Compare commits

..

1012 Commits

Author SHA1 Message Date
TheCrazyInsanity e9ca12fb1b
Merge db9b177a04 into d748981834 2025-12-02 17:49:29 -05:00
Linus Torvalds d748981834 - The mandatory pile of cleanups the cat drags in every merge window
-----BEGIN PGP SIGNATURE-----
 
 iQIzBAABCgAdFiEEzv7L6UO9uDPlPSfHEsHwGGHeVUoFAmktqqMACgkQEsHwGGHe
 VUpBDg/8CT4uBJLqgQYoh9YYN9U/C1walUMLivHq9VJbld7a55nGCirUTBuiTRzp
 V2np/6V/XpG2YmV9Xj6E4Q6QLiOsfrWKFRzfr7OlPZ7yW5GR8aKBcEZm2Voi7j94
 qLVSraAjtfmTtU5Ym7Lp1fqswDW2Y+iU6zqIyotqy+/7qZ6yp4NwlrUwSocDsSLo
 n6Gh1Vv6fXdeMckzT1WJ/CMtx07IfQB/wqVVTO4WwBu1Pv71WO+Cv1pG2mewUVjK
 879X7+oc1icxCtD2OZxHfOEJGtC97N7cZDq6VEd4s/bjgYF75glyFtkv/DVgbQ6E
 BSkwciT5Sqb7B5N09RJWWJiNLlNQPFQdDFAuU/Sk18Vqc01jySFoWLLsiS/DeCsz
 opEnPK6uXO4m+2DI44dWpWgVj33a/ao6zej3uEPENkp9+FYaZPwip8Bdjn1FptBp
 ZGmqa7oy38MXTzV6hOctMAx/nXE05lff41Xe8fLlisNc7a6ZUwql9oA4A4yULtKI
 BMlddTZ5/N5LzOGVH5nixX+ig3wDqtEEOD1tdLb/f8Nhy71Z3Kmbf7zDuJCTaabn
 VAGaPUDZ4qfeMks1qRIVVR7rUo6PwRRNvi8Wyiauc7/fnxWyVR0z1vrqs2o3MtZJ
 WQ0/HAZZ/K54HWk049ea2b1kFjjOe6R5hKKX1Hc2nPIS3APD/Vc=
 =3uJO
 -----END PGP SIGNATURE-----

Merge tag 'x86_cleanups_for_v6.19_rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 cleanups from Borislav Petkov:

 - The mandatory pile of cleanups the cat drags in every merge window

* tag 'x86_cleanups_for_v6.19_rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  x86/boot: Clean up whitespace in a20.c
  x86/mm: Delete disabled debug code
  x86/{boot,mtrr}: Remove unused function declarations
  x86/percpu: Use BIT_WORD() and BIT_MASK() macros
  x86/cpufeatures: Correct LKGS feature flag description
  x86/idtentry: Add missing '*' to kernel-doc lines
2025-12-02 12:17:47 -08:00
Linus Torvalds 2ae20d6510 - Add support for AMD's Smart Data Cache Injection feature which allows
for direct insertion of data from I/O devices into the L3 cache, thus
   bypassing DRAM and saving its bandwidth; the resctrl side of the feature
   allows the size of the L3 used for data injection to be controlled
 
 - Add Intel Clearwater Forest to the list of CPUs which support Sub-NUMA
   clustering
 
 - Other fixes and cleanups
 -----BEGIN PGP SIGNATURE-----
 
 iQIzBAABCgAdFiEEzv7L6UO9uDPlPSfHEsHwGGHeVUoFAmktpFQACgkQEsHwGGHe
 VUop4g/9GTb/5rcFMQzeGlG3USnJOqJ+SmiAalA9lm1c933en9tqUgL/K0C0xC6h
 yraB3ICuob1YayiZkBwKIOQiei9gmfhH/CGf5vLcZMM+D6fqvlk1D+C40SuFoDFV
 DOH3H2nYoJ3vbZRtRZsD3bv/djST/OVk28g7eY8OwpZIwN5VSFULJwjK1ePPy+nL
 l65s/yrgLY0oLDBCGxtJ9gVxjCBqAoqfbbwVbcJm5hXv+2sYk8BH6de/CU+0v/vo
 K6Qu4GbmWqDKYH9thjC4ZC/DPXjtoCxGkg/l1Af5T1PiZF0ZtgEZI6i9JTR33jYJ
 7j6BpkCwPzY07MKj/Ub1RemlMfY4XMN/qssEfFmnwG+aMBtbojNAjdb00Pu9Ffn+
 TKFKiZ6WBTcYhqPQsFVruwHh8wDbJp2/x/yBfjD4qovo1HuyCln4iGDmoFcU2wTD
 UlOXW89bxOT56A3FL77ElnOg9nRltvdKduOluGtkpSkmBbzmDfoXrhG2z9zuuAui
 FB6GT2c5MRVXEC4BY30xwQBG5MArVRMyz9uYDyXf9+KHhWVdmq9K0ZAkIaUmPCvy
 BvBXpRhfxm/dKJPhtSuUPhh5A+a87gqoiu1McaFoVGyjVJIJ5gflge8+/mLj1lQz
 kG56SnLOzdtcwKcmQ5ncv5EkrTBD1Ph12u1kcd+4IZwkpgGZteE=
 =o7Dg
 -----END PGP SIGNATURE-----

Merge tag 'x86_cache_for_v6.19_rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 resource control updates from Borislav Petkov:

 - Add support for AMD's Smart Data Cache Injection feature which allows
   for direct insertion of data from I/O devices into the L3 cache, thus
   bypassing DRAM and saving its bandwidth; the resctrl side of the
   feature allows the size of the L3 used for data injection to be
   controlled

 - Add Intel Clearwater Forest to the list of CPUs which support
   Sub-NUMA clustering

 - Other fixes and cleanups

* tag 'x86_cache_for_v6.19_rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  fs/resctrl: Update bit_usage to reflect io_alloc
  fs/resctrl: Introduce interface to modify io_alloc capacity bitmasks
  fs/resctrl: Modify struct rdt_parse_data to pass mode and CLOSID
  fs/resctrl: Introduce interface to display io_alloc CBMs
  fs/resctrl: Add user interface to enable/disable io_alloc feature
  fs/resctrl: Introduce interface to display "io_alloc" support
  x86,fs/resctrl: Implement "io_alloc" enable/disable handlers
  x86,fs/resctrl: Detect io_alloc feature
  x86/resctrl: Add SDCIAE feature in the command line options
  x86/cpufeatures: Add support for L3 Smart Data Cache Injection Allocation Enforcement
  fs/resctrl: Consider sparse masks when initializing new group's allocation
  x86/resctrl: Support Sub-NUMA Cluster (SNC) mode on Clearwater Forest
2025-12-02 11:55:58 -08:00
Linus Torvalds 2a47c26e55 - Add microcode staging support on Intel: it moves the sole microcode
blobs loading to a non-critical path so that microcode loading
   latencies are kept at minimum. The actual "directing" the hardware to
   load microcode is the only step which is done on the critical path.
   This scheme is also opportunistic as in: on a failure, the machinery
   falls back to normal loading
 
 - Add the capability to the AMD side of the loader to select one of two
   per-family/model/stepping patches: one is pre-Entrysign and the other
   is post-Entrysign; with the goal to take care of machines which
   haven't updated their BIOS yet - something they should absolutely do
   as this is the only proper Entrysign fix
 
 - Other small cleanups and fixlets
 -----BEGIN PGP SIGNATURE-----
 
 iQIzBAABCgAdFiEEzv7L6UO9uDPlPSfHEsHwGGHeVUoFAmktjK4ACgkQEsHwGGHe
 VUqCzg/+NTMgw/cb6zvgXviUTTL62127q4YBr0G3AoNruYbWvdt65suK1pMoRUZL
 CDtflIjDTj8ZSIreXS6tUoFIAzsZUnPApUshHCXlHbK6hYbHDjQgkZme48P+AIqC
 kuP8zcqL0Epzv/Il/d9M8LEmP/0JUoACiibI5T0xMA5Ji9yw0njiHaHCBnrwXduy
 oNsTW8KSaGSaq+zbqa+cS7T06b6SNtUpAQyNSg4Jgj9u3+uPb3a9AfD81jGxUmYl
 SoM/gsiwYjujKV/ZAldnN6tOoRSECqeYLRT/J/Bbqe4zSM5gYh7TRg7N4AcZXKuY
 BLps8IbmiS6ZF2qziicJ7+zN35kXLeuVC+T4rq+IjvkTyH+eJsuGFnGYbXxCwV8A
 nkinSLtn6x0sebem/6H77OjNMLZU0zmLgWfiUfvgnXCErb7SZfs967aG8nxs5bDX
 CnEzS7/98sSkZm0yDSjp0TuXzo1PSGS9wcv30vOR4hClx42YmTZlBUJ5QHJQ9AB0
 1PNmLptwUk9rorTemAzB3Cstm490U7BEd32Od6b+NiIyKogL7uPJKHsQ2Q/t07tw
 ubBm5nFzIhCXWz9v5q1fkvInKAXytHdIN4OnzOPw+7jHF95Vpa2o22OBWaBaCRex
 96jCa4b6pPomxPD+LxdSSMtSihUa4PQz9VrrqnYn7vulumQ/YDo=
 =rxMs
 -----END PGP SIGNATURE-----

Merge tag 'x86_microcode_for_v6.19_rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 microcode loading updates from Borislav Petkov:

 - Add microcode staging support on Intel: it moves the sole microcode
   blobs loading to a non-critical path so that microcode loading
   latencies are kept at minimum. The actual "directing" the hardware to
   load microcode is the only step which is done on the critical path.

   This scheme is also opportunistic as in: on a failure, the machinery
   falls back to normal loading

 - Add the capability to the AMD side of the loader to select one of two
   per-family/model/stepping patches: one is pre-Entrysign and the other
   is post-Entrysign; with the goal to take care of machines which
   haven't updated their BIOS yet - something they should absolutely do
   as this is the only proper Entrysign fix

 - Other small cleanups and fixlets

* tag 'x86_microcode_for_v6.19_rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  x86/microcode: Mark early_parse_cmdline() as __init
  x86/microcode/AMD: Select which microcode patch to load
  x86/microcode/intel: Enable staging when available
  x86/microcode/intel: Support mailbox transfer
  x86/microcode/intel: Implement staging handler
  x86/microcode/intel: Define staging state struct
  x86/microcode/intel: Establish staging control logic
  x86/microcode: Introduce staging step to reduce late-loading time
  x86/cpu/topology: Make primary thread mask available with SMP=n
2025-12-02 11:35:49 -08:00
Linus Torvalds a61288200e - The second part of the AMD MCA interrupts rework after the last-minute
show-stopper from the last merge window was sorted out. After this,
   the AMD MCA deferred errors, thresholding and corrected errors
   interrupt handlers use common MCA code and are tightly integrated
   into the core MCA code, thereby getting rid of considerable
   duplication. All culminating into allowing CMCI error thresholding
   storms to be detected at AMD too, using the common infrastructure
 
 - Add support for two new MCA bank bits on AMD Zen6 which denote whether
   the error address logged is a system physical address, which obviates
   the need for it to be translated before further error recovery can be
   done
 -----BEGIN PGP SIGNATURE-----
 
 iQIzBAABCgAdFiEEzv7L6UO9uDPlPSfHEsHwGGHeVUoFAmktlV8ACgkQEsHwGGHe
 VUrfGRAAsoVknP8SPap1dFpT82+avi7knEnZ56zuwCxjXOSlXDbvsrAUFsS8Io4o
 sf60gyUnFEFLW551qXUJoSnuSjf0S63tKmnX6ebUXtxe6mVC5Y0l3VGHz8/ymbCV
 8tLFF1yx6qMEwE2WutuIIeKGdZjn4lpg2lvhtaZnzeUSBk/BQcANjPaVYKQZPx/Q
 mXqpfvJnEBxkP6gy9VlrKxkpPyR0obD2/RFcN1M5dEbk0q52KNtcwyjblYR2XmNB
 7SVmwAcRkH+7Icp14XgHZamAs9NMdAShaQ7Rov7OjEucTnot+Q5BO/3ftvFOzvGu
 GHiY4rSew6QtKv4MWIYVHGrxIm6o6Sco7EFmESEC9UDX/Ck60WAj1LY6v6jKEF0g
 nnbqxO1hoD0ygNApBXMYleut8eqiriJlXCrImlaldkG8iQqsmf11kEHagS9EVtk0
 X28/eCoyD14a90NqmY13hBf2xscU41jy+LxdYy7sisL3LC4rhGgBpE/5vd/Ynnlf
 HELeQA8/5bIOgcbVvOIFxQGC+pBwhrHxIIOF0Z6pJZzznUO2cTUepJaLgWXdne7P
 EFE30+tDfeIy/bbB6CmkPV19NW3jNlkZib28t7L9uMCShPKiaza+Qv0SgzfeEy6t
 IERhzgmPxJiJ/7fOtdCUDL8YTlisiZ9t9RbSKNbriL54JHjX+Mc=
 =TY7F
 -----END PGP SIGNATURE-----

Merge tag 'ras_core_for_v6.19_rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 RAS updates from Borislav Petkov:

 - The second part of the AMD MCA interrupts rework after the
   last-minute show-stopper from the last merge window was sorted out.
   After this, the AMD MCA deferred errors, thresholding and corrected
   errors interrupt handlers use common MCA code and are tightly
   integrated into the core MCA code, thereby getting rid of
   considerable duplication. All culminating into allowing CMCI error
   thresholding storms to be detected at AMD too, using the common
   infrastructure

 - Add support for two new MCA bank bits on AMD Zen6 which denote
   whether the error address logged is a system physical address, which
   obviates the need for it to be translated before further error
   recovery can be done

* tag 'ras_core_for_v6.19_rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  x86/mce: Handle AMD threshold interrupt storms
  x86/mce: Do not clear bank's poll bit in mce_poll_banks on AMD SMCA systems
  x86/mce: Add support for physical address valid bit
  x86/mce: Save and use APEI corrected threshold limit
  x86/mce/amd: Define threshold restart function for banks
  x86/mce/amd: Remove redundant reset_block()
  x86/mce/amd: Support SMCA Corrected Error Interrupt
  x86/mce/amd: Enable interrupt vectors once per-CPU on SMCA systems
  x86/mce: Unify AMD DFR handler with MCA Polling
  x86/mce: Unify AMD THR handler with MCA Polling
2025-12-02 11:04:37 -08:00
Linus Torvalds 49219bba01 - imh_edac: Add a new EDAC driver for Intel Diamond Rapids and
future incarnations of this memory controllers architecture
 
 - amd64_edac: Remove the legacy csrow sysfs interface which has been
   deprecated and unused (we assume) for at least a decade
 
 - Add the capability to fallback to BIOS-provided address translation
   functionality (ACPI PRM) which can be used on systems unsupported by
   the current AMD address translation library
 
 - The usual fixes, fixlets, cleanups and improvements all over the place
 -----BEGIN PGP SIGNATURE-----
 
 iQIzBAABCgAdFiEEzv7L6UO9uDPlPSfHEsHwGGHeVUoFAmktdyMACgkQEsHwGGHe
 VUpXTxAAhdQxn1v1tYKya6YHxBS3T3Y3+4fec+LeKgoY1YnoFHMse3TAU+G67opR
 1xnEKHKrkX4v1FAwe7eD2G6qyz2ytqcApv4XGxmQ1WgldFWuPl/lI3ngPNMCHMog
 dqeQFRQ7MXsk0no0cjMA6NjafFpYOGGGhIzdU3wvgZawH4hG9wHLS6Urvn2SfWj6
 Pf/449qS7XoPU5G22qWPqqixRHpc9BPkJfKMIYeaWbxldePlwbh9cOMLqwsZo1QV
 v5cv/3CAIVFzRvNVIx05kDhRrwqTjIZL+u9IYHg2g9DA45GQuktYQwd1KksbVpUn
 CijhpKMoSnQHN+ZLW84XzvEH2rvroSTZl28d5suY1GHXG3ePc9HpmTVbVElFXWKZ
 dq0X2RIbMEbSxneePFHJ4ESUfNN2HbPSfh/sXN4epxcMQI0VWVhXYs5+Ek4UV1+E
 hvhCS/kuAypODzEi0cULoMcXdyKr2V1zpaAHNlZshepp/kUzY46b3cBhxKiL3Fsd
 x+IhZgow9a+iMJfMpCJhMABKEkoZRgS3gs5nWMJ6t0EvulvknG+aovGB/Q0VaIIa
 H69Fn+R2ewnEuZf1JGZDMit1y+wjGgeamk+uWTym+tCyNH1eHaSq48POribajcYF
 UtcobK4kG7hPodsbwwD4MhqtSLhuyIcXTHbI3x4+r+LLAgdAPKM=
 =NidS
 -----END PGP SIGNATURE-----

Merge tag 'edac_updates_for_v6.19_rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/ras/ras

Pull EDAC updates from Borislav Petkov:

 - imh_edac: Add a new EDAC driver for Intel Diamond Rapids and future
   incarnations of this memory controllers architecture

 - amd64_edac: Remove the legacy csrow sysfs interface which has been
   deprecated and unused (we assume) for at least a decade

 - Add the capability to fallback to BIOS-provided address translation
   functionality (ACPI PRM) which can be used on systems unsupported by
   the current AMD address translation library

 - The usual fixes, fixlets, cleanups and improvements all over the
   place

* tag 'edac_updates_for_v6.19_rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/ras/ras:
  RAS/AMD/ATL: Replace bitwise_xor_bits() with hweight16()
  EDAC/igen6: Fix error handling in igen6_edac driver
  EDAC/imh: Setup 'imh_test' debugfs testing node
  EDAC/{skx_comm,imh}: Detect 2-level memory configuration
  EDAC/skx_common: Extend the maximum number of DRAM chip row bits
  EDAC/{skx_common,imh}: Add EDAC driver for Intel Diamond Rapids servers
  EDAC/skx_common: Prepare for skx_set_hi_lo()
  EDAC/skx_common: Prepare for skx_get_edac_list()
  EDAC/{skx_common,skx,i10nm}: Make skx_register_mci() independent of pci_dev
  EDAC/ghes: Replace deprecated strcpy() in ghes_edac_report_mem_error()
  EDAC/ie31200: Fix error handling in ie31200_register_mci
  RAS/CEC: Replace use of system_wq with system_percpu_wq
  EDAC: Remove the legacy EDAC sysfs interface
  EDAC/amd64: Remove NUM_CONTROLLERS macro
  EDAC/amd64: Generate ctl_name string at runtime
  RAS/AMD/ATL: Require PRM support for future systems
  ACPI: PRM: Add acpi_prm_handler_available()
  RAS/AMD/ATL: Return error codes from helper functions
2025-12-02 10:45:50 -08:00
Linus Torvalds 7f8d5f70ff Tree wide cleanup of the remaining users of in_irq() which got replaced
by in_hardirq() and marked deprecated in 2020.
 -----BEGIN PGP SIGNATURE-----
 
 iQJHBAABCgAxFiEEQp8+kY+LLUocC4bMphj1TA10mKEFAmkvDhUTHHRnbHhAbGlu
 dXRyb25peC5kZQAKCRCmGPVMDXSYoW+XD/959KAIm2JpcEYUWuNBmlhEyuYWvPLw
 ZyOiraLYBNyWmfCO/Yz4Ff8VZSR9gdWQoNfvBb8uxkbSXa0UOEUhCbzWsuoTnqR5
 ObTIHCJ9QmPlRiFDvs4Sf5TGmy/4nXh6/PoH3JykNdlD3rZMTxiAz/k6QuO/S2iu
 ykA+DNtNL7jDkQHzrWa3rf597BkBN1Z+hUD8zHRt8LYKRfmLYWjCMggjPLMnuqcn
 240fnV/FubCLd9f5ZgNxHQMQCQH2qB7GYMk08YwXwCZQqIIXWqbNnhedkkNO3kWq
 Sws4TEO6yg9pgTFqkuiDU5QgYEboRY4pDT45KSkdTHHGZl2OAAl3eVIGCto72UEI
 Eyzn4k900hZ1iI/Rad5mx3D4XJZEXFgEbXhjph0odn6jVvmSj+Fmg3J67u1niO2a
 obzB+xeaIkbGNQIgJFy8+A9SSnZckvuPlXdZdUxS2S95zH7f9+vBY8HWJMuyursa
 3AJAKa82mN1i3A9FdSuMTdttQWkDmrwPKVzxvixs1mBu7kB70XaRIKsPjZj7LH6X
 CiqP9Kt5FO0hVA7K+nKTeUA5DdjB4HzYzOgMqzFUhExY3hksVsj8rQEO6B0bCp9t
 CfITA3BvU7GXxhXZHOq3dABQ21J/ZHgeuK3QdQSnOxSQOv2ElYIdKvYirJy2QdS1
 tSM3O3GXb4zWDg==
 =6LKf
 -----END PGP SIGNATURE-----

Merge tag 'core-core-2025-12-03' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull core irq cleanup from Thomas Gleixner:
 "Tree wide cleanup of the remaining users of in_irq() which got
  replaced by in_hardirq() and marked deprecated in 2020"

* tag 'core-core-2025-12-03' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  treewide: Remove in_irq()
2025-12-02 10:18:49 -08:00
Linus Torvalds d42e504a55 Update to the time/timers core:
- Prevent a thundering herd problem when the timekeeper CPU is delayed
     and a large number of CPUs compete to acquire jiffies_lock to do the
     update. Limit it to one CPU with a separate "uncontended" atomic
     variable.
 
   - A set of improvements for the timer migration mechanism:
 
     - Support imbalanced NUMA trees correctly
 
     - Support dynamic exclusion of CPUs from the migrator duty to allow the
       cpuset/isolation mechanism to exclude them from handling timers of
       remote idle CPUs.
 
    - The usual small updates, cleanups and enhancements
 -----BEGIN PGP SIGNATURE-----
 
 iQJHBAABCgAxFiEEQp8+kY+LLUocC4bMphj1TA10mKEFAmks7doTHHRnbHhAbGlu
 dXRyb25peC5kZQAKCRCmGPVMDXSYoaxrD/40nxx+8cEXsVbVLIkP2PQbd2Y8+7sk
 YbNu/Cb7j7Bg7R8YIs4p5GHk+7Yt/hNsW77SmbAzRPUyYYG6L3bUYlBa3yQlvIuo
 xRPbzGA+RJies9skIGHbQ8z6ig1zUASRJPcBYiuaVIAuQhCfLNc4Nii9cEWtjZ24
 +5gfRwV+vy74ArWwRkwaGejDK1tav+gd62OkFQZC8WtjQ08ozGZ6VBJNg7nYq/gH
 FYO1rH2tQ/ZyjlO/x5NF8gFcjYD8iv5PDp8oH35MPx+XTdDccf0G3QB7ug0ffVdV
 b4gA6lZTAmpsu/NHb6ByN4i/kf3wf8la/i+EaAh/Ov7NW078gunvVKVA7jStcbBl
 ZgG5SRHiKRvQF/WXLGVQAnilRDZwRuS0nmJlqfExa44v23l5o3768RwdRYwQlv8g
 X5KSRl0jlVgVtZHgNBlZtgX9+rnQSr9sB5sVGBP2a6a1WhVXQV/2kp0wjdnU0mPw
 jLCnSdsHqBlSf9V7O/na823WCnBFb7blrLBXUoSbHBnICqtVFzhE1kBXWw3S7Kqh
 CiaWM+S4WfR0HRnUlWMTS8BZ82MgiDnd7nGUXWwXBbdqWmoj/9CoU6SZRjbMBkzi
 EY1XvmoYf6eSzdxfydI1hFi0/bbb8K9umHQlrpW3HeN9uXnVc0/+TroVPLuaKUdi
 53ClqXjzE+CpJg==
 =lQKn
 -----END PGP SIGNATURE-----

Merge tag 'timers-core-2025-11-30' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull timer core updates from Thomas Gleixner:

 - Prevent a thundering herd problem when the timekeeper CPU is delayed
   and a large number of CPUs compete to acquire jiffies_lock to do the
   update. Limit it to one CPU with a separate "uncontended" atomic
   variable.

 - A set of improvements for the timer migration mechanism:

     - Support imbalanced NUMA trees correctly

     - Support dynamic exclusion of CPUs from the migrator duty to allow
       the cpuset/isolation mechanism to exclude them from handling
       timers of remote idle CPUs

 - The usual small updates, cleanups and enhancements

* tag 'timers-core-2025-11-30' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  timers/migration: Exclude isolated cpus from hierarchy
  cpumask: Add initialiser to use cleanup helpers
  sched/isolation: Force housekeeping if isolcpus and nohz_full don't leave any
  cgroup/cpuset: Rename update_unbound_workqueue_cpumask() to update_isolation_cpumasks()
  timers/migration: Use scoped_guard on available flag set/clear
  timers/migration: Add mask for CPUs available in the hierarchy
  timers/migration: Rename 'online' bit to 'available'
  selftests/timers/nanosleep: Add tests for return of remaining time
  selftests/timers: Clean up kernel version check in posix_timers
  time: Fix a few typos in time[r] related code comments
  time: tick-oneshot: Add missing Return and parameter descriptions to kernel-doc
  hrtimer: Store time as ktime_t in restart block
  timers/migration: Remove dead code handling idle CPU checking for remote timers
  timers/migration: Remove unused "cpu" parameter from tmigr_get_group()
  timers/migration: Assert that hotplug preparing CPU is part of stable active hierarchy
  timers/migration: Fix imbalanced NUMA trees
  timers/migration: Remove locking on group connection
  timers/migration: Convert "while" loops to use "for"
  tick/sched: Limit non-timekeeper CPUs calling jiffies update
2025-12-02 09:58:33 -08:00
Linus Torvalds 5028f42416 Updates for clocksource and clockevent drivers:
- A new driver for the Realtel system timer
 
  - Prevent the unbinding of timers when the drivers do not support that.
 
  - Expand the timer counter readout for the SPRD driver to 64 bit to allow
    IOT devices suspend times of more than 36 hours, which is the current
    limit of the 32-bi readout
 
  - The usual small cleanups, fixes and enhancements all over the place.
 -----BEGIN PGP SIGNATURE-----
 
 iQJHBAABCgAxFiEEQp8+kY+LLUocC4bMphj1TA10mKEFAmksxAATHHRnbHhAbGlu
 dXRyb25peC5kZQAKCRCmGPVMDXSYobIuD/9HNzi+SDKiWiwuhZEfwrk4IJY1k4uM
 yRrxQHt8yODKPq13M1eKNiXro3Tbhq6cLhECdQ6Rsf/g4Q0x+TeAl1M2CfHLMOJ5
 +VYNqAx7b63bkZIp1pJk8HJfn4e9itDKnqEgi0M20tIoG3K8fLtZfyIdiuqOsTia
 USWOdOqnPtwIOtVvMLPCmjYTh2FFHFxxcrQgoAW+1ACwOq/AkSSSAqKNcjEB7edH
 7C9IZpm6rCl+13ywMiHS5UsOYFWz1fOgSmQ1c7KPqx9PquMaJ7oZFAQgb2FF0xXJ
 S8DwTMKlwCO2Tq15XjmmCPLlvsGzZgVJkXhDsqyrDAZzOowqjHuT/HTrENLcE3K3
 /gS721vahsLWfJp229whKkT11RDgQOO2c/3cplsL2joUyrkDzW4sloYuu00gqWrJ
 mR9srdA7F3HeSACPb6rX64Rzg3m63P/zJ20h2uJt/JblIkZd+3kBTELM30GZRQbn
 z176KwiRPy0TDbN8pW1I4I1sLtG7zYhaEsASGZM9yH9uKYU1cLej1SmmbLqDs3oO
 e0+QyK+A4OzR43LiRltN4X3dJJ59uf+zru12WGjV85WxJsA4rN4/5q/S0xcpWR7b
 eQNXn/YZwppdlwxTg+n2RWSTzOFtvNm8nfnepxB2UqffOAa1Ah87AT3rPaUrCULj
 NwI9Fy4AY4IvVQ==
 =0426
 -----END PGP SIGNATURE-----

Merge tag 'timers-clocksource-2025-11-30' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull clocksource updates from Thomas Gleixner:
 "Updates for clocksource and clockevent drivers:

   - A new driver for the Realtel system timer

   - Prevent the unbinding of timers when the drivers do not support
     that

   - Expand the timer counter readout for the SPRD driver to 64 bit
     to allow IOT devices suspend times of more than 36 hours, which
     is the current limit of the 32-bi readout

   - The usual small cleanups, fixes and enhancements all over the
     place"

* tag 'timers-clocksource-2025-11-30' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  clocksource/drivers: Add Realtek system timer driver
  dt-bindings: timer: Add Realtek SYSTIMER
  clocksource/drivers/stm32-lp: Drop unused module alias
  clocksource/drivers/rda: Add sched_clock_register for RDA8810PL SoC
  clocksource/drivers/nxp-stm: Prevent driver unbind
  clocksource/drivers/nxp-pit: Prevent driver unbind
  clocksource/drivers/arm_arch_timer_mmio: Prevent driver unbind
  clocksource/drivers/nxp-stm: Fix section mismatches
  clocksource/drivers/sh_cmt: Always leave device running after probe
  clocksource/drivers/stm: Fix double deregistration on probe failure
  clocksource/drivers/ralink: Fix resource leaks in init error path
  clocksource/drivers/timer-sp804: Fix read_current_timer() issue when clock source is not registered
  clocksource/drivers/sprd: Enable register for timer counter from 32 bit to 64 bit
2025-12-02 09:54:27 -08:00
Linus Torvalds 9ce62ebbb7 Updates for [PCI] MSI related code:
- Remove one variant of PCI/MSI management as all users have been
    converted to use per device domains. That reduces the variants to two:
 
    The modern and the real archaic legacy variant, which keeps the usual
    suspects in the museum category alive.
 
  - Rework the platform MSI device ID detection mechanism in the ARM GIC
    world to address resource leaks, duplicated code and other details. This
    requires a corresponding preparatory step in the PCI/iproc driver.
 
  - Trivial core code cleanups
 -----BEGIN PGP SIGNATURE-----
 
 iQJHBAABCgAxFiEEQp8+kY+LLUocC4bMphj1TA10mKEFAmkswn0THHRnbHhAbGlu
 dXRyb25peC5kZQAKCRCmGPVMDXSYoYpuD/wKT7d6I6AqnJVF/RhiJ+/d6vuX/aFW
 g6E7XAkMLKhmxunSNFfPzXsHy2a0oJroYKmDJH4C8GWGo/gXa+QvmDt2491k9rdV
 zM+CBodBu3/bXWvTW+o1fbyAvG+p2C3+iSRW/gGqzPdcY8gQiRnNOZS1j7zusMjO
 A6pz5SvLSPWQUnVl9PygJBuNX5TFHPnY3AySRpW11CvqB5/8gqGz+O6lT/Q+5hov
 GUC57hskbQd1PsYhTNRaUR4z7VMolPHqscp8DYVCWjOMP/r5quC6dlsn91yxuATU
 8D7oRiW8xkCaTJplY/rA6r/VxUthZ3EgIxzev3rGaWBdPxHcFfftf2oxyFFAf3lf
 3rEdfGBcNgApx+MCcoT5/3mf3KJfn2/bE6bZhwv94+dtbTlHguztyMD3vnGTS73i
 zPWQ5ae4M5sqc8kCNMRaBfU8yQEHEKs3gia67vStZyn5R/uUNVKRo67LBPZKVDcJ
 2511Ylnm62yG6PtdPGIFHY1i75uPpxXuS7F0BJignzM3iPvVvwLPZLDORr3/pR4q
 CmswZTA2obue6+nwz/LUacxzONsZ2Z8pzGY6rrT9sfj0Z4mk6xrfEPfjfmVoMpyk
 Dk4B8lIVYwcR7d/Sw+FIwYst8iw+L77Yn7kN8yCbh4lAOxBUUvtS5KAP6uPGe3D1
 30Q/DbBVlEvg/g==
 =VAtQ
 -----END PGP SIGNATURE-----

Merge tag 'irq-msi-2025-11-30' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull MSI updates from Thomas Gleixner:
 "Updates for [PCI] MSI related code:

   - Remove one variant of PCI/MSI management as all users have been
     converted to use per device domains. That reduces the variants to
     two:

     The modern and the real archaic legacy variant, which keeps the
     usual suspects in the museum category alive.

   - Rework the platform MSI device ID detection mechanism in the ARM
     GIC world to address resource leaks, duplicated code and other
     details. This requires a corresponding preparatory step in the
     PCI/iproc driver.

   - Trivial core code cleanups"

* tag 'irq-msi-2025-11-30' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  irqchip/gic-its: Rework platform MSI deviceID detection
  PCI: iproc: Implement MSI controller node detection with of_msi_xlate()
  genirq/msi: Slightly simplify msi_domain_alloc()
  PCI/MSI: Delete pci_msi_create_irq_domain()
2025-12-02 09:35:59 -08:00
Linus Torvalds 15b87bec89 Boring updates for interrupt drivers:
- Support for a couple of new ARM64 and RISCV SoC variants and their
     magic interrupt controllers which either can reuse existing code or
     require quirks due to a botched hardware implementation.
 
   - More section mismatch fixes.
 
   - The usual cleanups and fixes all over the place.
 -----BEGIN PGP SIGNATURE-----
 
 iQJHBAABCgAxFiEEQp8+kY+LLUocC4bMphj1TA10mKEFAmkswMYTHHRnbHhAbGlu
 dXRyb25peC5kZQAKCRCmGPVMDXSYoZvSEACZdCx7vO2XX7oef7DxQ6EKFA/NQvd0
 xJGFTBlrxIucp26yUxWKkuDVdhu8WYe13zJG6+LVl9IxH3IIBa2duQ4HhIyqxuz6
 z74IDjBlOcKAHu2xLFJmBIS4vGTd6UPOg1KvSrIFd9oiuMXikphbnFgyrFGAFiSQ
 J1gP7mKZATUH08mTXK5k1pmBIbMjEHpyyTdBEJKoVgiN/MB/qsq95dy0Oxal+C13
 1cOKBaFreTMdX+77U5RucBcGaLHW4SdoaAVaqc/UXw2c2TAezbt/gPYexRpkdVaG
 2tuYTWIfCUuHbjUoOOYwI+ILnuiBMzjxlIUx3uSvcvtUVO4YuMDR4JOWVsevtfgI
 uUV+4OPq9kBI6PNqAyo16NhDdZ9rmjg3q14F9oyidQfR5gRbsZPPDmtCB/M2jbE1
 n3LlsHUJt0UYo8ZqCPrGhiw9hkGXv4wsEl10FKkyoNrQ0Y4SCUrdzGdr6vwhAAub
 yxMe1+BrFQT23R9l+qVrUZmDmpV9tlFNr6rPwtucrQX3PMWEfAeCc6a/vjY3eqJl
 sZ4pGyFEx0cwfKzHu1/SmNpnjSNdyc7niiN8HAQ7AnxzRW13fDdGQuuVGsKxHyJc
 Tke9wJsyUO4MxpSQDI+cmpsF8OeJDHuRDKMBdLFxlLPhABECdLUO0qKq9l0Ry/Ji
 uDkc3WvM14zKpw==
 =kdyt
 -----END PGP SIGNATURE-----

Merge tag 'irq-drivers-2025-11-30' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull irq driver updates from Thomas Gleixner:
 "Boring updates for interrupt drivers:

   - Support for a couple of new ARM64 and RISCV SoC variants and their
     magic interrupt controllers which either can reuse existing code or
     require quirks due to a botched hardware implementation

   - More section mismatch fixes

   - The usual cleanups and fixes all over the place"

* tag 'irq-drivers-2025-11-30' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (32 commits)
  irqchip/meson-gpio: Add support for Amlogic S6 S7 and S7D SoCs
  dt-bindings: interrupt-controller: Add support for Amlogic S6 S7 and S7D SoCs
  dt-bindings: interrupt-controller: aspeed,ast2700: Correct #interrupt-cells and interrupts count
  irqchip/aclint-sswi: Add Nuclei UX900 support
  dt-bindings: interrupt-controller: Add Anlogic DR1V90 ACLINT SSWI
  dt-bindings: interrupt-controller: Add Anlogic DR1V90 ACLINT MSWI
  dt-bindings: interrupt-controller: Add Anlogic DR1V90 PLIC
  irqchip/irq-bcm7038-l1: Remove unused reg_mask_status()
  irqchip/sifive-plic: Fix call to __plic_toggle() in M-Mode code path
  irqchip/sifive-plic: Add support for UltraRISC DP1000 PLIC
  irqchip/sifive-plic: Cache the interrupt enable state
  dt-bindings: interrupt-controller: Add UltraRISC DP1000 PLIC
  dt-bindings: vendor-prefixes: Add UltraRISC
  irqchip/qcom-irq-combiner: Rename driver structure
  irqchip/riscv-imsic: Inline imsic_vector_from_local_id()
  irqchip/riscv-imsic: Embed the vector array in lpriv
  irqchip/riscv-imsic: Remove redundant irq_data lookups
  irqchip/ts4800: Drop unused module alias
  irqchip/mvebu-pic: Drop unused module alias
  irqchip/meson-gpio: Drop unused module alias
  ...
2025-12-02 09:32:53 -08:00
Linus Torvalds 6863c8385c Updates for the interrupt core and treewide cleanups:
- Rework of the Per Processor Interrupt (PPI) management on ARM[64].
 
     PPI support was built under the assumption that the systems are
     homogenous so that the same CPU local device types are connected to
     them. That's unfortunately wishful thinking and created horrible
     workarounds.
 
     This rework provides affinity management for PPIs so that they can be
     individually configured in the firmware tables and mops up the related
     drivers all over the place.
 
   - Prevent CPUSET/isolation changes to arbitrarily affine interrupt
     threads to random CPUs, which ignores user or driver settings.
 
   - Plug a harmless race in the interrupt affinity proc interface, which
     allows to see a half updated mask
 
   - Adjust the priority of secondary interrupt threads on RT, so that the
     combination of primary and secondary thread emulates the hardware
     interrupt plus thread scenario. Having them at the same priority can
     cause starvation issues in some drivers.
 -----BEGIN PGP SIGNATURE-----
 
 iQJHBAABCgAxFiEEQp8+kY+LLUocC4bMphj1TA10mKEFAmksv3oTHHRnbHhAbGlu
 dXRyb25peC5kZQAKCRCmGPVMDXSYoe5+D/wNnBaX9LRajuLOF+zaYw5WZxkzp6U7
 X4AP3cLny8xynI1kM5V8M1ym3Fspk0hiqxNX2LLXrSZzBR+3O4uGCyCceBXeHKo2
 vW4auUXG4MB+2sZyudQXaBpNK4A2YBubycTUcRECjkjDkBPAWgN7J+Oz2lXUSUcH
 zlitlHNo48hnZQPAJr4PDpi5q9+rChn+8/s+K1d8NlEf9HOXC98qzyMuMq+jHdJE
 AQ6tKoHkA5lHjHAUY3AbWptoHo1Wp+p5PSqsrFr6nbKuPlhUqRNEPXX0Z8q7aUTj
 NgdkvIHJVJ0C+T40FIWCNzUYOUk4gTQXBSPvptwJSHAmf9ovp+Kg2ltVZBzyL2iI
 R0EZSQAQU8iJcRrqjcAYqI36LkmwwVT6RD1zFa98xJT/AjsMpAt/U1pEMDtkoTKe
 Lv7ZQ/hloc+4wV4xS4zEtoV/ukdUfA9aEdXsh5hNH/07tvatpKO2LgortsiI+lCK
 76vAULcGvbMr5Jr63snjICgstahunpNMRn2HmnGAjmdZf4+g+TDvZR4DI6bswtuO
 jp5G6OM30Z9zKheAr1VioV1XAKr6Y4jDKVjfFy/n1k5pDwYaSJopmZxSD35aas4e
 VqWizAzc5dAVCYRlzr6S1lrMQ2JJRg0RpIn+sMS8dhf9SK7hs5ilGSOsgX1fgVat
 1N3WXvYM8vSW+g==
 =zrA1
 -----END PGP SIGNATURE-----

Merge tag 'irq-core-2025-11-30' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull irq core updates from Thomas Gleixner:
 "Updates for the interrupt core and treewide cleanups:

   - Rework of the Per Processor Interrupt (PPI) management on ARM[64]

     PPI support was built under the assumption that the systems are
     homogenous so that the same CPU local device types are connected to
     them. That's unfortunately wishful thinking and created horrible
     workarounds.

     This rework provides affinity management for PPIs so that they can
     be individually configured in the firmware tables and mops up the
     related drivers all over the place.

   - Prevent CPUSET/isolation changes to arbitrarily affine interrupt
     threads to random CPUs, which ignores user or driver settings.

   - Plug a harmless race in the interrupt affinity proc interface,
     which allows to see a half updated mask

   - Adjust the priority of secondary interrupt threads on RT, so that
     the combination of primary and secondary thread emulates the
     hardware interrupt plus thread scenario. Having them at the same
     priority can cause starvation issues in some drivers"

* tag 'irq-core-2025-11-30' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (33 commits)
  genirq: Remove cpumask availability check on kthread affinity setting
  genirq: Fix interrupt threads affinity vs. cpuset isolated partitions
  genirq: Prevent early spurious wake-ups of interrupt threads
  genirq: Use raw_spinlock_irq() in irq_set_affinity_notifier()
  genirq/manage: Reduce priority of forced secondary interrupt handler
  genirq/proc: Fix race in show_irq_affinity()
  genirq: Fix percpu_devid irq affinity documentation
  perf: arm_pmu: Kill last use of per-CPU cpu_armpmu pointer
  irqdomain: Kill of_node_to_fwnode() helper
  genirq: Kill irq_{g,s}et_percpu_devid_partition()
  irqchip: Kill irq-partition-percpu
  irqchip/apple-aic: Drop support for custom PMU irq partitions
  irqchip/gic-v3: Drop support for custom PPI partitions
  coresight: trbe: Request specific affinities for per CPU interrupts
  perf: arm_spe_pmu: Request specific affinities for per CPU interrupts
  perf: arm_pmu: Request specific affinities for per CPU NMIs/interrupts
  genirq: Add request_percpu_irq_affinity() helper
  genirq: Allow per-cpu interrupt sharing for non-overlapping affinities
  genirq: Update request_percpu_nmi() to take an affinity
  genirq: Add affinity to percpu_devid interrupt requests
  ...
2025-12-02 09:14:26 -08:00
Linus Torvalds 312f5b1866 Two small updates for debugobjects:
- Allow pool refill on RT enabled kernels before the scheduler is up
       and running to prevent pool exhaustion
 
     - Correct the lockdep override to prevent false positives.
 -----BEGIN PGP SIGNATURE-----
 
 iQJHBAABCgAxFiEEQp8+kY+LLUocC4bMphj1TA10mKEFAmksu/UTHHRnbHhAbGlu
 dXRyb25peC5kZQAKCRCmGPVMDXSYoScuD/9g3OtG29VZ2uNkOhgvGKuAThxZ+Y4d
 8YPNT/X9SuSevuwBCc9zXgpc5S4Af40ndRbcsiZ38t/xrInE6+J6qPJ7BbKCTiac
 cYvNz4ibMx1qz35BPtym4RnJyZA2EHX2hVIFGCfdh4MkILI7r3OPjemX542epZAW
 8MdKu5WZJNa8KYIvUE1UZdDtH1imxU7jdYBkr1ockN66+HMjRKHxcPwrhTCFJeCT
 N4DHOQ+hf9NzipHpRppDmqwkzQCOyKrOojXht00rG92QXIzmZRepH93cCFi/nW1d
 8aUjHU6myNQa65VkFDM2I2bpzCzlK7HpBU3iNXEkXPLZ8bVrYMP9koK+SXIa+Gj0
 icXdJwBe9uOKQOaG6MRSO2hn3fHO0m+PjZGtQFg7EqFCaY0J+8tv9k3WttDDpfMg
 hjXjyJ0U9T+/YUuSDBLdPczIJZr8eGh960SF0OTshHGGVOCGJt4dlvoC0NtUxN8x
 WQ/he9K/Cyz7U6yr1aNO6hAfqX/+6c0ZhD3OONuC9xgxHUkjPdlEe1ntLbdfn92z
 VygbJaguvRdzkAeaAlXNNU5WTNvm3ZeLPqDnnUHUlDW1f7hF0KwCrfZUW0PqdB76
 +94ptMeIlCv53zIEKamHuALGp7WtGddGGzaZLH8rUnPxfiff+JiMhXtV0ioMuUNG
 jpdlyBMXK+s0PA==
 =dUo2
 -----END PGP SIGNATURE-----

Merge tag 'core-debugobjects-2025-11-30' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull debugobjects update from Thomas Gleixner:
 "Two small updates for debugobjects:

   - Allow pool refill on RT enabled kernels before the scheduler is up
     and running to prevent pool exhaustion

   - Correct the lockdep override to prevent false positives"

* tag 'core-debugobjects-2025-11-30' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  debugobjects: Use LD_WAIT_CONFIG instead of LD_WAIT_SLEEP
  debugobjects: Allow to refill the pool before SYSTEM_SCHEDULING
2025-12-02 09:07:48 -08:00
Linus Torvalds 2b09f480f0 A large overhaul of the restartable sequences and CID management:
The recent enablement of RSEQ in glibc resulted in regressions which are
   caused by the related overhead. It turned out that the decision to invoke
   the exit to user work was not really a decision. More or less each
   context switch caused that. There is a long list of small issues which
   sums up nicely and results in a 3-4% regression in I/O benchmarks.
 
   The other detail which caused issues due to extra work in context switch
   and task migration is the CID (memory context ID) management. It also
   requires to use a task work to consolidate the CID space, which is
   executed in the context of an arbitrary task and results in sporadic
   uncontrolled exit latencies.
 
   The rewrite addresses this by:
 
   - Removing deprecated and long unsupported functionality
 
   - Moving the related data into dedicated data structures which are
     optimized for fast path processing.
 
   - Caching values so actual decisions can be made
 
   - Replacing the current implementation with a optimized inlined variant.
 
   - Separating fast and slow path for architectures which use the generic
     entry code, so that only fault and error handling goes into the
     TIF_NOTIFY_RESUME handler.
 
   - Rewriting the CID management so that it becomes mostly invisible in the
     context switch path. That moves the work of switching modes into the
     fork/exit path, which is a reasonable tradeoff. That work is only
     required when a process creates more threads than the cpuset it is
     allowed to run on or when enough threads exit after that. An artificial
     thread pool benchmarks which triggers this did not degrade, it actually
     improved significantly.
 
     The main effect in migration heavy scenarios is that runqueue lock held
     time and therefore contention goes down significantly.
 -----BEGIN PGP SIGNATURE-----
 
 iQJHBAABCgAxFiEEQp8+kY+LLUocC4bMphj1TA10mKEFAmksaRYTHHRnbHhAbGlu
 dXRyb25peC5kZQAKCRCmGPVMDXSYoencEADA5he8PAFPmSRRPo6+2G5mHzWe8kIU
 5ZViQStWFNAA0qqy8VXryWiJ6qqrO6la9o7K4YOXASUtlkVjquRp1DF7PabqGwuy
 zshbRCXNlT51J8uqanN8VrGVjlf+bMdHDbGoI1SLkUTxG8b+kDD5PXUQE1ARelPP
 Slbg9u+EMrxj6D5MDTPbuW6TqryJEkPtiNScyOz43emp9ww9+WVxenOcRqU4D+Th
 mjWmrGIzkroSf4XReMoD/wg9TPTpUjXnNCwl2viY9JvBpkMfYtU4tJAGK3aNFOWy
 zsAN0O9CaFGrUEFne7qUmtwhNLdtnjx5HN5pe7yZd1EhdTuQKq4jPiiQnwwm8w72
 c0o6m45FNPmPoSyfaZWCkLjbTEUXonT9JF61iN35JVxim8gBDDJjHFKnLxDmLrH3
 X0eESE48ReY2EneDV6Y8RJRo6oG14Fccvc39aTf/2Rw3trpmtt2agvConQzupQIg
 DzANw4jhUUzFRrHrMHACNsqKFXh9ratue/S9DM3xxTpGO/bKdeK7jGIgzNf8O34M
 J0O6Hvk5jMdcWlIJTx21GoGzoSkkXnR49g/71aCcp+MwdY4x9zFz5SWi8LWQRmkx
 xRo6tY27Bma8/SEwMJjPpAUXDTpq6v+j3cPisybL1yGsyt9lh+p8LX7VUtwcoEqe
 6ZelC5Kgw/+/kg==
 =n5KT
 -----END PGP SIGNATURE-----

Merge tag 'core-rseq-2025-11-30' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull rseq updates from Thomas Gleixner:
 "A large overhaul of the restartable sequences and CID management:

  The recent enablement of RSEQ in glibc resulted in regressions which
  are caused by the related overhead. It turned out that the decision to
  invoke the exit to user work was not really a decision. More or less
  each context switch caused that. There is a long list of small issues
  which sums up nicely and results in a 3-4% regression in I/O
  benchmarks.

  The other detail which caused issues due to extra work in context
  switch and task migration is the CID (memory context ID) management.
  It also requires to use a task work to consolidate the CID space,
  which is executed in the context of an arbitrary task and results in
  sporadic uncontrolled exit latencies.

  The rewrite addresses this by:

   - Removing deprecated and long unsupported functionality

   - Moving the related data into dedicated data structures which are
     optimized for fast path processing.

   - Caching values so actual decisions can be made

   - Replacing the current implementation with a optimized inlined
     variant.

   - Separating fast and slow path for architectures which use the
     generic entry code, so that only fault and error handling goes into
     the TIF_NOTIFY_RESUME handler.

   - Rewriting the CID management so that it becomes mostly invisible in
     the context switch path. That moves the work of switching modes
     into the fork/exit path, which is a reasonable tradeoff. That work
     is only required when a process creates more threads than the
     cpuset it is allowed to run on or when enough threads exit after
     that. An artificial thread pool benchmarks which triggers this did
     not degrade, it actually improved significantly.

     The main effect in migration heavy scenarios is that runqueue lock
     held time and therefore contention goes down significantly"

* tag 'core-rseq-2025-11-30' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (54 commits)
  sched/mmcid: Switch over to the new mechanism
  sched/mmcid: Implement deferred mode change
  irqwork: Move data struct to a types header
  sched/mmcid: Provide CID ownership mode fixup functions
  sched/mmcid: Provide new scheduler CID mechanism
  sched/mmcid: Introduce per task/CPU ownership infrastructure
  sched/mmcid: Serialize sched_mm_cid_fork()/exit() with a mutex
  sched/mmcid: Provide precomputed maximal value
  sched/mmcid: Move initialization out of line
  signal: Move MMCID exit out of sighand lock
  sched/mmcid: Convert mm CID mask to a bitmap
  cpumask: Cache num_possible_cpus()
  sched/mmcid: Use cpumask_weighted_or()
  cpumask: Introduce cpumask_weighted_or()
  sched/mmcid: Prevent pointless work in mm_update_cpus_allowed()
  sched/mmcid: Move scheduler code out of global header
  sched: Fixup whitespace damage
  sched/mmcid: Cacheline align MM CID storage
  sched/mmcid: Use proper data structures
  sched/mmcid: Revert the complex CID management
  ...
2025-12-02 08:48:53 -08:00
Linus Torvalds 1dce50698a Scoped user mode access and related changes:
- Implement the missing u64 user access function on ARM when
    CONFIG_CPU_SPECTRE=n. This makes it possible to access a 64bit value in
    generic code with [unsafe_]get_user(). All other architectures and ARM
    variants provide the relevant accessors already.
 
  - Ensure that ASM GOTO jump label usage in the user mode access helpers
    always goes through a local C scope label indirection inside the
    helpers. This is required because compilers are not supporting that a
    ASM GOTO target leaves a auto cleanup scope. GCC silently fails to emit
    the cleanup invocation and CLANG fails the build.
 
    This provides generic wrapper macros and the conversion of affected
    architecture code to use them.
 
  - Scoped user mode access with auto cleanup
 
    Access to user mode memory can be required in hot code paths, but if it
    has to be done with user controlled pointers, the access is shielded
    with a speculation barrier, so that the CPU cannot speculate around the
    address range check. Those speculation barriers impact performance quite
    significantly. This can be avoided by "masking" the provided pointer so
    it is guaranteed to be in the valid user memory access range and
    otherwise to point to a guaranteed unpopulated address space. This has
    to be done without branches so it creates an address dependency for the
    access, which the CPU cannot speculate ahead.
 
    This results in repeating and error prone programming patterns:
 
      	    if (can_do_masked_user_access())
                     from = masked_user_read_access_begin((from));
             else if (!user_read_access_begin(from, sizeof(*from)))
                     return -EFAULT;
             unsafe_get_user(val, from, Efault);
             user_read_access_end();
             return 0;
       Efault:
             user_read_access_end();
             return -EFAULT;
 
     which can be replaced with scopes and automatic cleanup:
 
             scoped_user_read_access(from, Efault)
                     unsafe_get_user(val, from, Efault);
             return 0;
        Efault:
             return -EFAULT;
 
  - Convert code which implements the above pattern over to
    scope_user.*.access(). This also corrects a couple of imbalanced
    masked_*_begin() instances which are harmless on most architectures, but
    prevent PowerPC from implementing the masking optimization.
 
  - Add a missing speculation barrier in copy_from_user_iter()
 -----BEGIN PGP SIGNATURE-----
 
 iQJHBAABCgAxFiEEQp8+kY+LLUocC4bMphj1TA10mKEFAmksRfITHHRnbHhAbGlu
 dXRyb25peC5kZQAKCRCmGPVMDXSYoVhBEACEySjWcyCrD1e0ZFMFAOJZFI2BShav
 reotzCzmHYQdpVukDRxc64BgM2vN4yB04xnyMhi2o4hSTiIJhz1NzbKggsQJhVoA
 psYz+xEI161HuLZnUBUBuF9RRko/HVsbGqO2JFCuOKor4GCycvjVgupR3EIN9h5T
 HZEWGIgaTmN7MBj0QRrJgJkaaSTnPKOwWaNMV/F9pfk27zuB7vuV8WM9P3FaJYG+
 JGa9td7VGaBpWavxgMJqfdvXWBCVDDfZ1dunWx8tPTnLxKZZZD6HlfQXhZTr2n1e
 rtJpGgfVBx5Uqxn4RrhS0I7QeK1b9rrt3IU7EkFoaa3Z8LU5B7cHlm7KyicyoHhy
 SzFFUszssznT/0OhA5fmgPRlqI295HynW2p1L4Xy9hC0EZ2vXJPG5rO6X3x6QwSR
 asjRB7x/6JzWQUzE7/nhXd9KcB66wvQxhnjp7GqulF74aPBCtIdXXDD68YEDYkbi
 dPC3NRBr0ePbsGVGWbYvYIPWcvo1u814C2io1zKwmVbiN6lCYURgQK861vfAZUP8
 oP5D2a6ENgezDKoJo6eJ82inuDu64qZy7OOkU/aO3cbOuWGVyY9CjYD11x85Nr0k
 UNabSOfvcmhmobtYUiAgLLrjX1grQUG3F74ZQTw513mwgMObuDAAoS11GPjY6HL6
 b99WUJRv8jP66A==
 =6no0
 -----END PGP SIGNATURE-----

Merge tag 'core-uaccess-2025-11-30' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scoped user access updates from Thomas Gleixner:
 "Scoped user mode access and related changes:

   - Implement the missing u64 user access function on ARM when
     CONFIG_CPU_SPECTRE=n.

     This makes it possible to access a 64bit value in generic code with
     [unsafe_]get_user(). All other architectures and ARM variants
     provide the relevant accessors already.

   - Ensure that ASM GOTO jump label usage in the user mode access
     helpers always goes through a local C scope label indirection
     inside the helpers.

     This is required because compilers are not supporting that a ASM
     GOTO target leaves a auto cleanup scope. GCC silently fails to emit
     the cleanup invocation and CLANG fails the build.

     [ Editor's note: gcc-16 will have fixed the code generation issue
       in commit f68fe3ddda4 ("eh: Invoke cleanups/destructors in asm
       goto jumps [PR122835]"). But we obviously have to deal with clang
       and older versions of gcc, so.. - Linus ]

     This provides generic wrapper macros and the conversion of affected
     architecture code to use them.

   - Scoped user mode access with auto cleanup

     Access to user mode memory can be required in hot code paths, but
     if it has to be done with user controlled pointers, the access is
     shielded with a speculation barrier, so that the CPU cannot
     speculate around the address range check. Those speculation
     barriers impact performance quite significantly.

     This cost can be avoided by "masking" the provided pointer so it is
     guaranteed to be in the valid user memory access range and
     otherwise to point to a guaranteed unpopulated address space. This
     has to be done without branches so it creates an address dependency
     for the access, which the CPU cannot speculate ahead.

     This results in repeating and error prone programming patterns:

       	    if (can_do_masked_user_access())
                      from = masked_user_read_access_begin((from));
              else if (!user_read_access_begin(from, sizeof(*from)))
                      return -EFAULT;
              unsafe_get_user(val, from, Efault);
              user_read_access_end();
              return 0;
        Efault:
              user_read_access_end();
              return -EFAULT;

      which can be replaced with scopes and automatic cleanup:

              scoped_user_read_access(from, Efault)
                      unsafe_get_user(val, from, Efault);
              return 0;
         Efault:
              return -EFAULT;

   - Convert code which implements the above pattern over to
     scope_user.*.access(). This also corrects a couple of imbalanced
     masked_*_begin() instances which are harmless on most
     architectures, but prevent PowerPC from implementing the masking
     optimization.

   - Add a missing speculation barrier in copy_from_user_iter()"

* tag 'core-uaccess-2025-11-30' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  lib/strn*,uaccess: Use masked_user_{read/write}_access_begin when required
  scm: Convert put_cmsg() to scoped user access
  iov_iter: Add missing speculation barrier to copy_from_user_iter()
  iov_iter: Convert copy_from_user_iter() to masked user access
  select: Convert to scoped user access
  x86/futex: Convert to scoped user access
  futex: Convert to get/put_user_inline()
  uaccess: Provide put/get_user_inline()
  uaccess: Provide scoped user access regions
  arm64: uaccess: Use unsafe wrappers for ASM GOTO
  s390/uaccess: Use unsafe wrappers for ASM GOTO
  riscv/uaccess: Use unsafe wrappers for ASM GOTO
  powerpc/uaccess: Use unsafe wrappers for ASM GOTO
  x86/uaccess: Use unsafe wrappers for ASM GOTO
  uaccess: Provide ASM GOTO safe wrappers for unsafe_*_user()
  ARM: uaccess: Implement missing __get_user_asm_dword()
2025-12-02 08:01:39 -08:00
Linus Torvalds 4a26e7032d Core kernel bug handling infrastructure changes for v6.19:
- Improve WARN(), which has vararg printf like arguments,
     to work with the x86 #UD based WARN-optimizing infrastructure
     by hiding the format in the bug_table and replacing this
     first argument with the address of the bug-table entry,
     while making the actual function that's called a UD1 instruction.
     (Peter Zijlstra)
 
   - Introduce the CONFIG_DEBUG_BUGVERBOSE_DETAILED Kconfig switch
     (Ingo Molnar, s390 support by Heiko Carstens)
 
 Fixes and cleanups:
 
   - bugs/s390: Remove private WARN_ON() implementation (Heiko Carstens)
 
   - <asm/bugs.h>: Make i386 use GENERIC_BUG_RELATIVE_POINTERS
     (Peter Zijlstra)
 
 Signed-off-by: Ingo Molnar <mingo@kernel.org>
 -----BEGIN PGP SIGNATURE-----
 
 iQJFBAABCgAvFiEEBpT5eoXrXCwVQwEKEnMQ0APhK1gFAmktffcRHG1pbmdvQGtl
 cm5lbC5vcmcACgkQEnMQ0APhK1gLCQ//ZMHTpv6UF+xY4Kq3rdVFJWRR7Prs2hDn
 4V/cFHu7Xb2lRwpJDBNWnbiEkLbGR5qlWD+0CpAMK4mGuL9VjnoeflEzXxOPP9W1
 7SWw1HDc6NkjHM3BNnvkPQbzWF4RL4ZGs4Lbb1nv7pjoTSdBMXNrD5RL+iNmfHHd
 QfOG9ZiRyD5A/b07bfyjIXNaeC/Hot+FeVXTMfD7/vCfc2ywhL2Sm5G/igY/shTY
 Una7q8sbFDD/bFFtWSR2eFQeHQQfT6c/Pu39ZcAIdTlLk1FtZ+A2wcRtwYv/FdPV
 6KDOAxZK7fLMHoCdNTswsg+LbazhABOb/V1J7TaHq2tF/PeyN+B+ucxVY2KFcxQJ
 V5eG5crMrUIL22QO/UaT3dPWxGbJYkNlAWl416tAKdgXA52W4OsPd+k4DGjeP569
 KogAy3SY0D/80v1QN2HcFEJMvr7W2SukxtErqdtA5OKt/ZevB49lGqZoBecqASDO
 QjI1K0yLKnb+erbMIpCpNj+o/Fr1JQgWbYVpwipL2GON4an6vrowimGTsUG7qqxN
 Hwb7IaTNnWQ/4iyCkVV44q6Ln1gyP2hz5Lnzo7QJINUdzp98UjJLAEK4NRG44T4L
 p0t5NMxnvREpAv35rwy03xhmITYeQMOqzN9JEBBvegQyld5ghbThxI7g9BDzcXyL
 Bd0mF7WOV9M=
 =bMUL
 -----END PGP SIGNATURE-----

Merge tag 'core-bugs-2025-12-01' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull bug handling infrastructure updates from Ingo Molnar:
 "Core updates:

   - Improve WARN(), which has vararg printf like arguments, to work
     with the x86 #UD based WARN-optimizing infrastructure by hiding the
     format in the bug_table and replacing this first argument with the
     address of the bug-table entry, while making the actual function
     that's called a UD1 instruction (Peter Zijlstra)

   - Introduce the CONFIG_DEBUG_BUGVERBOSE_DETAILED Kconfig switch (Ingo
     Molnar, s390 support by Heiko Carstens)

  Fixes and cleanups:

   - bugs/s390: Remove private WARN_ON() implementation (Heiko Carstens)

   - <asm/bugs.h>: Make i386 use GENERIC_BUG_RELATIVE_POINTERS (Peter
     Zijlstra)"

* tag 'core-bugs-2025-12-01' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (31 commits)
  x86/bugs: Make i386 use GENERIC_BUG_RELATIVE_POINTERS
  x86/bug: Fix BUG_FORMAT vs KASLR
  x86_64/bug: Inline the UD1
  x86/bug: Implement WARN_ONCE()
  x86_64/bug: Implement __WARN_printf()
  x86/bug: Use BUG_FORMAT for DEBUG_BUGVERBOSE_DETAILED
  x86/bug: Add BUG_FORMAT basics
  bug: Allow architectures to provide __WARN_printf()
  bug: Implement WARN_ON() using __WARN_FLAGS()
  bug: Add report_bug_entry()
  bug: Add BUG_FORMAT_ARGS infrastructure
  bug: Clean up CONFIG_GENERIC_BUG_RELATIVE_POINTERS
  bug: Add BUG_FORMAT infrastructure
  x86: Rework __bug_table helpers
  bugs/s390: Remove private WARN_ON() implementation
  bugs/core: Reorganize fields in the first line of WARNING output, add ->comm[] output
  bugs/sh: Concatenate 'cond_str' with '__FILE__' in __WARN_FLAGS(), to extend WARN_ON/BUG_ON output
  bugs/parisc: Concatenate 'cond_str' with '__FILE__' in __WARN_FLAGS(), to extend WARN_ON/BUG_ON output
  bugs/riscv: Concatenate 'cond_str' with '__FILE__' in __BUG_FLAGS(), to extend WARN_ON/BUG_ON output
  bugs/riscv: Pass in 'cond_str' to __BUG_FLAGS()
  ...
2025-12-01 21:33:01 -08:00
Linus Torvalds dcd8637edb Core x86 changes for v6.19:
- x86/alternatives: Drop unnecessary test after call to
    alt_replace_call() (Juergen Gross)
 
  - x86/dumpstack: Prevent KASAN false positive warnings in
    __show_regs() (Tengda Wu)
 
 Signed-off-by: Ingo Molnar <mingo@kernel.org>
 -----BEGIN PGP SIGNATURE-----
 
 iQJFBAABCgAvFiEEBpT5eoXrXCwVQwEKEnMQ0APhK1gFAmktetMRHG1pbmdvQGtl
 cm5lbC5vcmcACgkQEnMQ0APhK1jLBRAAluu/0ACAzh1A5gw6Mr9uQ45KFxCt4HZa
 gjIyfDNqsdb6fqJ3yFAzw36gYViIzEGZo26S0mGWXqLVmMHus2HqvHTxYCPA7mov
 YfMxMUKsyTIaDoGHOYuhHwwdJL+KzXuKQri5/5MVbxNPPYedk4qV4weTTDPz/pkV
 biYOw6S4GDsOC5O1RT1AX+NBcIZkQzqoTSnkCXjDSCxXpVxYg7CQ6besMtkCCArZ
 QMl5ZCh6cEhhyOXIVL1Huz9pxxWpzxXM0oqNzhBjCjijYN5HGxL5kUv9tWHatP8h
 Q5SFnhyhPu/GtzZUw32O5r7sC93HweJeUtvaE0ekTSMK8A/oIwBup3Z4pRGpY/A2
 B16VAYSEYzoJEUmptPkHTbOouKMR45P02M0RM5rkojeh75LRivW6T0Zd1+LkEXJr
 yVwWfr+DoXaTtyK1GYdQNbwEWjpIxUwcBtVZN2h3Ajo28yPwB6Eh0mHUpl1Yol83
 Ex5XL9lsCljIkmDTup+6ai9fyVpsRUbLANnBZ76DiRRw44vUto/Dj8pkIOysZ/8/
 K/zVxou2KFtIDB/oFOIc1AIEjenq6IXqLhyKoerm2c0cALH67KUsFmHTH8/6hbUV
 WLWtROB9tLizBBE7Xo4uu1LkmIMWVZ8nHbX/QYQSQBv2Nhw29a681JaHjVgRYBIJ
 K97PRgZS6dM=
 =i6oY
 -----END PGP SIGNATURE-----

Merge tag 'x86-core-2025-12-01' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull core x86 updates from Ingo Molnar:

 - x86/alternatives: Drop unnecessary test after call to
   alt_replace_call() (Juergen Gross)

 - x86/dumpstack: Prevent KASAN false positive warnings in
   __show_regs() (Tengda Wu)

* tag 'x86-core-2025-12-01' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  x86/dumpstack: Prevent KASAN false positive warnings in __show_regs()
  x86/alternative: Drop not needed test after call of alt_replace_call()
2025-12-01 21:31:02 -08:00
Linus Torvalds e7d81c1ed6 A single fix for an ancient prototype in the math-emu code, by Arnd Bergmann.
Signed-off-by: Ingo Molnar <mingo@kernel.org>
 -----BEGIN PGP SIGNATURE-----
 
 iQJFBAABCgAvFiEEBpT5eoXrXCwVQwEKEnMQ0APhK1gFAmktecYRHG1pbmdvQGtl
 cm5lbC5vcmcACgkQEnMQ0APhK1gu0Q//S8JrbuW+o5XecSPgK+TEk8RhLRUYZPNK
 pObhFOZbWN8XPpVLGfHDLKTahyWNki8zfuTgdL+jwiCMvCp8u/vrBUr2bRYahDuI
 I4V6pau7zqSvDFG8bqpYbYyo8h0cGXm/f74wquSiaO7B6kEATDAQKUzCYOg1aTHR
 +hDYjgJssjBJg5e/F7gwri/V008aNCAJhOAlD75Yf6O209sMt8PjwaLQ9m+5zDPX
 9WD9wXAlV5sDcz7ukcsggKPc0fXKaqqjEu/mFXOHCe5qFbXeUQ+CE+ExGVT6oBo+
 3u/m4uH+TZ3LpP1ebWtGD22jZbd4xjKj5XyJ7KEh8zpu9xRaF4h0J5koLHRKvo9H
 KZ9j+tSQ7N6jH+fW0cj2NWA9onicSDgu5t0m5354Ui0TSjAqSFcErfMJJL+gEjHr
 zUHqIBvcMZqreg2KYYeS0VAslbD1Wr0f4GWBlJ26h71DZf5orHVYCaktWfCYCd72
 6HW34USKbzXK/X4ygl3VIvBOxSXeeOz8ce0nBJTZpwJsmMbQsmVFqyvqa8TLf131
 xntjS4mPkhIrnh8JpwwhoYC9dN58OtYbQ2cmkiXLwkqn60u3bkQATir0Zb1As0yz
 lZ7sdtxJ+OBImctySTM+mz1OjO8pNMmb37ZNcujuuSHaPS22cHqFg4TVpQWmDG3C
 ebH3gi+irqs=
 =hfpM
 -----END PGP SIGNATURE-----

Merge tag 'x86-build-2025-12-01' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 math-emu fix from Ingo Molnar:
 "A single fix for an ancient prototype in the math-emu code, by Arnd
  Bergmann"

* tag 'x86-build-2025-12-01' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  x86/math-emu: Fix div_Xsig() prototype
2025-12-01 21:28:23 -08:00
Linus Torvalds de2f75d55e x86/apic changes for v6.19:
- x86/apic: Fix the frequency in apic=verbose log output (Julian Stecklina)
 
   - Simplify mp_irqdomain_alloc() slightly (Christophe JAILLET)
 
 Signed-off-by: Ingo Molnar <mingo@kernel.org>
 -----BEGIN PGP SIGNATURE-----
 
 iQJFBAABCgAvFiEEBpT5eoXrXCwVQwEKEnMQ0APhK1gFAmktePURHG1pbmdvQGtl
 cm5lbC5vcmcACgkQEnMQ0APhK1iIsQ//ZGocltNkF7GrYytJbLTactwvj/w1C9ly
 ejEh0enSVibyvc+ADkEwFBPIXqQDo9KssYlwTZmgI3SSv1nohMch7CWPQ87Rnajo
 NxzWueS4q5Yp/IRVAiwXME7cYmJ2MtGxRk+gyq8RSFoV2gVOmPkFlWX0DZzDv5Oi
 Z0OPM2sTlw+hrFKd0DoG5PySaYhVooYfKynXCFFwn8KrRNYuEfQwA4wZFesn2+u2
 xwKaAKricpePG2/th3DdcBRUhkVZliVIUitbBUW5Ya9LsDeYkaq5+vLmywWEQC1q
 WCY8J3ptlH02E9xVoNA0M3lPhx5XPCcYQvSkgIobDE4d41e8yAj7Rgx1l5opNK6b
 KRMcsL9eBRMDbFSY0x6YA17XTz0yCN49BtiCx8hTGhPhBNyWICw5n8t1k5kTH9XQ
 Aj3oGSdtBstJIZjM4biPC5sP8HPXNn/QCHwMXBZEWG5IbyjbEFWGQvd3w2cNTuBd
 nMxaovs7eo+Q9ftbVlVJFhIgq9wEUUG9rTm7KLdqjglolhn7WzC/T7iVw+o8X9OX
 3IZRYimbfoP90vgqROVQiRjPxMj7t2F8Y11Bf4jIizTeSSCZQhrXQjZdUX1JNY3F
 T2Yhk8AXBaUEgf7kZlwzy+jco0Sd60MWHD18k1Ouot/RONgAbmkTn2BIXD/CkkkP
 UjiPoILd1kg=
 =RTrX
 -----END PGP SIGNATURE-----

Merge tag 'x86-apic-2025-12-01' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 apic updates from Ingo Molnar:

 - x86/apic: Fix the frequency in apic=verbose log output (Julian
   Stecklina)

 - Simplify mp_irqdomain_alloc() slightly (Christophe JAILLET)

* tag 'x86-apic-2025-12-01' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  x86/apic: Fix frequency in apic=verbose log output
  x86/ioapic: Simplify mp_irqdomain_alloc() slightly
2025-12-01 21:26:35 -08:00
Linus Torvalds 6d2c10e889 Scheduler changes for v6.19:
Scalability and load-balancing improvements:
 
   - Enable scheduler feature NEXT_BUDDY (Mel Gorman)
 
   - Reimplement NEXT_BUDDY to align with EEVDF goals (Mel Gorman)
 
   - Skip sched_balance_running cmpxchg when balance is not due (Tim Chen)
 
   - Implement generic code for architecture specific sched domain
     NUMA distances (Tim Chen)
 
   - Optimize the NUMA distances of the sched-domains builds of Intel
     Granite Rapids (GNR) and Clearwater Forest (CWF) platforms
     (Tim Chen)
 
   - Implement proportional newidle balance: a randomized algorithm
     that runs newidle balancing proportional to its success rate.
     (Peter Zijlstra)
 
 Scheduler infrastructure changes:
 
   - Implement the 'sched_change' scoped_guard() pattern for
     the entire scheduler (Peter Zijlstra)
 
   - More broadly utilize the sched_change guard (Peter Zijlstra)
 
   - Add support to pick functions to take runqueue-flags (Joel Fernandes)
 
   - Provide and use set_need_resched_current() (Peter Zijlstra)
 
 Fair scheduling enhancements:
 
   - Forfeit vruntime on yield (Fernand Sieber)
   - Only update stats for allowed CPUs when looking for dst group (Adam Li)
 
 CPU-core scheduling enhancements:
 
   - Optimize core cookie matching check (Fernand Sieber)
 
 Deadline scheduler fixes:
 
   - Only set free_cpus for online runqueues (Doug Berger)
   - Fix dl_server time accounting (Peter Zijlstra)
   - Fix dl_server stop condition (Peter Zijlstra)
 
 Proxy scheduling fixes:
 
   - Yield the donor task (Fernand Sieber)
 
 Fixes and cleanups:
 
   - Fix do_set_cpus_allowed() locking (Peter Zijlstra)
   - Fix migrate_disable_switch() locking (Peter Zijlstra)
   - Remove double update_rq_clock() in __set_cpus_allowed_ptr_locked() (Hao Jia)
   - Increase sched_tick_remote timeout (Phil Auld)
   - sched/deadline: Use cpumask_weight_and() in dl_bw_cpus() (Shrikanth Hegde)
   - sched/deadline: Clean up select_task_rq_dl() (Shrikanth Hegde)
 
 Signed-off-by: Ingo Molnar <mingo@kernel.org>
 -----BEGIN PGP SIGNATURE-----
 
 iQJFBAABCgAvFiEEBpT5eoXrXCwVQwEKEnMQ0APhK1gFAmktd+MRHG1pbmdvQGtl
 cm5lbC5vcmcACgkQEnMQ0APhK1hRFw//TNK2tN9D/U3rh66RU/evkv4ZoEfdkqwl
 L3MDs9evupkggELFBjImd+lxDTjKnvq6aSb9ryJEbmivJMVUAcCF1fBILcs5m51N
 pCIapavjsENMdCPRzC7xDE3t5GSAQ35NIWkLbr5d4bpEIp8YmchOKoikWNMT09A1
 ifHJ+BMWiX5AY3r0bcvsi8XRzo/bSw1OA+gTh0BUyD9VBbIrqjvYR+W6nJXw6FKB
 rHp+VlaidFIbfsi25KU7Ixn52K4Cqm0AlapMIlDZABPZpKFAkhDCRSb6CGigZl4T
 rNSSmnkmF6GFVVIKnNfWiW2OsojnS9mm1tAiM8huu+KumUqFXhfzlfoNPF9oMphR
 TCeyK66br55/WK86Of5zowxQLwj0/nLHCT9HbV4Bre7kakTUJjtFKBqxBcdoENjt
 bx/tLoFeS4EeMSULAT0vZvE064XT0gHnRU0UXSBwRTuNcjMsN6RQVFWfkBHRx90g
 HLoK2AkvdqUFeiHWXiIx9wu6CpWBPPmiGL3+7RyXJ93z2EccmCRP5jmzYFxNoaD/
 uKnz9gYETNyVIhU44pSJcImpxr2m9SKfqetouhaYrPi9SRhhOBP+ZSzNCUPMQeIN
 ZVpzYinMhZo4m0c8W2hitlGLmj3hIgt7vesyiFlqMJrS3Y5PNnPsN608PI27IGLa
 GSbXtohZZIM=
 =BZY4
 -----END PGP SIGNATURE-----

Merge tag 'sched-core-2025-12-01' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler updates from Ingo Molnar:
 "Scalability and load-balancing improvements:

   - Enable scheduler feature NEXT_BUDDY (Mel Gorman)

   - Reimplement NEXT_BUDDY to align with EEVDF goals (Mel Gorman)

   - Skip sched_balance_running cmpxchg when balance is not due (Tim
     Chen)

   - Implement generic code for architecture specific sched domain NUMA
     distances (Tim Chen)

   - Optimize the NUMA distances of the sched-domains builds of Intel
     Granite Rapids (GNR) and Clearwater Forest (CWF) platforms (Tim
     Chen)

   - Implement proportional newidle balance: a randomized algorithm that
     runs newidle balancing proportional to its success rate. (Peter
     Zijlstra)

  Scheduler infrastructure changes:

   - Implement the 'sched_change' scoped_guard() pattern for the entire
     scheduler (Peter Zijlstra)

   - More broadly utilize the sched_change guard (Peter Zijlstra)

   - Add support to pick functions to take runqueue-flags (Joel
     Fernandes)

   - Provide and use set_need_resched_current() (Peter Zijlstra)

  Fair scheduling enhancements:

   - Forfeit vruntime on yield (Fernand Sieber)

   - Only update stats for allowed CPUs when looking for dst group (Adam
     Li)

  CPU-core scheduling enhancements:

   - Optimize core cookie matching check (Fernand Sieber)

  Deadline scheduler fixes:

   - Only set free_cpus for online runqueues (Doug Berger)

   - Fix dl_server time accounting (Peter Zijlstra)

   - Fix dl_server stop condition (Peter Zijlstra)

  Proxy scheduling fixes:

   - Yield the donor task (Fernand Sieber)

  Fixes and cleanups:

   - Fix do_set_cpus_allowed() locking (Peter Zijlstra)

   - Fix migrate_disable_switch() locking (Peter Zijlstra)

   - Remove double update_rq_clock() in __set_cpus_allowed_ptr_locked()
     (Hao Jia)

   - Increase sched_tick_remote timeout (Phil Auld)

   - sched/deadline: Use cpumask_weight_and() in dl_bw_cpus() (Shrikanth
     Hegde)

   - sched/deadline: Clean up select_task_rq_dl() (Shrikanth Hegde)"

* tag 'sched-core-2025-12-01' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (44 commits)
  sched: Provide and use set_need_resched_current()
  sched/fair: Proportional newidle balance
  sched/fair: Small cleanup to update_newidle_cost()
  sched/fair: Small cleanup to sched_balance_newidle()
  sched/fair: Revert max_newidle_lb_cost bump
  sched/fair: Reimplement NEXT_BUDDY to align with EEVDF goals
  sched/fair: Enable scheduler feature NEXT_BUDDY
  sched: Increase sched_tick_remote timeout
  sched/fair: Have SD_SERIALIZE affect newidle balancing
  sched/fair: Skip sched_balance_running cmpxchg when balance is not due
  sched/deadline: Minor cleanup in select_task_rq_dl()
  sched/deadline: Use cpumask_weight_and() in dl_bw_cpus
  sched/deadline: Document dl_server
  sched/deadline: Fix dl_server stop condition
  sched/deadline: Fix dl_server time accounting
  sched/core: Remove double update_rq_clock() in __set_cpus_allowed_ptr_locked()
  sched/eevdf: Fix min_vruntime vs avg_vruntime
  sched/core: Add comment explaining force-idle vruntime snapshots
  sched/core: Optimize core cookie matching check
  sched/proxy: Yield the donor task
  ...
2025-12-01 21:04:45 -08:00
Linus Torvalds 6c26fbe8c9 Performance events changes for v6.19:
Callchain support:
 
  - Add support for deferred user-space stack unwinding for
    perf, enabled on x86. (Peter Zijlstra, Steven Rostedt)
 
  - unwind_user/x86: Enable frame pointer unwinding on x86
    (Josh Poimboeuf)
 
 x86 PMU support and infrastructure:
 
  - x86/insn: Simplify for_each_insn_prefix() (Peter Zijlstra)
 
  - x86/insn,uprobes,alternative: Unify insn_is_nop()
    (Peter Zijlstra)
 
 Intel PMU driver:
 
  - Large series to prepare for and implement architectural PEBS
    support for Intel platforms such as Clearwater Forest (CWF)
    and Panther Lake (PTL). (Dapeng Mi, Kan Liang)
 
  - Check dynamic constraints (Kan Liang)
 
  - Optimize PEBS extended config (Peter Zijlstra)
 
  - cstates: Remove PC3 support from LunarLake (Zhang Rui)
 
  - cstates: Add Pantherlake support (Zhang Rui)
 
  - cstates: Clearwater Forest support (Zide Chen)
 
 AMD PMU driver:
 
  - x86/amd: Check event before enable to avoid GPF (George Kennedy)
 
 Fixes and cleanups:
 
  - task_work: Fix NMI race condition (Peter Zijlstra)
 
  - perf/x86: Fix NULL event access and potential PEBS record loss
    (Dapeng Mi)
 
  - Misc other fixes and cleanups.
    (Dapeng Mi, Ingo Molnar, Peter Zijlstra)
 
 Signed-off-by: Ingo Molnar <mingo@kernel.org>
 -----BEGIN PGP SIGNATURE-----
 
 iQJFBAABCgAvFiEEBpT5eoXrXCwVQwEKEnMQ0APhK1gFAmktcU0RHG1pbmdvQGtl
 cm5lbC5vcmcACgkQEnMQ0APhK1gNKw//ThLmbkoGJ0/yLOdEcW8rA/7HB43Oz6j9
 k0Vs7zwDBMRFP4zQg2XeF5SH7CWS9p/nI3eMhorgmH77oJCvXJxVtD5991zmlZhf
 eafOar5ZMVaoMz+tK8WWiENZyuN0bt0mumZmz9svXR3KV1S/q18XZ8bCas0itwnq
 D0T3Gqi/Z39gJIy7bHNgLoFY2zvI9b2EJNDKlzHk3NJ7UamA4GuMHN0cM2dIzKGK
 2L+wXOe2BH9YYzYrz/cdKq7sBMjOvFsCQ/5jh23A2Yu6JI4nJbw0WmexZRK1OWCp
 GAdMjBuqIShibLRxK746WRO9iut49uTsah4iSG80hXzhpwf7VaegOarost1nLaqm
 zweIOr3iwJRf273r6IqRuaporVHpQYMj2w2H63z36sQtGtkKHNyxZ50b6bqpwwjU
 LikLEJ9Bmh3mlvlXsOx2wX6dTb1fUk+cy2ezCDKUHqOLjqy4dM8V+jYhuRO4yxXz
 mj9aHZKgyuREt8yo/3nLqAzF5Okj9cXp7H6F1hCKWuCoAhNXkrvYcvbg8h6aRxOX
 2vGhMYjpElkl/DG6OWCSwuqCt9nVEC/dazW9fKQjh4S0CFOVopaMGSkGcS/xUPub
 92J4XMDEJX4RJ6dfspeQr97+1fETXEIWNv4WbKnDjqJlAucU1gnOTprVnAYUjcWw
 74320FjGN1E=
 =/8GE
 -----END PGP SIGNATURE-----

Merge tag 'perf-core-2025-12-01' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull performance events updates from Ingo Molnar:
 "Callchain support:

   - Add support for deferred user-space stack unwinding for perf,
     enabled on x86. (Peter Zijlstra, Steven Rostedt)

   - unwind_user/x86: Enable frame pointer unwinding on x86 (Josh
     Poimboeuf)

  x86 PMU support and infrastructure:

   - x86/insn: Simplify for_each_insn_prefix() (Peter Zijlstra)

   - x86/insn,uprobes,alternative: Unify insn_is_nop() (Peter Zijlstra)

  Intel PMU driver:

   - Large series to prepare for and implement architectural PEBS
     support for Intel platforms such as Clearwater Forest (CWF) and
     Panther Lake (PTL). (Dapeng Mi, Kan Liang)

   - Check dynamic constraints (Kan Liang)

   - Optimize PEBS extended config (Peter Zijlstra)

   - cstates:
      - Remove PC3 support from LunarLake (Zhang Rui)
      - Add Pantherlake support (Zhang Rui)
      - Clearwater Forest support (Zide Chen)

  AMD PMU driver:

   - x86/amd: Check event before enable to avoid GPF (George Kennedy)

  Fixes and cleanups:

   - task_work: Fix NMI race condition (Peter Zijlstra)

   - perf/x86: Fix NULL event access and potential PEBS record loss
     (Dapeng Mi)

   - Misc other fixes and cleanups (Dapeng Mi, Ingo Molnar, Peter
     Zijlstra)"

* tag 'perf-core-2025-12-01' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (38 commits)
  perf/x86/intel: Fix and clean up intel_pmu_drain_arch_pebs() type use
  perf/x86/intel: Optimize PEBS extended config
  perf/x86/intel: Check PEBS dyn_constraints
  perf/x86/intel: Add a check for dynamic constraints
  perf/x86/intel: Add counter group support for arch-PEBS
  perf/x86/intel: Setup PEBS data configuration and enable legacy groups
  perf/x86/intel: Update dyn_constraint base on PEBS event precise level
  perf/x86/intel: Allocate arch-PEBS buffer and initialize PEBS_BASE MSR
  perf/x86/intel: Process arch-PEBS records or record fragments
  perf/x86/intel/ds: Factor out PEBS group processing code to functions
  perf/x86/intel/ds: Factor out PEBS record processing code to functions
  perf/x86/intel: Initialize architectural PEBS
  perf/x86/intel: Correct large PEBS flag check
  perf/x86/intel: Replace x86_pmu.drain_pebs calling with static call
  perf/x86: Fix NULL event access and potential PEBS record loss
  perf/x86: Remove redundant is_x86_event() prototype
  entry,unwind/deferred: Fix unwind_reset_info() placement
  unwind_user/x86: Fix arch=um build
  perf: Support deferred user unwind
  unwind_user/x86: Teach FP unwind about start of function
  ...
2025-12-01 20:42:01 -08:00
Linus Torvalds 63e6995005 objtool updates for v6.19:
- klp-build livepatch module generation (Josh Poimboeuf)
 
    Introduce new objtool features and a klp-build
    script to generate livepatch modules using a
    source .patch as input.
 
    This builds on concepts from the longstanding out-of-tree
    kpatch project which began in 2012 and has been used for
    many years to generate livepatch modules for production kernels.
    However, this is a complete rewrite which incorporates
    hard-earned lessons from 12+ years of maintaining kpatch.
 
    Key improvements compared to kpatch-build:
 
     - Integrated with objtool: Leverages objtool's existing control-flow
       graph analysis to help detect changed functions.
 
     - Works on vmlinux.o: Supports late-linked objects, making it
       compatible with LTO, IBT, and similar.
 
     - Simplified code base: ~3k fewer lines of code.
 
     - Upstream: No more out-of-tree #ifdef hacks, far less cruft.
 
     - Cleaner internals: Vastly simplified logic for symbol/section/reloc
       inclusion and special section extraction.
 
     - Robust __LINE__ macro handling: Avoids false positive binary diffs
       caused by the __LINE__ macro by introducing a fix-patch-lines script
       which injects #line directives into the source .patch to preserve
       the original line numbers at compile time.
 
  - Disassemble code with libopcodes instead of running objdump
    (Alexandre Chartre)
 
  - Disassemble support (-d option to objtool) by Alexandre Chartre,
    which supports the decoding of various Linux kernel code generation
    specials such as alternatives:
 
       17ef:  sched_balance_find_dst_group+0x62f                 mov    0x34(%r9),%edx
       17f3:  sched_balance_find_dst_group+0x633               | <alternative.17f3>             | X86_FEATURE_POPCNT
       17f3:  sched_balance_find_dst_group+0x633               | call   0x17f8 <__sw_hweight64> | popcnt %rdi,%rax
       17f8:  sched_balance_find_dst_group+0x638                 cmp    %eax,%edx
 
    ... jump table alternatives:
 
       1895:  sched_use_asym_prio+0x5                            test   $0x8,%ch
       1898:  sched_use_asym_prio+0x8                            je     0x18a9 <sched_use_asym_prio+0x19>
       189a:  sched_use_asym_prio+0xa                          | <jump_table.189a>                        | JUMP
       189a:  sched_use_asym_prio+0xa                          | jmp    0x18ae <sched_use_asym_prio+0x1e> | nop2
       189c:  sched_use_asym_prio+0xc                            mov    $0x1,%eax
       18a1:  sched_use_asym_prio+0x11                           and    $0x80,%ecx
 
    ... exception table alternatives:
 
     native_read_msr:
       5b80:  native_read_msr+0x0                                                     mov    %edi,%ecx
       5b82:  native_read_msr+0x2                                                   | <ex_table.5b82> | EXCEPTION
       5b82:  native_read_msr+0x2                                                   | rdmsr           | resume at 0x5b84 <native_read_msr+0x4>
       5b84:  native_read_msr+0x4                                                     shl    $0x20,%rdx
 
    .... x86 feature flag decoding (also see the X86_FEATURE_POPCNT
         example in sched_balance_find_dst_group() above):
 
       2faaf:  start_thread_common.constprop.0+0x1f                                    jne    0x2fba4 <start_thread_common.constprop.0+0x114>
       2fab5:  start_thread_common.constprop.0+0x25                                  | <alternative.2fab5>                  | X86_FEATURE_ALWAYS                                  | X86_BUG_NULL_SEG
       2fab5:  start_thread_common.constprop.0+0x25                                  | jmp    0x2faba <.altinstr_aux+0x2f4> | jmp    0x4b0 <start_thread_common.constprop.0+0x3f> | nop5
       2faba:  start_thread_common.constprop.0+0x2a                                    mov    $0x2b,%eax
 
    ... NOP sequence shortening:
 
       1048e2:  snapshot_write_finalize+0xc2                                            je     0x104917 <snapshot_write_finalize+0xf7>
       1048e4:  snapshot_write_finalize+0xc4                                            nop6
       1048ea:  snapshot_write_finalize+0xca                                            nop11
       1048f5:  snapshot_write_finalize+0xd5                                            nop11
       104900:  snapshot_write_finalize+0xe0                                            mov    %rax,%rcx
       104903:  snapshot_write_finalize+0xe3                                            mov    0x10(%rdx),%rax
 
    ... and much more.
 
  - Function validation tracing support (Alexandre Chartre)
 
  - Various -ffunction-sections fixes (Josh Poimboeuf)
 
  - Clang AutoFDO (Automated Feedback-Directed Optimizations) support (Josh Poimboeuf)
 
  - Misc fixes and cleanups (Borislav Petkov, Chen Ni,
    Dylan Hatch, Ingo Molnar, John Wang, Josh Poimboeuf,
    Pankaj Raghav, Peter Zijlstra, Thorsten Blum)
 
 Signed-off-by: Ingo Molnar <mingo@kernel.org>
 -----BEGIN PGP SIGNATURE-----
 
 iQJFBAABCgAvFiEEBpT5eoXrXCwVQwEKEnMQ0APhK1gFAmktavcRHG1pbmdvQGtl
 cm5lbC5vcmcACgkQEnMQ0APhK1j3IhAAvc9tRV8SJcohim6DrkPGxCN/S80uzt5S
 q8v1x5tBzMYmUxftfpoLsPCri6Ww0jprNuhnbRCvWAzXFuW79HWBNdVkEO7V/cym
 OsCKQv3r0mWv5UXP3o8VM5K3tnU61wOAIx3yZCz5XKWeOg6NPXBJCSGWYpLuA7z0
 1wUWAXuHgmj4RHMlHu5x0FZnSqGU3/TkUDGAqdxrY+myhdwm0Ul+dSwWGQdQjCgA
 59Y/gDsWWEe5BVL56suwKZ1e+8UFnpbncbWkjELD6euJpYpDSNMOW/S6PYqOOz5M
 rjMv06XIX5ma7QQbF5fMG/sXW64tZtc090UocDnx7hpDq9mLEyNNkXsqRQlmd8Wt
 wG19IaeWo8aG9DTQkiv8OhtmssPKZHJsVjRUvXGnjktvxnsYSomgOT1lNme38dJD
 X9jHgZCFMdPsQmG0dp00Y0oejfTChqIDef7qSpYwT96R7l9VQQF7K7AxfJwSeLGO
 3hClZ0Gz/u9NiJTUUWTxUmR+YEy+1xIeaQSDq6t4JRtNJaMZlcevfVW+F2Lm04XH
 9eSeF7bJS2XKrlLHVdPgWCGZOmee+ghdQ7svsyEGpzdzaAZ7UveTucHJ9CvW3Fft
 Dcrl8rxX2NiD2PLz03HCHR/JVUDc3W3Exrer1TD8PD4LcZhFoBEGQbZ/gFlkBTxb
 TOcemtJT03U=
 =yPrS
 -----END PGP SIGNATURE-----

Merge tag 'objtool-core-2025-12-01' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull objtool updates from Ingo Molnar:

 - klp-build livepatch module generation (Josh Poimboeuf)

   Introduce new objtool features and a klp-build script to generate
   livepatch modules using a source .patch as input.

   This builds on concepts from the longstanding out-of-tree kpatch
   project which began in 2012 and has been used for many years to
   generate livepatch modules for production kernels. However, this is a
   complete rewrite which incorporates hard-earned lessons from 12+
   years of maintaining kpatch.

   Key improvements compared to kpatch-build:

    - Integrated with objtool: Leverages objtool's existing control-flow
      graph analysis to help detect changed functions.

    - Works on vmlinux.o: Supports late-linked objects, making it
      compatible with LTO, IBT, and similar.

    - Simplified code base: ~3k fewer lines of code.

    - Upstream: No more out-of-tree #ifdef hacks, far less cruft.

    - Cleaner internals: Vastly simplified logic for
      symbol/section/reloc inclusion and special section extraction.

    - Robust __LINE__ macro handling: Avoids false positive binary diffs
      caused by the __LINE__ macro by introducing a fix-patch-lines
      script which injects #line directives into the source .patch to
      preserve the original line numbers at compile time.

 - Disassemble code with libopcodes instead of running objdump
   (Alexandre Chartre)

 - Disassemble support (-d option to objtool) by Alexandre Chartre,
   which supports the decoding of various Linux kernel code generation
   specials such as alternatives:

      17ef:  sched_balance_find_dst_group+0x62f                 mov    0x34(%r9),%edx
      17f3:  sched_balance_find_dst_group+0x633               | <alternative.17f3>             | X86_FEATURE_POPCNT
      17f3:  sched_balance_find_dst_group+0x633               | call   0x17f8 <__sw_hweight64> | popcnt %rdi,%rax
      17f8:  sched_balance_find_dst_group+0x638                 cmp    %eax,%edx

   ... jump table alternatives:

      1895:  sched_use_asym_prio+0x5                            test   $0x8,%ch
      1898:  sched_use_asym_prio+0x8                            je     0x18a9 <sched_use_asym_prio+0x19>
      189a:  sched_use_asym_prio+0xa                          | <jump_table.189a>                        | JUMP
      189a:  sched_use_asym_prio+0xa                          | jmp    0x18ae <sched_use_asym_prio+0x1e> | nop2
      189c:  sched_use_asym_prio+0xc                            mov    $0x1,%eax
      18a1:  sched_use_asym_prio+0x11                           and    $0x80,%ecx

   ... exception table alternatives:

    native_read_msr:
      5b80:  native_read_msr+0x0                                                     mov    %edi,%ecx
      5b82:  native_read_msr+0x2                                                   | <ex_table.5b82> | EXCEPTION
      5b82:  native_read_msr+0x2                                                   | rdmsr           | resume at 0x5b84 <native_read_msr+0x4>
      5b84:  native_read_msr+0x4                                                     shl    $0x20,%rdx

   .... x86 feature flag decoding (also see the X86_FEATURE_POPCNT
        example in sched_balance_find_dst_group() above):

      2faaf:  start_thread_common.constprop.0+0x1f                                    jne    0x2fba4 <start_thread_common.constprop.0+0x114>
      2fab5:  start_thread_common.constprop.0+0x25                                  | <alternative.2fab5>                  | X86_FEATURE_ALWAYS                                  | X86_BUG_NULL_SEG
      2fab5:  start_thread_common.constprop.0+0x25                                  | jmp    0x2faba <.altinstr_aux+0x2f4> | jmp    0x4b0 <start_thread_common.constprop.0+0x3f> | nop5
      2faba:  start_thread_common.constprop.0+0x2a                                    mov    $0x2b,%eax

   ... NOP sequence shortening:

      1048e2:  snapshot_write_finalize+0xc2                                            je     0x104917 <snapshot_write_finalize+0xf7>
      1048e4:  snapshot_write_finalize+0xc4                                            nop6
      1048ea:  snapshot_write_finalize+0xca                                            nop11
      1048f5:  snapshot_write_finalize+0xd5                                            nop11
      104900:  snapshot_write_finalize+0xe0                                            mov    %rax,%rcx
      104903:  snapshot_write_finalize+0xe3                                            mov    0x10(%rdx),%rax

   ... and much more.

 - Function validation tracing support (Alexandre Chartre)

 - Various -ffunction-sections fixes (Josh Poimboeuf)

 - Clang AutoFDO (Automated Feedback-Directed Optimizations) support
   (Josh Poimboeuf)

 - Misc fixes and cleanups (Borislav Petkov, Chen Ni, Dylan Hatch, Ingo
   Molnar, John Wang, Josh Poimboeuf, Pankaj Raghav, Peter Zijlstra,
   Thorsten Blum)

* tag 'objtool-core-2025-12-01' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (129 commits)
  objtool: Fix segfault on unknown alternatives
  objtool: Build with disassembly can fail when including bdf.h
  objtool: Trim trailing NOPs in alternative
  objtool: Add wide output for disassembly
  objtool: Compact output for alternatives with one instruction
  objtool: Improve naming of group alternatives
  objtool: Add Function to get the name of a CPU feature
  objtool: Provide access to feature and flags of group alternatives
  objtool: Fix address references in alternatives
  objtool: Disassemble jump table alternatives
  objtool: Disassemble exception table alternatives
  objtool: Print addresses with alternative instructions
  objtool: Disassemble group alternatives
  objtool: Print headers for alternatives
  objtool: Preserve alternatives order
  objtool: Add the --disas=<function-pattern> action
  objtool: Do not validate IBT for .return_sites and .call_sites
  objtool: Improve tracing of alternative instructions
  objtool: Add functions to better name alternatives
  objtool: Identify the different types of alternatives
  ...
2025-12-01 20:18:59 -08:00
Linus Torvalds b53440f8e5 Locking updates for v6.19:
Mutexes:
 
  - Redo __mutex_init() to reduce generated code size
    (Sebastian Andrzej Siewior)
 
 Seqlocks:
 
  - Introduce scoped_seqlock_read() (Peter Zijlstra)
 
  - Change thread_group_cputime() to use scoped_seqlock_read()
    (Oleg Nesterov)
 
  - Change do_task_stat() to use scoped_seqlock_read()
    (Oleg Nesterov)
 
  - Change do_io_accounting() to use scoped_seqlock_read()
    (Oleg Nesterov)
 
  - Fix the incorrect documentation of read_seqbegin_or_lock() /
    need_seqretry() (Oleg Nesterov)
 
  - Allow KASAN to fail optimizing (Peter Zijlstra)
 
 Local lock updates:
 
  - Fix all kernel-doc warnings (Randy Dunlap)
 
  - Add the <linux/local_lock*.h> headers to MAINTAINERS
    (Sebastian Andrzej Siewior)
 
  - Reduce the risk of shadowing via s/l/__l/ and s/tl/__tl/
    (Vincent Mailhol)
 
 Lock debugging:
 
  - spinlock/debug: Fix data-race in do_raw_write_lock
    (Alexander Sverdlin)
 
 Atomic primitives infrastructure:
 
  - atomic: Skip alignment check for try_cmpxchg() old arg
    (Arnd Bergmann)
 
 Rust runtime integration:
 
  - sync: atomic: Enable generated Atomic<T> usage (Boqun Feng)
 
  - sync: atomic: Implement Debug for Atomic<Debug> (Boqun Feng)
 
  - debugfs: Remove Rust native atomics and replace them with
    Linux versions (Boqun Feng)
 
  - debugfs: Implement Reader for Mutex<T> only when T is Unpin
    (Boqun Feng)
 
  - lock: guard: Add T: Unpin bound to DerefMut (Daniel Almeida)
 
  - lock: Pin the inner data (Daniel Almeida)
 
  - lock: Add a Pin<&mut T> accessor (Daniel Almeida)
 
 Signed-off-by: Ingo Molnar <mingo@kernel.org>
 -----BEGIN PGP SIGNATURE-----
 
 iQJFBAABCgAvFiEEBpT5eoXrXCwVQwEKEnMQ0APhK1gFAmktVmgRHG1pbmdvQGtl
 cm5lbC5vcmcACgkQEnMQ0APhK1hDqw//e9BTs9Yx6yLxZVRDagS7KrDKy3V3OMg1
 9h+tXzCosGeNz7XwVzt590wsACJvX7/QtqgTFQy/GPYBW56WeVOSYSpA6I4s43G1
 sz+EAc4BYPBWEwBQxCfPDwHI/M0MD0Im6mjpATc6Uv1Ct0+KS8iFXRnGUALe4bis
 8aKV8ZHo81Wnu6v1B8GroExHolL/AMORYfEYHABpWEe+GpwxqQcRdZjc/eiEUzOg
 umwMC9bNc5FAiPlku9Mh6pcBPjkMd9bGjVEIG8deJhm/aD8f/b0mgaxyaKgoHx8J
 ptauY3kLYBLuV793U37SXuQVw6o2LGHCYvN1fX+g1D0YTIuqIq9Pz7ObZs7w4xDd
 6iIK4QYP4Gjkvn0ZA275eI3ItcBEjJ2FD3ZDbkXNj+O4vEOrmG/OX4h2H5WGq/AU
 zr9YfmkRn0InPeHeLU1UM3NdbKgwc/Bd6MubSwX4v7G7ws4CGDtlvA2d3xg5q8Ls
 MQoAV+9QtiZ9prQjtd8nukgmh/+okPmCcnuEVXhSOZHpPjqXXnyUCTPyKXVkltdF
 1u4oUHiQY7Jydfn0wZgEV4nASDeV2gz5BwKoSAuKvYc5HGhXnXxvzyJyHJy3dL8R
 afGGQ3XfQhA0hUKoMiQFUk7p7dvjdAiHxN1EcvxxJqWVsaE/Gpik1GOm+FRn7Oqs
 UMvspgGrbI4=
 =KPgY
 -----END PGP SIGNATURE-----

Merge tag 'locking-core-2025-12-01' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull locking updates from Ingo Molnar:
 "Mutexes:

   - Redo __mutex_init() to reduce generated code size (Sebastian
     Andrzej Siewior)

  Seqlocks:

   - Introduce scoped_seqlock_read() (Peter Zijlstra)

   - Change thread_group_cputime() to use scoped_seqlock_read() (Oleg
     Nesterov)

   - Change do_task_stat() to use scoped_seqlock_read() (Oleg Nesterov)

   - Change do_io_accounting() to use scoped_seqlock_read() (Oleg
     Nesterov)

   - Fix the incorrect documentation of read_seqbegin_or_lock() /
     need_seqretry() (Oleg Nesterov)

   - Allow KASAN to fail optimizing (Peter Zijlstra)

  Local lock updates:

   - Fix all kernel-doc warnings (Randy Dunlap)

   - Add the <linux/local_lock*.h> headers to MAINTAINERS (Sebastian
     Andrzej Siewior)

   - Reduce the risk of shadowing via s/l/__l/ and s/tl/__tl/ (Vincent
     Mailhol)

  Lock debugging:

   - spinlock/debug: Fix data-race in do_raw_write_lock (Alexander
     Sverdlin)

  Atomic primitives infrastructure:

   - atomic: Skip alignment check for try_cmpxchg() old arg (Arnd
     Bergmann)

  Rust runtime integration:

   - sync: atomic: Enable generated Atomic<T> usage (Boqun Feng)

   - sync: atomic: Implement Debug for Atomic<Debug> (Boqun Feng)

   - debugfs: Remove Rust native atomics and replace them with Linux
     versions (Boqun Feng)

   - debugfs: Implement Reader for Mutex<T> only when T is Unpin (Boqun
     Feng)

   - lock: guard: Add T: Unpin bound to DerefMut (Daniel Almeida)

   - lock: Pin the inner data (Daniel Almeida)

   - lock: Add a Pin<&mut T> accessor (Daniel Almeida)"

* tag 'locking-core-2025-12-01' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  locking/local_lock: Fix all kernel-doc warnings
  locking/local_lock: s/l/__l/ and s/tl/__tl/ to reduce the risk of shadowing
  locking/local_lock: Add the <linux/local_lock*.h> headers to MAINTAINERS
  locking/mutex: Redo __mutex_init() to reduce generated code size
  rust: debugfs: Replace the usage of Rust native atomics
  rust: sync: atomic: Implement Debug for Atomic<Debug>
  rust: sync: atomic: Make Atomic*Ops pub(crate)
  seqlock: Allow KASAN to fail optimizing
  rust: debugfs: Implement Reader for Mutex<T> only when T is Unpin
  seqlock: Change do_io_accounting() to use scoped_seqlock_read()
  seqlock: Change do_task_stat() to use scoped_seqlock_read()
  seqlock: Change thread_group_cputime() to use scoped_seqlock_read()
  seqlock: Introduce scoped_seqlock_read()
  documentation: seqlock: fix the wrong documentation of read_seqbegin_or_lock/need_seqretry
  atomic: Skip alignment check for try_cmpxchg() old arg
  rust: lock: Add a Pin<&mut T> accessor
  rust: lock: Pin the inner data
  rust: lock: guard: Add T: Unpin bound to DerefMut
  locking/spinlock/debug: Fix data-race in do_raw_write_lock
2025-12-01 19:50:58 -08:00
Linus Torvalds 1b5dd29869 vfs-6.19-rc1.fd_prepare.fs
-----BEGIN PGP SIGNATURE-----
 
 iHUEABYKAB0WIQRAhzRXHqcMeLMyaSiRxhvAZXjcogUCaSmOZwAKCRCRxhvAZXjc
 op0AAP4oNVJkFyvgKoPos5K2EGFB8M7merGhpYtsOoeg8UK6OwD/UySQErHsXQDR
 sUDDa5uFOhfrkcfM8REtAN4wF8p5qAc=
 =QgFD
 -----END PGP SIGNATURE-----

Merge tag 'vfs-6.19-rc1.fd_prepare.fs' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs

Pull fd prepare updates from Christian Brauner:
 "This adds the FD_ADD() and FD_PREPARE() primitive. They simplify the
  common pattern of get_unused_fd_flags() + create file + fd_install()
  that is used extensively throughout the kernel and currently requires
  cumbersome cleanup paths.

  FD_ADD() - For simple cases where a file is installed immediately:

      fd = FD_ADD(O_CLOEXEC, vfio_device_open_file(device));
      if (fd < 0)
          vfio_device_put_registration(device);
      return fd;

  FD_PREPARE() - For cases requiring access to the fd or file, or
  additional work before publishing:

      FD_PREPARE(fdf, O_CLOEXEC, sync_file->file);
      if (fdf.err) {
          fput(sync_file->file);
          return fdf.err;
      }

      data.fence = fd_prepare_fd(fdf);
      if (copy_to_user((void __user *)arg, &data, sizeof(data)))
          return -EFAULT;

      return fd_publish(fdf);

  The primitives are centered around struct fd_prepare. FD_PREPARE()
  encapsulates all allocation and cleanup logic and must be followed by
  a call to fd_publish() which associates the fd with the file and
  installs it into the caller's fdtable. If fd_publish() isn't called,
  both are deallocated automatically. FD_ADD() is a shorthand that does
  fd_publish() immediately and never exposes the struct to the caller.

  I've implemented this in a way that it's compatible with the cleanup
  infrastructure while also being usable separately. IOW, it's centered
  around struct fd_prepare which is aliased to class_fd_prepare_t and so
  we can make use of all the basica guard infrastructure"

* tag 'vfs-6.19-rc1.fd_prepare.fs' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs: (42 commits)
  io_uring: convert io_create_mock_file() to FD_PREPARE()
  file: convert replace_fd() to FD_PREPARE()
  vfio: convert vfio_group_ioctl_get_device_fd() to FD_ADD()
  tty: convert ptm_open_peer() to FD_ADD()
  ntsync: convert ntsync_obj_get_fd() to FD_PREPARE()
  media: convert media_request_alloc() to FD_PREPARE()
  hv: convert mshv_ioctl_create_partition() to FD_ADD()
  gpio: convert linehandle_create() to FD_PREPARE()
  pseries: port papr_rtas_setup_file_interface() to FD_ADD()
  pseries: convert papr_platform_dump_create_handle() to FD_ADD()
  spufs: convert spufs_gang_open() to FD_PREPARE()
  papr-hvpipe: convert papr_hvpipe_dev_create_handle() to FD_PREPARE()
  spufs: convert spufs_context_open() to FD_PREPARE()
  net/socket: convert __sys_accept4_file() to FD_ADD()
  net/socket: convert sock_map_fd() to FD_ADD()
  net/kcm: convert kcm_ioctl() to FD_PREPARE()
  net/handshake: convert handshake_nl_accept_doit() to FD_PREPARE()
  secretmem: convert memfd_secret() to FD_ADD()
  memfd: convert memfd_create() to FD_ADD()
  bpf: convert bpf_token_create() to FD_PREPARE()
  ...
2025-12-01 17:32:07 -08:00
Linus Torvalds ffbf700df2 vfs-6.19-rc1.autofs
-----BEGIN PGP SIGNATURE-----
 
 iHUEABYKAB0WIQRAhzRXHqcMeLMyaSiRxhvAZXjcogUCaSmOZwAKCRCRxhvAZXjc
 ov48AP4qgtSo78euYDtsxkgU1IKRow1Hc3L/rql6uIP7dFtQ8AEAjZjCMK3vDSZy
 DUqStlgPZ3/GWyzdnDKOoAuwvn56gQQ=
 =KElr
 -----END PGP SIGNATURE-----

Merge tag 'vfs-6.19-rc1.autofs' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs

Pull autofs update from Christian Brauner:
 "Prevent futile mount triggers in private mount namespaces.

  Fix a problematic loop in autofs when a mount namespace contains
  autofs mounts that are propagation private and there is no
  namespace-specific automount daemon to handle possible automounting.

  Previously, attempted path resolution would loop until MAXSYMLINKS was
  reached before failing, causing significant noise in the log.

 The fix adds a check in autofs ->d_automount() so that the VFS can
 immediately return EPERM in this case. Since the mount is propagation
 private, EPERM is the most appropriate error code"

* tag 'vfs-6.19-rc1.autofs' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs:
  autofs: dont trigger mount if it cant succeed
2025-12-01 16:38:21 -08:00
Linus Torvalds d0deeb803c vfs-6.19-rc1.ovl
-----BEGIN PGP SIGNATURE-----
 
 iHUEABYKAB0WIQRAhzRXHqcMeLMyaSiRxhvAZXjcogUCaSmOZwAKCRCRxhvAZXjc
 othbAP97FNSOMUMAXTUxocE7vMgq3B/MG54e22ZrYhnZeP8NsgEA3zo4GpPCeM0p
 e8EjiLz0wUlveF68MZUg52eXT5/nTAE=
 =tbY/
 -----END PGP SIGNATURE-----

Merge tag 'vfs-6.19-rc1.ovl' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs

Pull overlayfs cred guard conversion from Christian Brauner:
 "This converts all of overlayfs to use credential guards, eliminating
  manual credential management throughout the filesystem.

  Credential guard conversion:

   - Convert all of overlayfs to use credential guards, replacing the
     manual ovl_override_creds()/ovl_revert_creds() pattern with scoped
     guards.

     This makes credential handling visually explicit and eliminates a
     class of potential bugs from mismatched override/revert calls.

     (1) Basic credential guard (with_ovl_creds)
     (2) Creator credential guard (ovl_override_creator_creds):

         Introduced a specialized guard for file creation operations
         that handles the two-phase credential override (mounter
         credentials, then fs{g,u}id override). The new pattern is much
         clearer:

         with_ovl_creds(dentry->d_sb) {
                 scoped_class(prepare_creds_ovl, cred, dentry, inode, mode) {
                         if (IS_ERR(cred))
                                 return PTR_ERR(cred);
                         /* creation operations */
                 }
         }

     (3) Copy-up credential guard (ovl_cu_creds):

         Introduced a specialized guard for copy-up operations,
         simplifying the previous struct ovl_cu_creds helper and
         associated functions.

         Ported ovl_copy_up_workdir() and ovl_copy_up_tmpfile() to this
         pattern.

  Cleanups:

   - Remove ovl_revert_creds() after all callers converted to guards

   - Remove struct ovl_cu_creds and associated functions

   - Drop ovl_setup_cred_for_create() after conversion

   - Refactor ovl_fill_super(), ovl_lookup(), ovl_iterate(),
     ovl_rename() for cleaner credential guard scope

   - Introduce struct ovl_renamedata to simplify rename handling

   - Don't override credentials for ovl_check_whiteouts() (unnecessary)

   - Remove unneeded semicolon"

* tag 'vfs-6.19-rc1.ovl' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs: (54 commits)
  ovl: remove unneeded semicolon
  ovl: remove struct ovl_cu_creds and associated functions
  ovl: port ovl_copy_up_tmpfile() to cred guard
  ovl: mark *_cu_creds() as unused temporarily
  ovl: port ovl_copy_up_workdir() to cred guard
  ovl: add copy up credential guard
  ovl: drop ovl_setup_cred_for_create()
  ovl: port ovl_create_or_link() to new ovl_override_creator_creds cleanup guard
  ovl: mark ovl_setup_cred_for_create() as unused temporarily
  ovl: reflow ovl_create_or_link()
  ovl: port ovl_create_tmpfile() to new ovl_override_creator_creds cleanup guard
  ovl: add ovl_override_creator_creds cred guard
  ovl: remove ovl_revert_creds()
  ovl: port ovl_fill_super() to cred guard
  ovl: refactor ovl_fill_super()
  ovl: port ovl_lower_positive() to cred guard
  ovl: port ovl_lookup() to cred guard
  ovl: refactor ovl_lookup()
  ovl: port ovl_copyfile() to cred guard
  ovl: port ovl_rename() to cred guard
  ...
2025-12-01 16:31:21 -08:00
Linus Torvalds a8058f8442 vfs-6.19-rc1.directory.locking
-----BEGIN PGP SIGNATURE-----
 
 iHUEABYKAB0WIQRAhzRXHqcMeLMyaSiRxhvAZXjcogUCaSmOZwAKCRCRxhvAZXjc
 op9tAQCJ//STOkvYHfqgsdRD+cW9MRg/gPzfVZgnV1FTyf8sMgEA0IsY5zCZB9eh
 9FdD0E57P8PlWRwWZ+LktnWBzRAUqwI=
 =MOVR
 -----END PGP SIGNATURE-----

Merge tag 'vfs-6.19-rc1.directory.locking' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs

Pull directory locking updates from Christian Brauner:
 "This contains the work to add centralized APIs for directory locking
  operations.

  This series is part of a larger effort to change directory operation
  locking to allow multiple concurrent operations in a directory. The
  ultimate goal is to lock the target dentry(s) rather than the whole
  parent directory.

  To help with changing the locking protocol, this series centralizes
  locking and lookup in new helper functions. The helpers establish a
  pattern where it is the dentry that is being locked and unlocked
  (currently the lock is held on dentry->d_parent->d_inode, but that can
  change in the future).

  This also changes vfs_mkdir() to unlock the parent on failure, as well
  as dput()ing the dentry. This allows end_creating() to only require
  the target dentry (which may be IS_ERR() after vfs_mkdir()), not the
  parent"

* tag 'vfs-6.19-rc1.directory.locking' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs:
  nfsd: fix end_creating() conversion
  VFS: introduce end_creating_keep()
  VFS: change vfs_mkdir() to unlock on failure.
  ecryptfs: use new start_creating/start_removing APIs
  Add start_renaming_two_dentries()
  VFS/ovl/smb: introduce start_renaming_dentry()
  VFS/nfsd/ovl: introduce start_renaming() and end_renaming()
  VFS: add start_creating_killable() and start_removing_killable()
  VFS: introduce start_removing_dentry()
  smb/server: use end_removing_noperm for for target of smb2_create_link()
  VFS: introduce start_creating_noperm() and start_removing_noperm()
  VFS/nfsd/cachefiles/ovl: introduce start_removing() and end_removing()
  VFS/nfsd/cachefiles/ovl: add start_creating() and end_creating()
  VFS: tidy up do_unlinkat()
  VFS: introduce start_dirop() and end_dirop()
  debugfs: rename end_creating() to debugfs_end_creating()
2025-12-01 16:13:46 -08:00
Linus Torvalds db74a7d02a vfs-6.19-rc1.directory.delegations
-----BEGIN PGP SIGNATURE-----
 
 iHUEABYKAB0WIQRAhzRXHqcMeLMyaSiRxhvAZXjcogUCaSmOZgAKCRCRxhvAZXjc
 ooiEAPwNZfkqiSs6G1B2EmjFpMrA2BDqskaOsnN2sywra0sNewD9EQxJwlYXUn+z
 nNUIAvmegJGg2OiU2UaNGwxMR3lR3w8=
 =YELr
 -----END PGP SIGNATURE-----

Merge tag 'vfs-6.19-rc1.directory.delegations' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs

Pull directory delegations update from Christian Brauner:
 "This contains the work for recall-only directory delegations for
  knfsd.

  Add support for simple, recallable-only directory delegations. This
  was decided at the fall NFS Bakeathon where the NFS client and server
  maintainers discussed how to merge directory delegation support.

  The approach starts with recallable-only delegations for several reasons:

   1. RFC8881 has gaps that are being addressed in RFC8881bis. In
      particular, it requires directory position information for
      CB_NOTIFY callbacks, which is difficult to implement properly
      under Linux. The spec is being extended to allow that information
      to be omitted.

   2. Client-side support for CB_NOTIFY still lags. The client side
      involves heuristics about when to request a delegation.

   3. Early indication shows simple, recallable-only delegations can
      help performance. Anna Schumaker mentioned seeing a multi-minute
      speedup in xfstests runs with them enabled.

  With these changes, userspace can also request a read lease on a
  directory that will be recalled on conflicting accesses. This may be
  useful for applications like Samba. Users can disable leases
  altogether via the fs.leases-enable sysctl if needed.

  VFS changes:

   - Dedicated Type for Delegations

     Introduce struct delegated_inode to track inodes that may have
     delegations that need to be broken. This replaces the previous
     approach of passing raw inode pointers through the delegation
     breaking code paths, providing better type safety and clearer
     semantics for the delegation machinery.

   - Break parent directory delegations in open(..., O_CREAT) codepath

   - Allow mkdir to wait for delegation break on parent

   - Allow rmdir to wait for delegation break on parent

   - Add try_break_deleg calls for parents to vfs_link(), vfs_rename(),
     and vfs_unlink()

   - Make vfs_create(), vfs_mknod(), and vfs_symlink() break delegations
     on parent directory

   - Clean up argument list for vfs_create()

   - Expose delegation support to userland

  Filelock changes:

   - Make lease_alloc() take a flags argument

   - Rework the __break_lease API to use flags

   - Add struct delegated_inode

   - Push the S_ISREG check down to ->setlease handlers

   - Lift the ban on directory leases in generic_setlease

  NFSD changes:

   - Allow filecache to hold S_IFDIR files

   - Allow DELEGRETURN on directories

   - Wire up GET_DIR_DELEGATION handling

  Fixes:

   - Fix kernel-doc warnings in __fcntl_getlease

   - Add needed headers for new struct delegation definition"

* tag 'vfs-6.19-rc1.directory.delegations' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs:
  vfs: add needed headers for new struct delegation definition
  filelock: __fcntl_getlease: fix kernel-doc warnings
  vfs: expose delegation support to userland
  nfsd: wire up GET_DIR_DELEGATION handling
  nfsd: allow DELEGRETURN on directories
  nfsd: allow filecache to hold S_IFDIR files
  filelock: lift the ban on directory leases in generic_setlease
  vfs: make vfs_symlink break delegations on parent dir
  vfs: make vfs_mknod break delegations on parent directory
  vfs: make vfs_create break delegations on parent directory
  vfs: clean up argument list for vfs_create()
  vfs: break parent dir delegations in open(..., O_CREAT) codepath
  vfs: allow rmdir to wait for delegation break on parent
  vfs: allow mkdir to wait for delegation break on parent
  vfs: add try_break_deleg calls for parents to vfs_{link,rename,unlink}
  filelock: push the S_ISREG check down to ->setlease handlers
  filelock: add struct delegated_inode
  filelock: rework the __break_lease API to use flags
  filelock: make lease_alloc() take a flags argument
2025-12-01 15:34:41 -08:00
Linus Torvalds 4664fb427c vfs-6.19-rc1.minix
-----BEGIN PGP SIGNATURE-----
 
 iHUEABYKAB0WIQRAhzRXHqcMeLMyaSiRxhvAZXjcogUCaSmOZgAKCRCRxhvAZXjc
 olEcAP4qG313oT/tm4W3nC4g2k8S//KqET97B80pSX0K3DvQEwD+LSCf1Th3RnsV
 EAMHczmCtRlbcFPqYOFVAMS8VxOyVg0=
 =7Ca4
 -----END PGP SIGNATURE-----

Merge tag 'vfs-6.19-rc1.minix' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs

Pull minix fixes from Christian Brauner:
 "Fix two syzbot corruption bugs in the minix filesystem.

  Syzbot fuzzes filesystems by trying to mount and manipulate
  deliberately corrupted images. This should not lead to BUG_ONs and
  WARN_ONs for easy to detect corruptions.

   - Add error handling to minix filesystem for inode corruption
     detection, enabling the filesystem to report such corruptions
     cleanly.

   - Fix a drop_nlink warning in minix_rmdir() triggered by corrupted
     directory link counts.

   - Fix a drop_nlink warning in minix_rename() triggered by corrupted
     inode link counts"

* tag 'vfs-6.19-rc1.minix' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs:
  Fix a drop_nlink warning in minix_rename
  Fix a drop_nlink warning in minix_rmdir
  Add error handling to minix filesystem for inode corruption detection
2025-12-01 15:22:40 -08:00
Linus Torvalds 978d337c2e vfs-6.19-rc1.guards
-----BEGIN PGP SIGNATURE-----
 
 iHUEABYKAB0WIQRAhzRXHqcMeLMyaSiRxhvAZXjcogUCaSmOZgAKCRCRxhvAZXjc
 opxBAQCjNjr0yTSoaGRM0CJXg79Of3DLIlBdB7TygibTN16WhwEA+VKWoHL5eRjg
 PZlwZD4Ei2ymeQYxi+6owTF8G806tAs=
 =m/Bt
 -----END PGP SIGNATURE-----

Merge tag 'vfs-6.19-rc1.guards' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs

Pull superblock lock guard updates from Christian Brauner:
 "This starts the work of introducing guards for superblock related
  locks.

  Introduce super_write_guard for scoped superblock write protection.

  This provides a guard-based alternative to the manual sb_start_write()
  and sb_end_write() pattern, allowing the compiler to automatically
  handle the cleanup"

* tag 'vfs-6.19-rc1.guards' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs:
  xfs: use super write guard in xfs_file_ioctl()
  open: use super write guard in do_ftruncate()
  btrfs: use super write guard in relocating_repair_kthread()
  ext4: use super write guard in write_mmp_block()
  btrfs: use super write guard in sb_start_write()
  btrfs: use super write guard btrfs_run_defrag_inode()
  btrfs: use super write guard in btrfs_reclaim_bgs_work()
  fs: add super_write_guard
2025-12-01 14:39:03 -08:00
Linus Torvalds afdf0fb340 vfs-6.19-rc1.fs_header
-----BEGIN PGP SIGNATURE-----
 
 iHUEABYKAB0WIQRAhzRXHqcMeLMyaSiRxhvAZXjcogUCaSmOZgAKCRCRxhvAZXjc
 oq2EAQD09y/qVU81E7Qg7Cn4n5/3WTlnQjx0aSvhb4p6dFUcFwD+K9uVJNP8x8tA
 xTaPt59nZbEX9BIAwtLChSPa4CZsnwM=
 =XrvE
 -----END PGP SIGNATURE-----

Merge tag 'vfs-6.19-rc1.fs_header' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs

Pull fs header updates from Christian Brauner:
 "This contains initial work to start splitting up fs.h.

  Begin the long-overdue work of splitting up the monolithic fs.h
  header. The header has grown to over 3000 lines and includes types and
  functions for many different subsystems, making it difficult to
  navigate and causing excessive compilation dependencies.

  This series introduces new focused headers for superblock-related
  code:

   - Rename fs_types.h to fs_dirent.h to better reflect its actual
     content (directory entry types)

   - Add fs/super_types.h containing superblock type definitions

   - Add fs/super.h containing superblock function declarations

  This is the first step in a longer effort to modularize the VFS
  headers.

  Cleanups:

   - Inode Field Layout Optimization (Mateusz Guzik)

     Move inode fields used during fast path lookup closer together to
     improve cache locality during path resolution.

   - current_umask() Optimization (Mateusz Guzik)

     Inline current_umask() and move it to fs_struct.h. This improves
     performance by avoiding function call overhead for this
     frequently-used function, and places it in a more appropriate
     header since it operates on fs_struct"

* tag 'vfs-6.19-rc1.fs_header' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs:
  fs: move inode fields used during fast path lookup closer together
  fs: inline current_umask() and move it to fs_struct.h
  fs: add fs/super.h header
  fs: add fs/super_types.h header
  fs: rename fs_types.h to fs_dirent.h
2025-12-01 14:18:01 -08:00
Linus Torvalds 1d18101a64 kernel-6.19-rc1.cred
-----BEGIN PGP SIGNATURE-----
 
 iHUEABYKAB0WIQRAhzRXHqcMeLMyaSiRxhvAZXjcogUCaSmOZQAKCRCRxhvAZXjc
 orJLAP9UD+dX6cicJDkzFZowDakmoIQkR5ZSDwChSlmvLcmquwEAlSq4svVd9Bdl
 7kOFUk71DqhVHrPAwO7ap0BxehokEAA=
 =Cli6
 -----END PGP SIGNATURE-----

Merge tag 'kernel-6.19-rc1.cred' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs

Pull cred guard updates from Christian Brauner:
 "This contains substantial credential infrastructure improvements
  adding guard-based credential management that simplifies code and
  eliminates manual reference counting in many subsystems.

  Features:

   - Kernel Credential Guards

     Add with_kernel_creds() and scoped_with_kernel_creds() guards that
     allow using the kernel credentials without allocating and copying
     them. This was requested by Linus after seeing repeated
     prepare_kernel_creds() calls that duplicate the kernel credentials
     only to drop them again later.

     The new guards completely avoid the allocation and never expose the
     temporary variable to hold the kernel credentials anywhere in
     callers.

   - Generic Credential Guards

     Add scoped_with_creds() guards for the common override_creds() and
     revert_creds() pattern. This builds on earlier work that made
     override_creds()/revert_creds() completely reference count free.

   - Prepare Credential Guards

     Add prepare credential guards for the more complex pattern of
     preparing a new set of credentials and overriding the current
     credentials with them:
      - prepare_creds()
      - modify new creds
      - override_creds()
      - revert_creds()
      - put_cred()

  Cleanups:

   - Make init_cred static since it should not be directly accessed

   - Add kernel_cred() helper to properly access the kernel credentials

   - Fix scoped_class() macro that was introduced two cycles ago

   - coredump: split out do_coredump() from vfs_coredump() for cleaner
     credential handling

   - coredump: move revert_cred() before coredump_cleanup()

   - coredump: mark struct mm_struct as const

   - coredump: pass struct linux_binfmt as const

   - sev-dev: use guard for path"

* tag 'kernel-6.19-rc1.cred' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs: (36 commits)
  trace: use override credential guard
  trace: use prepare credential guard
  coredump: use override credential guard
  coredump: use prepare credential guard
  coredump: split out do_coredump() from vfs_coredump()
  coredump: mark struct mm_struct as const
  coredump: pass struct linux_binfmt as const
  coredump: move revert_cred() before coredump_cleanup()
  sev-dev: use override credential guards
  sev-dev: use prepare credential guard
  sev-dev: use guard for path
  cred: add prepare credential guard
  net/dns_resolver: use credential guards in dns_query()
  cgroup: use credential guards in cgroup_attach_permissions()
  act: use credential guards in acct_write_process()
  smb: use credential guards in cifs_get_spnego_key()
  nfs: use credential guards in nfs_idmap_get_key()
  nfs: use credential guards in nfs_local_call_write()
  nfs: use credential guards in nfs_local_call_read()
  erofs: use credential guards
  ...
2025-12-01 13:45:41 -08:00
Linus Torvalds f2e74ecfba vfs-6.19-rc1.folio
-----BEGIN PGP SIGNATURE-----
 
 iHUEABYKAB0WIQRAhzRXHqcMeLMyaSiRxhvAZXjcogUCaSmOZQAKCRCRxhvAZXjc
 onGBAQDtqeO0jZzS7q9UxlJ84Wj/H9w+9INpO4jMxtWK4svhUAEAghG4qVxRvkE2
 Qh+wrpTPIC7OCQ78k8psDRmkj9cn8QA=
 =FCVN
 -----END PGP SIGNATURE-----

Merge tag 'vfs-6.19-rc1.folio' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs

Pull folio updates from Christian Brauner:
 "Add a new folio_next_pos() helper function that returns the file
  position of the first byte after the current folio. This is a common
  operation in filesystems when needing to know the end of the current
  folio.

  The helper is lifted from btrfs which already had its own version, and
  is now used across multiple filesystems and subsystems:
   - btrfs
   - buffer
   - ext4
   - f2fs
   - gfs2
   - iomap
   - netfs
   - xfs
   - mm

  This fixes a long-standing bug in ocfs2 on 32-bit systems with files
  larger than 2GiB. Presumably this is not a common configuration, but
  the fix is backported anyway. The other filesystems did not have bugs,
  they were just mildly inefficient.

  This also introduce uoff_t as the unsigned version of loff_t. A recent
  commit inadvertently changed a comparison from being unsigned (on
  64-bit systems) to being signed (which it had always been on 32-bit
  systems), leading to sporadic fstests failures.

  Generally file sizes are restricted to being a signed integer, but in
  places where -1 is passed to indicate "up to the end of the file", it
  is convenient to have an unsigned type to ensure comparisons are
  always unsigned regardless of architecture"

* tag 'vfs-6.19-rc1.folio' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs:
  fs: Add uoff_t
  mm: Use folio_next_pos()
  xfs: Use folio_next_pos()
  netfs: Use folio_next_pos()
  iomap: Use folio_next_pos()
  gfs2: Use folio_next_pos()
  f2fs: Use folio_next_pos()
  ext4: Use folio_next_pos()
  buffer: Use folio_next_pos()
  btrfs: Use folio_next_pos()
  filemap: Add folio_next_pos()
2025-12-01 10:26:38 -08:00
Linus Torvalds 212c4053a1 vfs-6.19-rc1.coredump
-----BEGIN PGP SIGNATURE-----
 
 iHUEABYKAB0WIQRAhzRXHqcMeLMyaSiRxhvAZXjcogUCaSmOZQAKCRCRxhvAZXjc
 oji0AQC5jl35xh04fJKB343InVAxtRFp8mSkJJ9Bx6x7xA7a+QEAiBMxYilUgYIW
 bZMcI5LU+gNO/1y076QkVt84jTUQLww=
 =WIBZ
 -----END PGP SIGNATURE-----

Merge tag 'vfs-6.19-rc1.coredump' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs

Pull pidfd and coredump updates from Christian Brauner:
 "Features:

   - Expose coredump signal via pidfd

     Expose the signal that caused the coredump through the pidfd
     interface. The recent changes to rework coredump handling to rely
     on unix sockets are in the process of being used in systemd. The
     previous systemd coredump container interface requires the coredump
     file descriptor and basic information including the signal number
     to be sent to the container. This means the signal number needs to
     be available before sending the coredump to the container.

   - Add supported_mask field to pidfd

     Add a new supported_mask field to struct pidfd_info that indicates
     which information fields are supported by the running kernel. This
     allows userspace to detect feature availability without relying on
     error codes or kernel version checks.

  Cleanups:

   - Drop struct pidfs_exit_info and prepare to drop exit_info pointer,
     simplifying the internal publication mechanism for exit and
     coredump information retrievable via the pidfd ioctl

   - Use guard() for task_lock in pidfs

   - Reduce wait_pidfd lock scope

   - Add missing PIDFD_INFO_SIZE_VER1 constant

   - Add missing BUILD_BUG_ON() assert on struct pidfd_info

  Fixes:

   - Fix PIDFD_INFO_COREDUMP handling

  Selftests:

   - Split out coredump socket tests and common helpers into separate
     files for better organization

   - Fix userspace coredump client detection issues

   - Handle edge-triggered epoll correctly

   - Ignore ENOSPC errors in tests

   - Add debug logging to coredump socket tests, socket protocol tests,
     and test helpers

   - Add tests for PIDFD_INFO_COREDUMP_SIGNAL

   - Add tests for supported_mask field

   - Update pidfd header for selftests"

* tag 'vfs-6.19-rc1.coredump' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs: (23 commits)
  pidfs: reduce wait_pidfd lock scope
  selftests/coredump: add second PIDFD_INFO_COREDUMP_SIGNAL test
  selftests/coredump: add first PIDFD_INFO_COREDUMP_SIGNAL test
  selftests/coredump: ignore ENOSPC errors
  selftests/coredump: add debug logging to coredump socket protocol tests
  selftests/coredump: add debug logging to coredump socket tests
  selftests/coredump: add debug logging to test helpers
  selftests/coredump: handle edge-triggered epoll correctly
  selftests/coredump: fix userspace coredump client detection
  selftests/coredump: fix userspace client detection
  selftests/coredump: split out coredump socket tests
  selftests/coredump: split out common helpers
  selftests/pidfd: add second supported_mask test
  selftests/pidfd: add first supported_mask test
  selftests/pidfd: update pidfd header
  pidfs: expose coredump signal
  pidfs: drop struct pidfs_exit_info
  pidfs: prepare to drop exit_info pointer
  pidfd: add a new supported_mask field
  pidfs: add missing BUILD_BUG_ON() assert on struct pidfd_info
  ...
2025-12-01 10:17:39 -08:00
Linus Torvalds 415d34b92c namespace-6.19-rc1
-----BEGIN PGP SIGNATURE-----
 
 iHUEABYKAB0WIQRAhzRXHqcMeLMyaSiRxhvAZXjcogUCaSmOZQAKCRCRxhvAZXjc
 ooKwAP4kR5kMjHlthf8jHmmCjVU3nQFO9hUZsIQL9gFJLOIQMAD+LLoTaq1WJufl
 oSgZpREXZVmI1TK61eR6EZMB1YikGAo=
 =TExi
 -----END PGP SIGNATURE-----

Merge tag 'namespace-6.19-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs

Pull namespace updates from Christian Brauner:
 "This contains substantial namespace infrastructure changes including a new
  system call, active reference counting, and extensive header cleanups.
  The branch depends on the shared kbuild branch for -fms-extensions support.

  Features:

   - listns() system call

     Add a new listns() system call that allows userspace to iterate
     through namespaces in the system. This provides a programmatic
     interface to discover and inspect namespaces, addressing
     longstanding limitations:

     Currently, there is no direct way for userspace to enumerate
     namespaces. Applications must resort to scanning /proc/*/ns/ across
     all processes, which is:
      - Inefficient - requires iterating over all processes
      - Incomplete - misses namespaces not attached to any running
        process but kept alive by file descriptors, bind mounts, or
        parent references
      - Permission-heavy - requires access to /proc for many processes
      - No ordering or ownership information
      - No filtering per namespace type

     The listns() system call solves these problems:

       ssize_t listns(const struct ns_id_req *req, u64 *ns_ids,
                      size_t nr_ns_ids, unsigned int flags);

       struct ns_id_req {
             __u32 size;
             __u32 spare;
             __u64 ns_id;
             struct /* listns */ {
                     __u32 ns_type;
                     __u32 spare2;
                     __u64 user_ns_id;
             };
       };

     Features include:
      - Pagination support for large namespace sets
      - Filtering by namespace type (MNT_NS, NET_NS, USER_NS, etc.)
      - Filtering by owning user namespace
      - Permission checks respecting namespace isolation

   - Active Reference Counting

     Introduce an active reference count that tracks namespace
     visibility to userspace. A namespace is visible in the following
     cases:
      - The namespace is in use by a task
      - The namespace is persisted through a VFS object (namespace file
        descriptor or bind-mount)
      - The namespace is a hierarchical type and is the parent of child
        namespaces

     The active reference count does not regulate lifetime (that's still
     done by the normal reference count) - it only regulates visibility
     to namespace file handles and listns().

     This prevents resurrection of namespaces that are pinned only for
     internal kernel reasons (e.g., user namespaces held by
     file->f_cred, lazy TLB references on idle CPUs, etc.) which should
     not be accessible via (1)-(3).

   - Unified Namespace Tree

     Introduce a unified tree structure for all namespaces with:
      - Fixed IDs assigned to initial namespaces
      - Lookup based solely on inode number
      - Maintained list of owned namespaces per user namespace
      - Simplified rbtree comparison helpers

   Cleanups

    - Header Reorganization:
      - Move namespace types into separate header (ns_common_types.h)
      - Decouple nstree from ns_common header
      - Move nstree types into separate header
      - Switch to new ns_tree_{node,root} structures with helper functions
      - Use guards for ns_tree_lock

   - Initial Namespace Reference Count Optimization
      - Make all reference counts on initial namespaces a nop to avoid
        pointless cacheline ping-pong for namespaces that can never go
        away
      - Drop custom reference count initialization for initial namespaces
      - Add NS_COMMON_INIT() macro and use it for all namespaces
      - pid: rely on common reference count behavior

   - Miscellaneous Cleanups
      - Rename exit_task_namespaces() to exit_nsproxy_namespaces()
      - Rename is_initial_namespace() and make argument const
      - Use boolean to indicate anonymous mount namespace
      - Simplify owner list iteration in nstree
      - nsfs: raise SB_I_NODEV, SB_I_NOEXEC, and DCACHE_DONTCACHE explicitly
      - nsfs: use inode_just_drop()
      - pidfs: raise DCACHE_DONTCACHE explicitly
      - pidfs: simplify PIDFD_GET__NAMESPACE ioctls
      - libfs: allow to specify s_d_flags
      - cgroup: add cgroup namespace to tree after owner is set
      - nsproxy: fix free_nsproxy() and simplify create_new_namespaces()

  Fixes:

   - setns(pidfd, ...) race condition

     Fix a subtle race when using pidfds with setns(). When the target
     task exits after prepare_nsset() but before commit_nsset(), the
     namespace's active reference count might have been dropped. If
     setns() then installs the namespaces, it would bump the active
     reference count from zero without taking the required reference on
     the owner namespace, leading to underflow when later decremented.

     The fix resurrects the ownership chain if necessary - if the caller
     succeeded in grabbing passive references, the setns() should
     succeed even if the target task exits or gets reaped.

   - Return EFAULT on put_user() error instead of success

   - Make sure references are dropped outside of RCU lock (some
     namespaces like mount namespace sleep when putting the last
     reference)

   - Don't skip active reference count initialization for network
     namespace

   - Add asserts for active refcount underflow

   - Add asserts for initial namespace reference counts (both passive
     and active)

   - ipc: enable is_ns_init_id() assertions

   - Fix kernel-doc comments for internal nstree functions

   - Selftests
      - 15 active reference count tests
      - 9 listns() functionality tests
      - 7 listns() permission tests
      - 12 inactive namespace resurrection tests
      - 3 threaded active reference count tests
      - commit_creds() active reference tests
      - Pagination and stress tests
      - EFAULT handling test
      - nsid tests fixes"

* tag 'namespace-6.19-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs: (103 commits)
  pidfs: simplify PIDFD_GET_<type>_NAMESPACE ioctls
  nstree: fix kernel-doc comments for internal functions
  nsproxy: fix free_nsproxy() and simplify create_new_namespaces()
  selftests/namespaces: fix nsid tests
  ns: drop custom reference count initialization for initial namespaces
  pid: rely on common reference count behavior
  ns: add asserts for initial namespace active reference counts
  ns: add asserts for initial namespace reference counts
  ns: make all reference counts on initial namespace a nop
  ipc: enable is_ns_init_id() assertions
  fs: use boolean to indicate anonymous mount namespace
  ns: rename is_initial_namespace()
  ns: make is_initial_namespace() argument const
  nstree: use guards for ns_tree_lock
  nstree: simplify owner list iteration
  nstree: switch to new structures
  nstree: add helper to operate on struct ns_tree_{node,root}
  nstree: move nstree types into separate header
  nstree: decouple from ns_common header
  ns: move namespace types into separate header
  ...
2025-12-01 09:47:41 -08:00
Linus Torvalds ebaeabfa5a vfs-6.19-rc1.writeback
-----BEGIN PGP SIGNATURE-----
 
 iHUEABYKAB0WIQRAhzRXHqcMeLMyaSiRxhvAZXjcogUCaSmOZQAKCRCRxhvAZXjc
 or4UAP9FbpFsZd0DpsYnKuv7kFepl291PuR0x2dKmseJ/wcf8AEAzI8FR5wd/fey
 25ZNdExoUojAOj5wVn+jUep3u54jBws=
 =/toi
 -----END PGP SIGNATURE-----

Merge tag 'vfs-6.19-rc1.writeback' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs

Pull writeback updates from Christian Brauner:
 "Features:

   - Allow file systems to increase the minimum writeback chunk size.

     The relatively low minimal writeback size of 4MiB means that
     written back inodes on rotational media are switched a lot. Besides
     introducing additional seeks, this also can lead to extreme file
     fragmentation on zoned devices when a lot of files are cached
     relative to the available writeback bandwidth.

     This adds a superblock field that allows the file system to
     override the default size, and sets it to the zone size for zoned
     XFS.

   - Add logging for slow writeback when it exceeds
     sysctl_hung_task_timeout_secs. This helps identify tasks waiting
     for a long time and pinpoint potential issues. Recording the
     starting jiffies is also useful when debugging a crashed vmcore.

   - Wake up waiting tasks when finishing the writeback of a chunk

  Cleanups:

   - filemap_* writeback interface cleanups.

     Adding filemap_fdatawrite_wbc ended up being a mistake, as all but
     the original btrfs caller should be using better high level
     interfaces instead.

     This series removes all these low-level interfaces, switches btrfs
     to a more specific interface, and cleans up other too low-level
     interfaces. With this the writeback_control that is passed to the
     writeback code is only initialized in three places.

   - Remove __filemap_fdatawrite, __filemap_fdatawrite_range, and
     filemap_fdatawrite_wbc

   - Add filemap_flush_nr helper for btrfs

   - Push struct writeback_control into start_delalloc_inodes in btrfs

   - Rename filemap_fdatawrite_range_kick to filemap_flush_range

   - Stop opencoding filemap_fdatawrite_range in 9p, ocfs2, and mm

   - Make wbc_to_tag() inline and use it in fs"

* tag 'vfs-6.19-rc1.writeback' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs:
  fs: Make wbc_to_tag() inline and use it in fs.
  xfs: set s_min_writeback_pages for zoned file systems
  writeback: allow the file system to override MIN_WRITEBACK_PAGES
  writeback: cleanup writeback_chunk_size
  mm: rename filemap_fdatawrite_range_kick to filemap_flush_range
  mm: remove __filemap_fdatawrite_range
  mm: remove filemap_fdatawrite_wbc
  mm: remove __filemap_fdatawrite
  mm,btrfs: add a filemap_flush_nr helper
  btrfs: push struct writeback_control into start_delalloc_inodes
  btrfs: use the local tmp_inode variable in start_delalloc_inodes
  ocfs2: don't opencode filemap_fdatawrite_range in ocfs2_journal_submit_inode_data_buffers
  9p: don't opencode filemap_fdatawrite_range in v9fs_mmap_vm_close
  mm: don't opencode filemap_fdatawrite_range in filemap_invalidate_inode
  writeback: Add logging for slow writeback (exceeds sysctl_hung_task_timeout_secs)
  writeback: Wake up waiting tasks when finishing the writeback of a chunk.
2025-12-01 09:20:51 -08:00
Linus Torvalds 9368f0f941 vfs-6.19-rc1.inode
-----BEGIN PGP SIGNATURE-----
 
 iHUEABYKAB0WIQRAhzRXHqcMeLMyaSiRxhvAZXjcogUCaSmOZAAKCRCRxhvAZXjc
 omMSAP9GLhavxyWQ24Q+49CNWWRQWDY1wTOiUK2BwtIvZ0YEcAD8D1dAiMckL5pC
 RwEAVA5p+y+qi+bZP0KXCBxQddoTIQM=
 =zo/J
 -----END PGP SIGNATURE-----

Merge tag 'vfs-6.19-rc1.inode' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs

Pull vfs inode updates from Christian Brauner:
 "Features:

   - Hide inode->i_state behind accessors. Open-coded accesses prevent
     asserting they are done correctly. One obvious aspect is locking,
     but significantly more can be checked. For example it can be
     detected when the code is clearing flags which are already missing,
     or is setting flags when it is illegal (e.g., I_FREEING when
     ->i_count > 0)

   - Provide accessors for ->i_state, converts all filesystems using
     coccinelle and manual conversions (btrfs, ceph, smb, f2fs, gfs2,
     overlayfs, nilfs2, xfs), and makes plain ->i_state access fail to
     compile

   - Rework I_NEW handling to operate without fences, simplifying the
     code after the accessor infrastructure is in place

  Cleanups:

   - Move wait_on_inode() from writeback.h to fs.h

   - Spell out fenced ->i_state accesses with explicit smp_wmb/smp_rmb
     for clarity

   - Cosmetic fixes to LRU handling

   - Push list presence check into inode_io_list_del()

   - Touch up predicts in __d_lookup_rcu()

   - ocfs2: retire ocfs2_drop_inode() and I_WILL_FREE usage

   - Assert on ->i_count in iput_final()

   - Assert ->i_lock held in __iget()

  Fixes:

   - Add missing fences to I_NEW handling"

* tag 'vfs-6.19-rc1.inode' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs: (22 commits)
  dcache: touch up predicts in __d_lookup_rcu()
  fs: push list presence check into inode_io_list_del()
  fs: cosmetic fixes to lru handling
  fs: rework I_NEW handling to operate without fences
  fs: make plain ->i_state access fail to compile
  xfs: use the new ->i_state accessors
  nilfs2: use the new ->i_state accessors
  overlayfs: use the new ->i_state accessors
  gfs2: use the new ->i_state accessors
  f2fs: use the new ->i_state accessors
  smb: use the new ->i_state accessors
  ceph: use the new ->i_state accessors
  btrfs: use the new ->i_state accessors
  Manual conversion to use ->i_state accessors of all places not covered by coccinelle
  Coccinelle-based conversion to use ->i_state accessors
  fs: provide accessors for ->i_state
  fs: spell out fenced ->i_state accesses with explicit smp_wmb/smp_rmb
  fs: move wait_on_inode() from writeback.h to fs.h
  fs: add missing fences to I_NEW handling
  ocfs2: retire ocfs2_drop_inode() and I_WILL_FREE usage
  ...
2025-12-01 09:02:34 -08:00
Linus Torvalds b04b2e7a61 vfs-6.19-rc1.misc
-----BEGIN PGP SIGNATURE-----
 
 iHUEABYKAB0WIQRAhzRXHqcMeLMyaSiRxhvAZXjcogUCaSmOZAAKCRCRxhvAZXjc
 onGCAQDEHKNEuZMhkyd3K5YsJtMzZlW/uXp4+Wddeob+5yQp0wEA09xN4CJNMwhP
 J6Kjaa80hWfrFacqSvyMUwQHHw6mngs=
 =5Mom
 -----END PGP SIGNATURE-----

Merge tag 'vfs-6.19-rc1.misc' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs

Pull misc vfs updates from Christian Brauner:
 "Features:

   - Cheaper MAY_EXEC handling for path lookup. This elides MAY_WRITE
     permission checks during path lookup and adds the
     IOP_FASTPERM_MAY_EXEC flag so filesystems like btrfs can avoid
     expensive permission work.

   - Hide dentry_cache behind runtime const machinery.

   - Add German Maglione as virtiofs co-maintainer.

  Cleanups:

   - Tidy up and inline step_into() and walk_component() for improved
     code generation.

   - Re-enable IOCB_NOWAIT writes to files. This refactors file
     timestamp update logic, fixing a layering bypass in btrfs when
     updating timestamps on device files and improving FMODE_NOCMTIME
     handling in VFS now that nfsd started using it.

   - Path lookup optimizations extracting slowpaths into dedicated
     routines and adding branch prediction hints for mntput_no_expire(),
     fd_install(), lookup_slow(), and various other hot paths.

   - Enable clang's -fms-extensions flag, requiring a JFS rename to
     avoid conflicts.

   - Remove spurious exports in fs/file_attr.c.

   - Stop duplicating union pipe_index declaration. This depends on the
     shared kbuild branch that brings in -fms-extensions support which
     is merged into this branch.

   - Use MD5 library instead of crypto_shash in ecryptfs.

   - Use largest_zero_folio() in iomap_dio_zero().

   - Replace simple_strtol/strtoul with kstrtoint/kstrtouint in init and
     initrd code.

   - Various typo fixes.

  Fixes:

   - Fix emergency sync for btrfs. Btrfs requires an explicit sync_fs()
     call with wait == 1 to commit super blocks. The emergency sync path
     never passed this, leaving btrfs data uncommitted during emergency
     sync.

   - Use local kmap in watch_queue's post_one_notification().

   - Add hint prints in sb_set_blocksize() for LBS dependency on THP"

* tag 'vfs-6.19-rc1.misc' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs: (35 commits)
  MAINTAINERS: add German Maglione as virtiofs co-maintainer
  fs: inline step_into() and walk_component()
  fs: tidy up step_into() & friends before inlining
  orangefs: use inode_update_timestamps directly
  btrfs: fix the comment on btrfs_update_time
  btrfs: use vfs_utimes to update file timestamps
  fs: export vfs_utimes
  fs: lift the FMODE_NOCMTIME check into file_update_time_flags
  fs: refactor file timestamp update logic
  include/linux/fs.h: trivial fix: regualr -> regular
  fs/splice.c: trivial fix: pipes -> pipe's
  fs: mark lookup_slow() as noinline
  fs: add predicts based on nd->depth
  fs: move mntput_no_expire() slowpath into a dedicated routine
  fs: remove spurious exports in fs/file_attr.c
  watch_queue: Use local kmap in post_one_notification()
  fs: touch up predicts in path lookup
  fs: move fd_install() slowpath into a dedicated routine and provide commentary
  fs: hide dentry_cache behind runtime const machinery
  fs: touch predicts in do_dentry_open()
  ...
2025-12-01 08:44:26 -08:00
Linus Torvalds 1885cdbfbb vfs-6.19-rc1.iomap
-----BEGIN PGP SIGNATURE-----
 
 iHUEABYKAB0WIQRAhzRXHqcMeLMyaSiRxhvAZXjcogUCaSmOZAAKCRCRxhvAZXjc
 ooCXAQCwzX2GS/55QHV6JXBBoNxguuSQ5dCj91ZmTfHzij0xNAEAhKEBw7iMGX72
 c2/x+xYf+Pc6mAfxdus5RLMggqBFPAk=
 =jInB
 -----END PGP SIGNATURE-----

Merge tag 'vfs-6.19-rc1.iomap' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs

Pull iomap updates from Christian Brauner:
 "FUSE iomap Support for Buffered Reads:

    This adds iomap support for FUSE buffered reads and readahead. This
    enables granular uptodate tracking with large folios so only
    non-uptodate portions need to be read. Also fixes a race condition
    with large folios + writeback cache that could cause data corruption
    on partial writes followed by reads.

     - Refactored iomap read/readahead bio logic into helpers
     - Added caller-provided callbacks for read operations
     - Moved buffered IO bio logic into new file
     - FUSE now uses iomap for read_folio and readahead

  Zero Range Folio Batch Support:

    Add folio batch support for iomap_zero_range() to handle dirty
    folios over unwritten mappings. Fix raciness issues where dirty data
    could be lost during zero range operations.

     - filemap_get_folios_tag_range() helper for dirty folio lookup
     - Optional zero range dirty folio processing
     - XFS fills dirty folios on zero range of unwritten mappings
     - Removed old partial EOF zeroing optimization

  DIO Write Completions from Interrupt Context:

    Restore pre-iomap behavior where pure overwrite completions run
    inline rather than being deferred to workqueue. Reduces context
    switches for high-performance workloads like ScyllaDB.

     - Removed unused IOCB_DIO_CALLER_COMP code
     - Error completions always run in user context (fixes zonefs)
     - Reworked REQ_FUA selection logic
     - Inverted IOMAP_DIO_INLINE_COMP to IOMAP_DIO_OFFLOAD_COMP

  Buffered IO Cleanups:

    Some performance and code clarity improvements:

     - Replace manual bitmap scanning with find_next_bit()
     - Simplify read skip logic for writes
     - Optimize pending async writeback accounting
     - Better variable naming
     - Documentation for iomap_finish_folio_write() requirements

  Misaligned Vectors for Zoned XFS:

    Enables sub-block aligned vectors in XFS always-COW mode for zoned
    devices via new IOMAP_DIO_FSBLOCK_ALIGNED flag.

  Bug Fixes:

     - Allocate s_dio_done_wq for async reads (fixes syzbot report after
       error completion changes)
     - Fix iomap_read_end() for already uptodate folios (regression fix)"

* tag 'vfs-6.19-rc1.iomap' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs: (40 commits)
  iomap: allocate s_dio_done_wq for async reads as well
  iomap: fix iomap_read_end() for already uptodate folios
  iomap: invert the polarity of IOMAP_DIO_INLINE_COMP
  iomap: support write completions from interrupt context
  iomap: rework REQ_FUA selection
  iomap: always run error completions in user context
  fs, iomap: remove IOCB_DIO_CALLER_COMP
  iomap: use find_next_bit() for uptodate bitmap scanning
  iomap: use find_next_bit() for dirty bitmap scanning
  iomap: simplify when reads can be skipped for writes
  iomap: simplify ->read_folio_range() error handling for reads
  iomap: optimize pending async writeback accounting
  docs: document iomap writeback's iomap_finish_folio_write() requirement
  iomap: account for unaligned end offsets when truncating read range
  iomap: rename bytes_pending/bytes_accounted to bytes_submitted/bytes_not_submitted
  xfs: support sub-block aligned vectors in always COW mode
  iomap: add IOMAP_DIO_FSBLOCK_ALIGNED flag
  xfs: error tag to force zeroing on debug kernels
  iomap: remove old partial eof zeroing optimization
  xfs: fill dirty folios on zero range of unwritten mappings
  ...
2025-12-01 08:14:00 -08:00
Borislav Petkov (AMD) e2349c5811 Merge remote-tracking branches 'ras/edac-amd-atl', 'ras/edac-drivers' and 'ras/edac-misc' into edac-updates
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
2025-12-01 12:06:08 +01:00
Ingo Molnar 6ec33db1aa objtool: Fix segfault on unknown alternatives
So 'objtool --link -d vmlinux.o' gets surprised by this endbr64+endbr64 pattern
in ___bpf_prog_run():

	___bpf_prog_run:
	1e7680:  ___bpf_prog_run+0x0                                                     push   %r12
	1e7682:  ___bpf_prog_run+0x2                                                     mov    %rdi,%r12
	1e7685:  ___bpf_prog_run+0x5                                                     push   %rbp
	1e7686:  ___bpf_prog_run+0x6                                                     xor    %ebp,%ebp
	1e7688:  ___bpf_prog_run+0x8                                                     push   %rbx
	1e7689:  ___bpf_prog_run+0x9                                                     mov    %rsi,%rbx
	1e768c:  ___bpf_prog_run+0xc                                                     movzbl (%rbx),%esi
	1e768f:  ___bpf_prog_run+0xf                                                     movzbl %sil,%edx
	1e7693:  ___bpf_prog_run+0x13                                                    mov    %esi,%eax
	1e7695:  ___bpf_prog_run+0x15                                                    mov    0x0(,%rdx,8),%rdx
	1e769d:  ___bpf_prog_run+0x1d                                                    jmp    0x1e76a2 <__x86_indirect_thunk_rdx>
	1e76a2:  ___bpf_prog_run+0x22                                                    endbr64
	1e76a6:  ___bpf_prog_run+0x26                                                    endbr64
	1e76aa:  ___bpf_prog_run+0x2a                                                    mov    0x4(%rbx),%edx

And crashes due to blindly dereferencing alt->insn->alt_group.

Bail out on NULL ->alt_group, which produces this warning and continues
with the disassembly, instead of a segfault:

  .git/O/vmlinux.o: warning: objtool: <alternative.1e769d>: failed to disassemble alternative

Cc: Alexandre Chartre <alexandre.chartre@oracle.com>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Josh Poimboeuf <jpoimboe@kernel.org>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2025-12-01 10:42:27 +01:00
Randy Dunlap 43decb6b62 locking/local_lock: Fix all kernel-doc warnings
Modify kernel-doc comments in local_lock.h to prevent warnings:

  Warning: include/linux/local_lock.h:9 function parameter 'lock' not described in 'local_lock_init'
  Warning: include/linux/local_lock.h:56 function parameter 'lock' not described in 'local_trylock_init'
  Warning: include/linux/local_lock.h:56 expecting prototype for local_lock_init(). Prototype was for local_trylock_init() instead

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://patch.msgid.link/20251128065925.917917-1-rdunlap@infradead.org
2025-12-01 06:56:16 +01:00
Vincent Mailhol 719e357fc0 locking/local_lock: s/l/__l/ and s/tl/__tl/ to reduce the risk of shadowing
The Linux kernel coding style advises to avoid common variable
names in function-like macros to reduce the risk of namespace
collisions.

Throughout local_lock_internal.h, several macros use the rather common
variable names 'l' and 'tl'. This already resulted in an actual
collision: the __local_lock_acquire() function like macro is currently
shadowing the parameter 'l' of the:

  class_##_name##_t class_##_name##_constructor(_type *l)

function factory from <linux/cleanup.h>.

Rename the variable 'l' to '__l' and the variable 'tl' to '__tl'
throughout the file to fix the current namespace collision and
to prevent future ones.

[ bigeasy: Rebase, update all l and tl instances in macros ]

Signed-off-by: Vincent Mailhol <mailhol@kernel.org>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Waiman Long <longman@redhat.com>
Link: https://patch.msgid.link/20251127144140.215722-3-bigeasy@linutronix.de
2025-12-01 06:56:16 +01:00
Sebastian Andrzej Siewior 52ed746147 locking/local_lock: Add the <linux/local_lock*.h> headers to MAINTAINERS
The local_lock_t was never added to the MAINTAINERS file since its
inclusion.

Add local_lock_t to the locking primitives section.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Waiman Long <longman@redhat.com>
Link: https://patch.msgid.link/20251127144140.215722-2-bigeasy@linutronix.de
2025-12-01 06:56:10 +01:00
Sebastian Andrzej Siewior 51d7a05452 locking/mutex: Redo __mutex_init() to reduce generated code size
mutex_init() invokes __mutex_init() providing the name of the lock and
a pointer to a the lock class. With LOCKDEP enabled this information is
useful but without LOCKDEP it not used at all. Passing the pointer
information of the lock class might be considered negligible but the
name of the lock is passed as well and the string is stored. This
information is wasting storage.

Split __mutex_init() into a _genereic() variant doing the initialisation
of the lock and a _lockdep() version which does _genereic() plus the
lockdep bits. Restrict the lockdep version to lockdep enabled builds
allowing the compiler to remove the unused parameter.

This results in the following size reduction:

        text     data       bss        dec  filename
  | 30237599  8161430   1176624   39575653  vmlinux.defconfig
  | 30233269  8149142   1176560   39558971  vmlinux.defconfig.patched
     -4.2KiB   -12KiB

  | 32455099  8471098  12934684   53860881  vmlinux.defconfig.lockdep
  | 32455100  8471098  12934684   53860882  vmlinux.defconfig.patched.lockdep

  | 27152407  7191822   2068040   36412269  vmlinux.defconfig.preempt_rt
  | 27145937  7183630   2067976   36397543  vmlinux.defconfig.patched.preempt_rt
     -6.3KiB    -8KiB

  | 29382020  7505742  13784608   50672370  vmlinux.defconfig.preempt_rt.lockdep
  | 29376229  7505742  13784544   50666515  vmlinux.defconfig.patched.preempt_rt.lockdep
     -5.6KiB

[peterz: folded fix from boqun]

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Waiman Long <longman@redhat.com>
Link: https://lkml.kernel.org/r/20251125145425.68319-1-boqun.feng@gmail.com
Link: https://patch.msgid.link/20251105142350.Tfeevs2N@linutronix.de
2025-12-01 06:51:57 +01:00
Linus Torvalds 7d0a66e4bb Linux 6.18 2025-11-30 14:42:10 -08:00
Harry Fellowes d911fe6e94 x86/boot: Clean up whitespace in a20.c
Remove trailing whitespace on empty lines.

No functional changes.

  [ bp: Massage commit message. ]

Signed-off-by: Harry Fellowes <harryfellowes1@gmail.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Link: https://patch.msgid.link/20250825192832.6444-3-harryfellowes1@gmail.com
2025-11-28 20:29:52 +01:00
Christian Brauner 0512bf9701
Merge patch series "file: FD_{ADD,PREPARE}()"
Christian Brauner <brauner@kernel.org> says:

This now removes roughly double the code that it adds.

I've been playing with this to allow for moderately flexible usage of
the get_unused_fd_flags() + create file + fd_install() pattern that's
used quite extensively and requires cumbersome cleanup paths.

How callers allocate files is really heterogenous so it's not really
convenient to fold them into a single class. It's possibe to split them
into subclasses like for anon inodes. I think that's not necessarily
nice as well. This adds two primitives:

(1) FD_ADD() the simple cases a file is installed:

    fd = FD_ADD(O_CLOEXEC, vfio_device_open_file(device));
    if (fd < 0)
            vfio_device_put_registration(device);
    return fd;

(2) FD_PREPARE() that captures all the cases where access to fd or file
    or additional work before publishing the fd is needed:

    FD_PREPARE(fdf, O_CLOEXEC, sync_file->file);
    if (fdf.err) {
            fput(sync_file->file);
            return fdf.err;
    }

    data.fence = fd_prepare_fd(fdf);
    if (copy_to_user((void __user *)arg, &data, sizeof(data)))
            return -EFAULT;

    return fd_publish(fdf);

I've converted all of the easy cases over to it and it gets rid of an
aweful lot of convoluted cleanup logic. There are a bunch of other cases
that can also be converted after a bit of massaging.

It's centered around a simple struct. FD_PREPARE() encapsulates all of
allocation and cleanup logic and must be followed by a call to
fd_publish() which associates the fd with the file and installs it into
the callers fdtable. If fd_publish() isn't called both are deallocated.
FD_ADD() is a shorthand that does the fd_publish() and never exposes the
struct to the caller. That's often the case when they don't need access
to anything after installing the fd.

It mandates a specific order namely that first we allocate the fd and
then instantiate the file. But that shouldn't be a problem. Nearly
everyone I've converted used this order anyway.

There's a bunch of additional cases where it would be easy to convert
them to this pattern. For example, the whole sync file stuff in dma
currently returns the containing structure of the file instead of the
file itself even though it's only used to allocate files. Changing that
would make it fall into the FD_PREPARE() pattern easily. I've not done
that work yet.

There's room for extending this in a way that wed'd have subclasses for
some particularly often use patterns but as I said I'm not even sure
that's worth it.

* patches from https://patch.msgid.link/20251123-work-fd-prepare-v4-0-b6efa1706cfd@kernel.org: (47 commits)
  kvm: convert kvm_vcpu_ioctl_get_stats_fd() to FD_PREPARE()
  kvm: convert kvm_arch_supports_gmem_init_shared() to FD_PREPARE()
  io_uring: convert io_create_mock_file() to FD_PREPARE()
  file: convert replace_fd() to FD_PREPARE()
  vfio: convert vfio_group_ioctl_get_device_fd() to FD_PREPARE()
  tty: convert ptm_open_peer() to FD_PREPARE()
  ntsync: convert ntsync_obj_get_fd() to FD_PREPARE()
  media: convert media_request_alloc() to FD_PREPARE()
  hv: convert mshv_ioctl_create_partition() to FD_PREPARE()
  gpio: convert linehandle_create() to FD_PREPARE()
  dma: port sw_sync_ioctl_create_fence() to FD_PREPARE()
  pseries: port papr_rtas_setup_file_interface() to FD_PREPARE()
  pseries: convert papr_platform_dump_create_handle() to FD_PREPARE()
  spufs: convert spufs_gang_open() to FD_PREPARE()
  papr-hvpipe: convert papr_hvpipe_dev_create_handle() to FD_PREPARE()
  spufs: convert spufs_context_open() to FD_PREPARE()
  net/socket: convert __sys_accept4_file() to FD_PREPARE()
  net/socket: convert sock_map_fd() to FD_PREPARE()
  net/sctp: convert sctp_getsockopt_peeloff_common() to FD_PREPARE()
  net/kcm: convert kcm_ioctl() to FD_PREPARE()
  ...

Link: https://patch.msgid.link/20251123-work-fd-prepare-v4-0-b6efa1706cfd@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-28 12:42:36 +01:00
Christian Brauner 6fb1022918
io_uring: convert io_create_mock_file() to FD_PREPARE()
Link: https://patch.msgid.link/20251123-work-fd-prepare-v4-45-b6efa1706cfd@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-28 12:42:36 +01:00
Christian Brauner 99d4f12f17
file: convert replace_fd() to FD_PREPARE()
Link: https://patch.msgid.link/20251123-work-fd-prepare-v4-44-b6efa1706cfd@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-28 12:42:36 +01:00
Christian Brauner 5f3ea1c201
vfio: convert vfio_group_ioctl_get_device_fd() to FD_ADD()
Link: https://patch.msgid.link/20251123-work-fd-prepare-v4-43-b6efa1706cfd@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-28 12:42:36 +01:00
Christian Brauner 3fd5edfe1d
tty: convert ptm_open_peer() to FD_ADD()
Christian Brauner <brauner@kernel.org> says:

The fix sent in [1] was squashed into this commit.

Fixes: https://lore.kernel.org/37ac7af5-584f-4768-a462-4d1071c43eaf@sirena.org.uk [1]
Reported-by: Mark Brown <broonie@kernel.org> [1]
Suggested-by: Linus Torvalds <torvalds@linux-foundation.org> [1]
Link: https://patch.msgid.link/20251123-work-fd-prepare-v4-42-b6efa1706cfd@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-28 12:42:36 +01:00
Christian Brauner af66279a01
ntsync: convert ntsync_obj_get_fd() to FD_PREPARE()
Link: https://patch.msgid.link/20251123-work-fd-prepare-v4-41-b6efa1706cfd@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-28 12:42:36 +01:00
Christian Brauner 6f504cbf10
media: convert media_request_alloc() to FD_PREPARE()
Link: https://patch.msgid.link/20251123-work-fd-prepare-v4-40-b6efa1706cfd@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-28 12:42:35 +01:00
Christian Brauner c99dc44562
hv: convert mshv_ioctl_create_partition() to FD_ADD()
Link: https://patch.msgid.link/20251123-work-fd-prepare-v4-39-b6efa1706cfd@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-28 12:42:35 +01:00
Christian Brauner da7e394bf5
gpio: convert linehandle_create() to FD_PREPARE()
Link: https://patch.msgid.link/20251123-work-fd-prepare-v4-38-b6efa1706cfd@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-28 12:42:35 +01:00
Christian Brauner 6ae8da4846
pseries: port papr_rtas_setup_file_interface() to FD_ADD()
Link: https://patch.msgid.link/20251123-work-fd-prepare-v4-36-b6efa1706cfd@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-28 12:42:35 +01:00
Christian Brauner 274d937006
pseries: convert papr_platform_dump_create_handle() to FD_ADD()
Link: https://patch.msgid.link/20251123-work-fd-prepare-v4-35-b6efa1706cfd@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-28 12:42:35 +01:00
Christian Brauner 0b9d4a6b51
spufs: convert spufs_gang_open() to FD_PREPARE()
Link: https://patch.msgid.link/20251123-work-fd-prepare-v4-34-b6efa1706cfd@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-28 12:42:35 +01:00
Christian Brauner 6d3789d347
papr-hvpipe: convert papr_hvpipe_dev_create_handle() to FD_PREPARE()
Fixes a UAF for src_info as well.

Link: https://patch.msgid.link/20251123-work-fd-prepare-v4-33-b6efa1706cfd@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-28 12:42:35 +01:00
Christian Brauner 843e7b5c29
spufs: convert spufs_context_open() to FD_PREPARE()
Link: https://patch.msgid.link/20251123-work-fd-prepare-v4-32-b6efa1706cfd@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-28 12:42:34 +01:00
Christian Brauner 4667d63872
net/socket: convert __sys_accept4_file() to FD_ADD()
Link: https://patch.msgid.link/20251123-work-fd-prepare-v4-31-b6efa1706cfd@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-28 12:42:34 +01:00
Christian Brauner 245f0d1c62
net/socket: convert sock_map_fd() to FD_ADD()
Link: https://patch.msgid.link/20251123-work-fd-prepare-v4-30-b6efa1706cfd@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-28 12:42:34 +01:00
Christian Brauner 0d52d06a19
net/kcm: convert kcm_ioctl() to FD_PREPARE()
Link: https://patch.msgid.link/20251123-work-fd-prepare-v4-28-b6efa1706cfd@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-28 12:42:34 +01:00
Christian Brauner fe67b063f6
net/handshake: convert handshake_nl_accept_doit() to FD_PREPARE()
Link: https://patch.msgid.link/20251123-work-fd-prepare-v4-27-b6efa1706cfd@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-28 12:42:34 +01:00
Christian Brauner 910c361f9a
secretmem: convert memfd_secret() to FD_ADD()
Link: https://patch.msgid.link/20251123-work-fd-prepare-v4-26-b6efa1706cfd@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-28 12:42:34 +01:00
Christian Brauner 1afcbbe5d6
memfd: convert memfd_create() to FD_ADD()
Link: https://patch.msgid.link/20251123-work-fd-prepare-v4-25-b6efa1706cfd@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-28 12:42:34 +01:00
Christian Brauner 981bec8f69
bpf: convert bpf_token_create() to FD_PREPARE()
Link: https://patch.msgid.link/20251123-work-fd-prepare-v4-24-b6efa1706cfd@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-28 12:42:33 +01:00
Christian Brauner 798c2da490
bpf: convert bpf_iter_new_fd() to FD_PREPARE()
Link: https://patch.msgid.link/20251123-work-fd-prepare-v4-23-b6efa1706cfd@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-28 12:42:33 +01:00
Christian Brauner f2573685bd
ipc: convert do_mq_open() to FD_ADD()
Christian Brauner <brauner@kernel.org> says:

The fix sent in [1] was squashed into this commit.

Fixes: https://lore.kernel.org/c41de645-8234-465f-a3be-f0385e3a163c@sirena.org.uk [1]
Reported-by: Mark Brown <broonie@kernel.org> [1]
Suggested-by: Linus Torvalds <torvalds@linux-foundation.org> [1]
Link: https://patch.msgid.link/20251123-work-fd-prepare-v4-22-b6efa1706cfd@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-28 12:42:33 +01:00
Christian Brauner 1ad7810c6d
exec: convert begin_new_exec() to FD_ADD()
Link: https://patch.msgid.link/20251123-work-fd-prepare-v4-21-b6efa1706cfd@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-28 12:42:33 +01:00
Christian Brauner 7352c6fce3
af_unix: convert unix_file_open() to FD_ADD()
Link: https://patch.msgid.link/20251123-work-fd-prepare-v4-19-b6efa1706cfd@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-28 12:42:33 +01:00
Christian Brauner 34dfce523c
dma: convert dma_buf_fd() to FD_ADD()
Link: https://patch.msgid.link/20251123-work-fd-prepare-v4-18-b6efa1706cfd@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-28 12:42:33 +01:00
Christian Brauner 993f30468e
xfs: convert xfs_open_by_handle() to FD_PREPARE()
Link: https://patch.msgid.link/20251123-work-fd-prepare-v4-17-b6efa1706cfd@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-28 12:42:33 +01:00
Christian Brauner 39f6e7581a
userfaultfd: convert new_userfaultfd() to FD_PREPARE()
Link: https://patch.msgid.link/20251123-work-fd-prepare-v4-16-b6efa1706cfd@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-28 12:42:33 +01:00
Christian Brauner 14010faa1b
timerfd: convert timerfd_create() to FD_ADD()
Link: https://patch.msgid.link/20251123-work-fd-prepare-v4-15-b6efa1706cfd@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-28 12:42:32 +01:00
Christian Brauner 5b755da105
signalfd: convert do_signalfd4() to FD_ADD()
Link: https://patch.msgid.link/20251123-work-fd-prepare-v4-14-b6efa1706cfd@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-28 12:42:32 +01:00
Christian Brauner 360fbf808a
open: convert do_sys_openat2() to FD_ADD()
Link: https://patch.msgid.link/20251123-work-fd-prepare-v4-13-b6efa1706cfd@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-28 12:42:32 +01:00
Christian Brauner 13dce771bb
eventpoll: convert do_epoll_create() to FD_PREPARE()
Link: https://patch.msgid.link/20251123-work-fd-prepare-v4-12-b6efa1706cfd@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-28 12:42:32 +01:00
Christian Brauner 0f4288410c
autofs: convert autofs_dev_ioctl_open_mountpoint() to FD_ADD()
Link: https://patch.msgid.link/20251123-work-fd-prepare-v4-11-b6efa1706cfd@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-28 12:42:32 +01:00
Christian Brauner 3d8aefd49a
nsfs: convert ns_ioctl() to FD_PREPARE()
Link: https://patch.msgid.link/20251123-work-fd-prepare-v4-10-b6efa1706cfd@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-28 12:42:32 +01:00
Christian Brauner 00de6e2448
nsfs: convert open_namespace() to FD_PREPARE()
Link: https://patch.msgid.link/20251123-work-fd-prepare-v4-9-b6efa1706cfd@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-28 12:42:32 +01:00
Christian Brauner 7129098f4f
fanotify: convert fanotify_init() to FD_PREPARE()
Christian Brauner <brauner@kernel.org> says:

The fix sent in [1] was squashed into this commit.

Link: https://lore.kernel.org/20251127201618.2115275-1-kuniyu@google.com [1]
Reported-by: syzbot+321168dfa622eda99689@syzkaller.appspotmail.com
Closes: https://lore.kernel.org/lkml/6928b121.a70a0220.d98e3.0110.GAE@google.com
Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
Link: https://patch.msgid.link/20251123-work-fd-prepare-v4-8-b6efa1706cfd@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-28 12:42:31 +01:00
Christian Brauner 05885f4165
namespace: convert fsmount() to FD_PREPARE()
Christian Brauner <brauner@kernel.org> says:

A variant of the fix sent in [1] was squashed into this commit.

Link: https://lore.kernel.org/20251128035149.392402-1-kartikey406@gmail.com [1]
Reported-by: Deepanshu Kartikey <kartikey406@gmail.com>
Reported-by: syzbot+94048264da5715c251f9@syzkaller.appspotmail.com
Tested-by: syzbot+94048264da5715c251f9@syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=94048264da5715c251f9
Link: https://patch.msgid.link/20251123-work-fd-prepare-v4-7-b6efa1706cfd@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-28 12:42:31 +01:00
Christian Brauner 416b0d1659
namespace: convert open_tree_attr() to FD_PREPARE()
Link: https://patch.msgid.link/20251123-work-fd-prepare-v4-6-b6efa1706cfd@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-28 12:42:31 +01:00
Christian Brauner 542a406543
namespace: convert open_tree() to FD_ADD()
Link: https://patch.msgid.link/20251123-work-fd-prepare-v4-5-b6efa1706cfd@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-28 12:42:31 +01:00
Christian Brauner fbe58faa69
fhandle: convert do_handle_open() to FD_ADD()
Link: https://patch.msgid.link/20251123-work-fd-prepare-v4-4-b6efa1706cfd@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-28 12:42:31 +01:00
Christian Brauner a5fa9ab846
eventfd: convert do_eventfd() to FD_PREPARE()
Link: https://patch.msgid.link/20251123-work-fd-prepare-v4-3-b6efa1706cfd@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-28 12:42:31 +01:00
Christian Brauner 8797dd5600
anon_inodes: convert to FD_ADD()
Link: https://patch.msgid.link/20251123-work-fd-prepare-v4-2-b6efa1706cfd@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-28 12:42:31 +01:00
Christian Brauner 011703a9ac
file: add FD_{ADD,PREPARE}()
I've been playing with this to allow for moderately flexible usage of
the get_unused_fd_flags() + create file + fd_install() pattern that's
used quite extensively.

How callers allocate files is really heterogenous so it's not really
convenient to fold them into a single class. It's possibe to split them
into subclasses like for anon inodes. I think that's not necessarily
nice as well.

My take is to add two primites:
(1) FD_ADD() the simple cases a file is installed:

    fd = FD_ADD(O_CLOEXEC, open_file(some, args)));
    if (fd >= 0)
            kvm_get_kvm(vcpu->kvm);
    return fd;

(2) FD_PREPARE() that captures all the cases where access to fd or file
    or additional work before publishing the fd is needed:

    FD_PREPARE(fdf, open_flag, file_open_handle(&path, open_flag));
    if (fdf.err)
            return fdf.err;

    if (copy_to_user(/* something something */))
            return -EFAULT;

    return fd_publish(fdf);

I've converted all of the easy cases over to it and it gets rid of an
aweful lot of convoluted cleanup logic.

It's centered around struct fd_prepare. FD_PREPARE() encapsulates all of
allocation and cleanup logic and must be followed by a call to
fd_publish() which associates the fd with the file and installs it into
the callers fdtable. If fd_publish() isn't called both are deallocated.

It mandates a specific order namely that first we allocate the fd and
then instantiate the file. But that shouldn't be a problem nearly
everyone I've converted uses this exact pattern anyway.

There's a bunch of additional cases where it would be easy to convert
them to this pattern. For example, the whole sync file stuff in dma
currently retains the containing structure of the file instead of the
file itself even though it's only used to allocate files. Changing that
would make it fall into the FD_PREPARE() pattern easily. I've not done
that work yet.

There's room for extending this in a way that wed'd have subclasses for
some particularly often use patterns but as I said I'm not even sure
that's worth it.

Link: https://patch.msgid.link/20251123-work-fd-prepare-v4-1-b6efa1706cfd@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-28 12:42:23 +01:00
Chen Ni 2579e21be5
ovl: remove unneeded semicolon
Remove unnecessary semicolons reported by Coccinelle/coccicheck and the
semantic patch at scripts/coccinelle/misc/semicolon.cocci.

Signed-off-by: Chen Ni <nichen@iscas.ac.cn>
Fixed: 7ab96df840 ("VFS/nfsd/cachefiles/ovl: add start_creating() and end_creating()")
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-28 11:05:52 +01:00
Jeff Layton 4be9e04ebf
vfs: add needed headers for new struct delegation definition
The definition of struct delegation uses stdint.h integer types. Add the
necessary headers to ensure that always works.

Fixes: 1602bad16d ("vfs: expose delegation support to userland")
Signed-off-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-28 10:55:34 +01:00
Mateusz Guzik ca0d620b0a
dcache: touch up predicts in __d_lookup_rcu()
Rationale is that if the parent dentry is the same and the length is the
same, then you have to be unlucky for the name to not match.

At the same time the dentry was literally just found on the hash, so you
have to be even more unlucky to determine it is unhashed.

While here add commentary while d_unhashed() is necessary. It was
already removed once and brought back in:
2e321806b6 ("Revert "vfs: remove unnecessary d_unhashed() check from __d_lookup_rcu"")

Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
Link: https://patch.msgid.link/20251127131526.4137768-1-mjguzik@gmail.com
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-28 10:31:45 +01:00
Randy Dunlap 01c9c30aae
filelock: __fcntl_getlease: fix kernel-doc warnings
Use the correct function name and add description for the @flavor
parameter to avoid these kernel-doc warnings:

Warning: fs/locks.c:1706 function parameter 'flavor' not described in
 '__fcntl_getlease'
WARNING: fs/locks.c:1706 expecting prototype for fcntl_getlease().
 Prototype was for __fcntl_getlease() instead

Fixes: 1602bad16d ("vfs: expose delegation support to userland")
Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Link: https://patch.msgid.link/20251128000826.457120-1-rdunlap@infradead.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-28 10:30:41 +01:00
Neil Brown eeec741ee0
nfsd: fix end_creating() conversion
Avoid a double-unlock as nfs_create_locked() will have unlocked the
parent and do the dput() manually.

Christian Brauner <brauner@kernel.org> says:

I've taken Neil's proposed fix from [1] and added a commit message.

Fixes: https://lore.kernel.org/202511252132.2c621407-lkp@intel.com [1]
Fixes: bd6ede8a06 ("VFS/nfsd/cachefiles/ovl: introduce start_removing() and end_removing()")
Signed-off-by: Neil Brown <neil@brown.name>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-28 09:51:16 +01:00
Sebastian Andrzej Siewior 37de2dbc31 debugobjects: Use LD_WAIT_CONFIG instead of LD_WAIT_SLEEP
fill_pool_map is used to suppress nesting violations caused by acquiring
a spinlock_t (from within the memory allocator) while holding a
raw_spinlock_t. The used annotation is wrong.

LD_WAIT_SLEEP is for always sleeping lock types such as mutex_t.
LD_WAIT_CONFIG is for lock type which are sleeping while spinning on
PREEMPT_RT such as spinlock_t.

Use LD_WAIT_CONFIG as override.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://patch.msgid.link/20251127153652.291697-3-bigeasy@linutronix.de
2025-11-27 16:55:34 +01:00
Sebastian Andrzej Siewior 06e0ae988f debugobjects: Allow to refill the pool before SYSTEM_SCHEDULING
The pool of free objects is refilled on several occasions such as object
initialisation. On PREEMPT_RT refilling is limited to preemptible
sections due to sleeping locks used by the memory allocator. The system
boots with disabled interrupts so the pool can not be refilled.

If too many objects are initialized and the pool gets empty then
debugobjects disables itself.

Refiling can also happen early in the boot with disabled interrupts as
long as the scheduler is not operational. If the scheduler can not
preempt a task then a sleeping lock can not be contended.

Allow to additionally refill the pool if the scheduler is not
operational.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://patch.msgid.link/20251127153652.291697-2-bigeasy@linutronix.de
2025-11-27 16:55:34 +01:00
Brendan Jackman 3d1f108845 x86/mm: Delete disabled debug code
This code doesn't run. Since 2008:

  4f9c11dd49 ("x86, 64-bit: adjust mapping of physical pagetables to work with Xen")

the kernel has gained more flexible logging and tracing capabilities;
presumably if anyone wanted to take advantage of this log message they would
have got rid of the "if (0)" so they could use these capabilities.

Since they haven't, just delete it.

Signed-off-by: Brendan Jackman <jackmanb@google.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Link: https://patch.msgid.link/20251003-x86-init-cleanup-v1-1-f2b7994c2ad6@google.com
2025-11-27 14:32:16 +01:00
Stefan Hajnoczi ebf8538979
MAINTAINERS: add German Maglione as virtiofs co-maintainer
German Maglione is a co-maintainer of the virtiofsd userspace device
implementation (https://gitlab.com/virtio-fs/virtiofsd) and is currently
one of the most active virtiofs developers outside the kernel.

I have not worked on virtiofs except to review kernel patches for a few
years now and would like German to take over from me gradually. It is
healthier to have a kernel maintainer who is actively involved. I expect
to remove myself in a few months.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Link: https://patch.msgid.link/20251126211548.598469-1-stefanha@redhat.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-27 10:00:09 +01:00
Peter Zijlstra b0a848f4a4 x86/bugs: Make i386 use GENERIC_BUG_RELATIVE_POINTERS
Linus figured less #ifdef is more better and making x86-32 use
GENERIC_BUG_RELATIVE_POINTERS removes one layer of macro magic from
the bug.h bits.

Suggested-by: Linus Torvalds <torvalds@linuxfoundation.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
2025-11-27 09:32:48 +01:00
Peter Zijlstra d62e4f2b95 x86/bug: Fix BUG_FORMAT vs KASLR
Encoding a relative NULL pointer doesn't work for KASLR, when the
whole kernel image gets shifted, the __bug_table and the target string
get shifted by the same amount and the relative offset is preserved.

However when the target is an absolute 0 value and the __bug_table
gets moved about, the end result in a pointer equivalent to
kaslr_offset(), not NULL.

Notably, this will generate SHN_UNDEF relocations, and Ard would
really like to not have those at all.

Use the empty string to denote no-string.

Suggested-by: Linus Torvalds <torvalds@linuxfoundation.org>
Cc: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
2025-11-27 09:32:47 +01:00
Alexandre Chartre 59bfa64082 objtool: Build with disassembly can fail when including bdf.h
Building objtool with disassembly support can fail when including
the bdf.h file:

  In file included from tools/objtool/include/objtool/arch.h:108,
                   from check.c:14:
  /usr/include/bfd.h:35:2: error: #error config.h must be included before this header
     35 | #error config.h must be included before this header
        |  ^~~~~

This check is present in the bfd.h file generated from the binutils
source code, but it is not necessarily present in the bfd.h file
provided in a binutil package (for example, it is not present in
the binutil RPM).

The solution to this issue is to define the PACKAGE macro before
including bfd.h. This is the solution suggested by the binutil
developer in bug 14243, and it is used by other kernel tools
which also use bfd.h (perf and bpf).

Fixes: 5995330382 ("objtool: Disassemble code with libopcodes instead of running objdump")
Closes: https://lore.kernel.org/all/3fa261fd-3b46-4cbe-b48d-7503aabc96cb@oracle.com/
Reported-by: Nathan Chancellor <nathan@kernel.org>
Suggested-by: Nathan Chancellor <nathan@kernel.org>
Signed-off-by: Alexandre Chartre <alexandre.chartre@oracle.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://sourceware.org/bugzilla/show_bug.cgi?id=14243
Link: https://patch.msgid.link/20251126134519.1760889-1-alexandre.chartre@oracle.com
2025-11-27 09:32:46 +01:00
Thomas Gleixner 2437f79880 - Use 64-bits for timer compensation for IoT usage where the suspend
time is much longer than what 32-bits can provide (Enlin Mu)
 
 - Add delay support on sp804 for ARM32 platforms (Stephen Eta Zhou)
 
 - Fix missing resource release on error in the probe path of in the
   ralink driver (Haotian Zhang)
 
 - Fix double deregistration on probe failure in the NXP STM driver
   (Johan Hovold)
 
 - Disable runtime PM for the Renesas SH CMT timer because it is
   incompatible with PREEMPT_RT=y (Niklas Söderlund)
 
 - Fix section mismatches in the NXP STM driver (Johan Hovold)
 
 - Preventing unbinding the NXP PIT, STM and MMIO ARM Arch timers as
   the code does not suppport bind/unbind (Johan Hovold)
 
 - Use the clocksource instead of ticks on the RDA8810PL platform
   (Enlin Mu)
 
 - Drop the unused module alias for the STM32-LP (Johan Hovold)
 
 - Add Realtek system timer driver (Hao-Wen Ting)
 -----BEGIN PGP SIGNATURE-----
 
 iQEzBAABCAAdFiEEGn3N4YVz0WNVyHskqDIjiipP6E8FAmkm6t4ACgkQqDIjiipP
 6E8xqQgAlXnV3vRJmEbjd3ILECvbKMLI2haHV2eA+75P+DvbfriL+ePMHkfkOPI6
 CC5UhCSy410cQLO88tzy5+9K8Po2KnHxb+lVS2P6zzcdefL5ZWMZ9Q+CAOwSo1s9
 An1A4nUgcTB52mAR+jlz++SF1VV/fMvskMrtiTg8bSIScSc+xi4sEC3GaZR09qSG
 RODtzmVsyeoHQ1u6ziRJen8GzpX1q6vUP0eAAr+vXqTUXdCuUL8P20h2mwzxPJWH
 mFo53OuKVbTMOoY1Av7euvO1ZZ1tsHsS4NxJfD1qatq+eh1As1dxYodB4dp44qZt
 jjnVuj0QrE40VB6EnHAA4kKb6WWpow==
 =WXsR
 -----END PGP SIGNATURE-----

Merge tag 'timers-v6.19-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/daniel.lezcano/linux into timers/clocksource

Pull clocksource/event changes from Daniel Lezcano:

    - Use 64-bits for timer compensation for IoT usage where the suspend
      time is much longer than what 32-bits can provide (Enlin Mu)

    - Add delay support on sp804 for ARM32 platforms (Stephen Eta Zhou)

    - Fix missing resource release on error in the probe path of in the
      ralink driver (Haotian Zhang)

    - Fix double deregistration on probe failure in the NXP STM driver
      (Johan Hovold)

    - Disable runtime PM for the Renesas SH CMT timer because it is
      incompatible with PREEMPT_RT=y (Niklas Söderlund)

    - Fix section mismatches in the NXP STM driver (Johan Hovold)

    - Preventing unbinding the NXP PIT, STM and MMIO ARM Arch timers as
      the code does not suppport bind/unbind (Johan Hovold)

    - Use the clocksource instead of ticks on the RDA8810PL platform
      (Enlin Mu)

    - Drop the unused module alias for the STM32-LP (Johan Hovold)

    - Add Realtek system timer driver (Hao-Wen Ting)

Link: https://lore.kernel.org/all/9303b790-28d4-4bd9-b01d-28fb05493596@linaro.org
2025-11-26 15:36:52 +01:00
Christian Brauner f403e1206b
Merge patch series "fs: tidy up step_into() & friends before inlining"
Cleanup step_into() and walk_component() and inline them both.

* patches from https://patch.msgid.link/20251120003803.2979978-1-mjguzik@gmail.com:
  fs: inline step_into() and walk_component()
  fs: tidy up step_into() & friends before inlining

Link: https://patch.msgid.link/20251120003803.2979978-1-mjguzik@gmail.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-26 14:52:07 +01:00
Mateusz Guzik 177fdbae39
fs: inline step_into() and walk_component()
The primary consumer is link_path_walk(), calling walk_component() every
time which in turn calls step_into().

Inlining these saves overhead of 2 function calls per path component,
along with allowing the compiler to do better job optimizing them in place.

step_into() had absolutely atrocious assembly to facilitate the
slowpath. In order to lessen the burden at the callsite all the hard
work is moved into step_into_slowpath() and instead an inline-able
fastpath is implemented for rcu-walk.

The new fastpath is a stripped down step_into() RCU handling with a
d_managed() check from handle_mounts().

Benchmarked as follows on Sapphire Rapids:
1. the "before" was a kernel with not-yet-merged optimizations (notably
   elision of calls to security_inode_permission() and marking ext4
   inodes as not having acls as applicable)
2. "after" is the same + the prep patch + this patch
3. benchmark consists of issuing 205 calls to access(2) in a loop with
   pathnames lifted out of gcc and the linker building real code, most
   of which have several path components and 118 of which fail with
   -ENOENT.

Result in terms of ops/s:
before:	21619
after:	22536 (+4%)

profile before:
  20.25%  [kernel]                  [k] __d_lookup_rcu
  10.54%  [kernel]                  [k] link_path_walk
  10.22%  [kernel]                  [k] entry_SYSCALL_64
   6.50%  libc.so.6                 [.] __GI___access
   6.35%  [kernel]                  [k] strncpy_from_user
   4.87%  [kernel]                  [k] step_into
   3.68%  [kernel]                  [k] kmem_cache_alloc_noprof
   2.88%  [kernel]                  [k] walk_component
   2.86%  [kernel]                  [k] kmem_cache_free
   2.14%  [kernel]                  [k] set_root
   2.08%  [kernel]                  [k] lookup_fast

after:
  23.38%  [kernel]                  [k] __d_lookup_rcu
  11.27%  [kernel]                  [k] entry_SYSCALL_64
  10.89%  [kernel]                  [k] link_path_walk
   7.00%  libc.so.6                 [.] __GI___access
   6.88%  [kernel]                  [k] strncpy_from_user
   3.50%  [kernel]                  [k] kmem_cache_alloc_noprof
   2.01%  [kernel]                  [k] kmem_cache_free
   2.00%  [kernel]                  [k] set_root
   1.99%  [kernel]                  [k] lookup_fast
   1.81%  [kernel]                  [k] do_syscall_64
   1.69%  [kernel]                  [k] entry_SYSCALL_64_safe_stack

While walk_component() and step_into() of course disappear from the
profile, the link_path_walk() barely gets more overhead despite the
inlining thanks to the fast path added and while completing more walks
per second.

I did not investigate why overhead grew a lot on __d_lookup_rcu().

Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
Link: https://patch.msgid.link/20251120003803.2979978-2-mjguzik@gmail.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-26 14:52:02 +01:00
Mateusz Guzik 9d2a6211a7
fs: tidy up step_into() & friends before inlining
Symlink handling is already marked as unlikely and pushing out some of
it into pick_link() reduces register spillage on entry to step_into()
with gcc 14.2.

The compiler needed additional convincing that handle_mounts() is
unlikely to fail.

At the same time neither clang nor gcc could be convinced to tail-call
into pick_link().

While pick_link() takes an address of stack-based object as an argument
(which definitely prevents the optimization), splitting it into separate
<dentry, mount> tuple did not help. The issue persists even when
compiled without stack protector. As such nothing was done about this
for the time being to not grow the diff.

Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
Link: https://patch.msgid.link/20251120003803.2979978-1-mjguzik@gmail.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-26 14:52:02 +01:00
Christian Brauner 1ed45a4ddc
Merge patch series "re-enable IOCB_NOWAIT writes to files v2"
Christoph Hellwig <hch@lst.de> says:

[Fix] the layering bypass in btrfs when updating timestamps on device
files for devices removed from btrfs usage, and FMODE_NOCMTIME handling
in the VFS now that nfsd started using it.  Note that I'm still not sure
that nfsd usage is fully correct for all file systems, as only XFS
explicitly supports FMODE_NOCMTIME, but at least the generic code does
the right thing now.

* patches from https://patch.msgid.link/20251120064859.2911749-1-hch@lst.de:
  orangefs: use inode_update_timestamps directly
  btrfs: fix the comment on btrfs_update_time
  btrfs: use vfs_utimes to update file timestamps
  fs: export vfs_utimes
  fs: lift the FMODE_NOCMTIME check into file_update_time_flags
  fs: refactor file timestamp update logic

Link: https://patch.msgid.link/20251120064859.2911749-1-hch@lst.de
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-26 14:50:17 +01:00
Christoph Hellwig eff094a58d
orangefs: use inode_update_timestamps directly
Orangefs has no i_version handling and __orangefs_setattr already
explicitly marks the inode dirty.  So instead of the using
the flags return value from generic_update_time, just call the
lower level inode_update_timestamps helper directly.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://patch.msgid.link/20251120064859.2911749-7-hch@lst.de
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-26 14:50:10 +01:00
Christoph Hellwig f981264ae7
btrfs: fix the comment on btrfs_update_time
Since commit e41f941a23 ("Btrfs: move over to use ->update_time") this
is not a copy of the high-level file_update_time helper.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://patch.msgid.link/20251120064859.2911749-6-hch@lst.de
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-26 14:50:10 +01:00
Christoph Hellwig ded9958704
btrfs: use vfs_utimes to update file timestamps
Btrfs updates the device node timestamps for block device special files
when it stop using the device.

Commit 8f96a5bfa1 ("btrfs: update the bdev time directly when closing")
switch that update from the correct layering to directly call the
low-level helper on the bdev inode.  This is wrong and got fixed in
commit 54fde91f52 ("btrfs: update device path inode time instead of
bd_inode") by updating the file system inode instead of the bdev inode,
but this kept the incorrect bypassing of the VFS interfaces and file
system ->update_times method.  Fix this by using the propet vfs_utimes
interface.

Fixes: 8f96a5bfa1 ("btrfs: update the bdev time directly when closing")
Fixes: 54fde91f52 ("btrfs: update device path inode time instead of bd_inode")
Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://patch.msgid.link/20251120064859.2911749-5-hch@lst.de
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-26 14:50:10 +01:00
Christoph Hellwig 0139836652
fs: export vfs_utimes
This will be used to replace an incorrect direct call into
generic_update_time in btrfs.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://patch.msgid.link/20251120064859.2911749-4-hch@lst.de
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-26 14:50:10 +01:00
Christoph Hellwig 7f30e7a423
fs: lift the FMODE_NOCMTIME check into file_update_time_flags
FMODE_NOCMTIME used to be just a hack for the legacy XFS handle-based
"invisible I/O", but commit e5e9b24ab8 ("nfsd: freeze c/mtime updates
with outstanding WRITE_ATTRS delegation") started using it from
generic callers.

I'm not sure other file systems are actually read for this in general,
so the above commit should get a closer look, but for it to make any
sense, file_update_time needs to respect the flag.

Lift the check from file_modified_flags to file_update_time so that
users of file_update_time inherit the behavior and so that all the
checks are done in one place.

Fixes: e5e9b24ab8 ("nfsd: freeze c/mtime updates with outstanding WRITE_ATTRS delegation")
Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://patch.msgid.link/20251120064859.2911749-3-hch@lst.de
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-26 14:50:10 +01:00
Christoph Hellwig 3cd9a42f1b
fs: refactor file timestamp update logic
Currently the two high-level APIs use two helper functions to implement
almost all of the logic.  Refactor the two helpers and the common logic
into a new file_update_time_flags routine that gets the iocb flags or
0 in case of file_update_time passed so that the entire logic is
contained in a single function and can be easily understood and modified.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://patch.msgid.link/20251120064859.2911749-2-hch@lst.de
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-26 14:50:10 +01:00
Hao-Wen Ting d1780dce95 clocksource/drivers: Add Realtek system timer driver
Add a system timer driver for Realtek SoCs.

This driver registers the 1 MHz global hardware counter on Realtek
platforms as a clock event device. Since this hardware counter starts
counting automatically after SoC power-on, no clock initialization is
required. Because the counter does not stop or get affected by CPU power
down, and it supports oneshot mode, it is typically used as a tick
broadcast timer.

Signed-off-by: Hao-Wen Ting <haowen.ting@realtek.com>
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://patch.msgid.link/20251126060110.198330-3-haowen.ting@realtek.com
2025-11-26 11:25:15 +01:00
Hao-Wen Ting 40caba2bd0 dt-bindings: timer: Add Realtek SYSTIMER
The Realtek SYSTIMER (System Timer) is a 64-bit global hardware counter
operating at a fixed 1MHz frequency. Thanks to its compare match
interrupt capability, the timer natively supports oneshot mode for tick
broadcast functionality.

Signed-off-by: Hao-Wen Ting <haowen.ting@realtek.com>
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Reviewed-by: Krzysztof Kozlowski <krzk@kernel.org>
Link: https://patch.msgid.link/20251126060110.198330-2-haowen.ting@realtek.com
2025-11-26 11:25:15 +01:00
Johan Hovold ed92a968a9 clocksource/drivers/stm32-lp: Drop unused module alias
The driver cannot be built as a module so drop the unused platform
module alias.

Note that platform aliases are not needed for OF probing should it ever
become possible to build the driver as a module.

Signed-off-by: Johan Hovold <johan@kernel.org>
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Link: https://patch.msgid.link/20251111154516.1698-1-johan@kernel.org
2025-11-26 11:25:15 +01:00
Enlin Mu 627f3f3716 clocksource/drivers/rda: Add sched_clock_register for RDA8810PL SoC
The current system log timestamp accuracy is tick based, which can not
meet the usage requirements and needs to reach nanoseconds.
Therefore, the sched_clock_register function needs to be added.

[ dlezcano: Fixed typos ]

Signed-off-by: Enlin Mu <enlin.mu@unisoc.com>
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Link: https://patch.msgid.link/20251107063347.3692-1-enlin.mu@linux.dev
2025-11-26 11:25:11 +01:00
Johan Hovold 6a2416892e clocksource/drivers/nxp-stm: Prevent driver unbind
Clockevents cannot be deregistered so suppress the bind attributes to
prevent the driver from being unbound and releasing the underlying
resources after registration.

Even if the driver can currently only be built-in, also switch to
builtin_platform_driver() to prevent it from being unloaded should
modular builds ever be enabled.

Fixes: cec32ac758 ("clocksource/drivers/nxp-timer: Add the System Timer Module for the s32gx platforms")
Signed-off-by: Johan Hovold <johan@kernel.org>
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Link: https://patch.msgid.link/20251111153226.579-4-johan@kernel.org
2025-11-26 11:25:03 +01:00
Johan Hovold e25f964cf4 clocksource/drivers/nxp-pit: Prevent driver unbind
The driver does not support unbinding (e.g. as clockevents cannot be
deregistered) so suppress the bind attributes to prevent the driver from
being unbound and rebound after registration (and disabling the timer
when reprobing fails).

Even if the driver can currently only be built-in, also switch to
builtin_platform_driver() to prevent it from being unloaded should
modular builds ever be enabled.

Fixes: bee33f22d7 ("clocksource/drivers/nxp-pit: Add NXP Automotive s32g2 / s32g3 support")
Signed-off-by: Johan Hovold <johan@kernel.org>
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Link: https://patch.msgid.link/20251111153226.579-3-johan@kernel.org
2025-11-26 11:24:57 +01:00
Johan Hovold 6aa10f0e2e clocksource/drivers/arm_arch_timer_mmio: Prevent driver unbind
Clockevents cannot be deregistered so suppress the bind attributes to
prevent the driver from being unbound and releasing the underlying
resources after registration.

Fixes: 4891f01527 ("clocksource/drivers/arm_arch_timer: Add standalone MMIO driver")
Signed-off-by: Johan Hovold <johan@kernel.org>
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Acked-by: Marc Zyngier <maz@kernel.org>
Link: https://patch.msgid.link/20251111153226.579-2-johan@kernel.org
2025-11-26 11:24:47 +01:00
Johan Hovold b452d2c97e clocksource/drivers/nxp-stm: Fix section mismatches
Platform drivers can be probed after their init sections have been
discarded (e.g. on probe deferral or manual rebind through sysfs) so the
probe function must not live in init. Device managed resource actions
similarly cannot be discarded.

The "_probe" suffix of the driver structure name prevents modpost from
warning about this so replace it to catch any similar future issues.

Fixes: cec32ac758 ("clocksource/drivers/nxp-timer: Add the System Timer Module for the s32gx platforms")
Signed-off-by: Johan Hovold <johan@kernel.org>
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Cc: stable@vger.kernel.org	# 6.16
Cc: Daniel Lezcano <daniel.lezcano@linaro.org>
Link: https://patch.msgid.link/20251017054943.7195-1-johan@kernel.org
2025-11-26 11:24:44 +01:00
Niklas Söderlund 62524f285c clocksource/drivers/sh_cmt: Always leave device running after probe
The CMT device can be used as both a clocksource and a clockevent
provider. The driver tries to be smart and power itself on and off, as
well as enabling and disabling its clock when it's not in operation.
This behavior is slightly altered if the CMT is used as an early
platform device in which case the device is left powered on after probe,
but the clock is still enabled and disabled at runtime.

This has worked for a long time, but recent improvements in PREEMPT_RT
and PROVE_LOCKING have highlighted an issue. As the CMT registers itself
as a clockevent provider, clockevents_register_device(), it needs to use
raw spinlocks internally as this is the context of which the clockevent
framework interacts with the CMT driver. However in the context of
holding a raw spinlock the CMT driver can't really manage its power
state or clock with calls to pm_runtime_*() and clk_*() as these calls
end up in other platform drivers using regular spinlocks to control
power and clocks.

This mix of spinlock contexts trips a lockdep warning.

    =============================
    [ BUG: Invalid wait context ]
    6.17.0-rc3-arm64-renesas-03071-gb3c4f4122b28-dirty #21 Not tainted
    -----------------------------
    swapper/1/0 is trying to lock:
    ffff00000898d180 (&dev->power.lock){-...}-{3:3}, at: __pm_runtime_resume+0x38/0x88
    ccree e6601000.crypto: ARM CryptoCell 630P Driver: HW version 0xAF400001/0xDCC63000, Driver version 5.0
    other info that might help us debug this:
    ccree e6601000.crypto: ARM ccree device initialized
    context-{5:5}
    2 locks held by swapper/1/0:
     #0: ffff80008173c298 (tick_broadcast_lock){-...}-{2:2}, at: __tick_broadcast_oneshot_control+0xa4/0x3a8
     #1: ffff0000089a5858 (&ch->lock){....}-{2:2}
    usbcore: registered new interface driver usbhid
    , at: sh_cmt_start+0x30/0x364
    stack backtrace:
    CPU: 1 UID: 0 PID: 0 Comm: swapper/1 Not tainted 6.17.0-rc3-arm64-renesas-03071-gb3c4f4122b28-dirty #21 PREEMPT
    Hardware name: Renesas Salvator-X 2nd version board based on r8a77965 (DT)
    Call trace:
     show_stack+0x14/0x1c (C)
     dump_stack_lvl+0x6c/0x90
     dump_stack+0x14/0x1c
     __lock_acquire+0x904/0x1584
     lock_acquire+0x220/0x34c
     _raw_spin_lock_irqsave+0x58/0x80
     __pm_runtime_resume+0x38/0x88
     sh_cmt_start+0x54/0x364
     sh_cmt_clock_event_set_oneshot+0x64/0xb8
     clockevents_switch_state+0xfc/0x13c
     tick_broadcast_set_event+0x30/0xa4
     __tick_broadcast_oneshot_control+0x1e0/0x3a8
     tick_broadcast_oneshot_control+0x30/0x40
     cpuidle_enter_state+0x40c/0x680
     cpuidle_enter+0x30/0x40
     do_idle+0x1f4/0x26c
     cpu_startup_entry+0x34/0x40
     secondary_start_kernel+0x11c/0x13c
     __secondary_switched+0x74/0x78

For non-PREEMPT_RT builds this is not really an issue, but for
PREEMPT_RT builds where normal spinlocks can sleep this might be an
issue. Be cautious and always leave the power and clock running after
probe.

Signed-off-by: Niklas Söderlund <niklas.soderlund+renesas@ragnatech.se>
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Tested-by: Geert Uytterhoeven <geert+renesas@glider.be>
Link: https://patch.msgid.link/20251016182022.1837417-1-niklas.soderlund+renesas@ragnatech.se
2025-11-26 11:24:40 +01:00
Johan Hovold 6b38a8b31e clocksource/drivers/stm: Fix double deregistration on probe failure
The purpose of the devm_add_action_or_reset() helper is to call the
action function in case adding an action ever fails so drop the clock
source deregistration from the error path to avoid deregistering twice.

Fixes: cec32ac758 ("clocksource/drivers/nxp-timer: Add the System Timer Module for the s32gx platforms")
Signed-off-by: Johan Hovold <johan@kernel.org>
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Cc: Daniel Lezcano <daniel.lezcano@linaro.org>
Link: https://patch.msgid.link/20251017055039.7307-1-johan@kernel.org
2025-11-26 11:24:37 +01:00
Haotian Zhang 2ba8e2aae1 clocksource/drivers/ralink: Fix resource leaks in init error path
The ralink_systick_init() function does not release all acquired resources
on its error paths. If irq_of_parse_and_map() or a subsequent call fails,
the previously created I/O memory mapping and IRQ mapping are leaked.

Add goto-based error handling labels to ensure that all allocated
resources are correctly freed.

Fixes: 1f2acc5a8a ("MIPS: ralink: Add support for systick timer found on newer ralink SoC")
Signed-off-by: Haotian Zhang <vulab@iscas.ac.cn>
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Link: https://patch.msgid.link/20251030090710.1603-1-vulab@iscas.ac.cn
2025-11-26 11:24:34 +01:00
Stephen Eta Zhou 640594a04f clocksource/drivers/timer-sp804: Fix read_current_timer() issue when clock source is not registered
Register a valid read_current_timer() function for the
SP804 timer on ARM32.

On ARM32 platforms, when the SP804 timer is selected as the clocksource,
the driver does not register a valid read_current_timer() function.
As a result, features that rely on this API—such as rdseed—consistently
return incorrect values.

To fix this, a delay_timer structure is registered during the SP804
driver's initialization. The read_current_timer() function is implemented
using the existing sp804_read() logic, and the timer frequency is reused
from the already-initialized clocksource.

Signed-off-by: Stephen Eta Zhou <stephen.eta.zhou@gmail.com>
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Link: https://patch.msgid.link/20250525-sp804-fix-read_current_timer-v4-1-87a9201fa4ec@gmail.com
2025-11-26 11:24:32 +01:00
Enlin Mu 576c564ec3 clocksource/drivers/sprd: Enable register for timer counter from 32 bit to 64 bit
Using 32 bit for suspend compensation, the max compensation time is 36
hours(working clock is 32k).In some IOT devices, the suspend time may
be long, even exceeding 36 hours. Therefore, a 64 bit timer counter
is needed for counting.

Signed-off-by: Enlin Mu <enlin.mu@unisoc.com>
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Link: https://patch.msgid.link/20251106021830.34846-1-enlin.mu@linux.dev
2025-11-26 11:24:26 +01:00
Thomas Gleixner 653fda7ae7 sched/mmcid: Switch over to the new mechanism
Now that all pieces are in place, change the implementations of
sched_mm_cid_fork() and sched_mm_cid_exit() to adhere to the new strict
ownership scheme and switch context_switch() over to use the new
mm_cid_schedin() functionality.

The common case is that there is no mode change required, which makes
fork() and exit() just update the user count and the constraints.

In case that a new user would exceed the CID space limit the fork() context
handles the transition to per CPU mode with mm::mm_cid::mutex held. exit()
handles the transition back to per task mode when the user count drops
below the switch back threshold. fork() might also be forced to handle a
deferred switch back to per task mode, when a affinity change increased the
number of allowed CPUs enough.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20251119172550.280380631@linutronix.de
2025-11-25 19:45:42 +01:00
Thomas Gleixner 9da6ccbcea sched/mmcid: Implement deferred mode change
When affinity changes cause an increase of the number of CPUs allowed for
tasks which are related to a MM, that might results in a situation where
the ownership mode can go back from per CPU mode to per task mode.

As affinity changes happen with runqueue lock held there is no way to do
the actual mode change and required fixup right there.

Add the infrastructure to defer it to a workqueue. The scheduled work can
race with a fork() or exit(). Whatever happens first takes care of it.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20251119172550.216484739@linutronix.de
2025-11-25 19:45:42 +01:00
Thomas Gleixner c809f081fe irqwork: Move data struct to a types header
... to avoid header recursion hell.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20251119172550.152813625@linutronix.de
2025-11-25 19:45:41 +01:00
Thomas Gleixner fbd0e71dc3 sched/mmcid: Provide CID ownership mode fixup functions
CIDs are either owned by tasks or by CPUs. The ownership mode depends on
the number of tasks related to a MM and the number of CPUs on which these
tasks are theoretically allowed to run on. Theoretically because that
number is the superset of CPU affinities of all tasks which only grows and
never shrinks.

Switching to per CPU mode happens when the user count becomes greater than
the maximum number of CIDs, which is calculated by:

	opt_cids = min(mm_cid::nr_cpus_allowed, mm_cid::users);
	max_cids = min(1.25 * opt_cids, nr_cpu_ids);

The +25% allowance is useful for tight CPU masks in scenarios where only a
few threads are created and destroyed to avoid frequent mode
switches. Though this allowance shrinks, the closer opt_cids becomes to
nr_cpu_ids, which is the (unfortunate) hard ABI limit.

At the point of switching to per CPU mode the new user is not yet visible
in the system, so the task which initiated the fork() runs the fixup
function: mm_cid_fixup_tasks_to_cpu() walks the thread list and either
transfers each tasks owned CID to the CPU the task runs on or drops it into
the CID pool if a task is not on a CPU at that point in time. Tasks which
schedule in before the task walk reaches them do the handover in
mm_cid_schedin(). When mm_cid_fixup_tasks_to_cpus() completes it's
guaranteed that no task related to that MM owns a CID anymore.

Switching back to task mode happens when the user count goes below the
threshold which was recorded on the per CPU mode switch:

	pcpu_thrs = min(opt_cids - (opt_cids / 4), nr_cpu_ids / 2);

This threshold is updated when a affinity change increases the number of
allowed CPUs for the MM, which might cause a switch back to per task mode.

If the switch back was initiated by a exiting task, then that task runs the
fixup function. If it was initiated by a affinity change, then it's run
either in the deferred update function in context of a workqueue or by a
task which forks a new one or by a task which exits. Whatever happens
first. mm_cid_fixup_cpus_to_task() walks through the possible CPUs and
either transfers the CPU owned CIDs to a related task which runs on the CPU
or drops it into the pool. Tasks which schedule in on a CPU which the walk
did not cover yet do the handover themselves.

This transition from CPU to per task ownership happens in two phases:

 1) mm:mm_cid.transit contains MM_CID_TRANSIT. This is OR'ed on the task
    CID and denotes that the CID is only temporarily owned by the
    task. When it schedules out the task drops the CID back into the
    pool if this bit is set.

 2) The initiating context walks the per CPU space and after completion
    clears mm:mm_cid.transit. After that point the CIDs are strictly
    task owned again.

This two phase transition is required to prevent CID space exhaustion
during the transition as a direct transfer of ownership would fail if
two tasks are scheduled in on the same CPU before the fixup freed per
CPU CIDs.

When mm_cid_fixup_cpus_to_tasks() completes it's guaranteed that no CID
related to that MM is owned by a CPU anymore.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20251119172550.088189028@linutronix.de
2025-11-25 19:45:41 +01:00
Thomas Gleixner 9a723ed7fa sched/mmcid: Provide new scheduler CID mechanism
The MM CID management has two fundamental requirements:

  1) It has to guarantee that at no given point in time the same CID is
     used by concurrent tasks in userspace.

  2) The CID space must not exceed the number of possible CPUs in a
     system. While most allocators (glibc, tcmalloc, jemalloc) do not
     care about that, there seems to be at least some LTTng library
     depending on it.

The CID space compaction itself is not a functional correctness
requirement, it is only a useful optimization mechanism to reduce the
memory foot print in unused user space pools.

The optimal CID space is:

    min(nr_tasks, nr_cpus_allowed);

Where @nr_tasks is the number of actual user space threads associated to
the mm and @nr_cpus_allowed is the superset of all task affinities. It is
growth only as it would be insane to take a racy snapshot of all task
affinities when the affinity of one task changes just do redo it 2
milliseconds later when the next task changes it's affinity.

That means that as long as the number of tasks is lower or equal than the
number of CPUs allowed, each task owns a CID. If the number of tasks
exceeds the number of CPUs allowed it switches to per CPU mode, where the
CPUs own the CIDs and the tasks borrow them as long as they are scheduled
in.

For transition periods CIDs can go beyond the optimal space as long as they
don't go beyond the number of possible CPUs.

The current upstream implementation adds overhead into task migration to
keep the CID with the task. It also has to do the CID space consolidation
work from a task work in the exit to user space path. As that work is
assigned to a random task related to a MM this can inflict unwanted exit
latencies.

Implement the context switch parts of a strict ownership mechanism to
address this.

This removes most of the work from the task which schedules out. Only
during transitioning from per CPU to per task ownership it is required to
drop the CID when leaving the CPU to prevent CID space exhaustion. Other
than that scheduling out is just a single check and branch.

The task which schedules in has to check whether:

    1) The ownership mode changed
    2) The CID is within the optimal CID space

In stable situations this results in zero work. The only short disruption
is when ownership mode changes or when the associated CID is not in the
optimal CID space. The latter only happens when tasks exit and therefore
the optimal CID space shrinks.

That mechanism is strictly optimized for the common case where no change
happens. The only case where it actually causes a temporary one time spike
is on mode changes when and only when a lot of tasks related to a MM
schedule exactly at the same time and have eventually to compete on
allocating a CID from the bitmap.

In the sysbench test case which triggered the spinlock contention in the
initial CID code, __schedule() drops significantly in perf top on a 128
Core (256 threads) machine when running sysbench with 255 threads, which
fits into the task mode limit of 256 together with the parent thread:

  Upstream  rseq/perf branch  +CID rework
  0.42%     0.37%             0.32%          [k] __schedule

Increasing the number of threads to 256, which puts the test process into
per CPU mode looks about the same.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20251119172550.023984859@linutronix.de
2025-11-25 19:45:41 +01:00
Thomas Gleixner 23343b6b09 sched/mmcid: Introduce per task/CPU ownership infrastructure
The MM CID management has two fundamental requirements:

  1) It has to guarantee that at no given point in time the same CID is
     used by concurrent tasks in userspace.

  2) The CID space must not exceed the number of possible CPUs in a
     system. While most allocators (glibc, tcmalloc, jemalloc) do not care
     about that, there seems to be at least librseq depending on it.

The CID space compaction itself is not a functional correctness
requirement, it is only a useful optimization mechanism to reduce the
memory foot print in unused user space pools.

The optimal CID space is:

    min(nr_tasks, nr_cpus_allowed);

Where @nr_tasks is the number of actual user space threads associated to
the mm and @nr_cpus_allowed is the superset of all task affinities. It is
growth only as it would be insane to take a racy snapshot of all task
affinities when the affinity of one task changes just do redo it 2
milliseconds later when the next task changes its affinity.

That means that as long as the number of tasks is lower or equal than the
number of CPUs allowed, each task owns a CID. If the number of tasks
exceeds the number of CPUs allowed it switches to per CPU mode, where the
CPUs own the CIDs and the tasks borrow them as long as they are scheduled
in.

For transition periods CIDs can go beyond the optimal space as long as they
don't go beyond the number of possible CPUs.

The current upstream implementation adds overhead into task migration to
keep the CID with the task. It also has to do the CID space consolidation
work from a task work in the exit to user space path. As that work is
assigned to a random task related to a MM this can inflict unwanted exit
latencies.

This can be done differently by implementing a strict CID ownership
mechanism. Either the CIDs are owned by the tasks or by the CPUs. The
latter provides less locality when tasks are heavily migrating, but there
is no justification to optimize for overcommit scenarios and thereby
penalizing everyone else.

Provide the basic infrastructure to implement this:

  - Change the UNSET marker to BIT(31) from ~0U
  - Add the ONCPU marker as BIT(30)
  - Add the TRANSIT marker as BIT(29)

That allows to check for ownership trivially and provides a simple check for
UNSET as well. The TRANSIT marker is required to prevent CID space
exhaustion when switching from per CPU to per task mode.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://patch.msgid.link/20251119172549.960252358@linutronix.de
2025-11-25 19:45:41 +01:00
Thomas Gleixner 51dd92c71a sched/mmcid: Serialize sched_mm_cid_fork()/exit() with a mutex
Prepare for the new CID management scheme which puts the CID ownership
transition into the fork() and exit() slow path by serializing
sched_mm_cid_fork()/exit() with it, so task list and cpu mask walks can be
done in interruptible and preemptible code.

The contention on it is not worse than on other concurrency controls in the
fork()/exit() machinery.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20251119172549.895826703@linutronix.de
2025-11-25 19:45:41 +01:00
Thomas Gleixner b0c3d51b54 sched/mmcid: Provide precomputed maximal value
Reading mm::mm_users and mm:::mm_cid::nr_cpus_allowed every time to compute
the maximal CID value is just wasteful as that value is only changing on
fork(), exit() and eventually when the affinity changes.

So it can be easily precomputed at those points and provided in mm::mm_cid
for consumption in the hot path.

But there is an issue with using mm::mm_users for accounting because that
does not necessarily reflect the number of user space tasks as other kernel
code can take temporary references on the MM which skew the picture.

Solve that by adding a users counter to struct mm_mm_cid, which is modified
by fork() and exit() and used for precomputing under mm_mm_cid::lock.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20251119172549.832764634@linutronix.de
2025-11-25 19:45:40 +01:00
Thomas Gleixner bf070520e3 sched/mmcid: Move initialization out of line
It's getting bigger soon, so just move it out of line to the rest of the
code.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20251119172549.769636491@linutronix.de
2025-11-25 19:45:40 +01:00
Thomas Gleixner 2b1642b881 signal: Move MMCID exit out of sighand lock
There is no need anymore to keep this under sighand lock as the current
code and the upcoming replacement are not depending on the exit state of a
task anymore.

That allows to use a mutex in the exit path.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20251119172549.706439391@linutronix.de
2025-11-25 19:45:40 +01:00
Thomas Gleixner 539115f08c sched/mmcid: Convert mm CID mask to a bitmap
This is truly a bitmap and just conveniently uses a cpumask because the
maximum size of the bitmap is nr_cpu_ids.

But that prevents to do searches for a zero bit in a limited range, which
is helpful to provide an efficient mechanism to consolidate the CID space
when the number of users decreases.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Acked-by: Yury Norov (NVIDIA) <yury.norov@gmail.com>
Link: https://patch.msgid.link/20251119172549.642866767@linutronix.de
2025-11-25 19:45:40 +01:00
Thomas Gleixner 35a5c37cb9 cpumask: Cache num_possible_cpus()
Reevaluating num_possible_cpus() over and over does not make sense. That
becomes a constant after init as cpu_possible_mask is marked ro_after_init.

Cache the value during initialization and provide that for consumption.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Yury Norov <yury.norov@gmail.com>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Reviewed-by: Shrikanth Hegde <sshegde@linux.ibm.com>
Link: https://patch.msgid.link/20251119172549.578653738@linutronix.de
2025-11-25 19:45:40 +01:00
Mateusz Guzik 003a660730
fs: push list presence check into inode_io_list_del()
For consistency with sb routines.

ext4 is the only consumer outside of evict(). Damage-controlling it is
outside of the scope of this cleanup.

Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
Link: https://patch.msgid.link/20251103230911.516866-1-mjguzik@gmail.com
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-25 10:34:49 +01:00
Mateusz Guzik 4c6b40877b
fs: cosmetic fixes to lru handling
1. inode_bit_waitqueue() was somehow placed between __inode_add_lru() and
   inode_add_lru(). move it up
2. assert ->i_lock is held in __inode_add_lru instead of just claiming it is
   needed
3. s/__inode_add_lru/__inode_lru_list_add/ for consistency with itself
   (inode_lru_list_del()) and similar routines for sb and io list
   management
4. push list presence check into inode_lru_list_del(), just like sb and
   io list

Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
Link: https://patch.msgid.link/20251029131428.654761-2-mjguzik@gmail.com
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-25 10:34:49 +01:00
Mateusz Guzik a27628f436
fs: rework I_NEW handling to operate without fences
In the inode hash code grab the state while ->i_lock is held. If found
to be set, synchronize the sleep once more with the lock held.

In the real world the flag is not set most of the time.

Apart from being simpler to reason about, it comes with a minor speed up
as now clearing the flag does not require the smp_mb() fence.

While here rename wait_on_inode() to wait_on_new_inode() to line it up
with __wait_on_freeing_inode().

Christian Brauner <brauner@kernel.org> says:

As per the discussion in [1] I folded in the diff sent in [2].

Link: https://lore.kernel.org/69238e4d.a70a0220.d98e3.006e.GAE@google.com [1]
Link: https://lore.kernel.org/c2kpawomkbvtahjm7y5mposbhckb7wxthi3iqy5yr22ggpucrm@ufvxwy233qxo [2]
Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
Link: https://patch.msgid.link/20251010221737.1403539-1-mjguzik@gmail.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-25 10:32:39 +01:00
Christoph Hellwig 7fd8720dff
iomap: allocate s_dio_done_wq for async reads as well
Since commit 222f2c7c6d14 ("iomap: always run error completions in user
context"), read error completions are deferred to s_dio_done_wq.  This
means the workqueue also needs to be allocated for async reads.

Fixes: 222f2c7c6d14 ("iomap: always run error completions in user context")
Reported-by: syzbot+a2b9a4ed0d61b1efb3f5@syzkaller.appspotmail.com
Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://patch.msgid.link/20251124140013.902853-1-hch@lst.de
Tested-by: syzbot+a2b9a4ed0d61b1efb3f5@syzkaller.appspotmail.com
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-25 10:22:19 +01:00
Joanne Koong d7ff85d4b8
iomap: fix iomap_read_end() for already uptodate folios
There are some cases where when iomap_read_end() is called, the folio
may already have been marked uptodate. For example, if the iomap block
needed zeroing, then the folio may have been marked uptodate after the
zeroing.

iomap_read_end() should unlock the folio instead of calling
folio_end_read(), which is how these cases were handled prior to commit
f8eaf79406 ("iomap: simplify ->read_folio_range() error handling for
reads"). Calling folio_end_read() on an uptodate folio leads to buggy
behavior where marking an already uptodate folio as uptodate will XOR it
to be marked nonuptodate.

Fixes: f8eaf79406 ("iomap: simplify ->read_folio_range() error handling for reads")
Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
Link: https://patch.msgid.link/20251118211111.1027272-2-joannelkoong@gmail.com
Tested-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reported-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-25 10:22:19 +01:00
Christian Brauner 5ec58e6acd
Merge patch series "enable iomap dio write completions from interrupt context v2"
Christoph Hellwig <hch@lst.de> says:

Currently iomap defers all write completions to interrupt context.  This
was based on my assumption that no one cares about the latency of those
to simplify the code vs the old direct-io.c.  It turns out someone cared,
as Avi reported a lot of context switches with ScyllaDB, which at least
in older kernels with workqueue scheduling issues caused really high
tail latencies.

Fortunately allowing the direct completions is pretty easy with all the
other iomap changes we had since.

While doing this I've also found dead code which gets removed (patch 1)
and an incorrect assumption in zonefs that read completions are called
in user context, which it assumes for it's error handling.  Fix this by
always calling error completions from user context (patch 2).
Against the vfs-6.19.iomap branch.

* patches from https://patch.msgid.link/20251113170633.1453259-1-hch@lst.de:
  iomap: invert the polarity of IOMAP_DIO_INLINE_COMP
  iomap: support write completions from interrupt context
  iomap: rework REQ_FUA selection
  iomap: always run error completions in user context
  fs, iomap: remove IOCB_DIO_CALLER_COMP

Link: https://patch.msgid.link/20251113170633.1453259-1-hch@lst.de
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-25 10:22:19 +01:00
Christoph Hellwig 76192a42c2
iomap: invert the polarity of IOMAP_DIO_INLINE_COMP
Replace IOMAP_DIO_INLINE_COMP with a flag to indicate that the
completion should be offloaded.  This removes a tiny bit of boilerplate
code, but more importantly just makes the code easier to follow as this
new flag gets set most of the time and only cleared in one place, while
it was the inverse for the old version.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://patch.msgid.link/20251113170633.1453259-6-hch@lst.de
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-25 10:22:19 +01:00
Christoph Hellwig eca9dc2089
iomap: support write completions from interrupt context
Completions for pure overwrites don't need to be deferred to a workqueue
as there is no work to be done, or at least no work that needs a user
context.  Set the IOMAP_DIO_INLINE_COMP by default for writes like we
already do for reads, and the clear it for all the cases that actually
do need a user context for completions to update the inode size or
record updates to the logical to physical mapping.

I've audited all users of the ->end_io callback, and they only require
user context for I/O that involves unwritten extents, COW, size
extensions, or error handling and all those are still run from workqueue
context.

This restores the behavior of the old pre-iomap direct I/O code.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://patch.msgid.link/20251113170633.1453259-5-hch@lst.de
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-25 10:22:19 +01:00
Christoph Hellwig 29086a31b3
iomap: rework REQ_FUA selection
The way how iomap_dio_can_use_fua and the caller is structured is
a bit confusing, as the main guarding condition is hidden in the
helper, and the secondary conditions are split between caller and
callee.

Refactor the code, so that iomap_dio_bio_iter itself tracks if a write
might need metadata updates based on the iomap type and flags, and
then have a condition based on that to use the FUA flag.

Note that this also moves the REQ_OP_WRITE assignment to the end of
the branch to improve readability a bit.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://patch.msgid.link/20251113170633.1453259-4-hch@lst.de
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-25 10:22:18 +01:00
Christoph Hellwig ddb4873286
iomap: always run error completions in user context
At least zonefs expects error completions to be able to sleep.  Because
error completions aren't performance critical, just defer them to workqueue
context unconditionally.

Fixes: 8dcc1a9d90 ("fs: New zonefs file system")
Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://patch.msgid.link/20251113170633.1453259-3-hch@lst.de
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-25 10:22:18 +01:00
Christoph Hellwig f9f8514999
fs, iomap: remove IOCB_DIO_CALLER_COMP
This was added by commit 099ada2c87 ("io_uring/rw: add write support
for IOCB_DIO_CALLER_COMP") and disabled a little later by commit
838b35bb6a ("io_uring/rw: disable IOCB_DIO_CALLER_COMP") because it
didn't work.  Remove all the related code that sat unused for 2 years.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://patch.msgid.link/20251113170633.1453259-2-hch@lst.de
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Reviewed-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-25 10:22:18 +01:00
Christian Brauner f53d302ee8
Merge patch series "iomap: buffered io changes"
This series contains several fixes and cleanups:

* Renaming bytes_pending/bytes_accounted to
  bytes_submitted/bytes_not_submitted for improved code clarity

* Accounting for unaligned end offsets when truncating read ranges

* Adding documentation for iomap_finish_folio_write() requirements

* Optimizing pending async writeback accounting logic

* Simplifying error handling in ->read_folio_range() for read operations

* Streamlining logic for skipping reads during write operations

* Replacing manual bitmap scanning with find_next_bit() for both dirty
  and uptodate bitmaps, improving performance

* patches from https://patch.msgid.link/20251111193658.3495942-1-joannelkoong@gmail.com:
  iomap: use find_next_bit() for uptodate bitmap scanning
  iomap: use find_next_bit() for dirty bitmap scanning
  iomap: simplify when reads can be skipped for writes
  iomap: simplify ->read_folio_range() error handling for reads
  iomap: optimize pending async writeback accounting
  docs: document iomap writeback's iomap_finish_folio_write() requirement
  iomap: account for unaligned end offsets when truncating read range
  iomap: rename bytes_pending/bytes_accounted to bytes_submitted/bytes_not_submitted

Link: https://patch.msgid.link/20251111193658.3495942-1-joannelkoong@gmail.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-25 10:22:10 +01:00
Joanne Koong b56c1c54f2
iomap: use find_next_bit() for uptodate bitmap scanning
Use find_next_bit()/find_next_zero_bit() for iomap uptodate bitmap
scanning. This uses __ffs() internally and is more efficient for
finding the next uptodate or non-uptodate bit than iterating through the
the bitmap range testing every bit.

Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
Link: https://patch.msgid.link/20251111193658.3495942-10-joannelkoong@gmail.com
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Suggested-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-25 10:22:10 +01:00
Joanne Koong fed9c62d28
iomap: use find_next_bit() for dirty bitmap scanning
Use find_next_bit()/find_next_zero_bit() for iomap dirty bitmap
scanning. This uses __ffs() internally and is more efficient for
finding the next dirty or clean bit than iterating through the bitmap
range testing every bit.

Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
Link: https://patch.msgid.link/20251111193658.3495942-9-joannelkoong@gmail.com
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Suggested-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-25 10:22:10 +01:00
Askar Safin 54ca9e913e
include/linux/fs.h: trivial fix: regualr -> regular
Trivial fix.

Signed-off-by: Askar Safin <safinaskar@gmail.com>
Link: https://patch.msgid.link/20251120195140.571608-1-safinaskar@gmail.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-25 10:13:09 +01:00
Askar Safin bef0202fb7
fs/splice.c: trivial fix: pipes -> pipe's
Trivial fix.

Signed-off-by: Askar Safin <safinaskar@gmail.com>
Link: https://patch.msgid.link/20251120211316.706725-1-safinaskar@gmail.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-25 10:11:16 +01:00
Matthew Wilcox (Oracle) 37d369fa97
fs: Add uoff_t
In a recent commit, I inadvertently changed a comparison from being an
unsigned comparison (on 64-bit systems) to being a signed comparison
(which it had always been on 32-bit systems).  This led to a sporadic
fstests failure.

To make sure this comparison is always unsigned, introduce a new type,
uoff_t which is the unsigned version of loff_t.  Generally file sizes
are restricted to being a signed integer, but in these two places it is
convenient to pass -1 to indicate "up to the end of the file".

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Link: https://patch.msgid.link/20251123220518.1447261-1-willy@infradead.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-25 10:07:42 +01:00
Mateusz Guzik 8d79ec9e7f
fs: mark lookup_slow() as noinline
Otherwise it gets inlined notably in walk_component(), which convinces
the compiler to push/pop additional registers in the fast path to
accomodate existence of the inlined version.

Shortens the fast path of that routine from 87 to 71 bytes.

Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
Link: https://patch.msgid.link/20251119144930.2911698-1-mjguzik@gmail.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-25 10:04:38 +01:00
Mateusz Guzik 7c179096e7
fs: add predicts based on nd->depth
Stats from nd->depth usage during the venerable kernel build collected like so:
bpftrace -e 'kprobe:terminate_walk,kprobe:walk_component,kprobe:legitimize_links
{ @[probe] = lhist(((struct nameidata *)arg0)->depth, 0, 8, 1); }'

@[kprobe:legitimize_links]:
[0, 1)           6554906 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@|
[1, 2)              3534 |                                                    |

@[kprobe:terminate_walk]:
[0, 1)          12153664 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@|

@[kprobe:walk_component]:
[0, 1)          53075749 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@|
[1, 2)            971421 |                                                    |
[2, 3)             84946 |                                                    |

Additionally a custom probe was added for depth within link_path_walk():
bpftrace -e 'kprobe:link_path_walk_probe { @[probe] = lhist(arg0, 0, 8, 1); }'
@[kprobe:link_path_walk_probe]:
[0, 1)           7528231 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@|
[1, 2)            407905 |@@                                                  |

Given these results:
1. terminate_walk() is called towards the end of the lookup and in this
   test it never had any links to clean up.
2. legitimize_links() is also called towards the end of lookup and most
   of the time there s 0 depth. Patch consumers to avoid calling into it
   in that case.
3. walk_component() is typically called with WALK_MORE and zero depth,
   checked in that order. Check depth first and predict it is 0.
4. link_path_walk() also does not deal with a symlink most of the time
   when !*name

Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
Link: https://patch.msgid.link/20251119142954.2909394-1-mjguzik@gmail.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-25 10:04:01 +01:00
Alexandre Chartre c0a67900dc objtool: Trim trailing NOPs in alternative
When disassembling alternatives replace trailing NOPs with a single
indication of the number of bytes covered with NOPs.

Signed-off-by: Alexandre Chartre <alexandre.chartre@oracle.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Josh Poimboeuf <jpoimboe@kernel.org>
Link: https://patch.msgid.link/20251121095340.464045-31-alexandre.chartre@oracle.com
2025-11-24 20:40:48 +01:00
Alexandre Chartre aff95e0d4e objtool: Add wide output for disassembly
Add the --wide option to provide a wide output when disassembling.
With this option, the disassembly of alternatives is displayed
side-by-side instead of one above the other.

Signed-off-by: Alexandre Chartre <alexandre.chartre@oracle.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Josh Poimboeuf <jpoimboe@kernel.org>
Link: https://patch.msgid.link/20251121095340.464045-30-alexandre.chartre@oracle.com
2025-11-24 20:40:48 +01:00
Alexandre Chartre 07d70b271a objtool: Compact output for alternatives with one instruction
When disassembling, if an instruction has alternatives which are all
made of a single instruction then print each alternative on a single
line (instruction + description) so that the output is more compact.

Signed-off-by: Alexandre Chartre <alexandre.chartre@oracle.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Josh Poimboeuf <jpoimboe@kernel.org>
Link: https://patch.msgid.link/20251121095340.464045-29-alexandre.chartre@oracle.com
2025-11-24 20:40:48 +01:00
Alexandre Chartre 56967b9a77 objtool: Improve naming of group alternatives
Improve the naming of group alternatives by showing the feature name and
flags used by the alternative.

Signed-off-by: Alexandre Chartre <alexandre.chartre@oracle.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Josh Poimboeuf <jpoimboe@kernel.org>
Link: https://patch.msgid.link/20251121095340.464045-28-alexandre.chartre@oracle.com
2025-11-24 20:40:48 +01:00
Alexandre Chartre 8308fd0019 objtool: Add Function to get the name of a CPU feature
Add a function to get the name of a CPU feature. The function is
architecture dependent and currently only implemented for x86. The
feature names are automatically generated from the cpufeatures.h
include file.

Signed-off-by: Alexandre Chartre <alexandre.chartre@oracle.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Josh Poimboeuf <jpoimboe@kernel.org>
Link: https://patch.msgid.link/20251121095340.464045-27-alexandre.chartre@oracle.com
2025-11-24 20:39:47 +01:00
Peter Zijlstra 860238af7a x86_64/bug: Inline the UD1
(Ab)use the static_call infrastructure to convert all:

  call __WARN_trap

instances into the desired:

  ud1 (%edx), %rdi

eliminating the CALL/RET, but more importantly, fixing the
fact that all WARNs will have:

  RIP: 0010:__WARN_trap+0

Basically, by making it a static_call trampoline call, objtool will
collect the callsites, and then the inline rewrite will hit the
special case and replace the code with the magic instruction.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patch.msgid.link/20251110115758.456717741@infradead.org
2025-11-24 20:23:25 +01:00
Peter Zijlstra 11bb4944f0 x86/bug: Implement WARN_ONCE()
Implement WARN_ONCE like WARN using BUGFLAG_ONCE.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patch.msgid.link/20251110115758.339309119@infradead.org
2025-11-24 20:23:25 +01:00
Peter Zijlstra 5b472b6e5b x86_64/bug: Implement __WARN_printf()
The basic idea is to have __WARN_printf() be a vararg function such
that the compiler can do the optimal calling convention for us. This
function body will be a #UD and then set up a va_list in the exception
from pt_regs.

But because the trap will be in a called function, the bug_entry must
be passed in. Have that be the first argument, with the format tucked
away inside the bug_entry.

The comments should clarify the real fun details.

The big downside is that all WARNs will now show:

 RIP: 0010:__WARN_trap:+0

One possible solution is to simply discard the top frame when
unwinding. A follow up patch takes care of this slightly differently
by abusing the x86 static_call implementation.

This changes (with the next patches):

	WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET,
		  "corrupted preempt_count: %s/%d/0x%x\n",

from:

	cmpl    $2, %ecx	#, _7
        jne     .L1472
	...

  .L1472:
        cmpb    $0, __already_done.11(%rip)
        je      .L1513
	...

  .L1513
	movb    $1, __already_done.11(%rip)
	movl    1424(%r14), %edx        # _15->pid, _15->pid
        leaq    1912(%r14), %rsi        #, _17
        movq    $.LC43, %rdi    #,
        call    __warn_printk   #
	ud2
  .pushsection __bug_table,"aw"
        2:
        .long 1b - .    # bug_entry::bug_addr
        .long .LC1 - .  # bug_entry::file
        .word 5093      # bug_entry::line
        .word 2313      # bug_entry::flags
        .org 2b + 12
  .popsection
  .pushsection .discard.annotate_insn,"M", @progbits, 8
        .long 1b - .
        .long 8         # ANNOTYPE_REACHABLE
  .popsection

into:

	cmpl    $2, %ecx        #, _7
        jne     .L1442  #,
	...

  .L1442:
        lea (2f)(%rip), %rdi
  1:
  .pushsection __bug_table,"aw"
        2:
        .long 1b - .    # bug_entry::bug_addr
        .long .LC43 - . # bug_entry::format
        .long .LC1 - .  # bug_entry::file
        .word 5093      # bug_entry::line
        .word 2323      # bug_entry::flags
        .org 2b + 16
  .popsection
        movl    1424(%r14), %edx        # _19->pid, _19->pid
        leaq    1912(%r14), %rsi        #, _13
	ud1 (%edx), %rdi

Notably, by pushing everything into the exception handler it can take
care of the ONCE thing.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patch.msgid.link/20251110115758.213813530@infradead.org
2025-11-24 20:23:05 +01:00
Peter Zijlstra 4f1b701f24 x86/bug: Use BUG_FORMAT for DEBUG_BUGVERBOSE_DETAILED
Since we have an explicit format string, use it for the condition string
instead of frobbing it in the file string.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patch.msgid.link/20251110115758.097401406@infradead.org
2025-11-24 20:22:21 +01:00
Peter Zijlstra 0a52d339d3 x86/bug: Add BUG_FORMAT basics
Opt-in to BUG_FORMAT for x86_64, adjust the BUGTABLE helper and for
now, just store NULL pointers.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patch.msgid.link/20251110115757.980264454@infradead.org
2025-11-24 20:22:11 +01:00
Nikolay Borisov 69acbdbbef RAS/AMD/ATL: Replace bitwise_xor_bits() with hweight16()
Doing hweight16() and checking whether the lsb is set is functionally
equivalent to what bitwise_xor_bits() does. In addition, it results in better
generated code as before gcc would inline the function 4 times.  With hweight16(),
the resulting code boils down to 2 instructions -  POPCNT and AND, and all
relevant CPUs support POPCNT.

An alternative would have been to use the __builtin_parity() function provided
by both Clang/GCC, however under some circumstances the compiler can choose not
to inline it but generate a library call which is unsupported in the kernel.

No functional changes.

  [ bp: Massage commit message. ]

Signed-off-by: Nikolay Borisov <nik.borisov@suse.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Link: https://patch.msgid.link/20251124142517.1708451-1-nik.borisov@suse.com
2025-11-24 17:00:37 +01:00
Yue Haibing e6a11a526e x86/{boot,mtrr}: Remove unused function declarations
Commits

  28be1b454c ("x86/boot: Remove unused copy_*_gs() functions")
  34d2819f20 ("x86, mtrr: Remove unused mtrr/state.c")

removed the functions but left the prototypes. Remove them.

  [ bp: Merge into a single patch. ]

Signed-off-by: Yue Haibing <yuehaibing@huawei.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Link: https://patch.msgid.link/20251120121037.1479334-1-yuehaibing@huawei.com
2025-11-22 21:26:36 +01:00
Lorenzo Pieralisi 9c1fbc56ca irqchip/gic-its: Rework platform MSI deviceID detection
Current code retrieving platform devices MSI devID in the GIC ITS MSI
parent helpers suffers from some minor issues:

- It leaks a struct device_node reference
- It is duplicated between GICv3 and GICv5 for no good reason
- It does not use the OF phandle iterator code that simplifies
  the msi-parent property parsing

Consolidate GIC v3 and v5 deviceID retrieval in a function that addresses
the full set of issues in one go by merging GIC v3 and v5 code and
converting the msi-parent parsing loop to the more modern OF phandle
iterator API, fixing the struct device_node reference leak in the process.

Signed-off-by: Lorenzo Pieralisi <lpieralisi@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frank Li <Frank.Li@nxp.com>
Reviewed-by: Marc Zyngier <maz@kernel.org>
Link: https://patch.msgid.link/20251021124103.198419-6-lpieralisi@kernel.org
2025-11-22 17:09:03 +01:00
Lorenzo Pieralisi 4f32612f6a PCI: iproc: Implement MSI controller node detection with of_msi_xlate()
The functionality implemented in the iproc driver in order to detect an
OF MSI controller node is now fully implemented in of_msi_xlate().

Replace the current msi-map/msi-parent parsing code with of_msi_xlate().

Since of_msi_xlate() is also a deviceID mapping API, pass in a fictitious
0 as deviceID - the driver only requires detecting the OF MSI controller
node not the deviceID mapping per-se (of_msi_xlate() return value is
ignored for the same reason).

Signed-off-by: Lorenzo Pieralisi <lpieralisi@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frank Li <Frank.Li@nxp.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Link: https://patch.msgid.link/20251021124103.198419-5-lpieralisi@kernel.org
2025-11-22 17:09:03 +01:00
Thomas Gleixner ebb922c920 Linux 6.18-rc3
-----BEGIN PGP SIGNATURE-----
 
 iQFSBAABCgA8FiEEq68RxlopcLEwq+PEeb4+QwBBGIYFAmj+p+UeHHRvcnZhbGRz
 QGxpbnV4LWZvdW5kYXRpb24ub3JnAAoJEHm+PkMAQRiGKsIH/1EFGYZDVJ7pTOcO
 qJY/xfu5YNd4ezZTGMW5SgJK+lAdJwkmbu8PUlcOhXKRVvACG9Tud/+pZzw966C5
 pk9pF9vpCXq2Zz6dk3/XGFARUPUlDA4uJ/jiPTNVA8yy+V18u+Ds55Y+rhv9MkcW
 n/Fi+fiYfjqAaqP328mWH9z51ibRqH3WQfqVdjzClzoSC31BuJUVEZi9s5FZ7C9Q
 OCvRLp8WvTpcQ7ab7WH/wCgznXEKyRM/OxaNtXWztod9GLqOmWoFiHUxWfEQ/gg+
 KzgbgQOeXI6q7U8xJZ/711ZFzGLR9VBEPN0HnqxRNr8fCpzJ9FKFGTFD2HcBgUjy
 F9JH3nk=
 =YBEg
 -----END PGP SIGNATURE-----

Merge tag 'v6.18-rc3' into irq/msi

Pick up OF changes to resolve dependencies
2025-11-22 17:07:57 +01:00
Babu Moger ac7de456a3 fs/resctrl: Update bit_usage to reflect io_alloc
The "shareable_bits" and "bit_usage" resctrl files associated with cache
resources give insight into how instances of a cache is used.

Update the annotated capacity bitmasks displayed by "bit_usage" to include the
cache portions allocated for I/O via the "io_alloc" feature. "shareable_bits"
is a global bitmask of shareable cache with I/O and can thus not present the
per-domain I/O allocations possible with the "io_alloc" feature. Revise the
"shareable_bits" documentation to direct users to "bit_usage" for accurate
cache usage information.

Signed-off-by: Babu Moger <babu.moger@amd.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Link: https://patch.msgid.link/e02a0d424129fd7f3e45822a559b1c614ae4652a.1762995456.git.babu.moger@amd.com
2025-11-22 14:30:34 +01:00
Babu Moger 28fa2cce7a fs/resctrl: Introduce interface to modify io_alloc capacity bitmasks
The io_alloc feature in resctrl enables system software to configure the
portion of the cache allocated for I/O traffic. When supported, the
io_alloc_cbm file in resctrl provides access to capacity bitmasks (CBMs)
allocated for I/O devices.

Enable users to modify io_alloc CBMs by writing to the io_alloc_cbm resctrl
file when the io_alloc feature is enabled.

Mirror the CBMs between CDP_CODE and CDP_DATA when CDP is enabled to present
consistent I/O allocation information to user space.

Signed-off-by: Babu Moger <babu.moger@amd.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Link: https://patch.msgid.link/67609641b03ccfba18a8ee0bf9dbd1f3dcbecda3.1762995456.git.babu.moger@amd.com
2025-11-22 14:28:31 +01:00
Babu Moger af1242eeca fs/resctrl: Modify struct rdt_parse_data to pass mode and CLOSID
parse_cbm() requires resource group mode and CLOSID to validate the capacity
bitmask (CBM). It is passed via struct rdtgroup in struct rdt_parse_data.

The io_alloc feature also uses CBMs to indicate which portions of cache are
allocated for I/O traffic. The CBMs are provided by user space and need to be
validated the same as CBMs provided for general (CPU) cache allocation.
parse_cbm() cannot be used as-is since io_alloc does not have rdtgroup context.

Pass the resource group mode and CLOSID directly to parse_cbm() via struct
rdt_parse_data, instead of through the rdtgroup struct, to facilitate calling
parse_cbm() to verify the CBM of the io_alloc feature.

Signed-off-by: Babu Moger <babu.moger@amd.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Link: https://patch.msgid.link/f8ec6ab5cf594d906a3fe75f56793d5fbd63f38f.1762995456.git.babu.moger@amd.com
2025-11-22 13:10:12 +01:00
Babu Moger 77b6623262 fs/resctrl: Introduce interface to display io_alloc CBMs
Introduce the "io_alloc_cbm" resctrl file to display the capacity bitmasks
(CBMs) that represent the portions of each cache instance allocated
for I/O traffic on a cache resource that supports the "io_alloc" feature.

io_alloc_cbm resides in the info directory of a cache resource, for example,
/sys/fs/resctrl/info/L3/. Since the resource name is part of the path, it
is not necessary to display the resource name as done in the schemata file.

When CDP is enabled, io_alloc routes traffic using the highest CLOSID
associated with the CDP_CODE resource and that CLOSID becomes unusable for
the CDP_DATA resource. The highest CLOSID of CDP_CODE and CDP_DATA resources
will be kept in sync to ensure consistent user interface. In preparation for
this, access the CBMs for I/O traffic through highest CLOSID of either
CDP_CODE or CDP_DATA resource.

Signed-off-by: Babu Moger <babu.moger@amd.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Link: https://patch.msgid.link/55a3ff66a70e7ce8239f022e62b334e9d64af604.1762995456.git.babu.moger@amd.com
2025-11-22 11:37:21 +01:00
Frederic Weisbecker 3de5e46e50 genirq: Remove cpumask availability check on kthread affinity setting
Failing to allocate the affinity mask of an interrupt descriptor fails the
whole descriptor initialization. It is then guaranteed that the cpumask is
always available whenever the related interrupt objects are alive, such as
the kthread handler.

Therefore remove the superfluous check since it is merely a historical
leftover. Get rid also of the comments above it that are obsolete and
useless.

Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://patch.msgid.link/20251121143500.42111-4-frederic@kernel.org
2025-11-22 09:26:18 +01:00
Frederic Weisbecker 801afdfbfc genirq: Fix interrupt threads affinity vs. cpuset isolated partitions
When a cpuset isolated partition is created / updated or destroyed, the
interrupt threads are affined blindly to all the non-isolated CPUs. This
happens without taking into account the interrupt threads initial affinity
that becomes ignored.

For example in a system with 8 CPUs, if an interrupt and its kthread are
initially affine to CPU 5, creating an isolated partition with only CPU 2
inside will eventually end up affining the interrupt kthread to all CPUs
but CPU 2 (that is CPUs 0,1,3-7), losing the kthread preference for CPU 5.

Besides the blind re-affining, this doesn't take care of the actual low
level interrupt which isn't migrated. As of today the only way to isolate
non managed interrupts, along with their kthreads, is to overwrite their
affinity separately, for example through /proc/irq/

To avoid doing that manually, future development should focus on updating
the interrupt's affinity whenever cpuset isolated partitions are updated.

In the meantime, cpuset shouldn't fiddle with interrupt threads directly.
To prevent from that, set the PF_NO_SETAFFINITY flag to them.

This is done through kthread_bind_mask() by affining them initially to all
possible CPUs as at that point the interrupt is not started up which means
the affinity of the hard interrupt is not known. The thread will adjust
that once it reaches the handler, which is guaranteed to happen after the
initial affinity of the hard interrupt is established.

Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://patch.msgid.link/20251121143500.42111-3-frederic@kernel.org
2025-11-22 09:26:18 +01:00
Frederic Weisbecker 68775ca79a genirq: Prevent early spurious wake-ups of interrupt threads
During initialization, the interrupt thread is created before the interrupt
is enabled. The interrupt enablement happens before the actual kthread wake
up point. Once the interrupt is enabled the hardware can raise an interrupt
and once setup_irq() drops the descriptor lock a interrupt wake-up can
happen.

Even when such an interrupt can be considered premature, this is not a
problem in general because at the point where the descriptor lock is
dropped and the wakeup can happen, the data which is used by the thread is
fully initialized.

Though from the perspective of least surprise, the initial wakeup really
should be performed by the setup code and not randomly by a premature
interrupt.

Prevent this by performing a wake-up only if the target is in state
TASK_INTERRUPTIBLE, which the thread uses in wait_for_interrupt().

If the thread is still in state TASK_UNINTERRUPTIBLE, the wake-up is not
lost because after the setup code completed the initial wake-up the thread
will observe the IRQTF_RUNTHREAD and proceed with the handling.

[ tglx: Simplified the changes and extended the changelog. ]

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://patch.msgid.link/20251121143500.42111-2-frederic@kernel.org
2025-11-22 09:26:18 +01:00
Babu Moger 9445c7059c fs/resctrl: Add user interface to enable/disable io_alloc feature
AMD's SDCIAE forces all SDCI lines to be placed into the L3 cache portions
identified by the highest-supported L3_MASK_n register, where n is the maximum
supported CLOSID.

To support this, when io_alloc resctrl feature is enabled, reserve the highest
CLOSID exclusively for I/O allocation traffic making it no longer available for
general CPU cache allocation.

Introduce user interface to enable/disable io_alloc feature and encourage users
to enable io_alloc only when running workloads that can benefit from this
functionality. On enable, initialize the io_alloc CLOSID with all usable CBMs
across all the domains.

Since CLOSIDs are managed by resctrl fs, it is least invasive to make "io_alloc
is supported by maximum supported CLOSID" part of the initial resctrl fs
support for io_alloc. Take care to minimally (only in error messages) expose
this use of CLOSID for io_alloc to user space so that this is not required from
other architectures that may support io_alloc differently in the future.

When resctrl is mounted with "-o cdp" to enable code/data prioritization,
there are two L3 resources that can support I/O allocation: L3CODE and
L3DATA.  From resctrl fs perspective the two resources share a CLOSID and
the architecture's available CLOSID are halved to support this.

The architecture's underlying CLOSID used by SDCIAE when CDP is enabled is the
CLOSID associated with the CDP_CODE resource, but from resctrl's perspective
there is only one CLOSID for both CDP_CODE and CDP_DATA. CDP_DATA is thus not
usable for general (CPU) cache allocation nor I/O allocation.

Keep the CDP_CODE and CDP_DATA I/O alloc status in sync to avoid any confusion
to user space.  That is, enabling io_alloc on CDP_CODE does so on CDP_DATA and
vice-versa, and keep the I/O allocation CBMs of CDP_CODE and CDP_DATA in sync.

Signed-off-by: Babu Moger <babu.moger@amd.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Link: https://patch.msgid.link/c7d3037795e653e22b02d8fc73ca80d9b075031c.1762995456.git.babu.moger@amd.com
2025-11-21 23:01:54 +01:00
Babu Moger 48068e5650 fs/resctrl: Introduce interface to display "io_alloc" support
Introduce the "io_alloc" resctrl file to the "info" area of a cache resource,
for example /sys/fs/resctrl/info/L3/io_alloc. "io_alloc" indicates support for
the "io_alloc" feature that allows direct insertion of data from I/O
devices into the cache.

Restrict exposing support for "io_alloc" to the L3 resource that is the only
resource where this feature can be backed by AMD's L3 Smart Data Cache
Injection Allocation Enforcement (SDCIAE). With that, the "io_alloc" file is
only visible to user space if the L3 resource supports "io_alloc".

Doing so makes the file visible for all cache resources though, for example
also L2 cache (if it supports cache allocation). As a consequence, add
capability for file to report expected "enabled" and "disabled", as well as
"not supported".

Signed-off-by: Babu Moger <babu.moger@amd.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Link: https://patch.msgid.link/e8b116a8f424128b227734bb1d433c14af478d90.1762995456.git.babu.moger@amd.com
2025-11-21 22:49:42 +01:00
Babu Moger 556d2892aa x86,fs/resctrl: Implement "io_alloc" enable/disable handlers
"io_alloc" is the generic name of the new resctrl feature that enables system
software to configure the portion of cache allocated for I/O traffic. On AMD
systems, "io_alloc" resctrl feature is backed by AMD's L3 Smart Data Cache
Injection Allocation Enforcement (SDCIAE).

Introduce the architecture-specific functions that resctrl fs should call to
enable, disable, or check status of the "io_alloc" feature. Change SDCIAE state
by setting (to enable) or clearing (to disable) bit 1 of
MSR_IA32_L3_QOS_EXT_CFG on all logical processors within the cache domain.

Signed-off-by: Babu Moger <babu.moger@amd.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Link: https://patch.msgid.link/9e9070100c320eab5368e088a3642443dee95ed7.1762995456.git.babu.moger@amd.com
2025-11-21 22:35:22 +01:00
Babu Moger 7923ae7698 x86,fs/resctrl: Detect io_alloc feature
AMD's SDCIAE (SDCI Allocation Enforcement) PQE feature enables system software
to control the portions of L3 cache used for direct insertion of data from I/O
devices into the L3 cache.

Introduce a generic resctrl cache resource property "io_alloc_capable" as the
first part of the new "io_alloc" resctrl feature that will support AMD's
SDCIAE. Any architecture can set a cache resource as "io_alloc_capable" if
a portion of the cache can be allocated for I/O traffic.

Set the "io_alloc_capable" property for the L3 cache resource on x86 (AMD)
systems that support SDCIAE.

Signed-off-by: Babu Moger <babu.moger@amd.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Link: https://patch.msgid.link/df85a9a6081674fd3ef6b4170920485512ce2ded.1762995456.git.babu.moger@amd.com
2025-11-21 22:04:59 +01:00
Babu Moger 4d4840b125 x86/resctrl: Add SDCIAE feature in the command line options
Add a kernel command-line parameter to enable or disable the exposure of
the L3 Smart Data Cache Injection Allocation Enforcement (SDCIAE) hardware
feature to resctrl.

Signed-off-by: Babu Moger <babu.moger@amd.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Link: https://patch.msgid.link/c623edf7cb369ba9da966de47d9f1b666778a40e.1762995456.git.babu.moger@amd.com
2025-11-21 22:03:23 +01:00
Babu Moger 3767def18f x86/cpufeatures: Add support for L3 Smart Data Cache Injection Allocation Enforcement
Smart Data Cache Injection (SDCI) is a mechanism that enables direct insertion
of data from I/O devices into the L3 cache. By directly caching data from I/O
devices rather than first storing the I/O data in DRAM, SDCI reduces demands on
DRAM bandwidth and reduces latency to the processor consuming the I/O data.

The SDCIAE (SDCI Allocation Enforcement) PQE feature allows system software to
control the portion of the L3 cache used for SDCI.

When enabled, SDCIAE forces all SDCI lines to be placed into the L3 cache
partitions identified by the highest-supported L3_MASK_n register, where n is
the maximum supported CLOSID.

Add CPUID feature bit that can be used to configure SDCIAE.

The SDCIAE feature details are documented in:

  AMD64 Architecture Programmer's Manual Volume 2: System Programming
     Publication # 24593 Revision 3.41 section 19.4.7 L3 Smart Data Cache
    Injection Allocation Enforcement (SDCIAE).

available at https://bugzilla.kernel.org/show_bug.cgi?id=206537

Signed-off-by: Babu Moger <babu.moger@amd.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Acked-by: Borislav Petkov (AMD) <bp@alien8.de>
Link: https://patch.msgid.link/83ca10d981c48e86df2c3ad9658bb3ba3544c763.1762995456.git.babu.moger@amd.com
2025-11-21 22:03:07 +01:00
Smita Koralahalli 5c4663ed1e x86/mce: Handle AMD threshold interrupt storms
Extend the logic of handling CMCI storms to AMD threshold interrupts.

Rely on the similar approach as of Intel's CMCI to mitigate storms per CPU and
per bank. But, unlike CMCI, do not set thresholds and reduce interrupt rate on
a storm. Rather, disable the interrupt on the corresponding CPU and bank.
Re-enable back the interrupts if enough consecutive polls of the bank show no
corrected errors (30, as programmed by Intel).

Turning off the threshold interrupts would be a better solution on AMD systems
as other error severities will still be handled even if the threshold
interrupts are disabled.

  [ Tony: Small tweak because mce_handle_storm() isn't a pointer now ]
  [ Yazen: Rebase and simplify ]
  [ Avadhut: Remove check to not clear bank's bit in mce_poll_banks and fix
    checkpatch warnings. ]

Signed-off-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Yazen Ghannam <yazen.ghannam@amd.com>
Signed-off-by: Avadhut Naik <avadhut.naik@amd.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Link: https://patch.msgid.link/20251121190542.2447913-3-avadhut.naik@amd.com
2025-11-21 20:41:10 +01:00
Avadhut Naik d7ac083f09 x86/mce: Do not clear bank's poll bit in mce_poll_banks on AMD SMCA systems
Currently, when a CMCI storm detected on a Machine Check bank, subsides, the
bank's corresponding bit in the mce_poll_banks per-CPU variable is cleared
unconditionally by cmci_storm_end().

On AMD SMCA systems, this essentially disables polling on that particular bank
on that CPU. Consequently, any subsequent correctable errors or storms will not
be logged.

Since AMD SMCA systems allow banks to be managed by both polling and
interrupts, the polling banks bitmap for a CPU, i.e., mce_poll_banks, should
not be modified when a storm subsides.

Fixes: 7eae17c4ad ("x86/mce: Add per-bank CMCI storm mitigation")
Signed-off-by: Avadhut Naik <avadhut.naik@amd.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Cc: stable@vger.kernel.org
Link: https://patch.msgid.link/20251121190542.2447913-2-avadhut.naik@amd.com
2025-11-21 20:33:12 +01:00
Ma Ke ef1b6d9049 EDAC/igen6: Fix error handling in igen6_edac driver
The igen6_edac driver calls device_initialize() for all memory
controllers in igen6_register_mci(), but misses corresponding
put_device() calls in error paths and during normal shutdown in
igen6_unregister_mcis().

Adding the missing put_device() calls improves code readability and
ensures proper reference counting for the device structure.

Found by code review.

Signed-off-by: Ma Ke <make24@iscas.ac.cn>
Reviewed-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
Link: https://patch.msgid.link/20251105090244.23327-1-make24@iscas.ac.cn
2025-11-21 10:20:51 -08:00
Qiuxu Zhuo 5f40ea7f41 EDAC/imh: Setup 'imh_test' debugfs testing node
Setup the following debugfs testing node to enable fake memory error
address decoding tests for the imh_edac driver.

  /sys/kernel/debug/edac/imh_test/addr

Tested-by: Yi Lai <yi1.lai@intel.com>
Signed-off-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
Link: https://patch.msgid.link/20251119134132.2389472-8-qiuxu.zhuo@intel.com
2025-11-21 10:20:51 -08:00
Qiuxu Zhuo f619613f30 EDAC/{skx_comm,imh}: Detect 2-level memory configuration
Detect 2-level memory configurations and notify the 'skx_common' library
to enable ADXL 2-level memory error decoding.

Tested-by: Yi Lai <yi1.lai@intel.com>
Signed-off-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
Link: https://patch.msgid.link/20251119134132.2389472-7-qiuxu.zhuo@intel.com
2025-11-21 10:20:51 -08:00
Qiuxu Zhuo 39abdcbdad EDAC/skx_common: Extend the maximum number of DRAM chip row bits
The allowed maximum number of row bits for DRAM chips in the Diamond
Rapids server processor is up to 19. Extend the current maximum row
bits from 18 to 19.

Tested-by: Yi Lai <yi1.lai@intel.com>
Signed-off-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
Link: https://patch.msgid.link/20251119134132.2389472-6-qiuxu.zhuo@intel.com
2025-11-21 10:20:51 -08:00
Qiuxu Zhuo 9fc67b1170 EDAC/{skx_common,imh}: Add EDAC driver for Intel Diamond Rapids servers
Intel Diamond Rapids CPUs include Integrated Memory and I/O Hubs (IMH).
The memory controllers within the IMHs provide memory stacks to the
processor. Create a new driver for this IMH-based memory controllers
rather than applying additional patches to the existing i10nm_edac.c
for the following reasons:

1) The memory controllers are not presented as PCI devices; instead,
   the detection and all their registers have been transitioned to
   MMIO-based memory spaces.

2) Validation processes are costly. Modifications to i10nm_edac would
   require extensive validation checks against multiple platforms,
   including Ice Lake, Sapphire Rapids, Emerald Rapids, Granite Rapids,
   Sierra Forest, and Grand Ridge.

3) Future Intel CPUs will likely only need patches on top of this new
   EDAC driver. Validation can be limited to Diamond Rapids servers
   and future Intel CPU generations.

[Tony: Fix kerneldoc for struct local_reg]
[randconfig: Added dependencies on NFIT and DMI]

Tested-by: Yi Lai <yi1.lai@intel.com>
Signed-off-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
Link: https://patch.msgid.link/20251119134132.2389472-5-qiuxu.zhuo@intel.com
2025-11-21 10:19:43 -08:00
Alexandre Chartre be5ee60ac5 objtool: Provide access to feature and flags of group alternatives
Each alternative of a group alternative depends on a specific
feature and flags. Provide access to the feature/flags for each
alternative as an attribute (feature) in struct alt_group.

Signed-off-by: Alexandre Chartre <alexandre.chartre@oracle.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Josh Poimboeuf <jpoimboe@kernel.org>
Link: https://patch.msgid.link/20251121095340.464045-26-alexandre.chartre@oracle.com
2025-11-21 15:30:14 +01:00
Alexandre Chartre 4aae0d3f77 objtool: Fix address references in alternatives
When using the --disas option, alternatives are disassembled but
address references in non-default alternatives can be incorrect.

The problem is that alternatives are shown as if they were replacing the
original code of the alternative. So if an alternative is referencing
an address inside the alternative then the reference has to be
adjusted to the location of the original code.

Signed-off-by: Alexandre Chartre <alexandre.chartre@oracle.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Josh Poimboeuf <jpoimboe@kernel.org>
Link: https://patch.msgid.link/20251121095340.464045-25-alexandre.chartre@oracle.com
2025-11-21 15:30:14 +01:00
Alexandre Chartre 7e017720aa objtool: Disassemble jump table alternatives
When using the --disas option, also disassemble jump tables.

Signed-off-by: Alexandre Chartre <alexandre.chartre@oracle.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Josh Poimboeuf <jpoimboe@kernel.org>
Link: https://patch.msgid.link/20251121095340.464045-24-alexandre.chartre@oracle.com
2025-11-21 15:30:14 +01:00
Alexandre Chartre 78df4590c5 objtool: Disassemble exception table alternatives
When using the --disas option, also disassemble exception tables
(EX_TABLE).

Signed-off-by: Alexandre Chartre <alexandre.chartre@oracle.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Josh Poimboeuf <jpoimboe@kernel.org>
Link: https://patch.msgid.link/20251121095340.464045-23-alexandre.chartre@oracle.com
2025-11-21 15:30:14 +01:00
Alexandre Chartre 15e7ad8667 objtool: Print addresses with alternative instructions
All alternatives are disassemble side-by-side when using the --disas
option. However the address of each instruction is not printed because
instructions from different alternatives are not necessarily aligned.

Change this behavior to print the address of each instruction. Spaces
will appear between instructions from the same alternative when
instructions from different alternatives do not have the same alignment.

Signed-off-by: Alexandre Chartre <alexandre.chartre@oracle.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Josh Poimboeuf <jpoimboe@kernel.org>
Link: https://patch.msgid.link/20251121095340.464045-22-alexandre.chartre@oracle.com
2025-11-21 15:30:13 +01:00
Alexandre Chartre a4f1599672 objtool: Disassemble group alternatives
When using the --disas option, disassemble all group alternatives.
Jump tables and exception tables (which are handled as alternatives)
are not disassembled at the moment.

Signed-off-by: Alexandre Chartre <alexandre.chartre@oracle.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Josh Poimboeuf <jpoimboe@kernel.org>
Link: https://patch.msgid.link/20251121095340.464045-21-alexandre.chartre@oracle.com
2025-11-21 15:30:13 +01:00
Alexandre Chartre 87343e6642 objtool: Print headers for alternatives
When using the --disas option, objtool doesn't currently disassemble
any alternative. Print an header for each alternative. This identifies
places where alternatives are present but alternative code is still
not disassembled at the moment.

Signed-off-by: Alexandre Chartre <alexandre.chartre@oracle.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Josh Poimboeuf <jpoimboe@kernel.org>
Link: https://patch.msgid.link/20251121095340.464045-20-alexandre.chartre@oracle.com
2025-11-21 15:30:13 +01:00
Alexandre Chartre 7ad7a4a720 objtool: Preserve alternatives order
Preserve the order in which alternatives are defined. Currently
objtool stores alternatives in a list in reverse order.

Signed-off-by: Alexandre Chartre <alexandre.chartre@oracle.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Josh Poimboeuf <jpoimboe@kernel.org>
Link: https://patch.msgid.link/20251121095340.464045-19-alexandre.chartre@oracle.com
2025-11-21 15:30:12 +01:00
Alexandre Chartre 5f326c8897 objtool: Add the --disas=<function-pattern> action
Add the --disas=<function-pattern> actions to disassemble the specified
functions. The function pattern can be a single function name (e.g.
--disas foo to disassemble the function with the name "foo"), or a shell
wildcard pattern (e.g. --disas foo* to disassemble all functions with a
name starting with "foo").

Signed-off-by: Alexandre Chartre <alexandre.chartre@oracle.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Josh Poimboeuf <jpoimboe@kernel.org>
Link: https://patch.msgid.link/20251121095340.464045-18-alexandre.chartre@oracle.com
2025-11-21 15:30:12 +01:00
Alexandre Chartre c3b7d044fc objtool: Do not validate IBT for .return_sites and .call_sites
The .return_sites and .call_sites sections reference text addresses,
but not with the intent to indirect branch to them, so they don't
need to be validated for IBT.

This is useful when running objtool on object files which already
have .return_sites or .call_sites sections, for example to re-run
objtool after it has reported an error or a warning.

Signed-off-by: Alexandre Chartre <alexandre.chartre@oracle.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Josh Poimboeuf <jpoimboe@kernel.org>
Link: https://patch.msgid.link/20251121095340.464045-17-alexandre.chartre@oracle.com
2025-11-21 15:30:12 +01:00
Alexandre Chartre 350c7ab857 objtool: Improve tracing of alternative instructions
When tracing function validation, improve the reporting of
alternative instruction by more clearly showing the different
alternatives beginning and end.

Signed-off-by: Alexandre Chartre <alexandre.chartre@oracle.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Josh Poimboeuf <jpoimboe@kernel.org>
Link: https://patch.msgid.link/20251121095340.464045-16-alexandre.chartre@oracle.com
2025-11-21 15:30:11 +01:00
Alexandre Chartre 9b580accac objtool: Add functions to better name alternatives
Add the disas_alt_name() and disas_alt_type_name() to provide a
name and a type name for an alternative. This will be used to
better name alternatives when tracing their execution.

Signed-off-by: Alexandre Chartre <alexandre.chartre@oracle.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Josh Poimboeuf <jpoimboe@kernel.org>
Link: https://patch.msgid.link/20251121095340.464045-15-alexandre.chartre@oracle.com
2025-11-21 15:30:11 +01:00
Alexandre Chartre d490aa2197 objtool: Identify the different types of alternatives
Alternative code, including jump table and exception table, is represented
with the same struct alternative structure. But there is no obvious way to
identify whether the struct represents alternative instructions, a jump
table or an exception table.

So add a type to struct alternative to clearly identify the type of
alternative.

Signed-off-by: Alexandre Chartre <alexandre.chartre@oracle.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Josh Poimboeuf <jpoimboe@kernel.org>
Link: https://patch.msgid.link/20251121095340.464045-14-alexandre.chartre@oracle.com
2025-11-21 15:30:11 +01:00
Alexandre Chartre 26a453fb56 objtool: Improve register reporting during function validation
When tracing function validation, instruction state changes can
report changes involving registers. These registers are reported
with the name "r<num>" (e.g. "r3"). Print the CPU specific register
name instead of a generic name (e.g. print "rbx" instead of "r3"
on x86).

Signed-off-by: Alexandre Chartre <alexandre.chartre@oracle.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Josh Poimboeuf <jpoimboe@kernel.org>
Link: https://patch.msgid.link/20251121095340.464045-13-alexandre.chartre@oracle.com
2025-11-21 15:30:10 +01:00
Alexandre Chartre fcb268b47a objtool: Trace instruction state changes during function validation
During function validation, objtool maintains a per-instruction state,
in particular to track call frame information. When tracing validation,
print any instruction state changes.

Signed-off-by: Alexandre Chartre <alexandre.chartre@oracle.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Josh Poimboeuf <jpoimboe@kernel.org>
Link: https://patch.msgid.link/20251121095340.464045-12-alexandre.chartre@oracle.com
2025-11-21 15:30:10 +01:00
Alexandre Chartre 70589843b3 objtool: Add option to trace function validation
Add an option to trace and have information during the validation
of specified functions. Functions are specified with the --trace
option which can be a single function name (e.g. --trace foo to
trace the function with the name "foo"), or a shell wildcard
pattern (e.g. --trace foo* to trace all functions with a name
starting with "foo").

Signed-off-by: Alexandre Chartre <alexandre.chartre@oracle.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Josh Poimboeuf <jpoimboe@kernel.org>
Link: https://patch.msgid.link/20251121095340.464045-11-alexandre.chartre@oracle.com
2025-11-21 15:30:09 +01:00
Alexandre Chartre de0248fbbf objtool: Record symbol name max length
Keep track of the maximum length of symbol names. This will help
formatting the code flow between different functions.

Signed-off-by: Alexandre Chartre <alexandre.chartre@oracle.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Josh Poimboeuf <jpoimboe@kernel.org>
Link: https://patch.msgid.link/20251121095340.464045-10-alexandre.chartre@oracle.com
2025-11-21 15:30:09 +01:00
Alexandre Chartre a0e5bf9fd6 objtool: Extract code to validate instruction from the validate branch loop
The code to validate a branch loops through all instructions of the
branch and validate each instruction. Move the code to validate an
instruction to a separated function.

Signed-off-by: Alexandre Chartre <alexandre.chartre@oracle.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Josh Poimboeuf <jpoimboe@kernel.org>
Link: https://patch.msgid.link/20251121095340.464045-9-alexandre.chartre@oracle.com
2025-11-21 15:30:08 +01:00
Alexandre Chartre 0bb080ba64 objtool: Disassemble instruction on warning or backtrace
When an instruction warning (WARN_INSN) or backtrace (BT_INSN) is issued,
disassemble the instruction to provide more context.

Signed-off-by: Alexandre Chartre <alexandre.chartre@oracle.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Josh Poimboeuf <jpoimboe@kernel.org>
Link: https://patch.msgid.link/20251121095340.464045-8-alexandre.chartre@oracle.com
2025-11-21 15:30:08 +01:00
Alexandre Chartre d4e13c2149 objtool: Store instruction disassembly result
When disassembling an instruction store the result instead of directly
printing it.

Signed-off-by: Alexandre Chartre <alexandre.chartre@oracle.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Josh Poimboeuf <jpoimboe@kernel.org>
Link: https://patch.msgid.link/20251121095340.464045-7-alexandre.chartre@oracle.com
2025-11-21 15:30:08 +01:00
Alexandre Chartre 5d859dff26 objtool: Print symbol during disassembly
Print symbols referenced during disassembly instead of just printing
raw addresses. Also handle address relocation.

Signed-off-by: Alexandre Chartre <alexandre.chartre@oracle.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Josh Poimboeuf <jpoimboe@kernel.org>
Link: https://patch.msgid.link/20251121095340.464045-6-alexandre.chartre@oracle.com
2025-11-21 15:30:07 +01:00
Alexandre Chartre f348a44c10 tool build: Remove annoying newline in build output
Remove the newline which is printed during feature discovery
when nothing else is printed.

Signed-off-by: Alexandre Chartre <alexandre.chartre@oracle.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Josh Poimboeuf <jpoimboe@kernel.org>
Link: https://patch.msgid.link/20251121095340.464045-5-alexandre.chartre@oracle.com
2025-11-21 15:30:07 +01:00
Alexandre Chartre 5995330382 objtool: Disassemble code with libopcodes instead of running objdump
objtool executes the objdump command to disassemble code. Use libopcodes
instead to have more control about the disassembly scope and output.
If libopcodes is not present then objtool is built without disassembly
support.

Signed-off-by: Alexandre Chartre <alexandre.chartre@oracle.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Josh Poimboeuf <jpoimboe@kernel.org>
Link: https://patch.msgid.link/20251121095340.464045-4-alexandre.chartre@oracle.com
2025-11-21 15:30:07 +01:00
Alexandre Chartre 1013f2e37b objtool: Create disassembly context
Create a structure to store information for disassembling functions.
For now, it is just a wrapper around an objtool file.

Signed-off-by: Alexandre Chartre <alexandre.chartre@oracle.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Josh Poimboeuf <jpoimboe@kernel.org>
Link: https://patch.msgid.link/20251121095340.464045-3-alexandre.chartre@oracle.com
2025-11-21 15:30:06 +01:00
Alexandre Chartre 55d2a473f3 objtool: Move disassembly functions to a separated file
objtool disassembles functions which have warnings. Move the code
to do that to a dedicated file. The code is just moved, it is not
changed.

Signed-off-by: Alexandre Chartre <alexandre.chartre@oracle.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Josh Poimboeuf <jpoimboe@kernel.org>
Link: https://patch.msgid.link/20251121095340.464045-2-alexandre.chartre@oracle.com
2025-11-21 15:30:06 +01:00
Peter Zijlstra b9b2c455f4 bug: Allow architectures to provide __WARN_printf()
In addition to providing __WARN_FLAGS(), allow an architecture to also
provide __WARN_printf().

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patch.msgid.link/20251110115757.807154591@infradead.org
2025-11-21 11:21:32 +01:00
Peter Zijlstra 3fd45b871f bug: Implement WARN_ON() using __WARN_FLAGS()
This completes 3bc3c9c3ab ("bugs/core: Pass down the condition
string of WARN_ON_ONCE(cond) warnings to __WARN_FLAGS()") and makes
WARN_ON() and WARN_ON_ONCE() behaviour consistent.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patch.msgid.link/20251110115757.690999560@infradead.org
2025-11-21 11:21:32 +01:00
Peter Zijlstra 7d2c27a0ec bug: Add report_bug_entry()
Add a report_bug() variant where the bug_entry is already known. This
is useful when the exception instruction is not instantiated per-site.
But instead has a single instance. In such a case the bug_entry
address might be passed along in a known register or something.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patch.msgid.link/20251110115757.575795595@infradead.org
2025-11-21 11:21:31 +01:00
Peter Zijlstra 5c47b7f3d1 bug: Add BUG_FORMAT_ARGS infrastructure
Add BUG_FORMAT_ARGS; when an architecture is able to provide a va_list
given pt_regs, use this to print format arguments.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patch.msgid.link/20251110115757.457339417@infradead.org
2025-11-21 11:21:31 +01:00
Peter Zijlstra 30b82568b0 bug: Clean up CONFIG_GENERIC_BUG_RELATIVE_POINTERS
Three repeated CONFIG_GENERIC_BUG_RELATIVE_POINTERS #ifdefs right
after one another yields unreadable code. Add a helper.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patch.msgid.link/20251110115757.341703850@infradead.org
2025-11-21 11:21:31 +01:00
Peter Zijlstra d292dbb564 bug: Add BUG_FORMAT infrastructure
Add BUG_FORMAT; an architecture opt-in feature that allows adding the
WARN_printf() format string to the bug_entry table.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patch.msgid.link/20251110115757.223371452@infradead.org
2025-11-21 11:21:30 +01:00
Peter Zijlstra 1be1fac648 x86: Rework __bug_table helpers
Rework the __bug_table helpers such that extension becomes easier.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patch.msgid.link/20251110115757.111187573@infradead.org
2025-11-21 11:21:30 +01:00
Peter Zijlstra 2ace527183 Merge branch 'objtool/core'
Bring in the UDB and objtool data annotations to avoid conflicts while further extending the bug exceptions.

Signed-off-by: Peter Zijlstra <peterz@infradead.org>
2025-11-21 11:21:20 +01:00
Avadhut Naik 821f5fe4db x86/mce: Add support for physical address valid bit
Starting with Zen6, AMD's Scalable MCA systems will incorporate two new bits in
MCA_STATUS and MCA_CONFIG MSRs. These bits will indicate if a valid System
Physical Address (SPA) is present in MCA_ADDR.

PhysAddrValidSupported bit (MCA_CONFIG[11]) serves as the architectural
indicator and states if PhysAddrV bit (MCA_STATUS[54]) is Reserved or if it
indicates validity of SPA in MCA_ADDR.

PhysAddrV bit (MCA_STATUS[54]) advertises if MCA_ADDR contains valid SPA or if
it is implementation specific.

Use and prefer MCA_STATUS[PhysAddrV] when checking for a usable address.

Signed-off-by: Avadhut Naik <avadhut.naik@amd.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Link: https://patch.msgid.link/20251118191731.181269-1-avadhut.naik@amd.com
2025-11-21 10:32:28 +01:00
Yazen Ghannam eeb3f76d73 x86/mce: Save and use APEI corrected threshold limit
The MCA threshold limit generally is not something that needs to change during
runtime. It is common for a system administrator to decide on a policy for
their managed systems.

If MCA thresholding is OS-managed, then the threshold limit must be set at
every boot. However, many systems allow the user to set a value in their BIOS.
And this is reported through an APEI HEST entry even if thresholding is not in
FW-First mode.

Use this value, if available, to set the OS-managed threshold limit.  Users
can still override it through sysfs if desired for testing or debug.

APEI is parsed after MCE is initialized. So reset the thresholding blocks
later to pick up the threshold limit.

Signed-off-by: Yazen Ghannam <yazen.ghannam@amd.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Link: https://lore.kernel.org/20251104-wip-mca-updates-v8-0-66c8eacf67b9@amd.com
2025-11-21 10:32:28 +01:00
Josh Poimboeuf 11991999a2 Revert "objtool: Warn on functions with ambiguous -ffunction-sections section names"
This reverts commit 9c7dc1dd89.

The check-function-names.sh script now provides the function name
checking functionality for all architectures, making the objtool check
redundant.

Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Link: https://patch.msgid.link/c7d549d4de8bd1490d106b99630eea5efc69a4dd.1763669451.git.jpoimboe@kernel.org
2025-11-21 10:04:10 +01:00
Josh Poimboeuf 93863f3f85 kbuild: Check for functions with ambiguous -ffunction-sections section names
Commit 9c7dc1dd89 ("objtool: Warn on functions with ambiguous
-ffunction-sections section names") only works for drivers which are
compiled on architectures supported by objtool.

Make a script to perform the same check for all architectures.

Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Link: https://patch.msgid.link/a6a49644a34964f7e02f3a8ce43af03e72817180.1763669451.git.jpoimboe@kernel.org
2025-11-21 10:04:10 +01:00
Josh Poimboeuf 3186333713 tty: synclink_gt: Fix namespace collision and startup() section placement with -ffunction-sections
When compiled with -ffunction-sections (e.g., for LTO, livepatch, dead
code elimination, AutoFDO, or Propeller), the startup() function gets
compiled into the .text.startup section (or in some cases
.text.startup.constprop.0 or .text.startup.isra.0).

However, the .text.startup and .text.startup.* sections are also used by
the compiler for __attribute__((constructor)) code.

This naming conflict causes the vmlinux linker script to wrongly place
startup() function code in .init.text, which gets freed during boot.

Some builds have a mix of objects, both with and without
-ffunctions-sections, so it's not possible for the linker script to
disambiguate with #ifdef CONFIG_FUNCTION_SECTIONS or similar.  This
means that "startup" unfortunately needs to be prohibited as a function
name.

Rename startup() to startup_hw().  For consistency, also rename its
shutdown() counterpart to shutdown_hw().

Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Link: https://patch.msgid.link/f0ee750f35c878172cc09916a0724b74e62eadc2.1763669451.git.jpoimboe@kernel.org
2025-11-21 10:04:10 +01:00
Josh Poimboeuf 845c09e474 tty: amiserial: Fix namespace collision and startup() section placement with -ffunction-sections
When compiled with -ffunction-sections (e.g., for LTO, livepatch, dead
code elimination, AutoFDO, or Propeller), the startup() function gets
compiled into the .text.startup section (or in some cases
.text.startup.constprop.0 or .text.startup.isra.0).

However, the .text.startup and .text.startup.* sections are also used by
the compiler for __attribute__((constructor)) code.

This naming conflict causes the vmlinux linker script to wrongly place
startup() function code in .init.text, which gets freed during boot.

Some builds have a mix of objects, both with and without
-ffunctions-sections, so it's not possible for the linker script to
disambiguate with #ifdef CONFIG_FUNCTION_SECTIONS or similar.  This
means that "startup" unfortunately needs to be prohibited as a function
name.

Rename startup() to rs_startup().  For consistency, also rename its
shutdown() counterpart to rs_shutdown().

Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Link: https://patch.msgid.link/9e56afff5268b0b12b99a8aa9bf244d6ebdcdf47.1763669451.git.jpoimboe@kernel.org
2025-11-21 10:04:09 +01:00
Josh Poimboeuf 2c715c9de2 media: atomisp: gc2235: Fix namespace collision and startup() section placement with -ffunction-sections
When compiled with -ffunction-sections (e.g., for LTO, livepatch, dead
code elimination, AutoFDO, or Propeller), the startup() function gets
compiled into the .text.startup section (or in some cases
.text.startup.constprop.0 or .text.startup.isra.0).

However, the .text.startup and .text.startup.* sections are also used by
the compiler for __attribute__((constructor)) code.

This naming conflict causes the vmlinux linker script to wrongly place
startup() function code in .init.text, which gets freed during boot.

Some builds have a mix of objects, both with and without
-ffunctions-sections, so it's not possible for the linker script to
disambiguate with #ifdef CONFIG_FUNCTION_SECTIONS or similar.  This
means that "startup" unfortunately needs to be prohibited as a function
name.

Rename startup() to gc2235_startup().

Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Link: https://patch.msgid.link/d28103a6edf7beceb5e3c6fa24e49dbad1350389.1763669451.git.jpoimboe@kernel.org
2025-11-21 10:04:09 +01:00
Josh Poimboeuf da6202139a serial: icom: Fix namespace collision and startup() section placement with -ffunction-sections
When compiled with -ffunction-sections (e.g., for LTO, livepatch, dead
code elimination, AutoFDO, or Propeller), the startup() function gets
compiled into the .text.startup section (or in some cases
.text.startup.constprop.0 or .text.startup.isra.0).

However, the .text.startup and .text.startup.* sections are also used by
the compiler for __attribute__((constructor)) code.

This naming conflict causes the vmlinux linker script to wrongly place
startup() function code in .init.text, which gets freed during boot.

Some builds have a mix of objects, both with and without
-ffunctions-sections, so it's not possible for the linker script to
disambiguate with #ifdef CONFIG_FUNCTION_SECTIONS or similar.  This
means that "startup" unfortunately needs to be prohibited as a function
name.

Rename startup() to icom_startup().  For consistency, also rename its
shutdown() counterpart to icom_shutdown().

Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Link: https://patch.msgid.link/1aee9ef69f9d40405676712b34f0c397706e7023.1763669451.git.jpoimboe@kernel.org
2025-11-21 10:04:09 +01:00
Josh Poimboeuf 106f11d43b objtool: Remove second pass of .cold function correlation
The .cold function parent/child correlation logic has two passes: one in
read_symbols() and one in add_jump_destinations().

The second pass was added with commit cd77849a69 ("objtool: Fix GCC 8
cold subfunction detection for aliased functions") to ensure that if the
parent symbol had aliases then the canonical symbol was chosen as the
parent.

That solution was rather clunky, not to mention incomplete due to the
existence of alternatives and switch tables.  Now that we have
sym->alias, the canonical alias fix can be done much simpler in the
first pass, making the second pass obsolete.

Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patch.msgid.link/bdab245a38000a5407f663a031f39e14c67a43d4.1763671318.git.jpoimboe@kernel.org
2025-11-21 10:04:08 +01:00
Josh Poimboeuf a91a61b290 objtool: Skip non-canonical aliased symbols in add_jump_table_alts()
If a symbol has aliases, make add_jump_table_alts() skip the
non-canonical ones to avoid any surprises.

Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patch.msgid.link/169aa17564b9aadb74897945ea74ac2eb70c5b13.1763671318.git.jpoimboe@kernel.org
2025-11-21 10:04:08 +01:00
Josh Poimboeuf 9205a322cf objtool: Return canonical symbol when aliases exist in symbol finding helpers
When symbol alias ambiguity exists in the symbol finding helper
functions, return the canonical sym->alias, as that's the one which gets
used by validate_branch() and elsewhere.

This doesn't fix any known issues, just makes the symbol alias behavior
more robust.

Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patch.msgid.link/450470a4897706af77453ad333e18af5ebab653c.1763671318.git.jpoimboe@kernel.org
2025-11-21 10:04:08 +01:00
Josh Poimboeuf 16f366c5a6 objtool: Don't alias undefined symbols
Objtool is mistakenly aliasing all undefined symbols.  That's obviously
wrong, though it has no consequence since objtool happens to only use
sym->alias for defined symbols.  Fix it regardless.

Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patch.msgid.link/bc401173a7717757eee672fc1ca5a20451d77b86.1763671318.git.jpoimboe@kernel.org
2025-11-21 10:04:08 +01:00
Josh Poimboeuf 2c2acca2ea objtool: Fix .cold function detection for duplicate symbols
The objtool .cold child/parent correlation is done in two phases: first
in elf_add_symbol() and later in add_jump_destinations().

The first phase is rather crude and can pick the wrong parent if there
are duplicates with the same name.

The second phase usually fixes that, but only if the parent has a direct
jump to the child.  It does *not* work if the only branch from the
parent to the child is an alternative or jump table entry.

Make the first phase more robust by looking for the parent in the same
STT_FILE as the child.

Fixes the following objtool warnings in an AutoFDO build with a large
CLANG_AUTOFDO_PROFILE profile:

  vmlinux.o: warning: objtool: rdev_add_key() falls through to next function rdev_add_key.cold()
  vmlinux.o: warning: objtool: rdev_set_default_key() falls through to next function rdev_set_default_key.cold()

Fixes: 13810435b9 ("objtool: Support GCC 8's cold subfunctions")
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patch.msgid.link/82c7b52e40efa75dd10e1c550cc75c1ce10ac2c9.1763671318.git.jpoimboe@kernel.org
2025-11-21 10:04:07 +01:00
Josh Poimboeuf 024020e2b6 objtool: Support Clang AUTOFDO .cold functions
AutoFDO enables -fsplit-machine-functions which can move the cold parts
of a function to a <func>.cold symbol in a .text.split.<func> section.

Unlike GCC, the Clang <func>.cold symbols are not marked STT_FUNC.  This
confuses objtool in several ways, resulting in warnings like the
following:

  vmlinux.o: warning: objtool: apply_retpolines.cold+0xfc: unsupported instruction in callable function
  vmlinux.o: warning: objtool: machine_check_poll.cold+0x2e: unsupported instruction in callable function
  vmlinux.o: warning: objtool: free_deferred_objects.cold+0x1f: relocation to !ENDBR: free_deferred_objects.cold+0x26
  vmlinux.o: warning: objtool: rpm_idle.cold+0xe0: relocation to !ENDBR: rpm_idle.cold+0xe7
  vmlinux.o: warning: objtool: tcp_rcv_state_process.cold+0x1c: relocation to !ENDBR: tcp_rcv_state_process.cold+0x23

Fix it by marking the .cold symbols as STT_FUNC.

Fixes: 2fd65f7afd ("AutoFDO: Enable machine function split optimization for AutoFDO")
Closes: https://lore.kernel.org/20251103215244.2080638-2-xur@google.com
Reported-by: Rong Xu <xur@google.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: xur@google.com
Tested-by: xur@google.com
Link: https://patch.msgid.link/20a67326f04b2a361c031b56d58e8a803b3c5893.1763671318.git.jpoimboe@kernel.org
2025-11-21 10:04:07 +01:00
Peter Zijlstra c04507ac50 sched: Provide and use set_need_resched_current()
set_tsk_need_resched(current) requires set_preempt_need_resched(current) to
work correctly outside of the scheduler.

Provide set_need_resched_current() which wraps this correctly and replace
all the open coded instances.

Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://patch.msgid.link/20251116174750.665769842@linutronix.de
2025-11-20 22:26:09 +01:00
Gabriele Monaco 7dec062cfc timers/migration: Exclude isolated cpus from hierarchy
The timer migration mechanism allows active CPUs to pull timers from
idle ones to improve the overall idle time. This is however undesired
when CPU intensive workloads run on isolated cores, as the algorithm
would move the timers from housekeeping to isolated cores, negatively
affecting the isolation.

Exclude isolated cores from the timer migration algorithm, extend the
concept of unavailable cores, currently used for offline ones, to
isolated ones:
* A core is unavailable if isolated or offline;
* A core is available if non isolated and online;

A core is considered unavailable as isolated if it belongs to:
* the isolcpus (domain) list
* an isolated cpuset
Except if it is:
* in the nohz_full list (already idle for the hierarchy)
* the nohz timekeeper core (must be available to handle global timers)

CPUs are added to the hierarchy during late boot, excluding isolated
ones, the hierarchy is also adapted when the cpuset isolation changes.

Due to how the timer migration algorithm works, any CPU part of the
hierarchy can have their global timers pulled by remote CPUs and have to
pull remote timers, only skipping pulling remote timers would break the
logic.
For this reason, prevent isolated CPUs from pulling remote global
timers, but also the other way around: any global timer started on an
isolated CPU will run there. This does not break the concept of
isolation (global timers don't come from outside the CPU) and, if
considered inappropriate, can usually be mitigated with other isolation
techniques (e.g. IRQ pinning).

This effect was noticed on a 128 cores machine running oslat on the
isolated cores (1-31,33-63,65-95,97-127). The tool monopolises CPUs,
and the CPU with lowest count in a timer migration hierarchy (here 1
and 65) appears as always active and continuously pulls global timers,
from the housekeeping CPUs. This ends up moving driver work (e.g.
delayed work) to isolated CPUs and causes latency spikes:

before the change:

 # oslat -c 1-31,33-63,65-95,97-127 -D 62s
 ...
  Maximum:     1203 10 3 4 ... 5 (us)

after the change:

 # oslat -c 1-31,33-63,65-95,97-127 -D 62s
 ...
  Maximum:      10 4 3 4 3 ... 5 (us)

The same behaviour was observed on a machine with as few as 20 cores /
40 threads with isocpus set to: 1-9,11-39 with rtla-osnoise-top.

Signed-off-by: Gabriele Monaco <gmonaco@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: John B. Wyatt IV <jwyatt@redhat.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Link: https://patch.msgid.link/20251120145653.296659-8-gmonaco@redhat.com
2025-11-20 20:17:32 +01:00
Yury Norov b56651007f cpumask: Add initialiser to use cleanup helpers
Now we can simplify a code that allocates cpumasks for local needs.

Automatic variables have to be initialized at declaration, or at least
before any possibility for the logic to return, so that compiler
wouldn't try to call an associate destructor function on a random stack
number.

Because cpumask_var_t, depending on the CPUMASK_OFFSTACK config, is
either a pointer or an array, we have to have a macro for initialization.

So define a CPUMASK_VAR_NULL macro, which allows to init struct cpumask
pointer with NULL when CPUMASK_OFFSTACK is enabled, and effectively a
no-op when CPUMASK_OFFSTACK is disabled (initialisation optimised out
with -O2).

Signed-off-by: Yury Norov <yury.norov@gmail.com>
Signed-off-by: Gabriele Monaco <gmonaco@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Link: https://patch.msgid.link/20251120145653.296659-7-gmonaco@redhat.com
2025-11-20 20:17:32 +01:00
Gabriele Monaco 185bccc797 sched/isolation: Force housekeeping if isolcpus and nohz_full don't leave any
Currently the user can set up isolcpus and nohz_full in such a way that
leaves no housekeeping CPU (i.e. no CPU that is neither domain isolated
nor nohz full). This can be a problem for other subsystems (e.g. the
timer wheel imgration).

Prevent this configuration by invalidating the last setting in case the
union of isolcpus (domain) and nohz_full covers all CPUs.

Signed-off-by: Gabriele Monaco <gmonaco@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Waiman Long <longman@redhat.com>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Link: https://patch.msgid.link/20251120145653.296659-6-gmonaco@redhat.com
2025-11-20 20:17:31 +01:00
Gabriele Monaco 22f8e41680 cgroup/cpuset: Rename update_unbound_workqueue_cpumask() to update_isolation_cpumasks()
update_unbound_workqueue_cpumask() updates unbound workqueues settings
when there's a change in isolated CPUs, but it can be used for other
subsystems requiring updated when isolated CPUs change.

Generalise the name to update_isolation_cpumasks() to prepare for other
functions unrelated to workqueues to be called in that spot.

[longman: Change the function name to update_isolation_cpumasks()]

Signed-off-by: Gabriele Monaco <gmonaco@redhat.com>
Signed-off-by: Waiman Long <longman@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Chen Ridong <chenridong@huaweicloud.com>
Acked-by: Frederic Weisbecker <frederic@kernel.org>
Acked-by: Waiman Long <longman@redhat.com>
Link: https://patch.msgid.link/20251120145653.296659-5-gmonaco@redhat.com
2025-11-20 20:17:31 +01:00
Gabriele Monaco 4c2374ed86 timers/migration: Use scoped_guard on available flag set/clear
Cleanup tmigr_clear_cpu_available() and tmigr_set_cpu_available() to
prepare for easier checks on the available flag.

Signed-off-by: Gabriele Monaco <gmonaco@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://patch.msgid.link/20251120145653.296659-4-gmonaco@redhat.com
2025-11-20 20:17:31 +01:00
Gabriele Monaco a048ca5f00 timers/migration: Add mask for CPUs available in the hierarchy
Keep track of the CPUs available for timer migration in a cpumask. This
prepares the ground to generalise the concept of unavailable CPUs.

Signed-off-by: Gabriele Monaco <gmonaco@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://patch.msgid.link/20251120145653.296659-3-gmonaco@redhat.com
2025-11-20 20:17:31 +01:00
Gabriele Monaco 8312cab5ff timers/migration: Rename 'online' bit to 'available'
The timer migration hierarchy excludes offline CPUs via the
tmigr_is_not_available function, which is essentially checking the
online bit for the CPU.

Rename the online bit to available and all references in function names
and tracepoint to generalise the concept of available CPUs.

Signed-off-by: Gabriele Monaco <gmonaco@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://patch.msgid.link/20251120145653.296659-2-gmonaco@redhat.com
2025-11-20 20:17:31 +01:00
Thomas Gleixner 79c11fb3da sched/mmcid: Use cpumask_weighted_or()
Use cpumask_weighted_or() instead of cpumask_or() and cpumask_weight() on
the result, which walks the same bitmap twice. Results in 10-20% less
cycles, which reduces the runqueue lock hold time.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Acked-by: Yury Norov (NVIDIA) <yury.norov@gmail.com>
Link: https://patch.msgid.link/20251119172549.511736272@linutronix.de
2025-11-20 12:14:54 +01:00
Thomas Gleixner 437cb3ded2 cpumask: Introduce cpumask_weighted_or()
CID management OR's two cpumasks and then calculates the weight on the
result. That's inefficient as that has to walk the same stuff twice. As
this is done with runqueue lock held, there is a real benefit of speeding
this up. Depending on the system this results in 10-20% less cycles spent
with runqueue lock held for a 4K cpumask.

Provide cpumask_weighted_or() and the corresponding bitmap functions which
return the weight of the OR result right away.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Yury Norov (NVIDIA) <yury.norov@gmail.com>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20251119172549.448263340@linutronix.de
2025-11-20 12:14:54 +01:00
Thomas Gleixner 0d032a43eb sched/mmcid: Prevent pointless work in mm_update_cpus_allowed()
mm_update_cpus_allowed() is not required to be invoked for affinity changes
due to migrate_disable() and migrate_enable().

migrate_disable() restricts the task temporarily to a CPU on which the task
was already allowed to run, so nothing changes. migrate_enable() restores
the actual task affinity mask.

If that mask changed between migrate_disable() and migrate_enable() then
that change was already accounted for.

Move the invocation to the proper place to avoid that.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20251119172549.385208276@linutronix.de
2025-11-20 12:14:54 +01:00
Thomas Gleixner b08ef5fc8f sched/mmcid: Move scheduler code out of global header
This is only used in the scheduler core code, so there is no point to have
it in a global header.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Acked-by: Yury Norov (NVIDIA) <yury.norov@gmail.com>
Link: https://patch.msgid.link/20251119172549.321259077@linutronix.de
2025-11-20 12:14:53 +01:00
Thomas Gleixner 925b7847bb sched: Fixup whitespace damage
With whitespace checks enabled in the editor this makes eyes bleed.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20251119172549.258651925@linutronix.de
2025-11-20 12:14:53 +01:00
Thomas Gleixner be4463fa2c sched/mmcid: Cacheline align MM CID storage
Both the per CPU storage and the data in mm_struct are heavily used in
context switch. As they can end up next to other frequently modified data,
they are subject to false sharing.

Make them cache line aligned.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20251119172549.194111661@linutronix.de
2025-11-20 12:14:53 +01:00
Thomas Gleixner 8cea569ca7 sched/mmcid: Use proper data structures
Having a lot of CID functionality specific members in struct task_struct
and struct mm_struct is not really making the code easier to read.

Encapsulate the CID specific parts in data structures and keep them
separate from the stuff they are embedded in.

No functional change.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20251119172549.131573768@linutronix.de
2025-11-20 12:14:52 +01:00
Thomas Gleixner 77d7dc8bef sched/mmcid: Revert the complex CID management
The CID management is a complex beast, which affects both scheduling and
task migration. The compaction mechanism forces random tasks of a process
into task work on exit to user space causing latency spikes.

Revert back to the initial simple bitmap allocating mechanics, which are
known to have scalability issues as that allows to gradually build up a
replacement functionality in a reviewable way.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20251119172549.068197830@linutronix.de
2025-11-20 12:14:52 +01:00
Christian Brauner 101bf15887
Merge patch series "ovl: convert copyup credential override to cred guard"
Christian Brauner <brauner@kernel.org> says:

This simplifies the copyup specific credential override.

The current code is centered around a helper struct ovl_cu_creds and is
a bit convoluted. We can simplify this by using a cred guard. This will
also allow us to remove the helper struct and associated functions.

* patches from https://patch.msgid.link/20251114-work-ovl-cred-guard-copyup-v1-0-ea3fb15cf427@kernel.org:
  ovl: remove struct ovl_cu_creds and associated functions
  ovl: port ovl_copy_up_tmpfile() to cred guard
  ovl: mark *_cu_creds() as unused temporarily
  ovl: port ovl_copy_up_workdir() to cred guard
  ovl: add copy up credential guard

Link: https://patch.msgid.link/20251114-work-ovl-cred-guard-copyup-v1-0-ea3fb15cf427@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-19 21:58:27 +01:00
Christian Brauner c0fb968656
Merge patch series "ovl: convert creation credential override to cred guard"
Christian Brauner <brauner@kernel.org> says:

This cleans up the creation specific credential override.

The current code to override credentials for creation operations is
pretty difficult to understand as we override the credentials twice:

(1) override with the mounter's credentials
(2) copy the mounts credentials and override the fs{g,u}id with the inode {u,g}id

And then we elide the revert_creds() because it would be an idempotent
revert. That elision doesn't buy us anything anymore though because it's
all reference count less anyway.

The fact that this is done in a function and that the revert is
happening in the original override makes this a lot to grasp.

By introducing a cleanup guard for the creation case we can make this a
lot easier to understand and extremely visually prevalent:

with_ovl_creds(dentry->d_sb) {
	scoped_class(prepare_creds_ovl, cred, dentry, inode, mode) {
		if (IS_ERR(cred))
			return PTR_ERR(cred);

		ovl_path_upper(dentry->d_parent, &realparentpath);

		/* more stuff you want to do */
}

I think this is a big improvement over what we have now.

* patches from https://patch.msgid.link/20251117-work-ovl-cred-guard-prepare-v2-0-bd1c97a36d7b@kernel.org:
  ovl: drop ovl_setup_cred_for_create()
  ovl: port ovl_create_or_link() to new ovl_override_creator_creds cleanup guard
  ovl: mark ovl_setup_cred_for_create() as unused temporarily
  ovl: reflow ovl_create_or_link()
  ovl: port ovl_create_tmpfile() to new ovl_override_creator_creds cleanup guard
  ovl: add ovl_override_creator_creds cred guard

Link: https://patch.msgid.link/20251117-work-ovl-cred-guard-prepare-v2-0-bd1c97a36d7b@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-19 21:58:27 +01:00
Christian Brauner 2c42b6ce4a
ovl: remove struct ovl_cu_creds and associated functions
Now that we have this all ported to a cred guard remove the struct and
the associated helpers.

Link: https://patch.msgid.link/20251114-work-ovl-cred-guard-copyup-v1-5-ea3fb15cf427@kernel.org
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-19 21:58:27 +01:00
Christian Brauner 72f098f0dd
ovl: port ovl_copy_up_tmpfile() to cred guard
Remove the complicated struct ovl_cu_creds dance and use our new copy up
cred guard.

Link: https://patch.msgid.link/20251114-work-ovl-cred-guard-copyup-v1-4-ea3fb15cf427@kernel.org
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-19 21:58:27 +01:00
Christian Brauner 643b8a2c0a
ovl: mark *_cu_creds() as unused temporarily
They will become unused in the next patch and we'll drop them after the
conversion is finished together with the struct. This keeps the changes
small and reviewable.

Link: https://patch.msgid.link/20251114-work-ovl-cred-guard-copyup-v1-3-ea3fb15cf427@kernel.org
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-19 21:58:27 +01:00
Christian Brauner bdba9c79c8
ovl: port ovl_copy_up_workdir() to cred guard
Remove the complicated struct ovl_cu_creds dance and use our new copy up
cred guard.

Link: https://patch.msgid.link/20251114-work-ovl-cred-guard-copyup-v1-2-ea3fb15cf427@kernel.org
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-19 21:58:27 +01:00
Christian Brauner 81b77b5b0a
ovl: add copy up credential guard
Add a credential guard for copy up. This will allows us to waste struct
struct ovl_cu_creds and simplify the code.

Link: https://patch.msgid.link/20251114-work-ovl-cred-guard-copyup-v1-1-ea3fb15cf427@kernel.org
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-19 21:58:27 +01:00
Christian Brauner 89a11f004f
ovl: drop ovl_setup_cred_for_create()
It is now unused and can be removed.

Link: https://patch.msgid.link/20251117-work-ovl-cred-guard-prepare-v2-6-bd1c97a36d7b@kernel.org
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-19 21:58:26 +01:00
Christian Brauner e566bff963
ovl: port ovl_create_or_link() to new ovl_override_creator_creds cleanup guard
This clearly indicates the double-credential override and makes the code
a lot easier to grasp with one glance.

Link: https://patch.msgid.link/20251117-work-ovl-cred-guard-prepare-v2-5-bd1c97a36d7b@kernel.org
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-19 21:58:26 +01:00
Christian Brauner 8a227c2766
ovl: mark ovl_setup_cred_for_create() as unused temporarily
The function will become unused in the next patch.
We'll remove it in later patches to keep the diff legible.

Link: https://patch.msgid.link/20251117-work-ovl-cred-guard-prepare-v2-4-bd1c97a36d7b@kernel.org
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-19 21:58:26 +01:00
Christian Brauner d6ef072d09
ovl: reflow ovl_create_or_link()
Reflow the creation routine in preparation of porting it to a guard.

Link: https://patch.msgid.link/20251117-work-ovl-cred-guard-prepare-v2-3-bd1c97a36d7b@kernel.org
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-19 21:58:26 +01:00
Christian Brauner 8d7fc461e4
ovl: port ovl_create_tmpfile() to new ovl_override_creator_creds cleanup guard
This clearly indicates the double-credential override and makes the code
a lot easier to grasp with one glance.

Link: https://patch.msgid.link/20251117-work-ovl-cred-guard-prepare-v2-2-bd1c97a36d7b@kernel.org
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-19 21:58:26 +01:00
Christian Brauner f37b334728
ovl: add ovl_override_creator_creds cred guard
The current code to override credentials for creation operations is
pretty difficult to understand. We effectively override the credentials
twice:

(1) override with the mounter's credentials
(2) copy the mounts credentials and override the fs{g,u}id with the inode {u,g}id

And then we elide the revert because it would be an idempotent revert.
That elision doesn't buy us anything anymore though because I've made it
all work without any reference counting anyway. All it does is mix the
two credential overrides together.

We can use a cleanup guard to clarify the creation codepaths and make
them easier to understand.

This just introduces the cleanup guard keeping the patch reviewable.
We'll convert the caller in follow-up patches and then drop the
duplicated code.

Link: https://patch.msgid.link/20251117-work-ovl-cred-guard-prepare-v2-1-bd1c97a36d7b@kernel.org
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-19 21:58:26 +01:00
Christian Brauner 5c06bc9f06
Merge patch series "ovl: convert to cred guard"
Christian Brauner <brauner@kernel.org> says:

This adds an overlayfs specific extension of the cred guard
infrastructure I introduced. This allows all of overlayfs to be ported
to cred guards. I refactored a few functions to reduce the scope of the
cred guard. I think this is beneficial as it's visually very easy to
grasp the scope in one go. Lightly tested.

* patches from https://patch.msgid.link/20251117-work-ovl-cred-guard-v4-0-b31603935724@kernel.org: (42 commits)
  ovl: remove ovl_revert_creds()
  ovl: port ovl_fill_super() to cred guard
  ovl: refactor ovl_fill_super()
  ovl: port ovl_lower_positive() to cred guard
  ovl: port ovl_lookup() to cred guard
  ovl: refactor ovl_lookup()
  ovl: port ovl_copyfile() to cred guard
  ovl: port ovl_rename() to cred guard
  ovl: refactor ovl_rename()
  ovl: introduce struct ovl_renamedata
  ovl: port ovl_listxattr() to cred guard
  ovl: port ovl_xattr_get() to cred guard
  ovl: port ovl_xattr_set() to cred guard
  ovl: port ovl_nlink_end() to cred guard
  ovl: port ovl_nlink_start() to cred guard
  ovl: port ovl_check_empty_dir() to cred guard
  ovl: port ovl_dir_llseek() to cred guard
  ovl: refactor ovl_iterate() and port to cred guard
  ovl: don't override credentials for ovl_check_whiteouts()
  ovl: port ovl_maybe_lookup_lowerdata() to cred guard
  ...

Link: https://patch.msgid.link/20251117-work-ovl-cred-guard-v4-0-b31603935724@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-19 21:58:25 +01:00
Christian Brauner 850e32512a
ovl: remove ovl_revert_creds()
The wrapper isn't needed anymore. Overlayfs completely relies on its
cleanup guard.

Link: https://patch.msgid.link/20251117-work-ovl-cred-guard-v4-42-b31603935724@kernel.org
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-19 21:58:25 +01:00
Christian Brauner 217e78d1b7
ovl: port ovl_fill_super() to cred guard
Use the scoped ovl cred guard.

Link: https://patch.msgid.link/20251117-work-ovl-cred-guard-v4-41-b31603935724@kernel.org
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-19 21:58:25 +01:00
Christian Brauner fc95cda673
ovl: refactor ovl_fill_super()
Split the core into a separate helper in preparation of converting the
caller to the scoped ovl cred guard.

Link: https://patch.msgid.link/20251117-work-ovl-cred-guard-v4-40-b31603935724@kernel.org
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-19 21:58:25 +01:00
Christian Brauner db7cfe8783
ovl: port ovl_lower_positive() to cred guard
Use the scoped ovl cred guard.

Link: https://patch.msgid.link/20251117-work-ovl-cred-guard-v4-39-b31603935724@kernel.org
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-19 21:58:25 +01:00
Christian Brauner 6b6ef7d16f
ovl: port ovl_lookup() to cred guard
Use the scoped ovl cred guard.

Link: https://patch.msgid.link/20251117-work-ovl-cred-guard-v4-38-b31603935724@kernel.org
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-19 21:58:25 +01:00
Christian Brauner 15da486ad3
ovl: refactor ovl_lookup()
Split the core into a separate helper in preparation of converting the
caller to the scoped ovl cred guard.

Link: https://patch.msgid.link/20251117-work-ovl-cred-guard-v4-37-b31603935724@kernel.org
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-19 21:58:25 +01:00
Christian Brauner 14d35fda5b
ovl: port ovl_copyfile() to cred guard
Use the scoped ovl cred guard.

Link: https://patch.msgid.link/20251117-work-ovl-cred-guard-v4-36-b31603935724@kernel.org
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-19 21:58:24 +01:00
Christian Brauner ca0c657f25
ovl: port ovl_rename() to cred guard
Use the scoped ovl cred guard.

Link: https://patch.msgid.link/20251117-work-ovl-cred-guard-v4-35-b31603935724@kernel.org
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-19 21:58:24 +01:00
Christian Brauner a1da840198
ovl: refactor ovl_rename()
Extract the code that runs under overridden credentials into a separate
ovl_rename_upper() helper function and the code that runs before/after to
ovl_rename_start/end(). Error handling is simplified.
The helpers returns errors directly instead of using goto labels.

Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Link: https://patch.msgid.link/20251117-work-ovl-cred-guard-v4-34-b31603935724@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-19 21:58:24 +01:00
Christian Brauner fb9f31fe9f
ovl: introduce struct ovl_renamedata
Add a struct ovl_renamedata to group rename-related state that was
previously stored in local variables. Embedd struct renamedata directly
aligning with the vfs.

Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Link: https://patch.msgid.link/20251117-work-ovl-cred-guard-v4-33-b31603935724@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-19 21:58:24 +01:00
Christian Brauner 0b5800172c
ovl: port ovl_listxattr() to cred guard
Use the scoped ovl cred guard.

Link: https://patch.msgid.link/20251117-work-ovl-cred-guard-v4-32-b31603935724@kernel.org
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-19 21:58:24 +01:00
Christian Brauner ae64b54185
ovl: port ovl_xattr_get() to cred guard
Use the scoped ovl cred guard.

Link: https://patch.msgid.link/20251117-work-ovl-cred-guard-v4-31-b31603935724@kernel.org
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-19 21:58:24 +01:00
Christian Brauner d605301726
ovl: port ovl_xattr_set() to cred guard
Use the scoped ovl cred guard.

Link: https://patch.msgid.link/20251117-work-ovl-cred-guard-v4-30-b31603935724@kernel.org
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-19 21:58:24 +01:00
Christian Brauner 9e5ec68f3a
ovl: port ovl_nlink_end() to cred guard
Use the scoped ovl cred guard.

Link: https://patch.msgid.link/20251117-work-ovl-cred-guard-v4-29-b31603935724@kernel.org
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-19 21:58:23 +01:00
Christian Brauner 062c5b48d2
ovl: port ovl_nlink_start() to cred guard
Use the scoped ovl cred guard.

Link: https://patch.msgid.link/20251117-work-ovl-cred-guard-v4-28-b31603935724@kernel.org
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-19 21:58:23 +01:00
Christian Brauner 67bc75e6f4
ovl: port ovl_check_empty_dir() to cred guard
Use the scoped ovl cred guard.

Link: https://patch.msgid.link/20251117-work-ovl-cred-guard-v4-27-b31603935724@kernel.org
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-19 21:58:23 +01:00
Christian Brauner 5517646e14
ovl: port ovl_dir_llseek() to cred guard
Use the scoped ovl cred guard.

Link: https://patch.msgid.link/20251117-work-ovl-cred-guard-v4-26-b31603935724@kernel.org
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-19 21:58:23 +01:00
Christian Brauner d25e4b739f
ovl: refactor ovl_iterate() and port to cred guard
factor out ovl_iterate_merged() and move some code into
ovl_iterate_real() for easier use of the scoped ovl cred guard.

Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Link: https://patch.msgid.link/20251117-work-ovl-cred-guard-v4-25-b31603935724@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-19 21:58:23 +01:00
Christian Brauner 198d182288
ovl: don't override credentials for ovl_check_whiteouts()
The function is only called when rdd->dentry is non-NULL:

if (!err && rdd->first_maybe_whiteout && rdd->dentry)
    err = ovl_check_whiteouts(realpath, rdd);

| Caller                        | Sets rdd->dentry? | Can call ovl_check_whiteouts()? |
|-------------------------------|-------------------|---------------------------------|
| ovl_dir_read_merged()         | ✓ Yes (line 430)  | ✓ YES                           |
| ovl_dir_read_impure()         | ✗ No              | ✗ NO                            |
| ovl_check_d_type_supported()  | ✗ No              | ✗ NO                            |
| ovl_workdir_cleanup_recurse() | ✗ No              | ✗ NO                            |
| ovl_indexdir_cleanup()        | ✗ No              | ✗ NO                            |

VFS layer (.iterate_shared file operation)
  → ovl_iterate()
      [CRED OVERRIDE]
      → ovl_cache_get()
          → ovl_dir_read_merged()
              → ovl_dir_read()
                  → ovl_check_whiteouts()
      [CRED REVERT]

ovl_unlink()
  → ovl_do_remove()
      → ovl_check_empty_dir()
          [CRED OVERRIDE]
          → ovl_dir_read_merged()
              → ovl_dir_read()
                  → ovl_check_whiteouts()
          [CRED REVERT]

ovl_rename()
  → ovl_check_empty_dir()
      [CRED OVERRIDE]
      → ovl_dir_read_merged()
          → ovl_dir_read()
              → ovl_check_whiteouts()
      [CRED REVERT]

All valid callchains already override credentials so drop the override.

Link: https://patch.msgid.link/20251117-work-ovl-cred-guard-v4-24-b31603935724@kernel.org
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-19 21:58:23 +01:00
Christian Brauner cb3c8cbaed
ovl: port ovl_maybe_lookup_lowerdata() to cred guard
Use the scoped ovl cred guard.

Link: https://patch.msgid.link/20251117-work-ovl-cred-guard-v4-23-b31603935724@kernel.org
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-19 21:58:23 +01:00
Christian Brauner b1c47b3abc
ovl: port ovl_maybe_validate_verity() to cred guard
Use the scoped ovl cred guard.

Link: https://patch.msgid.link/20251117-work-ovl-cred-guard-v4-22-b31603935724@kernel.org
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-19 21:58:22 +01:00
Christian Brauner 4975e683c2
ovl: port ovl_fileattr_get() to cred guard
Use the scoped ovl cred guard.

Link: https://patch.msgid.link/20251117-work-ovl-cred-guard-v4-21-b31603935724@kernel.org
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-19 21:58:22 +01:00
Christian Brauner af1d5d62f3
ovl: port ovl_fileattr_set() to cred guard
Use the scoped ovl cred guard.

Link: https://patch.msgid.link/20251117-work-ovl-cred-guard-v4-20-b31603935724@kernel.org
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-19 21:58:22 +01:00
Christian Brauner a3860a808f
ovl: port ovl_fiemap() to cred guard
Use the scoped ovl cred guard.

Link: https://patch.msgid.link/20251117-work-ovl-cred-guard-v4-19-b31603935724@kernel.org
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-19 21:58:22 +01:00
Christian Brauner 8e9698d6e4
ovl: port ovl_set_or_remove_acl() to cred guard
Use the scoped ovl cred guard.

Link: https://patch.msgid.link/20251117-work-ovl-cred-guard-v4-18-b31603935724@kernel.org
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-19 21:58:22 +01:00
Christian Brauner 71ac28fbcd
ovl: port do_ovl_get_acl() to cred guard
Use the scoped ovl cred guard.

Link: https://patch.msgid.link/20251117-work-ovl-cred-guard-v4-17-b31603935724@kernel.org
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-19 21:58:22 +01:00
Christian Brauner 47eba7f7fd
ovl: port ovl_get_link() to cred guard
Use the scoped ovl cred guard.

Link: https://patch.msgid.link/20251117-work-ovl-cred-guard-v4-16-b31603935724@kernel.org
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-19 21:58:22 +01:00
Christian Brauner d81999b40b
ovl: port ovl_permission() to cred guard
Link: https://patch.msgid.link/20251117-work-ovl-cred-guard-v4-15-b31603935724@kernel.org
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-19 21:58:21 +01:00
Christian Brauner 81707ae827
ovl: port ovl_getattr() to cred guard
Use the scoped ovl cred guard.

Link: https://patch.msgid.link/20251117-work-ovl-cred-guard-v4-14-b31603935724@kernel.org
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-19 21:58:21 +01:00
Christian Brauner 7aedfa5a52
ovl: port ovl_setattr() to cred guard
Use the scoped ovl cred guard.

Link: https://patch.msgid.link/20251117-work-ovl-cred-guard-v4-13-b31603935724@kernel.org
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-19 21:58:21 +01:00
Christian Brauner 9763970984
ovl: port ovl_flush() to cred guard
Use the scoped ovl cred guard.

Link: https://patch.msgid.link/20251117-work-ovl-cred-guard-v4-12-b31603935724@kernel.org
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-19 21:58:21 +01:00
Christian Brauner 8e8f4df93c
ovl: port ovl_fadvise() to cred guard
Use the scoped ovl cred guard.

Link: https://patch.msgid.link/20251117-work-ovl-cred-guard-v4-11-b31603935724@kernel.org
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-19 21:58:21 +01:00
Christian Brauner 2468017783
ovl: port ovl_fallocate() to cred guard
Use the scoped ovl cred guard.

Link: https://patch.msgid.link/20251117-work-ovl-cred-guard-v4-10-b31603935724@kernel.org
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-19 21:58:21 +01:00
Christian Brauner 07a891c346
ovl: port ovl_fsync() to cred guard
Use the scoped ovl cred guard.

Link: https://patch.msgid.link/20251117-work-ovl-cred-guard-v4-9-b31603935724@kernel.org
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-19 21:58:21 +01:00
Christian Brauner 1fc4bc77c7
ovl: port ovl_llseek() to cred guard
Use the scoped ovl cred guard.

Link: https://patch.msgid.link/20251117-work-ovl-cred-guard-v4-8-b31603935724@kernel.org
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-19 21:58:20 +01:00
Christian Brauner b27ebb3d4b
ovl: port ovl_open_realfile() to cred guard
Use the scoped ovl cred guard.

Link: https://patch.msgid.link/20251117-work-ovl-cred-guard-v4-7-b31603935724@kernel.org
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-19 21:58:20 +01:00
Christian Brauner 5f51dfe768
ovl: port ovl_create_tmpfile() to cred guard
Use the scoped ovl cred guard.

Link: https://patch.msgid.link/20251117-work-ovl-cred-guard-v4-6-b31603935724@kernel.org
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-19 21:58:20 +01:00
Christian Brauner 8368eb837e
ovl: port ovl_do_remove() to cred guard
Use the scoped ovl cred guard.

Link: https://patch.msgid.link/20251117-work-ovl-cred-guard-v4-5-b31603935724@kernel.org
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-19 21:58:20 +01:00
Christian Brauner ff4f6e4689
ovl: port ovl_set_link_redirect() to cred guard
Use the scoped ovl cred guard.

Link: https://patch.msgid.link/20251117-work-ovl-cred-guard-v4-4-b31603935724@kernel.org
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-19 21:58:20 +01:00
Christian Brauner 8c9531edcf
ovl: port ovl_create_or_link() to cred guard
Use the scoped ovl cred guard.

Link: https://patch.msgid.link/20251117-work-ovl-cred-guard-v4-3-b31603935724@kernel.org
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-19 21:58:20 +01:00
Christian Brauner 87809f12e0
ovl: port ovl_copy_up_flags() to cred guards
Use the scoped ovl cred guard.

Link: https://patch.msgid.link/20251117-work-ovl-cred-guard-v4-2-b31603935724@kernel.org
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-19 21:58:20 +01:00
Christian Brauner 6f5c84162a
ovl: add override_creds cleanup guard extension for overlayfs
Overlayfs plucks the relevant creds from the superblock. Extend the
override_creds cleanup class I added to override_creds_ovl which uses
the ovl_override_creds() function as initialization helper. Add
with_ovl_creds() based on this new class.

Link: https://patch.msgid.link/20251117-work-ovl-cred-guard-v4-1-b31603935724@kernel.org
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-19 21:58:19 +01:00
Christian Brauner 658d1322fa
Merge branch 'vfs-6.19.directory.locking' into base.vfs-6.19.ovl
Bring in the directory locking changes as they touch overlayfs in a
pretty substantial way and we are about to change the credential
override semantics quite substantially as well.

Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-19 21:56:47 +01:00
Christian Brauner 2b21a6204d
Merge branch 'kbuild-6.19.fms.extension'
Bring in the shared branch with the kbuild tree to enable
'-fms-extensions' for 6.19. The overlayfs cred guard work
depends on this.

Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-19 21:56:17 +01:00
Qiuxu Zhuo d4839582bc EDAC/skx_common: Prepare for skx_set_hi_lo()
The upcoming imh_edac driver for Intel Diamond Rapids servers cannot
use skx_get_hi_lo() in skx_common to retrieve the TOHM (Top of High
Memory) and TOLM (Top of Low Memory) parameters. Instead, it obtains
these parameters within its own EDAC driver. To accommodate this,
prepare skx_set_hi_lo() to allow the driver to notify skx_common of
these parameters.

Tested-by: Yi Lai <yi1.lai@intel.com>
Signed-off-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
Link: https://patch.msgid.link/20251119134132.2389472-4-qiuxu.zhuo@intel.com
2025-11-19 12:11:40 -08:00
Qiuxu Zhuo 9529e69773 EDAC/skx_common: Prepare for skx_get_edac_list()
The Intel EDAC library 'skx_common' maintains the Intel server EDAC device
list for {skx, i10nm}_edac drivers, which use skx_get_all_bus_mappings()
to build and retrieve the EDAC device list.

However, the upcoming Intel EDAC driver, imh_edac, for Diamond Rapids
servers is designed for memory controllers that are MMIO-based devices
rather than PCI devices. Consequently, it can't use
skx_get_all_bus_mappings() due to the absence of a PCI bus. To accommodate
this, prepare skx_get_edac_list() to enable the upcoming imh_edac driver
to obtain the EDAC device list from the skx_common library and build the
EDAC device list independently.

Tested-by: Yi Lai <yi1.lai@intel.com>
Signed-off-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
Link: https://patch.msgid.link/20251119134132.2389472-3-qiuxu.zhuo@intel.com
2025-11-19 12:11:40 -08:00
Qiuxu Zhuo b3d70059cb EDAC/{skx_common,skx,i10nm}: Make skx_register_mci() independent of pci_dev
Memory controllers in the new Intel server CPUs, such as Diamond Rapids,
are presented as MMIO-based devices rather than PCI devices.
Modify skx_register_mci() to be independent of 'pci_dev' and use a generic
'dev' of 'struct device' to prepare for support of such MMIO-based memory
controllers.

Tested-by: Yi Lai <yi1.lai@intel.com>
Signed-off-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
Link: https://patch.msgid.link/20251119134132.2389472-2-qiuxu.zhuo@intel.com
2025-11-19 12:11:40 -08:00
Mateusz Guzik bfef6e1f34
fs: move mntput_no_expire() slowpath into a dedicated routine
In the stock variant the compiler spills several registers on the stack
and employs stack smashing protection, adding even more code + a branch
on exit..

The actual fast path is small enough that the compiler inlines it for
all callers -- the symbol is no longer emitted.

Forcing noinline on it just for code-measurement purposes shows the fast
path dropping from 111 to 39 bytes.

Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
Link: https://patch.msgid.link/20251114201803.2183505-1-mjguzik@gmail.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-19 14:49:28 +01:00
Christoph Hellwig 6d228c181e
fs: remove spurious exports in fs/file_attr.c
Commit 2f952c9e8f ("fs: split fileattr related helpers into separate
file") added various exports without users despite claiming to be a
simple refactor.  Drop them again.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://patch.msgid.link/20251119101415.2732320-1-hch@lst.de
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-19 12:17:31 +01:00
Davidlohr Bueso c29383a874
watch_queue: Use local kmap in post_one_notification()
Replace the now deprecated kmap_atomic() with kmap_local_page().

Optimize for the non-highmem cases and avoid disabling preemption and
pagefaults, the caller's context is atomic anyway, but that is irrelevant
to kmap. The memcpy itself does not require any such semantics and the
mapping would hold valid across context switches anyway. Further, highmem
is planned to to be removed[1].

[1] https://lore.kernel.org/all/4ff89b72-03ff-4447-9d21-dd6a5fe1550f@app.fastmail.com/

Signed-off-by: Davidlohr Bueso <dave@stgolabs.net>
Link: https://patch.msgid.link/20251118210706.1816303-1-dave@stgolabs.net
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-19 12:17:28 +01:00
Ian Kent 922a6f34c1
autofs: dont trigger mount if it cant succeed
If a mount namespace contains autofs mounts, and they are propagation
private, and there is no namespace specific automount daemon to handle
possible automounting then attempted path resolution will loop until
MAXSYMLINKS is reached before failing causing quite a bit of noise in
the log.

Add a check for this in autofs ->d_automount() so that the VFS can
immediately return an error in this case. Since the mount is propagation
private an EPERM return seems most appropriate.

Suggested by: Christian Brauner <brauner@kernel.org>

Signed-off-by: Ian Kent <raven@themaw.net>
Link: https://patch.msgid.link/20251118024631.10854-2-raven@themaw.net
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-19 11:14:02 +01:00
Reinette Chatre 5a88a6e92b fs/resctrl: Consider sparse masks when initializing new group's allocation
A new resource group is intended to be created with sane defaults. For a cache
resource this means all cache portions the new group could possibly allocate
into. This includes unused cache portions and shareable cache portions used by
other groups and hardware.

New resource group creation does not take sparse masks into account. After
determining the bitmask reflecting the new group's possible allocations the
bitmask is forced to be contiguous even if the system supports sparse masks.
For example, a new group could by default allocate into a large portion of
cache represented by 0xff0f, but it is instead created with a mask of 0xf.

Do not force a contiguous allocation range if the system supports sparse masks.

Signed-off-by: Reinette Chatre <reinette.chatre@intel.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Link: https://patch.msgid.link/abbbb008bc09d982d715e79d3b885c10f92c64e0.1763426240.git.reinette.chatre@intel.com
2025-11-18 21:10:56 +01:00
Thorsten Blum cdf5ecc3f6 EDAC/ghes: Replace deprecated strcpy() in ghes_edac_report_mem_error()
strcpy() has been deprecated¹ because it performs no bounds checking on the
destination buffer, which can lead to buffer overflows. Use the safer
strscpy() instead.

¹ https://www.kernel.org/doc/html/latest/process/deprecated.html#strcpy

Signed-off-by: Thorsten Blum <thorsten.blum@linux.dev>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Reviewed-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Link: https://patch.msgid.link/20251118135621.101148-2-thorsten.blum@linux.dev
2025-11-18 16:50:32 +01:00
Chengkaitao 9d3faec60b genirq: Use raw_spinlock_irq() in irq_set_affinity_notifier()
Since irq_set_affinity_notifier() may sleep, interrupts are enabled. So
raw_spinlock_irqsave() can be replaced with raw_spinlock_irq().

Signed-off-by: Chengkaitao <chengkaitao@kylinos.cn>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://patch.msgid.link/20251118012754.61805-1-pilgrimtao@gmail.com
2025-11-18 16:19:40 +01:00
Dan Carpenter 80adaccf0e rseq: Delete duplicate if statement in rseq_virt_userspace_exit()
This if statement is indented weirdly.  It's a duplicate and doesn't
affect runtime (beyond wasting a little time).  Delete it.

Signed-off-by: Dan Carpenter <dan.carpenter@linaro.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://patch.msgid.link/aRxP3YcwscrP1BU_@stanley.mountain
2025-11-18 15:56:55 +01:00
Christophe Leroy 4322c8f81c lib/strn*,uaccess: Use masked_user_{read/write}_access_begin when required
Properly use masked_user_read_access_begin() and
masked_user_write_access_begin() instead of masked_user_access_begin() in
order to match user_read_access_end() and user_write_access_end().  This is
important for architectures like PowerPC that enable separately user reads
and user writes.

That means masked_user_read_access_begin() is used when user memory is
exclusively read during the window and masked_user_write_access_begin()
is used when user memory is exclusively writen during the window.
masked_user_access_begin() remains and is used when both reads and
writes are performed during the open window. Each of them is expected
to be terminated by the matching user_read_access_end(),
user_write_access_end() and user_access_end().

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://patch.msgid.link/cb5e4b0fa49ea9c740570949d5e3544423389757.1763396724.git.christophe.leroy@csgroup.eu
2025-11-18 15:27:35 +01:00
Christophe Leroy 1c204914bc scm: Convert put_cmsg() to scoped user access
Replace the open coded implementation with the scoped user access guard.

That also corrects the imbalance between masked_user_access_begin() and
user_write_access_end(), which would affect PowerPC when it gains masked
user access support.

No functional change intended.

[ tglx: Amend change log ]

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://patch.msgid.link/793219313f641eda09a892d06768d2837246bf9f.1763396724.git.christophe.leroy@csgroup.eu
2025-11-18 15:27:34 +01:00
Christophe Leroy 803abedbd5 iov_iter: Add missing speculation barrier to copy_from_user_iter()
The results of "access_ok()" can be mis-speculated.  The result is that
the CPU can end speculatively:

	if (access_ok(from, size))
		// Right here

For the same reason as done in copy_from_user() in commit 74e19ef0ff
("uaccess: Add speculation barrier to copy_from_user()"), add a speculation
barrier to copy_from_user_iter().

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://patch.msgid.link/6b73e69cc7168c89df4eab0a216e3ed4cca36b0a.1763396724.git.christophe.leroy@csgroup.eu
2025-11-18 15:27:34 +01:00
Christophe Leroy 4db1df7a72 iov_iter: Convert copy_from_user_iter() to masked user access
copy_from_user_iter() lacks a speculation barrier, which will degrade
performance on some architecture like x86, which would be unfortunate as
copy_from_user_iter() is a critical hotpath function.

Convert copy_from_user_iter() to using masked user access on architecture
that support it. This allows to add the speculation barrier without
impacting performance.

This is similar to what was done for copy_from_user() in commit
0fc810ae3a ("x86/uaccess: Avoid barrier_nospec() in 64-bit
copy_from_user()")

[ tglx: Massage change log ]

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://patch.msgid.link/58e4b07d469ca68a2b9477fe2c1ccc8a44cef131.1763396724.git.christophe.leroy@csgroup.eu
2025-11-18 15:27:34 +01:00
Josh Poimboeuf 2092007aa3 objtool/klp: Only enable --checksum when needed
With CONFIG_KLP_BUILD enabled, checksums are only needed during a
klp-build run.  There's no need to enable them for normal kernel builds.

This also has the benefit of softening the xxhash dependency.

Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Michael Kelley <mhklinux@outlook.com>
Link: https://patch.msgid.link/edbb1ca215e4926e02edb493b68b9d6d063e902f.1762990139.git.jpoimboe@kernel.org
2025-11-18 09:59:26 +01:00
Josh Poimboeuf ee0b48faba objtool: Set minimum xxhash version to 0.8
XXH3 is only supported starting with xxhash 0.8.  Enforce that.

Fixes: 0d83da43b1 ("objtool/klp: Add --checksum option to generate per-function checksums")
Closes: https://lore.kernel.org/SN6PR02MB41579B83CD295C9FEE40EED6D4FCA@SN6PR02MB4157.namprd02.prod.outlook.com
Reported-by: Michael Kelley <mhklinux@outlook.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Michael Kelley <mhklinux@outlook.com>
Tested-by: Michael Kelley <mhklinux@outlook.com>
Link: https://patch.msgid.link/7227c94692a3a51840278744c7af31b4797c6b96.1762990139.git.jpoimboe@kernel.org
2025-11-18 09:59:25 +01:00
Peter Zijlstra 33cf66d883 sched/fair: Proportional newidle balance
Add a randomized algorithm that runs newidle balancing proportional to
its success rate.

This improves schbench significantly:

 6.18-rc4:			2.22 Mrps/s
 6.18-rc4+revert:		2.04 Mrps/s
 6.18-rc4+revert+random:	2.18 Mrps/S

Conversely, per Adam Li this affects SpecJBB slightly, reducing it by 1%:

 6.17:			-6%
 6.17+revert:		 0%
 6.17+revert+random:	-1%

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Tested-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Tested-by: Chris Mason <clm@meta.com>
Link: https://lkml.kernel.org/r/6825c50d-7fa7-45d8-9b81-c6e7e25738e2@meta.com
Link: https://patch.msgid.link/20251107161739.770122091@infradead.org
2025-11-17 17:13:16 +01:00
Peter Zijlstra 08d473dd87 sched/fair: Small cleanup to update_newidle_cost()
Simplify code by adding a few variables.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Tested-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Tested-by: Chris Mason <clm@meta.com>
Link: https://patch.msgid.link/20251107161739.655208666@infradead.org
2025-11-17 17:13:15 +01:00
Peter Zijlstra e78e70dbf6 sched/fair: Small cleanup to sched_balance_newidle()
Pull out the !sd check to simplify code.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Tested-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Tested-by: Chris Mason <clm@meta.com>
Link: https://patch.msgid.link/20251107161739.525916173@infradead.org
2025-11-17 17:13:15 +01:00
Peter Zijlstra d206fbad93 sched/fair: Revert max_newidle_lb_cost bump
Many people reported regressions on their database workloads due to:

  155213a2ae ("sched/fair: Bump sd->max_newidle_lb_cost when newidle balance fails")

For instance Adam Li reported a 6% regression on SpecJBB.

Conversely this will regress schbench again; on my machine from 2.22
Mrps/s down to 2.04 Mrps/s.

Reported-by: Joseph Salisbury <joseph.salisbury@oracle.com>
Reported-by: Adam Li <adamli@os.amperecomputing.com>
Reported-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Reported-by: Hazem Mohamed Abuelfotoh <abuehaze@amazon.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Tested-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Tested-by: Chris Mason <clm@meta.com>
Link: https://lkml.kernel.org/r/20250626144017.1510594-2-clm@fb.com
Link: https://lkml.kernel.org/r/006c9df2-b691-47f1-82e6-e233c3f91faf@oracle.com
Link: https://patch.msgid.link/20251107161739.406147760@infradead.org
2025-11-17 17:13:15 +01:00
Mel Gorman e837456fdc sched/fair: Reimplement NEXT_BUDDY to align with EEVDF goals
Reimplement NEXT_BUDDY preemption to take into account the deadline and
eligibility of the wakee with respect to the waker. In the event
multiple buddies could be considered, the one with the earliest deadline
is selected.

Sync wakeups are treated differently to every other type of wakeup. The
WF_SYNC assumption is that the waker promises to sleep in the very near
future. This is violated in enough cases that WF_SYNC should be treated
as a suggestion instead of a contract. If a waker does go to sleep almost
immediately then the delay in wakeup is negligible. In other cases, it's
throttled based on the accumulated runtime of the waker so there is a
chance that some batched wakeups have been issued before preemption.

For all other wakeups, preemption happens if the wakee has a earlier
deadline than the waker and eligible to run.

While many workloads were tested, the two main targets were a modified
dbench4 benchmark and hackbench because the are on opposite ends of the
spectrum -- one prefers throughput by avoiding preemption and the other
relies on preemption.

First is the dbench throughput data even though it is a poor metric but
it is the default metric. The test machine is a 2-socket machine and the
backing filesystem is XFS as a lot of the IO work is dispatched to kernel
threads. It's important to note that these results are not representative
across all machines, especially Zen machines, as different bottlenecks
are exposed on different machines and filesystems.

dbench4 Throughput (misleading but traditional)
                            6.18-rc1               6.18-rc1
                             vanilla   sched-preemptnext-v5
Hmean     1       1268.80 (   0.00%)     1269.74 (   0.07%)
Hmean     4       3971.74 (   0.00%)     3950.59 (  -0.53%)
Hmean     7       5548.23 (   0.00%)     5420.08 (  -2.31%)
Hmean     12      7310.86 (   0.00%)     7165.57 (  -1.99%)
Hmean     21      8874.53 (   0.00%)     9149.04 (   3.09%)
Hmean     30      9361.93 (   0.00%)    10530.04 (  12.48%)
Hmean     48      9540.14 (   0.00%)    11820.40 (  23.90%)
Hmean     79      9208.74 (   0.00%)    12193.79 (  32.42%)
Hmean     110     8573.12 (   0.00%)    11933.72 (  39.20%)
Hmean     141     7791.33 (   0.00%)    11273.90 (  44.70%)
Hmean     160     7666.60 (   0.00%)    10768.72 (  40.46%)

As throughput is misleading, the benchmark is modified to use a short
loadfile report the completion time duration in milliseconds.

dbench4 Loadfile Execution Time
                             6.18-rc1               6.18-rc1
                              vanilla   sched-preemptnext-v5
Amean      1         14.62 (   0.00%)       14.69 (  -0.46%)
Amean      4         18.76 (   0.00%)       18.85 (  -0.45%)
Amean      7         23.71 (   0.00%)       24.38 (  -2.82%)
Amean      12        31.25 (   0.00%)       31.87 (  -1.97%)
Amean      21        45.12 (   0.00%)       43.69 (   3.16%)
Amean      30        61.07 (   0.00%)       54.33 (  11.03%)
Amean      48        95.91 (   0.00%)       77.22 (  19.49%)
Amean      79       163.38 (   0.00%)      123.08 (  24.66%)
Amean      110      243.91 (   0.00%)      175.11 (  28.21%)
Amean      141      343.47 (   0.00%)      239.10 (  30.39%)
Amean      160      401.15 (   0.00%)      283.73 (  29.27%)
Stddev     1          0.52 (   0.00%)        0.51 (   2.45%)
Stddev     4          1.36 (   0.00%)        1.30 (   4.04%)
Stddev     7          1.88 (   0.00%)        1.87 (   0.72%)
Stddev     12         3.06 (   0.00%)        2.45 (  19.83%)
Stddev     21         5.78 (   0.00%)        3.87 (  33.06%)
Stddev     30         9.85 (   0.00%)        5.25 (  46.76%)
Stddev     48        22.31 (   0.00%)        8.64 (  61.27%)
Stddev     79        35.96 (   0.00%)       18.07 (  49.76%)
Stddev     110       59.04 (   0.00%)       30.93 (  47.61%)
Stddev     141       85.38 (   0.00%)       40.93 (  52.06%)
Stddev     160       96.38 (   0.00%)       39.72 (  58.79%)

That is still looking good and the variance is reduced quite a bit.
Finally, fairness is a concern so the next report tracks how many
milliseconds does it take for all clients to complete a workfile. This
one is tricky because dbench makes to effort to synchronise clients so
the durations at benchmark start time differ substantially from typical
runtimes. This problem could be mitigated by warming up the benchmark
for a number of minutes but it's a matter of opinion whether that
counts as an evasion of inconvenient results.

dbench4 All Clients Loadfile Execution Time
                             6.18-rc1               6.18-rc1
                              vanilla   sched-preemptnext-v5
Amean      1         15.06 (   0.00%)       15.07 (  -0.03%)
Amean      4        603.81 (   0.00%)      524.29 (  13.17%)
Amean      7        855.32 (   0.00%)     1331.07 ( -55.62%)
Amean      12      1890.02 (   0.00%)     2323.97 ( -22.96%)
Amean      21      3195.23 (   0.00%)     2009.29 (  37.12%)
Amean      30     13919.53 (   0.00%)     4579.44 (  67.10%)
Amean      48     25246.07 (   0.00%)     5705.46 (  77.40%)
Amean      79     29701.84 (   0.00%)    15509.26 (  47.78%)
Amean      110    22803.03 (   0.00%)    23782.08 (  -4.29%)
Amean      141    36356.07 (   0.00%)    25074.20 (  31.03%)
Amean      160    17046.71 (   0.00%)    13247.62 (  22.29%)
Stddev     1          0.47 (   0.00%)        0.49 (  -3.74%)
Stddev     4        395.24 (   0.00%)      254.18 (  35.69%)
Stddev     7        467.24 (   0.00%)      764.42 ( -63.60%)
Stddev     12      1071.43 (   0.00%)     1395.90 ( -30.28%)
Stddev     21      1694.50 (   0.00%)     1204.89 (  28.89%)
Stddev     30      7945.63 (   0.00%)     2552.59 (  67.87%)
Stddev     48     14339.51 (   0.00%)     3227.55 (  77.49%)
Stddev     79     16620.91 (   0.00%)     8422.15 (  49.33%)
Stddev     110    12912.15 (   0.00%)    13560.95 (  -5.02%)
Stddev     141    20700.13 (   0.00%)    14544.51 (  29.74%)
Stddev     160     9079.16 (   0.00%)     7400.69 (  18.49%)

This is more of a mixed bag but it at least shows that fairness
is not crippled.

The hackbench results are more neutral but this is still important.
It's possible to boost the dbench figures by a large amount but only by
crippling the performance of a workload like hackbench. The WF_SYNC
behaviour is important for these workloads and is why the WF_SYNC
changes are not a separate patch.

hackbench-process-pipes
                          6.18-rc1             6.18-rc1
                             vanilla   sched-preemptnext-v5
Amean     1        0.2657 (   0.00%)      0.2150 (  19.07%)
Amean     4        0.6107 (   0.00%)      0.6060 (   0.76%)
Amean     7        0.7923 (   0.00%)      0.7440 (   6.10%)
Amean     12       1.1500 (   0.00%)      1.1263 (   2.06%)
Amean     21       1.7950 (   0.00%)      1.7987 (  -0.20%)
Amean     30       2.3207 (   0.00%)      2.5053 (  -7.96%)
Amean     48       3.5023 (   0.00%)      3.9197 ( -11.92%)
Amean     79       4.8093 (   0.00%)      5.2247 (  -8.64%)
Amean     110      6.1160 (   0.00%)      6.6650 (  -8.98%)
Amean     141      7.4763 (   0.00%)      7.8973 (  -5.63%)
Amean     172      8.9560 (   0.00%)      9.3593 (  -4.50%)
Amean     203     10.4783 (   0.00%)     10.8347 (  -3.40%)
Amean     234     12.4977 (   0.00%)     13.0177 (  -4.16%)
Amean     265     14.7003 (   0.00%)     15.5630 (  -5.87%)
Amean     296     16.1007 (   0.00%)     17.4023 (  -8.08%)

Processes using pipes are impacted but the variance (not presented) indicates
it's close to noise and the results are not always reproducible. If executed
across multiple reboots, it may show neutral or small gains so the worst
measured results are presented.

Hackbench using sockets is more reliably neutral as the wakeup
mechanisms are different between sockets and pipes.

hackbench-process-sockets
                          6.18-rc1             6.18-rc1
                             vanilla   sched-preemptnext-v2
Amean     1        0.3073 (   0.00%)      0.3263 (  -6.18%)
Amean     4        0.7863 (   0.00%)      0.7930 (  -0.85%)
Amean     7        1.3670 (   0.00%)      1.3537 (   0.98%)
Amean     12       2.1337 (   0.00%)      2.1903 (  -2.66%)
Amean     21       3.4683 (   0.00%)      3.4940 (  -0.74%)
Amean     30       4.7247 (   0.00%)      4.8853 (  -3.40%)
Amean     48       7.6097 (   0.00%)      7.8197 (  -2.76%)
Amean     79      14.7957 (   0.00%)     16.1000 (  -8.82%)
Amean     110     21.3413 (   0.00%)     21.9997 (  -3.08%)
Amean     141     29.0503 (   0.00%)     29.0353 (   0.05%)
Amean     172     36.4660 (   0.00%)     36.1433 (   0.88%)
Amean     203     39.7177 (   0.00%)     40.5910 (  -2.20%)
Amean     234     42.1120 (   0.00%)     43.5527 (  -3.42%)
Amean     265     45.7830 (   0.00%)     50.0560 (  -9.33%)
Amean     296     50.7043 (   0.00%)     54.3657 (  -7.22%)

As schbench has been mentioned in numerous bugs recently, the results
are interesting. A test case that represents the default schbench
behaviour is

schbench Wakeup Latency (usec)
                                       6.18.0-rc1             6.18.0-rc1
                                          vanilla   sched-preemptnext-v5
Amean     Wakeup-50th-80          7.17 (   0.00%)        6.00 (  16.28%)
Amean     Wakeup-90th-80         46.56 (   0.00%)       19.78 (  57.52%)
Amean     Wakeup-99th-80        119.61 (   0.00%)       89.94 (  24.80%)
Amean     Wakeup-99.9th-80     3193.78 (   0.00%)      328.22 (  89.72%)

schbench Requests Per Second (ops/sec)
                                  6.18.0-rc1             6.18.0-rc1
                                     vanilla   sched-preemptnext-v5
Hmean     RPS-20th-80     8900.91 (   0.00%)     9176.78 (   3.10%)
Hmean     RPS-50th-80     8987.41 (   0.00%)     9217.89 (   2.56%)
Hmean     RPS-90th-80     9123.73 (   0.00%)     9273.25 (   1.64%)
Hmean     RPS-max-80      9193.50 (   0.00%)     9301.47 (   1.17%)

Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patch.msgid.link/20251112122521.1331238-3-mgorman@techsingularity.net
2025-11-17 17:13:15 +01:00
Mel Gorman aceccac58a sched/fair: Enable scheduler feature NEXT_BUDDY
The NEXT_BUDDY feature reinforces wakeup preemption to encourage the last
wakee to be scheduled sooner on the assumption that the waker/wakee share
cache-hot data. In CFS, it was paired with LAST_BUDDY to switch back on
the assumption that the pair of tasks still share data but also relied
on START_DEBIT and the exact WAKEUP_PREEMPTION implementation to get
good results.

NEXT_BUDDY has been disabled since commit 0ec9fab3d1 ("sched: Improve
latencies and throughput") and LAST_BUDDY was removed in commit 5e963f2bd4
("sched/fair: Commit to EEVDF"). The reasoning is not clear but as vruntime
spread is mentioned so the expectation is that NEXT_BUDDY had an impact
on overall fairness. It was not noted why LAST_BUDDY was removed but it
is assumed that it's very difficult to reason what LAST_BUDDY's correct
and effective behaviour should be while still respecting EEVDFs goals.
Peter Zijlstra noted during review;

	I think I was just struggling to make sense of things and figured
	less is more and axed it.

	I have vague memories trying to work through the dynamics of
	a wakeup-stack and the EEVDF latency requirements and getting
	a head-ache.

NEXT_BUDDY is easier to reason about given that it's a point-in-time
decision on the wakees deadline and eligibilty relative to the waker. Enable
NEXT_BUDDY as a preparation path to document that the decision to ignore
the current implementation is deliberate. While not presented, the results
were at best neutral and often much more variable.

Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patch.msgid.link/20251112122521.1331238-2-mgorman@techsingularity.net
2025-11-17 17:13:15 +01:00
Phil Auld aaab6bb54a sched: Increase sched_tick_remote timeout
Increase the sched_tick_remote WARN_ON timeout to remove false
positives due to temporarily busy HK cpus. The suggestion
was 30 seconds to catch really stuck remote tick processing
but not trigger it too easily.

Suggested-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Phil Auld <pauld@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Frederic Weisbecker <frederic@kernel.org>
Link: https://patch.msgid.link/20250911161300.437944-1-pauld@redhat.com
2025-11-17 17:13:15 +01:00
Peter Zijlstra 522fb20fbd sched/fair: Have SD_SERIALIZE affect newidle balancing
Also serialize the possiblty much more frequent newidle balancing for
the 'expensive' domains that have SD_BALANCE set.

Initial benchmarking by K Prateek and Tim showed no negative effect.

Split out from the larger patch moving sched_balance_running around
for ease of bisect and such.

Suggested-by: Shrikanth Hegde <sshegde@linux.ibm.com>
Seconded-by: K Prateek Nayak <kprateek.nayak@amd.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/df068896-82f9-458d-8fff-5a2f654e8ffd@amd.com
Link: https://patch.msgid.link/6fed119b723c71552943bfe5798c93851b30a361.1762800251.git.tim.c.chen@linux.intel.com

# Conflicts:
#	kernel/sched/fair.c
2025-11-17 17:13:09 +01:00
Tim Chen 3324b2180c sched/fair: Skip sched_balance_running cmpxchg when balance is not due
The NUMA sched domain sets the SD_SERIALIZE flag by default, allowing
only one NUMA load balancing operation to run system-wide at a time.

Currently, each sched group leader directly under NUMA domain attempts
to acquire the global sched_balance_running flag via cmpxchg() before
checking whether load balancing is due or whether it is the designated
load balancer for that NUMA domain. On systems with a large number
of cores, this causes significant cache contention on the shared
sched_balance_running flag.

This patch reduces unnecessary cmpxchg() operations by first checking
that the balancer is the designated leader for a NUMA domain from
should_we_balance(), and the balance interval has expired before
trying to acquire sched_balance_running to load balance a NUMA
domain.

On a 2-socket Granite Rapids system with sub-NUMA clustering enabled,
running an OLTP workload, 7.8% of total CPU cycles were previously spent
in sched_balance_domain() contending on sched_balance_running before
this change.

         : 104              static __always_inline int arch_atomic_cmpxchg(atomic_t *v, int old, int new)
         : 105              {
         : 106              return arch_cmpxchg(&v->counter, old, new);
    0.00 :   ffffffff81326e6c:       xor    %eax,%eax
    0.00 :   ffffffff81326e6e:       mov    $0x1,%ecx
    0.00 :   ffffffff81326e73:       lock cmpxchg %ecx,0x2394195(%rip)        # ffffffff836bb010 <sched_balance_running>
         : 110              sched_balance_domains():
         : 12234            if (atomic_cmpxchg_acquire(&sched_balance_running, 0, 1))
   99.39 :   ffffffff81326e7b:       test   %eax,%eax
    0.00 :   ffffffff81326e7d:       jne    ffffffff81326e99 <sched_balance_domains+0x209>
         : 12238            if (time_after_eq(jiffies, sd->last_balance + interval)) {
    0.00 :   ffffffff81326e7f:       mov    0x14e2b3a(%rip),%rax        # ffffffff828099c0 <jiffies_64>
    0.00 :   ffffffff81326e86:       sub    0x48(%r14),%rax
    0.00 :   ffffffff81326e8a:       cmp    %rdx,%rax

After applying this fix, sched_balance_domain() is gone from the profile
and there is a 5% throughput improvement.

[peterz: made it so that redo retains the 'lock' and split out the
         CPU_NEWLY_IDLE change to a separate patch]
Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Chen Yu <yu.c.chen@intel.com>
Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>
Reviewed-by: Shrikanth Hegde <sshegde@linux.ibm.com>
Reviewed-by: K Prateek Nayak <kprateek.nayak@amd.com>
Reviewed-by: Srikar Dronamraju <srikar@linux.ibm.com>
Tested-by: Mohini Narkhede <mohini.narkhede@intel.com>
Tested-by: Shrikanth Hegde <sshegde@linux.ibm.com>
Link: https://patch.msgid.link/6fed119b723c71552943bfe5798c93851b30a361.1762800251.git.tim.c.chen@linux.intel.com
2025-11-17 17:12:00 +01:00
Christian Brauner a71e4f103a
pidfs: simplify PIDFD_GET_<type>_NAMESPACE ioctls
We have reworked namespaces sufficiently that all this special-casing
shouldn't be needed anymore

Link: https://patch.msgid.link/20251117-eidesstattlich-apotheke-36d2e644079f@brauner
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-17 16:23:13 +01:00
Thomas Weißschuh 308bc2e338 selftests/timers/nanosleep: Add tests for return of remaining time
If interrupted by a signal clock_nanosleep() returns the remaining time
into the structure pointed to by the rmtp parameter. So far this
functionality was not tested by the timer selftests.

Extend the nanosleep selftest to cover this feature.

Signed-off-by: Thomas Weißschuh <thomas.weissschuh@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://patch.msgid.link/20251106-nanosleep-rtmp-selftest-v1-1-f9212fb295fe@linutronix.de
2025-11-14 20:34:50 +01:00
Wake Liu 05d89fe7e4 selftests/timers: Clean up kernel version check in posix_timers
Several tests in the posix_timers selftest which test timer behavior
related to SIG_IGN fail on kernels older than 6.13. This is due to
a refactoring of signal handling in commit caf77435dd ("signal:
Handle ignored signals in do_sigaction(action != SIG_IGN)").

A previous attempt to fix this by adding a kernel version check to each
of the nine affected tests was suboptimal, as it resulted in emitting
the same skip message nine times.

Following the suggestion from Thomas Gleixner, this is refactored to
perform a single version check in main(). To satisfy the kselftest
framework's requirement for the test count to match the declared plan,
the plan is now conditionally set to 10 (for older kernels) or 19.

While setting the plan conditionally may seem complex, it is the
better approach to avoid the alternatives: either running tests on
unsupported kernels that are known to fail, or emitting a noisy series
of nine identical skip messages. A single informational message is now
printed instead when the tests are skipped.

Signed-off-by: Wake Liu <wakel@google.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/all/20250807085042.1690931-1-wakel@google.com/
Link: https://patch.msgid.link/20251103114502.584940-1-wakel@google.com
2025-11-14 20:34:50 +01:00
Jianyun Gao 4518767be9 time: Fix a few typos in time[r] related code comments
Signed-off-by: Jianyun Gao <jianyungao89@gmail.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://patch.msgid.link/20250927093411.1509275-1-jianyungao89@gmail.com
2025-11-14 20:34:50 +01:00
Sunday Adelodun e54dd0474c time: tick-oneshot: Add missing Return and parameter descriptions to kernel-doc
Several functions in kernel/time/tick-oneshot.c are missing parameter and
return value descriptions in their kernel-doc comments. This causes
warnings during doc generation.

Update the kernel-doc blocks to include detailed @param and Return:
descriptions for better clarity and to fix kernel-doc warnings.  No
functional code changes are made.

Signed-off-by: Sunday Adelodun <adelodunolaoluwa@yahoo.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://patch.msgid.link/20251106113938.34693-3-adelodunolaoluwa@yahoo.com
2025-11-14 20:17:44 +01:00
Thomas Weißschuh 4702f4eceb hrtimer: Store time as ktime_t in restart block
The hrtimer core uses ktime_t to represent times, use that also for the
restart block. CPU timers internally use nanoseconds instead of ktime_t
but use the same restart block, so use the correct accessors for those.

Signed-off-by: Thomas Weißschuh <thomas.weissschuh@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://patch.msgid.link/20251110-restart-block-expiration-v1-3-5d39cc93df4f@linutronix.de
2025-11-14 16:31:19 +01:00
Christian Brauner 523ac76880
Merge patch series "Create and use APIs to centralise locking for directory ops."
NeilBrown <neilb@ownmail.net> says:

This series is the next part of my effort to change directory-op
locking to allow multiple concurrent ops in a directory.  Ultimately we
will (in my plan) lock the target dentry(s) rather than the whole
parent directory.

To help with changing the locking protocol, this series centralises
locking and lookup in some helpers.  The various helpers are introduced
and then used in the same patch - roughly one patch per helper though
with various exceptions.

I haven't introduced these helpers into the various filesystems that
Al's tree-in-dcache series is changing.  That series introduces and
uses similar helpers tuned to the specific needs of that set of
filesystems.  Ultimately all the helpers will use the same backends
which can then be adjusted when it is time to change the locking
protocol.

One change that deserves highlighting is in patch 13 where vfs_mkdir()
is changed to unlock the parent on failure, as well as the current
behaviour of dput()ing the dentry on failure.  Once this change is in
place, the final step of both create and an remove sequences only
requires the target dentry, not the parent.  So e.g.  end_creating() is
only given the dentry (which may be IS_ERR() after vfs_mkdir()).  This
helps establish the pattern that it is the dentry that is being locked
and unlocked (the lock is currently held on dentry->d_parent->d_inode,
but that can change).

* patches from https://patch.msgid.link/20251113002050.676694-1-neilb@ownmail.net:
  VFS: introduce end_creating_keep()
  VFS: change vfs_mkdir() to unlock on failure.
  ecryptfs: use new start_creating/start_removing APIs
  Add start_renaming_two_dentries()
  VFS/ovl/smb: introduce start_renaming_dentry()
  VFS/nfsd/ovl: introduce start_renaming() and end_renaming()
  VFS: add start_creating_killable() and start_removing_killable()
  VFS: introduce start_removing_dentry()
  smb/server: use end_removing_noperm for for target of smb2_create_link()
  VFS: introduce start_creating_noperm() and start_removing_noperm()
  VFS/nfsd/cachefiles/ovl: introduce start_removing() and end_removing()
  VFS/nfsd/cachefiles/ovl: add start_creating() and end_creating()
  VFS: tidy up do_unlinkat()
  VFS: introduce start_dirop() and end_dirop()
  debugfs: rename end_creating() to debugfs_end_creating()

Link: https://patch.msgid.link/20251113002050.676694-1-neilb@ownmail.net
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-14 13:16:04 +01:00
NeilBrown cf296b294c
VFS: introduce end_creating_keep()
Occasionally the caller of end_creating() wants to keep using the dentry.
Rather then requiring them to dget() the dentry (when not an error)
before calling end_creating(), provide end_creating_keep() which does
this.

cachefiles and overlayfs make use of this.

Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: NeilBrown <neil@brown.name>
Link: https://patch.msgid.link/20251113002050.676694-16-neilb@ownmail.net
Tested-by: syzbot@syzkaller.appspotmail.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-14 13:15:58 +01:00
NeilBrown fe497f0759
VFS: change vfs_mkdir() to unlock on failure.
vfs_mkdir() already drops the reference to the dentry on failure but it
leaves the parent locked.
This complicates end_creating() which needs to unlock the parent even
though the dentry is no longer available.

If we change vfs_mkdir() to unlock on failure as well as releasing the
dentry, we can remove the "parent" arg from end_creating() and simplify
the rules for calling it.

Note that cachefiles_get_directory() can choose to substitute an error
instead of actually calling vfs_mkdir(), for fault injection.  In that
case it needs to call end_creating(), just as vfs_mkdir() now does on
error.

ovl_create_real() will now unlock on error.  So the conditional
end_creating() after the call is removed, and end_creating() is called
internally on error.

Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Tested-by: syzbot@syzkaller.appspotmail.com
Signed-off-by: NeilBrown <neil@brown.name>
Link: https://patch.msgid.link/20251113002050.676694-15-neilb@ownmail.net
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-14 13:15:58 +01:00
NeilBrown f046fbb4d8
ecryptfs: use new start_creating/start_removing APIs
This requires the addition of start_creating_dentry() which is given the
dentry which has already been found, and asks for it to be locked and
its parent validated.

Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: NeilBrown <neil@brown.name>
Link: https://patch.msgid.link/20251113002050.676694-14-neilb@ownmail.net
Tested-by: syzbot@syzkaller.appspotmail.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-14 13:15:58 +01:00
NeilBrown 833d2b3a07
Add start_renaming_two_dentries()
A few callers want to lock for a rename and already have both dentries.
Also debugfs does want to perform a lookup but doesn't want permission
checking, so start_renaming_dentry() cannot be used.

This patch introduces start_renaming_two_dentries() which is given both
dentries.  debugfs performs one lookup itself.  As it will only continue
with a negative dentry and as those cannot be renamed or unlinked, it is
safe to do the lookup before getting the rename locks.

overlayfs uses start_renaming_two_dentries() in three places and  selinux
uses it twice in sel_make_policy_nodes().

In sel_make_policy_nodes() we now lock for rename twice instead of just
once so the combined operation is no longer atomic w.r.t the parent
directory locks.  As selinux_state.policy_mutex is held across the whole
operation this does not open up any interesting races.

Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: NeilBrown <neil@brown.name>
Link: https://patch.msgid.link/20251113002050.676694-13-neilb@ownmail.net
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-14 13:15:58 +01:00
NeilBrown ac50950ca1
VFS/ovl/smb: introduce start_renaming_dentry()
Several callers perform a rename on a dentry they already have, and only
require lookup for the target name.  This includes smb/server and a few
different places in overlayfs.

start_renaming_dentry() performs the required lookup and takes the
required lock using lock_rename_child()

It is used in three places in overlayfs and in ksmbd_vfs_rename().

In the ksmbd case, the parent of the source is not important - the
source must be renamed from wherever it is.  So start_renaming_dentry()
allows rd->old_parent to be NULL and only checks it if it is non-NULL.
On success rd->old_parent will be the parent of old_dentry with an extra
reference taken.  Other start_renaming function also now take the extra
reference and end_renaming() now drops this reference as well.

ovl_lookup_temp(), ovl_parent_lock(), and ovl_parent_unlock() are
all removed as they are no longer needed.

OVL_TEMPNAME_SIZE and ovl_tempname() are now declared in overlayfs.h so
that ovl_check_rename_whiteout() can access them.

ovl_copy_up_workdir() now always cleans up on error.

Reviewed-by: Namjae Jeon <linkinjeon@kernel.org>
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: NeilBrown <neil@brown.name>
Link: https://patch.msgid.link/20251113002050.676694-12-neilb@ownmail.net
Tested-by: syzbot@syzkaller.appspotmail.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-14 13:15:57 +01:00
NeilBrown 5c87527299
VFS/nfsd/ovl: introduce start_renaming() and end_renaming()
start_renaming() combines name lookup and locking to prepare for rename.
It is used when two names need to be looked up as in nfsd and overlayfs -
cases where one or both dentries are already available will be handled
separately.

__start_renaming() avoids the inode_permission check and hash
calculation and is suitable after filename_parentat() in do_renameat2().
It subsumes quite a bit of code from that function.

start_renaming() does calculate the hash and check X permission and is
suitable elsewhere:
- nfsd_rename()
- ovl_rename()

In ovl, ovl_do_rename_rd() is factored out of ovl_do_rename(), which
itself will be gone by the end of the series.

Acked-by: Chuck Lever <chuck.lever@oracle.com> (for nfsd parts)
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: NeilBrown <neil@brown.name>

--
Changes since v3:
 - added missig dput() in ovl_rename when "whiteout" is not-NULL.

Changes since v2:
 - in __start_renaming() some label have been renamed, and err
   is always set before a "goto out_foo" rather than passing the
   error in a dentry*.
 - ovl_do_rename() changed to call the new ovl_do_rename_rd() rather
   than keeping duplicate code
 - code around ovl_cleanup() call in ovl_rename() restructured.

Link: https://patch.msgid.link/20251113002050.676694-11-neilb@ownmail.net
Tested-by: syzbot@syzkaller.appspotmail.com
Acked-by: Chuck Lever <chuck.lever@oracle.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-14 13:15:57 +01:00
NeilBrown ff7c4ea11a
VFS: add start_creating_killable() and start_removing_killable()
These are similar to start_creating() and start_removing(), but allow a
fatal signal to abort waiting for the lock.

They are used in btrfs for subvol creation and removal.

btrfs_may_create() no longer needs IS_DEADDIR() and
start_creating_killable() includes that check.

Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: NeilBrown <neil@brown.name>
Link: https://patch.msgid.link/20251113002050.676694-10-neilb@ownmail.net
Tested-by: syzbot@syzkaller.appspotmail.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-14 13:15:57 +01:00
NeilBrown 7bb1eb45e4
VFS: introduce start_removing_dentry()
start_removing_dentry() is similar to start_removing() but instead of
providing a name for lookup, the target dentry is given.

start_removing_dentry() checks that the dentry is still hashed and in
the parent, and if so it locks and increases the refcount so that
end_removing() can be used to finish the operation.

This is used in cachefiles, overlayfs, smb/server, and apparmor.

There will be other users including ecryptfs.

As start_removing_dentry() takes an extra reference to the dentry (to be
put by end_removing()), there is no need to explicitly take an extra
reference to stop d_delete() from using dentry_unlink_inode() to negate
the dentry - as in cachefiles_delete_object(), and ksmbd_vfs_unlink().

cachefiles_bury_object() now gets an extra ref to the victim, which is
drops.  As it includes the needed end_removing() calls, the caller
doesn't need them.

Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Reviewed-by: Namjae Jeon <linkinjeon@kernel.org>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: NeilBrown <neil@brown.name>
Link: https://patch.msgid.link/20251113002050.676694-9-neilb@ownmail.net
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-14 13:15:57 +01:00
NeilBrown 1ead2213dd
smb/server: use end_removing_noperm for for target of smb2_create_link()
Sometimes smb2_create_link() needs to remove the target before creating
the link.
It uses ksmbd_vfs_kern_locked(), and is the only user of that interface.

To match the new naming, that function is changed to
ksmbd_vfs_kern_start_removing(), and related functions or flags are also
renamed.

The lock actually happens in ksmbd_vfs_path_lookup() and that is changed
to use start_removing_noperm() - permission to perform lookup in the
parent was already checked in vfs_path_parent_lookup().

Signed-off-by: NeilBrown <neil@brown.name>
Link: https://patch.msgid.link/20251113002050.676694-8-neilb@ownmail.net
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-14 13:15:56 +01:00
NeilBrown c9ba789dad
VFS: introduce start_creating_noperm() and start_removing_noperm()
xfs, fuse, ipc/mqueue need variants of start_creating or start_removing
which do not check permissions.
This patch adds _noperm versions of these functions.

Note that do_mq_open() was only calling mntget() so it could call
path_put() - it didn't really need an extra reference on the mnt.
Now it doesn't call mntget() and uses end_creating() which does
the dput() half of path_put().

Also mq_unlink() previously passed
   d_inode(dentry->d_parent)
as the dir inode to vfs_unlink().  This is after locking
   d_inode(mnt->mnt_root)
These two inodes are the same, but normally calls use the textual
parent.
So I've changes the vfs_unlink() call to be given d_inode(mnt->mnt_root).

Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: NeilBrown <neil@brown.name>

--
changes since v2:
 - dir arg passed to vfs_unlink() in mq_unlink() changed to match
   the dir passed to lookup_noperm()
 - restore assignment to path->mnt even though the mntget() is removed.

Link: https://patch.msgid.link/20251113002050.676694-7-neilb@ownmail.net
Tested-by: syzbot@syzkaller.appspotmail.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-14 13:15:56 +01:00
NeilBrown bd6ede8a06
VFS/nfsd/cachefiles/ovl: introduce start_removing() and end_removing()
start_removing() is similar to start_creating() but will only return a
positive dentry with the expectation that it will be removed.  This is
used by nfsd, cachefiles, and overlayfs.  They are changed to also use
end_removing() to terminate the action begun by start_removing().  This
is a simple alias for end_dirop().

Apart from changes to the error paths, as we no longer need to unlock on
a lookup error, an effect on callers is that they don't need to test if
the found dentry is positive or negative - they can be sure it is
positive.

Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: NeilBrown <neil@brown.name>
Link: https://patch.msgid.link/20251113002050.676694-6-neilb@ownmail.net
Tested-by: syzbot@syzkaller.appspotmail.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-14 13:15:56 +01:00
NeilBrown 7ab96df840
VFS/nfsd/cachefiles/ovl: add start_creating() and end_creating()
start_creating() is similar to simple_start_creating() but is not so
simple.
It takes a qstr for the name, includes permission checking, and does NOT
report an error if the name already exists, returning a positive dentry
instead.

This is currently used by nfsd, cachefiles, and overlayfs.

end_creating() is called after the dentry has been used.
end_creating() drops the reference to the dentry as it is generally no
longer needed.  This is exactly the first section of end_creating_path()
so that function is changed to call the new end_creating()

These calls help encapsulate locking rules so that directory locking can
be changed.

Occasionally this change means that the parent lock is held for a
shorter period of time, for example in cachefiles_commit_tmpfile().
As this function now unlocks after an unlink and before the following
lookup, it is possible that the lookup could again find a positive
dentry, so a while loop is introduced there.

In overlayfs the ovl_lookup_temp() function has ovl_tempname()
split out to be used in ovl_start_creating_temp().  The other use
of ovl_lookup_temp() is preparing for a rename.  When rename handling
is updated, ovl_lookup_temp() will be removed.

Reviewed-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: NeilBrown <neil@brown.name>
Link: https://patch.msgid.link/20251113002050.676694-5-neilb@ownmail.net
Tested-by: syzbot@syzkaller.appspotmail.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-14 13:15:56 +01:00
NeilBrown 3661a78874
VFS: tidy up do_unlinkat()
The simplification of locking in the previous patch opens up some room
for tidying up do_unlinkat()

- change all "exit" labels to describe what will happen at the label.
- always goto an exit label on an error - unwrap the "if (!IS_ERR())" branch.
- Move the "slashes" handing inline, but mark it as unlikely()
- simplify use of the "inode" variable - we no longer need to test for NULL.

Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: NeilBrown <neil@brown.name>
Link: https://patch.msgid.link/20251113002050.676694-4-neilb@ownmail.net
Tested-by: syzbot@syzkaller.appspotmail.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-14 13:15:56 +01:00
NeilBrown 4037d966f0
VFS: introduce start_dirop() and end_dirop()
The fact that directory operations (create,remove,rename) are protected
by a lock on the parent is known widely throughout the kernel.
In order to change this - to instead lock the target dentry  - it is
best to centralise this knowledge so it can be changed in one place.

This patch introduces start_dirop() which is local to VFS code.
It performs the required locking for create and remove.  Rename
will be handled separately.

Various functions with names like start_creating() or start_removing_path(),
some of which already exist, will export this functionality beyond the VFS.

end_dirop() is the partner of start_dirop().  It drops the lock and
releases the reference on the dentry.
It *is* exported so that various end_creating etc functions can be inline.

As vfs_mkdir() drops the dentry on error we cannot use end_dirop() as
that won't unlock when the dentry IS_ERR().  For now we need an explicit
unlock when dentry IS_ERR().  I hope to change vfs_mkdir() to unlock
when it drops a dentry so that explicit unlock can go away.

end_dirop() can always be called on the result of start_dirop(), but not
after vfs_mkdir().  After a vfs_mkdir() we still may need the explicit
unlock as seen in end_creating_path().

As well as adding start_dirop() and end_dirop()
this patch uses them in:
 - simple_start_creating (which requires sharing lookup_noperm_common()
        with libfs.c)
 - start_removing_path / start_removing_user_path_at
 - filename_create / end_creating_path()
 - do_rmdir(), do_unlinkat()

Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: NeilBrown <neil@brown.name>
Link: https://patch.msgid.link/20251113002050.676694-3-neilb@ownmail.net
Tested-by: syzbot@syzkaller.appspotmail.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-14 13:15:56 +01:00
NeilBrown 8b45b9a882
debugfs: rename end_creating() to debugfs_end_creating()
By not using the generic end_creating() name here we are free to use it
more globally for a more generic function.
This should have been done when start_creating() was renamed.

For consistency, also rename failed_creating().

Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: NeilBrown <neil@brown.name>
Link: https://patch.msgid.link/20251113002050.676694-2-neilb@ownmail.net
Tested-by: syzbot@syzkaller.appspotmail.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-14 13:15:55 +01:00
Kriish Sharma cc7d6c65b8
nstree: fix kernel-doc comments for internal functions
Documentation build reported:

  Warning: kernel/nstree.c:325 function parameter 'ns_tree' not described in '__ns_tree_adjoined_rcu'
  Warning: kernel/nstree.c:325 expecting prototype for ns_tree_adjoined_rcu(). Prototype was for __ns_tree_adjoined_rcu() instead
  Warning: kernel/nstree.c:353 expecting prototype for ns_tree_gen_id(). Prototype was for __ns_tree_gen_id() instead

The kernel-doc comments for `__ns_tree_adjoined_rcu()` and
`__ns_tree_gen_id()` had mismatched function names and a missing
parameter description. This patch updates the function names in the
kernel-doc headers and adds the missing `@ns_tree` parameter description
for `__ns_tree_adjoined_rcu()`.

Fixes: 885fc8ac0a ("nstree: make iterator generic")
Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202511061542.0LO7xKs8-lkp@intel.com
Signed-off-by: Kriish Sharma <kriish.sharma2006@gmail.com>
Link: https://patch.msgid.link/20251111112533.2254432-1-kriish.sharma2006@gmail.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-14 13:10:38 +01:00
Christian Brauner cefd55bd21
nsproxy: fix free_nsproxy() and simplify create_new_namespaces()
Make it possible to handle NULL being passed to the reference count
helpers instead of forcing the caller to handle this. Afterwards we can
nicely allow a cleanup guard to handle nsproxy freeing.

Active reference count handling is not done in nsproxy_free() but rather
in free_nsproxy() as nsproxy_free() is also called from setns() failure
paths where a new nsproxy has been prepared but has not been marked as
active via switch_task_namespaces().

Link: https://lore.kernel.org/690bfb9e.050a0220.2e3c35.0013.GAE@google.com
Link: https://patch.msgid.link/20251111-sakralbau-guthaben-7dcc277d337f@brauner
Fixes: 3c9820d5c64a ("ns: add active reference count")
Reported-by: syzbot+0b2e79f91ff6579bfa5b@syzkaller.appspotmail.com
Reported-by: syzbot+0a8655a80e189278487e@syzkaller.appspotmail.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-14 13:10:38 +01:00
Mateusz Guzik 030e86dfda
fs: touch up predicts in path lookup
Rationale:
- ND_ROOT_PRESET is only set in a condition already marked unlikely
- LOOKUP_IS_SCOPED already has unlikely on it, but inconsistently
  applied
- set_root() only fails if there is a bug
- most names are not empty (see !*s)
- most of the time path_init() does not encounter LOOKUP_CACHED without
  LOOKUP_RCU
- LOOKUP_IN_ROOT is a rarely seen flag

Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
Link: https://patch.msgid.link/20251105150630.756606-1-mjguzik@gmail.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-13 14:22:25 +01:00
Xianwei Zhao fc584d871c irqchip/meson-gpio: Add support for Amlogic S6 S7 and S7D SoCs
The Amlogic S6/S7/S7D SoCs support GPIO interrupt lines:

    S6 IRQ Number:
    - 99:98    2 pins on bank CC
    - 97       1 pin  on bank TESTN
    - 96:81   16 pins on bank A
    - 80:65   16 pins on bank Z
    - 64:45   20 pins on bank X
    - 44:37    8 pins on bank H offs H1
    - 36:32    5 pins on bank F
    - 31:25    7 pins on bank D
    - 24:22    3 pins on bank E
    - 21:14    8 pins on bank C
    - 13:0    14 pins on bank B

    S7 IRQ Number:
    - 83:82    2 pins on bank CC
    - 81       1 pin  on bank TESTN
    - 80:68   13 pins on bank Z
    - 67:48   20 pins on bank X
    - 47:36   12 pins on bank H
    - 35:24   12 pins on bank D
    - 23:22    2 pins on bank E
    - 21:14    8 pins on bank C
    - 13:0    14 pins on bank B

    S7D IRQ Number:
    - 83:82    2 pins on bank CC
    - 81:75    7 pins on bank DV
    - 74       1 pin  on bank TESTN
    - 73:61   13 pins on bank Z
    - 60:41   20 pins on bank X
    - 40:29   12 pins on bank H
    - 28:24    5 pins on bank D
    - 23:22    2 pins on bank E
    - 21:14    8 pins on bank C
    - 13:0    14 pins on bank B

Add the required compatibles and interrupt count initializers.

Signed-off-by: Xianwei Zhao <xianwei.zhao@amlogic.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Neil Armstrong <neil.armstrong@linaro.org>
Link: https://patch.msgid.link/20251105-irqchip-gpio-s6-s7-s7d-v1-2-b4d1fe4781c1@amlogic.com
2025-11-13 14:04:16 +01:00
Xianwei Zhao e4ca152008 dt-bindings: interrupt-controller: Add support for Amlogic S6 S7 and S7D SoCs
Update the device tree binding document for GPIO interrupt controller of
Amlogic S6 S7 and S7D SoCs.

Signed-off-by: Xianwei Zhao <xianwei.zhao@amlogic.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Link: https://patch.msgid.link/20251105-irqchip-gpio-s6-s7-s7d-v1-1-b4d1fe4781c1@amlogic.com
2025-11-13 14:04:16 +01:00
Josh Poimboeuf 9c7dc1dd89 objtool: Warn on functions with ambiguous -ffunction-sections section names
When compiled with -ffunction-sections, a function named startup() will
be placed in .text.startup.  However, .text.startup is also used by the
compiler for functions with __attribute__((constructor)).

That creates an ambiguity for the vmlinux linker script, which needs to
differentiate those two cases.

Similar naming conflicts exist for functions named exit(), split(),
unlikely(), hot() and unknown().

One potential solution would be to use '#ifdef CC_USING_FUNCTION_SECTIONS'
to create two distinct implementations of the TEXT_MAIN macro.  However,
-ffunction-sections can be (and is) enabled or disabled on a per-object
basis (for example via ccflags-y or AUTOFDO_PROFILE).

So the recently unified TEXT_MAIN macro (commit 1ba9f89794
("vmlinux.lds: Unify TEXT_MAIN, DATA_MAIN, and related macros")) is
necessary.  This means there's no way for the linker script to
disambiguate things.

Instead, use objtool to warn on any function names whose resulting
section names might create ambiguity when the kernel is compiled (in
whole or in part) with -ffunction-sections.

Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: live-patching@vger.kernel.org
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: https://patch.msgid.link/65fedea974fe14be487c8867a0b8d0e4a294ce1e.1762991150.git.jpoimboe@kernel.org
2025-11-13 08:03:10 +01:00
Josh Poimboeuf 0330b7fbbf drivers/xen/xenbus: Fix namespace collision and split() section placement with AutoFDO
When compiling the kernel with -ffunction-sections enabled, the split()
function gets compiled into the .text.split section.  In some cases it
can even be cloned into .text.split.constprop.0 or .text.split.isra.0.

However, .text.split.* is already reserved for use by the Clang
-fsplit-machine-functions flag, which is used by AutoFDO.  That may
place part of a function's code in a .text.split.<func> section.

This naming conflict causes the vmlinux linker script to wrongly place
split() with other .text.split.* code, rather than where it belongs with
regular text.

Fix it by renaming split() to split_strings().

Fixes: 6568f14cb5 ("vmlinux.lds: Exclude .text.startup and .text.exit from TEXT_MAIN")
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: live-patching@vger.kernel.org
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: https://patch.msgid.link/92a194234a0f757765e275b288bb1a7236c2c35c.1762991150.git.jpoimboe@kernel.org
2025-11-13 08:03:10 +01:00
Josh Poimboeuf 56255fa968 media: atomisp: Fix namespace collision and startup() section placement with -ffunction-sections
When compiling the kernel with -ffunction-sections (e.g., for LTO,
livepatch, dead code elimination, AutoFDO, or Propeller), the startup()
function gets compiled into the .text.startup section.  In some cases it
can even be cloned into .text.startup.constprop.0 or
.text.startup.isra.0.

However, the .text.startup and .text.startup.* section names are already
reserved for use by the compiler for __attribute__((constructor)) code.

This naming conflict causes the vmlinux linker script to wrongly place
startup() function code in .init.text, which gets freed during boot.

Fix that by renaming startup() to ov2722_startup().

Fixes: 6568f14cb5 ("vmlinux.lds: Exclude .text.startup and .text.exit from TEXT_MAIN")
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: live-patching@vger.kernel.org
Cc: Hans de Goede <hansg@kernel.org>
Cc: Mauro Carvalho Chehab <mchehab@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: https://patch.msgid.link/bf8cd823a3f11f64cc82167913be5013c72afa57.1762991150.git.jpoimboe@kernel.org
2025-11-13 08:03:09 +01:00
Josh Poimboeuf f6a8919d61 vmlinux.lds: Fix TEXT_MAIN to include .text.start and friends
Since:

  6568f14cb5 ("vmlinux.lds: Exclude .text.startup and .text.exit from TEXT_MAIN")

the TEXT_MAIN macro uses a series of patterns to prevent the
.text.startup[.*] and .text.exit[.*] sections from getting
linked into the vmlinux runtime .text.

That commit is a tad too aggressive: it also inadvertently filters out
valid runtime text sections like .text.start and
.text.start.constprop.0, which can be generated for a function named
start() when -ffunction-sections is enabled.

As a result, those sections become orphans when building with
CONFIG_LD_DEAD_CODE_DATA_ELIMINATION for arm:

  arm-linux-gnueabi-ld: warning: orphan section `.text.start.constprop.0' from `drivers/usb/host/sl811-hcd.o' being placed in section `.text.start.constprop.0'
  arm-linux-gnueabi-ld: warning: orphan section `.text.start.constprop.0' from `drivers/media/dvb-frontends/drxk_hard.o' being placed in section `.text.start.constprop.0'
  arm-linux-gnueabi-ld: warning: orphan section `.text.start' from `drivers/media/dvb-frontends/stv0910.o' being placed in section `.text.start'
  arm-linux-gnueabi-ld: warning: orphan section `.text.start.constprop.0' from `drivers/media/pci/ddbridge/ddbridge-sx8.o' being placed in section `.text.start.constprop.0'

Fix that by explicitly adding the partial "substring" sections (.text.s,
.text.st, .text.sta, etc) and their cloned derivatives.

While this unfortunately means that TEXT_MAIN continues to grow,
these changes are ultimately necessary for proper support of
-ffunction-sections.

Fixes: 6568f14cb5 ("vmlinux.lds: Exclude .text.startup and .text.exit from TEXT_MAIN")
Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: live-patching@vger.kernel.org
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: https://patch.msgid.link/cd588144e63df901a656b06b566855019c4a931d.1762991150.git.jpoimboe@kernel.org
Closes: https://lore.kernel.org/oe-kbuild-all/202511040812.DFGedJiy-lkp@intel.com/
2025-11-13 08:03:09 +01:00
Ingo Molnar d851f2b2b2 Linux 6.18-rc5
-----BEGIN PGP SIGNATURE-----
 
 iQFSBAABCgA8FiEEq68RxlopcLEwq+PEeb4+QwBBGIYFAmkRH1seHHRvcnZhbGRz
 QGxpbnV4LWZvdW5kYXRpb24ub3JnAAoJEHm+PkMAQRiGUCgH/j+fMbEg618ajVS2
 SWdAXZKEDVtCqN6bq9VT3g3rwk/zNgvppjMdCBqyXFpjvkGGIxlZnNgiTVuTLzvR
 cjl0c5C1a38lJ+DzmLjTF1TJ3t0CcA/8l2iWKu3Dm1ch2yuxm5ZcM2b9ujBholf7
 pYd7jZ7JhVm5eXD7U5X03AkZPUWAIx/Nip37cO7RLGzlkRSGLB7OXq3TB2u4e2ti
 gDpP4O+cgOqSuS71Hz0/8T6KIVQ9IZ/qzANWAYeHZD2DQwI3OZXI1WRnc1iw401o
 QaMaV21NirKwAANKetvbj7FgtmpdfQs/7FA+yR7YW2ARTpkc1EXrxgMZ6NuphGKE
 kYQo55g=
 =QaZ2
 -----END PGP SIGNATURE-----

Merge tag 'v6.18-rc5' into objtool/core, to pick up fixes

Signed-off-by: Ingo Molnar <mingo@kernel.org>
2025-11-13 07:58:43 +01:00
Boqun Feng f74cf399e0 rust: debugfs: Replace the usage of Rust native atomics
Rust native atomics are not allowed to use in kernel due to the mismatch
of memory model with Linux kernel memory model, hence remove the usage
of Rust native atomics in debufs.

Reviewed-by: Matthew Maurer <mmaurer@google.com>
Acked-by: Danilo Krummrich <dakr@kernel.org>
Tested-by: David Gow <davidgow@google.com>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
Link: https://patch.msgid.link/20251022035324.70785-4-boqun.feng@gmail.com
2025-11-12 08:56:42 -08:00
Boqun Feng 013f912eb5 rust: sync: atomic: Implement Debug for Atomic<Debug>
If `Atomic<T>` is `Debug` then it's a `debugfs::Writer`, therefore make
it so since 1) debugfs needs to support `Atomic<T>` and 2) it's rather
trivial to implement `Debug` for `Atomic<Debug>`.

Tested-by: David Gow <davidgow@google.com>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
Link: https://patch.msgid.link/20251022035324.70785-3-boqun.feng@gmail.com
2025-11-12 08:56:41 -08:00
Boqun Feng 14e9a18b07 rust: sync: atomic: Make Atomic*Ops pub(crate)
In order to write code over a generate Atomic<T> we need to make
Atomic*Ops public so that functions like `.load()` and `.store()` are
available. Make these pub(crate) at the beginning so the usage in kernel
crate is supported.

Tested-by: David Gow <davidgow@google.com>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
Link: https://patch.msgid.link/20251022035324.70785-2-boqun.feng@gmail.com
2025-11-12 08:56:38 -08:00
Mateusz Guzik 9eda581bfe
fs: move fd_install() slowpath into a dedicated routine and provide commentary
On stock kernel gcc 14 emits avoidable register spillage:
	endbr64
	call   ffffffff81374630 <__fentry__>
	push   %r13
	push   %r12
	push   %rbx
	sub    $0x8,%rsp
	[snip]

Total fast path is 99 bytes.

Moving the slowpath out avoids it and shortens the fast path to 74
bytes.

Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
Link: https://patch.msgid.link/20251110095634.1433061-1-mjguzik@gmail.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-12 12:19:09 +01:00
Mateusz Guzik 21b561dab1
fs: hide dentry_cache behind runtime const machinery
Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
Link: https://patch.msgid.link/20251105153622.758836-1-mjguzik@gmail.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-12 12:19:09 +01:00
Mateusz Guzik e41c1f4291
fs: touch predicts in do_dentry_open()
Helps out some of the asm, the routine is still a mess.

Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
Link: https://patch.msgid.link/20251109125254.1288882-1-mjguzik@gmail.com
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-12 12:19:09 +01:00
Baokun Li 50b2a4f19b
bdev: add hint prints in sb_set_blocksize() for LBS dependency on THP
Support for block sizes greater than the page size depends on large
folios, which in turn require CONFIG_TRANSPARENT_HUGEPAGE to be enabled.

Because the code is wrapped in multiple layers of abstraction, this
dependency is rather obscure, so users may not realize it and may be
unsure how to enable LBS.

As suggested by Theodore, I have added hint messages in sb_set_blocksize
so that users can distinguish whether a mount failure with block size
larger than page size is due to lack of filesystem support or the absence
of CONFIG_TRANSPARENT_HUGEPAGE.

Suggested-by: Theodore Ts'o <tytso@mit.edu>
Link: https://patch.msgid.link/20251110043226.GD2988753@mit.edu
Signed-off-by: Baokun Li <libaokun1@huawei.com>
Link: https://patch.msgid.link/20251110124714.1329978-1-libaokun@huaweicloud.com
Reviewed-by: Zhang Yi <yi.zhang@huawei.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-12 12:19:09 +01:00
Christian Brauner 04f0955b60
Merge patch series "cheaper MAY_EXEC handling for path lookup"
Mateusz Guzik <mjguzik@gmail.com> says:

In short, MAY_WRITE checks are elided.

This obsoletes the idea of pre-computing if perm checks are necessary as
that turned out to be too hairy. The new code has 2 more branches per
path component compared to that idea, but the perf difference for
typical paths (< 6 components) was basically within noise. To be
revisited if someone(tm) removes other slowdowns.

Instead of the pre-computing thing I added IOP_FASTPERM_MAY_EXEC so that
filesystems like btrfs can still avoid the hard work.

* patches from https://patch.msgid.link/20251107142149.989998-1-mjguzik@gmail.com:
  fs: retire now stale MAY_WRITE predicts in inode_permission()
  btrfs: utilize IOP_FASTPERM_MAY_EXEC
  fs: speed up path lookup with cheaper handling of MAY_EXEC

Link: https://patch.msgid.link/20251107142149.989998-1-mjguzik@gmail.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-12 12:19:08 +01:00
Mateusz Guzik a0a28c4e41
fs: retire now stale MAY_WRITE predicts in inode_permission()
The primary non-MAY_WRITE consumer now uses lookup_inode_permission_may_exec().

Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
Link: https://patch.msgid.link/20251107142149.989998-4-mjguzik@gmail.com
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-12 12:19:08 +01:00
Mateusz Guzik 3e18f6256e
btrfs: utilize IOP_FASTPERM_MAY_EXEC
Root filesystem was ext4, btrfs was mounted on /testfs.

Then issuing access(2) in a loop on /testfs/repos/linux/include/linux/fs.h
on Sapphire Rapids (ops/s):

before: 3447976
after:	3620879 (+5%)

Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
Link: https://patch.msgid.link/20251107142149.989998-3-mjguzik@gmail.com
Acked-by: David Sterba <dsterba@suse.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-12 12:19:08 +01:00
Mateusz Guzik e631df89cd
fs: speed up path lookup with cheaper handling of MAY_EXEC
The generic inode_permission() routine does work which is known to be of
no significance for lookup. There are checks for MAY_WRITE, while the
requested permission is MAY_EXEC. Additionally devcgroup_inode_permission()
is called to check for devices, but it is an invariant the inode is a
directory.

Absent a ->permission func, execution lands in generic_permission()
which checks upfront if the requested permission is granted for
everyone.

We can elide the branches which are guaranteed to be false and cut
straight to the check if everyone happens to be allowed MAY_EXEC on the
inode (which holds true most of the time).

Moreover, filesystems which provide their own ->permission routine can
take advantage of the optimization by setting the IOP_FASTPERM_MAY_EXEC
flag on their inodes, which they can legitimately do if their MAY_EXEC
handling matches generic_permission().

As a simple benchmark, as part of compilation gcc issues access(2) on
numerous long paths, for example /usr/lib/gcc/x86_64-linux-gnu/12/crtendS.o

Issuing access(2) on it in a loop on ext4 on Sapphire Rapids (ops/s):
before: 3797556
after:  3987789 (+5%)

Note: this depends on the not-yet-landed ext4 patch to mark inodes with
cache_no_acl()

Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
Link: https://patch.msgid.link/20251107142149.989998-2-mjguzik@gmail.com
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-12 12:19:08 +01:00
Rasmus Villemoes 854e8df2ce
fs/pipe: stop duplicating union pipe_index declaration
Now that we build with -fms-extensions, union pipe_index can be
included as an anonymous member in struct pipe_inode_info, avoiding
the duplication.

Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Link: https://patch.msgid.link/20251023082142.2104456-1-linux@rasmusvillemoes.dk
Signed-off-by: Nathan Chancellor <nathan@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-12 12:18:56 +01:00
Ingo Molnar 9929dffce5 perf/x86/intel: Fix and clean up intel_pmu_drain_arch_pebs() type use
The following commit introduced a build failure on x86-32:

  21954c8a0ff ("perf/x86/intel: Process arch-PEBS records or record fragments")

  ...

  arch/x86/events/intel/ds.c:2983:24: error: cast from pointer to integer of different size [-Werror=pointer-to-int-cast]

The forced type conversion to 'u64' and 'void *' are not 32-bit clean,
but they are also entirely unnecessary: ->pebs_vaddr is 'void *' already,
and integer-compatible pointer arithmetics will work just fine on it.

Fix & simplify the code.

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Fixes: d21954c8a0 ("perf/x86/intel: Process arch-PEBS records or record fragments")
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Dapeng Mi <dapeng1.mi@linux.intel.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Link: https://patch.msgid.link/20251029102136.61364-10-dapeng1.mi@linux.intel.com
2025-11-12 12:12:28 +01:00
Joanne Koong a298febc47
iomap: simplify when reads can be skipped for writes
Currently, the logic for skipping the read range for a write is

if (!(iter->flags & IOMAP_UNSHARE) &&
    (from <= poff || from >= poff + plen) &&
    (to <= poff || to >= poff + plen))

which breaks down to skipping the read if any of these are true:
a) from <= poff && to <= poff
b) from <= poff && to >= poff + plen
c) from >= poff + plen && to <= poff
d) from >= poff + plen && to >= poff + plen

This can be simplified to
if (!(iter->flags & IOMAP_UNSHARE) && from <= poff && to >= poff + plen)

from the following reasoning:

a) from <= poff && to <= poff
This reduces to 'to <= poff' since it is guaranteed that 'from <= to'
(since to = from + len). It is not possible for 'from <= to' to be true
here because we only reach here if plen > 0 (thanks to the preceding 'if
(plen == 0)' check that would break us out of the loop). If 'to <=
poff', plen would have to be 0 since poff and plen get adjusted in
lockstep for uptodate blocks. This means we can eliminate this check.

c) from >= poff + plen && to <= poff
This is not possible since 'from <= to' and 'plen > 0'. We can eliminate
this check.

d) from >= poff + plen && to >= poff + plen
This reduces to 'from >= poff + plen' since 'from <= to'.
It is not possible for 'from >= poff + plen' to be true here. We only
reach here if plen > 0 and for writes, poff and plen will always be
block-aligned, which means poff <= from < poff + plen. We can eliminate
this check.

The only valid check is b) from <= poff && to >= poff + plen.

Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
Link: https://patch.msgid.link/20251111193658.3495942-7-joannelkoong@gmail.com
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-12 10:50:32 +01:00
Joanne Koong f8eaf79406
iomap: simplify ->read_folio_range() error handling for reads
Instead of requiring that the caller calls iomap_finish_folio_read()
even if the ->read_folio_range() callback returns an error, account for
this internally in iomap instead, which makes the interface simpler and
makes it match writeback's ->read_folio_range() error handling
expectations.

Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
Link: https://patch.msgid.link/20251111193658.3495942-6-joannelkoong@gmail.com
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-12 10:50:32 +01:00
Joanne Koong 6b1fd2281f
iomap: optimize pending async writeback accounting
Pending writebacks must be accounted for to determine when all requests
have completed and writeback on the folio should be ended. Currently
this is done by atomically incrementing ifs->write_bytes_pending for
every range to be written back.

Instead, the number of atomic operations can be minimized by setting
ifs->write_bytes_pending to the folio size, internally tracking how many
bytes are written back asynchronously, and then after sending off all
the requests, decrementing ifs->write_bytes_pending by the number of
bytes not written back asynchronously. Now, for N ranges written back,
only N + 2 atomic operations are required instead of 2N + 2.

Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
Link: https://patch.msgid.link/20251111193658.3495942-5-joannelkoong@gmail.com
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-12 10:50:32 +01:00
Joanne Koong 7e6cea5ae2
docs: document iomap writeback's iomap_finish_folio_write() requirement
Document that iomap_finish_folio_write() must be called after writeback
on the range completes.

Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
Link: https://patch.msgid.link/20251111193658.3495942-4-joannelkoong@gmail.com
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-12 10:50:32 +01:00
Joanne Koong 9d875e0eef
iomap: account for unaligned end offsets when truncating read range
The end position to start truncating from may be at an offset into a
block, which under the current logic would result in overtruncation.

Adjust the calculation to account for unaligned end offsets.

Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
Link: https://patch.msgid.link/20251111193658.3495942-3-joannelkoong@gmail.com
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-12 10:50:31 +01:00
Joanne Koong a0f1cabe29
iomap: rename bytes_pending/bytes_accounted to bytes_submitted/bytes_not_submitted
The naming "bytes_pending" and "bytes_accounted" may be confusing and
could be better named. Rename this to "bytes_submitted" and
"bytes_not_submitted" to make it more clear that these are bytes we
passed to the IO helper to read in.

Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
Link: https://patch.msgid.link/20251111193658.3495942-2-joannelkoong@gmail.com
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-12 10:50:31 +01:00
Christian Brauner 76c63ff12e
Merge patch series "vfs: recall-only directory delegations for knfsd"
Jeff Layton <jlayton@kernel.org> says:

At the fall NFS Bakeathon last week, the NFS client and server
maintainers had a discussion about how to merge support for directory
delegations. We decided to start with just merging support for simple,
recallable-only directory delegation support, for a number of reasons:

1/ RFC8881 has some gaps in coverage that we are hoping to have
addressed in RFC8881bis. In particular, it's written such that CB_NOTIFY
callbacks require directory position information. That will be hard to
do properly under Linux, so we're planning to extend the spec to allow
that information to be omitted.

2/ client-side support for CB_NOTIFY still lags a bit. The client side
is tricky, as it involves heuristics about when to request a delegation.

3/ we have some early indication that simple, recallable-only
delegations can help performance in some cases. Anna mentioned seeing a
multi-minute speedup in xfstests runs with them enabled. This needs more
investigation, but it's promising and seems like enough justification to
merge support.

This patchset is quite similar to the set I initially posted back in
early 2024. We've merged some GET_DIR_DELEGATION handling patches
since then, but the VFS layer support is basically the same.

One thing that I want to make clear is that with this patchset, userspace
can request a read lease on a directory that will be recalled on
conflicting accesses. I saw no reason to prevent this, and I think it may
be something useful for applications like Samba.

As always, users can disable leases altogether via the fs.leases-enable
sysctl if this is an issue, but I wanted to point this out in case
anyone sees footguns here.

* patches from https://patch.msgid.link/20251111-dir-deleg-ro-v6-0-52f3feebb2f2@kernel.org:
  vfs: expose delegation support to userland
  nfsd: wire up GET_DIR_DELEGATION handling
  nfsd: allow DELEGRETURN on directories
  nfsd: allow filecache to hold S_IFDIR files
  filelock: lift the ban on directory leases in generic_setlease
  vfs: make vfs_symlink break delegations on parent dir
  vfs: make vfs_mknod break delegations on parent directory
  vfs: make vfs_create break delegations on parent directory
  vfs: clean up argument list for vfs_create()
  vfs: break parent dir delegations in open(..., O_CREAT) codepath
  vfs: allow rmdir to wait for delegation break on parent
  vfs: allow mkdir to wait for delegation break on parent
  vfs: add try_break_deleg calls for parents to vfs_{link,rename,unlink}
  filelock: push the S_ISREG check down to ->setlease handlers
  filelock: add struct delegated_inode
  filelock: rework the __break_lease API to use flags
  filelock: make lease_alloc() take a flags argument

Link: https://patch.msgid.link/20251111-dir-deleg-ro-v6-0-52f3feebb2f2@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-12 09:38:46 +01:00
Jeff Layton 1602bad16d
vfs: expose delegation support to userland
Now that support for recallable directory delegations is available,
expose this functionality to userland with new F_SETDELEG and F_GETDELEG
commands for fcntl().

Note that this also allows userland to request a FL_DELEG type lease on
files too. Userland applications that do will get signalled when there
are metadata changes in addition to just data changes (which is a
limitation of FL_LEASE leases).

These commands accept a new "struct delegation" argument that contains a
flags field for future expansion.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Link: https://patch.msgid.link/20251111-dir-deleg-ro-v6-17-52f3feebb2f2@kernel.org
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-12 09:38:37 +01:00
Jeff Layton 8b99f6a8c1
nfsd: wire up GET_DIR_DELEGATION handling
Add a new routine for acquiring a read delegation on a directory. These
are recallable-only delegations with no support for CB_NOTIFY. That will
be added in a later phase.

Since the same CB_RECALL/DELEGRETURN infrastructure is used for regular
and directory delegations, a normal nfs4_delegation is used to represent
a directory delegation.

Reviewed-by: NeilBrown <neil@brown.name>
Reviewed-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Jeff Layton <jlayton@kernel.org>
Link: https://patch.msgid.link/20251111-dir-deleg-ro-v6-16-52f3feebb2f2@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-12 09:38:37 +01:00
Jeff Layton 80c8afddc8
nfsd: allow DELEGRETURN on directories
As Trond pointed out: "...provided that the presented stateid is
actually valid, it is also sufficient to uniquely identify the file to
which it is associated (see RFC8881 Section 8.2.4), so the filehandle
should be considered mostly irrelevant for operations like DELEGRETURN."

Don't ask fh_verify to filter on file type.

Reviewed-by: Chuck Lever <chuck.lever@oracle.com>
Reviewed-by: NeilBrown <neil@brown.name>
Signed-off-by: Jeff Layton <jlayton@kernel.org>
Link: https://patch.msgid.link/20251111-dir-deleg-ro-v6-15-52f3feebb2f2@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-12 09:38:37 +01:00
Jeff Layton 544a0ee152
nfsd: allow filecache to hold S_IFDIR files
The filecache infrastructure will only handle S_IFREG files at the
moment. Directory delegations will require adding support for opening
S_IFDIR inodes.

Plumb a "type" argument into nfsd_file_do_acquire() and have all of the
existing callers set it to S_IFREG. Add a new nfsd_file_acquire_dir()
wrapper that nfsd can call to request a nfsd_file that holds a directory
open.

For now, there is no need for a fsnotify_mark for directories, as
CB_NOTIFY is not yet supported. Change nfsd_file_do_acquire() to avoid
allocating one for non-S_IFREG inodes.

Reviewed-by: Chuck Lever <chuck.lever@oracle.com>
Reviewed-by: NeilBrown <neil@brown.name>
Signed-off-by: Jeff Layton <jlayton@kernel.org>
Link: https://patch.msgid.link/20251111-dir-deleg-ro-v6-14-52f3feebb2f2@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-12 09:38:36 +01:00
Jeff Layton d0eab9fc10
filelock: lift the ban on directory leases in generic_setlease
With the addition of the try_break_lease calls in directory changing
operations, allow generic_setlease to hand them out. Write leases on
directories are never allowed however, so continue to reject them.

For now, there is no API for requesting delegations from userland, so
ensure that userland is prevented from acquiring a lease on a directory.

Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: NeilBrown <neil@brown.name>
Signed-off-by: Jeff Layton <jlayton@kernel.org>
Link: https://patch.msgid.link/20251111-dir-deleg-ro-v6-13-52f3feebb2f2@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-12 09:38:36 +01:00
Jeff Layton 92bf53577f
vfs: make vfs_symlink break delegations on parent dir
In order to add directory delegation support, we must break delegations
on the parent on any change to the directory.

Add a delegated_inode parameter to vfs_symlink() and have it break the
delegation. do_symlinkat() can then wait on the delegation break before
proceeding.

Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: NeilBrown <neil@brown.name>
Signed-off-by: Jeff Layton <jlayton@kernel.org>
Link: https://patch.msgid.link/20251111-dir-deleg-ro-v6-12-52f3feebb2f2@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-12 09:38:36 +01:00
Jeff Layton e8960c1b2e
vfs: make vfs_mknod break delegations on parent directory
In order to add directory delegation support, we need to break
delegations on the parent whenever there is going to be a change in the
directory.

Add a new delegated_inode pointer to vfs_mknod() and have the
appropriate callers wait when there is an outstanding delegation. All
other callers just set the pointer to NULL.

Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: NeilBrown <neil@brown.name>
Signed-off-by: Jeff Layton <jlayton@kernel.org>
Link: https://patch.msgid.link/20251111-dir-deleg-ro-v6-11-52f3feebb2f2@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-12 09:38:36 +01:00
Jeff Layton c826229c6a
vfs: make vfs_create break delegations on parent directory
In order to add directory delegation support, we need to break
delegations on the parent whenever there is going to be a change in the
directory.

Add a delegated_inode parameter to vfs_create. Most callers are
converted to pass in NULL, but do_mknodat() is changed to wait for a
delegation break if there is one.

Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: NeilBrown <neil@brown.name>
Signed-off-by: Jeff Layton <jlayton@kernel.org>
Link: https://patch.msgid.link/20251111-dir-deleg-ro-v6-10-52f3feebb2f2@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-12 09:38:36 +01:00
Jeff Layton 85bbffcad7
vfs: clean up argument list for vfs_create()
As Neil points out:

"I would be in favour of dropping the "dir" arg because it is always
d_inode(dentry->d_parent) which is stable."

...and...

"Also *every* caller of vfs_create() passes ".excl = true".  So maybe we
don't need that arg at all."

Drop both arguments from vfs_create() and fix up the callers.

Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: NeilBrown <neil@brown.name>
Signed-off-by: Jeff Layton <jlayton@kernel.org>
Link: https://patch.msgid.link/20251111-dir-deleg-ro-v6-9-52f3feebb2f2@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-12 09:38:36 +01:00
Jeff Layton 134796f43a
vfs: break parent dir delegations in open(..., O_CREAT) codepath
In order to add directory delegation support, we need to break
delegations on the parent whenever there is going to be a change in the
directory.

Add a delegated_inode parameter to lookup_open and have it break the
delegation. Then, open_last_lookups can wait for the delegation break
and retry the call to lookup_open once it's done.

Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: NeilBrown <neil@brown.name>
Signed-off-by: Jeff Layton <jlayton@kernel.org>
Link: https://patch.msgid.link/20251111-dir-deleg-ro-v6-8-52f3feebb2f2@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-12 09:38:35 +01:00
Jeff Layton 4fa76319cd
vfs: allow rmdir to wait for delegation break on parent
In order to add directory delegation support, we need to break
delegations on the parent whenever there is going to be a change in the
directory.

Add a delegated_inode struct to vfs_rmdir() and populate that
pointer with the parent inode if it's non-NULL. Most existing in-kernel
callers pass in a NULL pointer.

Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: NeilBrown <neil@brown.name>
Signed-off-by: Jeff Layton <jlayton@kernel.org>
Link: https://patch.msgid.link/20251111-dir-deleg-ro-v6-7-52f3feebb2f2@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-12 09:38:35 +01:00
Jeff Layton e12d203b8c
vfs: allow mkdir to wait for delegation break on parent
In order to add directory delegation support, we need to break
delegations on the parent whenever there is going to be a change in the
directory.

Add a new delegated_inode parameter to vfs_mkdir. All of the existing
callers set that to NULL for now, except for do_mkdirat which will
properly block until the lease is gone.

Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: NeilBrown <neil@brown.name>
Signed-off-by: Jeff Layton <jlayton@kernel.org>
Link: https://patch.msgid.link/20251111-dir-deleg-ro-v6-6-52f3feebb2f2@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-12 09:38:35 +01:00
Jeff Layton b46ebf9a76
vfs: add try_break_deleg calls for parents to vfs_{link,rename,unlink}
In order to add directory delegation support, we need to break
delegations on the parent whenever there is going to be a change in the
directory.

vfs_link, vfs_unlink, and vfs_rename all have existing delegation break
handling for the children in the rename. Add the necessary calls for
breaking delegations in the parent(s) as well.

Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: NeilBrown <neil@brown.name>
Signed-off-by: Jeff Layton <jlayton@kernel.org>
Link: https://patch.msgid.link/20251111-dir-deleg-ro-v6-5-52f3feebb2f2@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-12 09:38:35 +01:00
Jeff Layton e6d28ebc17
filelock: push the S_ISREG check down to ->setlease handlers
When nfsd starts requesting directory delegations, setlease handlers may
see requests for leases on directories. Push the !S_ISREG check down
into the non-trivial setlease handlers, so we can selectively enable
them where they're supported.

FUSE is special: It's the only filesystem that supports atomic_open and
allows kernel-internal leases. atomic_open is issued when the VFS
doesn't know the state of the dentry being opened. If the file doesn't
exist, it may be created, in which case the dir lease should be broken.

The existing kernel-internal lease implementation has no provision for
this. Ensure that we don't allow directory leases by default going
forward by explicitly disabling them there.

Reviewed-by: NeilBrown <neil@brown.name>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jeff Layton <jlayton@kernel.org>
Link: https://patch.msgid.link/20251111-dir-deleg-ro-v6-4-52f3feebb2f2@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-12 09:38:35 +01:00
Jeff Layton 6976ed2dd0
filelock: add struct delegated_inode
The current API requires a pointer to an inode pointer. It's easy for
callers to get this wrong. Add a new delegated_inode structure and use
that to pass back any inode that needs to be waited on.

Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: NeilBrown <neil@brown.name>
Signed-off-by: Jeff Layton <jlayton@kernel.org>
Link: https://patch.msgid.link/20251111-dir-deleg-ro-v6-3-52f3feebb2f2@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-12 09:38:34 +01:00
Jeff Layton 4be9f3cc58
filelock: rework the __break_lease API to use flags
Currently __break_lease takes both a type and an openmode. With the
addition of directory leases, that makes less sense. Declare a set of
LEASE_BREAK_* flags that can be used to control how lease breaks work
instead of requiring a type and an openmode.

Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: NeilBrown <neil@brown.name>
Signed-off-by: Jeff Layton <jlayton@kernel.org>
Link: https://patch.msgid.link/20251111-dir-deleg-ro-v6-2-52f3feebb2f2@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-12 09:38:34 +01:00
Jeff Layton 6fc5f2b19e
filelock: make lease_alloc() take a flags argument
__break_lease() currently overrides the flc_flags field in the lease
after allocating it. A forthcoming patch will add the ability to request
a FL_DELEG type lease.

Instead of overriding the flags field, add a flags argument to
lease_alloc() and lease_init() so it's set correctly after allocating.

Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: NeilBrown <neil@brown.name>
Signed-off-by: Jeff Layton <jlayton@kernel.org>
Link: https://patch.msgid.link/20251111-dir-deleg-ro-v6-1-52f3feebb2f2@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-12 09:38:31 +01:00
Ryan Chen 7083e14225 dt-bindings: interrupt-controller: aspeed,ast2700: Correct #interrupt-cells and interrupts count
Update the AST2700 interrupt controller binding to match the actual
hardware and the irq-aspeed-intc driver behavior.

 - Interrupts:

    First-level INTC banks request multiple interrupt lines to the root
    GIC, with a maximum of 10 per bank. Second-level INTC banks request
    only one interrupt line to their parent INTC-IC. Therefore, set the
    interrupts property to allow a minimum of 1 and a maximum of 10
    entries.

 - #interrupt-cells:

    Set '#interrupt-cells' to <1> since the aspeed intc driver does not
    support specifying a trigger type; only the interrupt index is used.

Signed-off-by: Ryan Chen <ryan_chen@aspeedtech.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Link: https://patch.msgid.link/20251030060155.2342604-2-ryan_chen@aspeedtech.com
2025-11-11 22:20:45 +01:00
Junhui Liu 47a4ebbf91 irqchip/aclint-sswi: Add Nuclei UX900 support
Reuse the generic ACLINT SSWI probe for Nuclei UX900 since it is
compliant with the ACLINT specification.

Signed-off-by: Junhui Liu <junhui.liu@pigmoral.tech>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://patch.msgid.link/20251021-dr1v90-basic-dt-v3-9-5478db4f664a@pigmoral.tech
2025-11-11 22:17:22 +01:00
Junhui Liu a1c3a7d7ee dt-bindings: interrupt-controller: Add Anlogic DR1V90 ACLINT SSWI
Add SSWI support for Anlogic DR1V90 SoC, which uses Nuclei UX900 with a
TIMER unit compliant with the ACLINT specification.

Signed-off-by: Junhui Liu <junhui.liu@pigmoral.tech>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Rob Herring (Arm) <robh@kernel.org>
Link: https://patch.msgid.link/20251021-dr1v90-basic-dt-v3-6-5478db4f664a@pigmoral.tech
2025-11-11 22:17:21 +01:00
Junhui Liu 579951da64 dt-bindings: interrupt-controller: Add Anlogic DR1V90 ACLINT MSWI
Add MSWI support for Anlogic DR1V90 SoC, which uses Nuclei UX900 with a
TIMER unit compliant with the ACLINT specification.

Signed-off-by: Junhui Liu <junhui.liu@pigmoral.tech>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Rob Herring (Arm) <robh@kernel.org>
Link: https://patch.msgid.link/20251021-dr1v90-basic-dt-v3-5-5478db4f664a@pigmoral.tech
2025-11-11 22:17:21 +01:00
Junhui Liu b90ac5fe32 dt-bindings: interrupt-controller: Add Anlogic DR1V90 PLIC
Add PLIC support for Anlogic DR1V90.

Signed-off-by: Junhui Liu <junhui.liu@pigmoral.tech>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Conor Dooley <conor.dooley@microchip.com>
Link: https://patch.msgid.link/20251021-dr1v90-basic-dt-v3-4-5478db4f664a@pigmoral.tech
2025-11-11 22:17:21 +01:00
Krzysztof Kozlowski 45cc441de7 irqchip/irq-bcm7038-l1: Remove unused reg_mask_status()
reg_mask_status() is not referenced anywhere leading to W=1 warning:

  irq-bcm7038-l1.c:85:28: error: unused function 'reg_mask_status' [-Werror,-Wunused-function]

Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Florian Fainelli <florian.fainelli@broadcom.com>
Link: https://patch.msgid.link/20251106155200.337399-2-krzysztof.kozlowski@linaro.org
2025-11-11 22:11:17 +01:00
Charles Mirabile a045359e72 irqchip/sifive-plic: Fix call to __plic_toggle() in M-Mode code path
The code path for M-Mode linux that disables interrupts for other contexts
was missed when refactoring __plic_toggle().

Since the new version caches updates to the state for the primary context,
its use in this codepath is no longer desireable even if it could be made
correct.

Replace the calls to __plic_toggle() with a loop that simply disables all
of the interrupts in groups of 32 with a direct mmio write.

Fixes: 14ff9e54dd ("irqchip/sifive-plic: Cache the interrupt enable state")
Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Charles Mirabile <cmirabil@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://patch.msgid.link/20251103161813.2437427-1-cmirabil@redhat.com
Closes: https://lore.kernel.org/oe-kbuild-all/202510271316.AQM7gCCy-lkp@intel.com/
2025-11-11 22:11:16 +01:00
Shrikanth Hegde 65177ea9f6 sched/deadline: Minor cleanup in select_task_rq_dl()
In select_task_rq_dl, there is only one goto statement, there is no
need for it.

No functional changes.

Signed-off-by: Shrikanth Hegde <sshegde@linux.ibm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Juri Lelli <juri.lelli@redhat.com>
Link: https://patch.msgid.link/20251014100342.978936-2-sshegde@linux.ibm.com
2025-11-11 17:27:55 +01:00
Shrikanth Hegde b4bfacd392 sched/deadline: Use cpumask_weight_and() in dl_bw_cpus
cpumask_subset(a,b) -> cpumask_weight(a) should be same as cpumask_weight_and(a,b)
for_each_cpu_and(a,b) to count cpus could be replaced by cpumask_weight_and(a,b)

No Functional Change. It could save a few cycles since cpumask_weight_and
would be more efficient. Plus one less stack variable.

Signed-off-by: Shrikanth Hegde <sshegde@linux.ibm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Juri Lelli <juri.lelli@redhat.com>
Link: https://patch.msgid.link/20251014100342.978936-3-sshegde@linux.ibm.com
2025-11-11 17:27:55 +01:00
Peter Zijlstra 2614069c59 sched/deadline: Document dl_server
Place the notes that resulted from going through the dl_server code in a
comment.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
2025-11-11 17:27:50 +01:00
Peter Zijlstra f5a538c07d sched/deadline: Fix dl_server stop condition
Gabriel reported that the dl_server doesn't stop as expected.

The problem was found to be the fact that idle time and fair runtime are
treated equally. Both will count towards dl_server runtime and push the
activation forwards when it is in the zero-laxity wait state.

Notably:

  dl_server_update_idle()
    update_curr_dl_se()
      if (dl_defer && dl_throttled && dl_runtime_exceeded())
        hrtimer_try_to_cancel(); // stop timer
	replenish_dl_new_period()
	  deadline = now + dl_deadline; // fwd period
	  runtime = dl_runtime;
        start_dl_timer(); // restart timer

And while we do want idle time accounted towards the *current* activation of
the dl_server -- after all, a fair task could've ran if we had any -- we don't
necessarily want idle time to cause or push forward an activation.

Introduce dl_defer_idle to make this distinction. It will be set once idle time
pushed the activation forward, once set idle time will only be allowed to
consume any runtime but not push the activation. This will then cause
dl_server_timer() to fire, which will stop the dl_server.

Any non-idle time accounting during this phase will clear dl_defer_idle, so
only a full period of idle will cause the dl_server to stop.

Reported-by: Gabriele Monaco <gmonaco@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patch.msgid.link/20251101000057.GA2184199@noisy.programming.kicks-ass.net
2025-11-11 12:33:39 +01:00
Peter Zijlstra e636ffb9e3 sched/deadline: Fix dl_server time accounting
The dl_server time accounting code is a little odd. The normal scheduler
pattern is to update curr before doing something, such that the old state is
fully accounted before changing state.

Notably, the dl_server_timer() needs to propagate the current time accounting
since the current task could be ran by dl_server and thus this can affect
dl_se->runtime. Similarly for dl_server_start().

And since the (deferred) dl_server wants idle time accounted, rework
sched_idle_class time accounting to be more like all the others.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patch.msgid.link/20251020141130.GJ3245006@noisy.programming.kicks-ass.net
2025-11-11 12:33:38 +01:00
Hao Jia e40cea333e sched/core: Remove double update_rq_clock() in __set_cpus_allowed_ptr_locked()
Since commit d4c64207b8 ("sched: Cleanup the sched_change NOCLOCK usage"),
update_rq_clock() is called in do_set_cpus_allowed() -> sched_change_begin()
to update the rq clock. This results in a duplicate call update_rq_clock()
in __set_cpus_allowed_ptr_locked().

While holding the rq lock and before calling do_set_cpus_allowed(),
there is nothing that depends on an updated rq_clock.

Therefore, remove the redundant update_rq_clock() in
__set_cpus_allowed_ptr_locked() to avoid the warning about double
rq clock updates.

Fixes: d4c64207b8 ("sched: Cleanup the sched_change NOCLOCK usage")
Signed-off-by: Hao Jia <jiahao1@lixiang.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: K Prateek Nayak <kprateek.nayak@amd.com>
Link: https://patch.msgid.link/20251029093655.31252-1-jiahao.kernel@gmail.com
2025-11-11 12:33:38 +01:00
Peter Zijlstra 79f3f9bedd sched/eevdf: Fix min_vruntime vs avg_vruntime
Basically, from the constraint that the sum of lag is zero, you can
infer that the 0-lag point is the weighted average of the individual
vruntime, which is what we're trying to compute:

        \Sum w_i * v_i
  avg = --------------
           \Sum w_i

Now, since vruntime takes the whole u64 (worse, it wraps), this
multiplication term in the numerator is not something we can compute;
instead we do the min_vruntime (v0 henceforth) thing like:

  v_i = (v_i - v0) + v0

This does two things:
 - it keeps the key: (v_i - v0) 'small';
 - it creates a relative 0-point in the modular space.

If you do that subtitution and work it all out, you end up with:

        \Sum w_i * (v_i - v0)
  avg = --------------------- + v0
              \Sum w_i

Since you cannot very well track a ratio like that (and not suffer
terrible numerical problems) we simpy track the numerator and
denominator individually and only perform the division when strictly
needed.

Notably, the numerator lives in cfs_rq->avg_vruntime and the denominator
lives in cfs_rq->avg_load.

The one extra 'funny' is that these numbers track the entities in the
tree, and current is typically outside of the tree, so avg_vruntime()
adds current when needed before doing the division.

(vruntime_eligible() elides the division by cross-wise multiplication)

Anyway, as mentioned above, we currently use the CFS era min_vruntime
for this purpose. However, this thing can only move forward, while the
above avg can in fact move backward (when a non-eligible task leaves,
the average becomes smaller), this can cause trouble when through
happenstance (or construction) these values drift far enough apart to
wreck the game.

Replace cfs_rq::min_vruntime with cfs_rq::zero_vruntime which is kept
near/at avg_vruntime, following its motion.

The down-side is that this requires computing the avg more often.

Fixes: 147f3efaa2 ("sched/fair: Implement an EEVDF-like scheduling policy")
Reported-by: Zicheng Qu <quzicheng@huawei.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patch.msgid.link/20251106111741.GC4068168@noisy.programming.kicks-ass.net
Cc: stable@vger.kernel.org
2025-11-11 12:33:38 +01:00
Peter Zijlstra 9359d9785d sched/core: Add comment explaining force-idle vruntime snapshots
I always end up having to re-read these emails every time I look at
this code. And a future patch is going to change this story a little.
This means it is past time to stick them in a comment so it can be
modified and stay current.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20200506143506.GH5298@hirez.programming.kicks-ass.net
Link: https://lkml.kernel.org/r/20200515103844.GG2978@hirez.programming.kicks-ass.net
Link: https://patch.msgid.link/20251106111603.GB4068168@noisy.programming.kicks-ass.net
2025-11-11 12:33:37 +01:00
Fernand Sieber 7f829bde94 sched/core: Optimize core cookie matching check
Early return true if the core cookie matches. This avoids the SMT mask
loop to check for an idle core, which might be more expensive on wide
platforms.

Signed-off-by: Fernand Sieber <sieberf@amazon.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: K Prateek Nayak <kprateek.nayak@amd.com>
Reviewed-by: Madadi Vineeth Reddy <vineethr@linux.ibm.com>
Link: https://patch.msgid.link/20251105152538.470586-1-sieberf@amazon.com
2025-11-11 12:33:37 +01:00
Fernand Sieber 127b90315c sched/proxy: Yield the donor task
When executing a task in proxy context, handle yields as if they were
requested by the donor task. This matches the traditional PI semantics
of yield() as well.

This avoids scenario like proxy task yielding, pick next task selecting the
same previous blocked donor, running the proxy task again, etc.

Reported-by: kernel test robot <oliver.sang@intel.com>
Closes: https://lore.kernel.org/oe-lkp/202510211205.1e0f5223-lkp@intel.com
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Fernand Sieber <sieberf@amazon.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patch.msgid.link/20251106104022.195157-1-sieberf@amazon.com
2025-11-11 12:33:36 +01:00
Mateusz Guzik dca3aa666f
fs: move inode fields used during fast path lookup closer together
This should avoid *some* cache misses.

Successful path lookup is guaranteed to load at least ->i_mode,
->i_opflags and ->i_acl. At the same time the common case will avoid
looking at more fields.

struct inode is not guaranteed to have any particular alignment, notably
ext4 has it only aligned to 8 bytes meaning nearby fields might happen
to be on the same or only adjacent cache lines depending on luck (or no
luck).

According to pahole:
        umode_t                    i_mode;               /*     0     2 */
        short unsigned int         i_opflags;            /*     2     2 */
        kuid_t                     i_uid;                /*     4     4 */
        kgid_t                     i_gid;                /*     8     4 */
        unsigned int               i_flags;              /*    12     4 */
        struct posix_acl *         i_acl;                /*    16     8 */
        struct posix_acl *         i_default_acl;        /*    24     8 */

->i_acl is unnecessarily separated by 8 bytes from the other fields.
With struct inode being offset 48 bytes into the cacheline this means an
avoidable miss. Note it will still be there for the 56 byte case.

New layout:
        umode_t                    i_mode;               /*     0     2 */
        short unsigned int         i_opflags;            /*     2     2 */
        unsigned int               i_flags;              /*     4     4 */
        struct posix_acl *         i_acl;                /*     8     8 */
        struct posix_acl *         i_default_acl;        /*    16     8 */
        kuid_t                     i_uid;                /*    24     4 */
        kgid_t                     i_gid;                /*    28     4 */

I verified with pahole there are no size or hole changes.

This is stopgap until someone(tm) sanitizes the layout in the first
place, allocation methods aside.

Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
Link: https://patch.msgid.link/20251109121931.1285366-1-mjguzik@gmail.com
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-11 10:49:54 +01:00
Christian Brauner 18b5c40048
Merge patch series "ns: header cleanups and initial namespace reference count improvements"
Christian Brauner <brauner@kernel.org> says:

Cleanup the namespace headers by splitting them into types and helpers.
Better separate common namepace types and functions from namespace tree
types and functions.

Fix the reference counts of initial namespaces so we don't do any
pointless cacheline ping-pong for them when we know they can never go
away. Add a bunch of asserts for both the passive and active reference
counts to catch any changes that would break it.

* patches from https://patch.msgid.link/20251110-work-namespace-nstree-fixes-v1-0-e8a9264e0fb9@kernel.org:
  selftests/namespaces: fix nsid tests
  ns: drop custom reference count initialization for initial namespaces
  pid: rely on common reference count behavior
  ns: add asserts for initial namespace active reference counts
  ns: add asserts for initial namespace reference counts
  ns: make all reference counts on initial namespace a nop
  ipc: enable is_ns_init_id() assertions
  fs: use boolean to indicate anonymous mount namespace
  ns: rename is_initial_namespace()
  ns: make is_initial_namespace() argument const
  nstree: use guards for ns_tree_lock
  nstree: simplify owner list iteration
  nstree: switch to new structures
  nstree: add helper to operate on struct ns_tree_{node,root}
  nstree: move nstree types into separate header
  nstree: decouple from ns_common header
  ns: move namespace types into separate header

Link: https://patch.msgid.link/20251110-work-namespace-nstree-fixes-v1-0-e8a9264e0fb9@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-11 10:01:37 +01:00
Christian Brauner 6453937581
selftests/namespaces: fix nsid tests
Ensure that we always kill and cleanup all processes.

Link: https://patch.msgid.link/20251110-work-namespace-nstree-fixes-v1-17-e8a9264e0fb9@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-11 10:01:32 +01:00
Christian Brauner c2bbd2db52
ns: drop custom reference count initialization for initial namespaces
Initial namespaces don't modify their reference count anymore.
They remain fixed at one so drop the custom refcount initializations.

Link: https://patch.msgid.link/20251110-work-namespace-nstree-fixes-v1-16-e8a9264e0fb9@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-11 10:01:32 +01:00
Christian Brauner 282879afa0
pid: rely on common reference count behavior
Now that we changed the generic reference counting mechanism for all
namespaces to never manipulate reference counts of initial namespaces we
can drop the special handling for pid namespaces.

Link: https://patch.msgid.link/20251110-work-namespace-nstree-fixes-v1-15-e8a9264e0fb9@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-11 10:01:32 +01:00
Christian Brauner 7118daabb6
ns: add asserts for initial namespace active reference counts
They always remain fixed at one. Notice when that assumptions is broken.

Link: https://patch.msgid.link/20251110-work-namespace-nstree-fixes-v1-14-e8a9264e0fb9@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-11 10:01:32 +01:00
Christian Brauner 2b60d56acc
ns: add asserts for initial namespace reference counts
They always remain fixed at one. Notice when that assumptions is broken.

Link: https://patch.msgid.link/20251110-work-namespace-nstree-fixes-v1-13-e8a9264e0fb9@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-11 10:01:31 +01:00
Christian Brauner 657aeb436d
ns: make all reference counts on initial namespace a nop
They are always active so no need to needlessly cacheline ping-pong.

Link: https://patch.msgid.link/20251110-work-namespace-nstree-fixes-v1-12-e8a9264e0fb9@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-11 10:01:31 +01:00
Christian Brauner 3826d5dd06
ipc: enable is_ns_init_id() assertions
The ipc namespace may call put_ipc_ns() and get_ipc_ns() before it is
added to the namespace tree. Assign the id early like we do for a some
other namespaces.

Link: https://patch.msgid.link/20251110-work-namespace-nstree-fixes-v1-11-e8a9264e0fb9@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-11 10:01:31 +01:00
Christian Brauner d9a44089ac
fs: use boolean to indicate anonymous mount namespace
Stop playing games with the namespace id and use a boolean instead:

* This will remove the special-casing we need to do everywhere for mount
  namespaces.

* It will allow us to use asserts on the namespace id for initial
  namespaces everywhere.

* It will allow us to put anonymous mount namespaces on the namespaces
  trees in the future and thus make them available to statmount() and
  listmount().

Link: https://patch.msgid.link/20251110-work-namespace-nstree-fixes-v1-10-e8a9264e0fb9@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-11 10:01:31 +01:00
Christian Brauner 6bf253855a
ns: rename is_initial_namespace()
Rename is_initial_namespace() to ns_init_inum() and make it symmetrical
with the ns id variant.

Link: https://patch.msgid.link/20251110-work-namespace-nstree-fixes-v1-9-e8a9264e0fb9@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-11 10:01:31 +01:00
Christian Brauner ed93c0697a
ns: make is_initial_namespace() argument const
We don't modify the data structure at all so pass it as const.

Link: https://patch.msgid.link/20251110-work-namespace-nstree-fixes-v1-8-e8a9264e0fb9@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-11 10:01:31 +01:00
Christian Brauner 298ab06ae4
nstree: use guards for ns_tree_lock
Make use of the guard infrastructure for ns_tree_lock.

Link: https://patch.msgid.link/20251110-work-namespace-nstree-fixes-v1-7-e8a9264e0fb9@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-11 10:01:31 +01:00
Christian Brauner 8a30420c89
nstree: simplify owner list iteration
Make use of list_for_each_entry_from_rcu().

Link: https://patch.msgid.link/20251110-work-namespace-nstree-fixes-v1-6-e8a9264e0fb9@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-11 10:01:30 +01:00
Christian Brauner a657bc8a75
nstree: switch to new structures
Switch the nstree management to the new combined structures.

Link: https://patch.msgid.link/20251110-work-namespace-nstree-fixes-v1-5-e8a9264e0fb9@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-11 10:01:30 +01:00
Christian Brauner d12ea8062f
nstree: add helper to operate on struct ns_tree_{node,root}
Add helpers that work on the combined rbtree and rculist combined.
This will make the code a lot more managable and legible.

Link: https://patch.msgid.link/20251110-work-namespace-nstree-fixes-v1-4-e8a9264e0fb9@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-11 10:01:30 +01:00
Christian Brauner 1c64fb02ac
nstree: move nstree types into separate header
Introduce two new fundamental data structures for namespace tree
management in a separate header file.

Link: https://patch.msgid.link/20251110-work-namespace-nstree-fixes-v1-3-e8a9264e0fb9@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-11 10:01:30 +01:00
Christian Brauner ea1549e628
nstree: decouple from ns_common header
Foward declare struct ns_common and remove the include of ns_common.h.
We want ns_common.h to possibly include nstree structures but not the
other way around.

Link: https://patch.msgid.link/20251110-work-namespace-nstree-fixes-v1-2-e8a9264e0fb9@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-11 10:01:30 +01:00
Christian Brauner 2b9a0f21fb
ns: move namespace types into separate header
Add a dedicated header for namespace types.

Link: https://patch.msgid.link/20251110-work-namespace-nstree-fixes-v1-1-e8a9264e0fb9@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-11 10:01:30 +01:00
Christian Brauner a67ee4e2ba
Merge branch 'kbuild-6.19.fms.extension'
Bring in the shared branch with the kbuild tree to enable
'-fms-extensions' for 6.19. Further namespace cleanup work
requires this extension.

Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-11 09:59:08 +01:00
Ma Ke f18e71cd6c EDAC/ie31200: Fix error handling in ie31200_register_mci
ie31200_register_mci() calls device_initialize() for priv->dev
unconditionally. However, in the error path, put_device() is not
called, leading to an imbalance. Similarly, in the unload path,
put_device() is missing.

Although edac_mc_free() eventually frees the memory, it does not
release the device initialized by device_initialize(). For code
readability and proper pairing of device_initialize()/put_device(),
add put_device() calls in both error and unload paths.

Found by code review.

Signed-off-by: Ma Ke <make24@iscas.ac.cn>
Signed-off-by: Tony Luck <tony.luck@intel.com>
Reviewed-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Link: https://patch.msgid.link/20251106084735.35017-1-make24@iscas.ac.cn
2025-11-10 17:06:10 -08:00
Christian Brauner ae901e5e2e
Merge patch series "ns: fixes for namespace iteration and active reference counting"
Christian Brauner <brauner@kernel.org> says:

* Make sure to initialize the active reference count for the initial
  network namespace and prevent __ns_common_init() from returning too
  early.

* Make sure that passive reference counts are dropped outside of rcu
  read locks as some namespaces such as the mount namespace do in fact
  sleep when putting the last reference.

* The setns() system call supports:

  (1) namespace file descriptors (nsfd)
  (2) process file descriptors (pidfd)

  When using nsfds the namespaces will remain active because they are
  pinned by the vfs. However, when pidfds are used things are more
  complicated.

  When the target task exits and passes through exit_nsproxy_namespaces()
  or is reaped and thus also passes through exit_cred_namespaces() after
  the setns()'ing task has called prepare_nsset() but before the active
  reference count of the set of namespaces it wants to setns() to might
  have been dropped already:

    P1                                                              P2

    pid_p1 = clone(CLONE_NEWUSER | CLONE_NEWNET | CLONE_NEWNS)
                                                                    pidfd = pidfd_open(pid_p1)
                                                                    setns(pidfd, CLONE_NEWUSER | CLONE_NEWNET | CLONE_NEWNS)
                                                                    prepare_nsset()

    exit(0)
    // ns->__ns_active_ref        == 1
    // parent_ns->__ns_active_ref == 1
    -> exit_nsproxy_namespaces()
    -> exit_cred_namespaces()

    // ns_active_ref_put() will also put
    // the reference on the owner of the
    // namespace. If the only reason the
    // owning namespace was alive was
    // because it was a parent of @ns
    // it's active reference count now goes
    // to zero... --------------------------------
    //                                           |
    // ns->__ns_active_ref        == 0           |
    // parent_ns->__ns_active_ref == 0           |
                                                 |                  commit_nsset()
                                                 -----------------> // If setns()
                                                                    // now manages to install the namespaces
                                                                    // it will call ns_active_ref_get()
                                                                    // on them thus bumping the active reference
                                                                    // count from zero again but without also
                                                                    // taking the required reference on the owner.
                                                                    // Thus we get:
                                                                    //
                                                                    // ns->__ns_active_ref        == 1
                                                                    // parent_ns->__ns_active_ref == 0

    When later someone does ns_active_ref_put() on @ns it will underflow
    parent_ns->__ns_active_ref leading to a splat from our asserts
    thinking there are still active references when in fact the counter
    just underflowed.

  So resurrect the ownership chain if necessary as well. If the caller
  succeeded to grab passive references to the set of namespaces the
  setns() should simply succeed even if the target task exists or gets
  reaped in the meantime.

  The race is rare and can only be triggered when using pidfs to setns()
  to namespaces. Also note that active reference on initial namespaces are
  nops.

  Since we now always handle parent references directly we can drop
  ns_ref_active_get_owner() when adding a namespace to a namespace tree.
  This is now all handled uniformly in the places where the new namespaces
  actually become active.

* patches from https://patch.msgid.link/20251109-namespace-6-19-fixes-v1-0-ae8a4ad5a3b3@kernel.org:
  selftests/namespaces: test for efault
  selftests/namespaces: add active reference count regression test
  ns: add asserts for active refcount underflow
  ns: handle setns(pidfd, ...) cleanly
  ns: return EFAULT on put_user() error
  ns: make sure reference are dropped outside of rcu lock
  ns: don't increment or decrement initial namespaces
  ns: don't skip active reference count initialization

Link: https://patch.msgid.link/20251109-namespace-6-19-fixes-v1-0-ae8a4ad5a3b3@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-10 15:54:02 +01:00
Christian Brauner 07d7ad46da
selftests/namespaces: test for efault
Ensure that put_user() can fail and that namespace cleanup works
correctly.

Link: https://patch.msgid.link/20251109-namespace-6-19-fixes-v1-8-ae8a4ad5a3b3@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-10 15:53:56 +01:00
Borislav Petkov (AMD) 249092174c tools/objtool: Copy the __cleanup unused variable fix for older clang
Copy from

  54da6a0924 ("locking: Introduce __cleanup() based infrastructure")

the bits which mark the variable with a cleanup attribute unused so that my
clang 15 can dispose of it properly instead of warning that it is unused which
then fails the build due to -Werror.

Suggested-by: Nathan Chancellor <nathan@kernel.org>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Reviewed-by: Nathan Chancellor <nathan@kernel.org>
Link: https://lore.kernel.org/r/20251031114919.GBaQSiPxZrziOs3RCW@fat_crate.local
2025-11-10 12:46:08 +01:00
Uros Bizjak fd4e025526 x86/percpu: Use BIT_WORD() and BIT_MASK() macros
Use BIT_WORD() and BIT_MASK() macros from <linux/bits.h>
in <arch/x86/include/asm/percpu.h> instead of open-coding them.

No functional change intended.

Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Link: https://patch.msgid.link/20250907184915.78041-1-ubizjak@gmail.com
2025-11-10 11:55:54 +01:00
Christian Brauner aa70b9cf68
Merge branch 'kbuild-6.19.fms.extension'
Bring in the shared branch with the Kbuild tree for enabling
'-fms-extensions' for 6.19.

Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-10 10:41:58 +01:00
Christian Brauner 3c60b0b1e5 Shared branch between Kbuild and other trees for enabling '-fms-extensions' for 6.19
Signed-off-by: Nathan Chancellor <nathan@kernel.org>
 -----BEGIN PGP SIGNATURE-----
 
 iHUEABYKAB0WIQR74yXHMTGczQHYypIdayaRccAalgUCaQzbRwAKCRAdayaRccAa
 ls8lAP9Dj1mOl+KTtajMvDnDTym4Sso9CaFP+5maFAv9CflAIwEA5QEtSwI9sMcH
 ty8x9Y6TTuib+ns37i2jxR8cIt4jHwU=
 =WA6M
 -----END PGP SIGNATURE-----
gpgsig -----BEGIN PGP SIGNATURE-----
 
 iHUEABYKAB0WIQRAhzRXHqcMeLMyaSiRxhvAZXjcogUCaRGyjgAKCRCRxhvAZXjc
 okJ7AQDFRmoidpqHmRlvZQ3aismKkNHfx2k67QtRlX+YxDi8rQEAmmKyKUiX/SZV
 39TroGfiJ5ytQuXwz3QxG/34cA+kAgQ=
 =HqhX
 -----END PGP SIGNATURE-----

Merge patch "kbuild: Add '-fms-extensions' to areas with dedicated CFLAGS"

Nathan Chancellor <nathan@kernel.org> says:

Shared branch between Kbuild and other trees for enabling
'-fms-extensions' for 6.19.

* tag 'kbuild-ms-extensions-6.19' of ssh://gitolite.kernel.org/pub/scm/linux/kernel/git/kbuild/linux:
  kbuild: Add '-fms-extensions' to areas with dedicated CFLAGS
  Kbuild: enable -fms-extensions
  jfs: Rename _inline to avoid conflict with clang's '-fms-extensions'

Link: https://patch.msgid.link/20251101-kbuild-ms-extensions-dedicated-cflags-v1-1-38004aba524b@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-10 10:38:07 +01:00
Christian Brauner 88efd7c699
selftests/namespaces: add active reference count regression test
Add a regression test for setns() with pidfd.

Link: https://patch.msgid.link/20251109-namespace-6-19-fixes-v1-7-ae8a4ad5a3b3@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-10 10:20:54 +01:00
Christian Brauner 57b39aabb9
ns: add asserts for active refcount underflow
Add a few more assert to detect active reference count underflows.

Link: https://patch.msgid.link/20251109-namespace-6-19-fixes-v1-6-ae8a4ad5a3b3@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-10 10:20:54 +01:00
Christian Brauner f8d5a8970d
ns: handle setns(pidfd, ...) cleanly
The setns() system call supports:

(1) namespace file descriptors (nsfd)
(2) process file descriptors (pidfd)

When using nsfds the namespaces will remain active because they are
pinned by the vfs. However, when pidfds are used things are more
complicated.

When the target task exits and passes through exit_nsproxy_namespaces()
or is reaped and thus also passes through exit_cred_namespaces() after
the setns()'ing task has called prepare_nsset() but before the active
reference count of the set of namespaces it wants to setns() to might
have been dropped already:

  P1                                                              P2

  pid_p1 = clone(CLONE_NEWUSER | CLONE_NEWNET | CLONE_NEWNS)
                                                                  pidfd = pidfd_open(pid_p1)
                                                                  setns(pidfd, CLONE_NEWUSER | CLONE_NEWNET | CLONE_NEWNS)
                                                                  prepare_nsset()

  exit(0)
  // ns->__ns_active_ref        == 1
  // parent_ns->__ns_active_ref == 1
  -> exit_nsproxy_namespaces()
  -> exit_cred_namespaces()

  // ns_active_ref_put() will also put
  // the reference on the owner of the
  // namespace. If the only reason the
  // owning namespace was alive was
  // because it was a parent of @ns
  // it's active reference count now goes
  // to zero... --------------------------------
  //                                           |
  // ns->__ns_active_ref        == 0           |
  // parent_ns->__ns_active_ref == 0           |
                                               |                  commit_nsset()
                                               -----------------> // If setns()
                                                                  // now manages to install the namespaces
                                                                  // it will call ns_active_ref_get()
                                                                  // on them thus bumping the active reference
                                                                  // count from zero again but without also
                                                                  // taking the required reference on the owner.
                                                                  // Thus we get:
                                                                  //
                                                                  // ns->__ns_active_ref        == 1
                                                                  // parent_ns->__ns_active_ref == 0

  When later someone does ns_active_ref_put() on @ns it will underflow
  parent_ns->__ns_active_ref leading to a splat from our asserts
  thinking there are still active references when in fact the counter
  just underflowed.

So resurrect the ownership chain if necessary as well. If the caller
succeeded to grab passive references to the set of namespaces the
setns() should simply succeed even if the target task exists or gets
reaped in the meantime and thus has dropped all active references to its
namespaces.

The race is rare and can only be triggered when using pidfs to setns()
to namespaces. Also note that active reference on initial namespaces are
nops.

Since we now always handle parent references directly we can drop
ns_ref_active_get_owner() when adding a namespace to a namespace tree.
This is now all handled uniformly in the places where the new namespaces
actually become active.

Link: https://patch.msgid.link/20251109-namespace-6-19-fixes-v1-5-ae8a4ad5a3b3@kernel.org
Fixes: 3c9820d5c64a ("ns: add active reference count")
Reported-by: syzbot+1957b26299cf3ff7890c@syzkaller.appspotmail.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-10 10:20:54 +01:00
Christian Brauner a51dce7c32
ns: return EFAULT on put_user() error
Don't return EINVAL, return EFAULT just like we do in other system
calls.

Link: https://patch.msgid.link/20251109-namespace-6-19-fixes-v1-4-ae8a4ad5a3b3@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-10 10:20:54 +01:00
Christian Brauner 2ec2aff3c8
ns: make sure reference are dropped outside of rcu lock
The mount namespace may in fact sleep when putting the last passive
reference so we need to drop the namespace reference outside of the rcu
read lock. Do this by delaying the put until the next iteration where
we've already moved on to the next namespace and legitimized it. Once we
drop the rcu read lock to call put_user() we will also drop the
reference to the previous namespace in the tree.

Link: https://patch.msgid.link/20251109-namespace-6-19-fixes-v1-3-ae8a4ad5a3b3@kernel.org
Fixes: 76b6f5dfb3 ("nstree: add listns()")
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-10 10:20:53 +01:00
Christian Brauner 7cd3d20441
ns: don't increment or decrement initial namespaces
There's no need to bump the active reference counts of initial
namespaces as they're always active and can simply remain at 1.

Link: https://patch.msgid.link/20251109-namespace-6-19-fixes-v1-2-ae8a4ad5a3b3@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-10 10:20:53 +01:00
Christian Brauner 0355dcae2d
ns: don't skip active reference count initialization
Don't skip active reference count initialization for initial namespaces.
Doing this will break network namespace active reference counting.

Link: https://patch.msgid.link/20251109-namespace-6-19-fixes-v1-1-ae8a4ad5a3b3@kernel.org
Fixes: 3a18f80918 ("ns: add active reference count")
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-10 10:20:50 +01:00
Arnd Bergmann 780813d701 x86/math-emu: Fix div_Xsig() prototype
The third argument of div_Xsig() is the output of the division, but is marked
'const', which means the compiler is not expecting it to be updated and may
generate bad code around the call. clang-21 now warns about the pattern since
an uninitialized variable is passed into two 'const' arguments by reference:

  arch/x86/math-emu/poly_atan.c:93:28: error: variable 'argSignif' is uninitialized \
  when passed as a const pointer argument here [-Werror,-Wuninitialized-const-pointer]
     93 |         div_Xsig(&Numer, &Denom, &argSignif);
        |                                   ^~~~~~~~~
  arch/x86/math-emu/poly_l2.c:195:29: error: variable 'argSignif' is uninitialized \
  when passed as a const pointer argument here [-Werror,-Wuninitialized-const-pointer]
    195 |                 div_Xsig(&Numer, &Denom, &argSignif);
        |                                           ^~~~~~~~~

The implementation is in assembly, so the problem has gone unnoticed since the
code was added in the linux-1.1 days. Remove the 'const' marker here.

Fixes: e19a1bdb835c ("Import 1.1.38")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Link: https://patch.msgid.link/20250807205334.123231-1-arnd@kernel.org
2025-11-09 21:01:08 +01:00
Julian Stecklina ed4f9638d9 x86/apic: Fix frequency in apic=verbose log output
When apic=verbose is specified, the LAPIC timer calibration prints its results
to the console. At least while debugging virtualization code, the CPU and bus
frequencies are printed incorrectly.

Specifically, for a 1.7 GHz CPU with 1 GHz bus frequency and HZ=1000,
the log includes a superfluous 0 after the period:

  ..... calibration result: 999978
  ..... CPU clock speed is 1696.0783 MHz.
  ..... host bus clock speed is 999.0978 MHz.

Looking at the code, this only worked as intended for HZ=100. After the fix,
the correct frequency is printed:

  ..... calibration result: 999828
  ..... CPU clock speed is 1696.507 MHz.
  ..... host bus clock speed is 999.828 MHz.

There is no functional change to the LAPIC calibration here, beyond the
printing format changes.

  [ bp: - Massage commit message
        - Figures it should apply this patch about ~4 years later
        - Massage it into the current code ]

Suggested-by: Markus Napierkowski <markus.napierkowski@cyberus-technology.de>
Signed-off-by: Julian Stecklina <julian.stecklina@cyberus-technology.de>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Link: https://patch.msgid.link/20211030142148.143261-1-js@alien8.de
2025-11-07 17:48:14 +01:00
Peter Zijlstra 2093d8cf80 perf/x86/intel: Optimize PEBS extended config
Similar to enable_acr_event, avoid the branch.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
2025-11-07 15:08:23 +01:00
Peter Zijlstra 02da693f66 perf/x86/intel: Check PEBS dyn_constraints
Handle the interaction between ("perf/x86/intel: Update dyn_constraint
base on PEBS event precise level") and ("perf/x86/intel: Add a check
for dynamic constraints").

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
2025-11-07 15:08:23 +01:00
Kan Liang bd24f9beed perf/x86/intel: Add a check for dynamic constraints
The current event scheduler has a limit. If the counter constraint of an
event is not a subset of any other counter constraint with an equal or
higher weight. The counters may not be fully utilized.

To workaround it, the commit bc1738f6ee ("perf, x86: Fix event
scheduler for constraints with overlapping counters") introduced an
overlap flag, which is hardcoded to the event constraint that may
trigger the limit. It only works for static constraints.

Many features on and after Intel PMON v6 require dynamic constraints. An
event constraint is decided by both static and dynamic constraints at
runtime. See commit 4dfe3232cc ("perf/x86: Add dynamic constraint").
The dynamic constraints are from CPUID enumeration. It's impossible to
hardcode it in advance. It's not practical to set the overlap flag to all
events. It's harmful to the scheduler.

For the existing Intel platforms, the dynamic constraints don't trigger
the limit. A real fix is not required.

However, for virtualization, VMM may give a weird CPUID enumeration to a
guest. It's impossible to indicate what the weird enumeration is. A
check is introduced, which can list the possible breaks if a weird
enumeration is used.

Check the dynamic constraints enumerated for normal, branch counters
logging, and auto-counter reload.
Check both PEBS and non-PEBS constratins.

Closes: https://lore.kernel.org/lkml/20250416195610.GC38216@noisy.programming.kicks-ass.net/
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patch.msgid.link/20250512175542.2000708-1-kan.liang@linux.intel.com
2025-11-07 15:08:23 +01:00
Dapeng Mi bb5f13df3c perf/x86/intel: Add counter group support for arch-PEBS
Base on previous adaptive PEBS counter snapshot support, add counter
group support for architectural PEBS. Since arch-PEBS shares same
counter group layout with adaptive PEBS, directly reuse
__setup_pebs_counter_group() helper to process arch-PEBS counter group.

Signed-off-by: Dapeng Mi <dapeng1.mi@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patch.msgid.link/20251029102136.61364-13-dapeng1.mi@linux.intel.com
2025-11-07 15:08:22 +01:00
Dapeng Mi 52448a0a73 perf/x86/intel: Setup PEBS data configuration and enable legacy groups
Different with legacy PEBS, arch-PEBS provides per-counter PEBS data
configuration by programing MSR IA32_PMC_GPx/FXx_CFG_C MSRs.

This patch obtains PEBS data configuration from event attribute and then
writes the PEBS data configuration to MSR IA32_PMC_GPx/FXx_CFG_C and
enable corresponding PEBS groups.

Please notice this patch only enables XMM SIMD regs sampling for
arch-PEBS, the other SIMD regs (OPMASK/YMM/ZMM) sampling on arch-PEBS
would be supported after PMI based SIMD regs (OPMASK/YMM/ZMM) sampling
is supported.

Co-developed-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Dapeng Mi <dapeng1.mi@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patch.msgid.link/20251029102136.61364-12-dapeng1.mi@linux.intel.com
2025-11-07 15:08:22 +01:00
Dapeng Mi e89c5d1f29 perf/x86/intel: Update dyn_constraint base on PEBS event precise level
arch-PEBS provides CPUIDs to enumerate which counters support PEBS
sampling and precise distribution PEBS sampling. Thus PEBS constraints
should be dynamically configured base on these counter and precise
distribution bitmap instead of defining them statically.

Update event dyn_constraint base on PEBS event precise level.

Signed-off-by: Dapeng Mi <dapeng1.mi@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patch.msgid.link/20251029102136.61364-11-dapeng1.mi@linux.intel.com
2025-11-07 15:08:22 +01:00
Dapeng Mi 2721e8da2d perf/x86/intel: Allocate arch-PEBS buffer and initialize PEBS_BASE MSR
Arch-PEBS introduces a new MSR IA32_PEBS_BASE to store the arch-PEBS
buffer physical address. This patch allocates arch-PEBS buffer and then
initialize IA32_PEBS_BASE MSR with the buffer physical address.

Co-developed-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Dapeng Mi <dapeng1.mi@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patch.msgid.link/20251029102136.61364-10-dapeng1.mi@linux.intel.com
2025-11-07 15:08:22 +01:00
Dapeng Mi d21954c8a0 perf/x86/intel: Process arch-PEBS records or record fragments
A significant difference with adaptive PEBS is that arch-PEBS record
supports fragments which means an arch-PEBS record could be split into
several independent fragments which have its own arch-PEBS header in
each fragment.

This patch defines architectural PEBS record layout structures and add
helpers to process arch-PEBS records or fragments. Only legacy PEBS
groups like basic, GPR, XMM and LBR groups are supported in this patch,
the new added YMM/ZMM/OPMASK vector registers capturing would be
supported in the future.

Signed-off-by: Dapeng Mi <dapeng1.mi@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patch.msgid.link/20251029102136.61364-9-dapeng1.mi@linux.intel.com
2025-11-07 15:08:21 +01:00
Dapeng Mi 167cde7dc9 perf/x86/intel/ds: Factor out PEBS group processing code to functions
Adaptive PEBS and arch-PEBS share lots of same code to process these
PEBS groups, like basic, GPR and meminfo groups. Extract these shared
code to generic functions to avoid duplicated code.

Signed-off-by: Dapeng Mi <dapeng1.mi@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patch.msgid.link/20251029102136.61364-8-dapeng1.mi@linux.intel.com
2025-11-07 15:08:21 +01:00
Dapeng Mi 8807d92270 perf/x86/intel/ds: Factor out PEBS record processing code to functions
Beside some PEBS record layout difference, arch-PEBS can share most of
PEBS record processing code with adaptive PEBS. Thus, factor out these
common processing code to independent inline functions, so they can be
reused by subsequent arch-PEBS handler.

Suggested-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Dapeng Mi <dapeng1.mi@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patch.msgid.link/20251029102136.61364-7-dapeng1.mi@linux.intel.com
2025-11-07 15:08:21 +01:00
Dapeng Mi d243d0bb64 perf/x86/intel: Initialize architectural PEBS
arch-PEBS leverages CPUID.23H.4/5 sub-leaves enumerate arch-PEBS
supported capabilities and counters bitmap. This patch parses these 2
sub-leaves and initializes arch-PEBS capabilities and corresponding
structures.

Since IA32_PEBS_ENABLE and MSR_PEBS_DATA_CFG MSRs are no longer existed
for arch-PEBS, arch-PEBS doesn't need to manipulate these MSRs. Thus add
a simple pair of __intel_pmu_pebs_enable/disable() callbacks for
arch-PEBS.

Signed-off-by: Dapeng Mi <dapeng1.mi@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patch.msgid.link/20251029102136.61364-6-dapeng1.mi@linux.intel.com
2025-11-07 15:08:20 +01:00
Dapeng Mi 5e4e355ae7 perf/x86/intel: Correct large PEBS flag check
current large PEBS flag check only checks if sample_regs_user contains
unsupported GPRs but doesn't check if sample_regs_intr contains
unsupported GPRs.

Of course, currently PEBS HW supports to sample all perf supported GPRs,
the missed check doesn't cause real issue. But it won't be true any more
after the subsequent patches support to sample SSP register. SSP
sampling is not supported by adaptive PEBS HW and it would be supported
until arch-PEBS HW. So correct this issue.

Fixes: a47ba4d77e ("perf/x86: Enable free running PEBS for REGS_USER/INTR")
Signed-off-by: Dapeng Mi <dapeng1.mi@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patch.msgid.link/20251029102136.61364-5-dapeng1.mi@linux.intel.com
2025-11-07 15:08:20 +01:00
Dapeng Mi ee98b8bfc7 perf/x86/intel: Replace x86_pmu.drain_pebs calling with static call
Use x86_pmu_drain_pebs static call to replace calling x86_pmu.drain_pebs
function pointer.

Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Dapeng Mi <dapeng1.mi@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patch.msgid.link/20251029102136.61364-4-dapeng1.mi@linux.intel.com
2025-11-07 15:08:20 +01:00
Dapeng Mi 7e772a93eb perf/x86: Fix NULL event access and potential PEBS record loss
When intel_pmu_drain_pebs_icl() is called to drain PEBS records, the
perf_event_overflow() could be called to process the last PEBS record.

While perf_event_overflow() could trigger the interrupt throttle and
stop all events of the group, like what the below call-chain shows.

perf_event_overflow()
  -> __perf_event_overflow()
    ->__perf_event_account_interrupt()
      -> perf_event_throttle_group()
        -> perf_event_throttle()
          -> event->pmu->stop()
            -> x86_pmu_stop()

The side effect of stopping the events is that all corresponding event
pointers in cpuc->events[] array are cleared to NULL.

Assume there are two PEBS events (event a and event b) in a group. When
intel_pmu_drain_pebs_icl() calls perf_event_overflow() to process the
last PEBS record of PEBS event a, interrupt throttle is triggered and
all pointers of event a and event b are cleared to NULL. Then
intel_pmu_drain_pebs_icl() tries to process the last PEBS record of
event b and encounters NULL pointer access.

To avoid this issue, move cpuc->events[] clearing from x86_pmu_stop()
to x86_pmu_del(). It's safe since cpuc->active_mask or
cpuc->pebs_enabled is always checked before access the event pointer
from cpuc->events[].

Closes: https://lore.kernel.org/oe-lkp/202507042103.a15d2923-lkp@intel.com
Fixes: 9734e25fbf ("perf: Fix the throttle logic for a group")
Reported-by: kernel test robot <oliver.sang@intel.com>
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Dapeng Mi <dapeng1.mi@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patch.msgid.link/20251029102136.61364-3-dapeng1.mi@linux.intel.com
2025-11-07 15:08:19 +01:00
Dapeng Mi c7f69dc073 perf/x86: Remove redundant is_x86_event() prototype
2 is_x86_event() prototypes are defined in perf_event.h. Remove the
redundant one.

Signed-off-by: Dapeng Mi <dapeng1.mi@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patch.msgid.link/20251029102136.61364-2-dapeng1.mi@linux.intel.com
2025-11-07 15:08:19 +01:00
Marco Crivellari 24e3848a2e RAS/CEC: Replace use of system_wq with system_percpu_wq
Switch to using system_percpu_wq because system_wq is going away as part of
a workqueue restructuring.

Currently if a user enqueues a work item using schedule_delayed_work() the
used workqueue is "system_wq" (per-cpu workqueue) while queue_delayed_work()
uses WORK_CPU_UNBOUND (used when a CPU is not specified). The same applies to
schedule_work() that is using system_wq and queue_work(), that makes use of
WORK_CPU_UNBOUND again.

This lack of consistency cannot be addressed without refactoring the API.
For more details see those commits and the Link tag below.

  128ea9f6cc ("workqueue: Add system_percpu_wq and system_dfl_wq")
  930c2ea566 ("workqueue: Add new WQ_PERCPU flag")

  [ bp: Massage commit message. ]

Suggested-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Marco Crivellari <marco.crivellari@suse.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Link: https://lore.kernel.org/all/20250221112003.1dSuoGyc@linutronix.de
2025-11-07 13:48:28 +01:00
Avadhut Naik 8616025ae6 EDAC: Remove the legacy EDAC sysfs interface
Commit

  1997471069 ("edac: add a new per-dimm API and make the old per-virtual-rank API obsolete")

introduced a new per-DIMM sysfs interface for EDAC making the old
per-virtual-rank sysfs interface obsolete.

Since this new sysfs interface was introduced more than a decade ago, remove
the obsolete legacy interface.

Signed-off-by: Avadhut Naik <avadhut.naik@amd.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Link: https://lore.kernel.org/20251106015727.1987246-1-avadhut.naik@amd.com
2025-11-06 13:21:29 +01:00
Avadhut Naik 6a85796915 EDAC/amd64: Remove NUM_CONTROLLERS macro
Currently, the NUM_CONTROLLERS macro is used to limit the amount of memory
controllers (UMCs) available per node. The number of UMCs available per node,
however, is already cached by the max_mcs variable of struct amd64_pvt.

Allocate the relevant data structures dynamically using the variable instead
of static allocation through the macro.

The max_mcs variable is used for legacy systems too. These systems have a max
of 2 controllers. Since the default value of max_mcs, set in per_family_init(),
is 2, these legacy systems are also covered.

Signed-off-by: Avadhut Naik <avadhut.naik@amd.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Link: https://lore.kernel.org/20251106015727.1987246-1-avadhut.naik@amd.com
2025-11-06 12:51:33 +01:00
Avadhut Naik e9abd990ae EDAC/amd64: Generate ctl_name string at runtime
Currently, the ctl_name string is statically assigned based on the family and
model of the SOC when the amd64_edac module is loaded.

The same, however, is not exactly needed as the string can be generated and
assigned at runtime through scnprintf().

Remove all static assignments and generate the string at runtime. Also,
cleanup the switch cases which became defunct and consolidate identical cases.

Signed-off-by: Avadhut Naik <avadhut.naik@amd.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Link: https://lore.kernel.org/20251106015727.1987246-1-avadhut.naik@amd.com
2025-11-06 12:35:59 +01:00
Christian Brauner c8e00cdc74
Merge patch series "credential guards: credential preparation"
Christian Brauner <brauner@kernel.org> says:

This converts most users combining

* prepare_creds()
* modify new creds
* override_creds()
* revert_creds()
* put_cred()

to rely on credentials guards.

* patches from https://patch.msgid.link/20251103-work-creds-guards-prepare_creds-v1-0-b447b82f2c9b@kernel.org:
  trace: use override credential guard
  trace: use prepare credential guard
  coredump: use override credential guard
  coredump: use prepare credential guard
  coredump: split out do_coredump() from vfs_coredump()
  coredump: mark struct mm_struct as const
  coredump: pass struct linux_binfmt as const
  coredump: move revert_cred() before coredump_cleanup()
  sev-dev: use override credential guards
  sev-dev: use prepare credential guard
  sev-dev: use guard for path
  cred: add prepare credential guard

Link: https://patch.msgid.link/20251103-work-creds-guards-prepare_creds-v1-0-b447b82f2c9b@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-05 23:11:52 +01:00
Christian Brauner 06765b6efc
trace: use override credential guard
Use override credential guards for scoped credential override with
automatic restoration on scope exit.

Link: https://patch.msgid.link/20251103-work-creds-guards-prepare_creds-v1-12-b447b82f2c9b@kernel.org
Acked-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-05 23:11:52 +01:00
Christian Brauner 2ed6a34de9
trace: use prepare credential guard
Use the prepare credential guard for allocating a new set of
credentials.

Link: https://patch.msgid.link/20251103-work-creds-guards-prepare_creds-v1-11-b447b82f2c9b@kernel.org
Acked-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-05 23:11:52 +01:00
Christian Brauner 545985dd37
coredump: use override credential guard
Use override credential guards for scoped credential override with
automatic restoration on scope exit.

Link: https://patch.msgid.link/20251103-work-creds-guards-prepare_creds-v1-10-b447b82f2c9b@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-05 23:11:52 +01:00
Christian Brauner 8ed3473c5a
coredump: use prepare credential guard
Use the prepare credential guard for allocating a new set of
credentials.

Link: https://patch.msgid.link/20251103-work-creds-guards-prepare_creds-v1-9-b447b82f2c9b@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-05 23:11:52 +01:00
Christian Brauner af9803d4b8
coredump: split out do_coredump() from vfs_coredump()
Make the function easier to follow and prepare for some of the following
changes.

Link: https://patch.msgid.link/20251103-work-creds-guards-prepare_creds-v1-8-b447b82f2c9b@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-05 23:11:52 +01:00
Christian Brauner 313a335057
coredump: mark struct mm_struct as const
We don't actually modify it.

Link: https://patch.msgid.link/20251103-work-creds-guards-prepare_creds-v1-7-b447b82f2c9b@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-05 23:11:51 +01:00
Christian Brauner 1ec760fb42
coredump: pass struct linux_binfmt as const
We don't actually modify it.

Link: https://patch.msgid.link/20251103-work-creds-guards-prepare_creds-v1-6-b447b82f2c9b@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-05 23:11:51 +01:00
Christian Brauner eb937201ba
coredump: move revert_cred() before coredump_cleanup()
There's no need to pin the credentials across the coredump_cleanup()
call. Nothing in there depends on elevated credentials.

Link: https://patch.msgid.link/20251103-work-creds-guards-prepare_creds-v1-5-b447b82f2c9b@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-05 23:11:51 +01:00
Christian Brauner b7b4f7554b
sev-dev: use override credential guards
Use override credential guards for scoped credential override with
automatic restoration on scope exit.

Link: https://patch.msgid.link/20251103-work-creds-guards-prepare_creds-v1-4-b447b82f2c9b@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-05 23:11:42 +01:00
Christian Brauner 73fd0dba0b
Merge patch series "fs: introduce super write guard"
Christian Brauner <brauner@kernel.org> says:

I'm in the process of adding a few more guards for vfs constructs.
I've chosen the easy case of super_start_write() and super_end_write()
and converted eligible callers. I think long-term we can move a lot of
the manual placement to completely rely on guards - where sensible.

* patches from https://patch.msgid.link/20251104-work-guards-v1-0-5108ac78a171@kernel.org:
  xfs: use super write guard in xfs_file_ioctl()
  open: use super write guard in do_ftruncate()
  btrfs: use super write guard in relocating_repair_kthread()
  ext4: use super write guard in write_mmp_block()
  btrfs: use super write guard in sb_start_write()
  btrfs: use super write guard btrfs_run_defrag_inode()
  btrfs: use super write guard in btrfs_reclaim_bgs_work()
  fs: add super_write_guard

Link: https://patch.msgid.link/20251104-work-guards-v1-0-5108ac78a171@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-05 22:59:31 +01:00
Christian Brauner ab5f296076
xfs: use super write guard in xfs_file_ioctl()
Link: https://patch.msgid.link/20251104-work-guards-v1-8-5108ac78a171@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-05 22:59:27 +01:00
Christian Brauner 97f9d2d282
open: use super write guard in do_ftruncate()
Link: https://patch.msgid.link/20251104-work-guards-v1-7-5108ac78a171@kernel.org
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-05 22:52:15 +01:00
Christian Brauner b7b8aca68e
btrfs: use super write guard in relocating_repair_kthread()
Link: https://patch.msgid.link/20251104-work-guards-v1-6-5108ac78a171@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-05 22:52:15 +01:00
Christian Brauner 2774bac21f
ext4: use super write guard in write_mmp_block()
Link: https://patch.msgid.link/20251104-work-guards-v1-5-5108ac78a171@kernel.org
Acked-by: Theodore Ts'o <tytso@mit.edu>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-05 22:52:15 +01:00
Christian Brauner 6e5b78cb17
btrfs: use super write guard in sb_start_write()
Link: https://patch.msgid.link/20251104-work-guards-v1-4-5108ac78a171@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-05 22:52:15 +01:00
Christian Brauner e79a4512cc
btrfs: use super write guard btrfs_run_defrag_inode()
Link: https://patch.msgid.link/20251104-work-guards-v1-3-5108ac78a171@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-05 22:52:15 +01:00
Christian Brauner a5e3d0be9e
btrfs: use super write guard in btrfs_reclaim_bgs_work()
Link: https://patch.msgid.link/20251104-work-guards-v1-2-5108ac78a171@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-05 22:52:15 +01:00
Christian Brauner 8e4d576ed3
fs: add super_write_guard
Link: https://patch.msgid.link/20251104-work-guards-v1-1-5108ac78a171@kernel.org
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-05 22:52:15 +01:00
Mateusz Guzik 5b8ed52866
fs: inline current_umask() and move it to fs_struct.h
There is no good reason to have this as a func call, other than avoiding
the churn of adding fs_struct.h as needed.

Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
Link: https://patch.msgid.link/20251104170448.630414-1-mjguzik@gmail.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-05 22:51:23 +01:00
Christian Brauner 723cd9872d
Merge patch series "fs: start to split up fs.h"
Christian Brauner <brauner@kernel.org> says:

Take first steps to split up fs.h. Add fs/super_types.h and fs/super.h
headers that contain the types and functions associated with super
blocks respectively.

* patches from https://patch.msgid.link/20251104-work-fs-header-v1-0-fb39a2efe39e@kernel.org:
  fs: add fs/super.h header
  fs: add fs/super_types.h header
  fs: rename fs_types.h to fs_dirent.h

Link: https://patch.msgid.link/20251104-work-fs-header-v1-0-fb39a2efe39e@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-05 22:51:23 +01:00
Christian Brauner f7b3d14165
fs: add fs/super.h header
Split out super block associated functions into a separate header.

Link: https://patch.msgid.link/20251104-work-fs-header-v1-3-fb39a2efe39e@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-05 22:51:21 +01:00
Christian Brauner e0b62a4dee
fs: add fs/super_types.h header
Split out super block associated structures into a separate header.

Link: https://patch.msgid.link/20251104-work-fs-header-v1-2-fb39a2efe39e@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-05 22:47:52 +01:00
Yazen Ghannam 56f17be67a x86/mce/amd: Define threshold restart function for banks
Prepare for CMCI storm support by moving the common bank/block iterator code
to a helper function.

Include a parameter to switch the interrupt enable. This will be used by the
CMCI storm handling function.

Signed-off-by: Yazen Ghannam <yazen.ghannam@amd.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Reviewed-by: Nikolay Borisov <nik.borisov@suse.com>
Link: https://lore.kernel.org/20251104-wip-mca-updates-v8-0-66c8eacf67b9@amd.com
2025-11-05 22:38:31 +01:00
Yazen Ghannam 3206b41604 x86/mce/amd: Remove redundant reset_block()
Many of the checks in reset_block() are done again in the block reset
function. So drop the redundant checks.

Signed-off-by: Yazen Ghannam <yazen.ghannam@amd.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Link: https://lore.kernel.org/20251104-wip-mca-updates-v8-0-66c8eacf67b9@amd.com
2025-11-05 22:34:53 +01:00
Yazen Ghannam 4efaec6e16 x86/mce/amd: Support SMCA Corrected Error Interrupt
AMD systems optionally support MCA thresholding which provides the ability for
hardware to send an interrupt when a set error threshold is reached. This
feature counts errors of all severities, but it is commonly used to report
correctable errors with an interrupt rather than polling.

Scalable MCA systems allow the platform to take control of this feature. In
this case, the OS will not see the feature configuration and control bits in
the MCA_MISC* registers. The OS will not receive the MCA thresholding
interrupt, and it will need to poll for correctable errors.

A "corrected error interrupt" will be available on Scalable MCA systems. This
will be used in the same configuration where the platform controls MCA
thresholding. However, the platform will now be able to send the MCA
thresholding interrupt to the OS.

Check for, and enable, this feature during per-CPU SMCA init.

Signed-off-by: Yazen Ghannam <yazen.ghannam@amd.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Tested-by: Tony Luck <tony.luck@intel.com>
Link: https://lore.kernel.org/20251104-wip-mca-updates-v8-0-66c8eacf67b9@amd.com
2025-11-05 22:10:23 +01:00
Yazen Ghannam 134b1eabe6 x86/mce/amd: Enable interrupt vectors once per-CPU on SMCA systems
Scalable MCA systems have a per-CPU register that gives the APIC LVT offset
for the thresholding and deferred error interrupts.

Currently, this register is read once to set up the deferred error interrupt
and then read again for each thresholding block. Furthermore, the APIC LVT
registers are configured each time, but they only need to be configured once
per-CPU.

Move the APIC LVT setup to the early part of CPU init, so that the registers
are set up once. Also, this ensures that the kernel is ready to service the
interrupts before the individual error sources (each MCA bank) are enabled.

Apply this change only to SMCA systems to avoid breaking any legacy behavior.
The deferred error interrupt is technically advertised by the SUCCOR feature.
However, this was first made available on SMCA systems.  Therefore, only set
up the deferred error interrupt on SMCA systems and simplify the code.

Guidance from hardware designers is that the LVT offsets provided from the
platform should be used. The kernel should not try to enforce specific values.
However, the kernel should check that an LVT offset is not reused for multiple
sources.

Therefore, remove the extra checking and value enforcement from the MCE code.
The "reuse/conflict" case is already handled in setup_APIC_eilvt().

Signed-off-by: Yazen Ghannam <yazen.ghannam@amd.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Tested-by: Tony Luck <tony.luck@intel.com>
Link: https://lore.kernel.org/20251104-wip-mca-updates-v8-0-66c8eacf67b9@amd.com
2025-11-05 16:51:27 +01:00
Yazen Ghannam 7cb735d7c0 x86/mce: Unify AMD DFR handler with MCA Polling
AMD systems optionally support a deferred error interrupt. The interrupt
should be used as another signal to trigger MCA polling. This is similar to
how other MCA interrupts are handled.

Deferred errors do not require any special handling related to the interrupt,
e.g. resetting or rearming the interrupt, etc.

However, Scalable MCA systems include a pair of registers, MCA_DESTAT and
MCA_DEADDR, that should be checked for valid errors. This check should be done
whenever MCA registers are polled. Currently, the deferred error interrupt
does this check, but the MCA polling function does not.

Call the MCA polling function when handling the deferred error interrupt. This
keeps all "polling" cases in a common function.

Add an SMCA status check helper. This will do the same status check and
register clearing that the interrupt handler has done. And it extends the
common polling flow to find AMD deferred errors.

Clear the MCA_DESTAT register at the end of the handler rather than the
beginning. This maintains the procedure that the 'status' register must be
cleared as the final step.

Signed-off-by: Yazen Ghannam <yazen.ghannam@amd.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Link: https://lore.kernel.org/20251104-wip-mca-updates-v8-0-66c8eacf67b9@amd.com
2025-11-05 16:41:32 +01:00
Peter Zijlstra cf76553aaa entry,unwind/deferred: Fix unwind_reset_info() placement
Stephen reported that on KASAN builds he's seeing:

vmlinux.o: warning: objtool: user_exc_vmm_communication+0x15a: call to __kasan_check_read() leaves .noinstr.text section
vmlinux.o: warning: objtool: exc_debug_user+0x182: call to __kasan_check_read() leaves .noinstr.text section
vmlinux.o: warning: objtool: exc_int3+0x123: call to __kasan_check_read() leaves .noinstr.text section
vmlinux.o: warning: objtool: noist_exc_machine_check+0x17a: call to __kasan_check_read() leaves .noinstr.text section
vmlinux.o: warning: objtool: fred_exc_machine_check+0x17e: call to __kasan_check_read() leaves .noinstr.text section

This turns out to be atomic ops from unwind_reset_info() that have
explicit instrumentation. Place unwind_reset_info() in the preceding
instrumentation_begin() section.

Fixes: c6439bfaab ("Merge tag 'trace-deferred-unwind-v6.17' of git://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace")
Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Reported-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patch.msgid.link/20251105100014.GY4068168@noisy.programming.kicks-ass.net
2025-11-05 13:57:32 +01:00
Christian Brauner 0d534518ce
Merge patch series "Fix two syzbot corruption bugs in minix filesystem"
Jori Koolstra <jkoolstra@xs4all.nl> says:

Syzbot fuzzes /fs by trying to mount and manipulate deliberately
corrupted filesystems. This should not lead to BUG_ONs and WARN_ONs for
easy to detect corruptions. This series adds code to be able to report
such corruptions and fixes two syzbot bugs on this kind.

* patches from https://patch.msgid.link/20251104143005.3283980-1-jkoolstra@xs4all.nl:
  Fix a drop_nlink warning in minix_rename
  Fix a drop_nlink warning in minix_rmdir
  Add error handling to minix filesystem for inode corruption detection

Link: https://patch.msgid.link/20251104143005.3283980-1-jkoolstra@xs4all.nl
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-05 13:45:26 +01:00
Jori Koolstra 009a2ba403
Fix a drop_nlink warning in minix_rename
Syzbot found a drop_nlink warning that is triggered by an easy to
detect nlink corruption. This patch adds sanity checks to minix_unlink
and minix_rename to prevent the warning and instead return EFSCORRUPTED
to the caller.

The changes were tested using the syzbot reproducer as well as local
testing.

Signed-off-by: Jori Koolstra <jkoolstra@xs4all.nl>
Link: https://patch.msgid.link/20251104143005.3283980-4-jkoolstra@xs4all.nl
Reviewed-by: Jan Kara <jack@suse.cz>
Reported-by: syzbot+a65e824272c5f741247d@syzkaller.appspotmail.com
Closes: https://syzbot.org/bug?extid=a65e824272c5f741247d
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-05 13:45:21 +01:00
Jori Koolstra d3e0e8661c
Fix a drop_nlink warning in minix_rmdir
Syzbot found a drop_nlink warning that is triggered by an easy to
detect nlink corruption of a directory. This patch adds a sanity check
to minix_rmdir to prevent the warning and instead return EFSCORRUPTED to
the caller.

The changes were tested using the syzbot reproducer as well as local
testing.

Signed-off-by: Jori Koolstra <jkoolstra@xs4all.nl>
Link: https://patch.msgid.link/20251104143005.3283980-3-jkoolstra@xs4all.nl
Reviewed-by: Jan Kara <jack@suse.cz>
Reported-by: syzbot+4e49728ec1cbaf3b91d2@syzkaller.appspotmail.com
Closes: https://syzbot.org/bug?extid=4e49728ec1cbaf3b91d2
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-05 13:45:21 +01:00
Jori Koolstra 21215ce7a9
Add error handling to minix filesystem for inode corruption detection
We would like to provide early and specific warnings of filesystem
corruption without running into generic WARN_ONs and BUG_ONs.
Towards this goal, ext4, e.g., has a EFSCORRUPTED errno and a
standardized inode corruption message format. This patch adds this
errno and message format to the minix filesystem.

Signed-off-by: Jori Koolstra <jkoolstra@xs4all.nl>
Link: https://patch.msgid.link/20251104143005.3283980-2-jkoolstra@xs4all.nl
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-05 13:45:21 +01:00
Yazen Ghannam 34da4a5d68 x86/mce: Unify AMD THR handler with MCA Polling
AMD systems optionally support an MCA thresholding interrupt. The interrupt
should be used as another signal to trigger MCA polling. This is similar to
how the Intel Corrected Machine Check interrupt (CMCI) is handled.

AMD MCA thresholding is managed using the MCA_MISC registers within an MCA
bank. The OS will need to modify the hardware error count field in order to
reset the threshold limit and rearm the interrupt. Management of the MCA_MISC
register should be done as a follow up to the basic MCA polling flow. It
should not be the main focus of the interrupt handler.

Furthermore, future systems will have the ability to send an MCA thresholding
interrupt to the OS even when the OS does not manage the feature, i.e.
MCA_MISC registers are Read-as-Zero/Locked.

Call the common MCA polling function when handling the MCA thresholding
interrupt. This will allow the OS to find any valid errors whether or not the
MCA thresholding feature is OS-managed. Also, this allows the common MCA
polling options and kernel parameters to apply to AMD systems.

Add a callback to the MCA polling function to check and reset any threshold
blocks that have reached their threshold limit.

Signed-off-by: Yazen Ghannam <yazen.ghannam@amd.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Link: https://lore.kernel.org/20251104-wip-mca-updates-v8-0-66c8eacf67b9@amd.com
2025-11-05 13:41:18 +01:00
Christian Brauner ca3557a686
Merge patch series "alloc misaligned vectors for zoned XFS v2"
Christoph Hellwig <hch@lst.de> says:

This series enables the new block layer support for misaligned
individual vectors for zoned XFS.

The first patch is the from Qu and supposedly already applied to
the vfs iomap 6.19 branch, but I can't find it there.  The next
two are small fixups for it, and the last one makes use of this
new functionality in XFS.

* patches from https://patch.msgid.link/20251031131045.1613229-1-hch@lst.de:
  xfs: support sub-block aligned vectors in always COW mode
  iomap: add IOMAP_DIO_FSBLOCK_ALIGNED flag

Link: https://patch.msgid.link/20251031131045.1613229-1-hch@lst.de
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-05 13:09:32 +01:00
Christoph Hellwig 8caec6c9fe
xfs: support sub-block aligned vectors in always COW mode
Now that the block layer and iomap have grown support to indicate
the bio sector size explicitly instead of assuming the device sector
size, we can ask for logical block size alignment and thus support
direct I/O writes where the overall size is logical block size
aligned, but the boundaries between vectors might not be.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://patch.msgid.link/20251031131045.1613229-3-hch@lst.de
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-05 13:09:27 +01:00
Qu Wenruo 001397f5ef
iomap: add IOMAP_DIO_FSBLOCK_ALIGNED flag
Btrfs requires all of its bios to be fs block aligned, normally it's
totally fine but with the incoming block size larger than page size
(bs > ps) support, the requirement is no longer met for direct IOs.

Because iomap_dio_bio_iter() calls bio_iov_iter_get_pages(), only
requiring alignment to be bdev_logical_block_size().

In the real world that value is either 512 or 4K, on 4K page sized
systems it means bio_iov_iter_get_pages() can break the bio at any page
boundary, breaking btrfs' requirement for bs > ps cases.

To address this problem, introduce a new public iomap dio flag,
IOMAP_DIO_FSBLOCK_ALIGNED.

When calling __iomap_dio_rw() with that new flag, iomap_dio::flags will
inherit that new flag, and iomap_dio_bio_iter() will take fs block size
into the calculation of the alignment, and pass the alignment to
bio_iov_iter_get_pages(), respecting the fs block size requirement.

The initial user of this flag will be btrfs, which needs to calculate the
checksum for direct read and thus requires the biovec to be fs block
aligned for the incoming bs > ps support.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: Pankaj Raghav <p.raghav@samsung.com>
[hch: also align pos/len, incorporate the trace flags from Darrick]
Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://patch.msgid.link/20251031131045.1613229-2-hch@lst.de
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-05 13:09:27 +01:00
Christian Brauner 560507cbc1
Merge patch series "iomap: zero range folio batch support"
Brian Foster <bfoster@redhat.com> says:

This adds folio batch support for iomap. This initially only targets
zero range, the use case being zeroing of dirty folios over unwritten
mappings. There is potential to support other operations in the future:
iomap seek data/hole has similar raciness issues as zero range, the
prospect of using this for buffered write has been raised for granular
locking purposes, etc.

The one major caveat with this zero range implementation is that it
doesn't look at iomap_folio_state to determine whether to zero a
sub-folio portion of the folio. Instead it just relies on whether the
folio was dirty or not. This means that spurious zeroing of unwritten
ranges is possible if a folio is dirty but the target range includes a
subrange that is not.

The reasoning is that this is essentially a complexity tradeoff. The
current use cases for iomap_zero_range() are limited mostly to partial
block zeroing scenarios. It's relatively harmless to zero an unwritten
block (i.e. not a correctness issue), and this is something that
filesystems have done in the past without much notice or issue. The
advantage is less code and this makes it a little easier to use a
filemap lookup function for the batch rather than open coding more logic
in iomap. That said, this can probably be enhanced to look at ifs in the
future if the use case expands and/or other operations justify it.

WRT testing, I've tested with and without a local hack to redirect
fallocate zero range calls to iomap_zero_range() in XFS. This helps test
beyond the partial block/folio use case, i.e. to cover boundary
conditions like full folio batch handling, etc. I recently added patch 7
in spirit of that, which turns this logic into an XFS errortag. Further
comments on that are inline with patch 7.

* patches from https://lore.kernel.org/20251003134642.604736-1-bfoster@redhat.com:
  xfs: error tag to force zeroing on debug kernels
  iomap: remove old partial eof zeroing optimization
  xfs: fill dirty folios on zero range of unwritten mappings
  xfs: always trim mapping to requested range for zero range
  iomap: optional zero range dirty folio processing
  iomap: remove pos+len BUG_ON() to after folio lookup
  filemap: add helper to look up dirty folios in a range

Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-05 12:57:25 +01:00
Brian Foster 66d78a1147
xfs: error tag to force zeroing on debug kernels
iomap_zero_range() has to cover various corner cases that are
difficult to test on production kernels because it is used in fairly
limited use cases. For example, it is currently only used by XFS and
mostly only in partial block zeroing cases.

While it's possible to test most of these functional cases, we can
provide more robust test coverage by co-opting fallocate zero range
to invoke zeroing of the entire range instead of the more efficient
block punch/allocate sequence. Add an errortag to occasionally
invoke forced zeroing.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-05 12:57:25 +01:00
Brian Foster 39be21386d
iomap: remove old partial eof zeroing optimization
iomap_zero_range() optimizes the partial eof block zeroing use case
by force zeroing if the mapping is dirty. This is to avoid frequent
flushing on file extending workloads, which hurts performance.

Now that the folio batch mechanism provides a more generic solution
and is used by the only real zero range user (XFS), this isolated
optimization is no longer needed. Remove the unnecessary code and
let callers use the folio batch or fall back to flushing by default.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-05 12:57:25 +01:00
Brian Foster 77c475692c
xfs: fill dirty folios on zero range of unwritten mappings
Use the iomap folio batch mechanism to select folios to zero on zero
range of unwritten mappings. Trim the resulting mapping if the batch
is filled (unlikely for current use cases) to distinguish between a
range to skip and one that requires another iteration due to a full
batch.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-05 12:57:25 +01:00
Brian Foster 5c13dde963
xfs: always trim mapping to requested range for zero range
Refactor and tweak the IOMAP_ZERO logic in preparation to support
filling the folio batch for unwritten mappings. Drop the superfluous
imap offset check since the hole case has already been filtered out.
Split the the delalloc case handling into a sub-branch, and always
trim the imap to the requested offset/count so it can be more easily
used to bound the range to lookup in pagecache.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-05 12:57:25 +01:00
Christian Brauner 4966b46652
Merge patch series "fuse: use iomap for buffered reads + readahead"
Joanne Koong <joannelkoong@gmail.com> says:

This series adds fuse iomap support for buffered reads and readahead.
This is needed so that granular uptodate tracking can be used in fuse when
large folios are enabled so that only the non-uptodate portions of the folio
need to be read in instead of having to read in the entire folio. It also is
needed in order to turn on large folios for servers that use the writeback
cache since otherwise there is a race condition that may lead to data
corruption if there is a partial write, then a read and the read happens
before the write has undergone writeback, since otherwise the folio will not
be marked uptodate from the partial write so the read will read in the entire
folio from disk, which will overwrite the partial write.

This is on top of two locally-patched iomap patches [1] [2] patched on top of
commit f1c864be6e88 ("Merge branch 'vfs-6.18.async' into vfs.all") in
Christian's vfs.all tree.

This series was run through fstests on fuse passthrough_hp with an
out-of kernel patch enabling fuse large folios.

This patchset does not enable large folios on fuse yet. That will be part
of a different patchset.

* patches from https://lore.kernel.org/20250926002609.1302233-1-joannelkoong@gmail.com:
  fuse: remove fc->blkbits workaround for partial writes
  fuse: use iomap for readahead
  fuse: use iomap for read_folio
  iomap: make iomap_read_folio() a void return
  iomap: move buffered io bio logic into new file
  iomap: add caller-provided callbacks for read and readahead
  iomap: set accurate iter->pos when reading folio ranges
  iomap: track pending read bytes more optimally
  iomap: rename iomap_readpage_ctx struct to iomap_read_folio_ctx
  iomap: rename iomap_readpage_iter() to iomap_read_folio_iter()
  iomap: iterate over folio mapping in iomap_readpage_iter()
  iomap: store read/readahead bio generically
  iomap: move read/readahead bio submission logic into helper function
  iomap: move bio read logic into helper function

Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-05 12:57:24 +01:00
Brian Foster 395ed1ef00
iomap: optional zero range dirty folio processing
The only way zero range can currently process unwritten mappings
with dirty pagecache is to check whether the range is dirty before
mapping lookup and then flush when at least one underlying mapping
is unwritten. This ordering is required to prevent iomap lookup from
racing with folio writeback and reclaim.

Since zero range can skip ranges of unwritten mappings that are
clean in cache, this operation can be improved by allowing the
filesystem to provide a set of dirty folios that require zeroing. In
turn, rather than flush or iterate file offsets, zero range can
iterate on folios in the batch and advance over clean or uncached
ranges in between.

Add a folio_batch in struct iomap and provide a helper for
filesystems to populate the batch at lookup time. Update the folio
lookup path to return the next folio in the batch, if provided, and
advance the iter if the folio starts beyond the current offset.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-05 12:57:24 +01:00
Joanne Koong 93570c652b
fuse: remove fc->blkbits workaround for partial writes
Now that fuse is integrated with iomap for read/readahead, we can remove
the workaround that was added in commit bd24d2108e ("fuse: fix fuseblk
i_blkbits for iomap partial writes"), which was previously needed to
avoid a race condition where an iomap partial write may be overwritten
by a read if blocksize < PAGE_SIZE. Now that fuse does iomap
read/readahead, this is protected against since there is granular
uptodate tracking of blocks, which means this workaround can be removed.

Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
Tested-by: syzbot@syzkaller.appspotmail.com
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-05 12:57:24 +01:00
Brian Foster 49590716be
iomap: remove pos+len BUG_ON() to after folio lookup
The bug checks at the top of iomap_write_begin() assume the pos/len
reflect exactly the next range to process. This may no longer be the
case once the get folio path is able to process a folio batch from
the filesystem. On top of that, len is already trimmed to within the
iomap/srcmap by iomap_length(), so these checks aren't terribly
useful. Remove the unnecessary BUG_ON() checks.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-05 12:57:24 +01:00
Joanne Koong 4ea907108a
fuse: use iomap for readahead
Do readahead in fuse using iomap. This gives us granular uptodate
tracking for large folios, which optimizes how much data needs to be
read in. If some portions of the folio are already uptodate (eg through
a prior write), we only need to read in the non-uptodate portions.

Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-05 12:57:24 +01:00
Brian Foster f8d98072fe
filemap: add helper to look up dirty folios in a range
Add a new filemap_get_folios_dirty() helper to look up existing dirty
folios in a range and add them to a folio_batch. This is to support
optimization of certain iomap operations that only care about dirty
folios in a target range. For example, zero range only zeroes the subset
of dirty pages over unwritten mappings, seek hole/data may use similar
logic in the future, etc.

Note that the helper is intended for use under internal fs locks.
Therefore it trylocks folios in order to filter out clean folios.
This loosely follows the logic from filemap_range_has_writeback().

Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-05 12:57:24 +01:00
Joanne Koong 03e9618e82
fuse: use iomap for read_folio
Read folio data into the page cache using iomap. This gives us granular
uptodate tracking for large folios, which optimizes how much data needs
to be read in. If some portions of the folio are already uptodate (eg
through a prior write), we only need to read in the non-uptodate
portions.

Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-05 12:57:23 +01:00
Joanne Koong d4e88bb08e
iomap: make iomap_read_folio() a void return
No errors are propagated in iomap_read_folio(). Change
iomap_read_folio() to a void return to make this clearer to callers.

Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-05 12:57:23 +01:00
Christoph Hellwig [1] c2b1adc462
iomap: move buffered io bio logic into new file
Move bio logic in the buffered io code into its own file and remove
CONFIG_BLOCK gating for iomap read/readahead.

[1] https://lore.kernel.org/linux-fsdevel/aMK2GuumUf93ep99@infradead.org/

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-05 12:57:23 +01:00
Joanne Koong b2f35ac414
iomap: add caller-provided callbacks for read and readahead
Add caller-provided callbacks for read and readahead so that it can be
used generically, especially by filesystems that are not block-based.

In particular, this:
* Modifies the read and readahead interface to take in a
  struct iomap_read_folio_ctx that is publicly defined as:

  struct iomap_read_folio_ctx {
	const struct iomap_read_ops *ops;
	struct folio *cur_folio;
	struct readahead_control *rac;
	void *read_ctx;
  };

  where struct iomap_read_ops is defined as:

  struct iomap_read_ops {
      int (*read_folio_range)(const struct iomap_iter *iter,
                             struct iomap_read_folio_ctx *ctx,
                             size_t len);
      void (*read_submit)(struct iomap_read_folio_ctx *ctx);
  };

  read_folio_range() reads in the folio range and is required by the
  caller to provide. read_submit() is optional and is used for
  submitting any pending read requests.

* Modifies existing filesystems that use iomap for read and readahead to
  use the new API, through the new statically inlined helpers
  iomap_bio_read_folio() and iomap_bio_readahead(). There is no change
  in functionality for those filesystems.

Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-05 12:57:23 +01:00
Joanne Koong fb7a10ac47
iomap: set accurate iter->pos when reading folio ranges
Advance iter to the correct position before calling an IO helper to read
in a folio range. This allows the helper to reliably use iter->pos to
determine the starting offset for reading.

This will simplify the interface for reading in folio ranges when iomap
read/readahead supports caller-provided callbacks.

Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
Suggested-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-05 12:57:23 +01:00
Joanne Koong d43558ae67
iomap: track pending read bytes more optimally
Instead of incrementing read_bytes_pending for every folio range read in
(which requires acquiring the spinlock to do so), set read_bytes_pending
to the folio size when the first range is asynchronously read in, keep
track of how many bytes total are asynchronously read in, and adjust
read_bytes_pending accordingly after issuing requests to read in all the
necessary ranges.

iomap_read_folio_ctx->cur_folio_in_bio can be removed since a non-zero
value for pending bytes necessarily indicates the folio is in the bio.

Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
Suggested-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-05 12:57:23 +01:00
Kaushlendra Kumar a6446829f8
init: Replace simple_strtoul() with kstrtouint() in root_delay_setup()
Replace deprecated simple_strtoul() with kstrtouint() for better error
handling and input validation. Return 0 on parsing failure to indicate
invalid parameter, maintaining existing behavior for valid inputs.

The simple_strtoul() function is deprecated in favor of kstrtoint()
family functions which provide better error handling and are recommended
for new code and replacements.

Signed-off-by: Kaushlendra Kumar <kaushlendra.kumar@intel.com>
Link: https://patch.msgid.link/20251103080627.1844645-1-kaushlendra.kumar@intel.com
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-05 12:49:38 +01:00
Christian Brauner a4db63b88f
Merge patch series "fs: fully sync all fsese even for an emergency sync"
Qu Wenruo <wqu@suse.com> says:

The first patch is a cleanup related to sync_inodes_one_sb() callback.
Since it always wait for the writeback, there is no need to pass any
parameter for it.

The second patch is a fix mostly affecting btrfs, as btrfs requires a
explicit sync_fc() call with wait == 1, to commit its super blocks,
and sync_bdevs() won't cut it at all.

However the current emergency sync never passes wait == 1, it means
btrfs will writeback all dirty data and metadata, but still no super
block update, resulting everything still pointing back to the old
data/metadata.

This lead to a problem where btrfs doesn't seem to do anything during
emergency sync.

The second patch fixes the problem by passing wait == 1 for the second
iteration of sync_fs_one_sb().

* patches from https://patch.msgid.link/cover.1762142636.git.wqu@suse.com:
  fs: fully sync all fses even for an emergency sync
  fs: do not pass a parameter for sync_inodes_one_sb()

Link: https://patch.msgid.link/cover.1762142636.git.wqu@suse.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-05 12:30:05 +01:00
Qu Wenruo 2706659d64
fs: fully sync all fses even for an emergency sync
[BUG]
There is a bug report that during emergency sync, btrfs only write back
all the dirty data and metadadta, but no full transaction commit,
resulting the super block still pointing to the old trees, thus the end
user can only see the old data, not the newer one.

[CAUSE]
Initially this looks like a btrfs specific bug, since ext4 doesn't get
affected by this one.

But the root problem here is, a combination of btrfs features and the no
wait nature of emergency sync.

Firstly do_sync_work() will call sync_inodes_one_sb() for every fs, to
writeback all the dirty pages for the fs.

Btrfs will properly writeback all dirty pages, including both data and
the updated metadata. So far so good.

Then sync_fs_one_sb() called with @nowait, in the case of btrfs it means
no full transaction commit, thus no super block update.

At this stage, btrfs is only one super block update away to be fully committed.
I believe it's the more or less the same for other fses too.

The problem is the next step, sync_bdevs().
Normally other fses have their super block already updated in the page
cache of the block device, but btrfs only updates the super block during
full transaction commit.

So sync_bdevs() may work for other fses, but not for btrfs, btrfs is
still using its older super block, all pointing back to the old metadata
and data.

Thus if after emergency sync, power loss happened, the end user will
only see the old data, not the newer one, despite that everything but the
super block is already written back.

[FIX]
Since the emergency sync is already executing in a workqueue, I didn't
see much need to only do a nowait sync.
Especially after the fact that sync_inodes_one_sb() always wait for the
writeback to finish.

Instead for the second iteration of sync_fs_one_sb(), pass wait == 1
into it, so fses like btrfs can properly commit its super blocks.

Reported-by: Askar Safin <safinaskar@gmail.com>
Link: https://lore.kernel.org/linux-btrfs/20251101150429.321537-1-safinaskar@gmail.com/
Signed-off-by: Qu Wenruo <wqu@suse.com>
Link: https://patch.msgid.link/7b7fd40c5fe440b633b6c0c741d96ce93eb5a89a.1762142636.git.wqu@suse.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-05 12:29:59 +01:00
Qu Wenruo fbc22c2996
fs: do not pass a parameter for sync_inodes_one_sb()
The function sync_inodes_one_sb() will always wait for the writeback,
and ignore the optional parameter.

Explicitly pass NULL as parameter for the call sites inside
do_sync_work().

Signed-off-by: Qu Wenruo <wqu@suse.com>
Link: https://patch.msgid.link/8079af1c4798cb36887022a8c51547a727c353cf.1762142636.git.wqu@suse.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-05 12:29:59 +01:00
Christian Brauner 0485a18d91
fs: rename fs_types.h to fs_dirent.h
We will split out a bunch of types into a separate header.
So free up the appropriate name for it.

Link: https://patch.msgid.link/20251104-work-fs-header-v1-1-fb39a2efe39e@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-05 09:51:30 +01:00
Christian Brauner 390d967653
pidfs: reduce wait_pidfd lock scope
There's no need to hold the lock after we realized that pid->attr is
set. We're holding a reference to struct pid so it won't go away and
pidfs_exit() is called once per struct pid.

Link: https://patch.msgid.link/20251105-work-pidfs-wait_pidfd-lock-v1-1-02638783be07@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-05 00:09:06 +01:00
Borislav Petkov (AMD) 47955b58cf x86/cpufeatures: Correct LKGS feature flag description
Quotation marks in cpufeatures.h comments are special and when the
comment begins with a quoted string, that string lands in /proc/cpuinfo,
turning it into a user-visible one.

The LKGS comment doesn't begin with a quoted string but just in case
drop the quoted "kernel" in there to avoid confusion. And while at it,
simply change the description into what the LKGS instruction does for
more clarity.

No functional changes.

Reviewed-by: Xin Li (Intel) <xin@zytor.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Link: https://lore.kernel.org/r/20251015103548.10194-1-bp@kernel.org
2025-11-04 23:09:34 +01:00
Christian Brauner a45ff1c7c9
Merge patch series "coredump: cleanups & pidfd extension"
Christian Brauner <brauner@kernel.org> says:

The recent changes to rework coredump handling to rely on unix sockets
are in the process of being used in systemd. Yu reported on shortcoming
nameling that the signal causing the coredump was available before the
crashing process was reaped.

The previous systemd coredump container interface requires the coredump
file descriptor, and basic information including the signal number to be
sent to the container. This means we need to have the signal number
available before sending the coredump to the container.

In general, the extension makes sense and fits with the rest of the
coredump information.

In addition to this extension this fixes a bunch of the tests that were
failing and reworks the publication mechanism for exit and coredump info
retrievable via the pidfd ioctl.

* patches from https://patch.msgid.link/20251028-work-coredump-signal-v1-0-ca449b7b7aa0@kernel.org: (22 commits)
  selftests/coredump: add second PIDFD_INFO_COREDUMP_SIGNAL test
  selftests/coredump: add first PIDFD_INFO_COREDUMP_SIGNAL test
  selftests/coredump: ignore ENOSPC errors
  selftests/coredump: add debug logging to coredump socket protocol tests
  selftests/coredump: add debug logging to coredump socket tests
  selftests/coredump: add debug logging to test helpers
  selftests/coredump: handle edge-triggered epoll correctly
  selftests/coredump: fix userspace coredump client detection
  selftests/coredump: fix userspace client detection
  selftests/coredump: split out coredump socket tests
  selftests/coredump: split out common helpers
  selftests/pidfd: add second supported_mask test
  selftests/pidfd: add first supported_mask test
  selftests/pidfd: update pidfd header
  pidfs: expose coredump signal
  pidfs: drop struct pidfs_exit_info
  pidfs: prepare to drop exit_info pointer
  pidfd: add a new supported_mask field
  pidfs: add missing BUILD_BUG_ON() assert on struct pidfd_info
  pidfs: add missing PIDFD_INFO_SIZE_VER1
  ...

Link: https://patch.msgid.link/20251028-work-coredump-signal-v1-0-ca449b7b7aa0@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-04 22:05:03 +01:00
Christian Brauner cbb842548a
selftests/coredump: add second PIDFD_INFO_COREDUMP_SIGNAL test
Verify that when using simple socket-based coredump (@ pattern),
the coredump_signal field is correctly exposed as SIGABRT.

Link: https://patch.msgid.link/20251028-work-coredump-signal-v1-22-ca449b7b7aa0@kernel.org
Reviewed-by: Alexander Mikhalitsyn <aleksandr.mikhalitsyn@canonical.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-04 22:05:01 +01:00
Christian Brauner 619e2227cc
selftests/coredump: add first PIDFD_INFO_COREDUMP_SIGNAL test
Verify that when using simple socket-based coredump (@ pattern),
the coredump_signal field is correctly exposed as SIGSEGV.

Link: https://patch.msgid.link/20251028-work-coredump-signal-v1-21-ca449b7b7aa0@kernel.org
Reviewed-by: Alexander Mikhalitsyn <aleksandr.mikhalitsyn@canonical.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-04 22:04:59 +01:00
Christian Brauner 32ae33f796
selftests/coredump: ignore ENOSPC errors
If we crash multiple processes at the same time we may run out of space.
Just ignore those errors. They're not actually all that relevant for the
test.

Link: https://patch.msgid.link/20251028-work-coredump-signal-v1-20-ca449b7b7aa0@kernel.org
Reviewed-by: Alexander Mikhalitsyn <aleksandr.mikhalitsyn@canonical.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-04 22:04:57 +01:00
Christian Brauner 408a0ed9ee
selftests/coredump: add debug logging to coredump socket protocol tests
So it's easier to figure out bugs.

Link: https://patch.msgid.link/20251028-work-coredump-signal-v1-19-ca449b7b7aa0@kernel.org
Reviewed-by: Alexander Mikhalitsyn <aleksandr.mikhalitsyn@canonical.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-04 22:04:55 +01:00
Christian Brauner 2343cbee9f
selftests/coredump: add debug logging to coredump socket tests
So it's easier to figure out bugs.

Link: https://patch.msgid.link/20251028-work-coredump-signal-v1-18-ca449b7b7aa0@kernel.org
Reviewed-by: Alexander Mikhalitsyn <aleksandr.mikhalitsyn@canonical.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-04 22:04:53 +01:00
Christian Brauner d5694db5dc
selftests/coredump: add debug logging to test helpers
so we can easily figure out why something failed.

Link: https://patch.msgid.link/20251028-work-coredump-signal-v1-17-ca449b7b7aa0@kernel.org
Reviewed-by: Alexander Mikhalitsyn <aleksandr.mikhalitsyn@canonical.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-04 22:04:51 +01:00
Christian Brauner 305e6b167c
selftests/coredump: handle edge-triggered epoll correctly
by putting the file descriptor into non-blocking mode.

Link: https://patch.msgid.link/20251028-work-coredump-signal-v1-16-ca449b7b7aa0@kernel.org
Reviewed-by: Alexander Mikhalitsyn <aleksandr.mikhalitsyn@canonical.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-04 22:04:48 +01:00
Christian Brauner 8b64f54c81
selftests/coredump: fix userspace coredump client detection
PIDFD_INFO_COREDUMP is only retrievable until the task has exited. After
it has exited task->mm is NULL. So if the task didn't actually coredump
we can't retrieve it's dumpability settings anymore. Only if the task
did coredump will we have stashed the coredump information in the
respective struct pid.

Link: https://patch.msgid.link/20251028-work-coredump-signal-v1-15-ca449b7b7aa0@kernel.org
Reviewed-by: Alexander Mikhalitsyn <aleksandr.mikhalitsyn@canonical.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-04 22:04:46 +01:00
Christian Brauner 32ae9fa406
selftests/coredump: fix userspace client detection
We need to request PIDFD_INFO_COREDUMP in the first place.

Link: https://patch.msgid.link/20251028-work-coredump-signal-v1-14-ca449b7b7aa0@kernel.org
Reviewed-by: Alexander Mikhalitsyn <aleksandr.mikhalitsyn@canonical.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-04 22:04:44 +01:00
Christian Brauner c09ea6659e
selftests/coredump: split out coredump socket tests
Split the coredump socket tests into separate files.

Link: https://patch.msgid.link/20251028-work-coredump-signal-v1-13-ca449b7b7aa0@kernel.org
Reviewed-by: Alexander Mikhalitsyn <aleksandr.mikhalitsyn@canonical.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-04 22:04:42 +01:00
Christian Brauner c71147f42b
selftests/coredump: split out common helpers
into separate files.

Link: https://patch.msgid.link/20251028-work-coredump-signal-v1-12-ca449b7b7aa0@kernel.org
Reviewed-by: Alexander Mikhalitsyn <aleksandr.mikhalitsyn@canonical.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-04 22:04:40 +01:00
Christian Brauner 2593deaac8
selftests/pidfd: add second supported_mask test
Verify that supported_mask is returned even when other fields are
requested.

Link: https://patch.msgid.link/20251028-work-coredump-signal-v1-11-ca449b7b7aa0@kernel.org
Reviewed-by: Alexander Mikhalitsyn <aleksandr.mikhalitsyn@canonical.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-04 22:04:38 +01:00
Christian Brauner e12f734208
selftests/pidfd: add first supported_mask test
Verify that when PIDFD_INFO_SUPPORTED_MASK is requested, the kernel
returns the supported_mask field indicating which flags the kernel
supports.

Link: https://patch.msgid.link/20251028-work-coredump-signal-v1-10-ca449b7b7aa0@kernel.org
Reviewed-by: Alexander Mikhalitsyn <aleksandr.mikhalitsyn@canonical.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-04 22:04:36 +01:00
Christian Brauner a945535dfd
selftests/pidfd: update pidfd header
Include the new defines and members.

Link: https://patch.msgid.link/20251028-work-coredump-signal-v1-9-ca449b7b7aa0@kernel.org
Reviewed-by: Alexander Mikhalitsyn <aleksandr.mikhalitsyn@canonical.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-04 22:04:32 +01:00
Christian Brauner 89c545e29e
sev-dev: use prepare credential guard
Use the prepare credential guard for allocating a new set of
credentials.

Link: https://patch.msgid.link/20251103-work-creds-guards-prepare_creds-v1-3-b447b82f2c9b@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-04 12:37:01 +01:00
Christian Brauner 4c5941ca11
sev-dev: use guard for path
Just use a guard and also move the path_put() out of the credential
change's scope. There's no need to do this with the overridden
credentials.

Link: https://patch.msgid.link/20251103-work-creds-guards-prepare_creds-v1-2-b447b82f2c9b@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-04 12:37:00 +01:00
Christian Brauner c8ad3098e1
cred: add prepare credential guard
A lot of code uses the following pattern:

* prepare new credentials
* modify them for their use-case
* drop them

Support that easier with the new guard infrastructure.

Link: https://patch.msgid.link/20251103-work-creds-guards-prepare_creds-v1-1-b447b82f2c9b@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-04 12:36:57 +01:00
Christian Brauner a85787996a
Merge patch series "credentials guards: the easy cases"
Christian Brauner <brauner@kernel.org> says:

This converts all users of override_creds() to rely on credentials
guards. Leave all those that do the prepare_creds() + modify creds +
override_creds() dance alone for now. Some of them qualify for their own
variant.

* patches from https://patch.msgid.link/20251103-work-creds-guards-simple-v1-0-a3e156839e7f@kernel.org:
  net/dns_resolver: use credential guards in dns_query()
  cgroup: use credential guards in cgroup_attach_permissions()
  act: use credential guards in acct_write_process()
  smb: use credential guards in cifs_get_spnego_key()
  nfs: use credential guards in nfs_idmap_get_key()
  nfs: use credential guards in nfs_local_call_write()
  nfs: use credential guards in nfs_local_call_read()
  erofs: use credential guards
  binfmt_misc: use credential guards
  backing-file: use credential guards for mmap
  backing-file: use credential guards for splice write
  backing-file: use credential guards for splice read
  backing-file: use credential guards for writes
  backing-file: use credential guards for reads
  aio: use credential guards
  cred: add {scoped_}with_creds() guards

Link: https://patch.msgid.link/20251103-work-creds-guards-simple-v1-0-a3e156839e7f@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-04 12:36:53 +01:00
Christian Brauner 4037e28cd4
net/dns_resolver: use credential guards in dns_query()
Use credential guards for scoped credential override with automatic
restoration on scope exit.

Link: https://patch.msgid.link/20251103-work-creds-guards-simple-v1-16-a3e156839e7f@kernel.org
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-04 12:36:51 +01:00
Christian Brauner b66c7af4d8
cgroup: use credential guards in cgroup_attach_permissions()
Use credential guards for scoped credential override with automatic
restoration on scope exit.

Link: https://patch.msgid.link/20251103-work-creds-guards-simple-v1-15-a3e156839e7f@kernel.org
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-04 12:36:50 +01:00
Christian Brauner 5db84abd2a
act: use credential guards in acct_write_process()
Use credential guards for scoped credential override with automatic
restoration on scope exit.

Link: https://patch.msgid.link/20251103-work-creds-guards-simple-v1-14-a3e156839e7f@kernel.org
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-04 12:36:49 +01:00
Christian Brauner c5c92c624a
smb: use credential guards in cifs_get_spnego_key()
Use credential guards for scoped credential override with automatic
restoration on scope exit.

Link: https://patch.msgid.link/20251103-work-creds-guards-simple-v1-13-a3e156839e7f@kernel.org
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-04 12:36:48 +01:00
Christian Brauner f41799b2e1
nfs: use credential guards in nfs_idmap_get_key()
Use credential guards for scoped credential override with automatic
restoration on scope exit.

Link: https://patch.msgid.link/20251103-work-creds-guards-simple-v1-12-a3e156839e7f@kernel.org
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-04 12:36:45 +01:00
Christian Brauner bff3c841f7
nfs: use credential guards in nfs_local_call_write()
Use credential guards for scoped credential override with automatic
restoration on scope exit.

Link: https://patch.msgid.link/20251103-work-creds-guards-simple-v1-11-a3e156839e7f@kernel.org
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-04 12:36:43 +01:00
Christian Brauner 94afb627df
nfs: use credential guards in nfs_local_call_read()
Use credential guards for scoped credential override with automatic
restoration on scope exit.

Link: https://patch.msgid.link/20251103-work-creds-guards-simple-v1-10-a3e156839e7f@kernel.org
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-04 12:36:42 +01:00
Christian Brauner 5e88d1aadc
erofs: use credential guards
Use credential guards for scoped credential override with automatic
restoration on scope exit.

Link: https://patch.msgid.link/20251103-work-creds-guards-simple-v1-9-a3e156839e7f@kernel.org
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-04 12:36:40 +01:00
Christian Brauner ff2044cd27
binfmt_misc: use credential guards
Use credential guards for scoped credential override with automatic
restoration on scope exit.

Link: https://patch.msgid.link/20251103-work-creds-guards-simple-v1-8-a3e156839e7f@kernel.org
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-04 12:36:39 +01:00
Christian Brauner 6e1d1c1fa7
backing-file: use credential guards for mmap
Use credential guards for scoped credential override with automatic
restoration on scope exit.

Link: https://patch.msgid.link/20251103-work-creds-guards-simple-v1-7-a3e156839e7f@kernel.org
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-04 12:36:38 +01:00
Christian Brauner b688171f91
backing-file: use credential guards for splice write
Use credential guards for scoped credential override with automatic
restoration on scope exit.

Link: https://patch.msgid.link/20251103-work-creds-guards-simple-v1-6-a3e156839e7f@kernel.org
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-04 12:36:37 +01:00
Christian Brauner c3076d146e
backing-file: use credential guards for splice read
Use credential guards for scoped credential override with automatic
restoration on scope exit.

Link: https://patch.msgid.link/20251103-work-creds-guards-simple-v1-5-a3e156839e7f@kernel.org
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-04 12:36:36 +01:00
Christian Brauner f119feaa06
backing-file: use credential guards for writes
Use credential guards for scoped credential override with automatic
restoration on scope exit.

Link: https://patch.msgid.link/20251103-work-creds-guards-simple-v1-4-a3e156839e7f@kernel.org
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-04 12:36:35 +01:00
Christian Brauner 4f0a482578
backing-file: use credential guards for reads
Use credential guards for scoped credential override with automatic
restoration on scope exit.

Link: https://patch.msgid.link/20251103-work-creds-guards-simple-v1-3-a3e156839e7f@kernel.org
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-04 12:36:34 +01:00
Christian Brauner 84c1a329b4
aio: use credential guards
Use credential guards for scoped credential override with automatic
restoration on scope exit.

Link: https://patch.msgid.link/20251103-work-creds-guards-simple-v1-2-a3e156839e7f@kernel.org
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-04 12:36:33 +01:00
Christian Brauner 019e52e8d3
cred: add scoped_with_creds() guards
and implement scoped_with_kernel_creds() on top of it.

Link: https://patch.msgid.link/20251103-work-creds-guards-simple-v1-1-a3e156839e7f@kernel.org
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-04 12:36:29 +01:00
Christian Brauner e0876bde29
Merge patch series "creds: add {scoped_}with_kernel_creds()"
Christian Brauner <brauner@kernel.org> says:

A few months ago I did work to make override_creds()/revert_creds()
completely reference count free - mostly for the sake of
overlayfs but it has been beneficial to everyone using this.

In a recent pull request from Jens that introduced another round of
override_creds()/revert_creds() for nbd Linus asked whether we could
avoide the prepare_kernel_creds() calls that duplicate the kernel
credentials and then drop them again later.

Yes, we can actually. We can use the guard infrastructure to completely
avoid the allocation and then also to never expose the temporary
variable to hold the kernel credentials anywhere in the callers.

So add with_kernel_creds() and scoped_with_kernel_creds() for this
purpose. Also take the opportunity to fixup the scoped_class() macro I
introduced two cycles ago.

* patches from https://patch.msgid.link/20251103-work-creds-init_cred-v1-0-cb3ec8711a6a@kernel.org:
  unix: don't copy creds
  target: don't copy kernel creds
  nbd: don't copy kernel creds
  firmware: don't copy kernel creds
  cred: add {scoped_}with_kernel_creds
  cred: make init_cred static
  cred: add kernel_cred() helper
  cleanup: fix scoped_class()

Link: https://patch.msgid.link/20251103-work-creds-init_cred-v1-0-cb3ec8711a6a@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-04 12:36:24 +01:00
Christian Brauner 1ad5b411af
unix: don't copy creds
No need to copy kernel credentials.

Link: https://patch.msgid.link/20251103-work-creds-init_cred-v1-8-cb3ec8711a6a@kernel.org
Reviewed-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-04 12:36:21 +01:00
Christian Brauner 0f0e7cee34
target: don't copy kernel creds
Get rid of all the boilerplate and tightly scope when the task runs with
kernel creds.

Link: https://patch.msgid.link/20251103-work-creds-init_cred-v1-7-cb3ec8711a6a@kernel.org
Reviewed-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-04 12:36:18 +01:00
Christian Brauner 4601b7923d
nbd: don't copy kernel creds
No need to copy kernel credentials.

Link: https://patch.msgid.link/20251103-work-creds-init_cred-v1-6-cb3ec8711a6a@kernel.org
Reviewed-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-04 12:36:16 +01:00
Christian Brauner b9e3594e70
firmware: don't copy kernel creds
No need to copy kernel credentials.

Link: https://patch.msgid.link/20251103-work-creds-init_cred-v1-5-cb3ec8711a6a@kernel.org
Reviewed-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-04 12:36:10 +01:00
Christian Brauner ae40e6c657
cred: add scoped_with_kernel_creds()
Add a new cleanup class for override creds. We can make use of this in a
bunch of places going forward.

Based on this scoped_with_kernel_creds() that can be used to temporarily
assume kernel credentials for specific tasks such as firmware loading,
or coredump socket connections. At no point will the caller interact
with the kernel credentials directly.

Link: https://patch.msgid.link/20251103-work-creds-init_cred-v1-4-cb3ec8711a6a@kernel.org
Reviewed-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-04 12:36:07 +01:00
Christian Brauner 40314c2818
cred: make init_cred static
There's zero need to expose struct init_cred. The very few places that
need access can just go through init_task which is already exported.

Link: https://patch.msgid.link/20251103-work-creds-init_cred-v1-3-cb3ec8711a6a@kernel.org
Reviewed-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-04 12:36:02 +01:00
Christian Brauner 4c7ceeb62d
cred: add kernel_cred() helper
Access kernel creds based off of init_task. This will let us avoid any
direct access to init_cred.

Link: https://patch.msgid.link/20251103-work-creds-init_cred-v1-2-cb3ec8711a6a@kernel.org
Reviewed-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-04 12:35:52 +01:00
Christian Brauner 4e97bae1b4
cleanup: fix scoped_class()
This is a class, not a guard so why on earth is it checking for guard
pointers or conditional lock acquisition? None of it makes any sense at
all.

I'm not sure what happened back then. Maybe I had a brief psychedelic
period that I completely forgot about and spaced out into a zone where
that initial macro implementation made any sense at all.

Link: https://patch.msgid.link/20251103-work-creds-init_cred-v1-1-cb3ec8711a6a@kernel.org
Fixes: 5c21c5f22d ("cleanup: add a scoped version of CLASS()")
Reviewed-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-04 12:35:49 +01:00
Peter Zijlstra 1fe4002cf7 x86/ptrace: Always inline trivial accessors
A KASAN build bloats these single load/store helpers such that
it fails to inline them:

  vmlinux.o: error: objtool: irqentry_exit+0x5e8: call to instruction_pointer_set() with UACCESS enabled

Make sure the compiler isn't allowed to do stupid.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://patch.msgid.link/20251031105435.GU4068168@noisy.programming.kicks-ass.net
2025-11-04 08:36:20 +01:00
Peter Zijlstra 323d93f043 cleanup: Always inline everything
KASAN bloat caused cleanup helper functions to not get inlined:

  vmlinux.o: error: objtool: irqentry_exit+0x323: call to class_user_rw_access_destructor() with UACCESS enabled

Force inline all the cleanup helpers like they already are on normal
builds.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://patch.msgid.link/20251031105435.GU4068168@noisy.programming.kicks-ass.net
2025-11-04 08:35:58 +01:00
Thomas Gleixner 32034df66b rseq: Switch to TIF_RSEQ if supported
TIF_NOTIFY_RESUME is a multiplexing TIF bit, which is suboptimal especially
with the RSEQ fast path depending on it, but not really handling it.

Define a separate TIF_RSEQ in the generic TIF space and enable the full
separation of fast and slow path for architectures which utilize that.

That avoids the hassle with invocations of resume_user_mode_work() from
hypervisors, which clear TIF_NOTIFY_RESUME. It makes the therefore required
re-evaluation at the end of vcpu_run() a NOOP on architectures which
utilize the generic TIF space and have a separate TIF_RSEQ.

The hypervisor TIF handling does not include the separate TIF_RSEQ as there
is no point in doing so. The guest does neither know nor care about the VMM
host applications RSEQ state. That state is only relevant when the ioctl()
returns to user space.

The fastpath implementation still utilizes TIF_NOTIFY_RESUME for failure
handling, but this only happens within exit_to_user_mode_loop(), so
arguably the hypervisor ioctl() code is long done when this happens.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20251027084307.903622031@linutronix.de
2025-11-04 08:35:37 +01:00
Thomas Gleixner 7a5201ea19 rseq: Split up rseq_exit_to_user_mode()
Separate the interrupt and syscall exit handling. Syscall exit does not
require to clear the user_irq bit as it can't be set. On interrupt exit it
can be set when the interrupt did not result in a scheduling event and
therefore the return path did not invoke the TIF work handling, which would
have cleared it.

The debug check for the event state is also not really required even when
debug mode is enabled via the static key. Debug mode is largely aiding user
space by enabling a larger amount of validation checks, which cause a
segfault when a malformed critical section is detected. In production mode
the critical section handling takes the content mostly as is and lets user
space keep the pieces when it screwed up.

On kernel changes in that area the state check is useful, but that can be
done when lockdep is enabled, which is anyway a required test scenario for
fundamental changes.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20251027084307.842785700@linutronix.de
2025-11-04 08:35:30 +01:00
Thomas Gleixner 70fe25a3bc entry: Split up exit_to_user_mode_prepare()
exit_to_user_mode_prepare() is used for both interrupts and syscalls, but
there is extra rseq work, which is only required for in the interrupt exit
case.

Split up the function and provide wrappers for syscalls and interrupts,
which allows to separate the rseq exit work in the next step.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20251027084307.782234789@linutronix.de
2025-11-04 08:35:17 +01:00
Thomas Gleixner 3db6b38dfe rseq: Switch to fast path processing on exit to user
Now that all bits and pieces are in place, hook the RSEQ handling fast path
function into exit_to_user_mode_prepare() after the TIF work bits have been
handled. If case of fast path failure, TIF_NOTIFY_RESUME has been raised
and the caller needs to take another turn through the TIF handling slow
path.

This only works for architectures which use the generic entry code.
Architectures who still have their own incomplete hacks are not supported
and won't be.

This results in the following improvements:

  Kernel build	       Before		  After		      Reduction

  exit to user         80692981		  80514451
  signal checks:          32581		       121	       99%
  slowpath runs:        1201408   1.49%	       198 0.00%      100%
  fastpath runs:			    675941 0.84%       N/A
  id updates:           1233989   1.53%	     50541 0.06%       96%
  cs checks:            1125366   1.39%	         0 0.00%      100%
    cs cleared:         1125366      100%	 0            100%
    cs fixup:                 0        0%	 0

  RSEQ selftests      Before		  After		      Reduction

  exit to user:       386281778		  387373750
  signal checks:       35661203		          0           100%
  slowpath runs:      140542396 36.38%	        100  0.00%    100%
  fastpath runs:			    9509789  2.51%     N/A
  id updates:         176203599 45.62%	    9087994  2.35%     95%
  cs checks:          175587856 45.46%	    4728394  1.22%     98%
    cs cleared:       172359544   98.16%    1319307   27.90%   99%
    cs fixup:           3228312    1.84%    3409087   72.10%

The 'cs cleared' and 'cs fixup' percentages are not relative to the exit to
user invocations, they are relative to the actual 'cs check' invocations.

While some of this could have been avoided in the original code, like the
obvious clearing of CS when it's already clear, the main problem of going
through TIF_NOTIFY_RESUME cannot be solved. In some workloads the RSEQ
notify handler is invoked more than once before going out to user
space. Doing this once when everything has stabilized is the only solution
to avoid this.

The initial attempt to completely decouple it from the TIF work turned out
to be suboptimal for workloads, which do a lot of quick and short system
calls. Even if the fast path decision is only 4 instructions (including a
conditional branch), this adds up quickly and becomes measurable when the
rate for actually having to handle rseq is in the low single digit
percentage range of user/kernel transitions.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20251027084307.701201365@linutronix.de
2025-11-04 08:34:39 +01:00
Thomas Gleixner 05b44aef70 rseq: Implement fast path for exit to user
Implement the actual logic for handling RSEQ updates in a fast path after
handling the TIF work and at the point where the task is actually returning
to user space.

This is the right point to do that because at this point the CPU and the MM
CID are stable and cannot longer change due to yet another reschedule.
That happens when the task is handling it via TIF_NOTIFY_RESUME in
resume_user_mode_work(), which is invoked from the exit to user mode work
loop.

The function is invoked after the TIF work is handled and runs with
interrupts disabled, which means it cannot resolve page faults. It
therefore disables page faults and in case the access to the user space
memory faults, it:

  - notes the fail in the event struct
  - raises TIF_NOTIFY_RESUME
  - returns false to the caller

The caller has to go back to the TIF work, which runs with interrupts
enabled and therefore can resolve the page faults. This happens mostly on
fork() when the memory is marked COW.

If the user memory inspection finds invalid data, the function returns
false as well and sets the fatal flag in the event struct along with
TIF_NOTIFY_RESUME. The slow path notify handler has to evaluate that flag
and terminate the task with SIGSEGV as documented.

The initial decision to invoke any of this is based on one flags in the
event struct: @sched_switch. The decision is in pseudo ASM:

      load	tsk::event::sched_switch
      jnz	inspect_user_space
      mov	$0, tsk::event::events
      ...
      leave

So for the common case where the task was not scheduled out, this really
boils down to three instructions before going out if the compiler is not
completely stupid (and yes, some of them are).

If the condition is true, then it checks, whether CPU ID or MM CID have
changed. If so, then the CPU/MM IDs have to be updated and are thereby
cached for the next round. The update unconditionally retrieves the user
space critical section address to spare another user*begin/end() pair.  If
that's not zero and tsk::event::user_irq is set, then the critical section
is analyzed and acted upon. If either zero or the entry came via syscall
the critical section analysis is skipped.

If the comparison is false then the critical section has to be analyzed
because the event flag is then only true when entry from user was by
interrupt.

This is provided without the actual hookup to let reviewers focus on the
implementation details. The hookup happens in the next step.

Note: As with quite some other optimizations this depends on the generic
entry infrastructure and is not enabled to be sucked into random
architecture implementations.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20251027084307.638929615@linutronix.de
2025-11-04 08:34:18 +01:00
Thomas Gleixner 39a167560a rseq: Optimize event setting
After removing the various condition bits earlier it turns out that one
extra information is needed to avoid setting event::sched_switch and
TIF_NOTIFY_RESUME unconditionally on every context switch.

The update of the RSEQ user space memory is only required, when either

  the task was interrupted in user space and schedules

or

  the CPU or MM CID changes in schedule() independent of the entry mode

Right now only the interrupt from user information is available.

Add an event flag, which is set when the CPU or MM CID or both change.

Evaluate this event in the scheduler to decide whether the sched_switch
event and the TIF bit need to be set.

It's an extra conditional in context_switch(), but the downside of
unconditionally handling RSEQ after a context switch to user is way more
significant. The utilized boolean logic minimizes this to a single
conditional branch.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20251027084307.578058898@linutronix.de
2025-11-04 08:34:03 +01:00
Thomas Gleixner e2d4f42271 rseq: Rework the TIF_NOTIFY handler
Replace the whole logic with a new implementation, which is shared with
signal delivery and the upcoming exit fast path.

Contrary to the original implementation, this ignores invocations from
KVM/IO-uring, which invoke resume_user_mode_work() with the @regs argument
set to NULL.

The original implementation updated the CPU/Node/MM CID fields, but that
was just a side effect, which was addressing the problem that this
invocation cleared TIF_NOTIFY_RESUME, which in turn could cause an update
on return to user space to be lost.

This problem has been addressed differently, so that it's not longer
required to do that update before entering the guest.

That might be considered a user visible change, when the hosts thread TLS
memory is mapped into the guest, but as this was never intentionally
supported, this abuse of kernel internal implementation details is not
considered an ABI break.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20251027084307.517640811@linutronix.de
2025-11-04 08:33:54 +01:00
Thomas Gleixner 9f6ffd4ceb rseq: Separate the signal delivery path
Completely separate the signal delivery path from the notify handler as
they have different semantics versus the event handling.

The signal delivery only needs to ensure that the interrupted user context
was not in a critical section or the section is aborted before it switches
to the signal frame context. The signal frame context does not have the
original instruction pointer anymore, so that can't be handled on exit to
user space.

No point in updating the CPU/CID ids as they might change again before the
task returns to user space for real.

The fast path optimization, which checks for the 'entry from user via
interrupt' condition is only available for architectures which use the
generic entry code.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20251027084307.455429038@linutronix.de
2025-11-04 08:33:47 +01:00
Thomas Gleixner 0f085b4188 rseq: Provide and use rseq_set_ids()
Provide a new and straight forward implementation to set the IDs (CPU ID,
Node ID and MM CID), which can be later inlined into the fast path.

It does all operations in one scoped_user_rw_access() section and retrieves
also the critical section member (rseq::cs_rseq) from user space to avoid
another user..begin/end() pair. This is in preparation for optimizing the
fast path to avoid extra work when not required.

On rseq registration set the CPU ID fields to RSEQ_CPU_ID_UNINITIALIZED and
node and MM CID to zero. That's the same as the kernel internal reset
values. That makes the debug validation in the exit code work correctly on
the first exit to user space.

Use it to replace the whole related zoo in rseq.c

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20251027084307.393972266@linutronix.de
2025-11-04 08:33:33 +01:00
Thomas Gleixner eaa9088d56 rseq: Use static branch for syscall exit debug when GENERIC_IRQ_ENTRY=y
Make the syscall exit debug mechanism available via the static branch on
architectures which utilize the generic entry code.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20251027084307.333440475@linutronix.de
2025-11-04 08:33:27 +01:00
Thomas Gleixner c1cbad8f99 rseq: Make exit debugging static branch based
Disconnect it from the config switch and use the static debug branch. This
is a temporary measure for validating the rework. At the end this check
needs to be hidden behind lockdep as it has nothing to do with the other
debug infrastructure, which mainly aids user space debugging by enabling a
zoo of checks which terminate misbehaving tasks instead of letting them
keep the hard to diagnose pieces.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20251027084307.272660745@linutronix.de
2025-11-04 08:33:20 +01:00
Thomas Gleixner f7ee1964ac rseq: Replace the original debug implementation
Just utilize the new infrastructure and put the original one to rest.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20251027084307.212510692@linutronix.de
2025-11-04 08:33:12 +01:00
Thomas Gleixner abc850e761 rseq: Provide and use rseq_update_user_cs()
Provide a straight forward implementation to check for and eventually
clear/fixup critical sections in user space.

The non-debug version does only the minimal sanity checks and aims for
efficiency.

There are two attack vectors, which are checked for:

  1) An abort IP which is in the kernel address space. That would cause at
     least x86 to return to kernel space via IRET.

  2) A rogue critical section descriptor with an abort IP pointing to some
     arbitrary address, which is not preceded by the RSEQ signature.

If the section descriptors are invalid then the resulting misbehaviour of
the user space application is not the kernels problem.

The kernel provides a run-time switchable debug slow path, which implements
the full zoo of checks including termination of the task when one of the
gazillion conditions is not met.

Replace the zoo in rseq.c with it and invoke it from the TIF_NOTIFY_RESUME
handler. Move the remainders into the CONFIG_DEBUG_RSEQ section, which will
be replaced and removed in a subsequent step.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20251027084307.151465632@linutronix.de
2025-11-04 08:32:57 +01:00
Thomas Gleixner 9c37cb6e80 rseq: Provide static branch for runtime debugging
Config based debug is rarely turned on and is not available easily when
things go wrong.

Provide a static branch to allow permanent integration of debug mechanisms
along with the usual toggles in Kconfig, command line and debugfs.

Requested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20251027084307.089270547@linutronix.de
2025-11-04 08:32:49 +01:00
Thomas Gleixner 5412910487 rseq: Expose lightweight statistics in debugfs
Analyzing the call frequency without actually using tracing is helpful for
analysis of this infrastructure. The overhead is minimal as it just
increments a per CPU counter associated to each operation.

The debugfs readout provides a racy sum of all counters.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20251027084307.027916598@linutronix.de
2025-11-04 08:32:41 +01:00
Thomas Gleixner dab344753e rseq: Provide tracepoint wrappers for inline code
Provide tracepoint wrappers for the upcoming RSEQ exit to user space inline
fast path, so that the header can be safely included by code which defines
actual trace points.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20251027084306.967114316@linutronix.de
2025-11-04 08:32:35 +01:00
Thomas Gleixner 2fc0e4b412 rseq: Record interrupt from user space
For RSEQ the only relevant reason to inspect and eventually fixup (abort)
user space critical sections is when user space was interrupted and the
task was scheduled out.

If the user to kernel entry was from a syscall no fixup is required. If
user space invokes a syscall from a critical section it can keep the
pieces as documented.

This is only supported on architectures which utilize the generic entry
code. If your architecture does not use it, bad luck.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20251027084306.905067101@linutronix.de
2025-11-04 08:32:23 +01:00
Thomas Gleixner 4b7de6df20 rseq: Cache CPU ID and MM CID values
In preparation for rewriting RSEQ exit to user space handling provide
storage to cache the CPU ID and MM CID values which were written to user
space. That prepares for a quick check, which avoids the update when
nothing changed.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20251027084306.841964081@linutronix.de
2025-11-04 08:32:14 +01:00
Thomas Gleixner 4fc9225d19 sched: Move MM CID related functions to sched.h
There is nothing mm specific in that and including mm.h can cause header
recursion hell.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20251027084306.778457951@linutronix.de
2025-11-04 08:32:04 +01:00
Thomas Gleixner 7702a9c285 entry: Inline irqentry_enter/exit_from/to_user_mode()
There is no point to have this as a function which just inlines
enter_from_user_mode(). The function call overhead is larger than the
function itself.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20251027084306.715309918@linutronix.de
2025-11-04 08:31:47 +01:00
Thomas Gleixner 54a5ab5624 entry: Remove syscall_enter_from_user_mode_prepare()
Open code the only user in the x86 syscall code and reduce the zoo of
functions.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20251027084306.652839989@linutronix.de
2025-11-04 08:31:37 +01:00
Thomas Gleixner 5204be1679 entry: Clean up header
Clean up the include ordering, kernel-doc and other trivialities before
making further changes.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20251027084306.590338411@linutronix.de
2025-11-04 08:31:14 +01:00
Thomas Gleixner faba9d250e rseq: Introduce struct rseq_data
In preparation for a major rewrite of this code, provide a data structure
for rseq management.

Put all the rseq related data into it (except for the debug part), which
allows to simplify fork/execve by using memset() and memcpy() instead of
adding new fields to initialize over and over.

Create a storage struct for event management as well and put the
sched_switch event and a indicator for RSEQ on a task into it as a
start. That uses a union, which allows to mask and clear the whole lot
efficiently.

The indicators are explicitly not a bit field. Bit fields generate abysmal
code.

The boolean members are defined as u8 as that actually guarantees that it
fits. There seem to be strange architecture ABIs which need more than 8
bits for a boolean.

The has_rseq member is redundant vs. task::rseq, but it turns out that
boolean operations and quick checks on the union generate better code than
fiddling with separate entities and data types.

This struct will be extended over time to carry more information.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20251027084306.527086690@linutronix.de
2025-11-04 08:30:50 +01:00
Thomas Gleixner 566d8015f7 rseq: Avoid CPU/MM CID updates when no event pending
There is no need to update these values unconditionally if there is no
event pending.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20251027084306.462964916@linutronix.de
2025-11-04 08:30:43 +01:00
Thomas Gleixner 83409986f4 rseq, virt: Retrigger RSEQ after vcpu_run()
Hypervisors invoke resume_user_mode_work() before entering the guest, which
clears TIF_NOTIFY_RESUME. The @regs argument is NULL as there is no user
space context available to them, so the rseq notify handler skips
inspecting the critical section, but updates the CPU/MM CID values
unconditionally so that the eventual pending rseq event is not lost on the
way to user space.

This is a pointless exercise as the task might be rescheduled before
actually returning to user space and it creates unnecessary work in the
vcpu_run() loops.

It's way more efficient to ignore that invocation based on @regs == NULL
and let the hypervisors re-raise TIF_NOTIFY_RESUME after returning from the
vcpu_run() loop before returning from the ioctl().

This ensures that a pending RSEQ update is not lost and the IDs are updated
before returning to user space.

Once the RSEQ handling is decoupled from TIF_NOTIFY_RESUME, this turns into
a NOOP.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Acked-by: Sean Christopherson <seanjc@google.com>
Link: https://patch.msgid.link/20251027084306.399495855@linutronix.de
2025-11-04 08:30:23 +01:00
Thomas Gleixner d923739e2e rseq: Simplify the event notification
Since commit 0190e4198e ("rseq: Deprecate RSEQ_CS_FLAG_NO_RESTART_ON_*
flags") the bits in task::rseq_event_mask are meaningless and just extra
work in terms of setting them individually.

Aside of that the only relevant point where an event has to be raised is
context switch. Neither the CPU nor MM CID can change without going through
a context switch.

Collapse them all into a single boolean which simplifies the code a lot and
remove the pointless invocations which have been sprinkled all over the
place for no value.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20251027084306.336978188@linutronix.de
2025-11-04 08:30:09 +01:00
Thomas Gleixner 067b3b41b4 rseq: Simplify registration
There is no point to read the critical section element in the newly
registered user space RSEQ struct first in order to clear it.

Just clear it and be done with it.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20251027084306.274661227@linutronix.de
2025-11-04 08:30:05 +01:00
Thomas Gleixner 41b43a6ba3 rseq: Remove the ksig argument from rseq_handle_notify_resume()
There is no point for this being visible in the resume_to_user_mode()
handling.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20251027084306.211520245@linutronix.de
2025-11-04 08:30:01 +01:00
Thomas Gleixner 77f19e4d4f rseq: Move algorithm comment to top
Move the comment which documents the RSEQ algorithm to the top of the file,
so it does not create horrible diffs later when the actual implementation
is fed into the mincer.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20251027084306.149519580@linutronix.de
2025-11-04 08:29:52 +01:00
Thomas Gleixner fdc0f39d28 rseq: Condense the inline stubs
Scrolling over tons of pointless

	{
	}

lines to find the actual code is annoying at best.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20251027084306.085971048@linutronix.de
2025-11-04 08:29:08 +01:00
Thomas Gleixner 3ca59da7aa rseq: Avoid pointless evaluation in __rseq_notify_resume()
The RSEQ critical section mechanism only clears the event mask when a
critical section is registered, otherwise it is stale and collects
bits.

That means once a critical section is installed the first invocation of
that code when TIF_NOTIFY_RESUME is set will abort the critical section,
even when the TIF bit was not raised by the rseq preempt/migrate/signal
helpers.

This also has a performance implication because TIF_NOTIFY_RESUME is a
multiplexing TIF bit, which is utilized by quite some infrastructure. That
means every invocation of __rseq_notify_resume() goes unconditionally
through the heavy lifting of user space access and consistency checks even
if there is no reason to do so.

Keeping the stale event mask around when exiting to user space also
prevents it from being utilized by the upcoming time slice extension
mechanism.

Avoid this by reading and clearing the event mask before doing the user
space critical section access with interrupts or preemption disabled, which
ensures that the read and clear operation is CPU local atomic versus
scheduling and the membarrier IPI.

This is correct as after re-enabling interrupts/preemption any relevant
event will set the bit again and raise TIF_NOTIFY_RESUME, which makes the
user space exit code take another round of TIF bit clearing.

If the event mask was non-zero, invoke the slow path. On debug kernels the
slow path is invoked unconditionally and the result of the event mask
evaluation is handed in.

Add a exit path check after the TIF bit loop, which validates on debug
kernels that the event mask is zero before exiting to user space.

While at it reword the convoluted comment why the pt_regs pointer can be
NULL under certain circumstances.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20251027084306.022571576@linutronix.de
2025-11-04 08:28:38 +01:00
Thomas Gleixner 3ce17e6909 select: Convert to scoped user access
Replace the open coded implementation with the scoped user access guard.

No functional change intended.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20251027083745.862419776@linutronix.de
2025-11-04 08:28:34 +01:00
Thomas Gleixner e02718c986 x86/futex: Convert to scoped user access
Replace the open coded implementation with the scoped user access
guards

No functional change intended.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://patch.msgid.link/20251027083745.799714344@linutronix.de
2025-11-04 08:28:29 +01:00
Thomas Gleixner e4e28fd698 futex: Convert to get/put_user_inline()
Replace the open coded implementation with the new get/put_user_inline()
helpers. This might be replaced by a regular get/put_user(), but that needs
a proper performance evaluation.

No functional change intended.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://patch.msgid.link/20251027083745.736737934@linutronix.de
2025-11-04 08:28:23 +01:00
Thomas Gleixner b2cfc0cd68 uaccess: Provide put/get_user_inline()
Provide convenience wrappers around scoped user access similar to
put/get_user(), which reduce the usage sites to:

       if (!get_user_inline(val, ptr))
       		return -EFAULT;

Should only be used if there is a demonstrable performance benefit.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20251027083745.609031602@linutronix.de
2025-11-04 08:28:15 +01:00
Thomas Gleixner e497310b4f uaccess: Provide scoped user access regions
User space access regions are tedious and require similar code patterns all
over the place:

     	if (!user_read_access_begin(from, sizeof(*from)))
		return -EFAULT;
	unsafe_get_user(val, from, Efault);
	user_read_access_end();
	return 0;
Efault:
	user_read_access_end();
	return -EFAULT;

This got worse with the recent addition of masked user access, which
optimizes the speculation prevention:

	if (can_do_masked_user_access())
		from = masked_user_read_access_begin((from));
	else if (!user_read_access_begin(from, sizeof(*from)))
		return -EFAULT;
	unsafe_get_user(val, from, Efault);
	user_read_access_end();
	return 0;
Efault:
	user_read_access_end();
	return -EFAULT;

There have been issues with using the wrong user_*_access_end() variant in
the error path and other typical Copy&Pasta problems, e.g. using the wrong
fault label in the user accessor which ends up using the wrong accesss end
variant.

These patterns beg for scopes with automatic cleanup. The resulting outcome
is:
    	scoped_user_read_access(from, Efault)
		unsafe_get_user(val, from, Efault);
	return 0;
  Efault:
	return -EFAULT;

The scope guarantees the proper cleanup for the access mode is invoked both
in the success and the failure (fault) path.

The scoped_user_$MODE_access() macros are implemented as self terminating
nested for() loops. Thanks to Andrew Cooper for pointing me at them. The
scope can therefore be left with 'break', 'goto' and 'return'.  Even
'continue' "works" due to the self termination mechanism. Both GCC and
clang optimize all the convoluted macro maze out and the above results with
clang in:

 b80:	f3 0f 1e fa          	       endbr64
 b84:	48 b8 ef cd ab 89 67 45 23 01  movabs $0x123456789abcdef,%rax
 b8e:	48 39 c7    	               cmp    %rax,%rdi
 b91:	48 0f 47 f8          	       cmova  %rax,%rdi
 b95:	90                   	       nop
 b96:	90                   	       nop
 b97:	90                   	       nop
 b98:	31 c9                	       xor    %ecx,%ecx
 b9a:	8b 07                	       mov    (%rdi),%eax
 b9c:	89 06                	       mov    %eax,(%rsi)
 b9e:	85 c9                	       test   %ecx,%ecx
 ba0:	0f 94 c0             	       sete   %al
 ba3:	90                   	       nop
 ba4:	90                   	       nop
 ba5:	90                   	       nop
 ba6:	c3                   	       ret

Which looks as compact as it gets. The NOPs are placeholder for STAC/CLAC.
GCC emits the fault path seperately:

 bf0:	f3 0f 1e fa          	       endbr64
 bf4:	48 b8 ef cd ab 89 67 45 23 01  movabs $0x123456789abcdef,%rax
 bfe:	48 39 c7             	       cmp    %rax,%rdi
 c01:	48 0f 47 f8          	       cmova  %rax,%rdi
 c05:	90                   	       nop
 c06:	90                   	       nop
 c07:	90                   	       nop
 c08:	31 d2                	       xor    %edx,%edx
 c0a:	8b 07                	       mov    (%rdi),%eax
 c0c:	89 06                	       mov    %eax,(%rsi)
 c0e:	85 d2                	       test   %edx,%edx
 c10:	75 09                	       jne    c1b <afoo+0x2b>
 c12:	90                   	       nop
 c13:	90                   	       nop
 c14:	90                   	       nop
 c15:	b8 01 00 00 00       	       mov    $0x1,%eax
 c1a:	c3                   	       ret
 c1b:	90                   	       nop
 c1c:	90                   	       nop
 c1d:	90                   	       nop
 c1e:	31 c0                	       xor    %eax,%eax
 c20:	c3                   	       ret

The fault labels for the scoped*() macros and the fault labels for the
actual user space accessors can be shared and must be placed outside of the
scope.

If masked user access is enabled on an architecture, then the pointer
handed in to scoped_user_$MODE_access() can be modified to point to a
guaranteed faulting user address. This modification is only scope local as
the pointer is aliased inside the scope. When the scope is left the alias
is not longer in effect. IOW the original pointer value is preserved so it
can be used e.g. for fixup or diagnostic purposes in the fault path.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20251027083745.546420421@linutronix.de
2025-11-04 08:27:52 +01:00
Thomas Gleixner 2db48d8bf8 arm64: uaccess: Use unsafe wrappers for ASM GOTO
Clang propagates a provided label, which is outside of a cleanup scope to
ASM GOTO despite the fact that __raw_get_mem() has a local label for that
purpose:

  "error: cannot jump from this asm goto statement to one of its possible targets"

Using the unsafe wrapper with the extra local label indirection cures that.

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2025-11-04 08:27:20 +01:00
Christian Brauner 8ebfb9896c
Merge patch series "nstree: listns()"
Christian Brauner <brauner@kernel.org> says:

As announced a while ago this is the next step building on the nstree
work from prior cycles. There's a bunch of fixes and semantic cleanups
in here and a ton of tests.

Currently listns() is relying on active namespace reference counts which
are introduced alongside this series.

While a namespace is on the namespace trees with a valid reference count
it is possible to reopen it through a namespace file handle. This is all
fine but has some issues that should be addressed.

On current kernels a namespace is visible to userspace in the
following cases:

(1) The namespace is in use by a task.
(2) The namespace is persisted through a VFS object (namespace file
    descriptor or bind-mount).
    Note that (2) only cares about direct persistence of the namespace
    itself not indirectly via e.g., file->f_cred file references or
    similar.
(3) The namespace is a hierarchical namespace type and is the parent of
    a single or multiple child namespaces.

Case (3) is interesting because it is possible that a parent namespace
might not fulfill any of (1) or (2), i.e., is invisible to userspace but
it may still be resurrected through the NS_GET_PARENT ioctl().

Currently namespace file handles allow much broader access to namespaces
than what is currently possible via (1)-(3). The reason is that
namespaces may remain pinned for completely internal reasons yet are
inaccessible to userspace.

For example, a user namespace my remain pinned by get_cred() calls to
stash the opener's credentials into file->f_cred. As it stands file
handles allow to resurrect such a users namespace even though this
should not be possible via (1)-(3). This is a fundamental uapi change
that we shouldn't do if we don't have to.

Consider the following insane case: Various architectures support the
CONFIG_MMU_LAZY_TLB_REFCOUNT option which uses lazy TLB destruction.
When this option is set a userspace task's struct mm_struct may be used
for kernel threads such as the idle task and will only be destroyed once
the cpu's runqueue switches back to another task. But because of ptrace()
permission checks struct mm_struct stashes the user namespace of the
task that struct mm_struct originally belonged to. The kernel thread
will take a reference on the struct mm_struct and thus pin it.

So on an idle system user namespaces can be persisted for arbitrary
amounts of time which also means that they can be resurrected using
namespace file handles. That makes no sense whatsoever. The problem is
of course excarabted on large systems with a huge number of cpus.

To handle this nicely we introduce an active reference count which
tracks (1)-(3). This is easy to do as all of these things are already
managed centrally. Only (1)-(3) will count towards the active reference
count and only namespaces which are active may be opened via namespace
file handles.

The problem is that namespaces may be resurrected. Which means that they
can become temporarily inactive and will be reactived some time later.
Currently the only example of this is the SIOGCSKNS socket ioctl. The
SIOCGSKNS ioctl allows to open a network namespace file descriptor based
on a socket file descriptor.

If a socket is tied to a network namespace that subsequently becomes
inactive but that socket is persisted by another process in another
network namespace (e.g., via SCM_RIGHTS of pidfd_getfd()) then the
SIOCGSKNS ioctl will resurrect this network namespace.

So calls to open_related_ns() and open_namespace() will end up
resurrecting the corresponding namespace tree.

Note that the active reference count does not regulate the lifetime of
the namespace itself. This is still done by the normal reference count.
The active reference count can only be elevated if the regular reference
count is elevated.

The active reference count also doesn't regulate the presence of a
namespace on the namespace trees. It only regulates its visiblity to
namespace file handles (and in later patches to listns()).

A namespace remains on the namespace trees from creation until its
actual destruction. This will allow the kernel to always reach any
namespace trivially and it will also enable subsystems like bpf to walk
the namespace lists on the system for tracing or general introspection
purposes.

Note that different namespaces have different visibility lifetimes on
current kernels. While most namespace are immediately released when the
last task using them exits, the user- and pid namespace are persisted
and thus both remain accessible via /proc/<pid>/ns/<ns_type>.

The user namespace lifetime is aliged with struct cred and is only
released through exit_creds(). However, it becomes inaccessible to
userspace once the last task using it is reaped, i.e., when
release_task() is called and all proc entries are flushed. Similarly,
the pid namespace is also visible until the last task using it has been
reaped and the associated pid numbers are freed.

The active reference counts of the user- and pid namespace are
decremented once the task is reaped.

Based on the namespace trees and the active reference count, a new
listns() system call that allows userspace to iterate through namespaces
in the system. This provides a programmatic interface to discover and
inspect namespaces, enhancing existing namespace apis.

Currently, there is no direct way for userspace to enumerate namespaces
in the system. Applications must resort to scanning /proc/<pid>/ns/
across all processes, which is:

1. Inefficient - requires iterating over all processes
2. Incomplete - misses inactive namespaces that aren't attached to any
   running process but are kept alive by file descriptors, bind mounts,
   or parent namespace references
3. Permission-heavy - requires access to /proc for many processes
4. No ordering or ownership.
5. No filtering per namespace type: Must always iterate and check all
   namespaces.

The list goes on. The listns() system call solves these problems by
providing direct kernel-level enumeration of namespaces. It is similar
to listmount() but obviously tailored to namespaces.

/*
 * @req: Pointer to struct ns_id_req specifying search parameters
 * @ns_ids: User buffer to receive namespace IDs
 * @nr_ns_ids: Size of ns_ids buffer (maximum number of IDs to return)
 * @flags: Reserved for future use (must be 0)
 */
ssize_t listns(const struct ns_id_req *req, u64 *ns_ids,
               size_t nr_ns_ids, unsigned int flags);

Returns:
- On success: Number of namespace IDs written to ns_ids
- On error: Negative error code

/*
 * @size: Structure size
 * @ns_id: Starting point for iteration; use 0 for first call, then
 *         use the last returned ID for subsequent calls to paginate
 * @ns_type: Bitmask of namespace types to include (from enum ns_type):
 *           0: Return all namespace types
 *           MNT_NS: Mount namespaces
 *           NET_NS: Network namespaces
 *           USER_NS: User namespaces
 *           etc. Can be OR'd together
 * @user_ns_id: Filter results to namespaces owned by this user namespace:
 *              0: Return all namespaces (subject to permission checks)
 *              LISTNS_CURRENT_USER: Namespaces owned by caller's user namespace
 *              Other value: Namespaces owned by the specified user namespace ID
 */
struct ns_id_req {
        __u32 size;         /* sizeof(struct ns_id_req) */
        __u32 spare;        /* Reserved, must be 0 */
        __u64 ns_id;        /* Last seen namespace ID (for pagination) */
        __u32 ns_type;      /* Filter by namespace type(s) */
        __u32 spare2;       /* Reserved, must be 0 */
        __u64 user_ns_id;   /* Filter by owning user namespace */
};

Example 1: List all namespaces

void list_all_namespaces(void)
{
	struct ns_id_req req = {
		.size = sizeof(req),
		.ns_id = 0,      /* Start from beginning */
		.ns_type = 0,    /* All types */
		.user_ns_id = 0, /* All user namespaces */
	};
	uint64_t ids[100];
	ssize_t ret;

	printf("All namespaces in the system:\n");
	do {
		ret = listns(&req, ids, 100, 0);
		if (ret < 0) {
			perror("listns");
			break;
		}

		for (ssize_t i = 0; i < ret; i++)
			printf("  Namespace ID: %llu\n", (unsigned long long)ids[i]);

		/* Continue from last seen ID */
		if (ret > 0)
			req.ns_id = ids[ret - 1];
	} while (ret == 100); /* Buffer was full, more may exist */
}

Example 2 : List network namespaces only

void list_network_namespaces(void)
{
	struct ns_id_req req = {
		.size = sizeof(req),
		.ns_id = 0,
		.ns_type = NET_NS, /* Only network namespaces */
		.user_ns_id = 0,
	};
	uint64_t ids[100];
	ssize_t ret;

	ret = listns(&req, ids, 100, 0);
	if (ret < 0) {
		perror("listns");
		return;
	}

	printf("Network namespaces: %zd found\n", ret);
	for (ssize_t i = 0; i < ret; i++)
		printf("  netns ID: %llu\n", (unsigned long long)ids[i]);
}

Example 3 : List namespaces owned by current user namespace

void list_owned_namespaces(void)
{
	struct ns_id_req req = {
		.size = sizeof(req),
		.ns_id = 0,
		.ns_type = 0,                      /* All types */
		.user_ns_id = LISTNS_CURRENT_USER, /* Current userns */
	};
	uint64_t ids[100];
	ssize_t ret;

	ret = listns(&req, ids, 100, 0);
	if (ret < 0) {
		perror("listns");
		return;
	}

	printf("Namespaces owned by my user namespace: %zd\n", ret);
	for (ssize_t i = 0; i < ret; i++)
		printf("  ns ID: %llu\n", (unsigned long long)ids[i]);
}

Example 4 : List multiple namespace types

void list_network_and_mount_namespaces(void)
{
	struct ns_id_req req = {
		.size = sizeof(req),
		.ns_id = 0,
		.ns_type = NET_NS | MNT_NS, /* Network and mount */
		.user_ns_id = 0,
	};
	uint64_t ids[100];
	ssize_t ret;

	ret = listns(&req, ids, 100, 0);
	printf("Network and mount namespaces: %zd found\n", ret);
}

Example 5 : Pagination through large namespace sets

void list_all_with_pagination(void)
{
	struct ns_id_req req = {
		.size = sizeof(req),
		.ns_id = 0,
		.ns_type = 0,
		.user_ns_id = 0,
	};
	uint64_t ids[50];
	size_t total = 0;
	ssize_t ret;

	printf("Enumerating all namespaces with pagination:\n");

	while (1) {
		ret = listns(&req, ids, 50, 0);
		if (ret < 0) {
			perror("listns");
			break;
		}
		if (ret == 0)
			break; /* No more namespaces */

		total += ret;
		printf("  Batch: %zd namespaces\n", ret);

		/* Last ID in this batch becomes start of next batch */
		req.ns_id = ids[ret - 1];

		if (ret < 50)
			break; /* Partial batch = end of results */
	}

	printf("Total: %zu namespaces\n", total);
}

listns() respects namespace isolation and capabilities:

(1) Global listing (user_ns_id = 0):
    - Requires CAP_SYS_ADMIN in the namespace's owning user namespace
    - OR the namespace must be in the caller's namespace context (e.g.,
      a namespace the caller is currently using)
    - User namespaces additionally allow listing if the caller has
      CAP_SYS_ADMIN in that user namespace itself
(2) Owner-filtered listing (user_ns_id != 0):
    - Requires CAP_SYS_ADMIN in the specified owner user namespace
    - OR the namespace must be in the caller's namespace context
    - This allows unprivileged processes to enumerate namespaces they own
(3) Visibility:
    - Only "active" namespaces are listed
    - A namespace is active if it has a non-zero __ns_ref_active count
    - This includes namespaces used by running processes, held by open
      file descriptors, or kept active by bind mounts
    - Inactive namespaces (kept alive only by internal kernel
      references) are not visible via listns()

* patches from https://patch.msgid.link/20251029-work-namespace-nstree-listns-v4-0-2e6f823ebdc0@kernel.org: (74 commits)
  selftests/namespace: test listns() pagination
  selftests/namespace: add stress test
  selftests/namespace: commit_creds() active reference tests
  selftests/namespace: third threaded active reference count test
  selftests/namespace: second threaded active reference count test
  selftests/namespace: first threaded active reference count test
  selftests/namespaces: twelth inactive namespace resurrection test
  selftests/namespaces: eleventh inactive namespace resurrection test
  selftests/namespaces: tenth inactive namespace resurrection test
  selftests/namespaces: ninth inactive namespace resurrection test
  selftests/namespaces: eigth inactive namespace resurrection test
  selftests/namespaces: seventh inactive namespace resurrection test
  selftests/namespaces: sixth inactive namespace resurrection test
  selftests/namespaces: fifth inactive namespace resurrection test
  selftests/namespaces: fourth inactive namespace resurrection test
  selftests/namespaces: third inactive namespace resurrection test
  selftests/namespaces: second inactive namespace resurrection test
  selftests/namespaces: first inactive namespace resurrection test
  selftests/namespaces: seventh listns() permission test
  selftests/namespaces: sixth listns() permission test
  ...

Link: https://patch.msgid.link/20251029-work-namespace-nstree-listns-v4-0-2e6f823ebdc0@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-03 17:41:25 +01:00
Christian Brauner 2cc1c01fe9
selftests/namespace: test listns() pagination
Minimal test case to reproduce KASAN out-of-bounds in listns pagination.

Link: https://patch.msgid.link/20251029-work-namespace-nstree-listns-v4-72-2e6f823ebdc0@kernel.org
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-03 17:41:25 +01:00
Christian Brauner fc85885692
selftests/namespace: add stress test
Stress tests for namespace active reference counting.

These tests validate that the active reference counting system can
handle high load scenarios including rapid namespace
creation/destruction, large numbers of concurrent namespaces, and
various edge cases under stress.

Link: https://patch.msgid.link/20251029-work-namespace-nstree-listns-v4-71-2e6f823ebdc0@kernel.org
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-03 17:41:25 +01:00
Christian Brauner d18cf3f9a4
selftests/namespace: commit_creds() active reference tests
Test credential changes and their impact on namespace active references.

Link: https://patch.msgid.link/20251029-work-namespace-nstree-listns-v4-70-2e6f823ebdc0@kernel.org
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-03 17:41:24 +01:00
Christian Brauner 80fedf8168
selftests/namespace: third threaded active reference count test
Test that namespaces become inactive after subprocess with multiple
threads exits. Create a subprocess that unshares user and network
namespaces, then creates two threads that share those namespaces. Verify
that after all threads and subprocess exit, the namespaces are no longer
listed by listns() and cannot be opened by open_by_handle_at().

Link: https://patch.msgid.link/20251029-work-namespace-nstree-listns-v4-69-2e6f823ebdc0@kernel.org
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-03 17:41:24 +01:00
Christian Brauner ee86103238
selftests/namespace: second threaded active reference count test
Test that a namespace remains active while a thread holds an fd to it.
Even after the thread exits, the namespace should remain active as long
as another thread holds a file descriptor to it.

Link: https://patch.msgid.link/20251029-work-namespace-nstree-listns-v4-68-2e6f823ebdc0@kernel.org
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-03 17:41:24 +01:00
Christian Brauner 29f083c499
selftests/namespace: first threaded active reference count test
Test that namespace becomes inactive after thread exits. This verifies
active reference counting works with threads, not just processes.

Link: https://patch.msgid.link/20251029-work-namespace-nstree-listns-v4-67-2e6f823ebdc0@kernel.org
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-03 17:41:24 +01:00
Christian Brauner c89d100f6a
selftests/namespaces: twelth inactive namespace resurrection test
Test multi-level namespace resurrection across three user namespace levels.

This test creates a complex namespace hierarchy with three levels of user
namespaces and a network namespace at the deepest level. It verifies that
the resurrection semantics work correctly when SIOCGSKNS is called on a
socket from an inactive namespace tree, and that listns() and
open_by_handle_at() correctly respect visibility rules.

Hierarchy after child processes exit (all with 0 active refcount):

         net_L3A (0)                <- Level 3 network namespace
             |
             +
         userns_L3 (0)              <- Level 3 user namespace
             |
             +
         userns_L2 (0)              <- Level 2 user namespace
             |
             +
         userns_L1 (0)              <- Level 1 user namespace
             |
             x
         init_user_ns

The test verifies:
1. SIOCGSKNS on a socket from inactive net_L3A resurrects the entire chain
2. After resurrection, all namespaces are visible in listns()
3. Resurrected namespaces can be reopened via file handles
4. Closing the netns FD cascades down: the entire ownership chain
   (userns_L3 -> userns_L2 -> userns_L1) becomes inactive again
5. Inactive namespaces disappear from listns() and cannot be reopened
6. Calling SIOCGSKNS again on the same socket resurrects the tree again
7. After second resurrection, namespaces are visible and can be reopened

Link: https://patch.msgid.link/20251029-work-namespace-nstree-listns-v4-66-2e6f823ebdc0@kernel.org
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-03 17:41:24 +01:00
Christian Brauner c80168b677
selftests/namespaces: eleventh inactive namespace resurrection test
Test combined listns() and file handle operations with socket-kept
netns. Create a netns, keep it alive with a socket, verify it appears in
listns(), then reopen it via file handle obtained from listns() entry.

Link: https://patch.msgid.link/20251029-work-namespace-nstree-listns-v4-65-2e6f823ebdc0@kernel.org
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-03 17:41:24 +01:00
Christian Brauner 3798991a9f
selftests/namespaces: tenth inactive namespace resurrection test
Test that socket-kept netns can be reopened via file handle.
Verify that a network namespace kept alive by a socket FD can be
reopened using file handles even after the creating process exits.

Link: https://patch.msgid.link/20251029-work-namespace-nstree-listns-v4-64-2e6f823ebdc0@kernel.org
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-03 17:41:24 +01:00
Christian Brauner b9d09f568b
selftests/namespaces: ninth inactive namespace resurrection test
Test that socket-kept netns appears in listns() output.
Verify that a network namespace kept alive by a socket FD appears in
listns() output even after the creating process exits, and that it
disappears when the socket is closed.

Link: https://patch.msgid.link/20251029-work-namespace-nstree-listns-v4-63-2e6f823ebdc0@kernel.org
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-03 17:41:24 +01:00
Christian Brauner 6de17ec3cc
selftests/namespaces: eigth inactive namespace resurrection test
Test IPv6 sockets also work with SIOCGSKNS.

Link: https://patch.msgid.link/20251029-work-namespace-nstree-listns-v4-62-2e6f823ebdc0@kernel.org
Tested-by: syzbot@syzkaller.appspotmail.com
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-03 17:41:23 +01:00
Christian Brauner 54a29d1233
selftests/namespaces: seventh inactive namespace resurrection test
Test socket keeps netns active after creating process exits. Verify that
as long as the socket FD exists, the namespace remains active.

Link: https://patch.msgid.link/20251029-work-namespace-nstree-listns-v4-61-2e6f823ebdc0@kernel.org
Tested-by: syzbot@syzkaller.appspotmail.com
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-03 17:41:23 +01:00
Christian Brauner aec2237695
selftests/namespaces: sixth inactive namespace resurrection test
Test multiple sockets keep the same network namespace active. Create
multiple sockets, verify closing some doesn't affect others.

Link: https://patch.msgid.link/20251029-work-namespace-nstree-listns-v4-60-2e6f823ebdc0@kernel.org
Tested-by: syzbot@syzkaller.appspotmail.com
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-03 17:41:23 +01:00
Christian Brauner 2b9fa5bf0c
selftests/namespaces: fifth inactive namespace resurrection test
Test SIOCGSKNS fails on non-socket file descriptors.

Link: https://patch.msgid.link/20251029-work-namespace-nstree-listns-v4-59-2e6f823ebdc0@kernel.org
Tested-by: syzbot@syzkaller.appspotmail.com
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-03 17:41:23 +01:00
Christian Brauner 40226da471
selftests/namespaces: fourth inactive namespace resurrection test
Test SIOCGSKNS across setns. Create a socket in netns A, switch to netns
B, verify SIOCGSKNS still returns netns A.

Link: https://patch.msgid.link/20251029-work-namespace-nstree-listns-v4-58-2e6f823ebdc0@kernel.org
Tested-by: syzbot@syzkaller.appspotmail.com
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-03 17:41:23 +01:00
Christian Brauner 5aec9f455c
selftests/namespaces: third inactive namespace resurrection test
Test SIOCGSKNS with different socket types (TCP, UDP, RAW).

Link: https://patch.msgid.link/20251029-work-namespace-nstree-listns-v4-57-2e6f823ebdc0@kernel.org
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-03 17:41:23 +01:00
Christian Brauner c0f06da568
selftests/namespaces: second inactive namespace resurrection test
Test that socket file descriptors keep network namespaces active. Create
a network namespace, create a socket in it, then exit the namespace. The
namespace should remain active while the socket FD is held.

Link: https://patch.msgid.link/20251029-work-namespace-nstree-listns-v4-56-2e6f823ebdc0@kernel.org
Tested-by: syzbot@syzkaller.appspotmail.com
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-03 17:41:23 +01:00
Christian Brauner a1e49d8d18
selftests/namespaces: first inactive namespace resurrection test
Test basic SIOCGSKNS functionality. Create a socket and verify SIOCGSKNS
returns the correct network namespace.

Link: https://patch.msgid.link/20251029-work-namespace-nstree-listns-v4-55-2e6f823ebdc0@kernel.org
Tested-by: syzbot@syzkaller.appspotmail.com
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-03 17:41:22 +01:00
Christian Brauner 39bcc7ae57
selftests/namespaces: seventh listns() permission test
Test that dropping CAP_SYS_ADMIN restricts what we can see.

Link: https://patch.msgid.link/20251029-work-namespace-nstree-listns-v4-54-2e6f823ebdc0@kernel.org
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-03 17:41:22 +01:00
Christian Brauner cff66421ee
selftests/namespaces: sixth listns() permission test
Test that we can see user namespaces we have CAP_SYS_ADMIN inside of.
This is different from seeing namespaces owned by a user namespace.

Link: https://patch.msgid.link/20251029-work-namespace-nstree-listns-v4-53-2e6f823ebdc0@kernel.org
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-03 17:41:22 +01:00
Christian Brauner 1c28817eb3
selftests/namespaces: fifth listns() permission test
Test that CAP_SYS_ADMIN in parent user namespace allows seeing
child user namespace's owned namespaces.

Link: https://patch.msgid.link/20251029-work-namespace-nstree-listns-v4-52-2e6f823ebdc0@kernel.org
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-03 17:41:22 +01:00
Christian Brauner 6f360f2b2f
selftests/namespaces: fourth listns() permission test
Test permission checking with LISTNS_CURRENT_USER.
Verify that listing with LISTNS_CURRENT_USER respects permissions.

Link: https://patch.msgid.link/20251029-work-namespace-nstree-listns-v4-51-2e6f823ebdc0@kernel.org
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-03 17:41:22 +01:00
Christian Brauner 2635f93989
selftests/namespaces: third listns() permission test
Test that users cannot see namespaces from unrelated user namespaces.
Create two sibling user namespaces, verify they can't see each other's
owned namespaces.

Link: https://patch.msgid.link/20251029-work-namespace-nstree-listns-v4-50-2e6f823ebdc0@kernel.org
Tested-by: syzbot@syzkaller.appspotmail.com
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-03 17:41:22 +01:00
Christian Brauner ec38237731
selftests/namespaces: second listns() permission test
Test that users with CAP_SYS_ADMIN in a user namespace can see
all namespaces owned by that user namespace.

Link: https://patch.msgid.link/20251029-work-namespace-nstree-listns-v4-49-2e6f823ebdc0@kernel.org
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-03 17:41:22 +01:00
Christian Brauner 1f8ee4a1f9
selftests/namespaces: first listns() permission test
Test that unprivileged users can only see namespaces they're currently
in. Create a namespace, drop privileges, verify we can only see our own
namespaces.

Link: https://patch.msgid.link/20251029-work-namespace-nstree-listns-v4-48-2e6f823ebdc0@kernel.org
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-03 17:41:22 +01:00
Christian Brauner 674294a479
selftests/namespaces: ninth listns() test
Test error cases for listns().

Link: https://patch.msgid.link/20251029-work-namespace-nstree-listns-v4-47-2e6f823ebdc0@kernel.org
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-03 17:41:21 +01:00
Christian Brauner b0de4c80fb
selftests/namespaces: eigth listns() test
Test that hierarchical active reference propagation keeps parent
user namespaces visible in listns().

Link: https://patch.msgid.link/20251029-work-namespace-nstree-listns-v4-46-2e6f823ebdc0@kernel.org
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-03 17:41:21 +01:00
Christian Brauner 6aeca1dd49
selftests/namespaces: seventh listns() test
Test listns() with multiple namespace types filter.

Link: https://patch.msgid.link/20251029-work-namespace-nstree-listns-v4-45-2e6f823ebdc0@kernel.org
Tested-by: syzbot@syzkaller.appspotmail.com
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-03 17:41:21 +01:00
Christian Brauner bc8da67e0e
selftests/namespaces: sixth listns() test
Test listns() with specific user namespace ID.
Create a user namespace and list namespaces it owns.

Link: https://patch.msgid.link/20251029-work-namespace-nstree-listns-v4-44-2e6f823ebdc0@kernel.org
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-03 17:41:21 +01:00
Christian Brauner 4080b9d946
selftests/namespaces: fifth listns() test
Test that listns() only returns active namespaces.
Create a namespace, let it become inactive, verify it's not listed.

Link: https://patch.msgid.link/20251029-work-namespace-nstree-listns-v4-43-2e6f823ebdc0@kernel.org
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-03 17:41:21 +01:00
Christian Brauner abac8de3e5
selftests/namespaces: fourth listns() test
Test listns() with LISTNS_CURRENT_USER.
List namespaces owned by current user namespace.

Link: https://patch.msgid.link/20251029-work-namespace-nstree-listns-v4-42-2e6f823ebdc0@kernel.org
Tested-by: syzbot@syzkaller.appspotmail.com
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-03 17:41:21 +01:00
Christian Brauner 46909d1343
selftests/namespaces: third listns() test
Test listns() pagination.
List namespaces in batches.

Link: https://patch.msgid.link/20251029-work-namespace-nstree-listns-v4-41-2e6f823ebdc0@kernel.org
Tested-by: syzbot@syzkaller.appspotmail.com
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-03 17:41:21 +01:00
Christian Brauner 6a68c7f919
selftests/namespaces: second listns() test
test listns() with type filtering.
List only network namespaces.

Link: https://patch.msgid.link/20251029-work-namespace-nstree-listns-v4-40-2e6f823ebdc0@kernel.org
Tested-by: syzbot@syzkaller.appspotmail.com
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-03 17:41:21 +01:00
Christian Brauner e2ff8d8864
selftests/namespaces: first listns() test
Test basic listns() functionality with the unified namespace tree.
List all active namespaces globally.

Link: https://patch.msgid.link/20251029-work-namespace-nstree-listns-v4-39-2e6f823ebdc0@kernel.org
Tested-by: syzbot@syzkaller.appspotmail.com
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-03 17:41:20 +01:00
Christian Brauner 158c5c786e
selftests/namespaces: add listns() wrapper
Add a wrapper for the listns() system call.

Link: https://patch.msgid.link/20251029-work-namespace-nstree-listns-v4-38-2e6f823ebdc0@kernel.org
Tested-by: syzbot@syzkaller.appspotmail.com
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-03 17:41:20 +01:00
Christian Brauner da3c02b70c
selftests/namespaces: fifteenth active reference count tests
Test different namespace types (net, uts, ipc) all contributing
active references to the same owning user namespace.

Link: https://patch.msgid.link/20251029-work-namespace-nstree-listns-v4-37-2e6f823ebdc0@kernel.org
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-03 17:41:20 +01:00
Christian Brauner a9d84bf7bf
selftests/namespaces: fourteenth active reference count tests
Test that user namespace as a child also propagates correctly.
Create user_A -> user_B, verify when user_B is active that user_A
is also active. This is different from non-user namespace children.

Link: https://patch.msgid.link/20251029-work-namespace-nstree-listns-v4-36-2e6f823ebdc0@kernel.org
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-03 17:41:20 +01:00
Christian Brauner 2a94bf7bb8
selftests/namespaces: thirteenth active reference count tests
Test that parent stays active as long as ANY child is active.
Create parent user namespace with two child net namespaces.
Parent should remain active until BOTH children are inactive.

Link: https://patch.msgid.link/20251029-work-namespace-nstree-listns-v4-35-2e6f823ebdc0@kernel.org
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-03 17:41:20 +01:00
Christian Brauner 04aee1a346
selftests/namespaces: twelth active reference count tests
Test hierarchical propagation with deep namespace hierarchy.
Create: init_user_ns -> user_A -> user_B -> net_ns
When net_ns is active, both user_A and user_B should be active.
This verifies the conditional recursion in __ns_ref_active_put() works.

Link: https://patch.msgid.link/20251029-work-namespace-nstree-listns-v4-34-2e6f823ebdc0@kernel.org
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-03 17:41:20 +01:00
Christian Brauner 26d238ea6a
selftests/namespaces: eleventh active reference count tests
Test that different namespace types with same owner all contribute
active references to the owning user namespace.

Link: https://patch.msgid.link/20251029-work-namespace-nstree-listns-v4-33-2e6f823ebdc0@kernel.org
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-03 17:41:20 +01:00
Christian Brauner e7585a9ef5
selftests/namespaces: tenth active reference count tests
Test multiple children sharing same parent.
Parent should stay active as long as ANY child is active.

Link: https://patch.msgid.link/20251029-work-namespace-nstree-listns-v4-32-2e6f823ebdc0@kernel.org
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-03 17:41:20 +01:00
Christian Brauner a8ce47a1ac
selftests/namespaces: ninth active reference count tests
Test multi-level hierarchy (3+ levels deep).
Grandparent → Parent → Child
When child is active, both parent AND grandparent should be active.

Link: https://patch.msgid.link/20251029-work-namespace-nstree-listns-v4-31-2e6f823ebdc0@kernel.org
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-03 17:41:19 +01:00
Christian Brauner 94f8711080
selftests/namespaces: eigth active reference count tests
Test that bind mounts keep namespaces in the tree even when inactive

Link: https://patch.msgid.link/20251029-work-namespace-nstree-listns-v4-30-2e6f823ebdc0@kernel.org
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-03 17:41:19 +01:00
Christian Brauner 4b971b07e4
selftests/namespaces: seventh active reference count tests
Test hierarchical active reference propagation.
When a child namespace is active, its owning user namespace should also
be active automatically due to hierarchical active reference propagation.
This ensures parents are always reachable when children are active.

Link: https://patch.msgid.link/20251029-work-namespace-nstree-listns-v4-29-2e6f823ebdc0@kernel.org
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-03 17:41:19 +01:00
Christian Brauner 47a5fd8ce1
selftests/namespaces: sixth active reference count tests
Test that an open file descriptor keeps a namespace active.
Even after the creating process exits, the namespace should remain
active as long as an fd is held open.

Link: https://patch.msgid.link/20251029-work-namespace-nstree-listns-v4-28-2e6f823ebdc0@kernel.org
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-03 17:41:19 +01:00
Christian Brauner c4803b255f
selftests/namespaces: fifth active reference count tests
Test PID namespace active ref tracking

Link: https://patch.msgid.link/20251029-work-namespace-nstree-listns-v4-27-2e6f823ebdc0@kernel.org
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-03 17:41:19 +01:00
Christian Brauner 28655ff253
selftests/namespaces: fourth active reference count tests
Test user namespace active ref tracking via credential lifecycle.

Link: https://patch.msgid.link/20251029-work-namespace-nstree-listns-v4-26-2e6f823ebdc0@kernel.org
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-03 17:41:19 +01:00
Christian Brauner c6e25d930b
selftests/namespaces: third active reference count tests
Test that a namespace remains active while a process is using it,
even after the creating process exits.

Link: https://patch.msgid.link/20251029-work-namespace-nstree-listns-v4-25-2e6f823ebdc0@kernel.org
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-03 17:41:19 +01:00
Christian Brauner 721c7e41b1
selftests/namespaces: second active reference count tests
Test namespace lifecycle: create a namespace in a child process, get a
file handle while it's active, then try to reopen after the process
exits (namespace becomes inactive).

Link: https://patch.msgid.link/20251029-work-namespace-nstree-listns-v4-24-2e6f823ebdc0@kernel.org
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-03 17:41:19 +01:00
Christian Brauner 6bdce845fd
selftests/namespaces: first active reference count tests
Test that initial namespaces can be reopened via file handle. Initial
namespaces should always have a ref count of one from boot.

Link: https://patch.msgid.link/20251029-work-namespace-nstree-listns-v4-23-2e6f823ebdc0@kernel.org
Tested-by: syzbot@syzkaller.appspotmail.com
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-03 17:41:18 +01:00
Christian Brauner e2b6e5eadc
selftests/filesystems: remove CLONE_NEWPIDNS from setup_userns() helper
This is effectively unused and doesn't really server any purpose after
having reviewed all of the tests that rely on it.

Link: https://patch.msgid.link/20251029-work-namespace-nstree-listns-v4-22-2e6f823ebdc0@kernel.org
Tested-by: syzbot@syzkaller.appspotmail.com
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-03 17:41:18 +01:00
Christian Brauner 6fc9baa49d
nsfs: update tools header
Ensure all the new uapi bits are visible for the selftests.

Link: https://patch.msgid.link/20251029-work-namespace-nstree-listns-v4-21-2e6f823ebdc0@kernel.org
Tested-by: syzbot@syzkaller.appspotmail.com
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-03 17:41:18 +01:00
Christian Brauner b36d4b6aa8
arch: hookup listns() system call
Add the listns() system call to all architectures.

Link: https://patch.msgid.link/20251029-work-namespace-nstree-listns-v4-20-2e6f823ebdc0@kernel.org
Tested-by: syzbot@syzkaller.appspotmail.com
Reviewed-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-03 17:41:18 +01:00
Christian Brauner 76b6f5dfb3
nstree: add listns()
Add a new listns() system call that allows userspace to iterate through
namespaces in the system. This provides a programmatic interface to
discover and inspect namespaces, enhancing existing namespace apis.

Currently, there is no direct way for userspace to enumerate namespaces
in the system. Applications must resort to scanning /proc/<pid>/ns/
across all processes, which is:

1. Inefficient - requires iterating over all processes
2. Incomplete - misses inactive namespaces that aren't attached to any
   running process but are kept alive by file descriptors, bind mounts,
   or parent namespace references
3. Permission-heavy - requires access to /proc for many processes
4. No ordering or ownership.
5. No filtering per namespace type: Must always iterate and check all
   namespaces.

The list goes on. The listns() system call solves these problems by
providing direct kernel-level enumeration of namespaces. It is similar
to listmount() but obviously tailored to namespaces.

/*
 * @req: Pointer to struct ns_id_req specifying search parameters
 * @ns_ids: User buffer to receive namespace IDs
 * @nr_ns_ids: Size of ns_ids buffer (maximum number of IDs to return)
 * @flags: Reserved for future use (must be 0)
 */
ssize_t listns(const struct ns_id_req *req, u64 *ns_ids,
               size_t nr_ns_ids, unsigned int flags);

Returns:
- On success: Number of namespace IDs written to ns_ids
- On error: Negative error code

/*
 * @size: Structure size
 * @ns_id: Starting point for iteration; use 0 for first call, then
 *         use the last returned ID for subsequent calls to paginate
 * @ns_type: Bitmask of namespace types to include (from enum ns_type):
 *           0: Return all namespace types
 *           MNT_NS: Mount namespaces
 *           NET_NS: Network namespaces
 *           USER_NS: User namespaces
 *           etc. Can be OR'd together
 * @user_ns_id: Filter results to namespaces owned by this user namespace:
 *              0: Return all namespaces (subject to permission checks)
 *              LISTNS_CURRENT_USER: Namespaces owned by caller's user namespace
 *              Other value: Namespaces owned by the specified user namespace ID
 */
struct ns_id_req {
        __u32 size;         /* sizeof(struct ns_id_req) */
        __u32 spare;        /* Reserved, must be 0 */
        __u64 ns_id;        /* Last seen namespace ID (for pagination) */
        __u32 ns_type;      /* Filter by namespace type(s) */
        __u32 spare2;       /* Reserved, must be 0 */
        __u64 user_ns_id;   /* Filter by owning user namespace */
};

Example 1: List all namespaces

void list_all_namespaces(void)
{
    struct ns_id_req req = {
        .size = sizeof(req),
        .ns_id = 0,          /* Start from beginning */
        .ns_type = 0,        /* All types */
        .user_ns_id = 0,     /* All user namespaces */
    };
    uint64_t ids[100];
    ssize_t ret;

    printf("All namespaces in the system:\n");
    do {
        ret = listns(&req, ids, 100, 0);
        if (ret < 0) {
            perror("listns");
            break;
        }

        for (ssize_t i = 0; i < ret; i++)
            printf("  Namespace ID: %llu\n", (unsigned long long)ids[i]);

        /* Continue from last seen ID */
        if (ret > 0)
            req.ns_id = ids[ret - 1];
    } while (ret == 100);  /* Buffer was full, more may exist */
}

Example 2: List network namespaces only

void list_network_namespaces(void)
{
    struct ns_id_req req = {
        .size = sizeof(req),
        .ns_id = 0,
        .ns_type = NET_NS,   /* Only network namespaces */
        .user_ns_id = 0,
    };
    uint64_t ids[100];
    ssize_t ret;

    ret = listns(&req, ids, 100, 0);
    if (ret < 0) {
        perror("listns");
        return;
    }

    printf("Network namespaces: %zd found\n", ret);
    for (ssize_t i = 0; i < ret; i++)
        printf("  netns ID: %llu\n", (unsigned long long)ids[i]);
}

Example 3: List namespaces owned by current user namespace

void list_owned_namespaces(void)
{
    struct ns_id_req req = {
        .size = sizeof(req),
        .ns_id = 0,
        .ns_type = 0,                      /* All types */
        .user_ns_id = LISTNS_CURRENT_USER, /* Current userns */
    };
    uint64_t ids[100];
    ssize_t ret;

    ret = listns(&req, ids, 100, 0);
    if (ret < 0) {
        perror("listns");
        return;
    }

    printf("Namespaces owned by my user namespace: %zd\n", ret);
    for (ssize_t i = 0; i < ret; i++)
        printf("  ns ID: %llu\n", (unsigned long long)ids[i]);
}

Example 4: List multiple namespace types

void list_network_and_mount_namespaces(void)
{
    struct ns_id_req req = {
        .size = sizeof(req),
        .ns_id = 0,
        .ns_type = NET_NS | MNT_NS,  /* Network and mount */
        .user_ns_id = 0,
    };
    uint64_t ids[100];
    ssize_t ret;

    ret = listns(&req, ids, 100, 0);
    printf("Network and mount namespaces: %zd found\n", ret);
}

Example 5: Pagination through large namespace sets

void list_all_with_pagination(void)
{
    struct ns_id_req req = {
        .size = sizeof(req),
        .ns_id = 0,
        .ns_type = 0,
        .user_ns_id = 0,
    };
    uint64_t ids[50];
    size_t total = 0;
    ssize_t ret;

    printf("Enumerating all namespaces with pagination:\n");

    while (1) {
        ret = listns(&req, ids, 50, 0);
        if (ret < 0) {
            perror("listns");
            break;
        }
        if (ret == 0)
            break;  /* No more namespaces */

        total += ret;
        printf("  Batch: %zd namespaces\n", ret);

        /* Last ID in this batch becomes start of next batch */
        req.ns_id = ids[ret - 1];

        if (ret < 50)
            break;  /* Partial batch = end of results */
    }

    printf("Total: %zu namespaces\n", total);
}

Permission Model

listns() respects namespace isolation and capabilities:

(1) Global listing (user_ns_id = 0):
    - Requires CAP_SYS_ADMIN in the namespace's owning user namespace
    - OR the namespace must be in the caller's namespace context (e.g.,
      a namespace the caller is currently using)
    - User namespaces additionally allow listing if the caller has
      CAP_SYS_ADMIN in that user namespace itself
(2) Owner-filtered listing (user_ns_id != 0):
    - Requires CAP_SYS_ADMIN in the specified owner user namespace
    - OR the namespace must be in the caller's namespace context
    - This allows unprivileged processes to enumerate namespaces they own
(3) Visibility:
    - Only "active" namespaces are listed
    - A namespace is active if it has a non-zero __ns_ref_active count
    - This includes namespaces used by running processes, held by open
      file descriptors, or kept active by bind mounts
    - Inactive namespaces (kept alive only by internal kernel
      references) are not visible via listns()

Link: https://patch.msgid.link/20251029-work-namespace-nstree-listns-v4-19-2e6f823ebdc0@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-03 17:41:18 +01:00
Christian Brauner 560e25e70f
nstree: add unified namespace list
Allow to walk the unified namespace list completely locklessly.

Link: https://patch.msgid.link/20251029-work-namespace-nstree-listns-v4-18-2e6f823ebdc0@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-03 17:41:18 +01:00
Christian Brauner a202a50092
nstree: simplify rbtree comparison helpers
They all do the same basic thing.

Link: https://patch.msgid.link/20251029-work-namespace-nstree-listns-v4-17-2e6f823ebdc0@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-03 17:41:18 +01:00
Christian Brauner 3c1a52f2a6
nstree: maintain list of owned namespaces
The namespace tree doesn't express the ownership concept of namespace
appropriately. Maintain a list of directly owned namespaces per user
namespace. This will allow userspace and the kernel to use the listns()
system call to walk the namespace tree by owning user namespace. The
rbtree is used to find the relevant namespace entry point which allows
to continue iteration and the owner list can be used to walk the tree
completely lock free.

Link: https://patch.msgid.link/20251029-work-namespace-nstree-listns-v4-16-2e6f823ebdc0@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-03 17:41:17 +01:00
Christian Brauner 3760342fd6
nstree: assign fixed ids to the initial namespaces
The initial set of namespace comes with fixed inode numbers making it
easy for userspace to identify them solely based on that information.
This has long preceeded anything here.

Similarly, let's assign fixed namespace ids for the initial namespaces.

Kill the cookie and use a sequentially increasing number. This has the
nice side-effect that the owning user namespace will always have a
namespace id that is smaller than any of it's descendant namespaces.

Link: https://patch.msgid.link/20251029-work-namespace-nstree-listns-v4-15-2e6f823ebdc0@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-03 17:41:17 +01:00
Christian Brauner 04173501a6
nstree: allow lookup solely based on inode
The namespace file handle struct nsfs_file_handle is uapi and userspace
is expressly allowed to generate file handles without going through
name_to_handle_at().

Allow userspace to generate a file handle where both the inode number
and the namespace type are zero and just pass in the unique namespace
id. The kernel uses the unified namespace tree to find the namespace and
open the file handle.

When the kernel creates a file handle via name_to_handle_at() it will
always fill in the type and the inode number allowing userspace to
retrieve core information.

Link: https://patch.msgid.link/20251029-work-namespace-nstree-listns-v4-14-2e6f823ebdc0@kernel.org
Tested-by: syzbot@syzkaller.appspotmail.com
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-03 17:41:17 +01:00
Christian Brauner 2ccaebc686
nstree: introduce a unified tree
This will allow userspace to lookup and stat a namespace simply by its
identifier without having to know what type of namespace it is.

Link: https://patch.msgid.link/20251029-work-namespace-nstree-listns-v4-13-2e6f823ebdc0@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-03 17:41:17 +01:00
Christian Brauner 8895d2a3db
ns: use anonymous struct to group list member
Make it easier to spot that they belong together conceptually.

Link: https://patch.msgid.link/20251029-work-namespace-nstree-listns-v4-12-2e6f823ebdc0@kernel.org
Tested-by: syzbot@syzkaller.appspotmail.com
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-03 17:41:17 +01:00
Christian Brauner 3a18f80918
ns: add active reference count
The namespace tree is, among other things, currently used to support
file handles for namespaces. When a namespace is created it is placed on
the namespace trees and when it is destroyed it is removed from the
namespace trees.

While a namespace is on the namespace trees with a valid reference count
it is possible to reopen it through a namespace file handle. This is all
fine but has some issues that should be addressed.

On current kernels a namespace is visible to userspace in the
following cases:

(1) The namespace is in use by a task.
(2) The namespace is persisted through a VFS object (namespace file
    descriptor or bind-mount).
    Note that (2) only cares about direct persistence of the namespace
    itself not indirectly via e.g., file->f_cred file references or
    similar.
(3) The namespace is a hierarchical namespace type and is the parent of
    a single or multiple child namespaces.

Case (3) is interesting because it is possible that a parent namespace
might not fulfill any of (1) or (2), i.e., is invisible to userspace but
it may still be resurrected through the NS_GET_PARENT ioctl().

Currently namespace file handles allow much broader access to namespaces
than what is currently possible via (1)-(3). The reason is that
namespaces may remain pinned for completely internal reasons yet are
inaccessible to userspace.

For example, a user namespace my remain pinned by get_cred() calls to
stash the opener's credentials into file->f_cred. As it stands file
handles allow to resurrect such a users namespace even though this
should not be possible via (1)-(3). This is a fundamental uapi change
that we shouldn't do if we don't have to.

Consider the following insane case: Various architectures support the
CONFIG_MMU_LAZY_TLB_REFCOUNT option which uses lazy TLB destruction.
When this option is set a userspace task's struct mm_struct may be used
for kernel threads such as the idle task and will only be destroyed once
the cpu's runqueue switches back to another task. But because of ptrace()
permission checks struct mm_struct stashes the user namespace of the
task that struct mm_struct originally belonged to. The kernel thread
will take a reference on the struct mm_struct and thus pin it.

So on an idle system user namespaces can be persisted for arbitrary
amounts of time which also means that they can be resurrected using
namespace file handles. That makes no sense whatsoever. The problem is
of course excarabted on large systems with a huge number of cpus.

To handle this nicely we introduce an active reference count which
tracks (1)-(3). This is easy to do as all of these things are already
managed centrally. Only (1)-(3) will count towards the active reference
count and only namespaces which are active may be opened via namespace
file handles.

The problem is that namespaces may be resurrected. Which means that they
can become temporarily inactive and will be reactived some time later.
Currently the only example of this is the SIOGCSKNS socket ioctl. The
SIOCGSKNS ioctl allows to open a network namespace file descriptor based
on a socket file descriptor.

If a socket is tied to a network namespace that subsequently becomes
inactive but that socket is persisted by another process in another
network namespace (e.g., via SCM_RIGHTS of pidfd_getfd()) then the
SIOCGSKNS ioctl will resurrect this network namespace.

So calls to open_related_ns() and open_namespace() will end up
resurrecting the corresponding namespace tree.

Note that the active reference count does not regulate the lifetime of
the namespace itself. This is still done by the normal reference count.
The active reference count can only be elevated if the regular reference
count is elevated.

The active reference count also doesn't regulate the presence of a
namespace on the namespace trees. It only regulates its visiblity to
namespace file handles (and in later patches to listns()).

A namespace remains on the namespace trees from creation until its
actual destruction. This will allow the kernel to always reach any
namespace trivially and it will also enable subsystems like bpf to walk
the namespace lists on the system for tracing or general introspection
purposes.

Note that different namespaces have different visibility lifetimes on
current kernels. While most namespace are immediately released when the
last task using them exits, the user- and pid namespace are persisted
and thus both remain accessible via /proc/<pid>/ns/<ns_type>.

The user namespace lifetime is aliged with struct cred and is only
released through exit_creds(). However, it becomes inaccessible to
userspace once the last task using it is reaped, i.e., when
release_task() is called and all proc entries are flushed. Similarly,
the pid namespace is also visible until the last task using it has been
reaped and the associated pid numbers are freed.

The active reference counts of the user- and pid namespace are
decremented once the task is reaped.

Link: https://patch.msgid.link/20251029-work-namespace-nstree-listns-v4-11-2e6f823ebdc0@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-03 17:41:17 +01:00
Christian Brauner 4b06b70c82
ns: rename to exit_nsproxy_namespaces()
The current naming is very misleading as this really isn't exiting all
of the task's namespaces. It is only exiting the namespaces that hang of
off nsproxy. Reflect that in the name.

Link: https://patch.msgid.link/20251029-work-namespace-nstree-listns-v4-10-2e6f823ebdc0@kernel.org
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-03 17:41:17 +01:00
Christian Brauner 6b053576ed
ns: add __ns_ref_read()
Implement ns_ref_read() the same way as ns_ref_{get,put}().
No point in making that any more special or different from the other
helpers.

Link: https://patch.msgid.link/20251029-work-namespace-nstree-listns-v4-9-2e6f823ebdc0@kernel.org
Tested-by: syzbot@syzkaller.appspotmail.com
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-03 17:41:16 +01:00
Christian Brauner 3dd50c5866
ns: initialize ns_list_node for initial namespaces
Make sure that the list is always initialized for initial namespaces.

Link: https://patch.msgid.link/20251029-work-namespace-nstree-listns-v4-8-2e6f823ebdc0@kernel.org
Fixes: 885fc8ac0a ("nstree: make iterator generic")
Tested-by: syzbot@syzkaller.appspotmail.com
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-03 17:41:16 +01:00
Christian Brauner 0b1765830c
ns: use NS_COMMON_INIT() for all namespaces
Now that we have a common initializer use it for all static namespaces.

Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-03 17:41:16 +01:00
Christian Brauner d915fe20e5
ns: add NS_COMMON_INIT()
Add an initializer that can be used for the ns common initialization for
static namespace such as most init namespaces.

Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://patch.msgid.link/87ecqhy2y5.ffs@tglx
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-03 17:41:16 +01:00
Christian Brauner 8627bc8c7d
ns: add missing authorship
I authored the files a short while ago.

Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-03 17:39:20 +01:00
Thomas Gleixner 43cc54d8db s390/uaccess: Use unsafe wrappers for ASM GOTO
ASM GOTO is miscompiled by GCC when it is used inside a auto cleanup scope:

bool foo(u32 __user *p, u32 val)
{
	scoped_guard(pagefault)
		unsafe_put_user(val, p, efault);
	return true;
efault:
	return false;
}

It ends up leaking the pagefault disable counter in the fault path. clang
at least fails the build.

S390 is not affected for unsafe_*_user() as it uses its own local label
already, but __get/put_kernel_nofault() lack that.

Rename them to arch_*_kernel_nofault() which makes the generic uaccess
header wrap it with a local label that makes both compilers emit correct
code.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Acked-by: Heiko Carstens <hca@linux.ibm.com>
Link: https://patch.msgid.link/20251027083745.483079889@linutronix.de
2025-11-03 15:26:10 +01:00
Thomas Gleixner 0988ea18c6 riscv/uaccess: Use unsafe wrappers for ASM GOTO
ASM GOTO is miscompiled by GCC when it is used inside a auto cleanup scope:

bool foo(u32 __user *p, u32 val)
{
	scoped_guard(pagefault)
		unsafe_put_user(val, p, efault);
	return true;
efault:
	return false;
}

It ends up leaking the pagefault disable counter in the fault path. clang
at least fails the build.

Rename unsafe_*_user() to arch_unsafe_*_user() which makes the generic
uaccess header wrap it with a local label that makes both compilers emit
correct code. Same for the kernel_nofault() variants.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20251027083745.419351819@linutronix.de
2025-11-03 15:26:10 +01:00
Thomas Gleixner 5002dd5314 powerpc/uaccess: Use unsafe wrappers for ASM GOTO
ASM GOTO is miscompiled by GCC when it is used inside a auto cleanup scope:

bool foo(u32 __user *p, u32 val)
{
	scoped_guard(pagefault)
		unsafe_put_user(val, p, efault);
	return true;
efault:
	return false;
}

It ends up leaking the pagefault disable counter in the fault path. clang
at least fails the build.

Rename unsafe_*_user() to arch_unsafe_*_user() which makes the generic
uaccess header wrap it with a local label that makes both compilers emit
correct code. Same for the kernel_nofault() variants.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20251027083745.356628509@linutronix.de
2025-11-03 15:26:09 +01:00
Thomas Gleixner 14219398e3 x86/uaccess: Use unsafe wrappers for ASM GOTO
ASM GOTO is miscompiled by GCC when it is used inside a auto cleanup scope:

bool foo(u32 __user *p, u32 val)
{
	scoped_guard(pagefault)
		unsafe_put_user(val, p, efault);
	return true;
efault:
	return false;
}

It ends up leaking the pagefault disable counter in the fault path. clang
at least fails the build.

Rename unsafe_*_user() to arch_unsafe_*_user() which makes the generic
uaccess header wrap it with a local label that makes both compilers emit
correct code. Same for the kernel_nofault() variants.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20251027083745.294359925@linutronix.de
2025-11-03 15:26:09 +01:00
Thomas Gleixner 3eb6660f26 uaccess: Provide ASM GOTO safe wrappers for unsafe_*_user()
ASM GOTO is miscompiled by GCC when it is used inside a auto cleanup scope:

bool foo(u32 __user *p, u32 val)
{
	scoped_guard(pagefault)
		unsafe_put_user(val, p, efault);
	return true;
efault:
	return false;
}

 e80:	e8 00 00 00 00       	call   e85 <foo+0x5>
 e85:	65 48 8b 05 00 00 00 00 mov    %gs:0x0(%rip),%rax
 e8d:	83 80 04 14 00 00 01 	addl   $0x1,0x1404(%rax)   // pf_disable++
 e94:	89 37                	mov    %esi,(%rdi)
 e96:	83 a8 04 14 00 00 01 	subl   $0x1,0x1404(%rax)   // pf_disable--
 e9d:	b8 01 00 00 00       	mov    $0x1,%eax           // success
 ea2:	e9 00 00 00 00       	jmp    ea7 <foo+0x27>      // ret
 ea7:	31 c0                	xor    %eax,%eax           // fail
 ea9:	e9 00 00 00 00       	jmp    eae <foo+0x2e>      // ret

which is broken as it leaks the pagefault disable counter on failure.

Clang at least fails the build.

Linus suggested to add a local label into the macro scope and let that
jump to the actual caller supplied error label.

       	__label__ local_label;                                  \
        arch_unsafe_get_user(x, ptr, local_label);              \
	if (0) {                                                \
	local_label:                                            \
		goto label;                                     \

That works for both GCC and clang.

clang:

 c80:	0f 1f 44 00 00       	   nopl   0x0(%rax,%rax,1)
 c85:	65 48 8b 0c 25 00 00 00 00 mov    %gs:0x0,%rcx
 c8e:	ff 81 04 14 00 00    	   incl   0x1404(%rcx)	   // pf_disable++
 c94:	31 c0                	   xor    %eax,%eax        // set retval to false
 c96:	89 37                      mov    %esi,(%rdi)      // write
 c98:	b0 01                	   mov    $0x1,%al         // set retval to true
 c9a:	ff 89 04 14 00 00    	   decl   0x1404(%rcx)     // pf_disable--
 ca0:	2e e9 00 00 00 00    	   cs jmp ca6 <foo+0x26>   // ret

The exception table entry points correctly to c9a

GCC:

 f70:   e8 00 00 00 00          call   f75 <baz+0x5>
 f75:   65 48 8b 05 00 00 00 00 mov    %gs:0x0(%rip),%rax
 f7d:   83 80 04 14 00 00 01    addl   $0x1,0x1404(%rax)  // pf_disable++
 f84:   8b 17                   mov    (%rdi),%edx
 f86:   89 16                   mov    %edx,(%rsi)
 f88:   83 a8 04 14 00 00 01    subl   $0x1,0x1404(%rax) // pf_disable--
 f8f:   b8 01 00 00 00          mov    $0x1,%eax         // success
 f94:   e9 00 00 00 00          jmp    f99 <baz+0x29>    // ret
 f99:   83 a8 04 14 00 00 01    subl   $0x1,0x1404(%rax) // pf_disable--
 fa0:   31 c0                   xor    %eax,%eax         // fail
 fa2:   e9 00 00 00 00          jmp    fa7 <baz+0x37>    // ret

The exception table entry points correctly to f99

So both compilers optimize out the extra goto and emit correct and
efficient code.

Provide a generic wrapper to do that to avoid modifying all the affected
architecture specific implementation with that workaround.

The only change required for architectures is to rename unsafe_*_user() to
arch_unsafe_*_user(). That's done in subsequent changes.

Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/877bweujtn.ffs@tglx
2025-11-03 15:26:09 +01:00
Thomas Gleixner 44c5b6768e ARM: uaccess: Implement missing __get_user_asm_dword()
When CONFIG_CPU_SPECTRE=n then get_user() is missing the 8 byte ASM variant
for no real good reason. This prevents using get_user(u64) in generic code.

Implement it as a sequence of two 4-byte reads with LE/BE awareness and
make the unsigned long (or long long) type for the intermediate variable to
read into dependend on the the target type.

The __long_type() macro and idea was lifted from PowerPC. Thanks to
Christophe for pointing it out.

Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202509120155.pFgwfeUD-lkp@intel.com/
Link: https://patch.msgid.link/20251027083745.168468637@linutronix.de
2025-11-03 15:26:09 +01:00
Lukas Wunner 51d0656959 genirq/manage: Reduce priority of forced secondary interrupt handler
Crystal reports that the PCIe Advanced Error Reporting driver gets stuck
in an infinite loop on PREEMPT_RT:

Both the primary interrupt handler aer_irq() as well as the secondary
handler aer_isr() are forced into threads with identical priority.
Crystal writes that on the ARM system in question, the primary handler
has to clear an error in the Root Error Status register...

   "before the next error happens, or else the hardware will set the
    Multiple ERR_COR Received bit.  If that bit is set, then aer_isr()
    can't rely on the Error Source Identification register, so it scans
    through all devices looking for errors -- and for some reason, on
    this system, accessing the AER registers (or any Config Space above
    0x400, even though there are capabilities located there) generates
    an Unsupported Request Error (but returns valid data).  Since this
    happens more than once, without aer_irq() preempting, it causes
    another multi error and we get stuck in a loop."

The issue does not show on non-PREEMPT_RT because the primary handler
runs in hardirq context and thus can preempt the threaded secondary
handler, clear the Root Error Status register and prevent the secondary
handler from getting stuck.

Emulate the same behavior on PREEMPT_RT by assigning a lower default
priority to the secondary handler if the primary handler is forced into
a thread.

Reported-by: Crystal Wood <crwood@redhat.com>
Signed-off-by: Lukas Wunner <lukas@wunner.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Crystal Wood <crwood@redhat.com>
Reviewed-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Link: https://patch.msgid.link/f6dcdb41be2694886b8dbf4fe7b3ab89e9d5114c.1761569303.git.lukas@wunner.de
Closes: https://lore.kernel.org/r/20250902224441.368483-1-crwood@redhat.com/
2025-11-01 21:30:02 +01:00
Frederic Weisbecker ba14500e4b timers/migration: Remove dead code handling idle CPU checking for remote timers
Idle migrators don't walk the whole tree in order to find out if there
are timers to migrate because they recorded the next deadline to be
verified within a single check in tmigr_requires_handle_remote().

Remove the related dead code and data.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://patch.msgid.link/20251024132536.39841-7-frederic@kernel.org
2025-11-01 20:38:25 +01:00
Frederic Weisbecker 93643b90d6 timers/migration: Remove unused "cpu" parameter from tmigr_get_group()
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://patch.msgid.link/20251024132536.39841-6-frederic@kernel.org
2025-11-01 20:38:25 +01:00
Frederic Weisbecker 3c8eb36e2a timers/migration: Assert that hotplug preparing CPU is part of stable active hierarchy
The CPU doing the prepare work for a remote target must be online from
the tree point of view and its hierarchy must be active, otherwise
propagating its active state up to the new root branch would be either
incorrect or racy.

Assert those conditions with more sanity checks.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://patch.msgid.link/20251024132536.39841-5-frederic@kernel.org
2025-11-01 20:38:25 +01:00
Frederic Weisbecker 5eb579dfd4 timers/migration: Fix imbalanced NUMA trees
When a CPU from a new node boots, the old root may happen to be
connected to the new root even if their node mismatch, as depicted in
the following scenario:

1) CPU 0 boots and creates the first group for node 0.

   [GRP0:0]
    node 0
      |
    CPU 0

2) CPU 1 from node 1 boots and creates a new top that corresponds to
   node 1, but it also connects the old root from node 0 to the new root
   from node 1 by mistake.

             [GRP1:0]
              node 1
            /        \
           /          \
   [GRP0:0]             [GRP0:1]
    node 0               node 1
      |                    |
    CPU 0                CPU 1

3) This eventually leads to an imbalanced tree where some node 0 CPUs
   migrate node 1 timers (and vice versa) way before reaching the
   crossnode groups, resulting in more frequent remote memory accesses
   than expected.

                      [GRP2:0]
                      NUMA_NO_NODE
                     /             \
             [GRP1:0]              [GRP1:1]
              node 1               node 0
            /        \                |
           /          \             [...]
   [GRP0:0]             [GRP0:1]
    node 0               node 1
      |                    |
    CPU 0...              CPU 1...

A balanced tree should only contain groups having children that belong
to the same node:

                      [GRP2:0]
                      NUMA_NO_NODE
                     /             \
             [GRP1:0]              [GRP1:0]
              node 0               node 1
            /        \             /      \
           /          \           /        \
   [GRP0:0]          [...]      [...]    [GRP0:1]
    node 0                                node 1
      |                                     |
    CPU 0...                              CPU 1...

In order to fix this, the hierarchy must be unfolded up to the crossnode
level as soon as a node mismatch is detected. For example the stage 2
above should lead to this layout:

                      [GRP2:0]
                      NUMA_NO_NODE
                     /             \
             [GRP1:0]              [GRP1:1]
              node 0               node 1
              /                         \
             /                           \
        [GRP0:0]                        [GRP0:1]
        node 0                           node 1
          |                                |
       CPU 0                             CPU 1

This means that not only GRP1:0 must be created but also GRP1:1 and
GRP2:0 in order to prepare a balanced tree for next CPUs to boot.

Fixes: 7ee9887703 ("timers: Implement the hierarchical pull model")
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://patch.msgid.link/20251024132536.39841-4-frederic@kernel.org
2025-11-01 20:38:25 +01:00
Frederic Weisbecker fa9620355d timers/migration: Remove locking on group connection
Initializing the tmc's group, the group's number of children and the
group's parent can all be done without locking because:

  1) Reading the group's parent and its group mask is done locklessly.

  2) The connections prepared for a given CPU hierarchy are visible to the
     target CPU once online, thanks to the CPU hotplug enforced memory
     ordering.

  3) In case of a newly created upper level, the new root and its
     connections and initialization are made visible by the CPU which made
     the connections. When that CPUs goes idle in the future, the new link
     is published by tmigr_inactive_up() through the atomic RmW on
     ->migr_state.

  4) If CPUs were still walking up the active hierarchy, they could observe
     the new root earlier. In this case the ordering is enforced by an
     early initialization of the group mask and by barriers that maintain
     address dependency as explained in:

     b729cc1ec2 ("timers/migration: Fix another race between hotplug and idle entry/exit")
     de3ced72a7 ("timers/migration: Enforce group initialization visibility to tree walkers")

  5) Timers are propagated by a chain of group locking from the bottom to
     the top. And while doing so, the tree also propagates groups links
     and initialization. Therefore remote expiration, which also relies
     on group locking, will observe those links and initialization while
     holding the root lock before walking the tree remotely and update
     remote timers. This is especially important for migrators in the
     active hierarchy that may observe the new root early.

Therefore the locking is unnecessary at initialization. If anything, it
just brings confusion. Remove it.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://patch.msgid.link/20251024132536.39841-3-frederic@kernel.org
2025-11-01 20:38:25 +01:00
Frederic Weisbecker 6c181b5667 timers/migration: Convert "while" loops to use "for"
Both the "do while" and "while" loops in tmigr_setup_groups() eventually
mimic the behaviour of "for" loops.

Simplify accordingly.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://patch.msgid.link/20251024132536.39841-2-frederic@kernel.org
2025-11-01 20:38:24 +01:00
Steve Wahl 4138787408 tick/sched: Limit non-timekeeper CPUs calling jiffies update
On large NUMA systems, while running a test program that saturates the
inter-processor and inter-NUMA links, acquiring the jiffies_lock can be
very expensive.

If the cpu designated to do jiffies updates (tick_do_timer_cpu) gets
delayed and other cpus decide to do the jiffies update themselves, a large
number of them decide to do so at the same time.

The inexpensive check against tick_next_period is far quicker than actually
acquiring the lock, so most of these get in line to obtain the lock.  If
obtaining the lock is slow enough, this spirals into the vast majority of
CPUs continuously being stuck waiting for this lock, just to obtain it and
find out that time has already been updated by another cpu. For example, on
one random entry to kdb by manually-injected NMI, 2912 of 3840 CPUs were
observed to be stuck there.

To avoid this, allow only one non-timekeeper CPU to call
tick_do_update_jiffies64() at any given time, resetting ts->stalled jiffies
only if the jiffies update function is actually called.

With this change, manually interrupting the test at most two CPUs are
observed to invoke tick_do_update_jiffies64() - the timekeeper and one
other.

Signed-off-by: Steve Wahl <steve.wahl@hpe.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Shrikanth Hegde <sshegde@linux.ibm.com>
Link: https://patch.msgid.link/20251027183456.343407-1-steve.wahl@hpe.com
2025-11-01 20:25:53 +01:00
Muchun Song 9ea2b810d5 genirq/proc: Fix race in show_irq_affinity()
Reading /proc/irq/N/smp_affinity* races with irq_set_affinity() and
irq_move_masked_irq(), leading to old or torn output for users.

After a user writes a new CPU mask to /proc/irq/N/affinity*, the syscall
returns success, yet a subsequent read of the same file immediately returns
a value different from what was just written.

That's due to a race between show_irq_affinity() and irq_move_masked_irq()
which lets the read observe a transient, inconsistent affinity mask.

Cure it by guarding the read with irq_desc::lock.

[ tglx: Massaged change log ]

Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://patch.msgid.link/20251028090408.76331-1-songmuchun@bytedance.com
2025-10-31 22:30:05 +01:00
Marc Zyngier 68c4c159a0 genirq: Fix percpu_devid irq affinity documentation
Stephen points out that some of the percpu_devid irq affinity
documentation is either missing or not matching the data structures.

Address all the issues in one go.

Fixes: 87b0031f7f ("irqdomain: Add firmware info reporting interface")
Fixes: 258e7d28a3 ("genirq: Add affinity to percpu_devid interrupt requests")
Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://patch.msgid.link/20251030143032.2035987-1-maz@kernel.org
2025-10-31 22:25:34 +01:00
Mateusz Guzik 20052f2ef0
fs: touch up predicts in putname()
1. we already expect the refcount is 1.
2. path creation predicts name == iname

I verified this straightens out the asm, no functional changes.

Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
Link: https://patch.msgid.link/20251029134952.658450-1-mjguzik@gmail.com
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-10-31 13:18:08 +01:00
Christian Brauner a77a59592f
Merge patch series "Add and use folio_next_pos()"
Matthew Wilcox (Oracle) <willy@infradead.org> says:

It's relatively common in filesystems to want to know the end of the
current folio we're looking at.  So common in fact that btrfs has its own
helper for that.  Lift that helper to filemap and use it everywhere that
I've noticed it could be used.  This actually fixes a long-standing bug
in ocfs2 on 32-bit systems with files larger than 2GiB.  Presumably this
is not a common configuration, but I've marked it for backport anyway.

The other filesystems are all fine; none of them have a bug, they're
just mildly inefficient.  I think this should all go in via Christian's
tree, ideally with acks from the various fs maintainers (cc'd on their
individual patches).

* patches from https://patch.msgid.link/20251024170822.1427218-1-willy@infradead.org:
  mm: Use folio_next_pos()
  xfs: Use folio_next_pos()
  netfs: Use folio_next_pos()
  iomap: Use folio_next_pos()
  gfs2: Use folio_next_pos()
  f2fs: Use folio_next_pos()
  ext4: Use folio_next_pos()
  buffer: Use folio_next_pos()
  btrfs: Use folio_next_pos()
  filemap: Add folio_next_pos()

Link: https://patch.msgid.link/20251024170822.1427218-1-willy@infradead.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-10-31 13:11:44 +01:00
Matthew Wilcox (Oracle) 60a70e6143
mm: Use folio_next_pos()
This is one instruction more efficient than open-coding folio_pos() +
folio_size().  It's the equivalent of (x + y) << z rather than
x << z + y << z.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Link: https://patch.msgid.link/20251024170822.1427218-11-willy@infradead.org
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Cc: Hugh Dickins <hughd@google.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: linux-mm@kvack.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-10-31 13:11:38 +01:00
Matthew Wilcox (Oracle) ac0a11113d
xfs: Use folio_next_pos()
This is one instruction more efficient than open-coding folio_pos() +
folio_size().  It's the equivalent of (x + y) << z rather than
x << z + y << z.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Link: https://patch.msgid.link/20251024170822.1427218-10-willy@infradead.org
Reviewed-by: Christoph Hellwig <hch@lst.de>
Cc: Carlos Maiolino <cem@kernel.org>
Cc: linux-xfs@vger.kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-10-31 13:11:38 +01:00
Matthew Wilcox (Oracle) 2408900d40
netfs: Use folio_next_pos()
This is one instruction more efficient than open-coding folio_pos() +
folio_size().  It's the equivalent of (x + y) << z rather than
x << z + y << z.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Link: https://patch.msgid.link/20251024170822.1427218-9-willy@infradead.org
Acked-by: David Howells <dhowells@redhat.com>
Reviewed-by: Paulo Alcantara (Red Hat) <pc@manguebit.org>
Cc: David Howells <dhowells@redhat.com>
Cc: Paulo Alcantara <pc@manguebit.org>
Cc: netfs@lists.linux.dev
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-10-31 13:11:38 +01:00
Matthew Wilcox (Oracle) ac97520804
iomap: Use folio_next_pos()
This is one instruction more efficient than open-coding folio_pos() +
folio_size().  It's the equivalent of (x + y) << z rather than
x << z + y << z.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Link: https://patch.msgid.link/20251024170822.1427218-8-willy@infradead.org
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Darrick J. Wong <djwong@kernel.org>
Cc: linux-xfs@vger.kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-10-31 13:11:38 +01:00
Matthew Wilcox (Oracle) 5f0fc78532
gfs2: Use folio_next_pos()
This is one instruction more efficient than open-coding folio_pos() +
folio_size().  It's the equivalent of (x + y) << z rather than
x << z + y << z.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Link: https://patch.msgid.link/20251024170822.1427218-7-willy@infradead.org
Cc: Andreas Gruenbacher <agruenba@redhat.com>
Cc: gfs2@lists.linux.dev
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-10-31 13:11:38 +01:00
Matthew Wilcox (Oracle) 4fcafa30b7
f2fs: Use folio_next_pos()
This is one instruction more efficient than open-coding folio_pos() +
folio_size().  It's the equivalent of (x + y) << z rather than
x << z + y << z.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Link: https://patch.msgid.link/20251024170822.1427218-6-willy@infradead.org
Reviewed-by: Chao Yu <chao@kernel.org>
Cc: Jaegeuk Kim <jaegeuk@kernel.org>
Cc: Chao Yu <chao@kernel.org>
Cc: linux-f2fs-devel@lists.sourceforge.net
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-10-31 13:11:38 +01:00
Matthew Wilcox (Oracle) 4db47b2521
ext4: Use folio_next_pos()
This is one instruction more efficient than open-coding folio_pos() +
folio_size().  It's the equivalent of (x + y) << z rather than
x << z + y << z.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Link: https://patch.msgid.link/20251024170822.1427218-5-willy@infradead.org
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Zhang Yi <yi.zhang@huawei.com>
Cc: Theodore Ts'o <tytso@mit.edu>
Cc: Andreas Dilger <adilger.kernel@dilger.ca>
Cc: linux-ext4@vger.kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-10-31 13:11:37 +01:00
Matthew Wilcox (Oracle) 6870892b64
buffer: Use folio_next_pos()
This is one instruction more efficient than open-coding folio_pos() +
folio_size().  It's the equivalent of (x + y) << z rather than
x << z + y << z.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Link: https://patch.msgid.link/20251024170822.1427218-4-willy@infradead.org
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Cc: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-10-31 13:11:37 +01:00
Matthew Wilcox (Oracle) 48f3784b17
btrfs: Use folio_next_pos()
btrfs defined its own variant of folio_next_pos() called folio_end().
This is an ambiguous name as 'end' might be exclusive or inclusive.
Switch to the new folio_next_pos().

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Link: https://patch.msgid.link/20251024170822.1427218-3-willy@infradead.org
Acked-by: David Sterba <dsterba@suse.com>
Cc: Chris Mason <clm@fb.com>
Cc: David Sterba <dsterba@suse.com>
Cc: linux-btrfs@vger.kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-10-31 13:11:37 +01:00
Matthew Wilcox (Oracle) 4511fd86db
filemap: Add folio_next_pos()
Replace the open-coded implementation in ocfs2 (which loses the top
32 bits on 32-bit architectures) with a helper in pagemap.h.

Fixes: 35edec1d52 (ocfs2: update truncate handling of partial clusters)
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Link: https://patch.msgid.link/20251024170822.1427218-2-willy@infradead.org
Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Cc: Mark Fasheh <mark@fasheh.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Joseph Qi <joseph.qi@linux.alibaba.com>
Cc: ocfs2-devel@lists.linux.dev
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-10-31 13:11:37 +01:00
Josh Poimboeuf 6568f14cb5 vmlinux.lds: Exclude .text.startup and .text.exit from TEXT_MAIN
An ftrace warning was reported in ftrace_init_ool_stub():

   WARNING: arch/powerpc/kernel/trace/ftrace.c:234 at ftrace_init_ool_stub+0x188/0x3f4, CPU#0: swapper/0

The problem is that the linker script is placing .text.startup in .text
rather than in .init.text, due to an inadvertent match of the TEXT_MAIN
'.text.[0-9a-zA-Z_]*' pattern.

This bug existed for some configurations before, but is only now coming
to light due to the TEXT_MAIN macro unification in commit 1ba9f89794
("vmlinux.lds: Unify TEXT_MAIN, DATA_MAIN, and related macros").

The .text.startup section consists of constructors which are used by
KASAN, KCSAN, and GCOV.  The constructors are only called during boot,
so .text.startup is supposed to match the INIT_TEXT pattern so it can be
placed in .init.text and freed after init.  But since INIT_TEXT comes
*after* TEXT_MAIN in the linker script, TEXT_MAIN needs to manually
exclude .text.startup.

Update TEXT_MAIN to exclude .text.startup (and its .text.startup.*
variant from -ffunction-sections), along with .text.exit and
.text.exit.* which should match EXIT_TEXT.

Specifically, use a series of more specific glob patterns to match
generic .text.* sections (for -ffunction-sections) while explicitly
excluding .text.startup[.*] and .text.exit[.*].

Also update INIT_TEXT and EXIT_TEXT to explicitly match their
-ffunction-sections variants (.text.startup.* and .text.exit.*).

Fixes: 1ba9f89794 ("vmlinux.lds: Unify TEXT_MAIN, DATA_MAIN, and related macros")
Closes: https://lore.kernel.org/72469502-ca37-4287-90b9-a751cecc498c@linux.ibm.com
Reported-by: Venkat Rao Bagalkote <venkat88@linux.ibm.com>
Debugged-by: Hari Bathini <hbathini@linux.ibm.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Venkat Rao Bagalkote <venkat88@linux.ibm.com>
Link: https://patch.msgid.link/07f74b4e5c43872572b7def30f2eac45f28675d9.1761872421.git.jpoimboe@kernel.org
2025-10-31 11:19:21 +01:00
Christian Brauner 36a304de26
nstree: simplify return
node_to_ns() checks for NULL and the assert isn't really helpful and
will have to be dropped later anyway.

Link: https://patch.msgid.link/20251029-work-namespace-nstree-listns-v4-7-2e6f823ebdc0@kernel.org
Tested-by: syzbot@syzkaller.appspotmail.com
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-10-31 10:16:24 +01:00
Christian Brauner 768b1565d9
cgroup: add cgroup namespace to tree after owner is set
Otherwise we trip VFS_WARN_ON_ONC() in __ns_tree_add_raw().

Link: https://patch.msgid.link/20251029-work-namespace-nstree-listns-v4-6-2e6f823ebdc0@kernel.org
Fixes: 7c60593985 ("cgroup: support ns lookup")
Tested-by: syzbot@syzkaller.appspotmail.com
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-10-31 10:16:24 +01:00
Christian Brauner 4af033dad6
nsfs: raise SB_I_NODEV and SB_I_NOEXEC
There's zero need for nsfs to allow device nodes or execution.

Link: https://patch.msgid.link/20251029-work-namespace-nstree-listns-v4-5-2e6f823ebdc0@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-10-31 10:16:24 +01:00
Christian Brauner b21cba8d87
pidfs: raise DCACHE_DONTCACHE explicitly
While pidfs dentries are never hashed and thus retain_dentry() will never
consider them for placing them on the LRU it isn't great to always have
to go and remember that. Raise DCACHE_DONTCACHE explicitly as a visual
marker that dentries aren't kept but freed immediately instead.

Link: https://patch.msgid.link/20251029-work-namespace-nstree-listns-v4-4-2e6f823ebdc0@kernel.org
Tested-by: syzbot@syzkaller.appspotmail.com
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-10-31 10:16:24 +01:00
Christian Brauner 6dbe134e4b
nsfs: raise DCACHE_DONTCACHE explicitly
While nsfs dentries are never hashed and thus retain_dentry() will never
consider them for placing them on the LRU it isn't great to always have
to go and remember that. Raise DCACHE_DONTCACHE explicitly as a visual
marker that dentries aren't kept but freed immediately instead.

Link: https://patch.msgid.link/20251029-work-namespace-nstree-listns-v4-3-2e6f823ebdc0@kernel.org
Tested-by: syzbot@syzkaller.appspotmail.com
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-10-31 10:16:23 +01:00
Christian Brauner 1e9a9be249
nsfs: use inode_just_drop()
Currently nsfs uses the default inode_generic_drop() fallback which
drops the inode when it's unlinked or when it's unhashed. Since nsfs
never hashes inodes that always amounts to dropping the inode.

But that's just annoying to have to reason through every time we look at
this code. Switch to inode_just_drop() which always drops the inode
explicitly. This also aligns the behavior with pidfs which does the
same.

Link: https://patch.msgid.link/20251029-work-namespace-nstree-listns-v4-2-2e6f823ebdc0@kernel.org
Tested-by: syzbot@syzkaller.appspotmail.com
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-10-31 10:16:23 +01:00
Christian Brauner c9822fad80
libfs: allow to specify s_d_flags
Make it possible for pseudo filesystems to specify default dentry flags.

Link: https://patch.msgid.link/20251029-work-namespace-nstree-listns-v4-1-2e6f823ebdc0@kernel.org
Tested-by: syzbot@syzkaller.appspotmail.com
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-10-31 10:16:23 +01:00
Eric Biggers 0bbb838f38
ecryptfs: Use MD5 library instead of crypto_shash
eCryptfs uses MD5 for a couple unusual purposes: to "mix" the key into
the IVs for file contents encryption (similar to ESSIV), and to prepend
some key-dependent bytes to the plaintext when encrypting filenames
(which is useless since eCryptfs encrypts the filenames with ECB).

Currently, eCryptfs computes these MD5 hashes using the crypto_shash
API.  Update it to instead use the MD5 library API.  This is simpler and
faster: the library doesn't require memory allocations, can't fail, and
provides direct access to MD5 without overhead such as indirect calls.

To preserve the existing behavior of eCryptfs support being disabled
when the kernel is booted with "fips=1", make ecryptfs_get_tree() check
fips_enabled itself.  Previously it relied on crypto_alloc_shash("md5")
failing.  I don't know for sure that this is actually needed; e.g., it
could be argued that eCryptfs's use of MD5 isn't for a security purpose
as far as FIPS is concerned.  But this preserves the existing behavior.

Tested by verifying that an existing eCryptfs can still be mounted with
a kernel that has this commit, with all the files matching.  Also tested
creating a filesystem with this commit and mounting+reading it without.

Signed-off-by: Eric Biggers <ebiggers@kernel.org>
Link: https://patch.msgid.link/20251011200010.193140-1-ebiggers@kernel.org
Acked-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-10-31 10:12:35 +01:00
Pankaj Raghav 10436adf9d
iomap: use largest_zero_folio() in iomap_dio_zero()
iomap_dio_zero() uses a custom allocated memory of zeroes for padding
zeroes. This was a temporary solution until there was a way to request a
zero folio that was greater than the PAGE_SIZE.

Use largest_zero_folio() function instead of using the custom allocated
memory of zeroes. There is no guarantee from largest_zero_folio()
function that it will always return a PMD sized folio. Adapt the code so
that it can also work if largest_zero_folio() returns a ZERO_PAGE.

Signed-off-by: Pankaj Raghav <p.raghav@samsung.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-10-31 10:12:35 +01:00
Thorsten Blum b2c43efc3c
initrd: Replace simple_strtol with kstrtoint to improve ramdisk_start_setup
Replace simple_strtol() with the recommended kstrtoint() for parsing the
'ramdisk_start=' boot parameter. Unlike simple_strtol(), which returns a
a long, kstrtoint() converts the string directly to an integer and
avoids implicit casting.

Check the return value of kstrtoint() and reject invalid values. This
adds error handling while preserving existing behavior for valid values,
and removes use of the deprecated simple_strtol() helper.

Signed-off-by: Thorsten Blum <thorsten.blum@linux.dev>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-10-31 10:12:32 +01:00
Nathan Chancellor 5ff8ad3909 kbuild: Add '-fms-extensions' to areas with dedicated CFLAGS
This is a follow up to commit c4781dc3d1 ("Kbuild: enable
-fms-extensions") but in a separate change due to being substantially
different from the initial submission.

There are many places within the kernel that use their own CFLAGS
instead of the main KBUILD_CFLAGS, meaning code written with the main
kernel's use of '-fms-extensions' in mind that may be tangentially
included in these areas will result in "error: declaration does not
declare anything" messages from the compiler.

Add '-fms-extensions' to all these areas to ensure consistency, along
with -Wno-microsoft-anon-tag to silence clang's warning about use of the
extension that the kernel cares about using. parisc does not build with
clang so it does not need this warning flag. LoongArch does not need it
either because -W flags from KBUILD_FLAGS are pulled into cflags-vdso.

Reported-by: Christian Brauner <brauner@kernel.org>
Closes: https://lore.kernel.org/20251030-meerjungfrau-getrocknet-7b46eacc215d@brauner/
Reviewed-by: Christian Brauner <brauner@kernel.org>
Acked-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Nathan Chancellor <nathan@kernel.org>
2025-10-30 21:26:28 -04:00
Chen Ni 5eccd32239 objtool: Remove unneeded semicolon
Remove unnecessary semicolons reported by Coccinelle/coccicheck and the
semantic patch at scripts/coccinelle/misc/semicolon.cocci.

Signed-off-by: Chen Ni <nichen@iscas.ac.cn>
Link: https://patch.msgid.link/20251020020916.1070369-1-nichen@iscas.ac.cn
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
2025-10-30 08:29:46 -07:00
Thorsten Blum 0ccf30fc64 x86/smpboot: Mark native_play_dead() as __noreturn
native_play_dead() ends by calling the non-returning function
hlt_play_dead() and therefore also never returns.

The !CONFIG_HOTPLUG_CPU stub version of native_play_dead()
unconditionally calls BUG() and does not return either.

Add the __noreturn attribute to both function definitions and their
declaration to document this behavior and to potentially improve
compiler optimizations.

Remove the obsolete comment, and add native_play_dead() to the objtool's
list of __noreturn functions.

Signed-off-by: Thorsten Blum <thorsten.blum@linux.dev>
Link: https://patch.msgid.link/20251027155107.183136-1-thorsten.blum@linux.dev
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
2025-10-30 08:29:41 -07:00
Yu Peng ca8313fd83 x86/microcode: Mark early_parse_cmdline() as __init
Fix section mismatch warning reported by modpost:

  .text:early_parse_cmdline() -> .init.data:boot_command_line

The function early_parse_cmdline() is only called during init and accesses
init data, so mark it __init to match its usage.

  [ bp: This happens only when the toolchain fails to inline the function and
    I haven't been able to reproduce it with any toolchain I'm using. Patch is
    obviously correct regardless. ]

Signed-off-by: Yu Peng <pengyu@kylinos.cn>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Link: https://patch.msgid.link/all/20251030123757.1410904-1-pengyu@kylinos.cn
2025-10-30 14:33:31 +01:00
Borislav Petkov (AMD) 8d17104506 x86/microcode/AMD: Select which microcode patch to load
All microcode patches up to the proper BIOS Entrysign fix are loaded
only after the sha256 signature carried in the driver has been verified.

Microcode patches after the Entrysign fix has been applied, do not need
that signature verification anymore.

In order to not abandon machines which haven't received the BIOS update
yet, add the capability to select which microcode patch to load.

The corresponding microcode container supplied through firmware-linux
has been modified to carry two patches per CPU type
(family/model/stepping) so that the proper one gets selected.

Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Tested-by: Waiman Long <longman@redhat.com>
Link: https://patch.msgid.link/20251027133818.4363-1-bp@kernel.org
2025-10-30 14:29:54 +01:00
Christian Brauner 036375522b pidfs: expose coredump signal
Userspace needs access to the signal that caused the coredump before the
coredumping process has been reaped. Expose it as part of the coredump
information in struct pidfd_info. After the process has been reaped that
info is also available as part of PIDFD_INFO_EXIT's exit_code field.

Link: https://patch.msgid.link/20251028-work-coredump-signal-v1-8-ca449b7b7aa0@kernel.org
Reviewed-by: Alexander Mikhalitsyn <aleksandr.mikhalitsyn@canonical.com>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-10-30 14:25:14 +01:00
Christian Brauner 90df6ff685 pidfs: drop struct pidfs_exit_info
This is not needed anymore now that we have the new scheme to guarantee
all-or-nothing information exposure.

Link: https://patch.msgid.link/20251028-work-coredump-signal-v1-7-ca449b7b7aa0@kernel.org
Reviewed-by: Alexander Mikhalitsyn <aleksandr.mikhalitsyn@canonical.com>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-10-30 14:25:14 +01:00
Christian Brauner ad6e3ea683 pidfs: prepare to drop exit_info pointer
There will likely be more info that we need to store in struct
pidfs_attr. We need to make sure that some of the information such as
exit info or coredump info that consists of multiple bits is either
available completely or not at all, but never partially. Currently we
use a pointer that we assign to. That doesn't scale. We can't waste a
pointer for each mulit-part information struct we want to expose. Use a
bitmask instead.

Link: https://patch.msgid.link/20251028-work-coredump-signal-v1-6-ca449b7b7aa0@kernel.org
Reviewed-by: Alexander Mikhalitsyn <aleksandr.mikhalitsyn@canonical.com>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-10-30 14:25:14 +01:00
Christian Brauner dfd78546c9 pidfd: add a new supported_mask field
Some of the future fields in struct pidfd_info can be optional. If the
kernel has nothing to emit in that field, then it doesn't set the flag
in the reply. This presents a problem: There is currently no way to know
what mask flags the kernel supports since one can't always count on them
being in the reply.

Add a new PIDFD_INFO_SUPPORTED_MASK flag and field that the kernel can
set in the reply. Userspace can use this to determine if the fields it
requires from the kernel are supported. This also gives us a way to
deprecate fields in the future, if that should become necessary.

Link: https://patch.msgid.link/20251028-work-coredump-signal-v1-5-ca449b7b7aa0@kernel.org
Reviewed-by: Alexander Mikhalitsyn <aleksandr.mikhalitsyn@canonical.com>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-10-30 14:25:13 +01:00
Christian Brauner d8fc51d8fa pidfs: add missing BUILD_BUG_ON() assert on struct pidfd_info
Validate that the size of struct pidfd_info is correctly updated.

Link: https://patch.msgid.link/20251028-work-coredump-signal-v1-4-ca449b7b7aa0@kernel.org
Fixes: 1d8db6fd69 ("pidfs, coredump: add PIDFD_INFO_COREDUMP")
Reviewed-by: Alexander Mikhalitsyn <aleksandr.mikhalitsyn@canonical.com>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-10-30 14:25:13 +01:00
Christian Brauner 4061c43a99 pidfs: add missing PIDFD_INFO_SIZE_VER1
We grew struct pidfd_info not too long ago.

Link: https://patch.msgid.link/20251028-work-coredump-signal-v1-3-ca449b7b7aa0@kernel.org
Fixes: 1d8db6fd69 ("pidfs, coredump: add PIDFD_INFO_COREDUMP")
Reviewed-by: Alexander Mikhalitsyn <aleksandr.mikhalitsyn@canonical.com>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-10-30 14:25:13 +01:00
Christian Brauner fe0e6ce3fd pidfs: fix PIDFD_INFO_COREDUMP handling
When PIDFD_INFO_COREDUMP is requested we raise it unconditionally in the
returned mask even if no coredump actually did take place. This was
done because we assumed that the later check whether ->coredump_mask as
non-zero detects that it is zero and then retrieves the dumpability
settings from the task's mm. This has issues though becuase there are
tasks that might not have any mm. Also it's just not very cleanly
implemented. Fix this.

Link: https://patch.msgid.link/20251028-work-coredump-signal-v1-2-ca449b7b7aa0@kernel.org
Reviewed-by: Alexander Mikhalitsyn <aleksandr.mikhalitsyn@canonical.com>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-10-30 14:25:13 +01:00
Christian Brauner ccb3851ce7 pidfs: use guard() for task_lock
Use a guard().

Link: https://patch.msgid.link/20251028-work-coredump-signal-v1-1-ca449b7b7aa0@kernel.org
Reviewed-by: Alexander Mikhalitsyn <aleksandr.mikhalitsyn@canonical.com>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-10-30 14:25:13 +01:00
Peter Zijlstra aa7387e79a unwind_user/x86: Fix arch=um build
Add CONFIG_HAVE_UNWIND_USER_FP guards to make sure this code
doesn't break arch=um builds.

Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Closes: https://lore.kernel.org/oe-kbuild-all/202510291919.FFGyU7nq-lkp@intel.com/
2025-10-30 09:43:14 +01:00
Rasmus Villemoes c4781dc3d1
Kbuild: enable -fms-extensions
Once in a while, it turns out that enabling -fms-extensions could
allow some slightly prettier code. But every time it has come up, the
code that had to be used instead has been deemed "not too awful" and
not worth introducing another compiler flag for.

That's probably true for each individual case, but then it's somewhat
of a chicken/egg situation.

If we just "bite the bullet" as Linus says and enable it once and for
all, it is available whenever a use case turns up, and no individual
case has to justify it.

A lore.kernel.org search provides these examples:

- https://lore.kernel.org/lkml/200706301813.58435.agruen@suse.de/
- https://lore.kernel.org/lkml/20180419152817.GD25406@bombadil.infradead.org/
- https://lore.kernel.org/lkml/170622208395.21664.2510213291504081000@noble.neil.brown.name/
- https://lore.kernel.org/lkml/87h6475w9q.fsf@prevas.dk/
- https://lore.kernel.org/lkml/CAHk-=wjeZwww6Zswn6F_iZTpUihTSNKYppLqj36iQDDhfntuEw@mail.gmail.com/

Undoubtedly, there are more places in the code where this could also
be used but where -fms-extensions just didn't come up in any
discussion.

Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Acked-by: David Sterba <dsterba@suse.com>
Link: https://patch.msgid.link/20251020142228.1819871-2-linux@rasmusvillemoes.dk
[nathan: Move disabled clang warning to scripts/Makefile.extrawarn and
         adjust comment]
Signed-off-by: Nathan Chancellor <nathan@kernel.org>
2025-10-29 16:23:47 -07:00
Nathan Chancellor a6773e6932
jfs: Rename _inline to avoid conflict with clang's '-fms-extensions'
Building fs/jfs with clang and '-fms-extensions' errors with:

  In file included from fs/jfs/jfs_unicode.c:8:
  fs/jfs/jfs_incore.h:86:13: error: type name does not allow function specifier to be specified
     86 |                                         unchar _inline[128];
        |                                                ^
  fs/jfs/jfs_incore.h:86:20: error: expected member name or ';' after declaration specifiers
     86 |                                         unchar _inline[128];
        |                                         ~~~~~~~~~~~~~~^

'-fms-extensions' in clang enables several other Microsoft specific
keywords such as _inline [1], presumably for compatibility with MSVC, as
Microsoft's documentation [2] mentions:

  For compatibility with previous versions, _inline and _forceinline are
  synonyms for __inline and __forceinline, respectively

Rename the _inline array in 'struct jfs_inode_info' to _inline_sym to
avoid this conflict, which is not a large workaround as this member is
only ever referred to via the i_inline macro.

Link: 249883d0c5/clang/include/clang/Basic/TokenKinds.def (L744-L79) [1]
Link: https://learn.microsoft.com/en-us/cpp/c-language/inline-functions [2]
Acked-by: Dave Kleikamp <dave.kleikamp@oracle.com>
Link: https://patch.msgid.link/20251023-jfs-fix-conflict-with-clang-ms-ext-v1-1-e219d59a1e68@kernel.org
Signed-off-by: Nathan Chancellor <nathan@kernel.org>
2025-10-29 16:22:21 -07:00
Julian Sun 4952f35f05
fs: Make wbc_to_tag() inline and use it in fs.
The logic in wbc_to_tag() is widely used in file systems, so modify this
function to be inline and use it in file systems.

This patch has only passed compilation tests, but it should be fine.

Signed-off-by: Julian Sun <sunjunchao@bytedance.com>
Reviewed-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-10-29 23:33:48 +01:00
Christian Brauner 891bea757c
Merge patch series "allow file systems to increase the minimum writeback chunk size v2"
Christoph Hellwig <hch@lst.de> says:

The relatively low minimal writeback size of 4MiB leads means that
written back inodes on rotational media are switched a lot.  Besides
introducing additional seeks, this also can lead to extreme file
fragmentation on zoned devices when a lot of files are cached relative
to the available writeback bandwidth.

Add a superblock field that allows the file system to override the
default size, and set it to the zone size for zoned XFS.

* patches from https://patch.msgid.link/20251017034611.651385-1-hch@lst.de:
  xfs: set s_min_writeback_pages for zoned file systems
  writeback: allow the file system to override MIN_WRITEBACK_PAGES
  writeback: cleanup writeback_chunk_size

Link: https://patch.msgid.link/20251017034611.651385-1-hch@lst.de
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-10-29 15:54:36 +01:00
Christoph Hellwig 015a544077
xfs: set s_min_writeback_pages for zoned file systems
Set s_min_writeback_pages to the zone size, so that writeback always
writes up to a full zone.  This ensures that writeback does not add
spurious file fragmentation when writing back a large number of
files that are larger than the zone size.

Fixes: 4e4d520755 ("xfs: add the zoned space allocator")
Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://patch.msgid.link/20251017034611.651385-4-hch@lst.de
Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-10-29 15:54:31 +01:00
Christoph Hellwig 90db4d4441
writeback: allow the file system to override MIN_WRITEBACK_PAGES
The relatively low minimal writeback size of 4MiB means that written back
inodes on rotational media are switched a lot.  Besides introducing
additional seeks, this also can lead to extreme file fragmentation on
zoned devices when a lot of files are cached relative to the available
writeback bandwidth.

Add a superblock field that allows the file system to override the
default size.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://patch.msgid.link/20251017034611.651385-3-hch@lst.de
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-10-29 15:54:31 +01:00
Christoph Hellwig 151d0922bf
writeback: cleanup writeback_chunk_size
Return the pages directly when calculated instead of first assigning
them back to a variable, and directly return for the data integrity /
tagged case instead of going through an else clause.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://patch.msgid.link/20251017034611.651385-2-hch@lst.de
Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Nirjhar Roy (IBM) <nirjhar.roy.lists@gmail.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-10-29 15:54:31 +01:00
Christian Brauner 211c43d093
Merge patch series "filemap_* writeback interface cleanups v2"
Christoph Hellwig <hch@lst.de> says:

While looking at the filemap writeback code, I think adding
filemap_fdatawrite_wbc ended up being a mistake, as all but the original
btrfs caller should be using better high level interfaces instead.  This
series removes all these, switches btrfs to a more specific interfaces
and also cleans up another too low-level interface.  With this the
writeback_control that is passed to the writeback code is only
initialized in three places, although there are a lot more places in
file system code that never reach the common writeback code.

* patches from https://patch.msgid.link/20251024080431.324236-1-hch@lst.de:
  mm: rename filemap_fdatawrite_range_kick to filemap_flush_range
  mm: remove __filemap_fdatawrite_range
  mm: remove filemap_fdatawrite_wbc
  mm: remove __filemap_fdatawrite
  mm,btrfs: add a filemap_flush_nr helper
  btrfs: push struct writeback_control into start_delalloc_inodes
  btrfs: use the local tmp_inode variable in start_delalloc_inodes
  ocfs2: don't opencode filemap_fdatawrite_range in ocfs2_journal_submit_inode_data_buffers
  9p: don't opencode filemap_fdatawrite_range in v9fs_mmap_vm_close
  mm: don't opencode filemap_fdatawrite_range in filemap_invalidate_inode

Link: https://patch.msgid.link/20251024080431.324236-1-hch@lst.de
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-10-29 15:50:48 +01:00
Christoph Hellwig c28d67b33c
mm: rename filemap_fdatawrite_range_kick to filemap_flush_range
Rename filemap_fdatawrite_range_kick to filemap_flush_range because it
is the ranged version of filemap_flush.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://patch.msgid.link/20251024080431.324236-11-hch@lst.de
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-10-29 15:50:42 +01:00
Christoph Hellwig 45cbce5b88
mm: remove __filemap_fdatawrite_range
Use filemap_fdatawrite_range and filemap_fdatawrite_range_kick instead
of the low-level __filemap_fdatawrite_range that requires the caller
to know the internals of the writeback_control structure and remove
__filemap_fdatawrite_range now that it is trivial and only two callers
would be left.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://patch.msgid.link/20251024080431.324236-10-hch@lst.de
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-10-29 15:50:42 +01:00
Christoph Hellwig 1bcb413d0c
mm: remove filemap_fdatawrite_wbc
Replace filemap_fdatawrite_wbc, which exposes a writeback_control to the
callers with a filemap_writeback helper that takes all the possible
arguments and declares the writeback_control itself.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://patch.msgid.link/20251024080431.324236-9-hch@lst.de
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-10-29 15:50:41 +01:00
Christoph Hellwig 7359651448
mm: remove __filemap_fdatawrite
And rewrite filemap_fdatawrite to use filemap_fdatawrite_range instead
to have a simpler call chain.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://patch.msgid.link/20251024080431.324236-8-hch@lst.de
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-10-29 15:50:41 +01:00
Christoph Hellwig 7fabcb7fba
mm,btrfs: add a filemap_flush_nr helper
Abstract out the btrfs-specific behavior of kicking off I/O on a number
of pages on an address_space into a well-defined helper.

Note: there is no kerneldoc comment for the new function because it is
not part of the public API.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://patch.msgid.link/20251024080431.324236-7-hch@lst.de
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-10-29 15:50:41 +01:00
Christoph Hellwig c9501112e3
btrfs: push struct writeback_control into start_delalloc_inodes
In preparation for changing the filemap_fdatawrite_wbc API to not expose
the writeback_control to the callers, push the wbc declaration next to
the filemap_fdatawrite_wbc call and just pass the nr_to_write value to
start_delalloc_inodes.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://patch.msgid.link/20251024080431.324236-6-hch@lst.de
Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-10-29 15:50:41 +01:00
Christoph Hellwig 41e52c6447
btrfs: use the local tmp_inode variable in start_delalloc_inodes
start_delalloc_inodes has a struct inode * pointer available in the
main loop, use it instead of re-calculating it from the btrfs inode.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://patch.msgid.link/20251024080431.324236-5-hch@lst.de
Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-10-29 15:50:41 +01:00
Christoph Hellwig 890f141da0
ocfs2: don't opencode filemap_fdatawrite_range in ocfs2_journal_submit_inode_data_buffers
Use filemap_fdatawrite_range instead of opencoding the logic using
filemap_fdatawrite_wbc.  There is a slight change in the conversion
as nr_to_write is now set to LONG_MAX instead of double the number
of the pages in the range.  LONG_MAX is the usual nr_to_write for
WB_SYNC_ALL writeback, and the value expected by lower layers here.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://patch.msgid.link/20251024080431.324236-4-hch@lst.de
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-10-29 15:50:41 +01:00
Christoph Hellwig 3c2e5cee5e
9p: don't opencode filemap_fdatawrite_range in v9fs_mmap_vm_close
Use filemap_fdatawrite_range instead of opencoding the logic using
filemap_fdatawrite_wbc.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://patch.msgid.link/20251024080431.324236-3-hch@lst.de
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-10-29 15:50:41 +01:00
Christoph Hellwig a21134b5d6
mm: don't opencode filemap_fdatawrite_range in filemap_invalidate_inode
Use filemap_fdatawrite_range instead of opencoding the logic using
filemap_fdatawrite_wbc.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://patch.msgid.link/20251024080431.324236-2-hch@lst.de
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-10-29 15:50:40 +01:00
Tengda Wu ced37e9cea x86/dumpstack: Prevent KASAN false positive warnings in __show_regs()
When triggering a stack dump via sysrq (echo t > /proc/sysrq-trigger),
KASAN may report false-positive out-of-bounds access:

  BUG: KASAN: out-of-bounds in __show_regs+0x4b/0x340
  Call Trace:
    dump_stack_lvl
    print_address_description.constprop.0
    print_report
    __show_regs
    show_trace_log_lvl
    sched_show_task
    show_state_filter
    sysrq_handle_showstate
    __handle_sysrq
    write_sysrq_trigger
    proc_reg_write
    vfs_write
    ksys_write
    do_syscall_64
    entry_SYSCALL_64_after_hwframe

The issue occurs as follows:

  Task A (walk other tasks' stacks)           Task B (running)
  1. echo t > /proc/sysrq-trigger
  show_trace_log_lvl
    regs = unwind_get_entry_regs()
    show_regs_if_on_stack(regs)
                                              2. The stack value pointed by
                                                 `regs` keeps changing, and
                                                 so are the tags in its
                                                 KASAN shadow region.
      __show_regs(regs)
        regs->ax, regs->bx, ...
          3. hit KASAN redzones, OOB

When task A walks task B's stack without suspending it, the continuous changes
in task B's stack (and corresponding KASAN shadow tags) may cause task A to
hit KASAN redzones when accessing obsolete values on the stack, resulting in
false positive reports.

Simply stopping the task before unwinding is not a viable fix, as it would
alter the state intended to inspect. This is especially true for diagnosing
misbehaving tasks (e.g., in a hard lockup), where stopping might fail or hide
the root cause by changing the call stack.

Therefore, fix this by disabling KASAN checks during asynchronous stack
unwinding, which is identified when the unwinding task does not match the
current task (task != current).

  [ bp: Align arguments on function's opening brace. ]

Fixes: 3b3fa11bc7 ("x86/dumpstack: Print any pt_regs found on the stack")
Signed-off-by: Tengda Wu <wutengda@huaweicloud.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Reviewed-by: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Acked-by: Josh Poimboeuf <jpoimboe@kernel.org>
Link: https://patch.msgid.link/all/20251023090632.269121-1-wutengda@huaweicloud.com
2025-10-29 13:07:21 +01:00
Peter Zijlstra c69993ecdd perf: Support deferred user unwind
Add support for deferred userspace unwind to perf.

Where perf currently relies on in-place stack unwinding; from NMI
context and all that. This moves the userspace part of the unwind to
right before the return-to-userspace.

This has two distinct benefits, the biggest is that it moves the
unwind to a faultable context. It becomes possible to fault in debug
info (.eh_frame, SFrame etc.) that might not otherwise be readily
available. And secondly, it de-duplicates the user callchain where
multiple samples happen during the same kernel entry.

To facilitate this the perf interface is extended with a new record
type:

  PERF_RECORD_CALLCHAIN_DEFERRED

and two new attribute flags:

  perf_event_attr::defer_callchain - to request the user unwind be deferred
  perf_event_attr::defer_output    - to request PERF_RECORD_CALLCHAIN_DEFERRED records

The existing PERF_RECORD_SAMPLE callchain section gets a new
context type:

  PERF_CONTEXT_USER_DEFERRED

After which will come a single entry, denoting the 'cookie' of the
deferred callchain that should be attached here, matching the 'cookie'
field of the above mentioned PERF_RECORD_CALLCHAIN_DEFERRED.

The 'defer_callchain' flag is expected on all events with
PERF_SAMPLE_CALLCHAIN. The 'defer_output' flag is expect on the event
responsible for collecting side-band events (like mmap, comm etc.).
Setting 'defer_output' on multiple events will get you duplicated
PERF_RECORD_CALLCHAIN_DEFERRED records.

Based on earlier patches by Josh and Steven.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patch.msgid.link/20251023150002.GR4067720@noisy.programming.kicks-ass.net
2025-10-29 10:29:58 +01:00
Peter Zijlstra ae25884ad7 unwind_user/x86: Teach FP unwind about start of function
When userspace is interrupted at the start of a function, before we
get a chance to complete the frame, unwind will miss one caller.

X86 has a uprobe specific fixup for this, add bits to the generic
unwinder to support this.

Suggested-by: Jens Remus <jremus@linux.ibm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patch.msgid.link/20251024145156.GM4068168@noisy.programming.kicks-ass.net
2025-10-29 10:29:58 +01:00
Josh Poimboeuf 49cf34c081 unwind_user/x86: Enable frame pointer unwinding on x86
Use ARCH_INIT_USER_FP_FRAME to describe how frame pointers are unwound
on x86, and enable CONFIG_HAVE_UNWIND_USER_FP accordingly so the
unwind_user interfaces can be used.

Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patch.msgid.link/20250827193828.347397433@kernel.org
2025-10-29 10:29:58 +01:00
Peter Zijlstra c79dd946e3 unwind: Implement compat fp unwind
It is important to be able to unwind compat tasks too.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patch.msgid.link/20250924080119.613695709@infradead.org
2025-10-29 10:29:57 +01:00
Peter Zijlstra 5578534e4b unwind: Simplify unwind_user_next_fp() alignment check
2^log_2(n) == n

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Link: https://patch.msgid.link/20250924080119.497867836@infradead.org
2025-10-29 10:29:57 +01:00
Peter Zijlstra 639214f65b unwind: Make unwind_task_info::unwind_mask consistent
The unwind_task_info::unwind_mask was manipulated using a mixture of:

  regular store
  WRITE_ONCE()
  try_cmpxchg()
  set_bit()
  atomic_long_*()

Clean up and make it consistently atomic_long_t.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patch.msgid.link/20250924080119.384384486@infradead.org
2025-10-29 10:29:57 +01:00
Peter Zijlstra 42b9138f81 unwind: Simplify unwind_user_faultable()
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patch.msgid.link/20250924080119.271671514@infradead.org
2025-10-29 10:29:56 +01:00
Peter Zijlstra 1e74829f36 unwind: Clarify calling context
The get_cookie() function hard relies on IRQs being disabled, but this
isn't immediately obvious when reading the function.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Link: https://patch.msgid.link/20250924080119.122507632@infradead.org
2025-10-29 10:29:56 +01:00
Peter Zijlstra a38a64712e unwind: Fix unwind_deferred_request() vs NMI
task_work_add(RWA_RESUME) isn't NMI-safe, use TWA_NMI_CURRENT when
used from NMI context.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Link: https://patch.msgid.link/20250924080119.005422353@infradead.org
2025-10-29 10:29:56 +01:00
Peter Zijlstra ae577ea0bc unwind: Add comment to unwind_deferred_task_exit()
Explain why unwind_deferred_task_exit() exist and its constraints.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Link: https://patch.msgid.link/20250924080118.893367437@infradead.org
2025-10-29 10:29:55 +01:00
Peter Zijlstra 52a1ec718b unwind: Simplify unwind_reset_info()
Invert the condition of the first if and make it an early exit to
reduce an indent level for the rest fo the function.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Link: https://patch.msgid.link/20250924080118.777916262@infradead.org
2025-10-29 10:29:55 +01:00
Peter Zijlstra b1164c7d11 unwind: Add required include files
To be self sufficient, the file needs to include linux/types.h. This
provides things like u32/u64 and struct callback_head.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Link: https://patch.msgid.link/20250924080118.665787071@infradead.org
2025-10-29 10:29:55 +01:00
Peter Zijlstra c31b9d2f58 unwind: Shorten lines
There are some exceptionally long lines that cause ugly wrapping.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Link: https://patch.msgid.link/20250924080118.545274393@infradead.org
2025-10-29 10:29:54 +01:00
Peter Zijlstra ef1ea98c8f task_work: Fix NMI race condition
__schedule()
  // disable irqs
      <NMI>
	  task_work_add(current, work, TWA_NMI_CURRENT);
      </NMI>
  // current = next;
  // enable irqs
      <IRQ>
	  task_work_set_notify_irq()
	  test_and_set_tsk_thread_flag(current,
                                       TIF_NOTIFY_RESUME); // wrong task!
      </IRQ>
  // original task skips task work on its next return to user (or exit!)

Fixes: 466e4d801c ("task_work: Add TWA_NMI_CURRENT as an additional notify mode.")
Reported-by: Josh Poimboeuf <jpoimboe@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Link: https://patch.msgid.link/20250924080118.425949403@infradead.org
2025-10-29 10:29:54 +01:00
Zhang Rui 34976eaf5f perf/x86/intel/cstate: Add Pantherlake support
Like Lunarlake, Pantherlake supports CC1/CC6/CC7 and PC2/PC6/PC10.

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Kan Liang <kan.liang@linux.intel.com>
Reviewed-by: Dapeng Mi <dapeng1.mi@linux.intel.com>
Link: https://patch.msgid.link/20251023223754.1743928-4-zide.chen@intel.com
2025-10-29 10:29:54 +01:00
Zhang Rui 4ba45f041a perf/x86/intel/cstate: Remove PC3 support from LunarLake
LunarLake doesn't support Package C3. Remove the PC3 residency counter
support from LunarLake.

Fixes: 26579860fb ("perf/x86/intel/cstate: Add Lunarlake support")
Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Kan Liang <kan.liang@linux.intel.com>
Reviewed-by: Dapeng Mi <dapeng1.mi@linux.intel.com>
Link: https://patch.msgid.link/20251023223754.1743928-3-zide.chen@intel.com
2025-10-29 10:29:54 +01:00
Zide Chen e39b82f6cb perf/x86/intel/cstate: Add Clearwater Forest support
Clearwater Forest is based on the Darkmont Atom microarchitecture.
>From the perspective of C-state residency profiling, it supports the
same residency counters as Sierra Forest: CC1/CC6, PC2/PC6, and MC6.

Please note that the C1E residency counter can only be read via PMT,
not MSR. Therefore, tools relying on the perf_event framework cannot
access the C1E residency.

Signed-off-by: Zhenyu Wang <zhenyuw.linux@gmail.com>
Signed-off-by: Zide Chen <zide.chen@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Kan Liang <kan.liang@linux.intel.com>
Reviewed-by: Dapeng Mi <dapeng1.mi@linux.intel.com>
Link: https://patch.msgid.link/20251023223754.1743928-2-zide.chen@intel.com
2025-10-29 10:29:53 +01:00
Peter Zijlstra 977b9a0054 Merge branch 'linus/master' into sched/core, to resolve conflict
Conflicts:
	kernel/sched/ext.c

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2025-10-29 08:42:28 +01:00
Peter Zijlstra af13e5e437 sched: Fix the do_set_cpus_allowed() locking fix
Commit abfc01077d ("sched: Fix do_set_cpus_allowed() locking")
overlooked that __balance_push_cpu_stop() calls select_fallback_rq()
with rq->lock held. This makes that set_cpus_allowed_force() will
recursively take rq->lock and the machine locks up.

Run select_fallback_rq() earlier, without holding rq->lock. This opens
up a race window where a task could get migrated out from under us, but
that is harmless, we want the task migrated.

select_fallback_rq() itself will not be subject to concurrency as it
will be fully serialized by p->pi_lock, so there is no chance of
set_cpus_allowed_force() getting called with different arguments and
selecting different fallback CPUs for one task.

Fixes: abfc01077d ("sched: Fix do_set_cpus_allowed() locking")
Reported-by: Jan Polensky <japo@linux.ibm.com>
Reported-by: kernel test robot <oliver.sang@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Jan Polensky <japo@linux.ibm.com>
Closes: https://lore.kernel.org/oe-lkp/202510271206.24495a68-lkp@intel.com
Link: https://patch.msgid.link/20251027110133.GI3245006@noisy.programming.kicks-ass.net
2025-10-28 15:00:48 +01:00
Peter Zijlstra b94d45b6bb seqlock: Allow KASAN to fail optimizing
Some KASAN builds are failing to properly optimize this code --
luckily we don't care about core quality for KASAN builds, so just
exclude it.

Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Closes: https://lore.kernel.org/oe-kbuild-all/202510251641.idrNXhv5-lkp@intel.com/
2025-10-28 09:58:57 +01:00
Yazen Ghannam 187d1b27a1 RAS/AMD/ATL: Require PRM support for future systems
Currently, the AMD Address Translation Library will fail to load for new,
unrecognized systems (based on Data Fabric revision). The intention is to
prevent the code from executing on new systems and returning incorrect
results.

Recent AMD systems, however, may provide UEFI PRM handlers for address
translation. This is code provided by the platform through BIOS tables.  These
are the preferred method for translation, and the Linux native code can be
used as a fallback.

Future AMD systems are expected to provide PRM handlers by default. And Linux
native code will not be used.

Adjust the ATL init code so that new, unrecognized systems will default to
using PRM handlers only.

Signed-off-by: Yazen Ghannam <yazen.ghannam@amd.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Reviewed-by: "Mario Limonciello (AMD)" <superm1@kernel.org>
Link: https://patch.msgid.link/all/20251017-wip-atl-prm-v2-2-7ab1df4a5fbc@amd.com
2025-10-27 19:56:41 +01:00
Marc Zyngier fa9d277738 perf: arm_pmu: Kill last use of per-CPU cpu_armpmu pointer
Having removed the use of the cpu_armpmu per-CPU variable from the
interrupt handling, the only user left is the BRBE scheduler hook.

It is easy to drop the use of this variable by following the pointer to the
generic PMU structure, and get the arm_pmu structure from there.

Perform the conversion and kill cpu_armpmu altogether.

Suggested-by: Will Deacon <will@kernel.org>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Will Deacon <will@kernel.org>
Link: https://patch.msgid.link/20251020122944.3074811-27-maz@kernel.org
2025-10-27 17:16:37 +01:00
Marc Zyngier ebac4649fc irqdomain: Kill of_node_to_fwnode() helper
There is no in-tree users of this helper since b13b41cc3d ("misc:
ti_fpc202: Switch to of_fwnode_handle()"), and is replaced with
of_fwnode_handle().

Get rid of it.

Suggested-by: Jonathan Cameron <jonathan.cameron@huawei.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Will Deacon <will@kernel.org>
Link: https://patch.msgid.link/20251020122944.3074811-26-maz@kernel.org
2025-10-27 17:16:37 +01:00
Marc Zyngier ee2d50a9f5 genirq: Kill irq_{g,s}et_percpu_devid_partition()
These two helpers do not have any user anymore, and can be removed,
together with the affinity field kept in the irqdesc structure.

Signed-off-by: Marc Zyngier <maz@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Will Deacon <will@kernel.org>
Link: https://patch.msgid.link/20251020122944.3074811-25-maz@kernel.org
2025-10-27 17:16:37 +01:00
Marc Zyngier c620438ef2 irqchip: Kill irq-partition-percpu
This code is now completely unused, and nobody will ever miss it.

Signed-off-by: Marc Zyngier <maz@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Will Deacon <will@kernel.org>
Link: https://patch.msgid.link/20251020122944.3074811-24-maz@kernel.org
2025-10-27 17:16:36 +01:00
Marc Zyngier 7443813f10 irqchip/apple-aic: Drop support for custom PMU irq partitions
Similarly to what has been done for GICv3, drop the irq partitioning
support from the AIC driver, effectively merging the two per-cpu interrupts
for the PMU.

Signed-off-by: Marc Zyngier <maz@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Will Deacon <will@kernel.org>
Reviewed-by: Sven Peter <sven@kernel.org>
Link: https://patch.msgid.link/20251020122944.3074811-23-maz@kernel.org
2025-10-27 17:16:36 +01:00
Marc Zyngier 64b9738eaa irqchip/gic-v3: Drop support for custom PPI partitions
The only thing getting in the way of correctly handling PPIs the way they
were intended is the GICv3 hack that deals with PPI partitions.

Remove that code, allowing the common code to kick in.

Signed-off-by: Marc Zyngier <maz@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Will Deacon <will@kernel.org>
Link: https://patch.msgid.link/20251020122944.3074811-22-maz@kernel.org
2025-10-27 17:16:36 +01:00
Marc Zyngier 4cdf4813f5 coresight: trbe: Request specific affinities for per CPU interrupts
Let the TRBE driver request interrupts with an affinity mask matching the
TRBE implementation affinity.

Signed-off-by: Marc Zyngier <maz@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Will Deacon <will@kernel.org>
Acked-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Link: https://patch.msgid.link/20251020122944.3074811-21-maz@kernel.org
2025-10-27 17:16:36 +01:00
Marc Zyngier f8112d29ba perf: arm_spe_pmu: Request specific affinities for per CPU interrupts
Let the SPE driver request interrupts with an affinity mask matching the SPE
implementation affinity.

Signed-off-by: Marc Zyngier <maz@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Will Deacon <will@kernel.org>
Link: https://patch.msgid.link/20251020122944.3074811-20-maz@kernel.org
2025-10-27 17:16:36 +01:00
Will Deacon 54b350fa8e perf: arm_pmu: Request specific affinities for per CPU NMIs/interrupts
Let the PMU driver request both NMIs and normal interrupts with an affinity mask
matching the PMU affinity.

Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Will Deacon <will@kernel.org>
Link: https://patch.msgid.link/20251020122944.3074811-19-maz@kernel.org
2025-10-27 17:16:35 +01:00
Marc Zyngier c734af3b2b genirq: Add request_percpu_irq_affinity() helper
While it would be nice to simply make request_percpu_irq() take an affinity
mask, the churn is likely to be on the irritating side given that most
drivers do not give a damn about affinities.

So take the more innocuous path to provide a helper that parallels
request_percpu_irq(), with an affinity as a bonus argument.

Yes, request_percpu_irq_affinity() is a bit of a mouthful.

Signed-off-by: Marc Zyngier <maz@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Will Deacon <will@kernel.org>
Link: https://patch.msgid.link/20251020122944.3074811-18-maz@kernel.org
2025-10-27 17:16:35 +01:00
Marc Zyngier bdf4e2ac29 genirq: Allow per-cpu interrupt sharing for non-overlapping affinities
Interrupt sharing for percpu-devid interrupts is forbidden, and for good
reasons. These are interrupts generated *from* a CPU and handled by itself
(timer, for example). Nobody in their right mind would put two devices on
the same pin (and if they have, they get to keep the pieces...).

But this also prevents more benign cases, where devices are connected
to groups of CPUs, and for which the affinities are not overlapping.
Effectively, the only thing they share is the interrupt number, and
nothing else.

Tweak the definition of IRQF_SHARED applied to percpu_devid interrupts to
allow this particular use case. This results in extra validation at the
point of the interrupt being setup and freed, as well as a tiny bit of
extra complexity for interrupts at handling time (to pick the correct
irqaction).

Signed-off-by: Marc Zyngier <maz@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Will Deacon <will@kernel.org>
Link: https://patch.msgid.link/20251020122944.3074811-17-maz@kernel.org
2025-10-27 17:16:35 +01:00
Marc Zyngier b9c6aa9efc genirq: Update request_percpu_nmi() to take an affinity
Continue spreading the notion of affinity to the per CPU interrupt request
code by updating the call sites that use request_percpu_nmi() (all two of
them) to take an affinity pointer. This pointer is firmly NULL for now.

Signed-off-by: Marc Zyngier <maz@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Will Deacon <will@kernel.org>
Link: https://patch.msgid.link/20251020122944.3074811-16-maz@kernel.org
2025-10-27 17:16:35 +01:00
Marc Zyngier 258e7d28a3 genirq: Add affinity to percpu_devid interrupt requests
Add an affinity field to both the irqaction structure and the interrupt
request primitives. Nothing is making use of it yet, and the only value
used it NULL, which is used as a shorthand for cpu_possible_mask.

This will shortly get used with actual affinities.

Signed-off-by: Marc Zyngier <maz@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Will Deacon <will@kernel.org>
Link: https://patch.msgid.link/20251020122944.3074811-15-maz@kernel.org
2025-10-27 17:16:34 +01:00
Marc Zyngier 9047a39daa genirq: Factor-in percpu irqaction creation
Move the code creating a per-cpu irqaction into its own helper, so that
future changes to this code can be kept localised.

At the same time, fix the documentation which appears to say the wrong
thing when it comes to interrupts being automatically enabled
(percpu_devid interrupts never are).

Signed-off-by: Marc Zyngier <maz@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Will Deacon <will@kernel.org>
Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
Link: https://patch.msgid.link/20251020122944.3074811-14-maz@kernel.org
2025-10-27 17:16:34 +01:00
Marc Zyngier 5c2b2cc472 genirq: Merge irqaction::{dev_id,percpu_dev_id}
When irqaction::percpu_dev_id was introduced, it was hoped that it could be
part of an anonymous union with dev_id, as the two fields are mutually
exclusive.

However, toolchains used at the time were often showing terrible support
for anonymous unions, breaking the build on a number of architectures. It
was therefore decided to keep the two fields separate and address this down
the line.

14 years later, the compiler dark age is over, and there is universal
support for anonymous unions. Get a whole pointer back that can immediately
be spent on something else.

Signed-off-by: Marc Zyngier <maz@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Will Deacon <will@kernel.org>
Link: https://patch.msgid.link/20251020122944.3074811-13-maz@kernel.org
2025-10-27 17:16:34 +01:00
Marc Zyngier 5ff78c8de9 genirq: Kill handle_percpu_devid_fasteoi_nmi()
There is no in-tree user of this flow handler anymore, so simply remove it.

Suggested-by: Will Deacon <will@kernel.org>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Will Deacon <will@kernel.org>
Link: https://patch.msgid.link/20251020122944.3074811-12-maz@kernel.org
2025-10-27 17:16:34 +01:00
Marc Zyngier 21bbbc50f3 irqchip/gic-v3: Switch high priority PPIs over to handle_percpu_devid_irq()
It so appears that handle_percpu_devid_irq() is extremely similar to
handle_percpu_devid_fasteoi_nmi(), and that the differences do no justify
the horrid machinery in the GICv3 driver to handle the flow handler switch.

Stick with the standard flow handler, even for NMIs.

Suggested-by: Will Deacon <will@kernel.org>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Will Deacon <will@kernel.org>
Link: https://patch.msgid.link/20251020122944.3074811-11-maz@kernel.org
2025-10-27 17:16:34 +01:00
Marc Zyngier f6c8aced7c perf: arm_spe_pmu: Convert to new interrupt affinity retrieval API
Now that the relevant interrupt controllers are equipped with a callback
returning the affinity of per-CPU interrupts, switch the ARM SPE driver
over to this new method.

Signed-off-by: Marc Zyngier <maz@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Will Deacon <will@kernel.org>
Reviewed-by: Jinjie Ruan <ruanjinjie@huawei.com>
Link: https://patch.msgid.link/20251020122944.3074811-10-maz@kernel.org
2025-10-27 17:16:33 +01:00
Marc Zyngier 663783e001 perf: arm_pmu: Convert to the new interrupt affinity retrieval API
Now that the relevant interrupt controllers are equipped with a callback
returning the affinity of per-CPU interrupts, switch the OF side of the ARM
PMU driver over to this new method.

Signed-off-by: Marc Zyngier <maz@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Will Deacon <will@kernel.org>
Reviewed-by: Jinjie Ruan <ruanjinjie@huawei.com>
Link: https://patch.msgid.link/20251020122944.3074811-9-maz@kernel.org
2025-10-27 17:16:33 +01:00
Marc Zyngier 541454dd20 coresight: trbe: Convert to the new interrupt affinity retrieval API
Now that the relevant interrupt controllers are equipped with a callback
returning the affinity of per-CPU interrupts, switch the TRBE driver over
to this new method.

Signed-off-by: Marc Zyngier <maz@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Will Deacon <will@kernel.org>
Acked-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Link: https://patch.msgid.link/20251020122944.3074811-8-maz@kernel.org
2025-10-27 17:16:33 +01:00
Marc Zyngier de575de83c irqchip/apple-aic: Add FW info retrieval support
Plug the new .get_fwspec_info() callback into the Apple AIC driver, using
some of the existing FIQ affinity handling infrastructure.

Signed-off-by: Marc Zyngier <maz@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Will Deacon <will@kernel.org>
Acked-by: Sven Peter <sven@kernel.org>
Link: https://patch.msgid.link/20251020122944.3074811-7-maz@kernel.org
2025-10-27 17:16:33 +01:00
Marc Zyngier 68905ea65c irqchip/gic-v3: Add FW info retrieval support
Plug the new .get_fwspec_info() callback into the GICv3 core driver, using
some of the existing PPI affinity handling infrastructure.

Signed-off-by: Marc Zyngier <maz@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Will Deacon <will@kernel.org>
Link: https://patch.msgid.link/20251020122944.3074811-6-maz@kernel.org
2025-10-27 17:16:33 +01:00
Marc Zyngier 0d5daa938c platform: Add firmware-agnostic irq and affinity retrieval interface
Expand platform_get_irq_optional() to also return an affinity if available,
renaming it to platform_get_irq_affinity() in the process.

platform_get_irq_optional() is preserved with its current semantics by
calling into the new helper with a NULL affinity pointer.

Signed-off-by: Marc Zyngier <maz@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Will Deacon <will@kernel.org>
Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
Link: https://patch.msgid.link/20251020122944.3074811-5-maz@kernel.org
2025-10-27 17:16:32 +01:00
Marc Zyngier 5404f5c06d of/irq: Add interrupt affinity reporting interface
Plug the irq_populate_fwspec_info() helper into the OF layer to offer an
interrupt affinity reporting function.

Signed-off-by: Marc Zyngier <maz@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Will Deacon <will@kernel.org>
Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
Link: https://patch.msgid.link/20251020122944.3074811-4-maz@kernel.org
2025-10-27 17:16:32 +01:00
Marc Zyngier 5324fe21ba ACPI: irq: Add interrupt affinity reporting interface
Plug the irq_populate_fwspec_info() helper into the ACPI layer to offer an
interrupt affinity reporting function. This is currently only supported for
the CONFIG_ACPI_GENERIC_GSI configurations, but could later be extended to
legacy architectures if necessary.

Signed-off-by: Marc Zyngier <maz@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Will Deacon <will@kernel.org>
Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
Acked-by: Rafael J. Wysocki (Intel) <rafael@kernel.org>
Link: https://patch.msgid.link/20251020122944.3074811-3-maz@kernel.org
2025-10-27 17:16:32 +01:00
Marc Zyngier 87b0031f7f irqdomain: Add firmware info reporting interface
Add an irqdomain callback to report firmware-provided information that is
otherwise not available in a generic way. This is reported using a new data
structure (struct irq_fwspec_info).

This callback is optional and the only information that can be reported
currently is the affinity of an interrupt. However, the containing
structure is designed to be extensible, allowing other potentially relevant
information to be reported in the future.

Signed-off-by: Marc Zyngier <maz@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Will Deacon <will@kernel.org>
Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
Link: https://patch.msgid.link/20251020122944.3074811-2-maz@kernel.org
2025-10-27 17:16:32 +01:00
Yazen Ghannam 83be4bee57 ACPI: PRM: Add acpi_prm_handler_available()
Add a helper function to check if a PRM handler/module is present.

This can be used during init time by code that depends on a particular
handler. If the handler is not present, then the code does not need to
be loaded.

Signed-off-by: Yazen Ghannam <yazen.ghannam@amd.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Reviewed-by: "Mario Limonciello (AMD)" <superm1@kernel.org>
Acked-by: "Rafael J. Wysocki (Intel)" <rafael@kernel.org>
Link: https://patch.msgid.link/all/20251017-wip-atl-prm-v2-1-7ab1df4a5fbc@amd.com
2025-10-27 15:45:22 +01:00
Borislav Petkov (AMD) 4058386498 - Remove dead code leftovers after a recent mitigations cleanup which fail
a Clang build
 
 - Make sure a Retbleed mitigation message is printed only when necessary
 
 - Correct the last Zen1 microcode revision for which Entrysign sha256 check is
   needed
 
 - Fix a NULL ptr deref when mounting the resctrl fs on a system which supports
   assignable counters but where L3 total and local bandwidth monitoring has
   been disabled at boot
 -----BEGIN PGP SIGNATURE-----
 
 iQIzBAABCgAdFiEEzv7L6UO9uDPlPSfHEsHwGGHeVUoFAmj+FSYACgkQEsHwGGHe
 VUqh5RAAwTAfMsEs57v6gQqnm/rbNjGXoZuNcT9xhk4jbRC7xCcyJrZVyYA+mWIe
 5rgGOuSThOsOgqJHwVqn4kdym9yUwLradZS8gn5vHFIlDVXDoMRYJuvm8U7PdTug
 UWJv0uw0B393RNb+7yCeEN7Zpe2bvbh25PF66uh/7dQYKmWIaiTVlDhrZ+Ba51IK
 mmJzbVb6zqWrSP3heISZRjfV3rv+/SifUb+wIgWcCzcAb36fFIlUKaEYd/g5249R
 BBcEY5n/eUUKjMJVOki4vDqJyQdPdJCz9yH3qdZaz661Wh9/FVy/rLCQC/O1ruwt
 Ovoi6UJAjleb0OXfi00Hl1LT3v92xH/OwyVCamBAYyaIhTdPaoQS6YADGstt3PTx
 RUc/BG5wHyaOWsG94zVEvqK9MElyjW3DPiBH4E+O7OB348WAfhsbrUDnnaveDSym
 n2LivNnkiaXi8DpPhWL7XsJJjYAy1fi2piDrh952I5oVfhf5iYeNwFjNdtgAft7G
 wNr01qraqdPKfMYHZHdkaqrPH/Qy9DlLuDuTjQqtjGm8lsZK/g+txzQLfeXoDJSe
 RtKtRYlq0bVCOnAuA8MN4xi9H2WaKAZNgavJxywZslmaQuQzh21g7ISwxcAFe07n
 nevcypF1s/dnCUPK8yuKTmFzkwbg7I2OgrmX0RKZdFxY8uzg4Co=
 =EZGc
 -----END PGP SIGNATURE-----

Merge tag 'x86_urgent_for_v6.18_rc3' into x86/microcode

Pick up the below urgent upstream change in order to base more work
ontop:

- Correct the last Zen1 microcode revision for which Entrysign sha256 check is
  needed

Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
2025-10-27 14:06:38 +01:00
Charles Mirabile 539d147ef6 irqchip/sifive-plic: Add support for UltraRISC DP1000 PLIC
Add a new compatible for the plic found in UltraRISC DP1000 with a quirk to
work around a known hardware bug with IRQ claiming in the UR-CP100 cores.

When claiming an interrupt on UR-CP100 cores, all other interrupts must be
disabled before the claim register is accessed to prevent incorrect
handling of the interrupt. This is a hardware bug in the CP100 core
implementation, not specific to the DP1000 SoC.

When the PLIC_QUIRK_CP100_CLAIM_REGISTER_ERRATUM flag is present, a
specialized handler (plic_handle_irq_cp100) disables all interrupts except
for the first pending one before reading the claim register, and then
restores the interrupts before further processing of the claimed interrupt
continues.

This implementation leverages the enable_save optimization, which maintains
the current interrupt enable state in memory, avoiding additional register
reads during the workaround.

The driver matches on "ultrarisc,cp100-plic" to apply the quirk to all
SoCs using UR-CP100 cores, regardless of the specific SoC implementation.
This has no impact on other platforms.

[ tglx: Condensed the code a bit, massaged change log and comments ]

Co-developed-by: Zhang Xincheng <zhangxincheng@ultrarisc.com>
Signed-off-by: Zhang Xincheng <zhangxincheng@ultrarisc.com>
Signed-off-by: Charles Mirabile <cmirabil@redhat.com>
Signed-off-by: Lucas Zampieri <lzampier@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Samuel Holland <samuel.holland@sifive.com>
Link: https://patch.msgid.link/20251024083647.475239-5-lzampier@redhat.com
2025-10-27 12:11:56 +01:00
Matthew Wilcox (Oracle) 70e0a80a1f treewide: Remove in_irq()
This old alias for in_hardirq() has been marked as deprecated since
2020; remove the stragglers.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://patch.msgid.link/20251024180654.1691095-1-willy@infradead.org
2025-10-24 21:39:27 +02:00
Charles Mirabile 14ff9e54dd irqchip/sifive-plic: Cache the interrupt enable state
Optimize the PLIC driver by maintaining the interrupt enable state in the
handler's enable_save array during normal operation rather than only during
suspend/resume. This eliminates the need to read enable registers during
suspend and makes the enable state immediately available for other
purposes.

Let __plic_toggle() update both the hardware registers and the cached
enable_save state atomically within the existing enable_lock protection.

That allows to remove the suspend-time enable register reading since
handler::enable_save now always reflects the current state.

[ tglx: Massaged change log ]

Signed-off-by: Charles Mirabile <cmirabil@redhat.com>
Signed-off-by: Lucas Zampieri <lzampier@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://patch.msgid.link/20251024083647.475239-4-lzampier@redhat.com
2025-10-24 21:34:32 +02:00
Charles Mirabile 9dfb295a93 dt-bindings: interrupt-controller: Add UltraRISC DP1000 PLIC
Add compatible strings for the PLIC found in UltraRISC DP1000 SoC.

The PLIC is part of the UR-CP100 core and has a hardware bug requiring
a workaround.

Signed-off-by: Charles Mirabile <cmirabil@redhat.com>
Signed-off-by: Lucas Zampieri <lzampier@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Conor Dooley <conor.dooley@microchip.com>
Link: https://patch.msgid.link/20251024083647.475239-3-lzampier@redhat.com
2025-10-24 21:34:32 +02:00
Lucas Zampieri e95f66dd0e dt-bindings: vendor-prefixes: Add UltraRISC
Add vendor prefix for UltraRISC Technology Co., Ltd.

Signed-off-by: Lucas Zampieri <lzampier@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Rob Herring (Arm) <robh@kernel.org>
Link: https://patch.msgid.link/20251024083647.475239-2-lzampier@redhat.com
2025-10-24 21:34:31 +02:00
Josh Poimboeuf f6af8690d1 perf build: Fix perf build issues with fixdep
Commit a808a2b35f ("tools build: Fix fixdep dependencies") broke the
perf build ("make -C tools/perf") by introducing two inadvertent
conflicts:

  1) tools/build/Makefile includes tools/build/Makefile.include, which
     defines a phony 'fixdep' target.  This conflicts with the $(FIXDEP)
     file target in tools/build/Makefile when OUTPUT is empty, causing
     make to report duplicate recipes for the same target.

  2) The FIXDEP variable in tools/build/Makefile conflicts with the
     previously existing one in tools/perf/Makefile.perf.

Remove the unnecessary include of tools/build/Makefile.include from
tools/build/Makefile, and rename the FIXDEP variable in
tools/perf/Makefile.perf to FIXDEP_BUILT.

Fixes: a808a2b35f ("tools build: Fix fixdep dependencies")
Reported-by: Thorsten Leemhuis <linux@leemhuis.info>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Thorsten Leemhuis <linux@leemhuis.info>
Link: https://patch.msgid.link/8881bc3321bd9fa58802e4f36286eefe3667806b.1760992391.git.jpoimboe@kernel.org
2025-10-23 09:53:49 +02:00
Josh Poimboeuf 9025688bf6 module: Fix device table module aliases
Commit 6717e8f91d ("kbuild: Remove 'kmod_' prefix from
__KBUILD_MODNAME") inadvertently broke module alias generation for
modules which rely on MODULE_DEVICE_TABLE().

It removed the "kmod_" prefix from __KBUILD_MODNAME, which caused
MODULE_DEVICE_TABLE() to generate a symbol name which no longer matched
the format expected by handle_moddevtable() in scripts/mod/file2alias.c.

As a result, modpost failed to find the device tables, leading to
missing module aliases.

Fix this by explicitly adding the "kmod_" string within the
MODULE_DEVICE_TABLE() macro itself, restoring the symbol name to the
format expected by file2alias.c.

Fixes: 6717e8f91d ("kbuild: Remove 'kmod_' prefix from __KBUILD_MODNAME")
Reported-by: Alexander Stein <alexander.stein@ew.tq-group.com>
Reported-by: Marek Szyprowski <m.szyprowski@samsung.com>
Reported-by: Mark Brown <broonie@kernel.org>
Reported-by: Cosmin Tanislav <demonsingur@gmail.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Cosmin Tanislav <demonsingur@gmail.com>
Tested-by: Marek Szyprowski <m.szyprowski@samsung.com>
Tested-by: Mark Brown <broonie@kernel.org>
Tested-by: Alexander Stein <alexander.stein@ew.tq-group.com>
Tested-by: Chen-Yu Tsai <wenst@chromium.org>
Tested-by: Anders Roxell <anders.roxell@linaro.org>
Link: https://patch.msgid.link/e52ee3edf32874da645a9e037a7d77c69893a22a.1760982784.git.jpoimboe@kernel.org
2025-10-22 15:21:55 +02:00
Boqun Feng 37d0472c8a rust: debugfs: Implement Reader for Mutex<T> only when T is Unpin
Since we are going to make `Mutex<T>` structurally pin the data (i.e.
`T`), therefore `.lock()` function only returns a `Guard` that can
dereference a mutable reference to `T` if only `T` is `Unpin`, therefore
restrict the impl `Reader` block of `Mutex<T>` to that.

Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Danilo Krummrich <dakr@kernel.org>
Link: https://patch.msgid.link/20251022034237.70431-1-boqun.feng@gmail.com
2025-10-22 15:21:51 +02:00
Borislav Petkov (AMD) da247eff96 objtool/klp: Add the debian-based package name of xxhash to the hint
Add the debian package name for the devel version of the xxHash package
"libxxhash-dev".

Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Link: https://patch.msgid.link/20251017194732.7713-1-bp@kernel.org
2025-10-22 13:51:11 +02:00
Oleg Nesterov 795aab353d seqlock: Change do_io_accounting() to use scoped_seqlock_read()
To simplify the code and make it more readable.

[peterz: change to new interface]
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
2025-10-21 12:31:57 +02:00
Oleg Nesterov b76f72bea2 seqlock: Change do_task_stat() to use scoped_seqlock_read()
To simplify the code and make it more readable.

[peterz: change to new interface]
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
2025-10-21 12:31:57 +02:00
Oleg Nesterov 488f48b326 seqlock: Change thread_group_cputime() to use scoped_seqlock_read()
To simplify the code and make it more readable.

While at it, change thread_group_cputime() to use __for_each_thread(sig).

[peterz: update to new interface]
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
2025-10-21 12:31:57 +02:00
Peter Zijlstra cc39f3872c seqlock: Introduce scoped_seqlock_read()
The read_seqbegin/need_seqretry/done_seqretry API is cumbersome and
error prone. With the new helper the "typical" code like

	int seq, nextseq;
	unsigned long flags;

	nextseq = 0;
	do {
		seq = nextseq;
		flags = read_seqbegin_or_lock_irqsave(&seqlock, &seq);

		// read-side critical section

		nextseq = 1;
	} while (need_seqretry(&seqlock, seq));
	done_seqretry_irqrestore(&seqlock, seq, flags);

can be rewritten as

	scoped_seqlock_read (&seqlock, ss_lock_irqsave) {
		// read-side critical section
	}

Original idea by Oleg Nesterov; with contributions from Linus.

Originally-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
2025-10-21 12:31:57 +02:00
Oleg Nesterov 28a0ee3119 documentation: seqlock: fix the wrong documentation of read_seqbegin_or_lock/need_seqretry
The comments and pseudo code in Documentation/locking/seqlock.rst are wrong:

	int seq = 0;
	do {
		read_seqbegin_or_lock(&foo_seqlock, &seq);

		/* ... [[read-side critical section]] ... */

	} while (need_seqretry(&foo_seqlock, seq));

read_seqbegin_or_lock() always returns with an even "seq" and need_seqretry()
doesn't change this counter. This means that seq is always even and thus the
locking pass is simply impossible.

IOW, "_or_lock" has no effect and this code doesn't differ from

	do {
		seq = read_seqbegin(&foo_seqlock);

		/* ... [[read-side critical section]] ... */

	} while (read_seqretry(&foo_seqlock, seq));

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
2025-10-21 12:31:56 +02:00
Arnd Bergmann 44472d1b83 atomic: Skip alignment check for try_cmpxchg() old arg
The 'old' argument in atomic_try_cmpxchg() and related functions is a
pointer to a normal non-atomic integer number, which does not require
to be naturally aligned, unlike the atomic_t/atomic64_t types themselves.

In order to add an alignment check with CONFIG_DEBUG_ATOMIC into the
normal instrument_atomic_read_write() helper, change this check to use
the non-atomic instrument_read_write(), the same way that was done
earlier for try_cmpxchg() in commit ec570320b0 ("locking/atomic:
Correct (cmp)xchg() instrumentation").

This prevents warnings on m68k calling the 32-bit atomic_try_cmpxchg()
with 16-bit aligned arguments as well as several more architectures
including x86-32 when calling atomic64_try_cmpxchg() with 32-bit
aligned u64 arguments.

Reported-by: Finn Thain <fthain@linux-m68k.org>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/all/cover.1757810729.git.fthain@linux-m68k.org/
2025-10-21 12:31:56 +02:00
Daniel Almeida 66f1ea83d9 rust: lock: Add a Pin<&mut T> accessor
In order for callers to be able to access the inner T safely if T:
!Unpin, there needs to be a way to get a Pin<&mut T>. Add this accessor
and a corresponding example to tell users how it works.

This requires the pin projection functionality [1] for better ergonomic.

[boqun: Apply Daniel's fix to the code example, add the reference to pin
projection patch and remove out-of-date part in the commit log]

Suggested-by: Benno Lossin <lossin@kernel.org>
Suggested-by: Boqun Feng <boqun.feng@gmail.com>
Signed-off-by: Daniel Almeida <daniel.almeida@collabora.com>
Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Alice Ryhl <aliceryhl@google.com>
Reviewed-by: Benno Lossin <lossin@kernel.org>
Link: https://github.com/Rust-for-Linux/linux/issues/1181
Link: https://lore.kernel.org/rust-for-linux/20250912174148.373530-1-lossin@kernel.org/ [1]
2025-10-21 12:31:56 +02:00
Daniel Almeida 2497a7116f rust: lock: Pin the inner data
In preparation to support Lock<T> where T is pinned, the first thing
that needs to be done is to structurally pin the 'data' member. This
switches the 't' parameter in Lock<T>::new() to take in an impl
PinInit<T> instead of a plain T. This in turn uses the blanket
implementation "impl PinInit<T> for T".

Subsequent patches will touch on Guard<T>.

Suggested-by: Benno Lossin <lossin@kernel.org>
Suggested-by: Boqun Feng <boqun.feng@gmail.com>
Signed-off-by: Daniel Almeida <daniel.almeida@collabora.com>
Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Benno Lossin <lossin@kernel.org>
Reviewed-by: Alice Ryhl <aliceryhl@google.com>
Link: https://github.com/Rust-for-Linux/linux/issues/1181
2025-10-21 12:31:55 +02:00
Daniel Almeida da123f0ee4 rust: lock: guard: Add T: Unpin bound to DerefMut
A core property of pinned types is not handing a mutable reference to
the inner data in safe code, as this trivially allows that data to be
moved.

Enforce this condition by adding a bound on lock::Guard's DerefMut
implementation, so that it's only implemented for pinning-agnostic
types.

Suggested-by: Benno Lossin <lossin@kernel.org>
Suggested-by: Boqun Feng <boqun.feng@gmail.com>
Signed-off-by: Daniel Almeida <daniel.almeida@collabora.com>
Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Benno Lossin <lossin@kernel.org>
Reviewed-by: Alice Ryhl <aliceryhl@google.com>
Link: https://github.com/Rust-for-Linux/linux/issues/1181
2025-10-21 12:31:55 +02:00
Alexander Sverdlin c14ecb555c locking/spinlock/debug: Fix data-race in do_raw_write_lock
KCSAN reports:

BUG: KCSAN: data-race in do_raw_write_lock / do_raw_write_lock

write (marked) to 0xffff800009cf504c of 4 bytes by task 1102 on cpu 1:
 do_raw_write_lock+0x120/0x204
 _raw_write_lock_irq
 do_exit
 call_usermodehelper_exec_async
 ret_from_fork

read to 0xffff800009cf504c of 4 bytes by task 1103 on cpu 0:
 do_raw_write_lock+0x88/0x204
 _raw_write_lock_irq
 do_exit
 call_usermodehelper_exec_async
 ret_from_fork

value changed: 0xffffffff -> 0x00000001

Reported by Kernel Concurrency Sanitizer on:
CPU: 0 PID: 1103 Comm: kworker/u4:1 6.1.111

Commit 1a365e8223 ("locking/spinlock/debug: Fix various data races") has
adressed most of these races, but seems to be not consistent/not complete.

>From do_raw_write_lock() only debug_write_lock_after() part has been
converted to WRITE_ONCE(), but not debug_write_lock_before() part.
Do it now.

Fixes: 1a365e8223 ("locking/spinlock/debug: Fix various data races")
Reported-by: Adrian Freihofer <adrian.freihofer@siemens.com>
Signed-off-by: Alexander Sverdlin <alexander.sverdlin@siemens.com>
Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Paul E. McKenney <paulmck@kernel.org>
Acked-by: Waiman Long <longman@redhat.com>
Cc: stable@vger.kernel.org
2025-10-21 12:31:55 +02:00
Christophe JAILLET 27d2afa3b4 x86/ioapic: Simplify mp_irqdomain_alloc() slightly
The IRQ return value of irq_find_mapping() is only tested
for existence, not used for anything else.

So, this call can be replaced by a slightly simpler
irq_resolve_mapping() call, which reduces generated
code size a bit (x86-64 allmodconfig):

   text	   data	    bss	    dec	    hex	filename
  82142	  38633	  18048	 138823	  21e47	arch/x86/kernel/apic/io_apic.o.before
  81932	  38633	  18048	 138613	  21d75	arch/x86/kernel/apic/io_apic.o.after

[ mingo: Fixed & simplified the changelog ]

Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: kernel-janitors@vger.kernel.org
Link: https://patch.msgid.link/cb3a4968538637aac3a5ae4f5ecc4f5eb43376ea.1760861877.git.christophe.jaillet@wanadoo.fr
2025-10-21 08:47:33 +02:00
Julian Sun d6e6215907
writeback: Add logging for slow writeback (exceeds sysctl_hung_task_timeout_secs)
When a writeback work lasts for sysctl_hung_task_timeout_secs, we want
to identify that there are tasks waiting for a long time-this helps us
pinpoint potential issues.

Additionally, recording the starting jiffies is useful when debugging a
crashed vmcore.

Signed-off-by: Julian Sun <sunjunchao@bytedance.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-10-20 20:22:39 +02:00
Julian Sun 1888635532
writeback: Wake up waiting tasks when finishing the writeback of a chunk.
Writing back a large number of pages can take a lots of time.
This issue is exacerbated when the underlying device is slow or
subject to block layer rate limiting, which in turn triggers
unexpected hung task warnings.

We can trigger a wake-up once a chunk has been written back and the
waiting time for writeback exceeds half of
sysctl_hung_task_timeout_secs.
This action allows the hung task detector to be aware of the writeback
progress, thereby eliminating these unexpected hung task warnings.

This patch has passed the xfstests 'check -g quick' test based on ext4,
with no additional failures introduced.

Signed-off-by: Julian Sun <sunjunchao@bytedance.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-10-20 20:22:39 +02:00
Christian Brauner 11f2af2a80
Merge patch series "hide ->i_state behind accessors"
Mateusz Guzik <mjguzik@gmail.com> says:

Open-coded accesses prevent asserting they are done correctly. One
obvious aspect is locking, but significantly more can checked. For
example it can be detected when the code is clearing flags which are
already missing, or is setting flags when it is illegal (e.g., I_FREEING
when ->i_count > 0).

In order to keep things manageable this patchset merely gets the thing
off the ground with only lockdep checks baked in.

Current consumers can be trivially converted.

Suppose flags I_A and I_B are to be handled.

If ->i_lock is held, then:

state = inode->i_state          => state = inode_state_read(inode)
inode->i_state |= (I_A | I_B)   => inode_state_set(inode, I_A | I_B)
inode->i_state &= ~(I_A | I_B)  => inode_state_clear(inode, I_A | I_B)
inode->i_state = I_A | I_B      => inode_state_assign(inode, I_A | I_B)

If ->i_lock is not held or only held conditionally:

state = inode->i_state          => state = inode_state_read_once(inode)
inode->i_state |= (I_A | I_B)   => inode_state_set_raw(inode, I_A | I_B)
inode->i_state &= ~(I_A | I_B)  => inode_state_clear_raw(inode, I_A | I_B)
inode->i_state = I_A | I_B      => inode_state_assign_raw(inode, I_A | I_B)

The "_once" vs "_raw" discrepancy stems from the read variant differing
by READ_ONCE as opposed to just lockdep checks.

Finally, if you want to atomically clear flags and set new ones, the
following:

state = inode->i_state;
state &= ~I_A;
state |= I_B;
inode->i_state = state;

turns into:

inode_state_replace(inode, I_A, I_B);

* patches from https://lore.kernel.org/20251009075929.1203950-1-mjguzik@gmail.com:
  fs: make plain ->i_state access fail to compile
  xfs: use the new ->i_state accessors
  nilfs2: use the new ->i_state accessors
  overlayfs: use the new ->i_state accessors
  gfs2: use the new ->i_state accessors
  f2fs: use the new ->i_state accessors
  smb: use the new ->i_state accessors
  ceph: use the new ->i_state accessors
  btrfs: use the new ->i_state accessors
  Manual conversion to use ->i_state accessors of all places not covered by coccinelle
  Coccinelle-based conversion to use ->i_state accessors
  fs: provide accessors for ->i_state
  fs: spell out fenced ->i_state accesses with explicit smp_wmb/smp_rmb
  fs: move wait_on_inode() from writeback.h to fs.h

Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-10-20 20:22:28 +02:00
Mateusz Guzik 2ed81b4bef
fs: make plain ->i_state access fail to compile
... to make sure all accesses are properly validated.

Merely renaming the var to __i_state still lets the compiler make the
following suggestion:
error: 'struct inode' has no member named 'i_state'; did you mean '__i_state'?

Unfortunately some people will add the __'s and call it a day.

In order to make it harder to mess up in this way, hide it behind a
struct. The resulting error message should be convincing in terms of
checking what to do:
error: invalid operands to binary & (have 'struct inode_state_flags' and 'int')

Of course people determined to do a plain access can still do it, but
nothing can be done for that case.

Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-10-20 20:22:28 +02:00
Mateusz Guzik 18c61399f6
xfs: use the new ->i_state accessors
Change generated with coccinelle and fixed up by hand as appropriate.

Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-10-20 20:22:27 +02:00
Mateusz Guzik a18d43041b
nilfs2: use the new ->i_state accessors
Change generated with coccinelle and fixed up by hand as appropriate.

Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-10-20 20:22:27 +02:00
Mateusz Guzik ff175a4fc2
overlayfs: use the new ->i_state accessors
Change generated with coccinelle and fixed up by hand as appropriate.

Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-10-20 20:22:27 +02:00
Mateusz Guzik 40a4c512ad
gfs2: use the new ->i_state accessors
Change generated with coccinelle and fixed up by hand as appropriate.

Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-10-20 20:22:27 +02:00
Mateusz Guzik ba69118c52
f2fs: use the new ->i_state accessors
Change generated with coccinelle and fixed up by hand as appropriate.

Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-10-20 20:22:27 +02:00
Mateusz Guzik f5a67689ba
smb: use the new ->i_state accessors
Change generated with coccinelle and fixed up by hand as appropriate.

Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-10-20 20:22:27 +02:00
Mateusz Guzik fa49168ea0
ceph: use the new ->i_state accessors
Change generated with coccinelle and fixed up by hand as appropriate.

Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-10-20 20:22:27 +02:00
Mateusz Guzik 7b12a794bf
btrfs: use the new ->i_state accessors
Change generated with coccinelle and fixed up by hand as appropriate.

Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-10-20 20:22:26 +02:00
Mateusz Guzik f5aa78e2be
Manual conversion to use ->i_state accessors of all places not covered by coccinelle
Nothing to look at apart from iput_final().

Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-10-20 20:22:26 +02:00
Mateusz Guzik b4dbfd8653
Coccinelle-based conversion to use ->i_state accessors
All places were patched by coccinelle with the default expecting that
->i_lock is held, afterwards entries got fixed up by hand to use
unlocked variants as needed.

The script:
@@
expression inode, flags;
@@

- inode->i_state & flags
+ inode_state_read(inode) & flags

@@
expression inode, flags;
@@

- inode->i_state &= ~flags
+ inode_state_clear(inode, flags)

@@
expression inode, flag1, flag2;
@@

- inode->i_state &= ~flag1 & ~flag2
+ inode_state_clear(inode, flag1 | flag2)

@@
expression inode, flags;
@@

- inode->i_state |= flags
+ inode_state_set(inode, flags)

@@
expression inode, flags;
@@

- inode->i_state = flags
+ inode_state_assign(inode, flags)

@@
expression inode, flags;
@@

- flags = inode->i_state
+ flags = inode_state_read(inode)

@@
expression inode, flags;
@@

- READ_ONCE(inode->i_state) & flags
+ inode_state_read(inode) & flags

Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-10-20 20:22:26 +02:00
Mateusz Guzik d8753f788a
fs: provide accessors for ->i_state
Open-coded accesses prevent asserting they are done correctly. One
obvious aspect is locking, but significantly more can checked. For
example it can be detected when the code is clearing flags which are
already missing, or is setting flags when it is illegal (e.g., I_FREEING
when ->i_count > 0).

In order to keep things manageable this patchset merely gets the thing
off the ground with only lockdep checks baked in.

Current consumers can be trivially converted.

Suppose flags I_A and I_B are to be handled.

If ->i_lock is held, then:

state = inode->i_state  	=> state = inode_state_read(inode)
inode->i_state |= (I_A | I_B) 	=> inode_state_set(inode, I_A | I_B)
inode->i_state &= ~(I_A | I_B) 	=> inode_state_clear(inode, I_A | I_B)
inode->i_state = I_A | I_B	=> inode_state_assign(inode, I_A | I_B)

If ->i_lock is not held or only held conditionally:

state = inode->i_state  	=> state = inode_state_read_once(inode)
inode->i_state |= (I_A | I_B) 	=> inode_state_set_raw(inode, I_A | I_B)
inode->i_state &= ~(I_A | I_B) 	=> inode_state_clear_raw(inode, I_A | I_B)
inode->i_state = I_A | I_B	=> inode_state_assign_raw(inode, I_A | I_B)

The "_once" vs "_raw" discrepancy stems from the read variant differing
by READ_ONCE as opposed to just lockdep checks.

Finally, if you want to atomically clear flags and set new ones, the
following:

state = inode->i_state;
state &= ~I_A;
state |= I_B;
inode->i_state = state;

turns into:

inode_state_replace(inode, I_A, I_B);

Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-10-20 20:22:26 +02:00
Mateusz Guzik cb5db358ab
fs: spell out fenced ->i_state accesses with explicit smp_wmb/smp_rmb
The incomming helpers don't ship with _release/_acquire variants, for
the time being anyway.

Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-10-20 20:22:26 +02:00
Mateusz Guzik af6023e2ce
fs: move wait_on_inode() from writeback.h to fs.h
The only consumer outside of fs/inode.c is gfs2 and it already includes
fs.h in the relevant file.

Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-10-20 20:22:26 +02:00
Mateusz Guzik 31e332b911
fs: add missing fences to I_NEW handling
Suppose there are 2 CPUs racing inode hash lookup func (say ilookup5())
and unlock_new_inode().

In principle the latter can clear the I_NEW flag before prior stores
into the inode were made visible.

The former can in turn observe I_NEW is cleared and proceed to use the
inode, while possibly reading from not-yet-published areas.

Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-10-20 20:22:25 +02:00
Mateusz Guzik 0f607a89af
ocfs2: retire ocfs2_drop_inode() and I_WILL_FREE usage
This postpones the writeout to ocfs2_evict_inode(), which I'm told is
fine (tm).

The intent is to retire the I_WILL_FREE flag.

Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
Reviewed-by: Joel Becker <jlbec@evilplan.org>
Reviewed-by: Mark Tinguely <amrk.tinguely@oracle.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-10-20 20:22:25 +02:00
Mateusz Guzik be97a4b63c
fs: assert on ->i_count in iput_final()
Notably make sure the count is 0 after the return from ->drop_inode(),
provided we are going to drop.

Inspired by suspicious games played by f2fs.

Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-10-20 20:22:25 +02:00
Mateusz Guzik dc816f8d92
fs: assert ->i_lock held in __iget()
Also remove the now redundant comment.

Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-10-20 20:22:25 +02:00
Joanne Koong 87a13819dd
iomap: rename iomap_readpage_ctx struct to iomap_read_folio_ctx
->readpage was deprecated and reads are now on folios.

Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
Tested-by: syzbot@syzkaller.appspotmail.com
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-10-20 20:21:26 +02:00
Joanne Koong 8805a9c64b
iomap: rename iomap_readpage_iter() to iomap_read_folio_iter()
->readpage was deprecated and reads are now on folios.

Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-10-20 20:21:26 +02:00
Joanne Koong e0e15340e4
iomap: iterate over folio mapping in iomap_readpage_iter()
Iterate over all non-uptodate ranges of a folio mapping in a single call
to iomap_readpage_iter() instead of leaving the partial iteration to the
caller.

Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-10-20 20:21:26 +02:00
Joanne Koong 7aa6bc3e87
iomap: adjust read range correctly for non-block-aligned positions
iomap_adjust_read_range() assumes that the position and length passed in
are block-aligned. This is not always the case however, as shown in the
syzbot generated case for erofs. This causes too many bytes to be
skipped for uptodate blocks, which results in returning the incorrect
position and length to read in. If all the blocks are uptodate, this
underflows length and returns a position beyond the folio.

Fix the calculation to also take into account the block offset when
calculating how many bytes can be skipped for uptodate blocks.

Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
Tested-by: syzbot@syzkaller.appspotmail.com
Reviewed-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-10-20 20:21:25 +02:00
Joanne Koong d1f9893fcd
iomap: store read/readahead bio generically
Store the iomap_readpage_ctx bio generically as a "void *read_ctx".
This makes the read/readahead interface more generic, which allows it to
be used by filesystems that may not be block-based and may not have
CONFIG_BLOCK set.

Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
Tested-by: syzbot@syzkaller.appspotmail.com
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-10-20 20:21:25 +02:00
Joanne Koong ca82a7ea22
iomap: simplify iomap_iter_advance()
Most callers of iomap_iter_advance() do not need the remaining length
returned. Get rid of the extra iomap_length() call that
iomap_iter_advance() does.

Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-10-20 20:21:25 +02:00
Joanne Koong 7588469b5e
iomap: move read/readahead bio submission logic into helper function
Move the read/readahead bio submission logic into a separate helper.
This is needed to make iomap read/readahead more generically usable,
especially for filesystems that do not require CONFIG_BLOCK.

Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
Tested-by: syzbot@syzkaller.appspotmail.com
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-10-20 20:21:25 +02:00
Joanne Koong 573c14c821
iomap: move bio read logic into helper function
Move the iomap_readpage_iter() bio read logic into a separate helper
function, iomap_bio_read_folio_range(). This is needed to make iomap
read/readahead more generically usable, especially for filesystems that
do not require CONFIG_BLOCK.

Additionally rename buffered write's iomap_read_folio_range() function
to iomap_bio_read_folio_range_sync() to better describe its synchronous
behavior.

Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-10-20 20:21:25 +02:00
Christophe JAILLET ac646f4495 genirq/msi: Slightly simplify msi_domain_alloc()
The return value of irq_find_mapping() is only tested, not used for
anything else.

Replaced it by irq_resolve_mapping() which is internally used by
irq_find_mapping() and allows a simple boolean decision.

Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://patch.msgid.link/1ce680114cdb8d40b072c54d7f015696a540e5a6.1760863194.git.christophe.jaillet@wanadoo.fr
2025-10-20 20:18:48 +02:00
Johan Hovold a7f25e00c4 irqchip/qcom-irq-combiner: Rename driver structure
The "_probe" suffix of the driver structure name prevents modpost from
warning about section mismatches so replace it to catch any future
issues like the recently fixed probe function being incorrectly marked
as __init.

Signed-off-by: Johan Hovold <johan@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
2025-10-17 15:18:18 +02:00
Yazen Ghannam 6553c68bc7 RAS/AMD/ATL: Return error codes from helper functions
Pass up error codes from helper functions rather than discarding them.

Suggested-by: Mario Limonciello <mario.limonciello@amd.com>
Signed-off-by: Yazen Ghannam <yazen.ghannam@amd.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
2025-10-17 14:38:42 +02:00
Nam Cao dce7450093 PCI/MSI: Delete pci_msi_create_irq_domain()
pci_msi_create_irq_domain() is now unused. Delete it.

Signed-off-by: Nam Cao <namcao@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
2025-10-16 21:09:52 +02:00
Samuel Holland 3a16b05384 irqchip/riscv-imsic: Inline imsic_vector_from_local_id()
This function is only called from one place, which is in the interrupt
handling hot path. Inline it to improve code generation and to take
advantage of this_cpu operations. lpriv and imsic->base_domain can never be
NULL because irq_set_chained_handler() is called after they are allocated.

Signed-off-by: Samuel Holland <samuel.holland@sifive.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
2025-10-16 18:17:28 +02:00
Samuel Holland 79eaabc61d irqchip/riscv-imsic: Embed the vector array in lpriv
Reduce pointer chasing and the number of allocations by using a flexible
array member for the vector array instead of a separate allocation.

Signed-off-by: Samuel Holland <samuel.holland@sifive.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
2025-10-16 18:17:28 +02:00
Samuel Holland c475c0b713 irqchip/riscv-imsic: Remove redundant irq_data lookups
imsic_irq_set_affinity() already takes the irq_data pointer as a
parameter, so it is pointless to look it up again from the IRQ number.

Signed-off-by: Samuel Holland <samuel.holland@sifive.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
2025-10-16 18:17:28 +02:00
Johan Hovold dcc31768ff irqchip/ts4800: Drop unused module alias
The driver has never supported anything but OF probing so drop the
unused platform alias.

Signed-off-by: Johan Hovold <johan@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
2025-10-16 18:17:28 +02:00
Johan Hovold b03127a4e7 irqchip/mvebu-pic: Drop unused module alias
The driver has never supported anything but OF probing so drop the
unused platform alias.

Signed-off-by: Johan Hovold <johan@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
2025-10-16 18:17:28 +02:00
Johan Hovold 867c6aa283 irqchip/meson-gpio: Drop unused module alias
The driver has never supported anything but OF probing so drop the
unused platform alias that was erroneously added by commit a947aa00ed
("irqchip/meson-gpio: Make it possible to build as a module").

Signed-off-by: Johan Hovold <johan@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
2025-10-16 18:17:27 +02:00
Johan Hovold 1230fbb225 irqchip: Enable compile testing of Broadcom drivers
There seems to be nothing preventing the Broadcom drivers from being
compile tested so enable that for wider build coverage.

Signed-off-by: Johan Hovold <johan@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Florian Fainelli <florian.fainelli@broadcom.com>
2025-10-16 18:17:27 +02:00
Johan Hovold 1e3e330c07 irqchip: Pass platform device to platform drivers
The IRQCHIP_PLATFORM_DRIVER macros can be used to convert OF irqchip
drivers to platform drivers but currently reuse the OF init callback
prototype that only takes OF nodes as arguments. This forces drivers to
do reverse lookups of their struct devices during probe if they need
them for things like dev_printk() and device managed resources.

Half of the drivers doing reverse lookups also currently fail to release
the additional reference taken during the lookup, while other drivers
have had the reference leak plugged in various ways (e.g. using
non-intuitive cleanup constructs which still confuse static checkers).

Switch to using a probe callback that takes a platform device as its
first argument to simplify drivers and plug the remaining (mostly
benign) reference leaks.

Fixes: 32c6c05466 ("irqchip: Add Broadcom BCM2712 MSI-X interrupt controller")
Fixes: 70afdab904 ("irqchip: Add IMX MU MSI controller driver")
Fixes: a6199bb514 ("irqchip: Add Qualcomm MPM controller driver")
Signed-off-by: Johan Hovold <johan@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Florian Fainelli <florian.fainelli@broadcom.com>
Reviewed-by: Changhuang Liang <changhuang.liang@starfivetech.com>
2025-10-16 18:17:27 +02:00
Randy Dunlap 762a3d1ca2 x86/idtentry: Add missing '*' to kernel-doc lines
Fix kernel-doc warnings by adding the missing '*' to each line.

  Warning: include/asm/idtentry.h:395 bad line:    when raised from kernel mode
  Warning: include/asm/idtentry.h:405 bad line:    when raised from user mode

Since this is in a kernel-doc block, these lines need a leading
" *" on each line to prevent the warnings.

Fixes: a13644f3a5 ("x86/entry/64: Add entry code for #VC handler")
Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
2025-10-16 17:45:42 +02:00
Peter Zijlstra 73cbcfe255 sched/topology,x86: Fix build warning
A compile warning slipped through:

   arch/x86/kernel/smpboot.c:548:5: warning: no previous prototype for function 'arch_sched_node_distance' [-Wmissing-prototypes]

Fixes: 4d6dd05d07 ("sched/topology: Fix sched domain build error for GNR, CWF in SNC-3 mode")
Reported-by: Borislav Petkov <bp@alien8.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
2025-10-16 13:01:15 +02:00
Peter Zijlstra 00a155c691 Merge branch 'objtool/core' of https://git.kernel.org/pub/scm/linux/kernel/git/jpoimboe/linux
This series introduces new objtool features and a klp-build script to
generate livepatch modules using a source .patch as input.

This builds on concepts from the longstanding out-of-tree kpatch [1]
project which began in 2012 and has been used for many years to generate
livepatch modules for production kernels.  However, this is a complete
rewrite which incorporates hard-earned lessons from 12+ years of
maintaining kpatch.

Key improvements compared to kpatch-build:

  - Integrated with objtool: Leverages objtool's existing control-flow
    graph analysis to help detect changed functions.

  - Works on vmlinux.o: Supports late-linked objects, making it
    compatible with LTO, IBT, and similar.

  - Simplified code base: ~3k fewer lines of code.

  - Upstream: No more out-of-tree #ifdef hacks, far less cruft.

  - Cleaner internals: Vastly simplified logic for symbol/section/reloc
    inclusion and special section extraction.

  - Robust __LINE__ macro handling: Avoids false positive binary diffs
    caused by the __LINE__ macro by introducing a fix-patch-lines script
    which injects #line directives into the source .patch to preserve
    the original line numbers at compile time.

The primary user interface is the klp-build script which does the
following:

  - Builds an original kernel with -function-sections and
    -fdata-sections, plus objtool function checksumming.

  - Applies the .patch file and rebuilds the kernel using the same
    options.

  - Runs 'objtool klp diff' to detect changed functions and generate
    intermediate binary diff objects.

  - Builds a kernel module which links the diff objects with some
    livepatch module init code (scripts/livepatch/init.c).

  - Finalizes the livepatch module (aka work around linker wreckage)
    using 'objtool klp post-link'.

I've tested with a variety of patches on defconfig and Fedora-config
kernels with both GCC and Clang.
2025-10-16 11:38:19 +02:00
Johan Hovold 3540d99c03 irqchip: Drop leftover brackets
Drop some unnecessary brackets in platform_irqchip_probe() mistakenly
left by commit 9322d1915f ("irqchip: Plug a OF node reference leak in
platform_irqchip_probe()").

Signed-off-by: Johan Hovold <johan@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Geert Uytterhoeven <geert+renesas@glider.be>
2025-10-16 11:30:38 +02:00
Johan Hovold 9b685058ca irqchip/qcom-irq-combiner: Fix section mismatch
Platform drivers can be probed after their init sections have been
discarded so the probe callback must not live in init.

Fixes: f20cc9b00c ("irqchip/qcom: Add IRQ combiner driver")
Signed-off-by: Johan Hovold <johan@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
2025-10-16 11:30:38 +02:00
Johan Hovold f798bdb9aa irqchip/starfive-jh8100: Fix section mismatch
Platform drivers can be probed after their init sections have been
discarded so the irqchip init callback must not live in init.

Fixes: e4e5350361 ("irqchip: Add StarFive external interrupt controller")
Signed-off-by: Johan Hovold <johan@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Changhuang Liang <changhuang.liang@starfivetech.com>
2025-10-16 11:30:38 +02:00
Johan Hovold 5b338fbb2b irqchip/renesas-rzg2l: Fix section mismatch
Platform drivers can be probed after their init sections have been
discarded so the irqchip init callbacks must not live in init.

Fixes: d011c022ef ("irqchip/renesas-rzg2l: Add support for RZ/Five SoC")
Signed-off-by: Johan Hovold <johan@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Geert Uytterhoeven <geert+renesas@glider.be>
2025-10-16 11:30:38 +02:00
Johan Hovold 64acfd8e68 irqchip/imx-mu-msi: Fix section mismatch
Platform drivers can be probed after their init sections have been
discarded so the irqchip init callbacks must not live in init.

Fixes: 70afdab904 ("irqchip: Add IMX MU MSI controller driver")
Signed-off-by: Johan Hovold <johan@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
2025-10-16 11:30:38 +02:00
Johan Hovold bbe1775924 irqchip/irq-brcmstb-l2: Fix section mismatch
Platform drivers can be probed after their init sections have been
discarded so the irqchip init callbacks must not live in init.

Fixes: 51d9db5c8f ("irqchip/irq-brcmstb-l2: Switch to IRQCHIP_PLATFORM_DRIVER")
Signed-off-by: Johan Hovold <johan@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Florian Fainelli <florian.fainelli@broadcom.com>
2025-10-16 11:30:37 +02:00
Johan Hovold bfc0c5beab irqchip/irq-bcm7120-l2: Fix section mismatch
Platform drivers can be probed after their init sections have been
discarded so the irqchip init callbacks must not live in init.

Fixes: 3ac268d5ed ("irqchip/irq-bcm7120-l2: Switch to IRQCHIP_PLATFORM_DRIVER")
Signed-off-by: Johan Hovold <johan@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Florian Fainelli <florian.fainelli@broadcom.com>
2025-10-16 11:30:37 +02:00
Johan Hovold e9db5332ca irqchip/irq-bcm7038-l1: Fix section mismatch
Platform drivers can be probed after their init sections have been
discarded so the irqchip init callback must not live in init.

Fixes: c057c799e3 ("irqchip/irq-bcm7038-l1: Switch to IRQCHIP_PLATFORM_DRIVER")
Signed-off-by: Johan Hovold <johan@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Florian Fainelli <florian.fainelli@broadcom.com>
2025-10-16 11:30:37 +02:00
Johan Hovold a8452d1d59 irqchip/bcm2712-mip: Fix section mismatch
Platform drivers can be probed after their init sections have been
discarded so the irqchip init callback must not live in init.

Fixes: 32c6c05466 ("irqchip: Add Broadcom BCM2712 MSI-X interrupt controller")
Signed-off-by: Johan Hovold <johan@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Florian Fainelli <florian.fainelli@broadcom.com>
2025-10-16 11:30:37 +02:00
Johan Hovold 0435bcc4e5 irqchip/bcm2712-mip: Fix OF node reference imbalance
The init callback must not decrement the reference count of the provided
irqchip OF node.

This should not cause any trouble currently, but if the driver ever
starts probe deferring it could lead to warnings about reference
underflow and saturation.

Fixes: 32c6c05466 ("irqchip: Add Broadcom BCM2712 MSI-X interrupt controller")
Signed-off-by: Johan Hovold <johan@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Florian Fainelli <florian.fainelli@broadcom.com>
2025-10-16 11:30:37 +02:00
Peter Zijlstra 4c95380701 sched/ext: Fold balance_scx() into pick_task_scx()
With pick_task() having an rf argument, it is possible to do the
lock-break there, get rid of the weird balance/pick_task hack.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Tejun Heo <tj@kernel.org>
2025-10-16 11:13:55 +02:00
Joel Fernandes 50653216e4 sched: Add support to pick functions to take rf
Some pick functions like the internal pick_next_task_fair() already take
rf but some others dont. We need this for scx's server pick function.
Prepare for this by having pick functions accept it.

[peterz: - added RETRY_TASK handling
         - removed pick_next_task_fair indirection]
Signed-off-by: Joel Fernandes <joelagnelf@nvidia.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Tejun Heo <tj@kernel.org>
2025-10-16 11:13:55 +02:00
Peter Zijlstra 1e900f415c sched: Detect per-class runqueue changes
Have enqueue/dequeue set a per-class bit in rq->queue_mask. This then
enables easy tracking of which runqueues are modified over a
lock-break.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Tejun Heo <tj@kernel.org>
2025-10-16 11:13:55 +02:00
Peter Zijlstra 73ec89a1ce sched: Mandate shared flags for sched_change
Shrikanth noted that sched_change pattern relies on using shared
flags.

Suggested-by: Shrikanth Hegde <sshegde@linux.ibm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
2025-10-16 11:13:54 +02:00
Peter Zijlstra d4c64207b8 sched: Cleanup the sched_change NOCLOCK usage
Teach the sched_change pattern how to do update_rq_clock(); this
allows for some simplifications / cleanups.

Suggested-by: K Prateek Nayak <kprateek.nayak@amd.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Juri Lelli <juri.lelli@redhat.com>
Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: Vincent Guittot <vincent.guittot@linaro.org>
2025-10-16 11:13:54 +02:00
Peter Zijlstra 5892cbd85d sched: Match __task_rq_{,un}lock()
In preparation to adding more rules to __task_rq_lock(), such that
__task_rq_unlock() will no longer be equivalent to rq_unlock(),
make sure every __task_rq_lock() is matched by a __task_rq_unlock()
and vice-versa.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Juri Lelli <juri.lelli@redhat.com>
Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: Vincent Guittot <vincent.guittot@linaro.org>
2025-10-16 11:13:54 +02:00
Peter Zijlstra 46a177fb01 sched: Add locking comments to sched_class methods
'Document' the locking context the various sched_class methods are
called under.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Juri Lelli <juri.lelli@redhat.com>
Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: Vincent Guittot <vincent.guittot@linaro.org>
2025-10-16 11:13:53 +02:00
Peter Zijlstra 650952d3fb sched: Make __do_set_cpus_allowed() use the sched_change pattern
Now that do_set_cpus_allowed() holds all the regular locks, convert it
to use the sched_change pattern helper.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Juri Lelli <juri.lelli@redhat.com>
Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: Vincent Guittot <vincent.guittot@linaro.org>
2025-10-16 11:13:53 +02:00
Peter Zijlstra b079d93796 sched: Rename do_set_cpus_allowed()
Hopefully saner naming.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Juri Lelli <juri.lelli@redhat.com>
Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: Vincent Guittot <vincent.guittot@linaro.org>
2025-10-16 11:13:53 +02:00
Peter Zijlstra abfc01077d sched: Fix do_set_cpus_allowed() locking
All callers of do_set_cpus_allowed() only take p->pi_lock, which is
not sufficient to actually change the cpumask. Again, this is mostly
ok in these cases, but it results in unnecessarily complicated
reasoning.

Furthermore, there is no reason what so ever to not just take all the
required locks, so do just that.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Juri Lelli <juri.lelli@redhat.com>
Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: Vincent Guittot <vincent.guittot@linaro.org>
2025-10-16 11:13:52 +02:00
Peter Zijlstra 942b8db965 sched: Fix migrate_disable_switch() locking
For some reason migrate_disable_switch() was more complicated than it
needs to be, resulting in mind bending locking of dubious quality.

Recognise that migrate_disable_switch() must be called before a
context switch, but any place before that switch is equally good.
Since the current place results in troubled locking, simply move the
thing before taking rq->lock.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Juri Lelli <juri.lelli@redhat.com>
Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: Vincent Guittot <vincent.guittot@linaro.org>
2025-10-16 11:13:52 +02:00
Peter Zijlstra 6455ad5346 sched: Move sched_class::prio_changed() into the change pattern
Move sched_class::prio_changed() into the change pattern.

And while there, extend it with sched_class::get_prio() in order to
fix the deadline sitation.

Suggested-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Juri Lelli <juri.lelli@redhat.com>
Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: Vincent Guittot <vincent.guittot@linaro.org>
2025-10-16 11:13:52 +02:00
Peter Zijlstra 1ae5f5dfe5 sched: Cleanup sched_delayed handling for class switches
Use the new sched_class::switching_from() method to dequeue delayed
tasks before switching to another class.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>
Reviewed-by: Juri Lelli <juri.lelli@redhat.com>
Acked-by: Tejun Heo <tj@kernel.org>
2025-10-16 11:13:51 +02:00
Peter Zijlstra 637b068282 sched: Fold sched_class::switch{ing,ed}_{to,from}() into the change pattern
Add {DE,EN}QUEUE_CLASS and fold the sched_class::switch* methods into
the change pattern. This completes and makes the pattern more
symmetric.

This changes the order of callbacks slightly:

  OLD                              NEW
				|
				|  switching_from()
  dequeue_task();		|  dequeue_task()
  put_prev_task();		|  put_prev_task()
				|  switched_from()
				|
  ... change task ...		|  ... change task ...
				|
  switching_to();		|  switching_to()
  enqueue_task();		|  enqueue_task()
  set_next_task();		|  set_next_task()
  prev_class->switched_from()	|
  switched_to()			|  switched_to()
				|

Notably, it moves the switched_from() callback right after the
dequeue/put. Existing implementations don't appear to be affected by
this change in location -- specifically the task isn't enqueued on the
class in question in either location.

Make (CLASS)^(SAVE|MOVE), because there is nothing to save-restore
when changing scheduling classes.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Juri Lelli <juri.lelli@redhat.com>
Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: Vincent Guittot <vincent.guittot@linaro.org>
2025-10-16 11:13:51 +02:00
Peter Zijlstra 5e42d4c123 sched/deadline: Prepare for switched_from() change
Prepare for the sched_class::switch*() methods getting folded into the
change pattern. As a result of that, the location of switched_from
will change slightly. SCHED_DEADLINE is affected by this change in
location:

  OLD                              NEW
				|
				|  switching_from()
  dequeue_task();		|  dequeue_task()
  put_prev_task();		|  put_prev_task()
				|  switched_from()
				|
  ... change task ...		|  ... change task ...
				|
  switching_to();		|  switching_to()
  enqueue_task();		|  enqueue_task()
  set_next_task();		|  set_next_task()
  prev_class->switched_from()	|
  switched_to()			|  switched_to()
				|

Notably, where switched_from() was called *after* the change to the
task, it will get called before it. Specifically, switched_from_dl()
uses dl_task(p) which uses p->prio; which is changed when switching
class (it might be the reason to switch class in case of PI).

When switched_from_dl() gets called, the task will have left the
deadline class and dl_task() must be false, while when doing
dequeue_dl_entity() the task must be a dl_task(), otherwise we'd have
called a different dequeue method.

Reported-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
2025-10-16 11:13:51 +02:00
Peter Zijlstra 376f8963bb sched: Re-arrange the {EN,DE}QUEUE flags
Ensure the matched flags are in the low word while the unmatched flags
go into the second word.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Juri Lelli <juri.lelli@redhat.com>
Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: Vincent Guittot <vincent.guittot@linaro.org>
2025-10-16 11:13:50 +02:00
Peter Zijlstra e9139f765a sched: Employ sched_change guards
As proposed a long while ago -- and half done by scx -- wrap the
scheduler's 'change' pattern in a guard helper.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Juri Lelli <juri.lelli@redhat.com>
Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: Vincent Guittot <vincent.guittot@linaro.org>
2025-10-16 11:13:50 +02:00
Adam Li 82d6e01a06 sched/fair: Only update stats for allowed CPUs when looking for dst group
Load imbalance is observed when the workload frequently forks new threads.
Due to CPU affinity, the workload can run on CPU 0-7 in the first
group, and only on CPU 8-11 in the second group. CPU 12-15 are always idle.

{ 0 1 2 3 4 5 6 7 } {8 9 10 11 12 13 14 15}
  * * * * * * * *    * * *  *

When looking for dst group for newly forked threads, in many times
update_sg_wakeup_stats() reports the second group has more idle CPUs
than the first group. The scheduler thinks the second group is less
busy. Then it selects least busy CPUs among CPU 8-11. Therefore CPU 8-11
can be crowded with newly forked threads, at the same time CPU 0-7
can be idle.

A task may not use all the CPUs in a schedule group due to CPU affinity.
Only update schedule group statistics for allowed CPUs.

Signed-off-by: Adam Li <adamli@os.amperecomputing.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
2025-10-16 11:13:50 +02:00
Tim Chen 4d6dd05d07 sched/topology: Fix sched domain build error for GNR, CWF in SNC-3 mode
It is possible for Granite Rapids (GNR) and Clearwater Forest
(CWF) to have up to 3 dies per package. When sub-numa cluster (SNC-3)
is enabled, each die will become a separate NUMA node in the package
with different distances between dies within the same package.

For example, on GNR, we see the following numa distances for a 2 socket
system with 3 dies per socket:

    package 1       package2
	----------------
	|               |
    ---------       ---------
    |   0   |       |   3   |
    ---------       ---------
	|               |
    ---------       ---------
    |   1   |       |   4   |
    ---------       ---------
	|               |
    ---------       ---------
    |   2   |       |   5   |
    ---------       ---------
	|               |
	----------------

node distances:
node     0    1    2    3    4    5
0:   	10   15   17   21   28   26
1:   	15   10   15   23   26   23
2:   	17   15   10   26   23   21
3:   	21   28   26   10   15   17
4:   	23   26   23   15   10   15
5:   	26   23   21   17   15   10

The node distances above led to 2 problems:

1. Asymmetric routes taken between nodes in different packages led to
asymmetric scheduler domain perspective depending on which node you
are on.  Current scheduler code failed to build domains properly with
asymmetric distances.

2. Multiple remote distances to respective tiles on remote package create
too many levels of domain hierarchies grouping different nodes between
remote packages.

For example, the above GNR topology lead to NUMA domains below:

Sched domains from the perspective of a CPU in node 0, where the number
in bracket represent node number.

NUMA-level 1    [0,1] [2]
NUMA-level 2    [0,1,2] [3]
NUMA-level 3    [0,1,2,3] [5]
NUMA-level 4    [0,1,2,3,5] [4]

Sched domains from the perspective of a CPU in node 4
NUMA-level 1    [4] [3,5]
NUMA-level 2    [3,4,5] [0,2]
NUMA-level 3    [0,2,3,4,5] [1]

Scheduler group peers for load balancing from the perspective of CPU 0
and 4 are different.  Improper task could be chosen for load balancing
between groups such as [0,2,3,4,5] [1].  Ideally you should choose nodes
in 0 or 2 that are in same package as node 1 first.  But instead tasks
in the remote package node 3, 4, 5 could be chosen with an equal chance
and could lead to excessive remote package migrations and imbalance of
load between packages.  We should not group partial remote nodes and
local nodes together.
Simplify the remote distances for CWF and GNR for the purpose of
sched domains building, which maintains symmetry and leads to a more
reasonable load balance hierarchy.

The sched domains from the perspective of a CPU in node 0 NUMA-level 1
is now
NUMA-level 1    [0,1] [2]
NUMA-level 2    [0,1,2] [3,4,5]

The sched domains from the perspective of a CPU in node 4 NUMA-level 1
is now
NUMA-level 1    [4] [3,5]
NUMA-level 2    [3,4,5] [0,1,2]

We have the same balancing perspective from node 0 or node 4.  Loads are
now balanced equally between packages.

Co-developed-by: Vinicius Costa Gomes <vinicius.gomes@intel.com>
Signed-off-by: Vinicius Costa Gomes <vinicius.gomes@intel.com>
Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Chen Yu <yu.c.chen@intel.com>
Tested-by: Zhao Liu <zhao1.liu@intel.com>
2025-10-16 11:13:50 +02:00
Tim Chen 06f2c90885 sched: Create architecture specific sched domain distances
Allow architecture specific sched domain NUMA distances that are
modified from actual NUMA node distances for the purpose of building
NUMA sched domains.

Keep actual NUMA distances separately if modified distances
are used for building sched domains. Such distances
are still needed as NUMA balancing benefits from finding the
NUMA nodes that are actually closer to a task numa_group.

Consolidate the recording of unique NUMA distances in an array to
sched_record_numa_dist() so the function can be reused to record NUMA
distances when the NUMA distance metric is changed.

No functional change and additional distance array
allocated if there're no arch specific NUMA distances
being defined.

Co-developed-by: Vinicius Costa Gomes <vinicius.gomes@intel.com>
Signed-off-by: Vinicius Costa Gomes <vinicius.gomes@intel.com>
Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Chen Yu <yu.c.chen@intel.com>
2025-10-16 11:13:49 +02:00
Doug Berger 382748c05e sched/deadline: only set free_cpus for online runqueues
Commit 16b269436b ("sched/deadline: Modify cpudl::free_cpus
to reflect rd->online") introduced the cpudl_set/clear_freecpu
functions to allow the cpu_dl::free_cpus mask to be manipulated
by the deadline scheduler class rq_on/offline callbacks so the
mask would also reflect this state.

Commit 9659e1eeee ("sched/deadline: Remove cpu_active_mask
from cpudl_find()") removed the check of the cpu_active_mask to
save some processing on the premise that the cpudl::free_cpus
mask already reflected the runqueue online state.

Unfortunately, there are cases where it is possible for the
cpudl_clear function to set the free_cpus bit for a CPU when the
deadline runqueue is offline. When this occurs while a CPU is
connected to the default root domain the flag may retain the bad
state after the CPU has been unplugged. Later, a different CPU
that is transitioning through the default root domain may push a
deadline task to the powered down CPU when cpudl_find sees its
free_cpus bit is set. If this happens the task will not have the
opportunity to run.

One example is outlined here:
https://lore.kernel.org/lkml/20250110233010.2339521-1-opendmb@gmail.com

Another occurs when the last deadline task is migrated from a
CPU that has an offlined runqueue. The dequeue_task member of
the deadline scheduler class will eventually call cpudl_clear
and set the free_cpus bit for the CPU.

This commit modifies the cpudl_clear function to be aware of the
online state of the deadline runqueue so that the free_cpus mask
can be updated appropriately.

It is no longer necessary to manage the mask outside of the
cpudl_set/clear functions so the cpudl_set/clear_freecpu
functions are removed. In addition, since the free_cpus mask is
now only updated under the cpudl lock the code was changed to
use the non-atomic __cpumask functions.

Signed-off-by: Doug Berger <opendmb@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
2025-10-16 11:13:49 +02:00
Fernand Sieber 79104becf4 sched/fair: Forfeit vruntime on yield
If a task yields, the scheduler may decide to pick it again. The task in
turn may decide to yield immediately or shortly after, leading to a tight
loop of yields.

If there's another runnable task as this point, the deadline will be
increased by the slice at each loop. This can cause the deadline to runaway
pretty quickly, and subsequent elevated run delays later on as the task
doesn't get picked again. The reason the scheduler can pick the same task
again and again despite its deadline increasing is because it may be the
only eligible task at that point.

Fix this by making the task forfeiting its remaining vruntime and pushing
the deadline one slice ahead. This implements yield behavior more
authentically.

We limit the forfeiting to eligible tasks. This is because core scheduling
prefers running ineligible tasks rather than force idling. As such, without
the condition, we can end up on a yield loop which makes the vruntime
increase rapidly, leading to anomalous run delays later down the line.

Fixes: 147f3efaa2 ("sched/fair: Implement an EEVDF-like scheduling  policy")
Signed-off-by: Fernand Sieber <sieberf@amazon.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20250401123622.584018-1-sieberf@amazon.com
Link: https://lore.kernel.org/r/20250911095113.203439-1-sieberf@amazon.com
Link: https://lore.kernel.org/r/20250916140228.452231-1-sieberf@amazon.com
2025-10-16 11:13:49 +02:00
Peter Zijlstra 45e1dccc06 x86/insn: Simplify for_each_insn_prefix()
Use the new-found freedom of allowing variable declarions inside
for() to simplify the for_each_insn_prefix() iterator to no longer
need an external temporary.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
2025-10-16 11:13:48 +02:00
Peter Zijlstra 8a5c6cbfe4 x86/insn,uprobes,alternative: Unify insn_is_nop()
Both uprobes and alternatives have insn_is_nop() variants, unify them
and make sure insn_is_nop() works for both x86_64 and i386.

Specifically, uprobe must not compare userspace instructions to kernel
nops as that does not work right in the compat case.

For the uprobe case we therefore must recognise common 32bit and 64bit
nops. Because uprobe will consume the instruction as a nop, it must
not mistakenly claim a non-nop instruction to be a nop. Eg. 'REX.b3
NOP' is 'xchg %r8,%rax' - not a nop.

For the kernel case similar constraints apply, is it used to optimize
NOPs by replacing strings of short(er) nops with longer nops. Must not
claim an instruction is a nop if it really isn't. Not recognising a
nop is non-fatal.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
2025-10-16 11:13:47 +02:00
George Kennedy 866cf36bfe perf/x86/amd: Check event before enable to avoid GPF
On AMD machines cpuc->events[idx] can become NULL in a subtle race
condition with NMI->throttle->x86_pmu_stop().

Check event for NULL in amd_pmu_enable_all() before enable to avoid a GPF.
This appears to be an AMD only issue.

Syzkaller reported a GPF in amd_pmu_enable_all.

INFO: NMI handler (perf_event_nmi_handler) took too long to run: 13.143
    msecs
Oops: general protection fault, probably for non-canonical address
    0xdffffc0000000034: 0000  PREEMPT SMP KASAN NOPTI
KASAN: null-ptr-deref in range [0x00000000000001a0-0x00000000000001a7]
CPU: 0 UID: 0 PID: 328415 Comm: repro_36674776 Not tainted 6.12.0-rc1-syzk
RIP: 0010:x86_pmu_enable_event (arch/x86/events/perf_event.h:1195
    arch/x86/events/core.c:1430)
RSP: 0018:ffff888118009d60 EFLAGS: 00010012
RAX: dffffc0000000000 RBX: 0000000000000000 RCX: 0000000000000000
RDX: 0000000000000034 RSI: 0000000000000000 RDI: 00000000000001a0
RBP: 0000000000000001 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000002
R13: ffff88811802a440 R14: ffff88811802a240 R15: ffff8881132d8601
FS:  00007f097dfaa700(0000) GS:ffff888118000000(0000) GS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00000000200001c0 CR3: 0000000103d56000 CR4: 00000000000006f0
Call Trace:
 <IRQ>
amd_pmu_enable_all (arch/x86/events/amd/core.c:760 (discriminator 2))
x86_pmu_enable (arch/x86/events/core.c:1360)
event_sched_out (kernel/events/core.c:1191 kernel/events/core.c:1186
    kernel/events/core.c:2346)
__perf_remove_from_context (kernel/events/core.c:2435)
event_function (kernel/events/core.c:259)
remote_function (kernel/events/core.c:92 (discriminator 1)
    kernel/events/core.c:72 (discriminator 1))
__flush_smp_call_function_queue (./arch/x86/include/asm/jump_label.h:27
    ./include/linux/jump_label.h:207 ./include/trace/events/csd.h:64
    kernel/smp.c:135 kernel/smp.c:540)
__sysvec_call_function_single (./arch/x86/include/asm/jump_label.h:27
    ./include/linux/jump_label.h:207
    ./arch/x86/include/asm/trace/irq_vectors.h:99 arch/x86/kernel/smp.c:272)
sysvec_call_function_single (arch/x86/kernel/smp.c:266 (discriminator 47)
    arch/x86/kernel/smp.c:266 (discriminator 47))
 </IRQ>

Reported-by: syzkaller <syzkaller@googlegroups.com>
Signed-off-by: George Kennedy <george.kennedy@oracle.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
2025-10-16 11:13:47 +02:00
Chang S. Bae bffeb2fd0b x86/microcode/intel: Enable staging when available
With staging support implemented, enable it when the CPU reports the
feature.

  [ bp: Sort in the MSR properly. ]

Signed-off-by: Chang S. Bae <chang.seok.bae@intel.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Reviewed-by: Chao Gao <chao.gao@intel.com>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Tested-by: Anselm Busse <abusse@amazon.de>
Link: https://lore.kernel.org/20250320234104.8288-1-chang.seok.bae@intel.com
2025-10-15 16:47:50 +02:00
Chang S. Bae 4ab410287b x86/microcode/intel: Support mailbox transfer
The functions for sending microcode data and retrieving the next offset
were previously placeholders, as they need to handle a specific mailbox
format.

While the kernel supports similar mailboxes, none of them are compatible
with this one. Attempts to share code led to unnecessary complexity, so
add a dedicated implementation instead.

  [ bp: Sort the include properly. ]

Signed-off-by: Chang S. Bae <chang.seok.bae@intel.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Tested-by: Anselm Busse <abusse@amazon.de>
Link: https://lore.kernel.org/20250320234104.8288-1-chang.seok.bae@intel.com
2025-10-15 16:47:43 +02:00
Chang S. Bae afc3b50954 x86/microcode/intel: Implement staging handler
Previously, per-package staging invocations and their associated state
data were established. The next step is to implement the actual staging
handler according to the specified protocol. Below are key aspects to
note:

  (a)  Each staging process must begin by resetting the staging hardware.

  (b)  The staging hardware processes up to a page-sized chunk of the
       microcode image per iteration, requiring software to submit data
       incrementally.

  (c)  Once a data chunk is processed, the hardware responds with an
       offset in the image for the next chunk.

  (d)  The offset may indicate completion or request retransmission of an
       already transferred chunk. As long as the total transferred data
       remains within the predefined limit (twice the image size),
       retransmissions should be acceptable.

Incorporate them in the handler, while data transmission and mailbox
format handling are implemented separately.

  [ bp: Sort the headers in a reversed name-length order. ]

Signed-off-by: Chang S. Bae <chang.seok.bae@intel.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Tested-by: Anselm Busse <abusse@amazon.de>
Link: https://lore.kernel.org/20250320234104.8288-1-chang.seok.bae@intel.com
2025-10-15 16:47:37 +02:00
Chang S. Bae 079b90d4ba x86/microcode/intel: Define staging state struct
Define a staging_state struct to simplify function prototypes by consolidating
relevant data, instead of passing multiple local variables.

Signed-off-by: Chang S. Bae <chang.seok.bae@intel.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Tested-by: Anselm Busse <abusse@amazon.de>
Link: https://lore.kernel.org/20250320234104.8288-1-chang.seok.bae@intel.com
2025-10-15 16:47:31 +02:00
Chang S. Bae 740144bc6b x86/microcode/intel: Establish staging control logic
When microcode staging is initiated, operations are carried out through
an MMIO interface. Each package has a unique interface specified by the
IA32_MCU_STAGING_MBOX_ADDR MSR, which maps to a set of 32-bit registers.

Prepare staging with the following steps:

  1.  Ensure the microcode image is 32-bit aligned to match the MMIO
      register size.

  2.  Identify each MMIO interface based on its per-package scope.

  3.  Invoke the staging function for each identified interface, which
      will be implemented separately.

  [ bp: Improve error logging. ]

Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Chang S. Bae <chang.seok.bae@intel.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Tested-by: Anselm Busse <abusse@amazon.de>
Link: https://lore.kernel.org/all/871pznq229.ffs@tglx
2025-10-15 16:47:20 +02:00
Chang S. Bae 7cdda85ed9 x86/microcode: Introduce staging step to reduce late-loading time
As microcode patch sizes continue to grow, late-loading latency spikes can
lead to timeouts and disruptions in running workloads. This trend of
increasing patch sizes is expected to continue, so a foundational solution is
needed to address the issue.

To mitigate the problem, introduce a microcode staging feature. This option
processes most of the microcode update (excluding activation) on
a non-critical path, allowing CPUs to remain operational during the majority
of the update. By offloading work from the critical path, staging can
significantly reduce latency spikes.

Integrate staging as a preparatory step in late-loading. Introduce a new
callback for staging, which is invoked at the beginning of
load_late_stop_cpus(), before CPUs enter the rendezvous phase.

Staging follows an opportunistic model:

  *  If successful, it reduces CPU rendezvous time
  *  Even though it fails, the process falls back to the legacy path to
     finish the loading process but with potentially higher latency.

Extend struct microcode_ops to incorporate staging properties, which will be
implemented in the vendor code separately.

Signed-off-by: Chang S. Bae <chang.seok.bae@intel.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Reviewed-by: Chao Gao <chao.gao@intel.com>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Tested-by: Anselm Busse <abusse@amazon.de>
Link: https://lore.kernel.org/20250320234104.8288-1-chang.seok.bae@intel.com
2025-10-15 16:46:58 +02:00
Chang S. Bae ed44a5625f x86/cpu/topology: Make primary thread mask available with SMP=n
cpu_primary_thread_mask is only defined when CONFIG_SMP=y. However, even
in UP kernels there is always exactly one CPU, which can reasonably be
treated as the primary thread.

Historically, topology_is_primary_thread() always returned true with
CONFIG_SMP=n. A recent commit:

  4b455f5994 ("cpu/SMT: Provide a default topology_is_primary_thread()")

replaced it with a generic implementation with the note:

  "When disabling SMT, the primary thread of the SMT will remain
   enabled/active. Architectures that have a special primary thread (e.g.
   x86) need to override this function. ..."

For consistency and clarity, make the primary thread mask available
regardless of SMP, similar to cpu_possible_mask and cpu_present_mask.

Move __cpu_primary_thread_mask into common code to prevent build issues.
Let cpu_mark_primary_thread() configure the mask even for UP kernels,
alongside other masks. Then, topology_is_primary_thread() can
consistently reference it.

Signed-off-by: Chang S. Bae <chang.seok.bae@intel.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Link: https://lore.kernel.org/r/20250320234104.8288-1-chang.seok.bae@intel.com
2025-10-15 16:46:11 +02:00
Josh Poimboeuf b9976fa464 livepatch: Introduce source code helpers for livepatch modules
Add some helper macros which can be used by livepatch source .patch
files to register callbacks, convert static calls to regular calls where
needed, and patch syscalls.

Acked-by: Petr Mladek <pmladek@suse.com>
Tested-by: Joe Lawrence <joe.lawrence@redhat.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
2025-10-14 14:50:19 -07:00
Josh Poimboeuf 78be9facfb livepatch/klp-build: Add --show-first-changed option to show function divergence
Add a --show-first-changed option to identify where changed functions
begin to diverge:

  - Parse 'objtool klp diff' output to find changed functions.

  - Run objtool again on each object with --debug-checksum=<funcs>.

  - Diff the per-instruction checksum debug output to locate the first
    differing instruction.

This can be useful for quickly determining where and why a function
changed.

Acked-by: Petr Mladek <pmladek@suse.com>
Tested-by: Joe Lawrence <joe.lawrence@redhat.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
2025-10-14 14:50:19 -07:00
Josh Poimboeuf 2c2f0b8626 livepatch/klp-build: Add --debug option to show cloning decisions
Add a --debug option which gets passed to "objtool klp diff" to enable
debug output related to cloning decisions.

Acked-by: Petr Mladek <pmladek@suse.com>
Tested-by: Joe Lawrence <joe.lawrence@redhat.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
2025-10-14 14:50:19 -07:00
Josh Poimboeuf 24ebfcd65a livepatch/klp-build: Introduce klp-build script for generating livepatch modules
Add a klp-build script which automates the generation of a livepatch
module from a source .patch file by performing the following steps:

  - Builds an original kernel with -function-sections and
    -fdata-sections, plus objtool function checksumming.

  - Applies the .patch file and rebuilds the kernel using the same
    options.

  - Runs 'objtool klp diff' to detect changed functions and generate
    intermediate binary diff objects.

  - Builds a kernel module which links the diff objects with some
    livepatch module init code (scripts/livepatch/init.c).

  - Finalizes the livepatch module (aka work around linker wreckage)
    using 'objtool klp post-link'.

Acked-by: Petr Mladek <pmladek@suse.com>
Tested-by: Joe Lawrence <joe.lawrence@redhat.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
2025-10-14 14:50:19 -07:00
Josh Poimboeuf 59adee07b5 livepatch/klp-build: Add stub init code for livepatch modules
Add a module initialization stub which can be linked with binary diff
objects to produce a livepatch module.

Acked-by: Petr Mladek <pmladek@suse.com>
Tested-by: Joe Lawrence <joe.lawrence@redhat.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
2025-10-14 14:50:19 -07:00
Josh Poimboeuf abaf1f42dd livepatch/klp-build: Introduce fix-patch-lines script to avoid __LINE__ diff noise
The __LINE__ macro creates challenges for binary diffing.  When a .patch
file adds or removes lines, it shifts the line numbers for all code
below it.

This can cause the code generation of functions using __LINE__ to change
due to the line number constant being embedded in a MOV instruction,
despite there being no semantic difference.

Avoid such false positives by adding a fix-patch-lines script which can
be used to insert a #line directive in each patch hunk affecting the
line numbering.  This script will be used by klp-build, which will be
introduced in a subsequent patch.

Acked-by: Petr Mladek <pmladek@suse.com>
Tested-by: Joe Lawrence <joe.lawrence@redhat.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
2025-10-14 14:50:19 -07:00
Josh Poimboeuf f2c356d1d0 kbuild,objtool: Defer objtool validation step for CONFIG_KLP_BUILD
In preparation for klp-build, defer objtool validation for
CONFIG_KLP_BUILD kernels until the final pre-link archive (e.g.,
vmlinux.o, module-foo.o) is built.  This will simplify the process of
generating livepatch modules.

Delayed objtool is generally preferred anyway, and is already standard
for IBT and LTO.  Eventually the per-translation-unit mode will be
phased out.

Acked-by: Petr Mladek <pmladek@suse.com>
Tested-by: Joe Lawrence <joe.lawrence@redhat.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
2025-10-14 14:50:19 -07:00
Josh Poimboeuf 7ae60ff0b7 livepatch: Add CONFIG_KLP_BUILD
In preparation for introducing klp-build, add a new CONFIG_KLP_BUILD
option.  The initial version will only be supported on x86-64.

Acked-by: Petr Mladek <pmladek@suse.com>
Tested-by: Joe Lawrence <joe.lawrence@redhat.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
2025-10-14 14:50:18 -07:00
Josh Poimboeuf 164c9201e1 objtool: Add base objtool support for livepatch modules
In preparation for klp-build, enable "classic" objtool to work on
livepatch modules:

  - Avoid duplicate symbol/section warnings for prefix symbols and the
    .static_call_sites and __mcount_loc sections which may have already
    been extracted by klp diff.

  - Add __klp_funcs to the IBT function pointer section whitelist.

  - Prevent KLP symbols from getting incorrectly classified as cold
    subfunctions.

Acked-by: Petr Mladek <pmladek@suse.com>
Tested-by: Joe Lawrence <joe.lawrence@redhat.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
2025-10-14 14:50:18 -07:00
Josh Poimboeuf 2058f6d166 objtool: Refactor prefix symbol creation code
The prefix symbol creation code currently ignores all errors, presumably
because some functions don't have the leading NOPs.

Shuffle the code around a bit, improve the error handling and document
why some errors are ignored.

Acked-by: Petr Mladek <pmladek@suse.com>
Tested-by: Joe Lawrence <joe.lawrence@redhat.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
2025-10-14 14:50:18 -07:00
Josh Poimboeuf ebe864b553 objtool/klp: Add post-link subcommand to finalize livepatch modules
Livepatch needs some ELF magic which linkers don't like:

  - Two relocation sections (.rela*, .klp.rela*) for the same text
    section.

  - Use of SHN_LIVEPATCH to mark livepatch symbols.

Unfortunately linkers tend to mangle such things.  To work around that,
klp diff generates a linker-compliant intermediate binary which encodes
the relevant KLP section/reloc/symbol metadata.

After module linking, the .ko then needs to be converted to an actual
livepatch module.  Introduce a new klp post-link subcommand to do so.

Acked-by: Petr Mladek <pmladek@suse.com>
Tested-by: Joe Lawrence <joe.lawrence@redhat.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
2025-10-14 14:50:18 -07:00
Josh Poimboeuf 7c2575a640 objtool/klp: Add --debug option to show cloning decisions
Add a --debug option to klp diff which prints cloning decisions and an
indented dependency tree for all cloned symbols and relocations.  This
helps visualize which symbols and relocations were included and why.

Acked-by: Petr Mladek <pmladek@suse.com>
Tested-by: Joe Lawrence <joe.lawrence@redhat.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
2025-10-14 14:50:18 -07:00
Josh Poimboeuf dd590d4d57 objtool/klp: Introduce klp diff subcommand for diffing object files
Add a new klp diff subcommand which performs a binary diff between two
object files and extracts changed functions into a new object which can
then be linked into a livepatch module.

This builds on concepts from the longstanding out-of-tree kpatch [1]
project which began in 2012 and has been used for many years to generate
livepatch modules for production kernels.  However, this is a complete
rewrite which incorporates hard-earned lessons from 12+ years of
maintaining kpatch.

Key improvements compared to kpatch-build:

  - Integrated with objtool: Leverages objtool's existing control-flow
    graph analysis to help detect changed functions.

  - Works on vmlinux.o: Supports late-linked objects, making it
    compatible with LTO, IBT, and similar.

  - Simplified code base: ~3k fewer lines of code.

  - Upstream: No more out-of-tree #ifdef hacks, far less cruft.

  - Cleaner internals: Vastly simplified logic for symbol/section/reloc
    inclusion and special section extraction.

  - Robust __LINE__ macro handling: Avoids false positive binary diffs
    caused by the __LINE__ macro by introducing a fix-patch-lines script
    (coming in a later patch) which injects #line directives into the
    source .patch to preserve the original line numbers at compile time.

Note the end result of this subcommand is not yet functionally complete.
Livepatch needs some ELF magic which linkers don't like:

  - Two relocation sections (.rela*, .klp.rela*) for the same text
    section.

  - Use of SHN_LIVEPATCH to mark livepatch symbols.

Unfortunately linkers tend to mangle such things.  To work around that,
klp diff generates a linker-compliant intermediate binary which encodes
the relevant KLP section/reloc/symbol metadata.

After module linking, a klp post-link step (coming soon) will clean up
the mess and convert the linked .ko into a fully compliant livepatch
module.

Note this subcommand requires the diffed binaries to have been compiled
with -ffunction-sections and -fdata-sections, and processed with
'objtool --checksum'.  Those constraints will be handled by a klp-build
script introduced in a later patch.

Without '-ffunction-sections -fdata-sections', reliable object diffing
would be infeasible due to toolchain limitations:

  - For intra-file+intra-section references, the compiler might
    occasionally generated hard-coded instruction offsets instead of
    relocations.

  - Section-symbol-based references can be ambiguous:

    - Overlapping or zero-length symbols create ambiguity as to which
      symbol is being referenced.

    - A reference to the end of a symbol (e.g., checking array bounds)
      can be misinterpreted as a reference to the next symbol, or vice
      versa.

A potential future alternative to '-ffunction-sections -fdata-sections'
would be to introduce a toolchain option that forces symbol-based
(non-section) relocations.

Acked-by: Petr Mladek <pmladek@suse.com>
Tested-by: Joe Lawrence <joe.lawrence@redhat.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
2025-10-14 14:50:18 -07:00
Josh Poimboeuf a3493b3338 objtool/klp: Add --debug-checksum=<funcs> to show per-instruction checksums
Add a --debug-checksum=<funcs> option to the check subcommand to print
the calculated checksum of each instruction in the given functions.

This is useful for determining where two versions of a function begin to
diverge.

Acked-by: Petr Mladek <pmladek@suse.com>
Tested-by: Joe Lawrence <joe.lawrence@redhat.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
2025-10-14 14:50:18 -07:00
Josh Poimboeuf 0d83da43b1 objtool/klp: Add --checksum option to generate per-function checksums
In preparation for the objtool klp diff subcommand, add a command-line
option to generate a unique checksum for each function.  This will
enable detection of functions which have changed between two versions of
an object file.

Acked-by: Petr Mladek <pmladek@suse.com>
Tested-by: Joe Lawrence <joe.lawrence@redhat.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
2025-10-14 14:50:17 -07:00
Josh Poimboeuf f6b740ef5f objtool: Unify STACK_FRAME_NON_STANDARD entry sizes
The C implementation of STACK_FRAME_NON_STANDARD emits 8-byte entries,
whereas the asm version's entries are only 4 bytes.

Make them consistent by converting the asm version to 8-byte entries.

This is much easier than converting the C version to 4-bytes, which
would require awkwardly putting inline asm in a dummy function in order
to pass the 'func' pointer to the asm.

Acked-by: Petr Mladek <pmladek@suse.com>
Tested-by: Joe Lawrence <joe.lawrence@redhat.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
2025-10-14 14:50:17 -07:00
Josh Poimboeuf aca282ab7e x86/asm: Annotate special section entries
In preparation for the objtool klp diff subcommand, add annotations for
special section entries.  This will enable objtool to determine the size
and location of the entries and to extract them when needed.

Acked-by: Petr Mladek <pmladek@suse.com>
Tested-by: Joe Lawrence <joe.lawrence@redhat.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
2025-10-14 14:50:17 -07:00
Josh Poimboeuf 58f36a5756 objtool: Add ANNOTATE_DATA_SPECIAL
In preparation for the objtool klp diff subcommand, add an
ANNOTATE_DATA_SPECIAL macro which annotates special section entries so
that objtool can determine their size and location and extract them
when needed.

Acked-by: Petr Mladek <pmladek@suse.com>
Tested-by: Joe Lawrence <joe.lawrence@redhat.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
2025-10-14 14:50:16 -07:00
Josh Poimboeuf d2c60bde1c objtool: Move ANNOTATE* macros to annotate.h
In preparation for using the objtool annotation macros in higher-level
objtool.h macros like UNWIND_HINT, move them to their own file.

Acked-by: Petr Mladek <pmladek@suse.com>
Tested-by: Joe Lawrence <joe.lawrence@redhat.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
2025-10-14 14:49:20 -07:00
Josh Poimboeuf 3b92486fa1 objtool: Add annotype() helper
... for reading annotation types.

Acked-by: Petr Mladek <pmladek@suse.com>
Tested-by: Joe Lawrence <joe.lawrence@redhat.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
2025-10-14 14:46:49 -07:00
Josh Poimboeuf 03c19a99ee objtool: Add elf_create_file()
Add interface to enable the creation of a new ELF file.

Acked-by: Petr Mladek <pmladek@suse.com>
Tested-by: Joe Lawrence <joe.lawrence@redhat.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
2025-10-14 14:46:49 -07:00
Josh Poimboeuf 2c05ca0262 objtool: Add elf_create_reloc() and elf_init_reloc()
elf_create_rela_section() is quite limited in that it requires the
caller to know how many relocations need to be allocated up front.

In preparation for the objtool klp diff subcommand, allow an arbitrary
number of relocations to be created and initialized on demand after
section creation.

Acked-by: Petr Mladek <pmladek@suse.com>
Tested-by: Joe Lawrence <joe.lawrence@redhat.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
2025-10-14 14:46:49 -07:00
Josh Poimboeuf 431dbabf2d objtool: Add elf_create_data()
In preparation for the objtool klp diff subcommand, refactor
elf_add_string() by adding a new elf_add_data() helper which allows the
adding of arbitrary data to a section.

Make both interfaces global so they can be used by the upcoming klp diff
code.

Acked-by: Petr Mladek <pmladek@suse.com>
Tested-by: Joe Lawrence <joe.lawrence@redhat.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
2025-10-14 14:46:48 -07:00
Josh Poimboeuf 243e963853 objtool: Generalize elf_create_section()
In preparation for the objtool klp diff subcommand, broaden the
elf_create_section() interface to give callers more control and reduce
duplication of some subtle setup logic.

While at it, make elf_create_rela_section() global so sections can be
created by the upcoming klp diff code.

Acked-by: Petr Mladek <pmladek@suse.com>
Tested-by: Joe Lawrence <joe.lawrence@redhat.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
2025-10-14 14:46:48 -07:00
Josh Poimboeuf dd2c29aafd objtool: Generalize elf_create_symbol()
In preparation for the objtool klp diff subcommand, broaden the
elf_create_symbol() interface to give callers more control and reduce
duplication of some subtle setup logic.

While at it, make elf_create_symbol() and elf_create_section_symbol()
global so sections can be created by the upcoming klp diff code.

Acked-by: Petr Mladek <pmladek@suse.com>
Tested-by: Joe Lawrence <joe.lawrence@redhat.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
2025-10-14 14:46:48 -07:00
Josh Poimboeuf 02cf323a7e objtool: Simplify special symbol handling in elf_update_symbol()
!sym->sec isn't actually a thing: even STT_UNDEF and other special
symbol types belong to NULL section 0.

Simplify the initialization of 'shndx' accordingly.

Acked-by: Petr Mladek <pmladek@suse.com>
Tested-by: Joe Lawrence <joe.lawrence@redhat.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
2025-10-14 14:46:48 -07:00
Josh Poimboeuf a05de0a772 objtool: Refactor add_jump_destinations()
The add_jump_destinations() logic is a bit weird and convoluted after
being incrementally tweaked over the years.  Refactor it to hopefully be
more logical and straightforward.

Acked-by: Petr Mladek <pmladek@suse.com>
Tested-by: Joe Lawrence <joe.lawrence@redhat.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
2025-10-14 14:46:48 -07:00
Josh Poimboeuf 935c0b6a05 objtool: Reindent check_options[]
Bring the cmdline check_options[] array back into vertical alignment for
better readability.

Acked-by: Petr Mladek <pmladek@suse.com>
Tested-by: Joe Lawrence <joe.lawrence@redhat.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
2025-10-14 14:46:48 -07:00
Josh Poimboeuf 2b91479776 objtool: Resurrect --backup option
The --backup option was removed with the following commit:

  aa8b3e64fd ("objtool: Create backup on error and print args")

... which tied the backup functionality to --verbose, and only for
warnings/errors.

It's a bit inelegant and out of scope to tie that to --verbose.

Bring back the old --backup option, but with the new behavior: only on
warnings/errors, and print the args to make it easier to recreate.

Suggested-by: Peter Zijlstra <peterz@infradead.org>
Acked-by: Petr Mladek <pmladek@suse.com>
Tested-by: Joe Lawrence <joe.lawrence@redhat.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
2025-10-14 14:46:48 -07:00
Josh Poimboeuf 56754f0f46 objtool: Rename --Werror to --werror
The objtool --Werror option name is stylistically inconsistent: halfway
between GCC's single-dash capitalized -Werror and objtool's double-dash
--lowercase convention, making it unnecessarily hard to remember.

Make the 'W' lower case (--werror) for consistency with objtool's other
options.

Acked-by: Petr Mladek <pmladek@suse.com>
Tested-by: Joe Lawrence <joe.lawrence@redhat.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
2025-10-14 14:46:48 -07:00
Josh Poimboeuf 48f1bbaf26 objtool: Avoid emptying lists for duplicate sections
When a to-be-created section already exists, there's no point in
emptying the various lists if their respective sections already exist.
In fact it's better to leave them intact as they might get used later.

Acked-by: Petr Mladek <pmladek@suse.com>
Tested-by: Joe Lawrence <joe.lawrence@redhat.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
2025-10-14 14:46:47 -07:00
Josh Poimboeuf a040ab73df objtool: Simplify reloc offset calculation in unwind_read_hints()
Simplify the relocation offset calculation in unwind_read_hints(),
similar to other conversions which have already been done.

Acked-by: Petr Mladek <pmladek@suse.com>
Tested-by: Joe Lawrence <joe.lawrence@redhat.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
2025-10-14 14:46:47 -07:00
Josh Poimboeuf a1526bcfcb objtool: Mark prefix functions
In preparation for the objtool klp diff subcommand, introduce a flag to
identify __pfx_*() and __cfi_*() functions in advance so they don't need
to be manually identified every time a check is needed.

Acked-by: Petr Mladek <pmladek@suse.com>
Tested-by: Joe Lawrence <joe.lawrence@redhat.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
2025-10-14 14:46:47 -07:00
Josh Poimboeuf c9e9b85d41 objtool: Fix weak symbol hole detection for .cold functions
When ignore_unreachable_insn() looks for weak function holes which jump
to their .cold functions, it assumes the parent function comes before
the corresponding .cold function in the symbol table.  That's not
necessarily the case with -ffunction-sections.

Mark all the holes beforehand (including .cold functions) so the
ordering of the discovery doesn't matter.

Acked-by: Petr Mladek <pmladek@suse.com>
Tested-by: Joe Lawrence <joe.lawrence@redhat.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
2025-10-14 14:46:47 -07:00
Josh Poimboeuf 4ea029389b objtool: Mark .cold subfunctions
Introduce a flag to identify .cold subfunctions so they can be detected
easier and faster.

Acked-by: Petr Mladek <pmladek@suse.com>
Tested-by: Joe Lawrence <joe.lawrence@redhat.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
2025-10-14 14:46:46 -07:00
Josh Poimboeuf 25eac74b6b objtool: Add section/symbol type helpers
Add some helper macros to improve readability.

Acked-by: Petr Mladek <pmladek@suse.com>
Tested-by: Joe Lawrence <joe.lawrence@redhat.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
2025-10-14 14:45:25 -07:00
Josh Poimboeuf 96eceff331 objtool: Convert elf iterator macros to use 'struct elf'
'struct objtool_file' is specific to the check code and doesn't belong
in the elf code which is supposed to be objtool_file-agnostic.  Convert
the elf iterator macros to use 'struct elf' instead.

Acked-by: Petr Mladek <pmladek@suse.com>
Tested-by: Joe Lawrence <joe.lawrence@redhat.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
2025-10-14 14:45:25 -07:00
Josh Poimboeuf 72e4b6b44e objtool: Remove .parainstructions reference
The .parainstructions section no longer exists since the following
commit:

  60bc276b12 ("x86/paravirt: Switch mixed paravirt/alternative calls to alternatives").

Remove the reference to it.

Acked-by: Petr Mladek <pmladek@suse.com>
Tested-by: Joe Lawrence <joe.lawrence@redhat.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
2025-10-14 14:45:24 -07:00
Josh Poimboeuf 31eca25f3a objtool: Clean up compiler flag usage
KBUILD_HOSTCFLAGS and KBUILD_HOSTLDFLAGS aren't defined when objtool is
built standalone.  Also, the EXTRA_WARNINGS flags are rather arbitrary.

Make things simpler and more consistent by specifying compiler flags
explicitly and tweaking the warnings.  Also make a few code tweaks to
make the new warnings happy.

Acked-by: Petr Mladek <pmladek@suse.com>
Tested-by: Joe Lawrence <joe.lawrence@redhat.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
2025-10-14 14:45:24 -07:00
Josh Poimboeuf 34244f784c objtool: Const string cleanup
Use 'const char *' where applicable.

Acked-by: Petr Mladek <pmladek@suse.com>
Tested-by: Joe Lawrence <joe.lawrence@redhat.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
2025-10-14 14:45:24 -07:00
Josh Poimboeuf 3e4b5f66cf objtool: Check for missing annotation entries in read_annotate()
Add a sanity check to make sure none of the relocations for the
.discard.annotate_insn section have gone missing.

Acked-by: Petr Mladek <pmladek@suse.com>
Tested-by: Joe Lawrence <joe.lawrence@redhat.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
2025-10-14 14:45:24 -07:00
Josh Poimboeuf 4cdee7888f objtool: Fix "unexpected end of section" warning for alternatives
Due to the short circuiting logic in next_insn_to_validate(), control
flow may silently transition from .altinstr_replacement to .text without
a corresponding nested call to validate_branch().

As a result the validate_branch() 'sec' variable doesn't get
reinitialized, which can trigger a confusing "unexpected end of section"
warning which blames .altinstr_replacement rather than the offending
fallthrough function.

Fix that by not caching the section.  There's no point in doing that
anyway.

Acked-by: Petr Mladek <pmladek@suse.com>
Tested-by: Joe Lawrence <joe.lawrence@redhat.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
2025-10-14 14:45:24 -07:00
Josh Poimboeuf 68245893cf objtool: Fix __pa_symbol() relocation handling
__pa_symbol() generates a relocation which refers to a physical address.
Convert it to back its virtual form before calculating the addend.

Acked-by: Petr Mladek <pmladek@suse.com>
Tested-by: Joe Lawrence <joe.lawrence@redhat.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
2025-10-14 14:45:24 -07:00
Josh Poimboeuf 41d24d7858 objtool: Fix x86 addend calculation
On x86, arch_dest_reloc_offset() hardcodes the addend adjustment to
four, but the actual adjustment depends on the relocation type.  Fix
that.

Acked-by: Petr Mladek <pmladek@suse.com>
Tested-by: Joe Lawrence <joe.lawrence@redhat.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
2025-10-14 14:45:24 -07:00
Josh Poimboeuf 72567c630d objtool: Fix weak symbol detection
find_symbol_hole_containing() fails to find a symbol hole (aka stripped
weak symbol) if its section has no symbols before the hole.  This breaks
weak symbol detection if -ffunction-sections is enabled.

Fix that by allowing the interval tree to contain section symbols, which
are always at offset zero for a given section.

Fixes a bunch of (-ffunction-sections) warnings like:

  vmlinux.o: warning: objtool: .text.__x64_sys_io_setup+0x10: unreachable instruction

Fixes: 4adb236867 ("objtool: Ignore extra-symbol code")
Acked-by: Petr Mladek <pmladek@suse.com>
Tested-by: Joe Lawrence <joe.lawrence@redhat.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
2025-10-14 14:45:23 -07:00
Josh Poimboeuf c2a3e7af31 objtool: Fix interval tree insertion for zero-length symbols
Zero-length symbols get inserted in the wrong spot.  Fix that.

Acked-by: Petr Mladek <pmladek@suse.com>
Tested-by: Joe Lawrence <joe.lawrence@redhat.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
2025-10-14 14:45:23 -07:00
Josh Poimboeuf 81cf39be35 objtool: Add empty symbols to the symbol tree again
The following commit

  5da6aea375 ("objtool: Fix find_{symbol,func}_containing()")

fixed the issue where overlapping symbols weren't getting sorted
properly in the symbol tree.  Therefore the workaround to skip adding
empty symbols from the following commit

  a2e38dffcd ("objtool: Don't add empty symbols to the rbtree")

is no longer needed.

Acked-by: Petr Mladek <pmladek@suse.com>
Tested-by: Joe Lawrence <joe.lawrence@redhat.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
2025-10-14 14:45:23 -07:00
Josh Poimboeuf 4ac2ba35f6 objtool: Remove error handling boilerplate
Up to a certain point in objtool's execution, all errors are fatal and
return -1.  When propagating such errors, just return -1 directly
instead of trying to propagate the original return code.  This helps
make the code more compact and the behavior more explicit.

Acked-by: Petr Mladek <pmladek@suse.com>
Tested-by: Joe Lawrence <joe.lawrence@redhat.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
2025-10-14 14:45:23 -07:00
Josh Poimboeuf 2bb23cbf3f objtool: Propagate elf_truncate_section() error in elf_write()
Properly check and propagate the return value of elf_truncate_section()
to avoid silent failures.

Acked-by: Petr Mladek <pmladek@suse.com>
Tested-by: Joe Lawrence <joe.lawrence@redhat.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
2025-10-14 14:45:23 -07:00
Josh Poimboeuf 9ebb662fab objtool: Fix broken error handling in read_symbols()
The free(sym) call in the read_symbols() error path is fundamentally
broken: 'sym' doesn't point to any allocated block.  If triggered,
things would go from bad to worse.

Remove the free() and simplify the error paths.  Freeing memory isn't
necessary here anyway, these are fatal errors which lead to an immediate
exit().

Acked-by: Petr Mladek <pmladek@suse.com>
Tested-by: Joe Lawrence <joe.lawrence@redhat.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
2025-10-14 14:45:23 -07:00
Josh Poimboeuf 07e1c3fd86 objtool: Make find_symbol_containing() less arbitrary
In the rare case of overlapping symbols, find_symbol_containing() just
returns the first one it finds.  Make it slightly less arbitrary by
returning the smallest symbol with size > 0.

Acked-by: Petr Mladek <pmladek@suse.com>
Tested-by: Joe Lawrence <joe.lawrence@redhat.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
2025-10-14 14:45:23 -07:00
Josh Poimboeuf b37491d72b interval_tree: Fix ITSTATIC usage for *_subtree_search()
For consistency with the other function templates, change
_subtree_search_*() to use the user-supplied ITSTATIC rather than the
hard-coded 'static'.

Acked-by: Petr Mladek <pmladek@suse.com>
Tested-by: Joe Lawrence <joe.lawrence@redhat.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
2025-10-14 14:45:22 -07:00
Josh Poimboeuf 9b7eacac22 interval_tree: Sync interval_tree_generic.h with tools
The following commit made an improvement to interval_tree_generic.h, but
didn't sync it to the tools copy:

  1981128578 ("lib/interval_tree: skip the check before go to the right subtree")

Sync it, and add it to objtool's sync-check.sh so they are more likely
to stay in sync going forward.

Acked-by: Petr Mladek <pmladek@suse.com>
Tested-by: Joe Lawrence <joe.lawrence@redhat.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
2025-10-14 14:45:22 -07:00
Josh Poimboeuf 3049fc4b5f x86/alternative: Refactor INT3 call emulation selftest
The INT3 call emulation selftest is a bit fragile as it relies on the
compiler not inserting any extra instructions before the
int3_selftest_ip() definition.

Also, the int3_selftest_ip() symbol overlaps with the int3_selftest
symbol(), which can confuse objtool.

Fix those issues by slightly reworking the functionality and moving
int3_selftest_ip() to a separate asm function.  While at it, improve the
naming.

Acked-by: Petr Mladek <pmladek@suse.com>
Tested-by: Joe Lawrence <joe.lawrence@redhat.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
2025-10-14 14:45:22 -07:00
Josh Poimboeuf 4109043bff modpost: Ignore unresolved section bounds symbols
In preparation for klp-build livepatch module creation tooling,
suppress warnings for unresolved references to linker-generated
__start_* and __stop_* section bounds symbols.

These symbols are expected to be undefined when modpost runs, as they're
created later by the linker.

Cc: Masahiro Yamada <masahiroy@kernel.org>
Acked-by: Petr Mladek <pmladek@suse.com>
Tested-by: Joe Lawrence <joe.lawrence@redhat.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
2025-10-14 14:45:22 -07:00
Josh Poimboeuf 6717e8f91d kbuild: Remove 'kmod_' prefix from __KBUILD_MODNAME
In preparation for the objtool klp diff subcommand, remove the arbitrary
'kmod_' prefix from __KBUILD_MODNAME and instead add it explicitly in
the __initcall_id() macro.

This change supports the standardization of "unique" symbol naming by
ensuring the non-unique portion of the name comes before the unique
part.  That will enable objtool to properly correlate symbols across
builds.

Cc: Masahiro Yamada <masahiroy@kernel.org>
Acked-by: Petr Mladek <pmladek@suse.com>
Tested-by: Joe Lawrence <joe.lawrence@redhat.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
2025-10-14 14:45:22 -07:00
Josh Poimboeuf c2d420796a elfnote: Change ELFNOTE() to use __UNIQUE_ID()
In preparation for the objtool klp diff subcommand, replace the custom
unique symbol name generation in ELFNOTE() with __UNIQUE_ID().

This standardizes the naming format for all "unique" symbols, which will
allow objtool to properly correlate them.  Note this also removes the
"one ELF note per line" limitation.

Acked-by: Petr Mladek <pmladek@suse.com>
Tested-by: Joe Lawrence <joe.lawrence@redhat.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
2025-10-14 14:45:22 -07:00
Josh Poimboeuf 9f14f1f918 compiler.h: Make addressable symbols less of an eyesore
Avoid underscore overload by changing:

  __UNIQUE_ID___addressable_loops_per_jiffy_868

to the following:

  __UNIQUE_ID_addressable_loops_per_jiffy_868

This matches the format used by other __UNIQUE_ID()-generated symbols
and improves readability for those who stare at ELF symbol table dumps.

Acked-by: Petr Mladek <pmladek@suse.com>
Tested-by: Joe Lawrence <joe.lawrence@redhat.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
2025-10-14 14:45:21 -07:00
Josh Poimboeuf afb026b6d3 compiler: Tweak __UNIQUE_ID() naming
In preparation for the objtool klp diff subcommand, add an underscore
between the name and the counter.  This will make it possible for
objtool to distinguish between the non-unique and unique parts of the
symbol name so it can properly correlate the symbols.

Acked-by: Petr Mladek <pmladek@suse.com>
Tested-by: Joe Lawrence <joe.lawrence@redhat.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
2025-10-14 14:45:21 -07:00
Josh Poimboeuf 122679ebf9 x86/kprobes: Remove STACK_FRAME_NON_STANDARD annotation
Since commit 877b145f0f ("x86/kprobes: Move trampoline code into
RODATA"), the optprobe template code is no longer analyzed by objtool so
it doesn't need to be ignored.

Acked-by: Petr Mladek <pmladek@suse.com>
Tested-by: Joe Lawrence <joe.lawrence@redhat.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
2025-10-14 14:45:21 -07:00
Josh Poimboeuf bf770d6d20 x86/module: Improve relocation error messages
Add the section number and reloc index to relocation error messages to
help find the faulty relocation.

Acked-by: Petr Mladek <pmladek@suse.com>
Tested-by: Joe Lawrence <joe.lawrence@redhat.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
2025-10-14 14:45:21 -07:00
Josh Poimboeuf 1ba9f89794 vmlinux.lds: Unify TEXT_MAIN, DATA_MAIN, and related macros
TEXT_MAIN, DATA_MAIN and friends are defined differently depending on
whether certain config options enable -ffunction-sections and/or
-fdata-sections.

There's no technical reason for that beyond voodoo coding.  Keeping the
separate implementations adds unnecessary complexity, fragments the
logic, and increases the risk of subtle bugs.

Unify the macros by using the same input section patterns across all
configs.

This is a prerequisite for the upcoming livepatch klp-build tooling
which will manually enable -ffunction-sections and -fdata-sections via
KCFLAGS.

Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Acked-by: Petr Mladek <pmladek@suse.com>
Tested-by: Joe Lawrence <joe.lawrence@redhat.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
2025-10-14 14:45:21 -07:00
Josh Poimboeuf 68e71067ec s390/vmlinux.lds.S: Prevent thunk functions from getting placed with normal text
The s390 indirect thunks are placed in the .text.__s390_indirect_jump_*
sections.

Certain config options which enable -ffunction-sections have a custom
version of the TEXT_TEXT macro:

  .text.[0-9a-zA-Z_]*

That unintentionally matches the thunk sections, causing them to get
grouped with normal text rather than being handled by their intended
rule later in the script:

  *(.text.*_indirect_*)

Fix that by adding another period to the thunk section names, following
the kernel's general convention for distinguishing code-generated text
sections from compiler-generated ones.

Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Acked-by: Heiko Carstens <hca@linux.ibm.com>
Acked-by: Petr Mladek <pmladek@suse.com>
Tested-by: Joe Lawrence <joe.lawrence@redhat.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
2025-10-14 14:45:21 -07:00
Dylan Hatch be8374a5ba objtool: Fix standalone --hacks=jump_label
The objtool command line 'objtool --hacks=jump_label foo.o' on its own
should be expected to rewrite jump labels to NOPs. This means the
add_special_section_alts() code path needs to run when only this option
is provided.

This is mainly relevant in certain debugging situations, but could
potentially also fix kernel builds in which objtool is run with
--hacks=jump_label but without --orc, --stackval, --uaccess, or
--hacks=noinstr.

Fixes: de6fbcedf5 ("objtool: Read special sections with alts only when specific options are selected")
Signed-off-by: Dylan Hatch <dylanbhatch@google.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
2025-10-14 14:45:21 -07:00
Pankaj Raghav ff5c046648 scripts/faddr2line: Fix "Argument list too long" error
The run_readelf() function reads the entire output of readelf into a
single shell variable. For large object files with extensive debug
information, the size of this variable can exceed the system's
command-line argument length limit.

When this variable is subsequently passed to sed via `echo "${out}"`, it
triggers an "Argument list too long" error, causing the script to fail.

Fix this by redirecting the output of readelf to a temporary file
instead of a variable. The sed commands are then modified to read from
this file, avoiding the argument length limitation entirely.

Signed-off-by: Pankaj Raghav <p.raghav@samsung.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
2025-10-14 14:45:20 -07:00
Pankaj Raghav 6b4679fcbf scripts/faddr2line: Use /usr/bin/env bash for portability
The shebang `#!/bin/bash` assumes a fixed path for the bash interpreter.
This path does not exist on some systems, such as NixOS, causing the
script to fail.

Replace `/bin/bash` with the more portable `#!/usr/bin/env bash`.

Signed-off-by: Pankaj Raghav <p.raghav@samsung.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
2025-10-14 14:45:20 -07:00
John Wang 567f9c428f scripts/faddr2line: Set LANG=C to enforce ASCII output
Force tools like readelf to use the POSIX/C locale by exporting LANG=C
This ensures ASCII-only output and avoids locale-specific
characters(e.g., UTF-8 symbols or translated strings), which could
break text processing utilities like sed in the script

Signed-off-by: John Wang <wangzq.jn@gmail.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
2025-10-14 14:45:20 -07:00
Josh Poimboeuf a808a2b35f tools build: Fix fixdep dependencies
The tools version of fixdep has broken dependencies.  It doesn't get
rebuilt if the host compiler or headers change.

Build fixdep with the tools kbuild infrastructure, so fixdep runs on
itself.  Due to the recursive dependency, its dependency file is
incomplete the very first time it gets built.  In that case build it a
second time to achieve fixdep inception.

Reported-by: Arthur Marsh <arthur.marsh@internode.on.net>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
2025-10-14 14:45:20 -07:00
Chen Ni 2e985fdb7e objtool: Remove unneeded semicolon
Remove unnecessary semicolons reported by Coccinelle/coccicheck and the
semantic patch at scripts/coccinelle/misc/semicolon.cocci.

Signed-off-by: Chen Ni <nichen@iscas.ac.cn>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
2025-10-14 14:45:20 -07:00
Peter Zijlstra 044f721ccd objtool/x86: Fix NOP decode
For x86_64 the kernel consistently uses 2 instructions for all NOPs:

  90       - NOP
  0f 1f /0 - NOPL

Notably:

 - REP NOP is PAUSE, not a NOP instruction.

 - 0f {0c...0f} is reserved space,
   except for 0f 0d /1, which is PREFETCHW, not a NOP.

 - 0f {19,1c...1f} is reserved space,
   except for 0f 1f /0, which is NOPL.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
2025-10-14 13:43:11 +02:00
Peter Zijlstra 76e1851a1b objtool/x86: Add UDB support
Per commit 85a2d4a890 ("x86,ibt: Use UDB instead of 0xEA"), make
sure objtool also recognises UDB as a #UD instruction.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Alexandre Chartre <alexandre.chartre@oracle.com>
2025-10-14 13:43:11 +02:00
Peter Zijlstra c5df4e1ab8 objtool/x86: Remove 0xea hack
Was properly fixed in the decoder with commit 4b626015e1 ("x86/insn:
Stop decoding i64 instructions in x86-64 mode at opcode")

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Alexandre Chartre <alexandre.chartre@oracle.com>
2025-10-14 13:43:10 +02:00
Juergen Gross ad74016b91 x86/alternative: Drop not needed test after call of alt_replace_call()
alt_replace_call() will never return a negative value, so testing the
return value to be less than zero can be dropped.

This makes it possible to switch the return type of alt_replace_call()
and the type of insn_buff_sz to unsigned int.

Signed-off-by: Juergen Gross <jgross@suse.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
2025-10-14 10:38:11 +02:00
Chen Yu a0a0999507 x86/resctrl: Support Sub-NUMA Cluster (SNC) mode on Clearwater Forest
Clearwater Forest supports SNC mode. Add it to the snc_cpu_ids[] table.

Signed-off-by: Chen Yu <yu.c.chen@intel.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Acked-by: Tony Luck <tony.luck@intel.com>
2025-10-13 16:59:55 +02:00
Ingo Molnar a53d0cf7f1 Merge commit 'linus' into core/bugs, to resolve conflicts
Resolve conflicts with this commit that was developed in parallel
during the merge window:

 8c8efa93db ("x86/bug: Add ARCH_WARN_ASM macro for BUG/WARN asm code sharing with Rust")

 Conflicts:
	arch/riscv/include/asm/bug.h
	arch/x86/include/asm/bug.h

Signed-off-by: Ingo Molnar <mingo@kernel.org>
2025-08-05 11:15:34 +02:00
Heiko Carstens ed845c363d bugs/s390: Remove private WARN_ON() implementation
Besides an odd __builtin_constant_p() optimization the s390 specific
WARN_ON() implementation is identical to the generic variant.
Drop the s390 variant in favor of the generic variant.

Signed-off-by: Heiko Carstens <hca@linux.ibm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org> # Rebased ancestor commits
Reviewed-by: Alexander Gordeev <agordeev@linux.ibm.com>
Link: https://lore.kernel.org/r/20250617135042.1878068-2-hca@linux.ibm.com
2025-07-28 08:07:07 +02:00
Ingo Molnar 28ea295f94 bugs/core: Reorganize fields in the first line of WARNING output, add ->comm[] output
With the introduction of the condition string as part of the 'file'
string output of kernel warnings, the first line has become a bit
harder to read:

   WARNING: CPU: 0 PID: 0 at [ptr == 0 && 1] kernel/sched/core.c:8511 sched_init+0x20/0x410

Re-order the fields by importance (higher to lower), make the 'at' meaningful
again, and add '->comm[]' output which is often more valuable than a PID.

Also, remove the 'PID' prefix - in combination with comm it's clear what it is.

These changes make the output only slightly longer:

   WARNING: [ptr == 0 && 1] kernel/sched/core.c:8511 at sched_init+0x20/0x410 CPU#0: swapper/0

While adding more information and making it better organized.

Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org> # Rebased ancestor commits
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: linux-arch@vger.kernel.org
Link: https://lore.kernel.org/r/20250515124644.2958810-16-mingo@kernel.org
2025-07-28 08:06:55 +02:00
Ingo Molnar be2ba2fef1 bugs/sh: Concatenate 'cond_str' with '__FILE__' in __WARN_FLAGS(), to extend WARN_ON/BUG_ON output
Extend WARN_ON and BUG_ON style output from:

  WARNING: CPU: 0 PID: 0 at kernel/sched/core.c:8511 sched_init+0x20/0x410

to:

  WARNING: CPU: 0 PID: 0 at [idx < 0 && ptr] kernel/sched/core.c:8511 sched_init+0x20/0x410

Note that the output will be further reorganized later in this series.

Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org> # Rebased ancestor commits
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Rich Felker <dalias@libc.org>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Cc: linux-arch@vger.kernel.org
Cc: linux-sh@vger.kernel.org
Link: https://lore.kernel.org/r/20250515124644.2958810-15-mingo@kernel.org
2025-07-28 08:06:43 +02:00
Ingo Molnar f40484925b bugs/parisc: Concatenate 'cond_str' with '__FILE__' in __WARN_FLAGS(), to extend WARN_ON/BUG_ON output
Extend WARN_ON and BUG_ON style output from:

  WARNING: CPU: 0 PID: 0 at kernel/sched/core.c:8511 sched_init+0x20/0x410

to:

  WARNING: CPU: 0 PID: 0 at [idx < 0 && ptr] kernel/sched/core.c:8511 sched_init+0x20/0x410

Note that the output will be further reorganized later in this series.

Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org> # Rebased ancestor commits
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Helge Deller <deller@gmx.de>
Cc: James E.J. Bottomley <James.Bottomley@HansenPartnership.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: linux-arch@vger.kernel.org
Cc: linux-parisc@vger.kernel.org
Link: https://lore.kernel.org/r/20250515124644.2958810-14-mingo@kernel.org
2025-07-28 08:06:27 +02:00
Ingo Molnar bb39faa71d bugs/riscv: Concatenate 'cond_str' with '__FILE__' in __BUG_FLAGS(), to extend WARN_ON/BUG_ON output
Extend WARN_ON and BUG_ON style output from:

  WARNING: CPU: 0 PID: 0 at kernel/sched/core.c:8511 sched_init+0x20/0x410

to:

  WARNING: CPU: 0 PID: 0 at [idx < 0 && ptr] kernel/sched/core.c:8511 sched_init+0x20/0x410

Note that the output will be further reorganized later in this series.

Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org> # Rebased ancestor commits
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexandre Ghiti <alex@ghiti.fr>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: linux-arch@vger.kernel.org
Cc: linux-riscv@lists.infradead.org
Link: https://lore.kernel.org/r/20250515124644.2958810-13-mingo@kernel.org
2025-07-28 08:06:06 +02:00
Ingo Molnar 7e8c292692 bugs/riscv: Pass in 'cond_str' to __BUG_FLAGS()
Pass in the condition string from __WARN_FLAGS() to __BUG_FLAGS(),
but don't use it yet.

Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org> # Rebased ancestor commits
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexandre Ghiti <alex@ghiti.fr>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: linux-arch@vger.kernel.org
Cc: linux-riscv@lists.infradead.org
Link: https://lore.kernel.org/r/20250515124644.2958810-12-mingo@kernel.org
2025-07-28 08:05:49 +02:00
Heiko Carstens 6584ff203a bugs/s390: Use 'cond_str' in __EMIT_BUG()
The simple thing would be to add the string as an assembly immediate
input operand. Some older gcc variants cannot handle strings as
immediate input operands for inline assemblies. Doing so may result in
compile errors.

Rewrite the s390 generic bug support very similar to arm64 and
loongarch, and get rid of all input operands to fix this.

  [ peterz: backmerge fix and massage changelog ]

  [ bp: clang integrated assembler concatenates only .ascii strings:
    https://lore.kernel.org/r/202507020528.N0LtekXt-lkp@intel.com ]

Signed-off-by: Heiko Carstens <hca@linux.ibm.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org> # Fixed the tags section
Acked-by: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: linux-arch@vger.kernel.org
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: linux-s390@vger.kernel.org
Link: https://lore.kernel.org/r/20250520133927.7932C19-hca@linux.ibm.com
Link: https://lore.kernel.org/r/20250617135042.1878068-3-hca@linux.ibm.com
2025-07-28 08:02:43 +02:00
Ingo Molnar 7ce0f693cb bugs/s390: Pass in 'cond_str' to __EMIT_BUG()
Pass in the condition string from __WARN_FLAGS(), but do not
concatenate it with __FILE__, because it results in s390
assembler build errors that are beyond my s390-asm-fu.

Signed-off-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: linux-s390@vger.kernel.org
Cc: <linux-arch@vger.kernel.org>
Link: https://lore.kernel.org/r/20250515124644.2958810-11-mingo@kernel.org
2025-07-28 08:01:56 +02:00
Ingo Molnar d6b894cbfa bugs/LoongArch: Concatenate 'cond_str' with '__FILE__' in __BUG_ENTRY(), to extend WARN_ON/BUG_ON output
Extend WARN_ON and BUG_ON style output from:

  WARNING: CPU: 0 PID: 0 at kernel/sched/core.c:8511 sched_init+0x20/0x410

to:

  WARNING: CPU: 0 PID: 0 at [idx < 0 && ptr] kernel/sched/core.c:8511 sched_init+0x20/0x410

Note that the output will be further reorganized later in this series.

[ peterz: backmerge fix from Nathan ]

Fixed-by: Nathan Chancellor <nathan@kernel.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org> # Cleaned up tags section
Cc: Huacai Chen <chenhuacai@kernel.org>
Cc: Josh Poimboeuf <jpoimboe@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: WANG Xuerui <kernel@xen0n.name>
Cc: linux-arch@vger.kernel.org
Link: https://lore.kernel.org/r/20250515124644.2958810-10-mingo@kernel.org
Link: https://lore.kernel.org/r/20250616-loongarch-fix-warn-cond-llvm-ias-v1-1-6c6d90bb4466@kernel.org
2025-07-28 08:01:23 +02:00
Ingo Molnar 66e94df0dd bugs/LoongArch: Pass in 'cond_str' to __BUG_ENTRY()
Pass in the condition string from __WARN_FLAGS(), but don't use it yet.

Signed-off-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Huacai Chen <chenhuacai@kernel.org>
Cc: Josh Poimboeuf <jpoimboe@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: WANG Xuerui <kernel@xen0n.name>
Cc: linux-arch@vger.kernel.org
Link: https://lore.kernel.org/r/20250515124644.2958810-9-mingo@kernel.org
2025-06-13 10:25:32 +02:00
Ingo Molnar 1284579a7f bugs/powerpc: Concatenate 'cond_str' with '__FILE__' in BUG_ENTRY(), to extend WARN_ON/BUG_ON output
Extend WARN_ON and BUG_ON style output from:

  WARNING: CPU: 0 PID: 0 at kernel/sched/core.c:8511 sched_init+0x20/0x410

to:

  WARNING: CPU: 0 PID: 0 at [idx < 0 && ptr] kernel/sched/core.c:8511 sched_init+0x20/0x410

Note that the output will be further reorganized later in this series.

Signed-off-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Madhavan Srinivasan <maddy@linux.ibm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Naveen N Rao <naveen@kernel.org>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: linux-arch@vger.kernel.org
Cc: linuxppc-dev@lists.ozlabs.org
Link: https://lore.kernel.org/r/20250515124644.2958810-8-mingo@kernel.org
2025-06-13 10:25:32 +02:00
Ingo Molnar 1c59c2b284 bugs/powerpc: Pass in 'cond_str' to BUG_ENTRY()
Pass in the condition string from __WARN_FLAGS(), WARN_ON()
and BUG_ON(), but don't use it yet.

Signed-off-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Madhavan Srinivasan <maddy@linux.ibm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Naveen N Rao <naveen@kernel.org>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: linux-arch@vger.kernel.org
Cc: linuxppc-dev@lists.ozlabs.org
Link: https://lore.kernel.org/r/20250515124644.2958810-7-mingo@kernel.org
2025-06-13 10:25:32 +02:00
Ingo Molnar 48ede5be5c bugs/x86: Augment warnings output by concatenating 'cond_str' with the regular __FILE__ string in _BUG_FLAGS()
This allows the reuse of the UD2 based 'struct bug_entry' low-overhead
_BUG_FLAGS() implementation and string-printing backend, without
having to add a new field.

An example:

If we have the following WARN_ON_ONCE() in kernel/sched/core.c:

	WARN_ON_ONCE(idx < 0 && ptr);

Then previously _BUG_FLAGS() would store this string in bug_entry::file:

	"kernel/sched/core.c"

After this patch, it would store and print:

	"[idx < 0 && ptr] kernel/sched/core.c"

Which is an extended string that will be printed in warnings.

Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: linux-arch@vger.kernel.org
Link: https://lore.kernel.org/r/20250515124644.2958810-6-mingo@kernel.org
2025-06-13 10:25:32 +02:00
Ingo Molnar 407b9076c1 bugs/x86: Extend _BUG_FLAGS() with the 'cond_str' parameter
Just pass down the parameter, don't do anything with it yet.

Signed-off-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: linux-arch@vger.kernel.org
Link: https://lore.kernel.org/r/20250515124644.2958810-5-mingo@kernel.org
2025-06-13 10:25:32 +02:00
Ingo Molnar 687fac9d1b bugs/core: Introduce the CONFIG_DEBUG_BUGVERBOSE_DETAILED Kconfig switch
Allow configurability of the inclusion of more detailed
WARN_ON() strings, to be implemented in subsequent
commits.

Since the full cost will be around 100K more memory on
an x86 defconfig, disable it by default.

Provide the WARN_CONDITION_STR() macro to allow the conditional
passing of extra strings to lower level BUG/WARN handlers.

Signed-off-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: linux-arch@vger.kernel.org
Link: https://lore.kernel.org/r/20250515124644.2958810-4-mingo@kernel.org
2025-06-13 10:25:29 +02:00
Ingo Molnar 3bc3c9c3ab bugs/core: Pass down the condition string of WARN_ON_ONCE(cond) warnings to __WARN_FLAGS()
Doing this will allow architecture code to store and print out
this information as part of the WARN_ON and BUG_ON facilities.

The format of the string is '[condition]', for example:

  WARN_ON_ONCE(idx < 0 && ptr);

Will get the '[idx < 0 && ptr]' string literal passed down as 'cond_str'
in __WARN_FLAGS().

Signed-off-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: linux-arch@vger.kernel.org
Link: https://lore.kernel.org/r/20250515124644.2958810-3-mingo@kernel.org
2025-06-13 10:20:52 +02:00
Ingo Molnar aec58b4851 bugs/core: Extend __WARN_FLAGS() with the 'cond_str' parameter
Push the new parameter down into every architecture that defines __WARN_FLAGS():

  arm64
  loongarch
  parisc
  powerpc
  riscv
  s390
  sh
  x86

Don't pass anything substantial down yet, just propagate the
new parameter with empty strings, without generating it or
using it.

( The string is never NULL, so it can be concatenated at the
  preprocessor level. )

Signed-off-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: linux-arch@vger.kernel.org
Link: https://lore.kernel.org/r/20250515124644.2958810-2-mingo@kernel.org
2025-06-13 10:20:52 +02:00
777 changed files with 36837 additions and 14125 deletions

View File

@ -406,24 +406,8 @@ index of the MC::
|->mc2
....
Under each ``mcX`` directory each ``csrowX`` is again represented by a
``csrowX``, where ``X`` is the csrow index::
.../mc/mc0/
|
|->csrow0
|->csrow2
|->csrow3
....
Notice that there is no csrow1, which indicates that csrow0 is composed
of a single ranked DIMMs. This should also apply in both Channels, in
order to have dual-channel mode be operational. Since both csrow2 and
csrow3 are populated, this indicates a dual ranked set of DIMMs for
channels 0 and 1.
Within each of the ``mcX`` and ``csrowX`` directories are several EDAC
control and attribute files.
Within each of the ``mcX`` directory are several EDAC control and
attribute files.
``mcX`` directories
-------------------
@ -569,7 +553,7 @@ this ``X`` memory module:
- Unbuffered-DDR
.. [#f5] On some systems, the memory controller doesn't have any logic
to identify the memory module. On such systems, the directory is called ``rankX`` and works on a similar way as the ``csrowX`` directories.
to identify the memory module. On such systems, the directory is called ``rankX``.
On modern Intel memory controllers, the memory controller identifies the
memory modules directly. On such systems, the directory is called ``dimmX``.
@ -577,126 +561,6 @@ this ``X`` memory module:
symlinks inside the sysfs mapping that are automatically created by
the sysfs subsystem. Currently, they serve no purpose.
``csrowX`` directories
----------------------
When CONFIG_EDAC_LEGACY_SYSFS is enabled, sysfs will contain the ``csrowX``
directories. As this API doesn't work properly for Rambus, FB-DIMMs and
modern Intel Memory Controllers, this is being deprecated in favor of
``dimmX`` directories.
In the ``csrowX`` directories are EDAC control and attribute files for
this ``X`` instance of csrow:
- ``ue_count`` - Total Uncorrectable Errors count attribute file
This attribute file displays the total count of uncorrectable
errors that have occurred on this csrow. If panic_on_ue is set
this counter will not have a chance to increment, since EDAC
will panic the system.
- ``ce_count`` - Total Correctable Errors count attribute file
This attribute file displays the total count of correctable
errors that have occurred on this csrow. This count is very
important to examine. CEs provide early indications that a
DIMM is beginning to fail. This count field should be
monitored for non-zero values and report such information
to the system administrator.
- ``size_mb`` - Total memory managed by this csrow attribute file
This attribute file displays, in count of megabytes, the memory
that this csrow contains.
- ``mem_type`` - Memory Type attribute file
This attribute file will display what type of memory is currently
on this csrow. Normally, either buffered or unbuffered memory.
Examples:
- Registered-DDR
- Unbuffered-DDR
- ``edac_mode`` - EDAC Mode of operation attribute file
This attribute file will display what type of Error detection
and correction is being utilized.
- ``dev_type`` - Device type attribute file
This attribute file will display what type of DRAM device is
being utilized on this DIMM.
Examples:
- x1
- x2
- x4
- x8
- ``ch0_ce_count`` - Channel 0 CE Count attribute file
This attribute file will display the count of CEs on this
DIMM located in channel 0.
- ``ch0_ue_count`` - Channel 0 UE Count attribute file
This attribute file will display the count of UEs on this
DIMM located in channel 0.
- ``ch0_dimm_label`` - Channel 0 DIMM Label control file
This control file allows this DIMM to have a label assigned
to it. With this label in the module, when errors occur
the output can provide the DIMM label in the system log.
This becomes vital for panic events to isolate the
cause of the UE event.
DIMM Labels must be assigned after booting, with information
that correctly identifies the physical slot with its
silk screen label. This information is currently very
motherboard specific and determination of this information
must occur in userland at this time.
- ``ch1_ce_count`` - Channel 1 CE Count attribute file
This attribute file will display the count of CEs on this
DIMM located in channel 1.
- ``ch1_ue_count`` - Channel 1 UE Count attribute file
This attribute file will display the count of UEs on this
DIMM located in channel 0.
- ``ch1_dimm_label`` - Channel 1 DIMM Label control file
This control file allows this DIMM to have a label assigned
to it. With this label in the module, when errors occur
the output can provide the DIMM label in the system log.
This becomes vital for panic events to isolate the
cause of the UE event.
DIMM Labels must be assigned after booting, with information
that correctly identifies the physical slot with its
silk screen label. This information is currently very
motherboard specific and determination of this information
must occur in userland at this time.
System Logging
--------------

View File

@ -6207,7 +6207,7 @@
rdt= [HW,X86,RDT]
Turn on/off individual RDT features. List is:
cmt, mbmtotal, mbmlocal, l3cat, l3cdp, l2cat, l2cdp,
mba, smba, bmec, abmc.
mba, smba, bmec, abmc, sdciae.
E.g. to turn on cmt and turn off mba use:
rdt=cmt,!mba
@ -6500,6 +6500,10 @@
Memory area to be used by remote processor image,
managed by CMA.
rseq_debug= [KNL] Enable or disable restartable sequence
debug mode. Defaults to CONFIG_RSEQ_DEBUG_DEFAULT_ENABLE.
Format: <bool>
rt_group_sched= [KNL] Enable or disable SCHED_RR/FIFO group scheduling
when CONFIG_RT_GROUP_SCHED=y. Defaults to
!CONFIG_RT_GROUP_SCHED_DEFAULT_DISABLED.

View File

@ -39,6 +39,9 @@ properties:
- amlogic,a4-gpio-ao-intc
- amlogic,a5-gpio-intc
- amlogic,c3-gpio-intc
- amlogic,s6-gpio-intc
- amlogic,s7-gpio-intc
- amlogic,s7d-gpio-intc
- amlogic,t7-gpio-intc
- const: amlogic,meson-gpio-intc

View File

@ -25,13 +25,14 @@ properties:
interrupt-controller: true
'#interrupt-cells':
const: 2
const: 1
description:
The first cell is the IRQ number, the second cell is the trigger
type as defined in interrupt.txt in this directory.
interrupts:
maxItems: 6
minItems: 1
maxItems: 10
description: |
Depend to which INTC0 or INTC1 used.
INTC0 and INTC1 are two kinds of interrupt controller with enable and raw
@ -74,13 +75,17 @@ examples:
interrupt-controller@12101b00 {
compatible = "aspeed,ast2700-intc-ic";
reg = <0 0x12101b00 0 0x10>;
#interrupt-cells = <2>;
#interrupt-cells = <1>;
interrupt-controller;
interrupts = <GIC_SPI 192 IRQ_TYPE_LEVEL_HIGH>,
<GIC_SPI 193 IRQ_TYPE_LEVEL_HIGH>,
<GIC_SPI 194 IRQ_TYPE_LEVEL_HIGH>,
<GIC_SPI 195 IRQ_TYPE_LEVEL_HIGH>,
<GIC_SPI 196 IRQ_TYPE_LEVEL_HIGH>,
<GIC_SPI 197 IRQ_TYPE_LEVEL_HIGH>;
<GIC_SPI 197 IRQ_TYPE_LEVEL_HIGH>,
<GIC_SPI 198 IRQ_TYPE_LEVEL_HIGH>,
<GIC_SPI 199 IRQ_TYPE_LEVEL_HIGH>,
<GIC_SPI 200 IRQ_TYPE_LEVEL_HIGH>,
<GIC_SPI 201 IRQ_TYPE_LEVEL_HIGH>;
};
};

View File

@ -58,6 +58,7 @@ properties:
- const: andestech,nceplic100
- items:
- enum:
- anlogic,dr1v90-plic
- canaan,k210-plic
- eswin,eic7700-plic
- sifive,fu540-c000-plic
@ -75,6 +76,9 @@ properties:
- sophgo,sg2044-plic
- thead,th1520-plic
- const: thead,c900-plic
- items:
- const: ultrarisc,dp1000-plic
- const: ultrarisc,cp100-plic
- items:
- const: sifive,plic-1.0.0
- const: riscv,plic0

View File

@ -4,18 +4,23 @@
$id: http://devicetree.org/schemas/interrupt-controller/thead,c900-aclint-mswi.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
title: Sophgo sg2042 CLINT Machine-level Software Interrupt Device
title: ACLINT Machine-level Software Interrupt Device
maintainers:
- Inochi Amaoto <inochiama@outlook.com>
properties:
compatible:
items:
- enum:
- sophgo,sg2042-aclint-mswi
- sophgo,sg2044-aclint-mswi
- const: thead,c900-aclint-mswi
oneOf:
- items:
- enum:
- sophgo,sg2042-aclint-mswi
- sophgo,sg2044-aclint-mswi
- const: thead,c900-aclint-mswi
- items:
- enum:
- anlogic,dr1v90-aclint-mswi
- const: nuclei,ux900-aclint-mswi
reg:
maxItems: 1

View File

@ -30,6 +30,10 @@ properties:
- const: thead,c900-aclint-sswi
- items:
- const: mips,p8700-aclint-sswi
- items:
- enum:
- anlogic,dr1v90-aclint-sswi
- const: nuclei,ux900-aclint-sswi
reg:
maxItems: 1

View File

@ -0,0 +1,47 @@
# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
%YAML 1.2
---
$id: http://devicetree.org/schemas/timer/realtek,rtd1625-systimer.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
title: Realtek System Timer
maintainers:
- Hao-Wen Ting <haowen.ting@realtek.com>
description:
The Realtek SYSTIMER (System Timer) is a 64-bit global hardware counter operating
at a fixed 1MHz frequency. Thanks to its compare match interrupt capability,
the timer natively supports oneshot mode for tick broadcast functionality.
properties:
compatible:
oneOf:
- const: realtek,rtd1625-systimer
- items:
- const: realtek,rtd1635-systimer
- const: realtek,rtd1625-systimer
reg:
maxItems: 1
interrupts:
maxItems: 1
required:
- compatible
- reg
- interrupts
additionalProperties: false
examples:
- |
#include <dt-bindings/interrupt-controller/arm-gic.h>
timer@89420 {
compatible = "realtek,rtd1635-systimer",
"realtek,rtd1625-systimer";
reg = <0x89420 0x18>;
interrupts = <GIC_SPI 112 IRQ_TYPE_LEVEL_HIGH>;
};

View File

@ -1705,6 +1705,8 @@ patternProperties:
description: Universal Scientific Industrial Co., Ltd.
"^usr,.*":
description: U.S. Robotics Corporation
"^ultrarisc,.*":
description: UltraRISC Technology Co., Ltd.
"^ultratronik,.*":
description: Ultratronik GmbH
"^utoo,.*":

View File

@ -135,6 +135,27 @@ These ``struct kiocb`` flags are significant for buffered I/O with iomap:
* ``IOCB_DONTCACHE``: Turns on ``IOMAP_DONTCACHE``.
``struct iomap_read_ops``
--------------------------
.. code-block:: c
struct iomap_read_ops {
int (*read_folio_range)(const struct iomap_iter *iter,
struct iomap_read_folio_ctx *ctx, size_t len);
void (*submit_read)(struct iomap_read_folio_ctx *ctx);
};
iomap calls these functions:
- ``read_folio_range``: Called to read in the range. This must be provided
by the caller. If this succeeds, iomap_finish_folio_read() must be called
after the range is read in, regardless of whether the read succeeded or
failed.
- ``submit_read``: Submit any pending read requests. This function is
optional.
Internal per-Folio State
------------------------
@ -182,6 +203,28 @@ The ``flags`` argument to ``->iomap_begin`` will be set to zero.
The pagecache takes whatever locks it needs before calling the
filesystem.
Both ``iomap_readahead`` and ``iomap_read_folio`` pass in a ``struct
iomap_read_folio_ctx``:
.. code-block:: c
struct iomap_read_folio_ctx {
const struct iomap_read_ops *ops;
struct folio *cur_folio;
struct readahead_control *rac;
void *read_ctx;
};
``iomap_readahead`` must set:
* ``ops->read_folio_range()`` and ``rac``
``iomap_read_folio`` must set:
* ``ops->read_folio_range()`` and ``cur_folio``
``ops->submit_read()`` and ``read_ctx`` are optional. ``read_ctx`` is used to
pass in any custom data the caller needs accessible in the ops callbacks for
fulfilling reads.
Buffered Writes
---------------
@ -317,6 +360,9 @@ The fields are as follows:
delalloc reservations to avoid having delalloc reservations for
clean pagecache.
This function must be supplied by the filesystem.
If this succeeds, iomap_finish_folio_write() must be called once writeback
completes for the range, regardless of whether the writeback succeeded or
failed.
- ``writeback_submit``: Submit the previous built writeback context.
Block based file systems should use the iomap_ioend_writeback_submit
@ -444,10 +490,6 @@ These ``struct kiocb`` flags are significant for direct I/O with iomap:
Only meaningful for asynchronous I/O, and only if the entire I/O can
be issued as a single ``struct bio``.
* ``IOCB_DIO_CALLER_COMP``: Try to run I/O completion from the caller's
process context.
See ``linux/fs.h`` for more details.
Filesystems should call ``iomap_dio_rw`` from ``->read_iter`` and
``->write_iter``, and set ``FMODE_CAN_ODIRECT`` in the ``->open``
function for the file.

View File

@ -211,7 +211,7 @@ test and set for you.
e.g.::
inode = iget_locked(sb, ino);
if (inode->i_state & I_NEW) {
if (inode_state_read_once(inode) & I_NEW) {
err = read_inode_from_disk(inode);
if (err < 0) {
iget_failed(inode);
@ -1309,3 +1309,16 @@ a different length, use
vfs_parse_fs_qstr(fc, key, &QSTR_LEN(value, len))
instead.
---
**mandatory**
vfs_mkdir() now returns a dentry - the one returned by ->mkdir(). If
that dentry is different from the dentry passed in, including if it is
an IS_ERR() dentry pointer, the original dentry is dput().
When vfs_mkdir() returns an error, and so both dputs() the original
dentry and doesn't provide a replacement, it also unlocks the parent.
Consequently the return value from vfs_mkdir() can be passed to
end_creating() and the parent will be unlocked precisely when necessary.

View File

@ -17,17 +17,18 @@ AMD refers to this feature as AMD Platform Quality of Service(AMD QoS).
This feature is enabled by the CONFIG_X86_CPU_RESCTRL and the x86 /proc/cpuinfo
flag bits:
=============================================== ================================
RDT (Resource Director Technology) Allocation "rdt_a"
CAT (Cache Allocation Technology) "cat_l3", "cat_l2"
CDP (Code and Data Prioritization) "cdp_l3", "cdp_l2"
CQM (Cache QoS Monitoring) "cqm_llc", "cqm_occup_llc"
MBM (Memory Bandwidth Monitoring) "cqm_mbm_total", "cqm_mbm_local"
MBA (Memory Bandwidth Allocation) "mba"
SMBA (Slow Memory Bandwidth Allocation) ""
BMEC (Bandwidth Monitoring Event Configuration) ""
ABMC (Assignable Bandwidth Monitoring Counters) ""
=============================================== ================================
=============================================================== ================================
RDT (Resource Director Technology) Allocation "rdt_a"
CAT (Cache Allocation Technology) "cat_l3", "cat_l2"
CDP (Code and Data Prioritization) "cdp_l3", "cdp_l2"
CQM (Cache QoS Monitoring) "cqm_llc", "cqm_occup_llc"
MBM (Memory Bandwidth Monitoring) "cqm_mbm_total", "cqm_mbm_local"
MBA (Memory Bandwidth Allocation) "mba"
SMBA (Slow Memory Bandwidth Allocation) ""
BMEC (Bandwidth Monitoring Event Configuration) ""
ABMC (Assignable Bandwidth Monitoring Counters) ""
SDCIAE (Smart Data Cache Injection Allocation Enforcement) ""
=============================================================== ================================
Historically, new features were made visible by default in /proc/cpuinfo. This
resulted in the feature flags becoming hard to parse by humans. Adding a new
@ -72,6 +73,11 @@ The 'info' directory contains information about the enabled
resources. Each resource has its own subdirectory. The subdirectory
names reflect the resource names.
Most of the files in the resource's subdirectory are read-only, and
describe properties of the resource. Resources that support global
configuration options also include writable files that can be used
to modify those settings.
Each subdirectory contains the following files with respect to
allocation:
@ -90,12 +96,19 @@ related to allocation:
must be set when writing a mask.
"shareable_bits":
Bitmask of shareable resource with other executing
entities (e.g. I/O). User can use this when
setting up exclusive cache partitions. Note that
some platforms support devices that have their
own settings for cache use which can over-ride
these bits.
Bitmask of shareable resource with other executing entities
(e.g. I/O). Applies to all instances of this resource. User
can use this when setting up exclusive cache partitions.
Note that some platforms support devices that have their
own settings for cache use which can over-ride these bits.
When "io_alloc" is enabled, a portion of each cache instance can
be configured for shared use between hardware and software.
"bit_usage" should be used to see which portions of each cache
instance is configured for hardware use via "io_alloc" feature
because every cache instance can have its "io_alloc" bitmask
configured independently via "io_alloc_cbm".
"bit_usage":
Annotated capacity bitmasks showing how all
instances of the resource are used. The legend is:
@ -109,16 +122,16 @@ related to allocation:
"H":
Corresponding region is used by hardware only
but available for software use. If a resource
has bits set in "shareable_bits" but not all
of these bits appear in the resource groups'
schematas then the bits appearing in
"shareable_bits" but no resource group will
be marked as "H".
has bits set in "shareable_bits" or "io_alloc_cbm"
but not all of these bits appear in the resource
groups' schemata then the bits appearing in
"shareable_bits" or "io_alloc_cbm" but no
resource group will be marked as "H".
"X":
Corresponding region is available for sharing and
used by hardware and software. These are the
bits that appear in "shareable_bits" as
well as a resource group's allocation.
used by hardware and software. These are the bits
that appear in "shareable_bits" or "io_alloc_cbm"
as well as a resource group's allocation.
"S":
Corresponding region is used by software
and available for sharing.
@ -136,6 +149,77 @@ related to allocation:
"1":
Non-contiguous 1s value in CBM is supported.
"io_alloc":
"io_alloc" enables system software to configure the portion of
the cache allocated for I/O traffic. File may only exist if the
system supports this feature on some of its cache resources.
"disabled":
Resource supports "io_alloc" but the feature is disabled.
Portions of cache used for allocation of I/O traffic cannot
be configured.
"enabled":
Portions of cache used for allocation of I/O traffic
can be configured using "io_alloc_cbm".
"not supported":
Support not available for this resource.
The feature can be modified by writing to the interface, for example:
To enable::
# echo 1 > /sys/fs/resctrl/info/L3/io_alloc
To disable::
# echo 0 > /sys/fs/resctrl/info/L3/io_alloc
The underlying implementation may reduce resources available to
general (CPU) cache allocation. See architecture specific notes
below. Depending on usage requirements the feature can be enabled
or disabled.
On AMD systems, io_alloc feature is supported by the L3 Smart
Data Cache Injection Allocation Enforcement (SDCIAE). The CLOSID for
io_alloc is the highest CLOSID supported by the resource. When
io_alloc is enabled, the highest CLOSID is dedicated to io_alloc and
no longer available for general (CPU) cache allocation. When CDP is
enabled, io_alloc routes I/O traffic using the highest CLOSID allocated
for the instruction cache (CDP_CODE), making this CLOSID no longer
available for general (CPU) cache allocation for both the CDP_CODE
and CDP_DATA resources.
"io_alloc_cbm":
Capacity bitmasks that describe the portions of cache instances to
which I/O traffic from supported I/O devices are routed when "io_alloc"
is enabled.
CBMs are displayed in the following format:
<cache_id0>=<cbm>;<cache_id1>=<cbm>;...
Example::
# cat /sys/fs/resctrl/info/L3/io_alloc_cbm
0=ffff;1=ffff
CBMs can be configured by writing to the interface.
Example::
# echo 1=ff > /sys/fs/resctrl/info/L3/io_alloc_cbm
# cat /sys/fs/resctrl/info/L3/io_alloc_cbm
0=ffff;1=00ff
# echo "0=ff;1=f" > /sys/fs/resctrl/info/L3/io_alloc_cbm
# cat /sys/fs/resctrl/info/L3/io_alloc_cbm
0=00ff;1=000f
When CDP is enabled "io_alloc_cbm" associated with the CDP_DATA and CDP_CODE
resources may reflect the same values. For example, values read from and
written to /sys/fs/resctrl/info/L3DATA/io_alloc_cbm may be reflected by
/sys/fs/resctrl/info/L3CODE/io_alloc_cbm and vice versa.
Memory bandwidth(MB) subdirectory contains the following files
with respect to allocation:

View File

@ -220,13 +220,14 @@ Read path, three categories:
according to a passed marker. This is used to avoid lockless readers
starvation (too much retry loops) in case of a sharp spike in write
activity. First, a lockless read is tried (even marker passed). If
that trial fails (odd sequence counter is returned, which is used as
the next iteration marker), the lockless read is transformed to a
full locking read and no retry loop is necessary::
that trial fails (sequence counter doesn't match), make the marker
odd for the next iteration, the lockless read is transformed to a
full locking read and no retry loop is necessary, for example::
/* marker; even initialization */
int seq = 0;
int seq = 1;
do {
seq++; /* 2 on the 1st/lockless path, otherwise odd */
read_seqbegin_or_lock(&foo_seqlock, &seq);
/* ... [[read-side critical section]] ... */

View File

@ -14459,10 +14459,11 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/livepatching/livepatching.g
F: Documentation/ABI/testing/sysfs-kernel-livepatch
F: Documentation/livepatch/
F: arch/powerpc/include/asm/livepatch.h
F: include/linux/livepatch.h
F: include/linux/livepatch*.h
F: kernel/livepatch/
F: kernel/module/livepatch.c
F: samples/livepatch/
F: scripts/livepatch/
F: tools/testing/selftests/livepatch/
LLC (802.2)
@ -14536,6 +14537,7 @@ S: Maintained
T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git locking/core
F: Documentation/locking/
F: arch/*/include/asm/spinlock*.h
F: include/linux/local_lock*.h
F: include/linux/lockdep*.h
F: include/linux/mutex*.h
F: include/linux/rwlock*.h
@ -21679,6 +21681,11 @@ S: Maintained
F: Documentation/devicetree/bindings/spi/realtek,rtl9301-snand.yaml
F: drivers/spi/spi-realtek-rtl-snand.c
REALTEK SYSTIMER DRIVER
M: Hao-Wen Ting <haowen.ting@realtek.com>
S: Maintained
F: drivers/clocksource/timer-realtek.c
REALTEK WIRELESS DRIVER (rtlwifi family)
M: Ping-Ke Shih <pkshih@realtek.com>
L: linux-wireless@vger.kernel.org
@ -27166,6 +27173,7 @@ F: arch/s390/include/uapi/asm/virtio-ccw.h
F: drivers/s390/virtio/
VIRTIO FILE SYSTEM
M: German Maglione <gmaglione@redhat.com>
M: Vivek Goyal <vgoyal@redhat.com>
M: Stefan Hajnoczi <stefanha@redhat.com>
M: Miklos Szeredi <miklos@szeredi.hu>

View File

@ -2,7 +2,7 @@
VERSION = 6
PATCHLEVEL = 18
SUBLEVEL = 0
EXTRAVERSION = -rc7
EXTRAVERSION =
NAME = Baby Opossum Posse
# *DOCUMENTATION*
@ -1061,6 +1061,9 @@ NOSTDINC_FLAGS += -nostdinc
# perform bounds checking.
KBUILD_CFLAGS += $(call cc-option, -fstrict-flex-arrays=3)
# Allow including a tagged struct or union anonymously in another struct/union.
KBUILD_CFLAGS += -fms-extensions
# disable invalid "can't wrap" optimizations for signed / pointers
KBUILD_CFLAGS += -fno-strict-overflow

View File

@ -509,3 +509,4 @@
577 common open_tree_attr sys_open_tree_attr
578 common file_getattr sys_file_getattr
579 common file_setattr sys_file_setattr
580 common listns sys_listns

View File

@ -283,10 +283,17 @@ extern int __put_user_8(void *, unsigned long long);
__gu_err; \
})
/*
* This is a type: either unsigned long, if the argument fits into
* that type, or otherwise unsigned long long.
*/
#define __long_type(x) \
__typeof__(__builtin_choose_expr(sizeof(x) > sizeof(0UL), 0ULL, 0UL))
#define __get_user_err(x, ptr, err, __t) \
do { \
unsigned long __gu_addr = (unsigned long)(ptr); \
unsigned long __gu_val; \
__long_type(x) __gu_val; \
unsigned int __ua_flags; \
__chk_user_ptr(ptr); \
might_fault(); \
@ -295,6 +302,7 @@ do { \
case 1: __get_user_asm_byte(__gu_val, __gu_addr, err, __t); break; \
case 2: __get_user_asm_half(__gu_val, __gu_addr, err, __t); break; \
case 4: __get_user_asm_word(__gu_val, __gu_addr, err, __t); break; \
case 8: __get_user_asm_dword(__gu_val, __gu_addr, err, __t); break; \
default: (__gu_val) = __get_user_bad(); \
} \
uaccess_restore(__ua_flags); \
@ -353,6 +361,22 @@ do { \
#define __get_user_asm_word(x, addr, err, __t) \
__get_user_asm(x, addr, err, "ldr" __t)
#ifdef __ARMEB__
#define __WORD0_OFFS 4
#define __WORD1_OFFS 0
#else
#define __WORD0_OFFS 0
#define __WORD1_OFFS 4
#endif
#define __get_user_asm_dword(x, addr, err, __t) \
({ \
unsigned long __w0, __w1; \
__get_user_asm(__w0, addr + __WORD0_OFFS, err, "ldr" __t); \
__get_user_asm(__w1, addr + __WORD1_OFFS, err, "ldr" __t); \
(x) = ((u64)__w1 << 32) | (u64) __w0; \
})
#define __put_user_switch(x, ptr, __err, __fn) \
do { \
const __typeof__(*(ptr)) __user *__pu_ptr = (ptr); \

View File

@ -484,3 +484,4 @@
467 common open_tree_attr sys_open_tree_attr
468 common file_getattr sys_file_getattr
469 common file_setattr sys_file_setattr
470 common listns sys_listns

View File

@ -19,7 +19,7 @@
unreachable(); \
} while (0)
#define __WARN_FLAGS(flags) __BUG_FLAGS(BUGFLAG_WARNING|(flags))
#define __WARN_FLAGS(cond_str, flags) __BUG_FLAGS(BUGFLAG_WARNING|(flags))
#define HAVE_ARCH_BUG

View File

@ -422,9 +422,9 @@ static __must_check __always_inline bool user_access_begin(const void __user *pt
}
#define user_access_begin(a,b) user_access_begin(a,b)
#define user_access_end() uaccess_ttbr0_disable()
#define unsafe_put_user(x, ptr, label) \
#define arch_unsafe_put_user(x, ptr, label) \
__raw_put_mem("sttr", x, uaccess_mask_ptr(ptr), label, U)
#define unsafe_get_user(x, ptr, label) \
#define arch_unsafe_get_user(x, ptr, label) \
__raw_get_mem("ldtr", x, uaccess_mask_ptr(ptr), label, U)
/*

View File

@ -100,7 +100,7 @@ static __always_inline void arm64_enter_from_user_mode(struct pt_regs *regs)
static __always_inline void arm64_exit_to_user_mode(struct pt_regs *regs)
{
local_irq_disable();
exit_to_user_mode_prepare(regs);
exit_to_user_mode_prepare_legacy(regs);
local_daif_mask();
mte_check_tfsr_exit();
exit_to_user_mode();

View File

@ -1094,7 +1094,7 @@ static void ipi_setup_sgi(int ipi)
irq = ipi_irq_base + ipi;
if (ipi_should_be_nmi(ipi)) {
err = request_percpu_nmi(irq, ipi_handler, "IPI", &irq_stat);
err = request_percpu_nmi(irq, ipi_handler, "IPI", NULL, &irq_stat);
WARN(err, "Could not request IRQ %d as NMI, err=%d\n", irq, err);
} else {
err = request_percpu_irq(irq, ipi_handler, "IPI", &irq_stat);

View File

@ -63,7 +63,7 @@ VDSO_CFLAGS += -Wall -Wundef -Wstrict-prototypes -Wno-trigraphs \
$(filter -Werror,$(KBUILD_CPPFLAGS)) \
-Werror-implicit-function-declaration \
-Wno-format-security \
-std=gnu11
-std=gnu11 -fms-extensions
VDSO_CFLAGS += -O2
# Some useful compiler-dependent flags from top-level Makefile
VDSO_CFLAGS += $(call cc32-option,-Wno-pointer-sign)
@ -71,6 +71,7 @@ VDSO_CFLAGS += -fno-strict-overflow
VDSO_CFLAGS += $(call cc32-option,-Werror=strict-prototypes)
VDSO_CFLAGS += -Werror=date-time
VDSO_CFLAGS += $(call cc32-option,-Werror=incompatible-pointer-types)
VDSO_CFLAGS += $(if $(CONFIG_CC_IS_CLANG),-Wno-microsoft-anon-tag)
# Compile as THUMB2 or ARM. Unwinding via frame-pointers in THUMB2 is
# unreliable.

View File

@ -481,3 +481,4 @@
467 common open_tree_attr sys_open_tree_attr
468 common file_getattr sys_file_getattr
469 common file_setattr sys_file_setattr
470 common listns sys_listns

View File

@ -917,7 +917,6 @@ CONFIG_MMC=y
CONFIG_MMC_LOONGSON2=m
CONFIG_INFINIBAND=m
CONFIG_EDAC=y
# CONFIG_EDAC_LEGACY_SYSFS is not set
CONFIG_EDAC_LOONGSON=y
CONFIG_RTC_CLASS=y
CONFIG_RTC_DRV_EFI=y

View File

@ -11,7 +11,7 @@
#else
#define __BUGVERBOSE_LOCATION(file, line) \
.pushsection .rodata.str, "aMS", @progbits, 1; \
10002: .string file; \
10002: .ascii file "\0"; \
.popsection; \
\
.long 10002b - .; \
@ -20,39 +20,38 @@
#endif
#ifndef CONFIG_GENERIC_BUG
#define __BUG_ENTRY(flags)
#define __BUG_ENTRY(cond_str, flags)
#else
#define __BUG_ENTRY(flags) \
#define __BUG_ENTRY(cond_str, flags) \
.pushsection __bug_table, "aw"; \
.align 2; \
10000: .long 10001f - .; \
_BUGVERBOSE_LOCATION(__FILE__, __LINE__) \
.short flags; \
_BUGVERBOSE_LOCATION(WARN_CONDITION_STR(cond_str) __FILE__, __LINE__) \
.short flags; \
.popsection; \
10001:
#endif
#define ASM_BUG_FLAGS(flags) \
__BUG_ENTRY(flags) \
#define ASM_BUG_FLAGS(cond_str, flags) \
__BUG_ENTRY(cond_str, flags) \
break BRK_BUG;
#define ASM_BUG() ASM_BUG_FLAGS(0)
#define ASM_BUG() ASM_BUG_FLAGS("", 0)
#define __BUG_FLAGS(flags, extra) \
asm_inline volatile (__stringify(ASM_BUG_FLAGS(flags)) \
extra);
#define __BUG_FLAGS(cond_str, flags, extra) \
asm_inline volatile (__stringify(ASM_BUG_FLAGS(cond_str, flags)) extra);
#define __WARN_FLAGS(flags) \
#define __WARN_FLAGS(cond_str, flags) \
do { \
instrumentation_begin(); \
__BUG_FLAGS(BUGFLAG_WARNING|(flags), ANNOTATE_REACHABLE(10001b));\
__BUG_FLAGS(cond_str, BUGFLAG_WARNING|(flags), ANNOTATE_REACHABLE(10001b));\
instrumentation_end(); \
} while (0)
#define BUG() \
do { \
instrumentation_begin(); \
__BUG_FLAGS(0, ""); \
__BUG_FLAGS("", 0, ""); \
unreachable(); \
} while (0)

View File

@ -19,7 +19,7 @@ ccflags-vdso := \
cflags-vdso := $(ccflags-vdso) \
-isystem $(shell $(CC) -print-file-name=include) \
$(filter -W%,$(filter-out -Wa$(comma)%,$(KBUILD_CFLAGS))) \
-std=gnu11 -O2 -g -fno-strict-aliasing -fno-common -fno-builtin \
-std=gnu11 -fms-extensions -O2 -g -fno-strict-aliasing -fno-common -fno-builtin \
-fno-stack-protector -fno-jump-tables -DDISABLE_BRANCH_PROFILING \
$(call cc-option, -fno-asynchronous-unwind-tables) \
$(call cc-option, -fno-stack-protector)

View File

@ -469,3 +469,4 @@
467 common open_tree_attr sys_open_tree_attr
468 common file_getattr sys_file_getattr
469 common file_setattr sys_file_setattr
470 common listns sys_listns

View File

@ -475,3 +475,4 @@
467 common open_tree_attr sys_open_tree_attr
468 common file_getattr sys_file_getattr
469 common file_setattr sys_file_setattr
470 common listns sys_listns

View File

@ -408,3 +408,4 @@
467 n32 open_tree_attr sys_open_tree_attr
468 n32 file_getattr sys_file_getattr
469 n32 file_setattr sys_file_setattr
470 n32 listns sys_listns

View File

@ -384,3 +384,4 @@
467 n64 open_tree_attr sys_open_tree_attr
468 n64 file_getattr sys_file_getattr
469 n64 file_setattr sys_file_setattr
470 n64 listns sys_listns

View File

@ -457,3 +457,4 @@
467 o32 open_tree_attr sys_open_tree_attr
468 o32 file_getattr sys_file_getattr
469 o32 file_setattr sys_file_setattr
470 o32 listns sys_listns

View File

@ -18,7 +18,7 @@ KBUILD_CFLAGS += -fno-PIE -mno-space-regs -mdisable-fpregs -Os
ifndef CONFIG_64BIT
KBUILD_CFLAGS += -mfast-indirect-calls
endif
KBUILD_CFLAGS += -std=gnu11
KBUILD_CFLAGS += -std=gnu11 -fms-extensions
LDFLAGS_vmlinux := -X -e startup --as-needed -T
$(obj)/vmlinux: $(obj)/vmlinux.lds $(addprefix $(obj)/, $(OBJECTS)) $(LIBGCC) FORCE

View File

@ -50,7 +50,7 @@
#endif
#ifdef CONFIG_DEBUG_BUGVERBOSE
#define __WARN_FLAGS(flags) \
#define __WARN_FLAGS(cond_str, flags) \
do { \
asm volatile("\n" \
"1:\t" PARISC_BUG_BREAK_ASM "\n" \
@ -61,12 +61,12 @@
"\t.short %1, %2\n" \
"\t.blockz %3-2*4-2*2\n" \
"\t.popsection" \
: : "i" (__FILE__), "i" (__LINE__), \
: : "i" (WARN_CONDITION_STR(cond_str) __FILE__), "i" (__LINE__), \
"i" (BUGFLAG_WARNING|(flags)), \
"i" (sizeof(struct bug_entry)) ); \
} while(0)
#else
#define __WARN_FLAGS(flags) \
#define __WARN_FLAGS(cond_str, flags) \
do { \
asm volatile("\n" \
"1:\t" PARISC_BUG_BREAK_ASM "\n" \

View File

@ -468,3 +468,4 @@
467 common open_tree_attr sys_open_tree_attr
468 common file_getattr sys_file_getattr
469 common file_setattr sys_file_setattr
470 common listns sys_listns

View File

@ -70,7 +70,7 @@ BOOTCPPFLAGS := -nostdinc $(LINUXINCLUDE)
BOOTCPPFLAGS += -isystem $(shell $(BOOTCC) -print-file-name=include)
BOOTCFLAGS := $(BOOTTARGETFLAGS) \
-std=gnu11 \
-std=gnu11 -fms-extensions \
-Wall -Wundef -Wstrict-prototypes -Wno-trigraphs \
-fno-strict-aliasing -O2 \
-msoft-float -mno-altivec -mno-vsx \
@ -86,6 +86,7 @@ BOOTARFLAGS := -crD
ifdef CONFIG_CC_IS_CLANG
BOOTCFLAGS += $(CLANG_FLAGS)
BOOTCFLAGS += -Wno-microsoft-anon-tag
BOOTAFLAGS += $(CLANG_FLAGS)
endif

View File

@ -51,11 +51,11 @@
".previous\n"
#endif
#define BUG_ENTRY(insn, flags, ...) \
#define BUG_ENTRY(cond_str, insn, flags, ...) \
__asm__ __volatile__( \
"1: " insn "\n" \
_EMIT_BUG_ENTRY \
: : "i" (__FILE__), "i" (__LINE__), \
: : "i" (WARN_CONDITION_STR(cond_str) __FILE__), "i" (__LINE__), \
"i" (flags), \
"i" (sizeof(struct bug_entry)), \
##__VA_ARGS__)
@ -67,12 +67,12 @@
*/
#define BUG() do { \
BUG_ENTRY("twi 31, 0, 0", 0); \
BUG_ENTRY("", "twi 31, 0, 0", 0); \
unreachable(); \
} while (0)
#define HAVE_ARCH_BUG
#define __WARN_FLAGS(flags) BUG_ENTRY("twi 31, 0, 0", BUGFLAG_WARNING | (flags))
#define __WARN_FLAGS(cond_str, flags) BUG_ENTRY(cond_str, "twi 31, 0, 0", BUGFLAG_WARNING | (flags))
#ifdef CONFIG_PPC64
#define BUG_ON(x) do { \
@ -80,7 +80,7 @@
if (x) \
BUG(); \
} else { \
BUG_ENTRY(PPC_TLNEI " %4, 0", 0, "r" ((__force long)(x))); \
BUG_ENTRY(#x, PPC_TLNEI " %4, 0", 0, "r" ((__force long)(x))); \
} \
} while (0)
@ -90,7 +90,7 @@
if (__ret_warn_on) \
__WARN(); \
} else { \
BUG_ENTRY(PPC_TLNEI " %4, 0", \
BUG_ENTRY(#x, PPC_TLNEI " %4, 0", \
BUGFLAG_WARNING | BUGFLAG_TAINT(TAINT_WARN), \
"r" (__ret_warn_on)); \
} \

View File

@ -451,7 +451,7 @@ user_write_access_begin(const void __user *ptr, size_t len)
#define user_write_access_begin user_write_access_begin
#define user_write_access_end prevent_current_write_to_user
#define unsafe_get_user(x, p, e) do { \
#define arch_unsafe_get_user(x, p, e) do { \
__long_type(*(p)) __gu_val; \
__typeof__(*(p)) __user *__gu_addr = (p); \
\
@ -459,7 +459,7 @@ user_write_access_begin(const void __user *ptr, size_t len)
(x) = (__typeof__(*(p)))__gu_val; \
} while (0)
#define unsafe_put_user(x, p, e) \
#define arch_unsafe_put_user(x, p, e) \
__put_user_size_goto((__typeof__(*(p)))(x), (p), sizeof(*(p)), e)
#define unsafe_copy_from_user(d, s, l, e) \
@ -504,11 +504,11 @@ do { \
unsafe_put_user(*(u8*)(_src + _i), (u8 __user *)(_dst + _i), e); \
} while (0)
#define __get_kernel_nofault(dst, src, type, err_label) \
#define arch_get_kernel_nofault(dst, src, type, err_label) \
__get_user_size_goto(*((type *)(dst)), \
(__force type __user *)(src), sizeof(type), err_label)
#define __put_kernel_nofault(dst, src, type, err_label) \
#define arch_put_kernel_nofault(dst, src, type, err_label) \
__put_user_size_goto(*((type *)(src)), \
(__force type __user *)(dst), sizeof(type), err_label)

View File

@ -560,3 +560,4 @@
467 common open_tree_attr sys_open_tree_attr
468 common file_getattr sys_file_getattr
469 common file_setattr sys_file_setattr
470 common listns sys_listns

View File

@ -267,22 +267,11 @@ spufs_mkdir(struct inode *dir, struct dentry *dentry, unsigned int flags,
static int spufs_context_open(const struct path *path)
{
int ret;
struct file *filp;
ret = get_unused_fd_flags(0);
if (ret < 0)
return ret;
filp = dentry_open(path, O_RDONLY, current_cred());
if (IS_ERR(filp)) {
put_unused_fd(ret);
return PTR_ERR(filp);
}
filp->f_op = &spufs_context_fops;
fd_install(ret, filp);
return ret;
FD_PREPARE(fdf, 0, dentry_open(path, O_RDONLY, current_cred()));
if (fdf.err)
return fdf.err;
fd_prepare_file(fdf)->f_op = &spufs_context_fops;
return fd_publish(fdf);
}
static struct spu_context *
@ -508,26 +497,15 @@ static const struct file_operations spufs_gang_fops = {
static int spufs_gang_open(const struct path *path)
{
int ret;
struct file *filp;
ret = get_unused_fd_flags(0);
if (ret < 0)
return ret;
/*
* get references for dget and mntget, will be released
* in error path of *_open().
*/
filp = dentry_open(path, O_RDONLY, current_cred());
if (IS_ERR(filp)) {
put_unused_fd(ret);
return PTR_ERR(filp);
}
filp->f_op = &spufs_gang_fops;
fd_install(ret, filp);
return ret;
FD_PREPARE(fdf, 0, dentry_open(path, O_RDONLY, current_cred()));
if (fdf.err)
return fdf.err;
fd_prepare_file(fdf)->f_op = &spufs_gang_fops;
return fd_publish(fdf);
}
static int spufs_create_gang(struct inode *inode,

View File

@ -479,10 +479,7 @@ static const struct file_operations papr_hvpipe_handle_ops = {
static int papr_hvpipe_dev_create_handle(u32 srcID)
{
struct hvpipe_source_info *src_info;
struct file *file;
long err;
int fd;
struct hvpipe_source_info *src_info __free(kfree) = NULL;
spin_lock(&hvpipe_src_list_lock);
/*
@ -506,20 +503,13 @@ static int papr_hvpipe_dev_create_handle(u32 srcID)
src_info->tsk = current;
init_waitqueue_head(&src_info->recv_wqh);
fd = get_unused_fd_flags(O_RDONLY | O_CLOEXEC);
if (fd < 0) {
err = fd;
goto free_buf;
}
file = anon_inode_getfile("[papr-hvpipe]",
&papr_hvpipe_handle_ops, (void *)src_info,
O_RDWR);
if (IS_ERR(file)) {
err = PTR_ERR(file);
goto free_fd;
}
FD_PREPARE(fdf, O_RDONLY | O_CLOEXEC,
anon_inode_getfile("[papr-hvpipe]", &papr_hvpipe_handle_ops,
(void *)src_info, O_RDWR));
if (fdf.err)
return fdf.err;
retain_and_null_ptr(src_info);
spin_lock(&hvpipe_src_list_lock);
/*
* If two processes are executing ioctl() for the same
@ -528,22 +518,11 @@ static int papr_hvpipe_dev_create_handle(u32 srcID)
*/
if (hvpipe_find_source(srcID)) {
spin_unlock(&hvpipe_src_list_lock);
err = -EALREADY;
goto free_file;
return -EALREADY;
}
list_add(&src_info->list, &hvpipe_src_list);
spin_unlock(&hvpipe_src_list_lock);
fd_install(fd, file);
return fd;
free_file:
fput(file);
free_fd:
put_unused_fd(fd);
free_buf:
kfree(src_info);
return err;
return fd_publish(fdf);
}
/*

View File

@ -303,8 +303,6 @@ static long papr_platform_dump_create_handle(u64 dump_tag)
{
struct ibm_platform_dump_params *params;
u64 param_dump_tag;
struct file *file;
long err;
int fd;
/*
@ -334,34 +332,22 @@ static long papr_platform_dump_create_handle(u64 dump_tag)
params->dump_tag_lo = (u32)(dump_tag & 0x00000000ffffffffULL);
params->status = RTAS_IBM_PLATFORM_DUMP_START;
fd = get_unused_fd_flags(O_RDONLY | O_CLOEXEC);
fd = FD_ADD(O_RDONLY | O_CLOEXEC,
anon_inode_getfile_fmode("[papr-platform-dump]",
&papr_platform_dump_handle_ops,
(void *)params, O_RDONLY,
FMODE_LSEEK | FMODE_PREAD));
if (fd < 0) {
err = fd;
goto free_area;
rtas_work_area_free(params->work_area);
kfree(params);
return fd;
}
file = anon_inode_getfile_fmode("[papr-platform-dump]",
&papr_platform_dump_handle_ops,
(void *)params, O_RDONLY,
FMODE_LSEEK | FMODE_PREAD);
if (IS_ERR(file)) {
err = PTR_ERR(file);
goto put_fd;
}
fd_install(fd, file);
list_add(&params->list, &platform_dump_list);
pr_info("%s (%d) initiated platform dump for dump tag %llu\n",
current->comm, current->pid, dump_tag);
return fd;
put_fd:
put_unused_fd(fd);
free_area:
rtas_work_area_free(params->work_area);
kfree(params);
return err;
}
/*

View File

@ -205,35 +205,18 @@ long papr_rtas_setup_file_interface(struct papr_rtas_sequence *seq,
char *name)
{
const struct papr_rtas_blob *blob;
struct file *file;
long ret;
int fd;
blob = papr_rtas_retrieve(seq);
if (IS_ERR(blob))
return PTR_ERR(blob);
fd = get_unused_fd_flags(O_RDONLY | O_CLOEXEC);
if (fd < 0) {
ret = fd;
goto free_blob;
}
file = anon_inode_getfile_fmode(name, fops, (void *)blob,
O_RDONLY, FMODE_LSEEK | FMODE_PREAD);
if (IS_ERR(file)) {
ret = PTR_ERR(file);
goto put_fd;
}
fd_install(fd, file);
fd = FD_ADD(O_RDONLY | O_CLOEXEC,
anon_inode_getfile_fmode(name, fops, (void *)blob, O_RDONLY,
FMODE_LSEEK | FMODE_PREAD));
if (fd < 0)
papr_rtas_blob_free(blob);
return fd;
put_fd:
put_unused_fd(fd);
free_blob:
papr_rtas_blob_free(blob);
return ret;
}
/*

View File

@ -60,28 +60,28 @@ typedef u32 bug_insn_t;
".org 2b + " size "\n\t" \
".popsection" \
#define __BUG_FLAGS(flags) \
#define __BUG_FLAGS(cond_str, flags) \
do { \
__asm__ __volatile__ ( \
ARCH_WARN_ASM("%0", "%1", "%2", "%3") \
: \
: "i" (__FILE__), "i" (__LINE__), \
: "i" (WARN_CONDITION_STR(cond_str) __FILE__), "i" (__LINE__), \
"i" (flags), \
"i" (sizeof(struct bug_entry))); \
} while (0)
#else /* CONFIG_GENERIC_BUG */
#define __BUG_FLAGS(flags) do { \
#define __BUG_FLAGS(cond_str, flags) do { \
__asm__ __volatile__ ("ebreak\n"); \
} while (0)
#endif /* CONFIG_GENERIC_BUG */
#define BUG() do { \
__BUG_FLAGS(0); \
__BUG_FLAGS("", 0); \
unreachable(); \
} while (0)
#define __WARN_FLAGS(flags) __BUG_FLAGS(BUGFLAG_WARNING|(flags))
#define __WARN_FLAGS(cond_str, flags) __BUG_FLAGS(cond_str, BUGFLAG_WARNING|(flags))
#define ARCH_WARN_REACHABLE

View File

@ -437,10 +437,10 @@ unsigned long __must_check clear_user(void __user *to, unsigned long n)
__clear_user(untagged_addr(to), n) : n;
}
#define __get_kernel_nofault(dst, src, type, err_label) \
#define arch_get_kernel_nofault(dst, src, type, err_label) \
__get_user_nocheck(*((type *)(dst)), (__force __user type *)(src), err_label)
#define __put_kernel_nofault(dst, src, type, err_label) \
#define arch_put_kernel_nofault(dst, src, type, err_label) \
__put_user_nocheck(*((type *)(src)), (__force __user type *)(dst), err_label)
static __must_check __always_inline bool user_access_begin(const void __user *ptr, size_t len)
@ -460,10 +460,10 @@ static inline void user_access_restore(unsigned long enabled) { }
* We want the unsafe accessors to always be inlined and use
* the error labels - thus the macro games.
*/
#define unsafe_put_user(x, ptr, label) \
#define arch_unsafe_put_user(x, ptr, label) \
__put_user_nocheck(x, (ptr), label)
#define unsafe_get_user(x, ptr, label) do { \
#define arch_unsafe_get_user(x, ptr, label) do { \
__inttype(*(ptr)) __gu_val; \
__get_user_nocheck(__gu_val, (ptr), label); \
(x) = (__force __typeof__(*(ptr)))__gu_val; \

View File

@ -22,7 +22,7 @@ KBUILD_AFLAGS_DECOMPRESSOR := $(CLANG_FLAGS) -m64 -D__ASSEMBLY__
ifndef CONFIG_AS_IS_LLVM
KBUILD_AFLAGS_DECOMPRESSOR += $(if $(CONFIG_DEBUG_INFO),$(aflags_dwarf))
endif
KBUILD_CFLAGS_DECOMPRESSOR := $(CLANG_FLAGS) -m64 -O2 -mpacked-stack -std=gnu11
KBUILD_CFLAGS_DECOMPRESSOR := $(CLANG_FLAGS) -m64 -O2 -mpacked-stack -std=gnu11 -fms-extensions
KBUILD_CFLAGS_DECOMPRESSOR += -DDISABLE_BRANCH_PROFILING -D__NO_FORTIFY
KBUILD_CFLAGS_DECOMPRESSOR += -D__DECOMPRESSOR
KBUILD_CFLAGS_DECOMPRESSOR += -Wno-pointer-sign
@ -35,6 +35,7 @@ KBUILD_CFLAGS_DECOMPRESSOR += $(call cc-disable-warning, address-of-packed-membe
KBUILD_CFLAGS_DECOMPRESSOR += $(if $(CONFIG_DEBUG_INFO),-g)
KBUILD_CFLAGS_DECOMPRESSOR += $(if $(CONFIG_DEBUG_INFO_DWARF4), $(call cc-option, -gdwarf-4,))
KBUILD_CFLAGS_DECOMPRESSOR += $(if $(CONFIG_CC_NO_ARRAY_BOUNDS),-Wno-array-bounds)
KBUILD_CFLAGS_DECOMPRESSOR += $(if $(CONFIG_CC_IS_CLANG),-Wno-microsoft-anon-tag)
UTS_MACHINE := s390x
STACK_SIZE := $(if $(CONFIG_KASAN),65536,$(if $(CONFIG_KMSAN),65536,16384))

View File

@ -2,69 +2,55 @@
#ifndef _ASM_S390_BUG_H
#define _ASM_S390_BUG_H
#include <linux/compiler.h>
#include <linux/stringify.h>
#ifdef CONFIG_BUG
#ifndef CONFIG_DEBUG_BUGVERBOSE
#define _BUGVERBOSE_LOCATION(file, line)
#else
#define __BUGVERBOSE_LOCATION(file, line) \
.pushsection .rodata.str, "aMS", @progbits, 1; \
10002: .ascii file "\0"; \
.popsection; \
\
.long 10002b - .; \
.short line;
#define _BUGVERBOSE_LOCATION(file, line) __BUGVERBOSE_LOCATION(file, line)
#endif
#ifdef CONFIG_DEBUG_BUGVERBOSE
#ifndef CONFIG_GENERIC_BUG
#define __BUG_ENTRY(cond_str, flags)
#else
#define __BUG_ENTRY(cond_str, flags) \
.pushsection __bug_table, "aw"; \
.align 4; \
10000: .long 10001f - .; \
_BUGVERBOSE_LOCATION(WARN_CONDITION_STR(cond_str) __FILE__, __LINE__) \
.short flags; \
.popsection; \
10001:
#endif
#define __EMIT_BUG(x) do { \
asm_inline volatile( \
"0: mc 0,0\n" \
".section .rodata.str,\"aMS\",@progbits,1\n" \
"1: .asciz \""__FILE__"\"\n" \
".previous\n" \
".section __bug_table,\"aw\"\n" \
"2: .long 0b-.\n" \
" .long 1b-.\n" \
" .short %0,%1\n" \
" .org 2b+%2\n" \
".previous\n" \
: : "i" (__LINE__), \
"i" (x), \
"i" (sizeof(struct bug_entry))); \
#define ASM_BUG_FLAGS(cond_str, flags) \
__BUG_ENTRY(cond_str, flags) \
mc 0,0
#define ASM_BUG() ASM_BUG_FLAGS("", 0)
#define __BUG_FLAGS(cond_str, flags) \
asm_inline volatile(__stringify(ASM_BUG_FLAGS(cond_str, flags)));
#define __WARN_FLAGS(cond_str, flags) \
do { \
__BUG_FLAGS(cond_str, BUGFLAG_WARNING|(flags)); \
} while (0)
#else /* CONFIG_DEBUG_BUGVERBOSE */
#define __EMIT_BUG(x) do { \
asm_inline volatile( \
"0: mc 0,0\n" \
".section __bug_table,\"aw\"\n" \
"1: .long 0b-.\n" \
" .short %0\n" \
" .org 1b+%1\n" \
".previous\n" \
: : "i" (x), \
"i" (sizeof(struct bug_entry))); \
#define BUG() \
do { \
__BUG_FLAGS("", 0); \
unreachable(); \
} while (0)
#endif /* CONFIG_DEBUG_BUGVERBOSE */
#define BUG() do { \
__EMIT_BUG(0); \
unreachable(); \
} while (0)
#define __WARN_FLAGS(flags) do { \
__EMIT_BUG(BUGFLAG_WARNING|(flags)); \
} while (0)
#define WARN_ON(x) ({ \
int __ret_warn_on = !!(x); \
if (__builtin_constant_p(__ret_warn_on)) { \
if (__ret_warn_on) \
__WARN(); \
} else { \
if (unlikely(__ret_warn_on)) \
__WARN(); \
} \
unlikely(__ret_warn_on); \
})
#define HAVE_ARCH_BUG
#define HAVE_ARCH_WARN_ON
#endif /* CONFIG_BUG */
#include <asm-generic/bug.h>

View File

@ -19,7 +19,7 @@
#ifdef CONFIG_EXPOLINE_EXTERN
SYM_CODE_START(\name)
#else
.pushsection .text.\name,"axG",@progbits,\name,comdat
.pushsection .text..\name,"axG",@progbits,\name,comdat
.globl \name
.hidden \name
.type \name,@function

View File

@ -468,8 +468,8 @@ do { \
#endif /* CONFIG_CC_HAS_ASM_GOTO_OUTPUT && CONFIG_CC_HAS_ASM_AOR_FORMAT_FLAGS */
#define __get_kernel_nofault __mvc_kernel_nofault
#define __put_kernel_nofault __mvc_kernel_nofault
#define arch_get_kernel_nofault __mvc_kernel_nofault
#define arch_put_kernel_nofault __mvc_kernel_nofault
void __cmpxchg_user_key_called_with_bad_pointer(void);

View File

@ -472,3 +472,4 @@
467 common open_tree_attr sys_open_tree_attr sys_open_tree_attr
468 common file_getattr sys_file_getattr sys_file_getattr
469 common file_setattr sys_file_setattr sys_file_setattr
470 common listns sys_listns sys_listns

View File

@ -51,7 +51,7 @@ SECTIONS
IRQENTRY_TEXT
SOFTIRQENTRY_TEXT
FTRACE_HOTPATCH_TRAMPOLINES_TEXT
*(.text.*_indirect_*)
*(.text..*_indirect_*)
*(.gnu.warning)
. = ALIGN(PAGE_SIZE);
_etext = .; /* End of text section */

View File

@ -199,8 +199,7 @@ static void pfault_interrupt(struct ext_code ext_code,
* return to userspace schedule() to block.
*/
__set_current_state(TASK_UNINTERRUPTIBLE);
set_tsk_need_resched(tsk);
set_preempt_need_resched();
set_need_resched_current();
}
}
out:

View File

@ -13,7 +13,7 @@ CFLAGS_sha256.o := -D__NO_FORTIFY
$(obj)/mem.o: $(srctree)/arch/s390/lib/mem.S FORCE
$(call if_changed_rule,as_o_S)
KBUILD_CFLAGS := -std=gnu11 -fno-strict-aliasing -Wall -Wstrict-prototypes
KBUILD_CFLAGS := -std=gnu11 -fms-extensions -fno-strict-aliasing -Wall -Wstrict-prototypes
KBUILD_CFLAGS += -Wno-pointer-sign -Wno-sign-compare
KBUILD_CFLAGS += -fno-zero-initialized-in-bss -fno-builtin -ffreestanding
KBUILD_CFLAGS += -Os -m64 -msoft-float -fno-common
@ -21,6 +21,7 @@ KBUILD_CFLAGS += -fno-stack-protector
KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING
KBUILD_CFLAGS += -D__DISABLE_EXPORTS
KBUILD_CFLAGS += $(CLANG_FLAGS)
KBUILD_CFLAGS += $(if $(CONFIG_CC_IS_CLANG),-Wno-microsoft-anon-tag)
KBUILD_CFLAGS += $(call cc-option,-fno-PIE)
KBUILD_AFLAGS := $(filter-out -DCC_USING_EXPOLINE,$(KBUILD_AFLAGS))
KBUILD_AFLAGS += -D__DISABLE_EXPORTS

View File

@ -52,14 +52,14 @@ do { \
unreachable(); \
} while (0)
#define __WARN_FLAGS(flags) \
#define __WARN_FLAGS(cond_str, flags) \
do { \
__asm__ __volatile__ ( \
"1:\t.short %O0\n" \
_EMIT_BUG_ENTRY \
: \
: "n" (TRAPA_BUG_OPCODE), \
"i" (__FILE__), \
"i" (WARN_CONDITION_STR(cond_str) __FILE__), \
"i" (__LINE__), \
"i" (BUGFLAG_WARNING|(flags)), \
"i" (sizeof(struct bug_entry))); \

View File

@ -473,3 +473,4 @@
467 common open_tree_attr sys_open_tree_attr
468 common file_getattr sys_file_getattr
469 common file_setattr sys_file_setattr
470 common listns sys_listns

View File

@ -515,3 +515,4 @@
467 common open_tree_attr sys_open_tree_attr
468 common file_getattr sys_file_getattr
469 common file_setattr sys_file_setattr
470 common listns sys_listns

View File

@ -261,6 +261,7 @@ config X86
select HAVE_FUNCTION_ERROR_INJECTION
select HAVE_KRETPROBES
select HAVE_RETHOOK
select HAVE_KLP_BUILD if X86_64
select HAVE_LIVEPATCH if X86_64
select HAVE_MIXED_BREAKPOINTS_REGS
select HAVE_MOD_ARCH_SPECIFIC
@ -297,6 +298,7 @@ config X86
select HAVE_SYSCALL_TRACEPOINTS
select HAVE_UACCESS_VALIDATION if HAVE_OBJTOOL
select HAVE_UNSTABLE_SCHED_CLOCK
select HAVE_UNWIND_USER_FP if X86_64
select HAVE_USER_RETURN_NOTIFIER
select HAVE_GENERIC_VDSO
select VDSO_GETRANDOM if X86_64
@ -379,7 +381,7 @@ config GENERIC_CSUM
config GENERIC_BUG
def_bool y
depends on BUG
select GENERIC_BUG_RELATIVE_POINTERS if X86_64
select GENERIC_BUG_RELATIVE_POINTERS
config GENERIC_BUG_RELATIVE_POINTERS
bool

View File

@ -48,7 +48,8 @@ endif
# How to compile the 16-bit code. Note we always compile for -march=i386;
# that way we can complain to the user if the CPU is insufficient.
REALMODE_CFLAGS := -std=gnu11 -m16 -g -Os -DDISABLE_BRANCH_PROFILING -D__DISABLE_EXPORTS \
REALMODE_CFLAGS := -std=gnu11 -fms-extensions -m16 -g -Os \
-DDISABLE_BRANCH_PROFILING -D__DISABLE_EXPORTS \
-Wall -Wstrict-prototypes -march=i386 -mregparm=3 \
-fno-strict-aliasing -fomit-frame-pointer -fno-pic \
-mno-mmx -mno-sse $(call cc-option,-fcf-protection=none)
@ -60,6 +61,7 @@ REALMODE_CFLAGS += $(cc_stack_align4)
REALMODE_CFLAGS += $(CLANG_FLAGS)
ifdef CONFIG_CC_IS_CLANG
REALMODE_CFLAGS += -Wno-gnu
REALMODE_CFLAGS += -Wno-microsoft-anon-tag
endif
export REALMODE_CFLAGS

View File

@ -135,29 +135,29 @@ int enable_a20(void)
(legacy free, etc.) */
if (a20_test_short())
return 0;
/* Next, try the BIOS (INT 0x15, AX=0x2401) */
enable_a20_bios();
if (a20_test_short())
return 0;
/* Try enabling A20 through the keyboard controller */
kbc_err = empty_8042();
if (a20_test_short())
return 0; /* BIOS worked, but with delayed reaction */
if (!kbc_err) {
enable_a20_kbc();
if (a20_test_long())
return 0;
}
/* Finally, try enabling the "fast A20 gate" */
enable_a20_fast();
if (a20_test_long())
return 0;
}
return -1;
}

View File

@ -193,8 +193,6 @@ static inline bool heap_free(size_t n)
void copy_to_fs(addr_t dst, void *src, size_t len);
void *copy_from_fs(void *dst, addr_t src, size_t len);
void copy_to_gs(addr_t dst, void *src, size_t len);
void *copy_from_gs(void *dst, addr_t src, size_t len);
/* a20.c */
int enable_a20(void);

View File

@ -25,7 +25,7 @@ targets := vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma \
# avoid errors with '-march=i386', and future flags may depend on the target to
# be valid.
KBUILD_CFLAGS := -m$(BITS) -O2 $(CLANG_FLAGS)
KBUILD_CFLAGS += -std=gnu11
KBUILD_CFLAGS += -std=gnu11 -fms-extensions
KBUILD_CFLAGS += -fno-strict-aliasing -fPIE
KBUILD_CFLAGS += -Wundef
KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING
@ -36,7 +36,10 @@ KBUILD_CFLAGS += -mno-mmx -mno-sse
KBUILD_CFLAGS += -ffreestanding -fshort-wchar
KBUILD_CFLAGS += -fno-stack-protector
KBUILD_CFLAGS += $(call cc-disable-warning, address-of-packed-member)
KBUILD_CFLAGS += $(call cc-disable-warning, gnu)
ifdef CONFIG_CC_IS_CLANG
KBUILD_CFLAGS += -Wno-gnu
KBUILD_CFLAGS += -Wno-microsoft-anon-tag
endif
KBUILD_CFLAGS += -Wno-pointer-sign
KBUILD_CFLAGS += -fno-asynchronous-unwind-tables
KBUILD_CFLAGS += -D__DISABLE_EXPORTS

View File

@ -29,11 +29,10 @@
bool insn_has_rep_prefix(struct insn *insn)
{
insn_byte_t p;
int i;
insn_get_prefixes(insn);
for_each_insn_prefix(insn, i, p) {
for_each_insn_prefix(insn, p) {
if (p == 0xf2 || p == 0xf3)
return true;
}

View File

@ -36,7 +36,7 @@ $(patsubst %.o,$(obj)/%.o,$(lib-y)): OBJECT_FILES_NON_STANDARD := y
# relocations, even if other objtool actions are being deferred.
#
$(pi-objs): objtool-enabled = 1
$(pi-objs): objtool-args = $(if $(delay-objtool),,$(objtool-args-y)) --noabs
$(pi-objs): objtool-args = $(if $(delay-objtool),--dry-run,$(objtool-args-y)) --noabs
#
# Confine the startup code by prefixing all symbols with __pi_ (for position

View File

@ -32,6 +32,14 @@ SYM_FUNC_END(write_ibpb)
/* For KVM */
EXPORT_SYMBOL_GPL(write_ibpb);
SYM_FUNC_START(__WARN_trap)
ANNOTATE_NOENDBR
ANNOTATE_REACHABLE
ud1 (%edx), %_ASM_ARG1
RET
SYM_FUNC_END(__WARN_trap)
EXPORT_SYMBOL(__WARN_trap)
.popsection
/*

View File

@ -274,9 +274,10 @@ static noinstr bool __do_fast_syscall_32(struct pt_regs *regs)
* fetch EBP before invoking any of the syscall entry work
* functions.
*/
syscall_enter_from_user_mode_prepare(regs);
enter_from_user_mode(regs);
instrumentation_begin();
local_irq_enable();
/* Fetch EBP from where the vDSO stashed it. */
if (IS_ENABLED(CONFIG_X86_64)) {
/*

View File

@ -475,3 +475,4 @@
467 i386 open_tree_attr sys_open_tree_attr
468 i386 file_getattr sys_file_getattr
469 i386 file_setattr sys_file_setattr
470 i386 listns sys_listns

View File

@ -394,6 +394,7 @@
467 common open_tree_attr sys_open_tree_attr
468 common file_getattr sys_file_getattr
469 common file_setattr sys_file_setattr
470 common listns sys_listns
#
# Due to a historical design error, certain syscalls are numbered differently

View File

@ -763,7 +763,12 @@ static void amd_pmu_enable_all(int added)
if (!test_bit(idx, cpuc->active_mask))
continue;
amd_pmu_enable_event(cpuc->events[idx]);
/*
* FIXME: cpuc->events[idx] can become NULL in a subtle race
* condition with NMI->throttle->x86_pmu_stop().
*/
if (cpuc->events[idx])
amd_pmu_enable_event(cpuc->events[idx]);
}
}

View File

@ -554,14 +554,22 @@ static inline int precise_br_compat(struct perf_event *event)
return m == b;
}
int x86_pmu_max_precise(void)
int x86_pmu_max_precise(struct pmu *pmu)
{
int precise = 0;
/* Support for constant skid */
if (x86_pmu.pebs_active && !x86_pmu.pebs_broken) {
precise++;
/* arch PEBS */
if (x86_pmu.arch_pebs) {
precise = 2;
if (hybrid(pmu, arch_pebs_cap).pdists)
precise++;
return precise;
}
/* legacy PEBS - support for constant skid */
precise++;
/* Support for IP fixup */
if (x86_pmu.lbr_nr || x86_pmu.intel_cap.pebs_format >= 2)
precise++;
@ -569,13 +577,14 @@ int x86_pmu_max_precise(void)
if (x86_pmu.pebs_prec_dist)
precise++;
}
return precise;
}
int x86_pmu_hw_config(struct perf_event *event)
{
if (event->attr.precise_ip) {
int precise = x86_pmu_max_precise();
int precise = x86_pmu_max_precise(event->pmu);
if (event->attr.precise_ip > precise)
return -EOPNOTSUPP;
@ -1344,6 +1353,7 @@ static void x86_pmu_enable(struct pmu *pmu)
hwc->state |= PERF_HES_ARCH;
x86_pmu_stop(event, PERF_EF_UPDATE);
cpuc->events[hwc->idx] = NULL;
}
/*
@ -1365,6 +1375,7 @@ static void x86_pmu_enable(struct pmu *pmu)
* if cpuc->enabled = 0, then no wrmsr as
* per x86_pmu_enable_event()
*/
cpuc->events[hwc->idx] = event;
x86_pmu_start(event, PERF_EF_RELOAD);
}
cpuc->n_added = 0;
@ -1531,7 +1542,6 @@ static void x86_pmu_start(struct perf_event *event, int flags)
event->hw.state = 0;
cpuc->events[idx] = event;
__set_bit(idx, cpuc->active_mask);
static_call(x86_pmu_enable)(event);
perf_event_update_userpage(event);
@ -1610,7 +1620,6 @@ void x86_pmu_stop(struct perf_event *event, int flags)
if (test_bit(hwc->idx, cpuc->active_mask)) {
static_call(x86_pmu_disable)(event);
__clear_bit(hwc->idx, cpuc->active_mask);
cpuc->events[hwc->idx] = NULL;
WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
hwc->state |= PERF_HES_STOPPED;
}
@ -1648,6 +1657,7 @@ static void x86_pmu_del(struct perf_event *event, int flags)
* Not a TXN, therefore cleanup properly.
*/
x86_pmu_stop(event, PERF_EF_UPDATE);
cpuc->events[event->hw.idx] = NULL;
for (i = 0; i < cpuc->n_events; i++) {
if (event == cpuc->event_list[i])
@ -2629,7 +2639,9 @@ static ssize_t max_precise_show(struct device *cdev,
struct device_attribute *attr,
char *buf)
{
return snprintf(buf, PAGE_SIZE, "%d\n", x86_pmu_max_precise());
struct pmu *pmu = dev_get_drvdata(cdev);
return snprintf(buf, PAGE_SIZE, "%d\n", x86_pmu_max_precise(pmu));
}
static DEVICE_ATTR_RO(max_precise);
@ -2845,46 +2857,6 @@ static unsigned long get_segment_base(unsigned int segment)
return get_desc_base(desc);
}
#ifdef CONFIG_UPROBES
/*
* Heuristic-based check if uprobe is installed at the function entry.
*
* Under assumption of user code being compiled with frame pointers,
* `push %rbp/%ebp` is a good indicator that we indeed are.
*
* Similarly, `endbr64` (assuming 64-bit mode) is also a common pattern.
* If we get this wrong, captured stack trace might have one extra bogus
* entry, but the rest of stack trace will still be meaningful.
*/
static bool is_uprobe_at_func_entry(struct pt_regs *regs)
{
struct arch_uprobe *auprobe;
if (!current->utask)
return false;
auprobe = current->utask->auprobe;
if (!auprobe)
return false;
/* push %rbp/%ebp */
if (auprobe->insn[0] == 0x55)
return true;
/* endbr64 (64-bit only) */
if (user_64bit_mode(regs) && is_endbr((u32 *)auprobe->insn))
return true;
return false;
}
#else
static bool is_uprobe_at_func_entry(struct pt_regs *regs)
{
return false;
}
#endif /* CONFIG_UPROBES */
#ifdef CONFIG_IA32_EMULATION
#include <linux/compat.h>

View File

@ -2563,6 +2563,44 @@ static void intel_pmu_disable_fixed(struct perf_event *event)
cpuc->fixed_ctrl_val &= ~mask;
}
static inline void __intel_pmu_update_event_ext(int idx, u64 ext)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
u32 msr;
if (idx < INTEL_PMC_IDX_FIXED) {
msr = MSR_IA32_PMC_V6_GP0_CFG_C +
x86_pmu.addr_offset(idx, false);
} else {
msr = MSR_IA32_PMC_V6_FX0_CFG_C +
x86_pmu.addr_offset(idx - INTEL_PMC_IDX_FIXED, false);
}
cpuc->cfg_c_val[idx] = ext;
wrmsrq(msr, ext);
}
static void intel_pmu_disable_event_ext(struct perf_event *event)
{
/*
* Only clear CFG_C MSR for PEBS counter group events,
* it avoids the HW counter's value to be added into
* other PEBS records incorrectly after PEBS counter
* group events are disabled.
*
* For other events, it's unnecessary to clear CFG_C MSRs
* since CFG_C doesn't take effect if counter is in
* disabled state. That helps to reduce the WRMSR overhead
* in context switches.
*/
if (!is_pebs_counter_event_group(event))
return;
__intel_pmu_update_event_ext(event->hw.idx, 0);
}
DEFINE_STATIC_CALL_NULL(intel_pmu_disable_event_ext, intel_pmu_disable_event_ext);
static void intel_pmu_disable_event(struct perf_event *event)
{
struct hw_perf_event *hwc = &event->hw;
@ -2571,9 +2609,12 @@ static void intel_pmu_disable_event(struct perf_event *event)
switch (idx) {
case 0 ... INTEL_PMC_IDX_FIXED - 1:
intel_clear_masks(event, idx);
static_call_cond(intel_pmu_disable_event_ext)(event);
x86_pmu_disable_event(event);
break;
case INTEL_PMC_IDX_FIXED ... INTEL_PMC_IDX_FIXED_BTS - 1:
static_call_cond(intel_pmu_disable_event_ext)(event);
fallthrough;
case INTEL_PMC_IDX_METRIC_BASE ... INTEL_PMC_IDX_METRIC_END:
intel_pmu_disable_fixed(event);
break;
@ -2940,6 +2981,79 @@ static void intel_pmu_enable_acr(struct perf_event *event)
DEFINE_STATIC_CALL_NULL(intel_pmu_enable_acr_event, intel_pmu_enable_acr);
static void intel_pmu_enable_event_ext(struct perf_event *event)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
struct hw_perf_event *hwc = &event->hw;
union arch_pebs_index old, new;
struct arch_pebs_cap cap;
u64 ext = 0;
cap = hybrid(cpuc->pmu, arch_pebs_cap);
if (event->attr.precise_ip) {
u64 pebs_data_cfg = intel_get_arch_pebs_data_config(event);
ext |= ARCH_PEBS_EN;
if (hwc->flags & PERF_X86_EVENT_AUTO_RELOAD)
ext |= (-hwc->sample_period) & ARCH_PEBS_RELOAD;
if (pebs_data_cfg && cap.caps) {
if (pebs_data_cfg & PEBS_DATACFG_MEMINFO)
ext |= ARCH_PEBS_AUX & cap.caps;
if (pebs_data_cfg & PEBS_DATACFG_GP)
ext |= ARCH_PEBS_GPR & cap.caps;
if (pebs_data_cfg & PEBS_DATACFG_XMMS)
ext |= ARCH_PEBS_VECR_XMM & cap.caps;
if (pebs_data_cfg & PEBS_DATACFG_LBRS)
ext |= ARCH_PEBS_LBR & cap.caps;
if (pebs_data_cfg &
(PEBS_DATACFG_CNTR_MASK << PEBS_DATACFG_CNTR_SHIFT))
ext |= ARCH_PEBS_CNTR_GP & cap.caps;
if (pebs_data_cfg &
(PEBS_DATACFG_FIX_MASK << PEBS_DATACFG_FIX_SHIFT))
ext |= ARCH_PEBS_CNTR_FIXED & cap.caps;
if (pebs_data_cfg & PEBS_DATACFG_METRICS)
ext |= ARCH_PEBS_CNTR_METRICS & cap.caps;
}
if (cpuc->n_pebs == cpuc->n_large_pebs)
new.thresh = ARCH_PEBS_THRESH_MULTI;
else
new.thresh = ARCH_PEBS_THRESH_SINGLE;
rdmsrq(MSR_IA32_PEBS_INDEX, old.whole);
if (new.thresh != old.thresh || !old.en) {
if (old.thresh == ARCH_PEBS_THRESH_MULTI && old.wr > 0) {
/*
* Large PEBS was enabled.
* Drain PEBS buffer before applying the single PEBS.
*/
intel_pmu_drain_pebs_buffer();
} else {
new.wr = 0;
new.full = 0;
new.en = 1;
wrmsrq(MSR_IA32_PEBS_INDEX, new.whole);
}
}
}
if (is_pebs_counter_event_group(event))
ext |= ARCH_PEBS_CNTR_ALLOW;
if (cpuc->cfg_c_val[hwc->idx] != ext)
__intel_pmu_update_event_ext(hwc->idx, ext);
}
DEFINE_STATIC_CALL_NULL(intel_pmu_enable_event_ext, intel_pmu_enable_event_ext);
static void intel_pmu_enable_event(struct perf_event *event)
{
u64 enable_mask = ARCH_PERFMON_EVENTSEL_ENABLE;
@ -2955,10 +3069,12 @@ static void intel_pmu_enable_event(struct perf_event *event)
enable_mask |= ARCH_PERFMON_EVENTSEL_BR_CNTR;
intel_set_masks(event, idx);
static_call_cond(intel_pmu_enable_acr_event)(event);
static_call_cond(intel_pmu_enable_event_ext)(event);
__x86_pmu_enable_event(hwc, enable_mask);
break;
case INTEL_PMC_IDX_FIXED ... INTEL_PMC_IDX_FIXED_BTS - 1:
static_call_cond(intel_pmu_enable_acr_event)(event);
static_call_cond(intel_pmu_enable_event_ext)(event);
fallthrough;
case INTEL_PMC_IDX_METRIC_BASE ... INTEL_PMC_IDX_METRIC_END:
intel_pmu_enable_fixed(event);
@ -3215,6 +3331,19 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status)
status &= ~GLOBAL_STATUS_PERF_METRICS_OVF_BIT;
}
/*
* Arch PEBS sets bit 54 in the global status register
*/
if (__test_and_clear_bit(GLOBAL_STATUS_ARCH_PEBS_THRESHOLD_BIT,
(unsigned long *)&status)) {
handled++;
static_call(x86_pmu_drain_pebs)(regs, &data);
if (cpuc->events[INTEL_PMC_IDX_FIXED_SLOTS] &&
is_pebs_counter_event_group(cpuc->events[INTEL_PMC_IDX_FIXED_SLOTS]))
status &= ~GLOBAL_STATUS_PERF_METRICS_OVF_BIT;
}
/*
* Intel PT
*/
@ -3269,7 +3398,7 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status)
* The PEBS buffer has to be drained before handling the A-PMI
*/
if (is_pebs_counter_event_group(event))
x86_pmu.drain_pebs(regs, &data);
static_call(x86_pmu_drain_pebs)(regs, &data);
last_period = event->hw.last_period;
@ -4029,7 +4158,9 @@ static unsigned long intel_pmu_large_pebs_flags(struct perf_event *event)
if (!event->attr.exclude_kernel)
flags &= ~PERF_SAMPLE_REGS_USER;
if (event->attr.sample_regs_user & ~PEBS_GP_REGS)
flags &= ~(PERF_SAMPLE_REGS_USER | PERF_SAMPLE_REGS_INTR);
flags &= ~PERF_SAMPLE_REGS_USER;
if (event->attr.sample_regs_intr & ~PEBS_GP_REGS)
flags &= ~PERF_SAMPLE_REGS_INTR;
return flags;
}
@ -4204,6 +4335,20 @@ static bool intel_pmu_is_acr_group(struct perf_event *event)
return false;
}
static inline bool intel_pmu_has_pebs_counter_group(struct pmu *pmu)
{
u64 caps;
if (x86_pmu.intel_cap.pebs_format >= 6 && x86_pmu.intel_cap.pebs_baseline)
return true;
caps = hybrid(pmu, arch_pebs_cap).caps;
if (x86_pmu.arch_pebs && (caps & ARCH_PEBS_CNTR_MASK))
return true;
return false;
}
static inline void intel_pmu_set_acr_cntr_constr(struct perf_event *event,
u64 *cause_mask, int *num)
{
@ -4237,6 +4382,8 @@ static int intel_pmu_hw_config(struct perf_event *event)
}
if (event->attr.precise_ip) {
struct arch_pebs_cap pebs_cap = hybrid(event->pmu, arch_pebs_cap);
if ((event->attr.config & INTEL_ARCH_EVENT_MASK) == INTEL_FIXED_VLBR_EVENT)
return -EINVAL;
@ -4250,6 +4397,15 @@ static int intel_pmu_hw_config(struct perf_event *event)
}
if (x86_pmu.pebs_aliases)
x86_pmu.pebs_aliases(event);
if (x86_pmu.arch_pebs) {
u64 cntr_mask = hybrid(event->pmu, intel_ctrl) &
~GLOBAL_CTRL_EN_PERF_METRICS;
u64 pebs_mask = event->attr.precise_ip >= 3 ?
pebs_cap.pdists : pebs_cap.counters;
if (cntr_mask != pebs_mask)
event->hw.dyn_constraint &= pebs_mask;
}
}
if (needs_branch_stack(event)) {
@ -4341,8 +4497,7 @@ static int intel_pmu_hw_config(struct perf_event *event)
}
if ((event->attr.sample_type & PERF_SAMPLE_READ) &&
(x86_pmu.intel_cap.pebs_format >= 6) &&
x86_pmu.intel_cap.pebs_baseline &&
intel_pmu_has_pebs_counter_group(event->pmu) &&
is_sampling_event(event) &&
event->attr.precise_ip)
event->group_leader->hw.flags |= PERF_X86_EVENT_PEBS_CNTR;
@ -5212,7 +5367,13 @@ int intel_cpuc_prepare(struct cpu_hw_events *cpuc, int cpu)
static int intel_pmu_cpu_prepare(int cpu)
{
return intel_cpuc_prepare(&per_cpu(cpu_hw_events, cpu), cpu);
int ret;
ret = intel_cpuc_prepare(&per_cpu(cpu_hw_events, cpu), cpu);
if (ret)
return ret;
return alloc_arch_pebs_buf_on_cpu(cpu);
}
static void flip_smm_bit(void *data)
@ -5257,6 +5418,163 @@ static void intel_pmu_check_event_constraints(struct event_constraint *event_con
u64 fixed_cntr_mask,
u64 intel_ctrl);
enum dyn_constr_type {
DYN_CONSTR_NONE,
DYN_CONSTR_BR_CNTR,
DYN_CONSTR_ACR_CNTR,
DYN_CONSTR_ACR_CAUSE,
DYN_CONSTR_PEBS,
DYN_CONSTR_PDIST,
DYN_CONSTR_MAX,
};
static const char * const dyn_constr_type_name[] = {
[DYN_CONSTR_NONE] = "a normal event",
[DYN_CONSTR_BR_CNTR] = "a branch counter logging event",
[DYN_CONSTR_ACR_CNTR] = "an auto-counter reload event",
[DYN_CONSTR_ACR_CAUSE] = "an auto-counter reload cause event",
[DYN_CONSTR_PEBS] = "a PEBS event",
[DYN_CONSTR_PDIST] = "a PEBS PDIST event",
};
static void __intel_pmu_check_dyn_constr(struct event_constraint *constr,
enum dyn_constr_type type, u64 mask)
{
struct event_constraint *c1, *c2;
int new_weight, check_weight;
u64 new_mask, check_mask;
for_each_event_constraint(c1, constr) {
new_mask = c1->idxmsk64 & mask;
new_weight = hweight64(new_mask);
/* ignore topdown perf metrics event */
if (c1->idxmsk64 & INTEL_PMC_MSK_TOPDOWN)
continue;
if (!new_weight && fls64(c1->idxmsk64) < INTEL_PMC_IDX_FIXED) {
pr_info("The event 0x%llx is not supported as %s.\n",
c1->code, dyn_constr_type_name[type]);
}
if (new_weight <= 1)
continue;
for_each_event_constraint(c2, c1 + 1) {
bool check_fail = false;
check_mask = c2->idxmsk64 & mask;
check_weight = hweight64(check_mask);
if (c2->idxmsk64 & INTEL_PMC_MSK_TOPDOWN ||
!check_weight)
continue;
/* The same constraints or no overlap */
if (new_mask == check_mask ||
(new_mask ^ check_mask) == (new_mask | check_mask))
continue;
/*
* A scheduler issue may be triggered in the following cases.
* - Two overlap constraints have the same weight.
* E.g., A constraints: 0x3, B constraints: 0x6
* event counter failure case
* B PMC[2:1] 1
* A PMC[1:0] 0
* A PMC[1:0] FAIL
* - Two overlap constraints have different weight.
* The constraint has a low weight, but has high last bit.
* E.g., A constraints: 0x7, B constraints: 0xC
* event counter failure case
* B PMC[3:2] 2
* A PMC[2:0] 0
* A PMC[2:0] 1
* A PMC[2:0] FAIL
*/
if (new_weight == check_weight) {
check_fail = true;
} else if (new_weight < check_weight) {
if ((new_mask | check_mask) != check_mask &&
fls64(new_mask) > fls64(check_mask))
check_fail = true;
} else {
if ((new_mask | check_mask) != new_mask &&
fls64(new_mask) < fls64(check_mask))
check_fail = true;
}
if (check_fail) {
pr_info("The two events 0x%llx and 0x%llx may not be "
"fully scheduled under some circumstances as "
"%s.\n",
c1->code, c2->code, dyn_constr_type_name[type]);
}
}
}
}
static void intel_pmu_check_dyn_constr(struct pmu *pmu,
struct event_constraint *constr,
u64 cntr_mask)
{
enum dyn_constr_type i;
u64 mask;
for (i = DYN_CONSTR_NONE; i < DYN_CONSTR_MAX; i++) {
mask = 0;
switch (i) {
case DYN_CONSTR_NONE:
mask = cntr_mask;
break;
case DYN_CONSTR_BR_CNTR:
if (x86_pmu.flags & PMU_FL_BR_CNTR)
mask = x86_pmu.lbr_counters;
break;
case DYN_CONSTR_ACR_CNTR:
mask = hybrid(pmu, acr_cntr_mask64) & GENMASK_ULL(INTEL_PMC_MAX_GENERIC - 1, 0);
break;
case DYN_CONSTR_ACR_CAUSE:
if (hybrid(pmu, acr_cntr_mask64) == hybrid(pmu, acr_cause_mask64))
continue;
mask = hybrid(pmu, acr_cause_mask64) & GENMASK_ULL(INTEL_PMC_MAX_GENERIC - 1, 0);
break;
case DYN_CONSTR_PEBS:
if (x86_pmu.arch_pebs)
mask = hybrid(pmu, arch_pebs_cap).counters;
break;
case DYN_CONSTR_PDIST:
if (x86_pmu.arch_pebs)
mask = hybrid(pmu, arch_pebs_cap).pdists;
break;
default:
pr_warn("Unsupported dynamic constraint type %d\n", i);
}
if (mask)
__intel_pmu_check_dyn_constr(constr, i, mask);
}
}
static void intel_pmu_check_event_constraints_all(struct pmu *pmu)
{
struct event_constraint *event_constraints = hybrid(pmu, event_constraints);
struct event_constraint *pebs_constraints = hybrid(pmu, pebs_constraints);
u64 cntr_mask = hybrid(pmu, cntr_mask64);
u64 fixed_cntr_mask = hybrid(pmu, fixed_cntr_mask64);
u64 intel_ctrl = hybrid(pmu, intel_ctrl);
intel_pmu_check_event_constraints(event_constraints, cntr_mask,
fixed_cntr_mask, intel_ctrl);
if (event_constraints)
intel_pmu_check_dyn_constr(pmu, event_constraints, cntr_mask);
if (pebs_constraints)
intel_pmu_check_dyn_constr(pmu, pebs_constraints, cntr_mask);
}
static void intel_pmu_check_extra_regs(struct extra_reg *extra_regs);
static inline bool intel_pmu_broken_perf_cap(void)
@ -5269,34 +5587,89 @@ static inline bool intel_pmu_broken_perf_cap(void)
return false;
}
static inline void __intel_update_pmu_caps(struct pmu *pmu)
{
struct pmu *dest_pmu = pmu ? pmu : x86_get_pmu(smp_processor_id());
if (hybrid(pmu, arch_pebs_cap).caps & ARCH_PEBS_VECR_XMM)
dest_pmu->capabilities |= PERF_PMU_CAP_EXTENDED_REGS;
}
static inline void __intel_update_large_pebs_flags(struct pmu *pmu)
{
u64 caps = hybrid(pmu, arch_pebs_cap).caps;
x86_pmu.large_pebs_flags |= PERF_SAMPLE_TIME;
if (caps & ARCH_PEBS_LBR)
x86_pmu.large_pebs_flags |= PERF_SAMPLE_BRANCH_STACK;
if (caps & ARCH_PEBS_CNTR_MASK)
x86_pmu.large_pebs_flags |= PERF_SAMPLE_READ;
if (!(caps & ARCH_PEBS_AUX))
x86_pmu.large_pebs_flags &= ~PERF_SAMPLE_DATA_SRC;
if (!(caps & ARCH_PEBS_GPR)) {
x86_pmu.large_pebs_flags &=
~(PERF_SAMPLE_REGS_INTR | PERF_SAMPLE_REGS_USER);
}
}
#define counter_mask(_gp, _fixed) ((_gp) | ((u64)(_fixed) << INTEL_PMC_IDX_FIXED))
static void update_pmu_cap(struct pmu *pmu)
{
unsigned int cntr, fixed_cntr, ecx, edx;
union cpuid35_eax eax;
union cpuid35_ebx ebx;
unsigned int eax, ebx, ecx, edx;
union cpuid35_eax eax_0;
union cpuid35_ebx ebx_0;
u64 cntrs_mask = 0;
u64 pebs_mask = 0;
u64 pdists_mask = 0;
cpuid(ARCH_PERFMON_EXT_LEAF, &eax.full, &ebx.full, &ecx, &edx);
cpuid(ARCH_PERFMON_EXT_LEAF, &eax_0.full, &ebx_0.full, &ecx, &edx);
if (ebx.split.umask2)
if (ebx_0.split.umask2)
hybrid(pmu, config_mask) |= ARCH_PERFMON_EVENTSEL_UMASK2;
if (ebx.split.eq)
if (ebx_0.split.eq)
hybrid(pmu, config_mask) |= ARCH_PERFMON_EVENTSEL_EQ;
if (eax.split.cntr_subleaf) {
if (eax_0.split.cntr_subleaf) {
cpuid_count(ARCH_PERFMON_EXT_LEAF, ARCH_PERFMON_NUM_COUNTER_LEAF,
&cntr, &fixed_cntr, &ecx, &edx);
hybrid(pmu, cntr_mask64) = cntr;
hybrid(pmu, fixed_cntr_mask64) = fixed_cntr;
&eax, &ebx, &ecx, &edx);
hybrid(pmu, cntr_mask64) = eax;
hybrid(pmu, fixed_cntr_mask64) = ebx;
cntrs_mask = counter_mask(eax, ebx);
}
if (eax.split.acr_subleaf) {
if (eax_0.split.acr_subleaf) {
cpuid_count(ARCH_PERFMON_EXT_LEAF, ARCH_PERFMON_ACR_LEAF,
&cntr, &fixed_cntr, &ecx, &edx);
&eax, &ebx, &ecx, &edx);
/* The mask of the counters which can be reloaded */
hybrid(pmu, acr_cntr_mask64) = cntr | ((u64)fixed_cntr << INTEL_PMC_IDX_FIXED);
hybrid(pmu, acr_cntr_mask64) = counter_mask(eax, ebx);
/* The mask of the counters which can cause a reload of reloadable counters */
hybrid(pmu, acr_cause_mask64) = ecx | ((u64)edx << INTEL_PMC_IDX_FIXED);
hybrid(pmu, acr_cause_mask64) = counter_mask(ecx, edx);
}
/* Bits[5:4] should be set simultaneously if arch-PEBS is supported */
if (eax_0.split.pebs_caps_subleaf && eax_0.split.pebs_cnts_subleaf) {
cpuid_count(ARCH_PERFMON_EXT_LEAF, ARCH_PERFMON_PEBS_CAP_LEAF,
&eax, &ebx, &ecx, &edx);
hybrid(pmu, arch_pebs_cap).caps = (u64)ebx << 32;
cpuid_count(ARCH_PERFMON_EXT_LEAF, ARCH_PERFMON_PEBS_COUNTER_LEAF,
&eax, &ebx, &ecx, &edx);
pebs_mask = counter_mask(eax, ecx);
pdists_mask = counter_mask(ebx, edx);
hybrid(pmu, arch_pebs_cap).counters = pebs_mask;
hybrid(pmu, arch_pebs_cap).pdists = pdists_mask;
if (WARN_ON((pebs_mask | pdists_mask) & ~cntrs_mask)) {
x86_pmu.arch_pebs = 0;
} else {
__intel_update_pmu_caps(pmu);
__intel_update_large_pebs_flags(pmu);
}
} else {
WARN_ON(x86_pmu.arch_pebs == 1);
x86_pmu.arch_pebs = 0;
}
if (!intel_pmu_broken_perf_cap()) {
@ -5319,10 +5692,7 @@ static void intel_pmu_check_hybrid_pmus(struct x86_hybrid_pmu *pmu)
else
pmu->intel_ctrl &= ~GLOBAL_CTRL_EN_PERF_METRICS;
intel_pmu_check_event_constraints(pmu->event_constraints,
pmu->cntr_mask64,
pmu->fixed_cntr_mask64,
pmu->intel_ctrl);
intel_pmu_check_event_constraints_all(&pmu->pmu);
intel_pmu_check_extra_regs(pmu->extra_regs);
}
@ -5418,6 +5788,7 @@ static void intel_pmu_cpu_starting(int cpu)
return;
init_debug_store_on_cpu(cpu);
init_arch_pebs_on_cpu(cpu);
/*
* Deal with CPUs that don't clear their LBRs on power-up, and that may
* even boot with LBRs enabled.
@ -5456,6 +5827,8 @@ static void intel_pmu_cpu_starting(int cpu)
}
}
__intel_update_pmu_caps(cpuc->pmu);
if (!cpuc->shared_regs)
return;
@ -5515,6 +5888,7 @@ static void free_excl_cntrs(struct cpu_hw_events *cpuc)
static void intel_pmu_cpu_dying(int cpu)
{
fini_debug_store_on_cpu(cpu);
fini_arch_pebs_on_cpu(cpu);
}
void intel_cpuc_finish(struct cpu_hw_events *cpuc)
@ -5535,6 +5909,7 @@ static void intel_pmu_cpu_dead(int cpu)
{
struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
release_arch_pebs_buf_on_cpu(cpu);
intel_cpuc_finish(cpuc);
if (is_hybrid() && cpuc->pmu)
@ -6250,7 +6625,7 @@ tsx_is_visible(struct kobject *kobj, struct attribute *attr, int i)
static umode_t
pebs_is_visible(struct kobject *kobj, struct attribute *attr, int i)
{
return x86_pmu.ds_pebs ? attr->mode : 0;
return intel_pmu_has_pebs() ? attr->mode : 0;
}
static umode_t
@ -6940,8 +7315,11 @@ __init int intel_pmu_init(void)
* Many features on and after V6 require dynamic constraint,
* e.g., Arch PEBS, ACR.
*/
if (version >= 6)
if (version >= 6) {
x86_pmu.flags |= PMU_FL_DYN_CONSTRAINT;
x86_pmu.late_setup = intel_pmu_late_setup;
}
/*
* Install the hw-cache-events table:
*/
@ -7727,6 +8105,14 @@ __init int intel_pmu_init(void)
if (!is_hybrid() && boot_cpu_has(X86_FEATURE_ARCH_PERFMON_EXT))
update_pmu_cap(NULL);
if (x86_pmu.arch_pebs) {
static_call_update(intel_pmu_disable_event_ext,
intel_pmu_disable_event_ext);
static_call_update(intel_pmu_enable_event_ext,
intel_pmu_enable_event_ext);
pr_cont("Architectural PEBS, ");
}
intel_pmu_check_counters_mask(&x86_pmu.cntr_mask64,
&x86_pmu.fixed_cntr_mask64,
&x86_pmu.intel_ctrl);
@ -7735,10 +8121,8 @@ __init int intel_pmu_init(void)
if (x86_pmu.intel_cap.anythread_deprecated)
x86_pmu.format_attrs = intel_arch_formats_attr;
intel_pmu_check_event_constraints(x86_pmu.event_constraints,
x86_pmu.cntr_mask64,
x86_pmu.fixed_cntr_mask64,
x86_pmu.intel_ctrl);
intel_pmu_check_event_constraints_all(NULL);
/*
* Access LBR MSR may cause #GP under certain circumstances.
* Check all LBR MSR here.

View File

@ -41,7 +41,7 @@
* MSR_CORE_C1_RES: CORE C1 Residency Counter
* perf code: 0x00
* Available model: SLM,AMT,GLM,CNL,ICX,TNT,ADL,RPL
* MTL,SRF,GRR,ARL,LNL
* MTL,SRF,GRR,ARL,LNL,PTL
* Scope: Core (each processor core has a MSR)
* MSR_CORE_C3_RESIDENCY: CORE C3 Residency Counter
* perf code: 0x01
@ -53,31 +53,32 @@
* Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW,
* SKL,KNL,GLM,CNL,KBL,CML,ICL,ICX,
* TGL,TNT,RKL,ADL,RPL,SPR,MTL,SRF,
* GRR,ARL,LNL
* GRR,ARL,LNL,PTL
* Scope: Core
* MSR_CORE_C7_RESIDENCY: CORE C7 Residency Counter
* perf code: 0x03
* Available model: SNB,IVB,HSW,BDW,SKL,CNL,KBL,CML,
* ICL,TGL,RKL,ADL,RPL,MTL,ARL,LNL
* ICL,TGL,RKL,ADL,RPL,MTL,ARL,LNL,
* PTL
* Scope: Core
* MSR_PKG_C2_RESIDENCY: Package C2 Residency Counter.
* perf code: 0x00
* Available model: SNB,IVB,HSW,BDW,SKL,KNL,GLM,CNL,
* KBL,CML,ICL,ICX,TGL,TNT,RKL,ADL,
* RPL,SPR,MTL,ARL,LNL,SRF
* RPL,SPR,MTL,ARL,LNL,SRF,PTL
* Scope: Package (physical package)
* MSR_PKG_C3_RESIDENCY: Package C3 Residency Counter.
* perf code: 0x01
* Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL,KNL,
* GLM,CNL,KBL,CML,ICL,TGL,TNT,RKL,
* ADL,RPL,MTL,ARL,LNL
* ADL,RPL,MTL,ARL
* Scope: Package (physical package)
* MSR_PKG_C6_RESIDENCY: Package C6 Residency Counter.
* perf code: 0x02
* Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW,
* SKL,KNL,GLM,CNL,KBL,CML,ICL,ICX,
* TGL,TNT,RKL,ADL,RPL,SPR,MTL,SRF,
* ARL,LNL
* ARL,LNL,PTL
* Scope: Package (physical package)
* MSR_PKG_C7_RESIDENCY: Package C7 Residency Counter.
* perf code: 0x03
@ -96,7 +97,7 @@
* MSR_PKG_C10_RESIDENCY: Package C10 Residency Counter.
* perf code: 0x06
* Available model: HSW ULT,KBL,GLM,CNL,CML,ICL,TGL,
* TNT,RKL,ADL,RPL,MTL,ARL,LNL
* TNT,RKL,ADL,RPL,MTL,ARL,LNL,PTL
* Scope: Package (physical package)
* MSR_MODULE_C6_RES_MS: Module C6 Residency Counter.
* perf code: 0x00
@ -522,7 +523,6 @@ static const struct cstate_model lnl_cstates __initconst = {
BIT(PERF_CSTATE_CORE_C7_RES),
.pkg_events = BIT(PERF_CSTATE_PKG_C2_RES) |
BIT(PERF_CSTATE_PKG_C3_RES) |
BIT(PERF_CSTATE_PKG_C6_RES) |
BIT(PERF_CSTATE_PKG_C10_RES),
};
@ -628,6 +628,7 @@ static const struct x86_cpu_id intel_cstates_match[] __initconst = {
X86_MATCH_VFM(INTEL_ATOM_GRACEMONT, &adl_cstates),
X86_MATCH_VFM(INTEL_ATOM_CRESTMONT_X, &srf_cstates),
X86_MATCH_VFM(INTEL_ATOM_CRESTMONT, &grr_cstates),
X86_MATCH_VFM(INTEL_ATOM_DARKMONT_X, &srf_cstates),
X86_MATCH_VFM(INTEL_ICELAKE_L, &icl_cstates),
X86_MATCH_VFM(INTEL_ICELAKE, &icl_cstates),
@ -652,6 +653,7 @@ static const struct x86_cpu_id intel_cstates_match[] __initconst = {
X86_MATCH_VFM(INTEL_ARROWLAKE_H, &adl_cstates),
X86_MATCH_VFM(INTEL_ARROWLAKE_U, &adl_cstates),
X86_MATCH_VFM(INTEL_LUNARLAKE_M, &lnl_cstates),
X86_MATCH_VFM(INTEL_PANTHERLAKE_L, &lnl_cstates),
{ },
};
MODULE_DEVICE_TABLE(x86cpu, intel_cstates_match);

View File

@ -626,13 +626,18 @@ static int alloc_pebs_buffer(int cpu)
int max, node = cpu_to_node(cpu);
void *buffer, *insn_buff, *cea;
if (!x86_pmu.ds_pebs)
if (!intel_pmu_has_pebs())
return 0;
buffer = dsalloc_pages(bsiz, GFP_KERNEL, cpu);
if (unlikely(!buffer))
return -ENOMEM;
if (x86_pmu.arch_pebs) {
hwev->pebs_vaddr = buffer;
return 0;
}
/*
* HSW+ already provides us the eventing ip; no need to allocate this
* buffer then.
@ -645,7 +650,7 @@ static int alloc_pebs_buffer(int cpu)
}
per_cpu(insn_buffer, cpu) = insn_buff;
}
hwev->ds_pebs_vaddr = buffer;
hwev->pebs_vaddr = buffer;
/* Update the cpu entry area mapping */
cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.pebs_buffer;
ds->pebs_buffer_base = (unsigned long) cea;
@ -661,17 +666,20 @@ static void release_pebs_buffer(int cpu)
struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu);
void *cea;
if (!x86_pmu.ds_pebs)
if (!intel_pmu_has_pebs())
return;
kfree(per_cpu(insn_buffer, cpu));
per_cpu(insn_buffer, cpu) = NULL;
if (x86_pmu.ds_pebs) {
kfree(per_cpu(insn_buffer, cpu));
per_cpu(insn_buffer, cpu) = NULL;
/* Clear the fixmap */
cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.pebs_buffer;
ds_clear_cea(cea, x86_pmu.pebs_buffer_size);
dsfree_pages(hwev->ds_pebs_vaddr, x86_pmu.pebs_buffer_size);
hwev->ds_pebs_vaddr = NULL;
/* Clear the fixmap */
cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.pebs_buffer;
ds_clear_cea(cea, x86_pmu.pebs_buffer_size);
}
dsfree_pages(hwev->pebs_vaddr, x86_pmu.pebs_buffer_size);
hwev->pebs_vaddr = NULL;
}
static int alloc_bts_buffer(int cpu)
@ -824,6 +832,56 @@ void reserve_ds_buffers(void)
}
}
inline int alloc_arch_pebs_buf_on_cpu(int cpu)
{
if (!x86_pmu.arch_pebs)
return 0;
return alloc_pebs_buffer(cpu);
}
inline void release_arch_pebs_buf_on_cpu(int cpu)
{
if (!x86_pmu.arch_pebs)
return;
release_pebs_buffer(cpu);
}
void init_arch_pebs_on_cpu(int cpu)
{
struct cpu_hw_events *cpuc = per_cpu_ptr(&cpu_hw_events, cpu);
u64 arch_pebs_base;
if (!x86_pmu.arch_pebs)
return;
if (!cpuc->pebs_vaddr) {
WARN(1, "Fail to allocate PEBS buffer on CPU %d\n", cpu);
x86_pmu.pebs_active = 0;
return;
}
/*
* 4KB-aligned pointer of the output buffer
* (__alloc_pages_node() return page aligned address)
* Buffer Size = 4KB * 2^SIZE
* contiguous physical buffer (__alloc_pages_node() with order)
*/
arch_pebs_base = virt_to_phys(cpuc->pebs_vaddr) | PEBS_BUFFER_SHIFT;
wrmsr_on_cpu(cpu, MSR_IA32_PEBS_BASE, (u32)arch_pebs_base,
(u32)(arch_pebs_base >> 32));
x86_pmu.pebs_active = 1;
}
inline void fini_arch_pebs_on_cpu(int cpu)
{
if (!x86_pmu.arch_pebs)
return;
wrmsr_on_cpu(cpu, MSR_IA32_PEBS_BASE, 0, 0);
}
/*
* BTS
*/
@ -1471,6 +1529,25 @@ pebs_update_state(bool needed_cb, struct cpu_hw_events *cpuc,
}
}
u64 intel_get_arch_pebs_data_config(struct perf_event *event)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
u64 pebs_data_cfg = 0;
u64 cntr_mask;
if (WARN_ON(event->hw.idx < 0 || event->hw.idx >= X86_PMC_IDX_MAX))
return 0;
pebs_data_cfg |= pebs_update_adaptive_cfg(event);
cntr_mask = (PEBS_DATACFG_CNTR_MASK << PEBS_DATACFG_CNTR_SHIFT) |
(PEBS_DATACFG_FIX_MASK << PEBS_DATACFG_FIX_SHIFT) |
PEBS_DATACFG_CNTR | PEBS_DATACFG_METRICS;
pebs_data_cfg |= cpuc->pebs_data_cfg & cntr_mask;
return pebs_data_cfg;
}
void intel_pmu_pebs_add(struct perf_event *event)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
@ -1532,6 +1609,15 @@ static inline void intel_pmu_drain_large_pebs(struct cpu_hw_events *cpuc)
intel_pmu_drain_pebs_buffer();
}
static void __intel_pmu_pebs_enable(struct perf_event *event)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
struct hw_perf_event *hwc = &event->hw;
hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT;
cpuc->pebs_enabled |= 1ULL << hwc->idx;
}
void intel_pmu_pebs_enable(struct perf_event *event)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
@ -1540,9 +1626,7 @@ void intel_pmu_pebs_enable(struct perf_event *event)
struct debug_store *ds = cpuc->ds;
unsigned int idx = hwc->idx;
hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT;
cpuc->pebs_enabled |= 1ULL << hwc->idx;
__intel_pmu_pebs_enable(event);
if ((event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT) && (x86_pmu.version < 5))
cpuc->pebs_enabled |= 1ULL << (hwc->idx + 32);
@ -1604,14 +1688,22 @@ void intel_pmu_pebs_del(struct perf_event *event)
pebs_update_state(needed_cb, cpuc, event, false);
}
void intel_pmu_pebs_disable(struct perf_event *event)
static void __intel_pmu_pebs_disable(struct perf_event *event)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
struct hw_perf_event *hwc = &event->hw;
intel_pmu_drain_large_pebs(cpuc);
cpuc->pebs_enabled &= ~(1ULL << hwc->idx);
hwc->config |= ARCH_PERFMON_EVENTSEL_INT;
}
void intel_pmu_pebs_disable(struct perf_event *event)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
struct hw_perf_event *hwc = &event->hw;
__intel_pmu_pebs_disable(event);
if ((event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT) &&
(x86_pmu.version < 5))
@ -1623,8 +1715,6 @@ void intel_pmu_pebs_disable(struct perf_event *event)
if (cpuc->enabled)
wrmsrq(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled);
hwc->config |= ARCH_PERFMON_EVENTSEL_INT;
}
void intel_pmu_pebs_enable_all(void)
@ -2060,6 +2150,90 @@ static inline void __setup_pebs_counter_group(struct cpu_hw_events *cpuc,
#define PEBS_LATENCY_MASK 0xffff
static inline void __setup_perf_sample_data(struct perf_event *event,
struct pt_regs *iregs,
struct perf_sample_data *data)
{
perf_sample_data_init(data, 0, event->hw.last_period);
/*
* We must however always use iregs for the unwinder to stay sane; the
* record BP,SP,IP can point into thin air when the record is from a
* previous PMI context or an (I)RET happened between the record and
* PMI.
*/
perf_sample_save_callchain(data, event, iregs);
}
static inline void __setup_pebs_basic_group(struct perf_event *event,
struct pt_regs *regs,
struct perf_sample_data *data,
u64 sample_type, u64 ip,
u64 tsc, u16 retire)
{
/* The ip in basic is EventingIP */
set_linear_ip(regs, ip);
regs->flags = PERF_EFLAGS_EXACT;
setup_pebs_time(event, data, tsc);
if (sample_type & PERF_SAMPLE_WEIGHT_STRUCT)
data->weight.var3_w = retire;
}
static inline void __setup_pebs_gpr_group(struct perf_event *event,
struct pt_regs *regs,
struct pebs_gprs *gprs,
u64 sample_type)
{
if (event->attr.precise_ip < 2) {
set_linear_ip(regs, gprs->ip);
regs->flags &= ~PERF_EFLAGS_EXACT;
}
if (sample_type & (PERF_SAMPLE_REGS_INTR | PERF_SAMPLE_REGS_USER))
adaptive_pebs_save_regs(regs, gprs);
}
static inline void __setup_pebs_meminfo_group(struct perf_event *event,
struct perf_sample_data *data,
u64 sample_type, u64 latency,
u16 instr_latency, u64 address,
u64 aux, u64 tsx_tuning, u64 ax)
{
if (sample_type & PERF_SAMPLE_WEIGHT_TYPE) {
u64 tsx_latency = intel_get_tsx_weight(tsx_tuning);
data->weight.var2_w = instr_latency;
/*
* Although meminfo::latency is defined as a u64,
* only the lower 32 bits include the valid data
* in practice on Ice Lake and earlier platforms.
*/
if (sample_type & PERF_SAMPLE_WEIGHT)
data->weight.full = latency ?: tsx_latency;
else
data->weight.var1_dw = (u32)latency ?: tsx_latency;
data->sample_flags |= PERF_SAMPLE_WEIGHT_TYPE;
}
if (sample_type & PERF_SAMPLE_DATA_SRC) {
data->data_src.val = get_data_src(event, aux);
data->sample_flags |= PERF_SAMPLE_DATA_SRC;
}
if (sample_type & PERF_SAMPLE_ADDR_TYPE) {
data->addr = address;
data->sample_flags |= PERF_SAMPLE_ADDR;
}
if (sample_type & PERF_SAMPLE_TRANSACTION) {
data->txn = intel_get_tsx_transaction(tsx_tuning, ax);
data->sample_flags |= PERF_SAMPLE_TRANSACTION;
}
}
/*
* With adaptive PEBS the layout depends on what fields are configured.
*/
@ -2069,12 +2243,14 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event,
struct pt_regs *regs)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
u64 sample_type = event->attr.sample_type;
struct pebs_basic *basic = __pebs;
void *next_record = basic + 1;
u64 sample_type, format_group;
struct pebs_meminfo *meminfo = NULL;
struct pebs_gprs *gprs = NULL;
struct x86_perf_regs *perf_regs;
u64 format_group;
u16 retire;
if (basic == NULL)
return;
@ -2082,31 +2258,17 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event,
perf_regs = container_of(regs, struct x86_perf_regs, regs);
perf_regs->xmm_regs = NULL;
sample_type = event->attr.sample_type;
format_group = basic->format_group;
perf_sample_data_init(data, 0, event->hw.last_period);
setup_pebs_time(event, data, basic->tsc);
/*
* We must however always use iregs for the unwinder to stay sane; the
* record BP,SP,IP can point into thin air when the record is from a
* previous PMI context or an (I)RET happened between the record and
* PMI.
*/
perf_sample_save_callchain(data, event, iregs);
__setup_perf_sample_data(event, iregs, data);
*regs = *iregs;
/* The ip in basic is EventingIP */
set_linear_ip(regs, basic->ip);
regs->flags = PERF_EFLAGS_EXACT;
if (sample_type & PERF_SAMPLE_WEIGHT_STRUCT) {
if (x86_pmu.flags & PMU_FL_RETIRE_LATENCY)
data->weight.var3_w = basic->retire_latency;
else
data->weight.var3_w = 0;
}
/* basic group */
retire = x86_pmu.flags & PMU_FL_RETIRE_LATENCY ?
basic->retire_latency : 0;
__setup_pebs_basic_group(event, regs, data, sample_type,
basic->ip, basic->tsc, retire);
/*
* The record for MEMINFO is in front of GP
@ -2122,54 +2284,20 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event,
gprs = next_record;
next_record = gprs + 1;
if (event->attr.precise_ip < 2) {
set_linear_ip(regs, gprs->ip);
regs->flags &= ~PERF_EFLAGS_EXACT;
}
if (sample_type & (PERF_SAMPLE_REGS_INTR | PERF_SAMPLE_REGS_USER))
adaptive_pebs_save_regs(regs, gprs);
__setup_pebs_gpr_group(event, regs, gprs, sample_type);
}
if (format_group & PEBS_DATACFG_MEMINFO) {
if (sample_type & PERF_SAMPLE_WEIGHT_TYPE) {
u64 latency = x86_pmu.flags & PMU_FL_INSTR_LATENCY ?
meminfo->cache_latency : meminfo->mem_latency;
u64 latency = x86_pmu.flags & PMU_FL_INSTR_LATENCY ?
meminfo->cache_latency : meminfo->mem_latency;
u64 instr_latency = x86_pmu.flags & PMU_FL_INSTR_LATENCY ?
meminfo->instr_latency : 0;
u64 ax = gprs ? gprs->ax : 0;
if (x86_pmu.flags & PMU_FL_INSTR_LATENCY)
data->weight.var2_w = meminfo->instr_latency;
/*
* Although meminfo::latency is defined as a u64,
* only the lower 32 bits include the valid data
* in practice on Ice Lake and earlier platforms.
*/
if (sample_type & PERF_SAMPLE_WEIGHT) {
data->weight.full = latency ?:
intel_get_tsx_weight(meminfo->tsx_tuning);
} else {
data->weight.var1_dw = (u32)latency ?:
intel_get_tsx_weight(meminfo->tsx_tuning);
}
data->sample_flags |= PERF_SAMPLE_WEIGHT_TYPE;
}
if (sample_type & PERF_SAMPLE_DATA_SRC) {
data->data_src.val = get_data_src(event, meminfo->aux);
data->sample_flags |= PERF_SAMPLE_DATA_SRC;
}
if (sample_type & PERF_SAMPLE_ADDR_TYPE) {
data->addr = meminfo->address;
data->sample_flags |= PERF_SAMPLE_ADDR;
}
if (sample_type & PERF_SAMPLE_TRANSACTION) {
data->txn = intel_get_tsx_transaction(meminfo->tsx_tuning,
gprs ? gprs->ax : 0);
data->sample_flags |= PERF_SAMPLE_TRANSACTION;
}
__setup_pebs_meminfo_group(event, data, sample_type, latency,
instr_latency, meminfo->address,
meminfo->aux, meminfo->tsx_tuning,
ax);
}
if (format_group & PEBS_DATACFG_XMMS) {
@ -2220,6 +2348,135 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event,
format_group);
}
static inline bool arch_pebs_record_continued(struct arch_pebs_header *header)
{
/* Continue bit or null PEBS record indicates fragment follows. */
return header->cont || !(header->format & GENMASK_ULL(63, 16));
}
static void setup_arch_pebs_sample_data(struct perf_event *event,
struct pt_regs *iregs,
void *__pebs,
struct perf_sample_data *data,
struct pt_regs *regs)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
u64 sample_type = event->attr.sample_type;
struct arch_pebs_header *header = NULL;
struct arch_pebs_aux *meminfo = NULL;
struct arch_pebs_gprs *gprs = NULL;
struct x86_perf_regs *perf_regs;
void *next_record;
void *at = __pebs;
if (at == NULL)
return;
perf_regs = container_of(regs, struct x86_perf_regs, regs);
perf_regs->xmm_regs = NULL;
__setup_perf_sample_data(event, iregs, data);
*regs = *iregs;
again:
header = at;
next_record = at + sizeof(struct arch_pebs_header);
if (header->basic) {
struct arch_pebs_basic *basic = next_record;
u16 retire = 0;
next_record = basic + 1;
if (sample_type & PERF_SAMPLE_WEIGHT_STRUCT)
retire = basic->valid ? basic->retire : 0;
__setup_pebs_basic_group(event, regs, data, sample_type,
basic->ip, basic->tsc, retire);
}
/*
* The record for MEMINFO is in front of GP
* But PERF_SAMPLE_TRANSACTION needs gprs->ax.
* Save the pointer here but process later.
*/
if (header->aux) {
meminfo = next_record;
next_record = meminfo + 1;
}
if (header->gpr) {
gprs = next_record;
next_record = gprs + 1;
__setup_pebs_gpr_group(event, regs,
(struct pebs_gprs *)gprs,
sample_type);
}
if (header->aux) {
u64 ax = gprs ? gprs->ax : 0;
__setup_pebs_meminfo_group(event, data, sample_type,
meminfo->cache_latency,
meminfo->instr_latency,
meminfo->address, meminfo->aux,
meminfo->tsx_tuning, ax);
}
if (header->xmm) {
struct pebs_xmm *xmm;
next_record += sizeof(struct arch_pebs_xer_header);
xmm = next_record;
perf_regs->xmm_regs = xmm->xmm;
next_record = xmm + 1;
}
if (header->lbr) {
struct arch_pebs_lbr_header *lbr_header = next_record;
struct lbr_entry *lbr;
int num_lbr;
next_record = lbr_header + 1;
lbr = next_record;
num_lbr = header->lbr == ARCH_PEBS_LBR_NUM_VAR ?
lbr_header->depth :
header->lbr * ARCH_PEBS_BASE_LBR_ENTRIES;
next_record += num_lbr * sizeof(struct lbr_entry);
if (has_branch_stack(event)) {
intel_pmu_store_pebs_lbrs(lbr);
intel_pmu_lbr_save_brstack(data, cpuc, event);
}
}
if (header->cntr) {
struct arch_pebs_cntr_header *cntr = next_record;
unsigned int nr;
next_record += sizeof(struct arch_pebs_cntr_header);
if (is_pebs_counter_event_group(event)) {
__setup_pebs_counter_group(cpuc, event,
(struct pebs_cntr_header *)cntr, next_record);
data->sample_flags |= PERF_SAMPLE_READ;
}
nr = hweight32(cntr->cntr) + hweight32(cntr->fixed);
if (cntr->metrics == INTEL_CNTR_METRICS)
nr += 2;
next_record += nr * sizeof(u64);
}
/* Parse followed fragments if there are. */
if (arch_pebs_record_continued(header)) {
at = at + header->size;
goto again;
}
}
static inline void *
get_next_pebs_record_by_bit(void *base, void *top, int bit)
{
@ -2602,6 +2859,57 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs, struct perf_sample_d
}
}
static __always_inline void
__intel_pmu_handle_pebs_record(struct pt_regs *iregs,
struct pt_regs *regs,
struct perf_sample_data *data,
void *at, u64 pebs_status,
short *counts, void **last,
setup_fn setup_sample)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
struct perf_event *event;
int bit;
for_each_set_bit(bit, (unsigned long *)&pebs_status, X86_PMC_IDX_MAX) {
event = cpuc->events[bit];
if (WARN_ON_ONCE(!event) ||
WARN_ON_ONCE(!event->attr.precise_ip))
continue;
if (counts[bit]++) {
__intel_pmu_pebs_event(event, iregs, regs, data,
last[bit], setup_sample);
}
last[bit] = at;
}
}
static __always_inline void
__intel_pmu_handle_last_pebs_record(struct pt_regs *iregs,
struct pt_regs *regs,
struct perf_sample_data *data,
u64 mask, short *counts, void **last,
setup_fn setup_sample)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
struct perf_event *event;
int bit;
for_each_set_bit(bit, (unsigned long *)&mask, X86_PMC_IDX_MAX) {
if (!counts[bit])
continue;
event = cpuc->events[bit];
__intel_pmu_pebs_last_event(event, iregs, regs, data, last[bit],
counts[bit], setup_sample);
}
}
static void intel_pmu_drain_pebs_icl(struct pt_regs *iregs, struct perf_sample_data *data)
{
short counts[INTEL_PMC_IDX_FIXED + MAX_FIXED_PEBS_EVENTS] = {};
@ -2611,9 +2919,7 @@ static void intel_pmu_drain_pebs_icl(struct pt_regs *iregs, struct perf_sample_d
struct x86_perf_regs perf_regs;
struct pt_regs *regs = &perf_regs.regs;
struct pebs_basic *basic;
struct perf_event *event;
void *base, *at, *top;
int bit;
u64 mask;
if (!x86_pmu.pebs_active)
@ -2626,6 +2932,7 @@ static void intel_pmu_drain_pebs_icl(struct pt_regs *iregs, struct perf_sample_d
mask = hybrid(cpuc->pmu, pebs_events_mask) |
(hybrid(cpuc->pmu, fixed_cntr_mask64) << INTEL_PMC_IDX_FIXED);
mask &= cpuc->pebs_enabled;
if (unlikely(base >= top)) {
intel_pmu_pebs_event_update_no_drain(cpuc, mask);
@ -2643,38 +2950,114 @@ static void intel_pmu_drain_pebs_icl(struct pt_regs *iregs, struct perf_sample_d
if (basic->format_size != cpuc->pebs_record_size)
continue;
pebs_status = basic->applicable_counters & cpuc->pebs_enabled & mask;
for_each_set_bit(bit, (unsigned long *)&pebs_status, X86_PMC_IDX_MAX) {
event = cpuc->events[bit];
if (WARN_ON_ONCE(!event) ||
WARN_ON_ONCE(!event->attr.precise_ip))
continue;
if (counts[bit]++) {
__intel_pmu_pebs_event(event, iregs, regs, data, last[bit],
setup_pebs_adaptive_sample_data);
}
last[bit] = at;
}
pebs_status = mask & basic->applicable_counters;
__intel_pmu_handle_pebs_record(iregs, regs, data, at,
pebs_status, counts, last,
setup_pebs_adaptive_sample_data);
}
for_each_set_bit(bit, (unsigned long *)&mask, X86_PMC_IDX_MAX) {
if (!counts[bit])
__intel_pmu_handle_last_pebs_record(iregs, regs, data, mask, counts, last,
setup_pebs_adaptive_sample_data);
}
static void intel_pmu_drain_arch_pebs(struct pt_regs *iregs,
struct perf_sample_data *data)
{
short counts[INTEL_PMC_IDX_FIXED + MAX_FIXED_PEBS_EVENTS] = {};
void *last[INTEL_PMC_IDX_FIXED + MAX_FIXED_PEBS_EVENTS];
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
union arch_pebs_index index;
struct x86_perf_regs perf_regs;
struct pt_regs *regs = &perf_regs.regs;
void *base, *at, *top;
u64 mask;
rdmsrq(MSR_IA32_PEBS_INDEX, index.whole);
if (unlikely(!index.wr)) {
intel_pmu_pebs_event_update_no_drain(cpuc, X86_PMC_IDX_MAX);
return;
}
base = cpuc->pebs_vaddr;
top = cpuc->pebs_vaddr + (index.wr << ARCH_PEBS_INDEX_WR_SHIFT);
index.wr = 0;
index.full = 0;
index.en = 1;
if (cpuc->n_pebs == cpuc->n_large_pebs)
index.thresh = ARCH_PEBS_THRESH_MULTI;
else
index.thresh = ARCH_PEBS_THRESH_SINGLE;
wrmsrq(MSR_IA32_PEBS_INDEX, index.whole);
mask = hybrid(cpuc->pmu, arch_pebs_cap).counters & cpuc->pebs_enabled;
if (!iregs)
iregs = &dummy_iregs;
/* Process all but the last event for each counter. */
for (at = base; at < top;) {
struct arch_pebs_header *header;
struct arch_pebs_basic *basic;
u64 pebs_status;
header = at;
if (WARN_ON_ONCE(!header->size))
break;
/* 1st fragment or single record must have basic group */
if (!header->basic) {
at += header->size;
continue;
}
event = cpuc->events[bit];
basic = at + sizeof(struct arch_pebs_header);
pebs_status = mask & basic->applicable_counters;
__intel_pmu_handle_pebs_record(iregs, regs, data, at,
pebs_status, counts, last,
setup_arch_pebs_sample_data);
__intel_pmu_pebs_last_event(event, iregs, regs, data, last[bit],
counts[bit], setup_pebs_adaptive_sample_data);
/* Skip non-last fragments */
while (arch_pebs_record_continued(header)) {
if (!header->size)
break;
at += header->size;
header = at;
}
/* Skip last fragment or the single record */
at += header->size;
}
__intel_pmu_handle_last_pebs_record(iregs, regs, data, mask,
counts, last,
setup_arch_pebs_sample_data);
}
static void __init intel_arch_pebs_init(void)
{
/*
* Current hybrid platforms always both support arch-PEBS or not
* on all kinds of cores. So directly set x86_pmu.arch_pebs flag
* if boot cpu supports arch-PEBS.
*/
x86_pmu.arch_pebs = 1;
x86_pmu.pebs_buffer_size = PEBS_BUFFER_SIZE;
x86_pmu.drain_pebs = intel_pmu_drain_arch_pebs;
x86_pmu.pebs_capable = ~0ULL;
x86_pmu.flags |= PMU_FL_PEBS_ALL;
x86_pmu.pebs_enable = __intel_pmu_pebs_enable;
x86_pmu.pebs_disable = __intel_pmu_pebs_disable;
}
/*
* PEBS probe and setup
*/
void __init intel_pebs_init(void)
static void __init intel_ds_pebs_init(void)
{
/*
* No support for 32bit formats
@ -2736,10 +3119,8 @@ void __init intel_pebs_init(void)
break;
case 6:
if (x86_pmu.intel_cap.pebs_baseline) {
if (x86_pmu.intel_cap.pebs_baseline)
x86_pmu.large_pebs_flags |= PERF_SAMPLE_READ;
x86_pmu.late_setup = intel_pmu_late_setup;
}
fallthrough;
case 5:
x86_pmu.pebs_ept = 1;
@ -2789,6 +3170,14 @@ void __init intel_pebs_init(void)
}
}
void __init intel_pebs_init(void)
{
if (x86_pmu.intel_cap.pebs_format == 0xf)
intel_arch_pebs_init();
else
intel_ds_pebs_init();
}
void perf_restore_debug_store(void)
{
struct debug_store *ds = __this_cpu_read(cpu_hw_events.ds);

View File

@ -283,8 +283,9 @@ struct cpu_hw_events {
* Intel DebugStore bits
*/
struct debug_store *ds;
void *ds_pebs_vaddr;
void *ds_bts_vaddr;
/* DS based PEBS or arch-PEBS buffer address */
void *pebs_vaddr;
u64 pebs_enabled;
int n_pebs;
int n_large_pebs;
@ -303,6 +304,8 @@ struct cpu_hw_events {
/* Intel ACR configuration */
u64 acr_cfg_b[X86_PMC_IDX_MAX];
u64 acr_cfg_c[X86_PMC_IDX_MAX];
/* Cached CFG_C values */
u64 cfg_c_val[X86_PMC_IDX_MAX];
/*
* Intel LBR bits
@ -708,6 +711,12 @@ enum hybrid_pmu_type {
hybrid_big_small_tiny = hybrid_big | hybrid_small_tiny,
};
struct arch_pebs_cap {
u64 caps;
u64 counters;
u64 pdists;
};
struct x86_hybrid_pmu {
struct pmu pmu;
const char *name;
@ -752,6 +761,8 @@ struct x86_hybrid_pmu {
mid_ack :1,
enabled_ack :1;
struct arch_pebs_cap arch_pebs_cap;
u64 pebs_data_source[PERF_PEBS_DATA_SOURCE_MAX];
};
@ -906,7 +917,7 @@ struct x86_pmu {
union perf_capabilities intel_cap;
/*
* Intel DebugStore bits
* Intel DebugStore and PEBS bits
*/
unsigned int bts :1,
bts_active :1,
@ -917,7 +928,8 @@ struct x86_pmu {
pebs_no_tlb :1,
pebs_no_isolation :1,
pebs_block :1,
pebs_ept :1;
pebs_ept :1,
arch_pebs :1;
int pebs_record_size;
int pebs_buffer_size;
u64 pebs_events_mask;
@ -929,6 +941,11 @@ struct x86_pmu {
u64 rtm_abort_event;
u64 pebs_capable;
/*
* Intel Architectural PEBS
*/
struct arch_pebs_cap arch_pebs_cap;
/*
* Intel LBR
*/
@ -1124,7 +1141,6 @@ static struct perf_pmu_format_hybrid_attr format_attr_hybrid_##_name = {\
.pmu_type = _pmu, \
}
int is_x86_event(struct perf_event *event);
struct pmu *x86_get_pmu(unsigned int cpu);
extern struct x86_pmu x86_pmu __read_mostly;
@ -1217,7 +1233,7 @@ int x86_reserve_hardware(void);
void x86_release_hardware(void);
int x86_pmu_max_precise(void);
int x86_pmu_max_precise(struct pmu *pmu);
void hw_perf_lbr_event_destroy(struct perf_event *event);
@ -1604,6 +1620,14 @@ extern void intel_cpuc_finish(struct cpu_hw_events *cpuc);
int intel_pmu_init(void);
int alloc_arch_pebs_buf_on_cpu(int cpu);
void release_arch_pebs_buf_on_cpu(int cpu);
void init_arch_pebs_on_cpu(int cpu);
void fini_arch_pebs_on_cpu(int cpu);
void init_debug_store_on_cpu(int cpu);
void fini_debug_store_on_cpu(int cpu);
@ -1760,6 +1784,8 @@ void intel_pmu_pebs_data_source_cmt(void);
void intel_pmu_pebs_data_source_lnl(void);
u64 intel_get_arch_pebs_data_config(struct perf_event *event);
int intel_pmu_setup_lbr_filter(struct perf_event *event);
void intel_pt_interrupt(void);
@ -1792,6 +1818,11 @@ static inline int intel_pmu_max_num_pebs(struct pmu *pmu)
return fls((u32)hybrid(pmu, pebs_events_mask));
}
static inline bool intel_pmu_has_pebs(void)
{
return x86_pmu.ds_pebs || x86_pmu.arch_pebs;
}
#else /* CONFIG_CPU_SUP_INTEL */
static inline void reserve_ds_buffers(void)

View File

@ -198,6 +198,7 @@ static inline int alternatives_text_reserved(void *start, void *end)
#define ALTINSTR_ENTRY(ft_flags) \
".pushsection .altinstructions,\"a\"\n" \
ANNOTATE_DATA_SPECIAL \
" .long 771b - .\n" /* label */ \
" .long 774f - .\n" /* new instruction */ \
" .4byte " __stringify(ft_flags) "\n" /* feature + flags */ \
@ -207,6 +208,7 @@ static inline int alternatives_text_reserved(void *start, void *end)
#define ALTINSTR_REPLACEMENT(newinstr) /* replacement */ \
".pushsection .altinstr_replacement, \"ax\"\n" \
ANNOTATE_DATA_SPECIAL \
"# ALT: replacement\n" \
"774:\n\t" newinstr "\n775:\n" \
".popsection\n"
@ -337,6 +339,7 @@ void nop_func(void);
* instruction. See apply_alternatives().
*/
.macro altinstr_entry orig alt ft_flags orig_len alt_len
ANNOTATE_DATA_SPECIAL
.long \orig - .
.long \alt - .
.4byte \ft_flags
@ -365,6 +368,7 @@ void nop_func(void);
.popsection ; \
.pushsection .altinstr_replacement,"ax" ; \
743: \
ANNOTATE_DATA_SPECIAL ; \
newinst ; \
744: \
.popsection ;

View File

@ -2,6 +2,8 @@
#ifndef _ASM_X86_ASM_H
#define _ASM_X86_ASM_H
#include <linux/annotate.h>
#ifdef __ASSEMBLER__
# define __ASM_FORM(x, ...) x,## __VA_ARGS__
# define __ASM_FORM_RAW(x, ...) x,## __VA_ARGS__
@ -132,6 +134,7 @@ static __always_inline __pure void *rip_rel_ptr(void *p)
# define _ASM_EXTABLE_TYPE(from, to, type) \
.pushsection "__ex_table","a" ; \
.balign 4 ; \
ANNOTATE_DATA_SPECIAL ; \
.long (from) - . ; \
.long (to) - . ; \
.long type ; \
@ -179,6 +182,7 @@ static __always_inline __pure void *rip_rel_ptr(void *p)
# define _ASM_EXTABLE_TYPE(from, to, type) \
" .pushsection \"__ex_table\",\"a\"\n" \
" .balign 4\n" \
ANNOTATE_DATA_SPECIAL \
" .long (" #from ") - .\n" \
" .long (" #to ") - .\n" \
" .long " __stringify(type) " \n" \
@ -187,6 +191,7 @@ static __always_inline __pure void *rip_rel_ptr(void *p)
# define _ASM_EXTABLE_TYPE_REG(from, to, type, reg) \
" .pushsection \"__ex_table\",\"a\"\n" \
" .balign 4\n" \
ANNOTATE_DATA_SPECIAL \
" .long (" #from ") - .\n" \
" .long (" #to ") - .\n" \
DEFINE_EXTABLE_TYPE_REG \

View File

@ -7,6 +7,11 @@
#include <linux/objtool.h>
#include <asm/asm.h>
#ifndef __ASSEMBLY__
struct bug_entry;
extern void __WARN_trap(struct bug_entry *bug, ...);
#endif
/*
* Despite that some emulators terminate on UD2, we use it for WARN().
*/
@ -31,52 +36,77 @@
#define BUG_UD2 0xfffe
#define BUG_UD1 0xfffd
#define BUG_UD1_UBSAN 0xfffc
#define BUG_UD1_WARN 0xfffb
#define BUG_UDB 0xffd6
#define BUG_LOCK 0xfff0
#ifdef CONFIG_GENERIC_BUG
#ifdef CONFIG_X86_32
# define __BUG_REL(val) ".long " val
#else
# define __BUG_REL(val) ".long " val " - ."
#endif
#ifdef CONFIG_DEBUG_BUGVERBOSE
#define __BUG_ENTRY(file, line, flags) \
"2:\t" __BUG_REL("1b") "\t# bug_entry::bug_addr\n" \
"\t" __BUG_REL(file) "\t# bug_entry::file\n" \
"\t.word " line "\t# bug_entry::line\n" \
"\t.word " flags "\t# bug_entry::flags\n"
#define __BUG_ENTRY_VERBOSE(file, line) \
"\t.long " file " - .\t# bug_entry::file\n" \
"\t.word " line "\t# bug_entry::line\n"
#else
#define __BUG_ENTRY(file, line, flags) \
"2:\t" __BUG_REL("1b") "\t# bug_entry::bug_addr\n" \
"\t.word " flags "\t# bug_entry::flags\n"
#define __BUG_ENTRY_VERBOSE(file, line)
#endif
#define _BUG_FLAGS_ASM(ins, file, line, flags, size, extra) \
"1:\t" ins "\n" \
".pushsection __bug_table,\"aw\"\n" \
__BUG_ENTRY(file, line, flags) \
#if defined(CONFIG_X86_64) || defined(CONFIG_DEBUG_BUGVERBOSE_DETAILED)
#define HAVE_ARCH_BUG_FORMAT
#define __BUG_ENTRY_FORMAT(format) \
"\t.long " format " - .\t# bug_entry::format\n"
#else
#define __BUG_ENTRY_FORMAT(format)
#endif
#ifdef CONFIG_X86_64
#define HAVE_ARCH_BUG_FORMAT_ARGS
#endif
#define __BUG_ENTRY(format, file, line, flags) \
"\t.long 1b - ." "\t# bug_entry::bug_addr\n" \
__BUG_ENTRY_FORMAT(format) \
__BUG_ENTRY_VERBOSE(file, line) \
"\t.word " flags "\t# bug_entry::flags\n"
#define _BUG_FLAGS_ASM(format, file, line, flags, size, extra) \
".pushsection __bug_table,\"aw\"\n\t" \
ANNOTATE_DATA_SPECIAL \
"2:\n\t" \
__BUG_ENTRY(format, file, line, flags) \
"\t.org 2b + " size "\n" \
".popsection\n" \
extra
#define _BUG_FLAGS(ins, flags, extra) \
#ifdef CONFIG_DEBUG_BUGVERBOSE_DETAILED
#define WARN_CONDITION_STR(cond_str) cond_str
#else
#define WARN_CONDITION_STR(cond_str) ""
#endif
#define _BUG_FLAGS(cond_str, ins, flags, extra) \
do { \
asm_inline volatile(_BUG_FLAGS_ASM(ins, "%c0", \
"%c1", "%c2", "%c3", extra) \
: : "i" (__FILE__), "i" (__LINE__), \
"i" (flags), \
"i" (sizeof(struct bug_entry))); \
asm_inline volatile("1:\t" ins "\n" \
_BUG_FLAGS_ASM("%c[fmt]", "%c[file]", \
"%c[line]", "%c[fl]", \
"%c[size]", extra) \
: : [fmt] "i" (WARN_CONDITION_STR(cond_str)), \
[file] "i" (__FILE__), \
[line] "i" (__LINE__), \
[fl] "i" (flags), \
[size] "i" (sizeof(struct bug_entry))); \
} while (0)
#define ARCH_WARN_ASM(file, line, flags, size) \
_BUG_FLAGS_ASM(ASM_UD2, file, line, flags, size, "")
".pushsection .rodata.str1.1, \"aMS\", @progbits, 1\n" \
"99:\n" \
"\t.string \"\"\n" \
".popsection\n" \
"1:\t " ASM_UD2 "\n" \
_BUG_FLAGS_ASM("99b", file, line, flags, size, "")
#else
#define _BUG_FLAGS(ins, flags, extra) asm volatile(ins)
#define _BUG_FLAGS(cond_str, ins, flags, extra) asm volatile(ins)
#endif /* CONFIG_GENERIC_BUG */
@ -84,7 +114,7 @@ do { \
#define BUG() \
do { \
instrumentation_begin(); \
_BUG_FLAGS(ASM_UD2, 0, ""); \
_BUG_FLAGS("", ASM_UD2, 0, ""); \
__builtin_unreachable(); \
} while (0)
@ -97,14 +127,69 @@ do { \
#define ARCH_WARN_REACHABLE ANNOTATE_REACHABLE(1b)
#define __WARN_FLAGS(flags) \
do { \
__auto_type __flags = BUGFLAG_WARNING|(flags); \
instrumentation_begin(); \
_BUG_FLAGS(ASM_UD2, __flags, ARCH_WARN_REACHABLE); \
instrumentation_end(); \
#define __WARN_FLAGS(cond_str, flags) \
do { \
__auto_type __flags = BUGFLAG_WARNING|(flags); \
instrumentation_begin(); \
_BUG_FLAGS(cond_str, ASM_UD2, __flags, ARCH_WARN_REACHABLE); \
instrumentation_end(); \
} while (0)
#ifdef HAVE_ARCH_BUG_FORMAT_ARGS
#ifndef __ASSEMBLY__
#include <linux/static_call_types.h>
DECLARE_STATIC_CALL(WARN_trap, __WARN_trap);
struct pt_regs;
struct sysv_va_list { /* from AMD64 System V ABI */
unsigned int gp_offset;
unsigned int fp_offset;
void *overflow_arg_area;
void *reg_save_area;
};
struct arch_va_list {
unsigned long regs[6];
struct sysv_va_list args;
};
extern void *__warn_args(struct arch_va_list *args, struct pt_regs *regs);
#endif /* __ASSEMBLY__ */
#define __WARN_bug_entry(flags, format) ({ \
struct bug_entry *bug; \
asm_inline volatile("lea (2f)(%%rip), %[addr]\n1:\n" \
_BUG_FLAGS_ASM("%c[fmt]", "%c[file]", \
"%c[line]", "%c[fl]", \
"%c[size]", "") \
: [addr] "=r" (bug) \
: [fmt] "i" (format), \
[file] "i" (__FILE__), \
[line] "i" (__LINE__), \
[fl] "i" (flags), \
[size] "i" (sizeof(struct bug_entry))); \
bug; })
#define __WARN_print_arg(flags, format, arg...) \
do { \
int __flags = (flags) | BUGFLAG_WARNING | BUGFLAG_ARGS ; \
static_call_mod(WARN_trap)(__WARN_bug_entry(__flags, format), ## arg); \
asm (""); /* inhibit tail-call optimization */ \
} while (0)
#define __WARN_printf(taint, fmt, arg...) \
__WARN_print_arg(BUGFLAG_TAINT(taint), fmt, ## arg)
#define WARN_ONCE(cond, format, arg...) ({ \
int __ret_warn_on = !!(cond); \
if (unlikely(__ret_warn_on)) { \
__WARN_print_arg(BUGFLAG_ONCE|BUGFLAG_TAINT(TAINT_WARN),\
format, ## arg); \
} \
__ret_warn_on; \
})
#endif /* HAVE_ARCH_BUG_FORMAT_ARGS */
#include <asm-generic/bug.h>
#endif /* _ASM_X86_BUG_H */

View File

@ -101,6 +101,7 @@ static __always_inline bool _static_cpu_has(u16 bit)
asm goto(ALTERNATIVE_TERNARY("jmp 6f", %c[feature], "", "jmp %l[t_no]")
".pushsection .altinstr_aux,\"ax\"\n"
"6:\n"
ANNOTATE_DATA_SPECIAL
" testb %[bitnum], %a[cap_byte]\n"
" jnz %l[t_yes]\n"
" jmp %l[t_no]\n"

View File

@ -320,7 +320,7 @@
#define X86_FEATURE_FSRS (12*32+11) /* Fast short REP STOSB */
#define X86_FEATURE_FSRC (12*32+12) /* Fast short REP {CMPSB,SCASB} */
#define X86_FEATURE_FRED (12*32+17) /* "fred" Flexible Return and Event Delivery */
#define X86_FEATURE_LKGS (12*32+18) /* Load "kernel" (userspace) GS */
#define X86_FEATURE_LKGS (12*32+18) /* Like MOV_GS except MSR_KERNEL_GS_BASE = GS.base */
#define X86_FEATURE_WRMSRNS (12*32+19) /* Non-serializing WRMSR */
#define X86_FEATURE_AMX_FP16 (12*32+21) /* AMX fp16 Support */
#define X86_FEATURE_AVX_IFMA (12*32+23) /* Support for VPMADD52[H,L]UQ */
@ -500,6 +500,8 @@
#define X86_FEATURE_ABMC (21*32+15) /* Assignable Bandwidth Monitoring Counters */
#define X86_FEATURE_MSR_IMM (21*32+16) /* MSR immediate form instructions */
#define X86_FEATURE_SDCIAE (21*32+18) /* L3 Smart Data Cache Injection Allocation Enforcement */
/*
* BUG word(s)
*/

View File

@ -46,38 +46,31 @@ do { \
} while(0)
static __always_inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval,
u32 __user *uaddr)
u32 __user *uaddr)
{
if (can_do_masked_user_access())
uaddr = masked_user_access_begin(uaddr);
else if (!user_access_begin(uaddr, sizeof(u32)))
return -EFAULT;
switch (op) {
case FUTEX_OP_SET:
unsafe_atomic_op1("xchgl %0, %2", oval, uaddr, oparg, Efault);
break;
case FUTEX_OP_ADD:
unsafe_atomic_op1(LOCK_PREFIX "xaddl %0, %2", oval,
uaddr, oparg, Efault);
break;
case FUTEX_OP_OR:
unsafe_atomic_op2("orl %4, %3", oval, uaddr, oparg, Efault);
break;
case FUTEX_OP_ANDN:
unsafe_atomic_op2("andl %4, %3", oval, uaddr, ~oparg, Efault);
break;
case FUTEX_OP_XOR:
unsafe_atomic_op2("xorl %4, %3", oval, uaddr, oparg, Efault);
break;
default:
user_access_end();
return -ENOSYS;
scoped_user_rw_access(uaddr, Efault) {
switch (op) {
case FUTEX_OP_SET:
unsafe_atomic_op1("xchgl %0, %2", oval, uaddr, oparg, Efault);
break;
case FUTEX_OP_ADD:
unsafe_atomic_op1(LOCK_PREFIX "xaddl %0, %2", oval, uaddr, oparg, Efault);
break;
case FUTEX_OP_OR:
unsafe_atomic_op2("orl %4, %3", oval, uaddr, oparg, Efault);
break;
case FUTEX_OP_ANDN:
unsafe_atomic_op2("andl %4, %3", oval, uaddr, ~oparg, Efault);
break;
case FUTEX_OP_XOR:
unsafe_atomic_op2("xorl %4, %3", oval, uaddr, oparg, Efault);
break;
default:
return -ENOSYS;
}
}
user_access_end();
return 0;
Efault:
user_access_end();
return -EFAULT;
}
@ -86,21 +79,19 @@ static inline int futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
{
int ret = 0;
if (can_do_masked_user_access())
uaddr = masked_user_access_begin(uaddr);
else if (!user_access_begin(uaddr, sizeof(u32)))
return -EFAULT;
asm volatile("\n"
"1:\t" LOCK_PREFIX "cmpxchgl %3, %2\n"
"2:\n"
_ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_EFAULT_REG, %0) \
: "+r" (ret), "=a" (oldval), "+m" (*uaddr)
: "r" (newval), "1" (oldval)
: "memory"
);
user_access_end();
*uval = oldval;
scoped_user_rw_access(uaddr, Efault) {
asm_inline volatile("\n"
"1:\t" LOCK_PREFIX "cmpxchgl %3, %2\n"
"2:\n"
_ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_EFAULT_REG, %0)
: "+r" (ret), "=a" (oldval), "+m" (*uaddr)
: "r" (newval), "1" (oldval)
: "memory");
*uval = oldval;
}
return ret;
Efault:
return -EFAULT;
}
#endif

View File

@ -393,7 +393,7 @@ static __always_inline void __##func(struct pt_regs *regs)
/**
* DEFINE_IDTENTRY_VC_KERNEL - Emit code for VMM communication handler
when raised from kernel mode
* when raised from kernel mode
* @func: Function name of the entry point
*
* Maps to DEFINE_IDTENTRY_RAW_ERRORCODE
@ -403,7 +403,7 @@ static __always_inline void __##func(struct pt_regs *regs)
/**
* DEFINE_IDTENTRY_VC_USER - Emit code for VMM communication handler
when raised from user mode
* when raised from user mode
* @func: Function name of the entry point
*
* Maps to DEFINE_IDTENTRY_RAW_ERRORCODE

View File

@ -44,4 +44,6 @@ enum insn_mmio_type {
enum insn_mmio_type insn_decode_mmio(struct insn *insn, int *bytes);
bool insn_is_nop(struct insn *insn);
#endif /* _ASM_X86_INSN_EVAL_H */

View File

@ -312,7 +312,6 @@ static inline int insn_offset_immediate(struct insn *insn)
/**
* for_each_insn_prefix() -- Iterate prefixes in the instruction
* @insn: Pointer to struct insn.
* @idx: Index storage.
* @prefix: Prefix byte.
*
* Iterate prefix bytes of given @insn. Each prefix byte is stored in @prefix
@ -321,8 +320,8 @@ static inline int insn_offset_immediate(struct insn *insn)
* Since prefixes.nbytes can be bigger than 4 if some prefixes
* are repeated, it cannot be used for looping over the prefixes.
*/
#define for_each_insn_prefix(insn, idx, prefix) \
for (idx = 0; idx < ARRAY_SIZE(insn->prefixes.bytes) && (prefix = insn->prefixes.bytes[idx]) != 0; idx++)
#define for_each_insn_prefix(insn, prefix) \
for (int idx = 0; idx < ARRAY_SIZE(insn->prefixes.bytes) && (prefix = insn->prefixes.bytes[idx]) != 0; idx++)
#define POP_SS_OPCODE 0x1f
#define MOV_SREG_OPCODE 0x8e

View File

@ -4,7 +4,15 @@
#include <linux/percpu-defs.h>
#define BTS_BUFFER_SIZE (PAGE_SIZE << 4)
#define PEBS_BUFFER_SIZE (PAGE_SIZE << 4)
#define PEBS_BUFFER_SHIFT 4
#define PEBS_BUFFER_SIZE (PAGE_SIZE << PEBS_BUFFER_SHIFT)
/*
* The largest PEBS record could consume a page, ensure
* a record at least can be written after triggering PMI.
*/
#define ARCH_PEBS_THRESH_MULTI ((PEBS_BUFFER_SIZE - PAGE_SIZE) >> PEBS_BUFFER_SHIFT)
#define ARCH_PEBS_THRESH_SINGLE 1
/* The maximal number of PEBS events: */
#define MAX_PEBS_EVENTS_FMT4 8

View File

@ -15,6 +15,7 @@
#define JUMP_TABLE_ENTRY(key, label) \
".pushsection __jump_table, \"aw\" \n\t" \
_ASM_ALIGN "\n\t" \
ANNOTATE_DATA_SPECIAL \
".long 1b - . \n\t" \
".long " label " - . \n\t" \
_ASM_PTR " " key " - . \n\t" \

View File

@ -48,6 +48,7 @@
/* AMD-specific bits */
#define MCI_STATUS_TCC BIT_ULL(55) /* Task context corrupt */
#define MCI_STATUS_PADDRV BIT_ULL(54) /* Valid System Physical Address */
#define MCI_STATUS_SYNDV BIT_ULL(53) /* synd reg. valid */
#define MCI_STATUS_DEFERRED BIT_ULL(44) /* uncorrected error, deferred exception */
#define MCI_STATUS_POISON BIT_ULL(43) /* access poisonous data */
@ -62,6 +63,7 @@
*/
#define MCI_CONFIG_MCAX 0x1
#define MCI_CONFIG_FRUTEXT BIT_ULL(9)
#define MCI_CONFIG_PADDRV BIT_ULL(11)
#define MCI_IPID_MCATYPE 0xFFFF0000
#define MCI_IPID_HWID 0xFFF
@ -165,6 +167,12 @@
*/
#define MCE_IN_KERNEL_COPYIN BIT_ULL(7)
/*
* Indicates that handler should check and clear Deferred error registers
* rather than common ones.
*/
#define MCE_CHECK_DFR_REGS BIT_ULL(8)
/*
* This structure contains all data related to the MCE log. Also
* carries a signature to make it easier to find from external
@ -302,6 +310,12 @@ DECLARE_PER_CPU(struct mce, injectm);
/* Disable CMCI/polling for MCA bank claimed by firmware */
extern void mce_disable_bank(int bank);
#ifdef CONFIG_X86_MCE_THRESHOLD
void mce_save_apei_thr_limit(u32 thr_limit);
#else
static inline void mce_save_apei_thr_limit(u32 thr_limit) { }
#endif /* CONFIG_X86_MCE_THRESHOLD */
/*
* Exception handler
*/

View File

@ -166,6 +166,10 @@
* Processor MMIO stale data
* vulnerabilities.
*/
#define ARCH_CAP_MCU_ENUM BIT(16) /*
* Indicates the presence of microcode update
* feature enumeration and status information.
*/
#define ARCH_CAP_FB_CLEAR BIT(17) /*
* VERW clears CPU fill buffer
* even on MDS_NO CPUs.
@ -327,6 +331,26 @@
PERF_CAP_PEBS_FORMAT | PERF_CAP_PEBS_BASELINE | \
PERF_CAP_PEBS_TIMING_INFO)
/* Arch PEBS */
#define MSR_IA32_PEBS_BASE 0x000003f4
#define MSR_IA32_PEBS_INDEX 0x000003f5
#define ARCH_PEBS_OFFSET_MASK 0x7fffff
#define ARCH_PEBS_INDEX_WR_SHIFT 4
#define ARCH_PEBS_RELOAD 0xffffffff
#define ARCH_PEBS_CNTR_ALLOW BIT_ULL(35)
#define ARCH_PEBS_CNTR_GP BIT_ULL(36)
#define ARCH_PEBS_CNTR_FIXED BIT_ULL(37)
#define ARCH_PEBS_CNTR_METRICS BIT_ULL(38)
#define ARCH_PEBS_LBR_SHIFT 40
#define ARCH_PEBS_LBR (0x3ull << ARCH_PEBS_LBR_SHIFT)
#define ARCH_PEBS_VECR_XMM BIT_ULL(49)
#define ARCH_PEBS_GPR BIT_ULL(61)
#define ARCH_PEBS_AUX BIT_ULL(62)
#define ARCH_PEBS_EN BIT_ULL(63)
#define ARCH_PEBS_CNTR_MASK (ARCH_PEBS_CNTR_GP | ARCH_PEBS_CNTR_FIXED | \
ARCH_PEBS_CNTR_METRICS)
#define MSR_IA32_RTIT_CTL 0x00000570
#define RTIT_CTL_TRACEEN BIT(0)
#define RTIT_CTL_CYCLEACC BIT(1)
@ -929,6 +953,10 @@
#define MSR_IA32_APICBASE_BASE (0xfffff<<12)
#define MSR_IA32_UCODE_WRITE 0x00000079
#define MSR_IA32_MCU_ENUMERATION 0x0000007b
#define MCU_STAGING BIT(4)
#define MSR_IA32_UCODE_REV 0x0000008b
/* Intel SGX Launch Enclave Public Key Hash MSRs */
@ -1226,6 +1254,8 @@
#define MSR_IA32_VMX_VMFUNC 0x00000491
#define MSR_IA32_VMX_PROCBASED_CTLS3 0x00000492
#define MSR_IA32_MCU_STAGING_MBOX_ADDR 0x000007a5
/* Resctrl MSRs: */
/* - Intel: */
#define MSR_IA32_L3_QOS_CFG 0xc81

View File

@ -23,6 +23,7 @@
#else /* !__ASSEMBLY__: */
#include <linux/args.h>
#include <linux/bits.h>
#include <linux/build_bug.h>
#include <linux/stringify.h>
#include <asm/asm.h>
@ -572,9 +573,9 @@ do { \
#define x86_this_cpu_constant_test_bit(_nr, _var) \
({ \
unsigned long __percpu *addr__ = \
(unsigned long __percpu *)&(_var) + ((_nr) / BITS_PER_LONG); \
(unsigned long __percpu *)&(_var) + BIT_WORD(_nr); \
\
!!((1UL << ((_nr) % BITS_PER_LONG)) & raw_cpu_read(*addr__)); \
!!(BIT_MASK(_nr) & raw_cpu_read(*addr__)); \
})
#define x86_this_cpu_variable_test_bit(_nr, _var) \

View File

@ -141,16 +141,16 @@
#define ARCH_PERFMON_EVENTS_COUNT 7
#define PEBS_DATACFG_MEMINFO BIT_ULL(0)
#define PEBS_DATACFG_GP BIT_ULL(1)
#define PEBS_DATACFG_GP BIT_ULL(1)
#define PEBS_DATACFG_XMMS BIT_ULL(2)
#define PEBS_DATACFG_LBRS BIT_ULL(3)
#define PEBS_DATACFG_LBR_SHIFT 24
#define PEBS_DATACFG_CNTR BIT_ULL(4)
#define PEBS_DATACFG_METRICS BIT_ULL(5)
#define PEBS_DATACFG_LBR_SHIFT 24
#define PEBS_DATACFG_CNTR_SHIFT 32
#define PEBS_DATACFG_CNTR_MASK GENMASK_ULL(15, 0)
#define PEBS_DATACFG_FIX_SHIFT 48
#define PEBS_DATACFG_FIX_MASK GENMASK_ULL(7, 0)
#define PEBS_DATACFG_METRICS BIT_ULL(5)
/* Steal the highest bit of pebs_data_cfg for SW usage */
#define PEBS_UPDATE_DS_SW BIT_ULL(63)
@ -200,6 +200,8 @@ union cpuid10_edx {
#define ARCH_PERFMON_EXT_LEAF 0x00000023
#define ARCH_PERFMON_NUM_COUNTER_LEAF 0x1
#define ARCH_PERFMON_ACR_LEAF 0x2
#define ARCH_PERFMON_PEBS_CAP_LEAF 0x4
#define ARCH_PERFMON_PEBS_COUNTER_LEAF 0x5
union cpuid35_eax {
struct {
@ -210,7 +212,10 @@ union cpuid35_eax {
unsigned int acr_subleaf:1;
/* Events Sub-Leaf */
unsigned int events_subleaf:1;
unsigned int reserved:28;
/* arch-PEBS Sub-Leaves */
unsigned int pebs_caps_subleaf:1;
unsigned int pebs_cnts_subleaf:1;
unsigned int reserved:26;
} split;
unsigned int full;
};
@ -432,6 +437,8 @@ static inline bool is_topdown_idx(int idx)
#define GLOBAL_STATUS_LBRS_FROZEN BIT_ULL(GLOBAL_STATUS_LBRS_FROZEN_BIT)
#define GLOBAL_STATUS_TRACE_TOPAPMI_BIT 55
#define GLOBAL_STATUS_TRACE_TOPAPMI BIT_ULL(GLOBAL_STATUS_TRACE_TOPAPMI_BIT)
#define GLOBAL_STATUS_ARCH_PEBS_THRESHOLD_BIT 54
#define GLOBAL_STATUS_ARCH_PEBS_THRESHOLD BIT_ULL(GLOBAL_STATUS_ARCH_PEBS_THRESHOLD_BIT)
#define GLOBAL_STATUS_PERF_METRICS_OVF_BIT 48
#define GLOBAL_CTRL_EN_PERF_METRICS BIT_ULL(48)
@ -502,6 +509,107 @@ struct pebs_cntr_header {
#define INTEL_CNTR_METRICS 0x3
/*
* Arch PEBS
*/
union arch_pebs_index {
struct {
u64 rsvd:4,
wr:23,
rsvd2:4,
full:1,
en:1,
rsvd3:3,
thresh:23,
rsvd4:5;
};
u64 whole;
};
struct arch_pebs_header {
union {
u64 format;
struct {
u64 size:16, /* Record size */
rsvd:14,
mode:1, /* 64BIT_MODE */
cont:1,
rsvd2:3,
cntr:5,
lbr:2,
rsvd3:7,
xmm:1,
ymmh:1,
rsvd4:2,
opmask:1,
zmmh:1,
h16zmm:1,
rsvd5:5,
gpr:1,
aux:1,
basic:1;
};
};
u64 rsvd6;
};
struct arch_pebs_basic {
u64 ip;
u64 applicable_counters;
u64 tsc;
u64 retire :16, /* Retire Latency */
valid :1,
rsvd :47;
u64 rsvd2;
u64 rsvd3;
};
struct arch_pebs_aux {
u64 address;
u64 rsvd;
u64 rsvd2;
u64 rsvd3;
u64 rsvd4;
u64 aux;
u64 instr_latency :16,
pad2 :16,
cache_latency :16,
pad3 :16;
u64 tsx_tuning;
};
struct arch_pebs_gprs {
u64 flags, ip, ax, cx, dx, bx, sp, bp, si, di;
u64 r8, r9, r10, r11, r12, r13, r14, r15, ssp;
u64 rsvd;
};
struct arch_pebs_xer_header {
u64 xstate;
u64 rsvd;
};
#define ARCH_PEBS_LBR_NAN 0x0
#define ARCH_PEBS_LBR_NUM_8 0x1
#define ARCH_PEBS_LBR_NUM_16 0x2
#define ARCH_PEBS_LBR_NUM_VAR 0x3
#define ARCH_PEBS_BASE_LBR_ENTRIES 8
struct arch_pebs_lbr_header {
u64 rsvd;
u64 ctl;
u64 depth;
u64 ler_from;
u64 ler_to;
u64 ler_info;
};
struct arch_pebs_cntr_header {
u32 cntr;
u32 fixed;
u32 metrics;
u32 reserved;
};
/*
* AMD Extended Performance Monitoring and Debug cpuid feature detection
*/

View File

@ -187,12 +187,12 @@ convert_ip_to_linear(struct task_struct *child, struct pt_regs *regs);
extern void send_sigtrap(struct pt_regs *regs, int error_code, int si_code);
static inline unsigned long regs_return_value(struct pt_regs *regs)
static __always_inline unsigned long regs_return_value(struct pt_regs *regs)
{
return regs->ax;
}
static inline void regs_set_return_value(struct pt_regs *regs, unsigned long rc)
static __always_inline void regs_set_return_value(struct pt_regs *regs, unsigned long rc)
{
regs->ax = rc;
}
@ -277,34 +277,34 @@ static __always_inline bool ip_within_syscall_gap(struct pt_regs *regs)
}
#endif
static inline unsigned long kernel_stack_pointer(struct pt_regs *regs)
static __always_inline unsigned long kernel_stack_pointer(struct pt_regs *regs)
{
return regs->sp;
}
static inline unsigned long instruction_pointer(struct pt_regs *regs)
static __always_inline unsigned long instruction_pointer(struct pt_regs *regs)
{
return regs->ip;
}
static inline void instruction_pointer_set(struct pt_regs *regs,
unsigned long val)
static __always_inline
void instruction_pointer_set(struct pt_regs *regs, unsigned long val)
{
regs->ip = val;
}
static inline unsigned long frame_pointer(struct pt_regs *regs)
static __always_inline unsigned long frame_pointer(struct pt_regs *regs)
{
return regs->bp;
}
static inline unsigned long user_stack_pointer(struct pt_regs *regs)
static __always_inline unsigned long user_stack_pointer(struct pt_regs *regs)
{
return regs->sp;
}
static inline void user_stack_pointer_set(struct pt_regs *regs,
unsigned long val)
static __always_inline
void user_stack_pointer_set(struct pt_regs *regs, unsigned long val)
{
regs->sp = val;
}

View File

@ -109,7 +109,7 @@ int common_cpu_up(unsigned int cpunum, struct task_struct *tidle);
int native_kick_ap(unsigned int cpu, struct task_struct *tidle);
int native_cpu_disable(void);
void __noreturn hlt_play_dead(void);
void native_play_dead(void);
void __noreturn native_play_dead(void);
void play_dead_common(void);
void wbinvd_on_cpu(int cpu);
void wbinvd_on_all_cpus(void);

View File

@ -218,6 +218,12 @@ static inline unsigned int topology_amd_nodes_per_pkg(void)
return __amd_nodes_per_pkg;
}
#else /* CONFIG_SMP */
static inline int topology_phys_to_logical_pkg(unsigned int pkg) { return 0; }
static inline int topology_max_smt_threads(void) { return 1; }
static inline unsigned int topology_amd_nodes_per_pkg(void) { return 1; }
#endif /* !CONFIG_SMP */
extern struct cpumask __cpu_primary_thread_mask;
#define cpu_primary_thread_mask ((const struct cpumask *)&__cpu_primary_thread_mask)
@ -241,12 +247,6 @@ static inline bool topology_is_core_online(unsigned int cpu)
}
#define topology_is_core_online topology_is_core_online
#else /* CONFIG_SMP */
static inline int topology_phys_to_logical_pkg(unsigned int pkg) { return 0; }
static inline int topology_max_smt_threads(void) { return 1; }
static inline unsigned int topology_amd_nodes_per_pkg(void) { return 1; }
#endif /* !CONFIG_SMP */
static inline void arch_fix_phys_package_id(int num, u32 slot)
{
}
@ -325,4 +325,6 @@ static inline void freq_invariance_set_perf_ratio(u64 ratio, bool turbo_disabled
extern void arch_scale_freq_tick(void);
#define arch_scale_freq_tick arch_scale_freq_tick
extern int arch_sched_node_distance(int from, int to);
#endif /* _ASM_X86_TOPOLOGY_H */

View File

@ -528,18 +528,18 @@ static __must_check __always_inline bool user_access_begin(const void __user *pt
#define user_access_save() smap_save()
#define user_access_restore(x) smap_restore(x)
#define unsafe_put_user(x, ptr, label) \
#define arch_unsafe_put_user(x, ptr, label) \
__put_user_size((__typeof__(*(ptr)))(x), (ptr), sizeof(*(ptr)), label)
#ifdef CONFIG_CC_HAS_ASM_GOTO_OUTPUT
#define unsafe_get_user(x, ptr, err_label) \
#define arch_unsafe_get_user(x, ptr, err_label) \
do { \
__inttype(*(ptr)) __gu_val; \
__get_user_size(__gu_val, (ptr), sizeof(*(ptr)), err_label); \
(x) = (__force __typeof__(*(ptr)))__gu_val; \
} while (0)
#else // !CONFIG_CC_HAS_ASM_GOTO_OUTPUT
#define unsafe_get_user(x, ptr, err_label) \
#define arch_unsafe_get_user(x, ptr, err_label) \
do { \
int __gu_err; \
__inttype(*(ptr)) __gu_val; \
@ -618,11 +618,11 @@ do { \
} while (0)
#ifdef CONFIG_CC_HAS_ASM_GOTO_OUTPUT
#define __get_kernel_nofault(dst, src, type, err_label) \
#define arch_get_kernel_nofault(dst, src, type, err_label) \
__get_user_size(*((type *)(dst)), (__force type __user *)(src), \
sizeof(type), err_label)
#else // !CONFIG_CC_HAS_ASM_GOTO_OUTPUT
#define __get_kernel_nofault(dst, src, type, err_label) \
#define arch_get_kernel_nofault(dst, src, type, err_label) \
do { \
int __kr_err; \
\
@ -633,7 +633,7 @@ do { \
} while (0)
#endif // CONFIG_CC_HAS_ASM_GOTO_OUTPUT
#define __put_kernel_nofault(dst, src, type, err_label) \
#define arch_put_kernel_nofault(dst, src, type, err_label) \
__put_user_size(*((type *)(src)), (__force type __user *)(dst), \
sizeof(type), err_label)

View File

@ -0,0 +1,41 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_UNWIND_USER_H
#define _ASM_X86_UNWIND_USER_H
#ifdef CONFIG_HAVE_UNWIND_USER_FP
#include <asm/ptrace.h>
#include <asm/uprobes.h>
#define ARCH_INIT_USER_FP_FRAME(ws) \
.cfa_off = 2*(ws), \
.ra_off = -1*(ws), \
.fp_off = -2*(ws), \
.use_fp = true,
#define ARCH_INIT_USER_FP_ENTRY_FRAME(ws) \
.cfa_off = 1*(ws), \
.ra_off = -1*(ws), \
.fp_off = 0, \
.use_fp = false,
static inline int unwind_user_word_size(struct pt_regs *regs)
{
/* We can't unwind VM86 stacks */
if (regs->flags & X86_VM_MASK)
return 0;
#ifdef CONFIG_X86_64
if (!user_64bit_mode(regs))
return sizeof(int);
#endif
return sizeof(long);
}
static inline bool unwind_user_at_function_start(struct pt_regs *regs)
{
return is_uprobe_at_func_entry(regs);
}
#endif /* CONFIG_HAVE_UNWIND_USER_FP */
#endif /* _ASM_X86_UNWIND_USER_H */

View File

@ -62,4 +62,13 @@ struct arch_uprobe_task {
unsigned int saved_tf;
};
#ifdef CONFIG_UPROBES
extern bool is_uprobe_at_func_entry(struct pt_regs *regs);
#else
static bool is_uprobe_at_func_entry(struct pt_regs *regs)
{
return false;
}
#endif /* CONFIG_UPROBES */
#endif /* _ASM_UPROBES_H */

View File

@ -19,6 +19,8 @@ int arch_apei_enable_cmcff(struct acpi_hest_header *hest_hdr, void *data)
if (!cmc->enabled)
return 0;
mce_save_apei_thr_limit(cmc->notify.error_threshold_value);
/*
* We expect HEST to provide a list of MC banks that report errors
* in firmware first mode. Otherwise, return non-zero value to

View File

@ -9,6 +9,7 @@
#include <asm/text-patching.h>
#include <asm/insn.h>
#include <asm/insn-eval.h>
#include <asm/ibt.h>
#include <asm/set_memory.h>
#include <asm/nmi.h>
@ -345,25 +346,6 @@ static void add_nop(u8 *buf, unsigned int len)
*buf = INT3_INSN_OPCODE;
}
/*
* Matches NOP and NOPL, not any of the other possible NOPs.
*/
static bool insn_is_nop(struct insn *insn)
{
/* Anything NOP, but no REP NOP */
if (insn->opcode.bytes[0] == 0x90 &&
(!insn->prefixes.nbytes || insn->prefixes.bytes[0] != 0xF3))
return true;
/* NOPL */
if (insn->opcode.bytes[0] == 0x0F && insn->opcode.bytes[1] == 0x1F)
return true;
/* TODO: more nops */
return false;
}
/*
* Find the offset of the first non-NOP instruction starting at @offset
* but no further than @len.
@ -559,7 +541,7 @@ EXPORT_SYMBOL(BUG_func);
* Rewrite the "call BUG_func" replacement to point to the target of the
* indirect pv_ops call "call *disp(%ip)".
*/
static int alt_replace_call(u8 *instr, u8 *insn_buff, struct alt_instr *a)
static unsigned int alt_replace_call(u8 *instr, u8 *insn_buff, struct alt_instr *a)
{
void *target, *bug = &BUG_func;
s32 disp;
@ -643,7 +625,7 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start,
* order.
*/
for (a = start; a < end; a++) {
int insn_buff_sz = 0;
unsigned int insn_buff_sz = 0;
/*
* In case of nested ALTERNATIVE()s the outer alternative might
@ -683,11 +665,8 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start,
memcpy(insn_buff, replacement, a->replacementlen);
insn_buff_sz = a->replacementlen;
if (a->flags & ALT_FLAG_DIRECT_CALL) {
if (a->flags & ALT_FLAG_DIRECT_CALL)
insn_buff_sz = alt_replace_call(instr, insn_buff, a);
if (insn_buff_sz < 0)
continue;
}
for (; insn_buff_sz < a->instrlen; insn_buff_sz++)
insn_buff[insn_buff_sz] = 0x90;
@ -2244,21 +2223,34 @@ int alternatives_text_reserved(void *start, void *end)
* See entry_{32,64}.S for more details.
*/
/*
* We define the int3_magic() function in assembly to control the calling
* convention such that we can 'call' it from assembly.
*/
extern void int3_magic(unsigned int *ptr); /* defined in asm */
extern void int3_selftest_asm(unsigned int *ptr);
asm (
" .pushsection .init.text, \"ax\", @progbits\n"
" .type int3_magic, @function\n"
"int3_magic:\n"
" .type int3_selftest_asm, @function\n"
"int3_selftest_asm:\n"
ANNOTATE_NOENDBR
" movl $1, (%" _ASM_ARG1 ")\n"
/*
* INT3 padded with NOP to CALL_INSN_SIZE. The INT3 triggers an
* exception, then the int3_exception_nb notifier emulates a call to
* int3_selftest_callee().
*/
" int3; nop; nop; nop; nop\n"
ASM_RET
" .size int3_magic, .-int3_magic\n"
" .size int3_selftest_asm, . - int3_selftest_asm\n"
" .popsection\n"
);
extern void int3_selftest_callee(unsigned int *ptr);
asm (
" .pushsection .init.text, \"ax\", @progbits\n"
" .type int3_selftest_callee, @function\n"
"int3_selftest_callee:\n"
ANNOTATE_NOENDBR
" movl $0x1234, (%" _ASM_ARG1 ")\n"
ASM_RET
" .size int3_selftest_callee, . - int3_selftest_callee\n"
" .popsection\n"
);
@ -2267,7 +2259,7 @@ extern void int3_selftest_ip(void); /* defined in asm below */
static int __init
int3_exception_notify(struct notifier_block *self, unsigned long val, void *data)
{
unsigned long selftest = (unsigned long)&int3_selftest_ip;
unsigned long selftest = (unsigned long)&int3_selftest_asm;
struct die_args *args = data;
struct pt_regs *regs = args->regs;
@ -2282,7 +2274,7 @@ int3_exception_notify(struct notifier_block *self, unsigned long val, void *data
if (regs->ip - INT3_INSN_SIZE != selftest)
return NOTIFY_DONE;
int3_emulate_call(regs, (unsigned long)&int3_magic);
int3_emulate_call(regs, (unsigned long)&int3_selftest_callee);
return NOTIFY_STOP;
}
@ -2298,19 +2290,11 @@ static noinline void __init int3_selftest(void)
BUG_ON(register_die_notifier(&int3_exception_nb));
/*
* Basically: int3_magic(&val); but really complicated :-)
*
* INT3 padded with NOP to CALL_INSN_SIZE. The int3_exception_nb
* notifier above will emulate CALL for us.
* Basically: int3_selftest_callee(&val); but really complicated :-)
*/
asm volatile ("int3_selftest_ip:\n\t"
ANNOTATE_NOENDBR
" int3; nop; nop; nop; nop\n\t"
: ASM_CALL_CONSTRAINT
: __ASM_SEL_RAW(a, D) (&val)
: "memory");
int3_selftest_asm(&val);
BUG_ON(val != 1);
BUG_ON(val != 0x1234);
unregister_die_notifier(&int3_exception_nb);
}

View File

@ -173,6 +173,7 @@ static struct resource lapic_resource = {
.flags = IORESOURCE_MEM | IORESOURCE_BUSY,
};
/* Measured in ticks per HZ. */
unsigned int lapic_timer_period = 0;
static void apic_pm_activate(void);
@ -792,6 +793,7 @@ static int __init calibrate_APIC_clock(void)
{
struct clock_event_device *levt = this_cpu_ptr(&lapic_events);
u64 tsc_perj = 0, tsc_start = 0;
long delta_tsc_khz, bus_khz;
unsigned long jif_start;
unsigned long deltaj;
long delta, deltatsc;
@ -894,14 +896,15 @@ static int __init calibrate_APIC_clock(void)
apic_pr_verbose("..... calibration result: %u\n", lapic_timer_period);
if (boot_cpu_has(X86_FEATURE_TSC)) {
apic_pr_verbose("..... CPU clock speed is %ld.%04ld MHz.\n",
(deltatsc / LAPIC_CAL_LOOPS) / (1000000 / HZ),
(deltatsc / LAPIC_CAL_LOOPS) % (1000000 / HZ));
delta_tsc_khz = (deltatsc * HZ) / (1000 * LAPIC_CAL_LOOPS);
apic_pr_verbose("..... CPU clock speed is %ld.%03ld MHz.\n",
delta_tsc_khz / 1000, delta_tsc_khz % 1000);
}
apic_pr_verbose("..... host bus clock speed is %u.%04u MHz.\n",
lapic_timer_period / (1000000 / HZ),
lapic_timer_period % (1000000 / HZ));
bus_khz = (long)lapic_timer_period * HZ / 1000;
apic_pr_verbose("..... host bus clock speed is %ld.%03ld MHz.\n",
bus_khz / 1000, bus_khz % 1000);
/*
* Do a sanity check on the APIC calibration result

View File

@ -2864,7 +2864,7 @@ int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq,
ioapic = mp_irqdomain_ioapic_idx(domain);
pin = info->ioapic.pin;
if (irq_find_mapping(domain, (irq_hw_number_t)pin) > 0)
if (irq_resolve_mapping(domain, (irq_hw_number_t)pin))
return -EEXIST;
data = kzalloc(sizeof(*data), GFP_KERNEL);

View File

@ -72,6 +72,7 @@ static const struct cpuid_dep cpuid_deps[] = {
{ X86_FEATURE_CQM_MBM_LOCAL, X86_FEATURE_CQM_LLC },
{ X86_FEATURE_BMEC, X86_FEATURE_CQM_MBM_TOTAL },
{ X86_FEATURE_BMEC, X86_FEATURE_CQM_MBM_LOCAL },
{ X86_FEATURE_SDCIAE, X86_FEATURE_CAT_L3 },
{ X86_FEATURE_AVX512_BF16, X86_FEATURE_AVX512VL },
{ X86_FEATURE_AVX512_FP16, X86_FEATURE_AVX512BW },
{ X86_FEATURE_ENQCMD, X86_FEATURE_XSAVES },

View File

@ -43,9 +43,6 @@
/* Deferred error settings */
#define MSR_CU_DEF_ERR 0xC0000410
#define MASK_DEF_LVTOFF 0x000000F0
#define MASK_DEF_INT_TYPE 0x00000006
#define DEF_LVT_OFF 0x2
#define DEF_INT_TYPE_APIC 0x2
/* Scalable MCA: */
@ -54,6 +51,17 @@
static bool thresholding_irq_en;
struct mce_amd_cpu_data {
mce_banks_t thr_intr_banks;
mce_banks_t dfr_intr_banks;
u32 thr_intr_en: 1,
dfr_intr_en: 1,
__resv: 30;
};
static DEFINE_PER_CPU_READ_MOSTLY(struct mce_amd_cpu_data, mce_amd_data);
static const char * const th_names[] = {
"load_store",
"insn_fetch",
@ -79,6 +87,8 @@ struct smca_bank {
const struct smca_hwid *hwid;
u32 id; /* Value of MCA_IPID[InstanceId]. */
u8 sysfs_id; /* Value used for sysfs name. */
u64 paddrv :1, /* Physical Address Valid bit in MCA_CONFIG */
__reserved :63;
};
static DEFINE_PER_CPU_READ_MOSTLY(struct smca_bank[MAX_NR_BANKS], smca_banks);
@ -264,6 +274,7 @@ void (*deferred_error_int_vector)(void) = default_deferred_error_interrupt;
static void smca_configure(unsigned int bank, unsigned int cpu)
{
struct mce_amd_cpu_data *data = this_cpu_ptr(&mce_amd_data);
u8 *bank_counts = this_cpu_ptr(smca_bank_counts);
const struct smca_hwid *s_hwid;
unsigned int i, hwid_mcatype;
@ -294,11 +305,33 @@ static void smca_configure(unsigned int bank, unsigned int cpu)
* APIC based interrupt. First, check that no interrupt has been
* set.
*/
if ((low & BIT(5)) && !((high >> 5) & 0x3))
if ((low & BIT(5)) && !((high >> 5) & 0x3) && data->dfr_intr_en) {
__set_bit(bank, data->dfr_intr_banks);
high |= BIT(5);
}
/*
* SMCA Corrected Error Interrupt
*
* MCA_CONFIG[IntPresent] is bit 10, and tells us if the bank can
* send an MCA Thresholding interrupt without the OS initializing
* this feature. This can be used if the threshold limit is managed
* by the platform.
*
* MCA_CONFIG[IntEn] is bit 40 (8 in the high portion of the MSR).
* The OS should set this to inform the platform that the OS is ready
* to handle the MCA Thresholding interrupt.
*/
if ((low & BIT(10)) && data->thr_intr_en) {
__set_bit(bank, data->thr_intr_banks);
high |= BIT(8);
}
this_cpu_ptr(mce_banks_array)[bank].lsb_in_status = !!(low & BIT(8));
if (low & MCI_CONFIG_PADDRV)
this_cpu_ptr(smca_banks)[bank].paddrv = 1;
wrmsr(smca_config, low, high);
}
@ -368,6 +401,14 @@ static bool lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi)
{
int msr = (hi & MASK_LVTOFF_HI) >> 20;
/*
* On SMCA CPUs, LVT offset is programmed at a different MSR, and
* the BIOS provides the value. The original field where LVT offset
* was set is reserved. Return early here:
*/
if (mce_flags.smca)
return false;
if (apic < 0) {
pr_err(FW_BUG "cpu %d, failed to setup threshold interrupt "
"for bank %d, block %d (MSR%08X=0x%x%08x)\n", b->cpu,
@ -376,14 +417,6 @@ static bool lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi)
}
if (apic != msr) {
/*
* On SMCA CPUs, LVT offset is programmed at a different MSR, and
* the BIOS provides the value. The original field where LVT offset
* was set is reserved. Return early here:
*/
if (mce_flags.smca)
return false;
pr_err(FW_BUG "cpu %d, invalid threshold interrupt offset %d "
"for bank %d, block %d (MSR%08X=0x%x%08x)\n",
b->cpu, apic, b->bank, b->block, b->address, hi, lo);
@ -443,6 +476,36 @@ static void threshold_restart_block(void *_tr)
wrmsr(tr->b->address, lo, hi);
}
static void threshold_restart_bank(unsigned int bank, bool intr_en)
{
struct threshold_bank **thr_banks = this_cpu_read(threshold_banks);
struct threshold_block *block, *tmp;
struct thresh_restart tr;
if (!thr_banks || !thr_banks[bank])
return;
memset(&tr, 0, sizeof(tr));
list_for_each_entry_safe(block, tmp, &thr_banks[bank]->miscj, miscj) {
tr.b = block;
tr.b->interrupt_enable = intr_en;
threshold_restart_block(&tr);
}
}
/* Try to use the threshold limit reported through APEI. */
static u16 get_thr_limit(void)
{
u32 thr_limit = mce_get_apei_thr_limit();
/* Fallback to old default if APEI limit is not available. */
if (!thr_limit)
return THRESHOLD_MAX;
return min(thr_limit, THRESHOLD_MAX);
}
static void mce_threshold_block_init(struct threshold_block *b, int offset)
{
struct thresh_restart tr = {
@ -451,7 +514,7 @@ static void mce_threshold_block_init(struct threshold_block *b, int offset)
.lvt_off = offset,
};
b->threshold_limit = THRESHOLD_MAX;
b->threshold_limit = get_thr_limit();
threshold_restart_block(&tr);
};
@ -464,41 +527,6 @@ static int setup_APIC_mce_threshold(int reserved, int new)
return reserved;
}
static int setup_APIC_deferred_error(int reserved, int new)
{
if (reserved < 0 && !setup_APIC_eilvt(new, DEFERRED_ERROR_VECTOR,
APIC_EILVT_MSG_FIX, 0))
return new;
return reserved;
}
static void deferred_error_interrupt_enable(struct cpuinfo_x86 *c)
{
u32 low = 0, high = 0;
int def_offset = -1, def_new;
if (rdmsr_safe(MSR_CU_DEF_ERR, &low, &high))
return;
def_new = (low & MASK_DEF_LVTOFF) >> 4;
if (!(low & MASK_DEF_LVTOFF)) {
pr_err(FW_BUG "Your BIOS is not setting up LVT offset 0x2 for deferred error IRQs correctly.\n");
def_new = DEF_LVT_OFF;
low = (low & ~MASK_DEF_LVTOFF) | (DEF_LVT_OFF << 4);
}
def_offset = setup_APIC_deferred_error(def_offset, def_new);
if ((def_offset == def_new) &&
(deferred_error_int_vector != amd_deferred_error_interrupt))
deferred_error_int_vector = amd_deferred_error_interrupt;
if (!mce_flags.smca)
low = (low & ~MASK_DEF_INT_TYPE) | DEF_INT_TYPE_APIC;
wrmsr(MSR_CU_DEF_ERR, low, high);
}
static u32 get_block_address(u32 current_addr, u32 low, u32 high,
unsigned int bank, unsigned int block,
unsigned int cpu)
@ -534,12 +562,10 @@ static u32 get_block_address(u32 current_addr, u32 low, u32 high,
return addr;
}
static int
prepare_threshold_block(unsigned int bank, unsigned int block, u32 addr,
int offset, u32 misc_high)
static int prepare_threshold_block(unsigned int bank, unsigned int block, u32 addr,
int offset, u32 misc_high)
{
unsigned int cpu = smp_processor_id();
u32 smca_low, smca_high;
struct threshold_block b;
int new;
@ -556,20 +582,13 @@ prepare_threshold_block(unsigned int bank, unsigned int block, u32 addr,
if (!b.interrupt_capable)
goto done;
__set_bit(bank, this_cpu_ptr(&mce_amd_data)->thr_intr_banks);
b.interrupt_enable = 1;
if (!mce_flags.smca) {
new = (misc_high & MASK_LVTOFF_HI) >> 20;
goto set_offset;
}
if (mce_flags.smca)
goto done;
/* Gather LVT offset for thresholding: */
if (rdmsr_safe(MSR_CU_DEF_ERR, &smca_low, &smca_high))
goto out;
new = (smca_low & SMCA_THR_LVT_OFF) >> 12;
set_offset:
new = (misc_high & MASK_LVTOFF_HI) >> 20;
offset = setup_APIC_mce_threshold(offset, new);
if (offset == new)
thresholding_irq_en = true;
@ -577,7 +596,6 @@ prepare_threshold_block(unsigned int bank, unsigned int block, u32 addr,
done:
mce_threshold_block_init(&b, offset);
out:
return offset;
}
@ -668,6 +686,32 @@ static void amd_apply_cpu_quirks(struct cpuinfo_x86 *c)
mce_banks[0].ctl = 0;
}
/*
* Enable the APIC LVT interrupt vectors once per-CPU. This should be done before hardware is
* ready to send interrupts.
*
* Individual error sources are enabled later during per-bank init.
*/
static void smca_enable_interrupt_vectors(void)
{
struct mce_amd_cpu_data *data = this_cpu_ptr(&mce_amd_data);
u64 mca_intr_cfg, offset;
if (!mce_flags.smca || !mce_flags.succor)
return;
if (rdmsrq_safe(MSR_CU_DEF_ERR, &mca_intr_cfg))
return;
offset = (mca_intr_cfg & SMCA_THR_LVT_OFF) >> 12;
if (!setup_APIC_eilvt(offset, THRESHOLD_APIC_VECTOR, APIC_EILVT_MSG_FIX, 0))
data->thr_intr_en = 1;
offset = (mca_intr_cfg & MASK_DEF_LVTOFF) >> 4;
if (!setup_APIC_eilvt(offset, DEFERRED_ERROR_VECTOR, APIC_EILVT_MSG_FIX, 0))
data->dfr_intr_en = 1;
}
/* cpu init entry point, called from mce.c with preempt off */
void mce_amd_feature_init(struct cpuinfo_x86 *c)
{
@ -679,10 +723,16 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
mce_flags.amd_threshold = 1;
smca_enable_interrupt_vectors();
for (bank = 0; bank < this_cpu_read(mce_num_banks); ++bank) {
if (mce_flags.smca)
if (mce_flags.smca) {
smca_configure(bank, cpu);
if (!this_cpu_ptr(&mce_amd_data)->thr_intr_en)
continue;
}
disable_err_thresholding(c, bank);
for (block = 0; block < NR_BLOCKS; ++block) {
@ -703,9 +753,6 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
offset = prepare_threshold_block(bank, block, address, offset, high);
}
}
if (mce_flags.succor)
deferred_error_interrupt_enable(c);
}
void smca_bsp_init(void)
@ -748,9 +795,9 @@ bool amd_mce_is_memory_error(struct mce *m)
}
/*
* AMD systems do not have an explicit indicator that the value in MCA_ADDR is
* a system physical address. Therefore, individual cases need to be detected.
* Future cases and checks will be added as needed.
* Some AMD systems have an explicit indicator that the value in MCA_ADDR is a
* system physical address. Individual cases though, need to be detected for
* other systems. Future cases will be added as needed.
*
* 1) General case
* a) Assume address is not usable.
@ -764,6 +811,8 @@ bool amd_mce_is_memory_error(struct mce *m)
* a) Reported in legacy bank 4 with extended error code (XEC) 8.
* b) MCA_STATUS[43] is *not* defined as poison in legacy bank 4. Therefore,
* this bit should not be checked.
* 4) MCI_STATUS_PADDRVAL is set
* a) Will provide a valid system physical address.
*
* NOTE: SMCA UMC memory errors fall into case #1.
*/
@ -777,6 +826,9 @@ bool amd_mce_usable_address(struct mce *m)
return false;
}
if (this_cpu_ptr(smca_banks)[m->bank].paddrv)
return m->status & MCI_STATUS_PADDRV;
/* Check poison bit for all other bank types. */
if (m->status & MCI_STATUS_POISON)
return true;
@ -785,37 +837,6 @@ bool amd_mce_usable_address(struct mce *m)
return false;
}
static void __log_error(unsigned int bank, u64 status, u64 addr, u64 misc)
{
struct mce_hw_err err;
struct mce *m = &err.m;
mce_prep_record(&err);
m->status = status;
m->misc = misc;
m->bank = bank;
m->tsc = rdtsc();
if (m->status & MCI_STATUS_ADDRV) {
m->addr = addr;
smca_extract_err_addr(m);
}
if (mce_flags.smca) {
rdmsrq(MSR_AMD64_SMCA_MCx_IPID(bank), m->ipid);
if (m->status & MCI_STATUS_SYNDV) {
rdmsrq(MSR_AMD64_SMCA_MCx_SYND(bank), m->synd);
rdmsrq(MSR_AMD64_SMCA_MCx_SYND1(bank), err.vendor.amd.synd1);
rdmsrq(MSR_AMD64_SMCA_MCx_SYND2(bank), err.vendor.amd.synd2);
}
}
mce_log(&err);
}
DEFINE_IDTENTRY_SYSVEC(sysvec_deferred_error)
{
trace_deferred_error_apic_entry(DEFERRED_ERROR_VECTOR);
@ -825,103 +846,20 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_deferred_error)
apic_eoi();
}
/*
* Returns true if the logged error is deferred. False, otherwise.
*/
static inline bool
_log_error_bank(unsigned int bank, u32 msr_stat, u32 msr_addr, u64 misc)
{
u64 status, addr = 0;
rdmsrq(msr_stat, status);
if (!(status & MCI_STATUS_VAL))
return false;
if (status & MCI_STATUS_ADDRV)
rdmsrq(msr_addr, addr);
__log_error(bank, status, addr, misc);
wrmsrq(msr_stat, 0);
return status & MCI_STATUS_DEFERRED;
}
static bool _log_error_deferred(unsigned int bank, u32 misc)
{
if (!_log_error_bank(bank, mca_msr_reg(bank, MCA_STATUS),
mca_msr_reg(bank, MCA_ADDR), misc))
return false;
/*
* Non-SMCA systems don't have MCA_DESTAT/MCA_DEADDR registers.
* Return true here to avoid accessing these registers.
*/
if (!mce_flags.smca)
return true;
/* Clear MCA_DESTAT if the deferred error was logged from MCA_STATUS. */
wrmsrq(MSR_AMD64_SMCA_MCx_DESTAT(bank), 0);
return true;
}
/*
* We have three scenarios for checking for Deferred errors:
*
* 1) Non-SMCA systems check MCA_STATUS and log error if found.
* 2) SMCA systems check MCA_STATUS. If error is found then log it and also
* clear MCA_DESTAT.
* 3) SMCA systems check MCA_DESTAT, if error was not found in MCA_STATUS, and
* log it.
*/
static void log_error_deferred(unsigned int bank)
{
if (_log_error_deferred(bank, 0))
return;
/*
* Only deferred errors are logged in MCA_DE{STAT,ADDR} so just check
* for a valid error.
*/
_log_error_bank(bank, MSR_AMD64_SMCA_MCx_DESTAT(bank),
MSR_AMD64_SMCA_MCx_DEADDR(bank), 0);
}
/* APIC interrupt handler for deferred errors */
static void amd_deferred_error_interrupt(void)
{
unsigned int bank;
for (bank = 0; bank < this_cpu_read(mce_num_banks); ++bank)
log_error_deferred(bank);
machine_check_poll(MCP_TIMESTAMP, &this_cpu_ptr(&mce_amd_data)->dfr_intr_banks);
}
static void log_error_thresholding(unsigned int bank, u64 misc)
void mce_amd_handle_storm(unsigned int bank, bool on)
{
_log_error_deferred(bank, misc);
threshold_restart_bank(bank, on);
}
static void log_and_reset_block(struct threshold_block *block)
static void amd_reset_thr_limit(unsigned int bank)
{
struct thresh_restart tr;
u32 low = 0, high = 0;
if (!block)
return;
if (rdmsr_safe(block->address, &low, &high))
return;
if (!(high & MASK_OVERFLOW_HI))
return;
/* Log the MCE which caused the threshold event. */
log_error_thresholding(block->bank, ((u64)high << 32) | low);
/* Reset threshold block after logging error. */
memset(&tr, 0, sizeof(tr));
tr.b = block;
threshold_restart_block(&tr);
threshold_restart_bank(bank, true);
}
/*
@ -930,33 +868,21 @@ static void log_and_reset_block(struct threshold_block *block)
*/
static void amd_threshold_interrupt(void)
{
struct threshold_bank **bp = this_cpu_read(threshold_banks), *thr_bank;
unsigned int bank, cpu = smp_processor_id();
struct threshold_block *block, *tmp;
/*
* Validate that the threshold bank has been initialized already. The
* handler is installed at boot time, but on a hotplug event the
* interrupt might fire before the data has been initialized.
*/
if (!bp)
return;
for (bank = 0; bank < this_cpu_read(mce_num_banks); ++bank) {
if (!(per_cpu(bank_map, cpu) & BIT_ULL(bank)))
continue;
thr_bank = bp[bank];
if (!thr_bank)
continue;
list_for_each_entry_safe(block, tmp, &thr_bank->miscj, miscj)
log_and_reset_block(block);
}
machine_check_poll(MCP_TIMESTAMP, &this_cpu_ptr(&mce_amd_data)->thr_intr_banks);
}
void amd_clear_bank(struct mce *m)
{
amd_reset_thr_limit(m->bank);
/* Clear MCA_DESTAT for all deferred errors even those logged in MCA_STATUS. */
if (m->status & MCI_STATUS_DEFERRED)
mce_wrmsrq(MSR_AMD64_SMCA_MCx_DESTAT(m->bank), 0);
/* Don't clear MCA_STATUS if MCA_DESTAT was used exclusively. */
if (m->kflags & MCE_CHECK_DFR_REGS)
return;
mce_wrmsrq(mca_msr_reg(m->bank, MCA_STATUS), 0);
}
@ -1172,7 +1098,7 @@ static int allocate_threshold_blocks(unsigned int cpu, struct threshold_bank *tb
b->address = address;
b->interrupt_enable = 0;
b->interrupt_capable = lvt_interrupt_supported(bank, high);
b->threshold_limit = THRESHOLD_MAX;
b->threshold_limit = get_thr_limit();
if (b->interrupt_capable) {
default_attrs[2] = &interrupt_enable.attr;
@ -1183,6 +1109,8 @@ static int allocate_threshold_blocks(unsigned int cpu, struct threshold_bank *tb
list_add(&b->miscj, &tb->miscj);
mce_threshold_block_init(b, (high & MASK_LVTOFF_HI) >> 20);
err = kobject_init_and_add(&b->kobj, &threshold_ktype, tb->kobj, get_name(cpu, bank, b));
if (err)
goto out_free;

Some files were not shown because too many files have changed in this diff Show More