From 43169328c7b4623b54b7713ec68479cebda5465f Mon Sep 17 00:00:00 2001
From: Vivian Wang <wangruikang@iscas.ac.cn>
Date: Tue, 2 Dec 2025 13:25:07 +0800
Subject: [PATCH 1/7] lib/crypto: riscv/chacha: Avoid s0/fp register

In chacha_zvkb, avoid using the s0 register, which is the frame pointer,
by reallocating KEY0 to t5. This makes stack traces available if e.g. a
crash happens in chacha_zvkb.

No frame pointer maintenance is otherwise required since this is a leaf
function.

Signed-off-by: Vivian Wang <wangruikang@iscas.ac.cn>
Fixes: bb54668837a0 ("crypto: riscv - add vector crypto accelerated ChaCha20")
Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/r/20251202-riscv-chacha_zvkb-fp-v2-1-7bd00098c9dc@iscas.ac.cn
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
 lib/crypto/riscv/chacha-riscv64-zvkb.S | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/lib/crypto/riscv/chacha-riscv64-zvkb.S b/lib/crypto/riscv/chacha-riscv64-zvkb.S
index b777d0b4e379..3d183ec818f5 100644
--- a/lib/crypto/riscv/chacha-riscv64-zvkb.S
+++ b/lib/crypto/riscv/chacha-riscv64-zvkb.S
@@ -60,7 +60,8 @@
 #define VL		t2
 #define STRIDE		t3
 #define ROUND_CTR	t4
-#define KEY0		s0
+#define KEY0		t5
+// Avoid s0/fp to allow for unwinding
 #define KEY1		s1
 #define KEY2		s2
 #define KEY3		s3
@@ -143,7 +144,6 @@
 // The updated 32-bit counter is written back to state->x[12] before returning.
 SYM_FUNC_START(chacha_zvkb)
 	addi		sp, sp, -96
-	sd		s0, 0(sp)
 	sd		s1, 8(sp)
 	sd		s2, 16(sp)
 	sd		s3, 24(sp)
@@ -280,7 +280,6 @@ SYM_FUNC_START(chacha_zvkb)
 	bnez		NBLOCKS, .Lblock_loop
 
 	sw		COUNTER, 48(STATEP)
-	ld		s0, 0(sp)
 	ld		s1, 8(sp)
 	ld		s2, 16(sp)
 	ld		s3, 24(sp)

From 1cd5bb6e9e027bab33aafd58fe8340124869ba62 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@kernel.org>
Date: Sat, 6 Dec 2025 13:37:50 -0800
Subject: [PATCH 2/7] lib/crypto: riscv: Depend on
 RISCV_EFFICIENT_VECTOR_UNALIGNED_ACCESS

Replace the RISCV_ISA_V dependency of the RISC-V crypto code with
RISCV_EFFICIENT_VECTOR_UNALIGNED_ACCESS, which implies RISCV_ISA_V as
well as vector unaligned accesses being efficient.

This is necessary because this code assumes that vector unaligned
accesses are supported and are efficient.  (It does so to avoid having
to use lots of extra vsetvli instructions to switch the element width
back and forth between 8 and either 32 or 64.)

This was omitted from the code originally just because the RISC-V kernel
support for detecting this feature didn't exist yet.  Support has now
been added, but it's fragmented into per-CPU runtime detection, a
command-line parameter, and a kconfig option.  The kconfig option is the
only reasonable way to do it, though, so let's just rely on that.

Fixes: eb24af5d7a05 ("crypto: riscv - add vector crypto accelerated AES-{ECB,CBC,CTR,XTS}")
Fixes: bb54668837a0 ("crypto: riscv - add vector crypto accelerated ChaCha20")
Fixes: 600a3853dfa0 ("crypto: riscv - add vector crypto accelerated GHASH")
Fixes: 8c8e40470ffe ("crypto: riscv - add vector crypto accelerated SHA-{256,224}")
Fixes: b3415925a08b ("crypto: riscv - add vector crypto accelerated SHA-{512,384}")
Fixes: 563a5255afa2 ("crypto: riscv - add vector crypto accelerated SM3")
Fixes: b8d06352bbf3 ("crypto: riscv - add vector crypto accelerated SM4")
Cc: stable@vger.kernel.org
Reported-by: Vivian Wang <wangruikang@iscas.ac.cn>
Closes: https://lore.kernel.org/r/b3cfcdac-0337-4db0-a611-258f2868855f@iscas.ac.cn/
Reviewed-by: Jerry Shih <jerry.shih@sifive.com>
Link: https://lore.kernel.org/r/20251206213750.81474-1-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
 arch/riscv/crypto/Kconfig | 12 ++++++++----
 lib/crypto/Kconfig        |  9 ++++++---
 2 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/arch/riscv/crypto/Kconfig b/arch/riscv/crypto/Kconfig
index a75d6325607b..14c5acb935e9 100644
--- a/arch/riscv/crypto/Kconfig
+++ b/arch/riscv/crypto/Kconfig
@@ -4,7 +4,8 @@ menu "Accelerated Cryptographic Algorithms for CPU (riscv)"
 
 config CRYPTO_AES_RISCV64
 	tristate "Ciphers: AES, modes: ECB, CBC, CTS, CTR, XTS"
-	depends on 64BIT && RISCV_ISA_V && TOOLCHAIN_HAS_VECTOR_CRYPTO
+	depends on 64BIT && TOOLCHAIN_HAS_VECTOR_CRYPTO && \
+		   RISCV_EFFICIENT_VECTOR_UNALIGNED_ACCESS
 	select CRYPTO_ALGAPI
 	select CRYPTO_LIB_AES
 	select CRYPTO_SKCIPHER
@@ -20,7 +21,8 @@ config CRYPTO_AES_RISCV64
 
 config CRYPTO_GHASH_RISCV64
 	tristate "Hash functions: GHASH"
-	depends on 64BIT && RISCV_ISA_V && TOOLCHAIN_HAS_VECTOR_CRYPTO
+	depends on 64BIT && TOOLCHAIN_HAS_VECTOR_CRYPTO && \
+		   RISCV_EFFICIENT_VECTOR_UNALIGNED_ACCESS
 	select CRYPTO_GCM
 	help
 	  GCM GHASH function (NIST SP 800-38D)
@@ -30,7 +32,8 @@ config CRYPTO_GHASH_RISCV64
 
 config CRYPTO_SM3_RISCV64
 	tristate "Hash functions: SM3 (ShangMi 3)"
-	depends on 64BIT && RISCV_ISA_V && TOOLCHAIN_HAS_VECTOR_CRYPTO
+	depends on 64BIT && TOOLCHAIN_HAS_VECTOR_CRYPTO && \
+		   RISCV_EFFICIENT_VECTOR_UNALIGNED_ACCESS
 	select CRYPTO_HASH
 	select CRYPTO_LIB_SM3
 	help
@@ -42,7 +45,8 @@ config CRYPTO_SM3_RISCV64
 
 config CRYPTO_SM4_RISCV64
 	tristate "Ciphers: SM4 (ShangMi 4)"
-	depends on 64BIT && RISCV_ISA_V && TOOLCHAIN_HAS_VECTOR_CRYPTO
+	depends on 64BIT && TOOLCHAIN_HAS_VECTOR_CRYPTO && \
+		   RISCV_EFFICIENT_VECTOR_UNALIGNED_ACCESS
 	select CRYPTO_ALGAPI
 	select CRYPTO_SM4
 	help
diff --git a/lib/crypto/Kconfig b/lib/crypto/Kconfig
index a3647352bff6..6871a41e5069 100644
--- a/lib/crypto/Kconfig
+++ b/lib/crypto/Kconfig
@@ -61,7 +61,8 @@ config CRYPTO_LIB_CHACHA_ARCH
 	default y if ARM64 && KERNEL_MODE_NEON
 	default y if MIPS && CPU_MIPS32_R2
 	default y if PPC64 && CPU_LITTLE_ENDIAN && VSX
-	default y if RISCV && 64BIT && RISCV_ISA_V && TOOLCHAIN_HAS_VECTOR_CRYPTO
+	default y if RISCV && 64BIT && TOOLCHAIN_HAS_VECTOR_CRYPTO && \
+		     RISCV_EFFICIENT_VECTOR_UNALIGNED_ACCESS
 	default y if S390
 	default y if X86_64
 
@@ -184,7 +185,8 @@ config CRYPTO_LIB_SHA256_ARCH
 	default y if ARM64
 	default y if MIPS && CPU_CAVIUM_OCTEON
 	default y if PPC && SPE
-	default y if RISCV && 64BIT && RISCV_ISA_V && TOOLCHAIN_HAS_VECTOR_CRYPTO
+	default y if RISCV && 64BIT && TOOLCHAIN_HAS_VECTOR_CRYPTO && \
+		     RISCV_EFFICIENT_VECTOR_UNALIGNED_ACCESS
 	default y if S390
 	default y if SPARC64
 	default y if X86_64
@@ -202,7 +204,8 @@ config CRYPTO_LIB_SHA512_ARCH
 	default y if ARM && !CPU_V7M
 	default y if ARM64
 	default y if MIPS && CPU_CAVIUM_OCTEON
-	default y if RISCV && 64BIT && RISCV_ISA_V && TOOLCHAIN_HAS_VECTOR_CRYPTO
+	default y if RISCV && 64BIT && TOOLCHAIN_HAS_VECTOR_CRYPTO && \
+		     RISCV_EFFICIENT_VECTOR_UNALIGNED_ACCESS
 	default y if S390
 	default y if SPARC64
 	default y if X86_64

From 2e8f7b170a085f0f5522f262bffe92d6ec911abb Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@kernel.org>
Date: Thu, 4 Dec 2025 21:03:30 -0800
Subject: [PATCH 3/7] lib/crypto: blake2b: Roll up BLAKE2b round loop on 32-bit

BLAKE2b has a state of 16 64-bit words.  Add the message data in and
there are 32 64-bit words.  With the current code where all the rounds
are unrolled to enable constant-folding of the blake2b_sigma values,
this results in a very large code size on 32-bit kernels, including a
recurring issue where gcc uses a large amount of stack.

There's just not much benefit to this unrolling when the code is already
so large.  Let's roll up the rounds when !CONFIG_64BIT.

To avoid having to duplicate the code, just write the code once using a
loop, and conditionally use 'unrolled_full' from <linux/unroll.h>.

Then, fold the now-unneeded ROUND() macro into the loop.  Finally, also
remove the now-unneeded override of the stack frame size warning.

Code size improvements for blake2b_compress_generic():

                  Size before (bytes)    Size after (bytes)
                  -------------------    ------------------
    i386, gcc           27584                 3632
    i386, clang         18208                 3248
    arm32, gcc          19912                 2860
    arm32, clang        21336                 3344

Running the BLAKE2b benchmark on a !CONFIG_64BIT kernel on an x86_64
processor shows a 16384B throughput change of 351 => 340 MB/s (gcc) or
442 MB/s => 375 MB/s (clang).  So clearly not much of a slowdown either.
But also that microbenchmark also effectively disregards cache usage,
which is important in practice and is far better in the smaller code.

Note: If we rolled up the loop on x86_64 too, the change would be
7024 bytes => 1584 bytes and 1960 MB/s => 1396 MB/s (gcc), or
6848 bytes => 1696 bytes and 1920 MB/s => 1263 MB/s (clang).
Maybe still worth it, though not quite as clearly beneficial.

Fixes: 91d689337fe8 ("crypto: blake2b - add blake2b generic implementation")
Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20251205050330.89704-1-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
 lib/crypto/Makefile  |  1 -
 lib/crypto/blake2b.c | 44 ++++++++++++++++++++------------------------
 2 files changed, 20 insertions(+), 25 deletions(-)

diff --git a/lib/crypto/Makefile b/lib/crypto/Makefile
index b5346cebbb55..330ab65b29c4 100644
--- a/lib/crypto/Makefile
+++ b/lib/crypto/Makefile
@@ -33,7 +33,6 @@ obj-$(CONFIG_CRYPTO_LIB_GF128MUL)		+= gf128mul.o
 
 obj-$(CONFIG_CRYPTO_LIB_BLAKE2B) += libblake2b.o
 libblake2b-y := blake2b.o
-CFLAGS_blake2b.o := -Wframe-larger-than=4096 #  https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105930
 ifeq ($(CONFIG_CRYPTO_LIB_BLAKE2B_ARCH),y)
 CFLAGS_blake2b.o += -I$(src)/$(SRCARCH)
 libblake2b-$(CONFIG_ARM) += arm/blake2b-neon-core.o
diff --git a/lib/crypto/blake2b.c b/lib/crypto/blake2b.c
index 09c6d65d8a6e..581b7f8486fa 100644
--- a/lib/crypto/blake2b.c
+++ b/lib/crypto/blake2b.c
@@ -14,6 +14,7 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/string.h>
+#include <linux/unroll.h>
 #include <linux/types.h>
 
 static const u8 blake2b_sigma[12][16] = {
@@ -73,31 +74,26 @@ blake2b_compress_generic(struct blake2b_ctx *ctx,
 	b = ror64(b ^ c, 63); \
 } while (0)
 
-#define ROUND(r) do { \
-	G(r, 0, v[0], v[ 4], v[ 8], v[12]); \
-	G(r, 1, v[1], v[ 5], v[ 9], v[13]); \
-	G(r, 2, v[2], v[ 6], v[10], v[14]); \
-	G(r, 3, v[3], v[ 7], v[11], v[15]); \
-	G(r, 4, v[0], v[ 5], v[10], v[15]); \
-	G(r, 5, v[1], v[ 6], v[11], v[12]); \
-	G(r, 6, v[2], v[ 7], v[ 8], v[13]); \
-	G(r, 7, v[3], v[ 4], v[ 9], v[14]); \
-} while (0)
-		ROUND(0);
-		ROUND(1);
-		ROUND(2);
-		ROUND(3);
-		ROUND(4);
-		ROUND(5);
-		ROUND(6);
-		ROUND(7);
-		ROUND(8);
-		ROUND(9);
-		ROUND(10);
-		ROUND(11);
-
+#ifdef CONFIG_64BIT
+		/*
+		 * Unroll the rounds loop to enable constant-folding of the
+		 * blake2b_sigma values.  Seems worthwhile on 64-bit kernels.
+		 * Not worthwhile on 32-bit kernels because the code size is
+		 * already so large there due to BLAKE2b using 64-bit words.
+		 */
+		unrolled_full
+#endif
+		for (int r = 0; r < 12; r++) {
+			G(r, 0, v[0], v[4], v[8], v[12]);
+			G(r, 1, v[1], v[5], v[9], v[13]);
+			G(r, 2, v[2], v[6], v[10], v[14]);
+			G(r, 3, v[3], v[7], v[11], v[15]);
+			G(r, 4, v[0], v[5], v[10], v[15]);
+			G(r, 5, v[1], v[6], v[11], v[12]);
+			G(r, 6, v[2], v[7], v[8], v[13]);
+			G(r, 7, v[3], v[4], v[9], v[14]);
+		}
 #undef G
-#undef ROUND
 
 		for (i = 0; i < 8; ++i)
 			ctx->h[i] ^= v[i] ^ v[i + 8];

From 68b233b1d583f7d869fbb3afe2b0531138e001f7 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@kernel.org>
Date: Thu, 4 Dec 2025 21:11:55 -0800
Subject: [PATCH 4/7] lib/crypto: blake2s: Replace manual unrolling with
 unrolled_full

As we're doing in the BLAKE2b code, use unrolled_full to make the
compiler handle the loop unrolling.  This simplifies the code slightly.
The generated object code is nearly the same with both gcc and clang.

Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20251205051155.25274-1-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
 lib/crypto/blake2s.c | 38 ++++++++++++++++----------------------
 1 file changed, 16 insertions(+), 22 deletions(-)

diff --git a/lib/crypto/blake2s.c b/lib/crypto/blake2s.c
index 6182c21ed943..71578a084742 100644
--- a/lib/crypto/blake2s.c
+++ b/lib/crypto/blake2s.c
@@ -14,6 +14,7 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/string.h>
+#include <linux/unroll.h>
 #include <linux/types.h>
 
 static const u8 blake2s_sigma[10][16] = {
@@ -71,29 +72,22 @@ blake2s_compress_generic(struct blake2s_ctx *ctx,
 	b = ror32(b ^ c, 7); \
 } while (0)
 
-#define ROUND(r) do { \
-	G(r, 0, v[0], v[ 4], v[ 8], v[12]); \
-	G(r, 1, v[1], v[ 5], v[ 9], v[13]); \
-	G(r, 2, v[2], v[ 6], v[10], v[14]); \
-	G(r, 3, v[3], v[ 7], v[11], v[15]); \
-	G(r, 4, v[0], v[ 5], v[10], v[15]); \
-	G(r, 5, v[1], v[ 6], v[11], v[12]); \
-	G(r, 6, v[2], v[ 7], v[ 8], v[13]); \
-	G(r, 7, v[3], v[ 4], v[ 9], v[14]); \
-} while (0)
-		ROUND(0);
-		ROUND(1);
-		ROUND(2);
-		ROUND(3);
-		ROUND(4);
-		ROUND(5);
-		ROUND(6);
-		ROUND(7);
-		ROUND(8);
-		ROUND(9);
-
+		/*
+		 * Unroll the rounds loop to enable constant-folding of the
+		 * blake2s_sigma values.
+		 */
+		unrolled_full
+		for (int r = 0; r < 10; r++) {
+			G(r, 0, v[0], v[4], v[8], v[12]);
+			G(r, 1, v[1], v[5], v[9], v[13]);
+			G(r, 2, v[2], v[6], v[10], v[14]);
+			G(r, 3, v[3], v[7], v[11], v[15]);
+			G(r, 4, v[0], v[5], v[10], v[15]);
+			G(r, 5, v[1], v[6], v[11], v[12]);
+			G(r, 6, v[2], v[7], v[8], v[13]);
+			G(r, 7, v[3], v[4], v[9], v[14]);
+		}
 #undef G
-#undef ROUND
 
 		for (i = 0; i < 8; ++i)
 			ctx->h[i] ^= v[i] ^ v[i + 8];

From a9a8b1a383254c9f4ed7fe23b56937f8ad3ad3ab Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Wed, 3 Dec 2025 17:38:05 +0100
Subject: [PATCH 5/7] crypto/arm64: aes/xts - Use single ksimd scope to reduce
 stack bloat

The ciphertext stealing logic in the AES-XTS implementation creates a
separate ksimd scope to call into the FP/SIMD core routines, and in some
cases (CONFIG_KASAN_STACK is one, but there might be others), the 528
byte kernel mode FP/SIMD buffer that is allocated inside this scope is
not shared with the preceding ksimd scope, resulting in unnecessary
stack bloat.

Considering that

a) the XTS ciphertext stealing logic is never called for block
   encryption use cases, and XTS is rarely used for anything else,

b) in the vast majority of cases, the entire input block is processed
   during the first iteration of the loop,

we can combine both ksimd scopes into a single one with no practical
impact on how often/how long FP/SIMD is en/disabled, allowing us to
reuse the same stack slot for both FP/SIMD routine calls.

Fixes: ba3c1b3b5ac9 ("crypto/arm64: aes-blk - Switch to 'ksimd' scoped guard API")
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Tested-by: Arnd Bergmann <arnd@arndb.de>
Link: https://lore.kernel.org/r/20251203163803.157541-5-ardb@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
 arch/arm64/crypto/aes-glue.c        | 75 ++++++++++++++---------------
 arch/arm64/crypto/aes-neonbs-glue.c | 44 ++++++++---------
 2 files changed, 57 insertions(+), 62 deletions(-)

diff --git a/arch/arm64/crypto/aes-glue.c b/arch/arm64/crypto/aes-glue.c
index b087b900d279..c51d4487e9e9 100644
--- a/arch/arm64/crypto/aes-glue.c
+++ b/arch/arm64/crypto/aes-glue.c
@@ -549,38 +549,37 @@ static int __maybe_unused xts_encrypt(struct skcipher_request *req)
 		tail = 0;
 	}
 
-	for (first = 1; walk.nbytes >= AES_BLOCK_SIZE; first = 0) {
-		int nbytes = walk.nbytes;
+	scoped_ksimd() {
+		for (first = 1; walk.nbytes >= AES_BLOCK_SIZE; first = 0) {
+			int nbytes = walk.nbytes;
 
-		if (walk.nbytes < walk.total)
-			nbytes &= ~(AES_BLOCK_SIZE - 1);
+			if (walk.nbytes < walk.total)
+				nbytes &= ~(AES_BLOCK_SIZE - 1);
 
-		scoped_ksimd()
 			aes_xts_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
 					ctx->key1.key_enc, rounds, nbytes,
 					ctx->key2.key_enc, walk.iv, first);
-		err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
-	}
+			err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
+		}
 
-	if (err || likely(!tail))
-		return err;
+		if (err || likely(!tail))
+			return err;
 
-	dst = src = scatterwalk_ffwd(sg_src, req->src, req->cryptlen);
-	if (req->dst != req->src)
-		dst = scatterwalk_ffwd(sg_dst, req->dst, req->cryptlen);
+		dst = src = scatterwalk_ffwd(sg_src, req->src, req->cryptlen);
+		if (req->dst != req->src)
+			dst = scatterwalk_ffwd(sg_dst, req->dst, req->cryptlen);
 
-	skcipher_request_set_crypt(req, src, dst, AES_BLOCK_SIZE + tail,
-				   req->iv);
+		skcipher_request_set_crypt(req, src, dst, AES_BLOCK_SIZE + tail,
+					   req->iv);
 
-	err = skcipher_walk_virt(&walk, &subreq, false);
-	if (err)
-		return err;
+		err = skcipher_walk_virt(&walk, &subreq, false);
+		if (err)
+			return err;
 
-	scoped_ksimd()
 		aes_xts_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
 				ctx->key1.key_enc, rounds, walk.nbytes,
 				ctx->key2.key_enc, walk.iv, first);
-
+	}
 	return skcipher_walk_done(&walk, 0);
 }
 
@@ -619,39 +618,37 @@ static int __maybe_unused xts_decrypt(struct skcipher_request *req)
 		tail = 0;
 	}
 
-	for (first = 1; walk.nbytes >= AES_BLOCK_SIZE; first = 0) {
-		int nbytes = walk.nbytes;
+	scoped_ksimd() {
+		for (first = 1; walk.nbytes >= AES_BLOCK_SIZE; first = 0) {
+			int nbytes = walk.nbytes;
 
-		if (walk.nbytes < walk.total)
-			nbytes &= ~(AES_BLOCK_SIZE - 1);
+			if (walk.nbytes < walk.total)
+				nbytes &= ~(AES_BLOCK_SIZE - 1);
 
-		scoped_ksimd()
 			aes_xts_decrypt(walk.dst.virt.addr, walk.src.virt.addr,
 					ctx->key1.key_dec, rounds, nbytes,
 					ctx->key2.key_enc, walk.iv, first);
-		err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
-	}
+			err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
+		}
 
-	if (err || likely(!tail))
-		return err;
+		if (err || likely(!tail))
+			return err;
 
-	dst = src = scatterwalk_ffwd(sg_src, req->src, req->cryptlen);
-	if (req->dst != req->src)
-		dst = scatterwalk_ffwd(sg_dst, req->dst, req->cryptlen);
+		dst = src = scatterwalk_ffwd(sg_src, req->src, req->cryptlen);
+		if (req->dst != req->src)
+			dst = scatterwalk_ffwd(sg_dst, req->dst, req->cryptlen);
 
-	skcipher_request_set_crypt(req, src, dst, AES_BLOCK_SIZE + tail,
-				   req->iv);
+		skcipher_request_set_crypt(req, src, dst, AES_BLOCK_SIZE + tail,
+					   req->iv);
 
-	err = skcipher_walk_virt(&walk, &subreq, false);
-	if (err)
-		return err;
+		err = skcipher_walk_virt(&walk, &subreq, false);
+		if (err)
+			return err;
 
-
-	scoped_ksimd()
 		aes_xts_decrypt(walk.dst.virt.addr, walk.src.virt.addr,
 				ctx->key1.key_dec, rounds, walk.nbytes,
 				ctx->key2.key_enc, walk.iv, first);
-
+	}
 	return skcipher_walk_done(&walk, 0);
 }
 
diff --git a/arch/arm64/crypto/aes-neonbs-glue.c b/arch/arm64/crypto/aes-neonbs-glue.c
index d496effb0a5b..cb87c8fc66b3 100644
--- a/arch/arm64/crypto/aes-neonbs-glue.c
+++ b/arch/arm64/crypto/aes-neonbs-glue.c
@@ -312,13 +312,13 @@ static int __xts_crypt(struct skcipher_request *req, bool encrypt,
 	if (err)
 		return err;
 
-	while (walk.nbytes >= AES_BLOCK_SIZE) {
-		int blocks = (walk.nbytes / AES_BLOCK_SIZE) & ~7;
-		out = walk.dst.virt.addr;
-		in = walk.src.virt.addr;
-		nbytes = walk.nbytes;
+	scoped_ksimd() {
+		while (walk.nbytes >= AES_BLOCK_SIZE) {
+			int blocks = (walk.nbytes / AES_BLOCK_SIZE) & ~7;
+			out = walk.dst.virt.addr;
+			in = walk.src.virt.addr;
+			nbytes = walk.nbytes;
 
-		scoped_ksimd() {
 			if (blocks >= 8) {
 				if (first == 1)
 					neon_aes_ecb_encrypt(walk.iv, walk.iv,
@@ -344,30 +344,28 @@ static int __xts_crypt(struct skcipher_request *req, bool encrypt,
 							     ctx->twkey, walk.iv, first);
 				nbytes = first = 0;
 			}
+			err = skcipher_walk_done(&walk, nbytes);
 		}
-		err = skcipher_walk_done(&walk, nbytes);
-	}
 
-	if (err || likely(!tail))
-		return err;
+		if (err || likely(!tail))
+			return err;
 
-	/* handle ciphertext stealing */
-	dst = src = scatterwalk_ffwd(sg_src, req->src, req->cryptlen);
-	if (req->dst != req->src)
-		dst = scatterwalk_ffwd(sg_dst, req->dst, req->cryptlen);
+		/* handle ciphertext stealing */
+		dst = src = scatterwalk_ffwd(sg_src, req->src, req->cryptlen);
+		if (req->dst != req->src)
+			dst = scatterwalk_ffwd(sg_dst, req->dst, req->cryptlen);
 
-	skcipher_request_set_crypt(req, src, dst, AES_BLOCK_SIZE + tail,
-				   req->iv);
+		skcipher_request_set_crypt(req, src, dst, AES_BLOCK_SIZE + tail,
+					   req->iv);
 
-	err = skcipher_walk_virt(&walk, req, false);
-	if (err)
-		return err;
+		err = skcipher_walk_virt(&walk, req, false);
+		if (err)
+			return err;
 
-	out = walk.dst.virt.addr;
-	in = walk.src.virt.addr;
-	nbytes = walk.nbytes;
+		out = walk.dst.virt.addr;
+		in = walk.src.virt.addr;
+		nbytes = walk.nbytes;
 
-	scoped_ksimd() {
 		if (encrypt)
 			neon_aes_xts_encrypt(out, in, ctx->cts.key_enc,
 					     ctx->key.rounds, nbytes, ctx->twkey,

From 6f7d9481920e1bc06ff21c1e6a84fdea49c6ec3d Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Wed, 3 Dec 2025 17:38:06 +0100
Subject: [PATCH 6/7] crypto/arm64: sm4/xts - Merge ksimd scopes to reduce
 stack bloat

Merge the two ksimd scopes in the implementation of SM4-XTS to prevent
stack bloat in cases where the compiler fails to combine the stack slots
for the kernel mode FP/SIMD buffers.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Tested-by: Arnd Bergmann <arnd@arndb.de>
Link: https://lore.kernel.org/r/20251203163803.157541-6-ardb@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
 arch/arm64/crypto/sm4-ce-glue.c | 46 ++++++++++++++++-----------------
 1 file changed, 22 insertions(+), 24 deletions(-)

diff --git a/arch/arm64/crypto/sm4-ce-glue.c b/arch/arm64/crypto/sm4-ce-glue.c
index 5569cece5a0b..0eeabfa9ef25 100644
--- a/arch/arm64/crypto/sm4-ce-glue.c
+++ b/arch/arm64/crypto/sm4-ce-glue.c
@@ -346,11 +346,11 @@ static int sm4_xts_crypt(struct skcipher_request *req, bool encrypt)
 		tail = 0;
 	}
 
-	while ((nbytes = walk.nbytes) >= SM4_BLOCK_SIZE) {
-		if (nbytes < walk.total)
-			nbytes &= ~(SM4_BLOCK_SIZE - 1);
+	scoped_ksimd() {
+		while ((nbytes = walk.nbytes) >= SM4_BLOCK_SIZE) {
+			if (nbytes < walk.total)
+				nbytes &= ~(SM4_BLOCK_SIZE - 1);
 
-		scoped_ksimd() {
 			if (encrypt)
 				sm4_ce_xts_enc(ctx->key1.rkey_enc, walk.dst.virt.addr,
 						walk.src.virt.addr, walk.iv, nbytes,
@@ -359,32 +359,30 @@ static int sm4_xts_crypt(struct skcipher_request *req, bool encrypt)
 				sm4_ce_xts_dec(ctx->key1.rkey_dec, walk.dst.virt.addr,
 						walk.src.virt.addr, walk.iv, nbytes,
 						rkey2_enc);
+
+			rkey2_enc = NULL;
+
+			err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
+			if (err)
+				return err;
 		}
 
-		rkey2_enc = NULL;
+		if (likely(tail == 0))
+			return 0;
 
-		err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
+		/* handle ciphertext stealing */
+
+		dst = src = scatterwalk_ffwd(sg_src, req->src, subreq.cryptlen);
+		if (req->dst != req->src)
+			dst = scatterwalk_ffwd(sg_dst, req->dst, subreq.cryptlen);
+
+		skcipher_request_set_crypt(&subreq, src, dst,
+					   SM4_BLOCK_SIZE + tail, req->iv);
+
+		err = skcipher_walk_virt(&walk, &subreq, false);
 		if (err)
 			return err;
-	}
 
-	if (likely(tail == 0))
-		return 0;
-
-	/* handle ciphertext stealing */
-
-	dst = src = scatterwalk_ffwd(sg_src, req->src, subreq.cryptlen);
-	if (req->dst != req->src)
-		dst = scatterwalk_ffwd(sg_dst, req->dst, subreq.cryptlen);
-
-	skcipher_request_set_crypt(&subreq, src, dst, SM4_BLOCK_SIZE + tail,
-				   req->iv);
-
-	err = skcipher_walk_virt(&walk, &subreq, false);
-	if (err)
-		return err;
-
-	scoped_ksimd() {
 		if (encrypt)
 			sm4_ce_xts_enc(ctx->key1.rkey_enc, walk.dst.virt.addr,
 					walk.src.virt.addr, walk.iv, walk.nbytes,

From f6a458746f905adb7d70e50e8b9383dc9e3fd75f Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@kernel.org>
Date: Tue, 9 Dec 2025 14:34:17 -0800
Subject: [PATCH 7/7] crypto: arm64/ghash - Fix incorrect output from
 ghash-neon

Commit 9a7c987fb92b ("crypto: arm64/ghash - Use API partial block
handling") made ghash_finup() pass the wrong buffer to
ghash_do_simd_update().  As a result, ghash-neon now produces incorrect
outputs when the message length isn't divisible by 16 bytes.  Fix this.

(I didn't notice this earlier because this code is reached only on CPUs
that support NEON but not PMULL.  I haven't yet found a way to get
qemu-system-aarch64 to emulate that configuration.)

Fixes: 9a7c987fb92b ("crypto: arm64/ghash - Use API partial block handling")
Cc: stable@vger.kernel.org
Reported-by: Diederik de Haas <diederik@cknow-tech.com>
Closes: https://lore.kernel.org/linux-crypto/DETXT7QI62KE.F3CGH2VWX1SC@cknow-tech.com/
Tested-by: Diederik de Haas <diederik@cknow-tech.com>
Acked-by: Herbert Xu <herbert@gondor.apana.org.au>
Link: https://lore.kernel.org/r/20251209223417.112294-1-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
 arch/arm64/crypto/ghash-ce-glue.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm64/crypto/ghash-ce-glue.c b/arch/arm64/crypto/ghash-ce-glue.c
index 7951557a285a..ef249d06c92c 100644
--- a/arch/arm64/crypto/ghash-ce-glue.c
+++ b/arch/arm64/crypto/ghash-ce-glue.c
@@ -133,7 +133,7 @@ static int ghash_finup(struct shash_desc *desc, const u8 *src,
 		u8 buf[GHASH_BLOCK_SIZE] = {};
 
 		memcpy(buf, src, len);
-		ghash_do_simd_update(1, ctx->digest, src, key, NULL,
+		ghash_do_simd_update(1, ctx->digest, buf, key, NULL,
 				     pmull_ghash_update_p8);
 		memzero_explicit(buf, sizeof(buf));
 	}