From 43169328c7b4623b54b7713ec68479cebda5465f Mon Sep 17 00:00:00 2001 From: Vivian Wang Date: Tue, 2 Dec 2025 13:25:07 +0800 Subject: [PATCH 1/7] lib/crypto: riscv/chacha: Avoid s0/fp register In chacha_zvkb, avoid using the s0 register, which is the frame pointer, by reallocating KEY0 to t5. This makes stack traces available if e.g. a crash happens in chacha_zvkb. No frame pointer maintenance is otherwise required since this is a leaf function. Signed-off-by: Vivian Wang Fixes: bb54668837a0 ("crypto: riscv - add vector crypto accelerated ChaCha20") Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20251202-riscv-chacha_zvkb-fp-v2-1-7bd00098c9dc@iscas.ac.cn Signed-off-by: Eric Biggers --- lib/crypto/riscv/chacha-riscv64-zvkb.S | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/lib/crypto/riscv/chacha-riscv64-zvkb.S b/lib/crypto/riscv/chacha-riscv64-zvkb.S index b777d0b4e379..3d183ec818f5 100644 --- a/lib/crypto/riscv/chacha-riscv64-zvkb.S +++ b/lib/crypto/riscv/chacha-riscv64-zvkb.S @@ -60,7 +60,8 @@ #define VL t2 #define STRIDE t3 #define ROUND_CTR t4 -#define KEY0 s0 +#define KEY0 t5 +// Avoid s0/fp to allow for unwinding #define KEY1 s1 #define KEY2 s2 #define KEY3 s3 @@ -143,7 +144,6 @@ // The updated 32-bit counter is written back to state->x[12] before returning. SYM_FUNC_START(chacha_zvkb) addi sp, sp, -96 - sd s0, 0(sp) sd s1, 8(sp) sd s2, 16(sp) sd s3, 24(sp) @@ -280,7 +280,6 @@ SYM_FUNC_START(chacha_zvkb) bnez NBLOCKS, .Lblock_loop sw COUNTER, 48(STATEP) - ld s0, 0(sp) ld s1, 8(sp) ld s2, 16(sp) ld s3, 24(sp) From 1cd5bb6e9e027bab33aafd58fe8340124869ba62 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sat, 6 Dec 2025 13:37:50 -0800 Subject: [PATCH 2/7] lib/crypto: riscv: Depend on RISCV_EFFICIENT_VECTOR_UNALIGNED_ACCESS Replace the RISCV_ISA_V dependency of the RISC-V crypto code with RISCV_EFFICIENT_VECTOR_UNALIGNED_ACCESS, which implies RISCV_ISA_V as well as vector unaligned accesses being efficient. This is necessary because this code assumes that vector unaligned accesses are supported and are efficient. (It does so to avoid having to use lots of extra vsetvli instructions to switch the element width back and forth between 8 and either 32 or 64.) This was omitted from the code originally just because the RISC-V kernel support for detecting this feature didn't exist yet. Support has now been added, but it's fragmented into per-CPU runtime detection, a command-line parameter, and a kconfig option. The kconfig option is the only reasonable way to do it, though, so let's just rely on that. Fixes: eb24af5d7a05 ("crypto: riscv - add vector crypto accelerated AES-{ECB,CBC,CTR,XTS}") Fixes: bb54668837a0 ("crypto: riscv - add vector crypto accelerated ChaCha20") Fixes: 600a3853dfa0 ("crypto: riscv - add vector crypto accelerated GHASH") Fixes: 8c8e40470ffe ("crypto: riscv - add vector crypto accelerated SHA-{256,224}") Fixes: b3415925a08b ("crypto: riscv - add vector crypto accelerated SHA-{512,384}") Fixes: 563a5255afa2 ("crypto: riscv - add vector crypto accelerated SM3") Fixes: b8d06352bbf3 ("crypto: riscv - add vector crypto accelerated SM4") Cc: stable@vger.kernel.org Reported-by: Vivian Wang Closes: https://lore.kernel.org/r/b3cfcdac-0337-4db0-a611-258f2868855f@iscas.ac.cn/ Reviewed-by: Jerry Shih Link: https://lore.kernel.org/r/20251206213750.81474-1-ebiggers@kernel.org Signed-off-by: Eric Biggers --- arch/riscv/crypto/Kconfig | 12 ++++++++---- lib/crypto/Kconfig | 9 ++++++--- 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/arch/riscv/crypto/Kconfig b/arch/riscv/crypto/Kconfig index a75d6325607b..14c5acb935e9 100644 --- a/arch/riscv/crypto/Kconfig +++ b/arch/riscv/crypto/Kconfig @@ -4,7 +4,8 @@ menu "Accelerated Cryptographic Algorithms for CPU (riscv)" config CRYPTO_AES_RISCV64 tristate "Ciphers: AES, modes: ECB, CBC, CTS, CTR, XTS" - depends on 64BIT && RISCV_ISA_V && TOOLCHAIN_HAS_VECTOR_CRYPTO + depends on 64BIT && TOOLCHAIN_HAS_VECTOR_CRYPTO && \ + RISCV_EFFICIENT_VECTOR_UNALIGNED_ACCESS select CRYPTO_ALGAPI select CRYPTO_LIB_AES select CRYPTO_SKCIPHER @@ -20,7 +21,8 @@ config CRYPTO_AES_RISCV64 config CRYPTO_GHASH_RISCV64 tristate "Hash functions: GHASH" - depends on 64BIT && RISCV_ISA_V && TOOLCHAIN_HAS_VECTOR_CRYPTO + depends on 64BIT && TOOLCHAIN_HAS_VECTOR_CRYPTO && \ + RISCV_EFFICIENT_VECTOR_UNALIGNED_ACCESS select CRYPTO_GCM help GCM GHASH function (NIST SP 800-38D) @@ -30,7 +32,8 @@ config CRYPTO_GHASH_RISCV64 config CRYPTO_SM3_RISCV64 tristate "Hash functions: SM3 (ShangMi 3)" - depends on 64BIT && RISCV_ISA_V && TOOLCHAIN_HAS_VECTOR_CRYPTO + depends on 64BIT && TOOLCHAIN_HAS_VECTOR_CRYPTO && \ + RISCV_EFFICIENT_VECTOR_UNALIGNED_ACCESS select CRYPTO_HASH select CRYPTO_LIB_SM3 help @@ -42,7 +45,8 @@ config CRYPTO_SM3_RISCV64 config CRYPTO_SM4_RISCV64 tristate "Ciphers: SM4 (ShangMi 4)" - depends on 64BIT && RISCV_ISA_V && TOOLCHAIN_HAS_VECTOR_CRYPTO + depends on 64BIT && TOOLCHAIN_HAS_VECTOR_CRYPTO && \ + RISCV_EFFICIENT_VECTOR_UNALIGNED_ACCESS select CRYPTO_ALGAPI select CRYPTO_SM4 help diff --git a/lib/crypto/Kconfig b/lib/crypto/Kconfig index a3647352bff6..6871a41e5069 100644 --- a/lib/crypto/Kconfig +++ b/lib/crypto/Kconfig @@ -61,7 +61,8 @@ config CRYPTO_LIB_CHACHA_ARCH default y if ARM64 && KERNEL_MODE_NEON default y if MIPS && CPU_MIPS32_R2 default y if PPC64 && CPU_LITTLE_ENDIAN && VSX - default y if RISCV && 64BIT && RISCV_ISA_V && TOOLCHAIN_HAS_VECTOR_CRYPTO + default y if RISCV && 64BIT && TOOLCHAIN_HAS_VECTOR_CRYPTO && \ + RISCV_EFFICIENT_VECTOR_UNALIGNED_ACCESS default y if S390 default y if X86_64 @@ -184,7 +185,8 @@ config CRYPTO_LIB_SHA256_ARCH default y if ARM64 default y if MIPS && CPU_CAVIUM_OCTEON default y if PPC && SPE - default y if RISCV && 64BIT && RISCV_ISA_V && TOOLCHAIN_HAS_VECTOR_CRYPTO + default y if RISCV && 64BIT && TOOLCHAIN_HAS_VECTOR_CRYPTO && \ + RISCV_EFFICIENT_VECTOR_UNALIGNED_ACCESS default y if S390 default y if SPARC64 default y if X86_64 @@ -202,7 +204,8 @@ config CRYPTO_LIB_SHA512_ARCH default y if ARM && !CPU_V7M default y if ARM64 default y if MIPS && CPU_CAVIUM_OCTEON - default y if RISCV && 64BIT && RISCV_ISA_V && TOOLCHAIN_HAS_VECTOR_CRYPTO + default y if RISCV && 64BIT && TOOLCHAIN_HAS_VECTOR_CRYPTO && \ + RISCV_EFFICIENT_VECTOR_UNALIGNED_ACCESS default y if S390 default y if SPARC64 default y if X86_64 From 2e8f7b170a085f0f5522f262bffe92d6ec911abb Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 4 Dec 2025 21:03:30 -0800 Subject: [PATCH 3/7] lib/crypto: blake2b: Roll up BLAKE2b round loop on 32-bit BLAKE2b has a state of 16 64-bit words. Add the message data in and there are 32 64-bit words. With the current code where all the rounds are unrolled to enable constant-folding of the blake2b_sigma values, this results in a very large code size on 32-bit kernels, including a recurring issue where gcc uses a large amount of stack. There's just not much benefit to this unrolling when the code is already so large. Let's roll up the rounds when !CONFIG_64BIT. To avoid having to duplicate the code, just write the code once using a loop, and conditionally use 'unrolled_full' from . Then, fold the now-unneeded ROUND() macro into the loop. Finally, also remove the now-unneeded override of the stack frame size warning. Code size improvements for blake2b_compress_generic(): Size before (bytes) Size after (bytes) ------------------- ------------------ i386, gcc 27584 3632 i386, clang 18208 3248 arm32, gcc 19912 2860 arm32, clang 21336 3344 Running the BLAKE2b benchmark on a !CONFIG_64BIT kernel on an x86_64 processor shows a 16384B throughput change of 351 => 340 MB/s (gcc) or 442 MB/s => 375 MB/s (clang). So clearly not much of a slowdown either. But also that microbenchmark also effectively disregards cache usage, which is important in practice and is far better in the smaller code. Note: If we rolled up the loop on x86_64 too, the change would be 7024 bytes => 1584 bytes and 1960 MB/s => 1396 MB/s (gcc), or 6848 bytes => 1696 bytes and 1920 MB/s => 1263 MB/s (clang). Maybe still worth it, though not quite as clearly beneficial. Fixes: 91d689337fe8 ("crypto: blake2b - add blake2b generic implementation") Reviewed-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20251205050330.89704-1-ebiggers@kernel.org Signed-off-by: Eric Biggers --- lib/crypto/Makefile | 1 - lib/crypto/blake2b.c | 44 ++++++++++++++++++++------------------------ 2 files changed, 20 insertions(+), 25 deletions(-) diff --git a/lib/crypto/Makefile b/lib/crypto/Makefile index b5346cebbb55..330ab65b29c4 100644 --- a/lib/crypto/Makefile +++ b/lib/crypto/Makefile @@ -33,7 +33,6 @@ obj-$(CONFIG_CRYPTO_LIB_GF128MUL) += gf128mul.o obj-$(CONFIG_CRYPTO_LIB_BLAKE2B) += libblake2b.o libblake2b-y := blake2b.o -CFLAGS_blake2b.o := -Wframe-larger-than=4096 # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105930 ifeq ($(CONFIG_CRYPTO_LIB_BLAKE2B_ARCH),y) CFLAGS_blake2b.o += -I$(src)/$(SRCARCH) libblake2b-$(CONFIG_ARM) += arm/blake2b-neon-core.o diff --git a/lib/crypto/blake2b.c b/lib/crypto/blake2b.c index 09c6d65d8a6e..581b7f8486fa 100644 --- a/lib/crypto/blake2b.c +++ b/lib/crypto/blake2b.c @@ -14,6 +14,7 @@ #include #include #include +#include #include static const u8 blake2b_sigma[12][16] = { @@ -73,31 +74,26 @@ blake2b_compress_generic(struct blake2b_ctx *ctx, b = ror64(b ^ c, 63); \ } while (0) -#define ROUND(r) do { \ - G(r, 0, v[0], v[ 4], v[ 8], v[12]); \ - G(r, 1, v[1], v[ 5], v[ 9], v[13]); \ - G(r, 2, v[2], v[ 6], v[10], v[14]); \ - G(r, 3, v[3], v[ 7], v[11], v[15]); \ - G(r, 4, v[0], v[ 5], v[10], v[15]); \ - G(r, 5, v[1], v[ 6], v[11], v[12]); \ - G(r, 6, v[2], v[ 7], v[ 8], v[13]); \ - G(r, 7, v[3], v[ 4], v[ 9], v[14]); \ -} while (0) - ROUND(0); - ROUND(1); - ROUND(2); - ROUND(3); - ROUND(4); - ROUND(5); - ROUND(6); - ROUND(7); - ROUND(8); - ROUND(9); - ROUND(10); - ROUND(11); - +#ifdef CONFIG_64BIT + /* + * Unroll the rounds loop to enable constant-folding of the + * blake2b_sigma values. Seems worthwhile on 64-bit kernels. + * Not worthwhile on 32-bit kernels because the code size is + * already so large there due to BLAKE2b using 64-bit words. + */ + unrolled_full +#endif + for (int r = 0; r < 12; r++) { + G(r, 0, v[0], v[4], v[8], v[12]); + G(r, 1, v[1], v[5], v[9], v[13]); + G(r, 2, v[2], v[6], v[10], v[14]); + G(r, 3, v[3], v[7], v[11], v[15]); + G(r, 4, v[0], v[5], v[10], v[15]); + G(r, 5, v[1], v[6], v[11], v[12]); + G(r, 6, v[2], v[7], v[8], v[13]); + G(r, 7, v[3], v[4], v[9], v[14]); + } #undef G -#undef ROUND for (i = 0; i < 8; ++i) ctx->h[i] ^= v[i] ^ v[i + 8]; From 68b233b1d583f7d869fbb3afe2b0531138e001f7 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 4 Dec 2025 21:11:55 -0800 Subject: [PATCH 4/7] lib/crypto: blake2s: Replace manual unrolling with unrolled_full As we're doing in the BLAKE2b code, use unrolled_full to make the compiler handle the loop unrolling. This simplifies the code slightly. The generated object code is nearly the same with both gcc and clang. Reviewed-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20251205051155.25274-1-ebiggers@kernel.org Signed-off-by: Eric Biggers --- lib/crypto/blake2s.c | 38 ++++++++++++++++---------------------- 1 file changed, 16 insertions(+), 22 deletions(-) diff --git a/lib/crypto/blake2s.c b/lib/crypto/blake2s.c index 6182c21ed943..71578a084742 100644 --- a/lib/crypto/blake2s.c +++ b/lib/crypto/blake2s.c @@ -14,6 +14,7 @@ #include #include #include +#include #include static const u8 blake2s_sigma[10][16] = { @@ -71,29 +72,22 @@ blake2s_compress_generic(struct blake2s_ctx *ctx, b = ror32(b ^ c, 7); \ } while (0) -#define ROUND(r) do { \ - G(r, 0, v[0], v[ 4], v[ 8], v[12]); \ - G(r, 1, v[1], v[ 5], v[ 9], v[13]); \ - G(r, 2, v[2], v[ 6], v[10], v[14]); \ - G(r, 3, v[3], v[ 7], v[11], v[15]); \ - G(r, 4, v[0], v[ 5], v[10], v[15]); \ - G(r, 5, v[1], v[ 6], v[11], v[12]); \ - G(r, 6, v[2], v[ 7], v[ 8], v[13]); \ - G(r, 7, v[3], v[ 4], v[ 9], v[14]); \ -} while (0) - ROUND(0); - ROUND(1); - ROUND(2); - ROUND(3); - ROUND(4); - ROUND(5); - ROUND(6); - ROUND(7); - ROUND(8); - ROUND(9); - + /* + * Unroll the rounds loop to enable constant-folding of the + * blake2s_sigma values. + */ + unrolled_full + for (int r = 0; r < 10; r++) { + G(r, 0, v[0], v[4], v[8], v[12]); + G(r, 1, v[1], v[5], v[9], v[13]); + G(r, 2, v[2], v[6], v[10], v[14]); + G(r, 3, v[3], v[7], v[11], v[15]); + G(r, 4, v[0], v[5], v[10], v[15]); + G(r, 5, v[1], v[6], v[11], v[12]); + G(r, 6, v[2], v[7], v[8], v[13]); + G(r, 7, v[3], v[4], v[9], v[14]); + } #undef G -#undef ROUND for (i = 0; i < 8; ++i) ctx->h[i] ^= v[i] ^ v[i + 8]; From a9a8b1a383254c9f4ed7fe23b56937f8ad3ad3ab Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Wed, 3 Dec 2025 17:38:05 +0100 Subject: [PATCH 5/7] crypto/arm64: aes/xts - Use single ksimd scope to reduce stack bloat The ciphertext stealing logic in the AES-XTS implementation creates a separate ksimd scope to call into the FP/SIMD core routines, and in some cases (CONFIG_KASAN_STACK is one, but there might be others), the 528 byte kernel mode FP/SIMD buffer that is allocated inside this scope is not shared with the preceding ksimd scope, resulting in unnecessary stack bloat. Considering that a) the XTS ciphertext stealing logic is never called for block encryption use cases, and XTS is rarely used for anything else, b) in the vast majority of cases, the entire input block is processed during the first iteration of the loop, we can combine both ksimd scopes into a single one with no practical impact on how often/how long FP/SIMD is en/disabled, allowing us to reuse the same stack slot for both FP/SIMD routine calls. Fixes: ba3c1b3b5ac9 ("crypto/arm64: aes-blk - Switch to 'ksimd' scoped guard API") Signed-off-by: Ard Biesheuvel Tested-by: Arnd Bergmann Link: https://lore.kernel.org/r/20251203163803.157541-5-ardb@kernel.org Signed-off-by: Eric Biggers --- arch/arm64/crypto/aes-glue.c | 75 ++++++++++++++--------------- arch/arm64/crypto/aes-neonbs-glue.c | 44 ++++++++--------- 2 files changed, 57 insertions(+), 62 deletions(-) diff --git a/arch/arm64/crypto/aes-glue.c b/arch/arm64/crypto/aes-glue.c index b087b900d279..c51d4487e9e9 100644 --- a/arch/arm64/crypto/aes-glue.c +++ b/arch/arm64/crypto/aes-glue.c @@ -549,38 +549,37 @@ static int __maybe_unused xts_encrypt(struct skcipher_request *req) tail = 0; } - for (first = 1; walk.nbytes >= AES_BLOCK_SIZE; first = 0) { - int nbytes = walk.nbytes; + scoped_ksimd() { + for (first = 1; walk.nbytes >= AES_BLOCK_SIZE; first = 0) { + int nbytes = walk.nbytes; - if (walk.nbytes < walk.total) - nbytes &= ~(AES_BLOCK_SIZE - 1); + if (walk.nbytes < walk.total) + nbytes &= ~(AES_BLOCK_SIZE - 1); - scoped_ksimd() aes_xts_encrypt(walk.dst.virt.addr, walk.src.virt.addr, ctx->key1.key_enc, rounds, nbytes, ctx->key2.key_enc, walk.iv, first); - err = skcipher_walk_done(&walk, walk.nbytes - nbytes); - } + err = skcipher_walk_done(&walk, walk.nbytes - nbytes); + } - if (err || likely(!tail)) - return err; + if (err || likely(!tail)) + return err; - dst = src = scatterwalk_ffwd(sg_src, req->src, req->cryptlen); - if (req->dst != req->src) - dst = scatterwalk_ffwd(sg_dst, req->dst, req->cryptlen); + dst = src = scatterwalk_ffwd(sg_src, req->src, req->cryptlen); + if (req->dst != req->src) + dst = scatterwalk_ffwd(sg_dst, req->dst, req->cryptlen); - skcipher_request_set_crypt(req, src, dst, AES_BLOCK_SIZE + tail, - req->iv); + skcipher_request_set_crypt(req, src, dst, AES_BLOCK_SIZE + tail, + req->iv); - err = skcipher_walk_virt(&walk, &subreq, false); - if (err) - return err; + err = skcipher_walk_virt(&walk, &subreq, false); + if (err) + return err; - scoped_ksimd() aes_xts_encrypt(walk.dst.virt.addr, walk.src.virt.addr, ctx->key1.key_enc, rounds, walk.nbytes, ctx->key2.key_enc, walk.iv, first); - + } return skcipher_walk_done(&walk, 0); } @@ -619,39 +618,37 @@ static int __maybe_unused xts_decrypt(struct skcipher_request *req) tail = 0; } - for (first = 1; walk.nbytes >= AES_BLOCK_SIZE; first = 0) { - int nbytes = walk.nbytes; + scoped_ksimd() { + for (first = 1; walk.nbytes >= AES_BLOCK_SIZE; first = 0) { + int nbytes = walk.nbytes; - if (walk.nbytes < walk.total) - nbytes &= ~(AES_BLOCK_SIZE - 1); + if (walk.nbytes < walk.total) + nbytes &= ~(AES_BLOCK_SIZE - 1); - scoped_ksimd() aes_xts_decrypt(walk.dst.virt.addr, walk.src.virt.addr, ctx->key1.key_dec, rounds, nbytes, ctx->key2.key_enc, walk.iv, first); - err = skcipher_walk_done(&walk, walk.nbytes - nbytes); - } + err = skcipher_walk_done(&walk, walk.nbytes - nbytes); + } - if (err || likely(!tail)) - return err; + if (err || likely(!tail)) + return err; - dst = src = scatterwalk_ffwd(sg_src, req->src, req->cryptlen); - if (req->dst != req->src) - dst = scatterwalk_ffwd(sg_dst, req->dst, req->cryptlen); + dst = src = scatterwalk_ffwd(sg_src, req->src, req->cryptlen); + if (req->dst != req->src) + dst = scatterwalk_ffwd(sg_dst, req->dst, req->cryptlen); - skcipher_request_set_crypt(req, src, dst, AES_BLOCK_SIZE + tail, - req->iv); + skcipher_request_set_crypt(req, src, dst, AES_BLOCK_SIZE + tail, + req->iv); - err = skcipher_walk_virt(&walk, &subreq, false); - if (err) - return err; + err = skcipher_walk_virt(&walk, &subreq, false); + if (err) + return err; - - scoped_ksimd() aes_xts_decrypt(walk.dst.virt.addr, walk.src.virt.addr, ctx->key1.key_dec, rounds, walk.nbytes, ctx->key2.key_enc, walk.iv, first); - + } return skcipher_walk_done(&walk, 0); } diff --git a/arch/arm64/crypto/aes-neonbs-glue.c b/arch/arm64/crypto/aes-neonbs-glue.c index d496effb0a5b..cb87c8fc66b3 100644 --- a/arch/arm64/crypto/aes-neonbs-glue.c +++ b/arch/arm64/crypto/aes-neonbs-glue.c @@ -312,13 +312,13 @@ static int __xts_crypt(struct skcipher_request *req, bool encrypt, if (err) return err; - while (walk.nbytes >= AES_BLOCK_SIZE) { - int blocks = (walk.nbytes / AES_BLOCK_SIZE) & ~7; - out = walk.dst.virt.addr; - in = walk.src.virt.addr; - nbytes = walk.nbytes; + scoped_ksimd() { + while (walk.nbytes >= AES_BLOCK_SIZE) { + int blocks = (walk.nbytes / AES_BLOCK_SIZE) & ~7; + out = walk.dst.virt.addr; + in = walk.src.virt.addr; + nbytes = walk.nbytes; - scoped_ksimd() { if (blocks >= 8) { if (first == 1) neon_aes_ecb_encrypt(walk.iv, walk.iv, @@ -344,30 +344,28 @@ static int __xts_crypt(struct skcipher_request *req, bool encrypt, ctx->twkey, walk.iv, first); nbytes = first = 0; } + err = skcipher_walk_done(&walk, nbytes); } - err = skcipher_walk_done(&walk, nbytes); - } - if (err || likely(!tail)) - return err; + if (err || likely(!tail)) + return err; - /* handle ciphertext stealing */ - dst = src = scatterwalk_ffwd(sg_src, req->src, req->cryptlen); - if (req->dst != req->src) - dst = scatterwalk_ffwd(sg_dst, req->dst, req->cryptlen); + /* handle ciphertext stealing */ + dst = src = scatterwalk_ffwd(sg_src, req->src, req->cryptlen); + if (req->dst != req->src) + dst = scatterwalk_ffwd(sg_dst, req->dst, req->cryptlen); - skcipher_request_set_crypt(req, src, dst, AES_BLOCK_SIZE + tail, - req->iv); + skcipher_request_set_crypt(req, src, dst, AES_BLOCK_SIZE + tail, + req->iv); - err = skcipher_walk_virt(&walk, req, false); - if (err) - return err; + err = skcipher_walk_virt(&walk, req, false); + if (err) + return err; - out = walk.dst.virt.addr; - in = walk.src.virt.addr; - nbytes = walk.nbytes; + out = walk.dst.virt.addr; + in = walk.src.virt.addr; + nbytes = walk.nbytes; - scoped_ksimd() { if (encrypt) neon_aes_xts_encrypt(out, in, ctx->cts.key_enc, ctx->key.rounds, nbytes, ctx->twkey, From 6f7d9481920e1bc06ff21c1e6a84fdea49c6ec3d Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Wed, 3 Dec 2025 17:38:06 +0100 Subject: [PATCH 6/7] crypto/arm64: sm4/xts - Merge ksimd scopes to reduce stack bloat Merge the two ksimd scopes in the implementation of SM4-XTS to prevent stack bloat in cases where the compiler fails to combine the stack slots for the kernel mode FP/SIMD buffers. Signed-off-by: Ard Biesheuvel Tested-by: Arnd Bergmann Link: https://lore.kernel.org/r/20251203163803.157541-6-ardb@kernel.org Signed-off-by: Eric Biggers --- arch/arm64/crypto/sm4-ce-glue.c | 46 ++++++++++++++++----------------- 1 file changed, 22 insertions(+), 24 deletions(-) diff --git a/arch/arm64/crypto/sm4-ce-glue.c b/arch/arm64/crypto/sm4-ce-glue.c index 5569cece5a0b..0eeabfa9ef25 100644 --- a/arch/arm64/crypto/sm4-ce-glue.c +++ b/arch/arm64/crypto/sm4-ce-glue.c @@ -346,11 +346,11 @@ static int sm4_xts_crypt(struct skcipher_request *req, bool encrypt) tail = 0; } - while ((nbytes = walk.nbytes) >= SM4_BLOCK_SIZE) { - if (nbytes < walk.total) - nbytes &= ~(SM4_BLOCK_SIZE - 1); + scoped_ksimd() { + while ((nbytes = walk.nbytes) >= SM4_BLOCK_SIZE) { + if (nbytes < walk.total) + nbytes &= ~(SM4_BLOCK_SIZE - 1); - scoped_ksimd() { if (encrypt) sm4_ce_xts_enc(ctx->key1.rkey_enc, walk.dst.virt.addr, walk.src.virt.addr, walk.iv, nbytes, @@ -359,32 +359,30 @@ static int sm4_xts_crypt(struct skcipher_request *req, bool encrypt) sm4_ce_xts_dec(ctx->key1.rkey_dec, walk.dst.virt.addr, walk.src.virt.addr, walk.iv, nbytes, rkey2_enc); + + rkey2_enc = NULL; + + err = skcipher_walk_done(&walk, walk.nbytes - nbytes); + if (err) + return err; } - rkey2_enc = NULL; + if (likely(tail == 0)) + return 0; - err = skcipher_walk_done(&walk, walk.nbytes - nbytes); + /* handle ciphertext stealing */ + + dst = src = scatterwalk_ffwd(sg_src, req->src, subreq.cryptlen); + if (req->dst != req->src) + dst = scatterwalk_ffwd(sg_dst, req->dst, subreq.cryptlen); + + skcipher_request_set_crypt(&subreq, src, dst, + SM4_BLOCK_SIZE + tail, req->iv); + + err = skcipher_walk_virt(&walk, &subreq, false); if (err) return err; - } - if (likely(tail == 0)) - return 0; - - /* handle ciphertext stealing */ - - dst = src = scatterwalk_ffwd(sg_src, req->src, subreq.cryptlen); - if (req->dst != req->src) - dst = scatterwalk_ffwd(sg_dst, req->dst, subreq.cryptlen); - - skcipher_request_set_crypt(&subreq, src, dst, SM4_BLOCK_SIZE + tail, - req->iv); - - err = skcipher_walk_virt(&walk, &subreq, false); - if (err) - return err; - - scoped_ksimd() { if (encrypt) sm4_ce_xts_enc(ctx->key1.rkey_enc, walk.dst.virt.addr, walk.src.virt.addr, walk.iv, walk.nbytes, From f6a458746f905adb7d70e50e8b9383dc9e3fd75f Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 9 Dec 2025 14:34:17 -0800 Subject: [PATCH 7/7] crypto: arm64/ghash - Fix incorrect output from ghash-neon Commit 9a7c987fb92b ("crypto: arm64/ghash - Use API partial block handling") made ghash_finup() pass the wrong buffer to ghash_do_simd_update(). As a result, ghash-neon now produces incorrect outputs when the message length isn't divisible by 16 bytes. Fix this. (I didn't notice this earlier because this code is reached only on CPUs that support NEON but not PMULL. I haven't yet found a way to get qemu-system-aarch64 to emulate that configuration.) Fixes: 9a7c987fb92b ("crypto: arm64/ghash - Use API partial block handling") Cc: stable@vger.kernel.org Reported-by: Diederik de Haas Closes: https://lore.kernel.org/linux-crypto/DETXT7QI62KE.F3CGH2VWX1SC@cknow-tech.com/ Tested-by: Diederik de Haas Acked-by: Herbert Xu Link: https://lore.kernel.org/r/20251209223417.112294-1-ebiggers@kernel.org Signed-off-by: Eric Biggers --- arch/arm64/crypto/ghash-ce-glue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/crypto/ghash-ce-glue.c b/arch/arm64/crypto/ghash-ce-glue.c index 7951557a285a..ef249d06c92c 100644 --- a/arch/arm64/crypto/ghash-ce-glue.c +++ b/arch/arm64/crypto/ghash-ce-glue.c @@ -133,7 +133,7 @@ static int ghash_finup(struct shash_desc *desc, const u8 *src, u8 buf[GHASH_BLOCK_SIZE] = {}; memcpy(buf, src, len); - ghash_do_simd_update(1, ctx->digest, src, key, NULL, + ghash_do_simd_update(1, ctx->digest, buf, key, NULL, pmull_ghash_update_p8); memzero_explicit(buf, sizeof(buf)); }