lib/crypto: arm/blake2b: Migrate optimized code into library

Migrate the arm-optimized BLAKE2b code from arch/arm/crypto/ to
lib/crypto/arm/.  This makes the BLAKE2b library able to use it, and it
also simplifies the code because it's easier to integrate with the
library than crypto_shash.

This temporarily makes the arm-optimized BLAKE2b code unavailable via
crypto_shash.  A later commit reimplements the blake2b-* crypto_shash
algorithms on top of the BLAKE2b library API, making it available again.

Note that as per the lib/crypto/ convention, the optimized code is now
enabled by default.  So, this also fixes the longstanding issue where
the optimized BLAKE2b code was not enabled by default.

To see the diff from arch/arm/crypto/blake2b-neon-glue.c to
lib/crypto/arm/blake2b.h, view this commit with 'git show -M10'.

Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20251018043106.375964-8-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
This commit is contained in:
Eric Biggers 2025-10-17 21:31:03 -07:00
parent 23a16c9533
commit ba6617bd47
7 changed files with 59 additions and 135 deletions

View File

@ -33,22 +33,6 @@ config CRYPTO_NHPOLY1305_NEON
Architecture: arm using: Architecture: arm using:
- NEON (Advanced SIMD) extensions - NEON (Advanced SIMD) extensions
config CRYPTO_BLAKE2B_NEON
tristate "Hash functions: BLAKE2b (NEON)"
depends on KERNEL_MODE_NEON
select CRYPTO_BLAKE2B
help
BLAKE2b cryptographic hash function (RFC 7693)
Architecture: arm using
- NEON (Advanced SIMD) extensions
BLAKE2b digest algorithm optimized with ARM NEON instructions.
On ARM processors that have NEON support but not the ARMv8
Crypto Extensions, typically this BLAKE2b implementation is
much faster than the SHA-2 family and slightly faster than
SHA-1.
config CRYPTO_AES_ARM config CRYPTO_AES_ARM
tristate "Ciphers: AES" tristate "Ciphers: AES"
select CRYPTO_ALGAPI select CRYPTO_ALGAPI

View File

@ -5,7 +5,6 @@
obj-$(CONFIG_CRYPTO_AES_ARM) += aes-arm.o obj-$(CONFIG_CRYPTO_AES_ARM) += aes-arm.o
obj-$(CONFIG_CRYPTO_AES_ARM_BS) += aes-arm-bs.o obj-$(CONFIG_CRYPTO_AES_ARM_BS) += aes-arm-bs.o
obj-$(CONFIG_CRYPTO_BLAKE2B_NEON) += blake2b-neon.o
obj-$(CONFIG_CRYPTO_NHPOLY1305_NEON) += nhpoly1305-neon.o obj-$(CONFIG_CRYPTO_NHPOLY1305_NEON) += nhpoly1305-neon.o
obj-$(CONFIG_CRYPTO_AES_ARM_CE) += aes-arm-ce.o obj-$(CONFIG_CRYPTO_AES_ARM_CE) += aes-arm-ce.o
@ -13,7 +12,6 @@ obj-$(CONFIG_CRYPTO_GHASH_ARM_CE) += ghash-arm-ce.o
aes-arm-y := aes-cipher-core.o aes-cipher-glue.o aes-arm-y := aes-cipher-core.o aes-cipher-glue.o
aes-arm-bs-y := aes-neonbs-core.o aes-neonbs-glue.o aes-arm-bs-y := aes-neonbs-core.o aes-neonbs-glue.o
blake2b-neon-y := blake2b-neon-core.o blake2b-neon-glue.o
aes-arm-ce-y := aes-ce-core.o aes-ce-glue.o aes-arm-ce-y := aes-ce-core.o aes-ce-glue.o
ghash-arm-ce-y := ghash-ce-core.o ghash-ce-glue.o ghash-arm-ce-y := ghash-ce-core.o ghash-ce-glue.o
nhpoly1305-neon-y := nh-neon-core.o nhpoly1305-neon-glue.o nhpoly1305-neon-y := nh-neon-core.o nhpoly1305-neon-glue.o

View File

@ -1,104 +0,0 @@
// SPDX-License-Identifier: GPL-2.0-or-later
/*
* BLAKE2b digest algorithm, NEON accelerated
*
* Copyright 2020 Google LLC
*/
#include <crypto/internal/blake2b.h>
#include <crypto/internal/hash.h>
#include <linux/module.h>
#include <linux/sizes.h>
#include <asm/neon.h>
#include <asm/simd.h>
asmlinkage void blake2b_compress_neon(struct blake2b_state *state,
const u8 *block, size_t nblocks, u32 inc);
static void blake2b_compress_arch(struct blake2b_state *state,
const u8 *block, size_t nblocks, u32 inc)
{
do {
const size_t blocks = min_t(size_t, nblocks,
SZ_4K / BLAKE2B_BLOCK_SIZE);
kernel_neon_begin();
blake2b_compress_neon(state, block, blocks, inc);
kernel_neon_end();
nblocks -= blocks;
block += blocks * BLAKE2B_BLOCK_SIZE;
} while (nblocks);
}
static int crypto_blake2b_update_neon(struct shash_desc *desc,
const u8 *in, unsigned int inlen)
{
return crypto_blake2b_update_bo(desc, in, inlen, blake2b_compress_arch);
}
static int crypto_blake2b_finup_neon(struct shash_desc *desc, const u8 *in,
unsigned int inlen, u8 *out)
{
return crypto_blake2b_finup(desc, in, inlen, out,
blake2b_compress_arch);
}
#define BLAKE2B_ALG(name, driver_name, digest_size) \
{ \
.base.cra_name = name, \
.base.cra_driver_name = driver_name, \
.base.cra_priority = 200, \
.base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY | \
CRYPTO_AHASH_ALG_BLOCK_ONLY | \
CRYPTO_AHASH_ALG_FINAL_NONZERO, \
.base.cra_blocksize = BLAKE2B_BLOCK_SIZE, \
.base.cra_ctxsize = sizeof(struct blake2b_tfm_ctx), \
.base.cra_module = THIS_MODULE, \
.digestsize = digest_size, \
.setkey = crypto_blake2b_setkey, \
.init = crypto_blake2b_init, \
.update = crypto_blake2b_update_neon, \
.finup = crypto_blake2b_finup_neon, \
.descsize = sizeof(struct blake2b_state), \
.statesize = BLAKE2B_STATE_SIZE, \
}
static struct shash_alg blake2b_neon_algs[] = {
BLAKE2B_ALG("blake2b-160", "blake2b-160-neon", BLAKE2B_160_HASH_SIZE),
BLAKE2B_ALG("blake2b-256", "blake2b-256-neon", BLAKE2B_256_HASH_SIZE),
BLAKE2B_ALG("blake2b-384", "blake2b-384-neon", BLAKE2B_384_HASH_SIZE),
BLAKE2B_ALG("blake2b-512", "blake2b-512-neon", BLAKE2B_512_HASH_SIZE),
};
static int __init blake2b_neon_mod_init(void)
{
if (!(elf_hwcap & HWCAP_NEON))
return -ENODEV;
return crypto_register_shashes(blake2b_neon_algs,
ARRAY_SIZE(blake2b_neon_algs));
}
static void __exit blake2b_neon_mod_exit(void)
{
crypto_unregister_shashes(blake2b_neon_algs,
ARRAY_SIZE(blake2b_neon_algs));
}
module_init(blake2b_neon_mod_init);
module_exit(blake2b_neon_mod_exit);
MODULE_DESCRIPTION("BLAKE2b digest algorithm, NEON accelerated");
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Eric Biggers <ebiggers@google.com>");
MODULE_ALIAS_CRYPTO("blake2b-160");
MODULE_ALIAS_CRYPTO("blake2b-160-neon");
MODULE_ALIAS_CRYPTO("blake2b-256");
MODULE_ALIAS_CRYPTO("blake2b-256-neon");
MODULE_ALIAS_CRYPTO("blake2b-384");
MODULE_ALIAS_CRYPTO("blake2b-384-neon");
MODULE_ALIAS_CRYPTO("blake2b-512");
MODULE_ALIAS_CRYPTO("blake2b-512-neon");

View File

@ -37,6 +37,7 @@ config CRYPTO_LIB_BLAKE2B
config CRYPTO_LIB_BLAKE2B_ARCH config CRYPTO_LIB_BLAKE2B_ARCH
bool bool
depends on CRYPTO_LIB_BLAKE2B && !UML depends on CRYPTO_LIB_BLAKE2B && !UML
default y if ARM && KERNEL_MODE_NEON
# BLAKE2s support is always built-in, so there's no CRYPTO_LIB_BLAKE2S option. # BLAKE2s support is always built-in, so there's no CRYPTO_LIB_BLAKE2S option.

View File

@ -36,6 +36,7 @@ libblake2b-y := blake2b.o
CFLAGS_blake2b.o := -Wframe-larger-than=4096 # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105930 CFLAGS_blake2b.o := -Wframe-larger-than=4096 # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105930
ifeq ($(CONFIG_CRYPTO_LIB_BLAKE2B_ARCH),y) ifeq ($(CONFIG_CRYPTO_LIB_BLAKE2B_ARCH),y)
CFLAGS_blake2b.o += -I$(src)/$(SRCARCH) CFLAGS_blake2b.o += -I$(src)/$(SRCARCH)
libblake2b-$(CONFIG_ARM) += arm/blake2b-neon-core.o
endif # CONFIG_CRYPTO_LIB_BLAKE2B_ARCH endif # CONFIG_CRYPTO_LIB_BLAKE2B_ARCH
################################################################################ ################################################################################

View File

@ -1,6 +1,9 @@
/* SPDX-License-Identifier: GPL-2.0-or-later */ /* SPDX-License-Identifier: GPL-2.0-or-later */
/* /*
* BLAKE2b digest algorithm, NEON accelerated * BLAKE2b digest algorithm optimized with ARM NEON instructions. On ARM
* processors that have NEON support but not the ARMv8 Crypto Extensions,
* typically this BLAKE2b implementation is much faster than the SHA-2 family
* and slightly faster than SHA-1.
* *
* Copyright 2020 Google LLC * Copyright 2020 Google LLC
* *
@ -13,8 +16,8 @@
.fpu neon .fpu neon
// The arguments to blake2b_compress_neon() // The arguments to blake2b_compress_neon()
STATE .req r0 CTX .req r0
BLOCK .req r1 DATA .req r1
NBLOCKS .req r2 NBLOCKS .req r2
INC .req r3 INC .req r3
@ -234,10 +237,10 @@
.endm .endm
// //
// void blake2b_compress_neon(struct blake2b_state *state, // void blake2b_compress_neon(struct blake2b_ctx *ctx,
// const u8 *block, size_t nblocks, u32 inc); // const u8 *data, size_t nblocks, u32 inc);
// //
// Only the first three fields of struct blake2b_state are used: // Only the first three fields of struct blake2b_ctx are used:
// u64 h[8]; (inout) // u64 h[8]; (inout)
// u64 t[2]; (inout) // u64 t[2]; (inout)
// u64 f[2]; (in) // u64 f[2]; (in)
@ -255,7 +258,7 @@ ENTRY(blake2b_compress_neon)
adr ROR24_TABLE, .Lror24_table adr ROR24_TABLE, .Lror24_table
adr ROR16_TABLE, .Lror16_table adr ROR16_TABLE, .Lror16_table
mov ip, STATE mov ip, CTX
vld1.64 {q0-q1}, [ip]! // Load h[0..3] vld1.64 {q0-q1}, [ip]! // Load h[0..3]
vld1.64 {q2-q3}, [ip]! // Load h[4..7] vld1.64 {q2-q3}, [ip]! // Load h[4..7]
.Lnext_block: .Lnext_block:
@ -281,14 +284,14 @@ ENTRY(blake2b_compress_neon)
// (q8-q9) in an aligned buffer on the stack so that they can be // (q8-q9) in an aligned buffer on the stack so that they can be
// reloaded when needed. (We could just reload directly from the // reloaded when needed. (We could just reload directly from the
// message buffer, but it's faster to use aligned loads.) // message buffer, but it's faster to use aligned loads.)
vld1.8 {q8-q9}, [BLOCK]! vld1.8 {q8-q9}, [DATA]!
veor q6, q6, q14 // v[12..13] = IV[4..5] ^ t[0..1] veor q6, q6, q14 // v[12..13] = IV[4..5] ^ t[0..1]
vld1.8 {q10-q11}, [BLOCK]! vld1.8 {q10-q11}, [DATA]!
veor q7, q7, q15 // v[14..15] = IV[6..7] ^ f[0..1] veor q7, q7, q15 // v[14..15] = IV[6..7] ^ f[0..1]
vld1.8 {q12-q13}, [BLOCK]! vld1.8 {q12-q13}, [DATA]!
vst1.8 {q8-q9}, [sp, :256] vst1.8 {q8-q9}, [sp, :256]
mov ip, STATE mov ip, CTX
vld1.8 {q14-q15}, [BLOCK]! vld1.8 {q14-q15}, [DATA]!
// Execute the rounds. Each round is provided the order in which it // Execute the rounds. Each round is provided the order in which it
// needs to use the message words. // needs to use the message words.
@ -319,7 +322,7 @@ ENTRY(blake2b_compress_neon)
veor q3, q3, q7 // v[6..7] ^= v[14..15] veor q3, q3, q7 // v[6..7] ^= v[14..15]
veor q0, q0, q8 // v[0..1] ^= h[0..1] veor q0, q0, q8 // v[0..1] ^= h[0..1]
veor q1, q1, q9 // v[2..3] ^= h[2..3] veor q1, q1, q9 // v[2..3] ^= h[2..3]
mov ip, STATE mov ip, CTX
subs NBLOCKS, NBLOCKS, #1 // nblocks-- subs NBLOCKS, NBLOCKS, #1 // nblocks--
vst1.64 {q0-q1}, [ip]! // Store new h[0..3] vst1.64 {q0-q1}, [ip]! // Store new h[0..3]
veor q2, q2, q10 // v[4..5] ^= h[4..5] veor q2, q2, q10 // v[4..5] ^= h[4..5]

41
lib/crypto/arm/blake2b.h Normal file
View File

@ -0,0 +1,41 @@
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
* BLAKE2b digest algorithm, NEON accelerated
*
* Copyright 2020 Google LLC
*/
#include <asm/neon.h>
#include <asm/simd.h>
static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
asmlinkage void blake2b_compress_neon(struct blake2b_ctx *ctx,
const u8 *data, size_t nblocks, u32 inc);
static void blake2b_compress(struct blake2b_ctx *ctx,
const u8 *data, size_t nblocks, u32 inc)
{
if (!static_branch_likely(&have_neon) || !may_use_simd()) {
blake2b_compress_generic(ctx, data, nblocks, inc);
return;
}
do {
const size_t blocks = min_t(size_t, nblocks,
SZ_4K / BLAKE2B_BLOCK_SIZE);
kernel_neon_begin();
blake2b_compress_neon(ctx, data, blocks, inc);
kernel_neon_end();
data += blocks * BLAKE2B_BLOCK_SIZE;
nblocks -= blocks;
} while (nblocks);
}
#define blake2b_mod_init_arch blake2b_mod_init_arch
static void blake2b_mod_init_arch(void)
{
if (elf_hwcap & HWCAP_NEON)
static_branch_enable(&have_neon);
}