mirror of https://github.com/torvalds/linux.git
292 lines
8.3 KiB
ArmAsm
292 lines
8.3 KiB
ArmAsm
/* SPDX-License-Identifier: GPL-2.0 OR MIT */
|
|
/*
|
|
* Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
|
|
* Copyright (C) 2017-2019 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved.
|
|
*/
|
|
|
|
#include <linux/linkage.h>
|
|
|
|
.section .rodata.cst32.iv, "aM", @progbits, 32
|
|
.align 32
|
|
.Liv:
|
|
.octa 0xA54FF53A3C6EF372BB67AE856A09E667
|
|
.octa 0x5BE0CD191F83D9AB9B05688C510E527F
|
|
|
|
.section .rodata.cst16.ror16, "aM", @progbits, 16
|
|
.align 16
|
|
.Lror16:
|
|
.octa 0x0D0C0F0E09080B0A0504070601000302
|
|
|
|
.section .rodata.cst16.ror8, "aM", @progbits, 16
|
|
.align 16
|
|
.Lror8:
|
|
.octa 0x0C0F0E0D080B0A090407060500030201
|
|
|
|
.section .rodata.cst64.sigma, "aM", @progbits, 160
|
|
.align 64
|
|
.Lsigma:
|
|
.byte 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13
|
|
.byte 14, 4, 9, 13, 10, 8, 15, 6, 5, 1, 0, 11, 3, 12, 2, 7
|
|
.byte 11, 12, 5, 15, 8, 0, 2, 13, 9, 10, 3, 7, 4, 14, 6, 1
|
|
.byte 7, 3, 13, 11, 9, 1, 12, 14, 15, 2, 5, 4, 8, 6, 10, 0
|
|
.byte 9, 5, 2, 10, 0, 7, 4, 15, 3, 14, 11, 6, 13, 1, 12, 8
|
|
.byte 2, 6, 0, 8, 12, 10, 11, 3, 1, 4, 7, 15, 9, 13, 5, 14
|
|
.byte 12, 1, 14, 4, 5, 15, 13, 10, 8, 0, 6, 9, 11, 7, 3, 2
|
|
.byte 13, 7, 12, 3, 11, 14, 1, 9, 2, 5, 15, 8, 10, 0, 4, 6
|
|
.byte 6, 14, 11, 0, 15, 9, 3, 8, 10, 12, 13, 1, 5, 2, 7, 4
|
|
.byte 10, 8, 7, 1, 2, 4, 6, 5, 13, 15, 9, 3, 0, 11, 14, 12
|
|
|
|
.section .rodata.cst64.sigma2, "aM", @progbits, 160
|
|
.align 64
|
|
.Lsigma2:
|
|
.byte 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13
|
|
.byte 8, 2, 13, 15, 10, 9, 12, 3, 6, 4, 0, 14, 5, 11, 1, 7
|
|
.byte 11, 13, 8, 6, 5, 10, 14, 3, 2, 4, 12, 15, 1, 0, 7, 9
|
|
.byte 11, 10, 7, 0, 8, 15, 1, 13, 3, 6, 2, 12, 4, 14, 9, 5
|
|
.byte 4, 10, 9, 14, 15, 0, 11, 8, 1, 7, 3, 13, 2, 5, 6, 12
|
|
.byte 2, 11, 4, 15, 14, 3, 10, 8, 13, 6, 5, 7, 0, 12, 1, 9
|
|
.byte 4, 8, 15, 9, 14, 11, 13, 5, 3, 2, 1, 12, 6, 10, 7, 0
|
|
.byte 6, 13, 0, 14, 12, 2, 1, 11, 15, 4, 5, 8, 7, 9, 3, 10
|
|
.byte 15, 5, 4, 13, 10, 7, 3, 11, 12, 2, 0, 6, 9, 8, 1, 14
|
|
.byte 8, 7, 14, 11, 13, 15, 0, 12, 10, 4, 5, 6, 3, 2, 1, 9
|
|
|
|
#define CTX %rdi
|
|
#define DATA %rsi
|
|
#define NBLOCKS %rdx
|
|
#define INC %ecx
|
|
|
|
.text
|
|
//
|
|
// void blake2s_compress_ssse3(struct blake2s_ctx *ctx,
|
|
// const u8 *data, size_t nblocks, u32 inc);
|
|
//
|
|
// Only the first three fields of struct blake2s_ctx are used:
|
|
// u32 h[8]; (inout)
|
|
// u32 t[2]; (inout)
|
|
// u32 f[2]; (in)
|
|
//
|
|
SYM_FUNC_START(blake2s_compress_ssse3)
|
|
movdqu (CTX),%xmm0 // Load h[0..3]
|
|
movdqu 16(CTX),%xmm1 // Load h[4..7]
|
|
movdqa .Lror16(%rip),%xmm12
|
|
movdqa .Lror8(%rip),%xmm13
|
|
movdqu 32(CTX),%xmm14 // Load t and f
|
|
movd INC,%xmm15 // Load inc
|
|
leaq .Lsigma+160(%rip),%r8
|
|
jmp .Lssse3_mainloop
|
|
|
|
.align 32
|
|
.Lssse3_mainloop:
|
|
// Main loop: each iteration processes one 64-byte block.
|
|
movdqa %xmm0,%xmm10 // Save h[0..3] and let v[0..3] = h[0..3]
|
|
movdqa %xmm1,%xmm11 // Save h[4..7] and let v[4..7] = h[4..7]
|
|
paddq %xmm15,%xmm14 // t += inc (64-bit addition)
|
|
movdqa .Liv(%rip),%xmm2 // v[8..11] = iv[0..3]
|
|
movdqa %xmm14,%xmm3
|
|
pxor .Liv+16(%rip),%xmm3 // v[12..15] = iv[4..7] ^ [t, f]
|
|
leaq .Lsigma(%rip),%rcx
|
|
|
|
.Lssse3_roundloop:
|
|
// Round loop: each iteration does 1 round (of 10 rounds total).
|
|
movzbl (%rcx),%eax
|
|
movd (DATA,%rax,4),%xmm4
|
|
movzbl 1(%rcx),%eax
|
|
movd (DATA,%rax,4),%xmm5
|
|
movzbl 2(%rcx),%eax
|
|
movd (DATA,%rax,4),%xmm6
|
|
movzbl 3(%rcx),%eax
|
|
movd (DATA,%rax,4),%xmm7
|
|
punpckldq %xmm5,%xmm4
|
|
punpckldq %xmm7,%xmm6
|
|
punpcklqdq %xmm6,%xmm4
|
|
paddd %xmm4,%xmm0
|
|
paddd %xmm1,%xmm0
|
|
pxor %xmm0,%xmm3
|
|
pshufb %xmm12,%xmm3
|
|
paddd %xmm3,%xmm2
|
|
pxor %xmm2,%xmm1
|
|
movdqa %xmm1,%xmm8
|
|
psrld $12,%xmm1
|
|
pslld $20,%xmm8
|
|
por %xmm8,%xmm1
|
|
movzbl 4(%rcx),%eax
|
|
movd (DATA,%rax,4),%xmm5
|
|
movzbl 5(%rcx),%eax
|
|
movd (DATA,%rax,4),%xmm6
|
|
movzbl 6(%rcx),%eax
|
|
movd (DATA,%rax,4),%xmm7
|
|
movzbl 7(%rcx),%eax
|
|
movd (DATA,%rax,4),%xmm4
|
|
punpckldq %xmm6,%xmm5
|
|
punpckldq %xmm4,%xmm7
|
|
punpcklqdq %xmm7,%xmm5
|
|
paddd %xmm5,%xmm0
|
|
paddd %xmm1,%xmm0
|
|
pxor %xmm0,%xmm3
|
|
pshufb %xmm13,%xmm3
|
|
paddd %xmm3,%xmm2
|
|
pxor %xmm2,%xmm1
|
|
movdqa %xmm1,%xmm8
|
|
psrld $7,%xmm1
|
|
pslld $25,%xmm8
|
|
por %xmm8,%xmm1
|
|
pshufd $0x93,%xmm0,%xmm0
|
|
pshufd $0x4e,%xmm3,%xmm3
|
|
pshufd $0x39,%xmm2,%xmm2
|
|
movzbl 8(%rcx),%eax
|
|
movd (DATA,%rax,4),%xmm6
|
|
movzbl 9(%rcx),%eax
|
|
movd (DATA,%rax,4),%xmm7
|
|
movzbl 10(%rcx),%eax
|
|
movd (DATA,%rax,4),%xmm4
|
|
movzbl 11(%rcx),%eax
|
|
movd (DATA,%rax,4),%xmm5
|
|
punpckldq %xmm7,%xmm6
|
|
punpckldq %xmm5,%xmm4
|
|
punpcklqdq %xmm4,%xmm6
|
|
paddd %xmm6,%xmm0
|
|
paddd %xmm1,%xmm0
|
|
pxor %xmm0,%xmm3
|
|
pshufb %xmm12,%xmm3
|
|
paddd %xmm3,%xmm2
|
|
pxor %xmm2,%xmm1
|
|
movdqa %xmm1,%xmm8
|
|
psrld $12,%xmm1
|
|
pslld $20,%xmm8
|
|
por %xmm8,%xmm1
|
|
movzbl 12(%rcx),%eax
|
|
movd (DATA,%rax,4),%xmm7
|
|
movzbl 13(%rcx),%eax
|
|
movd (DATA,%rax,4),%xmm4
|
|
movzbl 14(%rcx),%eax
|
|
movd (DATA,%rax,4),%xmm5
|
|
movzbl 15(%rcx),%eax
|
|
movd (DATA,%rax,4),%xmm6
|
|
punpckldq %xmm4,%xmm7
|
|
punpckldq %xmm6,%xmm5
|
|
punpcklqdq %xmm5,%xmm7
|
|
paddd %xmm7,%xmm0
|
|
paddd %xmm1,%xmm0
|
|
pxor %xmm0,%xmm3
|
|
pshufb %xmm13,%xmm3
|
|
paddd %xmm3,%xmm2
|
|
pxor %xmm2,%xmm1
|
|
movdqa %xmm1,%xmm8
|
|
psrld $7,%xmm1
|
|
pslld $25,%xmm8
|
|
por %xmm8,%xmm1
|
|
pshufd $0x39,%xmm0,%xmm0
|
|
pshufd $0x4e,%xmm3,%xmm3
|
|
pshufd $0x93,%xmm2,%xmm2
|
|
addq $16,%rcx
|
|
cmpq %r8,%rcx
|
|
jnz .Lssse3_roundloop
|
|
|
|
// Compute the new h: h[0..7] ^= v[0..7] ^ v[8..15]
|
|
pxor %xmm2,%xmm0
|
|
pxor %xmm3,%xmm1
|
|
pxor %xmm10,%xmm0
|
|
pxor %xmm11,%xmm1
|
|
addq $64,DATA
|
|
decq NBLOCKS
|
|
jnz .Lssse3_mainloop
|
|
|
|
movdqu %xmm0,(CTX) // Store new h[0..3]
|
|
movdqu %xmm1,16(CTX) // Store new h[4..7]
|
|
movq %xmm14,32(CTX) // Store new t (f is unchanged)
|
|
RET
|
|
SYM_FUNC_END(blake2s_compress_ssse3)
|
|
|
|
//
|
|
// void blake2s_compress_avx512(struct blake2s_ctx *ctx,
|
|
// const u8 *data, size_t nblocks, u32 inc);
|
|
//
|
|
// Only the first three fields of struct blake2s_ctx are used:
|
|
// u32 h[8]; (inout)
|
|
// u32 t[2]; (inout)
|
|
// u32 f[2]; (in)
|
|
//
|
|
SYM_FUNC_START(blake2s_compress_avx512)
|
|
vmovdqu (CTX),%xmm0 // Load h[0..3]
|
|
vmovdqu 16(CTX),%xmm1 // Load h[4..7]
|
|
vmovdqu 32(CTX),%xmm4 // Load t and f
|
|
vmovd INC,%xmm5 // Load inc
|
|
vmovdqa .Liv(%rip),%xmm14 // Load iv[0..3]
|
|
vmovdqa .Liv+16(%rip),%xmm15 // Load iv[4..7]
|
|
jmp .Lavx512_mainloop
|
|
|
|
.align 32
|
|
.Lavx512_mainloop:
|
|
// Main loop: each iteration processes one 64-byte block.
|
|
vmovdqa %xmm0,%xmm10 // Save h[0..3] and let v[0..3] = h[0..3]
|
|
vmovdqa %xmm1,%xmm11 // Save h[4..7] and let v[4..7] = h[4..7]
|
|
vpaddq %xmm5,%xmm4,%xmm4 // t += inc (64-bit addition)
|
|
vmovdqa %xmm14,%xmm2 // v[8..11] = iv[0..3]
|
|
vpxor %xmm15,%xmm4,%xmm3 // v[12..15] = iv[4..7] ^ [t, f]
|
|
vmovdqu (DATA),%ymm6 // Load first 8 data words
|
|
vmovdqu 32(DATA),%ymm7 // Load second 8 data words
|
|
addq $64,DATA
|
|
leaq .Lsigma2(%rip),%rax
|
|
movb $10,%cl // Set num rounds remaining
|
|
|
|
.Lavx512_roundloop:
|
|
// Round loop: each iteration does 1 round (of 10 rounds total).
|
|
vpmovzxbd (%rax),%ymm8
|
|
vpmovzxbd 8(%rax),%ymm9
|
|
addq $16,%rax
|
|
vpermi2d %ymm7,%ymm6,%ymm8
|
|
vpermi2d %ymm7,%ymm6,%ymm9
|
|
vmovdqa %ymm8,%ymm6
|
|
vmovdqa %ymm9,%ymm7
|
|
vpaddd %xmm8,%xmm0,%xmm0
|
|
vpaddd %xmm1,%xmm0,%xmm0
|
|
vpxor %xmm0,%xmm3,%xmm3
|
|
vprord $16,%xmm3,%xmm3
|
|
vpaddd %xmm3,%xmm2,%xmm2
|
|
vpxor %xmm2,%xmm1,%xmm1
|
|
vprord $12,%xmm1,%xmm1
|
|
vextracti128 $1,%ymm8,%xmm8
|
|
vpaddd %xmm8,%xmm0,%xmm0
|
|
vpaddd %xmm1,%xmm0,%xmm0
|
|
vpxor %xmm0,%xmm3,%xmm3
|
|
vprord $8,%xmm3,%xmm3
|
|
vpaddd %xmm3,%xmm2,%xmm2
|
|
vpxor %xmm2,%xmm1,%xmm1
|
|
vprord $7,%xmm1,%xmm1
|
|
vpshufd $0x93,%xmm0,%xmm0
|
|
vpshufd $0x4e,%xmm3,%xmm3
|
|
vpshufd $0x39,%xmm2,%xmm2
|
|
vpaddd %xmm9,%xmm0,%xmm0
|
|
vpaddd %xmm1,%xmm0,%xmm0
|
|
vpxor %xmm0,%xmm3,%xmm3
|
|
vprord $16,%xmm3,%xmm3
|
|
vpaddd %xmm3,%xmm2,%xmm2
|
|
vpxor %xmm2,%xmm1,%xmm1
|
|
vprord $12,%xmm1,%xmm1
|
|
vextracti128 $1,%ymm9,%xmm9
|
|
vpaddd %xmm9,%xmm0,%xmm0
|
|
vpaddd %xmm1,%xmm0,%xmm0
|
|
vpxor %xmm0,%xmm3,%xmm3
|
|
vprord $8,%xmm3,%xmm3
|
|
vpaddd %xmm3,%xmm2,%xmm2
|
|
vpxor %xmm2,%xmm1,%xmm1
|
|
vprord $7,%xmm1,%xmm1
|
|
vpshufd $0x39,%xmm0,%xmm0
|
|
vpshufd $0x4e,%xmm3,%xmm3
|
|
vpshufd $0x93,%xmm2,%xmm2
|
|
decb %cl
|
|
jne .Lavx512_roundloop
|
|
|
|
// Compute the new h: h[0..7] ^= v[0..7] ^ v[8..15]
|
|
vpternlogd $0x96,%xmm10,%xmm2,%xmm0
|
|
vpternlogd $0x96,%xmm11,%xmm3,%xmm1
|
|
decq NBLOCKS
|
|
jne .Lavx512_mainloop
|
|
|
|
vmovdqu %xmm0,(CTX) // Store new h[0..3]
|
|
vmovdqu %xmm1,16(CTX) // Store new h[4..7]
|
|
vmovq %xmm4,32(CTX) // Store new t (f is unchanged)
|
|
vzeroupper
|
|
RET
|
|
SYM_FUNC_END(blake2s_compress_avx512)
|