From e4e72063d3c0ee9ba10faeb5645dcdaae2d733e9 Mon Sep 17 00:00:00 2001 From: Martin Willi Date: Sun, 11 Nov 2018 10:36:25 +0100 Subject: crypto: x86/chacha20 - Support partial lengths in 1-block SSSE3 variant Add a length argument to the single block function for SSSE3, so the block function may XOR only a partial length of the full block. Given that the setup code is rather cheap, the function does not process more than one block; this allows us to keep the block function selection in the C glue code. The required branching does not negatively affect performance for full block sizes. The partial XORing uses simple "rep movsb" to copy the data before and after doing XOR in SSE. This is rather efficient on modern processors; movsw can be slightly faster, but the additional complexity is probably not worth it. Signed-off-by: Martin Willi Signed-off-by: Herbert Xu --- arch/x86/crypto/chacha20-ssse3-x86_64.S | 74 ++++++++++++++++++++++++++------- arch/x86/crypto/chacha20_glue.c | 11 ++--- 2 files changed, 63 insertions(+), 22 deletions(-) (limited to 'arch/x86/crypto') diff --git a/arch/x86/crypto/chacha20-ssse3-x86_64.S b/arch/x86/crypto/chacha20-ssse3-x86_64.S index 512a2b500fd1..98d130b5e4ab 100644 --- a/arch/x86/crypto/chacha20-ssse3-x86_64.S +++ b/arch/x86/crypto/chacha20-ssse3-x86_64.S @@ -25,12 +25,13 @@ CTRINC: .octa 0x00000003000000020000000100000000 ENTRY(chacha20_block_xor_ssse3) # %rdi: Input state matrix, s - # %rsi: 1 data block output, o - # %rdx: 1 data block input, i + # %rsi: up to 1 data block output, o + # %rdx: up to 1 data block input, i + # %rcx: input/output length in bytes # This function encrypts one ChaCha20 block by loading the state matrix # in four SSE registers. It performs matrix operation on four words in - # parallel, but requireds shuffling to rearrange the words after each + # parallel, but requires shuffling to rearrange the words after each # round. 8/16-bit word rotation is done with the slightly better # performing SSSE3 byte shuffling, 7/12-bit word rotation uses # traditional shift+OR. @@ -48,7 +49,8 @@ ENTRY(chacha20_block_xor_ssse3) movdqa ROT8(%rip),%xmm4 movdqa ROT16(%rip),%xmm5 - mov $10,%ecx + mov %rcx,%rax + mov $10,%ecx .Ldoubleround: @@ -122,27 +124,69 @@ ENTRY(chacha20_block_xor_ssse3) jnz .Ldoubleround # o0 = i0 ^ (x0 + s0) - movdqu 0x00(%rdx),%xmm4 paddd %xmm8,%xmm0 + cmp $0x10,%rax + jl .Lxorpart + movdqu 0x00(%rdx),%xmm4 pxor %xmm4,%xmm0 movdqu %xmm0,0x00(%rsi) # o1 = i1 ^ (x1 + s1) - movdqu 0x10(%rdx),%xmm5 paddd %xmm9,%xmm1 - pxor %xmm5,%xmm1 - movdqu %xmm1,0x10(%rsi) + movdqa %xmm1,%xmm0 + cmp $0x20,%rax + jl .Lxorpart + movdqu 0x10(%rdx),%xmm0 + pxor %xmm1,%xmm0 + movdqu %xmm0,0x10(%rsi) # o2 = i2 ^ (x2 + s2) - movdqu 0x20(%rdx),%xmm6 paddd %xmm10,%xmm2 - pxor %xmm6,%xmm2 - movdqu %xmm2,0x20(%rsi) + movdqa %xmm2,%xmm0 + cmp $0x30,%rax + jl .Lxorpart + movdqu 0x20(%rdx),%xmm0 + pxor %xmm2,%xmm0 + movdqu %xmm0,0x20(%rsi) # o3 = i3 ^ (x3 + s3) - movdqu 0x30(%rdx),%xmm7 paddd %xmm11,%xmm3 - pxor %xmm7,%xmm3 - movdqu %xmm3,0x30(%rsi) - + movdqa %xmm3,%xmm0 + cmp $0x40,%rax + jl .Lxorpart + movdqu 0x30(%rdx),%xmm0 + pxor %xmm3,%xmm0 + movdqu %xmm0,0x30(%rsi) + +.Ldone: ret + +.Lxorpart: + # xor remaining bytes from partial register into output + mov %rax,%r9 + and $0x0f,%r9 + jz .Ldone + and $~0x0f,%rax + + mov %rsi,%r11 + + lea 8(%rsp),%r10 + sub $0x10,%rsp + and $~31,%rsp + + lea (%rdx,%rax),%rsi + mov %rsp,%rdi + mov %r9,%rcx + rep movsb + + pxor 0x00(%rsp),%xmm0 + movdqa %xmm0,0x00(%rsp) + + mov %rsp,%rsi + lea (%r11,%rax),%rdi + mov %r9,%rcx + rep movsb + + lea -8(%r10),%rsp + jmp .Ldone + ENDPROC(chacha20_block_xor_ssse3) ENTRY(chacha20_4block_xor_ssse3) diff --git a/arch/x86/crypto/chacha20_glue.c b/arch/x86/crypto/chacha20_glue.c index dce7c5d39c2f..cc4571736ce8 100644 --- a/arch/x86/crypto/chacha20_glue.c +++ b/arch/x86/crypto/chacha20_glue.c @@ -19,7 +19,8 @@ #define CHACHA20_STATE_ALIGN 16 -asmlinkage void chacha20_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src); +asmlinkage void chacha20_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src, + unsigned int len); asmlinkage void chacha20_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src); #ifdef CONFIG_AS_AVX2 asmlinkage void chacha20_8block_xor_avx2(u32 *state, u8 *dst, const u8 *src); @@ -29,8 +30,6 @@ static bool chacha20_use_avx2; static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src, unsigned int bytes) { - u8 buf[CHACHA20_BLOCK_SIZE]; - #ifdef CONFIG_AS_AVX2 if (chacha20_use_avx2) { while (bytes >= CHACHA20_BLOCK_SIZE * 8) { @@ -50,16 +49,14 @@ static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src, state[12] += 4; } while (bytes >= CHACHA20_BLOCK_SIZE) { - chacha20_block_xor_ssse3(state, dst, src); + chacha20_block_xor_ssse3(state, dst, src, bytes); bytes -= CHACHA20_BLOCK_SIZE; src += CHACHA20_BLOCK_SIZE; dst += CHACHA20_BLOCK_SIZE; state[12]++; } if (bytes) { - memcpy(buf, src, bytes); - chacha20_block_xor_ssse3(state, buf, buf); - memcpy(dst, buf, bytes); + chacha20_block_xor_ssse3(state, dst, src, bytes); } } -- cgit v1.2.3 From db8e15a24957904d10f784a9adc4ea4824ee996c Mon Sep 17 00:00:00 2001 From: Martin Willi Date: Sun, 11 Nov 2018 10:36:26 +0100 Subject: crypto: x86/chacha20 - Support partial lengths in 4-block SSSE3 variant Add a length argument to the quad block function for SSSE3, so the block function may XOR only a partial length of four blocks. As we already have the stack set up, the partial XORing does not need to. This gives a slightly different function trailer, so we keep that separate from the 1-block function. Signed-off-by: Martin Willi Signed-off-by: Herbert Xu --- arch/x86/crypto/chacha20-ssse3-x86_64.S | 163 ++++++++++++++++++++++++-------- arch/x86/crypto/chacha20_glue.c | 5 +- 2 files changed, 128 insertions(+), 40 deletions(-) (limited to 'arch/x86/crypto') diff --git a/arch/x86/crypto/chacha20-ssse3-x86_64.S b/arch/x86/crypto/chacha20-ssse3-x86_64.S index 98d130b5e4ab..d8ac75bb448f 100644 --- a/arch/x86/crypto/chacha20-ssse3-x86_64.S +++ b/arch/x86/crypto/chacha20-ssse3-x86_64.S @@ -191,8 +191,9 @@ ENDPROC(chacha20_block_xor_ssse3) ENTRY(chacha20_4block_xor_ssse3) # %rdi: Input state matrix, s - # %rsi: 4 data blocks output, o - # %rdx: 4 data blocks input, i + # %rsi: up to 4 data blocks output, o + # %rdx: up to 4 data blocks input, i + # %rcx: input/output length in bytes # This function encrypts four consecutive ChaCha20 blocks by loading the # the state matrix in SSE registers four times. As we need some scratch @@ -207,6 +208,7 @@ ENTRY(chacha20_4block_xor_ssse3) lea 8(%rsp),%r10 sub $0x80,%rsp and $~63,%rsp + mov %rcx,%rax # x0..15[0-3] = s0..3[0..3] movq 0x00(%rdi),%xmm1 @@ -617,58 +619,143 @@ ENTRY(chacha20_4block_xor_ssse3) # xor with corresponding input, write to output movdqa 0x00(%rsp),%xmm0 + cmp $0x10,%rax + jl .Lxorpart4 movdqu 0x00(%rdx),%xmm1 pxor %xmm1,%xmm0 movdqu %xmm0,0x00(%rsi) - movdqa 0x10(%rsp),%xmm0 - movdqu 0x80(%rdx),%xmm1 + + movdqu %xmm4,%xmm0 + cmp $0x20,%rax + jl .Lxorpart4 + movdqu 0x10(%rdx),%xmm1 pxor %xmm1,%xmm0 - movdqu %xmm0,0x80(%rsi) + movdqu %xmm0,0x10(%rsi) + + movdqu %xmm8,%xmm0 + cmp $0x30,%rax + jl .Lxorpart4 + movdqu 0x20(%rdx),%xmm1 + pxor %xmm1,%xmm0 + movdqu %xmm0,0x20(%rsi) + + movdqu %xmm12,%xmm0 + cmp $0x40,%rax + jl .Lxorpart4 + movdqu 0x30(%rdx),%xmm1 + pxor %xmm1,%xmm0 + movdqu %xmm0,0x30(%rsi) + movdqa 0x20(%rsp),%xmm0 + cmp $0x50,%rax + jl .Lxorpart4 movdqu 0x40(%rdx),%xmm1 pxor %xmm1,%xmm0 movdqu %xmm0,0x40(%rsi) + + movdqu %xmm6,%xmm0 + cmp $0x60,%rax + jl .Lxorpart4 + movdqu 0x50(%rdx),%xmm1 + pxor %xmm1,%xmm0 + movdqu %xmm0,0x50(%rsi) + + movdqu %xmm10,%xmm0 + cmp $0x70,%rax + jl .Lxorpart4 + movdqu 0x60(%rdx),%xmm1 + pxor %xmm1,%xmm0 + movdqu %xmm0,0x60(%rsi) + + movdqu %xmm14,%xmm0 + cmp $0x80,%rax + jl .Lxorpart4 + movdqu 0x70(%rdx),%xmm1 + pxor %xmm1,%xmm0 + movdqu %xmm0,0x70(%rsi) + + movdqa 0x10(%rsp),%xmm0 + cmp $0x90,%rax + jl .Lxorpart4 + movdqu 0x80(%rdx),%xmm1 + pxor %xmm1,%xmm0 + movdqu %xmm0,0x80(%rsi) + + movdqu %xmm5,%xmm0 + cmp $0xa0,%rax + jl .Lxorpart4 + movdqu 0x90(%rdx),%xmm1 + pxor %xmm1,%xmm0 + movdqu %xmm0,0x90(%rsi) + + movdqu %xmm9,%xmm0 + cmp $0xb0,%rax + jl .Lxorpart4 + movdqu 0xa0(%rdx),%xmm1 + pxor %xmm1,%xmm0 + movdqu %xmm0,0xa0(%rsi) + + movdqu %xmm13,%xmm0 + cmp $0xc0,%rax + jl .Lxorpart4 + movdqu 0xb0(%rdx),%xmm1 + pxor %xmm1,%xmm0 + movdqu %xmm0,0xb0(%rsi) + movdqa 0x30(%rsp),%xmm0 + cmp $0xd0,%rax + jl .Lxorpart4 movdqu 0xc0(%rdx),%xmm1 pxor %xmm1,%xmm0 movdqu %xmm0,0xc0(%rsi) - movdqu 0x10(%rdx),%xmm1 - pxor %xmm1,%xmm4 - movdqu %xmm4,0x10(%rsi) - movdqu 0x90(%rdx),%xmm1 - pxor %xmm1,%xmm5 - movdqu %xmm5,0x90(%rsi) - movdqu 0x50(%rdx),%xmm1 - pxor %xmm1,%xmm6 - movdqu %xmm6,0x50(%rsi) + + movdqu %xmm7,%xmm0 + cmp $0xe0,%rax + jl .Lxorpart4 movdqu 0xd0(%rdx),%xmm1 - pxor %xmm1,%xmm7 - movdqu %xmm7,0xd0(%rsi) - movdqu 0x20(%rdx),%xmm1 - pxor %xmm1,%xmm8 - movdqu %xmm8,0x20(%rsi) - movdqu 0xa0(%rdx),%xmm1 - pxor %xmm1,%xmm9 - movdqu %xmm9,0xa0(%rsi) - movdqu 0x60(%rdx),%xmm1 - pxor %xmm1,%xmm10 - movdqu %xmm10,0x60(%rsi) + pxor %xmm1,%xmm0 + movdqu %xmm0,0xd0(%rsi) + + movdqu %xmm11,%xmm0 + cmp $0xf0,%rax + jl .Lxorpart4 movdqu 0xe0(%rdx),%xmm1 - pxor %xmm1,%xmm11 - movdqu %xmm11,0xe0(%rsi) - movdqu 0x30(%rdx),%xmm1 - pxor %xmm1,%xmm12 - movdqu %xmm12,0x30(%rsi) - movdqu 0xb0(%rdx),%xmm1 - pxor %xmm1,%xmm13 - movdqu %xmm13,0xb0(%rsi) - movdqu 0x70(%rdx),%xmm1 - pxor %xmm1,%xmm14 - movdqu %xmm14,0x70(%rsi) + pxor %xmm1,%xmm0 + movdqu %xmm0,0xe0(%rsi) + + movdqu %xmm15,%xmm0 + cmp $0x100,%rax + jl .Lxorpart4 movdqu 0xf0(%rdx),%xmm1 - pxor %xmm1,%xmm15 - movdqu %xmm15,0xf0(%rsi) + pxor %xmm1,%xmm0 + movdqu %xmm0,0xf0(%rsi) +.Ldone4: lea -8(%r10),%rsp ret + +.Lxorpart4: + # xor remaining bytes from partial register into output + mov %rax,%r9 + and $0x0f,%r9 + jz .Ldone4 + and $~0x0f,%rax + + mov %rsi,%r11 + + lea (%rdx,%rax),%rsi + mov %rsp,%rdi + mov %r9,%rcx + rep movsb + + pxor 0x00(%rsp),%xmm0 + movdqa %xmm0,0x00(%rsp) + + mov %rsp,%rsi + lea (%r11,%rax),%rdi + mov %r9,%rcx + rep movsb + + jmp .Ldone4 + ENDPROC(chacha20_4block_xor_ssse3) diff --git a/arch/x86/crypto/chacha20_glue.c b/arch/x86/crypto/chacha20_glue.c index cc4571736ce8..8f1ef1a9ce5c 100644 --- a/arch/x86/crypto/chacha20_glue.c +++ b/arch/x86/crypto/chacha20_glue.c @@ -21,7 +21,8 @@ asmlinkage void chacha20_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src, unsigned int len); -asmlinkage void chacha20_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src); +asmlinkage void chacha20_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src, + unsigned int len); #ifdef CONFIG_AS_AVX2 asmlinkage void chacha20_8block_xor_avx2(u32 *state, u8 *dst, const u8 *src); static bool chacha20_use_avx2; @@ -42,7 +43,7 @@ static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src, } #endif while (bytes >= CHACHA20_BLOCK_SIZE * 4) { - chacha20_4block_xor_ssse3(state, dst, src); + chacha20_4block_xor_ssse3(state, dst, src, bytes); bytes -= CHACHA20_BLOCK_SIZE * 4; src += CHACHA20_BLOCK_SIZE * 4; dst += CHACHA20_BLOCK_SIZE * 4; -- cgit v1.2.3 From c3b734dd325dadc73c2f5b4d187208730bf21df5 Mon Sep 17 00:00:00 2001 From: Martin Willi Date: Sun, 11 Nov 2018 10:36:27 +0100 Subject: crypto: x86/chacha20 - Support partial lengths in 8-block AVX2 variant Add a length argument to the eight block function for AVX2, so the block function may XOR only a partial length of eight blocks. To avoid unnecessary operations, we integrate XORing of the first four blocks in the final lane interleaving; this also avoids some work in the partial lengths path. Signed-off-by: Martin Willi Signed-off-by: Herbert Xu --- arch/x86/crypto/chacha20-avx2-x86_64.S | 189 +++++++++++++++++++++++---------- arch/x86/crypto/chacha20_glue.c | 5 +- 2 files changed, 133 insertions(+), 61 deletions(-) (limited to 'arch/x86/crypto') diff --git a/arch/x86/crypto/chacha20-avx2-x86_64.S b/arch/x86/crypto/chacha20-avx2-x86_64.S index f3cd26f48332..7b62d55bee3d 100644 --- a/arch/x86/crypto/chacha20-avx2-x86_64.S +++ b/arch/x86/crypto/chacha20-avx2-x86_64.S @@ -30,8 +30,9 @@ CTRINC: .octa 0x00000003000000020000000100000000 ENTRY(chacha20_8block_xor_avx2) # %rdi: Input state matrix, s - # %rsi: 8 data blocks output, o - # %rdx: 8 data blocks input, i + # %rsi: up to 8 data blocks output, o + # %rdx: up to 8 data blocks input, i + # %rcx: input/output length in bytes # This function encrypts eight consecutive ChaCha20 blocks by loading # the state matrix in AVX registers eight times. As we need some @@ -48,6 +49,7 @@ ENTRY(chacha20_8block_xor_avx2) lea 8(%rsp),%r10 and $~31, %rsp sub $0x80, %rsp + mov %rcx,%rax # x0..15[0-7] = s[0..15] vpbroadcastd 0x00(%rdi),%ymm0 @@ -375,74 +377,143 @@ ENTRY(chacha20_8block_xor_avx2) vpunpckhqdq %ymm15,%ymm0,%ymm15 # interleave 128-bit words in state n, n+4 - vmovdqa 0x00(%rsp),%ymm0 - vperm2i128 $0x20,%ymm4,%ymm0,%ymm1 - vperm2i128 $0x31,%ymm4,%ymm0,%ymm4 - vmovdqa %ymm1,0x00(%rsp) - vmovdqa 0x20(%rsp),%ymm0 - vperm2i128 $0x20,%ymm5,%ymm0,%ymm1 - vperm2i128 $0x31,%ymm5,%ymm0,%ymm5 - vmovdqa %ymm1,0x20(%rsp) - vmovdqa 0x40(%rsp),%ymm0 - vperm2i128 $0x20,%ymm6,%ymm0,%ymm1 - vperm2i128 $0x31,%ymm6,%ymm0,%ymm6 - vmovdqa %ymm1,0x40(%rsp) - vmovdqa 0x60(%rsp),%ymm0 - vperm2i128 $0x20,%ymm7,%ymm0,%ymm1 - vperm2i128 $0x31,%ymm7,%ymm0,%ymm7 - vmovdqa %ymm1,0x60(%rsp) + # xor/write first four blocks + vmovdqa 0x00(%rsp),%ymm1 + vperm2i128 $0x20,%ymm4,%ymm1,%ymm0 + cmp $0x0020,%rax + jl .Lxorpart8 + vpxor 0x0000(%rdx),%ymm0,%ymm0 + vmovdqu %ymm0,0x0000(%rsi) + vperm2i128 $0x31,%ymm4,%ymm1,%ymm4 + vperm2i128 $0x20,%ymm12,%ymm8,%ymm0 + cmp $0x0040,%rax + jl .Lxorpart8 + vpxor 0x0020(%rdx),%ymm0,%ymm0 + vmovdqu %ymm0,0x0020(%rsi) vperm2i128 $0x31,%ymm12,%ymm8,%ymm12 - vmovdqa %ymm0,%ymm8 - vperm2i128 $0x20,%ymm13,%ymm9,%ymm0 - vperm2i128 $0x31,%ymm13,%ymm9,%ymm13 - vmovdqa %ymm0,%ymm9 + + vmovdqa 0x40(%rsp),%ymm1 + vperm2i128 $0x20,%ymm6,%ymm1,%ymm0 + cmp $0x0060,%rax + jl .Lxorpart8 + vpxor 0x0040(%rdx),%ymm0,%ymm0 + vmovdqu %ymm0,0x0040(%rsi) + vperm2i128 $0x31,%ymm6,%ymm1,%ymm6 + vperm2i128 $0x20,%ymm14,%ymm10,%ymm0 + cmp $0x0080,%rax + jl .Lxorpart8 + vpxor 0x0060(%rdx),%ymm0,%ymm0 + vmovdqu %ymm0,0x0060(%rsi) vperm2i128 $0x31,%ymm14,%ymm10,%ymm14 - vmovdqa %ymm0,%ymm10 - vperm2i128 $0x20,%ymm15,%ymm11,%ymm0 - vperm2i128 $0x31,%ymm15,%ymm11,%ymm15 - vmovdqa %ymm0,%ymm11 - # xor with corresponding input, write to output - vmovdqa 0x00(%rsp),%ymm0 - vpxor 0x0000(%rdx),%ymm0,%ymm0 - vmovdqu %ymm0,0x0000(%rsi) - vmovdqa 0x20(%rsp),%ymm0 + vmovdqa 0x20(%rsp),%ymm1 + vperm2i128 $0x20,%ymm5,%ymm1,%ymm0 + cmp $0x00a0,%rax + jl .Lxorpart8 vpxor 0x0080(%rdx),%ymm0,%ymm0 vmovdqu %ymm0,0x0080(%rsi) - vmovdqa 0x40(%rsp),%ymm0 - vpxor 0x0040(%rdx),%ymm0,%ymm0 - vmovdqu %ymm0,0x0040(%rsi) - vmovdqa 0x60(%rsp),%ymm0 + vperm2i128 $0x31,%ymm5,%ymm1,%ymm5 + + vperm2i128 $0x20,%ymm13,%ymm9,%ymm0 + cmp $0x00c0,%rax + jl .Lxorpart8 + vpxor 0x00a0(%rdx),%ymm0,%ymm0 + vmovdqu %ymm0,0x00a0(%rsi) + vperm2i128 $0x31,%ymm13,%ymm9,%ymm13 + + vmovdqa 0x60(%rsp),%ymm1 + vperm2i128 $0x20,%ymm7,%ymm1,%ymm0 + cmp $0x00e0,%rax + jl .Lxorpart8 vpxor 0x00c0(%rdx),%ymm0,%ymm0 vmovdqu %ymm0,0x00c0(%rsi) - vpxor 0x0100(%rdx),%ymm4,%ymm4 - vmovdqu %ymm4,0x0100(%rsi) - vpxor 0x0180(%rdx),%ymm5,%ymm5 - vmovdqu %ymm5,0x00180(%rsi) - vpxor 0x0140(%rdx),%ymm6,%ymm6 - vmovdqu %ymm6,0x0140(%rsi) - vpxor 0x01c0(%rdx),%ymm7,%ymm7 - vmovdqu %ymm7,0x01c0(%rsi) - vpxor 0x0020(%rdx),%ymm8,%ymm8 - vmovdqu %ymm8,0x0020(%rsi) - vpxor 0x00a0(%rdx),%ymm9,%ymm9 - vmovdqu %ymm9,0x00a0(%rsi) - vpxor 0x0060(%rdx),%ymm10,%ymm10 - vmovdqu %ymm10,0x0060(%rsi) - vpxor 0x00e0(%rdx),%ymm11,%ymm11 - vmovdqu %ymm11,0x00e0(%rsi) - vpxor 0x0120(%rdx),%ymm12,%ymm12 - vmovdqu %ymm12,0x0120(%rsi) - vpxor 0x01a0(%rdx),%ymm13,%ymm13 - vmovdqu %ymm13,0x01a0(%rsi) - vpxor 0x0160(%rdx),%ymm14,%ymm14 - vmovdqu %ymm14,0x0160(%rsi) - vpxor 0x01e0(%rdx),%ymm15,%ymm15 - vmovdqu %ymm15,0x01e0(%rsi) + vperm2i128 $0x31,%ymm7,%ymm1,%ymm7 + + vperm2i128 $0x20,%ymm15,%ymm11,%ymm0 + cmp $0x0100,%rax + jl .Lxorpart8 + vpxor 0x00e0(%rdx),%ymm0,%ymm0 + vmovdqu %ymm0,0x00e0(%rsi) + vperm2i128 $0x31,%ymm15,%ymm11,%ymm15 + + # xor remaining blocks, write to output + vmovdqa %ymm4,%ymm0 + cmp $0x0120,%rax + jl .Lxorpart8 + vpxor 0x0100(%rdx),%ymm0,%ymm0 + vmovdqu %ymm0,0x0100(%rsi) + vmovdqa %ymm12,%ymm0 + cmp $0x0140,%rax + jl .Lxorpart8 + vpxor 0x0120(%rdx),%ymm0,%ymm0 + vmovdqu %ymm0,0x0120(%rsi) + + vmovdqa %ymm6,%ymm0 + cmp $0x0160,%rax + jl .Lxorpart8 + vpxor 0x0140(%rdx),%ymm0,%ymm0 + vmovdqu %ymm0,0x0140(%rsi) + + vmovdqa %ymm14,%ymm0 + cmp $0x0180,%rax + jl .Lxorpart8 + vpxor 0x0160(%rdx),%ymm0,%ymm0 + vmovdqu %ymm0,0x0160(%rsi) + + vmovdqa %ymm5,%ymm0 + cmp $0x01a0,%rax + jl .Lxorpart8 + vpxor 0x0180(%rdx),%ymm0,%ymm0 + vmovdqu %ymm0,0x0180(%rsi) + + vmovdqa %ymm13,%ymm0 + cmp $0x01c0,%rax + jl .Lxorpart8 + vpxor 0x01a0(%rdx),%ymm0,%ymm0 + vmovdqu %ymm0,0x01a0(%rsi) + + vmovdqa %ymm7,%ymm0 + cmp $0x01e0,%rax + jl .Lxorpart8 + vpxor 0x01c0(%rdx),%ymm0,%ymm0 + vmovdqu %ymm0,0x01c0(%rsi) + + vmovdqa %ymm15,%ymm0 + cmp $0x0200,%rax + jl .Lxorpart8 + vpxor 0x01e0(%rdx),%ymm0,%ymm0 + vmovdqu %ymm0,0x01e0(%rsi) + +.Ldone8: vzeroupper lea -8(%r10),%rsp ret + +.Lxorpart8: + # xor remaining bytes from partial register into output + mov %rax,%r9 + and $0x1f,%r9 + jz .Ldone8 + and $~0x1f,%rax + + mov %rsi,%r11 + + lea (%rdx,%rax),%rsi + mov %rsp,%rdi + mov %r9,%rcx + rep movsb + + vpxor 0x00(%rsp),%ymm0,%ymm0 + vmovdqa %ymm0,0x00(%rsp) + + mov %rsp,%rsi + lea (%r11,%rax),%rdi + mov %r9,%rcx + rep movsb + + jmp .Ldone8 + ENDPROC(chacha20_8block_xor_avx2) diff --git a/arch/x86/crypto/chacha20_glue.c b/arch/x86/crypto/chacha20_glue.c index 8f1ef1a9ce5c..882e8bf5965a 100644 --- a/arch/x86/crypto/chacha20_glue.c +++ b/arch/x86/crypto/chacha20_glue.c @@ -24,7 +24,8 @@ asmlinkage void chacha20_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src, asmlinkage void chacha20_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src, unsigned int len); #ifdef CONFIG_AS_AVX2 -asmlinkage void chacha20_8block_xor_avx2(u32 *state, u8 *dst, const u8 *src); +asmlinkage void chacha20_8block_xor_avx2(u32 *state, u8 *dst, const u8 *src, + unsigned int len); static bool chacha20_use_avx2; #endif @@ -34,7 +35,7 @@ static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src, #ifdef CONFIG_AS_AVX2 if (chacha20_use_avx2) { while (bytes >= CHACHA20_BLOCK_SIZE * 8) { - chacha20_8block_xor_avx2(state, dst, src); + chacha20_8block_xor_avx2(state, dst, src, bytes); bytes -= CHACHA20_BLOCK_SIZE * 8; src += CHACHA20_BLOCK_SIZE * 8; dst += CHACHA20_BLOCK_SIZE * 8; -- cgit v1.2.3 From 9b17608f15b940babe2e32522ea29787abd10af2 Mon Sep 17 00:00:00 2001 From: Martin Willi Date: Sun, 11 Nov 2018 10:36:28 +0100 Subject: crypto: x86/chacha20 - Use larger block functions more aggressively Now that all block functions support partial lengths, engage the wider block sizes more aggressively. This prevents using smaller block functions multiple times, where the next larger block function would have been faster. Signed-off-by: Martin Willi Signed-off-by: Herbert Xu --- arch/x86/crypto/chacha20_glue.c | 39 ++++++++++++++++++++++++--------------- 1 file changed, 24 insertions(+), 15 deletions(-) (limited to 'arch/x86/crypto') diff --git a/arch/x86/crypto/chacha20_glue.c b/arch/x86/crypto/chacha20_glue.c index 882e8bf5965a..b541da71f11e 100644 --- a/arch/x86/crypto/chacha20_glue.c +++ b/arch/x86/crypto/chacha20_glue.c @@ -29,6 +29,12 @@ asmlinkage void chacha20_8block_xor_avx2(u32 *state, u8 *dst, const u8 *src, static bool chacha20_use_avx2; #endif +static unsigned int chacha20_advance(unsigned int len, unsigned int maxblocks) +{ + len = min(len, maxblocks * CHACHA20_BLOCK_SIZE); + return round_up(len, CHACHA20_BLOCK_SIZE) / CHACHA20_BLOCK_SIZE; +} + static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src, unsigned int bytes) { @@ -41,6 +47,11 @@ static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src, dst += CHACHA20_BLOCK_SIZE * 8; state[12] += 8; } + if (bytes > CHACHA20_BLOCK_SIZE * 4) { + chacha20_8block_xor_avx2(state, dst, src, bytes); + state[12] += chacha20_advance(bytes, 8); + return; + } } #endif while (bytes >= CHACHA20_BLOCK_SIZE * 4) { @@ -50,15 +61,14 @@ static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src, dst += CHACHA20_BLOCK_SIZE * 4; state[12] += 4; } - while (bytes >= CHACHA20_BLOCK_SIZE) { - chacha20_block_xor_ssse3(state, dst, src, bytes); - bytes -= CHACHA20_BLOCK_SIZE; - src += CHACHA20_BLOCK_SIZE; - dst += CHACHA20_BLOCK_SIZE; - state[12]++; + if (bytes > CHACHA20_BLOCK_SIZE) { + chacha20_4block_xor_ssse3(state, dst, src, bytes); + state[12] += chacha20_advance(bytes, 4); + return; } if (bytes) { chacha20_block_xor_ssse3(state, dst, src, bytes); + state[12]++; } } @@ -82,17 +92,16 @@ static int chacha20_simd(struct skcipher_request *req) kernel_fpu_begin(); - while (walk.nbytes >= CHACHA20_BLOCK_SIZE) { - chacha20_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr, - rounddown(walk.nbytes, CHACHA20_BLOCK_SIZE)); - err = skcipher_walk_done(&walk, - walk.nbytes % CHACHA20_BLOCK_SIZE); - } + while (walk.nbytes > 0) { + unsigned int nbytes = walk.nbytes; + + if (nbytes < walk.total) + nbytes = round_down(nbytes, walk.stride); - if (walk.nbytes) { chacha20_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr, - walk.nbytes); - err = skcipher_walk_done(&walk, 0); + nbytes); + + err = skcipher_walk_done(&walk, walk.nbytes - nbytes); } kernel_fpu_end(); -- cgit v1.2.3 From a5dd97f86211e91219807db607d740f9896b8e0b Mon Sep 17 00:00:00 2001 From: Martin Willi Date: Sun, 11 Nov 2018 10:36:29 +0100 Subject: crypto: x86/chacha20 - Add a 2-block AVX2 variant This variant uses the same principle as the single block SSSE3 variant by shuffling the state matrix after each round. With the wider AVX registers, we can do two blocks in parallel, though. This function can increase performance and efficiency significantly for lengths that would otherwise require a 4-block function. Signed-off-by: Martin Willi Signed-off-by: Herbert Xu --- arch/x86/crypto/chacha20-avx2-x86_64.S | 197 +++++++++++++++++++++++++++++++++ arch/x86/crypto/chacha20_glue.c | 7 ++ 2 files changed, 204 insertions(+) (limited to 'arch/x86/crypto') diff --git a/arch/x86/crypto/chacha20-avx2-x86_64.S b/arch/x86/crypto/chacha20-avx2-x86_64.S index 7b62d55bee3d..8247076b0ba7 100644 --- a/arch/x86/crypto/chacha20-avx2-x86_64.S +++ b/arch/x86/crypto/chacha20-avx2-x86_64.S @@ -26,8 +26,205 @@ ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302 CTRINC: .octa 0x00000003000000020000000100000000 .octa 0x00000007000000060000000500000004 +.section .rodata.cst32.CTR2BL, "aM", @progbits, 32 +.align 32 +CTR2BL: .octa 0x00000000000000000000000000000000 + .octa 0x00000000000000000000000000000001 + .text +ENTRY(chacha20_2block_xor_avx2) + # %rdi: Input state matrix, s + # %rsi: up to 2 data blocks output, o + # %rdx: up to 2 data blocks input, i + # %rcx: input/output length in bytes + + # This function encrypts two ChaCha20 blocks by loading the state + # matrix twice across four AVX registers. It performs matrix operations + # on four words in each matrix in parallel, but requires shuffling to + # rearrange the words after each round. + + vzeroupper + + # x0..3[0-2] = s0..3 + vbroadcasti128 0x00(%rdi),%ymm0 + vbroadcasti128 0x10(%rdi),%ymm1 + vbroadcasti128 0x20(%rdi),%ymm2 + vbroadcasti128 0x30(%rdi),%ymm3 + + vpaddd CTR2BL(%rip),%ymm3,%ymm3 + + vmovdqa %ymm0,%ymm8 + vmovdqa %ymm1,%ymm9 + vmovdqa %ymm2,%ymm10 + vmovdqa %ymm3,%ymm11 + + vmovdqa ROT8(%rip),%ymm4 + vmovdqa ROT16(%rip),%ymm5 + + mov %rcx,%rax + mov $10,%ecx + +.Ldoubleround: + + # x0 += x1, x3 = rotl32(x3 ^ x0, 16) + vpaddd %ymm1,%ymm0,%ymm0 + vpxor %ymm0,%ymm3,%ymm3 + vpshufb %ymm5,%ymm3,%ymm3 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 12) + vpaddd %ymm3,%ymm2,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vmovdqa %ymm1,%ymm6 + vpslld $12,%ymm6,%ymm6 + vpsrld $20,%ymm1,%ymm1 + vpor %ymm6,%ymm1,%ymm1 + + # x0 += x1, x3 = rotl32(x3 ^ x0, 8) + vpaddd %ymm1,%ymm0,%ymm0 + vpxor %ymm0,%ymm3,%ymm3 + vpshufb %ymm4,%ymm3,%ymm3 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 7) + vpaddd %ymm3,%ymm2,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vmovdqa %ymm1,%ymm7 + vpslld $7,%ymm7,%ymm7 + vpsrld $25,%ymm1,%ymm1 + vpor %ymm7,%ymm1,%ymm1 + + # x1 = shuffle32(x1, MASK(0, 3, 2, 1)) + vpshufd $0x39,%ymm1,%ymm1 + # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) + vpshufd $0x4e,%ymm2,%ymm2 + # x3 = shuffle32(x3, MASK(2, 1, 0, 3)) + vpshufd $0x93,%ymm3,%ymm3 + + # x0 += x1, x3 = rotl32(x3 ^ x0, 16) + vpaddd %ymm1,%ymm0,%ymm0 + vpxor %ymm0,%ymm3,%ymm3 + vpshufb %ymm5,%ymm3,%ymm3 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 12) + vpaddd %ymm3,%ymm2,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vmovdqa %ymm1,%ymm6 + vpslld $12,%ymm6,%ymm6 + vpsrld $20,%ymm1,%ymm1 + vpor %ymm6,%ymm1,%ymm1 + + # x0 += x1, x3 = rotl32(x3 ^ x0, 8) + vpaddd %ymm1,%ymm0,%ymm0 + vpxor %ymm0,%ymm3,%ymm3 + vpshufb %ymm4,%ymm3,%ymm3 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 7) + vpaddd %ymm3,%ymm2,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vmovdqa %ymm1,%ymm7 + vpslld $7,%ymm7,%ymm7 + vpsrld $25,%ymm1,%ymm1 + vpor %ymm7,%ymm1,%ymm1 + + # x1 = shuffle32(x1, MASK(2, 1, 0, 3)) + vpshufd $0x93,%ymm1,%ymm1 + # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) + vpshufd $0x4e,%ymm2,%ymm2 + # x3 = shuffle32(x3, MASK(0, 3, 2, 1)) + vpshufd $0x39,%ymm3,%ymm3 + + dec %ecx + jnz .Ldoubleround + + # o0 = i0 ^ (x0 + s0) + vpaddd %ymm8,%ymm0,%ymm7 + cmp $0x10,%rax + jl .Lxorpart2 + vpxor 0x00(%rdx),%xmm7,%xmm6 + vmovdqu %xmm6,0x00(%rsi) + vextracti128 $1,%ymm7,%xmm0 + # o1 = i1 ^ (x1 + s1) + vpaddd %ymm9,%ymm1,%ymm7 + cmp $0x20,%rax + jl .Lxorpart2 + vpxor 0x10(%rdx),%xmm7,%xmm6 + vmovdqu %xmm6,0x10(%rsi) + vextracti128 $1,%ymm7,%xmm1 + # o2 = i2 ^ (x2 + s2) + vpaddd %ymm10,%ymm2,%ymm7 + cmp $0x30,%rax + jl .Lxorpart2 + vpxor 0x20(%rdx),%xmm7,%xmm6 + vmovdqu %xmm6,0x20(%rsi) + vextracti128 $1,%ymm7,%xmm2 + # o3 = i3 ^ (x3 + s3) + vpaddd %ymm11,%ymm3,%ymm7 + cmp $0x40,%rax + jl .Lxorpart2 + vpxor 0x30(%rdx),%xmm7,%xmm6 + vmovdqu %xmm6,0x30(%rsi) + vextracti128 $1,%ymm7,%xmm3 + + # xor and write second block + vmovdqa %xmm0,%xmm7 + cmp $0x50,%rax + jl .Lxorpart2 + vpxor 0x40(%rdx),%xmm7,%xmm6 + vmovdqu %xmm6,0x40(%rsi) + + vmovdqa %xmm1,%xmm7 + cmp $0x60,%rax + jl .Lxorpart2 + vpxor 0x50(%rdx),%xmm7,%xmm6 + vmovdqu %xmm6,0x50(%rsi) + + vmovdqa %xmm2,%xmm7 + cmp $0x70,%rax + jl .Lxorpart2 + vpxor 0x60(%rdx),%xmm7,%xmm6 + vmovdqu %xmm6,0x60(%rsi) + + vmovdqa %xmm3,%xmm7 + cmp $0x80,%rax + jl .Lxorpart2 + vpxor 0x70(%rdx),%xmm7,%xmm6 + vmovdqu %xmm6,0x70(%rsi) + +.Ldone2: + vzeroupper + ret + +.Lxorpart2: + # xor remaining bytes from partial register into output + mov %rax,%r9 + and $0x0f,%r9 + jz .Ldone2 + and $~0x0f,%rax + + mov %rsi,%r11 + + lea 8(%rsp),%r10 + sub $0x10,%rsp + and $~31,%rsp + + lea (%rdx,%rax),%rsi + mov %rsp,%rdi + mov %r9,%rcx + rep movsb + + vpxor 0x00(%rsp),%xmm7,%xmm7 + vmovdqa %xmm7,0x00(%rsp) + + mov %rsp,%rsi + lea (%r11,%rax),%rdi + mov %r9,%rcx + rep movsb + + lea -8(%r10),%rsp + jmp .Ldone2 + +ENDPROC(chacha20_2block_xor_avx2) + ENTRY(chacha20_8block_xor_avx2) # %rdi: Input state matrix, s # %rsi: up to 8 data blocks output, o diff --git a/arch/x86/crypto/chacha20_glue.c b/arch/x86/crypto/chacha20_glue.c index b541da71f11e..82e46589a189 100644 --- a/arch/x86/crypto/chacha20_glue.c +++ b/arch/x86/crypto/chacha20_glue.c @@ -24,6 +24,8 @@ asmlinkage void chacha20_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src, asmlinkage void chacha20_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src, unsigned int len); #ifdef CONFIG_AS_AVX2 +asmlinkage void chacha20_2block_xor_avx2(u32 *state, u8 *dst, const u8 *src, + unsigned int len); asmlinkage void chacha20_8block_xor_avx2(u32 *state, u8 *dst, const u8 *src, unsigned int len); static bool chacha20_use_avx2; @@ -52,6 +54,11 @@ static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src, state[12] += chacha20_advance(bytes, 8); return; } + if (bytes > CHACHA20_BLOCK_SIZE) { + chacha20_2block_xor_avx2(state, dst, src, bytes); + state[12] += chacha20_advance(bytes, 2); + return; + } } #endif while (bytes >= CHACHA20_BLOCK_SIZE * 4) { -- cgit v1.2.3 From 8a5a79d5556b822143b4403fc46068d4eef2e4e2 Mon Sep 17 00:00:00 2001 From: Martin Willi Date: Sun, 11 Nov 2018 10:36:30 +0100 Subject: crypto: x86/chacha20 - Add a 4-block AVX2 variant This variant builds upon the idea of the 2-block AVX2 variant that shuffles words after each round. The shuffling has a rather high latency, so the arithmetic units are not optimally used. Given that we have plenty of registers in AVX, this version parallelizes the 2-block variant to do four blocks. While the first two blocks are shuffling, the CPU can do the XORing on the second two blocks and vice-versa, which makes this version much faster than the SSSE3 variant for four blocks. The latter is now mostly for systems that do not have AVX2, but there it is the work-horse, so we keep it in place. The partial XORing function trailer is very similar to the AVX2 2-block variant. While it could be shared, that code segment is rather short; profiling is also easier with the trailer integrated, so we keep it per function. Signed-off-by: Martin Willi Signed-off-by: Herbert Xu --- arch/x86/crypto/chacha20-avx2-x86_64.S | 310 +++++++++++++++++++++++++++++++++ arch/x86/crypto/chacha20_glue.c | 7 + 2 files changed, 317 insertions(+) (limited to 'arch/x86/crypto') diff --git a/arch/x86/crypto/chacha20-avx2-x86_64.S b/arch/x86/crypto/chacha20-avx2-x86_64.S index 8247076b0ba7..b6ab082be657 100644 --- a/arch/x86/crypto/chacha20-avx2-x86_64.S +++ b/arch/x86/crypto/chacha20-avx2-x86_64.S @@ -31,6 +31,11 @@ CTRINC: .octa 0x00000003000000020000000100000000 CTR2BL: .octa 0x00000000000000000000000000000000 .octa 0x00000000000000000000000000000001 +.section .rodata.cst32.CTR4BL, "aM", @progbits, 32 +.align 32 +CTR4BL: .octa 0x00000000000000000000000000000002 + .octa 0x00000000000000000000000000000003 + .text ENTRY(chacha20_2block_xor_avx2) @@ -225,6 +230,311 @@ ENTRY(chacha20_2block_xor_avx2) ENDPROC(chacha20_2block_xor_avx2) +ENTRY(chacha20_4block_xor_avx2) + # %rdi: Input state matrix, s + # %rsi: up to 4 data blocks output, o + # %rdx: up to 4 data blocks input, i + # %rcx: input/output length in bytes + + # This function encrypts four ChaCha20 block by loading the state + # matrix four times across eight AVX registers. It performs matrix + # operations on four words in two matrices in parallel, sequentially + # to the operations on the four words of the other two matrices. The + # required word shuffling has a rather high latency, we can do the + # arithmetic on two matrix-pairs without much slowdown. + + vzeroupper + + # x0..3[0-4] = s0..3 + vbroadcasti128 0x00(%rdi),%ymm0 + vbroadcasti128 0x10(%rdi),%ymm1 + vbroadcasti128 0x20(%rdi),%ymm2 + vbroadcasti128 0x30(%rdi),%ymm3 + + vmovdqa %ymm0,%ymm4 + vmovdqa %ymm1,%ymm5 + vmovdqa %ymm2,%ymm6 + vmovdqa %ymm3,%ymm7 + + vpaddd CTR2BL(%rip),%ymm3,%ymm3 + vpaddd CTR4BL(%rip),%ymm7,%ymm7 + + vmovdqa %ymm0,%ymm11 + vmovdqa %ymm1,%ymm12 + vmovdqa %ymm2,%ymm13 + vmovdqa %ymm3,%ymm14 + vmovdqa %ymm7,%ymm15 + + vmovdqa ROT8(%rip),%ymm8 + vmovdqa ROT16(%rip),%ymm9 + + mov %rcx,%rax + mov $10,%ecx + +.Ldoubleround4: + + # x0 += x1, x3 = rotl32(x3 ^ x0, 16) + vpaddd %ymm1,%ymm0,%ymm0 + vpxor %ymm0,%ymm3,%ymm3 + vpshufb %ymm9,%ymm3,%ymm3 + + vpaddd %ymm5,%ymm4,%ymm4 + vpxor %ymm4,%ymm7,%ymm7 + vpshufb %ymm9,%ymm7,%ymm7 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 12) + vpaddd %ymm3,%ymm2,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vmovdqa %ymm1,%ymm10 + vpslld $12,%ymm10,%ymm10 + vpsrld $20,%ymm1,%ymm1 + vpor %ymm10,%ymm1,%ymm1 + + vpaddd %ymm7,%ymm6,%ymm6 + vpxor %ymm6,%ymm5,%ymm5 + vmovdqa %ymm5,%ymm10 + vpslld $12,%ymm10,%ymm10 + vpsrld $20,%ymm5,%ymm5 + vpor %ymm10,%ymm5,%ymm5 + + # x0 += x1, x3 = rotl32(x3 ^ x0, 8) + vpaddd %ymm1,%ymm0,%ymm0 + vpxor %ymm0,%ymm3,%ymm3 + vpshufb %ymm8,%ymm3,%ymm3 + + vpaddd %ymm5,%ymm4,%ymm4 + vpxor %ymm4,%ymm7,%ymm7 + vpshufb %ymm8,%ymm7,%ymm7 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 7) + vpaddd %ymm3,%ymm2,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vmovdqa %ymm1,%ymm10 + vpslld $7,%ymm10,%ymm10 + vpsrld $25,%ymm1,%ymm1 + vpor %ymm10,%ymm1,%ymm1 + + vpaddd %ymm7,%ymm6,%ymm6 + vpxor %ymm6,%ymm5,%ymm5 + vmovdqa %ymm5,%ymm10 + vpslld $7,%ymm10,%ymm10 + vpsrld $25,%ymm5,%ymm5 + vpor %ymm10,%ymm5,%ymm5 + + # x1 = shuffle32(x1, MASK(0, 3, 2, 1)) + vpshufd $0x39,%ymm1,%ymm1 + vpshufd $0x39,%ymm5,%ymm5 + # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) + vpshufd $0x4e,%ymm2,%ymm2 + vpshufd $0x4e,%ymm6,%ymm6 + # x3 = shuffle32(x3, MASK(2, 1, 0, 3)) + vpshufd $0x93,%ymm3,%ymm3 + vpshufd $0x93,%ymm7,%ymm7 + + # x0 += x1, x3 = rotl32(x3 ^ x0, 16) + vpaddd %ymm1,%ymm0,%ymm0 + vpxor %ymm0,%ymm3,%ymm3 + vpshufb %ymm9,%ymm3,%ymm3 + + vpaddd %ymm5,%ymm4,%ymm4 + vpxor %ymm4,%ymm7,%ymm7 + vpshufb %ymm9,%ymm7,%ymm7 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 12) + vpaddd %ymm3,%ymm2,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vmovdqa %ymm1,%ymm10 + vpslld $12,%ymm10,%ymm10 + vpsrld $20,%ymm1,%ymm1 + vpor %ymm10,%ymm1,%ymm1 + + vpaddd %ymm7,%ymm6,%ymm6 + vpxor %ymm6,%ymm5,%ymm5 + vmovdqa %ymm5,%ymm10 + vpslld $12,%ymm10,%ymm10 + vpsrld $20,%ymm5,%ymm5 + vpor %ymm10,%ymm5,%ymm5 + + # x0 += x1, x3 = rotl32(x3 ^ x0, 8) + vpaddd %ymm1,%ymm0,%ymm0 + vpxor %ymm0,%ymm3,%ymm3 + vpshufb %ymm8,%ymm3,%ymm3 + + vpaddd %ymm5,%ymm4,%ymm4 + vpxor %ymm4,%ymm7,%ymm7 + vpshufb %ymm8,%ymm7,%ymm7 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 7) + vpaddd %ymm3,%ymm2,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vmovdqa %ymm1,%ymm10 + vpslld $7,%ymm10,%ymm10 + vpsrld $25,%ymm1,%ymm1 + vpor %ymm10,%ymm1,%ymm1 + + vpaddd %ymm7,%ymm6,%ymm6 + vpxor %ymm6,%ymm5,%ymm5 + vmovdqa %ymm5,%ymm10 + vpslld $7,%ymm10,%ymm10 + vpsrld $25,%ymm5,%ymm5 + vpor %ymm10,%ymm5,%ymm5 + + # x1 = shuffle32(x1, MASK(2, 1, 0, 3)) + vpshufd $0x93,%ymm1,%ymm1 + vpshufd $0x93,%ymm5,%ymm5 + # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) + vpshufd $0x4e,%ymm2,%ymm2 + vpshufd $0x4e,%ymm6,%ymm6 + # x3 = shuffle32(x3, MASK(0, 3, 2, 1)) + vpshufd $0x39,%ymm3,%ymm3 + vpshufd $0x39,%ymm7,%ymm7 + + dec %ecx + jnz .Ldoubleround4 + + # o0 = i0 ^ (x0 + s0), first block + vpaddd %ymm11,%ymm0,%ymm10 + cmp $0x10,%rax + jl .Lxorpart4 + vpxor 0x00(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x00(%rsi) + vextracti128 $1,%ymm10,%xmm0 + # o1 = i1 ^ (x1 + s1), first block + vpaddd %ymm12,%ymm1,%ymm10 + cmp $0x20,%rax + jl .Lxorpart4 + vpxor 0x10(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x10(%rsi) + vextracti128 $1,%ymm10,%xmm1 + # o2 = i2 ^ (x2 + s2), first block + vpaddd %ymm13,%ymm2,%ymm10 + cmp $0x30,%rax + jl .Lxorpart4 + vpxor 0x20(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x20(%rsi) + vextracti128 $1,%ymm10,%xmm2 + # o3 = i3 ^ (x3 + s3), first block + vpaddd %ymm14,%ymm3,%ymm10 + cmp $0x40,%rax + jl .Lxorpart4 + vpxor 0x30(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x30(%rsi) + vextracti128 $1,%ymm10,%xmm3 + + # xor and write second block + vmovdqa %xmm0,%xmm10 + cmp $0x50,%rax + jl .Lxorpart4 + vpxor 0x40(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x40(%rsi) + + vmovdqa %xmm1,%xmm10 + cmp $0x60,%rax + jl .Lxorpart4 + vpxor 0x50(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x50(%rsi) + + vmovdqa %xmm2,%xmm10 + cmp $0x70,%rax + jl .Lxorpart4 + vpxor 0x60(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x60(%rsi) + + vmovdqa %xmm3,%xmm10 + cmp $0x80,%rax + jl .Lxorpart4 + vpxor 0x70(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x70(%rsi) + + # o0 = i0 ^ (x0 + s0), third block + vpaddd %ymm11,%ymm4,%ymm10 + cmp $0x90,%rax + jl .Lxorpart4 + vpxor 0x80(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x80(%rsi) + vextracti128 $1,%ymm10,%xmm4 + # o1 = i1 ^ (x1 + s1), third block + vpaddd %ymm12,%ymm5,%ymm10 + cmp $0xa0,%rax + jl .Lxorpart4 + vpxor 0x90(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x90(%rsi) + vextracti128 $1,%ymm10,%xmm5 + # o2 = i2 ^ (x2 + s2), third block + vpaddd %ymm13,%ymm6,%ymm10 + cmp $0xb0,%rax + jl .Lxorpart4 + vpxor 0xa0(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0xa0(%rsi) + vextracti128 $1,%ymm10,%xmm6 + # o3 = i3 ^ (x3 + s3), third block + vpaddd %ymm15,%ymm7,%ymm10 + cmp $0xc0,%rax + jl .Lxorpart4 + vpxor 0xb0(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0xb0(%rsi) + vextracti128 $1,%ymm10,%xmm7 + + # xor and write fourth block + vmovdqa %xmm4,%xmm10 + cmp $0xd0,%rax + jl .Lxorpart4 + vpxor 0xc0(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0xc0(%rsi) + + vmovdqa %xmm5,%xmm10 + cmp $0xe0,%rax + jl .Lxorpart4 + vpxor 0xd0(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0xd0(%rsi) + + vmovdqa %xmm6,%xmm10 + cmp $0xf0,%rax + jl .Lxorpart4 + vpxor 0xe0(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0xe0(%rsi) + + vmovdqa %xmm7,%xmm10 + cmp $0x100,%rax + jl .Lxorpart4 + vpxor 0xf0(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0xf0(%rsi) + +.Ldone4: + vzeroupper + ret + +.Lxorpart4: + # xor remaining bytes from partial register into output + mov %rax,%r9 + and $0x0f,%r9 + jz .Ldone4 + and $~0x0f,%rax + + mov %rsi,%r11 + + lea 8(%rsp),%r10 + sub $0x10,%rsp + and $~31,%rsp + + lea (%rdx,%rax),%rsi + mov %rsp,%rdi + mov %r9,%rcx + rep movsb + + vpxor 0x00(%rsp),%xmm10,%xmm10 + vmovdqa %xmm10,0x00(%rsp) + + mov %rsp,%rsi + lea (%r11,%rax),%rdi + mov %r9,%rcx + rep movsb + + lea -8(%r10),%rsp + jmp .Ldone4 + +ENDPROC(chacha20_4block_xor_avx2) + ENTRY(chacha20_8block_xor_avx2) # %rdi: Input state matrix, s # %rsi: up to 8 data blocks output, o diff --git a/arch/x86/crypto/chacha20_glue.c b/arch/x86/crypto/chacha20_glue.c index 82e46589a189..9fd84fe6ec09 100644 --- a/arch/x86/crypto/chacha20_glue.c +++ b/arch/x86/crypto/chacha20_glue.c @@ -26,6 +26,8 @@ asmlinkage void chacha20_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src, #ifdef CONFIG_AS_AVX2 asmlinkage void chacha20_2block_xor_avx2(u32 *state, u8 *dst, const u8 *src, unsigned int len); +asmlinkage void chacha20_4block_xor_avx2(u32 *state, u8 *dst, const u8 *src, + unsigned int len); asmlinkage void chacha20_8block_xor_avx2(u32 *state, u8 *dst, const u8 *src, unsigned int len); static bool chacha20_use_avx2; @@ -54,6 +56,11 @@ static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src, state[12] += chacha20_advance(bytes, 8); return; } + if (bytes > CHACHA20_BLOCK_SIZE * 2) { + chacha20_4block_xor_avx2(state, dst, src, bytes); + state[12] += chacha20_advance(bytes, 4); + return; + } if (bytes > CHACHA20_BLOCK_SIZE) { chacha20_2block_xor_avx2(state, dst, src, bytes); state[12] += chacha20_advance(bytes, 2); -- cgit v1.2.3 From 1ca1b917940c24ca3d1f490118c5474168622953 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 16 Nov 2018 17:26:21 -0800 Subject: crypto: chacha20-generic - refactor to allow varying number of rounds In preparation for adding XChaCha12 support, rename/refactor chacha20-generic to support different numbers of rounds. The justification for needing XChaCha12 support is explained in more detail in the patch "crypto: chacha - add XChaCha12 support". The only difference between ChaCha{8,12,20} are the number of rounds itself; all other parts of the algorithm are the same. Therefore, remove the "20" from all definitions, structures, functions, files, etc. that will be shared by all ChaCha versions. Also make ->setkey() store the round count in the chacha_ctx (previously chacha20_ctx). The generic code then passes the round count through to chacha_block(). There will be a ->setkey() function for each explicitly allowed round count; the encrypt/decrypt functions will be the same. I decided not to do it the opposite way (same ->setkey() function for all round counts, with different encrypt/decrypt functions) because that would have required more boilerplate code in architecture-specific implementations of ChaCha and XChaCha. Reviewed-by: Ard Biesheuvel Acked-by: Martin Willi Signed-off-by: Eric Biggers Signed-off-by: Herbert Xu --- arch/x86/crypto/chacha20_glue.c | 48 ++++++++++++++++++++--------------------- 1 file changed, 24 insertions(+), 24 deletions(-) (limited to 'arch/x86/crypto') diff --git a/arch/x86/crypto/chacha20_glue.c b/arch/x86/crypto/chacha20_glue.c index 9fd84fe6ec09..1e9e66509226 100644 --- a/arch/x86/crypto/chacha20_glue.c +++ b/arch/x86/crypto/chacha20_glue.c @@ -10,7 +10,7 @@ */ #include -#include +#include #include #include #include @@ -35,8 +35,8 @@ static bool chacha20_use_avx2; static unsigned int chacha20_advance(unsigned int len, unsigned int maxblocks) { - len = min(len, maxblocks * CHACHA20_BLOCK_SIZE); - return round_up(len, CHACHA20_BLOCK_SIZE) / CHACHA20_BLOCK_SIZE; + len = min(len, maxblocks * CHACHA_BLOCK_SIZE); + return round_up(len, CHACHA_BLOCK_SIZE) / CHACHA_BLOCK_SIZE; } static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src, @@ -44,38 +44,38 @@ static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src, { #ifdef CONFIG_AS_AVX2 if (chacha20_use_avx2) { - while (bytes >= CHACHA20_BLOCK_SIZE * 8) { + while (bytes >= CHACHA_BLOCK_SIZE * 8) { chacha20_8block_xor_avx2(state, dst, src, bytes); - bytes -= CHACHA20_BLOCK_SIZE * 8; - src += CHACHA20_BLOCK_SIZE * 8; - dst += CHACHA20_BLOCK_SIZE * 8; + bytes -= CHACHA_BLOCK_SIZE * 8; + src += CHACHA_BLOCK_SIZE * 8; + dst += CHACHA_BLOCK_SIZE * 8; state[12] += 8; } - if (bytes > CHACHA20_BLOCK_SIZE * 4) { + if (bytes > CHACHA_BLOCK_SIZE * 4) { chacha20_8block_xor_avx2(state, dst, src, bytes); state[12] += chacha20_advance(bytes, 8); return; } - if (bytes > CHACHA20_BLOCK_SIZE * 2) { + if (bytes > CHACHA_BLOCK_SIZE * 2) { chacha20_4block_xor_avx2(state, dst, src, bytes); state[12] += chacha20_advance(bytes, 4); return; } - if (bytes > CHACHA20_BLOCK_SIZE) { + if (bytes > CHACHA_BLOCK_SIZE) { chacha20_2block_xor_avx2(state, dst, src, bytes); state[12] += chacha20_advance(bytes, 2); return; } } #endif - while (bytes >= CHACHA20_BLOCK_SIZE * 4) { + while (bytes >= CHACHA_BLOCK_SIZE * 4) { chacha20_4block_xor_ssse3(state, dst, src, bytes); - bytes -= CHACHA20_BLOCK_SIZE * 4; - src += CHACHA20_BLOCK_SIZE * 4; - dst += CHACHA20_BLOCK_SIZE * 4; + bytes -= CHACHA_BLOCK_SIZE * 4; + src += CHACHA_BLOCK_SIZE * 4; + dst += CHACHA_BLOCK_SIZE * 4; state[12] += 4; } - if (bytes > CHACHA20_BLOCK_SIZE) { + if (bytes > CHACHA_BLOCK_SIZE) { chacha20_4block_xor_ssse3(state, dst, src, bytes); state[12] += chacha20_advance(bytes, 4); return; @@ -89,7 +89,7 @@ static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src, static int chacha20_simd(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct chacha20_ctx *ctx = crypto_skcipher_ctx(tfm); + struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); u32 *state, state_buf[16 + 2] __aligned(8); struct skcipher_walk walk; int err; @@ -97,12 +97,12 @@ static int chacha20_simd(struct skcipher_request *req) BUILD_BUG_ON(CHACHA20_STATE_ALIGN != 16); state = PTR_ALIGN(state_buf + 0, CHACHA20_STATE_ALIGN); - if (req->cryptlen <= CHACHA20_BLOCK_SIZE || !may_use_simd()) - return crypto_chacha20_crypt(req); + if (req->cryptlen <= CHACHA_BLOCK_SIZE || !may_use_simd()) + return crypto_chacha_crypt(req); err = skcipher_walk_virt(&walk, req, true); - crypto_chacha20_init(state, ctx, walk.iv); + crypto_chacha_init(state, ctx, walk.iv); kernel_fpu_begin(); @@ -128,13 +128,13 @@ static struct skcipher_alg alg = { .base.cra_driver_name = "chacha20-simd", .base.cra_priority = 300, .base.cra_blocksize = 1, - .base.cra_ctxsize = sizeof(struct chacha20_ctx), + .base.cra_ctxsize = sizeof(struct chacha_ctx), .base.cra_module = THIS_MODULE, - .min_keysize = CHACHA20_KEY_SIZE, - .max_keysize = CHACHA20_KEY_SIZE, - .ivsize = CHACHA20_IV_SIZE, - .chunksize = CHACHA20_BLOCK_SIZE, + .min_keysize = CHACHA_KEY_SIZE, + .max_keysize = CHACHA_KEY_SIZE, + .ivsize = CHACHA_IV_SIZE, + .chunksize = CHACHA_BLOCK_SIZE, .setkey = crypto_chacha20_setkey, .encrypt = chacha20_simd, .decrypt = chacha20_simd, -- cgit v1.2.3 From 878afc35cd28bcd93cd3c5e1985ef39a104a4d45 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 16 Nov 2018 17:26:27 -0800 Subject: crypto: poly1305 - use structures for key and accumulator MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In preparation for exposing a low-level Poly1305 API which implements the ε-almost-∆-universal (εA∆U) hash function underlying the Poly1305 MAC and supports block-aligned inputs only, create structures poly1305_key and poly1305_state which hold the limbs of the Poly1305 "r" key and accumulator, respectively. These structures could actually have the same type (e.g. poly1305_val), but different types are preferable, to prevent misuse. Acked-by: Martin Willi Signed-off-by: Eric Biggers Acked-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/x86/crypto/poly1305_glue.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) (limited to 'arch/x86/crypto') diff --git a/arch/x86/crypto/poly1305_glue.c b/arch/x86/crypto/poly1305_glue.c index f012b7e28ad1..88cc01506c84 100644 --- a/arch/x86/crypto/poly1305_glue.c +++ b/arch/x86/crypto/poly1305_glue.c @@ -83,35 +83,37 @@ static unsigned int poly1305_simd_blocks(struct poly1305_desc_ctx *dctx, if (poly1305_use_avx2 && srclen >= POLY1305_BLOCK_SIZE * 4) { if (unlikely(!sctx->wset)) { if (!sctx->uset) { - memcpy(sctx->u, dctx->r, sizeof(sctx->u)); - poly1305_simd_mult(sctx->u, dctx->r); + memcpy(sctx->u, dctx->r.r, sizeof(sctx->u)); + poly1305_simd_mult(sctx->u, dctx->r.r); sctx->uset = true; } memcpy(sctx->u + 5, sctx->u, sizeof(sctx->u)); - poly1305_simd_mult(sctx->u + 5, dctx->r); + poly1305_simd_mult(sctx->u + 5, dctx->r.r); memcpy(sctx->u + 10, sctx->u + 5, sizeof(sctx->u)); - poly1305_simd_mult(sctx->u + 10, dctx->r); + poly1305_simd_mult(sctx->u + 10, dctx->r.r); sctx->wset = true; } blocks = srclen / (POLY1305_BLOCK_SIZE * 4); - poly1305_4block_avx2(dctx->h, src, dctx->r, blocks, sctx->u); + poly1305_4block_avx2(dctx->h.h, src, dctx->r.r, blocks, + sctx->u); src += POLY1305_BLOCK_SIZE * 4 * blocks; srclen -= POLY1305_BLOCK_SIZE * 4 * blocks; } #endif if (likely(srclen >= POLY1305_BLOCK_SIZE * 2)) { if (unlikely(!sctx->uset)) { - memcpy(sctx->u, dctx->r, sizeof(sctx->u)); - poly1305_simd_mult(sctx->u, dctx->r); + memcpy(sctx->u, dctx->r.r, sizeof(sctx->u)); + poly1305_simd_mult(sctx->u, dctx->r.r); sctx->uset = true; } blocks = srclen / (POLY1305_BLOCK_SIZE * 2); - poly1305_2block_sse2(dctx->h, src, dctx->r, blocks, sctx->u); + poly1305_2block_sse2(dctx->h.h, src, dctx->r.r, blocks, + sctx->u); src += POLY1305_BLOCK_SIZE * 2 * blocks; srclen -= POLY1305_BLOCK_SIZE * 2 * blocks; } if (srclen >= POLY1305_BLOCK_SIZE) { - poly1305_block_sse2(dctx->h, src, dctx->r, 1); + poly1305_block_sse2(dctx->h.h, src, dctx->r.r, 1); srclen -= POLY1305_BLOCK_SIZE; } return srclen; -- cgit v1.2.3 From cee7a36ecb5bafef8c87fb2c10641e6125044154 Mon Sep 17 00:00:00 2001 From: Martin Willi Date: Tue, 20 Nov 2018 17:30:48 +0100 Subject: crypto: x86/chacha20 - Add a 8-block AVX-512VL variant This variant is similar to the AVX2 version, but benefits from the AVX-512 rotate instructions and the additional registers, so it can operate without any data on the stack. It uses ymm registers only to avoid the massive core throttling on Skylake-X platforms. Nontheless does it bring a ~30% speed improvement compared to the AVX2 variant for random encryption lengths. The AVX2 version uses "rep movsb" for partial block XORing via the stack. With AVX-512, the new "vmovdqu8" can do this much more efficiently. The associated "kmov" instructions to work with dynamic masks is not part of the AVX-512VL instruction set, hence we depend on AVX-512BW as well. Given that the major AVX-512VL architectures provide AVX-512BW and this extension does not affect core clocking, this seems to be no problem at least for now. Signed-off-by: Martin Willi Signed-off-by: Herbert Xu --- arch/x86/crypto/Makefile | 5 + arch/x86/crypto/chacha20-avx512vl-x86_64.S | 396 +++++++++++++++++++++++++++++ arch/x86/crypto/chacha20_glue.c | 26 ++ 3 files changed, 427 insertions(+) create mode 100644 arch/x86/crypto/chacha20-avx512vl-x86_64.S (limited to 'arch/x86/crypto') diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index a4b0007a54e1..ce4e43642984 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile @@ -8,6 +8,7 @@ OBJECT_FILES_NON_STANDARD := y avx_supported := $(call as-instr,vpxor %xmm0$(comma)%xmm0$(comma)%xmm0,yes,no) avx2_supported := $(call as-instr,vpgatherdd %ymm0$(comma)(%eax$(comma)%ymm1\ $(comma)4)$(comma)%ymm2,yes,no) +avx512_supported :=$(call as-instr,vpmovm2b %k1$(comma)%zmm5,yes,no) sha1_ni_supported :=$(call as-instr,sha1msg1 %xmm0$(comma)%xmm1,yes,no) sha256_ni_supported :=$(call as-instr,sha256msg1 %xmm0$(comma)%xmm1,yes,no) @@ -103,6 +104,10 @@ ifeq ($(avx2_supported),yes) morus1280-avx2-y := morus1280-avx2-asm.o morus1280-avx2-glue.o endif +ifeq ($(avx512_supported),yes) + chacha20-x86_64-y += chacha20-avx512vl-x86_64.o +endif + aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o aesni-intel-$(CONFIG_64BIT) += aesni-intel_avx-x86_64.o aes_ctrby8_avx-x86_64.o ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o diff --git a/arch/x86/crypto/chacha20-avx512vl-x86_64.S b/arch/x86/crypto/chacha20-avx512vl-x86_64.S new file mode 100644 index 000000000000..e1877afcaa73 --- /dev/null +++ b/arch/x86/crypto/chacha20-avx512vl-x86_64.S @@ -0,0 +1,396 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +/* + * ChaCha20 256-bit cipher algorithm, RFC7539, x64 AVX-512VL functions + * + * Copyright (C) 2018 Martin Willi + */ + +#include + +.section .rodata.cst32.CTR8BL, "aM", @progbits, 32 +.align 32 +CTR8BL: .octa 0x00000003000000020000000100000000 + .octa 0x00000007000000060000000500000004 + +.text + +ENTRY(chacha20_8block_xor_avx512vl) + # %rdi: Input state matrix, s + # %rsi: up to 8 data blocks output, o + # %rdx: up to 8 data blocks input, i + # %rcx: input/output length in bytes + + # This function encrypts eight consecutive ChaCha20 blocks by loading + # the state matrix in AVX registers eight times. Compared to AVX2, this + # mostly benefits from the new rotate instructions in VL and the + # additional registers. + + vzeroupper + + # x0..15[0-7] = s[0..15] + vpbroadcastd 0x00(%rdi),%ymm0 + vpbroadcastd 0x04(%rdi),%ymm1 + vpbroadcastd 0x08(%rdi),%ymm2 + vpbroadcastd 0x0c(%rdi),%ymm3 + vpbroadcastd 0x10(%rdi),%ymm4 + vpbroadcastd 0x14(%rdi),%ymm5 + vpbroadcastd 0x18(%rdi),%ymm6 + vpbroadcastd 0x1c(%rdi),%ymm7 + vpbroadcastd 0x20(%rdi),%ymm8 + vpbroadcastd 0x24(%rdi),%ymm9 + vpbroadcastd 0x28(%rdi),%ymm10 + vpbroadcastd 0x2c(%rdi),%ymm11 + vpbroadcastd 0x30(%rdi),%ymm12 + vpbroadcastd 0x34(%rdi),%ymm13 + vpbroadcastd 0x38(%rdi),%ymm14 + vpbroadcastd 0x3c(%rdi),%ymm15 + + # x12 += counter values 0-3 + vpaddd CTR8BL(%rip),%ymm12,%ymm12 + + vmovdqa64 %ymm0,%ymm16 + vmovdqa64 %ymm1,%ymm17 + vmovdqa64 %ymm2,%ymm18 + vmovdqa64 %ymm3,%ymm19 + vmovdqa64 %ymm4,%ymm20 + vmovdqa64 %ymm5,%ymm21 + vmovdqa64 %ymm6,%ymm22 + vmovdqa64 %ymm7,%ymm23 + vmovdqa64 %ymm8,%ymm24 + vmovdqa64 %ymm9,%ymm25 + vmovdqa64 %ymm10,%ymm26 + vmovdqa64 %ymm11,%ymm27 + vmovdqa64 %ymm12,%ymm28 + vmovdqa64 %ymm13,%ymm29 + vmovdqa64 %ymm14,%ymm30 + vmovdqa64 %ymm15,%ymm31 + + mov $10,%eax + +.Ldoubleround8: + # x0 += x4, x12 = rotl32(x12 ^ x0, 16) + vpaddd %ymm0,%ymm4,%ymm0 + vpxord %ymm0,%ymm12,%ymm12 + vprold $16,%ymm12,%ymm12 + # x1 += x5, x13 = rotl32(x13 ^ x1, 16) + vpaddd %ymm1,%ymm5,%ymm1 + vpxord %ymm1,%ymm13,%ymm13 + vprold $16,%ymm13,%ymm13 + # x2 += x6, x14 = rotl32(x14 ^ x2, 16) + vpaddd %ymm2,%ymm6,%ymm2 + vpxord %ymm2,%ymm14,%ymm14 + vprold $16,%ymm14,%ymm14 + # x3 += x7, x15 = rotl32(x15 ^ x3, 16) + vpaddd %ymm3,%ymm7,%ymm3 + vpxord %ymm3,%ymm15,%ymm15 + vprold $16,%ymm15,%ymm15 + + # x8 += x12, x4 = rotl32(x4 ^ x8, 12) + vpaddd %ymm12,%ymm8,%ymm8 + vpxord %ymm8,%ymm4,%ymm4 + vprold $12,%ymm4,%ymm4 + # x9 += x13, x5 = rotl32(x5 ^ x9, 12) + vpaddd %ymm13,%ymm9,%ymm9 + vpxord %ymm9,%ymm5,%ymm5 + vprold $12,%ymm5,%ymm5 + # x10 += x14, x6 = rotl32(x6 ^ x10, 12) + vpaddd %ymm14,%ymm10,%ymm10 + vpxord %ymm10,%ymm6,%ymm6 + vprold $12,%ymm6,%ymm6 + # x11 += x15, x7 = rotl32(x7 ^ x11, 12) + vpaddd %ymm15,%ymm11,%ymm11 + vpxord %ymm11,%ymm7,%ymm7 + vprold $12,%ymm7,%ymm7 + + # x0 += x4, x12 = rotl32(x12 ^ x0, 8) + vpaddd %ymm0,%ymm4,%ymm0 + vpxord %ymm0,%ymm12,%ymm12 + vprold $8,%ymm12,%ymm12 + # x1 += x5, x13 = rotl32(x13 ^ x1, 8) + vpaddd %ymm1,%ymm5,%ymm1 + vpxord %ymm1,%ymm13,%ymm13 + vprold $8,%ymm13,%ymm13 + # x2 += x6, x14 = rotl32(x14 ^ x2, 8) + vpaddd %ymm2,%ymm6,%ymm2 + vpxord %ymm2,%ymm14,%ymm14 + vprold $8,%ymm14,%ymm14 + # x3 += x7, x15 = rotl32(x15 ^ x3, 8) + vpaddd %ymm3,%ymm7,%ymm3 + vpxord %ymm3,%ymm15,%ymm15 + vprold $8,%ymm15,%ymm15 + + # x8 += x12, x4 = rotl32(x4 ^ x8, 7) + vpaddd %ymm12,%ymm8,%ymm8 + vpxord %ymm8,%ymm4,%ymm4 + vprold $7,%ymm4,%ymm4 + # x9 += x13, x5 = rotl32(x5 ^ x9, 7) + vpaddd %ymm13,%ymm9,%ymm9 + vpxord %ymm9,%ymm5,%ymm5 + vprold $7,%ymm5,%ymm5 + # x10 += x14, x6 = rotl32(x6 ^ x10, 7) + vpaddd %ymm14,%ymm10,%ymm10 + vpxord %ymm10,%ymm6,%ymm6 + vprold $7,%ymm6,%ymm6 + # x11 += x15, x7 = rotl32(x7 ^ x11, 7) + vpaddd %ymm15,%ymm11,%ymm11 + vpxord %ymm11,%ymm7,%ymm7 + vprold $7,%ymm7,%ymm7 + + # x0 += x5, x15 = rotl32(x15 ^ x0, 16) + vpaddd %ymm0,%ymm5,%ymm0 + vpxord %ymm0,%ymm15,%ymm15 + vprold $16,%ymm15,%ymm15 + # x1 += x6, x12 = rotl32(x12 ^ x1, 16) + vpaddd %ymm1,%ymm6,%ymm1 + vpxord %ymm1,%ymm12,%ymm12 + vprold $16,%ymm12,%ymm12 + # x2 += x7, x13 = rotl32(x13 ^ x2, 16) + vpaddd %ymm2,%ymm7,%ymm2 + vpxord %ymm2,%ymm13,%ymm13 + vprold $16,%ymm13,%ymm13 + # x3 += x4, x14 = rotl32(x14 ^ x3, 16) + vpaddd %ymm3,%ymm4,%ymm3 + vpxord %ymm3,%ymm14,%ymm14 + vprold $16,%ymm14,%ymm14 + + # x10 += x15, x5 = rotl32(x5 ^ x10, 12) + vpaddd %ymm15,%ymm10,%ymm10 + vpxord %ymm10,%ymm5,%ymm5 + vprold $12,%ymm5,%ymm5 + # x11 += x12, x6 = rotl32(x6 ^ x11, 12) + vpaddd %ymm12,%ymm11,%ymm11 + vpxord %ymm11,%ymm6,%ymm6 + vprold $12,%ymm6,%ymm6 + # x8 += x13, x7 = rotl32(x7 ^ x8, 12) + vpaddd %ymm13,%ymm8,%ymm8 + vpxord %ymm8,%ymm7,%ymm7 + vprold $12,%ymm7,%ymm7 + # x9 += x14, x4 = rotl32(x4 ^ x9, 12) + vpaddd %ymm14,%ymm9,%ymm9 + vpxord %ymm9,%ymm4,%ymm4 + vprold $12,%ymm4,%ymm4 + + # x0 += x5, x15 = rotl32(x15 ^ x0, 8) + vpaddd %ymm0,%ymm5,%ymm0 + vpxord %ymm0,%ymm15,%ymm15 + vprold $8,%ymm15,%ymm15 + # x1 += x6, x12 = rotl32(x12 ^ x1, 8) + vpaddd %ymm1,%ymm6,%ymm1 + vpxord %ymm1,%ymm12,%ymm12 + vprold $8,%ymm12,%ymm12 + # x2 += x7, x13 = rotl32(x13 ^ x2, 8) + vpaddd %ymm2,%ymm7,%ymm2 + vpxord %ymm2,%ymm13,%ymm13 + vprold $8,%ymm13,%ymm13 + # x3 += x4, x14 = rotl32(x14 ^ x3, 8) + vpaddd %ymm3,%ymm4,%ymm3 + vpxord %ymm3,%ymm14,%ymm14 + vprold $8,%ymm14,%ymm14 + + # x10 += x15, x5 = rotl32(x5 ^ x10, 7) + vpaddd %ymm15,%ymm10,%ymm10 + vpxord %ymm10,%ymm5,%ymm5 + vprold $7,%ymm5,%ymm5 + # x11 += x12, x6 = rotl32(x6 ^ x11, 7) + vpaddd %ymm12,%ymm11,%ymm11 + vpxord %ymm11,%ymm6,%ymm6 + vprold $7,%ymm6,%ymm6 + # x8 += x13, x7 = rotl32(x7 ^ x8, 7) + vpaddd %ymm13,%ymm8,%ymm8 + vpxord %ymm8,%ymm7,%ymm7 + vprold $7,%ymm7,%ymm7 + # x9 += x14, x4 = rotl32(x4 ^ x9, 7) + vpaddd %ymm14,%ymm9,%ymm9 + vpxord %ymm9,%ymm4,%ymm4 + vprold $7,%ymm4,%ymm4 + + dec %eax + jnz .Ldoubleround8 + + # x0..15[0-3] += s[0..15] + vpaddd %ymm16,%ymm0,%ymm0 + vpaddd %ymm17,%ymm1,%ymm1 + vpaddd %ymm18,%ymm2,%ymm2 + vpaddd %ymm19,%ymm3,%ymm3 + vpaddd %ymm20,%ymm4,%ymm4 + vpaddd %ymm21,%ymm5,%ymm5 + vpaddd %ymm22,%ymm6,%ymm6 + vpaddd %ymm23,%ymm7,%ymm7 + vpaddd %ymm24,%ymm8,%ymm8 + vpaddd %ymm25,%ymm9,%ymm9 + vpaddd %ymm26,%ymm10,%ymm10 + vpaddd %ymm27,%ymm11,%ymm11 + vpaddd %ymm28,%ymm12,%ymm12 + vpaddd %ymm29,%ymm13,%ymm13 + vpaddd %ymm30,%ymm14,%ymm14 + vpaddd %ymm31,%ymm15,%ymm15 + + # interleave 32-bit words in state n, n+1 + vpunpckldq %ymm1,%ymm0,%ymm16 + vpunpckhdq %ymm1,%ymm0,%ymm17 + vpunpckldq %ymm3,%ymm2,%ymm18 + vpunpckhdq %ymm3,%ymm2,%ymm19 + vpunpckldq %ymm5,%ymm4,%ymm20 + vpunpckhdq %ymm5,%ymm4,%ymm21 + vpunpckldq %ymm7,%ymm6,%ymm22 + vpunpckhdq %ymm7,%ymm6,%ymm23 + vpunpckldq %ymm9,%ymm8,%ymm24 + vpunpckhdq %ymm9,%ymm8,%ymm25 + vpunpckldq %ymm11,%ymm10,%ymm26 + vpunpckhdq %ymm11,%ymm10,%ymm27 + vpunpckldq %ymm13,%ymm12,%ymm28 + vpunpckhdq %ymm13,%ymm12,%ymm29 + vpunpckldq %ymm15,%ymm14,%ymm30 + vpunpckhdq %ymm15,%ymm14,%ymm31 + + # interleave 64-bit words in state n, n+2 + vpunpcklqdq %ymm18,%ymm16,%ymm0 + vpunpcklqdq %ymm19,%ymm17,%ymm1 + vpunpckhqdq %ymm18,%ymm16,%ymm2 + vpunpckhqdq %ymm19,%ymm17,%ymm3 + vpunpcklqdq %ymm22,%ymm20,%ymm4 + vpunpcklqdq %ymm23,%ymm21,%ymm5 + vpunpckhqdq %ymm22,%ymm20,%ymm6 + vpunpckhqdq %ymm23,%ymm21,%ymm7 + vpunpcklqdq %ymm26,%ymm24,%ymm8 + vpunpcklqdq %ymm27,%ymm25,%ymm9 + vpunpckhqdq %ymm26,%ymm24,%ymm10 + vpunpckhqdq %ymm27,%ymm25,%ymm11 + vpunpcklqdq %ymm30,%ymm28,%ymm12 + vpunpcklqdq %ymm31,%ymm29,%ymm13 + vpunpckhqdq %ymm30,%ymm28,%ymm14 + vpunpckhqdq %ymm31,%ymm29,%ymm15 + + # interleave 128-bit words in state n, n+4 + # xor/write first four blocks + vmovdqa64 %ymm0,%ymm16 + vperm2i128 $0x20,%ymm4,%ymm0,%ymm0 + cmp $0x0020,%rcx + jl .Lxorpart8 + vpxord 0x0000(%rdx),%ymm0,%ymm0 + vmovdqu64 %ymm0,0x0000(%rsi) + vmovdqa64 %ymm16,%ymm0 + vperm2i128 $0x31,%ymm4,%ymm0,%ymm4 + + vperm2i128 $0x20,%ymm12,%ymm8,%ymm0 + cmp $0x0040,%rcx + jl .Lxorpart8 + vpxord 0x0020(%rdx),%ymm0,%ymm0 + vmovdqu64 %ymm0,0x0020(%rsi) + vperm2i128 $0x31,%ymm12,%ymm8,%ymm12 + + vperm2i128 $0x20,%ymm6,%ymm2,%ymm0 + cmp $0x0060,%rcx + jl .Lxorpart8 + vpxord 0x0040(%rdx),%ymm0,%ymm0 + vmovdqu64 %ymm0,0x0040(%rsi) + vperm2i128 $0x31,%ymm6,%ymm2,%ymm6 + + vperm2i128 $0x20,%ymm14,%ymm10,%ymm0 + cmp $0x0080,%rcx + jl .Lxorpart8 + vpxord 0x0060(%rdx),%ymm0,%ymm0 + vmovdqu64 %ymm0,0x0060(%rsi) + vperm2i128 $0x31,%ymm14,%ymm10,%ymm14 + + vperm2i128 $0x20,%ymm5,%ymm1,%ymm0 + cmp $0x00a0,%rcx + jl .Lxorpart8 + vpxord 0x0080(%rdx),%ymm0,%ymm0 + vmovdqu64 %ymm0,0x0080(%rsi) + vperm2i128 $0x31,%ymm5,%ymm1,%ymm5 + + vperm2i128 $0x20,%ymm13,%ymm9,%ymm0 + cmp $0x00c0,%rcx + jl .Lxorpart8 + vpxord 0x00a0(%rdx),%ymm0,%ymm0 + vmovdqu64 %ymm0,0x00a0(%rsi) + vperm2i128 $0x31,%ymm13,%ymm9,%ymm13 + + vperm2i128 $0x20,%ymm7,%ymm3,%ymm0 + cmp $0x00e0,%rcx + jl .Lxorpart8 + vpxord 0x00c0(%rdx),%ymm0,%ymm0 + vmovdqu64 %ymm0,0x00c0(%rsi) + vperm2i128 $0x31,%ymm7,%ymm3,%ymm7 + + vperm2i128 $0x20,%ymm15,%ymm11,%ymm0 + cmp $0x0100,%rcx + jl .Lxorpart8 + vpxord 0x00e0(%rdx),%ymm0,%ymm0 + vmovdqu64 %ymm0,0x00e0(%rsi) + vperm2i128 $0x31,%ymm15,%ymm11,%ymm15 + + # xor remaining blocks, write to output + vmovdqa64 %ymm4,%ymm0 + cmp $0x0120,%rcx + jl .Lxorpart8 + vpxord 0x0100(%rdx),%ymm0,%ymm0 + vmovdqu64 %ymm0,0x0100(%rsi) + + vmovdqa64 %ymm12,%ymm0 + cmp $0x0140,%rcx + jl .Lxorpart8 + vpxord 0x0120(%rdx),%ymm0,%ymm0 + vmovdqu64 %ymm0,0x0120(%rsi) + + vmovdqa64 %ymm6,%ymm0 + cmp $0x0160,%rcx + jl .Lxorpart8 + vpxord 0x0140(%rdx),%ymm0,%ymm0 + vmovdqu64 %ymm0,0x0140(%rsi) + + vmovdqa64 %ymm14,%ymm0 + cmp $0x0180,%rcx + jl .Lxorpart8 + vpxord 0x0160(%rdx),%ymm0,%ymm0 + vmovdqu64 %ymm0,0x0160(%rsi) + + vmovdqa64 %ymm5,%ymm0 + cmp $0x01a0,%rcx + jl .Lxorpart8 + vpxord 0x0180(%rdx),%ymm0,%ymm0 + vmovdqu64 %ymm0,0x0180(%rsi) + + vmovdqa64 %ymm13,%ymm0 + cmp $0x01c0,%rcx + jl .Lxorpart8 + vpxord 0x01a0(%rdx),%ymm0,%ymm0 + vmovdqu64 %ymm0,0x01a0(%rsi) + + vmovdqa64 %ymm7,%ymm0 + cmp $0x01e0,%rcx + jl .Lxorpart8 + vpxord 0x01c0(%rdx),%ymm0,%ymm0 + vmovdqu64 %ymm0,0x01c0(%rsi) + + vmovdqa64 %ymm15,%ymm0 + cmp $0x0200,%rcx + jl .Lxorpart8 + vpxord 0x01e0(%rdx),%ymm0,%ymm0 + vmovdqu64 %ymm0,0x01e0(%rsi) + +.Ldone8: + vzeroupper + ret + +.Lxorpart8: + # xor remaining bytes from partial register into output + mov %rcx,%rax + and $0x1f,%rcx + jz .Ldone8 + mov %rax,%r9 + and $~0x1f,%r9 + + mov $1,%rax + shld %cl,%rax,%rax + sub $1,%rax + kmovq %rax,%k1 + + vmovdqu8 (%rdx,%r9),%ymm1{%k1}{z} + vpxord %ymm0,%ymm1,%ymm1 + vmovdqu8 %ymm1,(%rsi,%r9){%k1} + + jmp .Ldone8 + +ENDPROC(chacha20_8block_xor_avx512vl) diff --git a/arch/x86/crypto/chacha20_glue.c b/arch/x86/crypto/chacha20_glue.c index 1e9e66509226..6a67e70bc82a 100644 --- a/arch/x86/crypto/chacha20_glue.c +++ b/arch/x86/crypto/chacha20_glue.c @@ -31,6 +31,11 @@ asmlinkage void chacha20_4block_xor_avx2(u32 *state, u8 *dst, const u8 *src, asmlinkage void chacha20_8block_xor_avx2(u32 *state, u8 *dst, const u8 *src, unsigned int len); static bool chacha20_use_avx2; +#ifdef CONFIG_AS_AVX512 +asmlinkage void chacha20_8block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src, + unsigned int len); +static bool chacha20_use_avx512vl; +#endif #endif static unsigned int chacha20_advance(unsigned int len, unsigned int maxblocks) @@ -43,6 +48,22 @@ static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src, unsigned int bytes) { #ifdef CONFIG_AS_AVX2 +#ifdef CONFIG_AS_AVX512 + if (chacha20_use_avx512vl) { + while (bytes >= CHACHA_BLOCK_SIZE * 8) { + chacha20_8block_xor_avx512vl(state, dst, src, bytes); + bytes -= CHACHA_BLOCK_SIZE * 8; + src += CHACHA_BLOCK_SIZE * 8; + dst += CHACHA_BLOCK_SIZE * 8; + state[12] += 8; + } + if (bytes > CHACHA_BLOCK_SIZE * 4) { + chacha20_8block_xor_avx512vl(state, dst, src, bytes); + state[12] += chacha20_advance(bytes, 8); + return; + } + } +#endif if (chacha20_use_avx2) { while (bytes >= CHACHA_BLOCK_SIZE * 8) { chacha20_8block_xor_avx2(state, dst, src, bytes); @@ -149,6 +170,11 @@ static int __init chacha20_simd_mod_init(void) chacha20_use_avx2 = boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_AVX2) && cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL); +#ifdef CONFIG_AS_AVX512 + chacha20_use_avx512vl = chacha20_use_avx2 && + boot_cpu_has(X86_FEATURE_AVX512VL) && + boot_cpu_has(X86_FEATURE_AVX512BW); /* kmovq */ +#endif #endif return crypto_register_skcipher(&alg); } -- cgit v1.2.3 From 29a47b54e030efe308aa90e6c26a9ce7f5f84ed8 Mon Sep 17 00:00:00 2001 From: Martin Willi Date: Tue, 20 Nov 2018 17:30:49 +0100 Subject: crypto: x86/chacha20 - Add a 2-block AVX-512VL variant This version uses the same principle as the AVX2 version. It benefits from the AVX-512VL rotate instructions and the more efficient partial block handling using "vmovdqu8", resulting in a speedup of ~20%. Unlike the AVX2 version, it is faster than the single block SSSE3 version to process a single block. Hence we engage that function for (partial) single block lengths as well. Signed-off-by: Martin Willi Signed-off-by: Herbert Xu --- arch/x86/crypto/chacha20-avx512vl-x86_64.S | 171 +++++++++++++++++++++++++++++ arch/x86/crypto/chacha20_glue.c | 7 ++ 2 files changed, 178 insertions(+) (limited to 'arch/x86/crypto') diff --git a/arch/x86/crypto/chacha20-avx512vl-x86_64.S b/arch/x86/crypto/chacha20-avx512vl-x86_64.S index e1877afcaa73..261097578715 100644 --- a/arch/x86/crypto/chacha20-avx512vl-x86_64.S +++ b/arch/x86/crypto/chacha20-avx512vl-x86_64.S @@ -7,6 +7,11 @@ #include +.section .rodata.cst32.CTR2BL, "aM", @progbits, 32 +.align 32 +CTR2BL: .octa 0x00000000000000000000000000000000 + .octa 0x00000000000000000000000000000001 + .section .rodata.cst32.CTR8BL, "aM", @progbits, 32 .align 32 CTR8BL: .octa 0x00000003000000020000000100000000 @@ -14,6 +19,172 @@ CTR8BL: .octa 0x00000003000000020000000100000000 .text +ENTRY(chacha20_2block_xor_avx512vl) + # %rdi: Input state matrix, s + # %rsi: up to 2 data blocks output, o + # %rdx: up to 2 data blocks input, i + # %rcx: input/output length in bytes + + # This function encrypts two ChaCha20 blocks by loading the state + # matrix twice across four AVX registers. It performs matrix operations + # on four words in each matrix in parallel, but requires shuffling to + # rearrange the words after each round. + + vzeroupper + + # x0..3[0-2] = s0..3 + vbroadcasti128 0x00(%rdi),%ymm0 + vbroadcasti128 0x10(%rdi),%ymm1 + vbroadcasti128 0x20(%rdi),%ymm2 + vbroadcasti128 0x30(%rdi),%ymm3 + + vpaddd CTR2BL(%rip),%ymm3,%ymm3 + + vmovdqa %ymm0,%ymm8 + vmovdqa %ymm1,%ymm9 + vmovdqa %ymm2,%ymm10 + vmovdqa %ymm3,%ymm11 + + mov $10,%rax + +.Ldoubleround: + + # x0 += x1, x3 = rotl32(x3 ^ x0, 16) + vpaddd %ymm1,%ymm0,%ymm0 + vpxord %ymm0,%ymm3,%ymm3 + vprold $16,%ymm3,%ymm3 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 12) + vpaddd %ymm3,%ymm2,%ymm2 + vpxord %ymm2,%ymm1,%ymm1 + vprold $12,%ymm1,%ymm1 + + # x0 += x1, x3 = rotl32(x3 ^ x0, 8) + vpaddd %ymm1,%ymm0,%ymm0 + vpxord %ymm0,%ymm3,%ymm3 + vprold $8,%ymm3,%ymm3 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 7) + vpaddd %ymm3,%ymm2,%ymm2 + vpxord %ymm2,%ymm1,%ymm1 + vprold $7,%ymm1,%ymm1 + + # x1 = shuffle32(x1, MASK(0, 3, 2, 1)) + vpshufd $0x39,%ymm1,%ymm1 + # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) + vpshufd $0x4e,%ymm2,%ymm2 + # x3 = shuffle32(x3, MASK(2, 1, 0, 3)) + vpshufd $0x93,%ymm3,%ymm3 + + # x0 += x1, x3 = rotl32(x3 ^ x0, 16) + vpaddd %ymm1,%ymm0,%ymm0 + vpxord %ymm0,%ymm3,%ymm3 + vprold $16,%ymm3,%ymm3 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 12) + vpaddd %ymm3,%ymm2,%ymm2 + vpxord %ymm2,%ymm1,%ymm1 + vprold $12,%ymm1,%ymm1 + + # x0 += x1, x3 = rotl32(x3 ^ x0, 8) + vpaddd %ymm1,%ymm0,%ymm0 + vpxord %ymm0,%ymm3,%ymm3 + vprold $8,%ymm3,%ymm3 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 7) + vpaddd %ymm3,%ymm2,%ymm2 + vpxord %ymm2,%ymm1,%ymm1 + vprold $7,%ymm1,%ymm1 + + # x1 = shuffle32(x1, MASK(2, 1, 0, 3)) + vpshufd $0x93,%ymm1,%ymm1 + # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) + vpshufd $0x4e,%ymm2,%ymm2 + # x3 = shuffle32(x3, MASK(0, 3, 2, 1)) + vpshufd $0x39,%ymm3,%ymm3 + + dec %rax + jnz .Ldoubleround + + # o0 = i0 ^ (x0 + s0) + vpaddd %ymm8,%ymm0,%ymm7 + cmp $0x10,%rcx + jl .Lxorpart2 + vpxord 0x00(%rdx),%xmm7,%xmm6 + vmovdqu %xmm6,0x00(%rsi) + vextracti128 $1,%ymm7,%xmm0 + # o1 = i1 ^ (x1 + s1) + vpaddd %ymm9,%ymm1,%ymm7 + cmp $0x20,%rcx + jl .Lxorpart2 + vpxord 0x10(%rdx),%xmm7,%xmm6 + vmovdqu %xmm6,0x10(%rsi) + vextracti128 $1,%ymm7,%xmm1 + # o2 = i2 ^ (x2 + s2) + vpaddd %ymm10,%ymm2,%ymm7 + cmp $0x30,%rcx + jl .Lxorpart2 + vpxord 0x20(%rdx),%xmm7,%xmm6 + vmovdqu %xmm6,0x20(%rsi) + vextracti128 $1,%ymm7,%xmm2 + # o3 = i3 ^ (x3 + s3) + vpaddd %ymm11,%ymm3,%ymm7 + cmp $0x40,%rcx + jl .Lxorpart2 + vpxord 0x30(%rdx),%xmm7,%xmm6 + vmovdqu %xmm6,0x30(%rsi) + vextracti128 $1,%ymm7,%xmm3 + + # xor and write second block + vmovdqa %xmm0,%xmm7 + cmp $0x50,%rcx + jl .Lxorpart2 + vpxord 0x40(%rdx),%xmm7,%xmm6 + vmovdqu %xmm6,0x40(%rsi) + + vmovdqa %xmm1,%xmm7 + cmp $0x60,%rcx + jl .Lxorpart2 + vpxord 0x50(%rdx),%xmm7,%xmm6 + vmovdqu %xmm6,0x50(%rsi) + + vmovdqa %xmm2,%xmm7 + cmp $0x70,%rcx + jl .Lxorpart2 + vpxord 0x60(%rdx),%xmm7,%xmm6 + vmovdqu %xmm6,0x60(%rsi) + + vmovdqa %xmm3,%xmm7 + cmp $0x80,%rcx + jl .Lxorpart2 + vpxord 0x70(%rdx),%xmm7,%xmm6 + vmovdqu %xmm6,0x70(%rsi) + +.Ldone2: + vzeroupper + ret + +.Lxorpart2: + # xor remaining bytes from partial register into output + mov %rcx,%rax + and $0xf,%rcx + jz .Ldone8 + mov %rax,%r9 + and $~0xf,%r9 + + mov $1,%rax + shld %cl,%rax,%rax + sub $1,%rax + kmovq %rax,%k1 + + vmovdqu8 (%rdx,%r9),%xmm1{%k1}{z} + vpxord %xmm7,%xmm1,%xmm1 + vmovdqu8 %xmm1,(%rsi,%r9){%k1} + + jmp .Ldone2 + +ENDPROC(chacha20_2block_xor_avx512vl) + ENTRY(chacha20_8block_xor_avx512vl) # %rdi: Input state matrix, s # %rsi: up to 8 data blocks output, o diff --git a/arch/x86/crypto/chacha20_glue.c b/arch/x86/crypto/chacha20_glue.c index 6a67e70bc82a..d6a95a6a324e 100644 --- a/arch/x86/crypto/chacha20_glue.c +++ b/arch/x86/crypto/chacha20_glue.c @@ -32,6 +32,8 @@ asmlinkage void chacha20_8block_xor_avx2(u32 *state, u8 *dst, const u8 *src, unsigned int len); static bool chacha20_use_avx2; #ifdef CONFIG_AS_AVX512 +asmlinkage void chacha20_2block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src, + unsigned int len); asmlinkage void chacha20_8block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src, unsigned int len); static bool chacha20_use_avx512vl; @@ -62,6 +64,11 @@ static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src, state[12] += chacha20_advance(bytes, 8); return; } + if (bytes) { + chacha20_2block_xor_avx512vl(state, dst, src, bytes); + state[12] += chacha20_advance(bytes, 2); + return; + } } #endif if (chacha20_use_avx2) { -- cgit v1.2.3 From 180def6c4ad139ae6f97953ae810092ace295d5b Mon Sep 17 00:00:00 2001 From: Martin Willi Date: Tue, 20 Nov 2018 17:30:50 +0100 Subject: crypto: x86/chacha20 - Add a 4-block AVX-512VL variant This version uses the same principle as the AVX2 version by scheduling the operations for two block pairs in parallel. It benefits from the AVX-512VL rotate instructions and the more efficient partial block handling using "vmovdqu8", resulting in a speedup of the raw block function of ~20%. Signed-off-by: Martin Willi Signed-off-by: Herbert Xu --- arch/x86/crypto/chacha20-avx512vl-x86_64.S | 272 +++++++++++++++++++++++++++++ arch/x86/crypto/chacha20_glue.c | 7 + 2 files changed, 279 insertions(+) (limited to 'arch/x86/crypto') diff --git a/arch/x86/crypto/chacha20-avx512vl-x86_64.S b/arch/x86/crypto/chacha20-avx512vl-x86_64.S index 261097578715..55d34de29e3e 100644 --- a/arch/x86/crypto/chacha20-avx512vl-x86_64.S +++ b/arch/x86/crypto/chacha20-avx512vl-x86_64.S @@ -12,6 +12,11 @@ CTR2BL: .octa 0x00000000000000000000000000000000 .octa 0x00000000000000000000000000000001 +.section .rodata.cst32.CTR4BL, "aM", @progbits, 32 +.align 32 +CTR4BL: .octa 0x00000000000000000000000000000002 + .octa 0x00000000000000000000000000000003 + .section .rodata.cst32.CTR8BL, "aM", @progbits, 32 .align 32 CTR8BL: .octa 0x00000003000000020000000100000000 @@ -185,6 +190,273 @@ ENTRY(chacha20_2block_xor_avx512vl) ENDPROC(chacha20_2block_xor_avx512vl) +ENTRY(chacha20_4block_xor_avx512vl) + # %rdi: Input state matrix, s + # %rsi: up to 4 data blocks output, o + # %rdx: up to 4 data blocks input, i + # %rcx: input/output length in bytes + + # This function encrypts four ChaCha20 block by loading the state + # matrix four times across eight AVX registers. It performs matrix + # operations on four words in two matrices in parallel, sequentially + # to the operations on the four words of the other two matrices. The + # required word shuffling has a rather high latency, we can do the + # arithmetic on two matrix-pairs without much slowdown. + + vzeroupper + + # x0..3[0-4] = s0..3 + vbroadcasti128 0x00(%rdi),%ymm0 + vbroadcasti128 0x10(%rdi),%ymm1 + vbroadcasti128 0x20(%rdi),%ymm2 + vbroadcasti128 0x30(%rdi),%ymm3 + + vmovdqa %ymm0,%ymm4 + vmovdqa %ymm1,%ymm5 + vmovdqa %ymm2,%ymm6 + vmovdqa %ymm3,%ymm7 + + vpaddd CTR2BL(%rip),%ymm3,%ymm3 + vpaddd CTR4BL(%rip),%ymm7,%ymm7 + + vmovdqa %ymm0,%ymm11 + vmovdqa %ymm1,%ymm12 + vmovdqa %ymm2,%ymm13 + vmovdqa %ymm3,%ymm14 + vmovdqa %ymm7,%ymm15 + + mov $10,%rax + +.Ldoubleround4: + + # x0 += x1, x3 = rotl32(x3 ^ x0, 16) + vpaddd %ymm1,%ymm0,%ymm0 + vpxord %ymm0,%ymm3,%ymm3 + vprold $16,%ymm3,%ymm3 + + vpaddd %ymm5,%ymm4,%ymm4 + vpxord %ymm4,%ymm7,%ymm7 + vprold $16,%ymm7,%ymm7 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 12) + vpaddd %ymm3,%ymm2,%ymm2 + vpxord %ymm2,%ymm1,%ymm1 + vprold $12,%ymm1,%ymm1 + + vpaddd %ymm7,%ymm6,%ymm6 + vpxord %ymm6,%ymm5,%ymm5 + vprold $12,%ymm5,%ymm5 + + # x0 += x1, x3 = rotl32(x3 ^ x0, 8) + vpaddd %ymm1,%ymm0,%ymm0 + vpxord %ymm0,%ymm3,%ymm3 + vprold $8,%ymm3,%ymm3 + + vpaddd %ymm5,%ymm4,%ymm4 + vpxord %ymm4,%ymm7,%ymm7 + vprold $8,%ymm7,%ymm7 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 7) + vpaddd %ymm3,%ymm2,%ymm2 + vpxord %ymm2,%ymm1,%ymm1 + vprold $7,%ymm1,%ymm1 + + vpaddd %ymm7,%ymm6,%ymm6 + vpxord %ymm6,%ymm5,%ymm5 + vprold $7,%ymm5,%ymm5 + + # x1 = shuffle32(x1, MASK(0, 3, 2, 1)) + vpshufd $0x39,%ymm1,%ymm1 + vpshufd $0x39,%ymm5,%ymm5 + # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) + vpshufd $0x4e,%ymm2,%ymm2 + vpshufd $0x4e,%ymm6,%ymm6 + # x3 = shuffle32(x3, MASK(2, 1, 0, 3)) + vpshufd $0x93,%ymm3,%ymm3 + vpshufd $0x93,%ymm7,%ymm7 + + # x0 += x1, x3 = rotl32(x3 ^ x0, 16) + vpaddd %ymm1,%ymm0,%ymm0 + vpxord %ymm0,%ymm3,%ymm3 + vprold $16,%ymm3,%ymm3 + + vpaddd %ymm5,%ymm4,%ymm4 + vpxord %ymm4,%ymm7,%ymm7 + vprold $16,%ymm7,%ymm7 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 12) + vpaddd %ymm3,%ymm2,%ymm2 + vpxord %ymm2,%ymm1,%ymm1 + vprold $12,%ymm1,%ymm1 + + vpaddd %ymm7,%ymm6,%ymm6 + vpxord %ymm6,%ymm5,%ymm5 + vprold $12,%ymm5,%ymm5 + + # x0 += x1, x3 = rotl32(x3 ^ x0, 8) + vpaddd %ymm1,%ymm0,%ymm0 + vpxord %ymm0,%ymm3,%ymm3 + vprold $8,%ymm3,%ymm3 + + vpaddd %ymm5,%ymm4,%ymm4 + vpxord %ymm4,%ymm7,%ymm7 + vprold $8,%ymm7,%ymm7 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 7) + vpaddd %ymm3,%ymm2,%ymm2 + vpxord %ymm2,%ymm1,%ymm1 + vprold $7,%ymm1,%ymm1 + + vpaddd %ymm7,%ymm6,%ymm6 + vpxord %ymm6,%ymm5,%ymm5 + vprold $7,%ymm5,%ymm5 + + # x1 = shuffle32(x1, MASK(2, 1, 0, 3)) + vpshufd $0x93,%ymm1,%ymm1 + vpshufd $0x93,%ymm5,%ymm5 + # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) + vpshufd $0x4e,%ymm2,%ymm2 + vpshufd $0x4e,%ymm6,%ymm6 + # x3 = shuffle32(x3, MASK(0, 3, 2, 1)) + vpshufd $0x39,%ymm3,%ymm3 + vpshufd $0x39,%ymm7,%ymm7 + + dec %rax + jnz .Ldoubleround4 + + # o0 = i0 ^ (x0 + s0), first block + vpaddd %ymm11,%ymm0,%ymm10 + cmp $0x10,%rcx + jl .Lxorpart4 + vpxord 0x00(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x00(%rsi) + vextracti128 $1,%ymm10,%xmm0 + # o1 = i1 ^ (x1 + s1), first block + vpaddd %ymm12,%ymm1,%ymm10 + cmp $0x20,%rcx + jl .Lxorpart4 + vpxord 0x10(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x10(%rsi) + vextracti128 $1,%ymm10,%xmm1 + # o2 = i2 ^ (x2 + s2), first block + vpaddd %ymm13,%ymm2,%ymm10 + cmp $0x30,%rcx + jl .Lxorpart4 + vpxord 0x20(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x20(%rsi) + vextracti128 $1,%ymm10,%xmm2 + # o3 = i3 ^ (x3 + s3), first block + vpaddd %ymm14,%ymm3,%ymm10 + cmp $0x40,%rcx + jl .Lxorpart4 + vpxord 0x30(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x30(%rsi) + vextracti128 $1,%ymm10,%xmm3 + + # xor and write second block + vmovdqa %xmm0,%xmm10 + cmp $0x50,%rcx + jl .Lxorpart4 + vpxord 0x40(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x40(%rsi) + + vmovdqa %xmm1,%xmm10 + cmp $0x60,%rcx + jl .Lxorpart4 + vpxord 0x50(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x50(%rsi) + + vmovdqa %xmm2,%xmm10 + cmp $0x70,%rcx + jl .Lxorpart4 + vpxord 0x60(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x60(%rsi) + + vmovdqa %xmm3,%xmm10 + cmp $0x80,%rcx + jl .Lxorpart4 + vpxord 0x70(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x70(%rsi) + + # o0 = i0 ^ (x0 + s0), third block + vpaddd %ymm11,%ymm4,%ymm10 + cmp $0x90,%rcx + jl .Lxorpart4 + vpxord 0x80(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x80(%rsi) + vextracti128 $1,%ymm10,%xmm4 + # o1 = i1 ^ (x1 + s1), third block + vpaddd %ymm12,%ymm5,%ymm10 + cmp $0xa0,%rcx + jl .Lxorpart4 + vpxord 0x90(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x90(%rsi) + vextracti128 $1,%ymm10,%xmm5 + # o2 = i2 ^ (x2 + s2), third block + vpaddd %ymm13,%ymm6,%ymm10 + cmp $0xb0,%rcx + jl .Lxorpart4 + vpxord 0xa0(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0xa0(%rsi) + vextracti128 $1,%ymm10,%xmm6 + # o3 = i3 ^ (x3 + s3), third block + vpaddd %ymm15,%ymm7,%ymm10 + cmp $0xc0,%rcx + jl .Lxorpart4 + vpxord 0xb0(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0xb0(%rsi) + vextracti128 $1,%ymm10,%xmm7 + + # xor and write fourth block + vmovdqa %xmm4,%xmm10 + cmp $0xd0,%rcx + jl .Lxorpart4 + vpxord 0xc0(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0xc0(%rsi) + + vmovdqa %xmm5,%xmm10 + cmp $0xe0,%rcx + jl .Lxorpart4 + vpxord 0xd0(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0xd0(%rsi) + + vmovdqa %xmm6,%xmm10 + cmp $0xf0,%rcx + jl .Lxorpart4 + vpxord 0xe0(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0xe0(%rsi) + + vmovdqa %xmm7,%xmm10 + cmp $0x100,%rcx + jl .Lxorpart4 + vpxord 0xf0(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0xf0(%rsi) + +.Ldone4: + vzeroupper + ret + +.Lxorpart4: + # xor remaining bytes from partial register into output + mov %rcx,%rax + and $0xf,%rcx + jz .Ldone8 + mov %rax,%r9 + and $~0xf,%r9 + + mov $1,%rax + shld %cl,%rax,%rax + sub $1,%rax + kmovq %rax,%k1 + + vmovdqu8 (%rdx,%r9),%xmm1{%k1}{z} + vpxord %xmm10,%xmm1,%xmm1 + vmovdqu8 %xmm1,(%rsi,%r9){%k1} + + jmp .Ldone4 + +ENDPROC(chacha20_4block_xor_avx512vl) + ENTRY(chacha20_8block_xor_avx512vl) # %rdi: Input state matrix, s # %rsi: up to 8 data blocks output, o diff --git a/arch/x86/crypto/chacha20_glue.c b/arch/x86/crypto/chacha20_glue.c index d6a95a6a324e..773d075a1483 100644 --- a/arch/x86/crypto/chacha20_glue.c +++ b/arch/x86/crypto/chacha20_glue.c @@ -34,6 +34,8 @@ static bool chacha20_use_avx2; #ifdef CONFIG_AS_AVX512 asmlinkage void chacha20_2block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src, unsigned int len); +asmlinkage void chacha20_4block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src, + unsigned int len); asmlinkage void chacha20_8block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src, unsigned int len); static bool chacha20_use_avx512vl; @@ -64,6 +66,11 @@ static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src, state[12] += chacha20_advance(bytes, 8); return; } + if (bytes > CHACHA_BLOCK_SIZE * 2) { + chacha20_4block_xor_avx512vl(state, dst, src, bytes); + state[12] += chacha20_advance(bytes, 4); + return; + } if (bytes) { chacha20_2block_xor_avx512vl(state, dst, src, bytes); state[12] += chacha20_advance(bytes, 2); -- cgit v1.2.3 From 012c82388c032cd4a9821e11bae336cf4a014822 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 4 Dec 2018 22:20:00 -0800 Subject: crypto: x86/nhpoly1305 - add SSE2 accelerated NHPoly1305 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a 64-bit SSE2 implementation of NHPoly1305, an ε-almost-∆-universal hash function used in the Adiantum encryption mode. For now, only the NH portion is actually SSE2-accelerated; the Poly1305 part is less performance-critical so is just implemented in C. Signed-off-by: Eric Biggers Signed-off-by: Herbert Xu --- arch/x86/crypto/Makefile | 4 ++ arch/x86/crypto/nh-sse2-x86_64.S | 123 +++++++++++++++++++++++++++++++++ arch/x86/crypto/nhpoly1305-sse2-glue.c | 76 ++++++++++++++++++++ 3 files changed, 203 insertions(+) create mode 100644 arch/x86/crypto/nh-sse2-x86_64.S create mode 100644 arch/x86/crypto/nhpoly1305-sse2-glue.c (limited to 'arch/x86/crypto') diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index ce4e43642984..2a6acb4de373 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile @@ -47,6 +47,8 @@ obj-$(CONFIG_CRYPTO_MORUS1280_GLUE) += morus1280_glue.o obj-$(CONFIG_CRYPTO_MORUS640_SSE2) += morus640-sse2.o obj-$(CONFIG_CRYPTO_MORUS1280_SSE2) += morus1280-sse2.o +obj-$(CONFIG_CRYPTO_NHPOLY1305_SSE2) += nhpoly1305-sse2.o + # These modules require assembler to support AVX. ifeq ($(avx_supported),yes) obj-$(CONFIG_CRYPTO_CAMELLIA_AESNI_AVX_X86_64) += \ @@ -85,6 +87,8 @@ aegis256-aesni-y := aegis256-aesni-asm.o aegis256-aesni-glue.o morus640-sse2-y := morus640-sse2-asm.o morus640-sse2-glue.o morus1280-sse2-y := morus1280-sse2-asm.o morus1280-sse2-glue.o +nhpoly1305-sse2-y := nh-sse2-x86_64.o nhpoly1305-sse2-glue.o + ifeq ($(avx_supported),yes) camellia-aesni-avx-x86_64-y := camellia-aesni-avx-asm_64.o \ camellia_aesni_avx_glue.o diff --git a/arch/x86/crypto/nh-sse2-x86_64.S b/arch/x86/crypto/nh-sse2-x86_64.S new file mode 100644 index 000000000000..51f52d4ab4bb --- /dev/null +++ b/arch/x86/crypto/nh-sse2-x86_64.S @@ -0,0 +1,123 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * NH - ε-almost-universal hash function, x86_64 SSE2 accelerated + * + * Copyright 2018 Google LLC + * + * Author: Eric Biggers + */ + +#include + +#define PASS0_SUMS %xmm0 +#define PASS1_SUMS %xmm1 +#define PASS2_SUMS %xmm2 +#define PASS3_SUMS %xmm3 +#define K0 %xmm4 +#define K1 %xmm5 +#define K2 %xmm6 +#define K3 %xmm7 +#define T0 %xmm8 +#define T1 %xmm9 +#define T2 %xmm10 +#define T3 %xmm11 +#define T4 %xmm12 +#define T5 %xmm13 +#define T6 %xmm14 +#define T7 %xmm15 +#define KEY %rdi +#define MESSAGE %rsi +#define MESSAGE_LEN %rdx +#define HASH %rcx + +.macro _nh_stride k0, k1, k2, k3, offset + + // Load next message stride + movdqu \offset(MESSAGE), T1 + + // Load next key stride + movdqu \offset(KEY), \k3 + + // Add message words to key words + movdqa T1, T2 + movdqa T1, T3 + paddd T1, \k0 // reuse k0 to avoid a move + paddd \k1, T1 + paddd \k2, T2 + paddd \k3, T3 + + // Multiply 32x32 => 64 and accumulate + pshufd $0x10, \k0, T4 + pshufd $0x32, \k0, \k0 + pshufd $0x10, T1, T5 + pshufd $0x32, T1, T1 + pshufd $0x10, T2, T6 + pshufd $0x32, T2, T2 + pshufd $0x10, T3, T7 + pshufd $0x32, T3, T3 + pmuludq T4, \k0 + pmuludq T5, T1 + pmuludq T6, T2 + pmuludq T7, T3 + paddq \k0, PASS0_SUMS + paddq T1, PASS1_SUMS + paddq T2, PASS2_SUMS + paddq T3, PASS3_SUMS +.endm + +/* + * void nh_sse2(const u32 *key, const u8 *message, size_t message_len, + * u8 hash[NH_HASH_BYTES]) + * + * It's guaranteed that message_len % 16 == 0. + */ +ENTRY(nh_sse2) + + movdqu 0x00(KEY), K0 + movdqu 0x10(KEY), K1 + movdqu 0x20(KEY), K2 + add $0x30, KEY + pxor PASS0_SUMS, PASS0_SUMS + pxor PASS1_SUMS, PASS1_SUMS + pxor PASS2_SUMS, PASS2_SUMS + pxor PASS3_SUMS, PASS3_SUMS + + sub $0x40, MESSAGE_LEN + jl .Lloop4_done +.Lloop4: + _nh_stride K0, K1, K2, K3, 0x00 + _nh_stride K1, K2, K3, K0, 0x10 + _nh_stride K2, K3, K0, K1, 0x20 + _nh_stride K3, K0, K1, K2, 0x30 + add $0x40, KEY + add $0x40, MESSAGE + sub $0x40, MESSAGE_LEN + jge .Lloop4 + +.Lloop4_done: + and $0x3f, MESSAGE_LEN + jz .Ldone + _nh_stride K0, K1, K2, K3, 0x00 + + sub $0x10, MESSAGE_LEN + jz .Ldone + _nh_stride K1, K2, K3, K0, 0x10 + + sub $0x10, MESSAGE_LEN + jz .Ldone + _nh_stride K2, K3, K0, K1, 0x20 + +.Ldone: + // Sum the accumulators for each pass, then store the sums to 'hash' + movdqa PASS0_SUMS, T0 + movdqa PASS2_SUMS, T1 + punpcklqdq PASS1_SUMS, T0 // => (PASS0_SUM_A PASS1_SUM_A) + punpcklqdq PASS3_SUMS, T1 // => (PASS2_SUM_A PASS3_SUM_A) + punpckhqdq PASS1_SUMS, PASS0_SUMS // => (PASS0_SUM_B PASS1_SUM_B) + punpckhqdq PASS3_SUMS, PASS2_SUMS // => (PASS2_SUM_B PASS3_SUM_B) + paddq PASS0_SUMS, T0 + paddq PASS2_SUMS, T1 + movdqu T0, 0x00(HASH) + movdqu T1, 0x10(HASH) + ret +ENDPROC(nh_sse2) diff --git a/arch/x86/crypto/nhpoly1305-sse2-glue.c b/arch/x86/crypto/nhpoly1305-sse2-glue.c new file mode 100644 index 000000000000..ed68d164ce14 --- /dev/null +++ b/arch/x86/crypto/nhpoly1305-sse2-glue.c @@ -0,0 +1,76 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * NHPoly1305 - ε-almost-∆-universal hash function for Adiantum + * (SSE2 accelerated version) + * + * Copyright 2018 Google LLC + */ + +#include +#include +#include +#include + +asmlinkage void nh_sse2(const u32 *key, const u8 *message, size_t message_len, + u8 hash[NH_HASH_BYTES]); + +/* wrapper to avoid indirect call to assembly, which doesn't work with CFI */ +static void _nh_sse2(const u32 *key, const u8 *message, size_t message_len, + __le64 hash[NH_NUM_PASSES]) +{ + nh_sse2(key, message, message_len, (u8 *)hash); +} + +static int nhpoly1305_sse2_update(struct shash_desc *desc, + const u8 *src, unsigned int srclen) +{ + if (srclen < 64 || !irq_fpu_usable()) + return crypto_nhpoly1305_update(desc, src, srclen); + + do { + unsigned int n = min_t(unsigned int, srclen, PAGE_SIZE); + + kernel_fpu_begin(); + crypto_nhpoly1305_update_helper(desc, src, n, _nh_sse2); + kernel_fpu_end(); + src += n; + srclen -= n; + } while (srclen); + return 0; +} + +static struct shash_alg nhpoly1305_alg = { + .base.cra_name = "nhpoly1305", + .base.cra_driver_name = "nhpoly1305-sse2", + .base.cra_priority = 200, + .base.cra_ctxsize = sizeof(struct nhpoly1305_key), + .base.cra_module = THIS_MODULE, + .digestsize = POLY1305_DIGEST_SIZE, + .init = crypto_nhpoly1305_init, + .update = nhpoly1305_sse2_update, + .final = crypto_nhpoly1305_final, + .setkey = crypto_nhpoly1305_setkey, + .descsize = sizeof(struct nhpoly1305_state), +}; + +static int __init nhpoly1305_mod_init(void) +{ + if (!boot_cpu_has(X86_FEATURE_XMM2)) + return -ENODEV; + + return crypto_register_shash(&nhpoly1305_alg); +} + +static void __exit nhpoly1305_mod_exit(void) +{ + crypto_unregister_shash(&nhpoly1305_alg); +} + +module_init(nhpoly1305_mod_init); +module_exit(nhpoly1305_mod_exit); + +MODULE_DESCRIPTION("NHPoly1305 ε-almost-∆-universal hash function (SSE2-accelerated)"); +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Eric Biggers "); +MODULE_ALIAS_CRYPTO("nhpoly1305"); +MODULE_ALIAS_CRYPTO("nhpoly1305-sse2"); -- cgit v1.2.3 From 0f961f9f670e7c07690bfde2f533b93c653569cc Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 4 Dec 2018 22:20:01 -0800 Subject: crypto: x86/nhpoly1305 - add AVX2 accelerated NHPoly1305 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a 64-bit AVX2 implementation of NHPoly1305, an ε-almost-∆-universal hash function used in the Adiantum encryption mode. For now, only the NH portion is actually AVX2-accelerated; the Poly1305 part is less performance-critical so is just implemented in C. Signed-off-by: Eric Biggers Signed-off-by: Herbert Xu --- arch/x86/crypto/Makefile | 3 + arch/x86/crypto/nh-avx2-x86_64.S | 157 +++++++++++++++++++++++++++++++++ arch/x86/crypto/nhpoly1305-avx2-glue.c | 77 ++++++++++++++++ 3 files changed, 237 insertions(+) create mode 100644 arch/x86/crypto/nh-avx2-x86_64.S create mode 100644 arch/x86/crypto/nhpoly1305-avx2-glue.c (limited to 'arch/x86/crypto') diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index 2a6acb4de373..0b31b16f49d8 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile @@ -48,6 +48,7 @@ obj-$(CONFIG_CRYPTO_MORUS640_SSE2) += morus640-sse2.o obj-$(CONFIG_CRYPTO_MORUS1280_SSE2) += morus1280-sse2.o obj-$(CONFIG_CRYPTO_NHPOLY1305_SSE2) += nhpoly1305-sse2.o +obj-$(CONFIG_CRYPTO_NHPOLY1305_AVX2) += nhpoly1305-avx2.o # These modules require assembler to support AVX. ifeq ($(avx_supported),yes) @@ -106,6 +107,8 @@ ifeq ($(avx2_supported),yes) serpent-avx2-y := serpent-avx2-asm_64.o serpent_avx2_glue.o morus1280-avx2-y := morus1280-avx2-asm.o morus1280-avx2-glue.o + + nhpoly1305-avx2-y := nh-avx2-x86_64.o nhpoly1305-avx2-glue.o endif ifeq ($(avx512_supported),yes) diff --git a/arch/x86/crypto/nh-avx2-x86_64.S b/arch/x86/crypto/nh-avx2-x86_64.S new file mode 100644 index 000000000000..f7946ea1b704 --- /dev/null +++ b/arch/x86/crypto/nh-avx2-x86_64.S @@ -0,0 +1,157 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * NH - ε-almost-universal hash function, x86_64 AVX2 accelerated + * + * Copyright 2018 Google LLC + * + * Author: Eric Biggers + */ + +#include + +#define PASS0_SUMS %ymm0 +#define PASS1_SUMS %ymm1 +#define PASS2_SUMS %ymm2 +#define PASS3_SUMS %ymm3 +#define K0 %ymm4 +#define K0_XMM %xmm4 +#define K1 %ymm5 +#define K1_XMM %xmm5 +#define K2 %ymm6 +#define K2_XMM %xmm6 +#define K3 %ymm7 +#define K3_XMM %xmm7 +#define T0 %ymm8 +#define T1 %ymm9 +#define T2 %ymm10 +#define T2_XMM %xmm10 +#define T3 %ymm11 +#define T3_XMM %xmm11 +#define T4 %ymm12 +#define T5 %ymm13 +#define T6 %ymm14 +#define T7 %ymm15 +#define KEY %rdi +#define MESSAGE %rsi +#define MESSAGE_LEN %rdx +#define HASH %rcx + +.macro _nh_2xstride k0, k1, k2, k3 + + // Add message words to key words + vpaddd \k0, T3, T0 + vpaddd \k1, T3, T1 + vpaddd \k2, T3, T2 + vpaddd \k3, T3, T3 + + // Multiply 32x32 => 64 and accumulate + vpshufd $0x10, T0, T4 + vpshufd $0x32, T0, T0 + vpshufd $0x10, T1, T5 + vpshufd $0x32, T1, T1 + vpshufd $0x10, T2, T6 + vpshufd $0x32, T2, T2 + vpshufd $0x10, T3, T7 + vpshufd $0x32, T3, T3 + vpmuludq T4, T0, T0 + vpmuludq T5, T1, T1 + vpmuludq T6, T2, T2 + vpmuludq T7, T3, T3 + vpaddq T0, PASS0_SUMS, PASS0_SUMS + vpaddq T1, PASS1_SUMS, PASS1_SUMS + vpaddq T2, PASS2_SUMS, PASS2_SUMS + vpaddq T3, PASS3_SUMS, PASS3_SUMS +.endm + +/* + * void nh_avx2(const u32 *key, const u8 *message, size_t message_len, + * u8 hash[NH_HASH_BYTES]) + * + * It's guaranteed that message_len % 16 == 0. + */ +ENTRY(nh_avx2) + + vmovdqu 0x00(KEY), K0 + vmovdqu 0x10(KEY), K1 + add $0x20, KEY + vpxor PASS0_SUMS, PASS0_SUMS, PASS0_SUMS + vpxor PASS1_SUMS, PASS1_SUMS, PASS1_SUMS + vpxor PASS2_SUMS, PASS2_SUMS, PASS2_SUMS + vpxor PASS3_SUMS, PASS3_SUMS, PASS3_SUMS + + sub $0x40, MESSAGE_LEN + jl .Lloop4_done +.Lloop4: + vmovdqu (MESSAGE), T3 + vmovdqu 0x00(KEY), K2 + vmovdqu 0x10(KEY), K3 + _nh_2xstride K0, K1, K2, K3 + + vmovdqu 0x20(MESSAGE), T3 + vmovdqu 0x20(KEY), K0 + vmovdqu 0x30(KEY), K1 + _nh_2xstride K2, K3, K0, K1 + + add $0x40, MESSAGE + add $0x40, KEY + sub $0x40, MESSAGE_LEN + jge .Lloop4 + +.Lloop4_done: + and $0x3f, MESSAGE_LEN + jz .Ldone + + cmp $0x20, MESSAGE_LEN + jl .Llast + + // 2 or 3 strides remain; do 2 more. + vmovdqu (MESSAGE), T3 + vmovdqu 0x00(KEY), K2 + vmovdqu 0x10(KEY), K3 + _nh_2xstride K0, K1, K2, K3 + add $0x20, MESSAGE + add $0x20, KEY + sub $0x20, MESSAGE_LEN + jz .Ldone + vmovdqa K2, K0 + vmovdqa K3, K1 +.Llast: + // Last stride. Zero the high 128 bits of the message and keys so they + // don't affect the result when processing them like 2 strides. + vmovdqu (MESSAGE), T3_XMM + vmovdqa K0_XMM, K0_XMM + vmovdqa K1_XMM, K1_XMM + vmovdqu 0x00(KEY), K2_XMM + vmovdqu 0x10(KEY), K3_XMM + _nh_2xstride K0, K1, K2, K3 + +.Ldone: + // Sum the accumulators for each pass, then store the sums to 'hash' + + // PASS0_SUMS is (0A 0B 0C 0D) + // PASS1_SUMS is (1A 1B 1C 1D) + // PASS2_SUMS is (2A 2B 2C 2D) + // PASS3_SUMS is (3A 3B 3C 3D) + // We need the horizontal sums: + // (0A + 0B + 0C + 0D, + // 1A + 1B + 1C + 1D, + // 2A + 2B + 2C + 2D, + // 3A + 3B + 3C + 3D) + // + + vpunpcklqdq PASS1_SUMS, PASS0_SUMS, T0 // T0 = (0A 1A 0C 1C) + vpunpckhqdq PASS1_SUMS, PASS0_SUMS, T1 // T1 = (0B 1B 0D 1D) + vpunpcklqdq PASS3_SUMS, PASS2_SUMS, T2 // T2 = (2A 3A 2C 3C) + vpunpckhqdq PASS3_SUMS, PASS2_SUMS, T3 // T3 = (2B 3B 2D 3D) + + vinserti128 $0x1, T2_XMM, T0, T4 // T4 = (0A 1A 2A 3A) + vinserti128 $0x1, T3_XMM, T1, T5 // T5 = (0B 1B 2B 3B) + vperm2i128 $0x31, T2, T0, T0 // T0 = (0C 1C 2C 3C) + vperm2i128 $0x31, T3, T1, T1 // T1 = (0D 1D 2D 3D) + + vpaddq T5, T4, T4 + vpaddq T1, T0, T0 + vpaddq T4, T0, T0 + vmovdqu T0, (HASH) + ret +ENDPROC(nh_avx2) diff --git a/arch/x86/crypto/nhpoly1305-avx2-glue.c b/arch/x86/crypto/nhpoly1305-avx2-glue.c new file mode 100644 index 000000000000..20d815ea4b6a --- /dev/null +++ b/arch/x86/crypto/nhpoly1305-avx2-glue.c @@ -0,0 +1,77 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * NHPoly1305 - ε-almost-∆-universal hash function for Adiantum + * (AVX2 accelerated version) + * + * Copyright 2018 Google LLC + */ + +#include +#include +#include +#include + +asmlinkage void nh_avx2(const u32 *key, const u8 *message, size_t message_len, + u8 hash[NH_HASH_BYTES]); + +/* wrapper to avoid indirect call to assembly, which doesn't work with CFI */ +static void _nh_avx2(const u32 *key, const u8 *message, size_t message_len, + __le64 hash[NH_NUM_PASSES]) +{ + nh_avx2(key, message, message_len, (u8 *)hash); +} + +static int nhpoly1305_avx2_update(struct shash_desc *desc, + const u8 *src, unsigned int srclen) +{ + if (srclen < 64 || !irq_fpu_usable()) + return crypto_nhpoly1305_update(desc, src, srclen); + + do { + unsigned int n = min_t(unsigned int, srclen, PAGE_SIZE); + + kernel_fpu_begin(); + crypto_nhpoly1305_update_helper(desc, src, n, _nh_avx2); + kernel_fpu_end(); + src += n; + srclen -= n; + } while (srclen); + return 0; +} + +static struct shash_alg nhpoly1305_alg = { + .base.cra_name = "nhpoly1305", + .base.cra_driver_name = "nhpoly1305-avx2", + .base.cra_priority = 300, + .base.cra_ctxsize = sizeof(struct nhpoly1305_key), + .base.cra_module = THIS_MODULE, + .digestsize = POLY1305_DIGEST_SIZE, + .init = crypto_nhpoly1305_init, + .update = nhpoly1305_avx2_update, + .final = crypto_nhpoly1305_final, + .setkey = crypto_nhpoly1305_setkey, + .descsize = sizeof(struct nhpoly1305_state), +}; + +static int __init nhpoly1305_mod_init(void) +{ + if (!boot_cpu_has(X86_FEATURE_AVX2) || + !boot_cpu_has(X86_FEATURE_OSXSAVE)) + return -ENODEV; + + return crypto_register_shash(&nhpoly1305_alg); +} + +static void __exit nhpoly1305_mod_exit(void) +{ + crypto_unregister_shash(&nhpoly1305_alg); +} + +module_init(nhpoly1305_mod_init); +module_exit(nhpoly1305_mod_exit); + +MODULE_DESCRIPTION("NHPoly1305 ε-almost-∆-universal hash function (AVX2-accelerated)"); +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Eric Biggers "); +MODULE_ALIAS_CRYPTO("nhpoly1305"); +MODULE_ALIAS_CRYPTO("nhpoly1305-avx2"); -- cgit v1.2.3 From 4af78261870a7d36dd222af8dad9688b705e365e Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 4 Dec 2018 22:20:02 -0800 Subject: crypto: x86/chacha20 - add XChaCha20 support Add an XChaCha20 implementation that is hooked up to the x86_64 SIMD implementations of ChaCha20. This can be used by Adiantum. An SSSE3 implementation of single-block HChaCha20 is also added so that XChaCha20 can use it rather than the generic implementation. This required refactoring the ChaCha permutation into its own function. Signed-off-by: Eric Biggers Signed-off-by: Herbert Xu --- arch/x86/crypto/chacha20-ssse3-x86_64.S | 81 ++++++++++++++++-------- arch/x86/crypto/chacha20_glue.c | 108 ++++++++++++++++++++++++-------- 2 files changed, 138 insertions(+), 51 deletions(-) (limited to 'arch/x86/crypto') diff --git a/arch/x86/crypto/chacha20-ssse3-x86_64.S b/arch/x86/crypto/chacha20-ssse3-x86_64.S index d8ac75bb448f..f6792789f875 100644 --- a/arch/x86/crypto/chacha20-ssse3-x86_64.S +++ b/arch/x86/crypto/chacha20-ssse3-x86_64.S @@ -10,6 +10,7 @@ */ #include +#include .section .rodata.cst16.ROT8, "aM", @progbits, 16 .align 16 @@ -23,37 +24,24 @@ CTRINC: .octa 0x00000003000000020000000100000000 .text -ENTRY(chacha20_block_xor_ssse3) - # %rdi: Input state matrix, s - # %rsi: up to 1 data block output, o - # %rdx: up to 1 data block input, i - # %rcx: input/output length in bytes - - # This function encrypts one ChaCha20 block by loading the state matrix - # in four SSE registers. It performs matrix operation on four words in - # parallel, but requires shuffling to rearrange the words after each - # round. 8/16-bit word rotation is done with the slightly better - # performing SSSE3 byte shuffling, 7/12-bit word rotation uses - # traditional shift+OR. - - # x0..3 = s0..3 - movdqa 0x00(%rdi),%xmm0 - movdqa 0x10(%rdi),%xmm1 - movdqa 0x20(%rdi),%xmm2 - movdqa 0x30(%rdi),%xmm3 - movdqa %xmm0,%xmm8 - movdqa %xmm1,%xmm9 - movdqa %xmm2,%xmm10 - movdqa %xmm3,%xmm11 +/* + * chacha20_permute - permute one block + * + * Permute one 64-byte block where the state matrix is in %xmm0-%xmm3. This + * function performs matrix operations on four words in parallel, but requires + * shuffling to rearrange the words after each round. 8/16-bit word rotation is + * done with the slightly better performing SSSE3 byte shuffling, 7/12-bit word + * rotation uses traditional shift+OR. + * + * Clobbers: %ecx, %xmm4-%xmm7 + */ +chacha20_permute: movdqa ROT8(%rip),%xmm4 movdqa ROT16(%rip),%xmm5 - - mov %rcx,%rax mov $10,%ecx .Ldoubleround: - # x0 += x1, x3 = rotl32(x3 ^ x0, 16) paddd %xmm1,%xmm0 pxor %xmm0,%xmm3 @@ -123,6 +111,29 @@ ENTRY(chacha20_block_xor_ssse3) dec %ecx jnz .Ldoubleround + ret +ENDPROC(chacha20_permute) + +ENTRY(chacha20_block_xor_ssse3) + # %rdi: Input state matrix, s + # %rsi: up to 1 data block output, o + # %rdx: up to 1 data block input, i + # %rcx: input/output length in bytes + FRAME_BEGIN + + # x0..3 = s0..3 + movdqa 0x00(%rdi),%xmm0 + movdqa 0x10(%rdi),%xmm1 + movdqa 0x20(%rdi),%xmm2 + movdqa 0x30(%rdi),%xmm3 + movdqa %xmm0,%xmm8 + movdqa %xmm1,%xmm9 + movdqa %xmm2,%xmm10 + movdqa %xmm3,%xmm11 + + mov %rcx,%rax + call chacha20_permute + # o0 = i0 ^ (x0 + s0) paddd %xmm8,%xmm0 cmp $0x10,%rax @@ -156,6 +167,7 @@ ENTRY(chacha20_block_xor_ssse3) movdqu %xmm0,0x30(%rsi) .Ldone: + FRAME_END ret .Lxorpart: @@ -189,6 +201,25 @@ ENTRY(chacha20_block_xor_ssse3) ENDPROC(chacha20_block_xor_ssse3) +ENTRY(hchacha20_block_ssse3) + # %rdi: Input state matrix, s + # %rsi: output (8 32-bit words) + FRAME_BEGIN + + movdqa 0x00(%rdi),%xmm0 + movdqa 0x10(%rdi),%xmm1 + movdqa 0x20(%rdi),%xmm2 + movdqa 0x30(%rdi),%xmm3 + + call chacha20_permute + + movdqu %xmm0,0x00(%rsi) + movdqu %xmm3,0x10(%rsi) + + FRAME_END + ret +ENDPROC(hchacha20_block_ssse3) + ENTRY(chacha20_4block_xor_ssse3) # %rdi: Input state matrix, s # %rsi: up to 4 data blocks output, o diff --git a/arch/x86/crypto/chacha20_glue.c b/arch/x86/crypto/chacha20_glue.c index 773d075a1483..70d388e4a3a2 100644 --- a/arch/x86/crypto/chacha20_glue.c +++ b/arch/x86/crypto/chacha20_glue.c @@ -23,6 +23,7 @@ asmlinkage void chacha20_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src, unsigned int len); asmlinkage void chacha20_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src, unsigned int len); +asmlinkage void hchacha20_block_ssse3(const u32 *state, u32 *out); #ifdef CONFIG_AS_AVX2 asmlinkage void chacha20_2block_xor_avx2(u32 *state, u8 *dst, const u8 *src, unsigned int len); @@ -121,10 +122,9 @@ static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src, } } -static int chacha20_simd(struct skcipher_request *req) +static int chacha20_simd_stream_xor(struct skcipher_request *req, + struct chacha_ctx *ctx, u8 *iv) { - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); u32 *state, state_buf[16 + 2] __aligned(8); struct skcipher_walk walk; int err; @@ -132,14 +132,9 @@ static int chacha20_simd(struct skcipher_request *req) BUILD_BUG_ON(CHACHA20_STATE_ALIGN != 16); state = PTR_ALIGN(state_buf + 0, CHACHA20_STATE_ALIGN); - if (req->cryptlen <= CHACHA_BLOCK_SIZE || !may_use_simd()) - return crypto_chacha_crypt(req); - err = skcipher_walk_virt(&walk, req, true); - crypto_chacha_init(state, ctx, walk.iv); - - kernel_fpu_begin(); + crypto_chacha_init(state, ctx, iv); while (walk.nbytes > 0) { unsigned int nbytes = walk.nbytes; @@ -153,26 +148,85 @@ static int chacha20_simd(struct skcipher_request *req) err = skcipher_walk_done(&walk, walk.nbytes - nbytes); } + return err; +} + +static int chacha20_simd(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); + int err; + + if (req->cryptlen <= CHACHA_BLOCK_SIZE || !irq_fpu_usable()) + return crypto_chacha_crypt(req); + + kernel_fpu_begin(); + err = chacha20_simd_stream_xor(req, ctx, req->iv); + kernel_fpu_end(); + return err; +} + +static int xchacha20_simd(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); + struct chacha_ctx subctx; + u32 *state, state_buf[16 + 2] __aligned(8); + u8 real_iv[16]; + int err; + + if (req->cryptlen <= CHACHA_BLOCK_SIZE || !irq_fpu_usable()) + return crypto_xchacha_crypt(req); + + BUILD_BUG_ON(CHACHA20_STATE_ALIGN != 16); + state = PTR_ALIGN(state_buf + 0, CHACHA20_STATE_ALIGN); + crypto_chacha_init(state, ctx, req->iv); + + kernel_fpu_begin(); + + hchacha20_block_ssse3(state, subctx.key); + + memcpy(&real_iv[0], req->iv + 24, 8); + memcpy(&real_iv[8], req->iv + 16, 8); + err = chacha20_simd_stream_xor(req, &subctx, real_iv); + kernel_fpu_end(); return err; } -static struct skcipher_alg alg = { - .base.cra_name = "chacha20", - .base.cra_driver_name = "chacha20-simd", - .base.cra_priority = 300, - .base.cra_blocksize = 1, - .base.cra_ctxsize = sizeof(struct chacha_ctx), - .base.cra_module = THIS_MODULE, - - .min_keysize = CHACHA_KEY_SIZE, - .max_keysize = CHACHA_KEY_SIZE, - .ivsize = CHACHA_IV_SIZE, - .chunksize = CHACHA_BLOCK_SIZE, - .setkey = crypto_chacha20_setkey, - .encrypt = chacha20_simd, - .decrypt = chacha20_simd, +static struct skcipher_alg algs[] = { + { + .base.cra_name = "chacha20", + .base.cra_driver_name = "chacha20-simd", + .base.cra_priority = 300, + .base.cra_blocksize = 1, + .base.cra_ctxsize = sizeof(struct chacha_ctx), + .base.cra_module = THIS_MODULE, + + .min_keysize = CHACHA_KEY_SIZE, + .max_keysize = CHACHA_KEY_SIZE, + .ivsize = CHACHA_IV_SIZE, + .chunksize = CHACHA_BLOCK_SIZE, + .setkey = crypto_chacha20_setkey, + .encrypt = chacha20_simd, + .decrypt = chacha20_simd, + }, { + .base.cra_name = "xchacha20", + .base.cra_driver_name = "xchacha20-simd", + .base.cra_priority = 300, + .base.cra_blocksize = 1, + .base.cra_ctxsize = sizeof(struct chacha_ctx), + .base.cra_module = THIS_MODULE, + + .min_keysize = CHACHA_KEY_SIZE, + .max_keysize = CHACHA_KEY_SIZE, + .ivsize = XCHACHA_IV_SIZE, + .chunksize = CHACHA_BLOCK_SIZE, + .setkey = crypto_chacha20_setkey, + .encrypt = xchacha20_simd, + .decrypt = xchacha20_simd, + }, }; static int __init chacha20_simd_mod_init(void) @@ -190,12 +244,12 @@ static int __init chacha20_simd_mod_init(void) boot_cpu_has(X86_FEATURE_AVX512BW); /* kmovq */ #endif #endif - return crypto_register_skcipher(&alg); + return crypto_register_skciphers(algs, ARRAY_SIZE(algs)); } static void __exit chacha20_simd_mod_fini(void) { - crypto_unregister_skcipher(&alg); + crypto_unregister_skciphers(algs, ARRAY_SIZE(algs)); } module_init(chacha20_simd_mod_init); @@ -206,3 +260,5 @@ MODULE_AUTHOR("Martin Willi "); MODULE_DESCRIPTION("chacha20 cipher algorithm, SIMD accelerated"); MODULE_ALIAS_CRYPTO("chacha20"); MODULE_ALIAS_CRYPTO("chacha20-simd"); +MODULE_ALIAS_CRYPTO("xchacha20"); +MODULE_ALIAS_CRYPTO("xchacha20-simd"); -- cgit v1.2.3 From 8b65f34c5821e7361488dc668d21195ea4c9f14d Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 4 Dec 2018 22:20:03 -0800 Subject: crypto: x86/chacha20 - refactor to allow varying number of rounds In preparation for adding XChaCha12 support, rename/refactor the x86_64 SIMD implementations of ChaCha20 to support different numbers of rounds. Reviewed-by: Martin Willi Signed-off-by: Eric Biggers Signed-off-by: Herbert Xu --- arch/x86/crypto/Makefile | 8 +- arch/x86/crypto/chacha-avx2-x86_64.S | 1025 +++++++++++++++++++++++++++ arch/x86/crypto/chacha-avx512vl-x86_64.S | 836 +++++++++++++++++++++++ arch/x86/crypto/chacha-ssse3-x86_64.S | 795 +++++++++++++++++++++ arch/x86/crypto/chacha20-avx2-x86_64.S | 1026 ---------------------------- arch/x86/crypto/chacha20-avx512vl-x86_64.S | 839 ----------------------- arch/x86/crypto/chacha20-ssse3-x86_64.S | 792 --------------------- arch/x86/crypto/chacha20_glue.c | 264 ------- arch/x86/crypto/chacha_glue.c | 270 ++++++++ 9 files changed, 2930 insertions(+), 2925 deletions(-) create mode 100644 arch/x86/crypto/chacha-avx2-x86_64.S create mode 100644 arch/x86/crypto/chacha-avx512vl-x86_64.S create mode 100644 arch/x86/crypto/chacha-ssse3-x86_64.S delete mode 100644 arch/x86/crypto/chacha20-avx2-x86_64.S delete mode 100644 arch/x86/crypto/chacha20-avx512vl-x86_64.S delete mode 100644 arch/x86/crypto/chacha20-ssse3-x86_64.S delete mode 100644 arch/x86/crypto/chacha20_glue.c create mode 100644 arch/x86/crypto/chacha_glue.c (limited to 'arch/x86/crypto') diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index 0b31b16f49d8..45734e1cf967 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile @@ -24,7 +24,7 @@ obj-$(CONFIG_CRYPTO_CAMELLIA_X86_64) += camellia-x86_64.o obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o obj-$(CONFIG_CRYPTO_TWOFISH_X86_64_3WAY) += twofish-x86_64-3way.o -obj-$(CONFIG_CRYPTO_CHACHA20_X86_64) += chacha20-x86_64.o +obj-$(CONFIG_CRYPTO_CHACHA20_X86_64) += chacha-x86_64.o obj-$(CONFIG_CRYPTO_SERPENT_SSE2_X86_64) += serpent-sse2-x86_64.o obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o @@ -78,7 +78,7 @@ camellia-x86_64-y := camellia-x86_64-asm_64.o camellia_glue.o blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o twofish-x86_64-3way-y := twofish-x86_64-asm_64-3way.o twofish_glue_3way.o -chacha20-x86_64-y := chacha20-ssse3-x86_64.o chacha20_glue.o +chacha-x86_64-y := chacha-ssse3-x86_64.o chacha_glue.o serpent-sse2-x86_64-y := serpent-sse2-x86_64-asm_64.o serpent_sse2_glue.o aegis128-aesni-y := aegis128-aesni-asm.o aegis128-aesni-glue.o @@ -103,7 +103,7 @@ endif ifeq ($(avx2_supported),yes) camellia-aesni-avx2-y := camellia-aesni-avx2-asm_64.o camellia_aesni_avx2_glue.o - chacha20-x86_64-y += chacha20-avx2-x86_64.o + chacha-x86_64-y += chacha-avx2-x86_64.o serpent-avx2-y := serpent-avx2-asm_64.o serpent_avx2_glue.o morus1280-avx2-y := morus1280-avx2-asm.o morus1280-avx2-glue.o @@ -112,7 +112,7 @@ ifeq ($(avx2_supported),yes) endif ifeq ($(avx512_supported),yes) - chacha20-x86_64-y += chacha20-avx512vl-x86_64.o + chacha-x86_64-y += chacha-avx512vl-x86_64.o endif aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o diff --git a/arch/x86/crypto/chacha-avx2-x86_64.S b/arch/x86/crypto/chacha-avx2-x86_64.S new file mode 100644 index 000000000000..32903fd450af --- /dev/null +++ b/arch/x86/crypto/chacha-avx2-x86_64.S @@ -0,0 +1,1025 @@ +/* + * ChaCha 256-bit cipher algorithm, x64 AVX2 functions + * + * Copyright (C) 2015 Martin Willi + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include + +.section .rodata.cst32.ROT8, "aM", @progbits, 32 +.align 32 +ROT8: .octa 0x0e0d0c0f0a09080b0605040702010003 + .octa 0x0e0d0c0f0a09080b0605040702010003 + +.section .rodata.cst32.ROT16, "aM", @progbits, 32 +.align 32 +ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302 + .octa 0x0d0c0f0e09080b0a0504070601000302 + +.section .rodata.cst32.CTRINC, "aM", @progbits, 32 +.align 32 +CTRINC: .octa 0x00000003000000020000000100000000 + .octa 0x00000007000000060000000500000004 + +.section .rodata.cst32.CTR2BL, "aM", @progbits, 32 +.align 32 +CTR2BL: .octa 0x00000000000000000000000000000000 + .octa 0x00000000000000000000000000000001 + +.section .rodata.cst32.CTR4BL, "aM", @progbits, 32 +.align 32 +CTR4BL: .octa 0x00000000000000000000000000000002 + .octa 0x00000000000000000000000000000003 + +.text + +ENTRY(chacha_2block_xor_avx2) + # %rdi: Input state matrix, s + # %rsi: up to 2 data blocks output, o + # %rdx: up to 2 data blocks input, i + # %rcx: input/output length in bytes + # %r8d: nrounds + + # This function encrypts two ChaCha blocks by loading the state + # matrix twice across four AVX registers. It performs matrix operations + # on four words in each matrix in parallel, but requires shuffling to + # rearrange the words after each round. + + vzeroupper + + # x0..3[0-2] = s0..3 + vbroadcasti128 0x00(%rdi),%ymm0 + vbroadcasti128 0x10(%rdi),%ymm1 + vbroadcasti128 0x20(%rdi),%ymm2 + vbroadcasti128 0x30(%rdi),%ymm3 + + vpaddd CTR2BL(%rip),%ymm3,%ymm3 + + vmovdqa %ymm0,%ymm8 + vmovdqa %ymm1,%ymm9 + vmovdqa %ymm2,%ymm10 + vmovdqa %ymm3,%ymm11 + + vmovdqa ROT8(%rip),%ymm4 + vmovdqa ROT16(%rip),%ymm5 + + mov %rcx,%rax + +.Ldoubleround: + + # x0 += x1, x3 = rotl32(x3 ^ x0, 16) + vpaddd %ymm1,%ymm0,%ymm0 + vpxor %ymm0,%ymm3,%ymm3 + vpshufb %ymm5,%ymm3,%ymm3 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 12) + vpaddd %ymm3,%ymm2,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vmovdqa %ymm1,%ymm6 + vpslld $12,%ymm6,%ymm6 + vpsrld $20,%ymm1,%ymm1 + vpor %ymm6,%ymm1,%ymm1 + + # x0 += x1, x3 = rotl32(x3 ^ x0, 8) + vpaddd %ymm1,%ymm0,%ymm0 + vpxor %ymm0,%ymm3,%ymm3 + vpshufb %ymm4,%ymm3,%ymm3 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 7) + vpaddd %ymm3,%ymm2,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vmovdqa %ymm1,%ymm7 + vpslld $7,%ymm7,%ymm7 + vpsrld $25,%ymm1,%ymm1 + vpor %ymm7,%ymm1,%ymm1 + + # x1 = shuffle32(x1, MASK(0, 3, 2, 1)) + vpshufd $0x39,%ymm1,%ymm1 + # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) + vpshufd $0x4e,%ymm2,%ymm2 + # x3 = shuffle32(x3, MASK(2, 1, 0, 3)) + vpshufd $0x93,%ymm3,%ymm3 + + # x0 += x1, x3 = rotl32(x3 ^ x0, 16) + vpaddd %ymm1,%ymm0,%ymm0 + vpxor %ymm0,%ymm3,%ymm3 + vpshufb %ymm5,%ymm3,%ymm3 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 12) + vpaddd %ymm3,%ymm2,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vmovdqa %ymm1,%ymm6 + vpslld $12,%ymm6,%ymm6 + vpsrld $20,%ymm1,%ymm1 + vpor %ymm6,%ymm1,%ymm1 + + # x0 += x1, x3 = rotl32(x3 ^ x0, 8) + vpaddd %ymm1,%ymm0,%ymm0 + vpxor %ymm0,%ymm3,%ymm3 + vpshufb %ymm4,%ymm3,%ymm3 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 7) + vpaddd %ymm3,%ymm2,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vmovdqa %ymm1,%ymm7 + vpslld $7,%ymm7,%ymm7 + vpsrld $25,%ymm1,%ymm1 + vpor %ymm7,%ymm1,%ymm1 + + # x1 = shuffle32(x1, MASK(2, 1, 0, 3)) + vpshufd $0x93,%ymm1,%ymm1 + # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) + vpshufd $0x4e,%ymm2,%ymm2 + # x3 = shuffle32(x3, MASK(0, 3, 2, 1)) + vpshufd $0x39,%ymm3,%ymm3 + + sub $2,%r8d + jnz .Ldoubleround + + # o0 = i0 ^ (x0 + s0) + vpaddd %ymm8,%ymm0,%ymm7 + cmp $0x10,%rax + jl .Lxorpart2 + vpxor 0x00(%rdx),%xmm7,%xmm6 + vmovdqu %xmm6,0x00(%rsi) + vextracti128 $1,%ymm7,%xmm0 + # o1 = i1 ^ (x1 + s1) + vpaddd %ymm9,%ymm1,%ymm7 + cmp $0x20,%rax + jl .Lxorpart2 + vpxor 0x10(%rdx),%xmm7,%xmm6 + vmovdqu %xmm6,0x10(%rsi) + vextracti128 $1,%ymm7,%xmm1 + # o2 = i2 ^ (x2 + s2) + vpaddd %ymm10,%ymm2,%ymm7 + cmp $0x30,%rax + jl .Lxorpart2 + vpxor 0x20(%rdx),%xmm7,%xmm6 + vmovdqu %xmm6,0x20(%rsi) + vextracti128 $1,%ymm7,%xmm2 + # o3 = i3 ^ (x3 + s3) + vpaddd %ymm11,%ymm3,%ymm7 + cmp $0x40,%rax + jl .Lxorpart2 + vpxor 0x30(%rdx),%xmm7,%xmm6 + vmovdqu %xmm6,0x30(%rsi) + vextracti128 $1,%ymm7,%xmm3 + + # xor and write second block + vmovdqa %xmm0,%xmm7 + cmp $0x50,%rax + jl .Lxorpart2 + vpxor 0x40(%rdx),%xmm7,%xmm6 + vmovdqu %xmm6,0x40(%rsi) + + vmovdqa %xmm1,%xmm7 + cmp $0x60,%rax + jl .Lxorpart2 + vpxor 0x50(%rdx),%xmm7,%xmm6 + vmovdqu %xmm6,0x50(%rsi) + + vmovdqa %xmm2,%xmm7 + cmp $0x70,%rax + jl .Lxorpart2 + vpxor 0x60(%rdx),%xmm7,%xmm6 + vmovdqu %xmm6,0x60(%rsi) + + vmovdqa %xmm3,%xmm7 + cmp $0x80,%rax + jl .Lxorpart2 + vpxor 0x70(%rdx),%xmm7,%xmm6 + vmovdqu %xmm6,0x70(%rsi) + +.Ldone2: + vzeroupper + ret + +.Lxorpart2: + # xor remaining bytes from partial register into output + mov %rax,%r9 + and $0x0f,%r9 + jz .Ldone2 + and $~0x0f,%rax + + mov %rsi,%r11 + + lea 8(%rsp),%r10 + sub $0x10,%rsp + and $~31,%rsp + + lea (%rdx,%rax),%rsi + mov %rsp,%rdi + mov %r9,%rcx + rep movsb + + vpxor 0x00(%rsp),%xmm7,%xmm7 + vmovdqa %xmm7,0x00(%rsp) + + mov %rsp,%rsi + lea (%r11,%rax),%rdi + mov %r9,%rcx + rep movsb + + lea -8(%r10),%rsp + jmp .Ldone2 + +ENDPROC(chacha_2block_xor_avx2) + +ENTRY(chacha_4block_xor_avx2) + # %rdi: Input state matrix, s + # %rsi: up to 4 data blocks output, o + # %rdx: up to 4 data blocks input, i + # %rcx: input/output length in bytes + # %r8d: nrounds + + # This function encrypts four ChaCha blocks by loading the state + # matrix four times across eight AVX registers. It performs matrix + # operations on four words in two matrices in parallel, sequentially + # to the operations on the four words of the other two matrices. The + # required word shuffling has a rather high latency, we can do the + # arithmetic on two matrix-pairs without much slowdown. + + vzeroupper + + # x0..3[0-4] = s0..3 + vbroadcasti128 0x00(%rdi),%ymm0 + vbroadcasti128 0x10(%rdi),%ymm1 + vbroadcasti128 0x20(%rdi),%ymm2 + vbroadcasti128 0x30(%rdi),%ymm3 + + vmovdqa %ymm0,%ymm4 + vmovdqa %ymm1,%ymm5 + vmovdqa %ymm2,%ymm6 + vmovdqa %ymm3,%ymm7 + + vpaddd CTR2BL(%rip),%ymm3,%ymm3 + vpaddd CTR4BL(%rip),%ymm7,%ymm7 + + vmovdqa %ymm0,%ymm11 + vmovdqa %ymm1,%ymm12 + vmovdqa %ymm2,%ymm13 + vmovdqa %ymm3,%ymm14 + vmovdqa %ymm7,%ymm15 + + vmovdqa ROT8(%rip),%ymm8 + vmovdqa ROT16(%rip),%ymm9 + + mov %rcx,%rax + +.Ldoubleround4: + + # x0 += x1, x3 = rotl32(x3 ^ x0, 16) + vpaddd %ymm1,%ymm0,%ymm0 + vpxor %ymm0,%ymm3,%ymm3 + vpshufb %ymm9,%ymm3,%ymm3 + + vpaddd %ymm5,%ymm4,%ymm4 + vpxor %ymm4,%ymm7,%ymm7 + vpshufb %ymm9,%ymm7,%ymm7 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 12) + vpaddd %ymm3,%ymm2,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vmovdqa %ymm1,%ymm10 + vpslld $12,%ymm10,%ymm10 + vpsrld $20,%ymm1,%ymm1 + vpor %ymm10,%ymm1,%ymm1 + + vpaddd %ymm7,%ymm6,%ymm6 + vpxor %ymm6,%ymm5,%ymm5 + vmovdqa %ymm5,%ymm10 + vpslld $12,%ymm10,%ymm10 + vpsrld $20,%ymm5,%ymm5 + vpor %ymm10,%ymm5,%ymm5 + + # x0 += x1, x3 = rotl32(x3 ^ x0, 8) + vpaddd %ymm1,%ymm0,%ymm0 + vpxor %ymm0,%ymm3,%ymm3 + vpshufb %ymm8,%ymm3,%ymm3 + + vpaddd %ymm5,%ymm4,%ymm4 + vpxor %ymm4,%ymm7,%ymm7 + vpshufb %ymm8,%ymm7,%ymm7 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 7) + vpaddd %ymm3,%ymm2,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vmovdqa %ymm1,%ymm10 + vpslld $7,%ymm10,%ymm10 + vpsrld $25,%ymm1,%ymm1 + vpor %ymm10,%ymm1,%ymm1 + + vpaddd %ymm7,%ymm6,%ymm6 + vpxor %ymm6,%ymm5,%ymm5 + vmovdqa %ymm5,%ymm10 + vpslld $7,%ymm10,%ymm10 + vpsrld $25,%ymm5,%ymm5 + vpor %ymm10,%ymm5,%ymm5 + + # x1 = shuffle32(x1, MASK(0, 3, 2, 1)) + vpshufd $0x39,%ymm1,%ymm1 + vpshufd $0x39,%ymm5,%ymm5 + # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) + vpshufd $0x4e,%ymm2,%ymm2 + vpshufd $0x4e,%ymm6,%ymm6 + # x3 = shuffle32(x3, MASK(2, 1, 0, 3)) + vpshufd $0x93,%ymm3,%ymm3 + vpshufd $0x93,%ymm7,%ymm7 + + # x0 += x1, x3 = rotl32(x3 ^ x0, 16) + vpaddd %ymm1,%ymm0,%ymm0 + vpxor %ymm0,%ymm3,%ymm3 + vpshufb %ymm9,%ymm3,%ymm3 + + vpaddd %ymm5,%ymm4,%ymm4 + vpxor %ymm4,%ymm7,%ymm7 + vpshufb %ymm9,%ymm7,%ymm7 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 12) + vpaddd %ymm3,%ymm2,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vmovdqa %ymm1,%ymm10 + vpslld $12,%ymm10,%ymm10 + vpsrld $20,%ymm1,%ymm1 + vpor %ymm10,%ymm1,%ymm1 + + vpaddd %ymm7,%ymm6,%ymm6 + vpxor %ymm6,%ymm5,%ymm5 + vmovdqa %ymm5,%ymm10 + vpslld $12,%ymm10,%ymm10 + vpsrld $20,%ymm5,%ymm5 + vpor %ymm10,%ymm5,%ymm5 + + # x0 += x1, x3 = rotl32(x3 ^ x0, 8) + vpaddd %ymm1,%ymm0,%ymm0 + vpxor %ymm0,%ymm3,%ymm3 + vpshufb %ymm8,%ymm3,%ymm3 + + vpaddd %ymm5,%ymm4,%ymm4 + vpxor %ymm4,%ymm7,%ymm7 + vpshufb %ymm8,%ymm7,%ymm7 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 7) + vpaddd %ymm3,%ymm2,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vmovdqa %ymm1,%ymm10 + vpslld $7,%ymm10,%ymm10 + vpsrld $25,%ymm1,%ymm1 + vpor %ymm10,%ymm1,%ymm1 + + vpaddd %ymm7,%ymm6,%ymm6 + vpxor %ymm6,%ymm5,%ymm5 + vmovdqa %ymm5,%ymm10 + vpslld $7,%ymm10,%ymm10 + vpsrld $25,%ymm5,%ymm5 + vpor %ymm10,%ymm5,%ymm5 + + # x1 = shuffle32(x1, MASK(2, 1, 0, 3)) + vpshufd $0x93,%ymm1,%ymm1 + vpshufd $0x93,%ymm5,%ymm5 + # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) + vpshufd $0x4e,%ymm2,%ymm2 + vpshufd $0x4e,%ymm6,%ymm6 + # x3 = shuffle32(x3, MASK(0, 3, 2, 1)) + vpshufd $0x39,%ymm3,%ymm3 + vpshufd $0x39,%ymm7,%ymm7 + + sub $2,%r8d + jnz .Ldoubleround4 + + # o0 = i0 ^ (x0 + s0), first block + vpaddd %ymm11,%ymm0,%ymm10 + cmp $0x10,%rax + jl .Lxorpart4 + vpxor 0x00(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x00(%rsi) + vextracti128 $1,%ymm10,%xmm0 + # o1 = i1 ^ (x1 + s1), first block + vpaddd %ymm12,%ymm1,%ymm10 + cmp $0x20,%rax + jl .Lxorpart4 + vpxor 0x10(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x10(%rsi) + vextracti128 $1,%ymm10,%xmm1 + # o2 = i2 ^ (x2 + s2), first block + vpaddd %ymm13,%ymm2,%ymm10 + cmp $0x30,%rax + jl .Lxorpart4 + vpxor 0x20(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x20(%rsi) + vextracti128 $1,%ymm10,%xmm2 + # o3 = i3 ^ (x3 + s3), first block + vpaddd %ymm14,%ymm3,%ymm10 + cmp $0x40,%rax + jl .Lxorpart4 + vpxor 0x30(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x30(%rsi) + vextracti128 $1,%ymm10,%xmm3 + + # xor and write second block + vmovdqa %xmm0,%xmm10 + cmp $0x50,%rax + jl .Lxorpart4 + vpxor 0x40(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x40(%rsi) + + vmovdqa %xmm1,%xmm10 + cmp $0x60,%rax + jl .Lxorpart4 + vpxor 0x50(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x50(%rsi) + + vmovdqa %xmm2,%xmm10 + cmp $0x70,%rax + jl .Lxorpart4 + vpxor 0x60(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x60(%rsi) + + vmovdqa %xmm3,%xmm10 + cmp $0x80,%rax + jl .Lxorpart4 + vpxor 0x70(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x70(%rsi) + + # o0 = i0 ^ (x0 + s0), third block + vpaddd %ymm11,%ymm4,%ymm10 + cmp $0x90,%rax + jl .Lxorpart4 + vpxor 0x80(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x80(%rsi) + vextracti128 $1,%ymm10,%xmm4 + # o1 = i1 ^ (x1 + s1), third block + vpaddd %ymm12,%ymm5,%ymm10 + cmp $0xa0,%rax + jl .Lxorpart4 + vpxor 0x90(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x90(%rsi) + vextracti128 $1,%ymm10,%xmm5 + # o2 = i2 ^ (x2 + s2), third block + vpaddd %ymm13,%ymm6,%ymm10 + cmp $0xb0,%rax + jl .Lxorpart4 + vpxor 0xa0(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0xa0(%rsi) + vextracti128 $1,%ymm10,%xmm6 + # o3 = i3 ^ (x3 + s3), third block + vpaddd %ymm15,%ymm7,%ymm10 + cmp $0xc0,%rax + jl .Lxorpart4 + vpxor 0xb0(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0xb0(%rsi) + vextracti128 $1,%ymm10,%xmm7 + + # xor and write fourth block + vmovdqa %xmm4,%xmm10 + cmp $0xd0,%rax + jl .Lxorpart4 + vpxor 0xc0(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0xc0(%rsi) + + vmovdqa %xmm5,%xmm10 + cmp $0xe0,%rax + jl .Lxorpart4 + vpxor 0xd0(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0xd0(%rsi) + + vmovdqa %xmm6,%xmm10 + cmp $0xf0,%rax + jl .Lxorpart4 + vpxor 0xe0(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0xe0(%rsi) + + vmovdqa %xmm7,%xmm10 + cmp $0x100,%rax + jl .Lxorpart4 + vpxor 0xf0(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0xf0(%rsi) + +.Ldone4: + vzeroupper + ret + +.Lxorpart4: + # xor remaining bytes from partial register into output + mov %rax,%r9 + and $0x0f,%r9 + jz .Ldone4 + and $~0x0f,%rax + + mov %rsi,%r11 + + lea 8(%rsp),%r10 + sub $0x10,%rsp + and $~31,%rsp + + lea (%rdx,%rax),%rsi + mov %rsp,%rdi + mov %r9,%rcx + rep movsb + + vpxor 0x00(%rsp),%xmm10,%xmm10 + vmovdqa %xmm10,0x00(%rsp) + + mov %rsp,%rsi + lea (%r11,%rax),%rdi + mov %r9,%rcx + rep movsb + + lea -8(%r10),%rsp + jmp .Ldone4 + +ENDPROC(chacha_4block_xor_avx2) + +ENTRY(chacha_8block_xor_avx2) + # %rdi: Input state matrix, s + # %rsi: up to 8 data blocks output, o + # %rdx: up to 8 data blocks input, i + # %rcx: input/output length in bytes + # %r8d: nrounds + + # This function encrypts eight consecutive ChaCha blocks by loading + # the state matrix in AVX registers eight times. As we need some + # scratch registers, we save the first four registers on the stack. The + # algorithm performs each operation on the corresponding word of each + # state matrix, hence requires no word shuffling. For final XORing step + # we transpose the matrix by interleaving 32-, 64- and then 128-bit + # words, which allows us to do XOR in AVX registers. 8/16-bit word + # rotation is done with the slightly better performing byte shuffling, + # 7/12-bit word rotation uses traditional shift+OR. + + vzeroupper + # 4 * 32 byte stack, 32-byte aligned + lea 8(%rsp),%r10 + and $~31, %rsp + sub $0x80, %rsp + mov %rcx,%rax + + # x0..15[0-7] = s[0..15] + vpbroadcastd 0x00(%rdi),%ymm0 + vpbroadcastd 0x04(%rdi),%ymm1 + vpbroadcastd 0x08(%rdi),%ymm2 + vpbroadcastd 0x0c(%rdi),%ymm3 + vpbroadcastd 0x10(%rdi),%ymm4 + vpbroadcastd 0x14(%rdi),%ymm5 + vpbroadcastd 0x18(%rdi),%ymm6 + vpbroadcastd 0x1c(%rdi),%ymm7 + vpbroadcastd 0x20(%rdi),%ymm8 + vpbroadcastd 0x24(%rdi),%ymm9 + vpbroadcastd 0x28(%rdi),%ymm10 + vpbroadcastd 0x2c(%rdi),%ymm11 + vpbroadcastd 0x30(%rdi),%ymm12 + vpbroadcastd 0x34(%rdi),%ymm13 + vpbroadcastd 0x38(%rdi),%ymm14 + vpbroadcastd 0x3c(%rdi),%ymm15 + # x0..3 on stack + vmovdqa %ymm0,0x00(%rsp) + vmovdqa %ymm1,0x20(%rsp) + vmovdqa %ymm2,0x40(%rsp) + vmovdqa %ymm3,0x60(%rsp) + + vmovdqa CTRINC(%rip),%ymm1 + vmovdqa ROT8(%rip),%ymm2 + vmovdqa ROT16(%rip),%ymm3 + + # x12 += counter values 0-3 + vpaddd %ymm1,%ymm12,%ymm12 + +.Ldoubleround8: + # x0 += x4, x12 = rotl32(x12 ^ x0, 16) + vpaddd 0x00(%rsp),%ymm4,%ymm0 + vmovdqa %ymm0,0x00(%rsp) + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm3,%ymm12,%ymm12 + # x1 += x5, x13 = rotl32(x13 ^ x1, 16) + vpaddd 0x20(%rsp),%ymm5,%ymm0 + vmovdqa %ymm0,0x20(%rsp) + vpxor %ymm0,%ymm13,%ymm13 + vpshufb %ymm3,%ymm13,%ymm13 + # x2 += x6, x14 = rotl32(x14 ^ x2, 16) + vpaddd 0x40(%rsp),%ymm6,%ymm0 + vmovdqa %ymm0,0x40(%rsp) + vpxor %ymm0,%ymm14,%ymm14 + vpshufb %ymm3,%ymm14,%ymm14 + # x3 += x7, x15 = rotl32(x15 ^ x3, 16) + vpaddd 0x60(%rsp),%ymm7,%ymm0 + vmovdqa %ymm0,0x60(%rsp) + vpxor %ymm0,%ymm15,%ymm15 + vpshufb %ymm3,%ymm15,%ymm15 + + # x8 += x12, x4 = rotl32(x4 ^ x8, 12) + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $12,%ymm4,%ymm0 + vpsrld $20,%ymm4,%ymm4 + vpor %ymm0,%ymm4,%ymm4 + # x9 += x13, x5 = rotl32(x5 ^ x9, 12) + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $12,%ymm5,%ymm0 + vpsrld $20,%ymm5,%ymm5 + vpor %ymm0,%ymm5,%ymm5 + # x10 += x14, x6 = rotl32(x6 ^ x10, 12) + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpslld $12,%ymm6,%ymm0 + vpsrld $20,%ymm6,%ymm6 + vpor %ymm0,%ymm6,%ymm6 + # x11 += x15, x7 = rotl32(x7 ^ x11, 12) + vpaddd %ymm15,%ymm11,%ymm11 + vpxor %ymm11,%ymm7,%ymm7 + vpslld $12,%ymm7,%ymm0 + vpsrld $20,%ymm7,%ymm7 + vpor %ymm0,%ymm7,%ymm7 + + # x0 += x4, x12 = rotl32(x12 ^ x0, 8) + vpaddd 0x00(%rsp),%ymm4,%ymm0 + vmovdqa %ymm0,0x00(%rsp) + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm2,%ymm12,%ymm12 + # x1 += x5, x13 = rotl32(x13 ^ x1, 8) + vpaddd 0x20(%rsp),%ymm5,%ymm0 + vmovdqa %ymm0,0x20(%rsp) + vpxor %ymm0,%ymm13,%ymm13 + vpshufb %ymm2,%ymm13,%ymm13 + # x2 += x6, x14 = rotl32(x14 ^ x2, 8) + vpaddd 0x40(%rsp),%ymm6,%ymm0 + vmovdqa %ymm0,0x40(%rsp) + vpxor %ymm0,%ymm14,%ymm14 + vpshufb %ymm2,%ymm14,%ymm14 + # x3 += x7, x15 = rotl32(x15 ^ x3, 8) + vpaddd 0x60(%rsp),%ymm7,%ymm0 + vmovdqa %ymm0,0x60(%rsp) + vpxor %ymm0,%ymm15,%ymm15 + vpshufb %ymm2,%ymm15,%ymm15 + + # x8 += x12, x4 = rotl32(x4 ^ x8, 7) + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm0 + vpsrld $25,%ymm4,%ymm4 + vpor %ymm0,%ymm4,%ymm4 + # x9 += x13, x5 = rotl32(x5 ^ x9, 7) + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm0 + vpsrld $25,%ymm5,%ymm5 + vpor %ymm0,%ymm5,%ymm5 + # x10 += x14, x6 = rotl32(x6 ^ x10, 7) + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpslld $7,%ymm6,%ymm0 + vpsrld $25,%ymm6,%ymm6 + vpor %ymm0,%ymm6,%ymm6 + # x11 += x15, x7 = rotl32(x7 ^ x11, 7) + vpaddd %ymm15,%ymm11,%ymm11 + vpxor %ymm11,%ymm7,%ymm7 + vpslld $7,%ymm7,%ymm0 + vpsrld $25,%ymm7,%ymm7 + vpor %ymm0,%ymm7,%ymm7 + + # x0 += x5, x15 = rotl32(x15 ^ x0, 16) + vpaddd 0x00(%rsp),%ymm5,%ymm0 + vmovdqa %ymm0,0x00(%rsp) + vpxor %ymm0,%ymm15,%ymm15 + vpshufb %ymm3,%ymm15,%ymm15 + # x1 += x6, x12 = rotl32(x12 ^ x1, 16)%ymm0 + vpaddd 0x20(%rsp),%ymm6,%ymm0 + vmovdqa %ymm0,0x20(%rsp) + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm3,%ymm12,%ymm12 + # x2 += x7, x13 = rotl32(x13 ^ x2, 16) + vpaddd 0x40(%rsp),%ymm7,%ymm0 + vmovdqa %ymm0,0x40(%rsp) + vpxor %ymm0,%ymm13,%ymm13 + vpshufb %ymm3,%ymm13,%ymm13 + # x3 += x4, x14 = rotl32(x14 ^ x3, 16) + vpaddd 0x60(%rsp),%ymm4,%ymm0 + vmovdqa %ymm0,0x60(%rsp) + vpxor %ymm0,%ymm14,%ymm14 + vpshufb %ymm3,%ymm14,%ymm14 + + # x10 += x15, x5 = rotl32(x5 ^ x10, 12) + vpaddd %ymm15,%ymm10,%ymm10 + vpxor %ymm10,%ymm5,%ymm5 + vpslld $12,%ymm5,%ymm0 + vpsrld $20,%ymm5,%ymm5 + vpor %ymm0,%ymm5,%ymm5 + # x11 += x12, x6 = rotl32(x6 ^ x11, 12) + vpaddd %ymm12,%ymm11,%ymm11 + vpxor %ymm11,%ymm6,%ymm6 + vpslld $12,%ymm6,%ymm0 + vpsrld $20,%ymm6,%ymm6 + vpor %ymm0,%ymm6,%ymm6 + # x8 += x13, x7 = rotl32(x7 ^ x8, 12) + vpaddd %ymm13,%ymm8,%ymm8 + vpxor %ymm8,%ymm7,%ymm7 + vpslld $12,%ymm7,%ymm0 + vpsrld $20,%ymm7,%ymm7 + vpor %ymm0,%ymm7,%ymm7 + # x9 += x14, x4 = rotl32(x4 ^ x9, 12) + vpaddd %ymm14,%ymm9,%ymm9 + vpxor %ymm9,%ymm4,%ymm4 + vpslld $12,%ymm4,%ymm0 + vpsrld $20,%ymm4,%ymm4 + vpor %ymm0,%ymm4,%ymm4 + + # x0 += x5, x15 = rotl32(x15 ^ x0, 8) + vpaddd 0x00(%rsp),%ymm5,%ymm0 + vmovdqa %ymm0,0x00(%rsp) + vpxor %ymm0,%ymm15,%ymm15 + vpshufb %ymm2,%ymm15,%ymm15 + # x1 += x6, x12 = rotl32(x12 ^ x1, 8) + vpaddd 0x20(%rsp),%ymm6,%ymm0 + vmovdqa %ymm0,0x20(%rsp) + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm2,%ymm12,%ymm12 + # x2 += x7, x13 = rotl32(x13 ^ x2, 8) + vpaddd 0x40(%rsp),%ymm7,%ymm0 + vmovdqa %ymm0,0x40(%rsp) + vpxor %ymm0,%ymm13,%ymm13 + vpshufb %ymm2,%ymm13,%ymm13 + # x3 += x4, x14 = rotl32(x14 ^ x3, 8) + vpaddd 0x60(%rsp),%ymm4,%ymm0 + vmovdqa %ymm0,0x60(%rsp) + vpxor %ymm0,%ymm14,%ymm14 + vpshufb %ymm2,%ymm14,%ymm14 + + # x10 += x15, x5 = rotl32(x5 ^ x10, 7) + vpaddd %ymm15,%ymm10,%ymm10 + vpxor %ymm10,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm0 + vpsrld $25,%ymm5,%ymm5 + vpor %ymm0,%ymm5,%ymm5 + # x11 += x12, x6 = rotl32(x6 ^ x11, 7) + vpaddd %ymm12,%ymm11,%ymm11 + vpxor %ymm11,%ymm6,%ymm6 + vpslld $7,%ymm6,%ymm0 + vpsrld $25,%ymm6,%ymm6 + vpor %ymm0,%ymm6,%ymm6 + # x8 += x13, x7 = rotl32(x7 ^ x8, 7) + vpaddd %ymm13,%ymm8,%ymm8 + vpxor %ymm8,%ymm7,%ymm7 + vpslld $7,%ymm7,%ymm0 + vpsrld $25,%ymm7,%ymm7 + vpor %ymm0,%ymm7,%ymm7 + # x9 += x14, x4 = rotl32(x4 ^ x9, 7) + vpaddd %ymm14,%ymm9,%ymm9 + vpxor %ymm9,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm0 + vpsrld $25,%ymm4,%ymm4 + vpor %ymm0,%ymm4,%ymm4 + + sub $2,%r8d + jnz .Ldoubleround8 + + # x0..15[0-3] += s[0..15] + vpbroadcastd 0x00(%rdi),%ymm0 + vpaddd 0x00(%rsp),%ymm0,%ymm0 + vmovdqa %ymm0,0x00(%rsp) + vpbroadcastd 0x04(%rdi),%ymm0 + vpaddd 0x20(%rsp),%ymm0,%ymm0 + vmovdqa %ymm0,0x20(%rsp) + vpbroadcastd 0x08(%rdi),%ymm0 + vpaddd 0x40(%rsp),%ymm0,%ymm0 + vmovdqa %ymm0,0x40(%rsp) + vpbroadcastd 0x0c(%rdi),%ymm0 + vpaddd 0x60(%rsp),%ymm0,%ymm0 + vmovdqa %ymm0,0x60(%rsp) + vpbroadcastd 0x10(%rdi),%ymm0 + vpaddd %ymm0,%ymm4,%ymm4 + vpbroadcastd 0x14(%rdi),%ymm0 + vpaddd %ymm0,%ymm5,%ymm5 + vpbroadcastd 0x18(%rdi),%ymm0 + vpaddd %ymm0,%ymm6,%ymm6 + vpbroadcastd 0x1c(%rdi),%ymm0 + vpaddd %ymm0,%ymm7,%ymm7 + vpbroadcastd 0x20(%rdi),%ymm0 + vpaddd %ymm0,%ymm8,%ymm8 + vpbroadcastd 0x24(%rdi),%ymm0 + vpaddd %ymm0,%ymm9,%ymm9 + vpbroadcastd 0x28(%rdi),%ymm0 + vpaddd %ymm0,%ymm10,%ymm10 + vpbroadcastd 0x2c(%rdi),%ymm0 + vpaddd %ymm0,%ymm11,%ymm11 + vpbroadcastd 0x30(%rdi),%ymm0 + vpaddd %ymm0,%ymm12,%ymm12 + vpbroadcastd 0x34(%rdi),%ymm0 + vpaddd %ymm0,%ymm13,%ymm13 + vpbroadcastd 0x38(%rdi),%ymm0 + vpaddd %ymm0,%ymm14,%ymm14 + vpbroadcastd 0x3c(%rdi),%ymm0 + vpaddd %ymm0,%ymm15,%ymm15 + + # x12 += counter values 0-3 + vpaddd %ymm1,%ymm12,%ymm12 + + # interleave 32-bit words in state n, n+1 + vmovdqa 0x00(%rsp),%ymm0 + vmovdqa 0x20(%rsp),%ymm1 + vpunpckldq %ymm1,%ymm0,%ymm2 + vpunpckhdq %ymm1,%ymm0,%ymm1 + vmovdqa %ymm2,0x00(%rsp) + vmovdqa %ymm1,0x20(%rsp) + vmovdqa 0x40(%rsp),%ymm0 + vmovdqa 0x60(%rsp),%ymm1 + vpunpckldq %ymm1,%ymm0,%ymm2 + vpunpckhdq %ymm1,%ymm0,%ymm1 + vmovdqa %ymm2,0x40(%rsp) + vmovdqa %ymm1,0x60(%rsp) + vmovdqa %ymm4,%ymm0 + vpunpckldq %ymm5,%ymm0,%ymm4 + vpunpckhdq %ymm5,%ymm0,%ymm5 + vmovdqa %ymm6,%ymm0 + vpunpckldq %ymm7,%ymm0,%ymm6 + vpunpckhdq %ymm7,%ymm0,%ymm7 + vmovdqa %ymm8,%ymm0 + vpunpckldq %ymm9,%ymm0,%ymm8 + vpunpckhdq %ymm9,%ymm0,%ymm9 + vmovdqa %ymm10,%ymm0 + vpunpckldq %ymm11,%ymm0,%ymm10 + vpunpckhdq %ymm11,%ymm0,%ymm11 + vmovdqa %ymm12,%ymm0 + vpunpckldq %ymm13,%ymm0,%ymm12 + vpunpckhdq %ymm13,%ymm0,%ymm13 + vmovdqa %ymm14,%ymm0 + vpunpckldq %ymm15,%ymm0,%ymm14 + vpunpckhdq %ymm15,%ymm0,%ymm15 + + # interleave 64-bit words in state n, n+2 + vmovdqa 0x00(%rsp),%ymm0 + vmovdqa 0x40(%rsp),%ymm2 + vpunpcklqdq %ymm2,%ymm0,%ymm1 + vpunpckhqdq %ymm2,%ymm0,%ymm2 + vmovdqa %ymm1,0x00(%rsp) + vmovdqa %ymm2,0x40(%rsp) + vmovdqa 0x20(%rsp),%ymm0 + vmovdqa 0x60(%rsp),%ymm2 + vpunpcklqdq %ymm2,%ymm0,%ymm1 + vpunpckhqdq %ymm2,%ymm0,%ymm2 + vmovdqa %ymm1,0x20(%rsp) + vmovdqa %ymm2,0x60(%rsp) + vmovdqa %ymm4,%ymm0 + vpunpcklqdq %ymm6,%ymm0,%ymm4 + vpunpckhqdq %ymm6,%ymm0,%ymm6 + vmovdqa %ymm5,%ymm0 + vpunpcklqdq %ymm7,%ymm0,%ymm5 + vpunpckhqdq %ymm7,%ymm0,%ymm7 + vmovdqa %ymm8,%ymm0 + vpunpcklqdq %ymm10,%ymm0,%ymm8 + vpunpckhqdq %ymm10,%ymm0,%ymm10 + vmovdqa %ymm9,%ymm0 + vpunpcklqdq %ymm11,%ymm0,%ymm9 + vpunpckhqdq %ymm11,%ymm0,%ymm11 + vmovdqa %ymm12,%ymm0 + vpunpcklqdq %ymm14,%ymm0,%ymm12 + vpunpckhqdq %ymm14,%ymm0,%ymm14 + vmovdqa %ymm13,%ymm0 + vpunpcklqdq %ymm15,%ymm0,%ymm13 + vpunpckhqdq %ymm15,%ymm0,%ymm15 + + # interleave 128-bit words in state n, n+4 + # xor/write first four blocks + vmovdqa 0x00(%rsp),%ymm1 + vperm2i128 $0x20,%ymm4,%ymm1,%ymm0 + cmp $0x0020,%rax + jl .Lxorpart8 + vpxor 0x0000(%rdx),%ymm0,%ymm0 + vmovdqu %ymm0,0x0000(%rsi) + vperm2i128 $0x31,%ymm4,%ymm1,%ymm4 + + vperm2i128 $0x20,%ymm12,%ymm8,%ymm0 + cmp $0x0040,%rax + jl .Lxorpart8 + vpxor 0x0020(%rdx),%ymm0,%ymm0 + vmovdqu %ymm0,0x0020(%rsi) + vperm2i128 $0x31,%ymm12,%ymm8,%ymm12 + + vmovdqa 0x40(%rsp),%ymm1 + vperm2i128 $0x20,%ymm6,%ymm1,%ymm0 + cmp $0x0060,%rax + jl .Lxorpart8 + vpxor 0x0040(%rdx),%ymm0,%ymm0 + vmovdqu %ymm0,0x0040(%rsi) + vperm2i128 $0x31,%ymm6,%ymm1,%ymm6 + + vperm2i128 $0x20,%ymm14,%ymm10,%ymm0 + cmp $0x0080,%rax + jl .Lxorpart8 + vpxor 0x0060(%rdx),%ymm0,%ymm0 + vmovdqu %ymm0,0x0060(%rsi) + vperm2i128 $0x31,%ymm14,%ymm10,%ymm14 + + vmovdqa 0x20(%rsp),%ymm1 + vperm2i128 $0x20,%ymm5,%ymm1,%ymm0 + cmp $0x00a0,%rax + jl .Lxorpart8 + vpxor 0x0080(%rdx),%ymm0,%ymm0 + vmovdqu %ymm0,0x0080(%rsi) + vperm2i128 $0x31,%ymm5,%ymm1,%ymm5 + + vperm2i128 $0x20,%ymm13,%ymm9,%ymm0 + cmp $0x00c0,%rax + jl .Lxorpart8 + vpxor 0x00a0(%rdx),%ymm0,%ymm0 + vmovdqu %ymm0,0x00a0(%rsi) + vperm2i128 $0x31,%ymm13,%ymm9,%ymm13 + + vmovdqa 0x60(%rsp),%ymm1 + vperm2i128 $0x20,%ymm7,%ymm1,%ymm0 + cmp $0x00e0,%rax + jl .Lxorpart8 + vpxor 0x00c0(%rdx),%ymm0,%ymm0 + vmovdqu %ymm0,0x00c0(%rsi) + vperm2i128 $0x31,%ymm7,%ymm1,%ymm7 + + vperm2i128 $0x20,%ymm15,%ymm11,%ymm0 + cmp $0x0100,%rax + jl .Lxorpart8 + vpxor 0x00e0(%rdx),%ymm0,%ymm0 + vmovdqu %ymm0,0x00e0(%rsi) + vperm2i128 $0x31,%ymm15,%ymm11,%ymm15 + + # xor remaining blocks, write to output + vmovdqa %ymm4,%ymm0 + cmp $0x0120,%rax + jl .Lxorpart8 + vpxor 0x0100(%rdx),%ymm0,%ymm0 + vmovdqu %ymm0,0x0100(%rsi) + + vmovdqa %ymm12,%ymm0 + cmp $0x0140,%rax + jl .Lxorpart8 + vpxor 0x0120(%rdx),%ymm0,%ymm0 + vmovdqu %ymm0,0x0120(%rsi) + + vmovdqa %ymm6,%ymm0 + cmp $0x0160,%rax + jl .Lxorpart8 + vpxor 0x0140(%rdx),%ymm0,%ymm0 + vmovdqu %ymm0,0x0140(%rsi) + + vmovdqa %ymm14,%ymm0 + cmp $0x0180,%rax + jl .Lxorpart8 + vpxor 0x0160(%rdx),%ymm0,%ymm0 + vmovdqu %ymm0,0x0160(%rsi) + + vmovdqa %ymm5,%ymm0 + cmp $0x01a0,%rax + jl .Lxorpart8 + vpxor 0x0180(%rdx),%ymm0,%ymm0 + vmovdqu %ymm0,0x0180(%rsi) + + vmovdqa %ymm13,%ymm0 + cmp $0x01c0,%rax + jl .Lxorpart8 + vpxor 0x01a0(%rdx),%ymm0,%ymm0 + vmovdqu %ymm0,0x01a0(%rsi) + + vmovdqa %ymm7,%ymm0 + cmp $0x01e0,%rax + jl .Lxorpart8 + vpxor 0x01c0(%rdx),%ymm0,%ymm0 + vmovdqu %ymm0,0x01c0(%rsi) + + vmovdqa %ymm15,%ymm0 + cmp $0x0200,%rax + jl .Lxorpart8 + vpxor 0x01e0(%rdx),%ymm0,%ymm0 + vmovdqu %ymm0,0x01e0(%rsi) + +.Ldone8: + vzeroupper + lea -8(%r10),%rsp + ret + +.Lxorpart8: + # xor remaining bytes from partial register into output + mov %rax,%r9 + and $0x1f,%r9 + jz .Ldone8 + and $~0x1f,%rax + + mov %rsi,%r11 + + lea (%rdx,%rax),%rsi + mov %rsp,%rdi + mov %r9,%rcx + rep movsb + + vpxor 0x00(%rsp),%ymm0,%ymm0 + vmovdqa %ymm0,0x00(%rsp) + + mov %rsp,%rsi + lea (%r11,%rax),%rdi + mov %r9,%rcx + rep movsb + + jmp .Ldone8 + +ENDPROC(chacha_8block_xor_avx2) diff --git a/arch/x86/crypto/chacha-avx512vl-x86_64.S b/arch/x86/crypto/chacha-avx512vl-x86_64.S new file mode 100644 index 000000000000..848f9c75fd4f --- /dev/null +++ b/arch/x86/crypto/chacha-avx512vl-x86_64.S @@ -0,0 +1,836 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +/* + * ChaCha 256-bit cipher algorithm, x64 AVX-512VL functions + * + * Copyright (C) 2018 Martin Willi + */ + +#include + +.section .rodata.cst32.CTR2BL, "aM", @progbits, 32 +.align 32 +CTR2BL: .octa 0x00000000000000000000000000000000 + .octa 0x00000000000000000000000000000001 + +.section .rodata.cst32.CTR4BL, "aM", @progbits, 32 +.align 32 +CTR4BL: .octa 0x00000000000000000000000000000002 + .octa 0x00000000000000000000000000000003 + +.section .rodata.cst32.CTR8BL, "aM", @progbits, 32 +.align 32 +CTR8BL: .octa 0x00000003000000020000000100000000 + .octa 0x00000007000000060000000500000004 + +.text + +ENTRY(chacha_2block_xor_avx512vl) + # %rdi: Input state matrix, s + # %rsi: up to 2 data blocks output, o + # %rdx: up to 2 data blocks input, i + # %rcx: input/output length in bytes + # %r8d: nrounds + + # This function encrypts two ChaCha blocks by loading the state + # matrix twice across four AVX registers. It performs matrix operations + # on four words in each matrix in parallel, but requires shuffling to + # rearrange the words after each round. + + vzeroupper + + # x0..3[0-2] = s0..3 + vbroadcasti128 0x00(%rdi),%ymm0 + vbroadcasti128 0x10(%rdi),%ymm1 + vbroadcasti128 0x20(%rdi),%ymm2 + vbroadcasti128 0x30(%rdi),%ymm3 + + vpaddd CTR2BL(%rip),%ymm3,%ymm3 + + vmovdqa %ymm0,%ymm8 + vmovdqa %ymm1,%ymm9 + vmovdqa %ymm2,%ymm10 + vmovdqa %ymm3,%ymm11 + +.Ldoubleround: + + # x0 += x1, x3 = rotl32(x3 ^ x0, 16) + vpaddd %ymm1,%ymm0,%ymm0 + vpxord %ymm0,%ymm3,%ymm3 + vprold $16,%ymm3,%ymm3 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 12) + vpaddd %ymm3,%ymm2,%ymm2 + vpxord %ymm2,%ymm1,%ymm1 + vprold $12,%ymm1,%ymm1 + + # x0 += x1, x3 = rotl32(x3 ^ x0, 8) + vpaddd %ymm1,%ymm0,%ymm0 + vpxord %ymm0,%ymm3,%ymm3 + vprold $8,%ymm3,%ymm3 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 7) + vpaddd %ymm3,%ymm2,%ymm2 + vpxord %ymm2,%ymm1,%ymm1 + vprold $7,%ymm1,%ymm1 + + # x1 = shuffle32(x1, MASK(0, 3, 2, 1)) + vpshufd $0x39,%ymm1,%ymm1 + # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) + vpshufd $0x4e,%ymm2,%ymm2 + # x3 = shuffle32(x3, MASK(2, 1, 0, 3)) + vpshufd $0x93,%ymm3,%ymm3 + + # x0 += x1, x3 = rotl32(x3 ^ x0, 16) + vpaddd %ymm1,%ymm0,%ymm0 + vpxord %ymm0,%ymm3,%ymm3 + vprold $16,%ymm3,%ymm3 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 12) + vpaddd %ymm3,%ymm2,%ymm2 + vpxord %ymm2,%ymm1,%ymm1 + vprold $12,%ymm1,%ymm1 + + # x0 += x1, x3 = rotl32(x3 ^ x0, 8) + vpaddd %ymm1,%ymm0,%ymm0 + vpxord %ymm0,%ymm3,%ymm3 + vprold $8,%ymm3,%ymm3 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 7) + vpaddd %ymm3,%ymm2,%ymm2 + vpxord %ymm2,%ymm1,%ymm1 + vprold $7,%ymm1,%ymm1 + + # x1 = shuffle32(x1, MASK(2, 1, 0, 3)) + vpshufd $0x93,%ymm1,%ymm1 + # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) + vpshufd $0x4e,%ymm2,%ymm2 + # x3 = shuffle32(x3, MASK(0, 3, 2, 1)) + vpshufd $0x39,%ymm3,%ymm3 + + sub $2,%r8d + jnz .Ldoubleround + + # o0 = i0 ^ (x0 + s0) + vpaddd %ymm8,%ymm0,%ymm7 + cmp $0x10,%rcx + jl .Lxorpart2 + vpxord 0x00(%rdx),%xmm7,%xmm6 + vmovdqu %xmm6,0x00(%rsi) + vextracti128 $1,%ymm7,%xmm0 + # o1 = i1 ^ (x1 + s1) + vpaddd %ymm9,%ymm1,%ymm7 + cmp $0x20,%rcx + jl .Lxorpart2 + vpxord 0x10(%rdx),%xmm7,%xmm6 + vmovdqu %xmm6,0x10(%rsi) + vextracti128 $1,%ymm7,%xmm1 + # o2 = i2 ^ (x2 + s2) + vpaddd %ymm10,%ymm2,%ymm7 + cmp $0x30,%rcx + jl .Lxorpart2 + vpxord 0x20(%rdx),%xmm7,%xmm6 + vmovdqu %xmm6,0x20(%rsi) + vextracti128 $1,%ymm7,%xmm2 + # o3 = i3 ^ (x3 + s3) + vpaddd %ymm11,%ymm3,%ymm7 + cmp $0x40,%rcx + jl .Lxorpart2 + vpxord 0x30(%rdx),%xmm7,%xmm6 + vmovdqu %xmm6,0x30(%rsi) + vextracti128 $1,%ymm7,%xmm3 + + # xor and write second block + vmovdqa %xmm0,%xmm7 + cmp $0x50,%rcx + jl .Lxorpart2 + vpxord 0x40(%rdx),%xmm7,%xmm6 + vmovdqu %xmm6,0x40(%rsi) + + vmovdqa %xmm1,%xmm7 + cmp $0x60,%rcx + jl .Lxorpart2 + vpxord 0x50(%rdx),%xmm7,%xmm6 + vmovdqu %xmm6,0x50(%rsi) + + vmovdqa %xmm2,%xmm7 + cmp $0x70,%rcx + jl .Lxorpart2 + vpxord 0x60(%rdx),%xmm7,%xmm6 + vmovdqu %xmm6,0x60(%rsi) + + vmovdqa %xmm3,%xmm7 + cmp $0x80,%rcx + jl .Lxorpart2 + vpxord 0x70(%rdx),%xmm7,%xmm6 + vmovdqu %xmm6,0x70(%rsi) + +.Ldone2: + vzeroupper + ret + +.Lxorpart2: + # xor remaining bytes from partial register into output + mov %rcx,%rax + and $0xf,%rcx + jz .Ldone8 + mov %rax,%r9 + and $~0xf,%r9 + + mov $1,%rax + shld %cl,%rax,%rax + sub $1,%rax + kmovq %rax,%k1 + + vmovdqu8 (%rdx,%r9),%xmm1{%k1}{z} + vpxord %xmm7,%xmm1,%xmm1 + vmovdqu8 %xmm1,(%rsi,%r9){%k1} + + jmp .Ldone2 + +ENDPROC(chacha_2block_xor_avx512vl) + +ENTRY(chacha_4block_xor_avx512vl) + # %rdi: Input state matrix, s + # %rsi: up to 4 data blocks output, o + # %rdx: up to 4 data blocks input, i + # %rcx: input/output length in bytes + # %r8d: nrounds + + # This function encrypts four ChaCha blocks by loading the state + # matrix four times across eight AVX registers. It performs matrix + # operations on four words in two matrices in parallel, sequentially + # to the operations on the four words of the other two matrices. The + # required word shuffling has a rather high latency, we can do the + # arithmetic on two matrix-pairs without much slowdown. + + vzeroupper + + # x0..3[0-4] = s0..3 + vbroadcasti128 0x00(%rdi),%ymm0 + vbroadcasti128 0x10(%rdi),%ymm1 + vbroadcasti128 0x20(%rdi),%ymm2 + vbroadcasti128 0x30(%rdi),%ymm3 + + vmovdqa %ymm0,%ymm4 + vmovdqa %ymm1,%ymm5 + vmovdqa %ymm2,%ymm6 + vmovdqa %ymm3,%ymm7 + + vpaddd CTR2BL(%rip),%ymm3,%ymm3 + vpaddd CTR4BL(%rip),%ymm7,%ymm7 + + vmovdqa %ymm0,%ymm11 + vmovdqa %ymm1,%ymm12 + vmovdqa %ymm2,%ymm13 + vmovdqa %ymm3,%ymm14 + vmovdqa %ymm7,%ymm15 + +.Ldoubleround4: + + # x0 += x1, x3 = rotl32(x3 ^ x0, 16) + vpaddd %ymm1,%ymm0,%ymm0 + vpxord %ymm0,%ymm3,%ymm3 + vprold $16,%ymm3,%ymm3 + + vpaddd %ymm5,%ymm4,%ymm4 + vpxord %ymm4,%ymm7,%ymm7 + vprold $16,%ymm7,%ymm7 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 12) + vpaddd %ymm3,%ymm2,%ymm2 + vpxord %ymm2,%ymm1,%ymm1 + vprold $12,%ymm1,%ymm1 + + vpaddd %ymm7,%ymm6,%ymm6 + vpxord %ymm6,%ymm5,%ymm5 + vprold $12,%ymm5,%ymm5 + + # x0 += x1, x3 = rotl32(x3 ^ x0, 8) + vpaddd %ymm1,%ymm0,%ymm0 + vpxord %ymm0,%ymm3,%ymm3 + vprold $8,%ymm3,%ymm3 + + vpaddd %ymm5,%ymm4,%ymm4 + vpxord %ymm4,%ymm7,%ymm7 + vprold $8,%ymm7,%ymm7 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 7) + vpaddd %ymm3,%ymm2,%ymm2 + vpxord %ymm2,%ymm1,%ymm1 + vprold $7,%ymm1,%ymm1 + + vpaddd %ymm7,%ymm6,%ymm6 + vpxord %ymm6,%ymm5,%ymm5 + vprold $7,%ymm5,%ymm5 + + # x1 = shuffle32(x1, MASK(0, 3, 2, 1)) + vpshufd $0x39,%ymm1,%ymm1 + vpshufd $0x39,%ymm5,%ymm5 + # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) + vpshufd $0x4e,%ymm2,%ymm2 + vpshufd $0x4e,%ymm6,%ymm6 + # x3 = shuffle32(x3, MASK(2, 1, 0, 3)) + vpshufd $0x93,%ymm3,%ymm3 + vpshufd $0x93,%ymm7,%ymm7 + + # x0 += x1, x3 = rotl32(x3 ^ x0, 16) + vpaddd %ymm1,%ymm0,%ymm0 + vpxord %ymm0,%ymm3,%ymm3 + vprold $16,%ymm3,%ymm3 + + vpaddd %ymm5,%ymm4,%ymm4 + vpxord %ymm4,%ymm7,%ymm7 + vprold $16,%ymm7,%ymm7 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 12) + vpaddd %ymm3,%ymm2,%ymm2 + vpxord %ymm2,%ymm1,%ymm1 + vprold $12,%ymm1,%ymm1 + + vpaddd %ymm7,%ymm6,%ymm6 + vpxord %ymm6,%ymm5,%ymm5 + vprold $12,%ymm5,%ymm5 + + # x0 += x1, x3 = rotl32(x3 ^ x0, 8) + vpaddd %ymm1,%ymm0,%ymm0 + vpxord %ymm0,%ymm3,%ymm3 + vprold $8,%ymm3,%ymm3 + + vpaddd %ymm5,%ymm4,%ymm4 + vpxord %ymm4,%ymm7,%ymm7 + vprold $8,%ymm7,%ymm7 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 7) + vpaddd %ymm3,%ymm2,%ymm2 + vpxord %ymm2,%ymm1,%ymm1 + vprold $7,%ymm1,%ymm1 + + vpaddd %ymm7,%ymm6,%ymm6 + vpxord %ymm6,%ymm5,%ymm5 + vprold $7,%ymm5,%ymm5 + + # x1 = shuffle32(x1, MASK(2, 1, 0, 3)) + vpshufd $0x93,%ymm1,%ymm1 + vpshufd $0x93,%ymm5,%ymm5 + # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) + vpshufd $0x4e,%ymm2,%ymm2 + vpshufd $0x4e,%ymm6,%ymm6 + # x3 = shuffle32(x3, MASK(0, 3, 2, 1)) + vpshufd $0x39,%ymm3,%ymm3 + vpshufd $0x39,%ymm7,%ymm7 + + sub $2,%r8d + jnz .Ldoubleround4 + + # o0 = i0 ^ (x0 + s0), first block + vpaddd %ymm11,%ymm0,%ymm10 + cmp $0x10,%rcx + jl .Lxorpart4 + vpxord 0x00(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x00(%rsi) + vextracti128 $1,%ymm10,%xmm0 + # o1 = i1 ^ (x1 + s1), first block + vpaddd %ymm12,%ymm1,%ymm10 + cmp $0x20,%rcx + jl .Lxorpart4 + vpxord 0x10(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x10(%rsi) + vextracti128 $1,%ymm10,%xmm1 + # o2 = i2 ^ (x2 + s2), first block + vpaddd %ymm13,%ymm2,%ymm10 + cmp $0x30,%rcx + jl .Lxorpart4 + vpxord 0x20(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x20(%rsi) + vextracti128 $1,%ymm10,%xmm2 + # o3 = i3 ^ (x3 + s3), first block + vpaddd %ymm14,%ymm3,%ymm10 + cmp $0x40,%rcx + jl .Lxorpart4 + vpxord 0x30(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x30(%rsi) + vextracti128 $1,%ymm10,%xmm3 + + # xor and write second block + vmovdqa %xmm0,%xmm10 + cmp $0x50,%rcx + jl .Lxorpart4 + vpxord 0x40(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x40(%rsi) + + vmovdqa %xmm1,%xmm10 + cmp $0x60,%rcx + jl .Lxorpart4 + vpxord 0x50(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x50(%rsi) + + vmovdqa %xmm2,%xmm10 + cmp $0x70,%rcx + jl .Lxorpart4 + vpxord 0x60(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x60(%rsi) + + vmovdqa %xmm3,%xmm10 + cmp $0x80,%rcx + jl .Lxorpart4 + vpxord 0x70(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x70(%rsi) + + # o0 = i0 ^ (x0 + s0), third block + vpaddd %ymm11,%ymm4,%ymm10 + cmp $0x90,%rcx + jl .Lxorpart4 + vpxord 0x80(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x80(%rsi) + vextracti128 $1,%ymm10,%xmm4 + # o1 = i1 ^ (x1 + s1), third block + vpaddd %ymm12,%ymm5,%ymm10 + cmp $0xa0,%rcx + jl .Lxorpart4 + vpxord 0x90(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x90(%rsi) + vextracti128 $1,%ymm10,%xmm5 + # o2 = i2 ^ (x2 + s2), third block + vpaddd %ymm13,%ymm6,%ymm10 + cmp $0xb0,%rcx + jl .Lxorpart4 + vpxord 0xa0(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0xa0(%rsi) + vextracti128 $1,%ymm10,%xmm6 + # o3 = i3 ^ (x3 + s3), third block + vpaddd %ymm15,%ymm7,%ymm10 + cmp $0xc0,%rcx + jl .Lxorpart4 + vpxord 0xb0(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0xb0(%rsi) + vextracti128 $1,%ymm10,%xmm7 + + # xor and write fourth block + vmovdqa %xmm4,%xmm10 + cmp $0xd0,%rcx + jl .Lxorpart4 + vpxord 0xc0(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0xc0(%rsi) + + vmovdqa %xmm5,%xmm10 + cmp $0xe0,%rcx + jl .Lxorpart4 + vpxord 0xd0(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0xd0(%rsi) + + vmovdqa %xmm6,%xmm10 + cmp $0xf0,%rcx + jl .Lxorpart4 + vpxord 0xe0(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0xe0(%rsi) + + vmovdqa %xmm7,%xmm10 + cmp $0x100,%rcx + jl .Lxorpart4 + vpxord 0xf0(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0xf0(%rsi) + +.Ldone4: + vzeroupper + ret + +.Lxorpart4: + # xor remaining bytes from partial register into output + mov %rcx,%rax + and $0xf,%rcx + jz .Ldone8 + mov %rax,%r9 + and $~0xf,%r9 + + mov $1,%rax + shld %cl,%rax,%rax + sub $1,%rax + kmovq %rax,%k1 + + vmovdqu8 (%rdx,%r9),%xmm1{%k1}{z} + vpxord %xmm10,%xmm1,%xmm1 + vmovdqu8 %xmm1,(%rsi,%r9){%k1} + + jmp .Ldone4 + +ENDPROC(chacha_4block_xor_avx512vl) + +ENTRY(chacha_8block_xor_avx512vl) + # %rdi: Input state matrix, s + # %rsi: up to 8 data blocks output, o + # %rdx: up to 8 data blocks input, i + # %rcx: input/output length in bytes + # %r8d: nrounds + + # This function encrypts eight consecutive ChaCha blocks by loading + # the state matrix in AVX registers eight times. Compared to AVX2, this + # mostly benefits from the new rotate instructions in VL and the + # additional registers. + + vzeroupper + + # x0..15[0-7] = s[0..15] + vpbroadcastd 0x00(%rdi),%ymm0 + vpbroadcastd 0x04(%rdi),%ymm1 + vpbroadcastd 0x08(%rdi),%ymm2 + vpbroadcastd 0x0c(%rdi),%ymm3 + vpbroadcastd 0x10(%rdi),%ymm4 + vpbroadcastd 0x14(%rdi),%ymm5 + vpbroadcastd 0x18(%rdi),%ymm6 + vpbroadcastd 0x1c(%rdi),%ymm7 + vpbroadcastd 0x20(%rdi),%ymm8 + vpbroadcastd 0x24(%rdi),%ymm9 + vpbroadcastd 0x28(%rdi),%ymm10 + vpbroadcastd 0x2c(%rdi),%ymm11 + vpbroadcastd 0x30(%rdi),%ymm12 + vpbroadcastd 0x34(%rdi),%ymm13 + vpbroadcastd 0x38(%rdi),%ymm14 + vpbroadcastd 0x3c(%rdi),%ymm15 + + # x12 += counter values 0-3 + vpaddd CTR8BL(%rip),%ymm12,%ymm12 + + vmovdqa64 %ymm0,%ymm16 + vmovdqa64 %ymm1,%ymm17 + vmovdqa64 %ymm2,%ymm18 + vmovdqa64 %ymm3,%ymm19 + vmovdqa64 %ymm4,%ymm20 + vmovdqa64 %ymm5,%ymm21 + vmovdqa64 %ymm6,%ymm22 + vmovdqa64 %ymm7,%ymm23 + vmovdqa64 %ymm8,%ymm24 + vmovdqa64 %ymm9,%ymm25 + vmovdqa64 %ymm10,%ymm26 + vmovdqa64 %ymm11,%ymm27 + vmovdqa64 %ymm12,%ymm28 + vmovdqa64 %ymm13,%ymm29 + vmovdqa64 %ymm14,%ymm30 + vmovdqa64 %ymm15,%ymm31 + +.Ldoubleround8: + # x0 += x4, x12 = rotl32(x12 ^ x0, 16) + vpaddd %ymm0,%ymm4,%ymm0 + vpxord %ymm0,%ymm12,%ymm12 + vprold $16,%ymm12,%ymm12 + # x1 += x5, x13 = rotl32(x13 ^ x1, 16) + vpaddd %ymm1,%ymm5,%ymm1 + vpxord %ymm1,%ymm13,%ymm13 + vprold $16,%ymm13,%ymm13 + # x2 += x6, x14 = rotl32(x14 ^ x2, 16) + vpaddd %ymm2,%ymm6,%ymm2 + vpxord %ymm2,%ymm14,%ymm14 + vprold $16,%ymm14,%ymm14 + # x3 += x7, x15 = rotl32(x15 ^ x3, 16) + vpaddd %ymm3,%ymm7,%ymm3 + vpxord %ymm3,%ymm15,%ymm15 + vprold $16,%ymm15,%ymm15 + + # x8 += x12, x4 = rotl32(x4 ^ x8, 12) + vpaddd %ymm12,%ymm8,%ymm8 + vpxord %ymm8,%ymm4,%ymm4 + vprold $12,%ymm4,%ymm4 + # x9 += x13, x5 = rotl32(x5 ^ x9, 12) + vpaddd %ymm13,%ymm9,%ymm9 + vpxord %ymm9,%ymm5,%ymm5 + vprold $12,%ymm5,%ymm5 + # x10 += x14, x6 = rotl32(x6 ^ x10, 12) + vpaddd %ymm14,%ymm10,%ymm10 + vpxord %ymm10,%ymm6,%ymm6 + vprold $12,%ymm6,%ymm6 + # x11 += x15, x7 = rotl32(x7 ^ x11, 12) + vpaddd %ymm15,%ymm11,%ymm11 + vpxord %ymm11,%ymm7,%ymm7 + vprold $12,%ymm7,%ymm7 + + # x0 += x4, x12 = rotl32(x12 ^ x0, 8) + vpaddd %ymm0,%ymm4,%ymm0 + vpxord %ymm0,%ymm12,%ymm12 + vprold $8,%ymm12,%ymm12 + # x1 += x5, x13 = rotl32(x13 ^ x1, 8) + vpaddd %ymm1,%ymm5,%ymm1 + vpxord %ymm1,%ymm13,%ymm13 + vprold $8,%ymm13,%ymm13 + # x2 += x6, x14 = rotl32(x14 ^ x2, 8) + vpaddd %ymm2,%ymm6,%ymm2 + vpxord %ymm2,%ymm14,%ymm14 + vprold $8,%ymm14,%ymm14 + # x3 += x7, x15 = rotl32(x15 ^ x3, 8) + vpaddd %ymm3,%ymm7,%ymm3 + vpxord %ymm3,%ymm15,%ymm15 + vprold $8,%ymm15,%ymm15 + + # x8 += x12, x4 = rotl32(x4 ^ x8, 7) + vpaddd %ymm12,%ymm8,%ymm8 + vpxord %ymm8,%ymm4,%ymm4 + vprold $7,%ymm4,%ymm4 + # x9 += x13, x5 = rotl32(x5 ^ x9, 7) + vpaddd %ymm13,%ymm9,%ymm9 + vpxord %ymm9,%ymm5,%ymm5 + vprold $7,%ymm5,%ymm5 + # x10 += x14, x6 = rotl32(x6 ^ x10, 7) + vpaddd %ymm14,%ymm10,%ymm10 + vpxord %ymm10,%ymm6,%ymm6 + vprold $7,%ymm6,%ymm6 + # x11 += x15, x7 = rotl32(x7 ^ x11, 7) + vpaddd %ymm15,%ymm11,%ymm11 + vpxord %ymm11,%ymm7,%ymm7 + vprold $7,%ymm7,%ymm7 + + # x0 += x5, x15 = rotl32(x15 ^ x0, 16) + vpaddd %ymm0,%ymm5,%ymm0 + vpxord %ymm0,%ymm15,%ymm15 + vprold $16,%ymm15,%ymm15 + # x1 += x6, x12 = rotl32(x12 ^ x1, 16) + vpaddd %ymm1,%ymm6,%ymm1 + vpxord %ymm1,%ymm12,%ymm12 + vprold $16,%ymm12,%ymm12 + # x2 += x7, x13 = rotl32(x13 ^ x2, 16) + vpaddd %ymm2,%ymm7,%ymm2 + vpxord %ymm2,%ymm13,%ymm13 + vprold $16,%ymm13,%ymm13 + # x3 += x4, x14 = rotl32(x14 ^ x3, 16) + vpaddd %ymm3,%ymm4,%ymm3 + vpxord %ymm3,%ymm14,%ymm14 + vprold $16,%ymm14,%ymm14 + + # x10 += x15, x5 = rotl32(x5 ^ x10, 12) + vpaddd %ymm15,%ymm10,%ymm10 + vpxord %ymm10,%ymm5,%ymm5 + vprold $12,%ymm5,%ymm5 + # x11 += x12, x6 = rotl32(x6 ^ x11, 12) + vpaddd %ymm12,%ymm11,%ymm11 + vpxord %ymm11,%ymm6,%ymm6 + vprold $12,%ymm6,%ymm6 + # x8 += x13, x7 = rotl32(x7 ^ x8, 12) + vpaddd %ymm13,%ymm8,%ymm8 + vpxord %ymm8,%ymm7,%ymm7 + vprold $12,%ymm7,%ymm7 + # x9 += x14, x4 = rotl32(x4 ^ x9, 12) + vpaddd %ymm14,%ymm9,%ymm9 + vpxord %ymm9,%ymm4,%ymm4 + vprold $12,%ymm4,%ymm4 + + # x0 += x5, x15 = rotl32(x15 ^ x0, 8) + vpaddd %ymm0,%ymm5,%ymm0 + vpxord %ymm0,%ymm15,%ymm15 + vprold $8,%ymm15,%ymm15 + # x1 += x6, x12 = rotl32(x12 ^ x1, 8) + vpaddd %ymm1,%ymm6,%ymm1 + vpxord %ymm1,%ymm12,%ymm12 + vprold $8,%ymm12,%ymm12 + # x2 += x7, x13 = rotl32(x13 ^ x2, 8) + vpaddd %ymm2,%ymm7,%ymm2 + vpxord %ymm2,%ymm13,%ymm13 + vprold $8,%ymm13,%ymm13 + # x3 += x4, x14 = rotl32(x14 ^ x3, 8) + vpaddd %ymm3,%ymm4,%ymm3 + vpxord %ymm3,%ymm14,%ymm14 + vprold $8,%ymm14,%ymm14 + + # x10 += x15, x5 = rotl32(x5 ^ x10, 7) + vpaddd %ymm15,%ymm10,%ymm10 + vpxord %ymm10,%ymm5,%ymm5 + vprold $7,%ymm5,%ymm5 + # x11 += x12, x6 = rotl32(x6 ^ x11, 7) + vpaddd %ymm12,%ymm11,%ymm11 + vpxord %ymm11,%ymm6,%ymm6 + vprold $7,%ymm6,%ymm6 + # x8 += x13, x7 = rotl32(x7 ^ x8, 7) + vpaddd %ymm13,%ymm8,%ymm8 + vpxord %ymm8,%ymm7,%ymm7 + vprold $7,%ymm7,%ymm7 + # x9 += x14, x4 = rotl32(x4 ^ x9, 7) + vpaddd %ymm14,%ymm9,%ymm9 + vpxord %ymm9,%ymm4,%ymm4 + vprold $7,%ymm4,%ymm4 + + sub $2,%r8d + jnz .Ldoubleround8 + + # x0..15[0-3] += s[0..15] + vpaddd %ymm16,%ymm0,%ymm0 + vpaddd %ymm17,%ymm1,%ymm1 + vpaddd %ymm18,%ymm2,%ymm2 + vpaddd %ymm19,%ymm3,%ymm3 + vpaddd %ymm20,%ymm4,%ymm4 + vpaddd %ymm21,%ymm5,%ymm5 + vpaddd %ymm22,%ymm6,%ymm6 + vpaddd %ymm23,%ymm7,%ymm7 + vpaddd %ymm24,%ymm8,%ymm8 + vpaddd %ymm25,%ymm9,%ymm9 + vpaddd %ymm26,%ymm10,%ymm10 + vpaddd %ymm27,%ymm11,%ymm11 + vpaddd %ymm28,%ymm12,%ymm12 + vpaddd %ymm29,%ymm13,%ymm13 + vpaddd %ymm30,%ymm14,%ymm14 + vpaddd %ymm31,%ymm15,%ymm15 + + # interleave 32-bit words in state n, n+1 + vpunpckldq %ymm1,%ymm0,%ymm16 + vpunpckhdq %ymm1,%ymm0,%ymm17 + vpunpckldq %ymm3,%ymm2,%ymm18 + vpunpckhdq %ymm3,%ymm2,%ymm19 + vpunpckldq %ymm5,%ymm4,%ymm20 + vpunpckhdq %ymm5,%ymm4,%ymm21 + vpunpckldq %ymm7,%ymm6,%ymm22 + vpunpckhdq %ymm7,%ymm6,%ymm23 + vpunpckldq %ymm9,%ymm8,%ymm24 + vpunpckhdq %ymm9,%ymm8,%ymm25 + vpunpckldq %ymm11,%ymm10,%ymm26 + vpunpckhdq %ymm11,%ymm10,%ymm27 + vpunpckldq %ymm13,%ymm12,%ymm28 + vpunpckhdq %ymm13,%ymm12,%ymm29 + vpunpckldq %ymm15,%ymm14,%ymm30 + vpunpckhdq %ymm15,%ymm14,%ymm31 + + # interleave 64-bit words in state n, n+2 + vpunpcklqdq %ymm18,%ymm16,%ymm0 + vpunpcklqdq %ymm19,%ymm17,%ymm1 + vpunpckhqdq %ymm18,%ymm16,%ymm2 + vpunpckhqdq %ymm19,%ymm17,%ymm3 + vpunpcklqdq %ymm22,%ymm20,%ymm4 + vpunpcklqdq %ymm23,%ymm21,%ymm5 + vpunpckhqdq %ymm22,%ymm20,%ymm6 + vpunpckhqdq %ymm23,%ymm21,%ymm7 + vpunpcklqdq %ymm26,%ymm24,%ymm8 + vpunpcklqdq %ymm27,%ymm25,%ymm9 + vpunpckhqdq %ymm26,%ymm24,%ymm10 + vpunpckhqdq %ymm27,%ymm25,%ymm11 + vpunpcklqdq %ymm30,%ymm28,%ymm12 + vpunpcklqdq %ymm31,%ymm29,%ymm13 + vpunpckhqdq %ymm30,%ymm28,%ymm14 + vpunpckhqdq %ymm31,%ymm29,%ymm15 + + # interleave 128-bit words in state n, n+4 + # xor/write first four blocks + vmovdqa64 %ymm0,%ymm16 + vperm2i128 $0x20,%ymm4,%ymm0,%ymm0 + cmp $0x0020,%rcx + jl .Lxorpart8 + vpxord 0x0000(%rdx),%ymm0,%ymm0 + vmovdqu64 %ymm0,0x0000(%rsi) + vmovdqa64 %ymm16,%ymm0 + vperm2i128 $0x31,%ymm4,%ymm0,%ymm4 + + vperm2i128 $0x20,%ymm12,%ymm8,%ymm0 + cmp $0x0040,%rcx + jl .Lxorpart8 + vpxord 0x0020(%rdx),%ymm0,%ymm0 + vmovdqu64 %ymm0,0x0020(%rsi) + vperm2i128 $0x31,%ymm12,%ymm8,%ymm12 + + vperm2i128 $0x20,%ymm6,%ymm2,%ymm0 + cmp $0x0060,%rcx + jl .Lxorpart8 + vpxord 0x0040(%rdx),%ymm0,%ymm0 + vmovdqu64 %ymm0,0x0040(%rsi) + vperm2i128 $0x31,%ymm6,%ymm2,%ymm6 + + vperm2i128 $0x20,%ymm14,%ymm10,%ymm0 + cmp $0x0080,%rcx + jl .Lxorpart8 + vpxord 0x0060(%rdx),%ymm0,%ymm0 + vmovdqu64 %ymm0,0x0060(%rsi) + vperm2i128 $0x31,%ymm14,%ymm10,%ymm14 + + vperm2i128 $0x20,%ymm5,%ymm1,%ymm0 + cmp $0x00a0,%rcx + jl .Lxorpart8 + vpxord 0x0080(%rdx),%ymm0,%ymm0 + vmovdqu64 %ymm0,0x0080(%rsi) + vperm2i128 $0x31,%ymm5,%ymm1,%ymm5 + + vperm2i128 $0x20,%ymm13,%ymm9,%ymm0 + cmp $0x00c0,%rcx + jl .Lxorpart8 + vpxord 0x00a0(%rdx),%ymm0,%ymm0 + vmovdqu64 %ymm0,0x00a0(%rsi) + vperm2i128 $0x31,%ymm13,%ymm9,%ymm13 + + vperm2i128 $0x20,%ymm7,%ymm3,%ymm0 + cmp $0x00e0,%rcx + jl .Lxorpart8 + vpxord 0x00c0(%rdx),%ymm0,%ymm0 + vmovdqu64 %ymm0,0x00c0(%rsi) + vperm2i128 $0x31,%ymm7,%ymm3,%ymm7 + + vperm2i128 $0x20,%ymm15,%ymm11,%ymm0 + cmp $0x0100,%rcx + jl .Lxorpart8 + vpxord 0x00e0(%rdx),%ymm0,%ymm0 + vmovdqu64 %ymm0,0x00e0(%rsi) + vperm2i128 $0x31,%ymm15,%ymm11,%ymm15 + + # xor remaining blocks, write to output + vmovdqa64 %ymm4,%ymm0 + cmp $0x0120,%rcx + jl .Lxorpart8 + vpxord 0x0100(%rdx),%ymm0,%ymm0 + vmovdqu64 %ymm0,0x0100(%rsi) + + vmovdqa64 %ymm12,%ymm0 + cmp $0x0140,%rcx + jl .Lxorpart8 + vpxord 0x0120(%rdx),%ymm0,%ymm0 + vmovdqu64 %ymm0,0x0120(%rsi) + + vmovdqa64 %ymm6,%ymm0 + cmp $0x0160,%rcx + jl .Lxorpart8 + vpxord 0x0140(%rdx),%ymm0,%ymm0 + vmovdqu64 %ymm0,0x0140(%rsi) + + vmovdqa64 %ymm14,%ymm0 + cmp $0x0180,%rcx + jl .Lxorpart8 + vpxord 0x0160(%rdx),%ymm0,%ymm0 + vmovdqu64 %ymm0,0x0160(%rsi) + + vmovdqa64 %ymm5,%ymm0 + cmp $0x01a0,%rcx + jl .Lxorpart8 + vpxord 0x0180(%rdx),%ymm0,%ymm0 + vmovdqu64 %ymm0,0x0180(%rsi) + + vmovdqa64 %ymm13,%ymm0 + cmp $0x01c0,%rcx + jl .Lxorpart8 + vpxord 0x01a0(%rdx),%ymm0,%ymm0 + vmovdqu64 %ymm0,0x01a0(%rsi) + + vmovdqa64 %ymm7,%ymm0 + cmp $0x01e0,%rcx + jl .Lxorpart8 + vpxord 0x01c0(%rdx),%ymm0,%ymm0 + vmovdqu64 %ymm0,0x01c0(%rsi) + + vmovdqa64 %ymm15,%ymm0 + cmp $0x0200,%rcx + jl .Lxorpart8 + vpxord 0x01e0(%rdx),%ymm0,%ymm0 + vmovdqu64 %ymm0,0x01e0(%rsi) + +.Ldone8: + vzeroupper + ret + +.Lxorpart8: + # xor remaining bytes from partial register into output + mov %rcx,%rax + and $0x1f,%rcx + jz .Ldone8 + mov %rax,%r9 + and $~0x1f,%r9 + + mov $1,%rax + shld %cl,%rax,%rax + sub $1,%rax + kmovq %rax,%k1 + + vmovdqu8 (%rdx,%r9),%ymm1{%k1}{z} + vpxord %ymm0,%ymm1,%ymm1 + vmovdqu8 %ymm1,(%rsi,%r9){%k1} + + jmp .Ldone8 + +ENDPROC(chacha_8block_xor_avx512vl) diff --git a/arch/x86/crypto/chacha-ssse3-x86_64.S b/arch/x86/crypto/chacha-ssse3-x86_64.S new file mode 100644 index 000000000000..c05a7a963dc3 --- /dev/null +++ b/arch/x86/crypto/chacha-ssse3-x86_64.S @@ -0,0 +1,795 @@ +/* + * ChaCha 256-bit cipher algorithm, x64 SSSE3 functions + * + * Copyright (C) 2015 Martin Willi + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include +#include + +.section .rodata.cst16.ROT8, "aM", @progbits, 16 +.align 16 +ROT8: .octa 0x0e0d0c0f0a09080b0605040702010003 +.section .rodata.cst16.ROT16, "aM", @progbits, 16 +.align 16 +ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302 +.section .rodata.cst16.CTRINC, "aM", @progbits, 16 +.align 16 +CTRINC: .octa 0x00000003000000020000000100000000 + +.text + +/* + * chacha_permute - permute one block + * + * Permute one 64-byte block where the state matrix is in %xmm0-%xmm3. This + * function performs matrix operations on four words in parallel, but requires + * shuffling to rearrange the words after each round. 8/16-bit word rotation is + * done with the slightly better performing SSSE3 byte shuffling, 7/12-bit word + * rotation uses traditional shift+OR. + * + * The round count is given in %r8d. + * + * Clobbers: %r8d, %xmm4-%xmm7 + */ +chacha_permute: + + movdqa ROT8(%rip),%xmm4 + movdqa ROT16(%rip),%xmm5 + +.Ldoubleround: + # x0 += x1, x3 = rotl32(x3 ^ x0, 16) + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 + pshufb %xmm5,%xmm3 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 12) + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm6 + pslld $12,%xmm6 + psrld $20,%xmm1 + por %xmm6,%xmm1 + + # x0 += x1, x3 = rotl32(x3 ^ x0, 8) + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 + pshufb %xmm4,%xmm3 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 7) + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm7 + pslld $7,%xmm7 + psrld $25,%xmm1 + por %xmm7,%xmm1 + + # x1 = shuffle32(x1, MASK(0, 3, 2, 1)) + pshufd $0x39,%xmm1,%xmm1 + # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) + pshufd $0x4e,%xmm2,%xmm2 + # x3 = shuffle32(x3, MASK(2, 1, 0, 3)) + pshufd $0x93,%xmm3,%xmm3 + + # x0 += x1, x3 = rotl32(x3 ^ x0, 16) + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 + pshufb %xmm5,%xmm3 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 12) + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm6 + pslld $12,%xmm6 + psrld $20,%xmm1 + por %xmm6,%xmm1 + + # x0 += x1, x3 = rotl32(x3 ^ x0, 8) + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 + pshufb %xmm4,%xmm3 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 7) + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm7 + pslld $7,%xmm7 + psrld $25,%xmm1 + por %xmm7,%xmm1 + + # x1 = shuffle32(x1, MASK(2, 1, 0, 3)) + pshufd $0x93,%xmm1,%xmm1 + # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) + pshufd $0x4e,%xmm2,%xmm2 + # x3 = shuffle32(x3, MASK(0, 3, 2, 1)) + pshufd $0x39,%xmm3,%xmm3 + + sub $2,%r8d + jnz .Ldoubleround + + ret +ENDPROC(chacha_permute) + +ENTRY(chacha_block_xor_ssse3) + # %rdi: Input state matrix, s + # %rsi: up to 1 data block output, o + # %rdx: up to 1 data block input, i + # %rcx: input/output length in bytes + # %r8d: nrounds + FRAME_BEGIN + + # x0..3 = s0..3 + movdqa 0x00(%rdi),%xmm0 + movdqa 0x10(%rdi),%xmm1 + movdqa 0x20(%rdi),%xmm2 + movdqa 0x30(%rdi),%xmm3 + movdqa %xmm0,%xmm8 + movdqa %xmm1,%xmm9 + movdqa %xmm2,%xmm10 + movdqa %xmm3,%xmm11 + + mov %rcx,%rax + call chacha_permute + + # o0 = i0 ^ (x0 + s0) + paddd %xmm8,%xmm0 + cmp $0x10,%rax + jl .Lxorpart + movdqu 0x00(%rdx),%xmm4 + pxor %xmm4,%xmm0 + movdqu %xmm0,0x00(%rsi) + # o1 = i1 ^ (x1 + s1) + paddd %xmm9,%xmm1 + movdqa %xmm1,%xmm0 + cmp $0x20,%rax + jl .Lxorpart + movdqu 0x10(%rdx),%xmm0 + pxor %xmm1,%xmm0 + movdqu %xmm0,0x10(%rsi) + # o2 = i2 ^ (x2 + s2) + paddd %xmm10,%xmm2 + movdqa %xmm2,%xmm0 + cmp $0x30,%rax + jl .Lxorpart + movdqu 0x20(%rdx),%xmm0 + pxor %xmm2,%xmm0 + movdqu %xmm0,0x20(%rsi) + # o3 = i3 ^ (x3 + s3) + paddd %xmm11,%xmm3 + movdqa %xmm3,%xmm0 + cmp $0x40,%rax + jl .Lxorpart + movdqu 0x30(%rdx),%xmm0 + pxor %xmm3,%xmm0 + movdqu %xmm0,0x30(%rsi) + +.Ldone: + FRAME_END + ret + +.Lxorpart: + # xor remaining bytes from partial register into output + mov %rax,%r9 + and $0x0f,%r9 + jz .Ldone + and $~0x0f,%rax + + mov %rsi,%r11 + + lea 8(%rsp),%r10 + sub $0x10,%rsp + and $~31,%rsp + + lea (%rdx,%rax),%rsi + mov %rsp,%rdi + mov %r9,%rcx + rep movsb + + pxor 0x00(%rsp),%xmm0 + movdqa %xmm0,0x00(%rsp) + + mov %rsp,%rsi + lea (%r11,%rax),%rdi + mov %r9,%rcx + rep movsb + + lea -8(%r10),%rsp + jmp .Ldone + +ENDPROC(chacha_block_xor_ssse3) + +ENTRY(hchacha_block_ssse3) + # %rdi: Input state matrix, s + # %rsi: output (8 32-bit words) + # %edx: nrounds + FRAME_BEGIN + + movdqa 0x00(%rdi),%xmm0 + movdqa 0x10(%rdi),%xmm1 + movdqa 0x20(%rdi),%xmm2 + movdqa 0x30(%rdi),%xmm3 + + mov %edx,%r8d + call chacha_permute + + movdqu %xmm0,0x00(%rsi) + movdqu %xmm3,0x10(%rsi) + + FRAME_END + ret +ENDPROC(hchacha_block_ssse3) + +ENTRY(chacha_4block_xor_ssse3) + # %rdi: Input state matrix, s + # %rsi: up to 4 data blocks output, o + # %rdx: up to 4 data blocks input, i + # %rcx: input/output length in bytes + # %r8d: nrounds + + # This function encrypts four consecutive ChaCha blocks by loading the + # the state matrix in SSE registers four times. As we need some scratch + # registers, we save the first four registers on the stack. The + # algorithm performs each operation on the corresponding word of each + # state matrix, hence requires no word shuffling. For final XORing step + # we transpose the matrix by interleaving 32- and then 64-bit words, + # which allows us to do XOR in SSE registers. 8/16-bit word rotation is + # done with the slightly better performing SSSE3 byte shuffling, + # 7/12-bit word rotation uses traditional shift+OR. + + lea 8(%rsp),%r10 + sub $0x80,%rsp + and $~63,%rsp + mov %rcx,%rax + + # x0..15[0-3] = s0..3[0..3] + movq 0x00(%rdi),%xmm1 + pshufd $0x00,%xmm1,%xmm0 + pshufd $0x55,%xmm1,%xmm1 + movq 0x08(%rdi),%xmm3 + pshufd $0x00,%xmm3,%xmm2 + pshufd $0x55,%xmm3,%xmm3 + movq 0x10(%rdi),%xmm5 + pshufd $0x00,%xmm5,%xmm4 + pshufd $0x55,%xmm5,%xmm5 + movq 0x18(%rdi),%xmm7 + pshufd $0x00,%xmm7,%xmm6 + pshufd $0x55,%xmm7,%xmm7 + movq 0x20(%rdi),%xmm9 + pshufd $0x00,%xmm9,%xmm8 + pshufd $0x55,%xmm9,%xmm9 + movq 0x28(%rdi),%xmm11 + pshufd $0x00,%xmm11,%xmm10 + pshufd $0x55,%xmm11,%xmm11 + movq 0x30(%rdi),%xmm13 + pshufd $0x00,%xmm13,%xmm12 + pshufd $0x55,%xmm13,%xmm13 + movq 0x38(%rdi),%xmm15 + pshufd $0x00,%xmm15,%xmm14 + pshufd $0x55,%xmm15,%xmm15 + # x0..3 on stack + movdqa %xmm0,0x00(%rsp) + movdqa %xmm1,0x10(%rsp) + movdqa %xmm2,0x20(%rsp) + movdqa %xmm3,0x30(%rsp) + + movdqa CTRINC(%rip),%xmm1 + movdqa ROT8(%rip),%xmm2 + movdqa ROT16(%rip),%xmm3 + + # x12 += counter values 0-3 + paddd %xmm1,%xmm12 + +.Ldoubleround4: + # x0 += x4, x12 = rotl32(x12 ^ x0, 16) + movdqa 0x00(%rsp),%xmm0 + paddd %xmm4,%xmm0 + movdqa %xmm0,0x00(%rsp) + pxor %xmm0,%xmm12 + pshufb %xmm3,%xmm12 + # x1 += x5, x13 = rotl32(x13 ^ x1, 16) + movdqa 0x10(%rsp),%xmm0 + paddd %xmm5,%xmm0 + movdqa %xmm0,0x10(%rsp) + pxor %xmm0,%xmm13 + pshufb %xmm3,%xmm13 + # x2 += x6, x14 = rotl32(x14 ^ x2, 16) + movdqa 0x20(%rsp),%xmm0 + paddd %xmm6,%xmm0 + movdqa %xmm0,0x20(%rsp) + pxor %xmm0,%xmm14 + pshufb %xmm3,%xmm14 + # x3 += x7, x15 = rotl32(x15 ^ x3, 16) + movdqa 0x30(%rsp),%xmm0 + paddd %xmm7,%xmm0 + movdqa %xmm0,0x30(%rsp) + pxor %xmm0,%xmm15 + pshufb %xmm3,%xmm15 + + # x8 += x12, x4 = rotl32(x4 ^ x8, 12) + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm0 + pslld $12,%xmm0 + psrld $20,%xmm4 + por %xmm0,%xmm4 + # x9 += x13, x5 = rotl32(x5 ^ x9, 12) + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm0 + pslld $12,%xmm0 + psrld $20,%xmm5 + por %xmm0,%xmm5 + # x10 += x14, x6 = rotl32(x6 ^ x10, 12) + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm0 + pslld $12,%xmm0 + psrld $20,%xmm6 + por %xmm0,%xmm6 + # x11 += x15, x7 = rotl32(x7 ^ x11, 12) + paddd %xmm15,%xmm11 + pxor %xmm11,%xmm7 + movdqa %xmm7,%xmm0 + pslld $12,%xmm0 + psrld $20,%xmm7 + por %xmm0,%xmm7 + + # x0 += x4, x12 = rotl32(x12 ^ x0, 8) + movdqa 0x00(%rsp),%xmm0 + paddd %xmm4,%xmm0 + movdqa %xmm0,0x00(%rsp) + pxor %xmm0,%xmm12 + pshufb %xmm2,%xmm12 + # x1 += x5, x13 = rotl32(x13 ^ x1, 8) + movdqa 0x10(%rsp),%xmm0 + paddd %xmm5,%xmm0 + movdqa %xmm0,0x10(%rsp) + pxor %xmm0,%xmm13 + pshufb %xmm2,%xmm13 + # x2 += x6, x14 = rotl32(x14 ^ x2, 8) + movdqa 0x20(%rsp),%xmm0 + paddd %xmm6,%xmm0 + movdqa %xmm0,0x20(%rsp) + pxor %xmm0,%xmm14 + pshufb %xmm2,%xmm14 + # x3 += x7, x15 = rotl32(x15 ^ x3, 8) + movdqa 0x30(%rsp),%xmm0 + paddd %xmm7,%xmm0 + movdqa %xmm0,0x30(%rsp) + pxor %xmm0,%xmm15 + pshufb %xmm2,%xmm15 + + # x8 += x12, x4 = rotl32(x4 ^ x8, 7) + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm0 + pslld $7,%xmm0 + psrld $25,%xmm4 + por %xmm0,%xmm4 + # x9 += x13, x5 = rotl32(x5 ^ x9, 7) + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm0 + pslld $7,%xmm0 + psrld $25,%xmm5 + por %xmm0,%xmm5 + # x10 += x14, x6 = rotl32(x6 ^ x10, 7) + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm0 + pslld $7,%xmm0 + psrld $25,%xmm6 + por %xmm0,%xmm6 + # x11 += x15, x7 = rotl32(x7 ^ x11, 7) + paddd %xmm15,%xmm11 + pxor %xmm11,%xmm7 + movdqa %xmm7,%xmm0 + pslld $7,%xmm0 + psrld $25,%xmm7 + por %xmm0,%xmm7 + + # x0 += x5, x15 = rotl32(x15 ^ x0, 16) + movdqa 0x00(%rsp),%xmm0 + paddd %xmm5,%xmm0 + movdqa %xmm0,0x00(%rsp) + pxor %xmm0,%xmm15 + pshufb %xmm3,%xmm15 + # x1 += x6, x12 = rotl32(x12 ^ x1, 16) + movdqa 0x10(%rsp),%xmm0 + paddd %xmm6,%xmm0 + movdqa %xmm0,0x10(%rsp) + pxor %xmm0,%xmm12 + pshufb %xmm3,%xmm12 + # x2 += x7, x13 = rotl32(x13 ^ x2, 16) + movdqa 0x20(%rsp),%xmm0 + paddd %xmm7,%xmm0 + movdqa %xmm0,0x20(%rsp) + pxor %xmm0,%xmm13 + pshufb %xmm3,%xmm13 + # x3 += x4, x14 = rotl32(x14 ^ x3, 16) + movdqa 0x30(%rsp),%xmm0 + paddd %xmm4,%xmm0 + movdqa %xmm0,0x30(%rsp) + pxor %xmm0,%xmm14 + pshufb %xmm3,%xmm14 + + # x10 += x15, x5 = rotl32(x5 ^ x10, 12) + paddd %xmm15,%xmm10 + pxor %xmm10,%xmm5 + movdqa %xmm5,%xmm0 + pslld $12,%xmm0 + psrld $20,%xmm5 + por %xmm0,%xmm5 + # x11 += x12, x6 = rotl32(x6 ^ x11, 12) + paddd %xmm12,%xmm11 + pxor %xmm11,%xmm6 + movdqa %xmm6,%xmm0 + pslld $12,%xmm0 + psrld $20,%xmm6 + por %xmm0,%xmm6 + # x8 += x13, x7 = rotl32(x7 ^ x8, 12) + paddd %xmm13,%xmm8 + pxor %xmm8,%xmm7 + movdqa %xmm7,%xmm0 + pslld $12,%xmm0 + psrld $20,%xmm7 + por %xmm0,%xmm7 + # x9 += x14, x4 = rotl32(x4 ^ x9, 12) + paddd %xmm14,%xmm9 + pxor %xmm9,%xmm4 + movdqa %xmm4,%xmm0 + pslld $12,%xmm0 + psrld $20,%xmm4 + por %xmm0,%xmm4 + + # x0 += x5, x15 = rotl32(x15 ^ x0, 8) + movdqa 0x00(%rsp),%xmm0 + paddd %xmm5,%xmm0 + movdqa %xmm0,0x00(%rsp) + pxor %xmm0,%xmm15 + pshufb %xmm2,%xmm15 + # x1 += x6, x12 = rotl32(x12 ^ x1, 8) + movdqa 0x10(%rsp),%xmm0 + paddd %xmm6,%xmm0 + movdqa %xmm0,0x10(%rsp) + pxor %xmm0,%xmm12 + pshufb %xmm2,%xmm12 + # x2 += x7, x13 = rotl32(x13 ^ x2, 8) + movdqa 0x20(%rsp),%xmm0 + paddd %xmm7,%xmm0 + movdqa %xmm0,0x20(%rsp) + pxor %xmm0,%xmm13 + pshufb %xmm2,%xmm13 + # x3 += x4, x14 = rotl32(x14 ^ x3, 8) + movdqa 0x30(%rsp),%xmm0 + paddd %xmm4,%xmm0 + movdqa %xmm0,0x30(%rsp) + pxor %xmm0,%xmm14 + pshufb %xmm2,%xmm14 + + # x10 += x15, x5 = rotl32(x5 ^ x10, 7) + paddd %xmm15,%xmm10 + pxor %xmm10,%xmm5 + movdqa %xmm5,%xmm0 + pslld $7,%xmm0 + psrld $25,%xmm5 + por %xmm0,%xmm5 + # x11 += x12, x6 = rotl32(x6 ^ x11, 7) + paddd %xmm12,%xmm11 + pxor %xmm11,%xmm6 + movdqa %xmm6,%xmm0 + pslld $7,%xmm0 + psrld $25,%xmm6 + por %xmm0,%xmm6 + # x8 += x13, x7 = rotl32(x7 ^ x8, 7) + paddd %xmm13,%xmm8 + pxor %xmm8,%xmm7 + movdqa %xmm7,%xmm0 + pslld $7,%xmm0 + psrld $25,%xmm7 + por %xmm0,%xmm7 + # x9 += x14, x4 = rotl32(x4 ^ x9, 7) + paddd %xmm14,%xmm9 + pxor %xmm9,%xmm4 + movdqa %xmm4,%xmm0 + pslld $7,%xmm0 + psrld $25,%xmm4 + por %xmm0,%xmm4 + + sub $2,%r8d + jnz .Ldoubleround4 + + # x0[0-3] += s0[0] + # x1[0-3] += s0[1] + movq 0x00(%rdi),%xmm3 + pshufd $0x00,%xmm3,%xmm2 + pshufd $0x55,%xmm3,%xmm3 + paddd 0x00(%rsp),%xmm2 + movdqa %xmm2,0x00(%rsp) + paddd 0x10(%rsp),%xmm3 + movdqa %xmm3,0x10(%rsp) + # x2[0-3] += s0[2] + # x3[0-3] += s0[3] + movq 0x08(%rdi),%xmm3 + pshufd $0x00,%xmm3,%xmm2 + pshufd $0x55,%xmm3,%xmm3 + paddd 0x20(%rsp),%xmm2 + movdqa %xmm2,0x20(%rsp) + paddd 0x30(%rsp),%xmm3 + movdqa %xmm3,0x30(%rsp) + + # x4[0-3] += s1[0] + # x5[0-3] += s1[1] + movq 0x10(%rdi),%xmm3 + pshufd $0x00,%xmm3,%xmm2 + pshufd $0x55,%xmm3,%xmm3 + paddd %xmm2,%xmm4 + paddd %xmm3,%xmm5 + # x6[0-3] += s1[2] + # x7[0-3] += s1[3] + movq 0x18(%rdi),%xmm3 + pshufd $0x00,%xmm3,%xmm2 + pshufd $0x55,%xmm3,%xmm3 + paddd %xmm2,%xmm6 + paddd %xmm3,%xmm7 + + # x8[0-3] += s2[0] + # x9[0-3] += s2[1] + movq 0x20(%rdi),%xmm3 + pshufd $0x00,%xmm3,%xmm2 + pshufd $0x55,%xmm3,%xmm3 + paddd %xmm2,%xmm8 + paddd %xmm3,%xmm9 + # x10[0-3] += s2[2] + # x11[0-3] += s2[3] + movq 0x28(%rdi),%xmm3 + pshufd $0x00,%xmm3,%xmm2 + pshufd $0x55,%xmm3,%xmm3 + paddd %xmm2,%xmm10 + paddd %xmm3,%xmm11 + + # x12[0-3] += s3[0] + # x13[0-3] += s3[1] + movq 0x30(%rdi),%xmm3 + pshufd $0x00,%xmm3,%xmm2 + pshufd $0x55,%xmm3,%xmm3 + paddd %xmm2,%xmm12 + paddd %xmm3,%xmm13 + # x14[0-3] += s3[2] + # x15[0-3] += s3[3] + movq 0x38(%rdi),%xmm3 + pshufd $0x00,%xmm3,%xmm2 + pshufd $0x55,%xmm3,%xmm3 + paddd %xmm2,%xmm14 + paddd %xmm3,%xmm15 + + # x12 += counter values 0-3 + paddd %xmm1,%xmm12 + + # interleave 32-bit words in state n, n+1 + movdqa 0x00(%rsp),%xmm0 + movdqa 0x10(%rsp),%xmm1 + movdqa %xmm0,%xmm2 + punpckldq %xmm1,%xmm2 + punpckhdq %xmm1,%xmm0 + movdqa %xmm2,0x00(%rsp) + movdqa %xmm0,0x10(%rsp) + movdqa 0x20(%rsp),%xmm0 + movdqa 0x30(%rsp),%xmm1 + movdqa %xmm0,%xmm2 + punpckldq %xmm1,%xmm2 + punpckhdq %xmm1,%xmm0 + movdqa %xmm2,0x20(%rsp) + movdqa %xmm0,0x30(%rsp) + movdqa %xmm4,%xmm0 + punpckldq %xmm5,%xmm4 + punpckhdq %xmm5,%xmm0 + movdqa %xmm0,%xmm5 + movdqa %xmm6,%xmm0 + punpckldq %xmm7,%xmm6 + punpckhdq %xmm7,%xmm0 + movdqa %xmm0,%xmm7 + movdqa %xmm8,%xmm0 + punpckldq %xmm9,%xmm8 + punpckhdq %xmm9,%xmm0 + movdqa %xmm0,%xmm9 + movdqa %xmm10,%xmm0 + punpckldq %xmm11,%xmm10 + punpckhdq %xmm11,%xmm0 + movdqa %xmm0,%xmm11 + movdqa %xmm12,%xmm0 + punpckldq %xmm13,%xmm12 + punpckhdq %xmm13,%xmm0 + movdqa %xmm0,%xmm13 + movdqa %xmm14,%xmm0 + punpckldq %xmm15,%xmm14 + punpckhdq %xmm15,%xmm0 + movdqa %xmm0,%xmm15 + + # interleave 64-bit words in state n, n+2 + movdqa 0x00(%rsp),%xmm0 + movdqa 0x20(%rsp),%xmm1 + movdqa %xmm0,%xmm2 + punpcklqdq %xmm1,%xmm2 + punpckhqdq %xmm1,%xmm0 + movdqa %xmm2,0x00(%rsp) + movdqa %xmm0,0x20(%rsp) + movdqa 0x10(%rsp),%xmm0 + movdqa 0x30(%rsp),%xmm1 + movdqa %xmm0,%xmm2 + punpcklqdq %xmm1,%xmm2 + punpckhqdq %xmm1,%xmm0 + movdqa %xmm2,0x10(%rsp) + movdqa %xmm0,0x30(%rsp) + movdqa %xmm4,%xmm0 + punpcklqdq %xmm6,%xmm4 + punpckhqdq %xmm6,%xmm0 + movdqa %xmm0,%xmm6 + movdqa %xmm5,%xmm0 + punpcklqdq %xmm7,%xmm5 + punpckhqdq %xmm7,%xmm0 + movdqa %xmm0,%xmm7 + movdqa %xmm8,%xmm0 + punpcklqdq %xmm10,%xmm8 + punpckhqdq %xmm10,%xmm0 + movdqa %xmm0,%xmm10 + movdqa %xmm9,%xmm0 + punpcklqdq %xmm11,%xmm9 + punpckhqdq %xmm11,%xmm0 + movdqa %xmm0,%xmm11 + movdqa %xmm12,%xmm0 + punpcklqdq %xmm14,%xmm12 + punpckhqdq %xmm14,%xmm0 + movdqa %xmm0,%xmm14 + movdqa %xmm13,%xmm0 + punpcklqdq %xmm15,%xmm13 + punpckhqdq %xmm15,%xmm0 + movdqa %xmm0,%xmm15 + + # xor with corresponding input, write to output + movdqa 0x00(%rsp),%xmm0 + cmp $0x10,%rax + jl .Lxorpart4 + movdqu 0x00(%rdx),%xmm1 + pxor %xmm1,%xmm0 + movdqu %xmm0,0x00(%rsi) + + movdqu %xmm4,%xmm0 + cmp $0x20,%rax + jl .Lxorpart4 + movdqu 0x10(%rdx),%xmm1 + pxor %xmm1,%xmm0 + movdqu %xmm0,0x10(%rsi) + + movdqu %xmm8,%xmm0 + cmp $0x30,%rax + jl .Lxorpart4 + movdqu 0x20(%rdx),%xmm1 + pxor %xmm1,%xmm0 + movdqu %xmm0,0x20(%rsi) + + movdqu %xmm12,%xmm0 + cmp $0x40,%rax + jl .Lxorpart4 + movdqu 0x30(%rdx),%xmm1 + pxor %xmm1,%xmm0 + movdqu %xmm0,0x30(%rsi) + + movdqa 0x20(%rsp),%xmm0 + cmp $0x50,%rax + jl .Lxorpart4 + movdqu 0x40(%rdx),%xmm1 + pxor %xmm1,%xmm0 + movdqu %xmm0,0x40(%rsi) + + movdqu %xmm6,%xmm0 + cmp $0x60,%rax + jl .Lxorpart4 + movdqu 0x50(%rdx),%xmm1 + pxor %xmm1,%xmm0 + movdqu %xmm0,0x50(%rsi) + + movdqu %xmm10,%xmm0 + cmp $0x70,%rax + jl .Lxorpart4 + movdqu 0x60(%rdx),%xmm1 + pxor %xmm1,%xmm0 + movdqu %xmm0,0x60(%rsi) + + movdqu %xmm14,%xmm0 + cmp $0x80,%rax + jl .Lxorpart4 + movdqu 0x70(%rdx),%xmm1 + pxor %xmm1,%xmm0 + movdqu %xmm0,0x70(%rsi) + + movdqa 0x10(%rsp),%xmm0 + cmp $0x90,%rax + jl .Lxorpart4 + movdqu 0x80(%rdx),%xmm1 + pxor %xmm1,%xmm0 + movdqu %xmm0,0x80(%rsi) + + movdqu %xmm5,%xmm0 + cmp $0xa0,%rax + jl .Lxorpart4 + movdqu 0x90(%rdx),%xmm1 + pxor %xmm1,%xmm0 + movdqu %xmm0,0x90(%rsi) + + movdqu %xmm9,%xmm0 + cmp $0xb0,%rax + jl .Lxorpart4 + movdqu 0xa0(%rdx),%xmm1 + pxor %xmm1,%xmm0 + movdqu %xmm0,0xa0(%rsi) + + movdqu %xmm13,%xmm0 + cmp $0xc0,%rax + jl .Lxorpart4 + movdqu 0xb0(%rdx),%xmm1 + pxor %xmm1,%xmm0 + movdqu %xmm0,0xb0(%rsi) + + movdqa 0x30(%rsp),%xmm0 + cmp $0xd0,%rax + jl .Lxorpart4 + movdqu 0xc0(%rdx),%xmm1 + pxor %xmm1,%xmm0 + movdqu %xmm0,0xc0(%rsi) + + movdqu %xmm7,%xmm0 + cmp $0xe0,%rax + jl .Lxorpart4 + movdqu 0xd0(%rdx),%xmm1 + pxor %xmm1,%xmm0 + movdqu %xmm0,0xd0(%rsi) + + movdqu %xmm11,%xmm0 + cmp $0xf0,%rax + jl .Lxorpart4 + movdqu 0xe0(%rdx),%xmm1 + pxor %xmm1,%xmm0 + movdqu %xmm0,0xe0(%rsi) + + movdqu %xmm15,%xmm0 + cmp $0x100,%rax + jl .Lxorpart4 + movdqu 0xf0(%rdx),%xmm1 + pxor %xmm1,%xmm0 + movdqu %xmm0,0xf0(%rsi) + +.Ldone4: + lea -8(%r10),%rsp + ret + +.Lxorpart4: + # xor remaining bytes from partial register into output + mov %rax,%r9 + and $0x0f,%r9 + jz .Ldone4 + and $~0x0f,%rax + + mov %rsi,%r11 + + lea (%rdx,%rax),%rsi + mov %rsp,%rdi + mov %r9,%rcx + rep movsb + + pxor 0x00(%rsp),%xmm0 + movdqa %xmm0,0x00(%rsp) + + mov %rsp,%rsi + lea (%r11,%rax),%rdi + mov %r9,%rcx + rep movsb + + jmp .Ldone4 + +ENDPROC(chacha_4block_xor_ssse3) diff --git a/arch/x86/crypto/chacha20-avx2-x86_64.S b/arch/x86/crypto/chacha20-avx2-x86_64.S deleted file mode 100644 index b6ab082be657..000000000000 --- a/arch/x86/crypto/chacha20-avx2-x86_64.S +++ /dev/null @@ -1,1026 +0,0 @@ -/* - * ChaCha20 256-bit cipher algorithm, RFC7539, x64 AVX2 functions - * - * Copyright (C) 2015 Martin Willi - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ - -#include - -.section .rodata.cst32.ROT8, "aM", @progbits, 32 -.align 32 -ROT8: .octa 0x0e0d0c0f0a09080b0605040702010003 - .octa 0x0e0d0c0f0a09080b0605040702010003 - -.section .rodata.cst32.ROT16, "aM", @progbits, 32 -.align 32 -ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302 - .octa 0x0d0c0f0e09080b0a0504070601000302 - -.section .rodata.cst32.CTRINC, "aM", @progbits, 32 -.align 32 -CTRINC: .octa 0x00000003000000020000000100000000 - .octa 0x00000007000000060000000500000004 - -.section .rodata.cst32.CTR2BL, "aM", @progbits, 32 -.align 32 -CTR2BL: .octa 0x00000000000000000000000000000000 - .octa 0x00000000000000000000000000000001 - -.section .rodata.cst32.CTR4BL, "aM", @progbits, 32 -.align 32 -CTR4BL: .octa 0x00000000000000000000000000000002 - .octa 0x00000000000000000000000000000003 - -.text - -ENTRY(chacha20_2block_xor_avx2) - # %rdi: Input state matrix, s - # %rsi: up to 2 data blocks output, o - # %rdx: up to 2 data blocks input, i - # %rcx: input/output length in bytes - - # This function encrypts two ChaCha20 blocks by loading the state - # matrix twice across four AVX registers. It performs matrix operations - # on four words in each matrix in parallel, but requires shuffling to - # rearrange the words after each round. - - vzeroupper - - # x0..3[0-2] = s0..3 - vbroadcasti128 0x00(%rdi),%ymm0 - vbroadcasti128 0x10(%rdi),%ymm1 - vbroadcasti128 0x20(%rdi),%ymm2 - vbroadcasti128 0x30(%rdi),%ymm3 - - vpaddd CTR2BL(%rip),%ymm3,%ymm3 - - vmovdqa %ymm0,%ymm8 - vmovdqa %ymm1,%ymm9 - vmovdqa %ymm2,%ymm10 - vmovdqa %ymm3,%ymm11 - - vmovdqa ROT8(%rip),%ymm4 - vmovdqa ROT16(%rip),%ymm5 - - mov %rcx,%rax - mov $10,%ecx - -.Ldoubleround: - - # x0 += x1, x3 = rotl32(x3 ^ x0, 16) - vpaddd %ymm1,%ymm0,%ymm0 - vpxor %ymm0,%ymm3,%ymm3 - vpshufb %ymm5,%ymm3,%ymm3 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 12) - vpaddd %ymm3,%ymm2,%ymm2 - vpxor %ymm2,%ymm1,%ymm1 - vmovdqa %ymm1,%ymm6 - vpslld $12,%ymm6,%ymm6 - vpsrld $20,%ymm1,%ymm1 - vpor %ymm6,%ymm1,%ymm1 - - # x0 += x1, x3 = rotl32(x3 ^ x0, 8) - vpaddd %ymm1,%ymm0,%ymm0 - vpxor %ymm0,%ymm3,%ymm3 - vpshufb %ymm4,%ymm3,%ymm3 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 7) - vpaddd %ymm3,%ymm2,%ymm2 - vpxor %ymm2,%ymm1,%ymm1 - vmovdqa %ymm1,%ymm7 - vpslld $7,%ymm7,%ymm7 - vpsrld $25,%ymm1,%ymm1 - vpor %ymm7,%ymm1,%ymm1 - - # x1 = shuffle32(x1, MASK(0, 3, 2, 1)) - vpshufd $0x39,%ymm1,%ymm1 - # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) - vpshufd $0x4e,%ymm2,%ymm2 - # x3 = shuffle32(x3, MASK(2, 1, 0, 3)) - vpshufd $0x93,%ymm3,%ymm3 - - # x0 += x1, x3 = rotl32(x3 ^ x0, 16) - vpaddd %ymm1,%ymm0,%ymm0 - vpxor %ymm0,%ymm3,%ymm3 - vpshufb %ymm5,%ymm3,%ymm3 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 12) - vpaddd %ymm3,%ymm2,%ymm2 - vpxor %ymm2,%ymm1,%ymm1 - vmovdqa %ymm1,%ymm6 - vpslld $12,%ymm6,%ymm6 - vpsrld $20,%ymm1,%ymm1 - vpor %ymm6,%ymm1,%ymm1 - - # x0 += x1, x3 = rotl32(x3 ^ x0, 8) - vpaddd %ymm1,%ymm0,%ymm0 - vpxor %ymm0,%ymm3,%ymm3 - vpshufb %ymm4,%ymm3,%ymm3 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 7) - vpaddd %ymm3,%ymm2,%ymm2 - vpxor %ymm2,%ymm1,%ymm1 - vmovdqa %ymm1,%ymm7 - vpslld $7,%ymm7,%ymm7 - vpsrld $25,%ymm1,%ymm1 - vpor %ymm7,%ymm1,%ymm1 - - # x1 = shuffle32(x1, MASK(2, 1, 0, 3)) - vpshufd $0x93,%ymm1,%ymm1 - # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) - vpshufd $0x4e,%ymm2,%ymm2 - # x3 = shuffle32(x3, MASK(0, 3, 2, 1)) - vpshufd $0x39,%ymm3,%ymm3 - - dec %ecx - jnz .Ldoubleround - - # o0 = i0 ^ (x0 + s0) - vpaddd %ymm8,%ymm0,%ymm7 - cmp $0x10,%rax - jl .Lxorpart2 - vpxor 0x00(%rdx),%xmm7,%xmm6 - vmovdqu %xmm6,0x00(%rsi) - vextracti128 $1,%ymm7,%xmm0 - # o1 = i1 ^ (x1 + s1) - vpaddd %ymm9,%ymm1,%ymm7 - cmp $0x20,%rax - jl .Lxorpart2 - vpxor 0x10(%rdx),%xmm7,%xmm6 - vmovdqu %xmm6,0x10(%rsi) - vextracti128 $1,%ymm7,%xmm1 - # o2 = i2 ^ (x2 + s2) - vpaddd %ymm10,%ymm2,%ymm7 - cmp $0x30,%rax - jl .Lxorpart2 - vpxor 0x20(%rdx),%xmm7,%xmm6 - vmovdqu %xmm6,0x20(%rsi) - vextracti128 $1,%ymm7,%xmm2 - # o3 = i3 ^ (x3 + s3) - vpaddd %ymm11,%ymm3,%ymm7 - cmp $0x40,%rax - jl .Lxorpart2 - vpxor 0x30(%rdx),%xmm7,%xmm6 - vmovdqu %xmm6,0x30(%rsi) - vextracti128 $1,%ymm7,%xmm3 - - # xor and write second block - vmovdqa %xmm0,%xmm7 - cmp $0x50,%rax - jl .Lxorpart2 - vpxor 0x40(%rdx),%xmm7,%xmm6 - vmovdqu %xmm6,0x40(%rsi) - - vmovdqa %xmm1,%xmm7 - cmp $0x60,%rax - jl .Lxorpart2 - vpxor 0x50(%rdx),%xmm7,%xmm6 - vmovdqu %xmm6,0x50(%rsi) - - vmovdqa %xmm2,%xmm7 - cmp $0x70,%rax - jl .Lxorpart2 - vpxor 0x60(%rdx),%xmm7,%xmm6 - vmovdqu %xmm6,0x60(%rsi) - - vmovdqa %xmm3,%xmm7 - cmp $0x80,%rax - jl .Lxorpart2 - vpxor 0x70(%rdx),%xmm7,%xmm6 - vmovdqu %xmm6,0x70(%rsi) - -.Ldone2: - vzeroupper - ret - -.Lxorpart2: - # xor remaining bytes from partial register into output - mov %rax,%r9 - and $0x0f,%r9 - jz .Ldone2 - and $~0x0f,%rax - - mov %rsi,%r11 - - lea 8(%rsp),%r10 - sub $0x10,%rsp - and $~31,%rsp - - lea (%rdx,%rax),%rsi - mov %rsp,%rdi - mov %r9,%rcx - rep movsb - - vpxor 0x00(%rsp),%xmm7,%xmm7 - vmovdqa %xmm7,0x00(%rsp) - - mov %rsp,%rsi - lea (%r11,%rax),%rdi - mov %r9,%rcx - rep movsb - - lea -8(%r10),%rsp - jmp .Ldone2 - -ENDPROC(chacha20_2block_xor_avx2) - -ENTRY(chacha20_4block_xor_avx2) - # %rdi: Input state matrix, s - # %rsi: up to 4 data blocks output, o - # %rdx: up to 4 data blocks input, i - # %rcx: input/output length in bytes - - # This function encrypts four ChaCha20 block by loading the state - # matrix four times across eight AVX registers. It performs matrix - # operations on four words in two matrices in parallel, sequentially - # to the operations on the four words of the other two matrices. The - # required word shuffling has a rather high latency, we can do the - # arithmetic on two matrix-pairs without much slowdown. - - vzeroupper - - # x0..3[0-4] = s0..3 - vbroadcasti128 0x00(%rdi),%ymm0 - vbroadcasti128 0x10(%rdi),%ymm1 - vbroadcasti128 0x20(%rdi),%ymm2 - vbroadcasti128 0x30(%rdi),%ymm3 - - vmovdqa %ymm0,%ymm4 - vmovdqa %ymm1,%ymm5 - vmovdqa %ymm2,%ymm6 - vmovdqa %ymm3,%ymm7 - - vpaddd CTR2BL(%rip),%ymm3,%ymm3 - vpaddd CTR4BL(%rip),%ymm7,%ymm7 - - vmovdqa %ymm0,%ymm11 - vmovdqa %ymm1,%ymm12 - vmovdqa %ymm2,%ymm13 - vmovdqa %ymm3,%ymm14 - vmovdqa %ymm7,%ymm15 - - vmovdqa ROT8(%rip),%ymm8 - vmovdqa ROT16(%rip),%ymm9 - - mov %rcx,%rax - mov $10,%ecx - -.Ldoubleround4: - - # x0 += x1, x3 = rotl32(x3 ^ x0, 16) - vpaddd %ymm1,%ymm0,%ymm0 - vpxor %ymm0,%ymm3,%ymm3 - vpshufb %ymm9,%ymm3,%ymm3 - - vpaddd %ymm5,%ymm4,%ymm4 - vpxor %ymm4,%ymm7,%ymm7 - vpshufb %ymm9,%ymm7,%ymm7 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 12) - vpaddd %ymm3,%ymm2,%ymm2 - vpxor %ymm2,%ymm1,%ymm1 - vmovdqa %ymm1,%ymm10 - vpslld $12,%ymm10,%ymm10 - vpsrld $20,%ymm1,%ymm1 - vpor %ymm10,%ymm1,%ymm1 - - vpaddd %ymm7,%ymm6,%ymm6 - vpxor %ymm6,%ymm5,%ymm5 - vmovdqa %ymm5,%ymm10 - vpslld $12,%ymm10,%ymm10 - vpsrld $20,%ymm5,%ymm5 - vpor %ymm10,%ymm5,%ymm5 - - # x0 += x1, x3 = rotl32(x3 ^ x0, 8) - vpaddd %ymm1,%ymm0,%ymm0 - vpxor %ymm0,%ymm3,%ymm3 - vpshufb %ymm8,%ymm3,%ymm3 - - vpaddd %ymm5,%ymm4,%ymm4 - vpxor %ymm4,%ymm7,%ymm7 - vpshufb %ymm8,%ymm7,%ymm7 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 7) - vpaddd %ymm3,%ymm2,%ymm2 - vpxor %ymm2,%ymm1,%ymm1 - vmovdqa %ymm1,%ymm10 - vpslld $7,%ymm10,%ymm10 - vpsrld $25,%ymm1,%ymm1 - vpor %ymm10,%ymm1,%ymm1 - - vpaddd %ymm7,%ymm6,%ymm6 - vpxor %ymm6,%ymm5,%ymm5 - vmovdqa %ymm5,%ymm10 - vpslld $7,%ymm10,%ymm10 - vpsrld $25,%ymm5,%ymm5 - vpor %ymm10,%ymm5,%ymm5 - - # x1 = shuffle32(x1, MASK(0, 3, 2, 1)) - vpshufd $0x39,%ymm1,%ymm1 - vpshufd $0x39,%ymm5,%ymm5 - # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) - vpshufd $0x4e,%ymm2,%ymm2 - vpshufd $0x4e,%ymm6,%ymm6 - # x3 = shuffle32(x3, MASK(2, 1, 0, 3)) - vpshufd $0x93,%ymm3,%ymm3 - vpshufd $0x93,%ymm7,%ymm7 - - # x0 += x1, x3 = rotl32(x3 ^ x0, 16) - vpaddd %ymm1,%ymm0,%ymm0 - vpxor %ymm0,%ymm3,%ymm3 - vpshufb %ymm9,%ymm3,%ymm3 - - vpaddd %ymm5,%ymm4,%ymm4 - vpxor %ymm4,%ymm7,%ymm7 - vpshufb %ymm9,%ymm7,%ymm7 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 12) - vpaddd %ymm3,%ymm2,%ymm2 - vpxor %ymm2,%ymm1,%ymm1 - vmovdqa %ymm1,%ymm10 - vpslld $12,%ymm10,%ymm10 - vpsrld $20,%ymm1,%ymm1 - vpor %ymm10,%ymm1,%ymm1 - - vpaddd %ymm7,%ymm6,%ymm6 - vpxor %ymm6,%ymm5,%ymm5 - vmovdqa %ymm5,%ymm10 - vpslld $12,%ymm10,%ymm10 - vpsrld $20,%ymm5,%ymm5 - vpor %ymm10,%ymm5,%ymm5 - - # x0 += x1, x3 = rotl32(x3 ^ x0, 8) - vpaddd %ymm1,%ymm0,%ymm0 - vpxor %ymm0,%ymm3,%ymm3 - vpshufb %ymm8,%ymm3,%ymm3 - - vpaddd %ymm5,%ymm4,%ymm4 - vpxor %ymm4,%ymm7,%ymm7 - vpshufb %ymm8,%ymm7,%ymm7 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 7) - vpaddd %ymm3,%ymm2,%ymm2 - vpxor %ymm2,%ymm1,%ymm1 - vmovdqa %ymm1,%ymm10 - vpslld $7,%ymm10,%ymm10 - vpsrld $25,%ymm1,%ymm1 - vpor %ymm10,%ymm1,%ymm1 - - vpaddd %ymm7,%ymm6,%ymm6 - vpxor %ymm6,%ymm5,%ymm5 - vmovdqa %ymm5,%ymm10 - vpslld $7,%ymm10,%ymm10 - vpsrld $25,%ymm5,%ymm5 - vpor %ymm10,%ymm5,%ymm5 - - # x1 = shuffle32(x1, MASK(2, 1, 0, 3)) - vpshufd $0x93,%ymm1,%ymm1 - vpshufd $0x93,%ymm5,%ymm5 - # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) - vpshufd $0x4e,%ymm2,%ymm2 - vpshufd $0x4e,%ymm6,%ymm6 - # x3 = shuffle32(x3, MASK(0, 3, 2, 1)) - vpshufd $0x39,%ymm3,%ymm3 - vpshufd $0x39,%ymm7,%ymm7 - - dec %ecx - jnz .Ldoubleround4 - - # o0 = i0 ^ (x0 + s0), first block - vpaddd %ymm11,%ymm0,%ymm10 - cmp $0x10,%rax - jl .Lxorpart4 - vpxor 0x00(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x00(%rsi) - vextracti128 $1,%ymm10,%xmm0 - # o1 = i1 ^ (x1 + s1), first block - vpaddd %ymm12,%ymm1,%ymm10 - cmp $0x20,%rax - jl .Lxorpart4 - vpxor 0x10(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x10(%rsi) - vextracti128 $1,%ymm10,%xmm1 - # o2 = i2 ^ (x2 + s2), first block - vpaddd %ymm13,%ymm2,%ymm10 - cmp $0x30,%rax - jl .Lxorpart4 - vpxor 0x20(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x20(%rsi) - vextracti128 $1,%ymm10,%xmm2 - # o3 = i3 ^ (x3 + s3), first block - vpaddd %ymm14,%ymm3,%ymm10 - cmp $0x40,%rax - jl .Lxorpart4 - vpxor 0x30(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x30(%rsi) - vextracti128 $1,%ymm10,%xmm3 - - # xor and write second block - vmovdqa %xmm0,%xmm10 - cmp $0x50,%rax - jl .Lxorpart4 - vpxor 0x40(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x40(%rsi) - - vmovdqa %xmm1,%xmm10 - cmp $0x60,%rax - jl .Lxorpart4 - vpxor 0x50(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x50(%rsi) - - vmovdqa %xmm2,%xmm10 - cmp $0x70,%rax - jl .Lxorpart4 - vpxor 0x60(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x60(%rsi) - - vmovdqa %xmm3,%xmm10 - cmp $0x80,%rax - jl .Lxorpart4 - vpxor 0x70(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x70(%rsi) - - # o0 = i0 ^ (x0 + s0), third block - vpaddd %ymm11,%ymm4,%ymm10 - cmp $0x90,%rax - jl .Lxorpart4 - vpxor 0x80(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x80(%rsi) - vextracti128 $1,%ymm10,%xmm4 - # o1 = i1 ^ (x1 + s1), third block - vpaddd %ymm12,%ymm5,%ymm10 - cmp $0xa0,%rax - jl .Lxorpart4 - vpxor 0x90(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x90(%rsi) - vextracti128 $1,%ymm10,%xmm5 - # o2 = i2 ^ (x2 + s2), third block - vpaddd %ymm13,%ymm6,%ymm10 - cmp $0xb0,%rax - jl .Lxorpart4 - vpxor 0xa0(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0xa0(%rsi) - vextracti128 $1,%ymm10,%xmm6 - # o3 = i3 ^ (x3 + s3), third block - vpaddd %ymm15,%ymm7,%ymm10 - cmp $0xc0,%rax - jl .Lxorpart4 - vpxor 0xb0(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0xb0(%rsi) - vextracti128 $1,%ymm10,%xmm7 - - # xor and write fourth block - vmovdqa %xmm4,%xmm10 - cmp $0xd0,%rax - jl .Lxorpart4 - vpxor 0xc0(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0xc0(%rsi) - - vmovdqa %xmm5,%xmm10 - cmp $0xe0,%rax - jl .Lxorpart4 - vpxor 0xd0(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0xd0(%rsi) - - vmovdqa %xmm6,%xmm10 - cmp $0xf0,%rax - jl .Lxorpart4 - vpxor 0xe0(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0xe0(%rsi) - - vmovdqa %xmm7,%xmm10 - cmp $0x100,%rax - jl .Lxorpart4 - vpxor 0xf0(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0xf0(%rsi) - -.Ldone4: - vzeroupper - ret - -.Lxorpart4: - # xor remaining bytes from partial register into output - mov %rax,%r9 - and $0x0f,%r9 - jz .Ldone4 - and $~0x0f,%rax - - mov %rsi,%r11 - - lea 8(%rsp),%r10 - sub $0x10,%rsp - and $~31,%rsp - - lea (%rdx,%rax),%rsi - mov %rsp,%rdi - mov %r9,%rcx - rep movsb - - vpxor 0x00(%rsp),%xmm10,%xmm10 - vmovdqa %xmm10,0x00(%rsp) - - mov %rsp,%rsi - lea (%r11,%rax),%rdi - mov %r9,%rcx - rep movsb - - lea -8(%r10),%rsp - jmp .Ldone4 - -ENDPROC(chacha20_4block_xor_avx2) - -ENTRY(chacha20_8block_xor_avx2) - # %rdi: Input state matrix, s - # %rsi: up to 8 data blocks output, o - # %rdx: up to 8 data blocks input, i - # %rcx: input/output length in bytes - - # This function encrypts eight consecutive ChaCha20 blocks by loading - # the state matrix in AVX registers eight times. As we need some - # scratch registers, we save the first four registers on the stack. The - # algorithm performs each operation on the corresponding word of each - # state matrix, hence requires no word shuffling. For final XORing step - # we transpose the matrix by interleaving 32-, 64- and then 128-bit - # words, which allows us to do XOR in AVX registers. 8/16-bit word - # rotation is done with the slightly better performing byte shuffling, - # 7/12-bit word rotation uses traditional shift+OR. - - vzeroupper - # 4 * 32 byte stack, 32-byte aligned - lea 8(%rsp),%r10 - and $~31, %rsp - sub $0x80, %rsp - mov %rcx,%rax - - # x0..15[0-7] = s[0..15] - vpbroadcastd 0x00(%rdi),%ymm0 - vpbroadcastd 0x04(%rdi),%ymm1 - vpbroadcastd 0x08(%rdi),%ymm2 - vpbroadcastd 0x0c(%rdi),%ymm3 - vpbroadcastd 0x10(%rdi),%ymm4 - vpbroadcastd 0x14(%rdi),%ymm5 - vpbroadcastd 0x18(%rdi),%ymm6 - vpbroadcastd 0x1c(%rdi),%ymm7 - vpbroadcastd 0x20(%rdi),%ymm8 - vpbroadcastd 0x24(%rdi),%ymm9 - vpbroadcastd 0x28(%rdi),%ymm10 - vpbroadcastd 0x2c(%rdi),%ymm11 - vpbroadcastd 0x30(%rdi),%ymm12 - vpbroadcastd 0x34(%rdi),%ymm13 - vpbroadcastd 0x38(%rdi),%ymm14 - vpbroadcastd 0x3c(%rdi),%ymm15 - # x0..3 on stack - vmovdqa %ymm0,0x00(%rsp) - vmovdqa %ymm1,0x20(%rsp) - vmovdqa %ymm2,0x40(%rsp) - vmovdqa %ymm3,0x60(%rsp) - - vmovdqa CTRINC(%rip),%ymm1 - vmovdqa ROT8(%rip),%ymm2 - vmovdqa ROT16(%rip),%ymm3 - - # x12 += counter values 0-3 - vpaddd %ymm1,%ymm12,%ymm12 - - mov $10,%ecx - -.Ldoubleround8: - # x0 += x4, x12 = rotl32(x12 ^ x0, 16) - vpaddd 0x00(%rsp),%ymm4,%ymm0 - vmovdqa %ymm0,0x00(%rsp) - vpxor %ymm0,%ymm12,%ymm12 - vpshufb %ymm3,%ymm12,%ymm12 - # x1 += x5, x13 = rotl32(x13 ^ x1, 16) - vpaddd 0x20(%rsp),%ymm5,%ymm0 - vmovdqa %ymm0,0x20(%rsp) - vpxor %ymm0,%ymm13,%ymm13 - vpshufb %ymm3,%ymm13,%ymm13 - # x2 += x6, x14 = rotl32(x14 ^ x2, 16) - vpaddd 0x40(%rsp),%ymm6,%ymm0 - vmovdqa %ymm0,0x40(%rsp) - vpxor %ymm0,%ymm14,%ymm14 - vpshufb %ymm3,%ymm14,%ymm14 - # x3 += x7, x15 = rotl32(x15 ^ x3, 16) - vpaddd 0x60(%rsp),%ymm7,%ymm0 - vmovdqa %ymm0,0x60(%rsp) - vpxor %ymm0,%ymm15,%ymm15 - vpshufb %ymm3,%ymm15,%ymm15 - - # x8 += x12, x4 = rotl32(x4 ^ x8, 12) - vpaddd %ymm12,%ymm8,%ymm8 - vpxor %ymm8,%ymm4,%ymm4 - vpslld $12,%ymm4,%ymm0 - vpsrld $20,%ymm4,%ymm4 - vpor %ymm0,%ymm4,%ymm4 - # x9 += x13, x5 = rotl32(x5 ^ x9, 12) - vpaddd %ymm13,%ymm9,%ymm9 - vpxor %ymm9,%ymm5,%ymm5 - vpslld $12,%ymm5,%ymm0 - vpsrld $20,%ymm5,%ymm5 - vpor %ymm0,%ymm5,%ymm5 - # x10 += x14, x6 = rotl32(x6 ^ x10, 12) - vpaddd %ymm14,%ymm10,%ymm10 - vpxor %ymm10,%ymm6,%ymm6 - vpslld $12,%ymm6,%ymm0 - vpsrld $20,%ymm6,%ymm6 - vpor %ymm0,%ymm6,%ymm6 - # x11 += x15, x7 = rotl32(x7 ^ x11, 12) - vpaddd %ymm15,%ymm11,%ymm11 - vpxor %ymm11,%ymm7,%ymm7 - vpslld $12,%ymm7,%ymm0 - vpsrld $20,%ymm7,%ymm7 - vpor %ymm0,%ymm7,%ymm7 - - # x0 += x4, x12 = rotl32(x12 ^ x0, 8) - vpaddd 0x00(%rsp),%ymm4,%ymm0 - vmovdqa %ymm0,0x00(%rsp) - vpxor %ymm0,%ymm12,%ymm12 - vpshufb %ymm2,%ymm12,%ymm12 - # x1 += x5, x13 = rotl32(x13 ^ x1, 8) - vpaddd 0x20(%rsp),%ymm5,%ymm0 - vmovdqa %ymm0,0x20(%rsp) - vpxor %ymm0,%ymm13,%ymm13 - vpshufb %ymm2,%ymm13,%ymm13 - # x2 += x6, x14 = rotl32(x14 ^ x2, 8) - vpaddd 0x40(%rsp),%ymm6,%ymm0 - vmovdqa %ymm0,0x40(%rsp) - vpxor %ymm0,%ymm14,%ymm14 - vpshufb %ymm2,%ymm14,%ymm14 - # x3 += x7, x15 = rotl32(x15 ^ x3, 8) - vpaddd 0x60(%rsp),%ymm7,%ymm0 - vmovdqa %ymm0,0x60(%rsp) - vpxor %ymm0,%ymm15,%ymm15 - vpshufb %ymm2,%ymm15,%ymm15 - - # x8 += x12, x4 = rotl32(x4 ^ x8, 7) - vpaddd %ymm12,%ymm8,%ymm8 - vpxor %ymm8,%ymm4,%ymm4 - vpslld $7,%ymm4,%ymm0 - vpsrld $25,%ymm4,%ymm4 - vpor %ymm0,%ymm4,%ymm4 - # x9 += x13, x5 = rotl32(x5 ^ x9, 7) - vpaddd %ymm13,%ymm9,%ymm9 - vpxor %ymm9,%ymm5,%ymm5 - vpslld $7,%ymm5,%ymm0 - vpsrld $25,%ymm5,%ymm5 - vpor %ymm0,%ymm5,%ymm5 - # x10 += x14, x6 = rotl32(x6 ^ x10, 7) - vpaddd %ymm14,%ymm10,%ymm10 - vpxor %ymm10,%ymm6,%ymm6 - vpslld $7,%ymm6,%ymm0 - vpsrld $25,%ymm6,%ymm6 - vpor %ymm0,%ymm6,%ymm6 - # x11 += x15, x7 = rotl32(x7 ^ x11, 7) - vpaddd %ymm15,%ymm11,%ymm11 - vpxor %ymm11,%ymm7,%ymm7 - vpslld $7,%ymm7,%ymm0 - vpsrld $25,%ymm7,%ymm7 - vpor %ymm0,%ymm7,%ymm7 - - # x0 += x5, x15 = rotl32(x15 ^ x0, 16) - vpaddd 0x00(%rsp),%ymm5,%ymm0 - vmovdqa %ymm0,0x00(%rsp) - vpxor %ymm0,%ymm15,%ymm15 - vpshufb %ymm3,%ymm15,%ymm15 - # x1 += x6, x12 = rotl32(x12 ^ x1, 16)%ymm0 - vpaddd 0x20(%rsp),%ymm6,%ymm0 - vmovdqa %ymm0,0x20(%rsp) - vpxor %ymm0,%ymm12,%ymm12 - vpshufb %ymm3,%ymm12,%ymm12 - # x2 += x7, x13 = rotl32(x13 ^ x2, 16) - vpaddd 0x40(%rsp),%ymm7,%ymm0 - vmovdqa %ymm0,0x40(%rsp) - vpxor %ymm0,%ymm13,%ymm13 - vpshufb %ymm3,%ymm13,%ymm13 - # x3 += x4, x14 = rotl32(x14 ^ x3, 16) - vpaddd 0x60(%rsp),%ymm4,%ymm0 - vmovdqa %ymm0,0x60(%rsp) - vpxor %ymm0,%ymm14,%ymm14 - vpshufb %ymm3,%ymm14,%ymm14 - - # x10 += x15, x5 = rotl32(x5 ^ x10, 12) - vpaddd %ymm15,%ymm10,%ymm10 - vpxor %ymm10,%ymm5,%ymm5 - vpslld $12,%ymm5,%ymm0 - vpsrld $20,%ymm5,%ymm5 - vpor %ymm0,%ymm5,%ymm5 - # x11 += x12, x6 = rotl32(x6 ^ x11, 12) - vpaddd %ymm12,%ymm11,%ymm11 - vpxor %ymm11,%ymm6,%ymm6 - vpslld $12,%ymm6,%ymm0 - vpsrld $20,%ymm6,%ymm6 - vpor %ymm0,%ymm6,%ymm6 - # x8 += x13, x7 = rotl32(x7 ^ x8, 12) - vpaddd %ymm13,%ymm8,%ymm8 - vpxor %ymm8,%ymm7,%ymm7 - vpslld $12,%ymm7,%ymm0 - vpsrld $20,%ymm7,%ymm7 - vpor %ymm0,%ymm7,%ymm7 - # x9 += x14, x4 = rotl32(x4 ^ x9, 12) - vpaddd %ymm14,%ymm9,%ymm9 - vpxor %ymm9,%ymm4,%ymm4 - vpslld $12,%ymm4,%ymm0 - vpsrld $20,%ymm4,%ymm4 - vpor %ymm0,%ymm4,%ymm4 - - # x0 += x5, x15 = rotl32(x15 ^ x0, 8) - vpaddd 0x00(%rsp),%ymm5,%ymm0 - vmovdqa %ymm0,0x00(%rsp) - vpxor %ymm0,%ymm15,%ymm15 - vpshufb %ymm2,%ymm15,%ymm15 - # x1 += x6, x12 = rotl32(x12 ^ x1, 8) - vpaddd 0x20(%rsp),%ymm6,%ymm0 - vmovdqa %ymm0,0x20(%rsp) - vpxor %ymm0,%ymm12,%ymm12 - vpshufb %ymm2,%ymm12,%ymm12 - # x2 += x7, x13 = rotl32(x13 ^ x2, 8) - vpaddd 0x40(%rsp),%ymm7,%ymm0 - vmovdqa %ymm0,0x40(%rsp) - vpxor %ymm0,%ymm13,%ymm13 - vpshufb %ymm2,%ymm13,%ymm13 - # x3 += x4, x14 = rotl32(x14 ^ x3, 8) - vpaddd 0x60(%rsp),%ymm4,%ymm0 - vmovdqa %ymm0,0x60(%rsp) - vpxor %ymm0,%ymm14,%ymm14 - vpshufb %ymm2,%ymm14,%ymm14 - - # x10 += x15, x5 = rotl32(x5 ^ x10, 7) - vpaddd %ymm15,%ymm10,%ymm10 - vpxor %ymm10,%ymm5,%ymm5 - vpslld $7,%ymm5,%ymm0 - vpsrld $25,%ymm5,%ymm5 - vpor %ymm0,%ymm5,%ymm5 - # x11 += x12, x6 = rotl32(x6 ^ x11, 7) - vpaddd %ymm12,%ymm11,%ymm11 - vpxor %ymm11,%ymm6,%ymm6 - vpslld $7,%ymm6,%ymm0 - vpsrld $25,%ymm6,%ymm6 - vpor %ymm0,%ymm6,%ymm6 - # x8 += x13, x7 = rotl32(x7 ^ x8, 7) - vpaddd %ymm13,%ymm8,%ymm8 - vpxor %ymm8,%ymm7,%ymm7 - vpslld $7,%ymm7,%ymm0 - vpsrld $25,%ymm7,%ymm7 - vpor %ymm0,%ymm7,%ymm7 - # x9 += x14, x4 = rotl32(x4 ^ x9, 7) - vpaddd %ymm14,%ymm9,%ymm9 - vpxor %ymm9,%ymm4,%ymm4 - vpslld $7,%ymm4,%ymm0 - vpsrld $25,%ymm4,%ymm4 - vpor %ymm0,%ymm4,%ymm4 - - dec %ecx - jnz .Ldoubleround8 - - # x0..15[0-3] += s[0..15] - vpbroadcastd 0x00(%rdi),%ymm0 - vpaddd 0x00(%rsp),%ymm0,%ymm0 - vmovdqa %ymm0,0x00(%rsp) - vpbroadcastd 0x04(%rdi),%ymm0 - vpaddd 0x20(%rsp),%ymm0,%ymm0 - vmovdqa %ymm0,0x20(%rsp) - vpbroadcastd 0x08(%rdi),%ymm0 - vpaddd 0x40(%rsp),%ymm0,%ymm0 - vmovdqa %ymm0,0x40(%rsp) - vpbroadcastd 0x0c(%rdi),%ymm0 - vpaddd 0x60(%rsp),%ymm0,%ymm0 - vmovdqa %ymm0,0x60(%rsp) - vpbroadcastd 0x10(%rdi),%ymm0 - vpaddd %ymm0,%ymm4,%ymm4 - vpbroadcastd 0x14(%rdi),%ymm0 - vpaddd %ymm0,%ymm5,%ymm5 - vpbroadcastd 0x18(%rdi),%ymm0 - vpaddd %ymm0,%ymm6,%ymm6 - vpbroadcastd 0x1c(%rdi),%ymm0 - vpaddd %ymm0,%ymm7,%ymm7 - vpbroadcastd 0x20(%rdi),%ymm0 - vpaddd %ymm0,%ymm8,%ymm8 - vpbroadcastd 0x24(%rdi),%ymm0 - vpaddd %ymm0,%ymm9,%ymm9 - vpbroadcastd 0x28(%rdi),%ymm0 - vpaddd %ymm0,%ymm10,%ymm10 - vpbroadcastd 0x2c(%rdi),%ymm0 - vpaddd %ymm0,%ymm11,%ymm11 - vpbroadcastd 0x30(%rdi),%ymm0 - vpaddd %ymm0,%ymm12,%ymm12 - vpbroadcastd 0x34(%rdi),%ymm0 - vpaddd %ymm0,%ymm13,%ymm13 - vpbroadcastd 0x38(%rdi),%ymm0 - vpaddd %ymm0,%ymm14,%ymm14 - vpbroadcastd 0x3c(%rdi),%ymm0 - vpaddd %ymm0,%ymm15,%ymm15 - - # x12 += counter values 0-3 - vpaddd %ymm1,%ymm12,%ymm12 - - # interleave 32-bit words in state n, n+1 - vmovdqa 0x00(%rsp),%ymm0 - vmovdqa 0x20(%rsp),%ymm1 - vpunpckldq %ymm1,%ymm0,%ymm2 - vpunpckhdq %ymm1,%ymm0,%ymm1 - vmovdqa %ymm2,0x00(%rsp) - vmovdqa %ymm1,0x20(%rsp) - vmovdqa 0x40(%rsp),%ymm0 - vmovdqa 0x60(%rsp),%ymm1 - vpunpckldq %ymm1,%ymm0,%ymm2 - vpunpckhdq %ymm1,%ymm0,%ymm1 - vmovdqa %ymm2,0x40(%rsp) - vmovdqa %ymm1,0x60(%rsp) - vmovdqa %ymm4,%ymm0 - vpunpckldq %ymm5,%ymm0,%ymm4 - vpunpckhdq %ymm5,%ymm0,%ymm5 - vmovdqa %ymm6,%ymm0 - vpunpckldq %ymm7,%ymm0,%ymm6 - vpunpckhdq %ymm7,%ymm0,%ymm7 - vmovdqa %ymm8,%ymm0 - vpunpckldq %ymm9,%ymm0,%ymm8 - vpunpckhdq %ymm9,%ymm0,%ymm9 - vmovdqa %ymm10,%ymm0 - vpunpckldq %ymm11,%ymm0,%ymm10 - vpunpckhdq %ymm11,%ymm0,%ymm11 - vmovdqa %ymm12,%ymm0 - vpunpckldq %ymm13,%ymm0,%ymm12 - vpunpckhdq %ymm13,%ymm0,%ymm13 - vmovdqa %ymm14,%ymm0 - vpunpckldq %ymm15,%ymm0,%ymm14 - vpunpckhdq %ymm15,%ymm0,%ymm15 - - # interleave 64-bit words in state n, n+2 - vmovdqa 0x00(%rsp),%ymm0 - vmovdqa 0x40(%rsp),%ymm2 - vpunpcklqdq %ymm2,%ymm0,%ymm1 - vpunpckhqdq %ymm2,%ymm0,%ymm2 - vmovdqa %ymm1,0x00(%rsp) - vmovdqa %ymm2,0x40(%rsp) - vmovdqa 0x20(%rsp),%ymm0 - vmovdqa 0x60(%rsp),%ymm2 - vpunpcklqdq %ymm2,%ymm0,%ymm1 - vpunpckhqdq %ymm2,%ymm0,%ymm2 - vmovdqa %ymm1,0x20(%rsp) - vmovdqa %ymm2,0x60(%rsp) - vmovdqa %ymm4,%ymm0 - vpunpcklqdq %ymm6,%ymm0,%ymm4 - vpunpckhqdq %ymm6,%ymm0,%ymm6 - vmovdqa %ymm5,%ymm0 - vpunpcklqdq %ymm7,%ymm0,%ymm5 - vpunpckhqdq %ymm7,%ymm0,%ymm7 - vmovdqa %ymm8,%ymm0 - vpunpcklqdq %ymm10,%ymm0,%ymm8 - vpunpckhqdq %ymm10,%ymm0,%ymm10 - vmovdqa %ymm9,%ymm0 - vpunpcklqdq %ymm11,%ymm0,%ymm9 - vpunpckhqdq %ymm11,%ymm0,%ymm11 - vmovdqa %ymm12,%ymm0 - vpunpcklqdq %ymm14,%ymm0,%ymm12 - vpunpckhqdq %ymm14,%ymm0,%ymm14 - vmovdqa %ymm13,%ymm0 - vpunpcklqdq %ymm15,%ymm0,%ymm13 - vpunpckhqdq %ymm15,%ymm0,%ymm15 - - # interleave 128-bit words in state n, n+4 - # xor/write first four blocks - vmovdqa 0x00(%rsp),%ymm1 - vperm2i128 $0x20,%ymm4,%ymm1,%ymm0 - cmp $0x0020,%rax - jl .Lxorpart8 - vpxor 0x0000(%rdx),%ymm0,%ymm0 - vmovdqu %ymm0,0x0000(%rsi) - vperm2i128 $0x31,%ymm4,%ymm1,%ymm4 - - vperm2i128 $0x20,%ymm12,%ymm8,%ymm0 - cmp $0x0040,%rax - jl .Lxorpart8 - vpxor 0x0020(%rdx),%ymm0,%ymm0 - vmovdqu %ymm0,0x0020(%rsi) - vperm2i128 $0x31,%ymm12,%ymm8,%ymm12 - - vmovdqa 0x40(%rsp),%ymm1 - vperm2i128 $0x20,%ymm6,%ymm1,%ymm0 - cmp $0x0060,%rax - jl .Lxorpart8 - vpxor 0x0040(%rdx),%ymm0,%ymm0 - vmovdqu %ymm0,0x0040(%rsi) - vperm2i128 $0x31,%ymm6,%ymm1,%ymm6 - - vperm2i128 $0x20,%ymm14,%ymm10,%ymm0 - cmp $0x0080,%rax - jl .Lxorpart8 - vpxor 0x0060(%rdx),%ymm0,%ymm0 - vmovdqu %ymm0,0x0060(%rsi) - vperm2i128 $0x31,%ymm14,%ymm10,%ymm14 - - vmovdqa 0x20(%rsp),%ymm1 - vperm2i128 $0x20,%ymm5,%ymm1,%ymm0 - cmp $0x00a0,%rax - jl .Lxorpart8 - vpxor 0x0080(%rdx),%ymm0,%ymm0 - vmovdqu %ymm0,0x0080(%rsi) - vperm2i128 $0x31,%ymm5,%ymm1,%ymm5 - - vperm2i128 $0x20,%ymm13,%ymm9,%ymm0 - cmp $0x00c0,%rax - jl .Lxorpart8 - vpxor 0x00a0(%rdx),%ymm0,%ymm0 - vmovdqu %ymm0,0x00a0(%rsi) - vperm2i128 $0x31,%ymm13,%ymm9,%ymm13 - - vmovdqa 0x60(%rsp),%ymm1 - vperm2i128 $0x20,%ymm7,%ymm1,%ymm0 - cmp $0x00e0,%rax - jl .Lxorpart8 - vpxor 0x00c0(%rdx),%ymm0,%ymm0 - vmovdqu %ymm0,0x00c0(%rsi) - vperm2i128 $0x31,%ymm7,%ymm1,%ymm7 - - vperm2i128 $0x20,%ymm15,%ymm11,%ymm0 - cmp $0x0100,%rax - jl .Lxorpart8 - vpxor 0x00e0(%rdx),%ymm0,%ymm0 - vmovdqu %ymm0,0x00e0(%rsi) - vperm2i128 $0x31,%ymm15,%ymm11,%ymm15 - - # xor remaining blocks, write to output - vmovdqa %ymm4,%ymm0 - cmp $0x0120,%rax - jl .Lxorpart8 - vpxor 0x0100(%rdx),%ymm0,%ymm0 - vmovdqu %ymm0,0x0100(%rsi) - - vmovdqa %ymm12,%ymm0 - cmp $0x0140,%rax - jl .Lxorpart8 - vpxor 0x0120(%rdx),%ymm0,%ymm0 - vmovdqu %ymm0,0x0120(%rsi) - - vmovdqa %ymm6,%ymm0 - cmp $0x0160,%rax - jl .Lxorpart8 - vpxor 0x0140(%rdx),%ymm0,%ymm0 - vmovdqu %ymm0,0x0140(%rsi) - - vmovdqa %ymm14,%ymm0 - cmp $0x0180,%rax - jl .Lxorpart8 - vpxor 0x0160(%rdx),%ymm0,%ymm0 - vmovdqu %ymm0,0x0160(%rsi) - - vmovdqa %ymm5,%ymm0 - cmp $0x01a0,%rax - jl .Lxorpart8 - vpxor 0x0180(%rdx),%ymm0,%ymm0 - vmovdqu %ymm0,0x0180(%rsi) - - vmovdqa %ymm13,%ymm0 - cmp $0x01c0,%rax - jl .Lxorpart8 - vpxor 0x01a0(%rdx),%ymm0,%ymm0 - vmovdqu %ymm0,0x01a0(%rsi) - - vmovdqa %ymm7,%ymm0 - cmp $0x01e0,%rax - jl .Lxorpart8 - vpxor 0x01c0(%rdx),%ymm0,%ymm0 - vmovdqu %ymm0,0x01c0(%rsi) - - vmovdqa %ymm15,%ymm0 - cmp $0x0200,%rax - jl .Lxorpart8 - vpxor 0x01e0(%rdx),%ymm0,%ymm0 - vmovdqu %ymm0,0x01e0(%rsi) - -.Ldone8: - vzeroupper - lea -8(%r10),%rsp - ret - -.Lxorpart8: - # xor remaining bytes from partial register into output - mov %rax,%r9 - and $0x1f,%r9 - jz .Ldone8 - and $~0x1f,%rax - - mov %rsi,%r11 - - lea (%rdx,%rax),%rsi - mov %rsp,%rdi - mov %r9,%rcx - rep movsb - - vpxor 0x00(%rsp),%ymm0,%ymm0 - vmovdqa %ymm0,0x00(%rsp) - - mov %rsp,%rsi - lea (%r11,%rax),%rdi - mov %r9,%rcx - rep movsb - - jmp .Ldone8 - -ENDPROC(chacha20_8block_xor_avx2) diff --git a/arch/x86/crypto/chacha20-avx512vl-x86_64.S b/arch/x86/crypto/chacha20-avx512vl-x86_64.S deleted file mode 100644 index 55d34de29e3e..000000000000 --- a/arch/x86/crypto/chacha20-avx512vl-x86_64.S +++ /dev/null @@ -1,839 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0+ */ -/* - * ChaCha20 256-bit cipher algorithm, RFC7539, x64 AVX-512VL functions - * - * Copyright (C) 2018 Martin Willi - */ - -#include - -.section .rodata.cst32.CTR2BL, "aM", @progbits, 32 -.align 32 -CTR2BL: .octa 0x00000000000000000000000000000000 - .octa 0x00000000000000000000000000000001 - -.section .rodata.cst32.CTR4BL, "aM", @progbits, 32 -.align 32 -CTR4BL: .octa 0x00000000000000000000000000000002 - .octa 0x00000000000000000000000000000003 - -.section .rodata.cst32.CTR8BL, "aM", @progbits, 32 -.align 32 -CTR8BL: .octa 0x00000003000000020000000100000000 - .octa 0x00000007000000060000000500000004 - -.text - -ENTRY(chacha20_2block_xor_avx512vl) - # %rdi: Input state matrix, s - # %rsi: up to 2 data blocks output, o - # %rdx: up to 2 data blocks input, i - # %rcx: input/output length in bytes - - # This function encrypts two ChaCha20 blocks by loading the state - # matrix twice across four AVX registers. It performs matrix operations - # on four words in each matrix in parallel, but requires shuffling to - # rearrange the words after each round. - - vzeroupper - - # x0..3[0-2] = s0..3 - vbroadcasti128 0x00(%rdi),%ymm0 - vbroadcasti128 0x10(%rdi),%ymm1 - vbroadcasti128 0x20(%rdi),%ymm2 - vbroadcasti128 0x30(%rdi),%ymm3 - - vpaddd CTR2BL(%rip),%ymm3,%ymm3 - - vmovdqa %ymm0,%ymm8 - vmovdqa %ymm1,%ymm9 - vmovdqa %ymm2,%ymm10 - vmovdqa %ymm3,%ymm11 - - mov $10,%rax - -.Ldoubleround: - - # x0 += x1, x3 = rotl32(x3 ^ x0, 16) - vpaddd %ymm1,%ymm0,%ymm0 - vpxord %ymm0,%ymm3,%ymm3 - vprold $16,%ymm3,%ymm3 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 12) - vpaddd %ymm3,%ymm2,%ymm2 - vpxord %ymm2,%ymm1,%ymm1 - vprold $12,%ymm1,%ymm1 - - # x0 += x1, x3 = rotl32(x3 ^ x0, 8) - vpaddd %ymm1,%ymm0,%ymm0 - vpxord %ymm0,%ymm3,%ymm3 - vprold $8,%ymm3,%ymm3 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 7) - vpaddd %ymm3,%ymm2,%ymm2 - vpxord %ymm2,%ymm1,%ymm1 - vprold $7,%ymm1,%ymm1 - - # x1 = shuffle32(x1, MASK(0, 3, 2, 1)) - vpshufd $0x39,%ymm1,%ymm1 - # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) - vpshufd $0x4e,%ymm2,%ymm2 - # x3 = shuffle32(x3, MASK(2, 1, 0, 3)) - vpshufd $0x93,%ymm3,%ymm3 - - # x0 += x1, x3 = rotl32(x3 ^ x0, 16) - vpaddd %ymm1,%ymm0,%ymm0 - vpxord %ymm0,%ymm3,%ymm3 - vprold $16,%ymm3,%ymm3 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 12) - vpaddd %ymm3,%ymm2,%ymm2 - vpxord %ymm2,%ymm1,%ymm1 - vprold $12,%ymm1,%ymm1 - - # x0 += x1, x3 = rotl32(x3 ^ x0, 8) - vpaddd %ymm1,%ymm0,%ymm0 - vpxord %ymm0,%ymm3,%ymm3 - vprold $8,%ymm3,%ymm3 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 7) - vpaddd %ymm3,%ymm2,%ymm2 - vpxord %ymm2,%ymm1,%ymm1 - vprold $7,%ymm1,%ymm1 - - # x1 = shuffle32(x1, MASK(2, 1, 0, 3)) - vpshufd $0x93,%ymm1,%ymm1 - # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) - vpshufd $0x4e,%ymm2,%ymm2 - # x3 = shuffle32(x3, MASK(0, 3, 2, 1)) - vpshufd $0x39,%ymm3,%ymm3 - - dec %rax - jnz .Ldoubleround - - # o0 = i0 ^ (x0 + s0) - vpaddd %ymm8,%ymm0,%ymm7 - cmp $0x10,%rcx - jl .Lxorpart2 - vpxord 0x00(%rdx),%xmm7,%xmm6 - vmovdqu %xmm6,0x00(%rsi) - vextracti128 $1,%ymm7,%xmm0 - # o1 = i1 ^ (x1 + s1) - vpaddd %ymm9,%ymm1,%ymm7 - cmp $0x20,%rcx - jl .Lxorpart2 - vpxord 0x10(%rdx),%xmm7,%xmm6 - vmovdqu %xmm6,0x10(%rsi) - vextracti128 $1,%ymm7,%xmm1 - # o2 = i2 ^ (x2 + s2) - vpaddd %ymm10,%ymm2,%ymm7 - cmp $0x30,%rcx - jl .Lxorpart2 - vpxord 0x20(%rdx),%xmm7,%xmm6 - vmovdqu %xmm6,0x20(%rsi) - vextracti128 $1,%ymm7,%xmm2 - # o3 = i3 ^ (x3 + s3) - vpaddd %ymm11,%ymm3,%ymm7 - cmp $0x40,%rcx - jl .Lxorpart2 - vpxord 0x30(%rdx),%xmm7,%xmm6 - vmovdqu %xmm6,0x30(%rsi) - vextracti128 $1,%ymm7,%xmm3 - - # xor and write second block - vmovdqa %xmm0,%xmm7 - cmp $0x50,%rcx - jl .Lxorpart2 - vpxord 0x40(%rdx),%xmm7,%xmm6 - vmovdqu %xmm6,0x40(%rsi) - - vmovdqa %xmm1,%xmm7 - cmp $0x60,%rcx - jl .Lxorpart2 - vpxord 0x50(%rdx),%xmm7,%xmm6 - vmovdqu %xmm6,0x50(%rsi) - - vmovdqa %xmm2,%xmm7 - cmp $0x70,%rcx - jl .Lxorpart2 - vpxord 0x60(%rdx),%xmm7,%xmm6 - vmovdqu %xmm6,0x60(%rsi) - - vmovdqa %xmm3,%xmm7 - cmp $0x80,%rcx - jl .Lxorpart2 - vpxord 0x70(%rdx),%xmm7,%xmm6 - vmovdqu %xmm6,0x70(%rsi) - -.Ldone2: - vzeroupper - ret - -.Lxorpart2: - # xor remaining bytes from partial register into output - mov %rcx,%rax - and $0xf,%rcx - jz .Ldone8 - mov %rax,%r9 - and $~0xf,%r9 - - mov $1,%rax - shld %cl,%rax,%rax - sub $1,%rax - kmovq %rax,%k1 - - vmovdqu8 (%rdx,%r9),%xmm1{%k1}{z} - vpxord %xmm7,%xmm1,%xmm1 - vmovdqu8 %xmm1,(%rsi,%r9){%k1} - - jmp .Ldone2 - -ENDPROC(chacha20_2block_xor_avx512vl) - -ENTRY(chacha20_4block_xor_avx512vl) - # %rdi: Input state matrix, s - # %rsi: up to 4 data blocks output, o - # %rdx: up to 4 data blocks input, i - # %rcx: input/output length in bytes - - # This function encrypts four ChaCha20 block by loading the state - # matrix four times across eight AVX registers. It performs matrix - # operations on four words in two matrices in parallel, sequentially - # to the operations on the four words of the other two matrices. The - # required word shuffling has a rather high latency, we can do the - # arithmetic on two matrix-pairs without much slowdown. - - vzeroupper - - # x0..3[0-4] = s0..3 - vbroadcasti128 0x00(%rdi),%ymm0 - vbroadcasti128 0x10(%rdi),%ymm1 - vbroadcasti128 0x20(%rdi),%ymm2 - vbroadcasti128 0x30(%rdi),%ymm3 - - vmovdqa %ymm0,%ymm4 - vmovdqa %ymm1,%ymm5 - vmovdqa %ymm2,%ymm6 - vmovdqa %ymm3,%ymm7 - - vpaddd CTR2BL(%rip),%ymm3,%ymm3 - vpaddd CTR4BL(%rip),%ymm7,%ymm7 - - vmovdqa %ymm0,%ymm11 - vmovdqa %ymm1,%ymm12 - vmovdqa %ymm2,%ymm13 - vmovdqa %ymm3,%ymm14 - vmovdqa %ymm7,%ymm15 - - mov $10,%rax - -.Ldoubleround4: - - # x0 += x1, x3 = rotl32(x3 ^ x0, 16) - vpaddd %ymm1,%ymm0,%ymm0 - vpxord %ymm0,%ymm3,%ymm3 - vprold $16,%ymm3,%ymm3 - - vpaddd %ymm5,%ymm4,%ymm4 - vpxord %ymm4,%ymm7,%ymm7 - vprold $16,%ymm7,%ymm7 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 12) - vpaddd %ymm3,%ymm2,%ymm2 - vpxord %ymm2,%ymm1,%ymm1 - vprold $12,%ymm1,%ymm1 - - vpaddd %ymm7,%ymm6,%ymm6 - vpxord %ymm6,%ymm5,%ymm5 - vprold $12,%ymm5,%ymm5 - - # x0 += x1, x3 = rotl32(x3 ^ x0, 8) - vpaddd %ymm1,%ymm0,%ymm0 - vpxord %ymm0,%ymm3,%ymm3 - vprold $8,%ymm3,%ymm3 - - vpaddd %ymm5,%ymm4,%ymm4 - vpxord %ymm4,%ymm7,%ymm7 - vprold $8,%ymm7,%ymm7 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 7) - vpaddd %ymm3,%ymm2,%ymm2 - vpxord %ymm2,%ymm1,%ymm1 - vprold $7,%ymm1,%ymm1 - - vpaddd %ymm7,%ymm6,%ymm6 - vpxord %ymm6,%ymm5,%ymm5 - vprold $7,%ymm5,%ymm5 - - # x1 = shuffle32(x1, MASK(0, 3, 2, 1)) - vpshufd $0x39,%ymm1,%ymm1 - vpshufd $0x39,%ymm5,%ymm5 - # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) - vpshufd $0x4e,%ymm2,%ymm2 - vpshufd $0x4e,%ymm6,%ymm6 - # x3 = shuffle32(x3, MASK(2, 1, 0, 3)) - vpshufd $0x93,%ymm3,%ymm3 - vpshufd $0x93,%ymm7,%ymm7 - - # x0 += x1, x3 = rotl32(x3 ^ x0, 16) - vpaddd %ymm1,%ymm0,%ymm0 - vpxord %ymm0,%ymm3,%ymm3 - vprold $16,%ymm3,%ymm3 - - vpaddd %ymm5,%ymm4,%ymm4 - vpxord %ymm4,%ymm7,%ymm7 - vprold $16,%ymm7,%ymm7 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 12) - vpaddd %ymm3,%ymm2,%ymm2 - vpxord %ymm2,%ymm1,%ymm1 - vprold $12,%ymm1,%ymm1 - - vpaddd %ymm7,%ymm6,%ymm6 - vpxord %ymm6,%ymm5,%ymm5 - vprold $12,%ymm5,%ymm5 - - # x0 += x1, x3 = rotl32(x3 ^ x0, 8) - vpaddd %ymm1,%ymm0,%ymm0 - vpxord %ymm0,%ymm3,%ymm3 - vprold $8,%ymm3,%ymm3 - - vpaddd %ymm5,%ymm4,%ymm4 - vpxord %ymm4,%ymm7,%ymm7 - vprold $8,%ymm7,%ymm7 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 7) - vpaddd %ymm3,%ymm2,%ymm2 - vpxord %ymm2,%ymm1,%ymm1 - vprold $7,%ymm1,%ymm1 - - vpaddd %ymm7,%ymm6,%ymm6 - vpxord %ymm6,%ymm5,%ymm5 - vprold $7,%ymm5,%ymm5 - - # x1 = shuffle32(x1, MASK(2, 1, 0, 3)) - vpshufd $0x93,%ymm1,%ymm1 - vpshufd $0x93,%ymm5,%ymm5 - # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) - vpshufd $0x4e,%ymm2,%ymm2 - vpshufd $0x4e,%ymm6,%ymm6 - # x3 = shuffle32(x3, MASK(0, 3, 2, 1)) - vpshufd $0x39,%ymm3,%ymm3 - vpshufd $0x39,%ymm7,%ymm7 - - dec %rax - jnz .Ldoubleround4 - - # o0 = i0 ^ (x0 + s0), first block - vpaddd %ymm11,%ymm0,%ymm10 - cmp $0x10,%rcx - jl .Lxorpart4 - vpxord 0x00(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x00(%rsi) - vextracti128 $1,%ymm10,%xmm0 - # o1 = i1 ^ (x1 + s1), first block - vpaddd %ymm12,%ymm1,%ymm10 - cmp $0x20,%rcx - jl .Lxorpart4 - vpxord 0x10(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x10(%rsi) - vextracti128 $1,%ymm10,%xmm1 - # o2 = i2 ^ (x2 + s2), first block - vpaddd %ymm13,%ymm2,%ymm10 - cmp $0x30,%rcx - jl .Lxorpart4 - vpxord 0x20(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x20(%rsi) - vextracti128 $1,%ymm10,%xmm2 - # o3 = i3 ^ (x3 + s3), first block - vpaddd %ymm14,%ymm3,%ymm10 - cmp $0x40,%rcx - jl .Lxorpart4 - vpxord 0x30(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x30(%rsi) - vextracti128 $1,%ymm10,%xmm3 - - # xor and write second block - vmovdqa %xmm0,%xmm10 - cmp $0x50,%rcx - jl .Lxorpart4 - vpxord 0x40(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x40(%rsi) - - vmovdqa %xmm1,%xmm10 - cmp $0x60,%rcx - jl .Lxorpart4 - vpxord 0x50(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x50(%rsi) - - vmovdqa %xmm2,%xmm10 - cmp $0x70,%rcx - jl .Lxorpart4 - vpxord 0x60(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x60(%rsi) - - vmovdqa %xmm3,%xmm10 - cmp $0x80,%rcx - jl .Lxorpart4 - vpxord 0x70(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x70(%rsi) - - # o0 = i0 ^ (x0 + s0), third block - vpaddd %ymm11,%ymm4,%ymm10 - cmp $0x90,%rcx - jl .Lxorpart4 - vpxord 0x80(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x80(%rsi) - vextracti128 $1,%ymm10,%xmm4 - # o1 = i1 ^ (x1 + s1), third block - vpaddd %ymm12,%ymm5,%ymm10 - cmp $0xa0,%rcx - jl .Lxorpart4 - vpxord 0x90(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x90(%rsi) - vextracti128 $1,%ymm10,%xmm5 - # o2 = i2 ^ (x2 + s2), third block - vpaddd %ymm13,%ymm6,%ymm10 - cmp $0xb0,%rcx - jl .Lxorpart4 - vpxord 0xa0(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0xa0(%rsi) - vextracti128 $1,%ymm10,%xmm6 - # o3 = i3 ^ (x3 + s3), third block - vpaddd %ymm15,%ymm7,%ymm10 - cmp $0xc0,%rcx - jl .Lxorpart4 - vpxord 0xb0(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0xb0(%rsi) - vextracti128 $1,%ymm10,%xmm7 - - # xor and write fourth block - vmovdqa %xmm4,%xmm10 - cmp $0xd0,%rcx - jl .Lxorpart4 - vpxord 0xc0(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0xc0(%rsi) - - vmovdqa %xmm5,%xmm10 - cmp $0xe0,%rcx - jl .Lxorpart4 - vpxord 0xd0(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0xd0(%rsi) - - vmovdqa %xmm6,%xmm10 - cmp $0xf0,%rcx - jl .Lxorpart4 - vpxord 0xe0(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0xe0(%rsi) - - vmovdqa %xmm7,%xmm10 - cmp $0x100,%rcx - jl .Lxorpart4 - vpxord 0xf0(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0xf0(%rsi) - -.Ldone4: - vzeroupper - ret - -.Lxorpart4: - # xor remaining bytes from partial register into output - mov %rcx,%rax - and $0xf,%rcx - jz .Ldone8 - mov %rax,%r9 - and $~0xf,%r9 - - mov $1,%rax - shld %cl,%rax,%rax - sub $1,%rax - kmovq %rax,%k1 - - vmovdqu8 (%rdx,%r9),%xmm1{%k1}{z} - vpxord %xmm10,%xmm1,%xmm1 - vmovdqu8 %xmm1,(%rsi,%r9){%k1} - - jmp .Ldone4 - -ENDPROC(chacha20_4block_xor_avx512vl) - -ENTRY(chacha20_8block_xor_avx512vl) - # %rdi: Input state matrix, s - # %rsi: up to 8 data blocks output, o - # %rdx: up to 8 data blocks input, i - # %rcx: input/output length in bytes - - # This function encrypts eight consecutive ChaCha20 blocks by loading - # the state matrix in AVX registers eight times. Compared to AVX2, this - # mostly benefits from the new rotate instructions in VL and the - # additional registers. - - vzeroupper - - # x0..15[0-7] = s[0..15] - vpbroadcastd 0x00(%rdi),%ymm0 - vpbroadcastd 0x04(%rdi),%ymm1 - vpbroadcastd 0x08(%rdi),%ymm2 - vpbroadcastd 0x0c(%rdi),%ymm3 - vpbroadcastd 0x10(%rdi),%ymm4 - vpbroadcastd 0x14(%rdi),%ymm5 - vpbroadcastd 0x18(%rdi),%ymm6 - vpbroadcastd 0x1c(%rdi),%ymm7 - vpbroadcastd 0x20(%rdi),%ymm8 - vpbroadcastd 0x24(%rdi),%ymm9 - vpbroadcastd 0x28(%rdi),%ymm10 - vpbroadcastd 0x2c(%rdi),%ymm11 - vpbroadcastd 0x30(%rdi),%ymm12 - vpbroadcastd 0x34(%rdi),%ymm13 - vpbroadcastd 0x38(%rdi),%ymm14 - vpbroadcastd 0x3c(%rdi),%ymm15 - - # x12 += counter values 0-3 - vpaddd CTR8BL(%rip),%ymm12,%ymm12 - - vmovdqa64 %ymm0,%ymm16 - vmovdqa64 %ymm1,%ymm17 - vmovdqa64 %ymm2,%ymm18 - vmovdqa64 %ymm3,%ymm19 - vmovdqa64 %ymm4,%ymm20 - vmovdqa64 %ymm5,%ymm21 - vmovdqa64 %ymm6,%ymm22 - vmovdqa64 %ymm7,%ymm23 - vmovdqa64 %ymm8,%ymm24 - vmovdqa64 %ymm9,%ymm25 - vmovdqa64 %ymm10,%ymm26 - vmovdqa64 %ymm11,%ymm27 - vmovdqa64 %ymm12,%ymm28 - vmovdqa64 %ymm13,%ymm29 - vmovdqa64 %ymm14,%ymm30 - vmovdqa64 %ymm15,%ymm31 - - mov $10,%eax - -.Ldoubleround8: - # x0 += x4, x12 = rotl32(x12 ^ x0, 16) - vpaddd %ymm0,%ymm4,%ymm0 - vpxord %ymm0,%ymm12,%ymm12 - vprold $16,%ymm12,%ymm12 - # x1 += x5, x13 = rotl32(x13 ^ x1, 16) - vpaddd %ymm1,%ymm5,%ymm1 - vpxord %ymm1,%ymm13,%ymm13 - vprold $16,%ymm13,%ymm13 - # x2 += x6, x14 = rotl32(x14 ^ x2, 16) - vpaddd %ymm2,%ymm6,%ymm2 - vpxord %ymm2,%ymm14,%ymm14 - vprold $16,%ymm14,%ymm14 - # x3 += x7, x15 = rotl32(x15 ^ x3, 16) - vpaddd %ymm3,%ymm7,%ymm3 - vpxord %ymm3,%ymm15,%ymm15 - vprold $16,%ymm15,%ymm15 - - # x8 += x12, x4 = rotl32(x4 ^ x8, 12) - vpaddd %ymm12,%ymm8,%ymm8 - vpxord %ymm8,%ymm4,%ymm4 - vprold $12,%ymm4,%ymm4 - # x9 += x13, x5 = rotl32(x5 ^ x9, 12) - vpaddd %ymm13,%ymm9,%ymm9 - vpxord %ymm9,%ymm5,%ymm5 - vprold $12,%ymm5,%ymm5 - # x10 += x14, x6 = rotl32(x6 ^ x10, 12) - vpaddd %ymm14,%ymm10,%ymm10 - vpxord %ymm10,%ymm6,%ymm6 - vprold $12,%ymm6,%ymm6 - # x11 += x15, x7 = rotl32(x7 ^ x11, 12) - vpaddd %ymm15,%ymm11,%ymm11 - vpxord %ymm11,%ymm7,%ymm7 - vprold $12,%ymm7,%ymm7 - - # x0 += x4, x12 = rotl32(x12 ^ x0, 8) - vpaddd %ymm0,%ymm4,%ymm0 - vpxord %ymm0,%ymm12,%ymm12 - vprold $8,%ymm12,%ymm12 - # x1 += x5, x13 = rotl32(x13 ^ x1, 8) - vpaddd %ymm1,%ymm5,%ymm1 - vpxord %ymm1,%ymm13,%ymm13 - vprold $8,%ymm13,%ymm13 - # x2 += x6, x14 = rotl32(x14 ^ x2, 8) - vpaddd %ymm2,%ymm6,%ymm2 - vpxord %ymm2,%ymm14,%ymm14 - vprold $8,%ymm14,%ymm14 - # x3 += x7, x15 = rotl32(x15 ^ x3, 8) - vpaddd %ymm3,%ymm7,%ymm3 - vpxord %ymm3,%ymm15,%ymm15 - vprold $8,%ymm15,%ymm15 - - # x8 += x12, x4 = rotl32(x4 ^ x8, 7) - vpaddd %ymm12,%ymm8,%ymm8 - vpxord %ymm8,%ymm4,%ymm4 - vprold $7,%ymm4,%ymm4 - # x9 += x13, x5 = rotl32(x5 ^ x9, 7) - vpaddd %ymm13,%ymm9,%ymm9 - vpxord %ymm9,%ymm5,%ymm5 - vprold $7,%ymm5,%ymm5 - # x10 += x14, x6 = rotl32(x6 ^ x10, 7) - vpaddd %ymm14,%ymm10,%ymm10 - vpxord %ymm10,%ymm6,%ymm6 - vprold $7,%ymm6,%ymm6 - # x11 += x15, x7 = rotl32(x7 ^ x11, 7) - vpaddd %ymm15,%ymm11,%ymm11 - vpxord %ymm11,%ymm7,%ymm7 - vprold $7,%ymm7,%ymm7 - - # x0 += x5, x15 = rotl32(x15 ^ x0, 16) - vpaddd %ymm0,%ymm5,%ymm0 - vpxord %ymm0,%ymm15,%ymm15 - vprold $16,%ymm15,%ymm15 - # x1 += x6, x12 = rotl32(x12 ^ x1, 16) - vpaddd %ymm1,%ymm6,%ymm1 - vpxord %ymm1,%ymm12,%ymm12 - vprold $16,%ymm12,%ymm12 - # x2 += x7, x13 = rotl32(x13 ^ x2, 16) - vpaddd %ymm2,%ymm7,%ymm2 - vpxord %ymm2,%ymm13,%ymm13 - vprold $16,%ymm13,%ymm13 - # x3 += x4, x14 = rotl32(x14 ^ x3, 16) - vpaddd %ymm3,%ymm4,%ymm3 - vpxord %ymm3,%ymm14,%ymm14 - vprold $16,%ymm14,%ymm14 - - # x10 += x15, x5 = rotl32(x5 ^ x10, 12) - vpaddd %ymm15,%ymm10,%ymm10 - vpxord %ymm10,%ymm5,%ymm5 - vprold $12,%ymm5,%ymm5 - # x11 += x12, x6 = rotl32(x6 ^ x11, 12) - vpaddd %ymm12,%ymm11,%ymm11 - vpxord %ymm11,%ymm6,%ymm6 - vprold $12,%ymm6,%ymm6 - # x8 += x13, x7 = rotl32(x7 ^ x8, 12) - vpaddd %ymm13,%ymm8,%ymm8 - vpxord %ymm8,%ymm7,%ymm7 - vprold $12,%ymm7,%ymm7 - # x9 += x14, x4 = rotl32(x4 ^ x9, 12) - vpaddd %ymm14,%ymm9,%ymm9 - vpxord %ymm9,%ymm4,%ymm4 - vprold $12,%ymm4,%ymm4 - - # x0 += x5, x15 = rotl32(x15 ^ x0, 8) - vpaddd %ymm0,%ymm5,%ymm0 - vpxord %ymm0,%ymm15,%ymm15 - vprold $8,%ymm15,%ymm15 - # x1 += x6, x12 = rotl32(x12 ^ x1, 8) - vpaddd %ymm1,%ymm6,%ymm1 - vpxord %ymm1,%ymm12,%ymm12 - vprold $8,%ymm12,%ymm12 - # x2 += x7, x13 = rotl32(x13 ^ x2, 8) - vpaddd %ymm2,%ymm7,%ymm2 - vpxord %ymm2,%ymm13,%ymm13 - vprold $8,%ymm13,%ymm13 - # x3 += x4, x14 = rotl32(x14 ^ x3, 8) - vpaddd %ymm3,%ymm4,%ymm3 - vpxord %ymm3,%ymm14,%ymm14 - vprold $8,%ymm14,%ymm14 - - # x10 += x15, x5 = rotl32(x5 ^ x10, 7) - vpaddd %ymm15,%ymm10,%ymm10 - vpxord %ymm10,%ymm5,%ymm5 - vprold $7,%ymm5,%ymm5 - # x11 += x12, x6 = rotl32(x6 ^ x11, 7) - vpaddd %ymm12,%ymm11,%ymm11 - vpxord %ymm11,%ymm6,%ymm6 - vprold $7,%ymm6,%ymm6 - # x8 += x13, x7 = rotl32(x7 ^ x8, 7) - vpaddd %ymm13,%ymm8,%ymm8 - vpxord %ymm8,%ymm7,%ymm7 - vprold $7,%ymm7,%ymm7 - # x9 += x14, x4 = rotl32(x4 ^ x9, 7) - vpaddd %ymm14,%ymm9,%ymm9 - vpxord %ymm9,%ymm4,%ymm4 - vprold $7,%ymm4,%ymm4 - - dec %eax - jnz .Ldoubleround8 - - # x0..15[0-3] += s[0..15] - vpaddd %ymm16,%ymm0,%ymm0 - vpaddd %ymm17,%ymm1,%ymm1 - vpaddd %ymm18,%ymm2,%ymm2 - vpaddd %ymm19,%ymm3,%ymm3 - vpaddd %ymm20,%ymm4,%ymm4 - vpaddd %ymm21,%ymm5,%ymm5 - vpaddd %ymm22,%ymm6,%ymm6 - vpaddd %ymm23,%ymm7,%ymm7 - vpaddd %ymm24,%ymm8,%ymm8 - vpaddd %ymm25,%ymm9,%ymm9 - vpaddd %ymm26,%ymm10,%ymm10 - vpaddd %ymm27,%ymm11,%ymm11 - vpaddd %ymm28,%ymm12,%ymm12 - vpaddd %ymm29,%ymm13,%ymm13 - vpaddd %ymm30,%ymm14,%ymm14 - vpaddd %ymm31,%ymm15,%ymm15 - - # interleave 32-bit words in state n, n+1 - vpunpckldq %ymm1,%ymm0,%ymm16 - vpunpckhdq %ymm1,%ymm0,%ymm17 - vpunpckldq %ymm3,%ymm2,%ymm18 - vpunpckhdq %ymm3,%ymm2,%ymm19 - vpunpckldq %ymm5,%ymm4,%ymm20 - vpunpckhdq %ymm5,%ymm4,%ymm21 - vpunpckldq %ymm7,%ymm6,%ymm22 - vpunpckhdq %ymm7,%ymm6,%ymm23 - vpunpckldq %ymm9,%ymm8,%ymm24 - vpunpckhdq %ymm9,%ymm8,%ymm25 - vpunpckldq %ymm11,%ymm10,%ymm26 - vpunpckhdq %ymm11,%ymm10,%ymm27 - vpunpckldq %ymm13,%ymm12,%ymm28 - vpunpckhdq %ymm13,%ymm12,%ymm29 - vpunpckldq %ymm15,%ymm14,%ymm30 - vpunpckhdq %ymm15,%ymm14,%ymm31 - - # interleave 64-bit words in state n, n+2 - vpunpcklqdq %ymm18,%ymm16,%ymm0 - vpunpcklqdq %ymm19,%ymm17,%ymm1 - vpunpckhqdq %ymm18,%ymm16,%ymm2 - vpunpckhqdq %ymm19,%ymm17,%ymm3 - vpunpcklqdq %ymm22,%ymm20,%ymm4 - vpunpcklqdq %ymm23,%ymm21,%ymm5 - vpunpckhqdq %ymm22,%ymm20,%ymm6 - vpunpckhqdq %ymm23,%ymm21,%ymm7 - vpunpcklqdq %ymm26,%ymm24,%ymm8 - vpunpcklqdq %ymm27,%ymm25,%ymm9 - vpunpckhqdq %ymm26,%ymm24,%ymm10 - vpunpckhqdq %ymm27,%ymm25,%ymm11 - vpunpcklqdq %ymm30,%ymm28,%ymm12 - vpunpcklqdq %ymm31,%ymm29,%ymm13 - vpunpckhqdq %ymm30,%ymm28,%ymm14 - vpunpckhqdq %ymm31,%ymm29,%ymm15 - - # interleave 128-bit words in state n, n+4 - # xor/write first four blocks - vmovdqa64 %ymm0,%ymm16 - vperm2i128 $0x20,%ymm4,%ymm0,%ymm0 - cmp $0x0020,%rcx - jl .Lxorpart8 - vpxord 0x0000(%rdx),%ymm0,%ymm0 - vmovdqu64 %ymm0,0x0000(%rsi) - vmovdqa64 %ymm16,%ymm0 - vperm2i128 $0x31,%ymm4,%ymm0,%ymm4 - - vperm2i128 $0x20,%ymm12,%ymm8,%ymm0 - cmp $0x0040,%rcx - jl .Lxorpart8 - vpxord 0x0020(%rdx),%ymm0,%ymm0 - vmovdqu64 %ymm0,0x0020(%rsi) - vperm2i128 $0x31,%ymm12,%ymm8,%ymm12 - - vperm2i128 $0x20,%ymm6,%ymm2,%ymm0 - cmp $0x0060,%rcx - jl .Lxorpart8 - vpxord 0x0040(%rdx),%ymm0,%ymm0 - vmovdqu64 %ymm0,0x0040(%rsi) - vperm2i128 $0x31,%ymm6,%ymm2,%ymm6 - - vperm2i128 $0x20,%ymm14,%ymm10,%ymm0 - cmp $0x0080,%rcx - jl .Lxorpart8 - vpxord 0x0060(%rdx),%ymm0,%ymm0 - vmovdqu64 %ymm0,0x0060(%rsi) - vperm2i128 $0x31,%ymm14,%ymm10,%ymm14 - - vperm2i128 $0x20,%ymm5,%ymm1,%ymm0 - cmp $0x00a0,%rcx - jl .Lxorpart8 - vpxord 0x0080(%rdx),%ymm0,%ymm0 - vmovdqu64 %ymm0,0x0080(%rsi) - vperm2i128 $0x31,%ymm5,%ymm1,%ymm5 - - vperm2i128 $0x20,%ymm13,%ymm9,%ymm0 - cmp $0x00c0,%rcx - jl .Lxorpart8 - vpxord 0x00a0(%rdx),%ymm0,%ymm0 - vmovdqu64 %ymm0,0x00a0(%rsi) - vperm2i128 $0x31,%ymm13,%ymm9,%ymm13 - - vperm2i128 $0x20,%ymm7,%ymm3,%ymm0 - cmp $0x00e0,%rcx - jl .Lxorpart8 - vpxord 0x00c0(%rdx),%ymm0,%ymm0 - vmovdqu64 %ymm0,0x00c0(%rsi) - vperm2i128 $0x31,%ymm7,%ymm3,%ymm7 - - vperm2i128 $0x20,%ymm15,%ymm11,%ymm0 - cmp $0x0100,%rcx - jl .Lxorpart8 - vpxord 0x00e0(%rdx),%ymm0,%ymm0 - vmovdqu64 %ymm0,0x00e0(%rsi) - vperm2i128 $0x31,%ymm15,%ymm11,%ymm15 - - # xor remaining blocks, write to output - vmovdqa64 %ymm4,%ymm0 - cmp $0x0120,%rcx - jl .Lxorpart8 - vpxord 0x0100(%rdx),%ymm0,%ymm0 - vmovdqu64 %ymm0,0x0100(%rsi) - - vmovdqa64 %ymm12,%ymm0 - cmp $0x0140,%rcx - jl .Lxorpart8 - vpxord 0x0120(%rdx),%ymm0,%ymm0 - vmovdqu64 %ymm0,0x0120(%rsi) - - vmovdqa64 %ymm6,%ymm0 - cmp $0x0160,%rcx - jl .Lxorpart8 - vpxord 0x0140(%rdx),%ymm0,%ymm0 - vmovdqu64 %ymm0,0x0140(%rsi) - - vmovdqa64 %ymm14,%ymm0 - cmp $0x0180,%rcx - jl .Lxorpart8 - vpxord 0x0160(%rdx),%ymm0,%ymm0 - vmovdqu64 %ymm0,0x0160(%rsi) - - vmovdqa64 %ymm5,%ymm0 - cmp $0x01a0,%rcx - jl .Lxorpart8 - vpxord 0x0180(%rdx),%ymm0,%ymm0 - vmovdqu64 %ymm0,0x0180(%rsi) - - vmovdqa64 %ymm13,%ymm0 - cmp $0x01c0,%rcx - jl .Lxorpart8 - vpxord 0x01a0(%rdx),%ymm0,%ymm0 - vmovdqu64 %ymm0,0x01a0(%rsi) - - vmovdqa64 %ymm7,%ymm0 - cmp $0x01e0,%rcx - jl .Lxorpart8 - vpxord 0x01c0(%rdx),%ymm0,%ymm0 - vmovdqu64 %ymm0,0x01c0(%rsi) - - vmovdqa64 %ymm15,%ymm0 - cmp $0x0200,%rcx - jl .Lxorpart8 - vpxord 0x01e0(%rdx),%ymm0,%ymm0 - vmovdqu64 %ymm0,0x01e0(%rsi) - -.Ldone8: - vzeroupper - ret - -.Lxorpart8: - # xor remaining bytes from partial register into output - mov %rcx,%rax - and $0x1f,%rcx - jz .Ldone8 - mov %rax,%r9 - and $~0x1f,%r9 - - mov $1,%rax - shld %cl,%rax,%rax - sub $1,%rax - kmovq %rax,%k1 - - vmovdqu8 (%rdx,%r9),%ymm1{%k1}{z} - vpxord %ymm0,%ymm1,%ymm1 - vmovdqu8 %ymm1,(%rsi,%r9){%k1} - - jmp .Ldone8 - -ENDPROC(chacha20_8block_xor_avx512vl) diff --git a/arch/x86/crypto/chacha20-ssse3-x86_64.S b/arch/x86/crypto/chacha20-ssse3-x86_64.S deleted file mode 100644 index f6792789f875..000000000000 --- a/arch/x86/crypto/chacha20-ssse3-x86_64.S +++ /dev/null @@ -1,792 +0,0 @@ -/* - * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions - * - * Copyright (C) 2015 Martin Willi - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ - -#include -#include - -.section .rodata.cst16.ROT8, "aM", @progbits, 16 -.align 16 -ROT8: .octa 0x0e0d0c0f0a09080b0605040702010003 -.section .rodata.cst16.ROT16, "aM", @progbits, 16 -.align 16 -ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302 -.section .rodata.cst16.CTRINC, "aM", @progbits, 16 -.align 16 -CTRINC: .octa 0x00000003000000020000000100000000 - -.text - -/* - * chacha20_permute - permute one block - * - * Permute one 64-byte block where the state matrix is in %xmm0-%xmm3. This - * function performs matrix operations on four words in parallel, but requires - * shuffling to rearrange the words after each round. 8/16-bit word rotation is - * done with the slightly better performing SSSE3 byte shuffling, 7/12-bit word - * rotation uses traditional shift+OR. - * - * Clobbers: %ecx, %xmm4-%xmm7 - */ -chacha20_permute: - - movdqa ROT8(%rip),%xmm4 - movdqa ROT16(%rip),%xmm5 - mov $10,%ecx - -.Ldoubleround: - # x0 += x1, x3 = rotl32(x3 ^ x0, 16) - paddd %xmm1,%xmm0 - pxor %xmm0,%xmm3 - pshufb %xmm5,%xmm3 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 12) - paddd %xmm3,%xmm2 - pxor %xmm2,%xmm1 - movdqa %xmm1,%xmm6 - pslld $12,%xmm6 - psrld $20,%xmm1 - por %xmm6,%xmm1 - - # x0 += x1, x3 = rotl32(x3 ^ x0, 8) - paddd %xmm1,%xmm0 - pxor %xmm0,%xmm3 - pshufb %xmm4,%xmm3 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 7) - paddd %xmm3,%xmm2 - pxor %xmm2,%xmm1 - movdqa %xmm1,%xmm7 - pslld $7,%xmm7 - psrld $25,%xmm1 - por %xmm7,%xmm1 - - # x1 = shuffle32(x1, MASK(0, 3, 2, 1)) - pshufd $0x39,%xmm1,%xmm1 - # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) - pshufd $0x4e,%xmm2,%xmm2 - # x3 = shuffle32(x3, MASK(2, 1, 0, 3)) - pshufd $0x93,%xmm3,%xmm3 - - # x0 += x1, x3 = rotl32(x3 ^ x0, 16) - paddd %xmm1,%xmm0 - pxor %xmm0,%xmm3 - pshufb %xmm5,%xmm3 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 12) - paddd %xmm3,%xmm2 - pxor %xmm2,%xmm1 - movdqa %xmm1,%xmm6 - pslld $12,%xmm6 - psrld $20,%xmm1 - por %xmm6,%xmm1 - - # x0 += x1, x3 = rotl32(x3 ^ x0, 8) - paddd %xmm1,%xmm0 - pxor %xmm0,%xmm3 - pshufb %xmm4,%xmm3 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 7) - paddd %xmm3,%xmm2 - pxor %xmm2,%xmm1 - movdqa %xmm1,%xmm7 - pslld $7,%xmm7 - psrld $25,%xmm1 - por %xmm7,%xmm1 - - # x1 = shuffle32(x1, MASK(2, 1, 0, 3)) - pshufd $0x93,%xmm1,%xmm1 - # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) - pshufd $0x4e,%xmm2,%xmm2 - # x3 = shuffle32(x3, MASK(0, 3, 2, 1)) - pshufd $0x39,%xmm3,%xmm3 - - dec %ecx - jnz .Ldoubleround - - ret -ENDPROC(chacha20_permute) - -ENTRY(chacha20_block_xor_ssse3) - # %rdi: Input state matrix, s - # %rsi: up to 1 data block output, o - # %rdx: up to 1 data block input, i - # %rcx: input/output length in bytes - FRAME_BEGIN - - # x0..3 = s0..3 - movdqa 0x00(%rdi),%xmm0 - movdqa 0x10(%rdi),%xmm1 - movdqa 0x20(%rdi),%xmm2 - movdqa 0x30(%rdi),%xmm3 - movdqa %xmm0,%xmm8 - movdqa %xmm1,%xmm9 - movdqa %xmm2,%xmm10 - movdqa %xmm3,%xmm11 - - mov %rcx,%rax - call chacha20_permute - - # o0 = i0 ^ (x0 + s0) - paddd %xmm8,%xmm0 - cmp $0x10,%rax - jl .Lxorpart - movdqu 0x00(%rdx),%xmm4 - pxor %xmm4,%xmm0 - movdqu %xmm0,0x00(%rsi) - # o1 = i1 ^ (x1 + s1) - paddd %xmm9,%xmm1 - movdqa %xmm1,%xmm0 - cmp $0x20,%rax - jl .Lxorpart - movdqu 0x10(%rdx),%xmm0 - pxor %xmm1,%xmm0 - movdqu %xmm0,0x10(%rsi) - # o2 = i2 ^ (x2 + s2) - paddd %xmm10,%xmm2 - movdqa %xmm2,%xmm0 - cmp $0x30,%rax - jl .Lxorpart - movdqu 0x20(%rdx),%xmm0 - pxor %xmm2,%xmm0 - movdqu %xmm0,0x20(%rsi) - # o3 = i3 ^ (x3 + s3) - paddd %xmm11,%xmm3 - movdqa %xmm3,%xmm0 - cmp $0x40,%rax - jl .Lxorpart - movdqu 0x30(%rdx),%xmm0 - pxor %xmm3,%xmm0 - movdqu %xmm0,0x30(%rsi) - -.Ldone: - FRAME_END - ret - -.Lxorpart: - # xor remaining bytes from partial register into output - mov %rax,%r9 - and $0x0f,%r9 - jz .Ldone - and $~0x0f,%rax - - mov %rsi,%r11 - - lea 8(%rsp),%r10 - sub $0x10,%rsp - and $~31,%rsp - - lea (%rdx,%rax),%rsi - mov %rsp,%rdi - mov %r9,%rcx - rep movsb - - pxor 0x00(%rsp),%xmm0 - movdqa %xmm0,0x00(%rsp) - - mov %rsp,%rsi - lea (%r11,%rax),%rdi - mov %r9,%rcx - rep movsb - - lea -8(%r10),%rsp - jmp .Ldone - -ENDPROC(chacha20_block_xor_ssse3) - -ENTRY(hchacha20_block_ssse3) - # %rdi: Input state matrix, s - # %rsi: output (8 32-bit words) - FRAME_BEGIN - - movdqa 0x00(%rdi),%xmm0 - movdqa 0x10(%rdi),%xmm1 - movdqa 0x20(%rdi),%xmm2 - movdqa 0x30(%rdi),%xmm3 - - call chacha20_permute - - movdqu %xmm0,0x00(%rsi) - movdqu %xmm3,0x10(%rsi) - - FRAME_END - ret -ENDPROC(hchacha20_block_ssse3) - -ENTRY(chacha20_4block_xor_ssse3) - # %rdi: Input state matrix, s - # %rsi: up to 4 data blocks output, o - # %rdx: up to 4 data blocks input, i - # %rcx: input/output length in bytes - - # This function encrypts four consecutive ChaCha20 blocks by loading the - # the state matrix in SSE registers four times. As we need some scratch - # registers, we save the first four registers on the stack. The - # algorithm performs each operation on the corresponding word of each - # state matrix, hence requires no word shuffling. For final XORing step - # we transpose the matrix by interleaving 32- and then 64-bit words, - # which allows us to do XOR in SSE registers. 8/16-bit word rotation is - # done with the slightly better performing SSSE3 byte shuffling, - # 7/12-bit word rotation uses traditional shift+OR. - - lea 8(%rsp),%r10 - sub $0x80,%rsp - and $~63,%rsp - mov %rcx,%rax - - # x0..15[0-3] = s0..3[0..3] - movq 0x00(%rdi),%xmm1 - pshufd $0x00,%xmm1,%xmm0 - pshufd $0x55,%xmm1,%xmm1 - movq 0x08(%rdi),%xmm3 - pshufd $0x00,%xmm3,%xmm2 - pshufd $0x55,%xmm3,%xmm3 - movq 0x10(%rdi),%xmm5 - pshufd $0x00,%xmm5,%xmm4 - pshufd $0x55,%xmm5,%xmm5 - movq 0x18(%rdi),%xmm7 - pshufd $0x00,%xmm7,%xmm6 - pshufd $0x55,%xmm7,%xmm7 - movq 0x20(%rdi),%xmm9 - pshufd $0x00,%xmm9,%xmm8 - pshufd $0x55,%xmm9,%xmm9 - movq 0x28(%rdi),%xmm11 - pshufd $0x00,%xmm11,%xmm10 - pshufd $0x55,%xmm11,%xmm11 - movq 0x30(%rdi),%xmm13 - pshufd $0x00,%xmm13,%xmm12 - pshufd $0x55,%xmm13,%xmm13 - movq 0x38(%rdi),%xmm15 - pshufd $0x00,%xmm15,%xmm14 - pshufd $0x55,%xmm15,%xmm15 - # x0..3 on stack - movdqa %xmm0,0x00(%rsp) - movdqa %xmm1,0x10(%rsp) - movdqa %xmm2,0x20(%rsp) - movdqa %xmm3,0x30(%rsp) - - movdqa CTRINC(%rip),%xmm1 - movdqa ROT8(%rip),%xmm2 - movdqa ROT16(%rip),%xmm3 - - # x12 += counter values 0-3 - paddd %xmm1,%xmm12 - - mov $10,%ecx - -.Ldoubleround4: - # x0 += x4, x12 = rotl32(x12 ^ x0, 16) - movdqa 0x00(%rsp),%xmm0 - paddd %xmm4,%xmm0 - movdqa %xmm0,0x00(%rsp) - pxor %xmm0,%xmm12 - pshufb %xmm3,%xmm12 - # x1 += x5, x13 = rotl32(x13 ^ x1, 16) - movdqa 0x10(%rsp),%xmm0 - paddd %xmm5,%xmm0 - movdqa %xmm0,0x10(%rsp) - pxor %xmm0,%xmm13 - pshufb %xmm3,%xmm13 - # x2 += x6, x14 = rotl32(x14 ^ x2, 16) - movdqa 0x20(%rsp),%xmm0 - paddd %xmm6,%xmm0 - movdqa %xmm0,0x20(%rsp) - pxor %xmm0,%xmm14 - pshufb %xmm3,%xmm14 - # x3 += x7, x15 = rotl32(x15 ^ x3, 16) - movdqa 0x30(%rsp),%xmm0 - paddd %xmm7,%xmm0 - movdqa %xmm0,0x30(%rsp) - pxor %xmm0,%xmm15 - pshufb %xmm3,%xmm15 - - # x8 += x12, x4 = rotl32(x4 ^ x8, 12) - paddd %xmm12,%xmm8 - pxor %xmm8,%xmm4 - movdqa %xmm4,%xmm0 - pslld $12,%xmm0 - psrld $20,%xmm4 - por %xmm0,%xmm4 - # x9 += x13, x5 = rotl32(x5 ^ x9, 12) - paddd %xmm13,%xmm9 - pxor %xmm9,%xmm5 - movdqa %xmm5,%xmm0 - pslld $12,%xmm0 - psrld $20,%xmm5 - por %xmm0,%xmm5 - # x10 += x14, x6 = rotl32(x6 ^ x10, 12) - paddd %xmm14,%xmm10 - pxor %xmm10,%xmm6 - movdqa %xmm6,%xmm0 - pslld $12,%xmm0 - psrld $20,%xmm6 - por %xmm0,%xmm6 - # x11 += x15, x7 = rotl32(x7 ^ x11, 12) - paddd %xmm15,%xmm11 - pxor %xmm11,%xmm7 - movdqa %xmm7,%xmm0 - pslld $12,%xmm0 - psrld $20,%xmm7 - por %xmm0,%xmm7 - - # x0 += x4, x12 = rotl32(x12 ^ x0, 8) - movdqa 0x00(%rsp),%xmm0 - paddd %xmm4,%xmm0 - movdqa %xmm0,0x00(%rsp) - pxor %xmm0,%xmm12 - pshufb %xmm2,%xmm12 - # x1 += x5, x13 = rotl32(x13 ^ x1, 8) - movdqa 0x10(%rsp),%xmm0 - paddd %xmm5,%xmm0 - movdqa %xmm0,0x10(%rsp) - pxor %xmm0,%xmm13 - pshufb %xmm2,%xmm13 - # x2 += x6, x14 = rotl32(x14 ^ x2, 8) - movdqa 0x20(%rsp),%xmm0 - paddd %xmm6,%xmm0 - movdqa %xmm0,0x20(%rsp) - pxor %xmm0,%xmm14 - pshufb %xmm2,%xmm14 - # x3 += x7, x15 = rotl32(x15 ^ x3, 8) - movdqa 0x30(%rsp),%xmm0 - paddd %xmm7,%xmm0 - movdqa %xmm0,0x30(%rsp) - pxor %xmm0,%xmm15 - pshufb %xmm2,%xmm15 - - # x8 += x12, x4 = rotl32(x4 ^ x8, 7) - paddd %xmm12,%xmm8 - pxor %xmm8,%xmm4 - movdqa %xmm4,%xmm0 - pslld $7,%xmm0 - psrld $25,%xmm4 - por %xmm0,%xmm4 - # x9 += x13, x5 = rotl32(x5 ^ x9, 7) - paddd %xmm13,%xmm9 - pxor %xmm9,%xmm5 - movdqa %xmm5,%xmm0 - pslld $7,%xmm0 - psrld $25,%xmm5 - por %xmm0,%xmm5 - # x10 += x14, x6 = rotl32(x6 ^ x10, 7) - paddd %xmm14,%xmm10 - pxor %xmm10,%xmm6 - movdqa %xmm6,%xmm0 - pslld $7,%xmm0 - psrld $25,%xmm6 - por %xmm0,%xmm6 - # x11 += x15, x7 = rotl32(x7 ^ x11, 7) - paddd %xmm15,%xmm11 - pxor %xmm11,%xmm7 - movdqa %xmm7,%xmm0 - pslld $7,%xmm0 - psrld $25,%xmm7 - por %xmm0,%xmm7 - - # x0 += x5, x15 = rotl32(x15 ^ x0, 16) - movdqa 0x00(%rsp),%xmm0 - paddd %xmm5,%xmm0 - movdqa %xmm0,0x00(%rsp) - pxor %xmm0,%xmm15 - pshufb %xmm3,%xmm15 - # x1 += x6, x12 = rotl32(x12 ^ x1, 16) - movdqa 0x10(%rsp),%xmm0 - paddd %xmm6,%xmm0 - movdqa %xmm0,0x10(%rsp) - pxor %xmm0,%xmm12 - pshufb %xmm3,%xmm12 - # x2 += x7, x13 = rotl32(x13 ^ x2, 16) - movdqa 0x20(%rsp),%xmm0 - paddd %xmm7,%xmm0 - movdqa %xmm0,0x20(%rsp) - pxor %xmm0,%xmm13 - pshufb %xmm3,%xmm13 - # x3 += x4, x14 = rotl32(x14 ^ x3, 16) - movdqa 0x30(%rsp),%xmm0 - paddd %xmm4,%xmm0 - movdqa %xmm0,0x30(%rsp) - pxor %xmm0,%xmm14 - pshufb %xmm3,%xmm14 - - # x10 += x15, x5 = rotl32(x5 ^ x10, 12) - paddd %xmm15,%xmm10 - pxor %xmm10,%xmm5 - movdqa %xmm5,%xmm0 - pslld $12,%xmm0 - psrld $20,%xmm5 - por %xmm0,%xmm5 - # x11 += x12, x6 = rotl32(x6 ^ x11, 12) - paddd %xmm12,%xmm11 - pxor %xmm11,%xmm6 - movdqa %xmm6,%xmm0 - pslld $12,%xmm0 - psrld $20,%xmm6 - por %xmm0,%xmm6 - # x8 += x13, x7 = rotl32(x7 ^ x8, 12) - paddd %xmm13,%xmm8 - pxor %xmm8,%xmm7 - movdqa %xmm7,%xmm0 - pslld $12,%xmm0 - psrld $20,%xmm7 - por %xmm0,%xmm7 - # x9 += x14, x4 = rotl32(x4 ^ x9, 12) - paddd %xmm14,%xmm9 - pxor %xmm9,%xmm4 - movdqa %xmm4,%xmm0 - pslld $12,%xmm0 - psrld $20,%xmm4 - por %xmm0,%xmm4 - - # x0 += x5, x15 = rotl32(x15 ^ x0, 8) - movdqa 0x00(%rsp),%xmm0 - paddd %xmm5,%xmm0 - movdqa %xmm0,0x00(%rsp) - pxor %xmm0,%xmm15 - pshufb %xmm2,%xmm15 - # x1 += x6, x12 = rotl32(x12 ^ x1, 8) - movdqa 0x10(%rsp),%xmm0 - paddd %xmm6,%xmm0 - movdqa %xmm0,0x10(%rsp) - pxor %xmm0,%xmm12 - pshufb %xmm2,%xmm12 - # x2 += x7, x13 = rotl32(x13 ^ x2, 8) - movdqa 0x20(%rsp),%xmm0 - paddd %xmm7,%xmm0 - movdqa %xmm0,0x20(%rsp) - pxor %xmm0,%xmm13 - pshufb %xmm2,%xmm13 - # x3 += x4, x14 = rotl32(x14 ^ x3, 8) - movdqa 0x30(%rsp),%xmm0 - paddd %xmm4,%xmm0 - movdqa %xmm0,0x30(%rsp) - pxor %xmm0,%xmm14 - pshufb %xmm2,%xmm14 - - # x10 += x15, x5 = rotl32(x5 ^ x10, 7) - paddd %xmm15,%xmm10 - pxor %xmm10,%xmm5 - movdqa %xmm5,%xmm0 - pslld $7,%xmm0 - psrld $25,%xmm5 - por %xmm0,%xmm5 - # x11 += x12, x6 = rotl32(x6 ^ x11, 7) - paddd %xmm12,%xmm11 - pxor %xmm11,%xmm6 - movdqa %xmm6,%xmm0 - pslld $7,%xmm0 - psrld $25,%xmm6 - por %xmm0,%xmm6 - # x8 += x13, x7 = rotl32(x7 ^ x8, 7) - paddd %xmm13,%xmm8 - pxor %xmm8,%xmm7 - movdqa %xmm7,%xmm0 - pslld $7,%xmm0 - psrld $25,%xmm7 - por %xmm0,%xmm7 - # x9 += x14, x4 = rotl32(x4 ^ x9, 7) - paddd %xmm14,%xmm9 - pxor %xmm9,%xmm4 - movdqa %xmm4,%xmm0 - pslld $7,%xmm0 - psrld $25,%xmm4 - por %xmm0,%xmm4 - - dec %ecx - jnz .Ldoubleround4 - - # x0[0-3] += s0[0] - # x1[0-3] += s0[1] - movq 0x00(%rdi),%xmm3 - pshufd $0x00,%xmm3,%xmm2 - pshufd $0x55,%xmm3,%xmm3 - paddd 0x00(%rsp),%xmm2 - movdqa %xmm2,0x00(%rsp) - paddd 0x10(%rsp),%xmm3 - movdqa %xmm3,0x10(%rsp) - # x2[0-3] += s0[2] - # x3[0-3] += s0[3] - movq 0x08(%rdi),%xmm3 - pshufd $0x00,%xmm3,%xmm2 - pshufd $0x55,%xmm3,%xmm3 - paddd 0x20(%rsp),%xmm2 - movdqa %xmm2,0x20(%rsp) - paddd 0x30(%rsp),%xmm3 - movdqa %xmm3,0x30(%rsp) - - # x4[0-3] += s1[0] - # x5[0-3] += s1[1] - movq 0x10(%rdi),%xmm3 - pshufd $0x00,%xmm3,%xmm2 - pshufd $0x55,%xmm3,%xmm3 - paddd %xmm2,%xmm4 - paddd %xmm3,%xmm5 - # x6[0-3] += s1[2] - # x7[0-3] += s1[3] - movq 0x18(%rdi),%xmm3 - pshufd $0x00,%xmm3,%xmm2 - pshufd $0x55,%xmm3,%xmm3 - paddd %xmm2,%xmm6 - paddd %xmm3,%xmm7 - - # x8[0-3] += s2[0] - # x9[0-3] += s2[1] - movq 0x20(%rdi),%xmm3 - pshufd $0x00,%xmm3,%xmm2 - pshufd $0x55,%xmm3,%xmm3 - paddd %xmm2,%xmm8 - paddd %xmm3,%xmm9 - # x10[0-3] += s2[2] - # x11[0-3] += s2[3] - movq 0x28(%rdi),%xmm3 - pshufd $0x00,%xmm3,%xmm2 - pshufd $0x55,%xmm3,%xmm3 - paddd %xmm2,%xmm10 - paddd %xmm3,%xmm11 - - # x12[0-3] += s3[0] - # x13[0-3] += s3[1] - movq 0x30(%rdi),%xmm3 - pshufd $0x00,%xmm3,%xmm2 - pshufd $0x55,%xmm3,%xmm3 - paddd %xmm2,%xmm12 - paddd %xmm3,%xmm13 - # x14[0-3] += s3[2] - # x15[0-3] += s3[3] - movq 0x38(%rdi),%xmm3 - pshufd $0x00,%xmm3,%xmm2 - pshufd $0x55,%xmm3,%xmm3 - paddd %xmm2,%xmm14 - paddd %xmm3,%xmm15 - - # x12 += counter values 0-3 - paddd %xmm1,%xmm12 - - # interleave 32-bit words in state n, n+1 - movdqa 0x00(%rsp),%xmm0 - movdqa 0x10(%rsp),%xmm1 - movdqa %xmm0,%xmm2 - punpckldq %xmm1,%xmm2 - punpckhdq %xmm1,%xmm0 - movdqa %xmm2,0x00(%rsp) - movdqa %xmm0,0x10(%rsp) - movdqa 0x20(%rsp),%xmm0 - movdqa 0x30(%rsp),%xmm1 - movdqa %xmm0,%xmm2 - punpckldq %xmm1,%xmm2 - punpckhdq %xmm1,%xmm0 - movdqa %xmm2,0x20(%rsp) - movdqa %xmm0,0x30(%rsp) - movdqa %xmm4,%xmm0 - punpckldq %xmm5,%xmm4 - punpckhdq %xmm5,%xmm0 - movdqa %xmm0,%xmm5 - movdqa %xmm6,%xmm0 - punpckldq %xmm7,%xmm6 - punpckhdq %xmm7,%xmm0 - movdqa %xmm0,%xmm7 - movdqa %xmm8,%xmm0 - punpckldq %xmm9,%xmm8 - punpckhdq %xmm9,%xmm0 - movdqa %xmm0,%xmm9 - movdqa %xmm10,%xmm0 - punpckldq %xmm11,%xmm10 - punpckhdq %xmm11,%xmm0 - movdqa %xmm0,%xmm11 - movdqa %xmm12,%xmm0 - punpckldq %xmm13,%xmm12 - punpckhdq %xmm13,%xmm0 - movdqa %xmm0,%xmm13 - movdqa %xmm14,%xmm0 - punpckldq %xmm15,%xmm14 - punpckhdq %xmm15,%xmm0 - movdqa %xmm0,%xmm15 - - # interleave 64-bit words in state n, n+2 - movdqa 0x00(%rsp),%xmm0 - movdqa 0x20(%rsp),%xmm1 - movdqa %xmm0,%xmm2 - punpcklqdq %xmm1,%xmm2 - punpckhqdq %xmm1,%xmm0 - movdqa %xmm2,0x00(%rsp) - movdqa %xmm0,0x20(%rsp) - movdqa 0x10(%rsp),%xmm0 - movdqa 0x30(%rsp),%xmm1 - movdqa %xmm0,%xmm2 - punpcklqdq %xmm1,%xmm2 - punpckhqdq %xmm1,%xmm0 - movdqa %xmm2,0x10(%rsp) - movdqa %xmm0,0x30(%rsp) - movdqa %xmm4,%xmm0 - punpcklqdq %xmm6,%xmm4 - punpckhqdq %xmm6,%xmm0 - movdqa %xmm0,%xmm6 - movdqa %xmm5,%xmm0 - punpcklqdq %xmm7,%xmm5 - punpckhqdq %xmm7,%xmm0 - movdqa %xmm0,%xmm7 - movdqa %xmm8,%xmm0 - punpcklqdq %xmm10,%xmm8 - punpckhqdq %xmm10,%xmm0 - movdqa %xmm0,%xmm10 - movdqa %xmm9,%xmm0 - punpcklqdq %xmm11,%xmm9 - punpckhqdq %xmm11,%xmm0 - movdqa %xmm0,%xmm11 - movdqa %xmm12,%xmm0 - punpcklqdq %xmm14,%xmm12 - punpckhqdq %xmm14,%xmm0 - movdqa %xmm0,%xmm14 - movdqa %xmm13,%xmm0 - punpcklqdq %xmm15,%xmm13 - punpckhqdq %xmm15,%xmm0 - movdqa %xmm0,%xmm15 - - # xor with corresponding input, write to output - movdqa 0x00(%rsp),%xmm0 - cmp $0x10,%rax - jl .Lxorpart4 - movdqu 0x00(%rdx),%xmm1 - pxor %xmm1,%xmm0 - movdqu %xmm0,0x00(%rsi) - - movdqu %xmm4,%xmm0 - cmp $0x20,%rax - jl .Lxorpart4 - movdqu 0x10(%rdx),%xmm1 - pxor %xmm1,%xmm0 - movdqu %xmm0,0x10(%rsi) - - movdqu %xmm8,%xmm0 - cmp $0x30,%rax - jl .Lxorpart4 - movdqu 0x20(%rdx),%xmm1 - pxor %xmm1,%xmm0 - movdqu %xmm0,0x20(%rsi) - - movdqu %xmm12,%xmm0 - cmp $0x40,%rax - jl .Lxorpart4 - movdqu 0x30(%rdx),%xmm1 - pxor %xmm1,%xmm0 - movdqu %xmm0,0x30(%rsi) - - movdqa 0x20(%rsp),%xmm0 - cmp $0x50,%rax - jl .Lxorpart4 - movdqu 0x40(%rdx),%xmm1 - pxor %xmm1,%xmm0 - movdqu %xmm0,0x40(%rsi) - - movdqu %xmm6,%xmm0 - cmp $0x60,%rax - jl .Lxorpart4 - movdqu 0x50(%rdx),%xmm1 - pxor %xmm1,%xmm0 - movdqu %xmm0,0x50(%rsi) - - movdqu %xmm10,%xmm0 - cmp $0x70,%rax - jl .Lxorpart4 - movdqu 0x60(%rdx),%xmm1 - pxor %xmm1,%xmm0 - movdqu %xmm0,0x60(%rsi) - - movdqu %xmm14,%xmm0 - cmp $0x80,%rax - jl .Lxorpart4 - movdqu 0x70(%rdx),%xmm1 - pxor %xmm1,%xmm0 - movdqu %xmm0,0x70(%rsi) - - movdqa 0x10(%rsp),%xmm0 - cmp $0x90,%rax - jl .Lxorpart4 - movdqu 0x80(%rdx),%xmm1 - pxor %xmm1,%xmm0 - movdqu %xmm0,0x80(%rsi) - - movdqu %xmm5,%xmm0 - cmp $0xa0,%rax - jl .Lxorpart4 - movdqu 0x90(%rdx),%xmm1 - pxor %xmm1,%xmm0 - movdqu %xmm0,0x90(%rsi) - - movdqu %xmm9,%xmm0 - cmp $0xb0,%rax - jl .Lxorpart4 - movdqu 0xa0(%rdx),%xmm1 - pxor %xmm1,%xmm0 - movdqu %xmm0,0xa0(%rsi) - - movdqu %xmm13,%xmm0 - cmp $0xc0,%rax - jl .Lxorpart4 - movdqu 0xb0(%rdx),%xmm1 - pxor %xmm1,%xmm0 - movdqu %xmm0,0xb0(%rsi) - - movdqa 0x30(%rsp),%xmm0 - cmp $0xd0,%rax - jl .Lxorpart4 - movdqu 0xc0(%rdx),%xmm1 - pxor %xmm1,%xmm0 - movdqu %xmm0,0xc0(%rsi) - - movdqu %xmm7,%xmm0 - cmp $0xe0,%rax - jl .Lxorpart4 - movdqu 0xd0(%rdx),%xmm1 - pxor %xmm1,%xmm0 - movdqu %xmm0,0xd0(%rsi) - - movdqu %xmm11,%xmm0 - cmp $0xf0,%rax - jl .Lxorpart4 - movdqu 0xe0(%rdx),%xmm1 - pxor %xmm1,%xmm0 - movdqu %xmm0,0xe0(%rsi) - - movdqu %xmm15,%xmm0 - cmp $0x100,%rax - jl .Lxorpart4 - movdqu 0xf0(%rdx),%xmm1 - pxor %xmm1,%xmm0 - movdqu %xmm0,0xf0(%rsi) - -.Ldone4: - lea -8(%r10),%rsp - ret - -.Lxorpart4: - # xor remaining bytes from partial register into output - mov %rax,%r9 - and $0x0f,%r9 - jz .Ldone4 - and $~0x0f,%rax - - mov %rsi,%r11 - - lea (%rdx,%rax),%rsi - mov %rsp,%rdi - mov %r9,%rcx - rep movsb - - pxor 0x00(%rsp),%xmm0 - movdqa %xmm0,0x00(%rsp) - - mov %rsp,%rsi - lea (%r11,%rax),%rdi - mov %r9,%rcx - rep movsb - - jmp .Ldone4 - -ENDPROC(chacha20_4block_xor_ssse3) diff --git a/arch/x86/crypto/chacha20_glue.c b/arch/x86/crypto/chacha20_glue.c deleted file mode 100644 index 70d388e4a3a2..000000000000 --- a/arch/x86/crypto/chacha20_glue.c +++ /dev/null @@ -1,264 +0,0 @@ -/* - * ChaCha20 256-bit cipher algorithm, RFC7539, SIMD glue code - * - * Copyright (C) 2015 Martin Willi - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ - -#include -#include -#include -#include -#include -#include -#include - -#define CHACHA20_STATE_ALIGN 16 - -asmlinkage void chacha20_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src, - unsigned int len); -asmlinkage void chacha20_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src, - unsigned int len); -asmlinkage void hchacha20_block_ssse3(const u32 *state, u32 *out); -#ifdef CONFIG_AS_AVX2 -asmlinkage void chacha20_2block_xor_avx2(u32 *state, u8 *dst, const u8 *src, - unsigned int len); -asmlinkage void chacha20_4block_xor_avx2(u32 *state, u8 *dst, const u8 *src, - unsigned int len); -asmlinkage void chacha20_8block_xor_avx2(u32 *state, u8 *dst, const u8 *src, - unsigned int len); -static bool chacha20_use_avx2; -#ifdef CONFIG_AS_AVX512 -asmlinkage void chacha20_2block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src, - unsigned int len); -asmlinkage void chacha20_4block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src, - unsigned int len); -asmlinkage void chacha20_8block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src, - unsigned int len); -static bool chacha20_use_avx512vl; -#endif -#endif - -static unsigned int chacha20_advance(unsigned int len, unsigned int maxblocks) -{ - len = min(len, maxblocks * CHACHA_BLOCK_SIZE); - return round_up(len, CHACHA_BLOCK_SIZE) / CHACHA_BLOCK_SIZE; -} - -static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src, - unsigned int bytes) -{ -#ifdef CONFIG_AS_AVX2 -#ifdef CONFIG_AS_AVX512 - if (chacha20_use_avx512vl) { - while (bytes >= CHACHA_BLOCK_SIZE * 8) { - chacha20_8block_xor_avx512vl(state, dst, src, bytes); - bytes -= CHACHA_BLOCK_SIZE * 8; - src += CHACHA_BLOCK_SIZE * 8; - dst += CHACHA_BLOCK_SIZE * 8; - state[12] += 8; - } - if (bytes > CHACHA_BLOCK_SIZE * 4) { - chacha20_8block_xor_avx512vl(state, dst, src, bytes); - state[12] += chacha20_advance(bytes, 8); - return; - } - if (bytes > CHACHA_BLOCK_SIZE * 2) { - chacha20_4block_xor_avx512vl(state, dst, src, bytes); - state[12] += chacha20_advance(bytes, 4); - return; - } - if (bytes) { - chacha20_2block_xor_avx512vl(state, dst, src, bytes); - state[12] += chacha20_advance(bytes, 2); - return; - } - } -#endif - if (chacha20_use_avx2) { - while (bytes >= CHACHA_BLOCK_SIZE * 8) { - chacha20_8block_xor_avx2(state, dst, src, bytes); - bytes -= CHACHA_BLOCK_SIZE * 8; - src += CHACHA_BLOCK_SIZE * 8; - dst += CHACHA_BLOCK_SIZE * 8; - state[12] += 8; - } - if (bytes > CHACHA_BLOCK_SIZE * 4) { - chacha20_8block_xor_avx2(state, dst, src, bytes); - state[12] += chacha20_advance(bytes, 8); - return; - } - if (bytes > CHACHA_BLOCK_SIZE * 2) { - chacha20_4block_xor_avx2(state, dst, src, bytes); - state[12] += chacha20_advance(bytes, 4); - return; - } - if (bytes > CHACHA_BLOCK_SIZE) { - chacha20_2block_xor_avx2(state, dst, src, bytes); - state[12] += chacha20_advance(bytes, 2); - return; - } - } -#endif - while (bytes >= CHACHA_BLOCK_SIZE * 4) { - chacha20_4block_xor_ssse3(state, dst, src, bytes); - bytes -= CHACHA_BLOCK_SIZE * 4; - src += CHACHA_BLOCK_SIZE * 4; - dst += CHACHA_BLOCK_SIZE * 4; - state[12] += 4; - } - if (bytes > CHACHA_BLOCK_SIZE) { - chacha20_4block_xor_ssse3(state, dst, src, bytes); - state[12] += chacha20_advance(bytes, 4); - return; - } - if (bytes) { - chacha20_block_xor_ssse3(state, dst, src, bytes); - state[12]++; - } -} - -static int chacha20_simd_stream_xor(struct skcipher_request *req, - struct chacha_ctx *ctx, u8 *iv) -{ - u32 *state, state_buf[16 + 2] __aligned(8); - struct skcipher_walk walk; - int err; - - BUILD_BUG_ON(CHACHA20_STATE_ALIGN != 16); - state = PTR_ALIGN(state_buf + 0, CHACHA20_STATE_ALIGN); - - err = skcipher_walk_virt(&walk, req, true); - - crypto_chacha_init(state, ctx, iv); - - while (walk.nbytes > 0) { - unsigned int nbytes = walk.nbytes; - - if (nbytes < walk.total) - nbytes = round_down(nbytes, walk.stride); - - chacha20_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr, - nbytes); - - err = skcipher_walk_done(&walk, walk.nbytes - nbytes); - } - - return err; -} - -static int chacha20_simd(struct skcipher_request *req) -{ - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); - int err; - - if (req->cryptlen <= CHACHA_BLOCK_SIZE || !irq_fpu_usable()) - return crypto_chacha_crypt(req); - - kernel_fpu_begin(); - err = chacha20_simd_stream_xor(req, ctx, req->iv); - kernel_fpu_end(); - return err; -} - -static int xchacha20_simd(struct skcipher_request *req) -{ - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); - struct chacha_ctx subctx; - u32 *state, state_buf[16 + 2] __aligned(8); - u8 real_iv[16]; - int err; - - if (req->cryptlen <= CHACHA_BLOCK_SIZE || !irq_fpu_usable()) - return crypto_xchacha_crypt(req); - - BUILD_BUG_ON(CHACHA20_STATE_ALIGN != 16); - state = PTR_ALIGN(state_buf + 0, CHACHA20_STATE_ALIGN); - crypto_chacha_init(state, ctx, req->iv); - - kernel_fpu_begin(); - - hchacha20_block_ssse3(state, subctx.key); - - memcpy(&real_iv[0], req->iv + 24, 8); - memcpy(&real_iv[8], req->iv + 16, 8); - err = chacha20_simd_stream_xor(req, &subctx, real_iv); - - kernel_fpu_end(); - - return err; -} - -static struct skcipher_alg algs[] = { - { - .base.cra_name = "chacha20", - .base.cra_driver_name = "chacha20-simd", - .base.cra_priority = 300, - .base.cra_blocksize = 1, - .base.cra_ctxsize = sizeof(struct chacha_ctx), - .base.cra_module = THIS_MODULE, - - .min_keysize = CHACHA_KEY_SIZE, - .max_keysize = CHACHA_KEY_SIZE, - .ivsize = CHACHA_IV_SIZE, - .chunksize = CHACHA_BLOCK_SIZE, - .setkey = crypto_chacha20_setkey, - .encrypt = chacha20_simd, - .decrypt = chacha20_simd, - }, { - .base.cra_name = "xchacha20", - .base.cra_driver_name = "xchacha20-simd", - .base.cra_priority = 300, - .base.cra_blocksize = 1, - .base.cra_ctxsize = sizeof(struct chacha_ctx), - .base.cra_module = THIS_MODULE, - - .min_keysize = CHACHA_KEY_SIZE, - .max_keysize = CHACHA_KEY_SIZE, - .ivsize = XCHACHA_IV_SIZE, - .chunksize = CHACHA_BLOCK_SIZE, - .setkey = crypto_chacha20_setkey, - .encrypt = xchacha20_simd, - .decrypt = xchacha20_simd, - }, -}; - -static int __init chacha20_simd_mod_init(void) -{ - if (!boot_cpu_has(X86_FEATURE_SSSE3)) - return -ENODEV; - -#ifdef CONFIG_AS_AVX2 - chacha20_use_avx2 = boot_cpu_has(X86_FEATURE_AVX) && - boot_cpu_has(X86_FEATURE_AVX2) && - cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL); -#ifdef CONFIG_AS_AVX512 - chacha20_use_avx512vl = chacha20_use_avx2 && - boot_cpu_has(X86_FEATURE_AVX512VL) && - boot_cpu_has(X86_FEATURE_AVX512BW); /* kmovq */ -#endif -#endif - return crypto_register_skciphers(algs, ARRAY_SIZE(algs)); -} - -static void __exit chacha20_simd_mod_fini(void) -{ - crypto_unregister_skciphers(algs, ARRAY_SIZE(algs)); -} - -module_init(chacha20_simd_mod_init); -module_exit(chacha20_simd_mod_fini); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Martin Willi "); -MODULE_DESCRIPTION("chacha20 cipher algorithm, SIMD accelerated"); -MODULE_ALIAS_CRYPTO("chacha20"); -MODULE_ALIAS_CRYPTO("chacha20-simd"); -MODULE_ALIAS_CRYPTO("xchacha20"); -MODULE_ALIAS_CRYPTO("xchacha20-simd"); diff --git a/arch/x86/crypto/chacha_glue.c b/arch/x86/crypto/chacha_glue.c new file mode 100644 index 000000000000..35fd02b50d27 --- /dev/null +++ b/arch/x86/crypto/chacha_glue.c @@ -0,0 +1,270 @@ +/* + * x64 SIMD accelerated ChaCha and XChaCha stream ciphers, + * including ChaCha20 (RFC7539) + * + * Copyright (C) 2015 Martin Willi + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include + +#define CHACHA_STATE_ALIGN 16 + +asmlinkage void chacha_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src, + unsigned int len, int nrounds); +asmlinkage void chacha_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src, + unsigned int len, int nrounds); +asmlinkage void hchacha_block_ssse3(const u32 *state, u32 *out, int nrounds); +#ifdef CONFIG_AS_AVX2 +asmlinkage void chacha_2block_xor_avx2(u32 *state, u8 *dst, const u8 *src, + unsigned int len, int nrounds); +asmlinkage void chacha_4block_xor_avx2(u32 *state, u8 *dst, const u8 *src, + unsigned int len, int nrounds); +asmlinkage void chacha_8block_xor_avx2(u32 *state, u8 *dst, const u8 *src, + unsigned int len, int nrounds); +static bool chacha_use_avx2; +#ifdef CONFIG_AS_AVX512 +asmlinkage void chacha_2block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src, + unsigned int len, int nrounds); +asmlinkage void chacha_4block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src, + unsigned int len, int nrounds); +asmlinkage void chacha_8block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src, + unsigned int len, int nrounds); +static bool chacha_use_avx512vl; +#endif +#endif + +static unsigned int chacha_advance(unsigned int len, unsigned int maxblocks) +{ + len = min(len, maxblocks * CHACHA_BLOCK_SIZE); + return round_up(len, CHACHA_BLOCK_SIZE) / CHACHA_BLOCK_SIZE; +} + +static void chacha_dosimd(u32 *state, u8 *dst, const u8 *src, + unsigned int bytes, int nrounds) +{ +#ifdef CONFIG_AS_AVX2 +#ifdef CONFIG_AS_AVX512 + if (chacha_use_avx512vl) { + while (bytes >= CHACHA_BLOCK_SIZE * 8) { + chacha_8block_xor_avx512vl(state, dst, src, bytes, + nrounds); + bytes -= CHACHA_BLOCK_SIZE * 8; + src += CHACHA_BLOCK_SIZE * 8; + dst += CHACHA_BLOCK_SIZE * 8; + state[12] += 8; + } + if (bytes > CHACHA_BLOCK_SIZE * 4) { + chacha_8block_xor_avx512vl(state, dst, src, bytes, + nrounds); + state[12] += chacha_advance(bytes, 8); + return; + } + if (bytes > CHACHA_BLOCK_SIZE * 2) { + chacha_4block_xor_avx512vl(state, dst, src, bytes, + nrounds); + state[12] += chacha_advance(bytes, 4); + return; + } + if (bytes) { + chacha_2block_xor_avx512vl(state, dst, src, bytes, + nrounds); + state[12] += chacha_advance(bytes, 2); + return; + } + } +#endif + if (chacha_use_avx2) { + while (bytes >= CHACHA_BLOCK_SIZE * 8) { + chacha_8block_xor_avx2(state, dst, src, bytes, nrounds); + bytes -= CHACHA_BLOCK_SIZE * 8; + src += CHACHA_BLOCK_SIZE * 8; + dst += CHACHA_BLOCK_SIZE * 8; + state[12] += 8; + } + if (bytes > CHACHA_BLOCK_SIZE * 4) { + chacha_8block_xor_avx2(state, dst, src, bytes, nrounds); + state[12] += chacha_advance(bytes, 8); + return; + } + if (bytes > CHACHA_BLOCK_SIZE * 2) { + chacha_4block_xor_avx2(state, dst, src, bytes, nrounds); + state[12] += chacha_advance(bytes, 4); + return; + } + if (bytes > CHACHA_BLOCK_SIZE) { + chacha_2block_xor_avx2(state, dst, src, bytes, nrounds); + state[12] += chacha_advance(bytes, 2); + return; + } + } +#endif + while (bytes >= CHACHA_BLOCK_SIZE * 4) { + chacha_4block_xor_ssse3(state, dst, src, bytes, nrounds); + bytes -= CHACHA_BLOCK_SIZE * 4; + src += CHACHA_BLOCK_SIZE * 4; + dst += CHACHA_BLOCK_SIZE * 4; + state[12] += 4; + } + if (bytes > CHACHA_BLOCK_SIZE) { + chacha_4block_xor_ssse3(state, dst, src, bytes, nrounds); + state[12] += chacha_advance(bytes, 4); + return; + } + if (bytes) { + chacha_block_xor_ssse3(state, dst, src, bytes, nrounds); + state[12]++; + } +} + +static int chacha_simd_stream_xor(struct skcipher_request *req, + struct chacha_ctx *ctx, u8 *iv) +{ + u32 *state, state_buf[16 + 2] __aligned(8); + struct skcipher_walk walk; + int err; + + BUILD_BUG_ON(CHACHA_STATE_ALIGN != 16); + state = PTR_ALIGN(state_buf + 0, CHACHA_STATE_ALIGN); + + err = skcipher_walk_virt(&walk, req, true); + + crypto_chacha_init(state, ctx, iv); + + while (walk.nbytes > 0) { + unsigned int nbytes = walk.nbytes; + + if (nbytes < walk.total) + nbytes = round_down(nbytes, walk.stride); + + chacha_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr, + nbytes, ctx->nrounds); + + err = skcipher_walk_done(&walk, walk.nbytes - nbytes); + } + + return err; +} + +static int chacha_simd(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); + int err; + + if (req->cryptlen <= CHACHA_BLOCK_SIZE || !irq_fpu_usable()) + return crypto_chacha_crypt(req); + + kernel_fpu_begin(); + err = chacha_simd_stream_xor(req, ctx, req->iv); + kernel_fpu_end(); + return err; +} + +static int xchacha_simd(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); + struct chacha_ctx subctx; + u32 *state, state_buf[16 + 2] __aligned(8); + u8 real_iv[16]; + int err; + + if (req->cryptlen <= CHACHA_BLOCK_SIZE || !irq_fpu_usable()) + return crypto_xchacha_crypt(req); + + BUILD_BUG_ON(CHACHA_STATE_ALIGN != 16); + state = PTR_ALIGN(state_buf + 0, CHACHA_STATE_ALIGN); + crypto_chacha_init(state, ctx, req->iv); + + kernel_fpu_begin(); + + hchacha_block_ssse3(state, subctx.key, ctx->nrounds); + subctx.nrounds = ctx->nrounds; + + memcpy(&real_iv[0], req->iv + 24, 8); + memcpy(&real_iv[8], req->iv + 16, 8); + err = chacha_simd_stream_xor(req, &subctx, real_iv); + + kernel_fpu_end(); + + return err; +} + +static struct skcipher_alg algs[] = { + { + .base.cra_name = "chacha20", + .base.cra_driver_name = "chacha20-simd", + .base.cra_priority = 300, + .base.cra_blocksize = 1, + .base.cra_ctxsize = sizeof(struct chacha_ctx), + .base.cra_module = THIS_MODULE, + + .min_keysize = CHACHA_KEY_SIZE, + .max_keysize = CHACHA_KEY_SIZE, + .ivsize = CHACHA_IV_SIZE, + .chunksize = CHACHA_BLOCK_SIZE, + .setkey = crypto_chacha20_setkey, + .encrypt = chacha_simd, + .decrypt = chacha_simd, + }, { + .base.cra_name = "xchacha20", + .base.cra_driver_name = "xchacha20-simd", + .base.cra_priority = 300, + .base.cra_blocksize = 1, + .base.cra_ctxsize = sizeof(struct chacha_ctx), + .base.cra_module = THIS_MODULE, + + .min_keysize = CHACHA_KEY_SIZE, + .max_keysize = CHACHA_KEY_SIZE, + .ivsize = XCHACHA_IV_SIZE, + .chunksize = CHACHA_BLOCK_SIZE, + .setkey = crypto_chacha20_setkey, + .encrypt = xchacha_simd, + .decrypt = xchacha_simd, + }, +}; + +static int __init chacha_simd_mod_init(void) +{ + if (!boot_cpu_has(X86_FEATURE_SSSE3)) + return -ENODEV; + +#ifdef CONFIG_AS_AVX2 + chacha_use_avx2 = boot_cpu_has(X86_FEATURE_AVX) && + boot_cpu_has(X86_FEATURE_AVX2) && + cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL); +#ifdef CONFIG_AS_AVX512 + chacha_use_avx512vl = chacha_use_avx2 && + boot_cpu_has(X86_FEATURE_AVX512VL) && + boot_cpu_has(X86_FEATURE_AVX512BW); /* kmovq */ +#endif +#endif + return crypto_register_skciphers(algs, ARRAY_SIZE(algs)); +} + +static void __exit chacha_simd_mod_fini(void) +{ + crypto_unregister_skciphers(algs, ARRAY_SIZE(algs)); +} + +module_init(chacha_simd_mod_init); +module_exit(chacha_simd_mod_fini); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Martin Willi "); +MODULE_DESCRIPTION("ChaCha and XChaCha stream ciphers (x64 SIMD accelerated)"); +MODULE_ALIAS_CRYPTO("chacha20"); +MODULE_ALIAS_CRYPTO("chacha20-simd"); +MODULE_ALIAS_CRYPTO("xchacha20"); +MODULE_ALIAS_CRYPTO("xchacha20-simd"); -- cgit v1.2.3 From 7a507d62258afd514583fadf1482451079fa0e4d Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 4 Dec 2018 22:20:04 -0800 Subject: crypto: x86/chacha - add XChaCha12 support Now that the x86_64 SIMD implementations of ChaCha20 and XChaCha20 have been refactored to support varying the number of rounds, add support for XChaCha12. This is identical to XChaCha20 except for the number of rounds, which is 12 instead of 20. This can be used by Adiantum. Reviewed-by: Martin Willi Signed-off-by: Eric Biggers Signed-off-by: Herbert Xu --- arch/x86/crypto/chacha_glue.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'arch/x86/crypto') diff --git a/arch/x86/crypto/chacha_glue.c b/arch/x86/crypto/chacha_glue.c index 35fd02b50d27..d19c2908be90 100644 --- a/arch/x86/crypto/chacha_glue.c +++ b/arch/x86/crypto/chacha_glue.c @@ -232,6 +232,21 @@ static struct skcipher_alg algs[] = { .setkey = crypto_chacha20_setkey, .encrypt = xchacha_simd, .decrypt = xchacha_simd, + }, { + .base.cra_name = "xchacha12", + .base.cra_driver_name = "xchacha12-simd", + .base.cra_priority = 300, + .base.cra_blocksize = 1, + .base.cra_ctxsize = sizeof(struct chacha_ctx), + .base.cra_module = THIS_MODULE, + + .min_keysize = CHACHA_KEY_SIZE, + .max_keysize = CHACHA_KEY_SIZE, + .ivsize = XCHACHA_IV_SIZE, + .chunksize = CHACHA_BLOCK_SIZE, + .setkey = crypto_chacha12_setkey, + .encrypt = xchacha_simd, + .decrypt = xchacha_simd, }, }; @@ -268,3 +283,5 @@ MODULE_ALIAS_CRYPTO("chacha20"); MODULE_ALIAS_CRYPTO("chacha20-simd"); MODULE_ALIAS_CRYPTO("xchacha20"); MODULE_ALIAS_CRYPTO("xchacha20-simd"); +MODULE_ALIAS_CRYPTO("xchacha12"); +MODULE_ALIAS_CRYPTO("xchacha12-simd"); -- cgit v1.2.3 From a033aed5a84eb93a32929b6862602cb283d39e82 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 4 Dec 2018 22:20:05 -0800 Subject: crypto: x86/chacha - yield the FPU occasionally To improve responsiveness, yield the FPU (temporarily re-enabling preemption) every 4 KiB encrypted/decrypted, rather than keeping preemption disabled during the entire encryption/decryption operation. Alternatively we could do this for every skcipher_walk step, but steps may be small in some cases, and yielding the FPU is expensive on x86. Suggested-by: Martin Willi Signed-off-by: Eric Biggers Signed-off-by: Herbert Xu --- arch/x86/crypto/chacha_glue.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'arch/x86/crypto') diff --git a/arch/x86/crypto/chacha_glue.c b/arch/x86/crypto/chacha_glue.c index d19c2908be90..9b1d3fac4943 100644 --- a/arch/x86/crypto/chacha_glue.c +++ b/arch/x86/crypto/chacha_glue.c @@ -132,6 +132,7 @@ static int chacha_simd_stream_xor(struct skcipher_request *req, { u32 *state, state_buf[16 + 2] __aligned(8); struct skcipher_walk walk; + int next_yield = 4096; /* bytes until next FPU yield */ int err; BUILD_BUG_ON(CHACHA_STATE_ALIGN != 16); @@ -144,12 +145,21 @@ static int chacha_simd_stream_xor(struct skcipher_request *req, while (walk.nbytes > 0) { unsigned int nbytes = walk.nbytes; - if (nbytes < walk.total) + if (nbytes < walk.total) { nbytes = round_down(nbytes, walk.stride); + next_yield -= nbytes; + } chacha_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr, nbytes, ctx->nrounds); + if (next_yield <= 0) { + /* temporarily allow preemption */ + kernel_fpu_end(); + kernel_fpu_begin(); + next_yield = 4096; + } + err = skcipher_walk_done(&walk, walk.nbytes - nbytes); } -- cgit v1.2.3 From f9b1d64678607e132051d232c7a2127f32947d64 Mon Sep 17 00:00:00 2001 From: Dave Watson Date: Mon, 10 Dec 2018 19:56:45 +0000 Subject: crypto: aesni - Merge GCM_ENC_DEC The GCM_ENC_DEC routines for AVX and AVX2 are identical, except they call separate sub-macros. Pass the macros as arguments, and merge them. This facilitates additional refactoring, by requiring changes in only one place. The GCM_ENC_DEC macro was moved above the CONFIG_AS_AVX* ifdefs, since it will be used by both AVX and AVX2. Signed-off-by: Dave Watson Signed-off-by: Herbert Xu --- arch/x86/crypto/aesni-intel_avx-x86_64.S | 2481 +++++++++++++----------------- 1 file changed, 1083 insertions(+), 1398 deletions(-) (limited to 'arch/x86/crypto') diff --git a/arch/x86/crypto/aesni-intel_avx-x86_64.S b/arch/x86/crypto/aesni-intel_avx-x86_64.S index 1985ea0b551b..318135a77975 100644 --- a/arch/x86/crypto/aesni-intel_avx-x86_64.S +++ b/arch/x86/crypto/aesni-intel_avx-x86_64.S @@ -280,1252 +280,1250 @@ VARIABLE_OFFSET = 16*8 vaesenclast 16*10(arg1), \XMM0, \XMM0 .endm -#ifdef CONFIG_AS_AVX -############################################################################### -# GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0) -# Input: A and B (128-bits each, bit-reflected) -# Output: C = A*B*x mod poly, (i.e. >>1 ) -# To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input -# GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly. -############################################################################### -.macro GHASH_MUL_AVX GH HK T1 T2 T3 T4 T5 +# combined for GCM encrypt and decrypt functions +# clobbering all xmm registers +# clobbering r10, r11, r12, r13, r14, r15 +.macro GCM_ENC_DEC INITIAL_BLOCKS GHASH_8_ENCRYPT_8_PARALLEL GHASH_LAST_8 GHASH_MUL ENC_DEC - vpshufd $0b01001110, \GH, \T2 - vpshufd $0b01001110, \HK, \T3 - vpxor \GH , \T2, \T2 # T2 = (a1+a0) - vpxor \HK , \T3, \T3 # T3 = (b1+b0) + #the number of pushes must equal STACK_OFFSET + push %r12 + push %r13 + push %r14 + push %r15 - vpclmulqdq $0x11, \HK, \GH, \T1 # T1 = a1*b1 - vpclmulqdq $0x00, \HK, \GH, \GH # GH = a0*b0 - vpclmulqdq $0x00, \T3, \T2, \T2 # T2 = (a1+a0)*(b1+b0) - vpxor \GH, \T2,\T2 - vpxor \T1, \T2,\T2 # T2 = a0*b1+a1*b0 + mov %rsp, %r14 - vpslldq $8, \T2,\T3 # shift-L T3 2 DWs - vpsrldq $8, \T2,\T2 # shift-R T2 2 DWs - vpxor \T3, \GH, \GH - vpxor \T2, \T1, \T1 # = GH x HK - #first phase of the reduction - vpslld $31, \GH, \T2 # packed right shifting << 31 - vpslld $30, \GH, \T3 # packed right shifting shift << 30 - vpslld $25, \GH, \T4 # packed right shifting shift << 25 - vpxor \T3, \T2, \T2 # xor the shifted versions - vpxor \T4, \T2, \T2 - vpsrldq $4, \T2, \T5 # shift-R T5 1 DW + sub $VARIABLE_OFFSET, %rsp + and $~63, %rsp # align rsp to 64 bytes - vpslldq $12, \T2, \T2 # shift-L T2 3 DWs - vpxor \T2, \GH, \GH # first phase of the reduction complete - #second phase of the reduction + vmovdqu HashKey(arg1), %xmm13 # xmm13 = HashKey - vpsrld $1,\GH, \T2 # packed left shifting >> 1 - vpsrld $2,\GH, \T3 # packed left shifting >> 2 - vpsrld $7,\GH, \T4 # packed left shifting >> 7 - vpxor \T3, \T2, \T2 # xor the shifted versions - vpxor \T4, \T2, \T2 + mov arg4, %r13 # save the number of bytes of plaintext/ciphertext + and $-16, %r13 # r13 = r13 - (r13 mod 16) - vpxor \T5, \T2, \T2 - vpxor \T2, \GH, \GH - vpxor \T1, \GH, \GH # the result is in GH + mov %r13, %r12 + shr $4, %r12 + and $7, %r12 + jz _initial_num_blocks_is_0\@ + cmp $7, %r12 + je _initial_num_blocks_is_7\@ + cmp $6, %r12 + je _initial_num_blocks_is_6\@ + cmp $5, %r12 + je _initial_num_blocks_is_5\@ + cmp $4, %r12 + je _initial_num_blocks_is_4\@ + cmp $3, %r12 + je _initial_num_blocks_is_3\@ + cmp $2, %r12 + je _initial_num_blocks_is_2\@ -.endm + jmp _initial_num_blocks_is_1\@ -.macro PRECOMPUTE_AVX HK T1 T2 T3 T4 T5 T6 +_initial_num_blocks_is_7\@: + \INITIAL_BLOCKS 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC + sub $16*7, %r13 + jmp _initial_blocks_encrypted\@ - # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i - vmovdqa \HK, \T5 +_initial_num_blocks_is_6\@: + \INITIAL_BLOCKS 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC + sub $16*6, %r13 + jmp _initial_blocks_encrypted\@ - vpshufd $0b01001110, \T5, \T1 - vpxor \T5, \T1, \T1 - vmovdqa \T1, HashKey_k(arg1) +_initial_num_blocks_is_5\@: + \INITIAL_BLOCKS 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC + sub $16*5, %r13 + jmp _initial_blocks_encrypted\@ - GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly - vmovdqa \T5, HashKey_2(arg1) # [HashKey_2] = HashKey^2<<1 mod poly - vpshufd $0b01001110, \T5, \T1 - vpxor \T5, \T1, \T1 - vmovdqa \T1, HashKey_2_k(arg1) +_initial_num_blocks_is_4\@: + \INITIAL_BLOCKS 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC + sub $16*4, %r13 + jmp _initial_blocks_encrypted\@ - GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly - vmovdqa \T5, HashKey_3(arg1) - vpshufd $0b01001110, \T5, \T1 - vpxor \T5, \T1, \T1 - vmovdqa \T1, HashKey_3_k(arg1) +_initial_num_blocks_is_3\@: + \INITIAL_BLOCKS 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC + sub $16*3, %r13 + jmp _initial_blocks_encrypted\@ - GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly - vmovdqa \T5, HashKey_4(arg1) - vpshufd $0b01001110, \T5, \T1 - vpxor \T5, \T1, \T1 - vmovdqa \T1, HashKey_4_k(arg1) +_initial_num_blocks_is_2\@: + \INITIAL_BLOCKS 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC + sub $16*2, %r13 + jmp _initial_blocks_encrypted\@ - GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly - vmovdqa \T5, HashKey_5(arg1) - vpshufd $0b01001110, \T5, \T1 - vpxor \T5, \T1, \T1 - vmovdqa \T1, HashKey_5_k(arg1) +_initial_num_blocks_is_1\@: + \INITIAL_BLOCKS 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC + sub $16*1, %r13 + jmp _initial_blocks_encrypted\@ - GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly - vmovdqa \T5, HashKey_6(arg1) - vpshufd $0b01001110, \T5, \T1 - vpxor \T5, \T1, \T1 - vmovdqa \T1, HashKey_6_k(arg1) +_initial_num_blocks_is_0\@: + \INITIAL_BLOCKS 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC - GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly - vmovdqa \T5, HashKey_7(arg1) - vpshufd $0b01001110, \T5, \T1 - vpxor \T5, \T1, \T1 - vmovdqa \T1, HashKey_7_k(arg1) - GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly - vmovdqa \T5, HashKey_8(arg1) - vpshufd $0b01001110, \T5, \T1 - vpxor \T5, \T1, \T1 - vmovdqa \T1, HashKey_8_k(arg1) +_initial_blocks_encrypted\@: + cmp $0, %r13 + je _zero_cipher_left\@ -.endm + sub $128, %r13 + je _eight_cipher_left\@ -## if a = number of total plaintext bytes -## b = floor(a/16) -## num_initial_blocks = b mod 4# -## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext -## r10, r11, r12, rax are clobbered -## arg1, arg2, arg3, r14 are used as a pointer only, not modified -.macro INITIAL_BLOCKS_AVX num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC - i = (8-\num_initial_blocks) - j = 0 - setreg - mov arg6, %r10 # r10 = AAD - mov arg7, %r12 # r12 = aadLen + vmovd %xmm9, %r15d + and $255, %r15d + vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 - mov %r12, %r11 - vpxor reg_j, reg_j, reg_j - vpxor reg_i, reg_i, reg_i - cmp $16, %r11 - jl _get_AAD_rest8\@ -_get_AAD_blocks\@: - vmovdqu (%r10), reg_i - vpshufb SHUF_MASK(%rip), reg_i, reg_i - vpxor reg_i, reg_j, reg_j - GHASH_MUL_AVX reg_j, \T2, \T1, \T3, \T4, \T5, \T6 - add $16, %r10 - sub $16, %r12 - sub $16, %r11 - cmp $16, %r11 - jge _get_AAD_blocks\@ - vmovdqu reg_j, reg_i - cmp $0, %r11 - je _get_AAD_done\@ +_encrypt_by_8_new\@: + cmp $(255-8), %r15d + jg _encrypt_by_8\@ - vpxor reg_i, reg_i, reg_i - /* read the last <16B of AAD. since we have at least 4B of - data right after the AAD (the ICV, and maybe some CT), we can - read 4B/8B blocks safely, and then get rid of the extra stuff */ -_get_AAD_rest8\@: - cmp $4, %r11 - jle _get_AAD_rest4\@ - movq (%r10), \T1 - add $8, %r10 - sub $8, %r11 - vpslldq $8, \T1, \T1 - vpsrldq $8, reg_i, reg_i - vpxor \T1, reg_i, reg_i - jmp _get_AAD_rest8\@ -_get_AAD_rest4\@: - cmp $0, %r11 - jle _get_AAD_rest0\@ - mov (%r10), %eax - movq %rax, \T1 - add $4, %r10 - sub $4, %r11 - vpslldq $12, \T1, \T1 - vpsrldq $4, reg_i, reg_i - vpxor \T1, reg_i, reg_i -_get_AAD_rest0\@: - /* finalize: shift out the extra bytes we read, and align - left. since pslldq can only shift by an immediate, we use - vpshufb and an array of shuffle masks */ - movq %r12, %r11 - salq $4, %r11 - movdqu aad_shift_arr(%r11), \T1 - vpshufb \T1, reg_i, reg_i -_get_AAD_rest_final\@: - vpshufb SHUF_MASK(%rip), reg_i, reg_i - vpxor reg_j, reg_i, reg_i - GHASH_MUL_AVX reg_i, \T2, \T1, \T3, \T4, \T5, \T6 -_get_AAD_done\@: - # initialize the data pointer offset as zero - xor %r11d, %r11d + add $8, %r15b + \GHASH_8_ENCRYPT_8_PARALLEL %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC + add $128, %r11 + sub $128, %r13 + jne _encrypt_by_8_new\@ - # start AES for num_initial_blocks blocks - mov arg5, %rax # rax = *Y0 - vmovdqu (%rax), \CTR # CTR = Y0 - vpshufb SHUF_MASK(%rip), \CTR, \CTR + vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 + jmp _eight_cipher_left\@ +_encrypt_by_8\@: + vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 + add $8, %r15b + \GHASH_8_ENCRYPT_8_PARALLEL %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC + vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 + add $128, %r11 + sub $128, %r13 + jne _encrypt_by_8_new\@ - i = (9-\num_initial_blocks) - setreg -.rep \num_initial_blocks - vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 - vmovdqa \CTR, reg_i - vpshufb SHUF_MASK(%rip), reg_i, reg_i # perform a 16Byte swap - i = (i+1) - setreg -.endr + vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 - vmovdqa (arg1), \T_key - i = (9-\num_initial_blocks) - setreg -.rep \num_initial_blocks - vpxor \T_key, reg_i, reg_i - i = (i+1) - setreg -.endr - j = 1 - setreg -.rep 9 - vmovdqa 16*j(arg1), \T_key - i = (9-\num_initial_blocks) - setreg -.rep \num_initial_blocks - vaesenc \T_key, reg_i, reg_i - i = (i+1) - setreg -.endr - j = (j+1) - setreg -.endr - - - vmovdqa 16*10(arg1), \T_key - i = (9-\num_initial_blocks) - setreg -.rep \num_initial_blocks - vaesenclast \T_key, reg_i, reg_i - i = (i+1) - setreg -.endr - i = (9-\num_initial_blocks) - setreg -.rep \num_initial_blocks - vmovdqu (arg3, %r11), \T1 - vpxor \T1, reg_i, reg_i - vmovdqu reg_i, (arg2 , %r11) # write back ciphertext for num_initial_blocks blocks - add $16, %r11 -.if \ENC_DEC == DEC - vmovdqa \T1, reg_i -.endif - vpshufb SHUF_MASK(%rip), reg_i, reg_i # prepare ciphertext for GHASH computations - i = (i+1) - setreg -.endr +_eight_cipher_left\@: + \GHASH_LAST_8 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8 - i = (8-\num_initial_blocks) - j = (9-\num_initial_blocks) - setreg +_zero_cipher_left\@: + cmp $16, arg4 + jl _only_less_than_16\@ -.rep \num_initial_blocks - vpxor reg_i, reg_j, reg_j - GHASH_MUL_AVX reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks - i = (i+1) - j = (j+1) - setreg -.endr - # XMM8 has the combined result here + mov arg4, %r13 + and $15, %r13 # r13 = (arg4 mod 16) - vmovdqa \XMM8, TMP1(%rsp) - vmovdqa \XMM8, \T3 + je _multiple_of_16_bytes\@ - cmp $128, %r13 - jl _initial_blocks_done\@ # no need for precomputed constants + # handle the last <16 Byte block seperately -############################################################################### -# Haskey_i_k holds XORed values of the low and high parts of the Haskey_i - vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 - vmovdqa \CTR, \XMM1 - vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap - vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 - vmovdqa \CTR, \XMM2 - vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap + vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn + vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 + ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn) - vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 - vmovdqa \CTR, \XMM3 - vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap + sub $16, %r11 + add %r13, %r11 + vmovdqu (arg3, %r11), %xmm1 # receive the last <16 Byte block - vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 - vmovdqa \CTR, \XMM4 - vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap + lea SHIFT_MASK+16(%rip), %r12 + sub %r13, %r12 # adjust the shuffle mask pointer to be + # able to shift 16-r13 bytes (r13 is the + # number of bytes in plaintext mod 16) + vmovdqu (%r12), %xmm2 # get the appropriate shuffle mask + vpshufb %xmm2, %xmm1, %xmm1 # shift right 16-r13 bytes + jmp _final_ghash_mul\@ - vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 - vmovdqa \CTR, \XMM5 - vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap +_only_less_than_16\@: + # check for 0 length + mov arg4, %r13 + and $15, %r13 # r13 = (arg4 mod 16) - vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 - vmovdqa \CTR, \XMM6 - vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap + je _multiple_of_16_bytes\@ - vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 - vmovdqa \CTR, \XMM7 - vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap + # handle the last <16 Byte block separately - vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 - vmovdqa \CTR, \XMM8 - vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap - vmovdqa (arg1), \T_key - vpxor \T_key, \XMM1, \XMM1 - vpxor \T_key, \XMM2, \XMM2 - vpxor \T_key, \XMM3, \XMM3 - vpxor \T_key, \XMM4, \XMM4 - vpxor \T_key, \XMM5, \XMM5 - vpxor \T_key, \XMM6, \XMM6 - vpxor \T_key, \XMM7, \XMM7 - vpxor \T_key, \XMM8, \XMM8 + vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn + vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 + ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn) - i = 1 - setreg -.rep 9 # do 9 rounds - vmovdqa 16*i(arg1), \T_key - vaesenc \T_key, \XMM1, \XMM1 - vaesenc \T_key, \XMM2, \XMM2 - vaesenc \T_key, \XMM3, \XMM3 - vaesenc \T_key, \XMM4, \XMM4 - vaesenc \T_key, \XMM5, \XMM5 - vaesenc \T_key, \XMM6, \XMM6 - vaesenc \T_key, \XMM7, \XMM7 - vaesenc \T_key, \XMM8, \XMM8 - i = (i+1) - setreg -.endr + lea SHIFT_MASK+16(%rip), %r12 + sub %r13, %r12 # adjust the shuffle mask pointer to be + # able to shift 16-r13 bytes (r13 is the + # number of bytes in plaintext mod 16) - vmovdqa 16*i(arg1), \T_key - vaesenclast \T_key, \XMM1, \XMM1 - vaesenclast \T_key, \XMM2, \XMM2 - vaesenclast \T_key, \XMM3, \XMM3 - vaesenclast \T_key, \XMM4, \XMM4 - vaesenclast \T_key, \XMM5, \XMM5 - vaesenclast \T_key, \XMM6, \XMM6 - vaesenclast \T_key, \XMM7, \XMM7 - vaesenclast \T_key, \XMM8, \XMM8 +_get_last_16_byte_loop\@: + movb (arg3, %r11), %al + movb %al, TMP1 (%rsp , %r11) + add $1, %r11 + cmp %r13, %r11 + jne _get_last_16_byte_loop\@ - vmovdqu (arg3, %r11), \T1 - vpxor \T1, \XMM1, \XMM1 - vmovdqu \XMM1, (arg2 , %r11) - .if \ENC_DEC == DEC - vmovdqa \T1, \XMM1 - .endif + vmovdqu TMP1(%rsp), %xmm1 - vmovdqu 16*1(arg3, %r11), \T1 - vpxor \T1, \XMM2, \XMM2 - vmovdqu \XMM2, 16*1(arg2 , %r11) - .if \ENC_DEC == DEC - vmovdqa \T1, \XMM2 - .endif + sub $16, %r11 - vmovdqu 16*2(arg3, %r11), \T1 - vpxor \T1, \XMM3, \XMM3 - vmovdqu \XMM3, 16*2(arg2 , %r11) - .if \ENC_DEC == DEC - vmovdqa \T1, \XMM3 - .endif +_final_ghash_mul\@: + .if \ENC_DEC == DEC + vmovdqa %xmm1, %xmm2 + vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn) + vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to + # mask out top 16-r13 bytes of xmm9 + vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9 + vpand %xmm1, %xmm2, %xmm2 + vpshufb SHUF_MASK(%rip), %xmm2, %xmm2 + vpxor %xmm2, %xmm14, %xmm14 + #GHASH computation for the last <16 Byte block + \GHASH_MUL %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 + sub %r13, %r11 + add $16, %r11 + .else + vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn) + vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to + # mask out top 16-r13 bytes of xmm9 + vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9 + vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 + vpxor %xmm9, %xmm14, %xmm14 + #GHASH computation for the last <16 Byte block + \GHASH_MUL %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 + sub %r13, %r11 + add $16, %r11 + vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 # shuffle xmm9 back to output as ciphertext + .endif - vmovdqu 16*3(arg3, %r11), \T1 - vpxor \T1, \XMM4, \XMM4 - vmovdqu \XMM4, 16*3(arg2 , %r11) - .if \ENC_DEC == DEC - vmovdqa \T1, \XMM4 - .endif - vmovdqu 16*4(arg3, %r11), \T1 - vpxor \T1, \XMM5, \XMM5 - vmovdqu \XMM5, 16*4(arg2 , %r11) - .if \ENC_DEC == DEC - vmovdqa \T1, \XMM5 - .endif + ############################# + # output r13 Bytes + vmovq %xmm9, %rax + cmp $8, %r13 + jle _less_than_8_bytes_left\@ - vmovdqu 16*5(arg3, %r11), \T1 - vpxor \T1, \XMM6, \XMM6 - vmovdqu \XMM6, 16*5(arg2 , %r11) - .if \ENC_DEC == DEC - vmovdqa \T1, \XMM6 - .endif + mov %rax, (arg2 , %r11) + add $8, %r11 + vpsrldq $8, %xmm9, %xmm9 + vmovq %xmm9, %rax + sub $8, %r13 - vmovdqu 16*6(arg3, %r11), \T1 - vpxor \T1, \XMM7, \XMM7 - vmovdqu \XMM7, 16*6(arg2 , %r11) - .if \ENC_DEC == DEC - vmovdqa \T1, \XMM7 - .endif +_less_than_8_bytes_left\@: + movb %al, (arg2 , %r11) + add $1, %r11 + shr $8, %rax + sub $1, %r13 + jne _less_than_8_bytes_left\@ + ############################# - vmovdqu 16*7(arg3, %r11), \T1 - vpxor \T1, \XMM8, \XMM8 - vmovdqu \XMM8, 16*7(arg2 , %r11) - .if \ENC_DEC == DEC - vmovdqa \T1, \XMM8 - .endif +_multiple_of_16_bytes\@: + mov arg7, %r12 # r12 = aadLen (number of bytes) + shl $3, %r12 # convert into number of bits + vmovd %r12d, %xmm15 # len(A) in xmm15 - add $128, %r11 + shl $3, arg4 # len(C) in bits (*128) + vmovq arg4, %xmm1 + vpslldq $8, %xmm15, %xmm15 # xmm15 = len(A)|| 0x0000000000000000 + vpxor %xmm1, %xmm15, %xmm15 # xmm15 = len(A)||len(C) - vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap - vpxor TMP1(%rsp), \XMM1, \XMM1 # combine GHASHed value with the corresponding ciphertext - vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap + vpxor %xmm15, %xmm14, %xmm14 + \GHASH_MUL %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 # final GHASH computation + vpshufb SHUF_MASK(%rip), %xmm14, %xmm14 # perform a 16Byte swap -############################################################################### + mov arg5, %rax # rax = *Y0 + vmovdqu (%rax), %xmm9 # xmm9 = Y0 -_initial_blocks_done\@: + ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Y0) -.endm + vpxor %xmm14, %xmm9, %xmm9 -# encrypt 8 blocks at a time -# ghash the 8 previously encrypted ciphertext blocks -# arg1, arg2, arg3 are used as pointers only, not modified -# r11 is the data offset value -.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC - vmovdqa \XMM1, \T2 - vmovdqa \XMM2, TMP2(%rsp) - vmovdqa \XMM3, TMP3(%rsp) - vmovdqa \XMM4, TMP4(%rsp) - vmovdqa \XMM5, TMP5(%rsp) - vmovdqa \XMM6, TMP6(%rsp) - vmovdqa \XMM7, TMP7(%rsp) - vmovdqa \XMM8, TMP8(%rsp) -.if \loop_idx == in_order - vpaddd ONE(%rip), \CTR, \XMM1 # INCR CNT - vpaddd ONE(%rip), \XMM1, \XMM2 - vpaddd ONE(%rip), \XMM2, \XMM3 - vpaddd ONE(%rip), \XMM3, \XMM4 - vpaddd ONE(%rip), \XMM4, \XMM5 - vpaddd ONE(%rip), \XMM5, \XMM6 - vpaddd ONE(%rip), \XMM6, \XMM7 - vpaddd ONE(%rip), \XMM7, \XMM8 - vmovdqa \XMM8, \CTR +_return_T\@: + mov arg8, %r10 # r10 = authTag + mov arg9, %r11 # r11 = auth_tag_len - vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap -.else - vpaddd ONEf(%rip), \CTR, \XMM1 # INCR CNT - vpaddd ONEf(%rip), \XMM1, \XMM2 - vpaddd ONEf(%rip), \XMM2, \XMM3 - vpaddd ONEf(%rip), \XMM3, \XMM4 - vpaddd ONEf(%rip), \XMM4, \XMM5 - vpaddd ONEf(%rip), \XMM5, \XMM6 - vpaddd ONEf(%rip), \XMM6, \XMM7 - vpaddd ONEf(%rip), \XMM7, \XMM8 - vmovdqa \XMM8, \CTR -.endif + cmp $16, %r11 + je _T_16\@ + cmp $8, %r11 + jl _T_4\@ - ####################################################################### +_T_8\@: + vmovq %xmm9, %rax + mov %rax, (%r10) + add $8, %r10 + sub $8, %r11 + vpsrldq $8, %xmm9, %xmm9 + cmp $0, %r11 + je _return_T_done\@ +_T_4\@: + vmovd %xmm9, %eax + mov %eax, (%r10) + add $4, %r10 + sub $4, %r11 + vpsrldq $4, %xmm9, %xmm9 + cmp $0, %r11 + je _return_T_done\@ +_T_123\@: + vmovd %xmm9, %eax + cmp $2, %r11 + jl _T_1\@ + mov %ax, (%r10) + cmp $2, %r11 + je _return_T_done\@ + add $2, %r10 + sar $16, %eax +_T_1\@: + mov %al, (%r10) + jmp _return_T_done\@ - vmovdqu (arg1), \T1 - vpxor \T1, \XMM1, \XMM1 - vpxor \T1, \XMM2, \XMM2 - vpxor \T1, \XMM3, \XMM3 - vpxor \T1, \XMM4, \XMM4 - vpxor \T1, \XMM5, \XMM5 - vpxor \T1, \XMM6, \XMM6 - vpxor \T1, \XMM7, \XMM7 - vpxor \T1, \XMM8, \XMM8 +_T_16\@: + vmovdqu %xmm9, (%r10) - ####################################################################### +_return_T_done\@: + mov %r14, %rsp + pop %r15 + pop %r14 + pop %r13 + pop %r12 +.endm +#ifdef CONFIG_AS_AVX +############################################################################### +# GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0) +# Input: A and B (128-bits each, bit-reflected) +# Output: C = A*B*x mod poly, (i.e. >>1 ) +# To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input +# GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly. +############################################################################### +.macro GHASH_MUL_AVX GH HK T1 T2 T3 T4 T5 + vpshufd $0b01001110, \GH, \T2 + vpshufd $0b01001110, \HK, \T3 + vpxor \GH , \T2, \T2 # T2 = (a1+a0) + vpxor \HK , \T3, \T3 # T3 = (b1+b0) + vpclmulqdq $0x11, \HK, \GH, \T1 # T1 = a1*b1 + vpclmulqdq $0x00, \HK, \GH, \GH # GH = a0*b0 + vpclmulqdq $0x00, \T3, \T2, \T2 # T2 = (a1+a0)*(b1+b0) + vpxor \GH, \T2,\T2 + vpxor \T1, \T2,\T2 # T2 = a0*b1+a1*b0 - vmovdqu 16*1(arg1), \T1 - vaesenc \T1, \XMM1, \XMM1 - vaesenc \T1, \XMM2, \XMM2 - vaesenc \T1, \XMM3, \XMM3 - vaesenc \T1, \XMM4, \XMM4 - vaesenc \T1, \XMM5, \XMM5 - vaesenc \T1, \XMM6, \XMM6 - vaesenc \T1, \XMM7, \XMM7 - vaesenc \T1, \XMM8, \XMM8 + vpslldq $8, \T2,\T3 # shift-L T3 2 DWs + vpsrldq $8, \T2,\T2 # shift-R T2 2 DWs + vpxor \T3, \GH, \GH + vpxor \T2, \T1, \T1 # = GH x HK - vmovdqu 16*2(arg1), \T1 - vaesenc \T1, \XMM1, \XMM1 - vaesenc \T1, \XMM2, \XMM2 - vaesenc \T1, \XMM3, \XMM3 - vaesenc \T1, \XMM4, \XMM4 - vaesenc \T1, \XMM5, \XMM5 - vaesenc \T1, \XMM6, \XMM6 - vaesenc \T1, \XMM7, \XMM7 - vaesenc \T1, \XMM8, \XMM8 + #first phase of the reduction + vpslld $31, \GH, \T2 # packed right shifting << 31 + vpslld $30, \GH, \T3 # packed right shifting shift << 30 + vpslld $25, \GH, \T4 # packed right shifting shift << 25 + vpxor \T3, \T2, \T2 # xor the shifted versions + vpxor \T4, \T2, \T2 - ####################################################################### + vpsrldq $4, \T2, \T5 # shift-R T5 1 DW - vmovdqa HashKey_8(arg1), \T5 - vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1 - vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0 + vpslldq $12, \T2, \T2 # shift-L T2 3 DWs + vpxor \T2, \GH, \GH # first phase of the reduction complete - vpshufd $0b01001110, \T2, \T6 - vpxor \T2, \T6, \T6 + #second phase of the reduction - vmovdqa HashKey_8_k(arg1), \T5 - vpclmulqdq $0x00, \T5, \T6, \T6 + vpsrld $1,\GH, \T2 # packed left shifting >> 1 + vpsrld $2,\GH, \T3 # packed left shifting >> 2 + vpsrld $7,\GH, \T4 # packed left shifting >> 7 + vpxor \T3, \T2, \T2 # xor the shifted versions + vpxor \T4, \T2, \T2 - vmovdqu 16*3(arg1), \T1 - vaesenc \T1, \XMM1, \XMM1 - vaesenc \T1, \XMM2, \XMM2 - vaesenc \T1, \XMM3, \XMM3 - vaesenc \T1, \XMM4, \XMM4 - vaesenc \T1, \XMM5, \XMM5 - vaesenc \T1, \XMM6, \XMM6 - vaesenc \T1, \XMM7, \XMM7 - vaesenc \T1, \XMM8, \XMM8 + vpxor \T5, \T2, \T2 + vpxor \T2, \GH, \GH + vpxor \T1, \GH, \GH # the result is in GH - vmovdqa TMP2(%rsp), \T1 - vmovdqa HashKey_7(arg1), \T5 - vpclmulqdq $0x11, \T5, \T1, \T3 - vpxor \T3, \T4, \T4 - vpclmulqdq $0x00, \T5, \T1, \T3 - vpxor \T3, \T7, \T7 - vpshufd $0b01001110, \T1, \T3 - vpxor \T1, \T3, \T3 - vmovdqa HashKey_7_k(arg1), \T5 - vpclmulqdq $0x10, \T5, \T3, \T3 - vpxor \T3, \T6, \T6 +.endm - vmovdqu 16*4(arg1), \T1 - vaesenc \T1, \XMM1, \XMM1 - vaesenc \T1, \XMM2, \XMM2 - vaesenc \T1, \XMM3, \XMM3 - vaesenc \T1, \XMM4, \XMM4 - vaesenc \T1, \XMM5, \XMM5 - vaesenc \T1, \XMM6, \XMM6 - vaesenc \T1, \XMM7, \XMM7 - vaesenc \T1, \XMM8, \XMM8 +.macro PRECOMPUTE_AVX HK T1 T2 T3 T4 T5 T6 - ####################################################################### + # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i + vmovdqa \HK, \T5 - vmovdqa TMP3(%rsp), \T1 - vmovdqa HashKey_6(arg1), \T5 - vpclmulqdq $0x11, \T5, \T1, \T3 - vpxor \T3, \T4, \T4 - vpclmulqdq $0x00, \T5, \T1, \T3 - vpxor \T3, \T7, \T7 + vpshufd $0b01001110, \T5, \T1 + vpxor \T5, \T1, \T1 + vmovdqa \T1, HashKey_k(arg1) - vpshufd $0b01001110, \T1, \T3 - vpxor \T1, \T3, \T3 - vmovdqa HashKey_6_k(arg1), \T5 - vpclmulqdq $0x10, \T5, \T3, \T3 - vpxor \T3, \T6, \T6 + GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly + vmovdqa \T5, HashKey_2(arg1) # [HashKey_2] = HashKey^2<<1 mod poly + vpshufd $0b01001110, \T5, \T1 + vpxor \T5, \T1, \T1 + vmovdqa \T1, HashKey_2_k(arg1) - vmovdqu 16*5(arg1), \T1 - vaesenc \T1, \XMM1, \XMM1 - vaesenc \T1, \XMM2, \XMM2 - vaesenc \T1, \XMM3, \XMM3 - vaesenc \T1, \XMM4, \XMM4 - vaesenc \T1, \XMM5, \XMM5 - vaesenc \T1, \XMM6, \XMM6 - vaesenc \T1, \XMM7, \XMM7 - vaesenc \T1, \XMM8, \XMM8 + GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly + vmovdqa \T5, HashKey_3(arg1) + vpshufd $0b01001110, \T5, \T1 + vpxor \T5, \T1, \T1 + vmovdqa \T1, HashKey_3_k(arg1) - vmovdqa TMP4(%rsp), \T1 - vmovdqa HashKey_5(arg1), \T5 - vpclmulqdq $0x11, \T5, \T1, \T3 - vpxor \T3, \T4, \T4 - vpclmulqdq $0x00, \T5, \T1, \T3 - vpxor \T3, \T7, \T7 + GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly + vmovdqa \T5, HashKey_4(arg1) + vpshufd $0b01001110, \T5, \T1 + vpxor \T5, \T1, \T1 + vmovdqa \T1, HashKey_4_k(arg1) - vpshufd $0b01001110, \T1, \T3 - vpxor \T1, \T3, \T3 - vmovdqa HashKey_5_k(arg1), \T5 - vpclmulqdq $0x10, \T5, \T3, \T3 - vpxor \T3, \T6, \T6 + GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly + vmovdqa \T5, HashKey_5(arg1) + vpshufd $0b01001110, \T5, \T1 + vpxor \T5, \T1, \T1 + vmovdqa \T1, HashKey_5_k(arg1) - vmovdqu 16*6(arg1), \T1 - vaesenc \T1, \XMM1, \XMM1 - vaesenc \T1, \XMM2, \XMM2 - vaesenc \T1, \XMM3, \XMM3 - vaesenc \T1, \XMM4, \XMM4 - vaesenc \T1, \XMM5, \XMM5 - vaesenc \T1, \XMM6, \XMM6 - vaesenc \T1, \XMM7, \XMM7 - vaesenc \T1, \XMM8, \XMM8 + GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly + vmovdqa \T5, HashKey_6(arg1) + vpshufd $0b01001110, \T5, \T1 + vpxor \T5, \T1, \T1 + vmovdqa \T1, HashKey_6_k(arg1) + GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly + vmovdqa \T5, HashKey_7(arg1) + vpshufd $0b01001110, \T5, \T1 + vpxor \T5, \T1, \T1 + vmovdqa \T1, HashKey_7_k(arg1) - vmovdqa TMP5(%rsp), \T1 - vmovdqa HashKey_4(arg1), \T5 - vpclmulqdq $0x11, \T5, \T1, \T3 - vpxor \T3, \T4, \T4 - vpclmulqdq $0x00, \T5, \T1, \T3 - vpxor \T3, \T7, \T7 + GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly + vmovdqa \T5, HashKey_8(arg1) + vpshufd $0b01001110, \T5, \T1 + vpxor \T5, \T1, \T1 + vmovdqa \T1, HashKey_8_k(arg1) - vpshufd $0b01001110, \T1, \T3 - vpxor \T1, \T3, \T3 - vmovdqa HashKey_4_k(arg1), \T5 - vpclmulqdq $0x10, \T5, \T3, \T3 - vpxor \T3, \T6, \T6 +.endm - vmovdqu 16*7(arg1), \T1 - vaesenc \T1, \XMM1, \XMM1 - vaesenc \T1, \XMM2, \XMM2 - vaesenc \T1, \XMM3, \XMM3 - vaesenc \T1, \XMM4, \XMM4 - vaesenc \T1, \XMM5, \XMM5 - vaesenc \T1, \XMM6, \XMM6 - vaesenc \T1, \XMM7, \XMM7 - vaesenc \T1, \XMM8, \XMM8 +## if a = number of total plaintext bytes +## b = floor(a/16) +## num_initial_blocks = b mod 4# +## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext +## r10, r11, r12, rax are clobbered +## arg1, arg2, arg3, r14 are used as a pointer only, not modified - vmovdqa TMP6(%rsp), \T1 - vmovdqa HashKey_3(arg1), \T5 - vpclmulqdq $0x11, \T5, \T1, \T3 - vpxor \T3, \T4, \T4 - vpclmulqdq $0x00, \T5, \T1, \T3 - vpxor \T3, \T7, \T7 +.macro INITIAL_BLOCKS_AVX num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC + i = (8-\num_initial_blocks) + j = 0 + setreg - vpshufd $0b01001110, \T1, \T3 - vpxor \T1, \T3, \T3 - vmovdqa HashKey_3_k(arg1), \T5 - vpclmulqdq $0x10, \T5, \T3, \T3 - vpxor \T3, \T6, \T6 + mov arg6, %r10 # r10 = AAD + mov arg7, %r12 # r12 = aadLen - vmovdqu 16*8(arg1), \T1 - vaesenc \T1, \XMM1, \XMM1 - vaesenc \T1, \XMM2, \XMM2 - vaesenc \T1, \XMM3, \XMM3 - vaesenc \T1, \XMM4, \XMM4 - vaesenc \T1, \XMM5, \XMM5 - vaesenc \T1, \XMM6, \XMM6 - vaesenc \T1, \XMM7, \XMM7 - vaesenc \T1, \XMM8, \XMM8 + mov %r12, %r11 - vmovdqa TMP7(%rsp), \T1 - vmovdqa HashKey_2(arg1), \T5 - vpclmulqdq $0x11, \T5, \T1, \T3 - vpxor \T3, \T4, \T4 - vpclmulqdq $0x00, \T5, \T1, \T3 - vpxor \T3, \T7, \T7 + vpxor reg_j, reg_j, reg_j + vpxor reg_i, reg_i, reg_i + cmp $16, %r11 + jl _get_AAD_rest8\@ +_get_AAD_blocks\@: + vmovdqu (%r10), reg_i + vpshufb SHUF_MASK(%rip), reg_i, reg_i + vpxor reg_i, reg_j, reg_j + GHASH_MUL_AVX reg_j, \T2, \T1, \T3, \T4, \T5, \T6 + add $16, %r10 + sub $16, %r12 + sub $16, %r11 + cmp $16, %r11 + jge _get_AAD_blocks\@ + vmovdqu reg_j, reg_i + cmp $0, %r11 + je _get_AAD_done\@ - vpshufd $0b01001110, \T1, \T3 - vpxor \T1, \T3, \T3 - vmovdqa HashKey_2_k(arg1), \T5 - vpclmulqdq $0x10, \T5, \T3, \T3 - vpxor \T3, \T6, \T6 + vpxor reg_i, reg_i, reg_i - ####################################################################### + /* read the last <16B of AAD. since we have at least 4B of + data right after the AAD (the ICV, and maybe some CT), we can + read 4B/8B blocks safely, and then get rid of the extra stuff */ +_get_AAD_rest8\@: + cmp $4, %r11 + jle _get_AAD_rest4\@ + movq (%r10), \T1 + add $8, %r10 + sub $8, %r11 + vpslldq $8, \T1, \T1 + vpsrldq $8, reg_i, reg_i + vpxor \T1, reg_i, reg_i + jmp _get_AAD_rest8\@ +_get_AAD_rest4\@: + cmp $0, %r11 + jle _get_AAD_rest0\@ + mov (%r10), %eax + movq %rax, \T1 + add $4, %r10 + sub $4, %r11 + vpslldq $12, \T1, \T1 + vpsrldq $4, reg_i, reg_i + vpxor \T1, reg_i, reg_i +_get_AAD_rest0\@: + /* finalize: shift out the extra bytes we read, and align + left. since pslldq can only shift by an immediate, we use + vpshufb and an array of shuffle masks */ + movq %r12, %r11 + salq $4, %r11 + movdqu aad_shift_arr(%r11), \T1 + vpshufb \T1, reg_i, reg_i +_get_AAD_rest_final\@: + vpshufb SHUF_MASK(%rip), reg_i, reg_i + vpxor reg_j, reg_i, reg_i + GHASH_MUL_AVX reg_i, \T2, \T1, \T3, \T4, \T5, \T6 - vmovdqu 16*9(arg1), \T5 - vaesenc \T5, \XMM1, \XMM1 - vaesenc \T5, \XMM2, \XMM2 - vaesenc \T5, \XMM3, \XMM3 - vaesenc \T5, \XMM4, \XMM4 - vaesenc \T5, \XMM5, \XMM5 - vaesenc \T5, \XMM6, \XMM6 - vaesenc \T5, \XMM7, \XMM7 - vaesenc \T5, \XMM8, \XMM8 +_get_AAD_done\@: + # initialize the data pointer offset as zero + xor %r11d, %r11d - vmovdqa TMP8(%rsp), \T1 - vmovdqa HashKey(arg1), \T5 - vpclmulqdq $0x11, \T5, \T1, \T3 - vpxor \T3, \T4, \T4 - vpclmulqdq $0x00, \T5, \T1, \T3 - vpxor \T3, \T7, \T7 + # start AES for num_initial_blocks blocks + mov arg5, %rax # rax = *Y0 + vmovdqu (%rax), \CTR # CTR = Y0 + vpshufb SHUF_MASK(%rip), \CTR, \CTR - vpshufd $0b01001110, \T1, \T3 - vpxor \T1, \T3, \T3 - vmovdqa HashKey_k(arg1), \T5 - vpclmulqdq $0x10, \T5, \T3, \T3 - vpxor \T3, \T6, \T6 - vpxor \T4, \T6, \T6 - vpxor \T7, \T6, \T6 + i = (9-\num_initial_blocks) + setreg +.rep \num_initial_blocks + vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 + vmovdqa \CTR, reg_i + vpshufb SHUF_MASK(%rip), reg_i, reg_i # perform a 16Byte swap + i = (i+1) + setreg +.endr - vmovdqu 16*10(arg1), \T5 + vmovdqa (arg1), \T_key + i = (9-\num_initial_blocks) + setreg +.rep \num_initial_blocks + vpxor \T_key, reg_i, reg_i + i = (i+1) + setreg +.endr - i = 0 j = 1 setreg -.rep 8 - vpxor 16*i(arg3, %r11), \T5, \T2 - .if \ENC_DEC == ENC - vaesenclast \T2, reg_j, reg_j - .else - vaesenclast \T2, reg_j, \T3 - vmovdqu 16*i(arg3, %r11), reg_j - vmovdqu \T3, 16*i(arg2, %r11) - .endif +.rep 9 + vmovdqa 16*j(arg1), \T_key + i = (9-\num_initial_blocks) + setreg +.rep \num_initial_blocks + vaesenc \T_key, reg_i, reg_i i = (i+1) + setreg +.endr + j = (j+1) setreg .endr - ####################################################################### - vpslldq $8, \T6, \T3 # shift-L T3 2 DWs - vpsrldq $8, \T6, \T6 # shift-R T2 2 DWs - vpxor \T3, \T7, \T7 - vpxor \T4, \T6, \T6 # accumulate the results in T6:T7 + vmovdqa 16*10(arg1), \T_key + i = (9-\num_initial_blocks) + setreg +.rep \num_initial_blocks + vaesenclast \T_key, reg_i, reg_i + i = (i+1) + setreg +.endr + i = (9-\num_initial_blocks) + setreg +.rep \num_initial_blocks + vmovdqu (arg3, %r11), \T1 + vpxor \T1, reg_i, reg_i + vmovdqu reg_i, (arg2 , %r11) # write back ciphertext for num_initial_blocks blocks + add $16, %r11 +.if \ENC_DEC == DEC + vmovdqa \T1, reg_i +.endif + vpshufb SHUF_MASK(%rip), reg_i, reg_i # prepare ciphertext for GHASH computations + i = (i+1) + setreg +.endr - ####################################################################### - #first phase of the reduction - ####################################################################### - vpslld $31, \T7, \T2 # packed right shifting << 31 - vpslld $30, \T7, \T3 # packed right shifting shift << 30 - vpslld $25, \T7, \T4 # packed right shifting shift << 25 + i = (8-\num_initial_blocks) + j = (9-\num_initial_blocks) + setreg - vpxor \T3, \T2, \T2 # xor the shifted versions - vpxor \T4, \T2, \T2 +.rep \num_initial_blocks + vpxor reg_i, reg_j, reg_j + GHASH_MUL_AVX reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks + i = (i+1) + j = (j+1) + setreg +.endr + # XMM8 has the combined result here - vpsrldq $4, \T2, \T1 # shift-R T1 1 DW + vmovdqa \XMM8, TMP1(%rsp) + vmovdqa \XMM8, \T3 - vpslldq $12, \T2, \T2 # shift-L T2 3 DWs - vpxor \T2, \T7, \T7 # first phase of the reduction complete - ####################################################################### - .if \ENC_DEC == ENC - vmovdqu \XMM1, 16*0(arg2,%r11) # Write to the Ciphertext buffer - vmovdqu \XMM2, 16*1(arg2,%r11) # Write to the Ciphertext buffer - vmovdqu \XMM3, 16*2(arg2,%r11) # Write to the Ciphertext buffer - vmovdqu \XMM4, 16*3(arg2,%r11) # Write to the Ciphertext buffer - vmovdqu \XMM5, 16*4(arg2,%r11) # Write to the Ciphertext buffer - vmovdqu \XMM6, 16*5(arg2,%r11) # Write to the Ciphertext buffer - vmovdqu \XMM7, 16*6(arg2,%r11) # Write to the Ciphertext buffer - vmovdqu \XMM8, 16*7(arg2,%r11) # Write to the Ciphertext buffer - .endif + cmp $128, %r13 + jl _initial_blocks_done\@ # no need for precomputed constants - ####################################################################### - #second phase of the reduction - vpsrld $1, \T7, \T2 # packed left shifting >> 1 - vpsrld $2, \T7, \T3 # packed left shifting >> 2 - vpsrld $7, \T7, \T4 # packed left shifting >> 7 - vpxor \T3, \T2, \T2 # xor the shifted versions - vpxor \T4, \T2, \T2 +############################################################################### +# Haskey_i_k holds XORed values of the low and high parts of the Haskey_i + vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 + vmovdqa \CTR, \XMM1 + vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap - vpxor \T1, \T2, \T2 - vpxor \T2, \T7, \T7 - vpxor \T7, \T6, \T6 # the result is in T6 - ####################################################################### + vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 + vmovdqa \CTR, \XMM2 + vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap + vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 + vmovdqa \CTR, \XMM3 + vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap + vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 + vmovdqa \CTR, \XMM4 + vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap - vpxor \T6, \XMM1, \XMM1 + vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 + vmovdqa \CTR, \XMM5 + vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap + vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 + vmovdqa \CTR, \XMM6 + vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap + vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 + vmovdqa \CTR, \XMM7 + vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap -.endm + vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 + vmovdqa \CTR, \XMM8 + vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap + vmovdqa (arg1), \T_key + vpxor \T_key, \XMM1, \XMM1 + vpxor \T_key, \XMM2, \XMM2 + vpxor \T_key, \XMM3, \XMM3 + vpxor \T_key, \XMM4, \XMM4 + vpxor \T_key, \XMM5, \XMM5 + vpxor \T_key, \XMM6, \XMM6 + vpxor \T_key, \XMM7, \XMM7 + vpxor \T_key, \XMM8, \XMM8 -# GHASH the last 4 ciphertext blocks. -.macro GHASH_LAST_8_AVX T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 + i = 1 + setreg +.rep 9 # do 9 rounds + vmovdqa 16*i(arg1), \T_key + vaesenc \T_key, \XMM1, \XMM1 + vaesenc \T_key, \XMM2, \XMM2 + vaesenc \T_key, \XMM3, \XMM3 + vaesenc \T_key, \XMM4, \XMM4 + vaesenc \T_key, \XMM5, \XMM5 + vaesenc \T_key, \XMM6, \XMM6 + vaesenc \T_key, \XMM7, \XMM7 + vaesenc \T_key, \XMM8, \XMM8 + i = (i+1) + setreg +.endr - ## Karatsuba Method + vmovdqa 16*i(arg1), \T_key + vaesenclast \T_key, \XMM1, \XMM1 + vaesenclast \T_key, \XMM2, \XMM2 + vaesenclast \T_key, \XMM3, \XMM3 + vaesenclast \T_key, \XMM4, \XMM4 + vaesenclast \T_key, \XMM5, \XMM5 + vaesenclast \T_key, \XMM6, \XMM6 + vaesenclast \T_key, \XMM7, \XMM7 + vaesenclast \T_key, \XMM8, \XMM8 - vpshufd $0b01001110, \XMM1, \T2 - vpxor \XMM1, \T2, \T2 - vmovdqa HashKey_8(arg1), \T5 - vpclmulqdq $0x11, \T5, \XMM1, \T6 - vpclmulqdq $0x00, \T5, \XMM1, \T7 + vmovdqu (arg3, %r11), \T1 + vpxor \T1, \XMM1, \XMM1 + vmovdqu \XMM1, (arg2 , %r11) + .if \ENC_DEC == DEC + vmovdqa \T1, \XMM1 + .endif - vmovdqa HashKey_8_k(arg1), \T3 - vpclmulqdq $0x00, \T3, \T2, \XMM1 + vmovdqu 16*1(arg3, %r11), \T1 + vpxor \T1, \XMM2, \XMM2 + vmovdqu \XMM2, 16*1(arg2 , %r11) + .if \ENC_DEC == DEC + vmovdqa \T1, \XMM2 + .endif - ###################### + vmovdqu 16*2(arg3, %r11), \T1 + vpxor \T1, \XMM3, \XMM3 + vmovdqu \XMM3, 16*2(arg2 , %r11) + .if \ENC_DEC == DEC + vmovdqa \T1, \XMM3 + .endif - vpshufd $0b01001110, \XMM2, \T2 - vpxor \XMM2, \T2, \T2 - vmovdqa HashKey_7(arg1), \T5 - vpclmulqdq $0x11, \T5, \XMM2, \T4 - vpxor \T4, \T6, \T6 + vmovdqu 16*3(arg3, %r11), \T1 + vpxor \T1, \XMM4, \XMM4 + vmovdqu \XMM4, 16*3(arg2 , %r11) + .if \ENC_DEC == DEC + vmovdqa \T1, \XMM4 + .endif - vpclmulqdq $0x00, \T5, \XMM2, \T4 - vpxor \T4, \T7, \T7 + vmovdqu 16*4(arg3, %r11), \T1 + vpxor \T1, \XMM5, \XMM5 + vmovdqu \XMM5, 16*4(arg2 , %r11) + .if \ENC_DEC == DEC + vmovdqa \T1, \XMM5 + .endif - vmovdqa HashKey_7_k(arg1), \T3 - vpclmulqdq $0x00, \T3, \T2, \T2 - vpxor \T2, \XMM1, \XMM1 + vmovdqu 16*5(arg3, %r11), \T1 + vpxor \T1, \XMM6, \XMM6 + vmovdqu \XMM6, 16*5(arg2 , %r11) + .if \ENC_DEC == DEC + vmovdqa \T1, \XMM6 + .endif - ###################### + vmovdqu 16*6(arg3, %r11), \T1 + vpxor \T1, \XMM7, \XMM7 + vmovdqu \XMM7, 16*6(arg2 , %r11) + .if \ENC_DEC == DEC + vmovdqa \T1, \XMM7 + .endif - vpshufd $0b01001110, \XMM3, \T2 - vpxor \XMM3, \T2, \T2 - vmovdqa HashKey_6(arg1), \T5 - vpclmulqdq $0x11, \T5, \XMM3, \T4 - vpxor \T4, \T6, \T6 + vmovdqu 16*7(arg3, %r11), \T1 + vpxor \T1, \XMM8, \XMM8 + vmovdqu \XMM8, 16*7(arg2 , %r11) + .if \ENC_DEC == DEC + vmovdqa \T1, \XMM8 + .endif - vpclmulqdq $0x00, \T5, \XMM3, \T4 - vpxor \T4, \T7, \T7 + add $128, %r11 - vmovdqa HashKey_6_k(arg1), \T3 - vpclmulqdq $0x00, \T3, \T2, \T2 - vpxor \T2, \XMM1, \XMM1 + vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap + vpxor TMP1(%rsp), \XMM1, \XMM1 # combine GHASHed value with the corresponding ciphertext + vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap - ###################### +############################################################################### - vpshufd $0b01001110, \XMM4, \T2 - vpxor \XMM4, \T2, \T2 - vmovdqa HashKey_5(arg1), \T5 - vpclmulqdq $0x11, \T5, \XMM4, \T4 - vpxor \T4, \T6, \T6 +_initial_blocks_done\@: - vpclmulqdq $0x00, \T5, \XMM4, \T4 - vpxor \T4, \T7, \T7 +.endm - vmovdqa HashKey_5_k(arg1), \T3 - vpclmulqdq $0x00, \T3, \T2, \T2 - vpxor \T2, \XMM1, \XMM1 +# encrypt 8 blocks at a time +# ghash the 8 previously encrypted ciphertext blocks +# arg1, arg2, arg3 are used as pointers only, not modified +# r11 is the data offset value +.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC - ###################### + vmovdqa \XMM1, \T2 + vmovdqa \XMM2, TMP2(%rsp) + vmovdqa \XMM3, TMP3(%rsp) + vmovdqa \XMM4, TMP4(%rsp) + vmovdqa \XMM5, TMP5(%rsp) + vmovdqa \XMM6, TMP6(%rsp) + vmovdqa \XMM7, TMP7(%rsp) + vmovdqa \XMM8, TMP8(%rsp) - vpshufd $0b01001110, \XMM5, \T2 - vpxor \XMM5, \T2, \T2 - vmovdqa HashKey_4(arg1), \T5 - vpclmulqdq $0x11, \T5, \XMM5, \T4 - vpxor \T4, \T6, \T6 +.if \loop_idx == in_order + vpaddd ONE(%rip), \CTR, \XMM1 # INCR CNT + vpaddd ONE(%rip), \XMM1, \XMM2 + vpaddd ONE(%rip), \XMM2, \XMM3 + vpaddd ONE(%rip), \XMM3, \XMM4 + vpaddd ONE(%rip), \XMM4, \XMM5 + vpaddd ONE(%rip), \XMM5, \XMM6 + vpaddd ONE(%rip), \XMM6, \XMM7 + vpaddd ONE(%rip), \XMM7, \XMM8 + vmovdqa \XMM8, \CTR - vpclmulqdq $0x00, \T5, \XMM5, \T4 - vpxor \T4, \T7, \T7 + vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap +.else + vpaddd ONEf(%rip), \CTR, \XMM1 # INCR CNT + vpaddd ONEf(%rip), \XMM1, \XMM2 + vpaddd ONEf(%rip), \XMM2, \XMM3 + vpaddd ONEf(%rip), \XMM3, \XMM4 + vpaddd ONEf(%rip), \XMM4, \XMM5 + vpaddd ONEf(%rip), \XMM5, \XMM6 + vpaddd ONEf(%rip), \XMM6, \XMM7 + vpaddd ONEf(%rip), \XMM7, \XMM8 + vmovdqa \XMM8, \CTR +.endif - vmovdqa HashKey_4_k(arg1), \T3 - vpclmulqdq $0x00, \T3, \T2, \T2 - vpxor \T2, \XMM1, \XMM1 - ###################### + ####################################################################### - vpshufd $0b01001110, \XMM6, \T2 - vpxor \XMM6, \T2, \T2 - vmovdqa HashKey_3(arg1), \T5 - vpclmulqdq $0x11, \T5, \XMM6, \T4 - vpxor \T4, \T6, \T6 + vmovdqu (arg1), \T1 + vpxor \T1, \XMM1, \XMM1 + vpxor \T1, \XMM2, \XMM2 + vpxor \T1, \XMM3, \XMM3 + vpxor \T1, \XMM4, \XMM4 + vpxor \T1, \XMM5, \XMM5 + vpxor \T1, \XMM6, \XMM6 + vpxor \T1, \XMM7, \XMM7 + vpxor \T1, \XMM8, \XMM8 - vpclmulqdq $0x00, \T5, \XMM6, \T4 - vpxor \T4, \T7, \T7 + ####################################################################### - vmovdqa HashKey_3_k(arg1), \T3 - vpclmulqdq $0x00, \T3, \T2, \T2 - vpxor \T2, \XMM1, \XMM1 - ###################### - vpshufd $0b01001110, \XMM7, \T2 - vpxor \XMM7, \T2, \T2 - vmovdqa HashKey_2(arg1), \T5 - vpclmulqdq $0x11, \T5, \XMM7, \T4 - vpxor \T4, \T6, \T6 - vpclmulqdq $0x00, \T5, \XMM7, \T4 - vpxor \T4, \T7, \T7 - vmovdqa HashKey_2_k(arg1), \T3 - vpclmulqdq $0x00, \T3, \T2, \T2 - vpxor \T2, \XMM1, \XMM1 + vmovdqu 16*1(arg1), \T1 + vaesenc \T1, \XMM1, \XMM1 + vaesenc \T1, \XMM2, \XMM2 + vaesenc \T1, \XMM3, \XMM3 + vaesenc \T1, \XMM4, \XMM4 + vaesenc \T1, \XMM5, \XMM5 + vaesenc \T1, \XMM6, \XMM6 + vaesenc \T1, \XMM7, \XMM7 + vaesenc \T1, \XMM8, \XMM8 - ###################### + vmovdqu 16*2(arg1), \T1 + vaesenc \T1, \XMM1, \XMM1 + vaesenc \T1, \XMM2, \XMM2 + vaesenc \T1, \XMM3, \XMM3 + vaesenc \T1, \XMM4, \XMM4 + vaesenc \T1, \XMM5, \XMM5 + vaesenc \T1, \XMM6, \XMM6 + vaesenc \T1, \XMM7, \XMM7 + vaesenc \T1, \XMM8, \XMM8 - vpshufd $0b01001110, \XMM8, \T2 - vpxor \XMM8, \T2, \T2 - vmovdqa HashKey(arg1), \T5 - vpclmulqdq $0x11, \T5, \XMM8, \T4 - vpxor \T4, \T6, \T6 - vpclmulqdq $0x00, \T5, \XMM8, \T4 - vpxor \T4, \T7, \T7 + ####################################################################### - vmovdqa HashKey_k(arg1), \T3 - vpclmulqdq $0x00, \T3, \T2, \T2 + vmovdqa HashKey_8(arg1), \T5 + vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1 + vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0 - vpxor \T2, \XMM1, \XMM1 - vpxor \T6, \XMM1, \XMM1 - vpxor \T7, \XMM1, \T2 + vpshufd $0b01001110, \T2, \T6 + vpxor \T2, \T6, \T6 + vmovdqa HashKey_8_k(arg1), \T5 + vpclmulqdq $0x00, \T5, \T6, \T6 + vmovdqu 16*3(arg1), \T1 + vaesenc \T1, \XMM1, \XMM1 + vaesenc \T1, \XMM2, \XMM2 + vaesenc \T1, \XMM3, \XMM3 + vaesenc \T1, \XMM4, \XMM4 + vaesenc \T1, \XMM5, \XMM5 + vaesenc \T1, \XMM6, \XMM6 + vaesenc \T1, \XMM7, \XMM7 + vaesenc \T1, \XMM8, \XMM8 + vmovdqa TMP2(%rsp), \T1 + vmovdqa HashKey_7(arg1), \T5 + vpclmulqdq $0x11, \T5, \T1, \T3 + vpxor \T3, \T4, \T4 + vpclmulqdq $0x00, \T5, \T1, \T3 + vpxor \T3, \T7, \T7 - vpslldq $8, \T2, \T4 - vpsrldq $8, \T2, \T2 + vpshufd $0b01001110, \T1, \T3 + vpxor \T1, \T3, \T3 + vmovdqa HashKey_7_k(arg1), \T5 + vpclmulqdq $0x10, \T5, \T3, \T3 + vpxor \T3, \T6, \T6 - vpxor \T4, \T7, \T7 - vpxor \T2, \T6, \T6 # holds the result of - # the accumulated carry-less multiplications + vmovdqu 16*4(arg1), \T1 + vaesenc \T1, \XMM1, \XMM1 + vaesenc \T1, \XMM2, \XMM2 + vaesenc \T1, \XMM3, \XMM3 + vaesenc \T1, \XMM4, \XMM4 + vaesenc \T1, \XMM5, \XMM5 + vaesenc \T1, \XMM6, \XMM6 + vaesenc \T1, \XMM7, \XMM7 + vaesenc \T1, \XMM8, \XMM8 - ####################################################################### - #first phase of the reduction - vpslld $31, \T7, \T2 # packed right shifting << 31 - vpslld $30, \T7, \T3 # packed right shifting shift << 30 - vpslld $25, \T7, \T4 # packed right shifting shift << 25 + ####################################################################### - vpxor \T3, \T2, \T2 # xor the shifted versions - vpxor \T4, \T2, \T2 + vmovdqa TMP3(%rsp), \T1 + vmovdqa HashKey_6(arg1), \T5 + vpclmulqdq $0x11, \T5, \T1, \T3 + vpxor \T3, \T4, \T4 + vpclmulqdq $0x00, \T5, \T1, \T3 + vpxor \T3, \T7, \T7 - vpsrldq $4, \T2, \T1 # shift-R T1 1 DW + vpshufd $0b01001110, \T1, \T3 + vpxor \T1, \T3, \T3 + vmovdqa HashKey_6_k(arg1), \T5 + vpclmulqdq $0x10, \T5, \T3, \T3 + vpxor \T3, \T6, \T6 - vpslldq $12, \T2, \T2 # shift-L T2 3 DWs - vpxor \T2, \T7, \T7 # first phase of the reduction complete - ####################################################################### + vmovdqu 16*5(arg1), \T1 + vaesenc \T1, \XMM1, \XMM1 + vaesenc \T1, \XMM2, \XMM2 + vaesenc \T1, \XMM3, \XMM3 + vaesenc \T1, \XMM4, \XMM4 + vaesenc \T1, \XMM5, \XMM5 + vaesenc \T1, \XMM6, \XMM6 + vaesenc \T1, \XMM7, \XMM7 + vaesenc \T1, \XMM8, \XMM8 + vmovdqa TMP4(%rsp), \T1 + vmovdqa HashKey_5(arg1), \T5 + vpclmulqdq $0x11, \T5, \T1, \T3 + vpxor \T3, \T4, \T4 + vpclmulqdq $0x00, \T5, \T1, \T3 + vpxor \T3, \T7, \T7 - #second phase of the reduction - vpsrld $1, \T7, \T2 # packed left shifting >> 1 - vpsrld $2, \T7, \T3 # packed left shifting >> 2 - vpsrld $7, \T7, \T4 # packed left shifting >> 7 - vpxor \T3, \T2, \T2 # xor the shifted versions - vpxor \T4, \T2, \T2 + vpshufd $0b01001110, \T1, \T3 + vpxor \T1, \T3, \T3 + vmovdqa HashKey_5_k(arg1), \T5 + vpclmulqdq $0x10, \T5, \T3, \T3 + vpxor \T3, \T6, \T6 - vpxor \T1, \T2, \T2 - vpxor \T2, \T7, \T7 - vpxor \T7, \T6, \T6 # the result is in T6 + vmovdqu 16*6(arg1), \T1 + vaesenc \T1, \XMM1, \XMM1 + vaesenc \T1, \XMM2, \XMM2 + vaesenc \T1, \XMM3, \XMM3 + vaesenc \T1, \XMM4, \XMM4 + vaesenc \T1, \XMM5, \XMM5 + vaesenc \T1, \XMM6, \XMM6 + vaesenc \T1, \XMM7, \XMM7 + vaesenc \T1, \XMM8, \XMM8 -.endm + vmovdqa TMP5(%rsp), \T1 + vmovdqa HashKey_4(arg1), \T5 + vpclmulqdq $0x11, \T5, \T1, \T3 + vpxor \T3, \T4, \T4 + vpclmulqdq $0x00, \T5, \T1, \T3 + vpxor \T3, \T7, \T7 -# combined for GCM encrypt and decrypt functions -# clobbering all xmm registers -# clobbering r10, r11, r12, r13, r14, r15 -.macro GCM_ENC_DEC_AVX ENC_DEC + vpshufd $0b01001110, \T1, \T3 + vpxor \T1, \T3, \T3 + vmovdqa HashKey_4_k(arg1), \T5 + vpclmulqdq $0x10, \T5, \T3, \T3 + vpxor \T3, \T6, \T6 - #the number of pushes must equal STACK_OFFSET - push %r12 - push %r13 - push %r14 - push %r15 + vmovdqu 16*7(arg1), \T1 + vaesenc \T1, \XMM1, \XMM1 + vaesenc \T1, \XMM2, \XMM2 + vaesenc \T1, \XMM3, \XMM3 + vaesenc \T1, \XMM4, \XMM4 + vaesenc \T1, \XMM5, \XMM5 + vaesenc \T1, \XMM6, \XMM6 + vaesenc \T1, \XMM7, \XMM7 + vaesenc \T1, \XMM8, \XMM8 - mov %rsp, %r14 + vmovdqa TMP6(%rsp), \T1 + vmovdqa HashKey_3(arg1), \T5 + vpclmulqdq $0x11, \T5, \T1, \T3 + vpxor \T3, \T4, \T4 + vpclmulqdq $0x00, \T5, \T1, \T3 + vpxor \T3, \T7, \T7 + vpshufd $0b01001110, \T1, \T3 + vpxor \T1, \T3, \T3 + vmovdqa HashKey_3_k(arg1), \T5 + vpclmulqdq $0x10, \T5, \T3, \T3 + vpxor \T3, \T6, \T6 + vmovdqu 16*8(arg1), \T1 + vaesenc \T1, \XMM1, \XMM1 + vaesenc \T1, \XMM2, \XMM2 + vaesenc \T1, \XMM3, \XMM3 + vaesenc \T1, \XMM4, \XMM4 + vaesenc \T1, \XMM5, \XMM5 + vaesenc \T1, \XMM6, \XMM6 + vaesenc \T1, \XMM7, \XMM7 + vaesenc \T1, \XMM8, \XMM8 - sub $VARIABLE_OFFSET, %rsp - and $~63, %rsp # align rsp to 64 bytes + vmovdqa TMP7(%rsp), \T1 + vmovdqa HashKey_2(arg1), \T5 + vpclmulqdq $0x11, \T5, \T1, \T3 + vpxor \T3, \T4, \T4 + vpclmulqdq $0x00, \T5, \T1, \T3 + vpxor \T3, \T7, \T7 + vpshufd $0b01001110, \T1, \T3 + vpxor \T1, \T3, \T3 + vmovdqa HashKey_2_k(arg1), \T5 + vpclmulqdq $0x10, \T5, \T3, \T3 + vpxor \T3, \T6, \T6 - vmovdqu HashKey(arg1), %xmm13 # xmm13 = HashKey + ####################################################################### - mov arg4, %r13 # save the number of bytes of plaintext/ciphertext - and $-16, %r13 # r13 = r13 - (r13 mod 16) + vmovdqu 16*9(arg1), \T5 + vaesenc \T5, \XMM1, \XMM1 + vaesenc \T5, \XMM2, \XMM2 + vaesenc \T5, \XMM3, \XMM3 + vaesenc \T5, \XMM4, \XMM4 + vaesenc \T5, \XMM5, \XMM5 + vaesenc \T5, \XMM6, \XMM6 + vaesenc \T5, \XMM7, \XMM7 + vaesenc \T5, \XMM8, \XMM8 - mov %r13, %r12 - shr $4, %r12 - and $7, %r12 - jz _initial_num_blocks_is_0\@ + vmovdqa TMP8(%rsp), \T1 + vmovdqa HashKey(arg1), \T5 + vpclmulqdq $0x11, \T5, \T1, \T3 + vpxor \T3, \T4, \T4 + vpclmulqdq $0x00, \T5, \T1, \T3 + vpxor \T3, \T7, \T7 - cmp $7, %r12 - je _initial_num_blocks_is_7\@ - cmp $6, %r12 - je _initial_num_blocks_is_6\@ - cmp $5, %r12 - je _initial_num_blocks_is_5\@ - cmp $4, %r12 - je _initial_num_blocks_is_4\@ - cmp $3, %r12 - je _initial_num_blocks_is_3\@ - cmp $2, %r12 - je _initial_num_blocks_is_2\@ + vpshufd $0b01001110, \T1, \T3 + vpxor \T1, \T3, \T3 + vmovdqa HashKey_k(arg1), \T5 + vpclmulqdq $0x10, \T5, \T3, \T3 + vpxor \T3, \T6, \T6 - jmp _initial_num_blocks_is_1\@ + vpxor \T4, \T6, \T6 + vpxor \T7, \T6, \T6 -_initial_num_blocks_is_7\@: - INITIAL_BLOCKS_AVX 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC - sub $16*7, %r13 - jmp _initial_blocks_encrypted\@ + vmovdqu 16*10(arg1), \T5 -_initial_num_blocks_is_6\@: - INITIAL_BLOCKS_AVX 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC - sub $16*6, %r13 - jmp _initial_blocks_encrypted\@ + i = 0 + j = 1 + setreg +.rep 8 + vpxor 16*i(arg3, %r11), \T5, \T2 + .if \ENC_DEC == ENC + vaesenclast \T2, reg_j, reg_j + .else + vaesenclast \T2, reg_j, \T3 + vmovdqu 16*i(arg3, %r11), reg_j + vmovdqu \T3, 16*i(arg2, %r11) + .endif + i = (i+1) + j = (j+1) + setreg +.endr + ####################################################################### -_initial_num_blocks_is_5\@: - INITIAL_BLOCKS_AVX 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC - sub $16*5, %r13 - jmp _initial_blocks_encrypted\@ -_initial_num_blocks_is_4\@: - INITIAL_BLOCKS_AVX 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC - sub $16*4, %r13 - jmp _initial_blocks_encrypted\@ + vpslldq $8, \T6, \T3 # shift-L T3 2 DWs + vpsrldq $8, \T6, \T6 # shift-R T2 2 DWs + vpxor \T3, \T7, \T7 + vpxor \T4, \T6, \T6 # accumulate the results in T6:T7 -_initial_num_blocks_is_3\@: - INITIAL_BLOCKS_AVX 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC - sub $16*3, %r13 - jmp _initial_blocks_encrypted\@ -_initial_num_blocks_is_2\@: - INITIAL_BLOCKS_AVX 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC - sub $16*2, %r13 - jmp _initial_blocks_encrypted\@ -_initial_num_blocks_is_1\@: - INITIAL_BLOCKS_AVX 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC - sub $16*1, %r13 - jmp _initial_blocks_encrypted\@ + ####################################################################### + #first phase of the reduction + ####################################################################### + vpslld $31, \T7, \T2 # packed right shifting << 31 + vpslld $30, \T7, \T3 # packed right shifting shift << 30 + vpslld $25, \T7, \T4 # packed right shifting shift << 25 -_initial_num_blocks_is_0\@: - INITIAL_BLOCKS_AVX 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC + vpxor \T3, \T2, \T2 # xor the shifted versions + vpxor \T4, \T2, \T2 + vpsrldq $4, \T2, \T1 # shift-R T1 1 DW -_initial_blocks_encrypted\@: - cmp $0, %r13 - je _zero_cipher_left\@ + vpslldq $12, \T2, \T2 # shift-L T2 3 DWs + vpxor \T2, \T7, \T7 # first phase of the reduction complete + ####################################################################### + .if \ENC_DEC == ENC + vmovdqu \XMM1, 16*0(arg2,%r11) # Write to the Ciphertext buffer + vmovdqu \XMM2, 16*1(arg2,%r11) # Write to the Ciphertext buffer + vmovdqu \XMM3, 16*2(arg2,%r11) # Write to the Ciphertext buffer + vmovdqu \XMM4, 16*3(arg2,%r11) # Write to the Ciphertext buffer + vmovdqu \XMM5, 16*4(arg2,%r11) # Write to the Ciphertext buffer + vmovdqu \XMM6, 16*5(arg2,%r11) # Write to the Ciphertext buffer + vmovdqu \XMM7, 16*6(arg2,%r11) # Write to the Ciphertext buffer + vmovdqu \XMM8, 16*7(arg2,%r11) # Write to the Ciphertext buffer + .endif - sub $128, %r13 - je _eight_cipher_left\@ + ####################################################################### + #second phase of the reduction + vpsrld $1, \T7, \T2 # packed left shifting >> 1 + vpsrld $2, \T7, \T3 # packed left shifting >> 2 + vpsrld $7, \T7, \T4 # packed left shifting >> 7 + vpxor \T3, \T2, \T2 # xor the shifted versions + vpxor \T4, \T2, \T2 + vpxor \T1, \T2, \T2 + vpxor \T2, \T7, \T7 + vpxor \T7, \T6, \T6 # the result is in T6 + ####################################################################### + vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap - vmovd %xmm9, %r15d - and $255, %r15d - vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 + vpxor \T6, \XMM1, \XMM1 -_encrypt_by_8_new\@: - cmp $(255-8), %r15d - jg _encrypt_by_8\@ +.endm - add $8, %r15b - GHASH_8_ENCRYPT_8_PARALLEL_AVX %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC - add $128, %r11 - sub $128, %r13 - jne _encrypt_by_8_new\@ +# GHASH the last 4 ciphertext blocks. +.macro GHASH_LAST_8_AVX T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 - vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 - jmp _eight_cipher_left\@ + ## Karatsuba Method -_encrypt_by_8\@: - vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 - add $8, %r15b - GHASH_8_ENCRYPT_8_PARALLEL_AVX %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC - vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 - add $128, %r11 - sub $128, %r13 - jne _encrypt_by_8_new\@ - vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 + vpshufd $0b01001110, \XMM1, \T2 + vpxor \XMM1, \T2, \T2 + vmovdqa HashKey_8(arg1), \T5 + vpclmulqdq $0x11, \T5, \XMM1, \T6 + vpclmulqdq $0x00, \T5, \XMM1, \T7 + vmovdqa HashKey_8_k(arg1), \T3 + vpclmulqdq $0x00, \T3, \T2, \XMM1 + ###################### + vpshufd $0b01001110, \XMM2, \T2 + vpxor \XMM2, \T2, \T2 + vmovdqa HashKey_7(arg1), \T5 + vpclmulqdq $0x11, \T5, \XMM2, \T4 + vpxor \T4, \T6, \T6 -_eight_cipher_left\@: - GHASH_LAST_8_AVX %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8 + vpclmulqdq $0x00, \T5, \XMM2, \T4 + vpxor \T4, \T7, \T7 + vmovdqa HashKey_7_k(arg1), \T3 + vpclmulqdq $0x00, \T3, \T2, \T2 + vpxor \T2, \XMM1, \XMM1 -_zero_cipher_left\@: - cmp $16, arg4 - jl _only_less_than_16\@ + ###################### - mov arg4, %r13 - and $15, %r13 # r13 = (arg4 mod 16) + vpshufd $0b01001110, \XMM3, \T2 + vpxor \XMM3, \T2, \T2 + vmovdqa HashKey_6(arg1), \T5 + vpclmulqdq $0x11, \T5, \XMM3, \T4 + vpxor \T4, \T6, \T6 - je _multiple_of_16_bytes\@ + vpclmulqdq $0x00, \T5, \XMM3, \T4 + vpxor \T4, \T7, \T7 - # handle the last <16 Byte block seperately + vmovdqa HashKey_6_k(arg1), \T3 + vpclmulqdq $0x00, \T3, \T2, \T2 + vpxor \T2, \XMM1, \XMM1 + ###################### - vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn - vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 - ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn) + vpshufd $0b01001110, \XMM4, \T2 + vpxor \XMM4, \T2, \T2 + vmovdqa HashKey_5(arg1), \T5 + vpclmulqdq $0x11, \T5, \XMM4, \T4 + vpxor \T4, \T6, \T6 - sub $16, %r11 - add %r13, %r11 - vmovdqu (arg3, %r11), %xmm1 # receive the last <16 Byte block + vpclmulqdq $0x00, \T5, \XMM4, \T4 + vpxor \T4, \T7, \T7 - lea SHIFT_MASK+16(%rip), %r12 - sub %r13, %r12 # adjust the shuffle mask pointer to be - # able to shift 16-r13 bytes (r13 is the - # number of bytes in plaintext mod 16) - vmovdqu (%r12), %xmm2 # get the appropriate shuffle mask - vpshufb %xmm2, %xmm1, %xmm1 # shift right 16-r13 bytes - jmp _final_ghash_mul\@ + vmovdqa HashKey_5_k(arg1), \T3 + vpclmulqdq $0x00, \T3, \T2, \T2 + vpxor \T2, \XMM1, \XMM1 -_only_less_than_16\@: - # check for 0 length - mov arg4, %r13 - and $15, %r13 # r13 = (arg4 mod 16) + ###################### - je _multiple_of_16_bytes\@ + vpshufd $0b01001110, \XMM5, \T2 + vpxor \XMM5, \T2, \T2 + vmovdqa HashKey_4(arg1), \T5 + vpclmulqdq $0x11, \T5, \XMM5, \T4 + vpxor \T4, \T6, \T6 - # handle the last <16 Byte block seperately + vpclmulqdq $0x00, \T5, \XMM5, \T4 + vpxor \T4, \T7, \T7 + vmovdqa HashKey_4_k(arg1), \T3 + vpclmulqdq $0x00, \T3, \T2, \T2 + vpxor \T2, \XMM1, \XMM1 - vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn - vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 - ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn) + ###################### + vpshufd $0b01001110, \XMM6, \T2 + vpxor \XMM6, \T2, \T2 + vmovdqa HashKey_3(arg1), \T5 + vpclmulqdq $0x11, \T5, \XMM6, \T4 + vpxor \T4, \T6, \T6 - lea SHIFT_MASK+16(%rip), %r12 - sub %r13, %r12 # adjust the shuffle mask pointer to be - # able to shift 16-r13 bytes (r13 is the - # number of bytes in plaintext mod 16) + vpclmulqdq $0x00, \T5, \XMM6, \T4 + vpxor \T4, \T7, \T7 -_get_last_16_byte_loop\@: - movb (arg3, %r11), %al - movb %al, TMP1 (%rsp , %r11) - add $1, %r11 - cmp %r13, %r11 - jne _get_last_16_byte_loop\@ + vmovdqa HashKey_3_k(arg1), \T3 + vpclmulqdq $0x00, \T3, \T2, \T2 + vpxor \T2, \XMM1, \XMM1 - vmovdqu TMP1(%rsp), %xmm1 + ###################### - sub $16, %r11 + vpshufd $0b01001110, \XMM7, \T2 + vpxor \XMM7, \T2, \T2 + vmovdqa HashKey_2(arg1), \T5 + vpclmulqdq $0x11, \T5, \XMM7, \T4 + vpxor \T4, \T6, \T6 -_final_ghash_mul\@: - .if \ENC_DEC == DEC - vmovdqa %xmm1, %xmm2 - vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn) - vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to - # mask out top 16-r13 bytes of xmm9 - vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9 - vpand %xmm1, %xmm2, %xmm2 - vpshufb SHUF_MASK(%rip), %xmm2, %xmm2 - vpxor %xmm2, %xmm14, %xmm14 - #GHASH computation for the last <16 Byte block - GHASH_MUL_AVX %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 - sub %r13, %r11 - add $16, %r11 - .else - vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn) - vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to - # mask out top 16-r13 bytes of xmm9 - vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9 - vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 - vpxor %xmm9, %xmm14, %xmm14 - #GHASH computation for the last <16 Byte block - GHASH_MUL_AVX %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 - sub %r13, %r11 - add $16, %r11 - vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 # shuffle xmm9 back to output as ciphertext - .endif + vpclmulqdq $0x00, \T5, \XMM7, \T4 + vpxor \T4, \T7, \T7 + vmovdqa HashKey_2_k(arg1), \T3 + vpclmulqdq $0x00, \T3, \T2, \T2 + vpxor \T2, \XMM1, \XMM1 - ############################# - # output r13 Bytes - vmovq %xmm9, %rax - cmp $8, %r13 - jle _less_than_8_bytes_left\@ + ###################### - mov %rax, (arg2 , %r11) - add $8, %r11 - vpsrldq $8, %xmm9, %xmm9 - vmovq %xmm9, %rax - sub $8, %r13 + vpshufd $0b01001110, \XMM8, \T2 + vpxor \XMM8, \T2, \T2 + vmovdqa HashKey(arg1), \T5 + vpclmulqdq $0x11, \T5, \XMM8, \T4 + vpxor \T4, \T6, \T6 -_less_than_8_bytes_left\@: - movb %al, (arg2 , %r11) - add $1, %r11 - shr $8, %rax - sub $1, %r13 - jne _less_than_8_bytes_left\@ - ############################# + vpclmulqdq $0x00, \T5, \XMM8, \T4 + vpxor \T4, \T7, \T7 -_multiple_of_16_bytes\@: - mov arg7, %r12 # r12 = aadLen (number of bytes) - shl $3, %r12 # convert into number of bits - vmovd %r12d, %xmm15 # len(A) in xmm15 + vmovdqa HashKey_k(arg1), \T3 + vpclmulqdq $0x00, \T3, \T2, \T2 - shl $3, arg4 # len(C) in bits (*128) - vmovq arg4, %xmm1 - vpslldq $8, %xmm15, %xmm15 # xmm15 = len(A)|| 0x0000000000000000 - vpxor %xmm1, %xmm15, %xmm15 # xmm15 = len(A)||len(C) + vpxor \T2, \XMM1, \XMM1 + vpxor \T6, \XMM1, \XMM1 + vpxor \T7, \XMM1, \T2 - vpxor %xmm15, %xmm14, %xmm14 - GHASH_MUL_AVX %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 # final GHASH computation - vpshufb SHUF_MASK(%rip), %xmm14, %xmm14 # perform a 16Byte swap - mov arg5, %rax # rax = *Y0 - vmovdqu (%rax), %xmm9 # xmm9 = Y0 - ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Y0) - vpxor %xmm14, %xmm9, %xmm9 + vpslldq $8, \T2, \T4 + vpsrldq $8, \T2, \T2 + vpxor \T4, \T7, \T7 + vpxor \T2, \T6, \T6 # holds the result of + # the accumulated carry-less multiplications + ####################################################################### + #first phase of the reduction + vpslld $31, \T7, \T2 # packed right shifting << 31 + vpslld $30, \T7, \T3 # packed right shifting shift << 30 + vpslld $25, \T7, \T4 # packed right shifting shift << 25 -_return_T\@: - mov arg8, %r10 # r10 = authTag - mov arg9, %r11 # r11 = auth_tag_len + vpxor \T3, \T2, \T2 # xor the shifted versions + vpxor \T4, \T2, \T2 - cmp $16, %r11 - je _T_16\@ + vpsrldq $4, \T2, \T1 # shift-R T1 1 DW - cmp $8, %r11 - jl _T_4\@ + vpslldq $12, \T2, \T2 # shift-L T2 3 DWs + vpxor \T2, \T7, \T7 # first phase of the reduction complete + ####################################################################### -_T_8\@: - vmovq %xmm9, %rax - mov %rax, (%r10) - add $8, %r10 - sub $8, %r11 - vpsrldq $8, %xmm9, %xmm9 - cmp $0, %r11 - je _return_T_done\@ -_T_4\@: - vmovd %xmm9, %eax - mov %eax, (%r10) - add $4, %r10 - sub $4, %r11 - vpsrldq $4, %xmm9, %xmm9 - cmp $0, %r11 - je _return_T_done\@ -_T_123\@: - vmovd %xmm9, %eax - cmp $2, %r11 - jl _T_1\@ - mov %ax, (%r10) - cmp $2, %r11 - je _return_T_done\@ - add $2, %r10 - sar $16, %eax -_T_1\@: - mov %al, (%r10) - jmp _return_T_done\@ -_T_16\@: - vmovdqu %xmm9, (%r10) + #second phase of the reduction + vpsrld $1, \T7, \T2 # packed left shifting >> 1 + vpsrld $2, \T7, \T3 # packed left shifting >> 2 + vpsrld $7, \T7, \T4 # packed left shifting >> 7 + vpxor \T3, \T2, \T2 # xor the shifted versions + vpxor \T4, \T2, \T2 -_return_T_done\@: - mov %r14, %rsp + vpxor \T1, \T2, \T2 + vpxor \T2, \T7, \T7 + vpxor \T7, \T6, \T6 # the result is in T6 - pop %r15 - pop %r14 - pop %r13 - pop %r12 .endm - ############################################################# #void aesni_gcm_precomp_avx_gen2 # (gcm_data *my_ctx_data, @@ -1593,7 +1591,7 @@ ENDPROC(aesni_gcm_precomp_avx_gen2) # Valid values are 16 (most likely), 12 or 8. */ ############################################################################### ENTRY(aesni_gcm_enc_avx_gen2) - GCM_ENC_DEC_AVX ENC + GCM_ENC_DEC INITIAL_BLOCKS_AVX GHASH_8_ENCRYPT_8_PARALLEL_AVX GHASH_LAST_8_AVX GHASH_MUL_AVX ENC ret ENDPROC(aesni_gcm_enc_avx_gen2) @@ -1614,7 +1612,7 @@ ENDPROC(aesni_gcm_enc_avx_gen2) # Valid values are 16 (most likely), 12 or 8. */ ############################################################################### ENTRY(aesni_gcm_dec_avx_gen2) - GCM_ENC_DEC_AVX DEC + GCM_ENC_DEC INITIAL_BLOCKS_AVX GHASH_8_ENCRYPT_8_PARALLEL_AVX GHASH_LAST_8_AVX GHASH_MUL_AVX DEC ret ENDPROC(aesni_gcm_dec_avx_gen2) #endif /* CONFIG_AS_AVX */ @@ -2378,477 +2376,164 @@ _initial_blocks_done\@: vmovdqa HashKey_7(arg1), \T5 vpshufd $0b01001110, \XMM2, \T2 vpshufd $0b01001110, \T5, \T3 - vpxor \XMM2, \T2, \T2 - vpxor \T5, \T3, \T3 - - vpclmulqdq $0x11, \T5, \XMM2, \T4 - vpxor \T4, \T6, \T6 - - vpclmulqdq $0x00, \T5, \XMM2, \T4 - vpxor \T4, \T7, \T7 - - vpclmulqdq $0x00, \T3, \T2, \T2 - - vpxor \T2, \XMM1, \XMM1 - - ###################### - - vmovdqa HashKey_6(arg1), \T5 - vpshufd $0b01001110, \XMM3, \T2 - vpshufd $0b01001110, \T5, \T3 - vpxor \XMM3, \T2, \T2 - vpxor \T5, \T3, \T3 - - vpclmulqdq $0x11, \T5, \XMM3, \T4 - vpxor \T4, \T6, \T6 - - vpclmulqdq $0x00, \T5, \XMM3, \T4 - vpxor \T4, \T7, \T7 - - vpclmulqdq $0x00, \T3, \T2, \T2 - - vpxor \T2, \XMM1, \XMM1 - - ###################### - - vmovdqa HashKey_5(arg1), \T5 - vpshufd $0b01001110, \XMM4, \T2 - vpshufd $0b01001110, \T5, \T3 - vpxor \XMM4, \T2, \T2 - vpxor \T5, \T3, \T3 - - vpclmulqdq $0x11, \T5, \XMM4, \T4 - vpxor \T4, \T6, \T6 - - vpclmulqdq $0x00, \T5, \XMM4, \T4 - vpxor \T4, \T7, \T7 - - vpclmulqdq $0x00, \T3, \T2, \T2 - - vpxor \T2, \XMM1, \XMM1 - - ###################### - - vmovdqa HashKey_4(arg1), \T5 - vpshufd $0b01001110, \XMM5, \T2 - vpshufd $0b01001110, \T5, \T3 - vpxor \XMM5, \T2, \T2 - vpxor \T5, \T3, \T3 - - vpclmulqdq $0x11, \T5, \XMM5, \T4 - vpxor \T4, \T6, \T6 - - vpclmulqdq $0x00, \T5, \XMM5, \T4 - vpxor \T4, \T7, \T7 - - vpclmulqdq $0x00, \T3, \T2, \T2 - - vpxor \T2, \XMM1, \XMM1 - - ###################### - - vmovdqa HashKey_3(arg1), \T5 - vpshufd $0b01001110, \XMM6, \T2 - vpshufd $0b01001110, \T5, \T3 - vpxor \XMM6, \T2, \T2 - vpxor \T5, \T3, \T3 - - vpclmulqdq $0x11, \T5, \XMM6, \T4 - vpxor \T4, \T6, \T6 - - vpclmulqdq $0x00, \T5, \XMM6, \T4 - vpxor \T4, \T7, \T7 - - vpclmulqdq $0x00, \T3, \T2, \T2 - - vpxor \T2, \XMM1, \XMM1 - - ###################### - - vmovdqa HashKey_2(arg1), \T5 - vpshufd $0b01001110, \XMM7, \T2 - vpshufd $0b01001110, \T5, \T3 - vpxor \XMM7, \T2, \T2 - vpxor \T5, \T3, \T3 - - vpclmulqdq $0x11, \T5, \XMM7, \T4 - vpxor \T4, \T6, \T6 - - vpclmulqdq $0x00, \T5, \XMM7, \T4 - vpxor \T4, \T7, \T7 - - vpclmulqdq $0x00, \T3, \T2, \T2 - - vpxor \T2, \XMM1, \XMM1 - - ###################### - - vmovdqa HashKey(arg1), \T5 - vpshufd $0b01001110, \XMM8, \T2 - vpshufd $0b01001110, \T5, \T3 - vpxor \XMM8, \T2, \T2 - vpxor \T5, \T3, \T3 - - vpclmulqdq $0x11, \T5, \XMM8, \T4 - vpxor \T4, \T6, \T6 - - vpclmulqdq $0x00, \T5, \XMM8, \T4 - vpxor \T4, \T7, \T7 - - vpclmulqdq $0x00, \T3, \T2, \T2 - - vpxor \T2, \XMM1, \XMM1 - vpxor \T6, \XMM1, \XMM1 - vpxor \T7, \XMM1, \T2 - - - - - vpslldq $8, \T2, \T4 - vpsrldq $8, \T2, \T2 - - vpxor \T4, \T7, \T7 - vpxor \T2, \T6, \T6 # holds the result of the - # accumulated carry-less multiplications - - ####################################################################### - #first phase of the reduction - vmovdqa POLY2(%rip), \T3 - - vpclmulqdq $0x01, \T7, \T3, \T2 - vpslldq $8, \T2, \T2 # shift-L xmm2 2 DWs - - vpxor \T2, \T7, \T7 # first phase of the reduction complete - ####################################################################### - - - #second phase of the reduction - vpclmulqdq $0x00, \T7, \T3, \T2 - vpsrldq $4, \T2, \T2 # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R) - - vpclmulqdq $0x10, \T7, \T3, \T4 - vpslldq $4, \T4, \T4 # shift-L T4 1 DW (Shift-L 1-DW to obtain result with no shifts) - - vpxor \T2, \T4, \T4 # second phase of the reduction complete - ####################################################################### - vpxor \T4, \T6, \T6 # the result is in T6 -.endm - - - -# combined for GCM encrypt and decrypt functions -# clobbering all xmm registers -# clobbering r10, r11, r12, r13, r14, r15 -.macro GCM_ENC_DEC_AVX2 ENC_DEC - - #the number of pushes must equal STACK_OFFSET - push %r12 - push %r13 - push %r14 - push %r15 - - mov %rsp, %r14 - - - - - sub $VARIABLE_OFFSET, %rsp - and $~63, %rsp # align rsp to 64 bytes - - - vmovdqu HashKey(arg1), %xmm13 # xmm13 = HashKey - - mov arg4, %r13 # save the number of bytes of plaintext/ciphertext - and $-16, %r13 # r13 = r13 - (r13 mod 16) - - mov %r13, %r12 - shr $4, %r12 - and $7, %r12 - jz _initial_num_blocks_is_0\@ - - cmp $7, %r12 - je _initial_num_blocks_is_7\@ - cmp $6, %r12 - je _initial_num_blocks_is_6\@ - cmp $5, %r12 - je _initial_num_blocks_is_5\@ - cmp $4, %r12 - je _initial_num_blocks_is_4\@ - cmp $3, %r12 - je _initial_num_blocks_is_3\@ - cmp $2, %r12 - je _initial_num_blocks_is_2\@ - - jmp _initial_num_blocks_is_1\@ - -_initial_num_blocks_is_7\@: - INITIAL_BLOCKS_AVX2 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC - sub $16*7, %r13 - jmp _initial_blocks_encrypted\@ - -_initial_num_blocks_is_6\@: - INITIAL_BLOCKS_AVX2 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC - sub $16*6, %r13 - jmp _initial_blocks_encrypted\@ - -_initial_num_blocks_is_5\@: - INITIAL_BLOCKS_AVX2 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC - sub $16*5, %r13 - jmp _initial_blocks_encrypted\@ - -_initial_num_blocks_is_4\@: - INITIAL_BLOCKS_AVX2 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC - sub $16*4, %r13 - jmp _initial_blocks_encrypted\@ - -_initial_num_blocks_is_3\@: - INITIAL_BLOCKS_AVX2 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC - sub $16*3, %r13 - jmp _initial_blocks_encrypted\@ - -_initial_num_blocks_is_2\@: - INITIAL_BLOCKS_AVX2 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC - sub $16*2, %r13 - jmp _initial_blocks_encrypted\@ - -_initial_num_blocks_is_1\@: - INITIAL_BLOCKS_AVX2 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC - sub $16*1, %r13 - jmp _initial_blocks_encrypted\@ - -_initial_num_blocks_is_0\@: - INITIAL_BLOCKS_AVX2 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC - - -_initial_blocks_encrypted\@: - cmp $0, %r13 - je _zero_cipher_left\@ - - sub $128, %r13 - je _eight_cipher_left\@ - - - + vpxor \XMM2, \T2, \T2 + vpxor \T5, \T3, \T3 - vmovd %xmm9, %r15d - and $255, %r15d - vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 + vpclmulqdq $0x11, \T5, \XMM2, \T4 + vpxor \T4, \T6, \T6 + vpclmulqdq $0x00, \T5, \XMM2, \T4 + vpxor \T4, \T7, \T7 -_encrypt_by_8_new\@: - cmp $(255-8), %r15d - jg _encrypt_by_8\@ + vpclmulqdq $0x00, \T3, \T2, \T2 + vpxor \T2, \XMM1, \XMM1 + ###################### - add $8, %r15b - GHASH_8_ENCRYPT_8_PARALLEL_AVX2 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC - add $128, %r11 - sub $128, %r13 - jne _encrypt_by_8_new\@ + vmovdqa HashKey_6(arg1), \T5 + vpshufd $0b01001110, \XMM3, \T2 + vpshufd $0b01001110, \T5, \T3 + vpxor \XMM3, \T2, \T2 + vpxor \T5, \T3, \T3 - vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 - jmp _eight_cipher_left\@ + vpclmulqdq $0x11, \T5, \XMM3, \T4 + vpxor \T4, \T6, \T6 -_encrypt_by_8\@: - vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 - add $8, %r15b - GHASH_8_ENCRYPT_8_PARALLEL_AVX2 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC - vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 - add $128, %r11 - sub $128, %r13 - jne _encrypt_by_8_new\@ + vpclmulqdq $0x00, \T5, \XMM3, \T4 + vpxor \T4, \T7, \T7 - vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 + vpclmulqdq $0x00, \T3, \T2, \T2 + vpxor \T2, \XMM1, \XMM1 + ###################### + vmovdqa HashKey_5(arg1), \T5 + vpshufd $0b01001110, \XMM4, \T2 + vpshufd $0b01001110, \T5, \T3 + vpxor \XMM4, \T2, \T2 + vpxor \T5, \T3, \T3 -_eight_cipher_left\@: - GHASH_LAST_8_AVX2 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8 + vpclmulqdq $0x11, \T5, \XMM4, \T4 + vpxor \T4, \T6, \T6 + vpclmulqdq $0x00, \T5, \XMM4, \T4 + vpxor \T4, \T7, \T7 -_zero_cipher_left\@: - cmp $16, arg4 - jl _only_less_than_16\@ + vpclmulqdq $0x00, \T3, \T2, \T2 - mov arg4, %r13 - and $15, %r13 # r13 = (arg4 mod 16) + vpxor \T2, \XMM1, \XMM1 - je _multiple_of_16_bytes\@ + ###################### - # handle the last <16 Byte block seperately + vmovdqa HashKey_4(arg1), \T5 + vpshufd $0b01001110, \XMM5, \T2 + vpshufd $0b01001110, \T5, \T3 + vpxor \XMM5, \T2, \T2 + vpxor \T5, \T3, \T3 + vpclmulqdq $0x11, \T5, \XMM5, \T4 + vpxor \T4, \T6, \T6 - vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn - vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 - ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn) + vpclmulqdq $0x00, \T5, \XMM5, \T4 + vpxor \T4, \T7, \T7 - sub $16, %r11 - add %r13, %r11 - vmovdqu (arg3, %r11), %xmm1 # receive the last <16 Byte block + vpclmulqdq $0x00, \T3, \T2, \T2 - lea SHIFT_MASK+16(%rip), %r12 - sub %r13, %r12 # adjust the shuffle mask pointer - # to be able to shift 16-r13 bytes - # (r13 is the number of bytes in plaintext mod 16) - vmovdqu (%r12), %xmm2 # get the appropriate shuffle mask - vpshufb %xmm2, %xmm1, %xmm1 # shift right 16-r13 bytes - jmp _final_ghash_mul\@ + vpxor \T2, \XMM1, \XMM1 -_only_less_than_16\@: - # check for 0 length - mov arg4, %r13 - and $15, %r13 # r13 = (arg4 mod 16) + ###################### - je _multiple_of_16_bytes\@ + vmovdqa HashKey_3(arg1), \T5 + vpshufd $0b01001110, \XMM6, \T2 + vpshufd $0b01001110, \T5, \T3 + vpxor \XMM6, \T2, \T2 + vpxor \T5, \T3, \T3 - # handle the last <16 Byte block seperately + vpclmulqdq $0x11, \T5, \XMM6, \T4 + vpxor \T4, \T6, \T6 + vpclmulqdq $0x00, \T5, \XMM6, \T4 + vpxor \T4, \T7, \T7 - vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn - vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 - ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn) + vpclmulqdq $0x00, \T3, \T2, \T2 + vpxor \T2, \XMM1, \XMM1 - lea SHIFT_MASK+16(%rip), %r12 - sub %r13, %r12 # adjust the shuffle mask pointer to be - # able to shift 16-r13 bytes (r13 is the - # number of bytes in plaintext mod 16) + ###################### -_get_last_16_byte_loop\@: - movb (arg3, %r11), %al - movb %al, TMP1 (%rsp , %r11) - add $1, %r11 - cmp %r13, %r11 - jne _get_last_16_byte_loop\@ + vmovdqa HashKey_2(arg1), \T5 + vpshufd $0b01001110, \XMM7, \T2 + vpshufd $0b01001110, \T5, \T3 + vpxor \XMM7, \T2, \T2 + vpxor \T5, \T3, \T3 - vmovdqu TMP1(%rsp), %xmm1 + vpclmulqdq $0x11, \T5, \XMM7, \T4 + vpxor \T4, \T6, \T6 - sub $16, %r11 + vpclmulqdq $0x00, \T5, \XMM7, \T4 + vpxor \T4, \T7, \T7 -_final_ghash_mul\@: - .if \ENC_DEC == DEC - vmovdqa %xmm1, %xmm2 - vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn) - vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to mask out top 16-r13 bytes of xmm9 - vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9 - vpand %xmm1, %xmm2, %xmm2 - vpshufb SHUF_MASK(%rip), %xmm2, %xmm2 - vpxor %xmm2, %xmm14, %xmm14 - #GHASH computation for the last <16 Byte block - GHASH_MUL_AVX2 %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 - sub %r13, %r11 - add $16, %r11 - .else - vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn) - vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to mask out top 16-r13 bytes of xmm9 - vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9 - vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 - vpxor %xmm9, %xmm14, %xmm14 - #GHASH computation for the last <16 Byte block - GHASH_MUL_AVX2 %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 - sub %r13, %r11 - add $16, %r11 - vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 # shuffle xmm9 back to output as ciphertext - .endif + vpclmulqdq $0x00, \T3, \T2, \T2 + vpxor \T2, \XMM1, \XMM1 - ############################# - # output r13 Bytes - vmovq %xmm9, %rax - cmp $8, %r13 - jle _less_than_8_bytes_left\@ + ###################### - mov %rax, (arg2 , %r11) - add $8, %r11 - vpsrldq $8, %xmm9, %xmm9 - vmovq %xmm9, %rax - sub $8, %r13 + vmovdqa HashKey(arg1), \T5 + vpshufd $0b01001110, \XMM8, \T2 + vpshufd $0b01001110, \T5, \T3 + vpxor \XMM8, \T2, \T2 + vpxor \T5, \T3, \T3 -_less_than_8_bytes_left\@: - movb %al, (arg2 , %r11) - add $1, %r11 - shr $8, %rax - sub $1, %r13 - jne _less_than_8_bytes_left\@ - ############################# + vpclmulqdq $0x11, \T5, \XMM8, \T4 + vpxor \T4, \T6, \T6 -_multiple_of_16_bytes\@: - mov arg7, %r12 # r12 = aadLen (number of bytes) - shl $3, %r12 # convert into number of bits - vmovd %r12d, %xmm15 # len(A) in xmm15 + vpclmulqdq $0x00, \T5, \XMM8, \T4 + vpxor \T4, \T7, \T7 - shl $3, arg4 # len(C) in bits (*128) - vmovq arg4, %xmm1 - vpslldq $8, %xmm15, %xmm15 # xmm15 = len(A)|| 0x0000000000000000 - vpxor %xmm1, %xmm15, %xmm15 # xmm15 = len(A)||len(C) + vpclmulqdq $0x00, \T3, \T2, \T2 - vpxor %xmm15, %xmm14, %xmm14 - GHASH_MUL_AVX2 %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 # final GHASH computation - vpshufb SHUF_MASK(%rip), %xmm14, %xmm14 # perform a 16Byte swap + vpxor \T2, \XMM1, \XMM1 + vpxor \T6, \XMM1, \XMM1 + vpxor \T7, \XMM1, \T2 - mov arg5, %rax # rax = *Y0 - vmovdqu (%rax), %xmm9 # xmm9 = Y0 - ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Y0) - vpxor %xmm14, %xmm9, %xmm9 + vpslldq $8, \T2, \T4 + vpsrldq $8, \T2, \T2 + vpxor \T4, \T7, \T7 + vpxor \T2, \T6, \T6 # holds the result of the + # accumulated carry-less multiplications -_return_T\@: - mov arg8, %r10 # r10 = authTag - mov arg9, %r11 # r11 = auth_tag_len + ####################################################################### + #first phase of the reduction + vmovdqa POLY2(%rip), \T3 - cmp $16, %r11 - je _T_16\@ + vpclmulqdq $0x01, \T7, \T3, \T2 + vpslldq $8, \T2, \T2 # shift-L xmm2 2 DWs - cmp $8, %r11 - jl _T_4\@ + vpxor \T2, \T7, \T7 # first phase of the reduction complete + ####################################################################### -_T_8\@: - vmovq %xmm9, %rax - mov %rax, (%r10) - add $8, %r10 - sub $8, %r11 - vpsrldq $8, %xmm9, %xmm9 - cmp $0, %r11 - je _return_T_done\@ -_T_4\@: - vmovd %xmm9, %eax - mov %eax, (%r10) - add $4, %r10 - sub $4, %r11 - vpsrldq $4, %xmm9, %xmm9 - cmp $0, %r11 - je _return_T_done\@ -_T_123\@: - vmovd %xmm9, %eax - cmp $2, %r11 - jl _T_1\@ - mov %ax, (%r10) - cmp $2, %r11 - je _return_T_done\@ - add $2, %r10 - sar $16, %eax -_T_1\@: - mov %al, (%r10) - jmp _return_T_done\@ -_T_16\@: - vmovdqu %xmm9, (%r10) + #second phase of the reduction + vpclmulqdq $0x00, \T7, \T3, \T2 + vpsrldq $4, \T2, \T2 # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R) -_return_T_done\@: - mov %r14, %rsp + vpclmulqdq $0x10, \T7, \T3, \T4 + vpslldq $4, \T4, \T4 # shift-L T4 1 DW (Shift-L 1-DW to obtain result with no shifts) - pop %r15 - pop %r14 - pop %r13 - pop %r12 + vpxor \T2, \T4, \T4 # second phase of the reduction complete + ####################################################################### + vpxor \T4, \T6, \T6 # the result is in T6 .endm + ############################################################# #void aesni_gcm_precomp_avx_gen4 # (gcm_data *my_ctx_data, @@ -2918,7 +2603,7 @@ ENDPROC(aesni_gcm_precomp_avx_gen4) # Valid values are 16 (most likely), 12 or 8. */ ############################################################################### ENTRY(aesni_gcm_enc_avx_gen4) - GCM_ENC_DEC_AVX2 ENC + GCM_ENC_DEC INITIAL_BLOCKS_AVX2 GHASH_8_ENCRYPT_8_PARALLEL_AVX2 GHASH_LAST_8_AVX2 GHASH_MUL_AVX2 ENC ret ENDPROC(aesni_gcm_enc_avx_gen4) @@ -2939,7 +2624,7 @@ ENDPROC(aesni_gcm_enc_avx_gen4) # Valid values are 16 (most likely), 12 or 8. */ ############################################################################### ENTRY(aesni_gcm_dec_avx_gen4) - GCM_ENC_DEC_AVX2 DEC + GCM_ENC_DEC INITIAL_BLOCKS_AVX2 GHASH_8_ENCRYPT_8_PARALLEL_AVX2 GHASH_LAST_8_AVX2 GHASH_MUL_AVX2 DEC ret ENDPROC(aesni_gcm_dec_avx_gen4) -- cgit v1.2.3 From de85fc46b1037099c78d2fec08a662079e568d62 Mon Sep 17 00:00:00 2001 From: Dave Watson Date: Mon, 10 Dec 2018 19:57:00 +0000 Subject: crypto: aesni - Introduce gcm_context_data Add the gcm_context_data structure to the avx asm routines. This will be necessary to support both 256 bit keys and scatter/gather. The pre-computed HashKeys are now stored in the gcm_context_data struct, which is expanded to hold the greater number of hashkeys necessary for avx. Loads and stores to the new struct are always done unlaligned to avoid compiler issues, see e5b954e8 "Use unaligned loads from gcm_context_data" Signed-off-by: Dave Watson Signed-off-by: Herbert Xu --- arch/x86/crypto/aesni-intel_avx-x86_64.S | 378 +++++++++++++++---------------- arch/x86/crypto/aesni-intel_glue.c | 58 +++-- 2 files changed, 215 insertions(+), 221 deletions(-) (limited to 'arch/x86/crypto') diff --git a/arch/x86/crypto/aesni-intel_avx-x86_64.S b/arch/x86/crypto/aesni-intel_avx-x86_64.S index 318135a77975..284f1b8b88fc 100644 --- a/arch/x86/crypto/aesni-intel_avx-x86_64.S +++ b/arch/x86/crypto/aesni-intel_avx-x86_64.S @@ -182,43 +182,22 @@ aad_shift_arr: .text -##define the fields of the gcm aes context -#{ -# u8 expanded_keys[16*11] store expanded keys -# u8 shifted_hkey_1[16] store HashKey <<1 mod poly here -# u8 shifted_hkey_2[16] store HashKey^2 <<1 mod poly here -# u8 shifted_hkey_3[16] store HashKey^3 <<1 mod poly here -# u8 shifted_hkey_4[16] store HashKey^4 <<1 mod poly here -# u8 shifted_hkey_5[16] store HashKey^5 <<1 mod poly here -# u8 shifted_hkey_6[16] store HashKey^6 <<1 mod poly here -# u8 shifted_hkey_7[16] store HashKey^7 <<1 mod poly here -# u8 shifted_hkey_8[16] store HashKey^8 <<1 mod poly here -# u8 shifted_hkey_1_k[16] store XOR HashKey <<1 mod poly here (for Karatsuba purposes) -# u8 shifted_hkey_2_k[16] store XOR HashKey^2 <<1 mod poly here (for Karatsuba purposes) -# u8 shifted_hkey_3_k[16] store XOR HashKey^3 <<1 mod poly here (for Karatsuba purposes) -# u8 shifted_hkey_4_k[16] store XOR HashKey^4 <<1 mod poly here (for Karatsuba purposes) -# u8 shifted_hkey_5_k[16] store XOR HashKey^5 <<1 mod poly here (for Karatsuba purposes) -# u8 shifted_hkey_6_k[16] store XOR HashKey^6 <<1 mod poly here (for Karatsuba purposes) -# u8 shifted_hkey_7_k[16] store XOR HashKey^7 <<1 mod poly here (for Karatsuba purposes) -# u8 shifted_hkey_8_k[16] store XOR HashKey^8 <<1 mod poly here (for Karatsuba purposes) -#} gcm_ctx# - -HashKey = 16*11 # store HashKey <<1 mod poly here -HashKey_2 = 16*12 # store HashKey^2 <<1 mod poly here -HashKey_3 = 16*13 # store HashKey^3 <<1 mod poly here -HashKey_4 = 16*14 # store HashKey^4 <<1 mod poly here -HashKey_5 = 16*15 # store HashKey^5 <<1 mod poly here -HashKey_6 = 16*16 # store HashKey^6 <<1 mod poly here -HashKey_7 = 16*17 # store HashKey^7 <<1 mod poly here -HashKey_8 = 16*18 # store HashKey^8 <<1 mod poly here -HashKey_k = 16*19 # store XOR of HashKey <<1 mod poly here (for Karatsuba purposes) -HashKey_2_k = 16*20 # store XOR of HashKey^2 <<1 mod poly here (for Karatsuba purposes) -HashKey_3_k = 16*21 # store XOR of HashKey^3 <<1 mod poly here (for Karatsuba purposes) -HashKey_4_k = 16*22 # store XOR of HashKey^4 <<1 mod poly here (for Karatsuba purposes) -HashKey_5_k = 16*23 # store XOR of HashKey^5 <<1 mod poly here (for Karatsuba purposes) -HashKey_6_k = 16*24 # store XOR of HashKey^6 <<1 mod poly here (for Karatsuba purposes) -HashKey_7_k = 16*25 # store XOR of HashKey^7 <<1 mod poly here (for Karatsuba purposes) -HashKey_8_k = 16*26 # store XOR of HashKey^8 <<1 mod poly here (for Karatsuba purposes) +HashKey = 16*6 # store HashKey <<1 mod poly here +HashKey_2 = 16*7 # store HashKey^2 <<1 mod poly here +HashKey_3 = 16*8 # store HashKey^3 <<1 mod poly here +HashKey_4 = 16*9 # store HashKey^4 <<1 mod poly here +HashKey_5 = 16*10 # store HashKey^5 <<1 mod poly here +HashKey_6 = 16*11 # store HashKey^6 <<1 mod poly here +HashKey_7 = 16*12 # store HashKey^7 <<1 mod poly here +HashKey_8 = 16*13 # store HashKey^8 <<1 mod poly here +HashKey_k = 16*14 # store XOR of HashKey <<1 mod poly here (for Karatsuba purposes) +HashKey_2_k = 16*15 # store XOR of HashKey^2 <<1 mod poly here (for Karatsuba purposes) +HashKey_3_k = 16*16 # store XOR of HashKey^3 <<1 mod poly here (for Karatsuba purposes) +HashKey_4_k = 16*17 # store XOR of HashKey^4 <<1 mod poly here (for Karatsuba purposes) +HashKey_5_k = 16*18 # store XOR of HashKey^5 <<1 mod poly here (for Karatsuba purposes) +HashKey_6_k = 16*19 # store XOR of HashKey^6 <<1 mod poly here (for Karatsuba purposes) +HashKey_7_k = 16*20 # store XOR of HashKey^7 <<1 mod poly here (for Karatsuba purposes) +HashKey_8_k = 16*21 # store XOR of HashKey^8 <<1 mod poly here (for Karatsuba purposes) #define arg1 %rdi #define arg2 %rsi @@ -229,6 +208,7 @@ HashKey_8_k = 16*26 # store XOR of HashKey^8 <<1 mod poly here (for Karatsu #define arg7 STACK_OFFSET+8*1(%r14) #define arg8 STACK_OFFSET+8*2(%r14) #define arg9 STACK_OFFSET+8*3(%r14) +#define arg10 STACK_OFFSET+8*4(%r14) i = 0 j = 0 @@ -300,9 +280,9 @@ VARIABLE_OFFSET = 16*8 and $~63, %rsp # align rsp to 64 bytes - vmovdqu HashKey(arg1), %xmm13 # xmm13 = HashKey + vmovdqu HashKey(arg2), %xmm13 # xmm13 = HashKey - mov arg4, %r13 # save the number of bytes of plaintext/ciphertext + mov arg5, %r13 # save the number of bytes of plaintext/ciphertext and $-16, %r13 # r13 = r13 - (r13 mod 16) mov %r13, %r12 @@ -413,11 +393,11 @@ _eight_cipher_left\@: _zero_cipher_left\@: - cmp $16, arg4 + cmp $16, arg5 jl _only_less_than_16\@ - mov arg4, %r13 - and $15, %r13 # r13 = (arg4 mod 16) + mov arg5, %r13 + and $15, %r13 # r13 = (arg5 mod 16) je _multiple_of_16_bytes\@ @@ -430,7 +410,7 @@ _zero_cipher_left\@: sub $16, %r11 add %r13, %r11 - vmovdqu (arg3, %r11), %xmm1 # receive the last <16 Byte block + vmovdqu (arg4, %r11), %xmm1 # receive the last <16 Byte block lea SHIFT_MASK+16(%rip), %r12 sub %r13, %r12 # adjust the shuffle mask pointer to be @@ -442,8 +422,8 @@ _zero_cipher_left\@: _only_less_than_16\@: # check for 0 length - mov arg4, %r13 - and $15, %r13 # r13 = (arg4 mod 16) + mov arg5, %r13 + and $15, %r13 # r13 = (arg5 mod 16) je _multiple_of_16_bytes\@ @@ -461,7 +441,7 @@ _only_less_than_16\@: # number of bytes in plaintext mod 16) _get_last_16_byte_loop\@: - movb (arg3, %r11), %al + movb (arg4, %r11), %al movb %al, TMP1 (%rsp , %r11) add $1, %r11 cmp %r13, %r11 @@ -506,14 +486,14 @@ _final_ghash_mul\@: cmp $8, %r13 jle _less_than_8_bytes_left\@ - mov %rax, (arg2 , %r11) + mov %rax, (arg3 , %r11) add $8, %r11 vpsrldq $8, %xmm9, %xmm9 vmovq %xmm9, %rax sub $8, %r13 _less_than_8_bytes_left\@: - movb %al, (arg2 , %r11) + movb %al, (arg3 , %r11) add $1, %r11 shr $8, %rax sub $1, %r13 @@ -521,12 +501,12 @@ _less_than_8_bytes_left\@: ############################# _multiple_of_16_bytes\@: - mov arg7, %r12 # r12 = aadLen (number of bytes) + mov arg8, %r12 # r12 = aadLen (number of bytes) shl $3, %r12 # convert into number of bits vmovd %r12d, %xmm15 # len(A) in xmm15 - shl $3, arg4 # len(C) in bits (*128) - vmovq arg4, %xmm1 + shl $3, arg5 # len(C) in bits (*128) + vmovq arg5, %xmm1 vpslldq $8, %xmm15, %xmm15 # xmm15 = len(A)|| 0x0000000000000000 vpxor %xmm1, %xmm15, %xmm15 # xmm15 = len(A)||len(C) @@ -534,7 +514,7 @@ _multiple_of_16_bytes\@: \GHASH_MUL %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 # final GHASH computation vpshufb SHUF_MASK(%rip), %xmm14, %xmm14 # perform a 16Byte swap - mov arg5, %rax # rax = *Y0 + mov arg6, %rax # rax = *Y0 vmovdqu (%rax), %xmm9 # xmm9 = Y0 ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Y0) @@ -544,8 +524,8 @@ _multiple_of_16_bytes\@: _return_T\@: - mov arg8, %r10 # r10 = authTag - mov arg9, %r11 # r11 = auth_tag_len + mov arg9, %r10 # r10 = authTag + mov arg10, %r11 # r11 = auth_tag_len cmp $16, %r11 je _T_16\@ @@ -655,49 +635,49 @@ _return_T_done\@: vpshufd $0b01001110, \T5, \T1 vpxor \T5, \T1, \T1 - vmovdqa \T1, HashKey_k(arg1) + vmovdqu \T1, HashKey_k(arg2) GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly - vmovdqa \T5, HashKey_2(arg1) # [HashKey_2] = HashKey^2<<1 mod poly + vmovdqu \T5, HashKey_2(arg2) # [HashKey_2] = HashKey^2<<1 mod poly vpshufd $0b01001110, \T5, \T1 vpxor \T5, \T1, \T1 - vmovdqa \T1, HashKey_2_k(arg1) + vmovdqu \T1, HashKey_2_k(arg2) GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly - vmovdqa \T5, HashKey_3(arg1) + vmovdqu \T5, HashKey_3(arg2) vpshufd $0b01001110, \T5, \T1 vpxor \T5, \T1, \T1 - vmovdqa \T1, HashKey_3_k(arg1) + vmovdqu \T1, HashKey_3_k(arg2) GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly - vmovdqa \T5, HashKey_4(arg1) + vmovdqu \T5, HashKey_4(arg2) vpshufd $0b01001110, \T5, \T1 vpxor \T5, \T1, \T1 - vmovdqa \T1, HashKey_4_k(arg1) + vmovdqu \T1, HashKey_4_k(arg2) GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly - vmovdqa \T5, HashKey_5(arg1) + vmovdqu \T5, HashKey_5(arg2) vpshufd $0b01001110, \T5, \T1 vpxor \T5, \T1, \T1 - vmovdqa \T1, HashKey_5_k(arg1) + vmovdqu \T1, HashKey_5_k(arg2) GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly - vmovdqa \T5, HashKey_6(arg1) + vmovdqu \T5, HashKey_6(arg2) vpshufd $0b01001110, \T5, \T1 vpxor \T5, \T1, \T1 - vmovdqa \T1, HashKey_6_k(arg1) + vmovdqu \T1, HashKey_6_k(arg2) GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly - vmovdqa \T5, HashKey_7(arg1) + vmovdqu \T5, HashKey_7(arg2) vpshufd $0b01001110, \T5, \T1 vpxor \T5, \T1, \T1 - vmovdqa \T1, HashKey_7_k(arg1) + vmovdqu \T1, HashKey_7_k(arg2) GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly - vmovdqa \T5, HashKey_8(arg1) + vmovdqu \T5, HashKey_8(arg2) vpshufd $0b01001110, \T5, \T1 vpxor \T5, \T1, \T1 - vmovdqa \T1, HashKey_8_k(arg1) + vmovdqu \T1, HashKey_8_k(arg2) .endm @@ -706,15 +686,15 @@ _return_T_done\@: ## num_initial_blocks = b mod 4# ## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext ## r10, r11, r12, rax are clobbered -## arg1, arg2, arg3, r14 are used as a pointer only, not modified +## arg1, arg3, arg4, r14 are used as a pointer only, not modified .macro INITIAL_BLOCKS_AVX num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC i = (8-\num_initial_blocks) j = 0 setreg - mov arg6, %r10 # r10 = AAD - mov arg7, %r12 # r12 = aadLen + mov arg7, %r10 # r10 = AAD + mov arg8, %r12 # r12 = aadLen mov %r12, %r11 @@ -780,7 +760,7 @@ _get_AAD_done\@: xor %r11d, %r11d # start AES for num_initial_blocks blocks - mov arg5, %rax # rax = *Y0 + mov arg6, %rax # rax = *Y0 vmovdqu (%rax), \CTR # CTR = Y0 vpshufb SHUF_MASK(%rip), \CTR, \CTR @@ -833,9 +813,9 @@ _get_AAD_done\@: i = (9-\num_initial_blocks) setreg .rep \num_initial_blocks - vmovdqu (arg3, %r11), \T1 + vmovdqu (arg4, %r11), \T1 vpxor \T1, reg_i, reg_i - vmovdqu reg_i, (arg2 , %r11) # write back ciphertext for num_initial_blocks blocks + vmovdqu reg_i, (arg3 , %r11) # write back ciphertext for num_initial_blocks blocks add $16, %r11 .if \ENC_DEC == DEC vmovdqa \T1, reg_i @@ -936,58 +916,58 @@ _get_AAD_done\@: vaesenclast \T_key, \XMM7, \XMM7 vaesenclast \T_key, \XMM8, \XMM8 - vmovdqu (arg3, %r11), \T1 + vmovdqu (arg4, %r11), \T1 vpxor \T1, \XMM1, \XMM1 - vmovdqu \XMM1, (arg2 , %r11) + vmovdqu \XMM1, (arg3 , %r11) .if \ENC_DEC == DEC vmovdqa \T1, \XMM1 .endif - vmovdqu 16*1(arg3, %r11), \T1 + vmovdqu 16*1(arg4, %r11), \T1 vpxor \T1, \XMM2, \XMM2 - vmovdqu \XMM2, 16*1(arg2 , %r11) + vmovdqu \XMM2, 16*1(arg3 , %r11) .if \ENC_DEC == DEC vmovdqa \T1, \XMM2 .endif - vmovdqu 16*2(arg3, %r11), \T1 + vmovdqu 16*2(arg4, %r11), \T1 vpxor \T1, \XMM3, \XMM3 - vmovdqu \XMM3, 16*2(arg2 , %r11) + vmovdqu \XMM3, 16*2(arg3 , %r11) .if \ENC_DEC == DEC vmovdqa \T1, \XMM3 .endif - vmovdqu 16*3(arg3, %r11), \T1 + vmovdqu 16*3(arg4, %r11), \T1 vpxor \T1, \XMM4, \XMM4 - vmovdqu \XMM4, 16*3(arg2 , %r11) + vmovdqu \XMM4, 16*3(arg3 , %r11) .if \ENC_DEC == DEC vmovdqa \T1, \XMM4 .endif - vmovdqu 16*4(arg3, %r11), \T1 + vmovdqu 16*4(arg4, %r11), \T1 vpxor \T1, \XMM5, \XMM5 - vmovdqu \XMM5, 16*4(arg2 , %r11) + vmovdqu \XMM5, 16*4(arg3 , %r11) .if \ENC_DEC == DEC vmovdqa \T1, \XMM5 .endif - vmovdqu 16*5(arg3, %r11), \T1 + vmovdqu 16*5(arg4, %r11), \T1 vpxor \T1, \XMM6, \XMM6 - vmovdqu \XMM6, 16*5(arg2 , %r11) + vmovdqu \XMM6, 16*5(arg3 , %r11) .if \ENC_DEC == DEC vmovdqa \T1, \XMM6 .endif - vmovdqu 16*6(arg3, %r11), \T1 + vmovdqu 16*6(arg4, %r11), \T1 vpxor \T1, \XMM7, \XMM7 - vmovdqu \XMM7, 16*6(arg2 , %r11) + vmovdqu \XMM7, 16*6(arg3 , %r11) .if \ENC_DEC == DEC vmovdqa \T1, \XMM7 .endif - vmovdqu 16*7(arg3, %r11), \T1 + vmovdqu 16*7(arg4, %r11), \T1 vpxor \T1, \XMM8, \XMM8 - vmovdqu \XMM8, 16*7(arg2 , %r11) + vmovdqu \XMM8, 16*7(arg3 , %r11) .if \ENC_DEC == DEC vmovdqa \T1, \XMM8 .endif @@ -1012,7 +992,7 @@ _initial_blocks_done\@: # encrypt 8 blocks at a time # ghash the 8 previously encrypted ciphertext blocks -# arg1, arg2, arg3 are used as pointers only, not modified +# arg1, arg3, arg4 are used as pointers only, not modified # r11 is the data offset value .macro GHASH_8_ENCRYPT_8_PARALLEL_AVX T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC @@ -1098,14 +1078,14 @@ _initial_blocks_done\@: ####################################################################### - vmovdqa HashKey_8(arg1), \T5 + vmovdqu HashKey_8(arg2), \T5 vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1 vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0 vpshufd $0b01001110, \T2, \T6 vpxor \T2, \T6, \T6 - vmovdqa HashKey_8_k(arg1), \T5 + vmovdqu HashKey_8_k(arg2), \T5 vpclmulqdq $0x00, \T5, \T6, \T6 vmovdqu 16*3(arg1), \T1 @@ -1119,7 +1099,7 @@ _initial_blocks_done\@: vaesenc \T1, \XMM8, \XMM8 vmovdqa TMP2(%rsp), \T1 - vmovdqa HashKey_7(arg1), \T5 + vmovdqu HashKey_7(arg2), \T5 vpclmulqdq $0x11, \T5, \T1, \T3 vpxor \T3, \T4, \T4 vpclmulqdq $0x00, \T5, \T1, \T3 @@ -1127,7 +1107,7 @@ _initial_blocks_done\@: vpshufd $0b01001110, \T1, \T3 vpxor \T1, \T3, \T3 - vmovdqa HashKey_7_k(arg1), \T5 + vmovdqu HashKey_7_k(arg2), \T5 vpclmulqdq $0x10, \T5, \T3, \T3 vpxor \T3, \T6, \T6 @@ -1144,7 +1124,7 @@ _initial_blocks_done\@: ####################################################################### vmovdqa TMP3(%rsp), \T1 - vmovdqa HashKey_6(arg1), \T5 + vmovdqu HashKey_6(arg2), \T5 vpclmulqdq $0x11, \T5, \T1, \T3 vpxor \T3, \T4, \T4 vpclmulqdq $0x00, \T5, \T1, \T3 @@ -1152,7 +1132,7 @@ _initial_blocks_done\@: vpshufd $0b01001110, \T1, \T3 vpxor \T1, \T3, \T3 - vmovdqa HashKey_6_k(arg1), \T5 + vmovdqu HashKey_6_k(arg2), \T5 vpclmulqdq $0x10, \T5, \T3, \T3 vpxor \T3, \T6, \T6 @@ -1167,7 +1147,7 @@ _initial_blocks_done\@: vaesenc \T1, \XMM8, \XMM8 vmovdqa TMP4(%rsp), \T1 - vmovdqa HashKey_5(arg1), \T5 + vmovdqu HashKey_5(arg2), \T5 vpclmulqdq $0x11, \T5, \T1, \T3 vpxor \T3, \T4, \T4 vpclmulqdq $0x00, \T5, \T1, \T3 @@ -1175,7 +1155,7 @@ _initial_blocks_done\@: vpshufd $0b01001110, \T1, \T3 vpxor \T1, \T3, \T3 - vmovdqa HashKey_5_k(arg1), \T5 + vmovdqu HashKey_5_k(arg2), \T5 vpclmulqdq $0x10, \T5, \T3, \T3 vpxor \T3, \T6, \T6 @@ -1191,7 +1171,7 @@ _initial_blocks_done\@: vmovdqa TMP5(%rsp), \T1 - vmovdqa HashKey_4(arg1), \T5 + vmovdqu HashKey_4(arg2), \T5 vpclmulqdq $0x11, \T5, \T1, \T3 vpxor \T3, \T4, \T4 vpclmulqdq $0x00, \T5, \T1, \T3 @@ -1199,7 +1179,7 @@ _initial_blocks_done\@: vpshufd $0b01001110, \T1, \T3 vpxor \T1, \T3, \T3 - vmovdqa HashKey_4_k(arg1), \T5 + vmovdqu HashKey_4_k(arg2), \T5 vpclmulqdq $0x10, \T5, \T3, \T3 vpxor \T3, \T6, \T6 @@ -1214,7 +1194,7 @@ _initial_blocks_done\@: vaesenc \T1, \XMM8, \XMM8 vmovdqa TMP6(%rsp), \T1 - vmovdqa HashKey_3(arg1), \T5 + vmovdqu HashKey_3(arg2), \T5 vpclmulqdq $0x11, \T5, \T1, \T3 vpxor \T3, \T4, \T4 vpclmulqdq $0x00, \T5, \T1, \T3 @@ -1222,7 +1202,7 @@ _initial_blocks_done\@: vpshufd $0b01001110, \T1, \T3 vpxor \T1, \T3, \T3 - vmovdqa HashKey_3_k(arg1), \T5 + vmovdqu HashKey_3_k(arg2), \T5 vpclmulqdq $0x10, \T5, \T3, \T3 vpxor \T3, \T6, \T6 @@ -1238,7 +1218,7 @@ _initial_blocks_done\@: vaesenc \T1, \XMM8, \XMM8 vmovdqa TMP7(%rsp), \T1 - vmovdqa HashKey_2(arg1), \T5 + vmovdqu HashKey_2(arg2), \T5 vpclmulqdq $0x11, \T5, \T1, \T3 vpxor \T3, \T4, \T4 vpclmulqdq $0x00, \T5, \T1, \T3 @@ -1246,7 +1226,7 @@ _initial_blocks_done\@: vpshufd $0b01001110, \T1, \T3 vpxor \T1, \T3, \T3 - vmovdqa HashKey_2_k(arg1), \T5 + vmovdqu HashKey_2_k(arg2), \T5 vpclmulqdq $0x10, \T5, \T3, \T3 vpxor \T3, \T6, \T6 @@ -1263,7 +1243,7 @@ _initial_blocks_done\@: vaesenc \T5, \XMM8, \XMM8 vmovdqa TMP8(%rsp), \T1 - vmovdqa HashKey(arg1), \T5 + vmovdqu HashKey(arg2), \T5 vpclmulqdq $0x11, \T5, \T1, \T3 vpxor \T3, \T4, \T4 vpclmulqdq $0x00, \T5, \T1, \T3 @@ -1271,7 +1251,7 @@ _initial_blocks_done\@: vpshufd $0b01001110, \T1, \T3 vpxor \T1, \T3, \T3 - vmovdqa HashKey_k(arg1), \T5 + vmovdqu HashKey_k(arg2), \T5 vpclmulqdq $0x10, \T5, \T3, \T3 vpxor \T3, \T6, \T6 @@ -1284,13 +1264,13 @@ _initial_blocks_done\@: j = 1 setreg .rep 8 - vpxor 16*i(arg3, %r11), \T5, \T2 + vpxor 16*i(arg4, %r11), \T5, \T2 .if \ENC_DEC == ENC vaesenclast \T2, reg_j, reg_j .else vaesenclast \T2, reg_j, \T3 - vmovdqu 16*i(arg3, %r11), reg_j - vmovdqu \T3, 16*i(arg2, %r11) + vmovdqu 16*i(arg4, %r11), reg_j + vmovdqu \T3, 16*i(arg3, %r11) .endif i = (i+1) j = (j+1) @@ -1322,14 +1302,14 @@ _initial_blocks_done\@: vpxor \T2, \T7, \T7 # first phase of the reduction complete ####################################################################### .if \ENC_DEC == ENC - vmovdqu \XMM1, 16*0(arg2,%r11) # Write to the Ciphertext buffer - vmovdqu \XMM2, 16*1(arg2,%r11) # Write to the Ciphertext buffer - vmovdqu \XMM3, 16*2(arg2,%r11) # Write to the Ciphertext buffer - vmovdqu \XMM4, 16*3(arg2,%r11) # Write to the Ciphertext buffer - vmovdqu \XMM5, 16*4(arg2,%r11) # Write to the Ciphertext buffer - vmovdqu \XMM6, 16*5(arg2,%r11) # Write to the Ciphertext buffer - vmovdqu \XMM7, 16*6(arg2,%r11) # Write to the Ciphertext buffer - vmovdqu \XMM8, 16*7(arg2,%r11) # Write to the Ciphertext buffer + vmovdqu \XMM1, 16*0(arg3,%r11) # Write to the Ciphertext buffer + vmovdqu \XMM2, 16*1(arg3,%r11) # Write to the Ciphertext buffer + vmovdqu \XMM3, 16*2(arg3,%r11) # Write to the Ciphertext buffer + vmovdqu \XMM4, 16*3(arg3,%r11) # Write to the Ciphertext buffer + vmovdqu \XMM5, 16*4(arg3,%r11) # Write to the Ciphertext buffer + vmovdqu \XMM6, 16*5(arg3,%r11) # Write to the Ciphertext buffer + vmovdqu \XMM7, 16*6(arg3,%r11) # Write to the Ciphertext buffer + vmovdqu \XMM8, 16*7(arg3,%r11) # Write to the Ciphertext buffer .endif ####################################################################### @@ -1370,25 +1350,25 @@ _initial_blocks_done\@: vpshufd $0b01001110, \XMM1, \T2 vpxor \XMM1, \T2, \T2 - vmovdqa HashKey_8(arg1), \T5 + vmovdqu HashKey_8(arg2), \T5 vpclmulqdq $0x11, \T5, \XMM1, \T6 vpclmulqdq $0x00, \T5, \XMM1, \T7 - vmovdqa HashKey_8_k(arg1), \T3 + vmovdqu HashKey_8_k(arg2), \T3 vpclmulqdq $0x00, \T3, \T2, \XMM1 ###################### vpshufd $0b01001110, \XMM2, \T2 vpxor \XMM2, \T2, \T2 - vmovdqa HashKey_7(arg1), \T5 + vmovdqu HashKey_7(arg2), \T5 vpclmulqdq $0x11, \T5, \XMM2, \T4 vpxor \T4, \T6, \T6 vpclmulqdq $0x00, \T5, \XMM2, \T4 vpxor \T4, \T7, \T7 - vmovdqa HashKey_7_k(arg1), \T3 + vmovdqu HashKey_7_k(arg2), \T3 vpclmulqdq $0x00, \T3, \T2, \T2 vpxor \T2, \XMM1, \XMM1 @@ -1396,14 +1376,14 @@ _initial_blocks_done\@: vpshufd $0b01001110, \XMM3, \T2 vpxor \XMM3, \T2, \T2 - vmovdqa HashKey_6(arg1), \T5 + vmovdqu HashKey_6(arg2), \T5 vpclmulqdq $0x11, \T5, \XMM3, \T4 vpxor \T4, \T6, \T6 vpclmulqdq $0x00, \T5, \XMM3, \T4 vpxor \T4, \T7, \T7 - vmovdqa HashKey_6_k(arg1), \T3 + vmovdqu HashKey_6_k(arg2), \T3 vpclmulqdq $0x00, \T3, \T2, \T2 vpxor \T2, \XMM1, \XMM1 @@ -1411,14 +1391,14 @@ _initial_blocks_done\@: vpshufd $0b01001110, \XMM4, \T2 vpxor \XMM4, \T2, \T2 - vmovdqa HashKey_5(arg1), \T5 + vmovdqu HashKey_5(arg2), \T5 vpclmulqdq $0x11, \T5, \XMM4, \T4 vpxor \T4, \T6, \T6 vpclmulqdq $0x00, \T5, \XMM4, \T4 vpxor \T4, \T7, \T7 - vmovdqa HashKey_5_k(arg1), \T3 + vmovdqu HashKey_5_k(arg2), \T3 vpclmulqdq $0x00, \T3, \T2, \T2 vpxor \T2, \XMM1, \XMM1 @@ -1426,14 +1406,14 @@ _initial_blocks_done\@: vpshufd $0b01001110, \XMM5, \T2 vpxor \XMM5, \T2, \T2 - vmovdqa HashKey_4(arg1), \T5 + vmovdqu HashKey_4(arg2), \T5 vpclmulqdq $0x11, \T5, \XMM5, \T4 vpxor \T4, \T6, \T6 vpclmulqdq $0x00, \T5, \XMM5, \T4 vpxor \T4, \T7, \T7 - vmovdqa HashKey_4_k(arg1), \T3 + vmovdqu HashKey_4_k(arg2), \T3 vpclmulqdq $0x00, \T3, \T2, \T2 vpxor \T2, \XMM1, \XMM1 @@ -1441,14 +1421,14 @@ _initial_blocks_done\@: vpshufd $0b01001110, \XMM6, \T2 vpxor \XMM6, \T2, \T2 - vmovdqa HashKey_3(arg1), \T5 + vmovdqu HashKey_3(arg2), \T5 vpclmulqdq $0x11, \T5, \XMM6, \T4 vpxor \T4, \T6, \T6 vpclmulqdq $0x00, \T5, \XMM6, \T4 vpxor \T4, \T7, \T7 - vmovdqa HashKey_3_k(arg1), \T3 + vmovdqu HashKey_3_k(arg2), \T3 vpclmulqdq $0x00, \T3, \T2, \T2 vpxor \T2, \XMM1, \XMM1 @@ -1456,14 +1436,14 @@ _initial_blocks_done\@: vpshufd $0b01001110, \XMM7, \T2 vpxor \XMM7, \T2, \T2 - vmovdqa HashKey_2(arg1), \T5 + vmovdqu HashKey_2(arg2), \T5 vpclmulqdq $0x11, \T5, \XMM7, \T4 vpxor \T4, \T6, \T6 vpclmulqdq $0x00, \T5, \XMM7, \T4 vpxor \T4, \T7, \T7 - vmovdqa HashKey_2_k(arg1), \T3 + vmovdqu HashKey_2_k(arg2), \T3 vpclmulqdq $0x00, \T3, \T2, \T2 vpxor \T2, \XMM1, \XMM1 @@ -1471,14 +1451,14 @@ _initial_blocks_done\@: vpshufd $0b01001110, \XMM8, \T2 vpxor \XMM8, \T2, \T2 - vmovdqa HashKey(arg1), \T5 + vmovdqu HashKey(arg2), \T5 vpclmulqdq $0x11, \T5, \XMM8, \T4 vpxor \T4, \T6, \T6 vpclmulqdq $0x00, \T5, \XMM8, \T4 vpxor \T4, \T7, \T7 - vmovdqa HashKey_k(arg1), \T3 + vmovdqu HashKey_k(arg2), \T3 vpclmulqdq $0x00, \T3, \T2, \T2 vpxor \T2, \XMM1, \XMM1 @@ -1527,6 +1507,7 @@ _initial_blocks_done\@: ############################################################# #void aesni_gcm_precomp_avx_gen2 # (gcm_data *my_ctx_data, +# gcm_context_data *data, # u8 *hash_subkey)# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */ ############################################################# ENTRY(aesni_gcm_precomp_avx_gen2) @@ -1543,7 +1524,7 @@ ENTRY(aesni_gcm_precomp_avx_gen2) sub $VARIABLE_OFFSET, %rsp and $~63, %rsp # align rsp to 64 bytes - vmovdqu (arg2), %xmm6 # xmm6 = HashKey + vmovdqu (arg3), %xmm6 # xmm6 = HashKey vpshufb SHUF_MASK(%rip), %xmm6, %xmm6 ############### PRECOMPUTATION of HashKey<<1 mod poly from the HashKey @@ -1560,7 +1541,7 @@ ENTRY(aesni_gcm_precomp_avx_gen2) vpand POLY(%rip), %xmm2, %xmm2 vpxor %xmm2, %xmm6, %xmm6 # xmm6 holds the HashKey<<1 mod poly ####################################################################### - vmovdqa %xmm6, HashKey(arg1) # store HashKey<<1 mod poly + vmovdqu %xmm6, HashKey(arg2) # store HashKey<<1 mod poly PRECOMPUTE_AVX %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5 @@ -1577,6 +1558,7 @@ ENDPROC(aesni_gcm_precomp_avx_gen2) ############################################################################### #void aesni_gcm_enc_avx_gen2( # gcm_data *my_ctx_data, /* aligned to 16 Bytes */ +# gcm_context_data *data, # u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */ # const u8 *in, /* Plaintext input */ # u64 plaintext_len, /* Length of data in Bytes for encryption. */ @@ -1598,6 +1580,7 @@ ENDPROC(aesni_gcm_enc_avx_gen2) ############################################################################### #void aesni_gcm_dec_avx_gen2( # gcm_data *my_ctx_data, /* aligned to 16 Bytes */ +# gcm_context_data *data, # u8 *out, /* Plaintext output. Decrypt in-place is allowed. */ # const u8 *in, /* Ciphertext input */ # u64 plaintext_len, /* Length of data in Bytes for encryption. */ @@ -1668,25 +1651,25 @@ ENDPROC(aesni_gcm_dec_avx_gen2) # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i vmovdqa \HK, \T5 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly - vmovdqa \T5, HashKey_2(arg1) # [HashKey_2] = HashKey^2<<1 mod poly + vmovdqu \T5, HashKey_2(arg2) # [HashKey_2] = HashKey^2<<1 mod poly GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly - vmovdqa \T5, HashKey_3(arg1) + vmovdqu \T5, HashKey_3(arg2) GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly - vmovdqa \T5, HashKey_4(arg1) + vmovdqu \T5, HashKey_4(arg2) GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly - vmovdqa \T5, HashKey_5(arg1) + vmovdqu \T5, HashKey_5(arg2) GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly - vmovdqa \T5, HashKey_6(arg1) + vmovdqu \T5, HashKey_6(arg2) GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly - vmovdqa \T5, HashKey_7(arg1) + vmovdqu \T5, HashKey_7(arg2) GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly - vmovdqa \T5, HashKey_8(arg1) + vmovdqu \T5, HashKey_8(arg2) .endm @@ -1696,15 +1679,15 @@ ENDPROC(aesni_gcm_dec_avx_gen2) ## num_initial_blocks = b mod 4# ## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext ## r10, r11, r12, rax are clobbered -## arg1, arg2, arg3, r14 are used as a pointer only, not modified +## arg1, arg3, arg4, r14 are used as a pointer only, not modified .macro INITIAL_BLOCKS_AVX2 num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC VER i = (8-\num_initial_blocks) j = 0 setreg - mov arg6, %r10 # r10 = AAD - mov arg7, %r12 # r12 = aadLen + mov arg7, %r10 # r10 = AAD + mov arg8, %r12 # r12 = aadLen mov %r12, %r11 @@ -1771,7 +1754,7 @@ _get_AAD_done\@: xor %r11d, %r11d # start AES for num_initial_blocks blocks - mov arg5, %rax # rax = *Y0 + mov arg6, %rax # rax = *Y0 vmovdqu (%rax), \CTR # CTR = Y0 vpshufb SHUF_MASK(%rip), \CTR, \CTR @@ -1824,9 +1807,9 @@ _get_AAD_done\@: i = (9-\num_initial_blocks) setreg .rep \num_initial_blocks - vmovdqu (arg3, %r11), \T1 + vmovdqu (arg4, %r11), \T1 vpxor \T1, reg_i, reg_i - vmovdqu reg_i, (arg2 , %r11) # write back ciphertext for + vmovdqu reg_i, (arg3 , %r11) # write back ciphertext for # num_initial_blocks blocks add $16, %r11 .if \ENC_DEC == DEC @@ -1928,58 +1911,58 @@ _get_AAD_done\@: vaesenclast \T_key, \XMM7, \XMM7 vaesenclast \T_key, \XMM8, \XMM8 - vmovdqu (arg3, %r11), \T1 + vmovdqu (arg4, %r11), \T1 vpxor \T1, \XMM1, \XMM1 - vmovdqu \XMM1, (arg2 , %r11) + vmovdqu \XMM1, (arg3 , %r11) .if \ENC_DEC == DEC vmovdqa \T1, \XMM1 .endif - vmovdqu 16*1(arg3, %r11), \T1 + vmovdqu 16*1(arg4, %r11), \T1 vpxor \T1, \XMM2, \XMM2 - vmovdqu \XMM2, 16*1(arg2 , %r11) + vmovdqu \XMM2, 16*1(arg3 , %r11) .if \ENC_DEC == DEC vmovdqa \T1, \XMM2 .endif - vmovdqu 16*2(arg3, %r11), \T1 + vmovdqu 16*2(arg4, %r11), \T1 vpxor \T1, \XMM3, \XMM3 - vmovdqu \XMM3, 16*2(arg2 , %r11) + vmovdqu \XMM3, 16*2(arg3 , %r11) .if \ENC_DEC == DEC vmovdqa \T1, \XMM3 .endif - vmovdqu 16*3(arg3, %r11), \T1 + vmovdqu 16*3(arg4, %r11), \T1 vpxor \T1, \XMM4, \XMM4 - vmovdqu \XMM4, 16*3(arg2 , %r11) + vmovdqu \XMM4, 16*3(arg3 , %r11) .if \ENC_DEC == DEC vmovdqa \T1, \XMM4 .endif - vmovdqu 16*4(arg3, %r11), \T1 + vmovdqu 16*4(arg4, %r11), \T1 vpxor \T1, \XMM5, \XMM5 - vmovdqu \XMM5, 16*4(arg2 , %r11) + vmovdqu \XMM5, 16*4(arg3 , %r11) .if \ENC_DEC == DEC vmovdqa \T1, \XMM5 .endif - vmovdqu 16*5(arg3, %r11), \T1 + vmovdqu 16*5(arg4, %r11), \T1 vpxor \T1, \XMM6, \XMM6 - vmovdqu \XMM6, 16*5(arg2 , %r11) + vmovdqu \XMM6, 16*5(arg3 , %r11) .if \ENC_DEC == DEC vmovdqa \T1, \XMM6 .endif - vmovdqu 16*6(arg3, %r11), \T1 + vmovdqu 16*6(arg4, %r11), \T1 vpxor \T1, \XMM7, \XMM7 - vmovdqu \XMM7, 16*6(arg2 , %r11) + vmovdqu \XMM7, 16*6(arg3 , %r11) .if \ENC_DEC == DEC vmovdqa \T1, \XMM7 .endif - vmovdqu 16*7(arg3, %r11), \T1 + vmovdqu 16*7(arg4, %r11), \T1 vpxor \T1, \XMM8, \XMM8 - vmovdqu \XMM8, 16*7(arg2 , %r11) + vmovdqu \XMM8, 16*7(arg3 , %r11) .if \ENC_DEC == DEC vmovdqa \T1, \XMM8 .endif @@ -2008,7 +1991,7 @@ _initial_blocks_done\@: # encrypt 8 blocks at a time # ghash the 8 previously encrypted ciphertext blocks -# arg1, arg2, arg3 are used as pointers only, not modified +# arg1, arg3, arg4 are used as pointers only, not modified # r11 is the data offset value .macro GHASH_8_ENCRYPT_8_PARALLEL_AVX2 T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC @@ -2094,7 +2077,7 @@ _initial_blocks_done\@: ####################################################################### - vmovdqa HashKey_8(arg1), \T5 + vmovdqu HashKey_8(arg2), \T5 vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1 vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0 vpclmulqdq $0x01, \T5, \T2, \T6 # T6 = a1*b0 @@ -2112,7 +2095,7 @@ _initial_blocks_done\@: vaesenc \T1, \XMM8, \XMM8 vmovdqa TMP2(%rsp), \T1 - vmovdqa HashKey_7(arg1), \T5 + vmovdqu HashKey_7(arg2), \T5 vpclmulqdq $0x11, \T5, \T1, \T3 vpxor \T3, \T4, \T4 @@ -2138,7 +2121,7 @@ _initial_blocks_done\@: ####################################################################### vmovdqa TMP3(%rsp), \T1 - vmovdqa HashKey_6(arg1), \T5 + vmovdqu HashKey_6(arg2), \T5 vpclmulqdq $0x11, \T5, \T1, \T3 vpxor \T3, \T4, \T4 @@ -2162,7 +2145,7 @@ _initial_blocks_done\@: vaesenc \T1, \XMM8, \XMM8 vmovdqa TMP4(%rsp), \T1 - vmovdqa HashKey_5(arg1), \T5 + vmovdqu HashKey_5(arg2), \T5 vpclmulqdq $0x11, \T5, \T1, \T3 vpxor \T3, \T4, \T4 @@ -2187,7 +2170,7 @@ _initial_blocks_done\@: vmovdqa TMP5(%rsp), \T1 - vmovdqa HashKey_4(arg1), \T5 + vmovdqu HashKey_4(arg2), \T5 vpclmulqdq $0x11, \T5, \T1, \T3 vpxor \T3, \T4, \T4 @@ -2211,7 +2194,7 @@ _initial_blocks_done\@: vaesenc \T1, \XMM8, \XMM8 vmovdqa TMP6(%rsp), \T1 - vmovdqa HashKey_3(arg1), \T5 + vmovdqu HashKey_3(arg2), \T5 vpclmulqdq $0x11, \T5, \T1, \T3 vpxor \T3, \T4, \T4 @@ -2235,7 +2218,7 @@ _initial_blocks_done\@: vaesenc \T1, \XMM8, \XMM8 vmovdqa TMP7(%rsp), \T1 - vmovdqa HashKey_2(arg1), \T5 + vmovdqu HashKey_2(arg2), \T5 vpclmulqdq $0x11, \T5, \T1, \T3 vpxor \T3, \T4, \T4 @@ -2262,7 +2245,7 @@ _initial_blocks_done\@: vaesenc \T5, \XMM8, \XMM8 vmovdqa TMP8(%rsp), \T1 - vmovdqa HashKey(arg1), \T5 + vmovdqu HashKey(arg2), \T5 vpclmulqdq $0x00, \T5, \T1, \T3 vpxor \T3, \T7, \T7 @@ -2283,13 +2266,13 @@ _initial_blocks_done\@: j = 1 setreg .rep 8 - vpxor 16*i(arg3, %r11), \T5, \T2 + vpxor 16*i(arg4, %r11), \T5, \T2 .if \ENC_DEC == ENC vaesenclast \T2, reg_j, reg_j .else vaesenclast \T2, reg_j, \T3 - vmovdqu 16*i(arg3, %r11), reg_j - vmovdqu \T3, 16*i(arg2, %r11) + vmovdqu 16*i(arg4, %r11), reg_j + vmovdqu \T3, 16*i(arg3, %r11) .endif i = (i+1) j = (j+1) @@ -2315,14 +2298,14 @@ _initial_blocks_done\@: vpxor \T2, \T7, \T7 # first phase of the reduction complete ####################################################################### .if \ENC_DEC == ENC - vmovdqu \XMM1, 16*0(arg2,%r11) # Write to the Ciphertext buffer - vmovdqu \XMM2, 16*1(arg2,%r11) # Write to the Ciphertext buffer - vmovdqu \XMM3, 16*2(arg2,%r11) # Write to the Ciphertext buffer - vmovdqu \XMM4, 16*3(arg2,%r11) # Write to the Ciphertext buffer - vmovdqu \XMM5, 16*4(arg2,%r11) # Write to the Ciphertext buffer - vmovdqu \XMM6, 16*5(arg2,%r11) # Write to the Ciphertext buffer - vmovdqu \XMM7, 16*6(arg2,%r11) # Write to the Ciphertext buffer - vmovdqu \XMM8, 16*7(arg2,%r11) # Write to the Ciphertext buffer + vmovdqu \XMM1, 16*0(arg3,%r11) # Write to the Ciphertext buffer + vmovdqu \XMM2, 16*1(arg3,%r11) # Write to the Ciphertext buffer + vmovdqu \XMM3, 16*2(arg3,%r11) # Write to the Ciphertext buffer + vmovdqu \XMM4, 16*3(arg3,%r11) # Write to the Ciphertext buffer + vmovdqu \XMM5, 16*4(arg3,%r11) # Write to the Ciphertext buffer + vmovdqu \XMM6, 16*5(arg3,%r11) # Write to the Ciphertext buffer + vmovdqu \XMM7, 16*6(arg3,%r11) # Write to the Ciphertext buffer + vmovdqu \XMM8, 16*7(arg3,%r11) # Write to the Ciphertext buffer .endif ####################################################################### @@ -2359,7 +2342,7 @@ _initial_blocks_done\@: ## Karatsuba Method - vmovdqa HashKey_8(arg1), \T5 + vmovdqu HashKey_8(arg2), \T5 vpshufd $0b01001110, \XMM1, \T2 vpshufd $0b01001110, \T5, \T3 @@ -2373,7 +2356,7 @@ _initial_blocks_done\@: ###################### - vmovdqa HashKey_7(arg1), \T5 + vmovdqu HashKey_7(arg2), \T5 vpshufd $0b01001110, \XMM2, \T2 vpshufd $0b01001110, \T5, \T3 vpxor \XMM2, \T2, \T2 @@ -2391,7 +2374,7 @@ _initial_blocks_done\@: ###################### - vmovdqa HashKey_6(arg1), \T5 + vmovdqu HashKey_6(arg2), \T5 vpshufd $0b01001110, \XMM3, \T2 vpshufd $0b01001110, \T5, \T3 vpxor \XMM3, \T2, \T2 @@ -2409,7 +2392,7 @@ _initial_blocks_done\@: ###################### - vmovdqa HashKey_5(arg1), \T5 + vmovdqu HashKey_5(arg2), \T5 vpshufd $0b01001110, \XMM4, \T2 vpshufd $0b01001110, \T5, \T3 vpxor \XMM4, \T2, \T2 @@ -2427,7 +2410,7 @@ _initial_blocks_done\@: ###################### - vmovdqa HashKey_4(arg1), \T5 + vmovdqu HashKey_4(arg2), \T5 vpshufd $0b01001110, \XMM5, \T2 vpshufd $0b01001110, \T5, \T3 vpxor \XMM5, \T2, \T2 @@ -2445,7 +2428,7 @@ _initial_blocks_done\@: ###################### - vmovdqa HashKey_3(arg1), \T5 + vmovdqu HashKey_3(arg2), \T5 vpshufd $0b01001110, \XMM6, \T2 vpshufd $0b01001110, \T5, \T3 vpxor \XMM6, \T2, \T2 @@ -2463,7 +2446,7 @@ _initial_blocks_done\@: ###################### - vmovdqa HashKey_2(arg1), \T5 + vmovdqu HashKey_2(arg2), \T5 vpshufd $0b01001110, \XMM7, \T2 vpshufd $0b01001110, \T5, \T3 vpxor \XMM7, \T2, \T2 @@ -2481,7 +2464,7 @@ _initial_blocks_done\@: ###################### - vmovdqa HashKey(arg1), \T5 + vmovdqu HashKey(arg2), \T5 vpshufd $0b01001110, \XMM8, \T2 vpshufd $0b01001110, \T5, \T3 vpxor \XMM8, \T2, \T2 @@ -2537,6 +2520,7 @@ _initial_blocks_done\@: ############################################################# #void aesni_gcm_precomp_avx_gen4 # (gcm_data *my_ctx_data, +# gcm_context_data *data, # u8 *hash_subkey)# /* H, the Hash sub key input. # Data starts on a 16-byte boundary. */ ############################################################# @@ -2554,7 +2538,7 @@ ENTRY(aesni_gcm_precomp_avx_gen4) sub $VARIABLE_OFFSET, %rsp and $~63, %rsp # align rsp to 64 bytes - vmovdqu (arg2), %xmm6 # xmm6 = HashKey + vmovdqu (arg3), %xmm6 # xmm6 = HashKey vpshufb SHUF_MASK(%rip), %xmm6, %xmm6 ############### PRECOMPUTATION of HashKey<<1 mod poly from the HashKey @@ -2571,7 +2555,7 @@ ENTRY(aesni_gcm_precomp_avx_gen4) vpand POLY(%rip), %xmm2, %xmm2 vpxor %xmm2, %xmm6, %xmm6 # xmm6 holds the HashKey<<1 mod poly ####################################################################### - vmovdqa %xmm6, HashKey(arg1) # store HashKey<<1 mod poly + vmovdqu %xmm6, HashKey(arg2) # store HashKey<<1 mod poly PRECOMPUTE_AVX2 %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5 @@ -2589,6 +2573,7 @@ ENDPROC(aesni_gcm_precomp_avx_gen4) ############################################################################### #void aesni_gcm_enc_avx_gen4( # gcm_data *my_ctx_data, /* aligned to 16 Bytes */ +# gcm_context_data *data, # u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */ # const u8 *in, /* Plaintext input */ # u64 plaintext_len, /* Length of data in Bytes for encryption. */ @@ -2610,6 +2595,7 @@ ENDPROC(aesni_gcm_enc_avx_gen4) ############################################################################### #void aesni_gcm_dec_avx_gen4( # gcm_data *my_ctx_data, /* aligned to 16 Bytes */ +# gcm_context_data *data, # u8 *out, /* Plaintext output. Decrypt in-place is allowed. */ # const u8 *in, /* Ciphertext input */ # u64 plaintext_len, /* Length of data in Bytes for encryption. */ diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index 661f7daf43da..d8b8cb33f608 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -84,7 +84,7 @@ struct gcm_context_data { u8 current_counter[GCM_BLOCK_LEN]; u64 partial_block_len; u64 unused; - u8 hash_keys[GCM_BLOCK_LEN * 8]; + u8 hash_keys[GCM_BLOCK_LEN * 16]; }; asmlinkage int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key, @@ -187,14 +187,18 @@ asmlinkage void aes_ctr_enc_256_avx_by8(const u8 *in, u8 *iv, * gcm_data *my_ctx_data, context data * u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary. */ -asmlinkage void aesni_gcm_precomp_avx_gen2(void *my_ctx_data, u8 *hash_subkey); +asmlinkage void aesni_gcm_precomp_avx_gen2(void *my_ctx_data, + struct gcm_context_data *gdata, + u8 *hash_subkey); -asmlinkage void aesni_gcm_enc_avx_gen2(void *ctx, u8 *out, +asmlinkage void aesni_gcm_enc_avx_gen2(void *ctx, + struct gcm_context_data *gdata, u8 *out, const u8 *in, unsigned long plaintext_len, u8 *iv, const u8 *aad, unsigned long aad_len, u8 *auth_tag, unsigned long auth_tag_len); -asmlinkage void aesni_gcm_dec_avx_gen2(void *ctx, u8 *out, +asmlinkage void aesni_gcm_dec_avx_gen2(void *ctx, + struct gcm_context_data *gdata, u8 *out, const u8 *in, unsigned long ciphertext_len, u8 *iv, const u8 *aad, unsigned long aad_len, u8 *auth_tag, unsigned long auth_tag_len); @@ -211,9 +215,9 @@ static void aesni_gcm_enc_avx(void *ctx, plaintext_len, iv, hash_subkey, aad, aad_len, auth_tag, auth_tag_len); } else { - aesni_gcm_precomp_avx_gen2(ctx, hash_subkey); - aesni_gcm_enc_avx_gen2(ctx, out, in, plaintext_len, iv, aad, - aad_len, auth_tag, auth_tag_len); + aesni_gcm_precomp_avx_gen2(ctx, data, hash_subkey); + aesni_gcm_enc_avx_gen2(ctx, data, out, in, plaintext_len, iv, + aad, aad_len, auth_tag, auth_tag_len); } } @@ -229,9 +233,9 @@ static void aesni_gcm_dec_avx(void *ctx, ciphertext_len, iv, hash_subkey, aad, aad_len, auth_tag, auth_tag_len); } else { - aesni_gcm_precomp_avx_gen2(ctx, hash_subkey); - aesni_gcm_dec_avx_gen2(ctx, out, in, ciphertext_len, iv, aad, - aad_len, auth_tag, auth_tag_len); + aesni_gcm_precomp_avx_gen2(ctx, data, hash_subkey); + aesni_gcm_dec_avx_gen2(ctx, data, out, in, ciphertext_len, iv, + aad, aad_len, auth_tag, auth_tag_len); } } #endif @@ -242,14 +246,18 @@ static void aesni_gcm_dec_avx(void *ctx, * gcm_data *my_ctx_data, context data * u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary. */ -asmlinkage void aesni_gcm_precomp_avx_gen4(void *my_ctx_data, u8 *hash_subkey); +asmlinkage void aesni_gcm_precomp_avx_gen4(void *my_ctx_data, + struct gcm_context_data *gdata, + u8 *hash_subkey); -asmlinkage void aesni_gcm_enc_avx_gen4(void *ctx, u8 *out, +asmlinkage void aesni_gcm_enc_avx_gen4(void *ctx, + struct gcm_context_data *gdata, u8 *out, const u8 *in, unsigned long plaintext_len, u8 *iv, const u8 *aad, unsigned long aad_len, u8 *auth_tag, unsigned long auth_tag_len); -asmlinkage void aesni_gcm_dec_avx_gen4(void *ctx, u8 *out, +asmlinkage void aesni_gcm_dec_avx_gen4(void *ctx, + struct gcm_context_data *gdata, u8 *out, const u8 *in, unsigned long ciphertext_len, u8 *iv, const u8 *aad, unsigned long aad_len, u8 *auth_tag, unsigned long auth_tag_len); @@ -266,13 +274,13 @@ static void aesni_gcm_enc_avx2(void *ctx, plaintext_len, iv, hash_subkey, aad, aad_len, auth_tag, auth_tag_len); } else if (plaintext_len < AVX_GEN4_OPTSIZE) { - aesni_gcm_precomp_avx_gen2(ctx, hash_subkey); - aesni_gcm_enc_avx_gen2(ctx, out, in, plaintext_len, iv, aad, - aad_len, auth_tag, auth_tag_len); + aesni_gcm_precomp_avx_gen2(ctx, data, hash_subkey); + aesni_gcm_enc_avx_gen2(ctx, data, out, in, plaintext_len, iv, + aad, aad_len, auth_tag, auth_tag_len); } else { - aesni_gcm_precomp_avx_gen4(ctx, hash_subkey); - aesni_gcm_enc_avx_gen4(ctx, out, in, plaintext_len, iv, aad, - aad_len, auth_tag, auth_tag_len); + aesni_gcm_precomp_avx_gen4(ctx, data, hash_subkey); + aesni_gcm_enc_avx_gen4(ctx, data, out, in, plaintext_len, iv, + aad, aad_len, auth_tag, auth_tag_len); } } @@ -288,13 +296,13 @@ static void aesni_gcm_dec_avx2(void *ctx, ciphertext_len, iv, hash_subkey, aad, aad_len, auth_tag, auth_tag_len); } else if (ciphertext_len < AVX_GEN4_OPTSIZE) { - aesni_gcm_precomp_avx_gen2(ctx, hash_subkey); - aesni_gcm_dec_avx_gen2(ctx, out, in, ciphertext_len, iv, aad, - aad_len, auth_tag, auth_tag_len); + aesni_gcm_precomp_avx_gen2(ctx, data, hash_subkey); + aesni_gcm_dec_avx_gen2(ctx, data, out, in, ciphertext_len, iv, + aad, aad_len, auth_tag, auth_tag_len); } else { - aesni_gcm_precomp_avx_gen4(ctx, hash_subkey); - aesni_gcm_dec_avx_gen4(ctx, out, in, ciphertext_len, iv, aad, - aad_len, auth_tag, auth_tag_len); + aesni_gcm_precomp_avx_gen4(ctx, data, hash_subkey); + aesni_gcm_dec_avx_gen4(ctx, data, out, in, ciphertext_len, iv, + aad, aad_len, auth_tag, auth_tag_len); } } #endif -- cgit v1.2.3 From 2426f64bc51fc86951b735d2247d6eb89259d580 Mon Sep 17 00:00:00 2001 From: Dave Watson Date: Mon, 10 Dec 2018 19:57:12 +0000 Subject: crypto: aesni - Macro-ify func save/restore Macro-ify function save and restore. These will be used in new functions added for scatter/gather update operations. Signed-off-by: Dave Watson Signed-off-by: Herbert Xu --- arch/x86/crypto/aesni-intel_avx-x86_64.S | 94 ++++++++++++-------------------- 1 file changed, 36 insertions(+), 58 deletions(-) (limited to 'arch/x86/crypto') diff --git a/arch/x86/crypto/aesni-intel_avx-x86_64.S b/arch/x86/crypto/aesni-intel_avx-x86_64.S index 284f1b8b88fc..dd895f69399b 100644 --- a/arch/x86/crypto/aesni-intel_avx-x86_64.S +++ b/arch/x86/crypto/aesni-intel_avx-x86_64.S @@ -247,6 +247,30 @@ VARIABLE_OFFSET = 16*8 # Utility Macros ################################ +.macro FUNC_SAVE + #the number of pushes must equal STACK_OFFSET + push %r12 + push %r13 + push %r14 + push %r15 + + mov %rsp, %r14 + + + + sub $VARIABLE_OFFSET, %rsp + and $~63, %rsp # align rsp to 64 bytes +.endm + +.macro FUNC_RESTORE + mov %r14, %rsp + + pop %r15 + pop %r14 + pop %r13 + pop %r12 +.endm + # Encryption of a single block .macro ENCRYPT_SINGLE_BLOCK XMM0 vpxor (arg1), \XMM0, \XMM0 @@ -264,22 +288,6 @@ VARIABLE_OFFSET = 16*8 # clobbering all xmm registers # clobbering r10, r11, r12, r13, r14, r15 .macro GCM_ENC_DEC INITIAL_BLOCKS GHASH_8_ENCRYPT_8_PARALLEL GHASH_LAST_8 GHASH_MUL ENC_DEC - - #the number of pushes must equal STACK_OFFSET - push %r12 - push %r13 - push %r14 - push %r15 - - mov %rsp, %r14 - - - - - sub $VARIABLE_OFFSET, %rsp - and $~63, %rsp # align rsp to 64 bytes - - vmovdqu HashKey(arg2), %xmm13 # xmm13 = HashKey mov arg5, %r13 # save the number of bytes of plaintext/ciphertext @@ -566,12 +574,6 @@ _T_16\@: vmovdqu %xmm9, (%r10) _return_T_done\@: - mov %r14, %rsp - - pop %r15 - pop %r14 - pop %r13 - pop %r12 .endm #ifdef CONFIG_AS_AVX @@ -1511,18 +1513,7 @@ _initial_blocks_done\@: # u8 *hash_subkey)# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */ ############################################################# ENTRY(aesni_gcm_precomp_avx_gen2) - #the number of pushes must equal STACK_OFFSET - push %r12 - push %r13 - push %r14 - push %r15 - - mov %rsp, %r14 - - - - sub $VARIABLE_OFFSET, %rsp - and $~63, %rsp # align rsp to 64 bytes + FUNC_SAVE vmovdqu (arg3), %xmm6 # xmm6 = HashKey @@ -1546,12 +1537,7 @@ ENTRY(aesni_gcm_precomp_avx_gen2) PRECOMPUTE_AVX %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5 - mov %r14, %rsp - - pop %r15 - pop %r14 - pop %r13 - pop %r12 + FUNC_RESTORE ret ENDPROC(aesni_gcm_precomp_avx_gen2) @@ -1573,7 +1559,9 @@ ENDPROC(aesni_gcm_precomp_avx_gen2) # Valid values are 16 (most likely), 12 or 8. */ ############################################################################### ENTRY(aesni_gcm_enc_avx_gen2) + FUNC_SAVE GCM_ENC_DEC INITIAL_BLOCKS_AVX GHASH_8_ENCRYPT_8_PARALLEL_AVX GHASH_LAST_8_AVX GHASH_MUL_AVX ENC + FUNC_RESTORE ret ENDPROC(aesni_gcm_enc_avx_gen2) @@ -1595,7 +1583,9 @@ ENDPROC(aesni_gcm_enc_avx_gen2) # Valid values are 16 (most likely), 12 or 8. */ ############################################################################### ENTRY(aesni_gcm_dec_avx_gen2) + FUNC_SAVE GCM_ENC_DEC INITIAL_BLOCKS_AVX GHASH_8_ENCRYPT_8_PARALLEL_AVX GHASH_LAST_8_AVX GHASH_MUL_AVX DEC + FUNC_RESTORE ret ENDPROC(aesni_gcm_dec_avx_gen2) #endif /* CONFIG_AS_AVX */ @@ -2525,18 +2515,7 @@ _initial_blocks_done\@: # Data starts on a 16-byte boundary. */ ############################################################# ENTRY(aesni_gcm_precomp_avx_gen4) - #the number of pushes must equal STACK_OFFSET - push %r12 - push %r13 - push %r14 - push %r15 - - mov %rsp, %r14 - - - - sub $VARIABLE_OFFSET, %rsp - and $~63, %rsp # align rsp to 64 bytes + FUNC_SAVE vmovdqu (arg3), %xmm6 # xmm6 = HashKey @@ -2560,12 +2539,7 @@ ENTRY(aesni_gcm_precomp_avx_gen4) PRECOMPUTE_AVX2 %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5 - mov %r14, %rsp - - pop %r15 - pop %r14 - pop %r13 - pop %r12 + FUNC_RESTORE ret ENDPROC(aesni_gcm_precomp_avx_gen4) @@ -2588,7 +2562,9 @@ ENDPROC(aesni_gcm_precomp_avx_gen4) # Valid values are 16 (most likely), 12 or 8. */ ############################################################################### ENTRY(aesni_gcm_enc_avx_gen4) + FUNC_SAVE GCM_ENC_DEC INITIAL_BLOCKS_AVX2 GHASH_8_ENCRYPT_8_PARALLEL_AVX2 GHASH_LAST_8_AVX2 GHASH_MUL_AVX2 ENC + FUNC_RESTORE ret ENDPROC(aesni_gcm_enc_avx_gen4) @@ -2610,7 +2586,9 @@ ENDPROC(aesni_gcm_enc_avx_gen4) # Valid values are 16 (most likely), 12 or 8. */ ############################################################################### ENTRY(aesni_gcm_dec_avx_gen4) + FUNC_SAVE GCM_ENC_DEC INITIAL_BLOCKS_AVX2 GHASH_8_ENCRYPT_8_PARALLEL_AVX2 GHASH_LAST_8_AVX2 GHASH_MUL_AVX2 DEC + FUNC_RESTORE ret ENDPROC(aesni_gcm_dec_avx_gen4) -- cgit v1.2.3 From 5350b0f563433dacc134214a452fd316b36251d6 Mon Sep 17 00:00:00 2001 From: Dave Watson Date: Mon, 10 Dec 2018 19:57:36 +0000 Subject: crypto: aesni - support 256 byte keys in avx asm Add support for 192/256-bit keys using the avx gcm/aes routines. The sse routines were previously updated in e31ac32d3b (Add support for 192 & 256 bit keys to AESNI RFC4106). Instead of adding an additional loop in the hotpath as in e31ac32d3b, this diff instead generates separate versions of the code using macros, and the entry routines choose which version once. This results in a 5% performance improvement vs. adding a loop to the hot path. This is the same strategy chosen by the intel isa-l_crypto library. The key size checks are removed from the c code where appropriate. Note that this diff depends on using gcm_context_data - 256 bit keys require 16 HashKeys + 15 expanded keys, which is larger than struct crypto_aes_ctx, so they are stored in struct gcm_context_data. Signed-off-by: Dave Watson Signed-off-by: Herbert Xu --- arch/x86/crypto/aesni-intel_avx-x86_64.S | 188 +++++++++++++++++++++++-------- arch/x86/crypto/aesni-intel_glue.c | 18 +-- 2 files changed, 145 insertions(+), 61 deletions(-) (limited to 'arch/x86/crypto') diff --git a/arch/x86/crypto/aesni-intel_avx-x86_64.S b/arch/x86/crypto/aesni-intel_avx-x86_64.S index dd895f69399b..2aa11c503bb9 100644 --- a/arch/x86/crypto/aesni-intel_avx-x86_64.S +++ b/arch/x86/crypto/aesni-intel_avx-x86_64.S @@ -209,6 +209,7 @@ HashKey_8_k = 16*21 # store XOR of HashKey^8 <<1 mod poly here (for Karatsu #define arg8 STACK_OFFSET+8*2(%r14) #define arg9 STACK_OFFSET+8*3(%r14) #define arg10 STACK_OFFSET+8*4(%r14) +#define keysize 2*15*16(arg1) i = 0 j = 0 @@ -272,22 +273,22 @@ VARIABLE_OFFSET = 16*8 .endm # Encryption of a single block -.macro ENCRYPT_SINGLE_BLOCK XMM0 +.macro ENCRYPT_SINGLE_BLOCK REP XMM0 vpxor (arg1), \XMM0, \XMM0 - i = 1 - setreg -.rep 9 + i = 1 + setreg +.rep \REP vaesenc 16*i(arg1), \XMM0, \XMM0 - i = (i+1) - setreg + i = (i+1) + setreg .endr - vaesenclast 16*10(arg1), \XMM0, \XMM0 + vaesenclast 16*i(arg1), \XMM0, \XMM0 .endm # combined for GCM encrypt and decrypt functions # clobbering all xmm registers # clobbering r10, r11, r12, r13, r14, r15 -.macro GCM_ENC_DEC INITIAL_BLOCKS GHASH_8_ENCRYPT_8_PARALLEL GHASH_LAST_8 GHASH_MUL ENC_DEC +.macro GCM_ENC_DEC INITIAL_BLOCKS GHASH_8_ENCRYPT_8_PARALLEL GHASH_LAST_8 GHASH_MUL ENC_DEC REP vmovdqu HashKey(arg2), %xmm13 # xmm13 = HashKey mov arg5, %r13 # save the number of bytes of plaintext/ciphertext @@ -314,42 +315,42 @@ VARIABLE_OFFSET = 16*8 jmp _initial_num_blocks_is_1\@ _initial_num_blocks_is_7\@: - \INITIAL_BLOCKS 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC + \INITIAL_BLOCKS \REP, 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC sub $16*7, %r13 jmp _initial_blocks_encrypted\@ _initial_num_blocks_is_6\@: - \INITIAL_BLOCKS 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC + \INITIAL_BLOCKS \REP, 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC sub $16*6, %r13 jmp _initial_blocks_encrypted\@ _initial_num_blocks_is_5\@: - \INITIAL_BLOCKS 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC + \INITIAL_BLOCKS \REP, 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC sub $16*5, %r13 jmp _initial_blocks_encrypted\@ _initial_num_blocks_is_4\@: - \INITIAL_BLOCKS 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC + \INITIAL_BLOCKS \REP, 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC sub $16*4, %r13 jmp _initial_blocks_encrypted\@ _initial_num_blocks_is_3\@: - \INITIAL_BLOCKS 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC + \INITIAL_BLOCKS \REP, 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC sub $16*3, %r13 jmp _initial_blocks_encrypted\@ _initial_num_blocks_is_2\@: - \INITIAL_BLOCKS 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC + \INITIAL_BLOCKS \REP, 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC sub $16*2, %r13 jmp _initial_blocks_encrypted\@ _initial_num_blocks_is_1\@: - \INITIAL_BLOCKS 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC + \INITIAL_BLOCKS \REP, 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC sub $16*1, %r13 jmp _initial_blocks_encrypted\@ _initial_num_blocks_is_0\@: - \INITIAL_BLOCKS 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC + \INITIAL_BLOCKS \REP, 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC _initial_blocks_encrypted\@: @@ -374,7 +375,7 @@ _encrypt_by_8_new\@: add $8, %r15b - \GHASH_8_ENCRYPT_8_PARALLEL %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC + \GHASH_8_ENCRYPT_8_PARALLEL \REP, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC add $128, %r11 sub $128, %r13 jne _encrypt_by_8_new\@ @@ -385,7 +386,7 @@ _encrypt_by_8_new\@: _encrypt_by_8\@: vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 add $8, %r15b - \GHASH_8_ENCRYPT_8_PARALLEL %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC + \GHASH_8_ENCRYPT_8_PARALLEL \REP, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 add $128, %r11 sub $128, %r13 @@ -414,7 +415,7 @@ _zero_cipher_left\@: vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 - ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn) + ENCRYPT_SINGLE_BLOCK \REP, %xmm9 # E(K, Yn) sub $16, %r11 add %r13, %r11 @@ -440,7 +441,7 @@ _only_less_than_16\@: vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 - ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn) + ENCRYPT_SINGLE_BLOCK \REP, %xmm9 # E(K, Yn) lea SHIFT_MASK+16(%rip), %r12 @@ -525,7 +526,7 @@ _multiple_of_16_bytes\@: mov arg6, %rax # rax = *Y0 vmovdqu (%rax), %xmm9 # xmm9 = Y0 - ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Y0) + ENCRYPT_SINGLE_BLOCK \REP, %xmm9 # E(K, Y0) vpxor %xmm14, %xmm9, %xmm9 @@ -690,7 +691,7 @@ _return_T_done\@: ## r10, r11, r12, rax are clobbered ## arg1, arg3, arg4, r14 are used as a pointer only, not modified -.macro INITIAL_BLOCKS_AVX num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC +.macro INITIAL_BLOCKS_AVX REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC i = (8-\num_initial_blocks) j = 0 setreg @@ -786,10 +787,10 @@ _get_AAD_done\@: setreg .endr - j = 1 - setreg -.rep 9 - vmovdqa 16*j(arg1), \T_key + j = 1 + setreg +.rep \REP + vmovdqa 16*j(arg1), \T_key i = (9-\num_initial_blocks) setreg .rep \num_initial_blocks @@ -798,12 +799,11 @@ _get_AAD_done\@: setreg .endr - j = (j+1) - setreg + j = (j+1) + setreg .endr - - vmovdqa 16*10(arg1), \T_key + vmovdqa 16*j(arg1), \T_key i = (9-\num_initial_blocks) setreg .rep \num_initial_blocks @@ -891,9 +891,9 @@ _get_AAD_done\@: vpxor \T_key, \XMM7, \XMM7 vpxor \T_key, \XMM8, \XMM8 - i = 1 - setreg -.rep 9 # do 9 rounds + i = 1 + setreg +.rep \REP # do REP rounds vmovdqa 16*i(arg1), \T_key vaesenc \T_key, \XMM1, \XMM1 vaesenc \T_key, \XMM2, \XMM2 @@ -903,11 +903,10 @@ _get_AAD_done\@: vaesenc \T_key, \XMM6, \XMM6 vaesenc \T_key, \XMM7, \XMM7 vaesenc \T_key, \XMM8, \XMM8 - i = (i+1) - setreg + i = (i+1) + setreg .endr - vmovdqa 16*i(arg1), \T_key vaesenclast \T_key, \XMM1, \XMM1 vaesenclast \T_key, \XMM2, \XMM2 @@ -996,7 +995,7 @@ _initial_blocks_done\@: # ghash the 8 previously encrypted ciphertext blocks # arg1, arg3, arg4 are used as pointers only, not modified # r11 is the data offset value -.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC +.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX REP T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC vmovdqa \XMM1, \T2 vmovdqa \XMM2, TMP2(%rsp) @@ -1262,6 +1261,24 @@ _initial_blocks_done\@: vmovdqu 16*10(arg1), \T5 + i = 11 + setreg +.rep (\REP-9) + + vaesenc \T5, \XMM1, \XMM1 + vaesenc \T5, \XMM2, \XMM2 + vaesenc \T5, \XMM3, \XMM3 + vaesenc \T5, \XMM4, \XMM4 + vaesenc \T5, \XMM5, \XMM5 + vaesenc \T5, \XMM6, \XMM6 + vaesenc \T5, \XMM7, \XMM7 + vaesenc \T5, \XMM8, \XMM8 + + vmovdqu 16*i(arg1), \T5 + i = i + 1 + setreg +.endr + i = 0 j = 1 setreg @@ -1560,9 +1577,23 @@ ENDPROC(aesni_gcm_precomp_avx_gen2) ############################################################################### ENTRY(aesni_gcm_enc_avx_gen2) FUNC_SAVE - GCM_ENC_DEC INITIAL_BLOCKS_AVX GHASH_8_ENCRYPT_8_PARALLEL_AVX GHASH_LAST_8_AVX GHASH_MUL_AVX ENC + mov keysize, %eax + cmp $32, %eax + je key_256_enc + cmp $16, %eax + je key_128_enc + # must be 192 + GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 11 FUNC_RESTORE - ret + ret +key_128_enc: + GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 9 + FUNC_RESTORE + ret +key_256_enc: + GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 13 + FUNC_RESTORE + ret ENDPROC(aesni_gcm_enc_avx_gen2) ############################################################################### @@ -1584,9 +1615,23 @@ ENDPROC(aesni_gcm_enc_avx_gen2) ############################################################################### ENTRY(aesni_gcm_dec_avx_gen2) FUNC_SAVE - GCM_ENC_DEC INITIAL_BLOCKS_AVX GHASH_8_ENCRYPT_8_PARALLEL_AVX GHASH_LAST_8_AVX GHASH_MUL_AVX DEC + mov keysize,%eax + cmp $32, %eax + je key_256_dec + cmp $16, %eax + je key_128_dec + # must be 192 + GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 11 FUNC_RESTORE - ret + ret +key_128_dec: + GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 9 + FUNC_RESTORE + ret +key_256_dec: + GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 13 + FUNC_RESTORE + ret ENDPROC(aesni_gcm_dec_avx_gen2) #endif /* CONFIG_AS_AVX */ @@ -1671,7 +1716,7 @@ ENDPROC(aesni_gcm_dec_avx_gen2) ## r10, r11, r12, rax are clobbered ## arg1, arg3, arg4, r14 are used as a pointer only, not modified -.macro INITIAL_BLOCKS_AVX2 num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC VER +.macro INITIAL_BLOCKS_AVX2 REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC VER i = (8-\num_initial_blocks) j = 0 setreg @@ -1770,7 +1815,7 @@ _get_AAD_done\@: j = 1 setreg -.rep 9 +.rep \REP vmovdqa 16*j(arg1), \T_key i = (9-\num_initial_blocks) setreg @@ -1785,7 +1830,7 @@ _get_AAD_done\@: .endr - vmovdqa 16*10(arg1), \T_key + vmovdqa 16*j(arg1), \T_key i = (9-\num_initial_blocks) setreg .rep \num_initial_blocks @@ -1876,7 +1921,7 @@ _get_AAD_done\@: i = 1 setreg -.rep 9 # do 9 rounds +.rep \REP # do REP rounds vmovdqa 16*i(arg1), \T_key vaesenc \T_key, \XMM1, \XMM1 vaesenc \T_key, \XMM2, \XMM2 @@ -1983,7 +2028,7 @@ _initial_blocks_done\@: # ghash the 8 previously encrypted ciphertext blocks # arg1, arg3, arg4 are used as pointers only, not modified # r11 is the data offset value -.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX2 T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC +.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX2 REP T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC vmovdqa \XMM1, \T2 vmovdqa \XMM2, TMP2(%rsp) @@ -2252,6 +2297,23 @@ _initial_blocks_done\@: vmovdqu 16*10(arg1), \T5 + i = 11 + setreg +.rep (\REP-9) + vaesenc \T5, \XMM1, \XMM1 + vaesenc \T5, \XMM2, \XMM2 + vaesenc \T5, \XMM3, \XMM3 + vaesenc \T5, \XMM4, \XMM4 + vaesenc \T5, \XMM5, \XMM5 + vaesenc \T5, \XMM6, \XMM6 + vaesenc \T5, \XMM7, \XMM7 + vaesenc \T5, \XMM8, \XMM8 + + vmovdqu 16*i(arg1), \T5 + i = i + 1 + setreg +.endr + i = 0 j = 1 setreg @@ -2563,7 +2625,21 @@ ENDPROC(aesni_gcm_precomp_avx_gen4) ############################################################################### ENTRY(aesni_gcm_enc_avx_gen4) FUNC_SAVE - GCM_ENC_DEC INITIAL_BLOCKS_AVX2 GHASH_8_ENCRYPT_8_PARALLEL_AVX2 GHASH_LAST_8_AVX2 GHASH_MUL_AVX2 ENC + mov keysize,%eax + cmp $32, %eax + je key_256_enc4 + cmp $16, %eax + je key_128_enc4 + # must be 192 + GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 11 + FUNC_RESTORE + ret +key_128_enc4: + GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 9 + FUNC_RESTORE + ret +key_256_enc4: + GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 13 FUNC_RESTORE ret ENDPROC(aesni_gcm_enc_avx_gen4) @@ -2587,9 +2663,23 @@ ENDPROC(aesni_gcm_enc_avx_gen4) ############################################################################### ENTRY(aesni_gcm_dec_avx_gen4) FUNC_SAVE - GCM_ENC_DEC INITIAL_BLOCKS_AVX2 GHASH_8_ENCRYPT_8_PARALLEL_AVX2 GHASH_LAST_8_AVX2 GHASH_MUL_AVX2 DEC + mov keysize,%eax + cmp $32, %eax + je key_256_dec4 + cmp $16, %eax + je key_128_dec4 + # must be 192 + GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 11 FUNC_RESTORE - ret + ret +key_128_dec4: + GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 9 + FUNC_RESTORE + ret +key_256_dec4: + GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 13 + FUNC_RESTORE + ret ENDPROC(aesni_gcm_dec_avx_gen4) #endif /* CONFIG_AS_AVX2 */ diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index d8b8cb33f608..7d1259feb0f9 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -209,8 +209,7 @@ static void aesni_gcm_enc_avx(void *ctx, u8 *hash_subkey, const u8 *aad, unsigned long aad_len, u8 *auth_tag, unsigned long auth_tag_len) { - struct crypto_aes_ctx *aes_ctx = (struct crypto_aes_ctx*)ctx; - if ((plaintext_len < AVX_GEN2_OPTSIZE) || (aes_ctx-> key_length != AES_KEYSIZE_128)){ + if (plaintext_len < AVX_GEN2_OPTSIZE) { aesni_gcm_enc(ctx, data, out, in, plaintext_len, iv, hash_subkey, aad, aad_len, auth_tag, auth_tag_len); @@ -227,8 +226,7 @@ static void aesni_gcm_dec_avx(void *ctx, u8 *hash_subkey, const u8 *aad, unsigned long aad_len, u8 *auth_tag, unsigned long auth_tag_len) { - struct crypto_aes_ctx *aes_ctx = (struct crypto_aes_ctx*)ctx; - if ((ciphertext_len < AVX_GEN2_OPTSIZE) || (aes_ctx-> key_length != AES_KEYSIZE_128)) { + if (ciphertext_len < AVX_GEN2_OPTSIZE) { aesni_gcm_dec(ctx, data, out, in, ciphertext_len, iv, hash_subkey, aad, aad_len, auth_tag, auth_tag_len); @@ -268,8 +266,7 @@ static void aesni_gcm_enc_avx2(void *ctx, u8 *hash_subkey, const u8 *aad, unsigned long aad_len, u8 *auth_tag, unsigned long auth_tag_len) { - struct crypto_aes_ctx *aes_ctx = (struct crypto_aes_ctx*)ctx; - if ((plaintext_len < AVX_GEN2_OPTSIZE) || (aes_ctx-> key_length != AES_KEYSIZE_128)) { + if (plaintext_len < AVX_GEN2_OPTSIZE) { aesni_gcm_enc(ctx, data, out, in, plaintext_len, iv, hash_subkey, aad, aad_len, auth_tag, auth_tag_len); @@ -290,8 +287,7 @@ static void aesni_gcm_dec_avx2(void *ctx, u8 *hash_subkey, const u8 *aad, unsigned long aad_len, u8 *auth_tag, unsigned long auth_tag_len) { - struct crypto_aes_ctx *aes_ctx = (struct crypto_aes_ctx*)ctx; - if ((ciphertext_len < AVX_GEN2_OPTSIZE) || (aes_ctx-> key_length != AES_KEYSIZE_128)) { + if (ciphertext_len < AVX_GEN2_OPTSIZE) { aesni_gcm_dec(ctx, data, out, in, ciphertext_len, iv, hash_subkey, aad, aad_len, auth_tag, auth_tag_len); @@ -928,8 +924,7 @@ static int gcmaes_encrypt(struct aead_request *req, unsigned int assoclen, struct scatter_walk dst_sg_walk = {}; struct gcm_context_data data AESNI_ALIGN_ATTR; - if (((struct crypto_aes_ctx *)aes_ctx)->key_length != AES_KEYSIZE_128 || - aesni_gcm_enc_tfm == aesni_gcm_enc || + if (aesni_gcm_enc_tfm == aesni_gcm_enc || req->cryptlen < AVX_GEN2_OPTSIZE) { return gcmaes_crypt_by_sg(true, req, assoclen, hash_subkey, iv, aes_ctx); @@ -1000,8 +995,7 @@ static int gcmaes_decrypt(struct aead_request *req, unsigned int assoclen, struct gcm_context_data data AESNI_ALIGN_ATTR; int retval = 0; - if (((struct crypto_aes_ctx *)aes_ctx)->key_length != AES_KEYSIZE_128 || - aesni_gcm_enc_tfm == aesni_gcm_enc || + if (aesni_gcm_enc_tfm == aesni_gcm_enc || req->cryptlen < AVX_GEN2_OPTSIZE) { return gcmaes_crypt_by_sg(false, req, assoclen, hash_subkey, iv, aes_ctx); -- cgit v1.2.3 From e377bedb09d6970ad27d7714b0a6365ee7e4d732 Mon Sep 17 00:00:00 2001 From: Dave Watson Date: Mon, 10 Dec 2018 19:57:49 +0000 Subject: crypto: aesni - Add GCM_COMPLETE macro Merge encode and decode tag calculations in GCM_COMPLETE macro. Scatter/gather routines will call this once at the end of encryption or decryption. Signed-off-by: Dave Watson Signed-off-by: Herbert Xu --- arch/x86/crypto/aesni-intel_avx-x86_64.S | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch/x86/crypto') diff --git a/arch/x86/crypto/aesni-intel_avx-x86_64.S b/arch/x86/crypto/aesni-intel_avx-x86_64.S index 2aa11c503bb9..8e9ae4b26118 100644 --- a/arch/x86/crypto/aesni-intel_avx-x86_64.S +++ b/arch/x86/crypto/aesni-intel_avx-x86_64.S @@ -510,6 +510,14 @@ _less_than_8_bytes_left\@: ############################# _multiple_of_16_bytes\@: + GCM_COMPLETE \GHASH_MUL \REP +.endm + + +# GCM_COMPLETE Finishes update of tag of last partial block +# Output: Authorization Tag (AUTH_TAG) +# Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15 +.macro GCM_COMPLETE GHASH_MUL REP mov arg8, %r12 # r12 = aadLen (number of bytes) shl $3, %r12 # convert into number of bits vmovd %r12d, %xmm15 # len(A) in xmm15 -- cgit v1.2.3 From 38003cd26c9f59da77d98927fb9af58732da207a Mon Sep 17 00:00:00 2001 From: Dave Watson Date: Mon, 10 Dec 2018 19:58:19 +0000 Subject: crypto: aesni - Split AAD hash calculation to separate macro AAD hash only needs to be calculated once for each scatter/gather operation. Move it to its own macro, and call it from GCM_INIT instead of INITIAL_BLOCKS. Signed-off-by: Dave Watson Signed-off-by: Herbert Xu --- arch/x86/crypto/aesni-intel_avx-x86_64.S | 228 +++++++++++++------------------ arch/x86/crypto/aesni-intel_glue.c | 28 ++-- 2 files changed, 115 insertions(+), 141 deletions(-) (limited to 'arch/x86/crypto') diff --git a/arch/x86/crypto/aesni-intel_avx-x86_64.S b/arch/x86/crypto/aesni-intel_avx-x86_64.S index 8e9ae4b26118..305abece93ad 100644 --- a/arch/x86/crypto/aesni-intel_avx-x86_64.S +++ b/arch/x86/crypto/aesni-intel_avx-x86_64.S @@ -182,6 +182,14 @@ aad_shift_arr: .text +#define AadHash 16*0 +#define AadLen 16*1 +#define InLen (16*1)+8 +#define PBlockEncKey 16*2 +#define OrigIV 16*3 +#define CurCount 16*4 +#define PBlockLen 16*5 + HashKey = 16*6 # store HashKey <<1 mod poly here HashKey_2 = 16*7 # store HashKey^2 <<1 mod poly here HashKey_3 = 16*8 # store HashKey^3 <<1 mod poly here @@ -585,6 +593,74 @@ _T_16\@: _return_T_done\@: .endm +.macro CALC_AAD_HASH GHASH_MUL AAD AADLEN T1 T2 T3 T4 T5 T6 T7 T8 + + mov \AAD, %r10 # r10 = AAD + mov \AADLEN, %r12 # r12 = aadLen + + + mov %r12, %r11 + + vpxor \T8, \T8, \T8 + vpxor \T7, \T7, \T7 + cmp $16, %r11 + jl _get_AAD_rest8\@ +_get_AAD_blocks\@: + vmovdqu (%r10), \T7 + vpshufb SHUF_MASK(%rip), \T7, \T7 + vpxor \T7, \T8, \T8 + \GHASH_MUL \T8, \T2, \T1, \T3, \T4, \T5, \T6 + add $16, %r10 + sub $16, %r12 + sub $16, %r11 + cmp $16, %r11 + jge _get_AAD_blocks\@ + vmovdqu \T8, \T7 + cmp $0, %r11 + je _get_AAD_done\@ + + vpxor \T7, \T7, \T7 + + /* read the last <16B of AAD. since we have at least 4B of + data right after the AAD (the ICV, and maybe some CT), we can + read 4B/8B blocks safely, and then get rid of the extra stuff */ +_get_AAD_rest8\@: + cmp $4, %r11 + jle _get_AAD_rest4\@ + movq (%r10), \T1 + add $8, %r10 + sub $8, %r11 + vpslldq $8, \T1, \T1 + vpsrldq $8, \T7, \T7 + vpxor \T1, \T7, \T7 + jmp _get_AAD_rest8\@ +_get_AAD_rest4\@: + cmp $0, %r11 + jle _get_AAD_rest0\@ + mov (%r10), %eax + movq %rax, \T1 + add $4, %r10 + sub $4, %r11 + vpslldq $12, \T1, \T1 + vpsrldq $4, \T7, \T7 + vpxor \T1, \T7, \T7 +_get_AAD_rest0\@: + /* finalize: shift out the extra bytes we read, and align + left. since pslldq can only shift by an immediate, we use + vpshufb and an array of shuffle masks */ + movq %r12, %r11 + salq $4, %r11 + vmovdqu aad_shift_arr(%r11), \T1 + vpshufb \T1, \T7, \T7 +_get_AAD_rest_final\@: + vpshufb SHUF_MASK(%rip), \T7, \T7 + vpxor \T8, \T7, \T7 + \GHASH_MUL \T7, \T2, \T1, \T3, \T4, \T5, \T6 + +_get_AAD_done\@: + vmovdqu \T7, AadHash(arg2) +.endm + #ifdef CONFIG_AS_AVX ############################################################################### # GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0) @@ -701,72 +777,9 @@ _return_T_done\@: .macro INITIAL_BLOCKS_AVX REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC i = (8-\num_initial_blocks) - j = 0 setreg + vmovdqu AadHash(arg2), reg_i - mov arg7, %r10 # r10 = AAD - mov arg8, %r12 # r12 = aadLen - - - mov %r12, %r11 - - vpxor reg_j, reg_j, reg_j - vpxor reg_i, reg_i, reg_i - cmp $16, %r11 - jl _get_AAD_rest8\@ -_get_AAD_blocks\@: - vmovdqu (%r10), reg_i - vpshufb SHUF_MASK(%rip), reg_i, reg_i - vpxor reg_i, reg_j, reg_j - GHASH_MUL_AVX reg_j, \T2, \T1, \T3, \T4, \T5, \T6 - add $16, %r10 - sub $16, %r12 - sub $16, %r11 - cmp $16, %r11 - jge _get_AAD_blocks\@ - vmovdqu reg_j, reg_i - cmp $0, %r11 - je _get_AAD_done\@ - - vpxor reg_i, reg_i, reg_i - - /* read the last <16B of AAD. since we have at least 4B of - data right after the AAD (the ICV, and maybe some CT), we can - read 4B/8B blocks safely, and then get rid of the extra stuff */ -_get_AAD_rest8\@: - cmp $4, %r11 - jle _get_AAD_rest4\@ - movq (%r10), \T1 - add $8, %r10 - sub $8, %r11 - vpslldq $8, \T1, \T1 - vpsrldq $8, reg_i, reg_i - vpxor \T1, reg_i, reg_i - jmp _get_AAD_rest8\@ -_get_AAD_rest4\@: - cmp $0, %r11 - jle _get_AAD_rest0\@ - mov (%r10), %eax - movq %rax, \T1 - add $4, %r10 - sub $4, %r11 - vpslldq $12, \T1, \T1 - vpsrldq $4, reg_i, reg_i - vpxor \T1, reg_i, reg_i -_get_AAD_rest0\@: - /* finalize: shift out the extra bytes we read, and align - left. since pslldq can only shift by an immediate, we use - vpshufb and an array of shuffle masks */ - movq %r12, %r11 - salq $4, %r11 - movdqu aad_shift_arr(%r11), \T1 - vpshufb \T1, reg_i, reg_i -_get_AAD_rest_final\@: - vpshufb SHUF_MASK(%rip), reg_i, reg_i - vpxor reg_j, reg_i, reg_i - GHASH_MUL_AVX reg_i, \T2, \T1, \T3, \T4, \T5, \T6 - -_get_AAD_done\@: # initialize the data pointer offset as zero xor %r11d, %r11d @@ -1535,7 +1548,13 @@ _initial_blocks_done\@: #void aesni_gcm_precomp_avx_gen2 # (gcm_data *my_ctx_data, # gcm_context_data *data, -# u8 *hash_subkey)# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */ +# u8 *hash_subkey# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */ +# u8 *iv, /* Pre-counter block j0: 4 byte salt +# (from Security Association) concatenated with 8 byte +# Initialisation Vector (from IPSec ESP Payload) +# concatenated with 0x00000001. 16-byte aligned pointer. */ +# const u8 *aad, /* Additional Authentication Data (AAD)*/ +# u64 aad_len) /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */ ############################################################# ENTRY(aesni_gcm_precomp_avx_gen2) FUNC_SAVE @@ -1560,6 +1579,8 @@ ENTRY(aesni_gcm_precomp_avx_gen2) vmovdqu %xmm6, HashKey(arg2) # store HashKey<<1 mod poly + CALC_AAD_HASH GHASH_MUL_AVX, arg5, arg6, %xmm2, %xmm6, %xmm3, %xmm4, %xmm5, %xmm7, %xmm1, %xmm0 + PRECOMPUTE_AVX %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5 FUNC_RESTORE @@ -1716,7 +1737,6 @@ ENDPROC(aesni_gcm_dec_avx_gen2) .endm - ## if a = number of total plaintext bytes ## b = floor(a/16) ## num_initial_blocks = b mod 4# @@ -1726,73 +1746,9 @@ ENDPROC(aesni_gcm_dec_avx_gen2) .macro INITIAL_BLOCKS_AVX2 REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC VER i = (8-\num_initial_blocks) - j = 0 setreg + vmovdqu AadHash(arg2), reg_i - mov arg7, %r10 # r10 = AAD - mov arg8, %r12 # r12 = aadLen - - - mov %r12, %r11 - - vpxor reg_j, reg_j, reg_j - vpxor reg_i, reg_i, reg_i - - cmp $16, %r11 - jl _get_AAD_rest8\@ -_get_AAD_blocks\@: - vmovdqu (%r10), reg_i - vpshufb SHUF_MASK(%rip), reg_i, reg_i - vpxor reg_i, reg_j, reg_j - GHASH_MUL_AVX2 reg_j, \T2, \T1, \T3, \T4, \T5, \T6 - add $16, %r10 - sub $16, %r12 - sub $16, %r11 - cmp $16, %r11 - jge _get_AAD_blocks\@ - vmovdqu reg_j, reg_i - cmp $0, %r11 - je _get_AAD_done\@ - - vpxor reg_i, reg_i, reg_i - - /* read the last <16B of AAD. since we have at least 4B of - data right after the AAD (the ICV, and maybe some CT), we can - read 4B/8B blocks safely, and then get rid of the extra stuff */ -_get_AAD_rest8\@: - cmp $4, %r11 - jle _get_AAD_rest4\@ - movq (%r10), \T1 - add $8, %r10 - sub $8, %r11 - vpslldq $8, \T1, \T1 - vpsrldq $8, reg_i, reg_i - vpxor \T1, reg_i, reg_i - jmp _get_AAD_rest8\@ -_get_AAD_rest4\@: - cmp $0, %r11 - jle _get_AAD_rest0\@ - mov (%r10), %eax - movq %rax, \T1 - add $4, %r10 - sub $4, %r11 - vpslldq $12, \T1, \T1 - vpsrldq $4, reg_i, reg_i - vpxor \T1, reg_i, reg_i -_get_AAD_rest0\@: - /* finalize: shift out the extra bytes we read, and align - left. since pslldq can only shift by an immediate, we use - vpshufb and an array of shuffle masks */ - movq %r12, %r11 - salq $4, %r11 - movdqu aad_shift_arr(%r11), \T1 - vpshufb \T1, reg_i, reg_i -_get_AAD_rest_final\@: - vpshufb SHUF_MASK(%rip), reg_i, reg_i - vpxor reg_j, reg_i, reg_i - GHASH_MUL_AVX2 reg_i, \T2, \T1, \T3, \T4, \T5, \T6 - -_get_AAD_done\@: # initialize the data pointer offset as zero xor %r11d, %r11d @@ -2581,8 +2537,13 @@ _initial_blocks_done\@: #void aesni_gcm_precomp_avx_gen4 # (gcm_data *my_ctx_data, # gcm_context_data *data, -# u8 *hash_subkey)# /* H, the Hash sub key input. -# Data starts on a 16-byte boundary. */ +# u8 *hash_subkey# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */ +# u8 *iv, /* Pre-counter block j0: 4 byte salt +# (from Security Association) concatenated with 8 byte +# Initialisation Vector (from IPSec ESP Payload) +# concatenated with 0x00000001. 16-byte aligned pointer. */ +# const u8 *aad, /* Additional Authentication Data (AAD)*/ +# u64 aad_len) /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */ ############################################################# ENTRY(aesni_gcm_precomp_avx_gen4) FUNC_SAVE @@ -2606,6 +2567,7 @@ ENTRY(aesni_gcm_precomp_avx_gen4) ####################################################################### vmovdqu %xmm6, HashKey(arg2) # store HashKey<<1 mod poly + CALC_AAD_HASH GHASH_MUL_AVX2, arg5, arg6, %xmm2, %xmm6, %xmm3, %xmm4, %xmm5, %xmm7, %xmm1, %xmm0 PRECOMPUTE_AVX2 %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5 diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index 7d1259feb0f9..2648842f1c3f 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -189,7 +189,10 @@ asmlinkage void aes_ctr_enc_256_avx_by8(const u8 *in, u8 *iv, */ asmlinkage void aesni_gcm_precomp_avx_gen2(void *my_ctx_data, struct gcm_context_data *gdata, - u8 *hash_subkey); + u8 *hash_subkey, + u8 *iv, + const u8 *aad, + unsigned long aad_len); asmlinkage void aesni_gcm_enc_avx_gen2(void *ctx, struct gcm_context_data *gdata, u8 *out, @@ -214,7 +217,8 @@ static void aesni_gcm_enc_avx(void *ctx, plaintext_len, iv, hash_subkey, aad, aad_len, auth_tag, auth_tag_len); } else { - aesni_gcm_precomp_avx_gen2(ctx, data, hash_subkey); + aesni_gcm_precomp_avx_gen2(ctx, data, hash_subkey, iv, + aad, aad_len); aesni_gcm_enc_avx_gen2(ctx, data, out, in, plaintext_len, iv, aad, aad_len, auth_tag, auth_tag_len); } @@ -231,7 +235,8 @@ static void aesni_gcm_dec_avx(void *ctx, ciphertext_len, iv, hash_subkey, aad, aad_len, auth_tag, auth_tag_len); } else { - aesni_gcm_precomp_avx_gen2(ctx, data, hash_subkey); + aesni_gcm_precomp_avx_gen2(ctx, data, hash_subkey, iv, + aad, aad_len); aesni_gcm_dec_avx_gen2(ctx, data, out, in, ciphertext_len, iv, aad, aad_len, auth_tag, auth_tag_len); } @@ -246,7 +251,10 @@ static void aesni_gcm_dec_avx(void *ctx, */ asmlinkage void aesni_gcm_precomp_avx_gen4(void *my_ctx_data, struct gcm_context_data *gdata, - u8 *hash_subkey); + u8 *hash_subkey, + u8 *iv, + const u8 *aad, + unsigned long aad_len); asmlinkage void aesni_gcm_enc_avx_gen4(void *ctx, struct gcm_context_data *gdata, u8 *out, @@ -271,11 +279,13 @@ static void aesni_gcm_enc_avx2(void *ctx, plaintext_len, iv, hash_subkey, aad, aad_len, auth_tag, auth_tag_len); } else if (plaintext_len < AVX_GEN4_OPTSIZE) { - aesni_gcm_precomp_avx_gen2(ctx, data, hash_subkey); + aesni_gcm_precomp_avx_gen2(ctx, data, hash_subkey, iv, + aad, aad_len); aesni_gcm_enc_avx_gen2(ctx, data, out, in, plaintext_len, iv, aad, aad_len, auth_tag, auth_tag_len); } else { - aesni_gcm_precomp_avx_gen4(ctx, data, hash_subkey); + aesni_gcm_precomp_avx_gen4(ctx, data, hash_subkey, iv, + aad, aad_len); aesni_gcm_enc_avx_gen4(ctx, data, out, in, plaintext_len, iv, aad, aad_len, auth_tag, auth_tag_len); } @@ -292,11 +302,13 @@ static void aesni_gcm_dec_avx2(void *ctx, ciphertext_len, iv, hash_subkey, aad, aad_len, auth_tag, auth_tag_len); } else if (ciphertext_len < AVX_GEN4_OPTSIZE) { - aesni_gcm_precomp_avx_gen2(ctx, data, hash_subkey); + aesni_gcm_precomp_avx_gen2(ctx, data, hash_subkey, iv, + aad, aad_len); aesni_gcm_dec_avx_gen2(ctx, data, out, in, ciphertext_len, iv, aad, aad_len, auth_tag, auth_tag_len); } else { - aesni_gcm_precomp_avx_gen4(ctx, data, hash_subkey); + aesni_gcm_precomp_avx_gen4(ctx, data, hash_subkey, iv, + aad, aad_len); aesni_gcm_dec_avx_gen4(ctx, data, out, in, ciphertext_len, iv, aad, aad_len, auth_tag, auth_tag_len); } -- cgit v1.2.3 From 1cb1bcbb567d10bdd9de9d03964c5e2958f0d111 Mon Sep 17 00:00:00 2001 From: Dave Watson Date: Mon, 10 Dec 2018 19:58:38 +0000 Subject: crypto: aesni - Merge avx precompute functions The precompute functions differ only by the sub-macros they call, merge them to a single macro. Later diffs add more code to fill in the gcm_context_data structure, this allows changes in a single place. Signed-off-by: Dave Watson Signed-off-by: Herbert Xu --- arch/x86/crypto/aesni-intel_avx-x86_64.S | 76 ++++++++++++-------------------- 1 file changed, 27 insertions(+), 49 deletions(-) (limited to 'arch/x86/crypto') diff --git a/arch/x86/crypto/aesni-intel_avx-x86_64.S b/arch/x86/crypto/aesni-intel_avx-x86_64.S index 305abece93ad..e347ba61db65 100644 --- a/arch/x86/crypto/aesni-intel_avx-x86_64.S +++ b/arch/x86/crypto/aesni-intel_avx-x86_64.S @@ -661,6 +661,31 @@ _get_AAD_done\@: vmovdqu \T7, AadHash(arg2) .endm +.macro INIT GHASH_MUL PRECOMPUTE + vmovdqu (arg3), %xmm6 # xmm6 = HashKey + + vpshufb SHUF_MASK(%rip), %xmm6, %xmm6 + ############### PRECOMPUTATION of HashKey<<1 mod poly from the HashKey + vmovdqa %xmm6, %xmm2 + vpsllq $1, %xmm6, %xmm6 + vpsrlq $63, %xmm2, %xmm2 + vmovdqa %xmm2, %xmm1 + vpslldq $8, %xmm2, %xmm2 + vpsrldq $8, %xmm1, %xmm1 + vpor %xmm2, %xmm6, %xmm6 + #reduction + vpshufd $0b00100100, %xmm1, %xmm2 + vpcmpeqd TWOONE(%rip), %xmm2, %xmm2 + vpand POLY(%rip), %xmm2, %xmm2 + vpxor %xmm2, %xmm6, %xmm6 # xmm6 holds the HashKey<<1 mod poly + ####################################################################### + vmovdqu %xmm6, HashKey(arg2) # store HashKey<<1 mod poly + + CALC_AAD_HASH \GHASH_MUL, arg5, arg6, %xmm2, %xmm6, %xmm3, %xmm4, %xmm5, %xmm7, %xmm1, %xmm0 + + \PRECOMPUTE %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5 +.endm + #ifdef CONFIG_AS_AVX ############################################################################### # GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0) @@ -1558,31 +1583,7 @@ _initial_blocks_done\@: ############################################################# ENTRY(aesni_gcm_precomp_avx_gen2) FUNC_SAVE - - vmovdqu (arg3), %xmm6 # xmm6 = HashKey - - vpshufb SHUF_MASK(%rip), %xmm6, %xmm6 - ############### PRECOMPUTATION of HashKey<<1 mod poly from the HashKey - vmovdqa %xmm6, %xmm2 - vpsllq $1, %xmm6, %xmm6 - vpsrlq $63, %xmm2, %xmm2 - vmovdqa %xmm2, %xmm1 - vpslldq $8, %xmm2, %xmm2 - vpsrldq $8, %xmm1, %xmm1 - vpor %xmm2, %xmm6, %xmm6 - #reduction - vpshufd $0b00100100, %xmm1, %xmm2 - vpcmpeqd TWOONE(%rip), %xmm2, %xmm2 - vpand POLY(%rip), %xmm2, %xmm2 - vpxor %xmm2, %xmm6, %xmm6 # xmm6 holds the HashKey<<1 mod poly - ####################################################################### - vmovdqu %xmm6, HashKey(arg2) # store HashKey<<1 mod poly - - - CALC_AAD_HASH GHASH_MUL_AVX, arg5, arg6, %xmm2, %xmm6, %xmm3, %xmm4, %xmm5, %xmm7, %xmm1, %xmm0 - - PRECOMPUTE_AVX %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5 - + INIT GHASH_MUL_AVX, PRECOMPUTE_AVX FUNC_RESTORE ret ENDPROC(aesni_gcm_precomp_avx_gen2) @@ -2547,30 +2548,7 @@ _initial_blocks_done\@: ############################################################# ENTRY(aesni_gcm_precomp_avx_gen4) FUNC_SAVE - - vmovdqu (arg3), %xmm6 # xmm6 = HashKey - - vpshufb SHUF_MASK(%rip), %xmm6, %xmm6 - ############### PRECOMPUTATION of HashKey<<1 mod poly from the HashKey - vmovdqa %xmm6, %xmm2 - vpsllq $1, %xmm6, %xmm6 - vpsrlq $63, %xmm2, %xmm2 - vmovdqa %xmm2, %xmm1 - vpslldq $8, %xmm2, %xmm2 - vpsrldq $8, %xmm1, %xmm1 - vpor %xmm2, %xmm6, %xmm6 - #reduction - vpshufd $0b00100100, %xmm1, %xmm2 - vpcmpeqd TWOONE(%rip), %xmm2, %xmm2 - vpand POLY(%rip), %xmm2, %xmm2 - vpxor %xmm2, %xmm6, %xmm6 # xmm6 holds the HashKey<<1 mod poly - ####################################################################### - vmovdqu %xmm6, HashKey(arg2) # store HashKey<<1 mod poly - - CALC_AAD_HASH GHASH_MUL_AVX2, arg5, arg6, %xmm2, %xmm6, %xmm3, %xmm4, %xmm5, %xmm7, %xmm1, %xmm0 - - PRECOMPUTE_AVX2 %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5 - + INIT GHASH_MUL_AVX2, PRECOMPUTE_AVX2 FUNC_RESTORE ret ENDPROC(aesni_gcm_precomp_avx_gen4) -- cgit v1.2.3 From a44b419fe5aee3bfe12c88698a023cb5c6067985 Mon Sep 17 00:00:00 2001 From: Dave Watson Date: Mon, 10 Dec 2018 19:58:56 +0000 Subject: crypto: aesni - Fill in new context data structures Fill in aadhash, aadlen, pblocklen, curcount with appropriate values. pblocklen, aadhash, and pblockenckey are also updated at the end of each scatter/gather operation, to be carried over to the next operation. Signed-off-by: Dave Watson Signed-off-by: Herbert Xu --- arch/x86/crypto/aesni-intel_avx-x86_64.S | 51 +++++++++++++++++++++++--------- 1 file changed, 37 insertions(+), 14 deletions(-) (limited to 'arch/x86/crypto') diff --git a/arch/x86/crypto/aesni-intel_avx-x86_64.S b/arch/x86/crypto/aesni-intel_avx-x86_64.S index e347ba61db65..0a9cdcfdd987 100644 --- a/arch/x86/crypto/aesni-intel_avx-x86_64.S +++ b/arch/x86/crypto/aesni-intel_avx-x86_64.S @@ -297,7 +297,9 @@ VARIABLE_OFFSET = 16*8 # clobbering all xmm registers # clobbering r10, r11, r12, r13, r14, r15 .macro GCM_ENC_DEC INITIAL_BLOCKS GHASH_8_ENCRYPT_8_PARALLEL GHASH_LAST_8 GHASH_MUL ENC_DEC REP + vmovdqu AadHash(arg2), %xmm8 vmovdqu HashKey(arg2), %xmm13 # xmm13 = HashKey + add arg5, InLen(arg2) mov arg5, %r13 # save the number of bytes of plaintext/ciphertext and $-16, %r13 # r13 = r13 - (r13 mod 16) @@ -410,6 +412,9 @@ _eight_cipher_left\@: _zero_cipher_left\@: + vmovdqu %xmm14, AadHash(arg2) + vmovdqu %xmm9, CurCount(arg2) + cmp $16, arg5 jl _only_less_than_16\@ @@ -420,10 +425,14 @@ _zero_cipher_left\@: # handle the last <16 Byte block seperately + mov %r13, PBlockLen(arg2) vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn + vmovdqu %xmm9, CurCount(arg2) vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 + ENCRYPT_SINGLE_BLOCK \REP, %xmm9 # E(K, Yn) + vmovdqu %xmm9, PBlockEncKey(arg2) sub $16, %r11 add %r13, %r11 @@ -451,6 +460,7 @@ _only_less_than_16\@: vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 ENCRYPT_SINGLE_BLOCK \REP, %xmm9 # E(K, Yn) + vmovdqu %xmm9, PBlockEncKey(arg2) lea SHIFT_MASK+16(%rip), %r12 sub %r13, %r12 # adjust the shuffle mask pointer to be @@ -480,6 +490,7 @@ _final_ghash_mul\@: vpxor %xmm2, %xmm14, %xmm14 #GHASH computation for the last <16 Byte block \GHASH_MUL %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 + vmovdqu %xmm14, AadHash(arg2) sub %r13, %r11 add $16, %r11 .else @@ -491,6 +502,7 @@ _final_ghash_mul\@: vpxor %xmm9, %xmm14, %xmm14 #GHASH computation for the last <16 Byte block \GHASH_MUL %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 + vmovdqu %xmm14, AadHash(arg2) sub %r13, %r11 add $16, %r11 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 # shuffle xmm9 back to output as ciphertext @@ -526,12 +538,16 @@ _multiple_of_16_bytes\@: # Output: Authorization Tag (AUTH_TAG) # Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15 .macro GCM_COMPLETE GHASH_MUL REP - mov arg8, %r12 # r12 = aadLen (number of bytes) + vmovdqu AadHash(arg2), %xmm14 + vmovdqu HashKey(arg2), %xmm13 + + mov AadLen(arg2), %r12 # r12 = aadLen (number of bytes) shl $3, %r12 # convert into number of bits vmovd %r12d, %xmm15 # len(A) in xmm15 - shl $3, arg5 # len(C) in bits (*128) - vmovq arg5, %xmm1 + mov InLen(arg2), %r12 + shl $3, %r12 # len(C) in bits (*128) + vmovq %r12, %xmm1 vpslldq $8, %xmm15, %xmm15 # xmm15 = len(A)|| 0x0000000000000000 vpxor %xmm1, %xmm15, %xmm15 # xmm15 = len(A)||len(C) @@ -539,8 +555,7 @@ _multiple_of_16_bytes\@: \GHASH_MUL %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 # final GHASH computation vpshufb SHUF_MASK(%rip), %xmm14, %xmm14 # perform a 16Byte swap - mov arg6, %rax # rax = *Y0 - vmovdqu (%rax), %xmm9 # xmm9 = Y0 + vmovdqu OrigIV(arg2), %xmm9 ENCRYPT_SINGLE_BLOCK \REP, %xmm9 # E(K, Y0) @@ -662,6 +677,20 @@ _get_AAD_done\@: .endm .macro INIT GHASH_MUL PRECOMPUTE + mov arg6, %r11 + mov %r11, AadLen(arg2) # ctx_data.aad_length = aad_length + xor %r11d, %r11d + mov %r11, InLen(arg2) # ctx_data.in_length = 0 + + mov %r11, PBlockLen(arg2) # ctx_data.partial_block_length = 0 + mov %r11, PBlockEncKey(arg2) # ctx_data.partial_block_enc_key = 0 + mov arg4, %rax + movdqu (%rax), %xmm0 + movdqu %xmm0, OrigIV(arg2) # ctx_data.orig_IV = iv + + vpshufb SHUF_MASK(%rip), %xmm0, %xmm0 + movdqu %xmm0, CurCount(arg2) # ctx_data.current_counter = iv + vmovdqu (arg3), %xmm6 # xmm6 = HashKey vpshufb SHUF_MASK(%rip), %xmm6, %xmm6 @@ -809,10 +838,7 @@ _get_AAD_done\@: xor %r11d, %r11d # start AES for num_initial_blocks blocks - mov arg6, %rax # rax = *Y0 - vmovdqu (%rax), \CTR # CTR = Y0 - vpshufb SHUF_MASK(%rip), \CTR, \CTR - + vmovdqu CurCount(arg2), \CTR i = (9-\num_initial_blocks) setreg @@ -1748,16 +1774,13 @@ ENDPROC(aesni_gcm_dec_avx_gen2) .macro INITIAL_BLOCKS_AVX2 REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC VER i = (8-\num_initial_blocks) setreg - vmovdqu AadHash(arg2), reg_i + vmovdqu AadHash(arg2), reg_i # initialize the data pointer offset as zero xor %r11d, %r11d # start AES for num_initial_blocks blocks - mov arg6, %rax # rax = *Y0 - vmovdqu (%rax), \CTR # CTR = Y0 - vpshufb SHUF_MASK(%rip), \CTR, \CTR - + vmovdqu CurCount(arg2), \CTR i = (9-\num_initial_blocks) setreg -- cgit v1.2.3 From 517a448e09846732d46e983a2195002d05857919 Mon Sep 17 00:00:00 2001 From: Dave Watson Date: Mon, 10 Dec 2018 19:59:11 +0000 Subject: crypto: aesni - Move ghash_mul to GCM_COMPLETE Prepare to handle partial blocks between scatter/gather calls. For the last partial block, we only want to calculate the aadhash in GCM_COMPLETE, and a new partial block macro will handle both aadhash update and encrypting partial blocks between calls. Signed-off-by: Dave Watson Signed-off-by: Herbert Xu --- arch/x86/crypto/aesni-intel_avx-x86_64.S | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'arch/x86/crypto') diff --git a/arch/x86/crypto/aesni-intel_avx-x86_64.S b/arch/x86/crypto/aesni-intel_avx-x86_64.S index 0a9cdcfdd987..44a4a8b43ca4 100644 --- a/arch/x86/crypto/aesni-intel_avx-x86_64.S +++ b/arch/x86/crypto/aesni-intel_avx-x86_64.S @@ -488,8 +488,7 @@ _final_ghash_mul\@: vpand %xmm1, %xmm2, %xmm2 vpshufb SHUF_MASK(%rip), %xmm2, %xmm2 vpxor %xmm2, %xmm14, %xmm14 - #GHASH computation for the last <16 Byte block - \GHASH_MUL %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 + vmovdqu %xmm14, AadHash(arg2) sub %r13, %r11 add $16, %r11 @@ -500,8 +499,7 @@ _final_ghash_mul\@: vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 vpxor %xmm9, %xmm14, %xmm14 - #GHASH computation for the last <16 Byte block - \GHASH_MUL %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 + vmovdqu %xmm14, AadHash(arg2) sub %r13, %r11 add $16, %r11 @@ -541,6 +539,14 @@ _multiple_of_16_bytes\@: vmovdqu AadHash(arg2), %xmm14 vmovdqu HashKey(arg2), %xmm13 + mov PBlockLen(arg2), %r12 + cmp $0, %r12 + je _partial_done\@ + + #GHASH computation for the last <16 Byte block + \GHASH_MUL %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 + +_partial_done\@: mov AadLen(arg2), %r12 # r12 = aadLen (number of bytes) shl $3, %r12 # convert into number of bits vmovd %r12d, %xmm15 # len(A) in xmm15 -- cgit v1.2.3 From ec8c02d9a30b8324d7aae9e4e7a08973a8eaa8b4 Mon Sep 17 00:00:00 2001 From: Dave Watson Date: Mon, 10 Dec 2018 19:59:26 +0000 Subject: crypto: aesni - Introduce READ_PARTIAL_BLOCK macro Introduce READ_PARTIAL_BLOCK macro, and use it in the two existing partial block cases: AAD and the end of ENC_DEC. In particular, the ENC_DEC case should be faster, since we read by 8/4 bytes if possible. This macro will also be used to read partial blocks between enc_update and dec_update calls. Signed-off-by: Dave Watson Signed-off-by: Herbert Xu --- arch/x86/crypto/aesni-intel_avx-x86_64.S | 102 ++++++++++++++++++------------- 1 file changed, 59 insertions(+), 43 deletions(-) (limited to 'arch/x86/crypto') diff --git a/arch/x86/crypto/aesni-intel_avx-x86_64.S b/arch/x86/crypto/aesni-intel_avx-x86_64.S index 44a4a8b43ca4..ff00ad19064d 100644 --- a/arch/x86/crypto/aesni-intel_avx-x86_64.S +++ b/arch/x86/crypto/aesni-intel_avx-x86_64.S @@ -415,68 +415,56 @@ _zero_cipher_left\@: vmovdqu %xmm14, AadHash(arg2) vmovdqu %xmm9, CurCount(arg2) - cmp $16, arg5 - jl _only_less_than_16\@ - + # check for 0 length mov arg5, %r13 and $15, %r13 # r13 = (arg5 mod 16) je _multiple_of_16_bytes\@ - # handle the last <16 Byte block seperately + # handle the last <16 Byte block separately mov %r13, PBlockLen(arg2) - vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn + vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn vmovdqu %xmm9, CurCount(arg2) vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 ENCRYPT_SINGLE_BLOCK \REP, %xmm9 # E(K, Yn) vmovdqu %xmm9, PBlockEncKey(arg2) - sub $16, %r11 - add %r13, %r11 - vmovdqu (arg4, %r11), %xmm1 # receive the last <16 Byte block - - lea SHIFT_MASK+16(%rip), %r12 - sub %r13, %r12 # adjust the shuffle mask pointer to be - # able to shift 16-r13 bytes (r13 is the - # number of bytes in plaintext mod 16) - vmovdqu (%r12), %xmm2 # get the appropriate shuffle mask - vpshufb %xmm2, %xmm1, %xmm1 # shift right 16-r13 bytes - jmp _final_ghash_mul\@ - -_only_less_than_16\@: - # check for 0 length - mov arg5, %r13 - and $15, %r13 # r13 = (arg5 mod 16) + cmp $16, arg5 + jge _large_enough_update\@ - je _multiple_of_16_bytes\@ + lea (arg4,%r11,1), %r10 + mov %r13, %r12 - # handle the last <16 Byte block separately - - - vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn - vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 - ENCRYPT_SINGLE_BLOCK \REP, %xmm9 # E(K, Yn) - - vmovdqu %xmm9, PBlockEncKey(arg2) + READ_PARTIAL_BLOCK %r10 %r12 %xmm1 lea SHIFT_MASK+16(%rip), %r12 sub %r13, %r12 # adjust the shuffle mask pointer to be # able to shift 16-r13 bytes (r13 is the - # number of bytes in plaintext mod 16) + # number of bytes in plaintext mod 16) -_get_last_16_byte_loop\@: - movb (arg4, %r11), %al - movb %al, TMP1 (%rsp , %r11) - add $1, %r11 - cmp %r13, %r11 - jne _get_last_16_byte_loop\@ + jmp _final_ghash_mul\@ + +_large_enough_update\@: + sub $16, %r11 + add %r13, %r11 + + # receive the last <16 Byte block + vmovdqu (arg4, %r11, 1), %xmm1 - vmovdqu TMP1(%rsp), %xmm1 + sub %r13, %r11 + add $16, %r11 - sub $16, %r11 + lea SHIFT_MASK+16(%rip), %r12 + # adjust the shuffle mask pointer to be able to shift 16-r13 bytes + # (r13 is the number of bytes in plaintext mod 16) + sub %r13, %r12 + # get the appropriate shuffle mask + vmovdqu (%r12), %xmm2 + # shift right 16-r13 bytes + vpshufb %xmm2, %xmm1, %xmm1 _final_ghash_mul\@: .if \ENC_DEC == DEC @@ -490,8 +478,6 @@ _final_ghash_mul\@: vpxor %xmm2, %xmm14, %xmm14 vmovdqu %xmm14, AadHash(arg2) - sub %r13, %r11 - add $16, %r11 .else vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn) vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to @@ -501,8 +487,6 @@ _final_ghash_mul\@: vpxor %xmm9, %xmm14, %xmm14 vmovdqu %xmm14, AadHash(arg2) - sub %r13, %r11 - add $16, %r11 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 # shuffle xmm9 back to output as ciphertext .endif @@ -721,6 +705,38 @@ _get_AAD_done\@: \PRECOMPUTE %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5 .endm + +# Reads DLEN bytes starting at DPTR and stores in XMMDst +# where 0 < DLEN < 16 +# Clobbers %rax, DLEN +.macro READ_PARTIAL_BLOCK DPTR DLEN XMMDst + vpxor \XMMDst, \XMMDst, \XMMDst + + cmp $8, \DLEN + jl _read_lt8_\@ + mov (\DPTR), %rax + vpinsrq $0, %rax, \XMMDst, \XMMDst + sub $8, \DLEN + jz _done_read_partial_block_\@ + xor %eax, %eax +_read_next_byte_\@: + shl $8, %rax + mov 7(\DPTR, \DLEN, 1), %al + dec \DLEN + jnz _read_next_byte_\@ + vpinsrq $1, %rax, \XMMDst, \XMMDst + jmp _done_read_partial_block_\@ +_read_lt8_\@: + xor %eax, %eax +_read_next_byte_lt8_\@: + shl $8, %rax + mov -1(\DPTR, \DLEN, 1), %al + dec \DLEN + jnz _read_next_byte_lt8_\@ + vpinsrq $0, %rax, \XMMDst, \XMMDst +_done_read_partial_block_\@: +.endm + #ifdef CONFIG_AS_AVX ############################################################################### # GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0) -- cgit v1.2.3 From e044d5056396029cc12ed5354aa2a073b747195a Mon Sep 17 00:00:00 2001 From: Dave Watson Date: Mon, 10 Dec 2018 19:59:38 +0000 Subject: crypto: aesni - Introduce partial block macro Before this diff, multiple calls to GCM_ENC_DEC will succeed, but only if all calls are a multiple of 16 bytes. Handle partial blocks at the start of GCM_ENC_DEC, and update aadhash as appropriate. The data offset %r11 is also updated after the partial block. Signed-off-by: Dave Watson Signed-off-by: Herbert Xu --- arch/x86/crypto/aesni-intel_avx-x86_64.S | 156 +++++++++++++++++++++++++++++-- 1 file changed, 150 insertions(+), 6 deletions(-) (limited to 'arch/x86/crypto') diff --git a/arch/x86/crypto/aesni-intel_avx-x86_64.S b/arch/x86/crypto/aesni-intel_avx-x86_64.S index ff00ad19064d..af45fc57db90 100644 --- a/arch/x86/crypto/aesni-intel_avx-x86_64.S +++ b/arch/x86/crypto/aesni-intel_avx-x86_64.S @@ -301,6 +301,12 @@ VARIABLE_OFFSET = 16*8 vmovdqu HashKey(arg2), %xmm13 # xmm13 = HashKey add arg5, InLen(arg2) + # initialize the data pointer offset as zero + xor %r11d, %r11d + + PARTIAL_BLOCK \GHASH_MUL, arg3, arg4, arg5, %r11, %xmm8, \ENC_DEC + sub %r11, arg5 + mov arg5, %r13 # save the number of bytes of plaintext/ciphertext and $-16, %r13 # r13 = r13 - (r13 mod 16) @@ -737,6 +743,150 @@ _read_next_byte_lt8_\@: _done_read_partial_block_\@: .endm +# PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks +# between update calls. +# Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK +# Outputs encrypted bytes, and updates hash and partial info in gcm_data_context +# Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13 +.macro PARTIAL_BLOCK GHASH_MUL CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \ + AAD_HASH ENC_DEC + mov PBlockLen(arg2), %r13 + cmp $0, %r13 + je _partial_block_done_\@ # Leave Macro if no partial blocks + # Read in input data without over reading + cmp $16, \PLAIN_CYPH_LEN + jl _fewer_than_16_bytes_\@ + vmovdqu (\PLAIN_CYPH_IN), %xmm1 # If more than 16 bytes, just fill xmm + jmp _data_read_\@ + +_fewer_than_16_bytes_\@: + lea (\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10 + mov \PLAIN_CYPH_LEN, %r12 + READ_PARTIAL_BLOCK %r10 %r12 %xmm1 + + mov PBlockLen(arg2), %r13 + +_data_read_\@: # Finished reading in data + + vmovdqu PBlockEncKey(arg2), %xmm9 + vmovdqu HashKey(arg2), %xmm13 + + lea SHIFT_MASK(%rip), %r12 + + # adjust the shuffle mask pointer to be able to shift r13 bytes + # r16-r13 is the number of bytes in plaintext mod 16) + add %r13, %r12 + vmovdqu (%r12), %xmm2 # get the appropriate shuffle mask + vpshufb %xmm2, %xmm9, %xmm9 # shift right r13 bytes + +.if \ENC_DEC == DEC + vmovdqa %xmm1, %xmm3 + pxor %xmm1, %xmm9 # Cyphertext XOR E(K, Yn) + + mov \PLAIN_CYPH_LEN, %r10 + add %r13, %r10 + # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling + sub $16, %r10 + # Determine if if partial block is not being filled and + # shift mask accordingly + jge _no_extra_mask_1_\@ + sub %r10, %r12 +_no_extra_mask_1_\@: + + vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 + # get the appropriate mask to mask out bottom r13 bytes of xmm9 + vpand %xmm1, %xmm9, %xmm9 # mask out bottom r13 bytes of xmm9 + + vpand %xmm1, %xmm3, %xmm3 + vmovdqa SHUF_MASK(%rip), %xmm10 + vpshufb %xmm10, %xmm3, %xmm3 + vpshufb %xmm2, %xmm3, %xmm3 + vpxor %xmm3, \AAD_HASH, \AAD_HASH + + cmp $0, %r10 + jl _partial_incomplete_1_\@ + + # GHASH computation for the last <16 Byte block + \GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 + xor %eax,%eax + + mov %rax, PBlockLen(arg2) + jmp _dec_done_\@ +_partial_incomplete_1_\@: + add \PLAIN_CYPH_LEN, PBlockLen(arg2) +_dec_done_\@: + vmovdqu \AAD_HASH, AadHash(arg2) +.else + vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn) + + mov \PLAIN_CYPH_LEN, %r10 + add %r13, %r10 + # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling + sub $16, %r10 + # Determine if if partial block is not being filled and + # shift mask accordingly + jge _no_extra_mask_2_\@ + sub %r10, %r12 +_no_extra_mask_2_\@: + + vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 + # get the appropriate mask to mask out bottom r13 bytes of xmm9 + vpand %xmm1, %xmm9, %xmm9 + + vmovdqa SHUF_MASK(%rip), %xmm1 + vpshufb %xmm1, %xmm9, %xmm9 + vpshufb %xmm2, %xmm9, %xmm9 + vpxor %xmm9, \AAD_HASH, \AAD_HASH + + cmp $0, %r10 + jl _partial_incomplete_2_\@ + + # GHASH computation for the last <16 Byte block + \GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 + xor %eax,%eax + + mov %rax, PBlockLen(arg2) + jmp _encode_done_\@ +_partial_incomplete_2_\@: + add \PLAIN_CYPH_LEN, PBlockLen(arg2) +_encode_done_\@: + vmovdqu \AAD_HASH, AadHash(arg2) + + vmovdqa SHUF_MASK(%rip), %xmm10 + # shuffle xmm9 back to output as ciphertext + vpshufb %xmm10, %xmm9, %xmm9 + vpshufb %xmm2, %xmm9, %xmm9 +.endif + # output encrypted Bytes + cmp $0, %r10 + jl _partial_fill_\@ + mov %r13, %r12 + mov $16, %r13 + # Set r13 to be the number of bytes to write out + sub %r12, %r13 + jmp _count_set_\@ +_partial_fill_\@: + mov \PLAIN_CYPH_LEN, %r13 +_count_set_\@: + vmovdqa %xmm9, %xmm0 + vmovq %xmm0, %rax + cmp $8, %r13 + jle _less_than_8_bytes_left_\@ + + mov %rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1) + add $8, \DATA_OFFSET + psrldq $8, %xmm0 + vmovq %xmm0, %rax + sub $8, %r13 +_less_than_8_bytes_left_\@: + movb %al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1) + add $1, \DATA_OFFSET + shr $8, %rax + sub $1, %r13 + jne _less_than_8_bytes_left_\@ +_partial_block_done_\@: +.endm # PARTIAL_BLOCK + #ifdef CONFIG_AS_AVX ############################################################################### # GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0) @@ -856,9 +1006,6 @@ _done_read_partial_block_\@: setreg vmovdqu AadHash(arg2), reg_i - # initialize the data pointer offset as zero - xor %r11d, %r11d - # start AES for num_initial_blocks blocks vmovdqu CurCount(arg2), \CTR @@ -1798,9 +1945,6 @@ ENDPROC(aesni_gcm_dec_avx_gen2) setreg vmovdqu AadHash(arg2), reg_i - # initialize the data pointer offset as zero - xor %r11d, %r11d - # start AES for num_initial_blocks blocks vmovdqu CurCount(arg2), \CTR -- cgit v1.2.3 From 603f8c3b0dbbe21fabb7e005f57883b21aaadd82 Mon Sep 17 00:00:00 2001 From: Dave Watson Date: Mon, 10 Dec 2018 19:59:59 +0000 Subject: crypto: aesni - Add scatter/gather avx stubs, and use them in C Add the appropriate scatter/gather stubs to the avx asm. In the C code, we can now always use crypt_by_sg, since both sse and asm code now support scatter/gather. Introduce a new struct, aesni_gcm_tfm, that is initialized on startup to point to either the SSE, AVX, or AVX2 versions of the four necessary encryption/decryption routines. GENX_OPTSIZE is still checked at the start of crypt_by_sg. The total size of the data is checked, since the additional overhead is in the init function, calculating additional HashKeys. Signed-off-by: Dave Watson Signed-off-by: Herbert Xu --- arch/x86/crypto/aesni-intel_avx-x86_64.S | 181 +++++++++------- arch/x86/crypto/aesni-intel_glue.c | 349 +++++++++---------------------- 2 files changed, 198 insertions(+), 332 deletions(-) (limited to 'arch/x86/crypto') diff --git a/arch/x86/crypto/aesni-intel_avx-x86_64.S b/arch/x86/crypto/aesni-intel_avx-x86_64.S index af45fc57db90..91c039ab5699 100644 --- a/arch/x86/crypto/aesni-intel_avx-x86_64.S +++ b/arch/x86/crypto/aesni-intel_avx-x86_64.S @@ -518,14 +518,13 @@ _less_than_8_bytes_left\@: ############################# _multiple_of_16_bytes\@: - GCM_COMPLETE \GHASH_MUL \REP .endm # GCM_COMPLETE Finishes update of tag of last partial block # Output: Authorization Tag (AUTH_TAG) # Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15 -.macro GCM_COMPLETE GHASH_MUL REP +.macro GCM_COMPLETE GHASH_MUL REP AUTH_TAG AUTH_TAG_LEN vmovdqu AadHash(arg2), %xmm14 vmovdqu HashKey(arg2), %xmm13 @@ -560,8 +559,8 @@ _partial_done\@: _return_T\@: - mov arg9, %r10 # r10 = authTag - mov arg10, %r11 # r11 = auth_tag_len + mov \AUTH_TAG, %r10 # r10 = authTag + mov \AUTH_TAG_LEN, %r11 # r11 = auth_tag_len cmp $16, %r11 je _T_16\@ @@ -680,14 +679,14 @@ _get_AAD_done\@: mov %r11, PBlockLen(arg2) # ctx_data.partial_block_length = 0 mov %r11, PBlockEncKey(arg2) # ctx_data.partial_block_enc_key = 0 - mov arg4, %rax + mov arg3, %rax movdqu (%rax), %xmm0 movdqu %xmm0, OrigIV(arg2) # ctx_data.orig_IV = iv vpshufb SHUF_MASK(%rip), %xmm0, %xmm0 movdqu %xmm0, CurCount(arg2) # ctx_data.current_counter = iv - vmovdqu (arg3), %xmm6 # xmm6 = HashKey + vmovdqu (arg4), %xmm6 # xmm6 = HashKey vpshufb SHUF_MASK(%rip), %xmm6, %xmm6 ############### PRECOMPUTATION of HashKey<<1 mod poly from the HashKey @@ -1776,88 +1775,100 @@ _initial_blocks_done\@: # const u8 *aad, /* Additional Authentication Data (AAD)*/ # u64 aad_len) /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */ ############################################################# -ENTRY(aesni_gcm_precomp_avx_gen2) +ENTRY(aesni_gcm_init_avx_gen2) FUNC_SAVE INIT GHASH_MUL_AVX, PRECOMPUTE_AVX FUNC_RESTORE ret -ENDPROC(aesni_gcm_precomp_avx_gen2) +ENDPROC(aesni_gcm_init_avx_gen2) ############################################################################### -#void aesni_gcm_enc_avx_gen2( +#void aesni_gcm_enc_update_avx_gen2( # gcm_data *my_ctx_data, /* aligned to 16 Bytes */ # gcm_context_data *data, # u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */ # const u8 *in, /* Plaintext input */ -# u64 plaintext_len, /* Length of data in Bytes for encryption. */ -# u8 *iv, /* Pre-counter block j0: 4 byte salt -# (from Security Association) concatenated with 8 byte -# Initialisation Vector (from IPSec ESP Payload) -# concatenated with 0x00000001. 16-byte aligned pointer. */ -# const u8 *aad, /* Additional Authentication Data (AAD)*/ -# u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */ -# u8 *auth_tag, /* Authenticated Tag output. */ -# u64 auth_tag_len)# /* Authenticated Tag Length in bytes. -# Valid values are 16 (most likely), 12 or 8. */ +# u64 plaintext_len) /* Length of data in Bytes for encryption. */ ############################################################################### -ENTRY(aesni_gcm_enc_avx_gen2) +ENTRY(aesni_gcm_enc_update_avx_gen2) FUNC_SAVE mov keysize, %eax cmp $32, %eax - je key_256_enc + je key_256_enc_update cmp $16, %eax - je key_128_enc + je key_128_enc_update # must be 192 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 11 FUNC_RESTORE ret -key_128_enc: +key_128_enc_update: GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 9 FUNC_RESTORE ret -key_256_enc: +key_256_enc_update: GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 13 FUNC_RESTORE ret -ENDPROC(aesni_gcm_enc_avx_gen2) +ENDPROC(aesni_gcm_enc_update_avx_gen2) ############################################################################### -#void aesni_gcm_dec_avx_gen2( +#void aesni_gcm_dec_update_avx_gen2( # gcm_data *my_ctx_data, /* aligned to 16 Bytes */ # gcm_context_data *data, # u8 *out, /* Plaintext output. Decrypt in-place is allowed. */ # const u8 *in, /* Ciphertext input */ -# u64 plaintext_len, /* Length of data in Bytes for encryption. */ -# u8 *iv, /* Pre-counter block j0: 4 byte salt -# (from Security Association) concatenated with 8 byte -# Initialisation Vector (from IPSec ESP Payload) -# concatenated with 0x00000001. 16-byte aligned pointer. */ -# const u8 *aad, /* Additional Authentication Data (AAD)*/ -# u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */ -# u8 *auth_tag, /* Authenticated Tag output. */ -# u64 auth_tag_len)# /* Authenticated Tag Length in bytes. -# Valid values are 16 (most likely), 12 or 8. */ +# u64 plaintext_len) /* Length of data in Bytes for encryption. */ ############################################################################### -ENTRY(aesni_gcm_dec_avx_gen2) +ENTRY(aesni_gcm_dec_update_avx_gen2) FUNC_SAVE mov keysize,%eax cmp $32, %eax - je key_256_dec + je key_256_dec_update cmp $16, %eax - je key_128_dec + je key_128_dec_update # must be 192 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 11 FUNC_RESTORE ret -key_128_dec: +key_128_dec_update: GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 9 FUNC_RESTORE ret -key_256_dec: +key_256_dec_update: GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 13 FUNC_RESTORE ret -ENDPROC(aesni_gcm_dec_avx_gen2) +ENDPROC(aesni_gcm_dec_update_avx_gen2) + +############################################################################### +#void aesni_gcm_finalize_avx_gen2( +# gcm_data *my_ctx_data, /* aligned to 16 Bytes */ +# gcm_context_data *data, +# u8 *auth_tag, /* Authenticated Tag output. */ +# u64 auth_tag_len)# /* Authenticated Tag Length in bytes. +# Valid values are 16 (most likely), 12 or 8. */ +############################################################################### +ENTRY(aesni_gcm_finalize_avx_gen2) + FUNC_SAVE + mov keysize,%eax + cmp $32, %eax + je key_256_finalize + cmp $16, %eax + je key_128_finalize + # must be 192 + GCM_COMPLETE GHASH_MUL_AVX, 11, arg3, arg4 + FUNC_RESTORE + ret +key_128_finalize: + GCM_COMPLETE GHASH_MUL_AVX, 9, arg3, arg4 + FUNC_RESTORE + ret +key_256_finalize: + GCM_COMPLETE GHASH_MUL_AVX, 13, arg3, arg4 + FUNC_RESTORE + ret +ENDPROC(aesni_gcm_finalize_avx_gen2) + #endif /* CONFIG_AS_AVX */ #ifdef CONFIG_AS_AVX2 @@ -2724,24 +2735,23 @@ _initial_blocks_done\@: ############################################################# -#void aesni_gcm_precomp_avx_gen4 +#void aesni_gcm_init_avx_gen4 # (gcm_data *my_ctx_data, # gcm_context_data *data, -# u8 *hash_subkey# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */ # u8 *iv, /* Pre-counter block j0: 4 byte salt # (from Security Association) concatenated with 8 byte # Initialisation Vector (from IPSec ESP Payload) # concatenated with 0x00000001. 16-byte aligned pointer. */ +# u8 *hash_subkey# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */ # const u8 *aad, /* Additional Authentication Data (AAD)*/ # u64 aad_len) /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */ ############################################################# -ENTRY(aesni_gcm_precomp_avx_gen4) +ENTRY(aesni_gcm_init_avx_gen4) FUNC_SAVE INIT GHASH_MUL_AVX2, PRECOMPUTE_AVX2 FUNC_RESTORE ret -ENDPROC(aesni_gcm_precomp_avx_gen4) - +ENDPROC(aesni_gcm_init_avx_gen4) ############################################################################### #void aesni_gcm_enc_avx_gen4( @@ -2749,74 +2759,85 @@ ENDPROC(aesni_gcm_precomp_avx_gen4) # gcm_context_data *data, # u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */ # const u8 *in, /* Plaintext input */ -# u64 plaintext_len, /* Length of data in Bytes for encryption. */ -# u8 *iv, /* Pre-counter block j0: 4 byte salt -# (from Security Association) concatenated with 8 byte -# Initialisation Vector (from IPSec ESP Payload) -# concatenated with 0x00000001. 16-byte aligned pointer. */ -# const u8 *aad, /* Additional Authentication Data (AAD)*/ -# u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */ -# u8 *auth_tag, /* Authenticated Tag output. */ -# u64 auth_tag_len)# /* Authenticated Tag Length in bytes. -# Valid values are 16 (most likely), 12 or 8. */ +# u64 plaintext_len) /* Length of data in Bytes for encryption. */ ############################################################################### -ENTRY(aesni_gcm_enc_avx_gen4) +ENTRY(aesni_gcm_enc_update_avx_gen4) FUNC_SAVE mov keysize,%eax cmp $32, %eax - je key_256_enc4 + je key_256_enc_update4 cmp $16, %eax - je key_128_enc4 + je key_128_enc_update4 # must be 192 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 11 FUNC_RESTORE ret -key_128_enc4: +key_128_enc_update4: GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 9 FUNC_RESTORE ret -key_256_enc4: +key_256_enc_update4: GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 13 FUNC_RESTORE ret -ENDPROC(aesni_gcm_enc_avx_gen4) +ENDPROC(aesni_gcm_enc_update_avx_gen4) ############################################################################### -#void aesni_gcm_dec_avx_gen4( +#void aesni_gcm_dec_update_avx_gen4( # gcm_data *my_ctx_data, /* aligned to 16 Bytes */ # gcm_context_data *data, # u8 *out, /* Plaintext output. Decrypt in-place is allowed. */ # const u8 *in, /* Ciphertext input */ -# u64 plaintext_len, /* Length of data in Bytes for encryption. */ -# u8 *iv, /* Pre-counter block j0: 4 byte salt -# (from Security Association) concatenated with 8 byte -# Initialisation Vector (from IPSec ESP Payload) -# concatenated with 0x00000001. 16-byte aligned pointer. */ -# const u8 *aad, /* Additional Authentication Data (AAD)*/ -# u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */ -# u8 *auth_tag, /* Authenticated Tag output. */ -# u64 auth_tag_len)# /* Authenticated Tag Length in bytes. -# Valid values are 16 (most likely), 12 or 8. */ +# u64 plaintext_len) /* Length of data in Bytes for encryption. */ ############################################################################### -ENTRY(aesni_gcm_dec_avx_gen4) +ENTRY(aesni_gcm_dec_update_avx_gen4) FUNC_SAVE mov keysize,%eax cmp $32, %eax - je key_256_dec4 + je key_256_dec_update4 cmp $16, %eax - je key_128_dec4 + je key_128_dec_update4 # must be 192 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 11 FUNC_RESTORE ret -key_128_dec4: +key_128_dec_update4: GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 9 FUNC_RESTORE ret -key_256_dec4: +key_256_dec_update4: GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 13 FUNC_RESTORE ret -ENDPROC(aesni_gcm_dec_avx_gen4) +ENDPROC(aesni_gcm_dec_update_avx_gen4) + +############################################################################### +#void aesni_gcm_finalize_avx_gen4( +# gcm_data *my_ctx_data, /* aligned to 16 Bytes */ +# gcm_context_data *data, +# u8 *auth_tag, /* Authenticated Tag output. */ +# u64 auth_tag_len)# /* Authenticated Tag Length in bytes. +# Valid values are 16 (most likely), 12 or 8. */ +############################################################################### +ENTRY(aesni_gcm_finalize_avx_gen4) + FUNC_SAVE + mov keysize,%eax + cmp $32, %eax + je key_256_finalize4 + cmp $16, %eax + je key_128_finalize4 + # must be 192 + GCM_COMPLETE GHASH_MUL_AVX2, 11, arg3, arg4 + FUNC_RESTORE + ret +key_128_finalize4: + GCM_COMPLETE GHASH_MUL_AVX2, 9, arg3, arg4 + FUNC_RESTORE + ret +key_256_finalize4: + GCM_COMPLETE GHASH_MUL_AVX2, 13, arg3, arg4 + FUNC_RESTORE + ret +ENDPROC(aesni_gcm_finalize_avx_gen4) #endif /* CONFIG_AS_AVX2 */ diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index 2648842f1c3f..1321700d6647 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -175,6 +175,32 @@ asmlinkage void aesni_gcm_finalize(void *ctx, struct gcm_context_data *gdata, u8 *auth_tag, unsigned long auth_tag_len); +static struct aesni_gcm_tfm_s { +void (*init)(void *ctx, + struct gcm_context_data *gdata, + u8 *iv, + u8 *hash_subkey, const u8 *aad, + unsigned long aad_len); +void (*enc_update)(void *ctx, + struct gcm_context_data *gdata, u8 *out, + const u8 *in, + unsigned long plaintext_len); +void (*dec_update)(void *ctx, + struct gcm_context_data *gdata, u8 *out, + const u8 *in, + unsigned long ciphertext_len); +void (*finalize)(void *ctx, + struct gcm_context_data *gdata, + u8 *auth_tag, unsigned long auth_tag_len); +} *aesni_gcm_tfm; + +struct aesni_gcm_tfm_s aesni_gcm_tfm_sse = { + .init = &aesni_gcm_init, + .enc_update = &aesni_gcm_enc_update, + .dec_update = &aesni_gcm_dec_update, + .finalize = &aesni_gcm_finalize, +}; + #ifdef CONFIG_AS_AVX asmlinkage void aes_ctr_enc_128_avx_by8(const u8 *in, u8 *iv, void *keys, u8 *out, unsigned int num_bytes); @@ -183,16 +209,27 @@ asmlinkage void aes_ctr_enc_192_avx_by8(const u8 *in, u8 *iv, asmlinkage void aes_ctr_enc_256_avx_by8(const u8 *in, u8 *iv, void *keys, u8 *out, unsigned int num_bytes); /* - * asmlinkage void aesni_gcm_precomp_avx_gen2() + * asmlinkage void aesni_gcm_init_avx_gen2() * gcm_data *my_ctx_data, context data * u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary. */ -asmlinkage void aesni_gcm_precomp_avx_gen2(void *my_ctx_data, - struct gcm_context_data *gdata, - u8 *hash_subkey, - u8 *iv, - const u8 *aad, - unsigned long aad_len); +asmlinkage void aesni_gcm_init_avx_gen2(void *my_ctx_data, + struct gcm_context_data *gdata, + u8 *iv, + u8 *hash_subkey, + const u8 *aad, + unsigned long aad_len); + +asmlinkage void aesni_gcm_enc_update_avx_gen2(void *ctx, + struct gcm_context_data *gdata, u8 *out, + const u8 *in, unsigned long plaintext_len); +asmlinkage void aesni_gcm_dec_update_avx_gen2(void *ctx, + struct gcm_context_data *gdata, u8 *out, + const u8 *in, + unsigned long ciphertext_len); +asmlinkage void aesni_gcm_finalize_avx_gen2(void *ctx, + struct gcm_context_data *gdata, + u8 *auth_tag, unsigned long auth_tag_len); asmlinkage void aesni_gcm_enc_avx_gen2(void *ctx, struct gcm_context_data *gdata, u8 *out, @@ -206,55 +243,38 @@ asmlinkage void aesni_gcm_dec_avx_gen2(void *ctx, const u8 *aad, unsigned long aad_len, u8 *auth_tag, unsigned long auth_tag_len); -static void aesni_gcm_enc_avx(void *ctx, - struct gcm_context_data *data, u8 *out, - const u8 *in, unsigned long plaintext_len, u8 *iv, - u8 *hash_subkey, const u8 *aad, unsigned long aad_len, - u8 *auth_tag, unsigned long auth_tag_len) -{ - if (plaintext_len < AVX_GEN2_OPTSIZE) { - aesni_gcm_enc(ctx, data, out, in, - plaintext_len, iv, hash_subkey, aad, - aad_len, auth_tag, auth_tag_len); - } else { - aesni_gcm_precomp_avx_gen2(ctx, data, hash_subkey, iv, - aad, aad_len); - aesni_gcm_enc_avx_gen2(ctx, data, out, in, plaintext_len, iv, - aad, aad_len, auth_tag, auth_tag_len); - } -} +struct aesni_gcm_tfm_s aesni_gcm_tfm_avx_gen2 = { + .init = &aesni_gcm_init_avx_gen2, + .enc_update = &aesni_gcm_enc_update_avx_gen2, + .dec_update = &aesni_gcm_dec_update_avx_gen2, + .finalize = &aesni_gcm_finalize_avx_gen2, +}; -static void aesni_gcm_dec_avx(void *ctx, - struct gcm_context_data *data, u8 *out, - const u8 *in, unsigned long ciphertext_len, u8 *iv, - u8 *hash_subkey, const u8 *aad, unsigned long aad_len, - u8 *auth_tag, unsigned long auth_tag_len) -{ - if (ciphertext_len < AVX_GEN2_OPTSIZE) { - aesni_gcm_dec(ctx, data, out, in, - ciphertext_len, iv, hash_subkey, aad, - aad_len, auth_tag, auth_tag_len); - } else { - aesni_gcm_precomp_avx_gen2(ctx, data, hash_subkey, iv, - aad, aad_len); - aesni_gcm_dec_avx_gen2(ctx, data, out, in, ciphertext_len, iv, - aad, aad_len, auth_tag, auth_tag_len); - } -} #endif #ifdef CONFIG_AS_AVX2 /* - * asmlinkage void aesni_gcm_precomp_avx_gen4() + * asmlinkage void aesni_gcm_init_avx_gen4() * gcm_data *my_ctx_data, context data * u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary. */ -asmlinkage void aesni_gcm_precomp_avx_gen4(void *my_ctx_data, - struct gcm_context_data *gdata, - u8 *hash_subkey, - u8 *iv, - const u8 *aad, - unsigned long aad_len); +asmlinkage void aesni_gcm_init_avx_gen4(void *my_ctx_data, + struct gcm_context_data *gdata, + u8 *iv, + u8 *hash_subkey, + const u8 *aad, + unsigned long aad_len); + +asmlinkage void aesni_gcm_enc_update_avx_gen4(void *ctx, + struct gcm_context_data *gdata, u8 *out, + const u8 *in, unsigned long plaintext_len); +asmlinkage void aesni_gcm_dec_update_avx_gen4(void *ctx, + struct gcm_context_data *gdata, u8 *out, + const u8 *in, + unsigned long ciphertext_len); +asmlinkage void aesni_gcm_finalize_avx_gen4(void *ctx, + struct gcm_context_data *gdata, + u8 *auth_tag, unsigned long auth_tag_len); asmlinkage void aesni_gcm_enc_avx_gen4(void *ctx, struct gcm_context_data *gdata, u8 *out, @@ -268,67 +288,15 @@ asmlinkage void aesni_gcm_dec_avx_gen4(void *ctx, const u8 *aad, unsigned long aad_len, u8 *auth_tag, unsigned long auth_tag_len); -static void aesni_gcm_enc_avx2(void *ctx, - struct gcm_context_data *data, u8 *out, - const u8 *in, unsigned long plaintext_len, u8 *iv, - u8 *hash_subkey, const u8 *aad, unsigned long aad_len, - u8 *auth_tag, unsigned long auth_tag_len) -{ - if (plaintext_len < AVX_GEN2_OPTSIZE) { - aesni_gcm_enc(ctx, data, out, in, - plaintext_len, iv, hash_subkey, aad, - aad_len, auth_tag, auth_tag_len); - } else if (plaintext_len < AVX_GEN4_OPTSIZE) { - aesni_gcm_precomp_avx_gen2(ctx, data, hash_subkey, iv, - aad, aad_len); - aesni_gcm_enc_avx_gen2(ctx, data, out, in, plaintext_len, iv, - aad, aad_len, auth_tag, auth_tag_len); - } else { - aesni_gcm_precomp_avx_gen4(ctx, data, hash_subkey, iv, - aad, aad_len); - aesni_gcm_enc_avx_gen4(ctx, data, out, in, plaintext_len, iv, - aad, aad_len, auth_tag, auth_tag_len); - } -} +struct aesni_gcm_tfm_s aesni_gcm_tfm_avx_gen4 = { + .init = &aesni_gcm_init_avx_gen4, + .enc_update = &aesni_gcm_enc_update_avx_gen4, + .dec_update = &aesni_gcm_dec_update_avx_gen4, + .finalize = &aesni_gcm_finalize_avx_gen4, +}; -static void aesni_gcm_dec_avx2(void *ctx, - struct gcm_context_data *data, u8 *out, - const u8 *in, unsigned long ciphertext_len, u8 *iv, - u8 *hash_subkey, const u8 *aad, unsigned long aad_len, - u8 *auth_tag, unsigned long auth_tag_len) -{ - if (ciphertext_len < AVX_GEN2_OPTSIZE) { - aesni_gcm_dec(ctx, data, out, in, - ciphertext_len, iv, hash_subkey, - aad, aad_len, auth_tag, auth_tag_len); - } else if (ciphertext_len < AVX_GEN4_OPTSIZE) { - aesni_gcm_precomp_avx_gen2(ctx, data, hash_subkey, iv, - aad, aad_len); - aesni_gcm_dec_avx_gen2(ctx, data, out, in, ciphertext_len, iv, - aad, aad_len, auth_tag, auth_tag_len); - } else { - aesni_gcm_precomp_avx_gen4(ctx, data, hash_subkey, iv, - aad, aad_len); - aesni_gcm_dec_avx_gen4(ctx, data, out, in, ciphertext_len, iv, - aad, aad_len, auth_tag, auth_tag_len); - } -} #endif -static void (*aesni_gcm_enc_tfm)(void *ctx, - struct gcm_context_data *data, u8 *out, - const u8 *in, unsigned long plaintext_len, - u8 *iv, u8 *hash_subkey, const u8 *aad, - unsigned long aad_len, u8 *auth_tag, - unsigned long auth_tag_len); - -static void (*aesni_gcm_dec_tfm)(void *ctx, - struct gcm_context_data *data, u8 *out, - const u8 *in, unsigned long ciphertext_len, - u8 *iv, u8 *hash_subkey, const u8 *aad, - unsigned long aad_len, u8 *auth_tag, - unsigned long auth_tag_len); - static inline struct aesni_rfc4106_gcm_ctx *aesni_rfc4106_gcm_ctx_get(struct crypto_aead *tfm) { @@ -810,6 +778,7 @@ static int gcmaes_crypt_by_sg(bool enc, struct aead_request *req, { struct crypto_aead *tfm = crypto_aead_reqtfm(req); unsigned long auth_tag_len = crypto_aead_authsize(tfm); + struct aesni_gcm_tfm_s *gcm_tfm = aesni_gcm_tfm; struct gcm_context_data data AESNI_ALIGN_ATTR; struct scatter_walk dst_sg_walk = {}; unsigned long left = req->cryptlen; @@ -827,6 +796,15 @@ static int gcmaes_crypt_by_sg(bool enc, struct aead_request *req, if (!enc) left -= auth_tag_len; +#ifdef CONFIG_AS_AVX2 + if (left < AVX_GEN4_OPTSIZE && gcm_tfm == &aesni_gcm_tfm_avx_gen4) + gcm_tfm = &aesni_gcm_tfm_avx_gen2; +#endif +#ifdef CONFIG_AS_AVX + if (left < AVX_GEN2_OPTSIZE && gcm_tfm == &aesni_gcm_tfm_avx_gen2) + gcm_tfm = &aesni_gcm_tfm_sse; +#endif + /* Linearize assoc, if not already linear */ if (req->src->length >= assoclen && req->src->length && (!PageHighMem(sg_page(req->src)) || @@ -851,7 +829,7 @@ static int gcmaes_crypt_by_sg(bool enc, struct aead_request *req, } kernel_fpu_begin(); - aesni_gcm_init(aes_ctx, &data, iv, + gcm_tfm->init(aes_ctx, &data, iv, hash_subkey, assoc, assoclen); if (req->src != req->dst) { while (left) { @@ -862,10 +840,10 @@ static int gcmaes_crypt_by_sg(bool enc, struct aead_request *req, len = min(srclen, dstlen); if (len) { if (enc) - aesni_gcm_enc_update(aes_ctx, &data, + gcm_tfm->enc_update(aes_ctx, &data, dst, src, len); else - aesni_gcm_dec_update(aes_ctx, &data, + gcm_tfm->dec_update(aes_ctx, &data, dst, src, len); } left -= len; @@ -883,10 +861,10 @@ static int gcmaes_crypt_by_sg(bool enc, struct aead_request *req, len = scatterwalk_clamp(&src_sg_walk, left); if (len) { if (enc) - aesni_gcm_enc_update(aes_ctx, &data, + gcm_tfm->enc_update(aes_ctx, &data, src, src, len); else - aesni_gcm_dec_update(aes_ctx, &data, + gcm_tfm->dec_update(aes_ctx, &data, src, src, len); } left -= len; @@ -895,7 +873,7 @@ static int gcmaes_crypt_by_sg(bool enc, struct aead_request *req, scatterwalk_done(&src_sg_walk, 1, left); } } - aesni_gcm_finalize(aes_ctx, &data, authTag, auth_tag_len); + gcm_tfm->finalize(aes_ctx, &data, authTag, auth_tag_len); kernel_fpu_end(); if (!assocmem) @@ -928,145 +906,15 @@ static int gcmaes_crypt_by_sg(bool enc, struct aead_request *req, static int gcmaes_encrypt(struct aead_request *req, unsigned int assoclen, u8 *hash_subkey, u8 *iv, void *aes_ctx) { - u8 one_entry_in_sg = 0; - u8 *src, *dst, *assoc; - struct crypto_aead *tfm = crypto_aead_reqtfm(req); - unsigned long auth_tag_len = crypto_aead_authsize(tfm); - struct scatter_walk src_sg_walk; - struct scatter_walk dst_sg_walk = {}; - struct gcm_context_data data AESNI_ALIGN_ATTR; - - if (aesni_gcm_enc_tfm == aesni_gcm_enc || - req->cryptlen < AVX_GEN2_OPTSIZE) { - return gcmaes_crypt_by_sg(true, req, assoclen, hash_subkey, iv, - aes_ctx); - } - if (sg_is_last(req->src) && - (!PageHighMem(sg_page(req->src)) || - req->src->offset + req->src->length <= PAGE_SIZE) && - sg_is_last(req->dst) && - (!PageHighMem(sg_page(req->dst)) || - req->dst->offset + req->dst->length <= PAGE_SIZE)) { - one_entry_in_sg = 1; - scatterwalk_start(&src_sg_walk, req->src); - assoc = scatterwalk_map(&src_sg_walk); - src = assoc + req->assoclen; - dst = src; - if (unlikely(req->src != req->dst)) { - scatterwalk_start(&dst_sg_walk, req->dst); - dst = scatterwalk_map(&dst_sg_walk) + req->assoclen; - } - } else { - /* Allocate memory for src, dst, assoc */ - assoc = kmalloc(req->cryptlen + auth_tag_len + req->assoclen, - GFP_ATOMIC); - if (unlikely(!assoc)) - return -ENOMEM; - scatterwalk_map_and_copy(assoc, req->src, 0, - req->assoclen + req->cryptlen, 0); - src = assoc + req->assoclen; - dst = src; - } - - kernel_fpu_begin(); - aesni_gcm_enc_tfm(aes_ctx, &data, dst, src, req->cryptlen, iv, - hash_subkey, assoc, assoclen, - dst + req->cryptlen, auth_tag_len); - kernel_fpu_end(); - - /* The authTag (aka the Integrity Check Value) needs to be written - * back to the packet. */ - if (one_entry_in_sg) { - if (unlikely(req->src != req->dst)) { - scatterwalk_unmap(dst - req->assoclen); - scatterwalk_advance(&dst_sg_walk, req->dst->length); - scatterwalk_done(&dst_sg_walk, 1, 0); - } - scatterwalk_unmap(assoc); - scatterwalk_advance(&src_sg_walk, req->src->length); - scatterwalk_done(&src_sg_walk, req->src == req->dst, 0); - } else { - scatterwalk_map_and_copy(dst, req->dst, req->assoclen, - req->cryptlen + auth_tag_len, 1); - kfree(assoc); - } - return 0; + return gcmaes_crypt_by_sg(true, req, assoclen, hash_subkey, iv, + aes_ctx); } static int gcmaes_decrypt(struct aead_request *req, unsigned int assoclen, u8 *hash_subkey, u8 *iv, void *aes_ctx) { - u8 one_entry_in_sg = 0; - u8 *src, *dst, *assoc; - unsigned long tempCipherLen = 0; - struct crypto_aead *tfm = crypto_aead_reqtfm(req); - unsigned long auth_tag_len = crypto_aead_authsize(tfm); - u8 authTag[16]; - struct scatter_walk src_sg_walk; - struct scatter_walk dst_sg_walk = {}; - struct gcm_context_data data AESNI_ALIGN_ATTR; - int retval = 0; - - if (aesni_gcm_enc_tfm == aesni_gcm_enc || - req->cryptlen < AVX_GEN2_OPTSIZE) { - return gcmaes_crypt_by_sg(false, req, assoclen, hash_subkey, iv, - aes_ctx); - } - tempCipherLen = (unsigned long)(req->cryptlen - auth_tag_len); - - if (sg_is_last(req->src) && - (!PageHighMem(sg_page(req->src)) || - req->src->offset + req->src->length <= PAGE_SIZE) && - sg_is_last(req->dst) && req->dst->length && - (!PageHighMem(sg_page(req->dst)) || - req->dst->offset + req->dst->length <= PAGE_SIZE)) { - one_entry_in_sg = 1; - scatterwalk_start(&src_sg_walk, req->src); - assoc = scatterwalk_map(&src_sg_walk); - src = assoc + req->assoclen; - dst = src; - if (unlikely(req->src != req->dst)) { - scatterwalk_start(&dst_sg_walk, req->dst); - dst = scatterwalk_map(&dst_sg_walk) + req->assoclen; - } - } else { - /* Allocate memory for src, dst, assoc */ - assoc = kmalloc(req->cryptlen + req->assoclen, GFP_ATOMIC); - if (!assoc) - return -ENOMEM; - scatterwalk_map_and_copy(assoc, req->src, 0, - req->assoclen + req->cryptlen, 0); - src = assoc + req->assoclen; - dst = src; - } - - - kernel_fpu_begin(); - aesni_gcm_dec_tfm(aes_ctx, &data, dst, src, tempCipherLen, iv, - hash_subkey, assoc, assoclen, - authTag, auth_tag_len); - kernel_fpu_end(); - - /* Compare generated tag with passed in tag. */ - retval = crypto_memneq(src + tempCipherLen, authTag, auth_tag_len) ? - -EBADMSG : 0; - - if (one_entry_in_sg) { - if (unlikely(req->src != req->dst)) { - scatterwalk_unmap(dst - req->assoclen); - scatterwalk_advance(&dst_sg_walk, req->dst->length); - scatterwalk_done(&dst_sg_walk, 1, 0); - } - scatterwalk_unmap(assoc); - scatterwalk_advance(&src_sg_walk, req->src->length); - scatterwalk_done(&src_sg_walk, req->src == req->dst, 0); - } else { - scatterwalk_map_and_copy(dst, req->dst, req->assoclen, - tempCipherLen, 1); - kfree(assoc); - } - return retval; - + return gcmaes_crypt_by_sg(false, req, assoclen, hash_subkey, iv, + aes_ctx); } static int helper_rfc4106_encrypt(struct aead_request *req) @@ -1434,21 +1282,18 @@ static int __init aesni_init(void) #ifdef CONFIG_AS_AVX2 if (boot_cpu_has(X86_FEATURE_AVX2)) { pr_info("AVX2 version of gcm_enc/dec engaged.\n"); - aesni_gcm_enc_tfm = aesni_gcm_enc_avx2; - aesni_gcm_dec_tfm = aesni_gcm_dec_avx2; + aesni_gcm_tfm = &aesni_gcm_tfm_avx_gen4; } else #endif #ifdef CONFIG_AS_AVX if (boot_cpu_has(X86_FEATURE_AVX)) { pr_info("AVX version of gcm_enc/dec engaged.\n"); - aesni_gcm_enc_tfm = aesni_gcm_enc_avx; - aesni_gcm_dec_tfm = aesni_gcm_dec_avx; + aesni_gcm_tfm = &aesni_gcm_tfm_avx_gen2; } else #endif { pr_info("SSE version of gcm_enc/dec engaged.\n"); - aesni_gcm_enc_tfm = aesni_gcm_enc; - aesni_gcm_dec_tfm = aesni_gcm_dec; + aesni_gcm_tfm = &aesni_gcm_tfm_sse; } aesni_ctr_enc_tfm = aesni_ctr_enc; #ifdef CONFIG_AS_AVX -- cgit v1.2.3 From f9c9bdb5131eee60dc3b92e5126d4c0e291703e2 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sat, 15 Dec 2018 12:40:17 -0800 Subject: crypto: x86/chacha - avoid sleeping under kernel_fpu_begin() Passing atomic=true to skcipher_walk_virt() only makes the later skcipher_walk_done() calls use atomic memory allocations, not skcipher_walk_virt() itself. Thus, we have to move it outside of the preemption-disabled region (kernel_fpu_begin()/kernel_fpu_end()). (skcipher_walk_virt() only allocates memory for certain layouts of the input scatterlist, hence why I didn't notice this earlier...) Reported-by: syzbot+9bf843c33f782d73ae7d@syzkaller.appspotmail.com Fixes: 4af78261870a ("crypto: x86/chacha20 - add XChaCha20 support") Signed-off-by: Eric Biggers Signed-off-by: Herbert Xu --- arch/x86/crypto/chacha_glue.c | 33 ++++++++++++++++++++------------- 1 file changed, 20 insertions(+), 13 deletions(-) (limited to 'arch/x86/crypto') diff --git a/arch/x86/crypto/chacha_glue.c b/arch/x86/crypto/chacha_glue.c index 9b1d3fac4943..45c1c4143176 100644 --- a/arch/x86/crypto/chacha_glue.c +++ b/arch/x86/crypto/chacha_glue.c @@ -127,30 +127,27 @@ static void chacha_dosimd(u32 *state, u8 *dst, const u8 *src, } } -static int chacha_simd_stream_xor(struct skcipher_request *req, +static int chacha_simd_stream_xor(struct skcipher_walk *walk, struct chacha_ctx *ctx, u8 *iv) { u32 *state, state_buf[16 + 2] __aligned(8); - struct skcipher_walk walk; int next_yield = 4096; /* bytes until next FPU yield */ - int err; + int err = 0; BUILD_BUG_ON(CHACHA_STATE_ALIGN != 16); state = PTR_ALIGN(state_buf + 0, CHACHA_STATE_ALIGN); - err = skcipher_walk_virt(&walk, req, true); - crypto_chacha_init(state, ctx, iv); - while (walk.nbytes > 0) { - unsigned int nbytes = walk.nbytes; + while (walk->nbytes > 0) { + unsigned int nbytes = walk->nbytes; - if (nbytes < walk.total) { - nbytes = round_down(nbytes, walk.stride); + if (nbytes < walk->total) { + nbytes = round_down(nbytes, walk->stride); next_yield -= nbytes; } - chacha_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr, + chacha_dosimd(state, walk->dst.virt.addr, walk->src.virt.addr, nbytes, ctx->nrounds); if (next_yield <= 0) { @@ -160,7 +157,7 @@ static int chacha_simd_stream_xor(struct skcipher_request *req, next_yield = 4096; } - err = skcipher_walk_done(&walk, walk.nbytes - nbytes); + err = skcipher_walk_done(walk, walk->nbytes - nbytes); } return err; @@ -170,13 +167,18 @@ static int chacha_simd(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); + struct skcipher_walk walk; int err; if (req->cryptlen <= CHACHA_BLOCK_SIZE || !irq_fpu_usable()) return crypto_chacha_crypt(req); + err = skcipher_walk_virt(&walk, req, true); + if (err) + return err; + kernel_fpu_begin(); - err = chacha_simd_stream_xor(req, ctx, req->iv); + err = chacha_simd_stream_xor(&walk, ctx, req->iv); kernel_fpu_end(); return err; } @@ -185,6 +187,7 @@ static int xchacha_simd(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); + struct skcipher_walk walk; struct chacha_ctx subctx; u32 *state, state_buf[16 + 2] __aligned(8); u8 real_iv[16]; @@ -193,6 +196,10 @@ static int xchacha_simd(struct skcipher_request *req) if (req->cryptlen <= CHACHA_BLOCK_SIZE || !irq_fpu_usable()) return crypto_xchacha_crypt(req); + err = skcipher_walk_virt(&walk, req, true); + if (err) + return err; + BUILD_BUG_ON(CHACHA_STATE_ALIGN != 16); state = PTR_ALIGN(state_buf + 0, CHACHA_STATE_ALIGN); crypto_chacha_init(state, ctx, req->iv); @@ -204,7 +211,7 @@ static int xchacha_simd(struct skcipher_request *req) memcpy(&real_iv[0], req->iv + 24, 8); memcpy(&real_iv[8], req->iv + 16, 8); - err = chacha_simd_stream_xor(req, &subctx, real_iv); + err = chacha_simd_stream_xor(&walk, &subctx, real_iv); kernel_fpu_end(); -- cgit v1.2.3