arch/arm64/crypto/aes-cipher-core.S


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132

/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Scalar AES core transform
 *
 * Copyright (C) 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
 */

#include <linux/linkage.h>
#include <asm/assembler.h>
#include <asm/cache.h>

	.text

	rk		.req	x0
	out		.req	x1
	in		.req	x2
	rounds		.req	x3
	tt		.req	x2

	.macro		__pair1, sz, op, reg0, reg1, in0, in1e, in1d, shift
	.ifc		\op\shift, b0
	ubfiz		\reg0, \in0, #2, #8
	ubfiz		\reg1, \in1e, #2, #8
	.else
	ubfx		\reg0, \in0, #\shift, #8
	ubfx		\reg1, \in1e, #\shift, #8
	.endif

	/*
	 * AArch64 cannot do byte size indexed loads from a table containing
	 * 32-bit quantities, i.e., 'ldrb w12, [tt, w12, uxtw #2]' is not a
	 * valid instruction. So perform the shift explicitly first for the
	 * high bytes (the low byte is shifted implicitly by using ubfiz rather
	 * than ubfx above)
	 */
	.ifnc		\op, b
	ldr		\reg0, [tt, \reg0, uxtw #2]
	ldr		\reg1, [tt, \reg1, uxtw #2]
	.else
	.if		\shift > 0
	lsl		\reg0, \reg0, #2
	lsl		\reg1, \reg1, #2
	.endif
	ldrb		\reg0, [tt, \reg0, uxtw]
	ldrb		\reg1, [tt, \reg1, uxtw]
	.endif
	.endm

	.macro		__pair0, sz, op, reg0, reg1, in0, in1e, in1d, shift
	ubfx		\reg0, \in0, #\shift, #8
	ubfx		\reg1, \in1d, #\shift, #8
	ldr\op		\reg0, [tt, \reg0, uxtw #\sz]
	ldr\op		\reg1, [tt, \reg1, uxtw #\sz]
	.endm

	.macro		__hround, out0, out1, in0, in1, in2, in3, t0, t1, enc, sz, op
	ldp		\out0, \out1, [rk], #8

	__pair\enc	\sz, \op, w12, w13, \in0, \in1, \in3, 0
	__pair\enc	\sz, \op, w14, w15, \in1, \in2, \in0, 8
	__pair\enc	\sz, \op, w16, w17, \in2, \in3, \in1, 16
	__pair\enc	\sz, \op, \t0, \t1, \in3, \in0, \in2, 24

	eor		\out0, \out0, w12
	eor		\out1, \out1, w13
	eor		\out0, \out0, w14, ror #24
	eor		\out1, \out1, w15, ror #24
	eor		\out0, \out0, w16, ror #16
	eor		\out1, \out1, w17, ror #16
	eor		\out0, \out0, \t0, ror #8
	eor		\out1, \out1, \t1, ror #8
	.endm

	.macro		fround, out0, out1, out2, out3, in0, in1, in2, in3, sz=2, op
	__hround	\out0, \out1, \in0, \in1, \in2, \in3, \out2, \out3, 1, \sz, \op
	__hround	\out2, \out3, \in2, \in3, \in0, \in1, \in1, \in2, 1, \sz, \op
	.endm

	.macro		iround, out0, out1, out2, out3, in0, in1, in2, in3, sz=2, op
	__hround	\out0, \out1, \in0, \in3, \in2, \in1, \out2, \out3, 0, \sz, \op
	__hround	\out2, \out3, \in2, \in1, \in0, \in3, \in1, \in0, 0, \sz, \op
	.endm

	.macro		do_crypt, round, ttab, ltab, bsz
	ldp		w4, w5, [in]
	ldp		w6, w7, [in, #8]
	ldp		w8, w9, [rk], #16
	ldp		w10, w11, [rk, #-8]

CPU_BE(	rev		w4, w4		)
CPU_BE(	rev		w5, w5		)
CPU_BE(	rev		w6, w6		)
CPU_BE(	rev		w7, w7		)

	eor		w4, w4, w8
	eor		w5, w5, w9
	eor		w6, w6, w10
	eor		w7, w7, w11

	adr_l		tt, \ttab

	tbnz		rounds, #1, 1f

0:	\round		w8, w9, w10, w11, w4, w5, w6, w7
	\round		w4, w5, w6, w7, w8, w9, w10, w11

1:	subs		rounds, rounds, #4
	\round		w8, w9, w10, w11, w4, w5, w6, w7
	b.ls		3f
2:	\round		w4, w5, w6, w7, w8, w9, w10, w11
	b		0b
3:	adr_l		tt, \ltab
	\round		w4, w5, w6, w7, w8, w9, w10, w11, \bsz, b

CPU_BE(	rev		w4, w4		)
CPU_BE(	rev		w5, w5		)
CPU_BE(	rev		w6, w6		)
CPU_BE(	rev		w7, w7		)

	stp		w4, w5, [out]
	stp		w6, w7, [out, #8]
	ret
	.endm

SYM_FUNC_START(__aes_arm64_encrypt)
	do_crypt	fround, crypto_ft_tab, crypto_ft_tab + 1, 2
SYM_FUNC_END(__aes_arm64_encrypt)

	.align		5
SYM_FUNC_START(__aes_arm64_decrypt)
	do_crypt	iround, crypto_it_tab, crypto_aes_inv_sbox, 0
SYM_FUNC_END(__aes_arm64_decrypt)