lib/crypto/x86/aes-aesni.S


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261

/* SPDX-License-Identifier: GPL-2.0-or-later */
//
// AES block cipher using AES-NI instructions
//
// Copyright 2026 Google LLC
//
// The code in this file supports 32-bit and 64-bit CPUs, and it doesn't require
// AVX.  It does use up to SSE4.1, which all CPUs with AES-NI have.
#include <linux/linkage.h>

.section .rodata
#ifdef __x86_64__
#define RODATA(label)	label(%rip)
#else
#define RODATA(label)	label
#endif

	// A mask for pshufb that extracts the last dword, rotates it right by 8
	// bits, and copies the result to all four dwords.
.p2align 4
.Lmask:
	.byte	13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12

	// The AES round constants, used during key expansion
.Lrcon:
	.long	0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36

.text

// Transform four dwords [a0, a1, a2, a3] in \a into
// [a0, a0^a1, a0^a1^a2, a0^a1^a2^a3].  \tmp is a temporary xmm register.
//
// Note: this could be done in four instructions, shufps + pxor + shufps + pxor,
// if the temporary register were zero-initialized ahead of time.  We instead do
// it in an easier-to-understand way that doesn't require zero-initialization
// and avoids the unusual shufps instruction.  movdqa is usually "free" anyway.
.macro	_prefix_sum	a, tmp
	movdqa		\a, \tmp	// [a0, a1, a2, a3]
	pslldq		$4, \a		// [0, a0, a1, a2]
	pxor		\tmp, \a	// [a0, a0^a1, a1^a2, a2^a3]
	movdqa		\a, \tmp
	pslldq		$8, \a		// [0, 0, a0, a0^a1]
	pxor		\tmp, \a	// [a0, a0^a1, a0^a1^a2, a0^a1^a2^a3]
.endm

.macro	_gen_round_key	a, b
	// Compute four copies of rcon[i] ^ SubBytes(ror32(w, 8)), where w is
	// the last dword of the previous round key (given in \b).
	//
	// 'aesenclast src, dst' does dst = src XOR SubBytes(ShiftRows(dst)).
	// It is used here solely for the SubBytes and the XOR.  The ShiftRows
	// is a no-op because all four columns are the same here.
	//
	// Don't use the 'aeskeygenassist' instruction, since:
	//  - On most Intel CPUs it is microcoded, making it have a much higher
	//    latency and use more execution ports than 'aesenclast'.
	//  - It cannot be used in a loop, since it requires an immediate.
	//  - It doesn't do much more than 'aesenclast' in the first place.
	movdqa		\b, %xmm2
	pshufb		MASK, %xmm2
	aesenclast	RCON, %xmm2

	// XOR in the prefix sum of the four dwords of \a, which is the
	// previous round key (AES-128) or the first round key in the previous
	// pair of round keys (AES-256).  The result is the next round key.
	_prefix_sum	\a, tmp=%xmm3
	pxor		%xmm2, \a

	// Store the next round key to memory.  Also leave it in \a.
	movdqu		\a, (RNDKEYS)
.endm

.macro	_aes_expandkey_aesni	is_aes128
#ifdef __x86_64__
	// Arguments
	.set	RNDKEYS,	%rdi
	.set	INV_RNDKEYS,	%rsi
	.set	IN_KEY,		%rdx

	// Other local variables
	.set	RCON_PTR,	%rcx
	.set	COUNTER,	%eax
#else
	// Arguments, assuming -mregparm=3
	.set	RNDKEYS,	%eax
	.set	INV_RNDKEYS,	%edx
	.set	IN_KEY,		%ecx

	// Other local variables
	.set	RCON_PTR,	%ebx
	.set	COUNTER,	%esi
#endif
	.set	RCON,		%xmm6
	.set	MASK,		%xmm7

#ifdef __i386__
	push		%ebx
	push		%esi
#endif

.if \is_aes128
	// AES-128: the first round key is simply a copy of the raw key.
	movdqu		(IN_KEY), %xmm0
	movdqu		%xmm0, (RNDKEYS)
.else
	// AES-256: the first two round keys are simply a copy of the raw key.
	movdqu		(IN_KEY), %xmm0
	movdqu		%xmm0, (RNDKEYS)
	movdqu		16(IN_KEY), %xmm1
	movdqu		%xmm1, 16(RNDKEYS)
	add		$32, RNDKEYS
.endif

	// Generate the remaining round keys.
	movdqa		RODATA(.Lmask), MASK
.if \is_aes128
	lea		RODATA(.Lrcon), RCON_PTR
	mov		$10, COUNTER
.Lgen_next_aes128_round_key:
	add		$16, RNDKEYS
	movd		(RCON_PTR), RCON
	pshufd		$0x00, RCON, RCON
	add		$4, RCON_PTR
	_gen_round_key	%xmm0, %xmm0
	dec		COUNTER
	jnz		.Lgen_next_aes128_round_key
.else
	// AES-256: only the first 7 round constants are needed, so instead of
	// loading each one from memory, just start by loading [1, 1, 1, 1] and
	// then generate the rest by doubling.
	pshufd		$0x00, RODATA(.Lrcon), RCON
	pxor		%xmm5, %xmm5	// All-zeroes
	mov		$7, COUNTER
.Lgen_next_aes256_round_key_pair:
	// Generate the next AES-256 round key: either the first of a pair of
	// two, or the last one.
	_gen_round_key	%xmm0, %xmm1

	dec		COUNTER
	jz		.Lgen_aes256_round_keys_done

	// Generate the second AES-256 round key of the pair.  Compared to the
	// first, there's no rotation and no XOR of a round constant.
	pshufd		$0xff, %xmm0, %xmm2	// Get four copies of last dword
	aesenclast	%xmm5, %xmm2		// Just does SubBytes
	_prefix_sum	%xmm1, tmp=%xmm3
	pxor		%xmm2, %xmm1
	movdqu		%xmm1, 16(RNDKEYS)
	add		$32, RNDKEYS
	paddd		RCON, RCON		// RCON <<= 1
	jmp		.Lgen_next_aes256_round_key_pair
.Lgen_aes256_round_keys_done:
.endif

	// If INV_RNDKEYS is non-NULL, write the round keys for the Equivalent
	// Inverse Cipher to it.  To do that, reverse the standard round keys,
	// and apply aesimc (InvMixColumn) to each except the first and last.
	test		INV_RNDKEYS, INV_RNDKEYS
	jz		.Ldone\@
	movdqu		(RNDKEYS), %xmm0	// Last standard round key
	movdqu		%xmm0, (INV_RNDKEYS)	// => First inverse round key
.if \is_aes128
	mov		$9, COUNTER
.else
	mov		$13, COUNTER
.endif
.Lgen_next_inv_round_key\@:
	sub		$16, RNDKEYS
	add		$16, INV_RNDKEYS
	movdqu		(RNDKEYS), %xmm0
	aesimc		%xmm0, %xmm0
	movdqu		%xmm0, (INV_RNDKEYS)
	dec		COUNTER
	jnz		.Lgen_next_inv_round_key\@
	movdqu		-16(RNDKEYS), %xmm0	// First standard round key
	movdqu		%xmm0, 16(INV_RNDKEYS)	// => Last inverse round key

.Ldone\@:
#ifdef __i386__
	pop		%esi
	pop		%ebx
#endif
	RET
.endm

// void aes128_expandkey_aesni(u32 rndkeys[], u32 *inv_rndkeys,
//			       const u8 in_key[AES_KEYSIZE_128]);
SYM_FUNC_START(aes128_expandkey_aesni)
	_aes_expandkey_aesni	1
SYM_FUNC_END(aes128_expandkey_aesni)

// void aes256_expandkey_aesni(u32 rndkeys[], u32 *inv_rndkeys,
//			       const u8 in_key[AES_KEYSIZE_256]);
SYM_FUNC_START(aes256_expandkey_aesni)
	_aes_expandkey_aesni	0
SYM_FUNC_END(aes256_expandkey_aesni)

.macro	_aes_crypt_aesni	enc
#ifdef __x86_64__
	.set	RNDKEYS,	%rdi
	.set	NROUNDS,	%esi
	.set	OUT,		%rdx
	.set	IN,		%rcx
#else
	// Assuming -mregparm=3
	.set	RNDKEYS,	%eax
	.set	NROUNDS,	%edx
	.set	OUT,		%ecx
	.set	IN,		%ebx	// Passed on stack
#endif

#ifdef __i386__
	push		%ebx
	mov		8(%esp), %ebx
#endif

	// Zero-th round
	movdqu		(IN), %xmm0
	movdqu		(RNDKEYS), %xmm1
	pxor		%xmm1, %xmm0

	// Normal rounds
	add		$16, RNDKEYS
	dec		NROUNDS
.Lnext_round\@:
	movdqu		(RNDKEYS), %xmm1
.if \enc
	aesenc		%xmm1, %xmm0
.else
	aesdec		%xmm1, %xmm0
.endif
	add		$16, RNDKEYS
	dec		NROUNDS
	jne		.Lnext_round\@

	// Last round
	movdqu		(RNDKEYS), %xmm1
.if \enc
	aesenclast	%xmm1, %xmm0
.else
	aesdeclast	%xmm1, %xmm0
.endif
	movdqu		%xmm0, (OUT)

#ifdef __i386__
	pop		%ebx
#endif
	RET
.endm

// void aes_encrypt_aesni(const u32 rndkeys[], int nrounds,
//			  u8 out[AES_BLOCK_SIZE], const u8 in[AES_BLOCK_SIZE]);
SYM_FUNC_START(aes_encrypt_aesni)
	_aes_crypt_aesni	1
SYM_FUNC_END(aes_encrypt_aesni)

// void aes_decrypt_aesni(const u32 inv_rndkeys[], int nrounds,
//			  u8 out[AES_BLOCK_SIZE], const u8 in[AES_BLOCK_SIZE]);
SYM_FUNC_START(aes_decrypt_aesni)
	_aes_crypt_aesni	0
SYM_FUNC_END(aes_decrypt_aesni)