1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
|
/* SPDX-License-Identifier: GPL-2.0-or-later */
//
// AES block cipher using AES-NI instructions
//
// Copyright 2026 Google LLC
//
// The code in this file supports 32-bit and 64-bit CPUs, and it doesn't require
// AVX. It does use up to SSE4.1, which all CPUs with AES-NI have.
#include <linux/linkage.h>
.section .rodata
#ifdef __x86_64__
#define RODATA(label) label(%rip)
#else
#define RODATA(label) label
#endif
// A mask for pshufb that extracts the last dword, rotates it right by 8
// bits, and copies the result to all four dwords.
.p2align 4
.Lmask:
.byte 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12
// The AES round constants, used during key expansion
.Lrcon:
.long 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36
.text
// Transform four dwords [a0, a1, a2, a3] in \a into
// [a0, a0^a1, a0^a1^a2, a0^a1^a2^a3]. \tmp is a temporary xmm register.
//
// Note: this could be done in four instructions, shufps + pxor + shufps + pxor,
// if the temporary register were zero-initialized ahead of time. We instead do
// it in an easier-to-understand way that doesn't require zero-initialization
// and avoids the unusual shufps instruction. movdqa is usually "free" anyway.
.macro _prefix_sum a, tmp
movdqa \a, \tmp // [a0, a1, a2, a3]
pslldq $4, \a // [0, a0, a1, a2]
pxor \tmp, \a // [a0, a0^a1, a1^a2, a2^a3]
movdqa \a, \tmp
pslldq $8, \a // [0, 0, a0, a0^a1]
pxor \tmp, \a // [a0, a0^a1, a0^a1^a2, a0^a1^a2^a3]
.endm
.macro _gen_round_key a, b
// Compute four copies of rcon[i] ^ SubBytes(ror32(w, 8)), where w is
// the last dword of the previous round key (given in \b).
//
// 'aesenclast src, dst' does dst = src XOR SubBytes(ShiftRows(dst)).
// It is used here solely for the SubBytes and the XOR. The ShiftRows
// is a no-op because all four columns are the same here.
//
// Don't use the 'aeskeygenassist' instruction, since:
// - On most Intel CPUs it is microcoded, making it have a much higher
// latency and use more execution ports than 'aesenclast'.
// - It cannot be used in a loop, since it requires an immediate.
// - It doesn't do much more than 'aesenclast' in the first place.
movdqa \b, %xmm2
pshufb MASK, %xmm2
aesenclast RCON, %xmm2
// XOR in the prefix sum of the four dwords of \a, which is the
// previous round key (AES-128) or the first round key in the previous
// pair of round keys (AES-256). The result is the next round key.
_prefix_sum \a, tmp=%xmm3
pxor %xmm2, \a
// Store the next round key to memory. Also leave it in \a.
movdqu \a, (RNDKEYS)
.endm
.macro _aes_expandkey_aesni is_aes128
#ifdef __x86_64__
// Arguments
.set RNDKEYS, %rdi
.set INV_RNDKEYS, %rsi
.set IN_KEY, %rdx
// Other local variables
.set RCON_PTR, %rcx
.set COUNTER, %eax
#else
// Arguments, assuming -mregparm=3
.set RNDKEYS, %eax
.set INV_RNDKEYS, %edx
.set IN_KEY, %ecx
// Other local variables
.set RCON_PTR, %ebx
.set COUNTER, %esi
#endif
.set RCON, %xmm6
.set MASK, %xmm7
#ifdef __i386__
push %ebx
push %esi
#endif
.if \is_aes128
// AES-128: the first round key is simply a copy of the raw key.
movdqu (IN_KEY), %xmm0
movdqu %xmm0, (RNDKEYS)
.else
// AES-256: the first two round keys are simply a copy of the raw key.
movdqu (IN_KEY), %xmm0
movdqu %xmm0, (RNDKEYS)
movdqu 16(IN_KEY), %xmm1
movdqu %xmm1, 16(RNDKEYS)
add $32, RNDKEYS
.endif
// Generate the remaining round keys.
movdqa RODATA(.Lmask), MASK
.if \is_aes128
lea RODATA(.Lrcon), RCON_PTR
mov $10, COUNTER
.Lgen_next_aes128_round_key:
add $16, RNDKEYS
movd (RCON_PTR), RCON
pshufd $0x00, RCON, RCON
add $4, RCON_PTR
_gen_round_key %xmm0, %xmm0
dec COUNTER
jnz .Lgen_next_aes128_round_key
.else
// AES-256: only the first 7 round constants are needed, so instead of
// loading each one from memory, just start by loading [1, 1, 1, 1] and
// then generate the rest by doubling.
pshufd $0x00, RODATA(.Lrcon), RCON
pxor %xmm5, %xmm5 // All-zeroes
mov $7, COUNTER
.Lgen_next_aes256_round_key_pair:
// Generate the next AES-256 round key: either the first of a pair of
// two, or the last one.
_gen_round_key %xmm0, %xmm1
dec COUNTER
jz .Lgen_aes256_round_keys_done
// Generate the second AES-256 round key of the pair. Compared to the
// first, there's no rotation and no XOR of a round constant.
pshufd $0xff, %xmm0, %xmm2 // Get four copies of last dword
aesenclast %xmm5, %xmm2 // Just does SubBytes
_prefix_sum %xmm1, tmp=%xmm3
pxor %xmm2, %xmm1
movdqu %xmm1, 16(RNDKEYS)
add $32, RNDKEYS
paddd RCON, RCON // RCON <<= 1
jmp .Lgen_next_aes256_round_key_pair
.Lgen_aes256_round_keys_done:
.endif
// If INV_RNDKEYS is non-NULL, write the round keys for the Equivalent
// Inverse Cipher to it. To do that, reverse the standard round keys,
// and apply aesimc (InvMixColumn) to each except the first and last.
test INV_RNDKEYS, INV_RNDKEYS
jz .Ldone\@
movdqu (RNDKEYS), %xmm0 // Last standard round key
movdqu %xmm0, (INV_RNDKEYS) // => First inverse round key
.if \is_aes128
mov $9, COUNTER
.else
mov $13, COUNTER
.endif
.Lgen_next_inv_round_key\@:
sub $16, RNDKEYS
add $16, INV_RNDKEYS
movdqu (RNDKEYS), %xmm0
aesimc %xmm0, %xmm0
movdqu %xmm0, (INV_RNDKEYS)
dec COUNTER
jnz .Lgen_next_inv_round_key\@
movdqu -16(RNDKEYS), %xmm0 // First standard round key
movdqu %xmm0, 16(INV_RNDKEYS) // => Last inverse round key
.Ldone\@:
#ifdef __i386__
pop %esi
pop %ebx
#endif
RET
.endm
// void aes128_expandkey_aesni(u32 rndkeys[], u32 *inv_rndkeys,
// const u8 in_key[AES_KEYSIZE_128]);
SYM_FUNC_START(aes128_expandkey_aesni)
_aes_expandkey_aesni 1
SYM_FUNC_END(aes128_expandkey_aesni)
// void aes256_expandkey_aesni(u32 rndkeys[], u32 *inv_rndkeys,
// const u8 in_key[AES_KEYSIZE_256]);
SYM_FUNC_START(aes256_expandkey_aesni)
_aes_expandkey_aesni 0
SYM_FUNC_END(aes256_expandkey_aesni)
.macro _aes_crypt_aesni enc
#ifdef __x86_64__
.set RNDKEYS, %rdi
.set NROUNDS, %esi
.set OUT, %rdx
.set IN, %rcx
#else
// Assuming -mregparm=3
.set RNDKEYS, %eax
.set NROUNDS, %edx
.set OUT, %ecx
.set IN, %ebx // Passed on stack
#endif
#ifdef __i386__
push %ebx
mov 8(%esp), %ebx
#endif
// Zero-th round
movdqu (IN), %xmm0
movdqu (RNDKEYS), %xmm1
pxor %xmm1, %xmm0
// Normal rounds
add $16, RNDKEYS
dec NROUNDS
.Lnext_round\@:
movdqu (RNDKEYS), %xmm1
.if \enc
aesenc %xmm1, %xmm0
.else
aesdec %xmm1, %xmm0
.endif
add $16, RNDKEYS
dec NROUNDS
jne .Lnext_round\@
// Last round
movdqu (RNDKEYS), %xmm1
.if \enc
aesenclast %xmm1, %xmm0
.else
aesdeclast %xmm1, %xmm0
.endif
movdqu %xmm0, (OUT)
#ifdef __i386__
pop %ebx
#endif
RET
.endm
// void aes_encrypt_aesni(const u32 rndkeys[], int nrounds,
// u8 out[AES_BLOCK_SIZE], const u8 in[AES_BLOCK_SIZE]);
SYM_FUNC_START(aes_encrypt_aesni)
_aes_crypt_aesni 1
SYM_FUNC_END(aes_encrypt_aesni)
// void aes_decrypt_aesni(const u32 inv_rndkeys[], int nrounds,
// u8 out[AES_BLOCK_SIZE], const u8 in[AES_BLOCK_SIZE]);
SYM_FUNC_START(aes_decrypt_aesni)
_aes_crypt_aesni 0
SYM_FUNC_END(aes_decrypt_aesni)
|