1 files changed, 153 insertions, 200 deletions
diff --git a/arch/x86/crypto/aes-gcm-vaes-avx512.S b/arch/x86/crypto/aes-gcm-vaes-avx512.S
index be5c14d33acc..3edf829c2ce0 100644
--- a/arch/x86/crypto/aes-gcm-vaes-avx512.S
+++ b/arch/x86/crypto/aes-gcm-vaes-avx512.S
@@ -70,16 +70,14 @@
 .Lgfpoly_and_internal_carrybit:
 	.octa	0xc2000000000000010000000000000001
 
-	// The below constants are used for incrementing the counter blocks.
-	// ctr_pattern points to the four 128-bit values [0, 1, 2, 3].
-	// inc_2blocks and inc_4blocks point to the single 128-bit values 2 and
-	// 4.  Note that the same '2' is reused in ctr_pattern and inc_2blocks.
+	// Values needed to prepare the initial vector of counter blocks.
 .Lctr_pattern:
 	.octa	0
 	.octa	1
-.Linc_2blocks:
 	.octa	2
 	.octa	3
+
+	// The number of AES blocks per vector, as a 128-bit value.
 .Linc_4blocks:
 	.octa	4
 
@@ -96,29 +94,13 @@
 // Offset to end of hash key powers array in the key struct.
 //
 // This is immediately followed by three zeroized padding blocks, which are
-// included so that partial vectors can be handled more easily.  E.g. if VL=64
-// and two blocks remain, we load the 4 values [H^2, H^1, 0, 0].  The most
-// padding blocks needed is 3, which occurs if [H^1, 0, 0, 0] is loaded.
+// included so that partial vectors can be handled more easily.  E.g. if two
+// blocks remain, we load the 4 values [H^2, H^1, 0, 0].  The most padding
+// blocks needed is 3, which occurs if [H^1, 0, 0, 0] is loaded.
 #define OFFSETOFEND_H_POWERS	(OFFSETOF_H_POWERS + (NUM_H_POWERS * 16))
 
 .text
 
-// Set the vector length in bytes.  This sets the VL variable and defines
-// register aliases V0-V31 that map to the ymm or zmm registers.
-.macro	_set_veclen	vl
-	.set	VL,	\vl
-.irp i, 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15, \
-	16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
-.if VL == 32
-	.set	V\i,	%ymm\i
-.elseif VL == 64
-	.set	V\i,	%zmm\i
-.else
-	.error "Unsupported vector length"
-.endif
-.endr
-.endm
-
 // The _ghash_mul_step macro does one step of GHASH multiplication of the
 // 128-bit lanes of \a by the corresponding 128-bit lanes of \b and storing the
 // reduced products in \dst.  \t0, \t1, and \t2 are temporary registers of the
@@ -286,31 +268,27 @@
 // The number of key powers initialized is NUM_H_POWERS, and they are stored in
 // the order H^NUM_H_POWERS to H^1.  The zeroized padding blocks after the key
 // powers themselves are also initialized.
-//
-// This macro supports both VL=32 and VL=64.  _set_veclen must have been invoked
-// with the desired length.  In the VL=32 case, the function computes twice as
-// many key powers than are actually used by the VL=32 GCM update functions.
-// This is done to keep the key format the same regardless of vector length.
 .macro	_aes_gcm_precompute
 
 	// Function arguments
 	.set	KEY,		%rdi
 
-	// Additional local variables.  V0-V2 and %rax are used as temporaries.
+	// Additional local variables.
+	// %zmm[0-2] and %rax are used as temporaries.
 	.set	POWERS_PTR,	%rsi
 	.set	RNDKEYLAST_PTR,	%rdx
-	.set	H_CUR,		V3
+	.set	H_CUR,		%zmm3
 	.set	H_CUR_YMM,	%ymm3
 	.set	H_CUR_XMM,	%xmm3
-	.set	H_INC,		V4
+	.set	H_INC,		%zmm4
 	.set	H_INC_YMM,	%ymm4
 	.set	H_INC_XMM,	%xmm4
-	.set	GFPOLY,		V5
+	.set	GFPOLY,		%zmm5
 	.set	GFPOLY_YMM,	%ymm5
 	.set	GFPOLY_XMM,	%xmm5
 
 	// Get pointer to lowest set of key powers (located at end of array).
-	lea		OFFSETOFEND_H_POWERS-VL(KEY), POWERS_PTR
+	lea		OFFSETOFEND_H_POWERS-64(KEY), POWERS_PTR
 
 	// Encrypt an all-zeroes block to get the raw hash subkey.
 	movl		OFFSETOF_AESKEYLEN(KEY), %eax
@@ -329,8 +307,8 @@
 
 	// Zeroize the padding blocks.
 	vpxor		%xmm0, %xmm0, %xmm0
-	vmovdqu		%ymm0, VL(POWERS_PTR)
-	vmovdqu		%xmm0, VL+2*16(POWERS_PTR)
+	vmovdqu		%ymm0, 64(POWERS_PTR)
+	vmovdqu		%xmm0, 64+2*16(POWERS_PTR)
 
 	// Finish preprocessing the first key power, H^1.  Since this GHASH
 	// implementation operates directly on values with the backwards bit
@@ -370,25 +348,22 @@
 	vinserti128	$1, H_CUR_XMM, H_INC_YMM, H_CUR_YMM
 	vinserti128	$1, H_INC_XMM, H_INC_YMM, H_INC_YMM
 
-.if VL == 64
 	// Create H_CUR = [H^4, H^3, H^2, H^1] and H_INC = [H^4, H^4, H^4, H^4].
 	_ghash_mul	H_INC_YMM, H_CUR_YMM, H_INC_YMM, GFPOLY_YMM, \
 			%ymm0, %ymm1, %ymm2
 	vinserti64x4	$1, H_CUR_YMM, H_INC, H_CUR
 	vshufi64x2	$0, H_INC, H_INC, H_INC
-.endif
 
 	// Store the lowest set of key powers.
 	vmovdqu8	H_CUR, (POWERS_PTR)
 
-	// Compute and store the remaining key powers.  With VL=32, repeatedly
-	// multiply [H^(i+1), H^i] by [H^2, H^2] to get [H^(i+3), H^(i+2)].
-	// With VL=64, repeatedly multiply [H^(i+3), H^(i+2), H^(i+1), H^i] by
+	// Compute and store the remaining key powers.
+	// Repeatedly multiply [H^(i+3), H^(i+2), H^(i+1), H^i] by
 	// [H^4, H^4, H^4, H^4] to get [H^(i+7), H^(i+6), H^(i+5), H^(i+4)].
-	mov		$(NUM_H_POWERS*16/VL) - 1, %eax
+	mov		$3, %eax
 .Lprecompute_next\@:
-	sub		$VL, POWERS_PTR
-	_ghash_mul	H_INC, H_CUR, H_CUR, GFPOLY, V0, V1, V2
+	sub		$64, POWERS_PTR
+	_ghash_mul	H_INC, H_CUR, H_CUR, GFPOLY, %zmm0, %zmm1, %zmm2
 	vmovdqu8	H_CUR, (POWERS_PTR)
 	dec		%eax
 	jnz		.Lprecompute_next\@
@@ -401,16 +376,10 @@
 // the result in \dst_xmm.  This implicitly zeroizes the other lanes of dst.
 .macro	_horizontal_xor	src, src_xmm, dst_xmm, t0_xmm, t1_xmm, t2_xmm
 	vextracti32x4	$1, \src, \t0_xmm
-.if VL == 32
-	vpxord		\t0_xmm, \src_xmm, \dst_xmm
-.elseif VL == 64
 	vextracti32x4	$2, \src, \t1_xmm
 	vextracti32x4	$3, \src, \t2_xmm
 	vpxord		\t0_xmm, \src_xmm, \dst_xmm
 	vpternlogd	$0x96, \t1_xmm, \t2_xmm, \dst_xmm
-.else
-	.error "Unsupported vector length"
-.endif
 .endm
 
 // Do one step of the GHASH update of the data blocks given in the vector
@@ -424,25 +393,21 @@
 //
 // The GHASH update does: GHASH_ACC = H_POW4*(GHASHDATA0 + GHASH_ACC) +
 // H_POW3*GHASHDATA1 + H_POW2*GHASHDATA2 + H_POW1*GHASHDATA3, where the
-// operations are vectorized operations on vectors of 16-byte blocks.  E.g.,
-// with VL=32 there are 2 blocks per vector and the vectorized terms correspond
-// to the following non-vectorized terms:
-//
-//	H_POW4*(GHASHDATA0 + GHASH_ACC) => H^8*(blk0 + GHASH_ACC_XMM) and H^7*(blk1 + 0)
-//	H_POW3*GHASHDATA1 => H^6*blk2 and H^5*blk3
-//	H_POW2*GHASHDATA2 => H^4*blk4 and H^3*blk5
-//	H_POW1*GHASHDATA3 => H^2*blk6 and H^1*blk7
+// operations are vectorized operations on 512-bit vectors of 128-bit blocks.
+// The vectorized terms correspond to the following non-vectorized terms:
 //
-// With VL=64, we use 4 blocks/vector, H^16 through H^1, and blk0 through blk15.
+//       H_POW4*(GHASHDATA0 + GHASH_ACC) => H^16*(blk0 + GHASH_ACC_XMM),
+//              H^15*(blk1 + 0), H^14*(blk2 + 0), and H^13*(blk3 + 0)
+//       H_POW3*GHASHDATA1 => H^12*blk4, H^11*blk5, H^10*blk6, and H^9*blk7
+//       H_POW2*GHASHDATA2 => H^8*blk8,  H^7*blk9,  H^6*blk10, and H^5*blk11
+//       H_POW1*GHASHDATA3 => H^4*blk12, H^3*blk13, H^2*blk14, and H^1*blk15
 //
 // More concretely, this code does:
 //   - Do vectorized "schoolbook" multiplications to compute the intermediate
 //     256-bit product of each block and its corresponding hash key power.
-//     There are 4*VL/16 of these intermediate products.
-//   - Sum (XOR) the intermediate 256-bit products across vectors.  This leaves
-//     VL/16 256-bit intermediate values.
+//   - Sum (XOR) the intermediate 256-bit products across vectors.
 //   - Do a vectorized reduction of these 256-bit intermediate values to
-//     128-bits each.  This leaves VL/16 128-bit intermediate values.
+//     128-bits each.
 //   - Sum (XOR) these values and store the 128-bit result in GHASH_ACC_XMM.
 //
 // See _ghash_mul_step for the full explanation of the operations performed for
@@ -498,60 +463,60 @@
 .endif
 .endm
 
-// Do one non-last round of AES encryption on the counter blocks in V0-V3 using
-// the round key that has been broadcast to all 128-bit lanes of \round_key.
+// Do one non-last round of AES encryption on the blocks in %zmm[0-3] using the
+// round key that has been broadcast to all 128-bit lanes of \round_key.
 .macro	_vaesenc_4x	round_key
-	vaesenc		\round_key, V0, V0
-	vaesenc		\round_key, V1, V1
-	vaesenc		\round_key, V2, V2
-	vaesenc		\round_key, V3, V3
+	vaesenc		\round_key, %zmm0, %zmm0
+	vaesenc		\round_key, %zmm1, %zmm1
+	vaesenc		\round_key, %zmm2, %zmm2
+	vaesenc		\round_key, %zmm3, %zmm3
 .endm
 
 // Start the AES encryption of four vectors of counter blocks.
 .macro	_ctr_begin_4x
 
 	// Increment LE_CTR four times to generate four vectors of little-endian
-	// counter blocks, swap each to big-endian, and store them in V0-V3.
-	vpshufb		BSWAP_MASK, LE_CTR, V0
+	// counter blocks, swap each to big-endian, and store them in %zmm[0-3].
+	vpshufb		BSWAP_MASK, LE_CTR, %zmm0
 	vpaddd		LE_CTR_INC, LE_CTR, LE_CTR
-	vpshufb		BSWAP_MASK, LE_CTR, V1
+	vpshufb		BSWAP_MASK, LE_CTR, %zmm1
 	vpaddd		LE_CTR_INC, LE_CTR, LE_CTR
-	vpshufb		BSWAP_MASK, LE_CTR, V2
+	vpshufb		BSWAP_MASK, LE_CTR, %zmm2
 	vpaddd		LE_CTR_INC, LE_CTR, LE_CTR
-	vpshufb		BSWAP_MASK, LE_CTR, V3
+	vpshufb		BSWAP_MASK, LE_CTR, %zmm3
 	vpaddd		LE_CTR_INC, LE_CTR, LE_CTR
 
 	// AES "round zero": XOR in the zero-th round key.
-	vpxord		RNDKEY0, V0, V0
-	vpxord		RNDKEY0, V1, V1
-	vpxord		RNDKEY0, V2, V2
-	vpxord		RNDKEY0, V3, V3
+	vpxord		RNDKEY0, %zmm0, %zmm0
+	vpxord		RNDKEY0, %zmm1, %zmm1
+	vpxord		RNDKEY0, %zmm2, %zmm2
+	vpxord		RNDKEY0, %zmm3, %zmm3
 .endm
 
-// Do the last AES round for four vectors of counter blocks V0-V3, XOR source
-// data with the resulting keystream, and write the result to DST and
+// Do the last AES round for four vectors of counter blocks %zmm[0-3], XOR
+// source data with the resulting keystream, and write the result to DST and
 // GHASHDATA[0-3].  (Implementation differs slightly, but has the same effect.)
 .macro	_aesenclast_and_xor_4x
 	// XOR the source data with the last round key, saving the result in
 	// GHASHDATA[0-3].  This reduces latency by taking advantage of the
 	// property vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a).
-	vpxord		0*VL(SRC), RNDKEYLAST, GHASHDATA0
-	vpxord		1*VL(SRC), RNDKEYLAST, GHASHDATA1
-	vpxord		2*VL(SRC), RNDKEYLAST, GHASHDATA2
-	vpxord		3*VL(SRC), RNDKEYLAST, GHASHDATA3
+	vpxord		0*64(SRC), RNDKEYLAST, GHASHDATA0
+	vpxord		1*64(SRC), RNDKEYLAST, GHASHDATA1
+	vpxord		2*64(SRC), RNDKEYLAST, GHASHDATA2
+	vpxord		3*64(SRC), RNDKEYLAST, GHASHDATA3
 
 	// Do the last AES round.  This handles the XOR with the source data
 	// too, as per the optimization described above.
-	vaesenclast	GHASHDATA0, V0, GHASHDATA0
-	vaesenclast	GHASHDATA1, V1, GHASHDATA1
-	vaesenclast	GHASHDATA2, V2, GHASHDATA2
-	vaesenclast	GHASHDATA3, V3, GHASHDATA3
+	vaesenclast	GHASHDATA0, %zmm0, GHASHDATA0
+	vaesenclast	GHASHDATA1, %zmm1, GHASHDATA1
+	vaesenclast	GHASHDATA2, %zmm2, GHASHDATA2
+	vaesenclast	GHASHDATA3, %zmm3, GHASHDATA3
 
 	// Store the en/decrypted data to DST.
-	vmovdqu8	GHASHDATA0, 0*VL(DST)
-	vmovdqu8	GHASHDATA1, 1*VL(DST)
-	vmovdqu8	GHASHDATA2, 2*VL(DST)
-	vmovdqu8	GHASHDATA3, 3*VL(DST)
+	vmovdqu8	GHASHDATA0, 0*64(DST)
+	vmovdqu8	GHASHDATA1, 1*64(DST)
+	vmovdqu8	GHASHDATA2, 2*64(DST)
+	vmovdqu8	GHASHDATA3, 3*64(DST)
 .endm
 
 // void aes_gcm_{enc,dec}_update_vaes_avx512(const struct aes_gcm_key_vaes_avx512 *key,
@@ -559,13 +524,11 @@
 //					     const u8 *src, u8 *dst, int datalen);
 //
 // This macro generates a GCM encryption or decryption update function with the
-// above prototype (with \enc selecting which one).  This macro supports both
-// VL=32 and VL=64.  _set_veclen must have been invoked with the desired length.
-//
-// This function computes the next portion of the CTR keystream, XOR's it with
-// |datalen| bytes from |src|, and writes the resulting encrypted or decrypted
-// data to |dst|.  It also updates the GHASH accumulator |ghash_acc| using the
-// next |datalen| ciphertext bytes.
+// above prototype (with \enc selecting which one).  The function computes the
+// next portion of the CTR keystream, XOR's it with |datalen| bytes from |src|,
+// and writes the resulting encrypted or decrypted data to |dst|.  It also
+// updates the GHASH accumulator |ghash_acc| using the next |datalen| ciphertext
+// bytes.
 //
 // |datalen| must be a multiple of 16, except on the last call where it can be
 // any length.  The caller must do any buffering needed to ensure this.  Both
@@ -600,69 +563,69 @@
 	// Pointer to the last AES round key for the chosen AES variant
 	.set	RNDKEYLAST_PTR,	%r11
 
-	// In the main loop, V0-V3 are used as AES input and output.  Elsewhere
-	// they are used as temporary registers.
+	// In the main loop, %zmm[0-3] are used as AES input and output.
+	// Elsewhere they are used as temporary registers.
 
 	// GHASHDATA[0-3] hold the ciphertext blocks and GHASH input data.
-	.set	GHASHDATA0,	V4
+	.set	GHASHDATA0,	%zmm4
 	.set	GHASHDATA0_XMM,	%xmm4
-	.set	GHASHDATA1,	V5
+	.set	GHASHDATA1,	%zmm5
 	.set	GHASHDATA1_XMM,	%xmm5
-	.set	GHASHDATA2,	V6
+	.set	GHASHDATA2,	%zmm6
 	.set	GHASHDATA2_XMM,	%xmm6
-	.set	GHASHDATA3,	V7
+	.set	GHASHDATA3,	%zmm7
 
 	// BSWAP_MASK is the shuffle mask for byte-reflecting 128-bit values
 	// using vpshufb, copied to all 128-bit lanes.
-	.set	BSWAP_MASK,	V8
+	.set	BSWAP_MASK,	%zmm8
 
 	// RNDKEY temporarily holds the next AES round key.
-	.set	RNDKEY,		V9
+	.set	RNDKEY,		%zmm9
 
 	// GHASH_ACC is the accumulator variable for GHASH.  When fully reduced,
 	// only the lowest 128-bit lane can be nonzero.  When not fully reduced,
 	// more than one lane may be used, and they need to be XOR'd together.
-	.set	GHASH_ACC,	V10
+	.set	GHASH_ACC,	%zmm10
 	.set	GHASH_ACC_XMM,	%xmm10
 
 	// LE_CTR_INC is the vector of 32-bit words that need to be added to a
 	// vector of little-endian counter blocks to advance it forwards.
-	.set	LE_CTR_INC,	V11
+	.set	LE_CTR_INC,	%zmm11
 
 	// LE_CTR contains the next set of little-endian counter blocks.
-	.set	LE_CTR,		V12
+	.set	LE_CTR,		%zmm12
 
 	// RNDKEY0, RNDKEYLAST, and RNDKEY_M[9-1] contain cached AES round keys,
 	// copied to all 128-bit lanes.  RNDKEY0 is the zero-th round key,
 	// RNDKEYLAST the last, and RNDKEY_M\i the one \i-th from the last.
-	.set	RNDKEY0,	V13
-	.set	RNDKEYLAST,	V14
-	.set	RNDKEY_M9,	V15
-	.set	RNDKEY_M8,	V16
-	.set	RNDKEY_M7,	V17
-	.set	RNDKEY_M6,	V18
-	.set	RNDKEY_M5,	V19
-	.set	RNDKEY_M4,	V20
-	.set	RNDKEY_M3,	V21
-	.set	RNDKEY_M2,	V22
-	.set	RNDKEY_M1,	V23
+	.set	RNDKEY0,	%zmm13
+	.set	RNDKEYLAST,	%zmm14
+	.set	RNDKEY_M9,	%zmm15
+	.set	RNDKEY_M8,	%zmm16
+	.set	RNDKEY_M7,	%zmm17
+	.set	RNDKEY_M6,	%zmm18
+	.set	RNDKEY_M5,	%zmm19
+	.set	RNDKEY_M4,	%zmm20
+	.set	RNDKEY_M3,	%zmm21
+	.set	RNDKEY_M2,	%zmm22
+	.set	RNDKEY_M1,	%zmm23
 
 	// GHASHTMP[0-2] are temporary variables used by _ghash_step_4x.  These
 	// cannot coincide with anything used for AES encryption, since for
 	// performance reasons GHASH and AES encryption are interleaved.
-	.set	GHASHTMP0,	V24
-	.set	GHASHTMP1,	V25
-	.set	GHASHTMP2,	V26
+	.set	GHASHTMP0,	%zmm24
+	.set	GHASHTMP1,	%zmm25
+	.set	GHASHTMP2,	%zmm26
 
-	// H_POW[4-1] contain the powers of the hash key H^(4*VL/16)...H^1.  The
+	// H_POW[4-1] contain the powers of the hash key H^16...H^1.  The
 	// descending numbering reflects the order of the key powers.
-	.set	H_POW4,		V27
-	.set	H_POW3,		V28
-	.set	H_POW2,		V29
-	.set	H_POW1,		V30
+	.set	H_POW4,		%zmm27
+	.set	H_POW3,		%zmm28
+	.set	H_POW2,		%zmm29
+	.set	H_POW1,		%zmm30
 
 	// GFPOLY contains the .Lgfpoly constant, copied to all 128-bit lanes.
-	.set	GFPOLY,		V31
+	.set	GFPOLY,		%zmm31
 
 	// Load some constants.
 	vbroadcasti32x4	.Lbswap_mask(%rip), BSWAP_MASK
@@ -685,29 +648,23 @@
 	// Finish initializing LE_CTR by adding [0, 1, ...] to its low words.
 	vpaddd		.Lctr_pattern(%rip), LE_CTR, LE_CTR
 
-	// Initialize LE_CTR_INC to contain VL/16 in all 128-bit lanes.
-.if VL == 32
-	vbroadcasti32x4	.Linc_2blocks(%rip), LE_CTR_INC
-.elseif VL == 64
+	// Load 4 into all 128-bit lanes of LE_CTR_INC.
 	vbroadcasti32x4	.Linc_4blocks(%rip), LE_CTR_INC
-.else
-	.error "Unsupported vector length"
-.endif
 
-	// If there are at least 4*VL bytes of data, then continue into the loop
-	// that processes 4*VL bytes of data at a time.  Otherwise skip it.
+	// If there are at least 256 bytes of data, then continue into the loop
+	// that processes 256 bytes of data at a time.  Otherwise skip it.
 	//
-	// Pre-subtracting 4*VL from DATALEN saves an instruction from the main
+	// Pre-subtracting 256 from DATALEN saves an instruction from the main
 	// loop and also ensures that at least one write always occurs to
 	// DATALEN, zero-extending it and allowing DATALEN64 to be used later.
-	add		$-4*VL, DATALEN  // shorter than 'sub 4*VL' when VL=32
+	sub		$256, DATALEN
 	jl		.Lcrypt_loop_4x_done\@
 
 	// Load powers of the hash key.
-	vmovdqu8	OFFSETOFEND_H_POWERS-4*VL(KEY), H_POW4
-	vmovdqu8	OFFSETOFEND_H_POWERS-3*VL(KEY), H_POW3
-	vmovdqu8	OFFSETOFEND_H_POWERS-2*VL(KEY), H_POW2
-	vmovdqu8	OFFSETOFEND_H_POWERS-1*VL(KEY), H_POW1
+	vmovdqu8	OFFSETOFEND_H_POWERS-4*64(KEY), H_POW4
+	vmovdqu8	OFFSETOFEND_H_POWERS-3*64(KEY), H_POW3
+	vmovdqu8	OFFSETOFEND_H_POWERS-2*64(KEY), H_POW2
+	vmovdqu8	OFFSETOFEND_H_POWERS-1*64(KEY), H_POW1
 
 	// Main loop: en/decrypt and hash 4 vectors at a time.
 	//
@@ -736,9 +693,9 @@
 	cmp		%rax, RNDKEYLAST_PTR
 	jne		1b
 	_aesenclast_and_xor_4x
-	sub		$-4*VL, SRC  // shorter than 'add 4*VL' when VL=32
-	sub		$-4*VL, DST
-	add		$-4*VL, DATALEN
+	add		$256, SRC
+	add		$256, DST
+	sub		$256, DATALEN
 	jl		.Lghash_last_ciphertext_4x\@
 .endif
 
@@ -752,10 +709,10 @@
 	// If decrypting, load more ciphertext blocks into GHASHDATA[0-3].  If
 	// encrypting, GHASHDATA[0-3] already contain the previous ciphertext.
 .if !\enc
-	vmovdqu8	0*VL(SRC), GHASHDATA0
-	vmovdqu8	1*VL(SRC), GHASHDATA1
-	vmovdqu8	2*VL(SRC), GHASHDATA2
-	vmovdqu8	3*VL(SRC), GHASHDATA3
+	vmovdqu8	0*64(SRC), GHASHDATA0
+	vmovdqu8	1*64(SRC), GHASHDATA1
+	vmovdqu8	2*64(SRC), GHASHDATA2
+	vmovdqu8	3*64(SRC), GHASHDATA3
 .endif
 
 	// Start the AES encryption of the counter blocks.
@@ -775,17 +732,18 @@
 	_vaesenc_4x	RNDKEY
 128:
 
-	// Finish the AES encryption of the counter blocks in V0-V3, interleaved
-	// with the GHASH update of the ciphertext blocks in GHASHDATA[0-3].
+	// Finish the AES encryption of the counter blocks in %zmm[0-3],
+	// interleaved with the GHASH update of the ciphertext blocks in
+	// GHASHDATA[0-3].
 .irp i, 9,8,7,6,5,4,3,2,1
 	_ghash_step_4x  (9 - \i)
 	_vaesenc_4x	RNDKEY_M\i
 .endr
 	_ghash_step_4x	9
 	_aesenclast_and_xor_4x
-	sub		$-4*VL, SRC  // shorter than 'add 4*VL' when VL=32
-	sub		$-4*VL, DST
-	add		$-4*VL, DATALEN
+	add		$256, SRC
+	add		$256, DST
+	sub		$256, DATALEN
 	jge		.Lcrypt_loop_4x\@
 
 .if \enc
@@ -798,21 +756,22 @@
 
 .Lcrypt_loop_4x_done\@:
 
-	// Undo the extra subtraction by 4*VL and check whether data remains.
-	sub		$-4*VL, DATALEN  // shorter than 'add 4*VL' when VL=32
+	// Undo the extra subtraction by 256 and check whether data remains.
+	add		$256, DATALEN
 	jz		.Ldone\@
 
-	// The data length isn't a multiple of 4*VL.  Process the remaining data
-	// of length 1 <= DATALEN < 4*VL, up to one vector (VL bytes) at a time.
-	// Going one vector at a time may seem inefficient compared to having
-	// separate code paths for each possible number of vectors remaining.
-	// However, using a loop keeps the code size down, and it performs
-	// surprising well; modern CPUs will start executing the next iteration
-	// before the previous one finishes and also predict the number of loop
-	// iterations.  For a similar reason, we roll up the AES rounds.
+	// The data length isn't a multiple of 256 bytes.  Process the remaining
+	// data of length 1 <= DATALEN < 256, up to one 64-byte vector at a
+	// time.  Going one vector at a time may seem inefficient compared to
+	// having separate code paths for each possible number of vectors
+	// remaining.  However, using a loop keeps the code size down, and it
+	// performs surprising well; modern CPUs will start executing the next
+	// iteration before the previous one finishes and also predict the
+	// number of loop iterations.  For a similar reason, we roll up the AES
+	// rounds.
 	//
-	// On the last iteration, the remaining length may be less than VL.
-	// Handle this using masking.
+	// On the last iteration, the remaining length may be less than 64
+	// bytes.  Handle this using masking.
 	//
 	// Since there are enough key powers available for all remaining data,
 	// there is no need to do a GHASH reduction after each iteration.
@@ -841,65 +800,60 @@
 .Lcrypt_loop_1x\@:
 
 	// Select the appropriate mask for this iteration: all 1's if
-	// DATALEN >= VL, otherwise DATALEN 1's.  Do this branchlessly using the
+	// DATALEN >= 64, otherwise DATALEN 1's.  Do this branchlessly using the
 	// bzhi instruction from BMI2.  (This relies on DATALEN <= 255.)
-.if VL < 64
-	mov		$-1, %eax
-	bzhi		DATALEN, %eax, %eax
-	kmovd		%eax, %k1
-.else
 	mov		$-1, %rax
 	bzhi		DATALEN64, %rax, %rax
 	kmovq		%rax, %k1
-.endif
 
 	// Encrypt a vector of counter blocks.  This does not need to be masked.
-	vpshufb		BSWAP_MASK, LE_CTR, V0
+	vpshufb		BSWAP_MASK, LE_CTR, %zmm0
 	vpaddd		LE_CTR_INC, LE_CTR, LE_CTR
-	vpxord		RNDKEY0, V0, V0
+	vpxord		RNDKEY0, %zmm0, %zmm0
 	lea		16(KEY), %rax
 1:
 	vbroadcasti32x4	(%rax), RNDKEY
-	vaesenc		RNDKEY, V0, V0
+	vaesenc		RNDKEY, %zmm0, %zmm0
 	add		$16, %rax
 	cmp		%rax, RNDKEYLAST_PTR
 	jne		1b
-	vaesenclast	RNDKEYLAST, V0, V0
+	vaesenclast	RNDKEYLAST, %zmm0, %zmm0
 
 	// XOR the data with the appropriate number of keystream bytes.
-	vmovdqu8	(SRC), V1{%k1}{z}
-	vpxord		V1, V0, V0
-	vmovdqu8	V0, (DST){%k1}
+	vmovdqu8	(SRC), %zmm1{%k1}{z}
+	vpxord		%zmm1, %zmm0, %zmm0
+	vmovdqu8	%zmm0, (DST){%k1}
 
 	// Update GHASH with the ciphertext block(s), without reducing.
 	//
-	// In the case of DATALEN < VL, the ciphertext is zero-padded to VL.
-	// (If decrypting, it's done by the above masked load.  If encrypting,
-	// it's done by the below masked register-to-register move.)  Note that
-	// if DATALEN <= VL - 16, there will be additional padding beyond the
-	// padding of the last block specified by GHASH itself; i.e., there may
-	// be whole block(s) that get processed by the GHASH multiplication and
-	// reduction instructions but should not actually be included in the
+	// In the case of DATALEN < 64, the ciphertext is zero-padded to 64
+	// bytes.  (If decrypting, it's done by the above masked load.  If
+	// encrypting, it's done by the below masked register-to-register move.)
+	// Note that if DATALEN <= 48, there will be additional padding beyond
+	// the padding of the last block specified by GHASH itself; i.e., there
+	// may be whole block(s) that get processed by the GHASH multiplication
+	// and reduction instructions but should not actually be included in the
 	// GHASH.  However, any such blocks are all-zeroes, and the values that
 	// they're multiplied with are also all-zeroes.  Therefore they just add
 	// 0 * 0 = 0 to the final GHASH result, which makes no difference.
 	vmovdqu8	(POWERS_PTR), H_POW1
 .if \enc
-	vmovdqu8	V0, V1{%k1}{z}
+	vmovdqu8	%zmm0, %zmm1{%k1}{z}
 .endif
-	vpshufb		BSWAP_MASK, V1, V0
-	vpxord		GHASH_ACC, V0, V0
-	_ghash_mul_noreduce	H_POW1, V0, LO, MI, HI, GHASHDATA3, V1, V2, V3
+	vpshufb		BSWAP_MASK, %zmm1, %zmm0
+	vpxord		GHASH_ACC, %zmm0, %zmm0
+	_ghash_mul_noreduce	H_POW1, %zmm0, LO, MI, HI, \
+				GHASHDATA3, %zmm1, %zmm2, %zmm3
 	vpxor		GHASH_ACC_XMM, GHASH_ACC_XMM, GHASH_ACC_XMM
 
-	add		$VL, POWERS_PTR
-	add		$VL, SRC
-	add		$VL, DST
-	sub		$VL, DATALEN
+	add		$64, POWERS_PTR
+	add		$64, SRC
+	add		$64, DST
+	sub		$64, DATALEN
 	jg		.Lcrypt_loop_1x\@
 
 	// Finally, do the GHASH reduction.
-	_ghash_reduce	LO, MI, HI, GFPOLY, V0
+	_ghash_reduce	LO, MI, HI, GFPOLY, %zmm0
 	_horizontal_xor	HI, HI_XMM, GHASH_ACC_XMM, %xmm0, %xmm1, %xmm2
 
 .Ldone\@:
@@ -1047,7 +1001,6 @@
 	RET
 .endm
 
-_set_veclen 64
 SYM_FUNC_START(aes_gcm_precompute_vaes_avx512)
 	_aes_gcm_precompute
 SYM_FUNC_END(aes_gcm_precompute_vaes_avx512)