diff options
Diffstat (limited to 'arch/x86/crypto/poly1305-sse2-x86_64.S')
| -rw-r--r-- | arch/x86/crypto/poly1305-sse2-x86_64.S | 22 | 
1 files changed, 14 insertions, 8 deletions
| diff --git a/arch/x86/crypto/poly1305-sse2-x86_64.S b/arch/x86/crypto/poly1305-sse2-x86_64.S index e6add74d78a5..6f0be7a86964 100644 --- a/arch/x86/crypto/poly1305-sse2-x86_64.S +++ b/arch/x86/crypto/poly1305-sse2-x86_64.S @@ -253,16 +253,16 @@ ENTRY(poly1305_block_sse2)  	# h0 += (d4 >> 26) * 5  	mov		d4,%rax  	shr		$26,%rax -	lea		(%eax,%eax,4),%eax -	add		%eax,%ebx +	lea		(%rax,%rax,4),%rax +	add		%rax,%rbx  	# h4 = d4 & 0x3ffffff  	mov		d4,%rax  	and		$0x3ffffff,%eax  	mov		%eax,h4  	# h1 += h0 >> 26 -	mov		%ebx,%eax -	shr		$26,%eax +	mov		%rbx,%rax +	shr		$26,%rax  	add		%eax,h1  	# h0 = h0 & 0x3ffffff  	andl		$0x3ffffff,%ebx @@ -524,6 +524,12 @@ ENTRY(poly1305_2block_sse2)  	paddq		t2,t1  	movq		t1,d4 +	# Now do a partial reduction mod (2^130)-5, carrying h0 -> h1 -> h2 -> +	# h3 -> h4 -> h0 -> h1 to get h0,h2,h3,h4 < 2^26 and h1 < 2^26 + a small +	# amount.  Careful: we must not assume the carry bits 'd0 >> 26', +	# 'd1 >> 26', 'd2 >> 26', 'd3 >> 26', and '(d4 >> 26) * 5' fit in 32-bit +	# integers.  It's true in a single-block implementation, but not here. +  	# d1 += d0 >> 26  	mov		d0,%rax  	shr		$26,%rax @@ -562,16 +568,16 @@ ENTRY(poly1305_2block_sse2)  	# h0 += (d4 >> 26) * 5  	mov		d4,%rax  	shr		$26,%rax -	lea		(%eax,%eax,4),%eax -	add		%eax,%ebx +	lea		(%rax,%rax,4),%rax +	add		%rax,%rbx  	# h4 = d4 & 0x3ffffff  	mov		d4,%rax  	and		$0x3ffffff,%eax  	mov		%eax,h4  	# h1 += h0 >> 26 -	mov		%ebx,%eax -	shr		$26,%eax +	mov		%rbx,%rax +	shr		$26,%rax  	add		%eax,h1  	# h0 = h0 & 0x3ffffff  	andl		$0x3ffffff,%ebx | 
