1 files changed, 120 insertions, 56 deletions
diff --git a/arch/riscv/lib/memmove.S b/arch/riscv/lib/memmove.S
index 07d1d2152ba..fbe6701dbe4 100644
--- a/arch/riscv/lib/memmove.S
+++ b/arch/riscv/lib/memmove.S
@@ -5,60 +5,124 @@
 
 ENTRY(__memmove)
 WEAK(memmove)
-        move    t0, a0
-        move    t1, a1
-
-        beq     a0, a1, exit_memcpy
-        beqz    a2, exit_memcpy
-        srli    t2, a2, 0x2
-
-        slt     t3, a0, a1
-        beqz    t3, do_reverse
-
-        andi    a2, a2, 0x3
-        li      t4, 1
-        beqz    t2, byte_copy
-
-word_copy:
-        lw      t3, 0(a1)
-        addi    t2, t2, -1
-        addi    a1, a1, 4
-        sw      t3, 0(a0)
-        addi    a0, a0, 4
-        bnez    t2, word_copy
-        beqz    a2, exit_memcpy
-        j       byte_copy
-
-do_reverse:
-        add     a0, a0, a2
-        add     a1, a1, a2
-        andi    a2, a2, 0x3
-        li      t4, -1
-        beqz    t2, reverse_byte_copy
-
-reverse_word_copy:
-        addi    a1, a1, -4
-        addi    t2, t2, -1
-        lw      t3, 0(a1)
-        addi    a0, a0, -4
-        sw      t3, 0(a0)
-        bnez    t2, reverse_word_copy
-        beqz    a2, exit_memcpy
-
-reverse_byte_copy:
-        addi    a0, a0, -1
-        addi    a1, a1, -1
-
-byte_copy:
-        lb      t3, 0(a1)
-        addi    a2, a2, -1
-        sb      t3, 0(a0)
-        add     a1, a1, t4
-        add     a0, a0, t4
-        bnez    a2, byte_copy
-
-exit_memcpy:
-        move a0, t0
-        move a1, t1
-        ret
+	/*
+	 * Here we determine if forward copy is possible. Forward copy is
+	 * preferred to backward copy as it is more cache friendly.
+	 *
+	 * If a0 >= a1, t0 gives their distance, if t0 >= a2 then we can
+	 *   copy forward.
+	 * If a0 < a1, we can always copy forward. This will make t0 negative,
+	 *   so a *unsigned* comparison will always have t0 >= a2.
+	 *
+	 * For forward copy we just delegate the task to memcpy.
+	 */
+	sub	t0, a0, a1
+	bltu	t0, a2, 1f
+	tail	__memcpy
+1:
+
+	/*
+	 * Register allocation for code below:
+	 * a0 - end of uncopied dst
+	 * a1 - end of uncopied src
+	 * t0 - start of uncopied dst
+	 */
+	mv	t0, a0
+	add	a0, a0, a2
+	add	a1, a1, a2
+
+	/*
+	 * Use bytewise copy if too small.
+	 *
+	 * This threshold must be at least 2*SZREG to ensure at least one
+	 * wordwise copy is performed. It is chosen to be 16 because it will
+	 * save at least 7 iterations of bytewise copy, which pays off the
+	 * fixed overhead.
+	 */
+	li	a3, 16
+	bltu	a2, a3, .Lbyte_copy_tail
+
+	/*
+	 * Bytewise copy first to align t0 to word boundary.
+	 */
+	andi	a2, a0, ~(SZREG-1)
+	beq	a0, a2, 2f
+1:
+	addi	a1, a1, -1
+	lb	a5, 0(a1)
+	addi	a0, a0, -1
+	sb	a5, 0(a0)
+	bne	a0, a2, 1b
+2:
+
+	/*
+	 * Now a0 is word-aligned. If a1 is also word aligned, we could perform
+	 * aligned word-wise copy. Otherwise we need to perform misaligned
+	 * word-wise copy.
+	 */
+	andi	a3, a1, SZREG-1
+	bnez	a3, .Lmisaligned_word_copy
+
+	/* Wordwise copy */
+	addi	t0, t0, SZREG-1
+	bleu	a0, t0, 2f
+1:
+	addi	a1, a1, -SZREG
+	REG_L	a5, 0(a1)
+	addi	a0, a0, -SZREG
+	REG_S	a5, 0(a0)
+	bgtu	a0, t0, 1b
+2:
+	addi	t0, t0, -(SZREG-1)
+
+.Lbyte_copy_tail:
+	/*
+	 * Bytewise copy anything left.
+	 */
+	beq	a0, t0, 2f
+1:
+	addi	a1, a1, -1
+	lb	a5, 0(a1)
+	addi	a0, a0, -1
+	sb	a5, 0(a0)
+	bne	a0, t0, 1b
+2:
+
+	mv	a0, t0
+	ret
+
+.Lmisaligned_word_copy:
+	/*
+	 * Misaligned word-wise copy.
+	 * For misaligned copy we still perform word-wise copy, but we need to
+	 * use the value fetched from the previous iteration and do some shifts.
+	 * This is safe because we wouldn't access more words than necessary.
+	 */
+
+	/* Calculate shifts */
+	slli	t3, a3, 3
+	sub	t4, x0, t3 /* negate is okay as shift will only look at LSBs */
+
+	/* Load the initial value and align a1 */
+	andi	a1, a1, ~(SZREG-1)
+	REG_L	a5, 0(a1)
+
+	addi	t0, t0, SZREG-1
+	/* At least one iteration will be executed here, no check */
+1:
+	sll	a4, a5, t4
+	addi	a1, a1, -SZREG
+	REG_L	a5, 0(a1)
+	srl	a2, a5, t3
+	or	a2, a2, a4
+	addi	a0, a0, -SZREG
+	REG_S	a2, 0(a0)
+	bgtu	a0, t0, 1b
+
+	/* Update pointers to correct value */
+	addi	t0, t0, -(SZREG-1)
+	add	a1, a1, a3
+
+	j	.Lbyte_copy_tail
+
 END(__memmove)