summaryrefslogtreecommitdiff
path: root/arch/riscv/lib/memmove.S
diff options
context:
space:
mode:
Diffstat (limited to 'arch/riscv/lib/memmove.S')
-rw-r--r--arch/riscv/lib/memmove.S176
1 files changed, 120 insertions, 56 deletions
diff --git a/arch/riscv/lib/memmove.S b/arch/riscv/lib/memmove.S
index 07d1d2152ba..fbe6701dbe4 100644
--- a/arch/riscv/lib/memmove.S
+++ b/arch/riscv/lib/memmove.S
@@ -5,60 +5,124 @@
ENTRY(__memmove)
WEAK(memmove)
- move t0, a0
- move t1, a1
-
- beq a0, a1, exit_memcpy
- beqz a2, exit_memcpy
- srli t2, a2, 0x2
-
- slt t3, a0, a1
- beqz t3, do_reverse
-
- andi a2, a2, 0x3
- li t4, 1
- beqz t2, byte_copy
-
-word_copy:
- lw t3, 0(a1)
- addi t2, t2, -1
- addi a1, a1, 4
- sw t3, 0(a0)
- addi a0, a0, 4
- bnez t2, word_copy
- beqz a2, exit_memcpy
- j byte_copy
-
-do_reverse:
- add a0, a0, a2
- add a1, a1, a2
- andi a2, a2, 0x3
- li t4, -1
- beqz t2, reverse_byte_copy
-
-reverse_word_copy:
- addi a1, a1, -4
- addi t2, t2, -1
- lw t3, 0(a1)
- addi a0, a0, -4
- sw t3, 0(a0)
- bnez t2, reverse_word_copy
- beqz a2, exit_memcpy
-
-reverse_byte_copy:
- addi a0, a0, -1
- addi a1, a1, -1
-
-byte_copy:
- lb t3, 0(a1)
- addi a2, a2, -1
- sb t3, 0(a0)
- add a1, a1, t4
- add a0, a0, t4
- bnez a2, byte_copy
-
-exit_memcpy:
- move a0, t0
- move a1, t1
- ret
+ /*
+ * Here we determine if forward copy is possible. Forward copy is
+ * preferred to backward copy as it is more cache friendly.
+ *
+ * If a0 >= a1, t0 gives their distance, if t0 >= a2 then we can
+ * copy forward.
+ * If a0 < a1, we can always copy forward. This will make t0 negative,
+ * so a *unsigned* comparison will always have t0 >= a2.
+ *
+ * For forward copy we just delegate the task to memcpy.
+ */
+ sub t0, a0, a1
+ bltu t0, a2, 1f
+ tail __memcpy
+1:
+
+ /*
+ * Register allocation for code below:
+ * a0 - end of uncopied dst
+ * a1 - end of uncopied src
+ * t0 - start of uncopied dst
+ */
+ mv t0, a0
+ add a0, a0, a2
+ add a1, a1, a2
+
+ /*
+ * Use bytewise copy if too small.
+ *
+ * This threshold must be at least 2*SZREG to ensure at least one
+ * wordwise copy is performed. It is chosen to be 16 because it will
+ * save at least 7 iterations of bytewise copy, which pays off the
+ * fixed overhead.
+ */
+ li a3, 16
+ bltu a2, a3, .Lbyte_copy_tail
+
+ /*
+ * Bytewise copy first to align t0 to word boundary.
+ */
+ andi a2, a0, ~(SZREG-1)
+ beq a0, a2, 2f
+1:
+ addi a1, a1, -1
+ lb a5, 0(a1)
+ addi a0, a0, -1
+ sb a5, 0(a0)
+ bne a0, a2, 1b
+2:
+
+ /*
+ * Now a0 is word-aligned. If a1 is also word aligned, we could perform
+ * aligned word-wise copy. Otherwise we need to perform misaligned
+ * word-wise copy.
+ */
+ andi a3, a1, SZREG-1
+ bnez a3, .Lmisaligned_word_copy
+
+ /* Wordwise copy */
+ addi t0, t0, SZREG-1
+ bleu a0, t0, 2f
+1:
+ addi a1, a1, -SZREG
+ REG_L a5, 0(a1)
+ addi a0, a0, -SZREG
+ REG_S a5, 0(a0)
+ bgtu a0, t0, 1b
+2:
+ addi t0, t0, -(SZREG-1)
+
+.Lbyte_copy_tail:
+ /*
+ * Bytewise copy anything left.
+ */
+ beq a0, t0, 2f
+1:
+ addi a1, a1, -1
+ lb a5, 0(a1)
+ addi a0, a0, -1
+ sb a5, 0(a0)
+ bne a0, t0, 1b
+2:
+
+ mv a0, t0
+ ret
+
+.Lmisaligned_word_copy:
+ /*
+ * Misaligned word-wise copy.
+ * For misaligned copy we still perform word-wise copy, but we need to
+ * use the value fetched from the previous iteration and do some shifts.
+ * This is safe because we wouldn't access more words than necessary.
+ */
+
+ /* Calculate shifts */
+ slli t3, a3, 3
+ sub t4, x0, t3 /* negate is okay as shift will only look at LSBs */
+
+ /* Load the initial value and align a1 */
+ andi a1, a1, ~(SZREG-1)
+ REG_L a5, 0(a1)
+
+ addi t0, t0, SZREG-1
+ /* At least one iteration will be executed here, no check */
+1:
+ sll a4, a5, t4
+ addi a1, a1, -SZREG
+ REG_L a5, 0(a1)
+ srl a2, a5, t3
+ or a2, a2, a4
+ addi a0, a0, -SZREG
+ REG_S a2, 0(a0)
+ bgtu a0, t0, 1b
+
+ /* Update pointers to correct value */
+ addi t0, t0, -(SZREG-1)
+ add a1, a1, a3
+
+ j .Lbyte_copy_tail
+
END(__memmove)