summaryrefslogtreecommitdiff
path: root/arch/riscv/lib/memmove.S
blob: b2c1c7367130164f8d32ea947cfd022c7cae281e (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
/* SPDX-License-Identifier: GPL-2.0 */

#include <linux/linkage.h>
#include <asm/asm.h>

ENTRY(__memmove)
WEAK(memmove)
	/*
	 * Here we determine if forward copy is possible. Forward copy is
	 * preferred to backward copy as it is more cache friendly.
	 *
	 * If a0 >= a1, t0 gives their distance, if t0 >= a2 then we can
	 *   copy forward.
	 * If a0 < a1, we can always copy forward. This will make t0 negative,
	 *   so a *unsigned* comparison will always have t0 >= a2.
	 *
	 * For forward copy we just delegate the task to memcpy.
	 */
	sub	t0, a0, a1
	bltu	t0, a2, 1f
	tail	__memcpy
1:

	/*
	 * Register allocation for code below:
	 * a0 - end of uncopied dst
	 * a1 - end of uncopied src
	 * t0 - start of uncopied dst
	 */
	mv	t0, a0
	add	a0, a0, a2
	add	a1, a1, a2

	/*
	 * Use bytewise copy if too small.
	 *
	 * This threshold must be at least 2*SZREG to ensure at least one
	 * wordwise copy is performed. It is chosen to be 16 because it will
	 * save at least 7 iterations of bytewise copy, which pays off the
	 * fixed overhead.
	 */
	li	a3, 16
	bltu	a2, a3, .Lbyte_copy_tail

	/*
	 * Bytewise copy first to align t0 to word boundary.
	 */
	andi	a2, a0, ~(SZREG-1)
	beq	a0, a2, 2f
1:
	addi	a1, a1, -1
	lb	a5, 0(a1)
	addi	a0, a0, -1
	sb	a5, 0(a0)
	bne	a0, a2, 1b
2:

	/*
	 * Now a0 is word-aligned. If a1 is also word aligned, we could perform
	 * aligned word-wise copy. Otherwise we need to perform misaligned
	 * word-wise copy.
	 */
	andi	a3, a1, SZREG-1
	bnez	a3, .Lmisaligned_word_copy

	/* Wordwise copy */
	addi	t0, t0, SZREG-1
	bleu	a0, t0, 2f
1:
	addi	a1, a1, -SZREG
	REG_L	a5, 0(a1)
	addi	a0, a0, -SZREG
	REG_S	a5, 0(a0)
	bgtu	a0, t0, 1b
2:
	addi	t0, t0, -(SZREG-1)

.Lbyte_copy_tail:
	/*
	 * Bytewise copy anything left.
	 */
	beq	a0, t0, 2f
1:
	addi	a1, a1, -1
	lb	a5, 0(a1)
	addi	a0, a0, -1
	sb	a5, 0(a0)
	bne	a0, t0, 1b
2:

	mv	a0, t0
	ret

#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
#define M_SLL sll
#define M_SRL srl
#else
#define M_SLL srl
#define M_SRL sll
#endif

.Lmisaligned_word_copy:
	/*
	 * Misaligned word-wise copy.
	 * For misaligned copy we still perform word-wise copy, but we need to
	 * use the value fetched from the previous iteration and do some shifts.
	 * This is safe because we wouldn't access more words than necessary.
	 */

	/* Calculate shifts */
	slli	t3, a3, 3
	sub	t4, x0, t3 /* negate is okay as shift will only look at LSBs */

	/* Load the initial value and align a1 */
	andi	a1, a1, ~(SZREG-1)
	REG_L	a5, 0(a1)

	addi	t0, t0, SZREG-1
	/* At least one iteration will be executed here, no check */
1:
	M_SLL	a4, a5, t4
	addi	a1, a1, -SZREG
	REG_L	a5, 0(a1)
	M_SRL	a2, a5, t3
	or	a2, a2, a4
	addi	a0, a0, -SZREG
	REG_S	a2, 0(a0)
	bgtu	a0, t0, 1b

	/* Update pointers to correct value */
	addi	t0, t0, -(SZREG-1)
	add	a1, a1, a3

	j	.Lbyte_copy_tail

END(__memmove)