1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
|
/* SPDX-License-Identifier: GPL-2.0 */
#include <linux/linkage.h>
#include <asm/asm.h>
ENTRY(__memmove)
WEAK(memmove)
/*
* Here we determine if forward copy is possible. Forward copy is
* preferred to backward copy as it is more cache friendly.
*
* If a0 >= a1, t0 gives their distance, if t0 >= a2 then we can
* copy forward.
* If a0 < a1, we can always copy forward. This will make t0 negative,
* so a *unsigned* comparison will always have t0 >= a2.
*
* For forward copy we just delegate the task to memcpy.
*/
sub t0, a0, a1
bltu t0, a2, 1f
tail __memcpy
1:
/*
* Register allocation for code below:
* a0 - end of uncopied dst
* a1 - end of uncopied src
* t0 - start of uncopied dst
*/
mv t0, a0
add a0, a0, a2
add a1, a1, a2
/*
* Use bytewise copy if too small.
*
* This threshold must be at least 2*SZREG to ensure at least one
* wordwise copy is performed. It is chosen to be 16 because it will
* save at least 7 iterations of bytewise copy, which pays off the
* fixed overhead.
*/
li a3, 16
bltu a2, a3, .Lbyte_copy_tail
/*
* Bytewise copy first to align t0 to word boundary.
*/
andi a2, a0, ~(SZREG-1)
beq a0, a2, 2f
1:
addi a1, a1, -1
lb a5, 0(a1)
addi a0, a0, -1
sb a5, 0(a0)
bne a0, a2, 1b
2:
/*
* Now a0 is word-aligned. If a1 is also word aligned, we could perform
* aligned word-wise copy. Otherwise we need to perform misaligned
* word-wise copy.
*/
andi a3, a1, SZREG-1
bnez a3, .Lmisaligned_word_copy
/* Wordwise copy */
addi t0, t0, SZREG-1
bleu a0, t0, 2f
1:
addi a1, a1, -SZREG
REG_L a5, 0(a1)
addi a0, a0, -SZREG
REG_S a5, 0(a0)
bgtu a0, t0, 1b
2:
addi t0, t0, -(SZREG-1)
.Lbyte_copy_tail:
/*
* Bytewise copy anything left.
*/
beq a0, t0, 2f
1:
addi a1, a1, -1
lb a5, 0(a1)
addi a0, a0, -1
sb a5, 0(a0)
bne a0, t0, 1b
2:
mv a0, t0
ret
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
#define M_SLL sll
#define M_SRL srl
#else
#define M_SLL srl
#define M_SRL sll
#endif
.Lmisaligned_word_copy:
/*
* Misaligned word-wise copy.
* For misaligned copy we still perform word-wise copy, but we need to
* use the value fetched from the previous iteration and do some shifts.
* This is safe because we wouldn't access more words than necessary.
*/
/* Calculate shifts */
slli t3, a3, 3
sub t4, x0, t3 /* negate is okay as shift will only look at LSBs */
/* Load the initial value and align a1 */
andi a1, a1, ~(SZREG-1)
REG_L a5, 0(a1)
addi t0, t0, SZREG-1
/* At least one iteration will be executed here, no check */
1:
M_SLL a4, a5, t4
addi a1, a1, -SZREG
REG_L a5, 0(a1)
M_SRL a2, a5, t3
or a2, a2, a4
addi a0, a0, -SZREG
REG_S a2, 0(a0)
bgtu a0, t0, 1b
/* Update pointers to correct value */
addi t0, t0, -(SZREG-1)
add a1, a1, a3
j .Lbyte_copy_tail
END(__memmove)
|