1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
|
/* SPDX-License-Identifier: GPL-2.0 */
/*
*
* Optimized version of the standard memcpy() function
*
* Inputs:
* in0: destination address
* in1: source address
* in2: number of bytes to copy
* Output:
* no return value
*
* Copyright (C) 2000-2001 Hewlett-Packard Co
* Stephane Eranian <eranian@hpl.hp.com>
* David Mosberger-Tang <davidm@hpl.hp.com>
*/
#include <asm/asmmacro.h>
#include <asm/export.h>
GLOBAL_ENTRY(memcpy)
# define MEM_LAT 21 /* latency to memory */
# define dst r2
# define src r3
# define retval r8
# define saved_pfs r9
# define saved_lc r10
# define saved_pr r11
# define cnt r16
# define src2 r17
# define t0 r18
# define t1 r19
# define t2 r20
# define t3 r21
# define t4 r22
# define src_end r23
# define N (MEM_LAT + 4)
# define Nrot ((N + 7) & ~7)
/*
* First, check if everything (src, dst, len) is a multiple of eight. If
* so, we handle everything with no taken branches (other than the loop
* itself) and a small icache footprint. Otherwise, we jump off to
* the more general copy routine handling arbitrary
* sizes/alignment etc.
*/
.prologue
.save ar.pfs, saved_pfs
alloc saved_pfs=ar.pfs,3,Nrot,0,Nrot
.save ar.lc, saved_lc
mov saved_lc=ar.lc
or t0=in0,in1
;;
or t0=t0,in2
.save pr, saved_pr
mov saved_pr=pr
.body
cmp.eq p6,p0=in2,r0 // zero length?
mov retval=in0 // return dst
(p6) br.ret.spnt.many rp // zero length, return immediately
;;
mov dst=in0 // copy because of rotation
shr.u cnt=in2,3 // number of 8-byte words to copy
mov pr.rot=1<<16
;;
adds cnt=-1,cnt // br.ctop is repeat/until
cmp.gtu p7,p0=16,in2 // copying less than 16 bytes?
mov ar.ec=N
;;
and t0=0x7,t0
mov ar.lc=cnt
;;
cmp.ne p6,p0=t0,r0
mov src=in1 // copy because of rotation
(p7) br.cond.spnt.few .memcpy_short
(p6) br.cond.spnt.few .memcpy_long
;;
nop.m 0
;;
nop.m 0
nop.i 0
;;
nop.m 0
;;
.rotr val[N]
.rotp p[N]
.align 32
1: { .mib
(p[0]) ld8 val[0]=[src],8
nop.i 0
brp.loop.imp 1b, 2f
}
2: { .mfb
(p[N-1])st8 [dst]=val[N-1],8
nop.f 0
br.ctop.dptk.few 1b
}
;;
mov ar.lc=saved_lc
mov pr=saved_pr,-1
mov ar.pfs=saved_pfs
br.ret.sptk.many rp
/*
* Small (<16 bytes) unaligned copying is done via a simple byte-at-the-time
* copy loop. This performs relatively poorly on Itanium, but it doesn't
* get used very often (gcc inlines small copies) and due to atomicity
* issues, we want to avoid read-modify-write of entire words.
*/
.align 32
.memcpy_short:
adds cnt=-1,in2 // br.ctop is repeat/until
mov ar.ec=MEM_LAT
brp.loop.imp 1f, 2f
;;
mov ar.lc=cnt
;;
nop.m 0
;;
nop.m 0
nop.i 0
;;
nop.m 0
;;
nop.m 0
;;
/*
* It is faster to put a stop bit in the loop here because it makes
* the pipeline shorter (and latency is what matters on short copies).
*/
.align 32
1: { .mib
(p[0]) ld1 val[0]=[src],1
nop.i 0
brp.loop.imp 1b, 2f
} ;;
2: { .mfb
(p[MEM_LAT-1])st1 [dst]=val[MEM_LAT-1],1
nop.f 0
br.ctop.dptk.few 1b
} ;;
mov ar.lc=saved_lc
mov pr=saved_pr,-1
mov ar.pfs=saved_pfs
br.ret.sptk.many rp
/*
* Large (>= 16 bytes) copying is done in a fancy way. Latency isn't
* an overriding concern here, but throughput is. We first do
* sub-word copying until the destination is aligned, then we check
* if the source is also aligned. If so, we do a simple load/store-loop
* until there are less than 8 bytes left over and then we do the tail,
* by storing the last few bytes using sub-word copying. If the source
* is not aligned, we branch off to the non-congruent loop.
*
* stage: op:
* 0 ld
* :
* MEM_LAT+3 shrp
* MEM_LAT+4 st
*
* On Itanium, the pipeline itself runs without stalls. However, br.ctop
* seems to introduce an unavoidable bubble in the pipeline so the overall
* latency is 2 cycles/iteration. This gives us a _copy_ throughput
* of 4 byte/cycle. Still not bad.
*/
# undef N
# undef Nrot
# define N (MEM_LAT + 5) /* number of stages */
# define Nrot ((N+1 + 2 + 7) & ~7) /* number of rotating regs */
#define LOG_LOOP_SIZE 6
.memcpy_long:
alloc t3=ar.pfs,3,Nrot,0,Nrot // resize register frame
and t0=-8,src // t0 = src & ~7
and t2=7,src // t2 = src & 7
;;
ld8 t0=[t0] // t0 = 1st source word
adds src2=7,src // src2 = (src + 7)
sub t4=r0,dst // t4 = -dst
;;
and src2=-8,src2 // src2 = (src + 7) & ~7
shl t2=t2,3 // t2 = 8*(src & 7)
shl t4=t4,3 // t4 = 8*(dst & 7)
;;
ld8 t1=[src2] // t1 = 1st source word if src is 8-byte aligned, 2nd otherwise
sub t3=64,t2 // t3 = 64-8*(src & 7)
shr.u t0=t0,t2
;;
add src_end=src,in2
shl t1=t1,t3
mov pr=t4,0x38 // (p5,p4,p3)=(dst & 7)
;;
or t0=t0,t1
mov cnt=r0
adds src_end=-1,src_end
;;
(p3) st1 [dst]=t0,1
(p3) shr.u t0=t0,8
(p3) adds cnt=1,cnt
;;
(p4) st2 [dst]=t0,2
(p4) shr.u t0=t0,16
(p4) adds cnt=2,cnt
;;
(p5) st4 [dst]=t0,4
(p5) adds cnt=4,cnt
and src_end=-8,src_end // src_end = last word of source buffer
;;
// At this point, dst is aligned to 8 bytes and there at least 16-7=9 bytes left to copy:
1:{ add src=cnt,src // make src point to remainder of source buffer
sub cnt=in2,cnt // cnt = number of bytes left to copy
mov t4=ip
} ;;
and src2=-8,src // align source pointer
adds t4=.memcpy_loops-1b,t4
mov ar.ec=N
and t0=7,src // t0 = src & 7
shr.u t2=cnt,3 // t2 = number of 8-byte words left to copy
shl cnt=cnt,3 // move bits 0-2 to 3-5
;;
.rotr val[N+1], w[2]
.rotp p[N]
cmp.ne p6,p0=t0,r0 // is src aligned, too?
shl t0=t0,LOG_LOOP_SIZE // t0 = 8*(src & 7)
adds t2=-1,t2 // br.ctop is repeat/until
;;
add t4=t0,t4
mov pr=cnt,0x38 // set (p5,p4,p3) to # of bytes last-word bytes to copy
mov ar.lc=t2
;;
nop.m 0
;;
nop.m 0
nop.i 0
;;
nop.m 0
;;
(p6) ld8 val[1]=[src2],8 // prime the pump...
mov b6=t4
br.sptk.few b6
;;
.memcpy_tail:
// At this point, (p5,p4,p3) are set to the number of bytes left to copy (which is
// less than 8) and t0 contains the last few bytes of the src buffer:
(p5) st4 [dst]=t0,4
(p5) shr.u t0=t0,32
mov ar.lc=saved_lc
;;
(p4) st2 [dst]=t0,2
(p4) shr.u t0=t0,16
mov ar.pfs=saved_pfs
;;
(p3) st1 [dst]=t0
mov pr=saved_pr,-1
br.ret.sptk.many rp
///////////////////////////////////////////////////////
.align 64
#define COPY(shift,index) \
1: { .mib \
(p[0]) ld8 val[0]=[src2],8; \
(p[MEM_LAT+3]) shrp w[0]=val[MEM_LAT+3],val[MEM_LAT+4-index],shift; \
brp.loop.imp 1b, 2f \
}; \
2: { .mfb \
(p[MEM_LAT+4]) st8 [dst]=w[1],8; \
nop.f 0; \
br.ctop.dptk.few 1b; \
}; \
;; \
ld8 val[N-1]=[src_end]; /* load last word (may be same as val[N]) */ \
;; \
shrp t0=val[N-1],val[N-index],shift; \
br .memcpy_tail
.memcpy_loops:
COPY(0, 1) /* no point special casing this---it doesn't go any faster without shrp */
COPY(8, 0)
COPY(16, 0)
COPY(24, 0)
COPY(32, 0)
COPY(40, 0)
COPY(48, 0)
COPY(56, 0)
END(memcpy)
EXPORT_SYMBOL(memcpy)
|