| 1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
 | /* Written 2003 by Andi Kleen, based on a kernel by Evandro Menezes */
#include <linux/linkage.h>
#include <asm/dwarf2.h>
#include <asm/alternative-asm.h>
	ALIGN
copy_page_c:
	CFI_STARTPROC
	movl $4096/8,%ecx
	rep movsq
	ret
	CFI_ENDPROC
ENDPROC(copy_page_c)
/* Don't use streaming store because it's better when the target
   ends up in cache. */
	    
/* Could vary the prefetch distance based on SMP/UP */
ENTRY(copy_page)
	CFI_STARTPROC
	subq	$3*8,%rsp
	CFI_ADJUST_CFA_OFFSET 3*8
	movq	%rbx,(%rsp)
	CFI_REL_OFFSET rbx, 0
	movq	%r12,1*8(%rsp)
	CFI_REL_OFFSET r12, 1*8
	movq	%r13,2*8(%rsp)
	CFI_REL_OFFSET r13, 2*8
	movl	$(4096/64)-5,%ecx
	.p2align 4
.Loop64:
  	dec     %rcx
	movq        (%rsi), %rax
	movq      8 (%rsi), %rbx
	movq     16 (%rsi), %rdx
	movq     24 (%rsi), %r8
	movq     32 (%rsi), %r9
	movq     40 (%rsi), %r10
	movq     48 (%rsi), %r11
	movq     56 (%rsi), %r12
	prefetcht0 5*64(%rsi)
	movq     %rax,    (%rdi)
	movq     %rbx,  8 (%rdi)
	movq     %rdx, 16 (%rdi)
	movq     %r8,  24 (%rdi)
	movq     %r9,  32 (%rdi)
	movq     %r10, 40 (%rdi)
	movq     %r11, 48 (%rdi)
	movq     %r12, 56 (%rdi)
	leaq    64 (%rsi), %rsi
	leaq    64 (%rdi), %rdi
	jnz     .Loop64
	movl	$5,%ecx
	.p2align 4
.Loop2:
	decl   %ecx
	movq        (%rsi), %rax
	movq      8 (%rsi), %rbx
	movq     16 (%rsi), %rdx
	movq     24 (%rsi), %r8
	movq     32 (%rsi), %r9
	movq     40 (%rsi), %r10
	movq     48 (%rsi), %r11
	movq     56 (%rsi), %r12
	movq     %rax,    (%rdi)
	movq     %rbx,  8 (%rdi)
	movq     %rdx, 16 (%rdi)
	movq     %r8,  24 (%rdi)
	movq     %r9,  32 (%rdi)
	movq     %r10, 40 (%rdi)
	movq     %r11, 48 (%rdi)
	movq     %r12, 56 (%rdi)
	leaq	64(%rdi),%rdi
	leaq	64(%rsi),%rsi
	jnz	.Loop2
	movq	(%rsp),%rbx
	CFI_RESTORE rbx
	movq	1*8(%rsp),%r12
	CFI_RESTORE r12
	movq	2*8(%rsp),%r13
	CFI_RESTORE r13
	addq	$3*8,%rsp
	CFI_ADJUST_CFA_OFFSET -3*8
	ret
.Lcopy_page_end:
	CFI_ENDPROC
ENDPROC(copy_page)
	/* Some CPUs run faster using the string copy instructions.
	   It is also a lot simpler. Use this when possible */
#include <asm/cpufeature.h>
	.section .altinstr_replacement,"ax"
1:	.byte 0xeb					/* jmp <disp8> */
	.byte (copy_page_c - copy_page) - (2f - 1b)	/* offset */
2:
	.previous
	.section .altinstructions,"a"
	altinstruction_entry copy_page, 1b, X86_FEATURE_REP_GOOD,	\
		.Lcopy_page_end-copy_page, 2b-1b
	.previous
 |