File: /usr/src/linux/arch/i386/lib/mmx.c

1     #include <linux/config.h>
2     #include <linux/types.h>
3     #include <linux/string.h>
4     #include <linux/sched.h>
5     
6     #include <asm/i387.h>
7     #include <asm/hardirq.h> 
8     
9     
10     /*
11      *	MMX 3DNow! library helper functions
12      *
13      *	To do:
14      *	We can use MMX just for prefetch in IRQ's. This may be a win. 
15      *		(reported so on K6-III)
16      *	We should use a better code neutral filler for the short jump
17      *		leal ebx. [ebx] is apparently best for K6-2, but Cyrix ??
18      *	We also want to clobber the filler register so we dont get any
19      *		register forwarding stalls on the filler. 
20      *
21      *	Add *user handling. Checksums are not a win with MMX on any CPU
22      *	tested so far for any MMX solution figured.
23      *
24      *	22/09/2000 - Arjan van de Ven 
25      *		Improved for non-egineering-sample Athlons 
26      *
27      */
28      
29     void *_mmx_memcpy(void *to, const void *from, size_t len)
30     {
31     	void *p;
32     	int i;
33     
34     	if (in_interrupt())
35     		return __memcpy(to, from, len);
36     
37     	p = to;
38     	i = len >> 6; /* len/64 */
39     
40     	kernel_fpu_begin();
41     
42     	__asm__ __volatile__ (
43     		"1: prefetch (%0)\n"		/* This set is 28 bytes */
44     		"   prefetch 64(%0)\n"
45     		"   prefetch 128(%0)\n"
46     		"   prefetch 192(%0)\n"
47     		"   prefetch 256(%0)\n"
48     		"2:  \n"
49     		".section .fixup, \"ax\"\n"
50     		"3: movw $0x1AEB, 1b\n"	/* jmp on 26 bytes */
51     		"   jmp 2b\n"
52     		".previous\n"
53     		".section __ex_table,\"a\"\n"
54     		"	.align 4\n"
55     		"	.long 1b, 3b\n"
56     		".previous"
57     		: : "r" (from) );
58     		
59     	
60     	for(; i>0; i--)
61     	{
62     		__asm__ __volatile__ (
63     		"1:  prefetch 320(%0)\n"
64     		"2:  movq (%0), %%mm0\n"
65     		"  movq 8(%0), %%mm1\n"
66     		"  movq 16(%0), %%mm2\n"
67     		"  movq 24(%0), %%mm3\n"
68     		"  movq %%mm0, (%1)\n"
69     		"  movq %%mm1, 8(%1)\n"
70     		"  movq %%mm2, 16(%1)\n"
71     		"  movq %%mm3, 24(%1)\n"
72     		"  movq 32(%0), %%mm0\n"
73     		"  movq 40(%0), %%mm1\n"
74     		"  movq 48(%0), %%mm2\n"
75     		"  movq 56(%0), %%mm3\n"
76     		"  movq %%mm0, 32(%1)\n"
77     		"  movq %%mm1, 40(%1)\n"
78     		"  movq %%mm2, 48(%1)\n"
79     		"  movq %%mm3, 56(%1)\n"
80     		".section .fixup, \"ax\"\n"
81     		"3: movw $0x05EB, 1b\n"	/* jmp on 5 bytes */
82     		"   jmp 2b\n"
83     		".previous\n"
84     		".section __ex_table,\"a\"\n"
85     		"	.align 4\n"
86     		"	.long 1b, 3b\n"
87     		".previous"
88     		: : "r" (from), "r" (to) : "memory");
89     		from+=64;
90     		to+=64;
91     	}
92     	/*
93     	 *	Now do the tail of the block
94     	 */
95     	__memcpy(to, from, len&63);
96     	kernel_fpu_end();
97     	return p;
98     }
99     
100     #ifdef CONFIG_MK7
101     
102     /*
103      *	The K7 has streaming cache bypass load/store. The Cyrix III, K6 and
104      *	other MMX using processors do not.
105      */
106     
107     static void fast_clear_page(void *page)
108     {
109     	int i;
110     
111     	kernel_fpu_begin();
112     	
113     	__asm__ __volatile__ (
114     		"  pxor %%mm0, %%mm0\n" : :
115     	);
116     
117     	for(i=0;i<4096/64;i++)
118     	{
119     		__asm__ __volatile__ (
120     		"  movntq %%mm0, (%0)\n"
121     		"  movntq %%mm0, 8(%0)\n"
122     		"  movntq %%mm0, 16(%0)\n"
123     		"  movntq %%mm0, 24(%0)\n"
124     		"  movntq %%mm0, 32(%0)\n"
125     		"  movntq %%mm0, 40(%0)\n"
126     		"  movntq %%mm0, 48(%0)\n"
127     		"  movntq %%mm0, 56(%0)\n"
128     		: : "r" (page) : "memory");
129     		page+=64;
130     	}
131     	/* since movntq is weakly-ordered, a "sfence" is needed to become
132     	 * ordered again.
133     	 */
134     	__asm__ __volatile__ (
135     		"  sfence \n" : :
136     	);
137     	kernel_fpu_end();
138     }
139     
140     static void fast_copy_page(void *to, void *from)
141     {
142     	int i;
143     
144     	kernel_fpu_begin();
145     
146     	/* maybe the prefetch stuff can go before the expensive fnsave...
147     	 * but that is for later. -AV
148     	 */
149     	__asm__ __volatile__ (
150     		"1: prefetch (%0)\n"
151     		"   prefetch 64(%0)\n"
152     		"   prefetch 128(%0)\n"
153     		"   prefetch 192(%0)\n"
154     		"   prefetch 256(%0)\n"
155     		"2:  \n"
156     		".section .fixup, \"ax\"\n"
157     		"3: movw $0x1AEB, 1b\n"	/* jmp on 26 bytes */
158     		"   jmp 2b\n"
159     		".previous\n"
160     		".section __ex_table,\"a\"\n"
161     		"	.align 4\n"
162     		"	.long 1b, 3b\n"
163     		".previous"
164     		: : "r" (from) );
165     
166     	for(i=0; i<(4096-320)/64; i++)
167     	{
168     		__asm__ __volatile__ (
169     		"1: prefetch 320(%0)\n"
170     		"2: movq (%0), %%mm0\n"
171     		"   movntq %%mm0, (%1)\n"
172     		"   movq 8(%0), %%mm1\n"
173     		"   movntq %%mm1, 8(%1)\n"
174     		"   movq 16(%0), %%mm2\n"
175     		"   movntq %%mm2, 16(%1)\n"
176     		"   movq 24(%0), %%mm3\n"
177     		"   movntq %%mm3, 24(%1)\n"
178     		"   movq 32(%0), %%mm4\n"
179     		"   movntq %%mm4, 32(%1)\n"
180     		"   movq 40(%0), %%mm5\n"
181     		"   movntq %%mm5, 40(%1)\n"
182     		"   movq 48(%0), %%mm6\n"
183     		"   movntq %%mm6, 48(%1)\n"
184     		"   movq 56(%0), %%mm7\n"
185     		"   movntq %%mm7, 56(%1)\n"
186     		".section .fixup, \"ax\"\n"
187     		"3: movw $0x05EB, 1b\n"	/* jmp on 5 bytes */
188     		"   jmp 2b\n"
189     		".previous\n"
190     		".section __ex_table,\"a\"\n"
191     		"	.align 4\n"
192     		"	.long 1b, 3b\n"
193     		".previous"
194     		: : "r" (from), "r" (to) : "memory");
195     		from+=64;
196     		to+=64;
197     	}
198     	for(i=(4096-320)/64; i<4096/64; i++)
199     	{
200     		__asm__ __volatile__ (
201     		"2: movq (%0), %%mm0\n"
202     		"   movntq %%mm0, (%1)\n"
203     		"   movq 8(%0), %%mm1\n"
204     		"   movntq %%mm1, 8(%1)\n"
205     		"   movq 16(%0), %%mm2\n"
206     		"   movntq %%mm2, 16(%1)\n"
207     		"   movq 24(%0), %%mm3\n"
208     		"   movntq %%mm3, 24(%1)\n"
209     		"   movq 32(%0), %%mm4\n"
210     		"   movntq %%mm4, 32(%1)\n"
211     		"   movq 40(%0), %%mm5\n"
212     		"   movntq %%mm5, 40(%1)\n"
213     		"   movq 48(%0), %%mm6\n"
214     		"   movntq %%mm6, 48(%1)\n"
215     		"   movq 56(%0), %%mm7\n"
216     		"   movntq %%mm7, 56(%1)\n"
217     		: : "r" (from), "r" (to) : "memory");
218     		from+=64;
219     		to+=64;
220     	}
221     	/* since movntq is weakly-ordered, a "sfence" is needed to become
222     	 * ordered again.
223     	 */
224     	__asm__ __volatile__ (
225     		"  sfence \n" : :
226     	);
227     	kernel_fpu_end();
228     }
229     
230     #else
231     
232     /*
233      *	Generic MMX implementation without K7 specific streaming
234      */
235      
236     static void fast_clear_page(void *page)
237     {
238     	int i;
239     	
240     	kernel_fpu_begin();
241     	
242     	__asm__ __volatile__ (
243     		"  pxor %%mm0, %%mm0\n" : :
244     	);
245     
246     	for(i=0;i<4096/128;i++)
247     	{
248     		__asm__ __volatile__ (
249     		"  movq %%mm0, (%0)\n"
250     		"  movq %%mm0, 8(%0)\n"
251     		"  movq %%mm0, 16(%0)\n"
252     		"  movq %%mm0, 24(%0)\n"
253     		"  movq %%mm0, 32(%0)\n"
254     		"  movq %%mm0, 40(%0)\n"
255     		"  movq %%mm0, 48(%0)\n"
256     		"  movq %%mm0, 56(%0)\n"
257     		"  movq %%mm0, 64(%0)\n"
258     		"  movq %%mm0, 72(%0)\n"
259     		"  movq %%mm0, 80(%0)\n"
260     		"  movq %%mm0, 88(%0)\n"
261     		"  movq %%mm0, 96(%0)\n"
262     		"  movq %%mm0, 104(%0)\n"
263     		"  movq %%mm0, 112(%0)\n"
264     		"  movq %%mm0, 120(%0)\n"
265     		: : "r" (page) : "memory");
266     		page+=128;
267     	}
268     
269     	kernel_fpu_end();
270     }
271     
272     static void fast_copy_page(void *to, void *from)
273     {
274     	int i;
275     	
276     	
277     	kernel_fpu_begin();
278     
279     	__asm__ __volatile__ (
280     		"1: prefetch (%0)\n"
281     		"   prefetch 64(%0)\n"
282     		"   prefetch 128(%0)\n"
283     		"   prefetch 192(%0)\n"
284     		"   prefetch 256(%0)\n"
285     		"2:  \n"
286     		".section .fixup, \"ax\"\n"
287     		"3: movw $0x1AEB, 1b\n"	/* jmp on 26 bytes */
288     		"   jmp 2b\n"
289     		".previous\n"
290     		".section __ex_table,\"a\"\n"
291     		"	.align 4\n"
292     		"	.long 1b, 3b\n"
293     		".previous"
294     		: : "r" (from) );
295     
296     	for(i=0; i<4096/64; i++)
297     	{
298     		__asm__ __volatile__ (
299     		"1: prefetch 320(%0)\n"
300     		"2: movq (%0), %%mm0\n"
301     		"   movq 8(%0), %%mm1\n"
302     		"   movq 16(%0), %%mm2\n"
303     		"   movq 24(%0), %%mm3\n"
304     		"   movq %%mm0, (%1)\n"
305     		"   movq %%mm1, 8(%1)\n"
306     		"   movq %%mm2, 16(%1)\n"
307     		"   movq %%mm3, 24(%1)\n"
308     		"   movq 32(%0), %%mm0\n"
309     		"   movq 40(%0), %%mm1\n"
310     		"   movq 48(%0), %%mm2\n"
311     		"   movq 56(%0), %%mm3\n"
312     		"   movq %%mm0, 32(%1)\n"
313     		"   movq %%mm1, 40(%1)\n"
314     		"   movq %%mm2, 48(%1)\n"
315     		"   movq %%mm3, 56(%1)\n"
316     		".section .fixup, \"ax\"\n"
317     		"3: movw $0x05EB, 1b\n"	/* jmp on 5 bytes */
318     		"   jmp 2b\n"
319     		".previous\n"
320     		".section __ex_table,\"a\"\n"
321     		"	.align 4\n"
322     		"	.long 1b, 3b\n"
323     		".previous"
324     		: : "r" (from), "r" (to) : "memory");
325     		from+=64;
326     		to+=64;
327     	}
328     	kernel_fpu_end();
329     }
330     
331     
332     #endif
333     
334     /*
335      *	Favour MMX for page clear and copy. 
336      */
337     
338     static void slow_zero_page(void * page)
339     {
340     	int d0, d1;
341     	__asm__ __volatile__( 
342     		"cld\n\t" 
343     		"rep ; stosl" 
344     		: "=&c" (d0), "=&D" (d1)
345     		:"a" (0),"1" (page),"0" (1024)
346     		:"memory");
347     }
348      
349     void mmx_clear_page(void * page)
350     {
351     	if(in_interrupt())
352     		slow_zero_page(page);
353     	else
354     		fast_clear_page(page);
355     }
356     
357     static void slow_copy_page(void *to, void *from)
358     {
359     	int d0, d1, d2;
360     	__asm__ __volatile__( 
361     		"cld\n\t" 
362     		"rep ; movsl" 
363     		: "=&c" (d0), "=&D" (d1), "=&S" (d2) 
364     		: "0" (1024),"1" ((long) to),"2" ((long) from) 
365     		: "memory");
366     }
367       
368     
369     void mmx_copy_page(void *to, void *from)
370     {
371     	if(in_interrupt())
372     		slow_copy_page(to, from);
373     	else
374     		fast_copy_page(to, from);
375     }
376