File: /usr/src/linux/include/asm/xor.h

1     /*
2      * include/asm-i386/xor.h
3      *
4      * Optimized RAID-5 checksumming functions for MMX and SSE.
5      *
6      * This program is free software; you can redistribute it and/or modify
7      * it under the terms of the GNU General Public License as published by
8      * the Free Software Foundation; either version 2, or (at your option)
9      * any later version.
10      *
11      * You should have received a copy of the GNU General Public License
12      * (for example /usr/src/linux/COPYING); if not, write to the Free
13      * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
14      */
15     
16     /*
17      * High-speed RAID5 checksumming functions utilizing MMX instructions.
18      * Copyright (C) 1998 Ingo Molnar.
19      */
20     
21     #define FPU_SAVE							\
22       do {									\
23     	if (!(current->flags & PF_USEDFPU))				\
24     		__asm__ __volatile__ (" clts;\n");			\
25     	__asm__ __volatile__ ("fsave %0; fwait": "=m"(fpu_save[0]));	\
26       } while (0)
27     
28     #define FPU_RESTORE							\
29       do {									\
30     	__asm__ __volatile__ ("frstor %0": : "m"(fpu_save[0]));		\
31     	if (!(current->flags & PF_USEDFPU))				\
32     		stts();							\
33       } while (0)
34     
35     #define LD(x,y)		"       movq   8*("#x")(%1), %%mm"#y"   ;\n"
36     #define ST(x,y)		"       movq %%mm"#y",   8*("#x")(%1)   ;\n"
37     #define XO1(x,y)	"       pxor   8*("#x")(%2), %%mm"#y"   ;\n"
38     #define XO2(x,y)	"       pxor   8*("#x")(%3), %%mm"#y"   ;\n"
39     #define XO3(x,y)	"       pxor   8*("#x")(%4), %%mm"#y"   ;\n"
40     #define XO4(x,y)	"       pxor   8*("#x")(%5), %%mm"#y"   ;\n"
41     
42     
43     static void
44     xor_pII_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
45     {
46     	unsigned long lines = bytes >> 7;
47     	char fpu_save[108];
48     
49     	FPU_SAVE;
50     
51     	__asm__ __volatile__ (
52     #undef BLOCK
53     #define BLOCK(i) \
54     	LD(i,0)					\
55     		LD(i+1,1)			\
56     			LD(i+2,2)		\
57     				LD(i+3,3)	\
58     	XO1(i,0)				\
59     	ST(i,0)					\
60     		XO1(i+1,1)			\
61     		ST(i+1,1)			\
62     			XO1(i+2,2)		\
63     			ST(i+2,2)		\
64     				XO1(i+3,3)	\
65     				ST(i+3,3)
66     
67     	" .align 32			;\n"
68       	" 1:                            ;\n"
69     
70     	BLOCK(0)
71     	BLOCK(4)
72     	BLOCK(8)
73     	BLOCK(12)
74     
75     	"       addl $128, %1         ;\n"
76     	"       addl $128, %2         ;\n"
77     	"       decl %0               ;\n"
78     	"       jnz 1b                ;\n"
79            	:
80     	: "r" (lines),
81     	  "r" (p1), "r" (p2)
82     	: "memory");
83     
84     	FPU_RESTORE;
85     }
86     
87     static void
88     xor_pII_mmx_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
89     	      unsigned long *p3)
90     {
91     	unsigned long lines = bytes >> 7;
92     	char fpu_save[108];
93     
94     	FPU_SAVE;
95     
96     	__asm__ __volatile__ (
97     #undef BLOCK
98     #define BLOCK(i) \
99     	LD(i,0)					\
100     		LD(i+1,1)			\
101     			LD(i+2,2)		\
102     				LD(i+3,3)	\
103     	XO1(i,0)				\
104     		XO1(i+1,1)			\
105     			XO1(i+2,2)		\
106     				XO1(i+3,3)	\
107     	XO2(i,0)				\
108     	ST(i,0)					\
109     		XO2(i+1,1)			\
110     		ST(i+1,1)			\
111     			XO2(i+2,2)		\
112     			ST(i+2,2)		\
113     				XO2(i+3,3)	\
114     				ST(i+3,3)
115     
116     	" .align 32			;\n"
117     	" 1:                            ;\n"
118     
119     	BLOCK(0)
120     	BLOCK(4)
121     	BLOCK(8)
122     	BLOCK(12)
123     
124     	"       addl $128, %1         ;\n"
125     	"       addl $128, %2         ;\n"
126     	"       addl $128, %3         ;\n"
127     	"       decl %0               ;\n"
128     	"       jnz 1b                ;\n"
129            	:
130     	: "r" (lines),
131     	  "r" (p1), "r" (p2), "r" (p3)
132     	: "memory");
133     
134     	FPU_RESTORE;
135     }
136     
137     static void
138     xor_pII_mmx_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
139     	      unsigned long *p3, unsigned long *p4)
140     {
141     	unsigned long lines = bytes >> 7;
142     	char fpu_save[108];
143     
144     	FPU_SAVE;
145     
146     	__asm__ __volatile__ (
147     #undef BLOCK
148     #define BLOCK(i) \
149     	LD(i,0)					\
150     		LD(i+1,1)			\
151     			LD(i+2,2)		\
152     				LD(i+3,3)	\
153     	XO1(i,0)				\
154     		XO1(i+1,1)			\
155     			XO1(i+2,2)		\
156     				XO1(i+3,3)	\
157     	XO2(i,0)				\
158     		XO2(i+1,1)			\
159     			XO2(i+2,2)		\
160     				XO2(i+3,3)	\
161     	XO3(i,0)				\
162     	ST(i,0)					\
163     		XO3(i+1,1)			\
164     		ST(i+1,1)			\
165     			XO3(i+2,2)		\
166     			ST(i+2,2)		\
167     				XO3(i+3,3)	\
168     				ST(i+3,3)
169     
170     	" .align 32			;\n"
171     	" 1:                            ;\n"
172     
173     	BLOCK(0)
174     	BLOCK(4)
175     	BLOCK(8)
176     	BLOCK(12)
177     
178     	"       addl $128, %1         ;\n"
179     	"       addl $128, %2         ;\n"
180     	"       addl $128, %3         ;\n"
181     	"       addl $128, %4         ;\n"
182     	"       decl %0               ;\n"
183     	"       jnz 1b                ;\n"
184            	:
185     	: "r" (lines),
186     	  "r" (p1), "r" (p2), "r" (p3), "r" (p4)
187     	: "memory");
188     
189     	FPU_RESTORE;
190     }
191     
192     static void
193     xor_pII_mmx_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
194     	      unsigned long *p3, unsigned long *p4, unsigned long *p5)
195     {
196     	unsigned long lines = bytes >> 7;
197     	char fpu_save[108];
198     
199     	FPU_SAVE;
200     
201     	__asm__ __volatile__ (
202     #undef BLOCK
203     #define BLOCK(i) \
204     	LD(i,0)					\
205     		LD(i+1,1)			\
206     			LD(i+2,2)		\
207     				LD(i+3,3)	\
208     	XO1(i,0)				\
209     		XO1(i+1,1)			\
210     			XO1(i+2,2)		\
211     				XO1(i+3,3)	\
212     	XO2(i,0)				\
213     		XO2(i+1,1)			\
214     			XO2(i+2,2)		\
215     				XO2(i+3,3)	\
216     	XO3(i,0)				\
217     		XO3(i+1,1)			\
218     			XO3(i+2,2)		\
219     				XO3(i+3,3)	\
220     	XO4(i,0)				\
221     	ST(i,0)					\
222     		XO4(i+1,1)			\
223     		ST(i+1,1)			\
224     			XO4(i+2,2)		\
225     			ST(i+2,2)		\
226     				XO4(i+3,3)	\
227     				ST(i+3,3)
228     
229     	" .align 32			;\n"
230     	" 1:                            ;\n"
231     
232     	BLOCK(0)
233     	BLOCK(4)
234     	BLOCK(8)
235     	BLOCK(12)
236     
237     	"       addl $128, %1         ;\n"
238     	"       addl $128, %2         ;\n"
239     	"       addl $128, %3         ;\n"
240     	"       addl $128, %4         ;\n"
241     	"       addl $128, %5         ;\n"
242     	"       decl %0               ;\n"
243     	"       jnz 1b                ;\n"
244            	:
245     	: "g" (lines),
246     	  "r" (p1), "r" (p2), "r" (p3), "r" (p4), "r" (p5)
247     	: "memory");
248     
249     	FPU_RESTORE;
250     }
251     
252     #undef LD
253     #undef XO1
254     #undef XO2
255     #undef XO3
256     #undef XO4
257     #undef ST
258     #undef BLOCK
259     
260     static void
261     xor_p5_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
262     {
263     	unsigned long lines = bytes >> 6;
264     	char fpu_save[108];
265     
266     	FPU_SAVE;
267     
268     	__asm__ __volatile__ (
269     	" .align 32	             ;\n"
270     	" 1:                         ;\n"
271     	"       movq   (%1), %%mm0   ;\n"
272     	"       movq  8(%1), %%mm1   ;\n"
273     	"       pxor   (%2), %%mm0   ;\n"
274     	"       movq 16(%1), %%mm2   ;\n"
275     	"       movq %%mm0,   (%1)   ;\n"
276     	"       pxor  8(%2), %%mm1   ;\n"
277     	"       movq 24(%1), %%mm3   ;\n"
278     	"       movq %%mm1,  8(%1)   ;\n"
279     	"       pxor 16(%2), %%mm2   ;\n"
280     	"       movq 32(%1), %%mm4   ;\n"
281     	"       movq %%mm2, 16(%1)   ;\n"
282     	"       pxor 24(%2), %%mm3   ;\n"
283     	"       movq 40(%1), %%mm5   ;\n"
284     	"       movq %%mm3, 24(%1)   ;\n"
285     	"       pxor 32(%2), %%mm4   ;\n"
286     	"       movq 48(%1), %%mm6   ;\n"
287     	"       movq %%mm4, 32(%1)   ;\n"
288     	"       pxor 40(%2), %%mm5   ;\n"
289     	"       movq 56(%1), %%mm7   ;\n"
290     	"       movq %%mm5, 40(%1)   ;\n"
291     	"       pxor 48(%2), %%mm6   ;\n"
292     	"       pxor 56(%2), %%mm7   ;\n"
293     	"       movq %%mm6, 48(%1)   ;\n"
294     	"       movq %%mm7, 56(%1)   ;\n"
295     	
296     	"       addl $64, %1         ;\n"
297     	"       addl $64, %2         ;\n"
298     	"       decl %0              ;\n"
299     	"       jnz 1b               ;\n"
300     	: 
301     	: "r" (lines),
302     	  "r" (p1), "r" (p2)
303     	: "memory");
304     
305     	FPU_RESTORE;
306     }
307     
308     static void
309     xor_p5_mmx_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
310     	     unsigned long *p3)
311     {
312     	unsigned long lines = bytes >> 6;
313     	char fpu_save[108];
314     
315     	FPU_SAVE;
316     
317     	__asm__ __volatile__ (
318     	" .align 32,0x90             ;\n"
319     	" 1:                         ;\n"
320     	"       movq   (%1), %%mm0   ;\n"
321     	"       movq  8(%1), %%mm1   ;\n"
322     	"       pxor   (%2), %%mm0   ;\n"
323     	"       movq 16(%1), %%mm2   ;\n"
324     	"       pxor  8(%2), %%mm1   ;\n"
325     	"       pxor   (%3), %%mm0   ;\n"
326     	"       pxor 16(%2), %%mm2   ;\n"
327     	"       movq %%mm0,   (%1)   ;\n"
328     	"       pxor  8(%3), %%mm1   ;\n"
329     	"       pxor 16(%3), %%mm2   ;\n"
330     	"       movq 24(%1), %%mm3   ;\n"
331     	"       movq %%mm1,  8(%1)   ;\n"
332     	"       movq 32(%1), %%mm4   ;\n"
333     	"       movq 40(%1), %%mm5   ;\n"
334     	"       pxor 24(%2), %%mm3   ;\n"
335     	"       movq %%mm2, 16(%1)   ;\n"
336     	"       pxor 32(%2), %%mm4   ;\n"
337     	"       pxor 24(%3), %%mm3   ;\n"
338     	"       pxor 40(%2), %%mm5   ;\n"
339     	"       movq %%mm3, 24(%1)   ;\n"
340     	"       pxor 32(%3), %%mm4   ;\n"
341     	"       pxor 40(%3), %%mm5   ;\n"
342     	"       movq 48(%1), %%mm6   ;\n"
343     	"       movq %%mm4, 32(%1)   ;\n"
344     	"       movq 56(%1), %%mm7   ;\n"
345     	"       pxor 48(%2), %%mm6   ;\n"
346     	"       movq %%mm5, 40(%1)   ;\n"
347     	"       pxor 56(%2), %%mm7   ;\n"
348     	"       pxor 48(%3), %%mm6   ;\n"
349     	"       pxor 56(%3), %%mm7   ;\n"
350     	"       movq %%mm6, 48(%1)   ;\n"
351     	"       movq %%mm7, 56(%1)   ;\n"
352           
353     	"       addl $64, %1         ;\n"
354     	"       addl $64, %2         ;\n"
355     	"       addl $64, %3         ;\n"
356     	"       decl %0              ;\n"
357     	"       jnz 1b               ;\n"
358     	: 
359     	: "r" (lines),
360     	  "r" (p1), "r" (p2), "r" (p3)
361     	: "memory" );
362     
363     	FPU_RESTORE;
364     }
365     
366     static void
367     xor_p5_mmx_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
368     	     unsigned long *p3, unsigned long *p4)
369     {
370     	unsigned long lines = bytes >> 6;
371     	char fpu_save[108];
372     
373     	FPU_SAVE;
374     
375     	__asm__ __volatile__ (
376     	" .align 32,0x90             ;\n"
377     	" 1:                         ;\n"
378     	"       movq   (%1), %%mm0   ;\n"
379     	"       movq  8(%1), %%mm1   ;\n"
380     	"       pxor   (%2), %%mm0   ;\n"
381     	"       movq 16(%1), %%mm2   ;\n"
382     	"       pxor  8(%2), %%mm1   ;\n"
383     	"       pxor   (%3), %%mm0   ;\n"
384     	"       pxor 16(%2), %%mm2   ;\n"
385     	"       pxor  8(%3), %%mm1   ;\n"
386     	"       pxor   (%4), %%mm0   ;\n"
387     	"       movq 24(%1), %%mm3   ;\n"
388     	"       pxor 16(%3), %%mm2   ;\n"
389     	"       pxor  8(%4), %%mm1   ;\n"
390     	"       movq %%mm0,   (%1)   ;\n"
391     	"       movq 32(%1), %%mm4   ;\n"
392     	"       pxor 24(%2), %%mm3   ;\n"
393     	"       pxor 16(%4), %%mm2   ;\n"
394     	"       movq %%mm1,  8(%1)   ;\n"
395     	"       movq 40(%1), %%mm5   ;\n"
396     	"       pxor 32(%2), %%mm4   ;\n"
397     	"       pxor 24(%3), %%mm3   ;\n"
398     	"       movq %%mm2, 16(%1)   ;\n"
399     	"       pxor 40(%2), %%mm5   ;\n"
400     	"       pxor 32(%3), %%mm4   ;\n"
401     	"       pxor 24(%4), %%mm3   ;\n"
402     	"       movq %%mm3, 24(%1)   ;\n"
403     	"       movq 56(%1), %%mm7   ;\n"
404     	"       movq 48(%1), %%mm6   ;\n"
405     	"       pxor 40(%3), %%mm5   ;\n"
406     	"       pxor 32(%4), %%mm4   ;\n"
407     	"       pxor 48(%2), %%mm6   ;\n"
408     	"       movq %%mm4, 32(%1)   ;\n"
409     	"       pxor 56(%2), %%mm7   ;\n"
410     	"       pxor 40(%4), %%mm5   ;\n"
411     	"       pxor 48(%3), %%mm6   ;\n"
412     	"       pxor 56(%3), %%mm7   ;\n"
413     	"       movq %%mm5, 40(%1)   ;\n"
414     	"       pxor 48(%4), %%mm6   ;\n"
415     	"       pxor 56(%4), %%mm7   ;\n"
416     	"       movq %%mm6, 48(%1)   ;\n"
417     	"       movq %%mm7, 56(%1)   ;\n"
418           
419     	"       addl $64, %1         ;\n"
420     	"       addl $64, %2         ;\n"
421     	"       addl $64, %3         ;\n"
422     	"       addl $64, %4         ;\n"
423     	"       decl %0              ;\n"
424     	"       jnz 1b               ;\n"
425     	: 
426     	: "r" (lines),
427     	  "r" (p1), "r" (p2), "r" (p3), "r" (p4)
428     	: "memory");
429     
430     	FPU_RESTORE;
431     }
432     
433     static void
434     xor_p5_mmx_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
435     	     unsigned long *p3, unsigned long *p4, unsigned long *p5)
436     {
437     	unsigned long lines = bytes >> 6;
438     	char fpu_save[108];
439     
440     	FPU_SAVE;
441     
442     	__asm__ __volatile__ (
443     	" .align 32,0x90             ;\n"
444     	" 1:                         ;\n"
445     	"       movq   (%1), %%mm0   ;\n"
446     	"       movq  8(%1), %%mm1   ;\n"
447     	"       pxor   (%2), %%mm0   ;\n"
448     	"       pxor  8(%2), %%mm1   ;\n"
449     	"       movq 16(%1), %%mm2   ;\n"
450     	"       pxor   (%3), %%mm0   ;\n"
451     	"       pxor  8(%3), %%mm1   ;\n"
452     	"       pxor 16(%2), %%mm2   ;\n"
453     	"       pxor   (%4), %%mm0   ;\n"
454     	"       pxor  8(%4), %%mm1   ;\n"
455     	"       pxor 16(%3), %%mm2   ;\n"
456     	"       movq 24(%1), %%mm3   ;\n"
457     	"       pxor   (%5), %%mm0   ;\n"
458     	"       pxor  8(%5), %%mm1   ;\n"
459     	"       movq %%mm0,   (%1)   ;\n"
460     	"       pxor 16(%4), %%mm2   ;\n"
461     	"       pxor 24(%2), %%mm3   ;\n"
462     	"       movq %%mm1,  8(%1)   ;\n"
463     	"       pxor 16(%5), %%mm2   ;\n"
464     	"       pxor 24(%3), %%mm3   ;\n"
465     	"       movq 32(%1), %%mm4   ;\n"
466     	"       movq %%mm2, 16(%1)   ;\n"
467     	"       pxor 24(%4), %%mm3   ;\n"
468     	"       pxor 32(%2), %%mm4   ;\n"
469     	"       movq 40(%1), %%mm5   ;\n"
470     	"       pxor 24(%5), %%mm3   ;\n"
471     	"       pxor 32(%3), %%mm4   ;\n"
472     	"       pxor 40(%2), %%mm5   ;\n"
473     	"       movq %%mm3, 24(%1)   ;\n"
474     	"       pxor 32(%4), %%mm4   ;\n"
475     	"       pxor 40(%3), %%mm5   ;\n"
476     	"       movq 48(%1), %%mm6   ;\n"
477     	"       movq 56(%1), %%mm7   ;\n"
478     	"       pxor 32(%5), %%mm4   ;\n"
479     	"       pxor 40(%4), %%mm5   ;\n"
480     	"       pxor 48(%2), %%mm6   ;\n"
481     	"       pxor 56(%2), %%mm7   ;\n"
482     	"       movq %%mm4, 32(%1)   ;\n"
483     	"       pxor 48(%3), %%mm6   ;\n"
484     	"       pxor 56(%3), %%mm7   ;\n"
485     	"       pxor 40(%5), %%mm5   ;\n"
486     	"       pxor 48(%4), %%mm6   ;\n"
487     	"       pxor 56(%4), %%mm7   ;\n"
488     	"       movq %%mm5, 40(%1)   ;\n"
489     	"       pxor 48(%5), %%mm6   ;\n"
490     	"       pxor 56(%5), %%mm7   ;\n"
491     	"       movq %%mm6, 48(%1)   ;\n"
492     	"       movq %%mm7, 56(%1)   ;\n"
493           
494     	"       addl $64, %1         ;\n"
495     	"       addl $64, %2         ;\n"
496     	"       addl $64, %3         ;\n"
497     	"       addl $64, %4         ;\n"
498     	"       addl $64, %5         ;\n"
499     	"       decl %0              ;\n"
500     	"       jnz 1b               ;\n"
501     	: 
502     	: "g" (lines),
503     	  "r" (p1), "r" (p2), "r" (p3), "r" (p4), "r" (p5)
504     	: "memory");
505     
506     	FPU_RESTORE;
507     }
508     
509     static struct xor_block_template xor_block_pII_mmx = {
510     	name: "pII_mmx",
511     	do_2: xor_pII_mmx_2,
512     	do_3: xor_pII_mmx_3,
513     	do_4: xor_pII_mmx_4,
514     	do_5: xor_pII_mmx_5,
515     };
516     
517     static struct xor_block_template xor_block_p5_mmx = {
518     	name: "p5_mmx",
519     	do_2: xor_p5_mmx_2,
520     	do_3: xor_p5_mmx_3,
521     	do_4: xor_p5_mmx_4,
522     	do_5: xor_p5_mmx_5,
523     };
524     
525     #undef FPU_SAVE
526     #undef FPU_RESTORE
527     
528     /*
529      * Cache avoiding checksumming functions utilizing KNI instructions
530      * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
531      */
532     
533     #define XMMS_SAVE				\
534     	__asm__ __volatile__ ( 			\
535     		"movl %%cr0,%0		;\n\t"	\
536     		"clts			;\n\t"	\
537     		"movups %%xmm0,(%1)	;\n\t"	\
538     		"movups %%xmm1,0x10(%1)	;\n\t"	\
539     		"movups %%xmm2,0x20(%1)	;\n\t"	\
540     		"movups %%xmm3,0x30(%1)	;\n\t"	\
541     		: "=r" (cr0)			\
542     		: "r" (xmm_save) 		\
543     		: "memory")
544     
545     #define XMMS_RESTORE				\
546     	__asm__ __volatile__ ( 			\
547     		"sfence			;\n\t"	\
548     		"movups (%1),%%xmm0	;\n\t"	\
549     		"movups 0x10(%1),%%xmm1	;\n\t"	\
550     		"movups 0x20(%1),%%xmm2	;\n\t"	\
551     		"movups 0x30(%1),%%xmm3	;\n\t"	\
552     		"movl 	%0,%%cr0	;\n\t"	\
553     		:				\
554     		: "r" (cr0), "r" (xmm_save)	\
555     		: "memory")
556     
557     #define OFFS(x)		"16*("#x")"
558     #define PF_OFFS(x)	"256+16*("#x")"
559     #define	PF0(x)		"	prefetchnta "PF_OFFS(x)"(%1)		;\n"
560     #define LD(x,y)		"       movaps   "OFFS(x)"(%1), %%xmm"#y"	;\n"
561     #define ST(x,y)		"       movaps %%xmm"#y",   "OFFS(x)"(%1)	;\n"
562     #define PF1(x)		"	prefetchnta "PF_OFFS(x)"(%2)		;\n"
563     #define PF2(x)		"	prefetchnta "PF_OFFS(x)"(%3)		;\n"
564     #define PF3(x)		"	prefetchnta "PF_OFFS(x)"(%4)		;\n"
565     #define PF4(x)		"	prefetchnta "PF_OFFS(x)"(%5)		;\n"
566     #define PF5(x)		"	prefetchnta "PF_OFFS(x)"(%6)		;\n"
567     #define XO1(x,y)	"       xorps   "OFFS(x)"(%2), %%xmm"#y"	;\n"
568     #define XO2(x,y)	"       xorps   "OFFS(x)"(%3), %%xmm"#y"	;\n"
569     #define XO3(x,y)	"       xorps   "OFFS(x)"(%4), %%xmm"#y"	;\n"
570     #define XO4(x,y)	"       xorps   "OFFS(x)"(%5), %%xmm"#y"	;\n"
571     #define XO5(x,y)	"       xorps   "OFFS(x)"(%6), %%xmm"#y"	;\n"
572     
573     
574     static void
575     xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
576     {
577             unsigned long lines = bytes >> 8;
578     	char xmm_save[16*4];
579     	int cr0;
580     
581     	XMMS_SAVE;
582     
583             __asm__ __volatile__ (
584     #undef BLOCK
585     #define BLOCK(i) \
586     		LD(i,0)					\
587     			LD(i+1,1)			\
588     		PF1(i)					\
589     				PF1(i+2)		\
590     				LD(i+2,2)		\
591     					LD(i+3,3)	\
592     		PF0(i+4)				\
593     				PF0(i+6)		\
594     		XO1(i,0)				\
595     			XO1(i+1,1)			\
596     				XO1(i+2,2)		\
597     					XO1(i+3,3)	\
598     		ST(i,0)					\
599     			ST(i+1,1)			\
600     				ST(i+2,2)		\
601     					ST(i+3,3)	\
602     
603     
604     		PF0(0)
605     				PF0(2)
606     
607     	" .align 32			;\n"
608             " 1:                            ;\n"
609     
610     		BLOCK(0)
611     		BLOCK(4)
612     		BLOCK(8)
613     		BLOCK(12)
614     
615             "       addl $256, %1           ;\n"
616             "       addl $256, %2           ;\n"
617             "       decl %0                 ;\n"
618             "       jnz 1b                  ;\n"
619     	:
620     	: "r" (lines),
621     	  "r" (p1), "r" (p2)
622             : "memory");
623     
624     	XMMS_RESTORE;
625     }
626     
627     static void
628     xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
629     	  unsigned long *p3)
630     {
631             unsigned long lines = bytes >> 8;
632     	char xmm_save[16*4];
633     	int cr0;
634     
635     	XMMS_SAVE;
636     
637             __asm__ __volatile__ (
638     #undef BLOCK
639     #define BLOCK(i) \
640     		PF1(i)					\
641     				PF1(i+2)		\
642     		LD(i,0)					\
643     			LD(i+1,1)			\
644     				LD(i+2,2)		\
645     					LD(i+3,3)	\
646     		PF2(i)					\
647     				PF2(i+2)		\
648     		PF0(i+4)				\
649     				PF0(i+6)		\
650     		XO1(i,0)				\
651     			XO1(i+1,1)			\
652     				XO1(i+2,2)		\
653     					XO1(i+3,3)	\
654     		XO2(i,0)				\
655     			XO2(i+1,1)			\
656     				XO2(i+2,2)		\
657     					XO2(i+3,3)	\
658     		ST(i,0)					\
659     			ST(i+1,1)			\
660     				ST(i+2,2)		\
661     					ST(i+3,3)	\
662     
663     
664     		PF0(0)
665     				PF0(2)
666     
667     	" .align 32			;\n"
668             " 1:                            ;\n"
669     
670     		BLOCK(0)
671     		BLOCK(4)
672     		BLOCK(8)
673     		BLOCK(12)
674     
675             "       addl $256, %1           ;\n"
676             "       addl $256, %2           ;\n"
677             "       addl $256, %3           ;\n"
678             "       decl %0                 ;\n"
679             "       jnz 1b                  ;\n"
680     	:
681     	: "r" (lines),
682     	  "r" (p1), "r"(p2), "r"(p3)
683             : "memory" );
684     
685     	XMMS_RESTORE;
686     }
687     
688     static void
689     xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
690     	  unsigned long *p3, unsigned long *p4)
691     {
692             unsigned long lines = bytes >> 8;
693     	char xmm_save[16*4];
694     	int cr0;
695     
696     	XMMS_SAVE;
697     
698             __asm__ __volatile__ (
699     #undef BLOCK
700     #define BLOCK(i) \
701     		PF1(i)					\
702     				PF1(i+2)		\
703     		LD(i,0)					\
704     			LD(i+1,1)			\
705     				LD(i+2,2)		\
706     					LD(i+3,3)	\
707     		PF2(i)					\
708     				PF2(i+2)		\
709     		XO1(i,0)				\
710     			XO1(i+1,1)			\
711     				XO1(i+2,2)		\
712     					XO1(i+3,3)	\
713     		PF3(i)					\
714     				PF3(i+2)		\
715     		PF0(i+4)				\
716     				PF0(i+6)		\
717     		XO2(i,0)				\
718     			XO2(i+1,1)			\
719     				XO2(i+2,2)		\
720     					XO2(i+3,3)	\
721     		XO3(i,0)				\
722     			XO3(i+1,1)			\
723     				XO3(i+2,2)		\
724     					XO3(i+3,3)	\
725     		ST(i,0)					\
726     			ST(i+1,1)			\
727     				ST(i+2,2)		\
728     					ST(i+3,3)	\
729     
730     
731     		PF0(0)
732     				PF0(2)
733     
734     	" .align 32			;\n"
735             " 1:                            ;\n"
736     
737     		BLOCK(0)
738     		BLOCK(4)
739     		BLOCK(8)
740     		BLOCK(12)
741     
742             "       addl $256, %1           ;\n"
743             "       addl $256, %2           ;\n"
744             "       addl $256, %3           ;\n"
745             "       addl $256, %4           ;\n"
746             "       decl %0                 ;\n"
747             "       jnz 1b                  ;\n"
748     	:
749     	: "r" (lines),
750     	  "r" (p1), "r" (p2), "r" (p3), "r" (p4)
751             : "memory" );
752     
753     	XMMS_RESTORE;
754     }
755     
756     static void
757     xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
758     	  unsigned long *p3, unsigned long *p4, unsigned long *p5)
759     {
760             unsigned long lines = bytes >> 8;
761     	char xmm_save[16*4];
762     	int cr0;
763     
764     	XMMS_SAVE;
765     
766             __asm__ __volatile__ (
767     #undef BLOCK
768     #define BLOCK(i) \
769     		PF1(i)					\
770     				PF1(i+2)		\
771     		LD(i,0)					\
772     			LD(i+1,1)			\
773     				LD(i+2,2)		\
774     					LD(i+3,3)	\
775     		PF2(i)					\
776     				PF2(i+2)		\
777     		XO1(i,0)				\
778     			XO1(i+1,1)			\
779     				XO1(i+2,2)		\
780     					XO1(i+3,3)	\
781     		PF3(i)					\
782     				PF3(i+2)		\
783     		XO2(i,0)				\
784     			XO2(i+1,1)			\
785     				XO2(i+2,2)		\
786     					XO2(i+3,3)	\
787     		PF4(i)					\
788     				PF4(i+2)		\
789     		PF0(i+4)				\
790     				PF0(i+6)		\
791     		XO3(i,0)				\
792     			XO3(i+1,1)			\
793     				XO3(i+2,2)		\
794     					XO3(i+3,3)	\
795     		XO4(i,0)				\
796     			XO4(i+1,1)			\
797     				XO4(i+2,2)		\
798     					XO4(i+3,3)	\
799     		ST(i,0)					\
800     			ST(i+1,1)			\
801     				ST(i+2,2)		\
802     					ST(i+3,3)	\
803     
804     
805     		PF0(0)
806     				PF0(2)
807     
808     	" .align 32			;\n"
809             " 1:                            ;\n"
810     
811     		BLOCK(0)
812     		BLOCK(4)
813     		BLOCK(8)
814     		BLOCK(12)
815     
816             "       addl $256, %1           ;\n"
817             "       addl $256, %2           ;\n"
818             "       addl $256, %3           ;\n"
819             "       addl $256, %4           ;\n"
820             "       addl $256, %5           ;\n"
821             "       decl %0                 ;\n"
822             "       jnz 1b                  ;\n"
823     	:
824     	: "r" (lines),
825     	  "r" (p1), "r" (p2), "r" (p3), "r" (p4), "r" (p5)
826     	: "memory");
827     
828     	XMMS_RESTORE;
829     }
830     
831     static struct xor_block_template xor_block_pIII_sse = {
832             name: "pIII_sse",
833             do_2: xor_sse_2,
834             do_3: xor_sse_3,
835             do_4: xor_sse_4,
836             do_5: xor_sse_5,
837     };
838     
839     /* Also try the generic routines.  */
840     #include <asm-generic/xor.h>
841     
842     #undef XOR_TRY_TEMPLATES
843     #define XOR_TRY_TEMPLATES				\
844     	do {						\
845     		xor_speed(&xor_block_8regs);		\
846     		xor_speed(&xor_block_32regs);		\
847     	        if (cpu_has_xmm)			\
848     			xor_speed(&xor_block_pIII_sse);	\
849     	        if (md_cpu_has_mmx()) {			\
850     	                xor_speed(&xor_block_pII_mmx);	\
851     	                xor_speed(&xor_block_p5_mmx);	\
852     	        }					\
853     	} while (0)
854     
855     /* We force the use of the SSE xor block because it can write around L2.
856        We may also be able to load into the L1 only depending on how the cpu
857        deals with a load to a line that is being prefetched.  */
858     #define XOR_SELECT_TEMPLATE(FASTEST) \
859     	(cpu_has_xmm ? &xor_block_pIII_sse : FASTEST)
860