File: /usr/src/linux/include/asm/xor.h
1 /*
2 * include/asm-i386/xor.h
3 *
4 * Optimized RAID-5 checksumming functions for MMX and SSE.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2, or (at your option)
9 * any later version.
10 *
11 * You should have received a copy of the GNU General Public License
12 * (for example /usr/src/linux/COPYING); if not, write to the Free
13 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
14 */
15
16 /*
17 * High-speed RAID5 checksumming functions utilizing MMX instructions.
18 * Copyright (C) 1998 Ingo Molnar.
19 */
20
21 #define FPU_SAVE \
22 do { \
23 if (!(current->flags & PF_USEDFPU)) \
24 __asm__ __volatile__ (" clts;\n"); \
25 __asm__ __volatile__ ("fsave %0; fwait": "=m"(fpu_save[0])); \
26 } while (0)
27
28 #define FPU_RESTORE \
29 do { \
30 __asm__ __volatile__ ("frstor %0": : "m"(fpu_save[0])); \
31 if (!(current->flags & PF_USEDFPU)) \
32 stts(); \
33 } while (0)
34
35 #define LD(x,y) " movq 8*("#x")(%1), %%mm"#y" ;\n"
36 #define ST(x,y) " movq %%mm"#y", 8*("#x")(%1) ;\n"
37 #define XO1(x,y) " pxor 8*("#x")(%2), %%mm"#y" ;\n"
38 #define XO2(x,y) " pxor 8*("#x")(%3), %%mm"#y" ;\n"
39 #define XO3(x,y) " pxor 8*("#x")(%4), %%mm"#y" ;\n"
40 #define XO4(x,y) " pxor 8*("#x")(%5), %%mm"#y" ;\n"
41
42
43 static void
44 xor_pII_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
45 {
46 unsigned long lines = bytes >> 7;
47 char fpu_save[108];
48
49 FPU_SAVE;
50
51 __asm__ __volatile__ (
52 #undef BLOCK
53 #define BLOCK(i) \
54 LD(i,0) \
55 LD(i+1,1) \
56 LD(i+2,2) \
57 LD(i+3,3) \
58 XO1(i,0) \
59 ST(i,0) \
60 XO1(i+1,1) \
61 ST(i+1,1) \
62 XO1(i+2,2) \
63 ST(i+2,2) \
64 XO1(i+3,3) \
65 ST(i+3,3)
66
67 " .align 32 ;\n"
68 " 1: ;\n"
69
70 BLOCK(0)
71 BLOCK(4)
72 BLOCK(8)
73 BLOCK(12)
74
75 " addl $128, %1 ;\n"
76 " addl $128, %2 ;\n"
77 " decl %0 ;\n"
78 " jnz 1b ;\n"
79 :
80 : "r" (lines),
81 "r" (p1), "r" (p2)
82 : "memory");
83
84 FPU_RESTORE;
85 }
86
87 static void
88 xor_pII_mmx_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
89 unsigned long *p3)
90 {
91 unsigned long lines = bytes >> 7;
92 char fpu_save[108];
93
94 FPU_SAVE;
95
96 __asm__ __volatile__ (
97 #undef BLOCK
98 #define BLOCK(i) \
99 LD(i,0) \
100 LD(i+1,1) \
101 LD(i+2,2) \
102 LD(i+3,3) \
103 XO1(i,0) \
104 XO1(i+1,1) \
105 XO1(i+2,2) \
106 XO1(i+3,3) \
107 XO2(i,0) \
108 ST(i,0) \
109 XO2(i+1,1) \
110 ST(i+1,1) \
111 XO2(i+2,2) \
112 ST(i+2,2) \
113 XO2(i+3,3) \
114 ST(i+3,3)
115
116 " .align 32 ;\n"
117 " 1: ;\n"
118
119 BLOCK(0)
120 BLOCK(4)
121 BLOCK(8)
122 BLOCK(12)
123
124 " addl $128, %1 ;\n"
125 " addl $128, %2 ;\n"
126 " addl $128, %3 ;\n"
127 " decl %0 ;\n"
128 " jnz 1b ;\n"
129 :
130 : "r" (lines),
131 "r" (p1), "r" (p2), "r" (p3)
132 : "memory");
133
134 FPU_RESTORE;
135 }
136
137 static void
138 xor_pII_mmx_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
139 unsigned long *p3, unsigned long *p4)
140 {
141 unsigned long lines = bytes >> 7;
142 char fpu_save[108];
143
144 FPU_SAVE;
145
146 __asm__ __volatile__ (
147 #undef BLOCK
148 #define BLOCK(i) \
149 LD(i,0) \
150 LD(i+1,1) \
151 LD(i+2,2) \
152 LD(i+3,3) \
153 XO1(i,0) \
154 XO1(i+1,1) \
155 XO1(i+2,2) \
156 XO1(i+3,3) \
157 XO2(i,0) \
158 XO2(i+1,1) \
159 XO2(i+2,2) \
160 XO2(i+3,3) \
161 XO3(i,0) \
162 ST(i,0) \
163 XO3(i+1,1) \
164 ST(i+1,1) \
165 XO3(i+2,2) \
166 ST(i+2,2) \
167 XO3(i+3,3) \
168 ST(i+3,3)
169
170 " .align 32 ;\n"
171 " 1: ;\n"
172
173 BLOCK(0)
174 BLOCK(4)
175 BLOCK(8)
176 BLOCK(12)
177
178 " addl $128, %1 ;\n"
179 " addl $128, %2 ;\n"
180 " addl $128, %3 ;\n"
181 " addl $128, %4 ;\n"
182 " decl %0 ;\n"
183 " jnz 1b ;\n"
184 :
185 : "r" (lines),
186 "r" (p1), "r" (p2), "r" (p3), "r" (p4)
187 : "memory");
188
189 FPU_RESTORE;
190 }
191
192 static void
193 xor_pII_mmx_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
194 unsigned long *p3, unsigned long *p4, unsigned long *p5)
195 {
196 unsigned long lines = bytes >> 7;
197 char fpu_save[108];
198
199 FPU_SAVE;
200
201 __asm__ __volatile__ (
202 #undef BLOCK
203 #define BLOCK(i) \
204 LD(i,0) \
205 LD(i+1,1) \
206 LD(i+2,2) \
207 LD(i+3,3) \
208 XO1(i,0) \
209 XO1(i+1,1) \
210 XO1(i+2,2) \
211 XO1(i+3,3) \
212 XO2(i,0) \
213 XO2(i+1,1) \
214 XO2(i+2,2) \
215 XO2(i+3,3) \
216 XO3(i,0) \
217 XO3(i+1,1) \
218 XO3(i+2,2) \
219 XO3(i+3,3) \
220 XO4(i,0) \
221 ST(i,0) \
222 XO4(i+1,1) \
223 ST(i+1,1) \
224 XO4(i+2,2) \
225 ST(i+2,2) \
226 XO4(i+3,3) \
227 ST(i+3,3)
228
229 " .align 32 ;\n"
230 " 1: ;\n"
231
232 BLOCK(0)
233 BLOCK(4)
234 BLOCK(8)
235 BLOCK(12)
236
237 " addl $128, %1 ;\n"
238 " addl $128, %2 ;\n"
239 " addl $128, %3 ;\n"
240 " addl $128, %4 ;\n"
241 " addl $128, %5 ;\n"
242 " decl %0 ;\n"
243 " jnz 1b ;\n"
244 :
245 : "g" (lines),
246 "r" (p1), "r" (p2), "r" (p3), "r" (p4), "r" (p5)
247 : "memory");
248
249 FPU_RESTORE;
250 }
251
252 #undef LD
253 #undef XO1
254 #undef XO2
255 #undef XO3
256 #undef XO4
257 #undef ST
258 #undef BLOCK
259
260 static void
261 xor_p5_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
262 {
263 unsigned long lines = bytes >> 6;
264 char fpu_save[108];
265
266 FPU_SAVE;
267
268 __asm__ __volatile__ (
269 " .align 32 ;\n"
270 " 1: ;\n"
271 " movq (%1), %%mm0 ;\n"
272 " movq 8(%1), %%mm1 ;\n"
273 " pxor (%2), %%mm0 ;\n"
274 " movq 16(%1), %%mm2 ;\n"
275 " movq %%mm0, (%1) ;\n"
276 " pxor 8(%2), %%mm1 ;\n"
277 " movq 24(%1), %%mm3 ;\n"
278 " movq %%mm1, 8(%1) ;\n"
279 " pxor 16(%2), %%mm2 ;\n"
280 " movq 32(%1), %%mm4 ;\n"
281 " movq %%mm2, 16(%1) ;\n"
282 " pxor 24(%2), %%mm3 ;\n"
283 " movq 40(%1), %%mm5 ;\n"
284 " movq %%mm3, 24(%1) ;\n"
285 " pxor 32(%2), %%mm4 ;\n"
286 " movq 48(%1), %%mm6 ;\n"
287 " movq %%mm4, 32(%1) ;\n"
288 " pxor 40(%2), %%mm5 ;\n"
289 " movq 56(%1), %%mm7 ;\n"
290 " movq %%mm5, 40(%1) ;\n"
291 " pxor 48(%2), %%mm6 ;\n"
292 " pxor 56(%2), %%mm7 ;\n"
293 " movq %%mm6, 48(%1) ;\n"
294 " movq %%mm7, 56(%1) ;\n"
295
296 " addl $64, %1 ;\n"
297 " addl $64, %2 ;\n"
298 " decl %0 ;\n"
299 " jnz 1b ;\n"
300 :
301 : "r" (lines),
302 "r" (p1), "r" (p2)
303 : "memory");
304
305 FPU_RESTORE;
306 }
307
308 static void
309 xor_p5_mmx_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
310 unsigned long *p3)
311 {
312 unsigned long lines = bytes >> 6;
313 char fpu_save[108];
314
315 FPU_SAVE;
316
317 __asm__ __volatile__ (
318 " .align 32,0x90 ;\n"
319 " 1: ;\n"
320 " movq (%1), %%mm0 ;\n"
321 " movq 8(%1), %%mm1 ;\n"
322 " pxor (%2), %%mm0 ;\n"
323 " movq 16(%1), %%mm2 ;\n"
324 " pxor 8(%2), %%mm1 ;\n"
325 " pxor (%3), %%mm0 ;\n"
326 " pxor 16(%2), %%mm2 ;\n"
327 " movq %%mm0, (%1) ;\n"
328 " pxor 8(%3), %%mm1 ;\n"
329 " pxor 16(%3), %%mm2 ;\n"
330 " movq 24(%1), %%mm3 ;\n"
331 " movq %%mm1, 8(%1) ;\n"
332 " movq 32(%1), %%mm4 ;\n"
333 " movq 40(%1), %%mm5 ;\n"
334 " pxor 24(%2), %%mm3 ;\n"
335 " movq %%mm2, 16(%1) ;\n"
336 " pxor 32(%2), %%mm4 ;\n"
337 " pxor 24(%3), %%mm3 ;\n"
338 " pxor 40(%2), %%mm5 ;\n"
339 " movq %%mm3, 24(%1) ;\n"
340 " pxor 32(%3), %%mm4 ;\n"
341 " pxor 40(%3), %%mm5 ;\n"
342 " movq 48(%1), %%mm6 ;\n"
343 " movq %%mm4, 32(%1) ;\n"
344 " movq 56(%1), %%mm7 ;\n"
345 " pxor 48(%2), %%mm6 ;\n"
346 " movq %%mm5, 40(%1) ;\n"
347 " pxor 56(%2), %%mm7 ;\n"
348 " pxor 48(%3), %%mm6 ;\n"
349 " pxor 56(%3), %%mm7 ;\n"
350 " movq %%mm6, 48(%1) ;\n"
351 " movq %%mm7, 56(%1) ;\n"
352
353 " addl $64, %1 ;\n"
354 " addl $64, %2 ;\n"
355 " addl $64, %3 ;\n"
356 " decl %0 ;\n"
357 " jnz 1b ;\n"
358 :
359 : "r" (lines),
360 "r" (p1), "r" (p2), "r" (p3)
361 : "memory" );
362
363 FPU_RESTORE;
364 }
365
366 static void
367 xor_p5_mmx_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
368 unsigned long *p3, unsigned long *p4)
369 {
370 unsigned long lines = bytes >> 6;
371 char fpu_save[108];
372
373 FPU_SAVE;
374
375 __asm__ __volatile__ (
376 " .align 32,0x90 ;\n"
377 " 1: ;\n"
378 " movq (%1), %%mm0 ;\n"
379 " movq 8(%1), %%mm1 ;\n"
380 " pxor (%2), %%mm0 ;\n"
381 " movq 16(%1), %%mm2 ;\n"
382 " pxor 8(%2), %%mm1 ;\n"
383 " pxor (%3), %%mm0 ;\n"
384 " pxor 16(%2), %%mm2 ;\n"
385 " pxor 8(%3), %%mm1 ;\n"
386 " pxor (%4), %%mm0 ;\n"
387 " movq 24(%1), %%mm3 ;\n"
388 " pxor 16(%3), %%mm2 ;\n"
389 " pxor 8(%4), %%mm1 ;\n"
390 " movq %%mm0, (%1) ;\n"
391 " movq 32(%1), %%mm4 ;\n"
392 " pxor 24(%2), %%mm3 ;\n"
393 " pxor 16(%4), %%mm2 ;\n"
394 " movq %%mm1, 8(%1) ;\n"
395 " movq 40(%1), %%mm5 ;\n"
396 " pxor 32(%2), %%mm4 ;\n"
397 " pxor 24(%3), %%mm3 ;\n"
398 " movq %%mm2, 16(%1) ;\n"
399 " pxor 40(%2), %%mm5 ;\n"
400 " pxor 32(%3), %%mm4 ;\n"
401 " pxor 24(%4), %%mm3 ;\n"
402 " movq %%mm3, 24(%1) ;\n"
403 " movq 56(%1), %%mm7 ;\n"
404 " movq 48(%1), %%mm6 ;\n"
405 " pxor 40(%3), %%mm5 ;\n"
406 " pxor 32(%4), %%mm4 ;\n"
407 " pxor 48(%2), %%mm6 ;\n"
408 " movq %%mm4, 32(%1) ;\n"
409 " pxor 56(%2), %%mm7 ;\n"
410 " pxor 40(%4), %%mm5 ;\n"
411 " pxor 48(%3), %%mm6 ;\n"
412 " pxor 56(%3), %%mm7 ;\n"
413 " movq %%mm5, 40(%1) ;\n"
414 " pxor 48(%4), %%mm6 ;\n"
415 " pxor 56(%4), %%mm7 ;\n"
416 " movq %%mm6, 48(%1) ;\n"
417 " movq %%mm7, 56(%1) ;\n"
418
419 " addl $64, %1 ;\n"
420 " addl $64, %2 ;\n"
421 " addl $64, %3 ;\n"
422 " addl $64, %4 ;\n"
423 " decl %0 ;\n"
424 " jnz 1b ;\n"
425 :
426 : "r" (lines),
427 "r" (p1), "r" (p2), "r" (p3), "r" (p4)
428 : "memory");
429
430 FPU_RESTORE;
431 }
432
433 static void
434 xor_p5_mmx_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
435 unsigned long *p3, unsigned long *p4, unsigned long *p5)
436 {
437 unsigned long lines = bytes >> 6;
438 char fpu_save[108];
439
440 FPU_SAVE;
441
442 __asm__ __volatile__ (
443 " .align 32,0x90 ;\n"
444 " 1: ;\n"
445 " movq (%1), %%mm0 ;\n"
446 " movq 8(%1), %%mm1 ;\n"
447 " pxor (%2), %%mm0 ;\n"
448 " pxor 8(%2), %%mm1 ;\n"
449 " movq 16(%1), %%mm2 ;\n"
450 " pxor (%3), %%mm0 ;\n"
451 " pxor 8(%3), %%mm1 ;\n"
452 " pxor 16(%2), %%mm2 ;\n"
453 " pxor (%4), %%mm0 ;\n"
454 " pxor 8(%4), %%mm1 ;\n"
455 " pxor 16(%3), %%mm2 ;\n"
456 " movq 24(%1), %%mm3 ;\n"
457 " pxor (%5), %%mm0 ;\n"
458 " pxor 8(%5), %%mm1 ;\n"
459 " movq %%mm0, (%1) ;\n"
460 " pxor 16(%4), %%mm2 ;\n"
461 " pxor 24(%2), %%mm3 ;\n"
462 " movq %%mm1, 8(%1) ;\n"
463 " pxor 16(%5), %%mm2 ;\n"
464 " pxor 24(%3), %%mm3 ;\n"
465 " movq 32(%1), %%mm4 ;\n"
466 " movq %%mm2, 16(%1) ;\n"
467 " pxor 24(%4), %%mm3 ;\n"
468 " pxor 32(%2), %%mm4 ;\n"
469 " movq 40(%1), %%mm5 ;\n"
470 " pxor 24(%5), %%mm3 ;\n"
471 " pxor 32(%3), %%mm4 ;\n"
472 " pxor 40(%2), %%mm5 ;\n"
473 " movq %%mm3, 24(%1) ;\n"
474 " pxor 32(%4), %%mm4 ;\n"
475 " pxor 40(%3), %%mm5 ;\n"
476 " movq 48(%1), %%mm6 ;\n"
477 " movq 56(%1), %%mm7 ;\n"
478 " pxor 32(%5), %%mm4 ;\n"
479 " pxor 40(%4), %%mm5 ;\n"
480 " pxor 48(%2), %%mm6 ;\n"
481 " pxor 56(%2), %%mm7 ;\n"
482 " movq %%mm4, 32(%1) ;\n"
483 " pxor 48(%3), %%mm6 ;\n"
484 " pxor 56(%3), %%mm7 ;\n"
485 " pxor 40(%5), %%mm5 ;\n"
486 " pxor 48(%4), %%mm6 ;\n"
487 " pxor 56(%4), %%mm7 ;\n"
488 " movq %%mm5, 40(%1) ;\n"
489 " pxor 48(%5), %%mm6 ;\n"
490 " pxor 56(%5), %%mm7 ;\n"
491 " movq %%mm6, 48(%1) ;\n"
492 " movq %%mm7, 56(%1) ;\n"
493
494 " addl $64, %1 ;\n"
495 " addl $64, %2 ;\n"
496 " addl $64, %3 ;\n"
497 " addl $64, %4 ;\n"
498 " addl $64, %5 ;\n"
499 " decl %0 ;\n"
500 " jnz 1b ;\n"
501 :
502 : "g" (lines),
503 "r" (p1), "r" (p2), "r" (p3), "r" (p4), "r" (p5)
504 : "memory");
505
506 FPU_RESTORE;
507 }
508
509 static struct xor_block_template xor_block_pII_mmx = {
510 name: "pII_mmx",
511 do_2: xor_pII_mmx_2,
512 do_3: xor_pII_mmx_3,
513 do_4: xor_pII_mmx_4,
514 do_5: xor_pII_mmx_5,
515 };
516
517 static struct xor_block_template xor_block_p5_mmx = {
518 name: "p5_mmx",
519 do_2: xor_p5_mmx_2,
520 do_3: xor_p5_mmx_3,
521 do_4: xor_p5_mmx_4,
522 do_5: xor_p5_mmx_5,
523 };
524
525 #undef FPU_SAVE
526 #undef FPU_RESTORE
527
528 /*
529 * Cache avoiding checksumming functions utilizing KNI instructions
530 * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
531 */
532
533 #define XMMS_SAVE \
534 __asm__ __volatile__ ( \
535 "movl %%cr0,%0 ;\n\t" \
536 "clts ;\n\t" \
537 "movups %%xmm0,(%1) ;\n\t" \
538 "movups %%xmm1,0x10(%1) ;\n\t" \
539 "movups %%xmm2,0x20(%1) ;\n\t" \
540 "movups %%xmm3,0x30(%1) ;\n\t" \
541 : "=r" (cr0) \
542 : "r" (xmm_save) \
543 : "memory")
544
545 #define XMMS_RESTORE \
546 __asm__ __volatile__ ( \
547 "sfence ;\n\t" \
548 "movups (%1),%%xmm0 ;\n\t" \
549 "movups 0x10(%1),%%xmm1 ;\n\t" \
550 "movups 0x20(%1),%%xmm2 ;\n\t" \
551 "movups 0x30(%1),%%xmm3 ;\n\t" \
552 "movl %0,%%cr0 ;\n\t" \
553 : \
554 : "r" (cr0), "r" (xmm_save) \
555 : "memory")
556
557 #define OFFS(x) "16*("#x")"
558 #define PF_OFFS(x) "256+16*("#x")"
559 #define PF0(x) " prefetchnta "PF_OFFS(x)"(%1) ;\n"
560 #define LD(x,y) " movaps "OFFS(x)"(%1), %%xmm"#y" ;\n"
561 #define ST(x,y) " movaps %%xmm"#y", "OFFS(x)"(%1) ;\n"
562 #define PF1(x) " prefetchnta "PF_OFFS(x)"(%2) ;\n"
563 #define PF2(x) " prefetchnta "PF_OFFS(x)"(%3) ;\n"
564 #define PF3(x) " prefetchnta "PF_OFFS(x)"(%4) ;\n"
565 #define PF4(x) " prefetchnta "PF_OFFS(x)"(%5) ;\n"
566 #define PF5(x) " prefetchnta "PF_OFFS(x)"(%6) ;\n"
567 #define XO1(x,y) " xorps "OFFS(x)"(%2), %%xmm"#y" ;\n"
568 #define XO2(x,y) " xorps "OFFS(x)"(%3), %%xmm"#y" ;\n"
569 #define XO3(x,y) " xorps "OFFS(x)"(%4), %%xmm"#y" ;\n"
570 #define XO4(x,y) " xorps "OFFS(x)"(%5), %%xmm"#y" ;\n"
571 #define XO5(x,y) " xorps "OFFS(x)"(%6), %%xmm"#y" ;\n"
572
573
574 static void
575 xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
576 {
577 unsigned long lines = bytes >> 8;
578 char xmm_save[16*4];
579 int cr0;
580
581 XMMS_SAVE;
582
583 __asm__ __volatile__ (
584 #undef BLOCK
585 #define BLOCK(i) \
586 LD(i,0) \
587 LD(i+1,1) \
588 PF1(i) \
589 PF1(i+2) \
590 LD(i+2,2) \
591 LD(i+3,3) \
592 PF0(i+4) \
593 PF0(i+6) \
594 XO1(i,0) \
595 XO1(i+1,1) \
596 XO1(i+2,2) \
597 XO1(i+3,3) \
598 ST(i,0) \
599 ST(i+1,1) \
600 ST(i+2,2) \
601 ST(i+3,3) \
602
603
604 PF0(0)
605 PF0(2)
606
607 " .align 32 ;\n"
608 " 1: ;\n"
609
610 BLOCK(0)
611 BLOCK(4)
612 BLOCK(8)
613 BLOCK(12)
614
615 " addl $256, %1 ;\n"
616 " addl $256, %2 ;\n"
617 " decl %0 ;\n"
618 " jnz 1b ;\n"
619 :
620 : "r" (lines),
621 "r" (p1), "r" (p2)
622 : "memory");
623
624 XMMS_RESTORE;
625 }
626
627 static void
628 xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
629 unsigned long *p3)
630 {
631 unsigned long lines = bytes >> 8;
632 char xmm_save[16*4];
633 int cr0;
634
635 XMMS_SAVE;
636
637 __asm__ __volatile__ (
638 #undef BLOCK
639 #define BLOCK(i) \
640 PF1(i) \
641 PF1(i+2) \
642 LD(i,0) \
643 LD(i+1,1) \
644 LD(i+2,2) \
645 LD(i+3,3) \
646 PF2(i) \
647 PF2(i+2) \
648 PF0(i+4) \
649 PF0(i+6) \
650 XO1(i,0) \
651 XO1(i+1,1) \
652 XO1(i+2,2) \
653 XO1(i+3,3) \
654 XO2(i,0) \
655 XO2(i+1,1) \
656 XO2(i+2,2) \
657 XO2(i+3,3) \
658 ST(i,0) \
659 ST(i+1,1) \
660 ST(i+2,2) \
661 ST(i+3,3) \
662
663
664 PF0(0)
665 PF0(2)
666
667 " .align 32 ;\n"
668 " 1: ;\n"
669
670 BLOCK(0)
671 BLOCK(4)
672 BLOCK(8)
673 BLOCK(12)
674
675 " addl $256, %1 ;\n"
676 " addl $256, %2 ;\n"
677 " addl $256, %3 ;\n"
678 " decl %0 ;\n"
679 " jnz 1b ;\n"
680 :
681 : "r" (lines),
682 "r" (p1), "r"(p2), "r"(p3)
683 : "memory" );
684
685 XMMS_RESTORE;
686 }
687
688 static void
689 xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
690 unsigned long *p3, unsigned long *p4)
691 {
692 unsigned long lines = bytes >> 8;
693 char xmm_save[16*4];
694 int cr0;
695
696 XMMS_SAVE;
697
698 __asm__ __volatile__ (
699 #undef BLOCK
700 #define BLOCK(i) \
701 PF1(i) \
702 PF1(i+2) \
703 LD(i,0) \
704 LD(i+1,1) \
705 LD(i+2,2) \
706 LD(i+3,3) \
707 PF2(i) \
708 PF2(i+2) \
709 XO1(i,0) \
710 XO1(i+1,1) \
711 XO1(i+2,2) \
712 XO1(i+3,3) \
713 PF3(i) \
714 PF3(i+2) \
715 PF0(i+4) \
716 PF0(i+6) \
717 XO2(i,0) \
718 XO2(i+1,1) \
719 XO2(i+2,2) \
720 XO2(i+3,3) \
721 XO3(i,0) \
722 XO3(i+1,1) \
723 XO3(i+2,2) \
724 XO3(i+3,3) \
725 ST(i,0) \
726 ST(i+1,1) \
727 ST(i+2,2) \
728 ST(i+3,3) \
729
730
731 PF0(0)
732 PF0(2)
733
734 " .align 32 ;\n"
735 " 1: ;\n"
736
737 BLOCK(0)
738 BLOCK(4)
739 BLOCK(8)
740 BLOCK(12)
741
742 " addl $256, %1 ;\n"
743 " addl $256, %2 ;\n"
744 " addl $256, %3 ;\n"
745 " addl $256, %4 ;\n"
746 " decl %0 ;\n"
747 " jnz 1b ;\n"
748 :
749 : "r" (lines),
750 "r" (p1), "r" (p2), "r" (p3), "r" (p4)
751 : "memory" );
752
753 XMMS_RESTORE;
754 }
755
756 static void
757 xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
758 unsigned long *p3, unsigned long *p4, unsigned long *p5)
759 {
760 unsigned long lines = bytes >> 8;
761 char xmm_save[16*4];
762 int cr0;
763
764 XMMS_SAVE;
765
766 __asm__ __volatile__ (
767 #undef BLOCK
768 #define BLOCK(i) \
769 PF1(i) \
770 PF1(i+2) \
771 LD(i,0) \
772 LD(i+1,1) \
773 LD(i+2,2) \
774 LD(i+3,3) \
775 PF2(i) \
776 PF2(i+2) \
777 XO1(i,0) \
778 XO1(i+1,1) \
779 XO1(i+2,2) \
780 XO1(i+3,3) \
781 PF3(i) \
782 PF3(i+2) \
783 XO2(i,0) \
784 XO2(i+1,1) \
785 XO2(i+2,2) \
786 XO2(i+3,3) \
787 PF4(i) \
788 PF4(i+2) \
789 PF0(i+4) \
790 PF0(i+6) \
791 XO3(i,0) \
792 XO3(i+1,1) \
793 XO3(i+2,2) \
794 XO3(i+3,3) \
795 XO4(i,0) \
796 XO4(i+1,1) \
797 XO4(i+2,2) \
798 XO4(i+3,3) \
799 ST(i,0) \
800 ST(i+1,1) \
801 ST(i+2,2) \
802 ST(i+3,3) \
803
804
805 PF0(0)
806 PF0(2)
807
808 " .align 32 ;\n"
809 " 1: ;\n"
810
811 BLOCK(0)
812 BLOCK(4)
813 BLOCK(8)
814 BLOCK(12)
815
816 " addl $256, %1 ;\n"
817 " addl $256, %2 ;\n"
818 " addl $256, %3 ;\n"
819 " addl $256, %4 ;\n"
820 " addl $256, %5 ;\n"
821 " decl %0 ;\n"
822 " jnz 1b ;\n"
823 :
824 : "r" (lines),
825 "r" (p1), "r" (p2), "r" (p3), "r" (p4), "r" (p5)
826 : "memory");
827
828 XMMS_RESTORE;
829 }
830
831 static struct xor_block_template xor_block_pIII_sse = {
832 name: "pIII_sse",
833 do_2: xor_sse_2,
834 do_3: xor_sse_3,
835 do_4: xor_sse_4,
836 do_5: xor_sse_5,
837 };
838
839 /* Also try the generic routines. */
840 #include <asm-generic/xor.h>
841
842 #undef XOR_TRY_TEMPLATES
843 #define XOR_TRY_TEMPLATES \
844 do { \
845 xor_speed(&xor_block_8regs); \
846 xor_speed(&xor_block_32regs); \
847 if (cpu_has_xmm) \
848 xor_speed(&xor_block_pIII_sse); \
849 if (md_cpu_has_mmx()) { \
850 xor_speed(&xor_block_pII_mmx); \
851 xor_speed(&xor_block_p5_mmx); \
852 } \
853 } while (0)
854
855 /* We force the use of the SSE xor block because it can write around L2.
856 We may also be able to load into the L1 only depending on how the cpu
857 deals with a load to a line that is being prefetched. */
858 #define XOR_SELECT_TEMPLATE(FASTEST) \
859 (cpu_has_xmm ? &xor_block_pIII_sse : FASTEST)
860