File: /usr/src/linux/arch/i386/lib/mmx.c
1 #include <linux/config.h>
2 #include <linux/types.h>
3 #include <linux/string.h>
4 #include <linux/sched.h>
5
6 #include <asm/i387.h>
7 #include <asm/hardirq.h>
8
9
10 /*
11 * MMX 3DNow! library helper functions
12 *
13 * To do:
14 * We can use MMX just for prefetch in IRQ's. This may be a win.
15 * (reported so on K6-III)
16 * We should use a better code neutral filler for the short jump
17 * leal ebx. [ebx] is apparently best for K6-2, but Cyrix ??
18 * We also want to clobber the filler register so we dont get any
19 * register forwarding stalls on the filler.
20 *
21 * Add *user handling. Checksums are not a win with MMX on any CPU
22 * tested so far for any MMX solution figured.
23 *
24 * 22/09/2000 - Arjan van de Ven
25 * Improved for non-egineering-sample Athlons
26 *
27 */
28
29 void *_mmx_memcpy(void *to, const void *from, size_t len)
30 {
31 void *p;
32 int i;
33
34 if (in_interrupt())
35 return __memcpy(to, from, len);
36
37 p = to;
38 i = len >> 6; /* len/64 */
39
40 kernel_fpu_begin();
41
42 __asm__ __volatile__ (
43 "1: prefetch (%0)\n" /* This set is 28 bytes */
44 " prefetch 64(%0)\n"
45 " prefetch 128(%0)\n"
46 " prefetch 192(%0)\n"
47 " prefetch 256(%0)\n"
48 "2: \n"
49 ".section .fixup, \"ax\"\n"
50 "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */
51 " jmp 2b\n"
52 ".previous\n"
53 ".section __ex_table,\"a\"\n"
54 " .align 4\n"
55 " .long 1b, 3b\n"
56 ".previous"
57 : : "r" (from) );
58
59
60 for(; i>0; i--)
61 {
62 __asm__ __volatile__ (
63 "1: prefetch 320(%0)\n"
64 "2: movq (%0), %%mm0\n"
65 " movq 8(%0), %%mm1\n"
66 " movq 16(%0), %%mm2\n"
67 " movq 24(%0), %%mm3\n"
68 " movq %%mm0, (%1)\n"
69 " movq %%mm1, 8(%1)\n"
70 " movq %%mm2, 16(%1)\n"
71 " movq %%mm3, 24(%1)\n"
72 " movq 32(%0), %%mm0\n"
73 " movq 40(%0), %%mm1\n"
74 " movq 48(%0), %%mm2\n"
75 " movq 56(%0), %%mm3\n"
76 " movq %%mm0, 32(%1)\n"
77 " movq %%mm1, 40(%1)\n"
78 " movq %%mm2, 48(%1)\n"
79 " movq %%mm3, 56(%1)\n"
80 ".section .fixup, \"ax\"\n"
81 "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */
82 " jmp 2b\n"
83 ".previous\n"
84 ".section __ex_table,\"a\"\n"
85 " .align 4\n"
86 " .long 1b, 3b\n"
87 ".previous"
88 : : "r" (from), "r" (to) : "memory");
89 from+=64;
90 to+=64;
91 }
92 /*
93 * Now do the tail of the block
94 */
95 __memcpy(to, from, len&63);
96 kernel_fpu_end();
97 return p;
98 }
99
100 #ifdef CONFIG_MK7
101
102 /*
103 * The K7 has streaming cache bypass load/store. The Cyrix III, K6 and
104 * other MMX using processors do not.
105 */
106
107 static void fast_clear_page(void *page)
108 {
109 int i;
110
111 kernel_fpu_begin();
112
113 __asm__ __volatile__ (
114 " pxor %%mm0, %%mm0\n" : :
115 );
116
117 for(i=0;i<4096/64;i++)
118 {
119 __asm__ __volatile__ (
120 " movntq %%mm0, (%0)\n"
121 " movntq %%mm0, 8(%0)\n"
122 " movntq %%mm0, 16(%0)\n"
123 " movntq %%mm0, 24(%0)\n"
124 " movntq %%mm0, 32(%0)\n"
125 " movntq %%mm0, 40(%0)\n"
126 " movntq %%mm0, 48(%0)\n"
127 " movntq %%mm0, 56(%0)\n"
128 : : "r" (page) : "memory");
129 page+=64;
130 }
131 /* since movntq is weakly-ordered, a "sfence" is needed to become
132 * ordered again.
133 */
134 __asm__ __volatile__ (
135 " sfence \n" : :
136 );
137 kernel_fpu_end();
138 }
139
140 static void fast_copy_page(void *to, void *from)
141 {
142 int i;
143
144 kernel_fpu_begin();
145
146 /* maybe the prefetch stuff can go before the expensive fnsave...
147 * but that is for later. -AV
148 */
149 __asm__ __volatile__ (
150 "1: prefetch (%0)\n"
151 " prefetch 64(%0)\n"
152 " prefetch 128(%0)\n"
153 " prefetch 192(%0)\n"
154 " prefetch 256(%0)\n"
155 "2: \n"
156 ".section .fixup, \"ax\"\n"
157 "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */
158 " jmp 2b\n"
159 ".previous\n"
160 ".section __ex_table,\"a\"\n"
161 " .align 4\n"
162 " .long 1b, 3b\n"
163 ".previous"
164 : : "r" (from) );
165
166 for(i=0; i<(4096-320)/64; i++)
167 {
168 __asm__ __volatile__ (
169 "1: prefetch 320(%0)\n"
170 "2: movq (%0), %%mm0\n"
171 " movntq %%mm0, (%1)\n"
172 " movq 8(%0), %%mm1\n"
173 " movntq %%mm1, 8(%1)\n"
174 " movq 16(%0), %%mm2\n"
175 " movntq %%mm2, 16(%1)\n"
176 " movq 24(%0), %%mm3\n"
177 " movntq %%mm3, 24(%1)\n"
178 " movq 32(%0), %%mm4\n"
179 " movntq %%mm4, 32(%1)\n"
180 " movq 40(%0), %%mm5\n"
181 " movntq %%mm5, 40(%1)\n"
182 " movq 48(%0), %%mm6\n"
183 " movntq %%mm6, 48(%1)\n"
184 " movq 56(%0), %%mm7\n"
185 " movntq %%mm7, 56(%1)\n"
186 ".section .fixup, \"ax\"\n"
187 "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */
188 " jmp 2b\n"
189 ".previous\n"
190 ".section __ex_table,\"a\"\n"
191 " .align 4\n"
192 " .long 1b, 3b\n"
193 ".previous"
194 : : "r" (from), "r" (to) : "memory");
195 from+=64;
196 to+=64;
197 }
198 for(i=(4096-320)/64; i<4096/64; i++)
199 {
200 __asm__ __volatile__ (
201 "2: movq (%0), %%mm0\n"
202 " movntq %%mm0, (%1)\n"
203 " movq 8(%0), %%mm1\n"
204 " movntq %%mm1, 8(%1)\n"
205 " movq 16(%0), %%mm2\n"
206 " movntq %%mm2, 16(%1)\n"
207 " movq 24(%0), %%mm3\n"
208 " movntq %%mm3, 24(%1)\n"
209 " movq 32(%0), %%mm4\n"
210 " movntq %%mm4, 32(%1)\n"
211 " movq 40(%0), %%mm5\n"
212 " movntq %%mm5, 40(%1)\n"
213 " movq 48(%0), %%mm6\n"
214 " movntq %%mm6, 48(%1)\n"
215 " movq 56(%0), %%mm7\n"
216 " movntq %%mm7, 56(%1)\n"
217 : : "r" (from), "r" (to) : "memory");
218 from+=64;
219 to+=64;
220 }
221 /* since movntq is weakly-ordered, a "sfence" is needed to become
222 * ordered again.
223 */
224 __asm__ __volatile__ (
225 " sfence \n" : :
226 );
227 kernel_fpu_end();
228 }
229
230 #else
231
232 /*
233 * Generic MMX implementation without K7 specific streaming
234 */
235
236 static void fast_clear_page(void *page)
237 {
238 int i;
239
240 kernel_fpu_begin();
241
242 __asm__ __volatile__ (
243 " pxor %%mm0, %%mm0\n" : :
244 );
245
246 for(i=0;i<4096/128;i++)
247 {
248 __asm__ __volatile__ (
249 " movq %%mm0, (%0)\n"
250 " movq %%mm0, 8(%0)\n"
251 " movq %%mm0, 16(%0)\n"
252 " movq %%mm0, 24(%0)\n"
253 " movq %%mm0, 32(%0)\n"
254 " movq %%mm0, 40(%0)\n"
255 " movq %%mm0, 48(%0)\n"
256 " movq %%mm0, 56(%0)\n"
257 " movq %%mm0, 64(%0)\n"
258 " movq %%mm0, 72(%0)\n"
259 " movq %%mm0, 80(%0)\n"
260 " movq %%mm0, 88(%0)\n"
261 " movq %%mm0, 96(%0)\n"
262 " movq %%mm0, 104(%0)\n"
263 " movq %%mm0, 112(%0)\n"
264 " movq %%mm0, 120(%0)\n"
265 : : "r" (page) : "memory");
266 page+=128;
267 }
268
269 kernel_fpu_end();
270 }
271
272 static void fast_copy_page(void *to, void *from)
273 {
274 int i;
275
276
277 kernel_fpu_begin();
278
279 __asm__ __volatile__ (
280 "1: prefetch (%0)\n"
281 " prefetch 64(%0)\n"
282 " prefetch 128(%0)\n"
283 " prefetch 192(%0)\n"
284 " prefetch 256(%0)\n"
285 "2: \n"
286 ".section .fixup, \"ax\"\n"
287 "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */
288 " jmp 2b\n"
289 ".previous\n"
290 ".section __ex_table,\"a\"\n"
291 " .align 4\n"
292 " .long 1b, 3b\n"
293 ".previous"
294 : : "r" (from) );
295
296 for(i=0; i<4096/64; i++)
297 {
298 __asm__ __volatile__ (
299 "1: prefetch 320(%0)\n"
300 "2: movq (%0), %%mm0\n"
301 " movq 8(%0), %%mm1\n"
302 " movq 16(%0), %%mm2\n"
303 " movq 24(%0), %%mm3\n"
304 " movq %%mm0, (%1)\n"
305 " movq %%mm1, 8(%1)\n"
306 " movq %%mm2, 16(%1)\n"
307 " movq %%mm3, 24(%1)\n"
308 " movq 32(%0), %%mm0\n"
309 " movq 40(%0), %%mm1\n"
310 " movq 48(%0), %%mm2\n"
311 " movq 56(%0), %%mm3\n"
312 " movq %%mm0, 32(%1)\n"
313 " movq %%mm1, 40(%1)\n"
314 " movq %%mm2, 48(%1)\n"
315 " movq %%mm3, 56(%1)\n"
316 ".section .fixup, \"ax\"\n"
317 "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */
318 " jmp 2b\n"
319 ".previous\n"
320 ".section __ex_table,\"a\"\n"
321 " .align 4\n"
322 " .long 1b, 3b\n"
323 ".previous"
324 : : "r" (from), "r" (to) : "memory");
325 from+=64;
326 to+=64;
327 }
328 kernel_fpu_end();
329 }
330
331
332 #endif
333
334 /*
335 * Favour MMX for page clear and copy.
336 */
337
338 static void slow_zero_page(void * page)
339 {
340 int d0, d1;
341 __asm__ __volatile__(
342 "cld\n\t"
343 "rep ; stosl"
344 : "=&c" (d0), "=&D" (d1)
345 :"a" (0),"1" (page),"0" (1024)
346 :"memory");
347 }
348
349 void mmx_clear_page(void * page)
350 {
351 if(in_interrupt())
352 slow_zero_page(page);
353 else
354 fast_clear_page(page);
355 }
356
357 static void slow_copy_page(void *to, void *from)
358 {
359 int d0, d1, d2;
360 __asm__ __volatile__(
361 "cld\n\t"
362 "rep ; movsl"
363 : "=&c" (d0), "=&D" (d1), "=&S" (d2)
364 : "0" (1024),"1" ((long) to),"2" ((long) from)
365 : "memory");
366 }
367
368
369 void mmx_copy_page(void *to, void *from)
370 {
371 if(in_interrupt())
372 slow_copy_page(to, from);
373 else
374 fast_copy_page(to, from);
375 }
376