#include #include #include #define mmxcpy(dest, src, len) \ __asm__ ( \ "movl %2, %%ecx\n\t" \ "shrl $5, %%ecx\n\t" \ "cplp:\n\t" \ "movq (%1), %%mm0\n\t" \ "movq 8(%1), %%mm1\n\t" \ "movq 16(%1), %%mm2\n\t" \ "movq 24(%1), %%mm3\n\t" \ "movq (%0), %%mm4\n\t" \ "movq %%mm0, (%0)\n\t" \ "movq %%mm1, 8(%0)\n\t" \ "movq %%mm2, 16(%0)\n\t" \ "movq %%mm3, 24(%0)\n\t" \ "addl $32, %1\n\t" \ "addl $32, %0\n\t" \ "decl %%ecx\n\t" \ "jnz cplp\n\t" \ : : "D" (dest), "S" (src), "g" (len) : "%ecx", "%esi", "%edi" ) static unsigned char a[4000] __attribute__ ((aligned(32))); static unsigned char b[4000] __attribute__ ((aligned(32))); main() { clock_t t1, t2; int i; t1 = clock(); while(t1 == clock()); t1 = clock(); for(i = 0; i < 200000; i++) mmxcpy(a, b, 4000); t2 = clock(); printf("mmxcpy: %d\n", t2 - t1); t1 = clock(); while(t1 == clock()); t1 = clock(); for(i = 0; i < 200000; i++) memcpy(a, b, 4000); t2 = clock(); printf("memcpy: %d\n", t2 - t1); } .