typedef unsigned long time; typedef unsigned long long stamp64; extern inline time stamp(void) { time tsc; asm volatile("rdtsc" : "=a" (tsc) : : "edx"); return tsc; } extern inline time measure(time t) { time tsc; asm volatile("rdtsc" : "=a" (tsc) : : "edx"); if (tsc>t) return tsc-t; else return t-tsc; } int c1 = 12345678; int d1; static long long var1 __attribute__ ((aligned(32))); static long long var2; #if 1 static long long dummy1; /* These two make var3 and var4 be outside */ static long long dummy2; /* the cache line when reading var1 */ #endif static long long var3; static long long var4; int x[64]; int main(void) { time s; time si[10]; int i; long double *p; p = (void *)&x; while((int)p & 7) p++; p++; *p=1234.567E123; /* printf("p=%08lx\n",p); printf("var1: %p\nvar2: %p\nvar3: %p\nvar4: %p\n", &var1, &var2, &var3, &var4); */ for(i=0; i < 10; i++) { int a,b; int c[10]; for(a=0;a<10;a++) c[a]=-5; s=stamp(); asm(" not %%esi nop nop .align 16 nop nop nop nop nop # Comment these four nop's out, nop # and suddenly the loop is nop # > 100%% slower! nop movl $30000, %%ecx # Loop this many times .align 16 lp: # Here is the loop (just some movl var1,%%edx # nonsense code) movl var1+4,%%eax movl var2,%%ebx andl %%edx,%%ebx movl var2+4,%%esi andl %%eax,%%esi movl %%ebx,var3 movl %%esi,var3+4 movl %%edx,%%eax addl $1,%%eax movl %%eax,var4 movl %%ebx,%%edx adcl $0,%%eax movl %%eax,var4+4 decl %%ecx jnz lp nop nop not %%esi " : : : "eax", "ebx", "edx", "ecx", "esi", "edi" ); si[i]=measure(s); } for(i=0; i<10; i++) { printf("%5d cycles %5f ms\n",si[i], ((float)si[i] * (1000.0 / 166.0)) / 1000000.0); } return 0; } .