Subj : Re: CMPXCHG timing To : comp.programming.threads From : Michael Pryhodko Date : Mon Apr 04 2005 07:43 pm > > Somehow I could get to the "we should not be here line" on > > my P4 2.8GHz HT -- which is totally confusing to me, only current > > thread ever writes this specific unique id into m_lock variable and > > since we are at the beginning of lock() -- m_lock should be != > > curr_thread_id. It look like I am blind. I found my source of problems already -- if CMPXCHG fails it writes old value back. > I haven't totally studied your idea, but from what I can immediately gather > it seems like your going for a simple spinlock without using LOCK prefix. Is > that about it? Yes, and your simplified version of lock operation is correct. > And your unlock function has: > > store 0 into shared > ( store/store ) right. > So, lock function has 3 barriers ( two of which are expensive store/load's ) > and 2 non-atomic CAS ( one is to shared mem ) on the fast-path, and the > unlock function has one store to shared and 1 barrier on the fast-path. Are > you sure this is vastly more efficient than just using the simple LOCK > prefix? I have never tried messing around with cmpxchg on SMP/HT system > without using the lock prefix... I have already expressed my thoughts about this algorithm comparison with simple LOCK-based lock: 1. LOCK could lock system bus -- this could create significant influence on whole system: more devices are using bus (e.g. processors) -- influence becomes worse. sfence does not have this disadvantage (AFAIK). 2. AFAIK sfence performance depends on size of store buffer, that means in case of: sfence mov mem_addr, smth sfence second sfence will perform much faster than first. 3. sfence in unlock could be omitted 4. first sfence in lock used only to make execution time of second more predictable (this is the only "fragile" part of x86 implementation of this algorithm -- part of which I am not 100% sure) Now about vast improvement of speed: I created small test program (see below), it calculates number of lock/unlock pairs in 3 million processor clocks. NOTE: I replaced cmpxchg with 'if (lock = 0) lock = Pi'. Since I think it is impossible to implement this algorithm on x86 due to redundand store in cxmpxchg. Probability that OS will interrupt right between 'compare' and 'store' is very low (but happened one or two times). RESULTS: I did 5 runs of each case on my single proc P2.8 HT (i.e. 2 processor unit) LOCK-based -- 1493 on average my lock -- 2039 on average my lock with cmpxchg with 'unlock' augmented to work on 2-processor pc -- 1045 on average CONCLUSION: my lock is potentially 25-27% faster than usual LOCK-based lock (but unfortunately can't be implemented on x86 due to cmpxchg behavior) my augmented lock for 2-proc PC performs 2 times slower. Well as I already told David it could potentially execute infinitely :)) Here is the code: -------------------------------------------------------------- #include #include #include #include #include "tick_cnt.h" #define THREAD_COUNT (4) ////////////////////////////////////////////////// // mtlock class class mtlock { __declspec(align(32)) LONG m_lock; public: mtlock() throw() : m_lock(0) {} ~mtlock() throw() {} void lock(LONG dwId) throw() { LONG dummy = 5; __asm { mov edx, dwId mov ebx, this mfence ////////////////////////////////////////////////////////// // /* // CAS(&m_lock, 0, dwId) ENTER_LOCK: xor eax, eax cmpxchg [ebx]this.m_lock, edx sfence // if (successfull) je PROCEED pause jmp ENTER_LOCK PROCEED: */ xor eax, eax ENTER_LOCK: cmp eax, [ebx]this.m_lock je PUT_VAL pause jmp ENTER_LOCK PUT_VAL: mov [ebx]this.m_lock, edx sfence // ////////////////////////////////////////////////////////// // CAS(&dummy, dummy, ~dummy) mov ecx, dummy mov eax, ecx neg ecx cmpxchg dummy, ecx mfence // if (m_lock == dwId) cmp [ebx]this.m_lock, edx je EXIT_LOCK pause jmp ENTER_LOCK EXIT_LOCK: } } void unlock(LONG dwId) throw() { // LONG dummy = 5; __asm { mov ebx, this mov [ebx]this.m_lock, 0 sfence /* mov ebx, this mov edx, dwId ENTER_UNLOCK: mov [ebx]this.m_lock, 0 sfence // CAS(&dummy, dummy, ~dummy) mov ecx, dummy mov eax, ecx neg ecx cmpxchg dummy, ecx mfence // if (m_lock == dwId) cmp [ebx]this.m_lock, edx jne EXIT_UNLOCK pause jmp ENTER_UNLOCK EXIT_UNLOCK: */ } } }; ////////////////////////////////////////////////// // mtlock_old class -- LOCK-based version class mtlock_old { __declspec(align(32)) LONG m_lock; public: mtlock_old() throw() : m_lock(0) {} ~mtlock_old() throw() {} void lock(LONG dwId) throw() { __asm { mov ebx, this mov ecx, 1 ENTER_LOCK: xor eax, eax lock cmpxchg [ebx]this.m_lock, ecx // if (successfull) je PROCEED pause jmp ENTER_LOCK PROCEED: } } void unlock(LONG dwId) throw() { __asm { mov ebx, this mov [ebx]this.m_lock, 0 sfence } } }; ////////////////////////////////////////////////// // global variables //mtlock_old lock; mtlock lock; LONG volatile flag = 0; // used to check lock validity LONG volatile g_stat[THREAD_COUNT] = {0}; // enter/leave stats HANDLE hStartEvent = NULL; #define ASSERT(x) if (!(x)) { __asm { int 3 }; } void __cdecl thread_proc(void* pData) { LONG dwThreadId = GetCurrentThreadId(); LONG idx = (LONG)pData; ASSERT( WaitForSingleObject(hStartEvent, INFINITE) == WAIT_OBJECT_0 ); for(;;) { lock.lock(dwThreadId); ASSERT(flag == 0); ASSERT( InterlockedIncrement(&flag) == 1 ); ASSERT(flag == 1); ASSERT( InterlockedDecrement(&flag) == 0 ); ASSERT(flag == 0); ++g_stat[idx]; lock.unlock(dwThreadId); // Sleep(0); } } int _tmain(int argc, _TCHAR* argv[]) { hStartEvent = CreateEvent(NULL, TRUE, FALSE, NULL); ASSERT(hStartEvent != NULL); for(int i = 0; i < THREAD_COUNT; ++i) { _beginthread(&thread_proc, 0, (void*)i); } Sleep(1000); // wait for all threads to enter WaitForSingleObject CTickCounter timer; // start measurements ASSERT( SetEvent(hStartEvent) ); // signal to start working Sleep(1000); // working time lock.lock(GetCurrentThreadId()); // stop work ULONG time_used = timer.Stop(); // stop measurements UINT64 unCount = 0; for(int i = 0; i < THREAD_COUNT; ++i) { std::cout << i << " " << g_stat[i] << std::endl; unCount += g_stat[i]; } lock.unlock(GetCurrentThreadId()); std::cout << "Total: " << unCount << std::endl; std::cout << "Performance: " << double(unCount)/time_used << std::endl; return 0; } ------------------- tick_cnt.h ------------------------------- #pragma once #ifndef TICK_CNT_H__5_04_2005__9_53_50__AM #define TICK_CNT_H__5_04_2005__9_53_50__AM #pragma pack(push, 1) #pragma warning(push) #pragma warning(disable: 4537) class CTickCounter { ULONG m_dwLow, m_dwHigh; public: CTickCounter() throw() { Start(); } void Start() throw() { __asm { CPUID RDTSC mov EBX, [this].m_dwLow mov dword ptr [EBX],EAX add EBX,4 mov dword ptr [EBX],EDX } } // returns time in microseconds ULONG Stop(ULONG dwMHZ = 3*1000*1000 /* = GetProcFrequency()*/) throw() { __asm { CPUID RDTSC mov EBX, [this].m_dwLow mov ECX, EBX add ECX, 4 sub EAX, dword ptr [EBX] sbb EDX, dword ptr [ECX] div dwMHZ } } }; #pragma warning(pop) #pragma pack(pop) #endif //TICK_CNT_H__5_04_2005__9_53_50__AM -------------------------------------------------------------------- Bye. Sincerely yours, Michael. .