Bonita Montero
2024-09-13 16:45:41 UTC
#include <iostream>
#include <barrier>
#include <thread>
#include <vector>
#if defined(_WIN32)
#include <intrin.h>
#elif defined(__linux__)
#include <x86intrin.h>
#endif
using namespace std;
int main()
{
unsigned hc = thread::hardware_concurrency();
barrier bar( hc );
atomic_uint synch( hc );
atomic_uint64_t zero( 0 );
atomic_int64_t diffs( 0 );
auto thr = [&]()
{
int64_t sum = 0;
for( unsigned t = 1'000; t; --t )
{
bar.arrive_and_wait();
if( synch.fetch_sub( 1, memory_order_relaxed ) > 1 )
while( synch.load( memory_order_relaxed ) );
uint64_t tsc = __rdtsc(), expected = 0;
if( !zero.compare_exchange_weak( expected, tsc, memory_order_relaxed,
memory_order_relaxed ) )
sum += abs( (int64_t)(expected - tsc) );
bar.arrive_and_wait();
synch.store( hc );
zero.store( 0, memory_order_relaxed );
}
diffs.fetch_add( sum, memory_order_relaxed );
};
vector<jthread> threads;
threads.reserve( hc - 1 );
for( unsigned t = hc - 1; t; --t )
threads.emplace_back( thr );
thr();
threads.resize( 0 );
cout << (double)diffs.load( memory_order_relaxed ) / (1'000.0 * hc) <<
endl;
}
My PC is a AMD 7950X 16-core system.
#include <barrier>
#include <thread>
#include <vector>
#if defined(_WIN32)
#include <intrin.h>
#elif defined(__linux__)
#include <x86intrin.h>
#endif
using namespace std;
int main()
{
unsigned hc = thread::hardware_concurrency();
barrier bar( hc );
atomic_uint synch( hc );
atomic_uint64_t zero( 0 );
atomic_int64_t diffs( 0 );
auto thr = [&]()
{
int64_t sum = 0;
for( unsigned t = 1'000; t; --t )
{
bar.arrive_and_wait();
if( synch.fetch_sub( 1, memory_order_relaxed ) > 1 )
while( synch.load( memory_order_relaxed ) );
uint64_t tsc = __rdtsc(), expected = 0;
if( !zero.compare_exchange_weak( expected, tsc, memory_order_relaxed,
memory_order_relaxed ) )
sum += abs( (int64_t)(expected - tsc) );
bar.arrive_and_wait();
synch.store( hc );
zero.store( 0, memory_order_relaxed );
}
diffs.fetch_add( sum, memory_order_relaxed );
};
vector<jthread> threads;
threads.reserve( hc - 1 );
for( unsigned t = hc - 1; t; --t )
threads.emplace_back( thr );
thr();
threads.resize( 0 );
cout << (double)diffs.load( memory_order_relaxed ) / (1'000.0 * hc) <<
endl;
}
My PC is a AMD 7950X 16-core system.