OK è giunto alla conclusione che non era idea di cercare di essere 'intelligente', io in panchina:
il costruito nel popcount intrinseca: _mm_popcnt_u64
bmi2: __tzcnt_u64(~_pext_u64(data[i],data[i]));
contro tre funzioni assembler
popcnt, bmi2 e avx2.
Tutti corrono alla velocità è possibile spostare la memoria dentro e fuori del mio:
cat /proc/cpuinfo
con Intel (R) Xeon (R) CPU E3-1275 v3 @ 3.50GHz
FYI:
principale.c:
// Hamming weight bench
#include <stdio.h>
#include <string.h>
#include <stdint.h>
#include <stdlib.h>
#include <math.h>
#include <sys/time.h>
#include <smmintrin.h>
#include <immintrin.h>
#include <x86intrin.h>
#include <math.h>
#define DISPLAY_HEIGHT 4
#define DISPLAY_WIDTH 32
#define NUM_DATA_OBJECTS 40000000
#define ITTERATIONS 20
// The source data (+32 to avoid the quantization out of memory problem)
__attribute__ ((aligned(32))) static long long unsigned data[NUM_DATA_OBJECTS+32]={};
__attribute__ ((aligned(32))) static long long unsigned data_out[NUM_DATA_OBJECTS+32]={};
__attribute__ ((aligned(32))) static unsigned char k1[32*3]={
0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,
0x00,0x01,0x01,0x02,0x01,0x02,0x02,0x03,0x01,0x02,0x02,0x03,0x02,0x03,0x03,0x04,0x00,0x01,0x01,0x02,0x01,0x02,0x02,0x03,0x01,0x02,0x02,0x03,0x02,0x03,0x03,0x04,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
};
extern "C" {
void popcnt_popcnt(long long unsigned[],unsigned int,long long unsigned[]);
void popcnt_bmi2(long long unsigned[],unsigned int,long long unsigned[]);
void popcnt_avx2(long long unsigned[],unsigned int,long long unsigned[],unsigned char[]);
}
void populate_data()
{
for(unsigned int i = 0; i < NUM_DATA_OBJECTS; i++)
{
data[i] = rand();
}
}
void display_source_data()
{
printf ("\r\nData in(start):\r\n");
for (unsigned int j = 0; j < DISPLAY_HEIGHT; j++)
{
for (unsigned int i = 0; i < DISPLAY_WIDTH; i++)
{
printf ("0x%02llux,",data[i+(j*DISPLAY_WIDTH)]);
}
printf ("\r\n");
}
}
void bench_popcnt()
{
for(unsigned int i = 0; i < NUM_DATA_OBJECTS; i++)
{
data_out[i] = _mm_popcnt_u64(data[i]);
}
}
void bench_move_data_memcpy()
{
memcpy(data_out,data,NUM_DATA_OBJECTS*8);
}
// __tzcnt64 ??
void bench_bmi2()
{
for(unsigned int i = 0; i < NUM_DATA_OBJECTS; i++)
{
data_out[i]=__tzcnt_u64(~_pext_u64(data[i],data[i]));
}
}
void display_dest_data()
{
printf ("\r\nData out:\r\n");
for (unsigned int j = 0; j < DISPLAY_HEIGHT; j++)
{
for (unsigned int i = 0; i < DISPLAY_WIDTH; i++)
{
printf ("0x%02llux,",data_out[i+(j*DISPLAY_WIDTH)]);
}
printf ("\r\n");
}
}
int main() {
struct timeval t0;
struct timeval t1;
long elapsed[ITTERATIONS]={0};
long avrg=0;
for (unsigned int i = 0; i < ITTERATIONS; i++)
{
populate_data();
// display_source_data();
gettimeofday(&t0, 0);
bench_move_data_memcpy();
gettimeofday(&t1, 0);
elapsed[i]= (((t1.tv_sec-t0.tv_sec)*1000000 + t1.tv_usec-t0.tv_usec)/1000);
printf ("Time_to_move_data_without_processing: %ld\n",elapsed[i]);
}
avrg=0;
for (unsigned int i = 1; i < ITTERATIONS; i++){
avrg+=elapsed[i];
}
printf ("Average time_to_move_data: %ld\n",avrg/(ITTERATIONS-1));
//display_dest_data();
for (unsigned int i = 0; i < ITTERATIONS; i++)
{
populate_data();
// display_source_data();
gettimeofday(&t0, 0);
bench_popcnt();
gettimeofday(&t1, 0);
elapsed[i] = ((t1.tv_sec-t0.tv_sec)*1000000 + t1.tv_usec-t0.tv_usec)/1000;
printf ("popcnt: %ld\n",elapsed[i]);
}
avrg=0;
for (unsigned int i = 1; i < ITTERATIONS; i++){
avrg+=elapsed[i];
}
printf ("Average popcnt: %ld\n",avrg/(ITTERATIONS-1));
//display_dest_data();
for (unsigned int i = 0; i < ITTERATIONS; i++)
{
populate_data();
// display_source_data();
gettimeofday(&t0, 0);
bench_bmi2();
gettimeofday(&t1, 0);
elapsed[i] = ((t1.tv_sec-t0.tv_sec)*1000000 + t1.tv_usec-t0.tv_usec)/1000;
printf ("bmi2: %ld\n",elapsed[i]);
}
avrg=0;
for (unsigned int i = 1; i < ITTERATIONS; i++){
avrg+=elapsed[i];
}
printf ("Average bmi2: %ld\n",avrg/(ITTERATIONS-1));
//display_dest_data();
printf ("Now test the assembler functions\n");
for (unsigned int i = 0; i < ITTERATIONS; i++)
{
populate_data();
// display_source_data();
gettimeofday(&t0, 0);
popcnt_popcnt(data,NUM_DATA_OBJECTS,data_out);
gettimeofday(&t1, 0);
elapsed[i] = ((t1.tv_sec-t0.tv_sec)*1000000 + t1.tv_usec-t0.tv_usec)/1000;
printf ("popcnt_asm: %ld\n",elapsed[i]);
}
avrg=0;
for (unsigned int i = 1; i < ITTERATIONS; i++){
avrg+=elapsed[i];
}
printf ("Average popcnt_asm: %ld\n",avrg/(ITTERATIONS-1));
//display_dest_data();
for (unsigned int i = 0; i < ITTERATIONS; i++)
{
populate_data();
// display_source_data();
gettimeofday(&t0, 0);
popcnt_bmi2(data,NUM_DATA_OBJECTS,data_out);
gettimeofday(&t1, 0);
elapsed[i] = ((t1.tv_sec-t0.tv_sec)*1000000 + t1.tv_usec-t0.tv_usec)/1000;
printf ("bmi2_asm: %ld\n",elapsed[i]);
}
avrg=0;
for (unsigned int i = 1; i < ITTERATIONS; i++){
avrg+=elapsed[i];
}
printf ("Average bmi2_asm: %ld\n",avrg/(ITTERATIONS-1));
//display_dest_data();
for (unsigned int i = 0; i < ITTERATIONS; i++)
{
populate_data();
// display_source_data();
gettimeofday(&t0, 0);
popcnt_avx2(data,(unsigned int)ceil((NUM_DATA_OBJECTS*8)/32.0),data_out,k1);
gettimeofday(&t1, 0);
elapsed[i] = ((t1.tv_sec-t0.tv_sec)*1000000 + t1.tv_usec-t0.tv_usec)/1000;
printf ("avx2_asm: %ld\n",elapsed[i]);
}
avrg=0;
for (unsigned int i = 1; i < ITTERATIONS; i++){
avrg+=elapsed[i];
}
printf ("Average avx2_asm: %ld\n",avrg/(ITTERATIONS-1));
//display_dest_data();
return 0;
}
I engine.s
//
// avx2_bmi2_popcnt bench
//
.global popcnt_bmi2 , popcnt_avx2, popcnt_popcnt
.align 2
//64-bit popcnt using the built-in popcnt instruction
popcnt_popcnt:
popcntq (%rdi), %r11
mov %r11,(%rdx)
add $8,%rdi
add $8,%rdx
dec %rsi
jnz popcnt_popcnt
ret
//64-bit popcnt using BMI2
popcnt_bmi2:
mov (%rdi),%r11
pextq %r11,%r11,%r11
not %r11
tzcnt %r11,%r11
mov %r11,(%rdx)
add $8,%rdi
add $8,%rdx
dec %rsi
jnz popcnt_bmi2
ret
//64-bit popcnt using AVX2
popcnt_avx2:
vmovdqa (%rcx),%ymm2
add $0x20,%rcx
vmovdqa (%rcx),%ymm3
add $0x20,%rcx
vmovdqa (%rcx),%ymm4
popcnt_avx2_loop:
vmovdqa (%rdi),%ymm0
vpand %ymm0, %ymm2, %ymm1
vpandn %ymm0, %ymm2, %ymm0
vpsrld $4,%ymm0, %ymm0
vpshufb %ymm1, %ymm3, %ymm1
vpshufb %ymm0, %ymm3, %ymm0
vpaddb %ymm1,%ymm0,%ymm0
vpsadbw %ymm0,%ymm4,%ymm0
vmovdqa %ymm0,(%rdx)
add $0x20,%rdi
add $0x20,%rdx
dec %rsi
jnz popcnt_avx2_loop
ret
compilare le sorgenti:
g++ -march=native -mavx -mpopcnt -O3 main.c engine.s
impostare la CPU di prestazioni:
cpufreq-set -g performance
Run banco:
sudo chrt -r 10 ./a.out
risultati:
time_to_move_data medio: 61
POPCNT medio: 61
bmi2 medio: 61
Ora testare le funzioni assembler
popcnt_asm medio: 61
bmi2_asm medio: 61
avx2_asm medio: 61
Credo che la vera domanda è: perché si blocca 'popcnt'? Il tuo processore ce l'ha. È disabilitato tramite alcune configurazioni VM o BIOS? – Mysticial
Arresta in modo anomalo se si utilizzano i build incorporati anziché l'assembly gestito manualmente? Ad esempio GCC offre '__builtin_popcountll'. – peppe
@peppe che si compila comunque in un 'popcnt', quindi qual è la differenza? – harold