Come normalizzare le colonne matriciali in CUDA con le massime prestazioni?

Come normalizzare efficacemente le colonne di matrice in CUDA?Come normalizzare le colonne matriciali in CUDA con le massime prestazioni?

La mia matrice è memorizzata in colonna maggiore e la dimensione tipica è 2000x200.

L'operazione può essere rappresentata nel seguente codice MATLAB.

A = rand(2000,200); 

A = exp(A); 
A = A./repmat(sum(A,1), [size(A,1) 1]);

Questo può essere fatto efficacemente da Thrust, cuBLAS e/o cuNPP?

Un'implementazione rapida che include 4 kernel è indicata come segue.

Chiedere se è possibile eseguire questi in 1 o 2 kernel per migliorare le prestazioni, in particolare per il passaggio di sommazione delle colonne implementato da cublasDgemv().

#include <cuda.h> 
#include <curand.h> 
#include <cublas_v2.h> 
#include <thrust/device_vector.h> 
#include <thrust/device_ptr.h> 
#include <thrust/transform.h> 
#include <thrust/iterator/constant_iterator.h> 
#include <math.h> 

struct Exp 
{ 
    __host__ __device__ void operator()(double& x) 
    { 
     x = exp(x); 
    } 
}; 

struct Inv 
{ 
    __host__ __device__ void operator()(double& x) 
    { 
     x = (double) 1.0/x; 
    } 
}; 

int main() 
{ 
    cudaDeviceSetCacheConfig(cudaFuncCachePreferShared); 
    cublasHandle_t hd; 
    curandGenerator_t rng; 
    cublasCreate(&hd); 
    curandCreateGenerator(&rng, CURAND_RNG_PSEUDO_DEFAULT); 

    const size_t m = 2000, n = 200; 
    const double c1 = 1.0; 
    const double c0 = 0.0; 

    thrust::device_vector<double> A(m * n); 
    thrust::device_vector<double> sum(1 * n); 
    thrust::device_vector<double> one(m * n, 1.0); 

    double* pA = thrust::raw_pointer_cast(&A[0]); 
    double* pSum = thrust::raw_pointer_cast(&sum[0]); 
    double* pOne = thrust::raw_pointer_cast(&one[0]); 

    for (int i = 0; i < 100; i++) 
    { 
     curandGenerateUniformDouble(rng, pA, A.size()); 


     thrust::for_each(A.begin(), A.end(), Exp()); 

     cublasDgemv(hd, CUBLAS_OP_T, m, n, 
       &c1, pA, m, pOne, 1, &c0, pSum, 1); 

     thrust::for_each(sum.begin(), sum.end(), Inv()); 

     cublasDdgmm(hd, CUBLAS_SIDE_RIGHT, m, n, pA, m, pSum, 1, pA, m); 
    } 

    curandDestroyGenerator(rng); 
    cublasDestroy(hd); 

    return 0; 
}

fonte

2013-01-08 kangshiyin

Sì, può essere fatto efficacemente con CUDA. Mostra un codice CUDA che hai scritto per ottenere ciò che desideri. – sgarizvi

codice aggiunto. cercare il miglioramento delle prestazioni – kangshiyin

Ho confrontato le prestazioni di 3 approcci su M2090 con CUDA 5.0.

[173,179 us] cublas attuazione come indicato in questione
[733,734 us] attuazione puro spinta con thrust::reduce_by_key da @talonmies
[1.508 ms] implementazione di spinta pura con thrust::inclusive_scan_by_key

$Performance on A_{2,000 x 200}$

Si può notare che,

cublas ha prestazioni più elevate in questo caso;
entrambi thrust::reduce_by_key & thrust::inclusive_scan_by_key avviare più kernel, che portano a overhead aggiuntivo;
thrust::inclusive_scan_by_key scrive molto più dati su DRAM rispetto a thrust::reduce_by_key, che può essere uno dei motivi per un tempo di kernel più lungo;
la differenza di prestazione principale tra cublas e approccio di spinta è la somma della colonna di matrice. la spinta è più lenta forse perché lo thrust::reduce_by_key è progettato per ridurre i segmenti con una lunghezza variante, ma cublas_gemv() può essere applicato solo ai segmenti a lunghezza fissa (riga/colonna).

Quando la matrice A è abbastanza grande da ignorare il sovraccarico di avvio del kernel, il cublas appoach funziona ancora meglio. Il risultato del profilo su A_ {20.000 x 2.000} viene mostrato come segue.

$Performance on A_{20,000 x 2,000}$

Fondendo la prima operazione for_each con la chiamata cublasSgemv come indicato dalla @talonmies può migliorare ulteriormente le prestazioni, ma credo che il kernel scritto a mano deve essere usato al posto di thrust::reduce_by_key.

Il codice per i 3 approcci è mostrato come segue.

#include <cuda.h> 
#include <curand.h> 
#include <cublas_v2.h> 
#include <thrust/device_vector.h> 
#include <thrust/device_ptr.h> 
#include <thrust/transform.h> 
#include <thrust/reduce.h> 
#include <thrust/scan.h> 
#include <thrust/iterator/counting_iterator.h> 
#include <thrust/iterator/transform_iterator.h> 
#include <thrust/iterator/discard_iterator.h> 
#include <thrust/iterator/permutation_iterator.h> 
#include <math.h> 

struct Exp: public thrust::unary_function<double, double> 
{ 
    __host__ __device__ double operator()(double x) 
    { 
     return exp(x); 
    } 
}; 

struct Inv: public thrust::unary_function<double, double> 
{ 
    __host__ __device__ double operator()(double x) 
    { 
     return (double) 1.0/x; 
    } 
}; 

template<typename T> 
struct MulC: public thrust::unary_function<T, T> 
{ 
    T C; 
    __host__ __device__ MulC(T c) : 
     C(c) 
    { 
    } 
    __host__ __device__ T operator()(T x) 
    { 
     return x * C; 
    } 
}; 

template<typename T> 
struct line2col: public thrust::unary_function<T, T> 
{ 
    T C; 
    __host__ __device__ line2col(T C) : 
      C(C) 
    { 
    } 

    __host__ __device__ T operator()(T i) 
    { 
     return i/C; 
    } 
}; 

int main() 
{ 
    cudaDeviceSetCacheConfig(cudaFuncCachePreferShared); 
    cublasHandle_t hd; 
    curandGenerator_t rng; 
    cublasCreate(&hd); 
    curandCreateGenerator(&rng, CURAND_RNG_PSEUDO_DEFAULT); 

    const size_t m = 2000, n = 200; 
    const double c1 = 1.0; 
    const double c0 = 0.0; 

    thrust::device_vector<double> A(m * n); 
    thrust::device_vector<double> B(m * n); 
    thrust::device_vector<double> C(m * n); 
    thrust::device_vector<double> sum1(1 * n); 
    thrust::device_vector<double> sum2(1 * n); 
    thrust::device_vector<double> one(m * n, 1); 

    double* pA = thrust::raw_pointer_cast(&A[0]); 
    double* pB = thrust::raw_pointer_cast(&B[0]); 
    double* pSum1 = thrust::raw_pointer_cast(&sum1[0]); 
    double* pSum2 = thrust::raw_pointer_cast(&sum2[0]); 
    double* pOne = thrust::raw_pointer_cast(&one[0]); 

    curandGenerateUniformDouble(rng, pA, A.size()); 

    const int count = 2; 

    for (int i = 0; i < count; i++) 
    { 
     thrust::transform(A.begin(), A.end(), B.begin(), Exp()); 
     cublasDgemv(hd, CUBLAS_OP_T, m, n, &c1, pB, m, pOne, 1, &c0, pSum1, 1); 
     thrust::transform(sum1.begin(), sum1.end(), sum1.begin(), Inv()); 
     cublasDdgmm(hd, CUBLAS_SIDE_RIGHT, m, n, pB, m, pSum2, 1, pB, m); 
    } 

    for (int i = 0; i < count; i++) 
    { 
     thrust::reduce_by_key(
       thrust::make_transform_iterator(thrust::make_counting_iterator(0), line2col<int>(m)), 
       thrust::make_transform_iterator(thrust::make_counting_iterator(0), line2col<int>(m)) + A.size(), 
       thrust::make_transform_iterator(A.begin(), Exp()), 
       thrust::make_discard_iterator(), 
       sum2.begin()); 
     thrust::transform(
       A.begin(), A.end(), 
       thrust::make_permutation_iterator(
         sum2.begin(), 
         thrust::make_transform_iterator(thrust::make_counting_iterator(0), line2col<int>(m))), 
       C.begin(), 
       thrust::divides<double>()); 
    } 

    for (int i = 0; i < count; i++) 
    { 
     thrust::inclusive_scan_by_key(
       thrust::make_transform_iterator(thrust::make_counting_iterator(0), line2col<int>(m)), 
       thrust::make_transform_iterator(thrust::make_counting_iterator(0), line2col<int>(m)) + A.size(), 
       thrust::make_transform_iterator(A.begin(), Exp()), 
       C.begin()); 
     thrust::copy(
       thrust::make_permutation_iterator(
         C.begin() + m - 1, 
         thrust::make_transform_iterator(thrust::make_counting_iterator(0), MulC<int>(m))), 
       thrust::make_permutation_iterator(
         C.begin() + m - 1, 
         thrust::make_transform_iterator(thrust::make_counting_iterator(0), MulC<int>(m))) + n, 
       sum2.begin()); 
     thrust::transform(
       A.begin(), A.end(), 
       thrust::make_permutation_iterator(
         sum2.begin(), 
         thrust::make_transform_iterator(thrust::make_counting_iterator(0), line2col<int>(m))), 
       C.begin(), 
       thrust::divides<double>()); 
    } 

    curandDestroyGenerator(rng); 
    cublasDestroy(hd); 

    return 0; 
}

fonte

2013-01-09 08:41:15 kangshiyin

Si potrebbe utilizzare ArrayFire nel modo seguente

array A = randu(2000, 2000); 
A = exp(A); 
A /= tile(sum(A, 0), A.dims(0), 1);

Si potrebbe fare questo in spinta pure. Ma se hai intenzione di lavorare con le matrici (al contrario dei semplici vettori), dovresti farlo in un ciclo for che non sarebbe altrettanto efficiente.

DISCLAIMER Sono uno sviluppatore presso Accelereyes, lavorando su arrayfire.

EDIT Sto lavorando alla generazione di nuovi benchmark come richiesto.

EDIT Abbiamo rilevato bug di prestazioni per exp nel nostro codice a causa di questo benchmark. Stiamo rivedendo e risolvendolo.

fonte

2013-01-08 23:24:59

Grazie! È impressionante che il codice possa essere semplice come Matlab. Potresti anche confrontare le prestazioni del tuo codice con il mio? Poiché non ho la lib ArrayFire sulla mano. – kangshiyin

@EricShiyinKang Aggiornato con i risultati. –

Penso che ci sia un problema nel codice di riferimento, che porta al risultato della temporizzazione del pool per l'approccio cublas/thrust. Ecco il modificato [bench.cu] (https://gist.github.com/786a7ea0597be5ab10d3) – kangshiyin

Dovreste essere in grado di fondere la prima operazione for_each con la chiamata cublasSgemv in un unico reduce_by_key chiamata. Se si definisce/ridefinire functors come:

struct Accessor : public thrust::unary_function<int,int> 
{ 
    int lda; 
    __host__ __device__ Accessor(int _lda) : lda(_lda) {}; 
    __host__ __device__ int operator()(const int& idx) 
    { 
     return idx/lda; 
    } 
}; 

struct Exp : public thrust::unary_function<double,double> 
{ 
    __host__ __device__ double operator()(const double& x) 
    { 
     return exp(x); 
    } 
}; 

struct Inv : public thrust::unary_function<double,double> 
{ 
    __host__ __device__ double operator()(const double& x) 
    { 
     return double(1.0)/x; 
    } 
};

È quindi possibile calcolare l'uscita normalizzata come

Accessor columns(m); 
thrust::reduce_by_key(
     thrust::make_transform_iterator(thrust::make_counting_iterator(int(0)), columns), 
     thrust::make_transform_iterator(thrust::make_counting_iterator(int(m*n)), columns), 
     thrust::make_transform_iterator(A.begin(), Exp()), 
     thrust::make_discard_iterator(), 
     sum.begin()); 

thrust::for_each(sum.begin(), sum.end(), Inv()); 

cublasDdgmm(hd, CUBLAS_SIDE_RIGHT, m, n, pA, m, pSum, 1, pA, m);

[disclaimer: tutto il codice scritto in del browser ed è testato, utilizzare a proprio rischio e pericolo]

Oltre a ridurre il numero di chiamate al kernel, l'uso di iteratori di fantasia elimina la necessità della matrice di unità di grandi dimensioni che dovrebbe ridurre l'impronta di memoria e il numero totale di transazioni di memoria per eseguire le operazioni di somma e di elevazione.

fonte

2013-01-09 03:43:47 talonmies

Gli iteratori sono veramente _fantosi_. Ho confrontato i cubetti e gli approcci di spinta. Sebbene 'thrust :: reduce_by_key' possa richiedere una larghezza di banda di memoria inferiore, è ancora più lento rispetto a' cublasDgemv'. Qualche idea? – kangshiyin

Sospetto che le prestazioni relative dipenderanno molto dalla GPU e dal tipo che usi. Su una GPU diversa che utilizza tipi a 32 bit, è possibile che l'approccio di riduzione sia più vicino nelle prestazioni rispetto all'implementazione di CUBLAS pura.Gli sviluppatori di thrust hanno riconosciuto che la riduzione dello stato dell'arte si è spostata un po 'da quando hanno fatto l'attuale implementazione in spinta, ma in generale il modello di riduzione ad albero sarà sempre meno efficiente di qualcosa di ottimale espresso come un flusso di FMADs, come in questo caso. – talonmies

Vorrei anche suggerire di provare 'thrust :: transform' invece di' thrust_for_each'. In alcuni casi (certamente qualche tempo fa), ho trovato un po 'più veloce di 'for_each'. Ma probabilmente non cambierà molto le prestazioni. – talonmies

Come normalizzare le colonne matriciali in CUDA con le massime prestazioni?

risposta

Problemi correlati