Codirò il mio primo progetto CUDA relativamente grande come Ottimizzazione della pendenza per scopi di apprendimento automatico. Vorrei trarre beneficio dalla saggezza del pubblico riguardo alcune utili funzioni native del CUDA che potrebbero essere scorciatoie da utilizzare nel progetto. Qualche idea/suggerimento?Ottimizzazione della discesa dei gradienti in CUDA
risposta
Gradient Descent (alias ripida discesa) mira a trovare un minimo locale di una funzione multivariata F(x)
adottando misure proporzionali al negativo del gradiente di F(x)
nel punto corrente. La regola di aggiornamento è il seguente:
cui la dimensione passogamma_n
può cambiare ad ogni passo e può essere determinato, per esempio, da linea ricerche.
L'implementazione della regola di aggiornamento sopra descritta in CUDA è piuttosto semplice. Di seguito, fornisco un esempio completo utilizzando la funzione Rosenbrock come costo funzionale da ottimizzare, sfruttando il gradiente analitico e considerando un valore costante per la dimensione del passo attraverso le iterazioni (ovvero, gamma_n = gamma
). I file Utilities.cu
e Utilities.cuh
vengono mantenuti a OrangeOwlSolutions/CUDA_Utilities e qui omessi. L'esempio implementa la CPU e l'approccio GPU.
**kernel.cu**
#include <stdio.h>
#include <float.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "GradientDescentCPU.h"
#include "GradientDescentGPU.cuh"
#include "Utilities.cuh"
/********/
/* MAIN */
/********/
int main()
{
/********************/
/* INPUT PARAMETERS */
/********************/
// --- Number of unknowns
const int M = 5;
// --- Starting point
float *h_x0 = (float*)malloc(M * sizeof(float));
for (int i=0; i<M; i++) h_x0[i] = 1.2f;
// --- Termination tolerance
const float tol = 1.e-6;
// --- Maximum number of allowed iterations
const int maxiter = 10000;
// --- Step size
const float alpha = 0.001f;
// --- Derivative step
const float h = 0.0001f;
// --- Minimum allowed perturbations
const float dxmin = 1e-5;
/*********************/
/* OUTPUT PARAMETERS */
/*********************/
// --- Optimal point
float* h_xopt = (float*)malloc(M * sizeof(float));
for (int i=0; i<M; i++) h_xopt[i] = 0.f;
// --- Optimal functional
float fopt = 0.f;
// --- Number of performed iterations
int niter = 0;
// --- Gradient norm at optimal point
float gnorm = 0.f;
// --- Distance between last and penultimate solutions found
float dx = 0.f;
/***************************/
/* OPTIMIZATION - CPU CASE */
/***************************/
GradientDescentCPU(h_x0, tol, maxiter, alpha, h, dxmin, M, h_xopt, &fopt, &niter, &gnorm, &dx);
printf("Solution found - CPU case:\n");
printf("fopt = %f; niter = %i; gnorm = %f; dx = %f\n", fopt, niter, gnorm, dx);
printf("\n\n");
#ifdef VERBOSE
printf("Found minimum - CPU case:\n");
for (int i=0; i<M; i++) printf("i = %i; h_xopt = %f\n", i, h_xopt[i]);
printf("\n\n");
#endif
/***************************/
/* OPTIMIZATION - GPU CASE */
/***************************/
// --- Starting point
float *d_x0; gpuErrchk(cudaMalloc((void**)&d_x0, M * sizeof(float)));
gpuErrchk(cudaMemcpy(d_x0, h_x0, M * sizeof(float), cudaMemcpyHostToDevice));
// --- Optimal point
float *d_xopt; gpuErrchk(cudaMalloc((void**)&d_xopt, M * sizeof(float)));
GradientDescentGPU(d_x0, tol, maxiter, alpha, h, dxmin, M, d_xopt, &fopt, &niter, &gnorm, &dx);
printf("Solution found - GPU case:\n");
printf("fopt = %f; niter = %i; gnorm = %f; dx = %f\n", fopt, niter, gnorm, dx);
printf("\n\n");
#ifdef VERBOSE
gpuErrchk(cudaMemcpy(h_xopt, d_xopt, M * sizeof(float), cudaMemcpyDeviceToHost));
printf("Found minimum - GPU case:\n");
for (int i=0; i<M; i++) printf("i = %i; h_xopt = %f\n", i, h_xopt[i]);
printf("\n\n");
#endif
return 0;
}
GradientDescentCPU.cu
#include <stdlib.h>
#include <math.h>
#include <float.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "GradientDescentGPU.cuh"
/*******************************/
/* GRADIENT DESCENT - CPU CASE */
/*******************************/
// --- Version using finite differences
//void CostFunctionGradientCPU(float * __restrict h_x, float * __restrict h_g, const float h, const int M) {
//
// for (int i=0; i<M; i++) {
// h_x[i] = h_x[i] + h/2.f;
// h_g[i] = CostFunction(h_x, M);
// h_x[i] = h_x[i] - h;
// h_g[i] = (h_g[i] - CostFunction(h_x, M))/(2.f * h);
// h_x[i] = h_x[i] + h/2.f;
// }
//}
// --- Version using analytical gradient (Rosenbrock function)
void CostFunctionGradientCPU(float * __restrict h_x, float * __restrict h_g, const float h, const int M) {
h_g[0] = -400.f * (h_x[1] - h_x[0] * h_x[0]) * h_x[0] + 2.f * (h_x[0] - 1.f);
for (int i=1; i<M-1; i++) {
h_g[i] = -400.f * h_x[i] * (h_x[i+1] - h_x[i] * h_x[i]) + 2.f * (h_x[i] - 1.f) + 200.f * (h_x[i] - h_x[i-1] * h_x[i-1]);
}
h_g[M-1] = 200.f * (h_x[M-1] - h_x[M-2] * h_x[M-2]);
}
/********/
/* NORM */
/********/
float normCPU(const float * __restrict h_x, const int M) {
float sum = 0.f;
for(int i=0; i<M; i++) sum = sum + h_x[i] * h_x[i];
return sqrt(sum);
}
/****************************************/
/* GRADIENT DESCENT FUNCTION - CPU CASE */
/****************************************/
// x0 - Starting point
// tol - Termination tolerance
// maxiter - Maximum number of allowed iterations
// alpha - Step size
// dxmin - Minimum allowed perturbations
void GradientDescentCPU(const float * __restrict h_x0, const float tol, const int maxiter, const float alpha, const float h, const float dxmin, const int M,
float * __restrict h_xopt, float *fopt, int *niter, float *gnorm, float *dx) {
// --- Initialize gradient norm, optimization vector, iteration counter, perturbation
*gnorm = FLT_MAX;
float *h_x = (float *)malloc(M * sizeof(float));
for (int i=0; i<M; i++) h_x[i] = h_x0[i];
*niter = 0;
*dx = FLT_MAX;
// --- Allocating space for the gradient, for the new actual solution and for the difference between actual and old solutions
float *h_g = (float *)malloc(M * sizeof(float));
float *h_xnew = (float *)malloc(M * sizeof(float));
float *h_xdiff = (float *)malloc(M * sizeof(float));
// --- Gradient Descent iterations
while ((*gnorm >= tol) && (*niter <= maxiter) && (*dx >= dxmin)) {
// --- Calculate gradient
CostFunctionGradientCPU(h_x, h_g, h, M);
*gnorm = normCPU(h_g, M);
// --- Take step:
for (int i=0; i<M; i++) h_xnew[i] = h_x[i] - alpha * h_g[i];
// --- Update termination metrics
*niter = *niter + 1;
for (int i=0; i<M; i++) h_xdiff[i] = h_xnew[i] - h_x[i];
*dx = normCPU(h_xdiff, M);
for (int i=0; i<M; i++) h_x[i] = h_xnew[i];
}
for (int i=0; i<M; i++) h_xopt[i] = h_x[i];
*fopt = CostFunction(h_xopt, M);
*niter = *niter - 1;
}
GradientDescentCPU.h
#ifndef GRADIENT_DESCENT_CPU
#define GRADIENT_DESCENT_CPU
void GradientDescentCPU(const float * __restrict, const float, const int, const float, const float, const float, const int,
float * __restrict, float *, int *, float *, float *);
#endif
GradientDescentGPU.cu
#include <thrust\device_ptr.h>
#include <thrust\inner_product.h>
#include "Utilities.cuh"
#define BLOCK_SIZE 256
//#define VERBOSE
//#define DEBUG
/***********************************/
/* COST FUNCTION - CPU & GPU CASES */
/***********************************/
__host__ __device__ float CostFunction(const float * __restrict h_x, const int M) {
// --- Rosenbrock function
float sum = 0.f;
for (int i=0; i<M-1; i++) {
float temp1 = (h_x[i+1] - h_x[i] * h_x[i]);
float temp2 = (h_x[i] - 1.f);
sum = sum + 100.f * temp1 * temp1 + temp2 * temp2;
}
return sum;
}
/*******************************/
/* GRADIENT DESCENT - GPU CASE */
/*******************************/
// --- Version using finite differences
//__device__ void CostFunctionGradientGPU(float * __restrict d_x, float * __restrict d_g, const float h, const int tid, const int M) {
//
// int test1, test2;
// float h_test1_plus, h_test1_minus, h_test2_plus, h_test2_minus, temp1_plus, temp1_minus, temp2_plus, temp2_minus;
//
// // --- Rosenbrock function
// float sum_plus = 0.f, sum_minus = 0.f;
// for (int i=0; i<M-1; i++) {
// h_test1_plus = d_x[i] + (h/2.f) * (tid == i);
// h_test1_minus = d_x[i] - (h/2.f) * (tid == i);
// h_test2_plus = d_x[i + 1] + (h/2.f) * (tid == (i + 1));
// h_test2_minus = d_x[i + 1] - (h/2.f) * (tid == (i + 1));
// temp1_plus = (h_test2_plus - h_test1_plus * h_test1_plus);
// temp2_plus = (h_test1_plus - 1.f);
// temp1_minus = (h_test2_minus - h_test1_minus * h_test1_minus);
// temp2_minus = (h_test1_minus - 1.f);
// sum_plus = sum_plus + 100.f * temp1_plus * temp1_plus + temp2_plus * temp2_plus;
// sum_minus = sum_minus + 100.f * temp1_minus * temp1_minus + temp2_minus * temp2_minus;
// }
// d_g[tid] = (sum_plus - sum_minus)/(2.f * h);
//}
// --- Version using analytical gradient (Rosenbrock function)
__device__ void CostFunctionGradientGPU(float * __restrict d_x, float * __restrict d_g, const float h, const int tid, const int M) {
if (tid == 0) d_g[0] = -400.f * (d_x[1] - d_x[0] * d_x[0]) * d_x[0] + 2.f * (d_x[0] - 1.f);
else if (tid == M-1) d_g[M-1] = 200.f * (d_x[M-1] - d_x[M-2] * d_x[M-2]);
else {
for (int i=1; i<M-1; i++) {
d_g[i] = -400.f * d_x[i] * (d_x[i+1] - d_x[i] * d_x[i]) + 2.f * (d_x[i] - 1.f) + 200.f * (d_x[i] - d_x[i-1] * d_x[i-1]);
}
}
}
/*******************/
/* STEP - GPU CASE */
/*******************/
__global__ void StepGPU(float * __restrict d_x, float * __restrict d_xnew, float * __restrict d_xdiff, float * __restrict d_g, const float alpha, const float h, const int M) {
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < M) {
// --- Calculate gradient
CostFunctionGradientGPU(d_x, d_g, h, tid, M);
// --- Take step
d_xnew[tid] = d_x[tid] - alpha * d_g[tid];
// --- Update termination metrics
d_xdiff[tid] = d_xnew[tid] - d_x[tid];
// --- Update current solution
d_x[tid] = d_xnew[tid];
}
}
/***********************************/
/* COST FUNCTION STRUCT - GPU CASE */
/***********************************/
// --- Rosenbrock function struct for thrust reduction
struct CostFunctionStructGPU{
template <typename Tuple>
__host__ __device__ float operator()(Tuple a) {
float temp1 = (thrust::get<1>(a) - thrust::get<0>(a) * thrust::get<0>(a));
float temp2 = (thrust::get<0>(a) - 1.f);
return 100.f * temp1 * temp1 + temp2 * temp2;
}
};
/****************************************/
/* GRADIENT DESCENT FUNCTION - GPU CASE */
/****************************************/
// x0 - Starting point
// tol - Termination tolerance
// maxiter - Maximum number of allowed iterations
// alpha - Step size
// dxmin - Minimum allowed perturbations
void GradientDescentGPU(const float * __restrict__ d_x0, const float tol, const int maxiter, const float alpha, const float h,
const float dxmin, const int M, float * __restrict__ d_xopt, float *fopt, int *niter, float *gnorm, float *dx) {
thrust::device_ptr<float> dev_ptr_xopt = thrust::device_pointer_cast(d_xopt);
// --- Initialize gradient norm, optimization vector, iteration counter, perturbation
*gnorm = FLT_MAX;
float *d_x; gpuErrchk(cudaMalloc((void**)&d_x, M * sizeof(float)));
gpuErrchk(cudaMemcpy(d_x, d_x0, M * sizeof(float), cudaMemcpyDeviceToDevice));
*niter = 0;
*dx = FLT_MAX;
// --- Allocating space for the gradient, for the new actual solution and for the difference between actual and old solutions
float *d_g; gpuErrchk(cudaMalloc((void**)&d_g, M * sizeof(float))); thrust::device_ptr<float> dev_ptr_g = thrust::device_pointer_cast(d_g);
float *d_xnew; gpuErrchk(cudaMalloc((void**)&d_xnew, M * sizeof(float)));
float *d_xdiff; gpuErrchk(cudaMalloc((void**)&d_xdiff, M * sizeof(float))); thrust::device_ptr<float> dev_ptr_xdiff = thrust::device_pointer_cast(d_xdiff);
// --- Gradient Descent iterations
while ((*gnorm >= tol) && (*niter <= maxiter) && (*dx >= dxmin)) {
// --- Iteration step
StepGPU<<<iDivUp(M, BLOCK_SIZE), BLOCK_SIZE>>>(d_x, d_xnew, d_xdiff, d_g, alpha, h, M);
#ifdef DEBUG
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
#endif
*gnorm = sqrt(thrust::inner_product(dev_ptr_g, dev_ptr_g + M, dev_ptr_g, 0.0f));
*dx = sqrt(thrust::inner_product(dev_ptr_xdiff, dev_ptr_xdiff + M, dev_ptr_xdiff, 0.0f));
*niter = *niter + 1;
}
gpuErrchk(cudaMemcpy(d_xopt, d_x, M * sizeof(float), cudaMemcpyDeviceToDevice));
// --- Functional calculation
*fopt = thrust::transform_reduce(thrust::make_zip_iterator(thrust::make_tuple(dev_ptr_xopt, dev_ptr_xopt + 1)), thrust::make_zip_iterator(thrust::make_tuple(dev_ptr_xopt + M - 1, dev_ptr_xopt + M)), CostFunctionStructGPU(), 0.0f, thrust::plus<float>());
*niter = *niter - 1;
}
GradientDescentGPU.cuh
#ifndef GRADIENT_DESCENT_GPU
#define GRADIENT_DESCENT_GPU
void GradientDescentGPU(const float * __restrict__, const float, const int, const float, const float, const float, const int,
float * __restrict__, float *, int *, float *, float *);
__host__ __device__ float CostFunction(const float * __restrict, const int);
#endif
Che tipo di discesa del gradiente hai intenzione di implementare? Puoi trovare alcuni esempi interessanti [** qui **] (http://blog.accelereyes.com/blog/2011/09/20/optimization-methods-for-deep-learning/) con diversi metodi e risultati. C'è anche [** questo altro post **] (http://adnanboz.wordpress.com/2012/02/25/large-scale-machine-learning-using-nvidia-cuda/) su machine learning e GPGPU. Quindi potresti provare a darci qualche informazione in più sul tuo problema? – BenC
Grazie per il collegamento ma non voglio imparare GD Voglio solo imparare alcune funzioni utili in CUDA che potrebbero essere utili per tale progetto – erogol
Il problema è che questo tipo di domanda potrebbe essere troppo ampia. Ci sono [librerie matematiche] (https://developer.nvidia.com/cuda-math-library), librerie di algebra lineare ([MAGMA] (https://developer.nvidia.com/magma), [CUBLAS] (https : //developer.nvidia.com/cublas)), e se vuoi solo una libreria orientata allo sviluppo, [Thrust] (http://thrust.github.io/) è sicuramente una buona scelta. È possibile controllare [** questo elenco **] (https://developer.nvidia.com/technologies/Libraries) sul sito Web di NVIDIA. – BenC