Solving dense linear systems AX = B with CUDA

cuda gpu cusolver

11,878

Yes.

Approach nr. 1

In the framework of cuSOLVER you can use QR decomposition, see QR decomposition to solve linear systems in CUDA.

Approach nr. 2

Alternatively, you can calculate the matrix inverse by the successive involation of

 cublas<t>getrfBatched()

which calculates the LU decomposition of a matrix, and

cublas<t>getriBatched()

which calculates the inverse of the matrix starting from its LU decomposition.

Approach nr. 3

A final possibility is using

cublas<t>getrfBatched()

followed by a twofold invocation of

cublas<t>trsm()

which solves upper or lower triangular linear systems.

As pointed out by Robert Crovella, the answer may vary on the size and the type of the involved matrices.

Code for approach nr. 1

Please, see QR decomposition to solve linear systems in CUDA.

Code for approaches nr. 2 and nr. 3

Below, I'm reporting a worked example for the implementation of approaches nr. 2 and 3. Hankel matrices are used to feed the approaches with well-conditioned, invertible matrices. Please, note that approach nr. 3 requires permuting (rearranging) the system coefficients vector according to the pivot array obtained following the invokation of cublas<t>getrfBatched(). This permutation can be conveniently done on the CPU.

#include <stdio.h>
#include <fstream>
#include <iomanip>
#include <stdlib.h>     /* srand, rand */
#include <time.h>       /* time */

#include "cuda_runtime.h" 
#include "device_launch_parameters.h"

#include "cublas_v2.h"

#include "Utilities.cuh"
#include "TimingGPU.cuh"

#define prec_save 10

#define BLOCKSIZE 256

#define BLOCKSIZEX 16
#define BLOCKSIZEY 16

/************************************/
/* SAVE REAL ARRAY FROM CPU TO FILE */
/************************************/
template <class T>
void saveCPUrealtxt(const T * h_in, const char *filename, const int M) {

    std::ofstream outfile;
    outfile.open(filename);
    for (int i = 0; i < M; i++) outfile << std::setprecision(prec_save) << h_in[i] << "\n";
    outfile.close();

}

/************************************/
/* SAVE REAL ARRAY FROM GPU TO FILE */
/************************************/
template <class T>
void saveGPUrealtxt(const T * d_in, const char *filename, const int M) {

    T *h_in = (T *)malloc(M * sizeof(T));

    gpuErrchk(cudaMemcpy(h_in, d_in, M * sizeof(T), cudaMemcpyDeviceToHost));

    std::ofstream outfile;
    outfile.open(filename);
    for (int i = 0; i < M; i++) outfile << std::setprecision(prec_save) << h_in[i] << "\n";
    outfile.close();

}

/***************************************************/
/* FUNCTION TO SET THE VALUES OF THE HANKEL MATRIX */
/***************************************************/
// --- https://en.wikipedia.org/wiki/Hankel_matrix
void setHankelMatrix(double * __restrict h_A, const int N) {

    double *h_atemp = (double *)malloc((2 * N - 1) * sizeof(double));

    // --- Initialize random seed
    srand(time(NULL));

    // --- Generate random numbers
    for (int k = 0; k < 2 * N - 1; k++) h_atemp[k] = rand();

    // --- Fill the Hankel matrix. The Hankel matrix is symmetric, so filling by row or column is equivalent.
    for (int i = 0; i < N; i++)
        for (int j = 0; j < N; j++)
            h_A[i * N + j] = h_atemp[(i + 1) + (j + 1) - 2];

    free(h_atemp);

}

/***********************************************/
/* FUNCTION TO COMPUTE THE COEFFICIENTS VECTOR */
/***********************************************/
void computeCoefficientsVector(const double * __restrict h_A, const double * __restrict h_xref, 
                               double * __restrict h_y, const int N) {

    for (int k = 0; k < N; k++) h_y[k] = 0.f;

    for (int m = 0; m < N; m++)
        for (int n = 0; n < N; n++)
            h_y[m] = h_y[m] + h_A[n * N + m] * h_xref[n];

}

/************************************/
/* COEFFICIENT REARRANGING FUNCTION */
/************************************/
void rearrange(double *vec, int *pivotArray, int N){
    for (int i = 0; i < N; i++) {
        double temp = vec[i];
        vec[i] = vec[pivotArray[i] - 1];
        vec[pivotArray[i] - 1] = temp;
    }   
}

/********/
/* MAIN */
/********/
int main() {

    const unsigned int N = 1000;

    const unsigned int Nmatrices = 1;

    // --- CUBLAS initialization
    cublasHandle_t cublas_handle;
    cublasSafeCall(cublasCreate(&cublas_handle));

    TimingGPU timerLU, timerApproach1, timerApproach2;
    double timingLU, timingApproach1, timingApproach2;

    /***********************/
    /* SETTING THE PROBLEM */
    /***********************/
    // --- Matrices to be inverted (only one in this example)
    double *h_A = (double *)malloc(N * N * Nmatrices * sizeof(double));

    // --- Setting the Hankel matrix
    setHankelMatrix(h_A, N);

    // --- Defining the solution
    double *h_xref = (double *)malloc(N * sizeof(double));
    for (int k = 0; k < N; k++) h_xref[k] = 1.f;

    // --- Coefficient vectors (only one in this example)
    double *h_y = (double *)malloc(N * sizeof(double));

    computeCoefficientsVector(h_A, h_xref, h_y, N);

    // --- Result (only one in this example)
    double *h_x = (double *)malloc(N * sizeof(double));

    // --- Allocate device space for the input matrices 
    double *d_A; gpuErrchk(cudaMalloc(&d_A, N * N * Nmatrices * sizeof(double)));
    double *d_y; gpuErrchk(cudaMalloc(&d_y, N *                 sizeof(double)));
    double *d_x; gpuErrchk(cudaMalloc(&d_x, N *                 sizeof(double)));

    // --- Move the relevant matrices from host to device
    gpuErrchk(cudaMemcpy(d_A, h_A, N * N * Nmatrices * sizeof(double), cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_y, h_y, N *                 sizeof(double), cudaMemcpyHostToDevice));

    /**********************************/
    /* COMPUTING THE LU DECOMPOSITION */
    /**********************************/
    timerLU.StartCounter();

    // --- Creating the array of pointers needed as input/output to the batched getrf
    double **h_inout_pointers = (double **)malloc(Nmatrices * sizeof(double *));
    for (int i = 0; i < Nmatrices; i++) h_inout_pointers[i] = d_A + i * N * N;

    double **d_inout_pointers;
    gpuErrchk(cudaMalloc(&d_inout_pointers, Nmatrices * sizeof(double *)));
    gpuErrchk(cudaMemcpy(d_inout_pointers, h_inout_pointers, Nmatrices * sizeof(double *), cudaMemcpyHostToDevice));
    free(h_inout_pointers);

    int *d_pivotArray; gpuErrchk(cudaMalloc(&d_pivotArray, N * Nmatrices * sizeof(int)));
    int *d_InfoArray;  gpuErrchk(cudaMalloc(&d_InfoArray,      Nmatrices * sizeof(int)));

    int *h_InfoArray  = (int *)malloc(Nmatrices * sizeof(int));

    cublasSafeCall(cublasDgetrfBatched(cublas_handle, N, d_inout_pointers, N, d_pivotArray, d_InfoArray, Nmatrices));
    //cublasSafeCall(cublasDgetrfBatched(cublas_handle, N, d_inout_pointers, N, NULL, d_InfoArray, Nmatrices));

    gpuErrchk(cudaMemcpy(h_InfoArray, d_InfoArray, Nmatrices * sizeof(int), cudaMemcpyDeviceToHost));

    for (int i = 0; i < Nmatrices; i++)
        if (h_InfoArray[i] != 0) {
            fprintf(stderr, "Factorization of matrix %d Failed: Matrix may be singular\n", i);
            cudaDeviceReset();
            exit(EXIT_FAILURE);
        }

    timingLU = timerLU.GetCounter();
    printf("Timing LU decomposition %f [ms]\n", timingLU);

    /*********************************/
    /* CHECKING THE LU DECOMPOSITION */
    /*********************************/
    saveCPUrealtxt(h_A,          "D:\\Project\\solveSquareLinearSystemCUDA\\solveSquareLinearSystemCUDA\\A.txt", N * N);
    saveCPUrealtxt(h_y,          "D:\\Project\\solveSquareLinearSystemCUDA\\solveSquareLinearSystemCUDA\\y.txt", N);
    saveGPUrealtxt(d_A,          "D:\\Project\\solveSquareLinearSystemCUDA\\solveSquareLinearSystemCUDA\\Adecomposed.txt", N * N);
    saveGPUrealtxt(d_pivotArray, "D:\\Project\\solveSquareLinearSystemCUDA\\solveSquareLinearSystemCUDA\\pivotArray.txt", N);

    /******************************************************************************/
    /* APPROACH NR.1: COMPUTE THE INVERSE OF A STARTING FROM ITS LU DECOMPOSITION */
    /******************************************************************************/
    timerApproach1.StartCounter();

    // --- Allocate device space for the inverted matrices 
    double *d_Ainv; gpuErrchk(cudaMalloc(&d_Ainv, N * N * Nmatrices * sizeof(double)));

    // --- Creating the array of pointers needed as output to the batched getri
    double **h_out_pointers = (double **)malloc(Nmatrices * sizeof(double *));
    for (int i = 0; i < Nmatrices; i++) h_out_pointers[i] = (double *)((char*)d_Ainv + i * ((size_t)N * N) * sizeof(double));

    double **d_out_pointers;
    gpuErrchk(cudaMalloc(&d_out_pointers, Nmatrices*sizeof(double *)));
    gpuErrchk(cudaMemcpy(d_out_pointers, h_out_pointers, Nmatrices*sizeof(double *), cudaMemcpyHostToDevice));
    free(h_out_pointers);

    cublasSafeCall(cublasDgetriBatched(cublas_handle, N, (const double **)d_inout_pointers, N, d_pivotArray, d_out_pointers, N, d_InfoArray, Nmatrices));

    gpuErrchk(cudaMemcpy(h_InfoArray, d_InfoArray, Nmatrices * sizeof(int), cudaMemcpyDeviceToHost));

    for (int i = 0; i < Nmatrices; i++)
        if (h_InfoArray[i] != 0) {
        fprintf(stderr, "Inversion of matrix %d Failed: Matrix may be singular\n", i);
        cudaDeviceReset();
        exit(EXIT_FAILURE);
        }

    double alpha1 = 1.f;
    double beta1 = 0.f;

    cublasSafeCall(cublasDgemv(cublas_handle, CUBLAS_OP_N, N, N, &alpha1, d_Ainv, N, d_y, 1, &beta1, d_x, 1));

    timingApproach1 = timingLU + timerApproach1.GetCounter();
    printf("Timing approach 1 %f [ms]\n", timingApproach1);

    /**************************/
    /* CHECKING APPROACH NR.1 */
    /**************************/
    saveGPUrealtxt(d_x, "D:\\Project\\solveSquareLinearSystemCUDA\\solveSquareLinearSystemCUDA\\xApproach1.txt", N);

    /*************************************************************/
    /* APPROACH NR.2: INVERT UPPER AND LOWER TRIANGULAR MATRICES */
    /*************************************************************/
    timerApproach2.StartCounter();

    double *d_P; gpuErrchk(cudaMalloc(&d_P, N * N * sizeof(double)));

    gpuErrchk(cudaMemcpy(h_y, d_y, N * Nmatrices * sizeof(int), cudaMemcpyDeviceToHost));
    int *h_pivotArray = (int *)malloc(N * Nmatrices*sizeof(int));
    gpuErrchk(cudaMemcpy(h_pivotArray, d_pivotArray, N * Nmatrices * sizeof(int), cudaMemcpyDeviceToHost));

    rearrange(h_y, h_pivotArray, N);
    gpuErrchk(cudaMemcpy(d_y, h_y, N * Nmatrices * sizeof(double), cudaMemcpyHostToDevice));

    // --- Now P*A=L*U
    //     Linear system A*x=y => P.'*L*U*x=y => L*U*x=P*y

    // --- 1st phase - solve Ly = b 
    const double alpha = 1.f;

    // --- Function solves the triangular linear system with multiple right hand sides, function overrides b as a result 

    // --- Lower triangular part
    cublasSafeCall(cublasDtrsm(cublas_handle, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, CUBLAS_DIAG_UNIT, N, 1, &alpha, d_A, N, d_y, N));

    // --- Upper triangular part
    cublasSafeCall(cublasDtrsm(cublas_handle, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, N, 1, &alpha, d_A, N, d_y, N));

    timingApproach2 = timingLU + timerApproach2.GetCounter();
    printf("Timing approach 2 %f [ms]\n", timingApproach2);

    /**************************/
    /* CHECKING APPROACH NR.2 */
    /**************************/
    saveGPUrealtxt(d_y, "D:\\Project\\solveSquareLinearSystemCUDA\\solveSquareLinearSystemCUDA\\xApproach2.txt", N);

    return 0;
}

The Utilities.cu and Utilities.cuh files needed to run such an example are maintained at this github page. The TimingGPU.cu and TimingGPU.cuh files are maintained at this github page.

Some useful references on the third approach:

NAG Fortran Library Routine Document

Scientific Computing Software Library (SCSL) User’s Guide

https://www.cs.drexel.edu/~jjohnson/2010-11/summer/cs680/programs/lapack/Danh/verify_sequential.c

EDIT

Timings (in ms) for approaches nr. 2 and 3 (tests performed on a GTX960 card, cc. 5.2).

N        LU decomposition       Approach nr. 2       Approach nr. 3
100      1.08                   2.75                 1.28
500      45.4                   161                  45.7
1000     302                    1053                 303

As it emerges, approach nr. 3 is more convenient and its cost is essentially the cost of computing the LU factorization. Furthermore:

Solving linear systems by LU decomposition is faster than using QR decomposition (see QR decomposition to solve linear systems in CUDA);
LU decomposition is limited to square linear systems, while QR decomposition helps in case of non-square linear systems.

The below Matlab code can be used for checking the results

clear all
close all
clc

warning off

N = 1000;

% --- Setting the problem solution
x = ones(N, 1);

%%%%%%%%%%%%%%%%%%%%%
% NxN HANKEL MATRIX %
%%%%%%%%%%%%%%%%%%%%%
% --- https://en.wikipedia.org/wiki/Hankel_matrix
load A.txt
load y.txt

A = reshape(A, N, N);

yMatlab = A * x;
fprintf('Percentage rms between coefficients vectors in Matlab and CUDA %f\n', 100 * sqrt(sum(sum(abs(yMatlab - y).^2)) / sum(sum(abs(yMatlab).^2))));

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% COMPUTATION OF THE LU DECOMPOSITION %
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
[Lmatlab, Umatlab] = lu(A);

load Adecomposed.txt
Adecomposed = reshape(Adecomposed, N, N);

L = eye(N);
for k = 1 : N
    L(k + 1 : N, k) = Adecomposed(k + 1 : N, k);
end
U = zeros(N);
for k = 1 : N
    U(k, k : N) = Adecomposed(k, k : N);
end

load pivotArray.txt
Pj = eye(N);
for j = 1 : N
   tempVector = Pj(j, :);
   Pj(j, :) = Pj(pivotArray(j), :);
   Pj(pivotArray(j), :) = tempVector;
end

fprintf('Percentage rms between Pj * A and L * U in CUDA %f\n', 100 * sqrt(sum(sum(abs(Pj * A - L * U).^2)) / sum(sum(abs(Pj * A).^2))));

xprime      = inv(Lmatlab) * yMatlab;
xMatlab     = inv(Umatlab) * xprime;

fprintf('Percentage rms between reference solution and solution in Matlab %f\n', 100 * sqrt(sum(sum(abs(xMatlab - x).^2)) / sum(sum(abs(x).^2))));

load xApproach1.txt
fprintf('Percentage rms between reference solution and solution in CUDA for approach nr.1 %f\n', 100 * sqrt(sum(sum(abs(xApproach1 - x).^2)) / sum(sum(abs(x).^2))));

load xApproach2.txt
fprintf('Percentage rms between reference solution and solution in CUDA for approach nr.2 %f\n', 100 * sqrt(sum(sum(abs(xApproach2 - x).^2)) / sum(sum(abs(x).^2))));

11,878

Author by

Admin

Updated on October 06, 2022

Comments

Admin over 1 year
Can I use the new cuSOLVER library (CUDA 7) to solve linear systems of the form
```
AX = B 
```
where A, X and B are NxN dense matrices ?