I am trying to find the SVD of a complex matrix in Cuda using CuSolver library. The singular values are correct and matching the python output, but the U and VT matrices are incorrect.
Utilities.cu
#include <stdio.h>
#include <assert.h>
#include "cuda_runtime.h"
#include <cuda.h>
#include <cusolverDn.h>
/*******************/
/* iDivUp FUNCTION */
/*******************/
extern "C" int iDivUp(int a, int b) {
return ((a % b) != 0) ? (a / b + 1) : (a / b);
}
/********************/
/* CUDA ERROR CHECK */
/********************/
// --- Credit to http://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api
void gpuAssert(cudaError_t code, char *file, int line, bool abort = true)
{
if (code != cudaSuccess)
{
fprintf(stderr, "GPUassert: %s %s %d
", cudaGetErrorString(code), file, line);
if (abort) {
exit(code);
}
}
}
extern "C" void gpuErrchk(cudaError_t ans) {
gpuAssert((ans), __FILE__, __LINE__);
}
TimingGPU.cu
/**************/
/* TIMING GPU */
/**************/
#include "TimingGPU.cuh"
#include <cuda.h>
#include <cuda_runtime.h>
struct PrivateTimingGPU {
cudaEvent_t start;
cudaEvent_t stop;
};
// default constructor
TimingGPU::TimingGPU() {
privateTimingGPU = new PrivateTimingGPU;
}
// default destructor
TimingGPU::~TimingGPU() { }
void TimingGPU::StartCounter()
{
cudaEventCreate(&((*privateTimingGPU).start));
cudaEventCreate(&((*privateTimingGPU).stop));
cudaEventRecord((*privateTimingGPU).start, 0);
}
void TimingGPU::StartCounterFlags()
{
int eventflags = cudaEventBlockingSync;
cudaEventCreateWithFlags(&((*privateTimingGPU).start), eventflags);
cudaEventCreateWithFlags(&((*privateTimingGPU).stop), eventflags);
cudaEventRecord((*privateTimingGPU).start, 0);
}
// Gets the counter in ms
float TimingGPU::GetCounter()
{
float time;
cudaEventRecord((*privateTimingGPU).stop, 0);
cudaEventSynchronize((*privateTimingGPU).stop);
cudaEventElapsedTime(&time, (*privateTimingGPU).start, (*privateTimingGPU).stop);
return time;
}
TimingGPU.cuh
#ifndef __TIMING_CUH__
#define __TIMING_CUH__
/**************/
/* TIMING GPU */
/**************/
// Events are a part of CUDA API and provide a system independent way to measure execution times on CUDA devices with approximately 0.5
// microsecond precision.
struct PrivateTimingGPU;
class TimingGPU
{
private:
PrivateTimingGPU *privateTimingGPU;
public:
TimingGPU();
~TimingGPU();
void StartCounter();
void StartCounterFlags();
float GetCounter();
}; // TimingCPU class
#endif
kernel.cu
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <cusolverDn.h>
#include "Utilities.cuh"
#include "TimingGPU.cuh"
#define FULLSVD
#define PRINTRESULTS
/********/
/* MAIN */
/********/
int main() {
const int M = 3;
const int N = 3;
const int lda = M;
const int numMatrices = 1;
//const int numMatrices = 256;
cublasHandle_t cublasHandle = NULL;
TimingGPU timerGPU;
cudaEvent_t start, stop;
cusolverStatus_t status = CUSOLVER_STATUS_SUCCESS;
cudaError_t cudaStat1 = cudaSuccess;
cublasCreate(&cublasHandle);
cudaEventCreate(&start);
cudaEventCreate(&stop);
// --- Setting the host matrix
cuComplex *h_A = (cuComplex *)malloc(lda * N * numMatrices * sizeof(double));
for (unsigned int k = 0; k < numMatrices; k++)
for (unsigned int i = 0; i < M; i++) {
for (unsigned int j = 0; j < N; j++) {
h_A[k * M * N + j * M + i] = make_float2((1. / (k + 1)) * (i + j * j) * (i + j), (1. / (k + 1)) * (i + j * j) * (i + j));
//printf("[%d, %d] %f
", i, j, h_A[j*M + i]);
printf("%f %f ", h_A[j*M + i].x, h_A[j * M + i].y);
}
printf("
");
}
// --- Setting the device matrix and moving the host matrix to the device
cuComplex *d_A; gpuErrchk(cudaMalloc(&d_A, M * N * numMatrices * sizeof(cuComplex)));
gpuErrchk(cudaMemcpy(d_A, h_A, M * N * numMatrices * sizeof(cuComplex), cudaMemcpyHostToDevice));
// --- host side SVD results space
float *h_S = (float *)malloc(N * numMatrices * sizeof(float));
cuComplex *h_SI = (cuComplex *)malloc(N * N * numMatrices * sizeof(cuComplex));
cuComplex *h_x = (cuComplex *)malloc(N * numMatrices * sizeof(cuComplex));
cuComplex *h_U = NULL;
cuComplex *h_V = NULL;
#ifdef FULLSVD
h_U = (cuComplex *)malloc(M * M * numMatrices * sizeof(cuComplex));
h_V = (cuComplex *)malloc(N * N * numMatrices * sizeof(cuComplex));
#endif
// --- device side SVD workspace and matrices
int work_size = 0;
int *devInfo; gpuErrchk(cudaMalloc(&devInfo, sizeof(int)));
float *d_S; gpuErrchk(cudaMalloc(&d_S, N * numMatrices * sizeof(float)));
cuComplex *d_SI; gpuErrchk(cudaMalloc(&d_SI, N * N * numMatrices * sizeof(cuComplex)));
cuComplex *d_x; gpuErrchk(cudaMalloc(&d_x, N * numMatrices * sizeof(cuComplex)));
cuComplex *d_U = NULL;
cuComplex *d_V = NULL;
#ifdef FULLSVD
gpuErrchk(cudaMalloc(&d_U, M * M * numMatrices * sizeof(cuComplex)));
gpuErrchk(cudaMalloc(&d_V, N * N * numMatrices * sizeof(cuComplex)));
#endif
cuComplex *d_work = NULL; /* devie workspace for gesvdj */
int devInfo_h = 0; /* host copy of error devInfo_h */
// --- Parameters configuration of Jacobi-based SVD
const double tol = 1.e-7;
const int maxSweeps = 15;
cusolverEigMode_t jobz; // --- CUSOLVER_EIG_MODE_VECTOR - Compute eigenvectors; CUSOLVER_EIG_MODE_NOVECTOR - Compute singular values only
#ifdef FULLSVD
jobz = CUSOLVER_EIG_MODE_VECTOR;
#else
jobz = CUSOLVER_EIG_MODE_NOVECTOR;
#endif
const int econ = 0; // --- econ = 1 for economy size
// --- Numerical result parameters of gesvdj
double residual = 0;
int executedSweeps = 0;
// --- CUDA solver initialization
cusolverDnHandle_t solver_handle = NULL;
cusolveSafeCall(cusolverDnCreate(&solver_handle));
// --- Configuration of gesvdj
gesvdjInfo_t gesvdj_params = NULL;
cusolveSafeCall(cusolverDnCreateGesvdjInfo(&gesvdj_params));
// --- Set the computation tolerance, since the default tolerance is machine precision
cusolveSafeCall(cusolverDnXgesvdjSetTolerance(gesvdj_params, tol));
// --- Set the maximum number of sweeps, since the default value of max. sweeps is 100
cusolveSafeCall(cusolverDnXgesvdjSetMaxSweeps(gesvdj_params, maxSweeps));
// --- Query the SVD workspace
cusolveSafeCall(cusolverDnCgesvdjBatched_bufferSize(
solver_handle,
jobz, // --- Compute the singular vectors or not
M, // --- Number of rows of A, 0 <= M
N, // --- Number of columns of A, 0 <= N
d_A, // --- M x N
lda, // --- Leading dimension of A
d_S, // --- Square matrix of size min(M, N) x min(M, N)
d_U, // --- M x M if econ = 0, M x min(M, N) if econ = 1
lda, // --- Leading dimension of U, ldu >= max(1, M)
d_V, // --- N x N if econ = 0, N x min(M,N) if econ = 1
lda, // --- Leading dimension of V, ldv >= max(1, N)
&work_size,
gesvdj_params,
numMatrices));
gpuErrchk(cudaMalloc(&d_work, sizeof(cuComplex) * work_size));
// --- Compute SVD
timerGPU.StartCounter();
cusolveSafeCall(cusolverDnCgesvdjBatched(
solver_handle,
jobz, // --- Compute the singular vectors or not
M, // --- Number of rows of A, 0 <= M
N, // --- Number of columns of A, 0 <= N
d_A, // --- M x N
lda, // --- Leading dimension of A
d_S, // --- Square matrix of size min(M, N) x min(M, N)
d_U, // --- M x M if econ = 0, M x min(M, N) if econ = 1
lda, // --- Leading dimension of U, ldu >= max(1, M)
d_V, // --- N x N if econ = 0, N x min(M, N) if econ = 1
N, // --- Leading dimension of V, ldv >= max(1, N)
d_work,
work_size,
devInfo,
gesvdj_params,
numMatrices));
cudaStat1 = cudaDeviceSynchronize();
assert(CUSOLVER_STATUS_SUCCESS == status);
assert(cudaSuccess == cudaStat1);
printf("Calculation of the singular values only: %f ms
", timerGPU.GetCounter());
gpuErrchk(cudaMemcpy(&devInfo_h, devInfo, sizeof(int), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_S, d_S, sizeof(float) * N * numMatrices, cudaMemcpyDeviceToHost));
#ifdef FULLSVD
gpuErrchk(cudaMemcpy(h_U, d_U, sizeof(cuComplex) * lda * M * numMatrices, cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_V, d_V, sizeof(cuComplex) * N * N * numMatrices, cudaMemcpyDeviceToHost));
#endif
#ifdef PRINTRESULTS
printf("SINGULAR VALUES
");
printf("_______________
");
for (int k = 0; k < numMatrices; k++) {
for (int p = 0; p < N; p++)
printf("Matrix nr. %d; SV nr. %d; Value = %f
", k, p, h_S[k * N + p]);
printf("
");
}
#ifdef FULLSVD
printf("SINGULAR VECTORS U
");
printf("__________________
");
for (int k = 0; k < numMatrices; k++) {
for (int q = 0; q < M; q++)
for (int p = 0; p < M; p++)
printf("Matrix nr. %d; U nr. %d, %d; Value = %e, %e
", k, q, p, h_U[q * M + p].x, h_U[q * M + p].y);
printf("
");
}
printf("SINGULAR VECTORS V
");
printf("__________________
");
for (int k = 0; k < numMatrices; k++) {
for (int q = 0; q < N; q++)
for (int p = 0; p < N; p++)
printf("Matrix nr. %d; V nr. %d; Value = %e, %e
", k, p, h_V[N * q + p].x, h_V[N * q + p].y);
printf("
");
}
#endif
#endif
if (0 == devInfo_h) {
printf("gesvdj converges
");
}
else if (0 > devInfo_h) {
printf("%d-th par