2D CUDA median filter optimization
Solution 1
I'm answering your last question on the use of shared memory.
As already noticed by Eric, your use of shared memory does not really lead to thread collaboration.
I'm comparing your solution, for the 3x3
case, with a variant of your kernel not using shared memory at all as well as with the Accelereyes solution discussed in 2D median filtering in CUDA: how to efficiently copy global memory to shared memory.
Here is the complete code:
#include <iostream>
#include <fstream>
using namespace std;
#define BLOCK_WIDTH 16
#define BLOCK_HEIGHT 16
/*******************/
/* iDivUp FUNCTION */
/*******************/
int iDivUp(int a, int b){ return ((a % b) != 0) ? (a / b + 1) : (a / b); }
/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
/**********************************************/
/* KERNEL WITH OPTIMIZED USE OF SHARED MEMORY */
/**********************************************/
__global__ void Optimized_Kernel_Function_shared(unsigned short *Input_Image, unsigned short *Output_Image, int Image_Width, int Image_Height)
{
const int tx_l = threadIdx.x; // --- Local thread x index
const int ty_l = threadIdx.y; // --- Local thread y index
const int tx_g = blockIdx.x * blockDim.x + tx_l; // --- Global thread x index
const int ty_g = blockIdx.y * blockDim.y + ty_l; // --- Global thread y index
__shared__ unsigned short smem[BLOCK_WIDTH+2][BLOCK_HEIGHT+2];
// --- Fill the shared memory border with zeros
if (tx_l == 0) smem[tx_l] [ty_l+1] = 0; // --- left border
else if (tx_l == BLOCK_WIDTH-1) smem[tx_l+2][ty_l+1] = 0; // --- right border
if (ty_l == 0) { smem[tx_l+1][ty_l] = 0; // --- upper border
if (tx_l == 0) smem[tx_l] [ty_l] = 0; // --- top-left corner
else if (tx_l == BLOCK_WIDTH-1) smem[tx_l+2][ty_l] = 0; // --- top-right corner
} else if (ty_l == BLOCK_HEIGHT-1) {smem[tx_l+1][ty_l+2] = 0; // --- bottom border
if (tx_l == 0) smem[tx_l] [ty_l+2] = 0; // --- bottom-left corder
else if (tx_l == BLOCK_WIDTH-1) smem[tx_l+2][ty_l+2] = 0; // --- bottom-right corner
}
// --- Fill shared memory
smem[tx_l+1][ty_l+1] = Input_Image[ty_g*Image_Width + tx_g]; // --- center
if ((tx_l == 0)&&((tx_g > 0))) smem[tx_l] [ty_l+1] = Input_Image[ty_g*Image_Width + tx_g-1]; // --- left border
else if ((tx_l == BLOCK_WIDTH-1)&&(tx_g < Image_Width - 1)) smem[tx_l+2][ty_l+1] = Input_Image[ty_g*Image_Width + tx_g+1]; // --- right border
if ((ty_l == 0)&&(ty_g > 0)) { smem[tx_l+1][ty_l] = Input_Image[(ty_g-1)*Image_Width + tx_g]; // --- upper border
if ((tx_l == 0)&&((tx_g > 0))) smem[tx_l] [ty_l] = Input_Image[(ty_g-1)*Image_Width + tx_g-1]; // --- top-left corner
else if ((tx_l == BLOCK_WIDTH-1)&&(tx_g < Image_Width - 1)) smem[tx_l+2][ty_l] = Input_Image[(ty_g-1)*Image_Width + tx_g+1]; // --- top-right corner
} else if ((ty_l == BLOCK_HEIGHT-1)&&(ty_g < Image_Height - 1)) { smem[tx_l+1][ty_l+2] = Input_Image[(ty_g+1)*Image_Width + tx_g]; // --- bottom border
if ((tx_l == 0)&&((tx_g > 0))) smem[tx_l] [ty_l+2] = Input_Image[(ty_g-1)*Image_Width + tx_g-1]; // --- bottom-left corder
else if ((tx_l == BLOCK_WIDTH-1)&&(tx_g < Image_Width - 1)) smem[tx_l+2][ty_l+2] = Input_Image[(ty_g+1)*Image_Width + tx_g+1]; // --- bottom-right corner
}
__syncthreads();
// --- Pull the 3x3 window in a local array
unsigned short v[9] = { smem[tx_l][ty_l], smem[tx_l+1][ty_l], smem[tx_l+2][ty_l],
smem[tx_l][ty_l+1], smem[tx_l+1][ty_l+1], smem[tx_l+2][ty_l+1],
smem[tx_l][ty_l+2], smem[tx_l+1][ty_l+2], smem[tx_l+2][ty_l+2] };
// --- Bubble-sort
for (int i = 0; i < 5; i++) {
for (int j = i + 1; j < 9; j++) {
if (v[i] > v[j]) { // swap?
unsigned short tmp = v[i];
v[i] = v[j];
v[j] = tmp;
}
}
}
// --- Pick the middle one
Output_Image[ty_g*Image_Width + tx_g] = v[4];
}
/****************************/
/* ORIGINAL KERNEL FUNCTION */
/****************************/
__global__ void Original_Kernel_Function(unsigned short *Input_Image, unsigned short *Output_Image, int Image_Width, int Image_Height) {
__shared__ unsigned short surround[BLOCK_WIDTH*BLOCK_HEIGHT][9];
int iterator;
const int x = blockDim.x * blockIdx.x + threadIdx.x;
const int y = blockDim.y * blockIdx.y + threadIdx.y;
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
if( (x >= (Image_Width - 1)) || (y >= Image_Height - 1) || (x == 0) || (y == 0)) return;
// --- Fill shared memory
iterator = 0;
for (int r = x - 1; r <= x + 1; r++) {
for (int c = y - 1; c <= y + 1; c++) {
surround[tid][iterator] = Input_Image[c*Image_Width+r];
iterator++;
}
}
// --- Sort shared memory to find the median using Bubble Short
for (int i=0; i<5; ++i) {
// --- Find the position of the minimum element
int minval=i;
for (int l=i+1; l<9; ++l) if (surround[tid][l] < surround[tid][minval]) minval=l;
// --- Put found minimum element in its place
unsigned short temp = surround[tid][i];
surround[tid][i]=surround[tid][minval];
surround[tid][minval]=temp;
}
// --- Pick the middle one
Output_Image[(y*Image_Width)+x]=surround[tid][4];
__syncthreads();
}
/***********************************************/
/* ORIGINAL KERNEL FUNCTION - NO SHARED MEMORY */
/***********************************************/
__global__ void Original_Kernel_Function_no_shared(unsigned short *Input_Image, unsigned short *Output_Image, int Image_Width, int Image_Height) {
unsigned short surround[9];
int iterator;
const int x = blockDim.x * blockIdx.x + threadIdx.x;
const int y = blockDim.y * blockIdx.y + threadIdx.y;
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
if( (x >= (Image_Width - 1)) || (y >= Image_Height - 1) || (x == 0) || (y == 0)) return;
// --- Fill array private to the threads
iterator = 0;
for (int r = x - 1; r <= x + 1; r++) {
for (int c = y - 1; c <= y + 1; c++) {
surround[iterator] = Input_Image[c*Image_Width+r];
iterator++;
}
}
// --- Sort private array to find the median using Bubble Short
for (int i=0; i<5; ++i) {
// --- Find the position of the minimum element
int minval=i;
for (int l=i+1; l<9; ++l) if (surround[l] < surround[minval]) minval=l;
// --- Put found minimum element in its place
unsigned short temp = surround[i];
surround[i]=surround[minval];
surround[minval]=temp;
}
// --- Pick the middle one
Output_Image[(y*Image_Width)+x]=surround[4];
}
/********/
/* MAIN */
/********/
int main()
{
const int Image_Width = 1580;
const int Image_Height = 1050;
// --- Open data file
ifstream is; is.open("C:\\Users\\user\\Documents\\Project\\Median_Filter\\Release\\Image_To_Be_Filtered.raw", ios::binary );
// --- Get file length
is.seekg(0, ios::end);
int dataLength = is.tellg();
is.seekg(0, ios::beg);
// --- Read data from file and close file
unsigned short* Input_Image_Host = new unsigned short[dataLength * sizeof(char) / sizeof(unsigned short)];
is.read((char*)Input_Image_Host,dataLength);
is.close();
// --- CUDA warm up
unsigned short *forFirstCudaMalloc; gpuErrchk(cudaMalloc((void**)&forFirstCudaMalloc, dataLength * sizeof(unsigned short)));
gpuErrchk(cudaFree(forFirstCudaMalloc));
// --- Allocate host and device memory spaces
unsigned short *Output_Image_Host = (unsigned short *)malloc(dataLength);
unsigned short *Input_Image; gpuErrchk(cudaMalloc( (void**)&Input_Image, dataLength * sizeof(unsigned short)));
unsigned short *Output_Image; gpuErrchk(cudaMalloc((void**)&Output_Image, dataLength * sizeof(unsigned short)));
// --- Copy data from host to device
gpuErrchk(cudaMemcpy(Input_Image, Input_Image_Host, dataLength, cudaMemcpyHostToDevice));// copying Host Data To Device Memory For Filtering
// --- Grid and block sizes
const dim3 grid (iDivUp(Image_Width, BLOCK_WIDTH), iDivUp(Image_Height, BLOCK_HEIGHT), 1);
const dim3 block(BLOCK_WIDTH, BLOCK_HEIGHT, 1);
/****************************/
/* ORIGINAL KERNEL FUNCTION */
/****************************/
float time;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
cudaFuncSetCacheConfig(Original_Kernel_Function, cudaFuncCachePreferShared);
Original_Kernel_Function<<<grid,block>>>(Input_Image, Output_Image, Image_Width, Image_Height);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("Original kernel function - elapsed time: %3.3f ms \n", time);
/***********************************************/
/* ORIGINAL KERNEL FUNCTION - NO SHARED MEMORY */
/***********************************************/
cudaEventRecord(start, 0);
cudaFuncSetCacheConfig(Original_Kernel_Function_no_shared, cudaFuncCachePreferL1);
Original_Kernel_Function_no_shared<<<grid,block>>>(Input_Image, Output_Image, Image_Width, Image_Height);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("Original kernel function - no shared - elapsed time: %3.3f ms \n", time);
/**********************************************/
/* KERNEL WITH OPTIMIZED USE OF SHARED MEMORY */
/**********************************************/
cudaEventRecord(start, 0);
cudaFuncSetCacheConfig(Optimized_Kernel_Function_shared, cudaFuncCachePreferShared);
Optimized_Kernel_Function_shared<<<grid,block>>>(Input_Image, Output_Image, Image_Width, Image_Height);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("Optimized kernel function - shared - elapsed time: %3.3f ms \n", time);
// --- Copy results back to the host
gpuErrchk(cudaMemcpy(Output_Image_Host, Output_Image, dataLength, cudaMemcpyDeviceToHost));
// --- Open results file, write results and close the file
ofstream of2; of2.open("C:\\Users\\angelo\\Documents\\Project\\Median_Filter\\Release\\Filtered_Image.raw", ios::binary);
of2.write((char*)Output_Image_Host, dataLength);
of2.close();
cout << "\n Press Any Key To Exit..!!";
gpuErrchk(cudaFree(Input_Image));
delete Input_Image_Host;
delete Output_Image_Host;
return 0;
}
Here are the timing results on a Kepler K20c:
1580 x 1050
Original_Kernel_Function = 1.588ms
Original_Kernel_Function_no_shared = 1.278ms
Optimized_Kernel_Function_shared = 1.455ms
2048 x 2048
Original_Kernel_Function = 3.94ms
Original_Kernel_Function_no_shared = 3.118ms
Optimized_Kernel_Function_shared = 3.709ms
4096 x 4096
Original_Kernel_Function = 16.003ms
Original_Kernel_Function_no_shared = 13.735ms
Optimized_Kernel_Function_shared = 14.526ms
8192 x 8192
Original_Kernel_Function = 62.278ms
Original_Kernel_Function_no_shared = 47.484ms
Optimized_Kernel_Function_shared = 57.474ms
Here are the timing results on a GT540M, which is more similar to your card:
1580 x 1050
Original_Kernel_Function = 10.332 ms
Original_Kernel_Function_no_shared = 9.294 ms
Optimized_Kernel_Function_shared = 10.301 ms
2048 x 2048
Original_Kernel_Function = 25.256 ms
Original_Kernel_Function_no_shared = 23.567 ms
Optimized_Kernel_Function_shared = 23.876 ms
4096 x 4096
Original_Kernel_Function = 99.791 ms
Original_Kernel_Function_no_shared = 93.919 ms
Optimized_Kernel_Function_shared = 95.464 ms
8192 x 8192
Original_Kernel_Function = 399.259 ms
Original_Kernel_Function_no_shared = 375.634 ms
Optimized_Kernel_Function_shared = 383.121 ms
As it can be seen, the version not using shared memory seems to be (slightly) convenient in all the cases.
Solution 2
It seems you share nothing between threads using shared memory, i.e. for 3x3 filter, you read each pixel 9 times from the global memory, which is not necessary. This white paper may provide some ideas on how to using shared memory in a convolution kernel. Hope it help.
http://docs.nvidia.com/cuda/samples/3_Imaging/convolutionSeparable/doc/convolutionSeparable.pdf
Solution 3
Quickselect median is the fastest linear time algorithm for best case; however, it is hard to implement in CUDA due to memory overhead. The easiest approach for a highly parallel algorithm is to minimize memory overhead. Instead of a partial sort algorithm like Quickselect, do away with memory overhead entirely by using the Torben median algorithm. The Torben algorithm can be significantly slower than other algorithms, but it does not modify the input data. Therefore, there is no need to allocate shared memory.
Finally, for maximum speed, bind the input to a texture, which has the added bonus of managing border extensions. To minimize cache misses, use a row major nested for loop for row major array.
Jony
Updated on September 21, 2022Comments
-
Jony 8 months
I have implemented a 2D median filter in CUDA and the whole program is shown below.
#include "cuda_runtime.h" #include "cuda_runtime_api.h" #include "device_launch_parameters.h" #include <iostream> #include <fstream> #include <iomanip> #include <windows.h> #include <io.h> #include <stdio.h> #include<conio.h> #include <cstdlib> #include "cstdlib" #include <process.h> #include <stdlib.h> #include <malloc.h> #include <ctime> using namespace std; #define MEDIAN_DIMENSION 3 // For matrix of 3 x 3. We can Use 5 x 5 , 7 x 7 , 9 x 9...... #define MEDIAN_LENGTH 9 // Shoul be MEDIAN_DIMENSION x MEDIAN_DIMENSION = 3 x 3 #define BLOCK_WIDTH 16 // Should be 8 If matrix is of larger then of 5 x 5 elese error occur as " uses too much shared data " at surround[BLOCK_WIDTH*BLOCK_HEIGHT][MEDIAN_LENGTH] #define BLOCK_HEIGHT 16// Should be 8 If matrix is of larger then of 5 x 5 elese error occur as " uses too much shared data " at surround[BLOCK_WIDTH*BLOCK_HEIGHT][MEDIAN_LENGTH] __global__ void MedianFilter_gpu( unsigned short *Device_ImageData,int Image_Width,int Image_Height){ __shared__ unsigned short surround[BLOCK_WIDTH*BLOCK_HEIGHT][MEDIAN_LENGTH]; int iterator; const int Half_Of_MEDIAN_LENGTH =(MEDIAN_LENGTH/2)+1; int StartPoint=MEDIAN_DIMENSION/2; int EndPoint=StartPoint+1; const int x = blockDim.x * blockIdx.x + threadIdx.x; const int y = blockDim.y * blockIdx.y + threadIdx.y; const int tid=threadIdx.y*blockDim.y+threadIdx.x; if(x>=Image_Width || y>=Image_Height) return; //Fill surround with pixel value of Image in Matrix Pettern of MEDIAN_DIMENSION x MEDIAN_DIMENSION if (x == 0 || x == Image_Width - StartPoint || y == 0 || y == Image_Height - StartPoint) { } else { iterator = 0; for (int r = x - StartPoint; r < x + (EndPoint); r++) { for (int c = y - StartPoint; c < y + (EndPoint); c++) { surround[tid][iterator] =*(Device_ImageData+(c*Image_Width)+r); iterator++; } } //Sort the Surround Array to Find Median. Use Bubble Short if Matrix oF 3 x 3 Matrix //You can use Insertion commented below to Short Bigger Dimension Matrix //// bubble short // for ( int i=0; i<Half_Of_MEDIAN_LENGTH; ++i) { // Find position of minimum element int min=i; for ( int l=i+1; l<MEDIAN_LENGTH; ++l) if (surround[tid][l] <surround[tid][min] ) min=l; // Put found minimum element in its place unsigned short temp= surround[tid][i]; surround[tid][i]=surround[tid][min]; surround[tid][min]=temp; }//bubble short end //////insertion sort start // /*int t,j,i; for ( i = 1 ; i< MEDIAN_LENGTH ; i++) { j = i; while ( j > 0 && surround[tid][j] < surround[tid][j-1]) { t= surround[tid][j]; surround[tid][j]= surround[tid][j-1]; surround[tid][j-1] = t; j--; } }*/ ////insertion sort end *(Device_ImageData+(y*Image_Width)+x)= surround[tid][Half_Of_MEDIAN_LENGTH-1]; // it will give value of surround[tid][4] as Median Value if use 3 x 3 matrix __syncthreads(); } } int main( int argc, const char** argv ) { int dataLength; int p1; unsigned short* Host_ImageData = NULL; ifstream is; // Read File is.open ("D:\\Image_To_Be_Filtered.raw", ios::binary ); // get length of file: is.seekg (0, ios::end); dataLength = is.tellg(); is.seekg (0, ios::beg); Host_ImageData = new unsigned short[dataLength * sizeof(char) / sizeof(unsigned short)]; is.read ((char*)Host_ImageData,dataLength); is.close(); int Image_Width = 1580; int Image_Height = 1050; unsigned short *Host_ResultData = (unsigned short *)malloc(dataLength); unsigned short *Device_ImageData = NULL; ///////////////////////////// // As First time cudaMalloc take more time for memory alocation, i dont want to cosider this time in my process. //So Please Ignore Code For Displaying First CudaMelloc Time clock_t begin = clock(); unsigned short *forFirstCudaMalloc = NULL; cudaMalloc( (void**)&forFirstCudaMalloc, dataLength * sizeof(unsigned short) ); clock_t end = clock(); double elapsed_secs = double(end - begin) / CLOCKS_PER_SEC; cout<<"First CudaMelloc time = "<<elapsed_secs<<" Second\n" ; cudaFree( forFirstCudaMalloc ); //////////////////////////// //Actual Process Starts From Here clock_t beginOverAll = clock(); // cudaMalloc( (void**)&Device_ImageData, dataLength * sizeof(unsigned short) ); cudaMemcpy(Device_ImageData, Host_ImageData, dataLength, cudaMemcpyHostToDevice);// copying Host Data To Device Memory For Filtering int x = static_cast<int>(ceilf(static_cast<float>(1580.0) /BLOCK_WIDTH)); int y = static_cast<int>(ceilf(static_cast<float>(1050.0) /BLOCK_HEIGHT)); const dim3 grid (x, y, 1); const dim3 block(BLOCK_WIDTH, BLOCK_HEIGHT, 1); begin = clock(); MedianFilter_gpu<<<grid,block>>>( Device_ImageData, Image_Width, Image_Height); cudaDeviceSynchronize(); end = clock(); elapsed_secs = double(end - begin) / CLOCKS_PER_SEC; cout<<"Process time = "<<elapsed_secs<<" Second\n" ; cudaMemcpy(Host_ResultData, Device_ImageData, dataLength, cudaMemcpyDeviceToHost); // copying Back Device Data To Host Memory To write In file After Filter Done clock_t endOverall = clock(); elapsed_secs = double(endOverall - beginOverAll) / CLOCKS_PER_SEC; cout<<"Complete Time = "<<elapsed_secs<<" Second\n" ; ofstream of2; //Write Filtered Image Into File of2.open("D:\\Filtered_Image.raw", ios::binary); of2.write((char*)Host_ResultData,dataLength); of2.close(); cout<<"\nEnd of Writing File. Press Any Key To Exit..!!"; cudaFree(Device_ImageData); delete Host_ImageData; delete Host_ResultData; getch(); return 0; }
Here is the link for the file I use. I used ImajeJ to store the image in "raw" format and the same for reading the "raw" Image. My image pixel is
16
bit,unsigned short
. The width of the image is1580
and the height is1050
.I strongly believe that the filter can be made more efficient and fast by using proper CUDA optimization.
Indeed, I'm running on a GeForce GT 520M card and the timings are the following
1) For
MEDIAN_DIMENSION
of3 x 3 = 0.027 seconds
2) For
MEDIAN_DIMENSION
of5 x 5 = 0.206 seconds
3) For
MEDIAN_DIMENSION
of7 x 7 = 1.11 seconds
4) For
MEDIAN_DIMENSION
of9 x 9 = 4.931 seconds
As you can see, as we increase
MEDIAN_DIMENSION
, the time increases very much and I have applications where I generally use higherMEDIAN_DIMENSION
like7 x 7
and9 x 9
. I think that, by using Cuda, even for9 x 9
the time should be less than1 second
.Since I think that the sorting part is taking most of the time here, can we make the sorting part of the algorithm faster?
Can we use
grid
andblock
more efficiently? Can I use largerBLOCK_WIDTH
andBLOCK_HEIGHT
(like32
and32
) and still not hit the maximum__shared__
memory limit which is4Kb
for my device?Can
__shared__
memory be used more efficiently?Any help will be appreciated.
Thanks in advance.
-
ArchaeaSoftware over 9 yearsSeems like a repeat of stackoverflow.com/questions/18538701/…
-
Jony over 9 years@ArchaeaSoftware Well its not a copy of what u linked. If m question get solved many will able to use cuda more efficiently.It is not only about selection of median Rather after geting index of pixel using cuda resource(i.e x and y), one would again have to get neighbor pixels and found median of them. It may helpful in the way of using shared_ memory more efficiently.
-
ArchaeaSoftware over 9 yearsIf you really want to make a fast median filter, I think your best bet is to use the vector min/max instructions in conjunction with the algorithm described in "A fast median filter using Altivec" by Kolte, Smith and Su.
-
ArchaeaSoftware over 9 yearsI should also mention that if you just want a sample of a CUDA kernel that does a neighborhood operation on each input pixel, take a look at the sobelFilter SDK sample.
-
-
Jony over 9 yearsthanks for reply Please explain me how do i read 9 times for each pixel? And what should I change to correct it?
-
kangshiyin over 9 yearsYou have 1580x1050 threads work on the image. Each of the thread read 9 pixels. So you must have read 9x1580x1050 pixels in all, which is 9 times of the whole image. See the link for how to use pixels read by other threads through shared memory.
-
Jony over 9 yearsusing const int x = blockDim.x * blockIdx.x + threadIdx.x; const int y = blockDim.y * blockIdx.y + threadIdx.y; I am reading each pixel once. Now according to algorithm requirement i again need the surrounding matrix pixel for the same image. And that is why i read 9 pixel in the array to get median and set its value as current pixel for these x and y.
-
kangshiyin over 9 yearsYes you need the surrounding pixels. But they have already been read in by surrounding threads, why the current thread need to read them from global memory again?