Timing CUDA operations
Solution 1
You could do something along the lines of :
#include <sys/time.h>
struct timeval t1, t2;
gettimeofday(&t1, 0);
kernel_call<<<dimGrid, dimBlock, 0>>>();
HANDLE_ERROR(cudaThreadSynchronize();)
gettimeofday(&t2, 0);
double time = (1000000.0*(t2.tv_sec-t1.tv_sec) + t2.tv_usec-t1.tv_usec)/1000.0;
printf("Time to generate: %3.1f ms \n", time);
or:
float time;
cudaEvent_t start, stop;
HANDLE_ERROR( cudaEventCreate(&start) );
HANDLE_ERROR( cudaEventCreate(&stop) );
HANDLE_ERROR( cudaEventRecord(start, 0) );
kernel_call<<<dimGrid, dimBlock, 0>>>();
HANDLE_ERROR( cudaEventRecord(stop, 0) );
HANDLE_ERROR( cudaEventSynchronize(stop) );
HANDLE_ERROR( cudaEventElapsedTime(&time, start, stop) );
printf("Time to generate: %3.1f ms \n", time);
Solution 2
A satisfactory answer has been already given to your question.
I have constructed classes for timing C/C++ as well as CUDA operations and want to share with other hoping they could be helpful to next users. You will just need to add the 4
files reported below to your project and #include
the two header files as
// --- Timing includes
#include "TimingCPU.h"
#include "TimingGPU.cuh"
The two classes can be used as follows.
Timing CPU section
TimingCPU timer_CPU;
timer_CPU.StartCounter();
CPU perations to be timed
std::cout << "CPU Timing = " << timer_CPU.GetCounter() << " ms" << std::endl;
Timing GPU section
TimingGPU timer_GPU;
timer_GPU.StartCounter();
GPU perations to be timed
std::cout << "GPU Timing = " << timer_GPU.GetCounter() << " ms" << std::endl;
In both the cases, the timing is in milliseconds. Also, the two classes can be used under linux or windows.
Here are the 4
files:
TimingCPU.cpp
/**************/
/* TIMING CPU */
/**************/
#include "TimingCPU.h"
#ifdef __linux__
#include <sys/time.h>
#include <stdio.h>
TimingCPU::TimingCPU(): cur_time_(0) { StartCounter(); }
TimingCPU::~TimingCPU() { }
void TimingCPU::StartCounter()
{
struct timeval time;
if(gettimeofday( &time, 0 )) return;
cur_time_ = 1000000 * time.tv_sec + time.tv_usec;
}
double TimingCPU::GetCounter()
{
struct timeval time;
if(gettimeofday( &time, 0 )) return -1;
long cur_time = 1000000 * time.tv_sec + time.tv_usec;
double sec = (cur_time - cur_time_) / 1000000.0;
if(sec < 0) sec += 86400;
cur_time_ = cur_time;
return 1000.*sec;
}
#elif _WIN32 || _WIN64
#include <windows.h>
#include <iostream>
struct PrivateTimingCPU {
double PCFreq;
__int64 CounterStart;
};
// --- Default constructor
TimingCPU::TimingCPU() { privateTimingCPU = new PrivateTimingCPU; (*privateTimingCPU).PCFreq = 0.0; (*privateTimingCPU).CounterStart = 0; }
// --- Default destructor
TimingCPU::~TimingCPU() { }
// --- Starts the timing
void TimingCPU::StartCounter()
{
LARGE_INTEGER li;
if(!QueryPerformanceFrequency(&li)) std::cout << "QueryPerformanceFrequency failed!\n";
(*privateTimingCPU).PCFreq = double(li.QuadPart)/1000.0;
QueryPerformanceCounter(&li);
(*privateTimingCPU).CounterStart = li.QuadPart;
}
// --- Gets the timing counter in ms
double TimingCPU::GetCounter()
{
LARGE_INTEGER li;
QueryPerformanceCounter(&li);
return double(li.QuadPart-(*privateTimingCPU).CounterStart)/(*privateTimingCPU).PCFreq;
}
#endif
TimingCPU.h
// 1 micro-second accuracy
// Returns the time in seconds
#ifndef __TIMINGCPU_H__
#define __TIMINGCPU_H__
#ifdef __linux__
class TimingCPU {
private:
long cur_time_;
public:
TimingCPU();
~TimingCPU();
void StartCounter();
double GetCounter();
};
#elif _WIN32 || _WIN64
struct PrivateTimingCPU;
class TimingCPU
{
private:
PrivateTimingCPU *privateTimingCPU;
public:
TimingCPU();
~TimingCPU();
void StartCounter();
double GetCounter();
}; // TimingCPU class
#endif
#endif
TimingGPU.cu
/**************/
/* TIMING GPU */
/**************/
#include "TimingGPU.cuh"
#include <cuda.h>
#include <cuda_runtime.h>
struct PrivateTimingGPU {
cudaEvent_t start;
cudaEvent_t stop;
};
// default constructor
TimingGPU::TimingGPU() { privateTimingGPU = new PrivateTimingGPU; }
// default destructor
TimingGPU::~TimingGPU() { }
void TimingGPU::StartCounter()
{
cudaEventCreate(&((*privateTimingGPU).start));
cudaEventCreate(&((*privateTimingGPU).stop));
cudaEventRecord((*privateTimingGPU).start,0);
}
void TimingGPU::StartCounterFlags()
{
int eventflags = cudaEventBlockingSync;
cudaEventCreateWithFlags(&((*privateTimingGPU).start),eventflags);
cudaEventCreateWithFlags(&((*privateTimingGPU).stop),eventflags);
cudaEventRecord((*privateTimingGPU).start,0);
}
// Gets the counter in ms
float TimingGPU::GetCounter()
{
float time;
cudaEventRecord((*privateTimingGPU).stop, 0);
cudaEventSynchronize((*privateTimingGPU).stop);
cudaEventElapsedTime(&time,(*privateTimingGPU).start,(*privateTimingGPU).stop);
return time;
}
TimingGPU.cuh
#ifndef __TIMING_CUH__
#define __TIMING_CUH__
/**************/
/* TIMING GPU */
/**************/
// Events are a part of CUDA API and provide a system independent way to measure execution times on CUDA devices with approximately 0.5
// microsecond precision.
struct PrivateTimingGPU;
class TimingGPU
{
private:
PrivateTimingGPU *privateTimingGPU;
public:
TimingGPU();
~TimingGPU();
void StartCounter();
void StartCounterFlags();
float GetCounter();
}; // TimingCPU class
#endif
Solution 3
There is an out-of-box GpuTimer struct for use:
#ifndef __GPU_TIMER_H__
#define __GPU_TIMER_H__
struct GpuTimer
{
cudaEvent_t start;
cudaEvent_t stop;
GpuTimer()
{
cudaEventCreate(&start);
cudaEventCreate(&stop);
}
~GpuTimer()
{
cudaEventDestroy(start);
cudaEventDestroy(stop);
}
void Start()
{
cudaEventRecord(start, 0);
}
void Stop()
{
cudaEventRecord(stop, 0);
}
float Elapsed()
{
float elapsed;
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsed, start, stop);
return elapsed;
}
};
#endif /* __GPU_TIMER_H__ */
Solution 4
If you want to measure GPU time you pretty much have to use events. Theres a great discussion thread on the do's and don'ts of timing your application over on the nvidia forums here.
Tudor
6th person to receive the multithreading gold badge! 107th person to receive the java gold badge! 313rd person to receive the c# gold badge! Careers profile
Updated on December 15, 2020Comments
-
Tudor over 3 years
I need to time a CUDA kernel execution. The Best Practices Guide says that we can use either events or standard timing functions like
clock()
in Windows. My problem is that using these two functions gives me a totally different result. In fact, the result given by events seems to be huge compared to the actual speed in practice.What I actually need all this for is to be able to predict the running time of a computation by first running a reduced version of it on a smaller data set. Unfortunately, the results of this benchmark are totally unrealistic, being either too optimistic (
clock()
) or waaaay too pessimistic (events).-
pQB over 12 yearsDid you synchronize in the CPU after launch your kernel and before to time(end) with clock?
-
Tudor over 12 yearsDo you mean if I have cudaThreadSynchronize() calls before and after? Yes I do.
-
pQB over 12 yearsYes, That's what i meant
-
pQB over 12 yearsBy the way. CUDA timing is returned in ms (micro-seconds if you use the visual profiler). Just in case.
-
-
Tudor over 12 yearsThanks, but I need to do these measurements programmatically.
-
talonmies over 12 years@Programmer: The profiler also completely serialises the API, and adds latency because it requires additional host-device transfers to gather profile counter output. It is useful for a lot of things, but accurate execution times is not one of them.
-
Programmer over 12 years@talonmies: What do you mean that profiler completely serialises the API? –
-
jmsu over 12 years@Programmer It is a function or macro he didn't define that handles the errors returned by the cuda function calls. You should do error handling but it could have been omitted here for simplicity.
-
talonmies over 12 years@Programmer: The CUDA API is naturally asynchronous (kernel launches, streams, certain classes of memory transfers). When you run programs in the profiler, they all become serial. If you have code which overlaps memory copying with kernel execution, those will be serial when profiled. On Fermi, multiple, simultaneous kernel execution also is disabled during profiling.
-
fbielejec over 12 years@ Programmer, yes exacly, there are some usefull macros for error handling in the SDK
-
Tom over 12 years@fbielejec the error handling functions in the SDK are there just to make the examples as simple as possible for education. Calling exit() when you encounter an error is not the best way of handling an error!
-
Zk1001 over 12 yearsI think you still can do it programmatically if you use the command line profiler (not the visual profiler). But as talonmies said, it serializes the API calls. So what you get is executing all of the API calls in a blocking manner. And also there is a small extra overhead for reading the counters.
-
Kknd over 10 yearsNote that 1e6 us = 1 s, so on the first example time is in seconds, not ms.
-
chappjc over 9 yearsRegarding HANDLE_ERROR, see stackoverflow.com/q/14038589/2778484 and look at helper_cuda.h in the CUDA samples, which has a macro called
getLastCudaError
. -
tomriddle_1234 about 9 yearsAccording to here, the second method has 1.5ms resolution.
-
Gigi almost 4 yearsWorks great! I had to include also #include "TimingCPU.cpp" and #include "TimingGPU.cu" beside the two includes mentioned above.