Timing CUDA operations

29,876

Solution 1

You could do something along the lines of :

#include <sys/time.h>

struct timeval t1, t2;

gettimeofday(&t1, 0);

kernel_call<<<dimGrid, dimBlock, 0>>>();

HANDLE_ERROR(cudaThreadSynchronize();)

gettimeofday(&t2, 0);

double time = (1000000.0*(t2.tv_sec-t1.tv_sec) + t2.tv_usec-t1.tv_usec)/1000.0;

printf("Time to generate:  %3.1f ms \n", time);

or:

float time;
cudaEvent_t start, stop;

HANDLE_ERROR( cudaEventCreate(&start) );
HANDLE_ERROR( cudaEventCreate(&stop) );
HANDLE_ERROR( cudaEventRecord(start, 0) );

kernel_call<<<dimGrid, dimBlock, 0>>>();

HANDLE_ERROR( cudaEventRecord(stop, 0) );
HANDLE_ERROR( cudaEventSynchronize(stop) );
HANDLE_ERROR( cudaEventElapsedTime(&time, start, stop) );

printf("Time to generate:  %3.1f ms \n", time);

Solution 2

A satisfactory answer has been already given to your question.

I have constructed classes for timing C/C++ as well as CUDA operations and want to share with other hoping they could be helpful to next users. You will just need to add the 4 files reported below to your project and #include the two header files as

// --- Timing includes
#include "TimingCPU.h"
#include "TimingGPU.cuh"

The two classes can be used as follows.

Timing CPU section

TimingCPU timer_CPU;

timer_CPU.StartCounter();
CPU perations to be timed
std::cout << "CPU Timing = " << timer_CPU.GetCounter() << " ms" << std::endl;

Timing GPU section

TimingGPU timer_GPU;
timer_GPU.StartCounter();
GPU perations to be timed
std::cout << "GPU Timing = " << timer_GPU.GetCounter() << " ms" << std::endl;

In both the cases, the timing is in milliseconds. Also, the two classes can be used under linux or windows.

Here are the 4 files:

TimingCPU.cpp

/**************/
/* TIMING CPU */
/**************/

#include "TimingCPU.h"

#ifdef __linux__

    #include <sys/time.h>
    #include <stdio.h>

    TimingCPU::TimingCPU(): cur_time_(0) { StartCounter(); }

    TimingCPU::~TimingCPU() { }

    void TimingCPU::StartCounter()
    {
        struct timeval time;
        if(gettimeofday( &time, 0 )) return;
        cur_time_ = 1000000 * time.tv_sec + time.tv_usec;
    }

    double TimingCPU::GetCounter()
    {
        struct timeval time;
        if(gettimeofday( &time, 0 )) return -1;

        long cur_time = 1000000 * time.tv_sec + time.tv_usec;
        double sec = (cur_time - cur_time_) / 1000000.0;
        if(sec < 0) sec += 86400;
        cur_time_ = cur_time;

        return 1000.*sec;
    }

#elif _WIN32 || _WIN64
    #include <windows.h>
    #include <iostream>

    struct PrivateTimingCPU {
        double  PCFreq;
        __int64 CounterStart;
    };

    // --- Default constructor
    TimingCPU::TimingCPU() { privateTimingCPU = new PrivateTimingCPU; (*privateTimingCPU).PCFreq = 0.0; (*privateTimingCPU).CounterStart = 0; }

    // --- Default destructor
    TimingCPU::~TimingCPU() { }

    // --- Starts the timing
    void TimingCPU::StartCounter()
    {
        LARGE_INTEGER li;
        if(!QueryPerformanceFrequency(&li)) std::cout << "QueryPerformanceFrequency failed!\n";

        (*privateTimingCPU).PCFreq = double(li.QuadPart)/1000.0;

        QueryPerformanceCounter(&li);
        (*privateTimingCPU).CounterStart = li.QuadPart;
    }

    // --- Gets the timing counter in ms
    double TimingCPU::GetCounter()
    {
        LARGE_INTEGER li;
        QueryPerformanceCounter(&li);
        return double(li.QuadPart-(*privateTimingCPU).CounterStart)/(*privateTimingCPU).PCFreq;
    }
#endif

TimingCPU.h

// 1 micro-second accuracy
// Returns the time in seconds

#ifndef __TIMINGCPU_H__
#define __TIMINGCPU_H__

#ifdef __linux__

    class TimingCPU {

        private:
            long cur_time_;

        public:

            TimingCPU();

            ~TimingCPU();

            void StartCounter();

            double GetCounter();
    };

#elif _WIN32 || _WIN64

    struct PrivateTimingCPU;

    class TimingCPU
    {
        private:
            PrivateTimingCPU *privateTimingCPU;

        public:

            TimingCPU();

            ~TimingCPU();

            void StartCounter();

            double GetCounter();

    }; // TimingCPU class

#endif

#endif

TimingGPU.cu

/**************/
/* TIMING GPU */
/**************/

#include "TimingGPU.cuh"

#include <cuda.h>
#include <cuda_runtime.h>

struct PrivateTimingGPU {
    cudaEvent_t     start;
    cudaEvent_t     stop;
};

// default constructor
TimingGPU::TimingGPU() { privateTimingGPU = new PrivateTimingGPU; }

// default destructor
TimingGPU::~TimingGPU() { }

void TimingGPU::StartCounter()
{
    cudaEventCreate(&((*privateTimingGPU).start));
    cudaEventCreate(&((*privateTimingGPU).stop));
    cudaEventRecord((*privateTimingGPU).start,0);
}

void TimingGPU::StartCounterFlags()
{
    int eventflags = cudaEventBlockingSync;

    cudaEventCreateWithFlags(&((*privateTimingGPU).start),eventflags);
    cudaEventCreateWithFlags(&((*privateTimingGPU).stop),eventflags);
    cudaEventRecord((*privateTimingGPU).start,0);
}

// Gets the counter in ms
float TimingGPU::GetCounter()
{
    float   time;
    cudaEventRecord((*privateTimingGPU).stop, 0);
    cudaEventSynchronize((*privateTimingGPU).stop);
    cudaEventElapsedTime(&time,(*privateTimingGPU).start,(*privateTimingGPU).stop);
    return time;
}

TimingGPU.cuh

#ifndef __TIMING_CUH__
#define __TIMING_CUH__

/**************/
/* TIMING GPU */
/**************/

// Events are a part of CUDA API and provide a system independent way to measure execution times on CUDA devices with approximately 0.5
// microsecond precision.

struct PrivateTimingGPU;

class TimingGPU
{
    private:
        PrivateTimingGPU *privateTimingGPU;

    public:

        TimingGPU();

        ~TimingGPU();

        void StartCounter();
        void StartCounterFlags();

        float GetCounter();

}; // TimingCPU class

#endif

Solution 3

There is an out-of-box GpuTimer struct for use:

#ifndef __GPU_TIMER_H__
#define __GPU_TIMER_H__

struct GpuTimer
{
      cudaEvent_t start;
      cudaEvent_t stop;

      GpuTimer()
      {
            cudaEventCreate(&start);
            cudaEventCreate(&stop);
      }

      ~GpuTimer()
      {
            cudaEventDestroy(start);
            cudaEventDestroy(stop);
      }

      void Start()
      {
            cudaEventRecord(start, 0);
      }

      void Stop()
      {
            cudaEventRecord(stop, 0);
      }

      float Elapsed()
      {
            float elapsed;
            cudaEventSynchronize(stop);
            cudaEventElapsedTime(&elapsed, start, stop);
            return elapsed;
      }
};

#endif  /* __GPU_TIMER_H__ */

Solution 4

If you want to measure GPU time you pretty much have to use events. Theres a great discussion thread on the do's and don'ts of timing your application over on the nvidia forums here.

Share:
29,876
Tudor
Author by

Tudor

6th person to receive the multithreading gold badge! 107th person to receive the java gold badge! 313rd person to receive the c# gold badge! Careers profile

Updated on December 15, 2020

Comments

  • Tudor
    Tudor over 3 years

    I need to time a CUDA kernel execution. The Best Practices Guide says that we can use either events or standard timing functions like clock() in Windows. My problem is that using these two functions gives me a totally different result. In fact, the result given by events seems to be huge compared to the actual speed in practice.

    What I actually need all this for is to be able to predict the running time of a computation by first running a reduced version of it on a smaller data set. Unfortunately, the results of this benchmark are totally unrealistic, being either too optimistic (clock()) or waaaay too pessimistic (events).

    • pQB
      pQB over 12 years
      Did you synchronize in the CPU after launch your kernel and before to time(end) with clock?
    • Tudor
      Tudor over 12 years
      Do you mean if I have cudaThreadSynchronize() calls before and after? Yes I do.
    • pQB
      pQB over 12 years
      Yes, That's what i meant
    • pQB
      pQB over 12 years
      By the way. CUDA timing is returned in ms (micro-seconds if you use the visual profiler). Just in case.
  • Tudor
    Tudor over 12 years
    Thanks, but I need to do these measurements programmatically.
  • talonmies
    talonmies over 12 years
    @Programmer: The profiler also completely serialises the API, and adds latency because it requires additional host-device transfers to gather profile counter output. It is useful for a lot of things, but accurate execution times is not one of them.
  • Programmer
    Programmer over 12 years
    @talonmies: What do you mean that profiler completely serialises the API? –
  • jmsu
    jmsu over 12 years
    @Programmer It is a function or macro he didn't define that handles the errors returned by the cuda function calls. You should do error handling but it could have been omitted here for simplicity.
  • talonmies
    talonmies over 12 years
    @Programmer: The CUDA API is naturally asynchronous (kernel launches, streams, certain classes of memory transfers). When you run programs in the profiler, they all become serial. If you have code which overlaps memory copying with kernel execution, those will be serial when profiled. On Fermi, multiple, simultaneous kernel execution also is disabled during profiling.
  • fbielejec
    fbielejec over 12 years
    @ Programmer, yes exacly, there are some usefull macros for error handling in the SDK
  • Tom
    Tom over 12 years
    @fbielejec the error handling functions in the SDK are there just to make the examples as simple as possible for education. Calling exit() when you encounter an error is not the best way of handling an error!
  • Zk1001
    Zk1001 over 12 years
    I think you still can do it programmatically if you use the command line profiler (not the visual profiler). But as talonmies said, it serializes the API calls. So what you get is executing all of the API calls in a blocking manner. And also there is a small extra overhead for reading the counters.
  • Kknd
    Kknd over 10 years
    Note that 1e6 us = 1 s, so on the first example time is in seconds, not ms.
  • chappjc
    chappjc over 9 years
    Regarding HANDLE_ERROR, see stackoverflow.com/q/14038589/2778484 and look at helper_cuda.h in the CUDA samples, which has a macro called getLastCudaError.
  • tomriddle_1234
    tomriddle_1234 about 9 years
    According to here, the second method has 1.5ms resolution.
  • Gigi
    Gigi almost 4 years
    Works great! I had to include also #include "TimingCPU.cpp" and #include "TimingGPU.cu" beside the two includes mentioned above.