共计 4529 个字符,预计需要花费 12 分钟才能阅读完成。
Event 是 CUDA 中的事件,用于剖析、检测 CUDA 程序中的谬误。个别咱们会定义一个宏:#pragma once
include <stdio.h>
define CHECK(call) \
do \
{\
const cudaError_t error_code = call; \ | |
if (error_code != cudaSuccess) \ | |
{ \ | |
printf("CUDA Error:\n"); \ | |
printf("File: %s\n", __FILE__); \ | |
printf("Line: %d\n", __LINE__); \ | |
printf("Error code: %d\n", error_code); \ | |
printf("Error text: %s\n", \ | |
cudaGetErrorString(error_code)); \ | |
exit(1); \ | |
} \ |
} while (0) 并在适当的地位应用这个宏来打印 CUDA 的谬误日志。#pragma once, 不要放在源代码文件里,这个个别只放在头文件里的。(避免头文件被引入屡次)Event 的调用有以下内容:
具体的程序如下:(1)申明 Event(这里以计算核函数运行工夫前后的 start Event 和 stop Event 为例)cudaEvent_t start, stop;(2)创立 EventCHECK(cudaEventCreate(&start));
CHECK(cudaEventCreate(&stop));(3)增加 Event(在适合的中央)cudaEventRecord(start);
cudaEventRecord(stop);(4)期待 Event 实现(a)非梗塞形式——能够用于一些不须要期待的解决 cudaEventQuery(start);(b)梗塞形式——能够用于执行核函数后期待核函数执行结束后的解决 cudaEventSynchronize(stop);(5)计算两个 Event 间隔时间 CHECK(cudaEventElapsedTime(&elapsed_time, start, stop));(6)销毁 EventCHECK(cudaEventDestroy(start));
CHECK(cudaEventDestroy(stop)); 以上次介绍的矩阵乘为例,残缺的代码如下:#pragma once
include <stdio.h>
define CHECK(call) \
do \
{\
const cudaError_t error_code = call; \ | |
if (error_code != cudaSuccess) \ | |
{ \ | |
printf("CUDA Error:\n"); \ | |
printf("File: %s\n", __FILE__); \ | |
printf("Line: %d\n", __LINE__); \ | |
printf("Error code: %d\n", error_code); \ | |
printf("Error text: %s\n", \ | |
cudaGetErrorString(error_code)); \ | |
exit(1); \ | |
} \ |
} while (0)
include <stdio.h>
include <math.h>
include “error.cuh”
define BLOCK_SIZE 32
global void gpu_matrix_mult(int a,int b, int *c, int m, int n, int k)
{
int row = blockIdx.y * blockDim.y + threadIdx.y; | |
int col = blockIdx.x * blockDim.x + threadIdx.x; | |
int sum = 0; | |
if(col < k && row < m) | |
{for(int i = 0; i < n; i++) | |
{sum += a[row * n + i] * b[i * k + col]; | |
} | |
c[row * k + col] = sum; | |
} |
}
void cpu_matrix_mult(int h_a, int h_b, int *h_result, int m, int n, int k) {
for (int i = 0; i < m; ++i) | |
{for (int j = 0; j < k; ++j) | |
{ | |
int tmp = 0.0; | |
for (int h = 0; h < n; ++h) | |
{tmp += h_a[i * n + h] * h_b[h * k + j]; | |
} | |
h_result[i * k + j] = tmp; | |
} | |
} |
}
int main(int argc, char const *argv[])
{
int m=100; | |
int n=100; | |
int k=100; | |
// 申明 Event | |
cudaEvent_t start, stop, stop2, stop3 , stop4 ; | |
// 创立 Event | |
CHECK(cudaEventCreate(&start)); | |
CHECK(cudaEventCreate(&stop)); | |
CHECK(cudaEventCreate(&stop2)); | |
int *h_a, *h_b, *h_c, *h_cc; | |
CHECK(cudaMallocHost((void **) &h_a, sizeof(int)*m*n)); | |
CHECK(cudaMallocHost((void **) &h_b, sizeof(int)*n*k)); | |
CHECK(cudaMallocHost((void **) &h_c, sizeof(int)*m*k)); | |
CHECK(cudaMallocHost((void **) &h_cc, sizeof(int)*m*k)); | |
for (int i = 0; i < m; ++i) {for (int j = 0; j < n; ++j) {h_a[i * n + j] = rand() % 1024;} | |
} | |
for (int i = 0; i < n; ++i) {for (int j = 0; j < k; ++j) {h_b[i * k + j] = rand() % 1024;} | |
} | |
int *d_a, *d_b, *d_c; | |
CHECK(cudaMalloc((void **) &d_a, sizeof(int)*m*n)); | |
CHECK(cudaMalloc((void **) &d_b, sizeof(int)*n*k)); | |
CHECK(cudaMalloc((void **) &d_c, sizeof(int)*m*k)); | |
// copy matrix A and B from host to device memory | |
CHECK(cudaMemcpy(d_a, h_a, sizeof(int)*m*n, cudaMemcpyHostToDevice)); | |
CHECK(cudaMemcpy(d_b, h_b, sizeof(int)*n*k, cudaMemcpyHostToDevice)); | |
unsigned int grid_rows = (m + BLOCK_SIZE - 1) / BLOCK_SIZE; | |
unsigned int grid_cols = (k + BLOCK_SIZE - 1) / BLOCK_SIZE; | |
dim3 dimGrid(grid_cols, grid_rows); | |
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE); | |
// 开始 start Event | |
cudaEventRecord(start); | |
// 非阻塞模式 | |
cudaEventQuery(start); | |
//gpu_matrix_mult<<<dimGrid, dimBlock>>>(d_a, d_b, d_c, m, n, k); | |
gpu_matrix_mult_shared<<<dimGrid, dimBlock>>>(d_a, d_b, d_c, m, n, k); | |
// 开始 stop Event | |
cudaEventRecord(stop); | |
// 因为要期待核函数执行结束,所以抉择阻塞模式 | |
cudaEventSynchronize(stop); | |
// 计算工夫 stop-start | |
float elapsed_time; | |
CHECK(cudaEventElapsedTime(&elapsed_time, start, stop)); | |
printf("start-》stop:Time = %g ms.\n", elapsed_time); | |
cudaMemcpy(h_c, d_c, (sizeof(int)*m*k), cudaMemcpyDeviceToHost); | |
//cudaThreadSynchronize(); | |
// 开始 stop2 Event | |
CHECK(cudaEventRecord(stop2)); | |
// 非阻塞模式 | |
//CHECK(cudaEventSynchronize(stop2)); | |
cudaEventQuery(stop2); | |
// 计算工夫 stop-stop2 | |
float elapsed_time2; | |
cudaEventElapsedTime(&elapsed_time2, stop, stop2); | |
printf("stop-》stop2:Time = %g ms.\n", elapsed_time2); | |
// 销毁 Event | |
CHECK(cudaEventDestroy(start)); | |
CHECK(cudaEventDestroy(stop)); | |
CHECK(cudaEventDestroy(stop2)); | |
//CPU 函数计算 | |
cpu_matrix_mult(h_a, h_b, h_cc, m, n, k); | |
int ok = 1; | |
for (int i = 0; i < m; ++i) | |
{for (int j = 0; j < k; ++j) | |
{if(fabs(h_cc[i*k + j] - h_c[i*k + j])>(1.0e-10)) | |
{ok = 0;} | |
} | |
} | |
if(ok) | |
{printf("Pass!!!\n"); | |
} | |
else | |
{printf("Error!!!\n"); | |
} | |
// free memory | |
cudaFree(d_a); | |
cudaFree(d_b); | |
cudaFree(d_c); | |
cudaFreeHost(h_a); | |
cudaFreeHost(h_b); | |
cudaFreeHost(h_c); | |
return 0; |
} 在 Quardo P1000 的 GPU 上执行:
这里以矩阵乘为例,打印了调用矩阵乘核函数的工夫,以及前面 cudaMemcpy 的工夫。咱们强行将 CHECK(cudaMemcpy(d_b, h_b, sizeof(int)nk, cudaMemcpyHostToDevice)); 改为 CHECK(cudaMemcpy(d_b, h_b, sizeof(int)nk*2, cudaMemcpyHostToDevice)); 成心让其出界。再从新编译,运行,看看成果:
零碎会通知你 这行有错:
这样就能够跟踪出 CUDA 调用中的谬误。这里须要总结一下张小白在调试 CHECK 过程中发现的几个问题:(1)如果没有 CHECK(cudaEventCreate()) 就间接调用 cudaEventRecord() 或者执行前面的 Event 函数,会导致打印不了信息。张小白过后对于 stop2 这个 event 就犯了这个错,导致 stop->stop2 的工夫怎么都打不进去。(2)对于 cudaEventQuery() 是不能加 CHECK 的,如果加了反而会报错:在下面的环境中,如果您这样写:CHECK(cudaEventQuery(stop2)); 编译执行就会呈现以下谬误:
cudaEventQuery 的 cudaErrorNotReady 代表了事件还没产生(还没有被记录),不代表谬误。