Demo entry 6659956

b

   

Submitted by anonymous on Nov 13, 2017 at 12:47
Language: CUDA. Code size: 5.7 kB.

#include <stdio.h>
#include <stdlib.h>
#include <time.h>

//CUDA RunTime API
#include <cuda_runtime.h>

//1M
#define DATA_SIZE 1048576

//max thread count
int g_thread_num;

//max block count
int g_block_num;

//gpu clock
int gpu_clock;

int data[DATA_SIZE];

//产生大量0-9之间的随机数
void GenerateNumbers(int *number, int size)
{
    for (int i = 0; i < size; i++) {
        number[i] = rand() % 10;
    }
}

//打印设备信息
void printDeviceProp(const cudaDeviceProp &prop)
{
    printf("Device Name : %s.\n", prop.name);
    printf("totalGlobalMem : %lu.\n", prop.totalGlobalMem);
    printf("sharedMemPerBlock : %lu.\n", prop.sharedMemPerBlock);
    printf("regsPerBlock : %d.\n", prop.regsPerBlock);
    printf("warpSize : %d.\n", prop.warpSize);
    printf("memPitch : %lu.\n", prop.memPitch);
    printf("maxThreadsPerBlock : %d.\n", prop.maxThreadsPerBlock);
    printf("maxThreadsDim[0 - 2] : %d %d %d.\n", prop.maxThreadsDim[0], prop.maxThreadsDim[1], prop.maxThreadsDim[2]);
    printf("maxGridSize[0 - 2] : %d %d %d.\n", prop.maxGridSize[0], prop.maxGridSize[1], prop.maxGridSize[2]);
    printf("totalConstMem : %lu.\n", prop.totalConstMem);
    printf("major.minor : %d.%d.\n", prop.major, prop.minor);
    printf("clockRate : %d.\n", prop.clockRate);
    printf("textureAlignment : %lu.\n", prop.textureAlignment);
    printf("deviceOverlap : %d.\n", prop.deviceOverlap);
    printf("multiProcessorCount : %d.\n", prop.multiProcessorCount);
}

//CUDA 初始化
bool InitCUDA()
{
    int count;

    //取得支持Cuda的装置的数目
    cudaGetDeviceCount(&count);

    if (count == 0) {
        fprintf(stderr, "There is no device.\n");
        return false;
    }

    int i;

    for (i = 0; i < count; i++) {

        cudaDeviceProp prop;
        cudaGetDeviceProperties(&prop, i);
        //打印设备信息
        printDeviceProp(prop);

        if (cudaGetDeviceProperties(&prop, i) == cudaSuccess) {
            if (prop.major >= 1) {
                //获取gpu时钟节拍
                gpu_clock = prop.clockRate;
                break;
            }
        }
    }

    if (i == count) {
        fprintf(stderr, "There is no device supporting CUDA 1.x.\n");
        return false;
    }

    cudaSetDevice(i);

    return true;
}


// __global__ 函数 (GPU上执行) 计算立方和
// __global__ 函数 (GPU上执行) 计算立方和
__global__ static void sumOfSquares(int *num, int* result, clock_t* time, int block_num, int thread_num)
{

    //表示目前的 thread 是第几个 thread(由 0 开始计算)
    const int tid = threadIdx.x;

    //表示目前的 thread 属于第几个 block(由 0 开始计算)
    const int bid = blockIdx.x;


    int sum = 0;

    int i;

    //计算每个block需要完成的量
    const int size = DATA_SIZE / (block_num*thread_num);

    //只在 thread 0(即 threadIdx.x = 0 的时候)进行记录,每个 block 都会记录开始时间及结束时间
    if (tid == 0) time[bid] = clock();

    //thread需要同时通过tid和bid来确定
    for (i = (bid * thread_num + tid)* size; i < (bid * thread_num + tid + 1)* size; i++) {
	
        sum += num[i] * num[i] * num[i];

    }

    //Result的数量随之增加
    result[bid * thread_num + tid] = sum;

    //计算时间的动作,只在 thread 0(即 threadIdx.x = 0 的时候)进行,每个 block 都会记录开始时间及结束时间
    if (tid == 0) time[bid + block_num] = clock();

}





int main(int argc, char** argv)
{
    if (argc >= 3)
    {
        int arg_threads = atoi(argv[2]);
		int arg_blocks = atoi(argv[1]);
        if (arg_blocks > 0)
        {
            g_block_num = arg_blocks;
        } 
        if (arg_threads > 0)
        {
            g_thread_num = arg_threads;
        }       
    }
    if (g_thread_num <= 0)
    {
        g_thread_num = 256;
    }
    if (g_block_num <= 0)
    {
		g_block_num = 32;
    }
	
    //CUDA 初始化
    if (!InitCUDA()) {
        return 0;
    }

    //生成随机数
    GenerateNumbers(data, DATA_SIZE);

    /*把数据复制到显卡内存中*/
    int* gpudata, *result;

    clock_t* time;

    //cudaMalloc 取得一块显卡内存 ( 其中result用来存储计算结果,time用来存储运行时间 )
    cudaMalloc((void**)&gpudata, sizeof(int)* DATA_SIZE);
    cudaMalloc((void**)&result, sizeof(int)*g_thread_num* g_block_num);
    cudaMalloc((void**)&time, sizeof(clock_t)* g_block_num * 2);

    //cudaMemcpy 将产生的随机数复制到显卡内存中
    //cudaMemcpyHostToDevice - 从内存复制到显卡内存
    //cudaMemcpyDeviceToHost - 从显卡内存复制到内存
    cudaMemcpy(gpudata, data, sizeof(int)* DATA_SIZE, cudaMemcpyHostToDevice);

    // 在CUDA 中执行函数 语法:函数名称<<<block 数目, thread 数目, shared memory 大小>>>(参数...);
    sumOfSquares << < g_block_num, g_thread_num, 0 >> >(gpudata, result, time, g_block_num, g_thread_num);


    /*把结果从显示芯片复制回主内存*/

	int* sum = (int*)calloc(sizeof(int), g_thread_num*g_block_num);

	clock_t* time_use = (clock_t*)calloc(sizeof(clock_t), g_block_num*2);

    //cudaMemcpy 将结果从显存中复制回内存
    cudaMemcpy(sum, result, sizeof(int)* g_thread_num*g_block_num, cudaMemcpyDeviceToHost);
    cudaMemcpy(time_use, time, sizeof(clock_t)* g_block_num * 2, cudaMemcpyDeviceToHost);

    //Free
    cudaFree(gpudata);
    cudaFree(result);
    cudaFree(time);

    int final_sum = 0;

    for (int i = 0; i < g_thread_num*g_block_num; i++) {

        final_sum += sum[i];

    }
	free(sum);

    //采取新的计时策略 把每个 block 最早的开始时间,和最晚的结束时间相减,取得总运行时间
    clock_t min_start, max_end;

    min_start = time_use[0];

    max_end = time_use[g_block_num];

    for (int i = 1; i < g_block_num; i++) {
        if (min_start > time_use[i])
            min_start = time_use[i];
        if (max_end < time_use[i + g_block_num])
            max_end = time_use[i + g_block_num];
    }
	free(time_use);

	//计算执行时间
    float t = (float)(max_end - min_start)/(float)gpu_clock/1000;
    printf("GPUsum: %d  gpuclock: %ld, gputime: %.5f\n", final_sum, max_end - min_start, t);

    final_sum = 0;

    for (int i = 0; i < DATA_SIZE; i++) {

        final_sum += data[i] * data[i] * data[i];

    }

    printf("CPUsum: %d \n", final_sum);

    return 0;
}

This snippet took 0.02 seconds to highlight.

Back to the Entry List or Home.

Delete this entry (admin only).