失眠网 > 显卡性能测试工具BenchMarkTool

显卡性能测试工具BenchMarkTool

时间：2019-07-01 17:25:07

设计自己的CUDAZ

CUDAZ是一款用于查询显卡信息，测试显卡性能的工具，具体参见：

http://cuda-/#block-windows

获取信息包括 TimeStamp Compute能力(Gflops) fHost2Device拷贝性能 fDevice2Host拷贝性能 fHost2DevicePin fDevice2HostPin fDevice2Device(GiB/s)

基本框架搭建：

LabBase.h

#pragma once#ifndef LAB_BASE#define LAB_BASE #include <stdio.h> #include <fstream>#include <windows.h>#include <vector>#include <list>#include <iostream>#include <iomanip>#include <sstream>#include <math.h>#include <string>#include <time.h>#include <tchar.h>#include <cstring>#include <psapi.h>#pragma comment(lib,"psapi.lib")#include <direct.h>#include <io.h>typedef _ULonglong uint64_t;class LabBase{public://Singleton lazy allocstatic LabBase getInstance();static uint64_t getTime_UTC();static std::string getTime_Str();public:template<typename T>static void log2File(const T &Info,const std::string &logfile){std::ostringstream streamInfo;streamInfo.clear();streamInfo.str("");streamInfo<<Info;std::string sInfo = streamInfo.str();LabBase obj = LabBase::getInstance();obj.log2File(sInfo,logfile);}template<typename T>static void log2File(const T &Info,const char *const file, int const line,const std::string &logfile) {std::string sfile(file);std::ostringstream streamInfo;streamInfo.clear();streamInfo.str("");streamInfo<<Info;std::string sInfo = streamInfo.str();LabBase obj = LabBase::getInstance();obj.log2File(sInfo,sfile,line,logfile);}/// \brief Save file.template<typename T>static void SaveFile(const T* data, size_t len, const std::string &file) {std::ofstream ofs(file, std::ios::binary|std::ios::ate);if(ofs.fail()) {std::cout<<"failed to open file: " + file<<std::endl;return;}ofs.write((char*)data, len*sizeof(T));ofs.close();}private:void log2File(const std::string &sInfo,const std::string &file="1.log");void log2File(const char* charArray,const std::string &file="1.log");void log2File(const std::string &sInfo,const std::string &file, int const line,const std::string &logfile="1.log");};class LogAlgoPerf{public:explicit LogAlgoPerf(void);virtual ~LogAlgoPerf(void);public:// set Log file name & will be saved to "D:\\LogAlgoPerf\\"void setLogFile(const std::string &strFileName);// set start logging timer void setLogStart();// get logging timevoid getLogEnd();// get Process Memory info WorkSet Paged Pinnedtemplate<typename T>void logInfo(T info){//decltypem_streamInfo<<info<<"";}void logMemInfo(); // Log other info double getTotalTime();private:void log2File();void mkdir();void reset();private:LARGE_INTEGER m_nBegTime;LARGE_INTEGER m_nEndTime;LARGE_INTEGER m_nFreq;std::stringm_strFile;std::ostringstreamm_streamInfo;std::vector<LARGE_INTEGER>m_vBegTime;std::vector<LARGE_INTEGER>m_vEndTime;std::vector<std::string>m_vInfo;};//#define LOGPERF(_ins, _m, ...)#define LOGPERF(_ins, _m, ...) {_ins.##_m(##__VA_ARGS__);}#endif LAB_BASE

CudaBase.h

#pragma once#ifndef __CUDA_BASE__#define __CUDA_BASE__#include"cuda_runtime.h"#include"cublas.h"#include "device_launch_parameters.h"#include "LabBase.h"template< typename T >inline bool checkOpt(T cudaFunc, char const *const cudaFuncName, const char *const file, int const line){bool bSuc = true;auto result = cudaFunc;if (result != cudaSuccess){bSuc = false;std::ostringstream oss;oss<<"Throw cudaError: "<<result<<"File: "<<file<<", cudaFunction: "<<cudaFuncName<<", Line: "<<line;size_t nBytesFree = 0, nBytesTotal = 0;if (cudaMemGetInfo(&nBytesFree, &nBytesTotal) == cudaSuccess){int DeviceId =-1;cudaGetDevice(&DeviceId);cudaGetLastError();// reset cuda error oss<<" GPU "<<DeviceId<<" FreeMemory: "<<nBytesFree<<" bytes, TotalMemory: "<<nBytesTotal<<" bytes";}std::string sInfo = oss.str();LabBase::log2File(sInfo,__FILE__, __LINE__,"checkCudaErrors.log");printf("%s",oss.str());throw std::logic_error(oss.str());}return bSuc;}#define checkCudaErrors(cudaFunc) checkOpt ( (cudaFunc), #cudaFunc, __FILE__, __LINE__ )#endif__CUDA_BASE__

BenchMark.h

#include "../../common/CudaBase.h"// Test memory Optsenum GPUCOPYMODEL{COPY_MODE_Hpage2D= 0,/*!< Host pageable memory to device data copy mode. */COPY_MODE_D2Hpage= 1,/*!< Device to host pageable memory data copy mode. */COPY_MODE_Hpin2D= 2,/*!< Host pinned memory to device data copy mode. */COPY_MODE_D2Hpin= 3,/*!< Device to host pinned memory data copy mode. */COPY_MODE_D2D= 4/*!< Device to device data copy mode. */};struct memBuff {size_tszMem;void*memHostPage;/*!< Pageable host memory. */void*memHostPin;/*!< Pinned host memory. */void*memDevice1;/*!< Device memory buffer 1. */void*memDevice2;/*!< Device memory buffer 2. */memBuff():szMem(0),memHostPage(NULL),memHostPin(NULL),memDevice1(NULL),memDevice2(NULL){}};struct memInfo{float fH2DPage;float fD2HPage;float fH2DPin;float fD2HPin;float fD2D;memInfo():fH2DPage(0.0f),fD2HPage(0.0f),fH2DPin(0.0f),fD2HPin(0.0f),fD2D(0.0f){}};class BandWidth{public:BandWidth(void);~BandWidth(void);void memBandInit(memBuff sData);memInfo getBandWidth();private:float getMemSpeed(GPUCOPYMODEL model,int iterNum=10);void memAllocFree();private:memBuff m_sData;};

BenchMark.cuh

#include"BenchMarks.h"extern "C" float TestFunc_Launchlatency();extern "C" float TestFunc_CalcPerformance();extern "C" memInfo TestFunc_BandWidth();

BenchMark.cu

#include"BenchMarks.cuh"//empty Kernel__global__ void empty() {}#define CALC_FMAD_16(a, b) \a = a * a + a; b = b * b + b; a = a * a + a; b = b * b + b; \a = a * a + a; b = b * b + b; a = a * a + a; b = b * b + b; \a = a * a + a; b = b * b + b; a = a * a + a; b = b * b + b; \a = a * a + a; b = b * b + b; a = a * a + a; b = b * b + b; \#define CALC_FMAD_256(a, b) \CALC_FMAD_16(a, b) CALC_FMAD_16(a, b) CALC_FMAD_16(a, b) CALC_FMAD_16(a, b) \CALC_FMAD_16(a, b) CALC_FMAD_16(a, b) CALC_FMAD_16(a, b) CALC_FMAD_16(a, b) \CALC_FMAD_16(a, b) CALC_FMAD_16(a, b) CALC_FMAD_16(a, b) CALC_FMAD_16(a, b) \CALC_FMAD_16(a, b) CALC_FMAD_16(a, b) CALC_FMAD_16(a, b) CALC_FMAD_16(a, b) \__global__ void CalcKernelFloat(void *buf,/*!<[in] Data buffer. */int iters) {int index = blockIdx.x * blockDim.x + threadIdx.x;float *arr = (float*)buf;float val1 = index;float val2 = arr[index];int i;for(i = 0; i < iters; i++) {CALC_FMAD_256(val1, val2);CALC_FMAD_256(val1, val2);CALC_FMAD_256(val1, val2);CALC_FMAD_256(val1, val2);CALC_FMAD_256(val1, val2);CALC_FMAD_256(val1, val2);CALC_FMAD_256(val1, val2);CALC_FMAD_256(val1, val2);}arr[index] = val1 + val2;}float TestFunc_Launchlatency(){const int iters = 100;cudaFree(0);// Warmup phaseempty<<<1,1>>>();float timeMs= 0.0;cudaEvent_t start;cudaEvent_t stop;checkCudaErrors(cudaEventCreate(&start));checkCudaErrors(cudaEventCreate(&stop));checkCudaErrors(cudaEventRecord(start, 0));checkCudaErrors(cudaDeviceSynchronize());float totalTime = 0;// Benchmark phasefor (int i = 0; i < iters; ++i) {float loopMs = 0;empty<<<1,1024>>>();cudaEventRecord(stop, 0);cudaEventSynchronize(stop);cudaEventElapsedTime(&loopMs, start, stop);//std::cout<<i<<" "<<loopMs<<std::endl;timeMs += loopMs;}float averTime = timeMs/iters;return averTime;//printf(" Average Launch Time %f ms \r\n",averTime);}memInfo TestFunc_BandWidth(){memBuff sData;BandWidth obj;sData.szMem = 1024*1024*100;obj.memBandInit(sData);memInfo info = obj.getBandWidth();return info;}float TestFunc_CalcPerformance(){cudaDeviceProp prop;cudaGetDeviceProperties(&prop, 0);int nRepeatNum= 10;int iterNumInKernel= 32;int threadsNum= prop.maxThreadsPerBlock;int blocksNum= 1;if(threadsNum == 0) {int warpSize = prop.warpSize;if(warpSize == 0)warpSize = 32;threadsNum = warpSize * 2;if(threadsNum > 512)threadsNum = 512;}float* pfDiviceBuff= NULL;checkCudaErrors(cudaMalloc((void**)&pfDiviceBuff,threadsNum*sizeof(float)));checkCudaErrors(cudaMemset(pfDiviceBuff,0,threadsNum*sizeof(float)));std::shared_ptr<float> pArray(new float[threadsNum]);float*ptrArray = pArray.get();for(int i = 0;i<threadsNum;++i){ptrArray[i] = i*0.03141592653f;}checkCudaErrors(cudaMemcpy(pfDiviceBuff,ptrArray,threadsNum*sizeof(float),cudaMemcpyHostToDevice));float timeMs= 0.0;cudaEvent_t start;cudaEvent_t stop;cudaEventCreate(&start);cudaEventCreate(&stop);cudaEventRecord(start, 0);for(int i = 0;i<nRepeatNum;++i){float loopMs = 0.0;cudaEventRecord(start, 0);CalcKernelFloat<<<blocksNum, threadsNum>>>(pfDiviceBuff,iterNumInKernel);checkCudaErrors(cudaGetLastError());cudaEventRecord(stop, 0);cudaEventSynchronize(stop);cudaEventElapsedTime(&loopMs, start, stop);timeMs += loopMs;}int nOpsNumofFMA= 2;// Number of operations per one loopint nFMAInstNum= 256;// Size of instruction blockint nIterofFMAOps= 8;// Number of instruction blocks in loopfloat GFLOPs = ((float)prop.multiProcessorCount * (float)threadsNum// max cores * warp*(float)iterNumInKernel*(float)nIterofFMAOps *(float)nFMAInstNum *(float)nOpsNumofFMA // ops of cores*(float)nRepeatNum *1000.0/(float)timeMs)// ops per Second/1000/1000/1000;// GcudaEventDestroy(start);cudaEventDestroy(stop);cudaFree(pfDiviceBuff);//printf("GFLOPs %10f \r\n",GFLOPs);return GFLOPs;}

BenchMark.cpp

#include "BenchMarks.h"BandWidth::BandWidth(void){}BandWidth::~BandWidth(void){memAllocFree();}void BandWidth::memBandInit(memBuff sData){m_sData = sData;if (m_sData.szMem==0){m_sData.szMem = 1024*1024*1;}m_sData.memHostPage = (void*)malloc(m_sData.szMem);auto state = cudaMallocHost((void**)&m_sData.memHostPin,m_sData.szMem);cudaMalloc((void**)&m_sData.memDevice1,m_sData.szMem);cudaMalloc((void**)&m_sData.memDevice2,m_sData.szMem);}float BandWidth::getMemSpeed(GPUCOPYMODEL model,int iterNum){float timeMs= 0.0;cudaEvent_t start;cudaEvent_t stop;checkCudaErrors(cudaEventCreate(&start));checkCudaErrors(cudaEventCreate(&stop));for(int i = 0;i<iterNum;++i){float loopMs = 0.0;cudaEventRecord(start, 0);switch(model){case COPY_MODE_Hpage2D:checkCudaErrors(cudaMemcpy(m_sData.memDevice1, m_sData.memHostPage, m_sData.szMem, cudaMemcpyHostToDevice));break;case COPY_MODE_D2Hpage:checkCudaErrors(cudaMemcpy(m_sData.memHostPage, m_sData.memDevice1, m_sData.szMem, cudaMemcpyDeviceToHost));break;case COPY_MODE_Hpin2D:checkCudaErrors(cudaMemcpy(m_sData.memDevice2, m_sData.memHostPin, m_sData.szMem, cudaMemcpyHostToDevice));break;case COPY_MODE_D2Hpin:checkCudaErrors(cudaMemcpy(m_sData.memHostPin, m_sData.memDevice2, m_sData.szMem, cudaMemcpyDeviceToHost));break;case COPY_MODE_D2D:checkCudaErrors(cudaMemcpy(m_sData.memDevice1, m_sData.memDevice2, m_sData.szMem, cudaMemcpyDeviceToHost));break;default:break;}cudaEventRecord(stop, 0);checkCudaErrors(cudaEventSynchronize(stop));cudaEventElapsedTime(&loopMs, start, stop);timeMs += loopMs;}checkCudaErrors(cudaEventDestroy(start));checkCudaErrors(cudaEventDestroy(stop));float bandwidthGiBs = (1000 *m_sData.szMem*iterNum) / (timeMs *(float)(1 << 30));return bandwidthGiBs;}void BandWidth::memAllocFree(){if (m_sData.memDevice1!=NULL){cudaFree(m_sData.memDevice1);}if (m_sData.memDevice2!=NULL){cudaFree(m_sData.memDevice2);}if (m_sData.memHostPage!=NULL){free(m_sData.memHostPage);}if (m_sData.memHostPin!=NULL){cudaFreeHost(m_sData.memHostPin);}}memInfo BandWidth::getBandWidth(){memInfo info;info.fH2DPage= getMemSpeed(COPY_MODE_Hpage2D);info.fD2HPage= getMemSpeed(COPY_MODE_D2Hpage);info.fH2DPin= getMemSpeed(COPY_MODE_Hpin2D);info.fD2HPin= getMemSpeed(COPY_MODE_D2Hpin);info.fD2D= getMemSpeed(COPY_MODE_D2D);return info;}

Main.cpp

void TestFunc_PerfMon(){TestFunc_Launchlatency();printf("TimeStampCompute(Gflops) fH2D fD2H fH2DPin fD2HPin fD2D(GiB/s) \r\n");for (int i=0;i<10000;++i){std::string strtime = LabBase::getTime_Str();memInfo info = TestFunc_BandWidth();float Gflops = TestFunc_CalcPerformance();printf("%s %10f %10f %10f %10f %10f %10f \r\n",strtime.c_str(),Gflops,info.fH2DPage,info.fD2HPage,info.fH2DPin,info.fD2HPin,info.fD2D);Sleep(1000);}}

如果觉得《显卡性能测试工具BenchMarkTool》对你有帮助，请点赞、收藏，并留下你的观点哦！

本内容不代表本网观点和政治立场，如有侵犯你的权益请联系我们处理。

网友评论

网友评论仅供其表达个人看法，并不表明网站立场。