OpenCL编程指南-2.1HelloWorld - 服务器托管|北京服务器租用|机房托管租用|IDC托管租用|机房机柜带宽租用-价格及费用咨询

在windows下编写HelloWorld

按照前面文章搭建好OpenCL的环境https://blog.csdn.net/qq_36314864/article/details/130513584

main函数完成以下操作：
1）在第一个可用平台上创建OpenCL上下文
2）在第一个可用设备上创建命令队列
3）加载一个内核文件(HelloWorld.cl)并将它构建到程序对象中
4）为内核函数hello_kernel()创建一个内核对象
5）为内核参数创建内存对象(result, a, b)
6）将待执行的内核排队
7）将内核结果读回结果缓冲区

helloworld.cpp

//
// Book:      OpenCL(R) Programming Guide
// Authors:   Aaftab Munshi, Benedict Gaster, Timothy Mattson, James Fung, Dan Ginsburg
// ISBN-10:   0-321-74964-2
// ISBN-13:   978-0-321-74964-2
// Publisher: Addison-Wesley Professional
// URLs:      http://safari.informit.com/9780132488006/
//            http://www.openclprogrammingguide.com
//

// HelloWorld.cpp
//
//    This is a simple example that demonstrates basic OpenCL setup and
//    use.

#include 
#include 
#include 

#ifdef __APPLE__
#include 
#else
#include 
#endif
#pragma warning( disable : 4996 )
///
//  Constants
//
const int ARRAY_SIZE = 1000;

///
//  Create an OpenCL context on the first available platform using
//  either a GPU or CPU depending on what is available.
//
cl_context CreateContext()
{
	cl_int errNum;
	cl_uint numPlatforms;
	cl_platform_id firstPlatformId;
	cl_context context = NULL;

	// First, select an OpenCL platform to run on.  For this example, we
	// simply choose the first available platform.  Normally, you would
	// query for all available platforms and select the most appropriate one.
	errNum = clGetPlatformIDs(1, &firstPlatformId, &numPlatforms);
	if (errNum != CL_SUCCESS || numPlatforms  0)
	{
		std::cerr  "Failed to find any OpenCL platforms."  std::endl;
		return NULL;
	}

	// Next, create an OpenCL context on the platform.  Attempt to
	// create a GPU-based context, and if that fails, try to create
	// a CPU-based context.
	cl_context_properties contextProperties[] =
	{
		CL_CONTEXT_PLATFORM,
		(cl_context_properties)firstPlatformId,
		0
	};
	context = clCreateContextFromType(contextProperties, CL_DEVICE_TYPE_GPU,
		NULL, NULL, &errNum);
	if (errNum != CL_SUCCESS)
	{
		std::cout  "Could not create GPU context, trying CPU..."  std::endl;
		context = clCreateContextFromType(contextProperties, CL_DEVICE_TYPE_CPU,
			NULL, NULL, &errNum);
		if (errNum != CL_SUCCESS)
		{
			std::cerr  "Failed to create an OpenCL GPU or CPU context."  std::endl;
			return NULL;
		}
	}

	return context;
}

///
//  Create a command queue on the first device available on the
//  context
//
cl_command_queue CreateCommandQueue(cl_context context, cl_device_id* device)
{
	cl_int errNum;
	cl_device_id* devices;
	cl_command_queue commandQueue = NULL;
	size_t deviceBufferSize = -1;

	// First get the size of the devices buffer
	errNum = clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &deviceBufferSize);
	if (errNum != CL_SUCCESS)
	{
		std::cerr  "Failed call to clGetContextInfo(...,GL_CONTEXT_DEVICES,...)";
		return NULL;
	}

	if (deviceBufferSize  0)
	{
		std::cerr  "No devices available.";
		return NULL;
	}

	// Allocate memory for the devices buffer
	devices = new cl_device_id[deviceBufferSize / sizeof(cl_device_id)];
	errNum = clGetContextInfo(context, CL_CONTEXT_DEVICES, deviceBufferSize, devices, NULL);
	if (errNum != CL_SUCCESS)
	{
		delete[] devices;
		std::cerr  "Failed to get device IDs";
		return NULL;
	}

	// In this example, we just choose the first available device.  In a
	// real program, you would likely use all available devices or choose
	// the highest performance device based on OpenCL device queries
	commandQueue = clCreateCommandQueue(context, devices[0], 0, NULL);
	if (commandQueue == NULL)
	{
		delete[] devices;
		std::cerr  "Failed to create commandQueue for device 0";
		return NULL;
	}

	*device = devices[0];
	delete[] devices;
	return commandQueue;
}

///
//  Create an OpenCL program from the kernel source file
//
cl_program CreateProgram(cl_context context, cl_device_id device, const char* fileName)
{
	cl_int errNum;
	cl_program program;

	std::ifstream kernelFile(fileName, std::ios::in);
	if (!kernelFile.is_open())
	{
		std::cerr  "Failed to open file for reading: "  fileName  std::endl;
		return NULL;
	}

	std::ostringstream oss;
	oss  kernelFile.rdbuf();

	std::string srcStdStr = oss.str();
	const char* srcStr = srcStdStr.c_str();
	program = clCreateProgramWithSource(context, 1,
		(const char**)&srcStr,
		NULL, NULL);
	if (program == NULL)
	{
		std::cerr  "Failed to create CL program from source."  std::endl;
		return NULL;
	}

	errNum = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
	if (errNum != CL_SUCCESS)
	{
		// Determine the reason for the error
		char buildLog[16384];
		clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,
			sizeof(buildLog), buildLog, NULL);

		std::cerr  "Error in kernel: "  std::endl;
		std::cerr  buildLog;
		clReleaseProgram(program);
		return NULL;
	}

	return program;
}

///
//  Create memory objects used as the arguments to the kernel
//  The kernel takes three arguments: result (output), a (input),
//  and b (input)
//
bool CreateMemObjects(cl_context context, cl_mem memObjects[3],
	float* a, float* b)
{
	memObjects[0] = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
		sizeof(float) * ARRAY_SIZE, a, NULL);
	memObjects[1] = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
		sizeof(float) * ARRAY_SIZE, b, NULL);
	memObjects[2] = clCreateBuffer(context, CL_MEM_READ_WRITE,
		sizeof(float) * ARRAY_SIZE, NULL, NULL);

	if (memObjects[0] == NULL || memObjects[1] == NULL || memObjects[2] == NULL)
	{
		std::cerr  "Error creating memory objects."  std::endl;
		return false;
	}

	return true;
}

///
//  Cleanup any created OpenCL resources
//
void Cleanup(cl_context context, cl_command_queue commandQueue,
	cl_program program, cl_kernel kernel, cl_mem memObjects[3])
{
	for (int i = 0; i  3; i++)
	{
		if (memObjects[i] != 0)
			clReleaseMemObject(memObjects[i]);
	}
	if (commandQueue != 0)
		clReleaseCommandQueue(commandQueue);

	if (kernel != 0)
		clReleaseKernel(kernel);

	if (program != 0)
		clReleaseProgram(program);

	if (context != 0)
		clReleaseContext(context);

}

///
//	main() for HelloWorld example
//
int main(int argc, char** argv)
{
	cl_context context = 0;
	cl_command_queue commandQueue = 0;
	cl_program program = 0;
	cl_device_id device = 0;
	cl_kernel kernel = 0;
	cl_mem memObjects[3] = { 0, 0, 0 };
	cl_int errNum;

	// Create an OpenCL context on first available platform
	context = CreateContext();
	if (context == NULL)
	{
		std::cerr  "Failed to create OpenCL context."  std::endl;
		return 1;
	}

	// Create a command-queue on the first device available
	// on the created context
	commandQueue = CreateCommandQueue(context, &device);
	if (commandQueue == NULL)
	{
		Cleanup(context, commandQueue, program, kernel, memObjects);
		return 1;
	}

	// Create OpenCL program from HelloWorld.cl kernel source
	program = CreateProgram(context, device, "HelloWorld.cl");
	if (program == NULL)
	{
		Cleanup(context, commandQueue, program, kernel, memObjects);
		return 1;
	}

	// Create OpenCL kernel
	kernel = clCreateKernel(program, "hello_kernel", NULL);
	if (kernel == NULL)
	{
		std::cerr  "Failed to create kernel"  std::endl;
		Cleanup(context, commandQueue, program, kernel, memObjects);
		return 1;
	}

	// Create memory objects that will be used as arguments to
	// kernel.  First create host memory arrays that will be
	// used to store the arguments to the kernel
	float result[ARRAY_SIZE];
	float a[ARRAY_SIZE];
	float b[ARRAY_SIZE];
	for (int i = 0; i  ARRAY_SIZE; i++)
	{
		a[i] = (float)i;
		b[i] = (float)(i * 2);
	}

	if (!CreateMemObjects(context, memObjects, a, b))
	{
		Cleanup(context, commandQueue, program, kernel, memObjects);
		return 1;
	}

	// Set the kernel arguments (result, a, b)
	errNum = clSetKernelArg(kernel, 0, sizeof(cl_mem), &memObjects[0]);
	errNum |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &memObjects[1]);
	errNum |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &memObjects[2]);
	if (errNum != CL_SUCCESS)
	{
		std::cerr  "Error setting kernel arguments."  std::endl;
		Cleanup(context, commandQueue, program, kernel, memObjects);
		return 1;
	}

	size_t globalWorkSize[1] = { ARRAY_SIZE };
	size_t localWorkSize[1] = { 1 };

	// Queue the kernel up for execution across the array
	errNum = clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL,
		globalWorkSize, localWorkSize,
		0, NULL, NULL);
	if (errNum != CL_SUCCESS)
	{
		std::cerr  "Error queuing kernel for execution."  std::endl;
		Cleanup(context, commandQueue, program, kernel, memObjects);
		return 1;
	}

	// Read the output buffer back to the Host
	errNum = clEnqueueReadBuffer(commandQueue, memObjects[2], CL_TRUE,
		0, ARRAY_SIZE * sizeof(float), result,
		0, NULL, NULL);
	if (errNum != CL_SUCCESS)
	{
		std::cerr  "Error reading result buffer."  std::endl;
		Cleanup(context, commandQueue, program, kernel, memObjects);
		return 1;
	}

	// Output the result buffer
	for (int i = 0; i  ARRAY_SIZE; i++)
	{
		std::cout  result[i]  " ";
	}
	std::cout  std::endl;
	std::cout  "Executed program succesfully."  std::endl;
	Cleanup(context, commandQueue, program, kernel, memObjects);

	return 0;
}

HelloWorld.cl


__kernel void hello_kernel(__global const float *a,
						__global const float *b,
						__global float *result)
{
    int gid = get_global_id(0);

    result[gid] = a[gid] + b[gid];
}

将两个文件放在统一目录下，打开vs2019进行编译

运行结果如下

选择OpenCL平台并创建一个上下文

HelloWorld示例createContext()函数的代码。首先调用clGetPlatformIDs()来获取第一个可用的平台。得到第一个可用平台的cl_platform_id之后，再调用clcreateContextFromType()创建一个上下文。这个clCreateContextFromType()调用会尝试为一个GPU设备创建一个上下文。如果尝试失败，程序会做下一个尝试。这一次将为一个CPU设备创建上下文。

选择设备并创建命令队列

选择平台并创建一个上下文之后，HelloWorld 应用的下一步要选择一个设备，并创建一个命令队列。设备在计算机硬件底层，如GPU或CPU。要与设备通信，应用程序必须为它创建一个命令队列。将在设备上完成的操作要在命令队列中排队。上述包含一个createCommandQueue()函数，它会选择设备并为HelloWorld应用创建命令队列。

第一个clGetContextInfo()调用会查询上下文的信息，得到存储上下文中所有可用设备ID所需要的缓冲区大小。这个大小将用来分配一个缓冲区，用于存储设备ID，另一个clGetContextInfo()调用则获取上下文中所有可用的设备。一般情况下，程序会迭代查询这些设备的信息，选择其中最好的一个 (或多个) 设备。在HelloWorld示例中，会选择第一个设备。第3章中，我们会介绍如何查询设备信息，从而能选择对应用最适用的设备。选择了所用的设备之后，应用调用clCreateCommandQueue()在所选择的设备上创建一个命令队列。这个命令队列用于将程序中要执行的内核排队，并读回其结果。

创建和构建程序对象

要执行OpenCL 计算内核，需要在内存中分配内核函数的参数，以便在OpenCL设备上访问。HelloWorld示例的内核已经在上述代码中给出。这个例子中的内核是一个简单的函数，它要计算两个数组(a和b)中各个元素值之和，并把结果存储在另一个数组(result) 中。代码中为"hello_kernel"创建一个内核对象，将其编译到程序对象中。另外，将分配数组 (a、b和result) 并填入数据。在宿主机内存中创建这些数组之后，调用createMemObjects()，它会把这些数组复制到内存对象，然后传入内核。

代码中给出了createMemobjects()函数的代码。这个函数为各个数组分别调用clCreateBuffer()来创建一个内存对象。内存对象分配在设备内存中，可以由内核函数直接访问。对于输入数组 (a和 b)，缓冲区使用CL_MEM_READ_ONLY | CL_MEM_coPy_HoST_PTR内存类型来创建，这说明这些数组对内核是只读的，可以从宿主机内存复制到设备内存。数组本身作为参数传递到clcreateBuffer()，这会将数组的内容复制到设备上为内存对象分配的存储空间中。result数组用内存类型CL_MEM_READ_WRITE创建，这说明这个数组对内核是可读、写的。

执行内核

既然已经创建了内核和内存对象，接下来HelloWorld程序可以将要执行的内核排队。内核函数的所有参数需要使用clsetKernelArg()设置。这个函数的第一个参数是参数的索引。hello_kernel()有3个参数 (a、b和result)，分别对应索引0、1和2。CreateMemObjects()创建的内存对象传入内核对象。

建立内核参数之后，HelloWorld示例利用命令队列使将在设备上执行的内核排队。这是通过调用clEnqueueNDRangeKernel()完成的。globalworkSize和 localWorkSize确定内核如何在设备上的多个处理单元间分布。HelloWorld示例采用了一种非常简单的方法，让globalworkSize等于数组大小，localWorkSize等于1。确定如何在一个数据集上高效地分布内核，这是使用OpenCL难度最大的问题之一。

执行内核排队并不表示这个内核会立即执行。内核执行会放在命令队列中，以后再由设备消费。换句话说，clEnqueueNDRangeKernel()调用之后，可能并不会在设备上执行内核，而是让内核等待之前的事件完成之后再执行。这个内容将在第9章中详细讨论。要从内核读回结果，HelloWorld示例调用clEnqueueReadBuffer()读回result数组(memobjects[2])。

clEnqueueReadBuffer()的第3个参数是一个布尔类型的blocking_read，这个参数确定这个调用是否等待结果准备就绪才返回。在这个例子中，blocking_read设置为CL_TRUE,这说明在内核读取结束之前这个调用不会返回。这样可以保证置于命令队列净的操作会按顺序执行（除非命令队列使用cL_QUEUE_our_oF_ORDER_EXEC_MODE_ENABLE类型创建，不过HelloWorld示例中并未使用这个类型创建命令队列)。

综上所述，在内核的执行完成之前，不会有读操作，另外，由设备读回结果之前读取操作也不会返回。因此，一旦程序由clEnqueueReadBuffer()返回，就说明已经由设备为宿主机读回result数组，可以读或写了。最后，将把结果数组中的值输出到标准输出。

检查OpenCL中的错误

在HelloWorld例子中，示例代码展示如何检查OpenCL函数返回的错误码。现在我们只是对OpenCL报告错误的机制做个简单介绍。在报告错误方面，OpenCL 中的函数可以分为两类：一类返回OpenCL对象；另一类不返回。例如，我们看到clcreateContextFromType()返回了一个cl_context对象。然而，函数clSetKernelArg()并不返回新对象。clsetKernelArg()只向调用者返回一个错误码，clCreateContextFromType()最后一个参数是一个指针，指向这个函数生成的错误码。
从这两个函数可以了解OpenCL中报告错误的简单规则：
1）返回cl_xxx对象的OpenCL函数最后一个参数是一个指针，指向所返回的错误码。
2）不返回对象的OpenCL函数会返回一个错误码。

OpenCL有大量可能出现的错误。每个API调用可能返回这些错误的一个子集。OpenCL中可能的错误如下

CL_SUCCESS                          命令成功执行，没有出现错误
CL_DEVICE_NoT_FOUND                 未发现与条件匹配的OpenCL设备
CL_DEVICE_NOT_AVAILABLE             OpenCL设备目前不可用
CL_COMPILER_NOT_AVAILABLE           程序由源代码创建，不过没有可用的OpenCLC编译器
CL_MEM_OBJECT_ALLOCATION_FAILURE    无法为内存对象或图像对象分配内存
CL_OUR_OF_RESOURCES                 没有足够的资源执行命令
CL_OUT_OF_HOST_MEMORY               宿主机上没有足够的内存执行命令
CL_PROFILING_INFO_NOT_AVAILABLE     无法得到事件的性能评测信息或者命令队列不支持性能评测
CL_MEM_COPX_OVERLAP                 两个缓冲区在同一个内存区域重叠
CL_IMAGE_FORMAT_MISMATCH            图像未采用相同的图像格式
CL_IMAGE_FORMAT_NOT_SUPPORTED       不支持指定的图像格式
CL_BUILD_PROGRAM_FAILURE            无法为程序构建可执行代码
CL_MAP_FAILURE                      内存区域无法映射到宿主机内存
CL_INVALID_VALUE                    命令的一个或多个参数指定了非法值
CL_INVALID_DEVICE_TYPE              传入的设备类型不是合法值
CL_INVALID_PLATFORM                 传入的平台不是合法值
CL_INVALID_DEVICE                   传入的设备不是合法值
CL_INVALID_CONTEXT                  传入的上下文不是合法值
CL_INVALID_QUEUE_PROPERTIES         设备不支持命令队列属性
CL_INVALID_COtAND_QUEUE             传入的命令队列不是合法值
CL_INVALID_HOST_PTR                 宿主机指针不合法
CL_INVALID_MEM_OBJECT               传入的内存对象不是合法值
CL_INVALID_IMAGE_FORMAT_DESCRIPTOR  传入的图像格式描述符不是合法值
CL_INVALID_IMAGE_SIZE               设备不支持这个图像大小
CL_INVALID_SAMPLER                  传入的采样工具不是合法值
CL_INVALID_BINARY                   传入了非法的二进制程序
CL_INVALID_BUILD_OPTIONS            一个或多个构建选项不合法
CL_INVALID_PROGRAM                  传入的程序不是合法值
CL_INVALID_PROGRAM_EXECUTABLE       程序未能成功地构建命令队列关联设备上的一个可执行程序
CL_INVALID_KERNEL_NAME              程序中不存在指定的内核
CL_INVALID_KERNEL_DEFINITION        程序源代码中定义的内核不合法
CL_INVALID_KERNEL                   传入的内核不是合法值
CL_INVALID_ARG_INDEX                参数索引指示的参数对于内核不合法
CL_INVALID_ARG_VALUE                对于一个非局部参数，内核参数值为NULL;或者对于一个局部参数，内核参数值为非NOLL
CL_INVALID_ARG_SIZE                 参数大小与内核参数不一致
CL_INVALID_KERNEL_ARGS              一个或多个内核参数未赋值
CL_INVALID_WORK_DIMENSION           工作维度值不是介于1~3的一个值
CL_INVALID_WORK_GROuP_SIZE          局部或全局工作组大小不合法
CL_INVALID_WORK_ITEM_SIZE           一个或多个工作项大小超出了设备支持的最大大小
CL_INVALID_GLOBAL_OFFSET            全局偏移量超出了所支持的界限
CL_INVALID_EVENT_WAIT_LIST          提供的等待列表大小不合法或者其中包含非事件
CL_INVALID_EVENT                    传入的事件不是一个合法值
CL_INVALID_OPERATION                执行命令导致出现一个不合法的操作
CL_INVALID_GL_OBJECT                OpenGL引用的对象存在问题
CL_INVALID_BUFFER_SIZE              指定的缓冲区大小越界
CL_INVALID_MIP_LEVEL                为OpenGL纹理指定的mipmap级别对于OpenGL对象不合法
CL_INVALID_GLOBAL_WORK_SIZE         传入的全局工作大小不合法，可能为0或者超出了设备支持的大小

服务器托管，北京服务器托管，服务器租用 http://www.fwqtg.net

相关推荐: OData WebAPI实践-OData与EDM

本文属于 OData 系列引言在 OData 中，EDM（Entity Data Model）代表“实体数据模型”，它是一种用于表示 Web API 中的结构化数据的格式。EDM 定义了可以由 OData 服务公开的数据类型、实体和关系。 EDM 也提供…