blob: fba6ab5d8186948cccfb0366dcef8729637ce2fd [file] [log] [blame]
/*
* Copyright (c) 2011 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef VP8_OPENCL_H
#define VP8_OPENCL_H
#ifdef __cplusplus
extern "C" {
#endif
#include "../../../vpx_config.h"
#ifdef __APPLE__
#include <OpenCL/cl.h>
#else
#include <CL/cl.h>
#endif
#if HAVE_DLOPEN
#include "dynamic_cl.h"
#endif
#define ENABLE_CL_IDCT_DEQUANT 0
#define ENABLE_CL_SUBPIXEL 1
#define TWO_PASS_SIXTAP 0
#define MEM_COPY_KERNEL 1
#define ONE_CQ_PER_MB 1 //Value of 0 is racey... still experimental.
#define ENABLE_CL_LOOPFILTER 0
extern char *cl_read_file(const char* file_name);
extern int cl_common_init();
extern void cl_destroy(cl_command_queue cq, int new_status);
extern int cl_load_program(cl_program *prog_ref, const char *file_name, const char *opts);
#define MAX_NUM_PLATFORMS 4
#define MAX_NUM_DEVICES 10
#define VP8_CL_TRIED_BUT_FAILED 1
#define VP8_CL_NOT_INITIALIZED -1
extern int cl_initialized;
extern const char *vpx_codec_lib_dir(void);
#define VP8_CL_FINISH(cq) \
if (cl_initialized == CL_SUCCESS){ \
/* Wait for kernels to finish. */ \
clFinish(cq); \
}
#define VP8_CL_BARRIER(cq) \
if (cl_initialized == CL_SUCCESS){ \
/* Insert a barrier into the command queue. */ \
clEnqueueBarrier(cq); \
}
#define VP8_CL_CHECK_SUCCESS(cq,cond,msg,alt,retCode) \
if ( cond ){ \
fprintf(stderr, msg); \
cl_destroy(cq, VP8_CL_TRIED_BUT_FAILED); \
alt; \
return retCode; \
}
#define VP8_CL_CALC_LOCAL_SIZE(kernel, kernel_size) \
err = clGetKernelWorkGroupInfo( cl_data.kernel, \
cl_data.device_id, \
CL_KERNEL_WORK_GROUP_SIZE, \
sizeof(size_t), \
&cl_data.kernel_size, \
NULL);\
VP8_CL_CHECK_SUCCESS(NULL, err != CL_SUCCESS, \
"Error: Failed to calculate local size of kernel!\n", \
,\
VP8_CL_TRIED_BUT_FAILED \
); \
#define VP8_CL_CREATE_KERNEL(data,program,name,str_name) \
data.name = clCreateKernel(data.program, str_name , &err); \
VP8_CL_CHECK_SUCCESS(NULL, err != CL_SUCCESS || !data.name, \
"Error: Failed to create compute kernel "#str_name"!\n", \
,\
VP8_CL_TRIED_BUT_FAILED \
);
#define VP8_CL_READ_BUF(cq, bufRef, bufSize, dstPtr) \
err = clEnqueueReadBuffer(cq, bufRef, CL_FALSE, 0, bufSize , dstPtr, 0, NULL, NULL); \
VP8_CL_CHECK_SUCCESS( cq, err != CL_SUCCESS, \
"Error: Failed to read from GPU!\n",, err \
); \
#define VP8_CL_SET_BUF(cq, bufRef, bufSize, dataPtr, altPath, retCode) \
{ \
err = clEnqueueWriteBuffer(cq, bufRef, CL_FALSE, 0, \
bufSize, dataPtr, 0, NULL, NULL); \
\
VP8_CL_CHECK_SUCCESS(cq, err != CL_SUCCESS, \
"Error: Failed to write to buffer!\n", \
altPath, retCode\
); \
} \
#define VP8_CL_CREATE_BUF(cq, bufRef, bufType, bufSize, dataPtr, altPath, retCode) \
bufRef = clCreateBuffer(cl_data.context, CL_MEM_READ_WRITE, bufSize, NULL, NULL); \
if (dataPtr != NULL && bufRef != NULL){ \
VP8_CL_SET_BUF(cq, bufRef, bufSize, dataPtr, altPath, retCode)\
} \
VP8_CL_CHECK_SUCCESS(cq, !bufRef, \
"Error: Failed to allocate buffer. Using CPU path!\n", \
altPath, retCode\
); \
#define VP8_CL_RELEASE_KERNEL(kernel) \
if (kernel) \
clReleaseKernel(kernel); \
kernel = NULL;
typedef struct VP8_COMMON_CL {
cl_device_id device_id; // compute device id
cl_context context; // compute context
//cl_command_queue commands; // compute command queue
cl_program filter_program; // compute program for subpixel/bilinear filters
cl_kernel vp8_sixtap_predict_kernel;
size_t vp8_sixtap_predict_kernel_size;
cl_kernel vp8_sixtap_predict8x4_kernel;
size_t vp8_sixtap_predict8x4_kernel_size;
cl_kernel vp8_sixtap_predict8x8_kernel;
size_t vp8_sixtap_predict8x8_kernel_size;
cl_kernel vp8_sixtap_predict16x16_kernel;
size_t vp8_sixtap_predict16x16_kernel_size;
cl_kernel vp8_bilinear_predict4x4_kernel;
cl_kernel vp8_bilinear_predict8x4_kernel;
cl_kernel vp8_bilinear_predict8x8_kernel;
cl_kernel vp8_bilinear_predict16x16_kernel;
cl_kernel vp8_filter_block2d_first_pass_kernel;
size_t vp8_filter_block2d_first_pass_kernel_size;
cl_kernel vp8_filter_block2d_second_pass_kernel;
size_t vp8_filter_block2d_second_pass_kernel_size;
cl_kernel vp8_filter_block2d_bil_first_pass_kernel;
size_t vp8_filter_block2d_bil_first_pass_kernel_size;
cl_kernel vp8_filter_block2d_bil_second_pass_kernel;
size_t vp8_filter_block2d_bil_second_pass_kernel_size;
cl_kernel vp8_memcpy_kernel;
size_t vp8_memcpy_kernel_size;
cl_kernel vp8_memset_short_kernel;
cl_program idct_program;
cl_kernel vp8_short_inv_walsh4x4_1_kernel;
cl_kernel vp8_short_inv_walsh4x4_1st_pass_kernel;
cl_kernel vp8_short_inv_walsh4x4_2nd_pass_kernel;
cl_kernel vp8_dc_only_idct_add_kernel;
//Note that the following 2 kernels are encoder-only. Not used in decoder.
cl_kernel vp8_short_idct4x4llm_1_kernel;
cl_kernel vp8_short_idct4x4llm_kernel;
cl_program loop_filter_program;
cl_kernel vp8_loop_filter_horizontal_edge_kernel;
cl_kernel vp8_loop_filter_vertical_edge_kernel;
cl_kernel vp8_mbloop_filter_horizontal_edge_kernel;
cl_kernel vp8_mbloop_filter_vertical_edge_kernel;
cl_kernel vp8_loop_filter_simple_horizontal_edge_kernel;
cl_kernel vp8_loop_filter_simple_vertical_edge_kernel;
cl_program dequant_program;
cl_kernel vp8_dequant_dc_idct_add_kernel;
cl_kernel vp8_dequant_idct_add_kernel;
cl_kernel vp8_dequantize_b_kernel;
cl_int cl_decode_initialized;
cl_int cl_encode_initialized;
} VP8_COMMON_CL;
extern VP8_COMMON_CL cl_data;
#ifdef __cplusplus
}
#endif
#endif /* VP8_OPENCL_H */