1.加速方法
(1)将少量常量数据,如模式串长度、文本长度等,保存在线程的private memory中。
(2)将模式串保存在GPU的local memory中,加速线程对模式串的访问。
(3)将待查找的文本保存在global memory中,使用尽可能多线程访问global memory,减小线程平均访存时间。
(4)每个work-group中的线程操作文本中一段,多个work-group并行处理大文本。
2.同步
(1)work-group内,使用CLK_LOCAL_MEM_FENCE、CLK_GLOBAL_MEM_FENCE
(2)全局使用对__global int 的原子操作,来保证每个线程将结果写到全局内存的正确位置。设备支持的操作可以通过查询设备的扩展获得,如下图,可知核函数支持原子操作、printf操作:

3.代码实例,大文本精确模式串搜索
3.1 核函数(string_search_kernel.cl):
int compare(__global const uchar* text, __local const uchar* pattern, uint length){
for(uint l=0; l
if (text[l] != pattern[l])
return 0;
}
return 1;
}
__kernel void
StringSearch (
__global uchar* text, //Input Text
const uint textLength, //Length of the text
__global const uchar* pattern, //Pattern string
const uint patternLength, //Pattern length
const uint maxSearchLength, //Maximum search positions for each work-group
__global int* resultCount, //Result counts (global)
__global int* resultBuffer, //Save the match result
__local uchar* localPattern) //local buffer for the search pattern
{
int localIdx = get_local_id(0);
int localSize = get_local_size(0);
int groupIdx = get_group_id(0);
uint lastSearchIdx = textLength - patternLength + 1;
uint beginSearchIdx = groupIdx * maxSearchLength;
uint endSearchIdx = beginSearchIdx + maxSearchLength;
if(beginSearchIdx > lastSearchIdx)
return;
if(endSearchIdx > lastSearchIdx)
endSearchIdx = lastSearchIdx;
for(int idx = localIdx; idx < patternLength; idx+=localSize)
localPattern[idx] = pattern[idx];
barrier(CLK_LOCAL_MEM_FENCE);
for(uint stringPos=beginSearchIdx+localIdx; stringPos
if (compare(text+stringPos, localPattern, patternLength) == 1){
int count = atomic_inc(resultCount);
resultBuffer[count] = stringPos;
//printf("%d ",stringPos);
}
barrier(CLK_LOCAL_MEM_FENCE);
}
}