surf.ocl.cpp

rainhh sun, 2014-04-08 04:32 am

Download (26.8 kB)

 
1
/*M/////////////////////////////////////////////////////////////////////////////////////////
2
//
3
//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4
//
5
//  By downloading, copying, installing or using the software you agree to this license.
6
//  If you do not agree to this license, do not download, install,
7
//  copy or use the software.
8
//
9
//
10
//                           License Agreement
11
//                For Open Source Computer Vision Library
12
//
13
// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
14
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
15
// Third party copyrights are property of their respective owners.
16
//
17
// @Authors
18
//    Peng Xiao, [email protected]
19
//
20
// Redistribution and use in source and binary forms, with or without modification,
21
// are permitted provided that the following conditions are met:
22
//
23
//   * Redistribution's of source code must retain the above copyright notice,
24
//     this list of conditions and the following disclaimer.
25
//
26
//   * Redistribution's in binary form must reproduce the above copyright notice,
27
//     this list of conditions and the following disclaimer in the documentation
28
//     and/or other materials provided with the distribution.
29
//
30
//   * The name of the copyright holders may not be used to endorse or promote products
31
//     derived from this software without specific prior written permission.
32
//
33
// This software is provided by the copyright holders and contributors as is and
34
// any express or implied warranties, including, but not limited to, the implied
35
// warranties of merchantability and fitness for a particular purpose are disclaimed.
36
// In no event shall the Intel Corporation or contributors be liable for any direct,
37
// indirect, incidental, special, exemplary, or consequential damages
38
// (including, but not limited to, procurement of substitute goods or services;
39
// loss of use, data, or profits; or business interruption) however caused
40
// and on any theory of liability, whether in contract, strict liability,
41
// or tort (including negligence or otherwise) arising in any way out of
42
// the use of this software, even if advised of the possibility of such damage.
43
//
44
//M*/
45
#include "precomp.hpp"
46
47
#ifdef HAVE_OPENCV_OCL
48
#include <cstdio>
49
#include <sstream>
50
#include "opencl_kernels.hpp"
51
52
using namespace cv;
53
using namespace cv::ocl;
54
55
namespace cv
56
{
57
    namespace ocl
58
    {
59
        // The number of degrees between orientation samples in calcOrientation
60
        const static int ORI_SEARCH_INC = 5;
61
        // The local size of the calcOrientation kernel
62
        const static int ORI_LOCAL_SIZE = (360 / ORI_SEARCH_INC);
63
64
        static void openCLExecuteKernelSURF(Context *clCxt, const cv::ocl::ProgramEntry* source, string kernelName, size_t globalThreads[3],
65
            size_t localThreads[3],  std::vector< std::pair<size_t, const void *> > &args, int channels, int depth)
66
        {
67
            std::stringstream optsStr;
68
            optsStr << "-D ORI_LOCAL_SIZE=" << ORI_LOCAL_SIZE << " ";
69
            optsStr << "-D ORI_SEARCH_INC=" << ORI_SEARCH_INC << " ";
70
            cl_kernel kernel;
71
            kernel = openCLGetKernelFromSource(clCxt, source, kernelName, optsStr.str().c_str());
72
            size_t wave_size = queryWaveFrontSize(kernel);
73
            CV_Assert(clReleaseKernel(kernel) == CL_SUCCESS);
74
            optsStr << "-D WAVE_SIZE=" << wave_size;
75
            openCLExecuteKernel(clCxt, source, kernelName, globalThreads, localThreads, args, channels, depth, optsStr.str().c_str());
76
        }
77
78
    }
79
}
80
81
static inline int calcSize(int octave, int layer)
82
{
83
    /* Wavelet size at first layer of first octave. */
84
    const int HAAR_SIZE0 = 9;
85
86
    /* Wavelet size increment between layers. This should be an even number,
87
    such that the wavelet sizes in an octave are either all even or all odd.
88
    This ensures that when looking for the neighbors of a sample, the layers
89
90
    above and below are aligned correctly. */
91
    const int HAAR_SIZE_INC = 6;
92
93
    return (HAAR_SIZE0 + HAAR_SIZE_INC * layer) << octave;
94
}
95
96
97
class SURF_OCL_Invoker
98
{
99
public:
100
    // facilities
101
    void bindImgTex(const oclMat &img, cl_mem &texture);
102
103
    //void loadGlobalConstants(int maxCandidates, int maxFeatures, int img_rows, int img_cols, int nOctaveLayers, float hessianThreshold);
104
    //void loadOctaveConstants(int octave, int layer_rows, int layer_cols);
105
106
    // kernel callers declarations
107
    void icvCalcLayerDetAndTrace_gpu(oclMat &det, oclMat &trace, int octave, int nOctaveLayers, int layer_rows);
108
109
    void icvFindMaximaInLayer_gpu(const oclMat &det, const oclMat &trace, oclMat &maxPosBuffer, oclMat &maxCounter, int counterOffset,
110
                                  int octave, bool use_mask, int nLayers, int layer_rows, int layer_cols);
111
112
    void icvInterpolateKeypoint_gpu(const oclMat &det, const oclMat &maxPosBuffer, int maxCounter,
113
                                    oclMat &keypoints, oclMat &counters, int octave, int layer_rows, int maxFeatures);
114
115
    void icvCalcOrientation_gpu(const oclMat &keypoints, int nFeatures);
116
117
    void icvSetUpright_gpu(const oclMat &keypoints, int nFeatures);
118
119
    void compute_descriptors_gpu(const oclMat &descriptors, const oclMat &keypoints, int nFeatures);
120
    // end of kernel callers declarations
121
122
    SURF_OCL_Invoker(SURF_OCL &surf, const oclMat &img, const oclMat &mask) :
123
        surf_(surf),
124
        img_cols(img.cols), img_rows(img.rows),
125
        use_mask(!mask.empty()), counters(oclMat()),
126
        imgTex(NULL), sumTex(NULL), maskSumTex(NULL), _img(img)
127
    {
128
        CV_Assert(!img.empty() && img.type() == CV_8UC1);
129
        CV_Assert(mask.empty() || (mask.size() == img.size() && mask.type() == CV_8UC1));
130
        CV_Assert(surf_.nOctaves > 0 && surf_.nOctaveLayers > 0);
131
132
        const int min_size = calcSize(surf_.nOctaves - 1, 0);
133
        CV_Assert(img_rows - min_size >= 0);
134
        CV_Assert(img_cols - min_size >= 0);
135
136
        const int layer_rows = img_rows >> (surf_.nOctaves - 1);
137
        const int layer_cols = img_cols >> (surf_.nOctaves - 1);
138
        const int min_margin = ((calcSize((surf_.nOctaves - 1), 2) >> 1) >> (surf_.nOctaves - 1)) + 1;
139
        CV_Assert(layer_rows - 2 * min_margin > 0);
140
        CV_Assert(layer_cols - 2 * min_margin > 0);
141
142
        maxFeatures   = std::min(static_cast<int>(img.size().area() * surf.keypointsRatio), 65535);
143
        maxCandidates = std::min(static_cast<int>(1.5 * maxFeatures), 65535);
144
145
        CV_Assert(maxFeatures > 0);
146
147
        counters.create(1, surf_.nOctaves + 1, CV_32SC1);
148
        counters.setTo(Scalar::all(0));
149
150
        integral(img, surf_.sum);
151
152
        bindImgTex(img, imgTex);
153
        bindImgTex(surf_.sum, sumTex);
154
        finish();
155
156
        maskSumTex = 0;
157
158
        if (use_mask)
159
        {
160
            CV_Error(CV_StsBadFunc, "Masked SURF detector is not implemented yet");
161
            //!FIXME
162
            // temp fix for missing min overload
163
            //oclMat temp(mask.size(), mask.type());
164
            //temp.setTo(Scalar::all(1.0));
165
            ////cv::ocl::min(mask, temp, surf_.mask1);           ///////// disable this
166
            //integral(surf_.mask1, surf_.maskSum);
167
            //bindImgTex(surf_.maskSum, maskSumTex);
168
        }
169
    }
170
171
    void detectKeypoints(oclMat &keypoints)
172
    {
173
        // create image pyramid buffers
174
        // different layers have same sized buffers, but they are sampled from Gaussian kernel.
175
        ensureSizeIsEnough(img_rows * (surf_.nOctaveLayers + 2), img_cols, CV_32FC1, surf_.det);
176
        ensureSizeIsEnough(img_rows * (surf_.nOctaveLayers + 2), img_cols, CV_32FC1, surf_.trace);
177
178
        ensureSizeIsEnough(1, maxCandidates, CV_32SC4, surf_.maxPosBuffer);
179
        ensureSizeIsEnough(SURF_OCL::ROWS_COUNT, maxFeatures, CV_32FC1, keypoints);
180
        keypoints.setTo(Scalar::all(0));
181
182
        for (int octave = 0; octave < surf_.nOctaves; ++octave)
183
        {
184
            const int layer_rows = img_rows >> octave;
185
            const int layer_cols = img_cols >> octave;
186
187
            //loadOctaveConstants(octave, layer_rows, layer_cols);
188
189
            icvCalcLayerDetAndTrace_gpu(surf_.det, surf_.trace, octave, surf_.nOctaveLayers, layer_rows);
190
191
            icvFindMaximaInLayer_gpu(surf_.det, surf_.trace, surf_.maxPosBuffer, counters, 1 + octave,
192
                                     octave, use_mask, surf_.nOctaveLayers, layer_rows, layer_cols);
193
194
            int maxCounter = ((Mat)counters).at<int>(1 + octave);
195
            maxCounter = std::min(maxCounter, static_cast<int>(maxCandidates));
196
197
            if (maxCounter > 0)
198
            {
199
                icvInterpolateKeypoint_gpu(surf_.det, surf_.maxPosBuffer, maxCounter,
200
                                           keypoints, counters, octave, layer_rows, maxFeatures);
201
            }
202
        }
203
        int featureCounter = Mat(counters).at<int>(0);
204
        featureCounter = std::min(featureCounter, static_cast<int>(maxFeatures));
205
206
        keypoints.cols = featureCounter;
207
208
        if (surf_.upright)
209
        {
210
            //keypoints.row(SURF_OCL::ANGLE_ROW).setTo(Scalar::all(90.0));
211
            setUpright(keypoints);
212
        }
213
        else
214
        {
215
            findOrientation(keypoints);
216
        }
217
    }
218
219
    void setUpright(oclMat &keypoints)
220
    {
221
        const int nFeatures = keypoints.cols;
222
        if(nFeatures > 0)
223
        {
224
            icvSetUpright_gpu(keypoints, keypoints.cols);
225
        }
226
    }
227
228
    void findOrientation(oclMat &keypoints)
229
    {
230
        const int nFeatures = keypoints.cols;
231
        if (nFeatures > 0)
232
        {
233
            icvCalcOrientation_gpu(keypoints, nFeatures);
234
        }
235
    }
236
237
    void computeDescriptors(const oclMat &keypoints, oclMat &descriptors, int descriptorSize)
238
    {
239
        const int nFeatures = keypoints.cols;
240
        if (nFeatures > 0)
241
        {
242
            ensureSizeIsEnough(nFeatures, descriptorSize, CV_32F, descriptors);
243
            compute_descriptors_gpu(descriptors, keypoints, nFeatures);
244
        }
245
    }
246
247
    ~SURF_OCL_Invoker()
248
    {
249
        if(imgTex)
250
            openCLFree(imgTex);
251
        if(sumTex)
252
            openCLFree(sumTex);
253
        if(maskSumTex)
254
            openCLFree(maskSumTex);
255
    }
256
257
private:
258
    SURF_OCL &surf_;
259
260
    int img_cols, img_rows;
261
262
    bool use_mask;
263
264
    int maxCandidates;
265
    int maxFeatures;
266
267
    oclMat counters;
268
269
    // texture buffers
270
    cl_mem imgTex;
271
    cl_mem sumTex;
272
    cl_mem maskSumTex;
273
274
    const oclMat _img; // make a copy for non-image2d_t supported platform
275
276
    SURF_OCL_Invoker &operator= (const SURF_OCL_Invoker &right)
277
    {
278
        (*this) = right;
279
        return *this;
280
    } // remove warning C4512
281
};
282
283
cv::ocl::SURF_OCL::SURF_OCL()
284
{
285
    hessianThreshold = 100.0f;
286
    extended = true;
287
    nOctaves = 4;
288
    nOctaveLayers = 2;
289
    keypointsRatio = 0.01f;
290
    upright = false;
291
}
292
293
cv::ocl::SURF_OCL::SURF_OCL(double _threshold, int _nOctaves, int _nOctaveLayers, bool _extended, float _keypointsRatio, bool _upright)
294
{
295
    hessianThreshold = saturate_cast<float>(_threshold);
296
    extended = _extended;
297
    nOctaves = _nOctaves;
298
    nOctaveLayers = _nOctaveLayers;
299
    keypointsRatio = _keypointsRatio;
300
    upright = _upright;
301
}
302
303
int cv::ocl::SURF_OCL::descriptorSize() const
304
{
305
    return extended ? 128 : 64;
306
}
307
308
void cv::ocl::SURF_OCL::uploadKeypoints(const vector<KeyPoint> &keypoints, oclMat &keypointsGPU)
309
{
310
    if (keypoints.empty())
311
        keypointsGPU.release();
312
    else
313
    {
314
        Mat keypointsCPU(SURF_OCL::ROWS_COUNT, static_cast<int>(keypoints.size()), CV_32FC1);
315
316
        float *kp_x = keypointsCPU.ptr<float>(SURF_OCL::X_ROW);
317
        float *kp_y = keypointsCPU.ptr<float>(SURF_OCL::Y_ROW);
318
        int *kp_laplacian = keypointsCPU.ptr<int>(SURF_OCL::LAPLACIAN_ROW);
319
        int *kp_octave = keypointsCPU.ptr<int>(SURF_OCL::OCTAVE_ROW);
320
        float *kp_size = keypointsCPU.ptr<float>(SURF_OCL::SIZE_ROW);
321
        float *kp_dir = keypointsCPU.ptr<float>(SURF_OCL::ANGLE_ROW);
322
        float *kp_hessian = keypointsCPU.ptr<float>(SURF_OCL::HESSIAN_ROW);
323
324
        for (size_t i = 0, size = keypoints.size(); i < size; ++i)
325
        {
326
            const KeyPoint &kp = keypoints[i];
327
            kp_x[i] = kp.pt.x;
328
            kp_y[i] = kp.pt.y;
329
            kp_octave[i] = kp.octave;
330
            kp_size[i] = kp.size;
331
            kp_dir[i] = kp.angle;
332
            kp_hessian[i] = kp.response;
333
            kp_laplacian[i] = 1;
334
        }
335
336
        keypointsGPU.upload(keypointsCPU);
337
    }
338
}
339
340
void cv::ocl::SURF_OCL::downloadKeypoints(const oclMat &keypointsGPU, vector<KeyPoint> &keypoints)
341
{
342
    const int nFeatures = keypointsGPU.cols;
343
344
    if (nFeatures == 0)
345
        keypoints.clear();
346
    else
347
    {
348
        CV_Assert(keypointsGPU.type() == CV_32FC1 && keypointsGPU.rows == ROWS_COUNT);
349
350
        Mat keypointsCPU(keypointsGPU);
351
352
        keypoints.resize(nFeatures);
353
354
        float *kp_x = keypointsCPU.ptr<float>(SURF_OCL::X_ROW);
355
        float *kp_y = keypointsCPU.ptr<float>(SURF_OCL::Y_ROW);
356
        int *kp_laplacian = keypointsCPU.ptr<int>(SURF_OCL::LAPLACIAN_ROW);
357
        int *kp_octave = keypointsCPU.ptr<int>(SURF_OCL::OCTAVE_ROW);
358
        float *kp_size = keypointsCPU.ptr<float>(SURF_OCL::SIZE_ROW);
359
        float *kp_dir = keypointsCPU.ptr<float>(SURF_OCL::ANGLE_ROW);
360
        float *kp_hessian = keypointsCPU.ptr<float>(SURF_OCL::HESSIAN_ROW);
361
362
        for (int i = 0; i < nFeatures; ++i)
363
        {
364
            KeyPoint &kp = keypoints[i];
365
            kp.pt.x = kp_x[i];
366
            kp.pt.y = kp_y[i];
367
            kp.class_id = kp_laplacian[i];
368
            kp.octave = kp_octave[i];
369
            kp.size = kp_size[i];
370
            kp.angle = kp_dir[i];
371
            kp.response = kp_hessian[i];
372
        }
373
    }
374
}
375
376
void cv::ocl::SURF_OCL::downloadDescriptors(const oclMat &descriptorsGPU, vector<float> &descriptors)
377
{
378
    if (descriptorsGPU.empty())
379
        descriptors.clear();
380
    else
381
    {
382
        CV_Assert(descriptorsGPU.type() == CV_32F);
383
384
        descriptors.resize(descriptorsGPU.rows * descriptorsGPU.cols);
385
        Mat descriptorsCPU(descriptorsGPU.size(), CV_32F, &descriptors[0]);
386
        descriptorsGPU.download(descriptorsCPU);
387
    }
388
}
389
390
void cv::ocl::SURF_OCL::operator()(const oclMat &img, const oclMat &mask, oclMat &keypoints)
391
{
392
    if (!img.empty())
393
    {
394
        SURF_OCL_Invoker surf(*this, img, mask);
395
396
        surf.detectKeypoints(keypoints);
397
    }
398
}
399
400
void cv::ocl::SURF_OCL::operator()(const oclMat &img, const oclMat &mask, oclMat &keypoints, oclMat &descriptors,
401
                                   bool useProvidedKeypoints)
402
{
403
    if (!img.empty())
404
    {
405
        SURF_OCL_Invoker surf(*this, img, mask);
406
407
        if (!useProvidedKeypoints)
408
            surf.detectKeypoints(keypoints);
409
        else if (!upright)
410
        {
411
            surf.findOrientation(keypoints);
412
        }
413
414
        surf.computeDescriptors(keypoints, descriptors, descriptorSize());
415
    }
416
}
417
418
void cv::ocl::SURF_OCL::operator()(const oclMat &img, const oclMat &mask, vector<KeyPoint> &keypoints)
419
{
420
    oclMat keypointsGPU;
421
422
    (*this)(img, mask, keypointsGPU);
423
424
    downloadKeypoints(keypointsGPU, keypoints);
425
}
426
427
void cv::ocl::SURF_OCL::operator()(const oclMat &img, const oclMat &mask, vector<KeyPoint> &keypoints,
428
                                   oclMat &descriptors, bool useProvidedKeypoints)
429
{
430
    oclMat keypointsGPU;
431
432
    if (useProvidedKeypoints)
433
        uploadKeypoints(keypoints, keypointsGPU);
434
435
    (*this)(img, mask, keypointsGPU, descriptors, useProvidedKeypoints);
436
437
    downloadKeypoints(keypointsGPU, keypoints);
438
}
439
440
void cv::ocl::SURF_OCL::operator()(const oclMat &img, const oclMat &mask, vector<KeyPoint> &keypoints,
441
                                   vector<float> &descriptors, bool useProvidedKeypoints)
442
{
443
    oclMat descriptorsGPU;
444
445
    (*this)(img, mask, keypoints, descriptorsGPU, useProvidedKeypoints);
446
447
    downloadDescriptors(descriptorsGPU, descriptors);
448
}
449
450
void cv::ocl::SURF_OCL::releaseMemory()
451
{
452
    sum.release();
453
    mask1.release();
454
    maskSum.release();
455
    intBuffer.release();
456
    det.release();
457
    trace.release();
458
    maxPosBuffer.release();
459
}
460
461
462
// bind source buffer to image oject.
463
void SURF_OCL_Invoker::bindImgTex(const oclMat &img, cl_mem &texture)
464
{
465
    if(texture)
466
    {
467
        openCLFree(texture);
468
    }
469
    texture = bindTexture(img);
470
}
471
472
////////////////////////////
473
// kernel caller definitions
474
void SURF_OCL_Invoker::icvCalcLayerDetAndTrace_gpu(oclMat &det, oclMat &trace, int octave, int nOctaveLayers, int c_layer_rows)
475
{
476
    const int min_size = calcSize(octave, 0);
477
    const int max_samples_i = 1 + ((img_rows - min_size) >> octave);
478
    const int max_samples_j = 1 + ((img_cols - min_size) >> octave);
479
480
    Context *clCxt = det.clCxt;
481
    string kernelName = "icvCalcLayerDetAndTrace";
482
    std::vector< std::pair<size_t, const void *> > args;
483
484
    if(sumTex)
485
    {
486
        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&sumTex));
487
    }
488
    else
489
    {
490
        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&surf_.sum.data)); // if image2d is not supported
491
    }
492
    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&det.data));
493
    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&trace.data));
494
    args.push_back( std::make_pair( sizeof(cl_int), (void *)&det.step));
495
    args.push_back( std::make_pair( sizeof(cl_int), (void *)&trace.step));
496
    args.push_back( std::make_pair( sizeof(cl_int), (void *)&img_rows));
497
    args.push_back( std::make_pair( sizeof(cl_int), (void *)&img_cols));
498
    args.push_back( std::make_pair( sizeof(cl_int), (void *)&nOctaveLayers));
499
    args.push_back( std::make_pair( sizeof(cl_int), (void *)&octave));
500
    args.push_back( std::make_pair( sizeof(cl_int), (void *)&c_layer_rows));
501
    args.push_back( std::make_pair( sizeof(cl_int), (void *)&surf_.sum.step));
502
503
    size_t localThreads[3]  = {16, 16, 1};
504
    size_t globalThreads[3] =
505
    {
506
        divUp(max_samples_j, localThreads[0]) * localThreads[0],
507
        divUp(max_samples_i, localThreads[1]) * localThreads[1] *(nOctaveLayers + 2),
508
        1
509
    };
510
    openCLExecuteKernelSURF(clCxt, &surf, kernelName, globalThreads, localThreads, args, -1, -1);
511
}
512
513
void SURF_OCL_Invoker::icvFindMaximaInLayer_gpu(const oclMat &det, const oclMat &trace, oclMat &maxPosBuffer, oclMat &maxCounter, int counterOffset,
514
        int octave, bool useMask, int nLayers, int layer_rows, int layer_cols)
515
{
516
    const int min_margin = ((calcSize(octave, 2) >> 1) >> octave) + 1;
517
518
    Context *clCxt = det.clCxt;
519
    string kernelName = useMask ? "icvFindMaximaInLayer_withmask" : "icvFindMaximaInLayer";
520
    std::vector< std::pair<size_t, const void *> > args;
521
522
    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&det.data));
523
    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&trace.data));
524
    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&maxPosBuffer.data));
525
    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&maxCounter.data));
526
    args.push_back( std::make_pair( sizeof(cl_int), (void *)&counterOffset));
527
    args.push_back( std::make_pair( sizeof(cl_int), (void *)&det.step));
528
    args.push_back( std::make_pair( sizeof(cl_int), (void *)&trace.step));
529
    args.push_back( std::make_pair( sizeof(cl_int), (void *)&img_rows));
530
    args.push_back( std::make_pair( sizeof(cl_int), (void *)&img_cols));
531
    args.push_back( std::make_pair( sizeof(cl_int), (void *)&nLayers));
532
    args.push_back( std::make_pair( sizeof(cl_int), (void *)&octave));
533
    args.push_back( std::make_pair( sizeof(cl_int), (void *)&layer_rows));
534
    args.push_back( std::make_pair( sizeof(cl_int), (void *)&layer_cols));
535
    args.push_back( std::make_pair( sizeof(cl_int), (void *)&maxCandidates));
536
    args.push_back( std::make_pair( sizeof(cl_float), (void *)&surf_.hessianThreshold));
537
538
    if(useMask)
539
    {
540
        if(maskSumTex)
541
        {
542
            args.push_back( std::make_pair( sizeof(cl_mem), (void *)&maskSumTex));
543
        }
544
        else
545
        {
546
            args.push_back( std::make_pair( sizeof(cl_mem), (void *)&surf_.maskSum.data));
547
        }
548
        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&surf_.maskSum.step));
549
    }
550
    size_t localThreads[3]  = {16, 16, 1};
551
    size_t globalThreads[3] = {divUp(layer_cols - 2 * min_margin, localThreads[0] - 2) *localThreads[0],
552
                               divUp(layer_rows - 2 * min_margin, localThreads[1] - 2) *nLayers *localThreads[1],
553
                               1
554
                              };
555
556
    openCLExecuteKernelSURF(clCxt, &surf, kernelName, globalThreads, localThreads, args, -1, -1);
557
}
558
559
void SURF_OCL_Invoker::icvInterpolateKeypoint_gpu(const oclMat &det, const oclMat &maxPosBuffer, int maxCounter,
560
        oclMat &keypoints, oclMat &counters_, int octave, int layer_rows, int max_features)
561
{
562
    Context *clCxt = det.clCxt;
563
    string kernelName = "icvInterpolateKeypoint";
564
    std::vector< std::pair<size_t, const void *> > args;
565
566
    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&det.data));
567
    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&maxPosBuffer.data));
568
    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&keypoints.data));
569
    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&counters_.data));
570
    args.push_back( std::make_pair( sizeof(cl_int), (void *)&det.step));
571
    args.push_back( std::make_pair( sizeof(cl_int), (void *)&keypoints.step));
572
    args.push_back( std::make_pair( sizeof(cl_int), (void *)&img_rows));
573
    args.push_back( std::make_pair( sizeof(cl_int), (void *)&img_cols));
574
    args.push_back( std::make_pair( sizeof(cl_int), (void *)&octave));
575
    args.push_back( std::make_pair( sizeof(cl_int), (void *)&layer_rows));
576
    args.push_back( std::make_pair( sizeof(cl_int), (void *)&max_features));
577
578
    size_t localThreads[3]  = {3, 3, 3};
579
    size_t globalThreads[3] = {maxCounter *localThreads[0], localThreads[1], 1};
580
581
    openCLExecuteKernelSURF(clCxt, &surf, kernelName, globalThreads, localThreads, args, -1, -1);
582
}
583
584
void SURF_OCL_Invoker::icvCalcOrientation_gpu(const oclMat &keypoints, int nFeatures)
585
{
586
    Context *clCxt = counters.clCxt;
587
    string kernelName = "icvCalcOrientation";
588
589
    std::vector< std::pair<size_t, const void *> > args;
590
591
    if(sumTex)
592
    {
593
        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&sumTex));
594
    }
595
    else
596
    {
597
        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&surf_.sum.data)); // if image2d is not supported
598
    }
599
    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&keypoints.data));
600
    args.push_back( std::make_pair( sizeof(cl_int), (void *)&keypoints.step));
601
    args.push_back( std::make_pair( sizeof(cl_int), (void *)&img_rows));
602
    args.push_back( std::make_pair( sizeof(cl_int), (void *)&img_cols));
603
    args.push_back( std::make_pair( sizeof(cl_int), (void *)&surf_.sum.step));
604
605
    size_t localThreads[3]  = {ORI_LOCAL_SIZE, 1, 1};
606
    size_t globalThreads[3] = {nFeatures * localThreads[0], 1, 1};
607
608
    openCLExecuteKernelSURF(clCxt, &surf, kernelName, globalThreads, localThreads, args, -1, -1);
609
}
610
611
void SURF_OCL_Invoker::icvSetUpright_gpu(const oclMat &keypoints, int nFeatures)
612
{
613
    Context *clCxt = counters.clCxt;
614
    string kernelName = "icvSetUpright";
615
616
    std::vector< std::pair<size_t, const void *> > args;
617
618
    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&keypoints.data));
619
    args.push_back( std::make_pair( sizeof(cl_int), (void *)&keypoints.step));
620
    args.push_back( std::make_pair( sizeof(cl_int), (void *)&nFeatures));
621
622
    size_t localThreads[3]  = {256, 1, 1};
623
    size_t globalThreads[3] = {saturate_cast<size_t>(nFeatures), 1, 1};
624
625
    openCLExecuteKernelSURF(clCxt, &surf, kernelName, globalThreads, localThreads, args, -1, -1);
626
}
627
628
629
void SURF_OCL_Invoker::compute_descriptors_gpu(const oclMat &descriptors, const oclMat &keypoints, int nFeatures)
630
{
631
    // compute unnormalized descriptors, then normalize them - odd indexing since grid must be 2D
632
    Context *clCxt = descriptors.clCxt;
633
    string kernelName;
634
    std::vector< std::pair<size_t, const void *> > args;
635
    size_t localThreads[3]  = {1, 1, 1};
636
    size_t globalThreads[3] = {1, 1, 1};
637
638
    if(descriptors.cols == 64)
639
    {
640
        kernelName = "compute_descriptors64";
641
642
        localThreads[0] = 6;
643
        localThreads[1] = 6;
644
645
        globalThreads[0] = nFeatures * localThreads[0];
646
        globalThreads[1] = 16 * localThreads[1];
647
648
        args.clear();
649
        if(imgTex)
650
        {
651
            args.push_back( std::make_pair( sizeof(cl_mem), (void *)&imgTex));
652
        }
653
        else
654
        {
655
            args.push_back( std::make_pair( sizeof(cl_mem), (void *)&_img.data));
656
        }
657
        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&descriptors.data));
658
        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&keypoints.data));
659
        args.push_back( std::make_pair( sizeof(cl_int), (void *)&descriptors.step));
660
        args.push_back( std::make_pair( sizeof(cl_int), (void *)&keypoints.step));
661
        args.push_back( std::make_pair( sizeof(cl_int), (void *)&_img.rows));
662
        args.push_back( std::make_pair( sizeof(cl_int), (void *)&_img.cols));
663
        args.push_back( std::make_pair( sizeof(cl_int), (void *)&_img.step));
664
665
        openCLExecuteKernelSURF(clCxt, &surf, kernelName, globalThreads, localThreads, args, -1, -1);
666
667
        kernelName = "normalize_descriptors64";
668
669
        localThreads[0] = 64;
670
        localThreads[1] = 1;
671
672
        globalThreads[0] = nFeatures * localThreads[0];
673
        globalThreads[1] = localThreads[1];
674
675
        args.clear();
676
        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&descriptors.data));
677
        args.push_back( std::make_pair( sizeof(cl_int), (void *)&descriptors.step));
678
679
        openCLExecuteKernelSURF(clCxt, &surf, kernelName, globalThreads, localThreads, args, -1, -1);
680
    }
681
    else
682
    {
683
        kernelName = "compute_descriptors128";
684
685
        localThreads[0] = 6;
686
        localThreads[1] = 6;
687
688
        globalThreads[0] = nFeatures * localThreads[0];
689
        globalThreads[1] = 16 * localThreads[1];
690
691
        args.clear();
692
        if(imgTex)
693
        {
694
            args.push_back( std::make_pair( sizeof(cl_mem), (void *)&imgTex));
695
        }
696
        else
697
        {
698
            args.push_back( std::make_pair( sizeof(cl_mem), (void *)&_img.data));
699
        }
700
        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&descriptors.data));
701
        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&keypoints.data));
702
        args.push_back( std::make_pair( sizeof(cl_int), (void *)&descriptors.step));
703
        args.push_back( std::make_pair( sizeof(cl_int), (void *)&keypoints.step));
704
        args.push_back( std::make_pair( sizeof(cl_int), (void *)&_img.rows));
705
        args.push_back( std::make_pair( sizeof(cl_int), (void *)&_img.cols));
706
        args.push_back( std::make_pair( sizeof(cl_int), (void *)&_img.step));
707
708
        openCLExecuteKernelSURF(clCxt, &surf, kernelName, globalThreads, localThreads, args, -1, -1);
709
710
        kernelName = "normalize_descriptors128";
711
712
        localThreads[0] = 128;
713
        localThreads[1] = 1;
714
715
        globalThreads[0] = nFeatures * localThreads[0];
716
        globalThreads[1] = localThreads[1];
717
718
        args.clear();
719
        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&descriptors.data));
720
        args.push_back( std::make_pair( sizeof(cl_int), (void *)&descriptors.step));
721
722
        openCLExecuteKernelSURF(clCxt, &surf, kernelName, globalThreads, localThreads, args, -1, -1);
723
    }
724
}
725
726
#endif //HAVE_OPENCV_OCL