opencv231_neon_thresh_8u.patch

NEON optimisation of cv::threshold() for iOS - Yasuhiro Yoshimura, 2011-10-31 02:03 pm

Download (6.3 kB)

 
modules/imgproc/src/thresh.cpp 2011-10-31 21:26:06.000000000 +0900
116 116
        CV_Error( CV_StsBadArg, "Unknown threshold type" );
117 117
    }
118 118

  
119
/* Optimisation for ARM NEON*/
120
#if defined __GNUC__ && CV_NEON
121
    if( CPU_HAS_NEON_FEATURE )
122
    {
123
        uint8x16_t _x80 = vmovq_n_u8('\x80');
124
        uint8x16_t mask = {0, 0, 0, 0, 0, 0, 0, 0, 255, 255, 255, 255, 255, 255, 255, 255};
125
        uint8x16_t thresh_u = vmovq_n_u8(thresh);
126
        uint8x16_t thresh_s = vmovq_n_u8(thresh ^ 0x80);
127
        uint8x16_t maxval_ = vmovq_n_u8(maxval);
128
        j_scalar = roi.width & -8;
129
        
130
        for( i = 0; i < roi.height; i++ )
131
        {
132
            const uchar* src = (const uchar*)(_src.data + _src.step*i);
133
            uchar* dst = (uchar*)(_dst.data + _dst.step*i);
134
            
135
            switch( type )
136
            {
137
            case THRESH_BINARY:
138
                for( j = 0; j <= roi.width - 32; j += 32 )
139
                {
140
                    uint8x16_t v0, v1;
141
                    v0 = vld1q_u8( (uint8_t const*)(src + j) );
142
                    v1 = vld1q_u8( (uint8_t const*)(src + j + 16) );
143
                    v0 = vcgtq_s8( veorq_u8(v0, _x80), thresh_s );
144
                    v1 = vcgtq_s8( veorq_u8(v1, _x80), thresh_s );
145
                    v0 = vandq_u8( v0, maxval_ );
146
                    v1 = vandq_u8( v1, maxval_ );
147
                    vst1q_u8( (uint8_t*)(dst + j), v0 );
148
                    vst1q_u8( (uint8_t*)(dst + j + 16), v1 );
149
                }
150

  
151
                for( ; j <= roi.width - 8; j += 8 )
152
                {
153
                    uint8x16_t v0 = vld1q_u8( (const uint8_t*)(src + j) );
154
                    v0 = vandq_u8( v0, mask );
155
                    v0 = vcgtq_s8( veorq_u8(v0, _x80), thresh_s );
156
                    v0 = vandq_u8( v0, maxval_ );
157
                    vst1q_u8( (uint8_t*)(dst + j), v0 );
158
                    v0 = vandq_u8( v0, mask );
159
                }
160
                break;
161

  
162
            case THRESH_BINARY_INV:
163
                for( j = 0; j <= roi.width - 32; j += 32 )
164
                {
165
                    uint8x16_t v0, v1;
166
                    v0 = vld1q_u8( (uint8_t const*)(src + j) );
167
                    v1 = vld1q_u8( (uint8_t const*)(src + j + 16) );
168
                    v0 = vcgtq_s8( veorq_u8(v0, _x80), thresh_s );
169
                    v1 = vcgtq_s8( veorq_u8(v1, _x80), thresh_s );
170
                    v0 = vandq_u8( vmvnq_u8( v0 ), maxval_ );
171
                    v1 = vandq_u8( vmvnq_u8( v1 ), maxval_ );
172
                    vst1q_u8( (uint8_t*)(dst + j), v0 );
173
                    vst1q_u8( (uint8_t*)(dst + j + 16), v1 );
174
                }
175

  
176
                for( ; j <= roi.width - 8; j += 8 )
177
                {
178
                    uint8x16_t v0 = vld1q_u8( (const uint8_t*)(src + j) );
179
                    v0 = vandq_u8( v0, mask );
180
                    v0 = vcgtq_s8( veorq_u8(v0, _x80), thresh_s );
181
                    v0 = vandq_u8( vmvnq_u8( v0 ), maxval_ );
182
                    vst1q_u8( (uint8_t*)(dst + j), v0 );
183
                    v0 = vandq_u8( v0, mask );  
184
                }
185
                break;
186

  
187
            case THRESH_TRUNC:
188
                for( j = 0; j <= roi.width - 32; j += 32 )
189
                {
190
                    uint8x16_t v0, v1;
191
                    v0 = vld1q_u8( (uint8_t const*)(src + j) );
192
                    v1 = vld1q_u8( (uint8_t const*)(src + j + 16) );
193
                    v0 = vqsubq_u8( v0, vqsubq_u8( v0, thresh_u ) );
194
                    v1 = vqsubq_u8( v1, vqsubq_u8( v1, thresh_u ) );
195
                    vst1q_u8( (uint8_t*)(dst + j), v0 );
196
                    vst1q_u8( (uint8_t*)(dst + j + 16), v1 );
197
                }
198

  
199
                for( ; j <= roi.width - 8; j += 8 )
200
                {
201
                    uint8x16_t v0 = vld1q_u8( (const uint8_t*)(src + j) );
202
                    v0 = vandq_u8( v0, mask );
203
                    v0 = vqsubq_u8( v0, vqsubq_u8( v0, thresh_u ) );
204
                    vst1q_u8( (uint8_t*)(dst + j), v0 );
205
                    v0 = vandq_u8( v0, mask );  
206
                }
207
                break;
208

  
209
            case THRESH_TOZERO:
210
                for( j = 0; j <= roi.width - 32; j += 32 )
211
                {
212
                    uint8x16_t v0, v1;
213
                    v0 = vld1q_u8( (uint8_t const*)(src + j) );
214
                    v1 = vld1q_u8( (uint8_t const*)(src + j + 16) );
215
                    v0 = vandq_u8( v0, vcgtq_s8( veorq_u8(v0, _x80), thresh_s ) );
216
                    v1 = vandq_u8( v1, vcgtq_s8( veorq_u8(v1, _x80), thresh_s ) );
217
                    vst1q_u8( (uint8_t*)(dst + j), v0 );
218
                    vst1q_u8( (uint8_t*)(dst + j + 16), v1 );
219
                }
220

  
221
                for( ; j <= roi.width - 8; j += 8 )
222
                {
223
                    uint8x16_t v0 = vld1q_u8( (const uint8_t*)(src + j) );
224
                    v0 = vandq_u8( v0, mask );
225
                    v0 = vandq_u8( v0, vcgtq_s8( veorq_u8(v0, _x80), thresh_s ) );
226
                    vst1q_u8( (uint8_t*)(dst + j), v0 );
227
                    v0 = vandq_u8( v0, mask );
228
                }
229
                break;
230

  
231
            case THRESH_TOZERO_INV:
232
                for( j = 0; j <= roi.width - 32; j += 32 )
233
                {
234
                    uint8x16_t v0, v1;
235
                    v0 = vld1q_u8( (uint8_t const*)(src + j) );
236
                    v1 = vld1q_u8( (uint8_t const*)(src + j + 16) );
237
                    v0 = vandq_u8( vmvnq_u8( vcgtq_s8( veorq_u8(v0, _x80), thresh_s ) ), v0 );
238
                    v1 = vandq_u8( vmvnq_u8( vcgtq_s8( veorq_u8(v1, _x80), thresh_s ) ), v1 );
239
                    vst1q_u8( (uint8_t*)(dst + j), v0 );
240
                    vst1q_u8( (uint8_t*)(dst + j + 16), v1 );
241
                }
242
                        
243
                for( ; j <= roi.width - 8; j += 8 )
244
                {
245
                    uint8x16_t v0 = vld1q_u8( (const uint8_t*)(src + j) );
246
                    v0 = vandq_u8( v0, mask );
247
                    v0 = vandq_u8( vmvnq_u8( vcgtq_s8( veorq_u8(v0, _x80), thresh_s ) ), v0 );
248
                    vst1q_u8( (uint8_t*)(dst + j), v0 );
249
                    v0 = vandq_u8( v0, mask );
250
                }
251
                break;
252

  
253
            }
254
        }
255
    }
256
#endif  
257

  
119 258
#if CV_SSE2
120 259
    if( checkHardwareSupport(CV_CPU_SSE2) )
121 260
    {