OpenCV - opencv231_neon_thresh_8u.patch

opencv231_neon_thresh_8u.patch

NEON optimisation of cv::threshold() for iOS - Yasuhiro Yoshimura, 2011-10-31 02:03 pm

             CV_Error( CV_StsBadArg, "Unknown threshold type" );
+        }
     /* Optimisation for ARM NEON*/
     #if defined __GNUC__ && CV_NEON
         if( CPU_HAS_NEON_FEATURE )
+        {
             uint8x16_t _x80 = vmovq_n_u8('\x80');
             uint8x16_t mask = {0, 0, 0, 0, 0, 0, 0, 0, 255, 255, 255, 255, 255, 255, 255, 255};
             uint8x16_t thresh_u = vmovq_n_u8(thresh);
             uint8x16_t thresh_s = vmovq_n_u8(thresh ^ 0x80);
             uint8x16_t maxval_ = vmovq_n_u8(maxval);
             j_scalar = roi.width & -8;
             for( i = 0; i < roi.height; i++ )
+            {
                 const uchar* src = (const uchar*)(_src.data + _src.step*i);
                 uchar* dst = (uchar*)(_dst.data + _dst.step*i);
                 switch( type )
+                {
                 case THRESH_BINARY:
                     for( j = 0; j <= roi.width - 32; j += 32 )
+                    {
                         uint8x16_t v0, v1;
                         v0 = vld1q_u8( (uint8_t const*)(src + j) );
                         v1 = vld1q_u8( (uint8_t const*)(src + j + 16) );
                         v0 = vcgtq_s8( veorq_u8(v0, _x80), thresh_s );
                         v1 = vcgtq_s8( veorq_u8(v1, _x80), thresh_s );
                         v0 = vandq_u8( v0, maxval_ );
                         v1 = vandq_u8( v1, maxval_ );
                         vst1q_u8( (uint8_t*)(dst + j), v0 );
                         vst1q_u8( (uint8_t*)(dst + j + 16), v1 );
+                    }
                     for( ; j <= roi.width - 8; j += 8 )
+                    {
                         uint8x16_t v0 = vld1q_u8( (const uint8_t*)(src + j) );
                         v0 = vandq_u8( v0, mask );
                         v0 = vcgtq_s8( veorq_u8(v0, _x80), thresh_s );
                         v0 = vandq_u8( v0, maxval_ );
                         vst1q_u8( (uint8_t*)(dst + j), v0 );
                         v0 = vandq_u8( v0, mask );
+                    }
                     break;
                 case THRESH_BINARY_INV:
                     for( j = 0; j <= roi.width - 32; j += 32 )
+                    {
                         uint8x16_t v0, v1;
                         v0 = vld1q_u8( (uint8_t const*)(src + j) );
                         v1 = vld1q_u8( (uint8_t const*)(src + j + 16) );
                         v0 = vcgtq_s8( veorq_u8(v0, _x80), thresh_s );
                         v1 = vcgtq_s8( veorq_u8(v1, _x80), thresh_s );
                         v0 = vandq_u8( vmvnq_u8( v0 ), maxval_ );
                         v1 = vandq_u8( vmvnq_u8( v1 ), maxval_ );
                         vst1q_u8( (uint8_t*)(dst + j), v0 );
                         vst1q_u8( (uint8_t*)(dst + j + 16), v1 );
+                    }
                     for( ; j <= roi.width - 8; j += 8 )
+                    {
                         uint8x16_t v0 = vld1q_u8( (const uint8_t*)(src + j) );
                         v0 = vandq_u8( v0, mask );
                         v0 = vcgtq_s8( veorq_u8(v0, _x80), thresh_s );
                         v0 = vandq_u8( vmvnq_u8( v0 ), maxval_ );
                         vst1q_u8( (uint8_t*)(dst + j), v0 );
                         v0 = vandq_u8( v0, mask );
+                    }
                     break;
                 case THRESH_TRUNC:
                     for( j = 0; j <= roi.width - 32; j += 32 )
+                    {
                         uint8x16_t v0, v1;
                         v0 = vld1q_u8( (uint8_t const*)(src + j) );
                         v1 = vld1q_u8( (uint8_t const*)(src + j + 16) );
                         v0 = vqsubq_u8( v0, vqsubq_u8( v0, thresh_u ) );
                         v1 = vqsubq_u8( v1, vqsubq_u8( v1, thresh_u ) );
                         vst1q_u8( (uint8_t*)(dst + j), v0 );
                         vst1q_u8( (uint8_t*)(dst + j + 16), v1 );
+                    }
                     for( ; j <= roi.width - 8; j += 8 )
+                    {
                         uint8x16_t v0 = vld1q_u8( (const uint8_t*)(src + j) );
                         v0 = vandq_u8( v0, mask );
                         v0 = vqsubq_u8( v0, vqsubq_u8( v0, thresh_u ) );
                         vst1q_u8( (uint8_t*)(dst + j), v0 );
                         v0 = vandq_u8( v0, mask );
+                    }
                     break;
                 case THRESH_TOZERO:
                     for( j = 0; j <= roi.width - 32; j += 32 )
+                    {
                         uint8x16_t v0, v1;
                         v0 = vld1q_u8( (uint8_t const*)(src + j) );
                         v1 = vld1q_u8( (uint8_t const*)(src + j + 16) );
                         v0 = vandq_u8( v0, vcgtq_s8( veorq_u8(v0, _x80), thresh_s ) );
                         v1 = vandq_u8( v1, vcgtq_s8( veorq_u8(v1, _x80), thresh_s ) );
                         vst1q_u8( (uint8_t*)(dst + j), v0 );
                         vst1q_u8( (uint8_t*)(dst + j + 16), v1 );
+                    }
                     for( ; j <= roi.width - 8; j += 8 )
+                    {
                         uint8x16_t v0 = vld1q_u8( (const uint8_t*)(src + j) );
                         v0 = vandq_u8( v0, mask );
                         v0 = vandq_u8( v0, vcgtq_s8( veorq_u8(v0, _x80), thresh_s ) );
                         vst1q_u8( (uint8_t*)(dst + j), v0 );
                         v0 = vandq_u8( v0, mask );
+                    }
                     break;
                 case THRESH_TOZERO_INV:
                     for( j = 0; j <= roi.width - 32; j += 32 )
+                    {
                         uint8x16_t v0, v1;
                         v0 = vld1q_u8( (uint8_t const*)(src + j) );
                         v1 = vld1q_u8( (uint8_t const*)(src + j + 16) );
                         v0 = vandq_u8( vmvnq_u8( vcgtq_s8( veorq_u8(v0, _x80), thresh_s ) ), v0 );
                         v1 = vandq_u8( vmvnq_u8( vcgtq_s8( veorq_u8(v1, _x80), thresh_s ) ), v1 );
                         vst1q_u8( (uint8_t*)(dst + j), v0 );
                         vst1q_u8( (uint8_t*)(dst + j + 16), v1 );
+                    }
                     for( ; j <= roi.width - 8; j += 8 )
+                    {
                         uint8x16_t v0 = vld1q_u8( (const uint8_t*)(src + j) );
                         v0 = vandq_u8( v0, mask );
                         v0 = vandq_u8( vmvnq_u8( vcgtq_s8( veorq_u8(v0, _x80), thresh_s ) ), v0 );
                         vst1q_u8( (uint8_t*)(dst + j), v0 );
                         v0 = vandq_u8( v0, mask );
+                    }
                     break;
+                }
+            }
+        }
     #endif
     #if CV_SSE2
         if( checkHardwareSupport(CV_CPU_SSE2) )
+        {

Login	Password