--- modules/imgproc/src/thresh.cpp 2011-10-18 18:42:40.000000000 +0900 +++ modules/imgproc/src/thresh.cpp 2011-10-31 21:26:06.000000000 +0900 @@ -116,6 +116,145 @@ CV_Error( CV_StsBadArg, "Unknown threshold type" ); } +/* Optimisation for ARM NEON*/ +#if defined __GNUC__ && CV_NEON + if( CPU_HAS_NEON_FEATURE ) + { + uint8x16_t _x80 = vmovq_n_u8('\x80'); + uint8x16_t mask = {0, 0, 0, 0, 0, 0, 0, 0, 255, 255, 255, 255, 255, 255, 255, 255}; + uint8x16_t thresh_u = vmovq_n_u8(thresh); + uint8x16_t thresh_s = vmovq_n_u8(thresh ^ 0x80); + uint8x16_t maxval_ = vmovq_n_u8(maxval); + j_scalar = roi.width & -8; + + for( i = 0; i < roi.height; i++ ) + { + const uchar* src = (const uchar*)(_src.data + _src.step*i); + uchar* dst = (uchar*)(_dst.data + _dst.step*i); + + switch( type ) + { + case THRESH_BINARY: + for( j = 0; j <= roi.width - 32; j += 32 ) + { + uint8x16_t v0, v1; + v0 = vld1q_u8( (uint8_t const*)(src + j) ); + v1 = vld1q_u8( (uint8_t const*)(src + j + 16) ); + v0 = vcgtq_s8( veorq_u8(v0, _x80), thresh_s ); + v1 = vcgtq_s8( veorq_u8(v1, _x80), thresh_s ); + v0 = vandq_u8( v0, maxval_ ); + v1 = vandq_u8( v1, maxval_ ); + vst1q_u8( (uint8_t*)(dst + j), v0 ); + vst1q_u8( (uint8_t*)(dst + j + 16), v1 ); + } + + for( ; j <= roi.width - 8; j += 8 ) + { + uint8x16_t v0 = vld1q_u8( (const uint8_t*)(src + j) ); + v0 = vandq_u8( v0, mask ); + v0 = vcgtq_s8( veorq_u8(v0, _x80), thresh_s ); + v0 = vandq_u8( v0, maxval_ ); + vst1q_u8( (uint8_t*)(dst + j), v0 ); + v0 = vandq_u8( v0, mask ); + } + break; + + case THRESH_BINARY_INV: + for( j = 0; j <= roi.width - 32; j += 32 ) + { + uint8x16_t v0, v1; + v0 = vld1q_u8( (uint8_t const*)(src + j) ); + v1 = vld1q_u8( (uint8_t const*)(src + j + 16) ); + v0 = vcgtq_s8( veorq_u8(v0, _x80), thresh_s ); + v1 = vcgtq_s8( veorq_u8(v1, _x80), thresh_s ); + v0 = vandq_u8( vmvnq_u8( v0 ), maxval_ ); + v1 = vandq_u8( vmvnq_u8( v1 ), maxval_ ); + vst1q_u8( (uint8_t*)(dst + j), v0 ); + vst1q_u8( (uint8_t*)(dst + j + 16), v1 ); + } + + for( ; j <= roi.width - 8; j += 8 ) + { + uint8x16_t v0 = vld1q_u8( (const uint8_t*)(src + j) ); + v0 = vandq_u8( v0, mask ); + v0 = vcgtq_s8( veorq_u8(v0, _x80), thresh_s ); + v0 = vandq_u8( vmvnq_u8( v0 ), maxval_ ); + vst1q_u8( (uint8_t*)(dst + j), v0 ); + v0 = vandq_u8( v0, mask ); + } + break; + + case THRESH_TRUNC: + for( j = 0; j <= roi.width - 32; j += 32 ) + { + uint8x16_t v0, v1; + v0 = vld1q_u8( (uint8_t const*)(src + j) ); + v1 = vld1q_u8( (uint8_t const*)(src + j + 16) ); + v0 = vqsubq_u8( v0, vqsubq_u8( v0, thresh_u ) ); + v1 = vqsubq_u8( v1, vqsubq_u8( v1, thresh_u ) ); + vst1q_u8( (uint8_t*)(dst + j), v0 ); + vst1q_u8( (uint8_t*)(dst + j + 16), v1 ); + } + + for( ; j <= roi.width - 8; j += 8 ) + { + uint8x16_t v0 = vld1q_u8( (const uint8_t*)(src + j) ); + v0 = vandq_u8( v0, mask ); + v0 = vqsubq_u8( v0, vqsubq_u8( v0, thresh_u ) ); + vst1q_u8( (uint8_t*)(dst + j), v0 ); + v0 = vandq_u8( v0, mask ); + } + break; + + case THRESH_TOZERO: + for( j = 0; j <= roi.width - 32; j += 32 ) + { + uint8x16_t v0, v1; + v0 = vld1q_u8( (uint8_t const*)(src + j) ); + v1 = vld1q_u8( (uint8_t const*)(src + j + 16) ); + v0 = vandq_u8( v0, vcgtq_s8( veorq_u8(v0, _x80), thresh_s ) ); + v1 = vandq_u8( v1, vcgtq_s8( veorq_u8(v1, _x80), thresh_s ) ); + vst1q_u8( (uint8_t*)(dst + j), v0 ); + vst1q_u8( (uint8_t*)(dst + j + 16), v1 ); + } + + for( ; j <= roi.width - 8; j += 8 ) + { + uint8x16_t v0 = vld1q_u8( (const uint8_t*)(src + j) ); + v0 = vandq_u8( v0, mask ); + v0 = vandq_u8( v0, vcgtq_s8( veorq_u8(v0, _x80), thresh_s ) ); + vst1q_u8( (uint8_t*)(dst + j), v0 ); + v0 = vandq_u8( v0, mask ); + } + break; + + case THRESH_TOZERO_INV: + for( j = 0; j <= roi.width - 32; j += 32 ) + { + uint8x16_t v0, v1; + v0 = vld1q_u8( (uint8_t const*)(src + j) ); + v1 = vld1q_u8( (uint8_t const*)(src + j + 16) ); + v0 = vandq_u8( vmvnq_u8( vcgtq_s8( veorq_u8(v0, _x80), thresh_s ) ), v0 ); + v1 = vandq_u8( vmvnq_u8( vcgtq_s8( veorq_u8(v1, _x80), thresh_s ) ), v1 ); + vst1q_u8( (uint8_t*)(dst + j), v0 ); + vst1q_u8( (uint8_t*)(dst + j + 16), v1 ); + } + + for( ; j <= roi.width - 8; j += 8 ) + { + uint8x16_t v0 = vld1q_u8( (const uint8_t*)(src + j) ); + v0 = vandq_u8( v0, mask ); + v0 = vandq_u8( vmvnq_u8( vcgtq_s8( veorq_u8(v0, _x80), thresh_s ) ), v0 ); + vst1q_u8( (uint8_t*)(dst + j), v0 ); + v0 = vandq_u8( v0, mask ); + } + break; + + } + } + } +#endif + #if CV_SSE2 if( checkHardwareSupport(CV_CPU_SSE2) ) {