--- modules/imgproc/src/thresh.cpp	2011-10-18 18:42:40.000000000 +0900
+++ modules/imgproc/src/thresh.cpp	2011-10-31 21:26:06.000000000 +0900
@@ -116,6 +116,145 @@
         CV_Error( CV_StsBadArg, "Unknown threshold type" );
     }
 
+/* Optimisation for ARM NEON*/
+#if defined __GNUC__ && CV_NEON
+    if( CPU_HAS_NEON_FEATURE )
+    {
+        uint8x16_t _x80 = vmovq_n_u8('\x80');
+        uint8x16_t mask = {0, 0, 0, 0, 0, 0, 0, 0, 255, 255, 255, 255, 255, 255, 255, 255};
+        uint8x16_t thresh_u = vmovq_n_u8(thresh);
+        uint8x16_t thresh_s = vmovq_n_u8(thresh ^ 0x80);
+        uint8x16_t maxval_ = vmovq_n_u8(maxval);
+        j_scalar = roi.width & -8;
+        
+        for( i = 0; i < roi.height; i++ )
+        {
+            const uchar* src = (const uchar*)(_src.data + _src.step*i);
+            uchar* dst = (uchar*)(_dst.data + _dst.step*i);
+            
+            switch( type )
+            {
+            case THRESH_BINARY:
+                for( j = 0; j <= roi.width - 32; j += 32 )
+                {
+                    uint8x16_t v0, v1;
+                    v0 = vld1q_u8( (uint8_t const*)(src + j) );
+                    v1 = vld1q_u8( (uint8_t const*)(src + j + 16) );
+                    v0 = vcgtq_s8( veorq_u8(v0, _x80), thresh_s );
+                    v1 = vcgtq_s8( veorq_u8(v1, _x80), thresh_s );
+                    v0 = vandq_u8( v0, maxval_ );
+                    v1 = vandq_u8( v1, maxval_ );
+                    vst1q_u8( (uint8_t*)(dst + j), v0 );
+                    vst1q_u8( (uint8_t*)(dst + j + 16), v1 );
+                }
+
+                for( ; j <= roi.width - 8; j += 8 )
+                {
+                    uint8x16_t v0 = vld1q_u8( (const uint8_t*)(src + j) );
+                    v0 = vandq_u8( v0, mask );
+                    v0 = vcgtq_s8( veorq_u8(v0, _x80), thresh_s );
+                    v0 = vandq_u8( v0, maxval_ );
+                    vst1q_u8( (uint8_t*)(dst + j), v0 );
+                    v0 = vandq_u8( v0, mask );
+                }
+                break;
+
+            case THRESH_BINARY_INV:
+                for( j = 0; j <= roi.width - 32; j += 32 )
+                {
+                    uint8x16_t v0, v1;
+                    v0 = vld1q_u8( (uint8_t const*)(src + j) );
+                    v1 = vld1q_u8( (uint8_t const*)(src + j + 16) );
+                    v0 = vcgtq_s8( veorq_u8(v0, _x80), thresh_s );
+                    v1 = vcgtq_s8( veorq_u8(v1, _x80), thresh_s );
+                    v0 = vandq_u8( vmvnq_u8( v0 ), maxval_ );
+                    v1 = vandq_u8( vmvnq_u8( v1 ), maxval_ );
+                    vst1q_u8( (uint8_t*)(dst + j), v0 );
+                    vst1q_u8( (uint8_t*)(dst + j + 16), v1 );
+                }
+
+                for( ; j <= roi.width - 8; j += 8 )
+                {
+                    uint8x16_t v0 = vld1q_u8( (const uint8_t*)(src + j) );
+                    v0 = vandq_u8( v0, mask );
+                    v0 = vcgtq_s8( veorq_u8(v0, _x80), thresh_s );
+                    v0 = vandq_u8( vmvnq_u8( v0 ), maxval_ );
+                    vst1q_u8( (uint8_t*)(dst + j), v0 );
+                    v0 = vandq_u8( v0, mask );  
+                }
+                break;
+
+            case THRESH_TRUNC:
+                for( j = 0; j <= roi.width - 32; j += 32 )
+                {
+                    uint8x16_t v0, v1;
+                    v0 = vld1q_u8( (uint8_t const*)(src + j) );
+                    v1 = vld1q_u8( (uint8_t const*)(src + j + 16) );
+                    v0 = vqsubq_u8( v0, vqsubq_u8( v0, thresh_u ) );
+                    v1 = vqsubq_u8( v1, vqsubq_u8( v1, thresh_u ) );
+                    vst1q_u8( (uint8_t*)(dst + j), v0 );
+                    vst1q_u8( (uint8_t*)(dst + j + 16), v1 );
+                }
+
+                for( ; j <= roi.width - 8; j += 8 )
+                {
+                    uint8x16_t v0 = vld1q_u8( (const uint8_t*)(src + j) );
+                    v0 = vandq_u8( v0, mask );
+                    v0 = vqsubq_u8( v0, vqsubq_u8( v0, thresh_u ) );
+                    vst1q_u8( (uint8_t*)(dst + j), v0 );
+                    v0 = vandq_u8( v0, mask );  
+                }
+                break;
+
+            case THRESH_TOZERO:
+                for( j = 0; j <= roi.width - 32; j += 32 )
+                {
+                    uint8x16_t v0, v1;
+                    v0 = vld1q_u8( (uint8_t const*)(src + j) );
+                    v1 = vld1q_u8( (uint8_t const*)(src + j + 16) );
+                    v0 = vandq_u8( v0, vcgtq_s8( veorq_u8(v0, _x80), thresh_s ) );
+                    v1 = vandq_u8( v1, vcgtq_s8( veorq_u8(v1, _x80), thresh_s ) );
+                    vst1q_u8( (uint8_t*)(dst + j), v0 );
+                    vst1q_u8( (uint8_t*)(dst + j + 16), v1 );
+                }
+
+                for( ; j <= roi.width - 8; j += 8 )
+                {
+                    uint8x16_t v0 = vld1q_u8( (const uint8_t*)(src + j) );
+                    v0 = vandq_u8( v0, mask );
+                    v0 = vandq_u8( v0, vcgtq_s8( veorq_u8(v0, _x80), thresh_s ) );
+                    vst1q_u8( (uint8_t*)(dst + j), v0 );
+                    v0 = vandq_u8( v0, mask );
+                }
+                break;
+
+            case THRESH_TOZERO_INV:
+                for( j = 0; j <= roi.width - 32; j += 32 )
+                {
+                    uint8x16_t v0, v1;
+                    v0 = vld1q_u8( (uint8_t const*)(src + j) );
+                    v1 = vld1q_u8( (uint8_t const*)(src + j + 16) );
+                    v0 = vandq_u8( vmvnq_u8( vcgtq_s8( veorq_u8(v0, _x80), thresh_s ) ), v0 );
+                    v1 = vandq_u8( vmvnq_u8( vcgtq_s8( veorq_u8(v1, _x80), thresh_s ) ), v1 );
+                    vst1q_u8( (uint8_t*)(dst + j), v0 );
+                    vst1q_u8( (uint8_t*)(dst + j + 16), v1 );
+                }
+                        
+                for( ; j <= roi.width - 8; j += 8 )
+                {
+                    uint8x16_t v0 = vld1q_u8( (const uint8_t*)(src + j) );
+                    v0 = vandq_u8( v0, mask );
+                    v0 = vandq_u8( vmvnq_u8( vcgtq_s8( veorq_u8(v0, _x80), thresh_s ) ), v0 );
+                    vst1q_u8( (uint8_t*)(dst + j), v0 );
+                    v0 = vandq_u8( v0, mask );
+                }
+                break;
+
+            }
+        }
+    }
+#endif  
+
 #if CV_SSE2
     if( checkHardwareSupport(CV_CPU_SSE2) )
     {