diff --git a/modules/objdetect/src/haar.cpp b/modules/objdetect/src/haar.cpp index e6563ba..6626b90 100644 --- a/modules/objdetect/src/haar.cpp +++ b/modules/objdetect/src/haar.cpp @@ -44,17 +44,8 @@ #include "precomp.hpp" #include -/*#if CV_SSE2 -# if CV_SSE4 || defined __SSE4__ -# include -# else -# define _mm_blendv_pd(a, b, m) _mm_xor_pd(a, _mm_and_pd(_mm_xor_pd(b, a), m)) -# define _mm_blendv_ps(a, b, m) _mm_xor_ps(a, _mm_and_ps(_mm_xor_ps(b, a), m)) -# endif -#if defined CV_ICC -# define CV_HAAR_USE_SSE 1 -#endif -#endif*/ +#define _mm_blendv_pd(a, b, m) _mm_xor_pd(a, _mm_and_pd(_mm_xor_pd(b, a), m)) +#define _mm_blendv_ps(a, b, m) _mm_xor_ps(a, _mm_and_ps(_mm_xor_ps(b, a), m)) /* these settings affect the quality of detection: change with care */ #define CV_ADJUST_FEATURES 1 @@ -721,75 +712,91 @@ cvRunHaarClassifierCascade( const CvHaarClassifierCascade* _cascade, } else if( cascade->is_stump_based ) { - for( i = start_stage; i < cascade->count; i++ ) + if ( cv::checkHardwareSupport(CV_CPU_SSE2) ) { -#ifndef CV_HAAR_USE_SSE - double stage_sum = 0; -#else - __m128d stage_sum = _mm_setzero_pd(); -#endif - - if( cascade->stage_classifier[i].two_rects ) + for( i = start_stage; i < cascade->count; i++ ) { - for( j = 0; j < cascade->stage_classifier[i].count; j++ ) + __m128d stage_sum = _mm_setzero_pd(); + + if( cascade->stage_classifier[i].two_rects ) { - CvHidHaarClassifier* classifier = cascade->stage_classifier[i].classifier + j; - CvHidHaarTreeNode* node = classifier->node; -#ifndef CV_HAAR_USE_SSE - double t = node->threshold*variance_norm_factor; - double sum = calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight; - sum += calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight; - stage_sum += classifier->alpha[sum >= t]; -#else - // ayasin - NHM perf optim. Avoid use of costly flaky jcc - __m128d t = _mm_set_sd(node->threshold*variance_norm_factor); - __m128d a = _mm_set_sd(classifier->alpha[0]); - __m128d b = _mm_set_sd(classifier->alpha[1]); - __m128d sum = _mm_set_sd(calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight + - calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight); - t = _mm_cmpgt_sd(t, sum); - stage_sum = _mm_add_sd(stage_sum, _mm_blendv_pd(b, a, t)); -#endif + for( j = 0; j < cascade->stage_classifier[i].count; j++ ) + { + CvHidHaarClassifier* classifier = cascade->stage_classifier[i].classifier + j; + CvHidHaarTreeNode* node = classifier->node; + // ayasin - NHM perf optim. Avoid use of costly flaky jcc + __m128d t = _mm_set_sd(node->threshold*variance_norm_factor); + __m128d a = _mm_set_sd(classifier->alpha[0]); + __m128d b = _mm_set_sd(classifier->alpha[1]); + __m128d sum = _mm_set_sd(calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight + + calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight); + t = _mm_cmpgt_sd(t, sum); + stage_sum = _mm_add_sd(stage_sum, _mm_blendv_pd(b, a, t)); + } } + else + { + for( j = 0; j < cascade->stage_classifier[i].count; j++ ) + { + CvHidHaarClassifier* classifier = cascade->stage_classifier[i].classifier + j; + CvHidHaarTreeNode* node = classifier->node; + // ayasin - NHM perf optim. Avoid use of costly flaky jcc + __m128d t = _mm_set_sd(node->threshold*variance_norm_factor); + __m128d a = _mm_set_sd(classifier->alpha[0]); + __m128d b = _mm_set_sd(classifier->alpha[1]); + double _sum = calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight; + _sum += calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight; + if( node->feature.rect[2].p0 ) + _sum += calc_sum(node->feature.rect[2],p_offset) * node->feature.rect[2].weight; + __m128d sum = _mm_set_sd(_sum); + + t = _mm_cmpgt_sd(t, sum); + stage_sum = _mm_add_sd(stage_sum, _mm_blendv_pd(b, a, t)); + } + } + + __m128d i_threshold = _mm_set_sd(cascade->stage_classifier[i].threshold); + if( _mm_comilt_sd(stage_sum, i_threshold) ) + return -i; } - else + } + else + { + for( i = start_stage; i < cascade->count; i++ ) { - for( j = 0; j < cascade->stage_classifier[i].count; j++ ) + double stage_sum = 0; + + if( cascade->stage_classifier[i].two_rects ) { - CvHidHaarClassifier* classifier = cascade->stage_classifier[i].classifier + j; - CvHidHaarTreeNode* node = classifier->node; -#ifndef CV_HAAR_USE_SSE - double t = node->threshold*variance_norm_factor; - double sum = calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight; - sum += calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight; - if( node->feature.rect[2].p0 ) - sum += calc_sum(node->feature.rect[2],p_offset) * node->feature.rect[2].weight; - - stage_sum += classifier->alpha[sum >= t]; -#else - // ayasin - NHM perf optim. Avoid use of costly flaky jcc - __m128d t = _mm_set_sd(node->threshold*variance_norm_factor); - __m128d a = _mm_set_sd(classifier->alpha[0]); - __m128d b = _mm_set_sd(classifier->alpha[1]); - double _sum = calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight; - _sum += calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight; - if( node->feature.rect[2].p0 ) - _sum += calc_sum(node->feature.rect[2],p_offset) * node->feature.rect[2].weight; - __m128d sum = _mm_set_sd(_sum); - - t = _mm_cmpgt_sd(t, sum); - stage_sum = _mm_add_sd(stage_sum, _mm_blendv_pd(b, a, t)); -#endif + for( j = 0; j < cascade->stage_classifier[i].count; j++ ) + { + CvHidHaarClassifier* classifier = cascade->stage_classifier[i].classifier + j; + CvHidHaarTreeNode* node = classifier->node; + double t = node->threshold*variance_norm_factor; + double sum = calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight; + sum += calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight; + stage_sum += classifier->alpha[sum >= t]; + } } + else + { + for( j = 0; j < cascade->stage_classifier[i].count; j++ ) + { + CvHidHaarClassifier* classifier = cascade->stage_classifier[i].classifier + j; + CvHidHaarTreeNode* node = classifier->node; + double t = node->threshold*variance_norm_factor; + double sum = calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight; + sum += calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight; + if( node->feature.rect[2].p0 ) + sum += calc_sum(node->feature.rect[2],p_offset) * node->feature.rect[2].weight; + + stage_sum += classifier->alpha[sum >= t]; + } + } + + if( stage_sum < cascade->stage_classifier[i].threshold ) + return -i; } - -#ifndef CV_HAAR_USE_SSE - if( stage_sum < cascade->stage_classifier[i].threshold ) -#else - __m128d i_threshold = _mm_set_sd(cascade->stage_classifier[i].threshold); - if( _mm_comilt_sd(stage_sum, i_threshold) ) -#endif - return -i; } } else