cvhaar-sse.patch

Peter Collingbourne, 2010-07-26 09:56 pm

Download (9 kB)

 
b/modules/objdetect/src/haar.cpp
44 44
#include "precomp.hpp"
45 45
#include <stdio.h>
46 46

  
47
/*#if CV_SSE2
48
#   if CV_SSE4 || defined __SSE4__
49
#       include <smmintrin.h>
50
#   else
51
#       define _mm_blendv_pd(a, b, m) _mm_xor_pd(a, _mm_and_pd(_mm_xor_pd(b, a), m))
52
#       define _mm_blendv_ps(a, b, m) _mm_xor_ps(a, _mm_and_ps(_mm_xor_ps(b, a), m))
53
#   endif
54
#if defined CV_ICC
55
#   define CV_HAAR_USE_SSE 1
56
#endif
57
#endif*/
47
#define _mm_blendv_pd(a, b, m) _mm_xor_pd(a, _mm_and_pd(_mm_xor_pd(b, a), m))
48
#define _mm_blendv_ps(a, b, m) _mm_xor_ps(a, _mm_and_ps(_mm_xor_ps(b, a), m))
58 49

  
59 50
/* these settings affect the quality of detection: change with care */
60 51
#define CV_ADJUST_FEATURES 1
......
721 712
    }
722 713
    else if( cascade->is_stump_based )
723 714
    {
724
        for( i = start_stage; i < cascade->count; i++ )
715
        if ( cv::checkHardwareSupport(CV_CPU_SSE2) )
725 716
        {
726
#ifndef CV_HAAR_USE_SSE
727
            double stage_sum = 0;
728
#else
729
            __m128d stage_sum = _mm_setzero_pd();
730
#endif
731

  
732
            if( cascade->stage_classifier[i].two_rects )
717
            for( i = start_stage; i < cascade->count; i++ )
733 718
            {
734
                for( j = 0; j < cascade->stage_classifier[i].count; j++ )
719
                __m128d stage_sum = _mm_setzero_pd();
720
     
721
                if( cascade->stage_classifier[i].two_rects )
735 722
                {
736
                    CvHidHaarClassifier* classifier = cascade->stage_classifier[i].classifier + j;
737
                    CvHidHaarTreeNode* node = classifier->node;
738
#ifndef CV_HAAR_USE_SSE
739
                    double t = node->threshold*variance_norm_factor;
740
                    double sum = calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight;
741
                    sum += calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight;
742
                    stage_sum += classifier->alpha[sum >= t];
743
#else
744
                    // ayasin - NHM perf optim. Avoid use of costly flaky jcc
745
                    __m128d t = _mm_set_sd(node->threshold*variance_norm_factor);
746
                    __m128d a = _mm_set_sd(classifier->alpha[0]);
747
                    __m128d b = _mm_set_sd(classifier->alpha[1]);
748
                    __m128d sum = _mm_set_sd(calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight +
749
                                             calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight);
750
                    t = _mm_cmpgt_sd(t, sum);
751
                    stage_sum = _mm_add_sd(stage_sum, _mm_blendv_pd(b, a, t));
752
#endif
723
                    for( j = 0; j < cascade->stage_classifier[i].count; j++ )
724
                    {
725
                        CvHidHaarClassifier* classifier = cascade->stage_classifier[i].classifier + j;
726
                        CvHidHaarTreeNode* node = classifier->node;
727
                        // ayasin - NHM perf optim. Avoid use of costly flaky jcc
728
                        __m128d t = _mm_set_sd(node->threshold*variance_norm_factor);
729
                        __m128d a = _mm_set_sd(classifier->alpha[0]);
730
                        __m128d b = _mm_set_sd(classifier->alpha[1]);
731
                        __m128d sum = _mm_set_sd(calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight +
732
                                                 calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight);
733
                        t = _mm_cmpgt_sd(t, sum);
734
                        stage_sum = _mm_add_sd(stage_sum, _mm_blendv_pd(b, a, t));
735
                    }
753 736
                }
737
                else
738
                {
739
                    for( j = 0; j < cascade->stage_classifier[i].count; j++ )
740
                    {
741
                        CvHidHaarClassifier* classifier = cascade->stage_classifier[i].classifier + j;
742
                        CvHidHaarTreeNode* node = classifier->node;
743
                        // ayasin - NHM perf optim. Avoid use of costly flaky jcc
744
                        __m128d t = _mm_set_sd(node->threshold*variance_norm_factor);
745
                        __m128d a = _mm_set_sd(classifier->alpha[0]);
746
                        __m128d b = _mm_set_sd(classifier->alpha[1]);
747
                        double _sum = calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight;
748
                        _sum += calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight;
749
                        if( node->feature.rect[2].p0 )
750
                            _sum += calc_sum(node->feature.rect[2],p_offset) * node->feature.rect[2].weight;
751
                        __m128d sum = _mm_set_sd(_sum);
752
                        
753
                        t = _mm_cmpgt_sd(t, sum);
754
                        stage_sum = _mm_add_sd(stage_sum, _mm_blendv_pd(b, a, t));
755
                    }
756
                }
757
     
758
                __m128d i_threshold = _mm_set_sd(cascade->stage_classifier[i].threshold);
759
                if( _mm_comilt_sd(stage_sum, i_threshold) )
760
                    return -i;
754 761
            }
755
            else
762
        }
763
        else
764
        {
765
            for( i = start_stage; i < cascade->count; i++ )
756 766
            {
757
                for( j = 0; j < cascade->stage_classifier[i].count; j++ )
767
                double stage_sum = 0;
768
     
769
                if( cascade->stage_classifier[i].two_rects )
758 770
                {
759
                    CvHidHaarClassifier* classifier = cascade->stage_classifier[i].classifier + j;
760
                    CvHidHaarTreeNode* node = classifier->node;
761
#ifndef CV_HAAR_USE_SSE
762
                    double t = node->threshold*variance_norm_factor;
763
                    double sum = calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight;
764
                    sum += calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight;
765
                    if( node->feature.rect[2].p0 )
766
                        sum += calc_sum(node->feature.rect[2],p_offset) * node->feature.rect[2].weight;
767
                    
768
                    stage_sum += classifier->alpha[sum >= t];
769
#else
770
                    // ayasin - NHM perf optim. Avoid use of costly flaky jcc
771
                    __m128d t = _mm_set_sd(node->threshold*variance_norm_factor);
772
                    __m128d a = _mm_set_sd(classifier->alpha[0]);
773
                    __m128d b = _mm_set_sd(classifier->alpha[1]);
774
                    double _sum = calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight;
775
                    _sum += calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight;
776
                    if( node->feature.rect[2].p0 )
777
                        _sum += calc_sum(node->feature.rect[2],p_offset) * node->feature.rect[2].weight;
778
                    __m128d sum = _mm_set_sd(_sum);
779
                    
780
                    t = _mm_cmpgt_sd(t, sum);
781
                    stage_sum = _mm_add_sd(stage_sum, _mm_blendv_pd(b, a, t));
782
#endif
771
                    for( j = 0; j < cascade->stage_classifier[i].count; j++ )
772
                    {
773
                        CvHidHaarClassifier* classifier = cascade->stage_classifier[i].classifier + j;
774
                        CvHidHaarTreeNode* node = classifier->node;
775
                        double t = node->threshold*variance_norm_factor;
776
                        double sum = calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight;
777
                        sum += calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight;
778
                        stage_sum += classifier->alpha[sum >= t];
779
                    }
783 780
                }
781
                else
782
                {
783
                    for( j = 0; j < cascade->stage_classifier[i].count; j++ )
784
                    {
785
                        CvHidHaarClassifier* classifier = cascade->stage_classifier[i].classifier + j;
786
                        CvHidHaarTreeNode* node = classifier->node;
787
                        double t = node->threshold*variance_norm_factor;
788
                        double sum = calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight;
789
                        sum += calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight;
790
                        if( node->feature.rect[2].p0 )
791
                            sum += calc_sum(node->feature.rect[2],p_offset) * node->feature.rect[2].weight;
792
                        
793
                        stage_sum += classifier->alpha[sum >= t];
794
                    }
795
                }
796
     
797
                if( stage_sum < cascade->stage_classifier[i].threshold )
798
                    return -i;
784 799
            }
785

  
786
#ifndef CV_HAAR_USE_SSE
787
            if( stage_sum < cascade->stage_classifier[i].threshold )
788
#else
789
            __m128d i_threshold = _mm_set_sd(cascade->stage_classifier[i].threshold);
790
            if( _mm_comilt_sd(stage_sum, i_threshold) )
791
#endif
792
                return -i;
793 800
        }
794 801
    }
795 802
    else