44 |
44 |
#include "precomp.hpp"
|
45 |
45 |
#include <stdio.h>
|
46 |
46 |
|
47 |
|
/*#if CV_SSE2
|
48 |
|
# if CV_SSE4 || defined __SSE4__
|
49 |
|
# include <smmintrin.h>
|
50 |
|
# else
|
51 |
|
# define _mm_blendv_pd(a, b, m) _mm_xor_pd(a, _mm_and_pd(_mm_xor_pd(b, a), m))
|
52 |
|
# define _mm_blendv_ps(a, b, m) _mm_xor_ps(a, _mm_and_ps(_mm_xor_ps(b, a), m))
|
53 |
|
# endif
|
54 |
|
#if defined CV_ICC
|
55 |
|
# define CV_HAAR_USE_SSE 1
|
56 |
|
#endif
|
57 |
|
#endif*/
|
|
47 |
#define _mm_blendv_pd(a, b, m) _mm_xor_pd(a, _mm_and_pd(_mm_xor_pd(b, a), m))
|
|
48 |
#define _mm_blendv_ps(a, b, m) _mm_xor_ps(a, _mm_and_ps(_mm_xor_ps(b, a), m))
|
58 |
49 |
|
59 |
50 |
/* these settings affect the quality of detection: change with care */
|
60 |
51 |
#define CV_ADJUST_FEATURES 1
|
... | ... | |
721 |
712 |
}
|
722 |
713 |
else if( cascade->is_stump_based )
|
723 |
714 |
{
|
724 |
|
for( i = start_stage; i < cascade->count; i++ )
|
|
715 |
if ( cv::checkHardwareSupport(CV_CPU_SSE2) )
|
725 |
716 |
{
|
726 |
|
#ifndef CV_HAAR_USE_SSE
|
727 |
|
double stage_sum = 0;
|
728 |
|
#else
|
729 |
|
__m128d stage_sum = _mm_setzero_pd();
|
730 |
|
#endif
|
731 |
|
|
732 |
|
if( cascade->stage_classifier[i].two_rects )
|
|
717 |
for( i = start_stage; i < cascade->count; i++ )
|
733 |
718 |
{
|
734 |
|
for( j = 0; j < cascade->stage_classifier[i].count; j++ )
|
|
719 |
__m128d stage_sum = _mm_setzero_pd();
|
|
720 |
|
|
721 |
if( cascade->stage_classifier[i].two_rects )
|
735 |
722 |
{
|
736 |
|
CvHidHaarClassifier* classifier = cascade->stage_classifier[i].classifier + j;
|
737 |
|
CvHidHaarTreeNode* node = classifier->node;
|
738 |
|
#ifndef CV_HAAR_USE_SSE
|
739 |
|
double t = node->threshold*variance_norm_factor;
|
740 |
|
double sum = calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight;
|
741 |
|
sum += calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight;
|
742 |
|
stage_sum += classifier->alpha[sum >= t];
|
743 |
|
#else
|
744 |
|
// ayasin - NHM perf optim. Avoid use of costly flaky jcc
|
745 |
|
__m128d t = _mm_set_sd(node->threshold*variance_norm_factor);
|
746 |
|
__m128d a = _mm_set_sd(classifier->alpha[0]);
|
747 |
|
__m128d b = _mm_set_sd(classifier->alpha[1]);
|
748 |
|
__m128d sum = _mm_set_sd(calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight +
|
749 |
|
calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight);
|
750 |
|
t = _mm_cmpgt_sd(t, sum);
|
751 |
|
stage_sum = _mm_add_sd(stage_sum, _mm_blendv_pd(b, a, t));
|
752 |
|
#endif
|
|
723 |
for( j = 0; j < cascade->stage_classifier[i].count; j++ )
|
|
724 |
{
|
|
725 |
CvHidHaarClassifier* classifier = cascade->stage_classifier[i].classifier + j;
|
|
726 |
CvHidHaarTreeNode* node = classifier->node;
|
|
727 |
// ayasin - NHM perf optim. Avoid use of costly flaky jcc
|
|
728 |
__m128d t = _mm_set_sd(node->threshold*variance_norm_factor);
|
|
729 |
__m128d a = _mm_set_sd(classifier->alpha[0]);
|
|
730 |
__m128d b = _mm_set_sd(classifier->alpha[1]);
|
|
731 |
__m128d sum = _mm_set_sd(calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight +
|
|
732 |
calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight);
|
|
733 |
t = _mm_cmpgt_sd(t, sum);
|
|
734 |
stage_sum = _mm_add_sd(stage_sum, _mm_blendv_pd(b, a, t));
|
|
735 |
}
|
753 |
736 |
}
|
|
737 |
else
|
|
738 |
{
|
|
739 |
for( j = 0; j < cascade->stage_classifier[i].count; j++ )
|
|
740 |
{
|
|
741 |
CvHidHaarClassifier* classifier = cascade->stage_classifier[i].classifier + j;
|
|
742 |
CvHidHaarTreeNode* node = classifier->node;
|
|
743 |
// ayasin - NHM perf optim. Avoid use of costly flaky jcc
|
|
744 |
__m128d t = _mm_set_sd(node->threshold*variance_norm_factor);
|
|
745 |
__m128d a = _mm_set_sd(classifier->alpha[0]);
|
|
746 |
__m128d b = _mm_set_sd(classifier->alpha[1]);
|
|
747 |
double _sum = calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight;
|
|
748 |
_sum += calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight;
|
|
749 |
if( node->feature.rect[2].p0 )
|
|
750 |
_sum += calc_sum(node->feature.rect[2],p_offset) * node->feature.rect[2].weight;
|
|
751 |
__m128d sum = _mm_set_sd(_sum);
|
|
752 |
|
|
753 |
t = _mm_cmpgt_sd(t, sum);
|
|
754 |
stage_sum = _mm_add_sd(stage_sum, _mm_blendv_pd(b, a, t));
|
|
755 |
}
|
|
756 |
}
|
|
757 |
|
|
758 |
__m128d i_threshold = _mm_set_sd(cascade->stage_classifier[i].threshold);
|
|
759 |
if( _mm_comilt_sd(stage_sum, i_threshold) )
|
|
760 |
return -i;
|
754 |
761 |
}
|
755 |
|
else
|
|
762 |
}
|
|
763 |
else
|
|
764 |
{
|
|
765 |
for( i = start_stage; i < cascade->count; i++ )
|
756 |
766 |
{
|
757 |
|
for( j = 0; j < cascade->stage_classifier[i].count; j++ )
|
|
767 |
double stage_sum = 0;
|
|
768 |
|
|
769 |
if( cascade->stage_classifier[i].two_rects )
|
758 |
770 |
{
|
759 |
|
CvHidHaarClassifier* classifier = cascade->stage_classifier[i].classifier + j;
|
760 |
|
CvHidHaarTreeNode* node = classifier->node;
|
761 |
|
#ifndef CV_HAAR_USE_SSE
|
762 |
|
double t = node->threshold*variance_norm_factor;
|
763 |
|
double sum = calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight;
|
764 |
|
sum += calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight;
|
765 |
|
if( node->feature.rect[2].p0 )
|
766 |
|
sum += calc_sum(node->feature.rect[2],p_offset) * node->feature.rect[2].weight;
|
767 |
|
|
768 |
|
stage_sum += classifier->alpha[sum >= t];
|
769 |
|
#else
|
770 |
|
// ayasin - NHM perf optim. Avoid use of costly flaky jcc
|
771 |
|
__m128d t = _mm_set_sd(node->threshold*variance_norm_factor);
|
772 |
|
__m128d a = _mm_set_sd(classifier->alpha[0]);
|
773 |
|
__m128d b = _mm_set_sd(classifier->alpha[1]);
|
774 |
|
double _sum = calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight;
|
775 |
|
_sum += calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight;
|
776 |
|
if( node->feature.rect[2].p0 )
|
777 |
|
_sum += calc_sum(node->feature.rect[2],p_offset) * node->feature.rect[2].weight;
|
778 |
|
__m128d sum = _mm_set_sd(_sum);
|
779 |
|
|
780 |
|
t = _mm_cmpgt_sd(t, sum);
|
781 |
|
stage_sum = _mm_add_sd(stage_sum, _mm_blendv_pd(b, a, t));
|
782 |
|
#endif
|
|
771 |
for( j = 0; j < cascade->stage_classifier[i].count; j++ )
|
|
772 |
{
|
|
773 |
CvHidHaarClassifier* classifier = cascade->stage_classifier[i].classifier + j;
|
|
774 |
CvHidHaarTreeNode* node = classifier->node;
|
|
775 |
double t = node->threshold*variance_norm_factor;
|
|
776 |
double sum = calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight;
|
|
777 |
sum += calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight;
|
|
778 |
stage_sum += classifier->alpha[sum >= t];
|
|
779 |
}
|
783 |
780 |
}
|
|
781 |
else
|
|
782 |
{
|
|
783 |
for( j = 0; j < cascade->stage_classifier[i].count; j++ )
|
|
784 |
{
|
|
785 |
CvHidHaarClassifier* classifier = cascade->stage_classifier[i].classifier + j;
|
|
786 |
CvHidHaarTreeNode* node = classifier->node;
|
|
787 |
double t = node->threshold*variance_norm_factor;
|
|
788 |
double sum = calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight;
|
|
789 |
sum += calc_sum(node->feature.rect[1],p_offset) * node->feature.rect[1].weight;
|
|
790 |
if( node->feature.rect[2].p0 )
|
|
791 |
sum += calc_sum(node->feature.rect[2],p_offset) * node->feature.rect[2].weight;
|
|
792 |
|
|
793 |
stage_sum += classifier->alpha[sum >= t];
|
|
794 |
}
|
|
795 |
}
|
|
796 |
|
|
797 |
if( stage_sum < cascade->stage_classifier[i].threshold )
|
|
798 |
return -i;
|
784 |
799 |
}
|
785 |
|
|
786 |
|
#ifndef CV_HAAR_USE_SSE
|
787 |
|
if( stage_sum < cascade->stage_classifier[i].threshold )
|
788 |
|
#else
|
789 |
|
__m128d i_threshold = _mm_set_sd(cascade->stage_classifier[i].threshold);
|
790 |
|
if( _mm_comilt_sd(stage_sum, i_threshold) )
|
791 |
|
#endif
|
792 |
|
return -i;
|
793 |
800 |
}
|
794 |
801 |
}
|
795 |
802 |
else
|