Review Board 1.7.16


dsp.c optimize goerztzel sample loops, in dtmf_detect, mf_detect and tone_detect

Review Request #2093 - Created Sept. 1, 2012 and submitted

Alec Davis
1.8 to trunk
Reviewers
asterisk-dev
Asterisk
the dmtf_detector had to recalc the address from the amp[] for each of 8 calls to goerztzel_sample()
Now use similar optimization as was already done in tone_detect().

If we're counting just lines of code executed, originally for each of the 160 samples was 80, now 53. 

created code looked like this;
this code is repeated 8 times in dtmf_detect, at 8000 times a second.
677:dsp.c         ****                         goertzel_sample(s->td.dtmf.row_out, amp[j]);
 14299                  .loc 1 677 0
 14300 0d21 8B45E0      movl -32(%ebp),%eax
 14301 0d24 01C0        addl %eax,%eax
 14302 0d26 034510      addl 16(%ebp),%eax
 14303 0d29 0FB700      movzwl (%eax),%eax
 14304 0d2c 98          cwtl
 14305 0d2d 8B550C      movl 12(%ebp),%edx
 14306 0d30 81C29402    addl $660,%edx
 14306      0000

 14307 0d36 89442404    movl %eax,4(%esp)
 14308 0d3a 891424      movl %edx,(%esp)
 14309 0d3d E8EEF2FF    call goertzel_sample

=============================================================
Now this code is add once
 675:dsp.c         ****                         samp = amp[j];
 14281                  .loc 1 675 0
 14282 0cf1 8B45DC      movl -36(%ebp),%eax
 14283 0cf4 01C0        addl %eax,%eax
 14284 0cf6 034510      addl 16(%ebp),%eax
 14285 0cf9 0FB700      movzwl (%eax),%eax
 14286 0cfc 668945E6    movw %ax,-26(%ebp)
Now this code is repeated 8 times. 
 679:dsp.c         ****                         goertzel_sample(s->td.dtmf.row_out, samp);
 14298                  .loc 1 679 0
 14299 0d25 0FBF45E6    movswl -26(%ebp),%eax
 14300 0d29 8B550C      movl 12(%ebp),%edx
 14301 0d2c 81C29402    addl $660,%edx
 14301      0000
 14302 0d32 89442404    movl %eax,4(%esp)
 14303 0d36 891424      movl %edx,(%esp)
 14304 0d39 E8F2F2FF    call goertzel_sample
Yes, DTMF anyway, still able for recognise digits dialled. 

Verified with spandsp 0.0.6 if SPANDSP_USE_FIXED_POINT is chosen

Diff revision 1 (Latest)

  1. trunk/main/dsp.c: Loading...
trunk/main/dsp.c
Revision 371689 New Change
[20] 528 lines
[+20] [+] static int tone_detect(struct ast_dsp *dsp, tone_detect_state_t *s, int16_t *amp, int samples)
529
	int i;
529
	int i;
530
	int hit = 0;
530
	int hit = 0;
531
	int limit;
531
	int limit;
532
	int res = 0;
532
	int res = 0;
533
	int16_t *ptr;
533
	int16_t *ptr;

    
   
534
	short samp;
534
	int start, end;
535
	int start, end;
535
	fragment_t mute = {0, 0};
536
	fragment_t mute = {0, 0};
536

    
   
537

   
537
	if (s->squelch && s->mute_samples > 0) {
538
	if (s->squelch && s->mute_samples > 0) {
538
		mute.end = (s->mute_samples < samples) ? s->mute_samples : samples;
539
		mute.end = (s->mute_samples < samples) ? s->mute_samples : samples;
[+20] [20] 7 lines
[+20] static int tone_detect(struct ast_dsp *dsp, tone_detect_state_t *s, int16_t *amp, int samples)
546
			limit = s->samples_pending;
547
			limit = s->samples_pending;
547
		}
548
		}
548
		end = start + limit;
549
		end = start + limit;
549

    
   
550

   
550
		for (i = limit, ptr = amp ; i > 0; i--, ptr++) {
551
		for (i = limit, ptr = amp ; i > 0; i--, ptr++) {

    
   
552
			samp = *ptr;
551
			/* signed 32 bit int should be enough to suqare any possible signed 16 bit value */
553
			/* signed 32 bit int should be enough to suqare any possible signed 16 bit value */
552
			s->energy += (int32_t) *ptr * (int32_t) *ptr;
554
			s->energy += (int32_t) samp * (int32_t) samp;
553

    
   
555

   
554
			goertzel_sample(&s->tone, *ptr);
556
			goertzel_sample(&s->tone, samp);
555
		}
557
		}
556

    
   
558

   
557
		s->samples_pending -= limit;
559
		s->samples_pending -= limit;
558

    
   
560

   
559
		if (s->samples_pending) {
561
		if (s->samples_pending) {
[+20] [20] 82 lines
[+20] [+] static void store_digit(digit_detect_state_t *s, char digit)
642

    
   
644

   
643
static int dtmf_detect(struct ast_dsp *dsp, digit_detect_state_t *s, int16_t amp[], int samples, int squelch, int relax)
645
static int dtmf_detect(struct ast_dsp *dsp, digit_detect_state_t *s, int16_t amp[], int samples, int squelch, int relax)
644
{
646
{
645
	float row_energy[4];
647
	float row_energy[4];
646
	float col_energy[4];
648
	float col_energy[4];
647
	float famp;

   
648
	int i;
649
	int i;
649
	int j;
650
	int j;
650
	int sample;
651
	int sample;

    
   
652
	short samp;
651
	int best_row;
653
	int best_row;
652
	int best_col;
654
	int best_col;
653
	int hit;
655
	int hit;
654
	int limit;
656
	int limit;
655
	fragment_t mute = {0, 0};
657
	fragment_t mute = {0, 0};
[+20] [20] 12 lines
[+20] static int dtmf_detect(struct ast_dsp *dsp, digit_detect_state_t *s, int16_t amp[], int samples, int squelch, int relax)
668
			limit = samples;
670
			limit = samples;
669
		}
671
		}
670
		/* The following unrolled loop takes only 35% (rough estimate) of the
672
		/* The following unrolled loop takes only 35% (rough estimate) of the
671
		   time of a rolled loop on the machine on which it was developed */
673
		   time of a rolled loop on the machine on which it was developed */
672
		for (j = sample; j < limit; j++) {
674
		for (j = sample; j < limit; j++) {
673
			famp = amp[j];
675
			samp = amp[j];
674
			s->td.dtmf.energy += famp*famp;
676
			s->td.dtmf.energy += (int32_t) samp * (int32_t) samp;
675
			/* With GCC 2.95, the following unrolled code seems to take about 35%
677
			/* With GCC 2.95, the following unrolled code seems to take about 35%
676
			   (rough estimate) as long as a neat little 0-3 loop */
678
			   (rough estimate) as long as a neat little 0-3 loop */
677
			goertzel_sample(s->td.dtmf.row_out, amp[j]);
679
			goertzel_sample(s->td.dtmf.row_out, samp);
678
			goertzel_sample(s->td.dtmf.col_out, amp[j]);
680
			goertzel_sample(s->td.dtmf.col_out, samp);
679
			goertzel_sample(s->td.dtmf.row_out + 1, amp[j]);
681
			goertzel_sample(s->td.dtmf.row_out + 1, samp);
680
			goertzel_sample(s->td.dtmf.col_out + 1, amp[j]);
682
			goertzel_sample(s->td.dtmf.col_out + 1, samp);
681
			goertzel_sample(s->td.dtmf.row_out + 2, amp[j]);
683
			goertzel_sample(s->td.dtmf.row_out + 2, samp);
682
			goertzel_sample(s->td.dtmf.col_out + 2, amp[j]);
684
			goertzel_sample(s->td.dtmf.col_out + 2, samp);
683
			goertzel_sample(s->td.dtmf.row_out + 3, amp[j]);
685
			goertzel_sample(s->td.dtmf.row_out + 3, samp);
684
			goertzel_sample(s->td.dtmf.col_out + 3, amp[j]);
686
			goertzel_sample(s->td.dtmf.col_out + 3, samp);
685
		}
687
		}
686
		s->td.dtmf.current_sample += (limit - sample);
688
		s->td.dtmf.current_sample += (limit - sample);
687
		if (s->td.dtmf.current_sample < DTMF_GSIZE) {
689
		if (s->td.dtmf.current_sample < DTMF_GSIZE) {
688
			continue;
690
			continue;
689
		}
691
		}
[+20] [20] 107 lines
[+20] [+] static int mf_detect(struct ast_dsp *dsp, digit_detect_state_t *s, int16_t amp[],
797
	int best;
799
	int best;
798
	int second_best;
800
	int second_best;
799
	int i;
801
	int i;
800
	int j;
802
	int j;
801
	int sample;
803
	int sample;

    
   
804
	short samp;
802
	int hit;
805
	int hit;
803
	int limit;
806
	int limit;
804
	fragment_t mute = {0, 0};
807
	fragment_t mute = {0, 0};
805

    
   
808

   
806
	if (squelch && s->td.mf.mute_samples > 0) {
809
	if (squelch && s->td.mf.mute_samples > 0) {
[+20] [20] 13 lines
[+20] static int mf_detect(struct ast_dsp *dsp, digit_detect_state_t *s, int16_t amp[],
820
		/* The following unrolled loop takes only 35% (rough estimate) of the
823
		/* The following unrolled loop takes only 35% (rough estimate) of the
821
		   time of a rolled loop on the machine on which it was developed */
824
		   time of a rolled loop on the machine on which it was developed */
822
		for (j = sample;  j < limit;  j++) {
825
		for (j = sample;  j < limit;  j++) {
823
			/* With GCC 2.95, the following unrolled code seems to take about 35%
826
			/* With GCC 2.95, the following unrolled code seems to take about 35%
824
			   (rough estimate) as long as a neat little 0-3 loop */
827
			   (rough estimate) as long as a neat little 0-3 loop */
825
			goertzel_sample(s->td.mf.tone_out, amp[j]);
828
			samp = amp[j];
826
			goertzel_sample(s->td.mf.tone_out + 1, amp[j]);
829
			goertzel_sample(s->td.mf.tone_out, samp);
827
			goertzel_sample(s->td.mf.tone_out + 2, amp[j]);
830
			goertzel_sample(s->td.mf.tone_out + 1, samp);
828
			goertzel_sample(s->td.mf.tone_out + 3, amp[j]);
831
			goertzel_sample(s->td.mf.tone_out + 2, samp);
829
			goertzel_sample(s->td.mf.tone_out + 4, amp[j]);
832
			goertzel_sample(s->td.mf.tone_out + 3, samp);
830
			goertzel_sample(s->td.mf.tone_out + 5, amp[j]);
833
			goertzel_sample(s->td.mf.tone_out + 4, samp);

    
   
834
			goertzel_sample(s->td.mf.tone_out + 5, samp);
831
		}
835
		}
832
		s->td.mf.current_sample += (limit - sample);
836
		s->td.mf.current_sample += (limit - sample);
833
		if (s->td.mf.current_sample < MF_GSIZE) {
837
		if (s->td.mf.current_sample < MF_GSIZE) {
834
			continue;
838
			continue;
835
		}
839
		}
[+20] [20] 979 lines
  1. trunk/main/dsp.c: Loading...

https://reviewboard.asterisk.org/ runs on a server provided by Digium, Inc. and uses bandwidth donated to the open source Asterisk community by API Digital Communications in Huntsville, AL USA.
Please report problems with this site to asteriskteam@digium.com.