Revert "added optimization for 64-bit shift and clip to 16-bits, nets 5-6% speed increase"

This reverts commit ca216de68b.
This commit is contained in:
Jeff Epler 2020-06-17 12:07:05 -05:00
parent 270f830f1a
commit db72dc4a7d
2 changed files with 12 additions and 54 deletions

View file

@ -341,35 +341,6 @@ static __inline Word64 MADD64(Word64 sum64, int x, int y)
return u.w64;
}
/* Ken's trick: clips to [-32768, 32767] */
//sign = x >> 31;
//if (sign != (x >> 15))
// x = sign ^ ((1 << 15) - 1);
__attribute__((__always_inline__)) static __inline short SAR64_Clip(Word64 x, int n)
{
unsigned int xLo = (unsigned int) x;
int xHi = (int) (x >> 32);
int nComp = 32-n;
int tmp;
// Shortcut: n is always < 32.
__asm__ __volatile__( "lsl %2, %0, %3\n\t" // tmp <- xHi<<(32-n)
"lsr %1, %1, %4\n\t" // xLo <- xLo>>n
"orr %1, %2\n\t" // xLo <= xLo || tmp
// Uncomment this part if you really need it to saturate the output to 16-bits
// This didn't appear necessary because the temp data is shifted right by 26 bits
// and doesn't grow large enough to overflow a 16-bit signed value
// "asr %2, %1, #31\n\t" // get sign in tmp
// "asr %0, %1, #15\n\t" // use xHi as tmp2
// "mov %3, #-1\n\t" // prep constant 0x7fff
// "lsr %3, #17\n\t"
// "cmp %2, %0\n\t" // if (sign != (x >> 15))
// "it ne\n\t"
// "eorne %1, %3\n\t"
: "+&r" (xHi), "+r" (xLo), "=&r" (tmp)
: "r" (nComp), "r" (n) );
return( (short)xLo );
}
__attribute__((__always_inline__)) static __inline Word64 SAR64(Word64 x, int n)
{
unsigned int xLo = (unsigned int) x;

View file

@ -142,8 +142,7 @@ void PolyphaseMono(short *pcm, int *vbuf, const int *coefBase)
MC0M(6)
MC0M(7)
// *(pcm + 0) = ClipToShort((int)SAR64(sum1L, (32-CSHIFT)), DEF_NFRACBITS);
*(pcm + 0) = SAR64_Clip(sum1L, (32+DEF_NFRACBITS-CSHIFT));
*(pcm + 0) = ClipToShort((int)SAR64(sum1L, (32-CSHIFT)), DEF_NFRACBITS);
/* special case, output sample 16 */
coef = coefBase + 256;
@ -159,8 +158,7 @@ void PolyphaseMono(short *pcm, int *vbuf, const int *coefBase)
MC1M(6)
MC1M(7)
// *(pcm + 16) = ClipToShort((int)SAR64(sum1L, (32-CSHIFT)), DEF_NFRACBITS);
*(pcm + 16) = SAR64_Clip(sum1L, (32+DEF_NFRACBITS-CSHIFT));
*(pcm + 16) = ClipToShort((int)SAR64(sum1L, (32-CSHIFT)), DEF_NFRACBITS);
/* main convolution loop: sum1L = samples 1, 2, 3, ... 15 sum2L = samples 31, 30, ... 17 */
coef = coefBase + 16;
@ -181,10 +179,8 @@ void PolyphaseMono(short *pcm, int *vbuf, const int *coefBase)
MC2M(7)
vb1 += 64;
// *(pcm) = ClipToShort((int)SAR64(sum1L, (32-CSHIFT)), DEF_NFRACBITS);
// *(pcm + 2*i) = ClipToShort((int)SAR64(sum2L, (32-CSHIFT)), DEF_NFRACBITS);
*(pcm) = SAR64_Clip(sum1L, (32+DEF_NFRACBITS-CSHIFT));
*(pcm + 2*i) = SAR64_Clip(sum2L, (32+DEF_NFRACBITS-CSHIFT));
*(pcm) = ClipToShort((int)SAR64(sum1L, (32-CSHIFT)), DEF_NFRACBITS);
*(pcm + 2*i) = ClipToShort((int)SAR64(sum2L, (32-CSHIFT)), DEF_NFRACBITS);
pcm++;
}
}
@ -259,10 +255,8 @@ void PolyphaseStereo(short *pcm, int *vbuf, const int *coefBase)
MC0S(6)
MC0S(7)
// *(pcm + 0) = ClipToShort((int)SAR64(sum1L, (32-CSHIFT)), DEF_NFRACBITS);
// *(pcm + 1) = ClipToShort((int)SAR64(sum1R, (32-CSHIFT)), DEF_NFRACBITS);
*(pcm + 0) = SAR64_Clip(sum1L, (32+DEF_NFRACBITS-CSHIFT));
*(pcm + 1) = SAR64_Clip(sum1R, (32+DEF_NFRACBITS-CSHIFT));
*(pcm + 0) = ClipToShort((int)SAR64(sum1L, (32-CSHIFT)), DEF_NFRACBITS);
*(pcm + 1) = ClipToShort((int)SAR64(sum1R, (32-CSHIFT)), DEF_NFRACBITS);
/* special case, output sample 16 */
coef = coefBase + 256;
@ -278,10 +272,8 @@ void PolyphaseStereo(short *pcm, int *vbuf, const int *coefBase)
MC1S(6)
MC1S(7)
// *(pcm + 2*16 + 0) = ClipToShort((int)SAR64(sum1L, (32-CSHIFT)), DEF_NFRACBITS);
// *(pcm + 2*16 + 1) = ClipToShort((int)SAR64(sum1R, (32-CSHIFT)), DEF_NFRACBITS);
*(pcm + 2*16 + 0) = SAR64_Clip(sum1L, (32+DEF_NFRACBITS-CSHIFT));
*(pcm + 2*16 + 1) = SAR64_Clip(sum1R, (32+DEF_NFRACBITS-CSHIFT));
*(pcm + 2*16 + 0) = ClipToShort((int)SAR64(sum1L, (32-CSHIFT)), DEF_NFRACBITS);
*(pcm + 2*16 + 1) = ClipToShort((int)SAR64(sum1R, (32-CSHIFT)), DEF_NFRACBITS);
/* main convolution loop: sum1L = samples 1, 2, 3, ... 15 sum2L = samples 31, 30, ... 17 */
coef = coefBase + 16;
@ -303,15 +295,10 @@ void PolyphaseStereo(short *pcm, int *vbuf, const int *coefBase)
MC2S(7)
vb1 += 64;
// *(pcm + 0) = ClipToShort((int)SAR64(sum1L, (32-CSHIFT)), DEF_NFRACBITS);
// *(pcm + 1) = ClipToShort((int)SAR64(sum1R, (32-CSHIFT)), DEF_NFRACBITS);
*(pcm + 0) = SAR64_Clip(sum1L, (32+DEF_NFRACBITS-CSHIFT));
*(pcm + 1) = SAR64_Clip(sum1R, (32+DEF_NFRACBITS-CSHIFT));
// *(pcm + 2*2*i + 0) = ClipToShort((int)SAR64(sum2L, (32-CSHIFT)), DEF_NFRACBITS);
// *(pcm + 2*2*i + 1) = ClipToShort((int)SAR64(sum2R, (32-CSHIFT)), DEF_NFRACBITS);
*(pcm + 2*2*i + 0) = SAR64_Clip(sum2L, (32+DEF_NFRACBITS-CSHIFT));
*(pcm + 2*2*i + 1) = SAR64_Clip(sum2R, (32+DEF_NFRACBITS-CSHIFT));
*(pcm + 0) = ClipToShort((int)SAR64(sum1L, (32-CSHIFT)), DEF_NFRACBITS);
*(pcm + 1) = ClipToShort((int)SAR64(sum1R, (32-CSHIFT)), DEF_NFRACBITS);
*(pcm + 2*2*i + 0) = ClipToShort((int)SAR64(sum2L, (32-CSHIFT)), DEF_NFRACBITS);
*(pcm + 2*2*i + 1) = ClipToShort((int)SAR64(sum2R, (32-CSHIFT)), DEF_NFRACBITS);
pcm += 2;
}
}