Revert "added optimization for 64-bit shift and clip to 16-bits, nets 5-6% speed increase"
This reverts commit ca216de68b.
This commit is contained in:
parent
270f830f1a
commit
db72dc4a7d
2 changed files with 12 additions and 54 deletions
|
|
@ -341,35 +341,6 @@ static __inline Word64 MADD64(Word64 sum64, int x, int y)
|
|||
return u.w64;
|
||||
}
|
||||
|
||||
/* Ken's trick: clips to [-32768, 32767] */
|
||||
//sign = x >> 31;
|
||||
//if (sign != (x >> 15))
|
||||
// x = sign ^ ((1 << 15) - 1);
|
||||
__attribute__((__always_inline__)) static __inline short SAR64_Clip(Word64 x, int n)
|
||||
{
|
||||
unsigned int xLo = (unsigned int) x;
|
||||
int xHi = (int) (x >> 32);
|
||||
int nComp = 32-n;
|
||||
int tmp;
|
||||
// Shortcut: n is always < 32.
|
||||
__asm__ __volatile__( "lsl %2, %0, %3\n\t" // tmp <- xHi<<(32-n)
|
||||
"lsr %1, %1, %4\n\t" // xLo <- xLo>>n
|
||||
"orr %1, %2\n\t" // xLo <= xLo || tmp
|
||||
// Uncomment this part if you really need it to saturate the output to 16-bits
|
||||
// This didn't appear necessary because the temp data is shifted right by 26 bits
|
||||
// and doesn't grow large enough to overflow a 16-bit signed value
|
||||
// "asr %2, %1, #31\n\t" // get sign in tmp
|
||||
// "asr %0, %1, #15\n\t" // use xHi as tmp2
|
||||
// "mov %3, #-1\n\t" // prep constant 0x7fff
|
||||
// "lsr %3, #17\n\t"
|
||||
// "cmp %2, %0\n\t" // if (sign != (x >> 15))
|
||||
// "it ne\n\t"
|
||||
// "eorne %1, %3\n\t"
|
||||
: "+&r" (xHi), "+r" (xLo), "=&r" (tmp)
|
||||
: "r" (nComp), "r" (n) );
|
||||
return( (short)xLo );
|
||||
}
|
||||
|
||||
__attribute__((__always_inline__)) static __inline Word64 SAR64(Word64 x, int n)
|
||||
{
|
||||
unsigned int xLo = (unsigned int) x;
|
||||
|
|
|
|||
|
|
@ -142,8 +142,7 @@ void PolyphaseMono(short *pcm, int *vbuf, const int *coefBase)
|
|||
MC0M(6)
|
||||
MC0M(7)
|
||||
|
||||
// *(pcm + 0) = ClipToShort((int)SAR64(sum1L, (32-CSHIFT)), DEF_NFRACBITS);
|
||||
*(pcm + 0) = SAR64_Clip(sum1L, (32+DEF_NFRACBITS-CSHIFT));
|
||||
*(pcm + 0) = ClipToShort((int)SAR64(sum1L, (32-CSHIFT)), DEF_NFRACBITS);
|
||||
|
||||
/* special case, output sample 16 */
|
||||
coef = coefBase + 256;
|
||||
|
|
@ -159,8 +158,7 @@ void PolyphaseMono(short *pcm, int *vbuf, const int *coefBase)
|
|||
MC1M(6)
|
||||
MC1M(7)
|
||||
|
||||
// *(pcm + 16) = ClipToShort((int)SAR64(sum1L, (32-CSHIFT)), DEF_NFRACBITS);
|
||||
*(pcm + 16) = SAR64_Clip(sum1L, (32+DEF_NFRACBITS-CSHIFT));
|
||||
*(pcm + 16) = ClipToShort((int)SAR64(sum1L, (32-CSHIFT)), DEF_NFRACBITS);
|
||||
|
||||
/* main convolution loop: sum1L = samples 1, 2, 3, ... 15 sum2L = samples 31, 30, ... 17 */
|
||||
coef = coefBase + 16;
|
||||
|
|
@ -181,10 +179,8 @@ void PolyphaseMono(short *pcm, int *vbuf, const int *coefBase)
|
|||
MC2M(7)
|
||||
|
||||
vb1 += 64;
|
||||
// *(pcm) = ClipToShort((int)SAR64(sum1L, (32-CSHIFT)), DEF_NFRACBITS);
|
||||
// *(pcm + 2*i) = ClipToShort((int)SAR64(sum2L, (32-CSHIFT)), DEF_NFRACBITS);
|
||||
*(pcm) = SAR64_Clip(sum1L, (32+DEF_NFRACBITS-CSHIFT));
|
||||
*(pcm + 2*i) = SAR64_Clip(sum2L, (32+DEF_NFRACBITS-CSHIFT));
|
||||
*(pcm) = ClipToShort((int)SAR64(sum1L, (32-CSHIFT)), DEF_NFRACBITS);
|
||||
*(pcm + 2*i) = ClipToShort((int)SAR64(sum2L, (32-CSHIFT)), DEF_NFRACBITS);
|
||||
pcm++;
|
||||
}
|
||||
}
|
||||
|
|
@ -259,10 +255,8 @@ void PolyphaseStereo(short *pcm, int *vbuf, const int *coefBase)
|
|||
MC0S(6)
|
||||
MC0S(7)
|
||||
|
||||
// *(pcm + 0) = ClipToShort((int)SAR64(sum1L, (32-CSHIFT)), DEF_NFRACBITS);
|
||||
// *(pcm + 1) = ClipToShort((int)SAR64(sum1R, (32-CSHIFT)), DEF_NFRACBITS);
|
||||
*(pcm + 0) = SAR64_Clip(sum1L, (32+DEF_NFRACBITS-CSHIFT));
|
||||
*(pcm + 1) = SAR64_Clip(sum1R, (32+DEF_NFRACBITS-CSHIFT));
|
||||
*(pcm + 0) = ClipToShort((int)SAR64(sum1L, (32-CSHIFT)), DEF_NFRACBITS);
|
||||
*(pcm + 1) = ClipToShort((int)SAR64(sum1R, (32-CSHIFT)), DEF_NFRACBITS);
|
||||
|
||||
/* special case, output sample 16 */
|
||||
coef = coefBase + 256;
|
||||
|
|
@ -278,10 +272,8 @@ void PolyphaseStereo(short *pcm, int *vbuf, const int *coefBase)
|
|||
MC1S(6)
|
||||
MC1S(7)
|
||||
|
||||
// *(pcm + 2*16 + 0) = ClipToShort((int)SAR64(sum1L, (32-CSHIFT)), DEF_NFRACBITS);
|
||||
// *(pcm + 2*16 + 1) = ClipToShort((int)SAR64(sum1R, (32-CSHIFT)), DEF_NFRACBITS);
|
||||
*(pcm + 2*16 + 0) = SAR64_Clip(sum1L, (32+DEF_NFRACBITS-CSHIFT));
|
||||
*(pcm + 2*16 + 1) = SAR64_Clip(sum1R, (32+DEF_NFRACBITS-CSHIFT));
|
||||
*(pcm + 2*16 + 0) = ClipToShort((int)SAR64(sum1L, (32-CSHIFT)), DEF_NFRACBITS);
|
||||
*(pcm + 2*16 + 1) = ClipToShort((int)SAR64(sum1R, (32-CSHIFT)), DEF_NFRACBITS);
|
||||
|
||||
/* main convolution loop: sum1L = samples 1, 2, 3, ... 15 sum2L = samples 31, 30, ... 17 */
|
||||
coef = coefBase + 16;
|
||||
|
|
@ -303,15 +295,10 @@ void PolyphaseStereo(short *pcm, int *vbuf, const int *coefBase)
|
|||
MC2S(7)
|
||||
|
||||
vb1 += 64;
|
||||
// *(pcm + 0) = ClipToShort((int)SAR64(sum1L, (32-CSHIFT)), DEF_NFRACBITS);
|
||||
// *(pcm + 1) = ClipToShort((int)SAR64(sum1R, (32-CSHIFT)), DEF_NFRACBITS);
|
||||
*(pcm + 0) = SAR64_Clip(sum1L, (32+DEF_NFRACBITS-CSHIFT));
|
||||
*(pcm + 1) = SAR64_Clip(sum1R, (32+DEF_NFRACBITS-CSHIFT));
|
||||
// *(pcm + 2*2*i + 0) = ClipToShort((int)SAR64(sum2L, (32-CSHIFT)), DEF_NFRACBITS);
|
||||
// *(pcm + 2*2*i + 1) = ClipToShort((int)SAR64(sum2R, (32-CSHIFT)), DEF_NFRACBITS);
|
||||
*(pcm + 2*2*i + 0) = SAR64_Clip(sum2L, (32+DEF_NFRACBITS-CSHIFT));
|
||||
*(pcm + 2*2*i + 1) = SAR64_Clip(sum2R, (32+DEF_NFRACBITS-CSHIFT));
|
||||
|
||||
*(pcm + 0) = ClipToShort((int)SAR64(sum1L, (32-CSHIFT)), DEF_NFRACBITS);
|
||||
*(pcm + 1) = ClipToShort((int)SAR64(sum1R, (32-CSHIFT)), DEF_NFRACBITS);
|
||||
*(pcm + 2*2*i + 0) = ClipToShort((int)SAR64(sum2L, (32-CSHIFT)), DEF_NFRACBITS);
|
||||
*(pcm + 2*2*i + 1) = ClipToShort((int)SAR64(sum2R, (32-CSHIFT)), DEF_NFRACBITS);
|
||||
pcm += 2;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in a new issue