Revert "added optimization for 64-bit shift and clip to 16-bits, nets 5-6% speed increase"

This reverts commit ca216de68b.
2020-06-17 12:07:05 -05:00 · 2020-06-17 12:07:05 -05:00 · db72dc4a7d
commit db72dc4a7d
parent 270f830f1a
2 changed files with 12 additions and 54 deletions
--- a/src/assembly.h
+++ b/src/assembly.h
@ -341,35 +341,6 @@ static __inline Word64 MADD64(Word64 sum64, int x, int y)
        return u.w64;
 }

-/* Ken's trick: clips to [-32768, 32767] */
-//sign = x >> 31;
-//if (sign != (x >> 15))
-//    x = sign ^ ((1 << 15) - 1);
-__attribute__((__always_inline__)) static __inline short SAR64_Clip(Word64 x, int n)
-{
-  unsigned int xLo = (unsigned int) x;
-  int xHi = (int) (x >> 32);
-  int nComp = 32-n;
-  int tmp;
-  // Shortcut: n is always < 32.
-  __asm__ __volatile__( "lsl %2, %0, %3\n\t"  // tmp <- xHi<<(32-n)
-                        "lsr %1, %1, %4\n\t"  // xLo <- xLo>>n
-                        "orr %1, %2\n\t"      // xLo <= xLo || tmp
-// Uncomment this part if you really need it to saturate the output to 16-bits
-// This didn't appear necessary because the temp data is shifted right by 26 bits
-// and doesn't grow large enough to overflow a 16-bit signed value
-//                        "asr %2, %1, #31\n\t" // get sign in tmp
-//                        "asr %0, %1, #15\n\t" // use xHi as tmp2
-//                        "mov %3, #-1\n\t" // prep constant 0x7fff
-//                        "lsr %3, #17\n\t"
-//                        "cmp %2, %0\n\t"      // if (sign != (x >> 15))
-//                        "it ne\n\t"
-//                        "eorne %1, %3\n\t"
-                        : "+&r" (xHi), "+r" (xLo), "=&r" (tmp)
-                        : "r" (nComp), "r" (n) );
-  return( (short)xLo );
-}
-
 __attribute__((__always_inline__)) static __inline Word64 SAR64(Word64 x, int n)
 {
  unsigned int xLo = (unsigned int) x;
--- a/src/polyphase.c
+++ b/src/polyphase.c
@ -142,8 +142,7 @@ void PolyphaseMono(short *pcm, int *vbuf, const int *coefBase)
 	MC0M(6)
 	MC0M(7)

-//	*(pcm + 0) = ClipToShort((int)SAR64(sum1L, (32-CSHIFT)), DEF_NFRACBITS);
-	*(pcm + 0) = SAR64_Clip(sum1L, (32+DEF_NFRACBITS-CSHIFT));
+	*(pcm + 0) = ClipToShort((int)SAR64(sum1L, (32-CSHIFT)), DEF_NFRACBITS);

 	/* special case, output sample 16 */
 	coef = coefBase + 256;
@ -159,8 +158,7 @@ void PolyphaseMono(short *pcm, int *vbuf, const int *coefBase)
 	MC1M(6)
 	MC1M(7)

-//	*(pcm + 16) = ClipToShort((int)SAR64(sum1L, (32-CSHIFT)), DEF_NFRACBITS);
-	*(pcm + 16) = SAR64_Clip(sum1L, (32+DEF_NFRACBITS-CSHIFT));
+	*(pcm + 16) = ClipToShort((int)SAR64(sum1L, (32-CSHIFT)), DEF_NFRACBITS);

 	/* main convolution loop: sum1L = samples 1, 2, 3, ... 15   sum2L = samples 31, 30, ... 17 */
 	coef = coefBase + 16;
@ -181,10 +179,8 @@ void PolyphaseMono(short *pcm, int *vbuf, const int *coefBase)
 		MC2M(7)

 		vb1 += 64;
-//		*(pcm)       = ClipToShort((int)SAR64(sum1L, (32-CSHIFT)), DEF_NFRACBITS);
-//		*(pcm + 2*i) = ClipToShort((int)SAR64(sum2L, (32-CSHIFT)), DEF_NFRACBITS);
-		*(pcm) = SAR64_Clip(sum1L, (32+DEF_NFRACBITS-CSHIFT));
-		*(pcm + 2*i) = SAR64_Clip(sum2L, (32+DEF_NFRACBITS-CSHIFT));
+		*(pcm)       = ClipToShort((int)SAR64(sum1L, (32-CSHIFT)), DEF_NFRACBITS);
+		*(pcm + 2*i) = ClipToShort((int)SAR64(sum2L, (32-CSHIFT)), DEF_NFRACBITS);
 		pcm++;
 	}
 }
@ -259,10 +255,8 @@ void PolyphaseStereo(short *pcm, int *vbuf, const int *coefBase)
 	MC0S(6)
 	MC0S(7)

-//	*(pcm + 0) = ClipToShort((int)SAR64(sum1L, (32-CSHIFT)), DEF_NFRACBITS);
-//	*(pcm + 1) = ClipToShort((int)SAR64(sum1R, (32-CSHIFT)), DEF_NFRACBITS);
-	*(pcm + 0) = SAR64_Clip(sum1L, (32+DEF_NFRACBITS-CSHIFT));
-	*(pcm + 1) = SAR64_Clip(sum1R, (32+DEF_NFRACBITS-CSHIFT));
+	*(pcm + 0) = ClipToShort((int)SAR64(sum1L, (32-CSHIFT)), DEF_NFRACBITS);
+	*(pcm + 1) = ClipToShort((int)SAR64(sum1R, (32-CSHIFT)), DEF_NFRACBITS);

 	/* special case, output sample 16 */
 	coef = coefBase + 256;
@ -278,10 +272,8 @@ void PolyphaseStereo(short *pcm, int *vbuf, const int *coefBase)
 	MC1S(6)
 	MC1S(7)

-//	*(pcm + 2*16 + 0) = ClipToShort((int)SAR64(sum1L, (32-CSHIFT)), DEF_NFRACBITS);
-//	*(pcm + 2*16 + 1) = ClipToShort((int)SAR64(sum1R, (32-CSHIFT)), DEF_NFRACBITS);
-	*(pcm + 2*16 + 0) = SAR64_Clip(sum1L, (32+DEF_NFRACBITS-CSHIFT));
-	*(pcm + 2*16 + 1) = SAR64_Clip(sum1R, (32+DEF_NFRACBITS-CSHIFT));
+	*(pcm + 2*16 + 0) = ClipToShort((int)SAR64(sum1L, (32-CSHIFT)), DEF_NFRACBITS);
+	*(pcm + 2*16 + 1) = ClipToShort((int)SAR64(sum1R, (32-CSHIFT)), DEF_NFRACBITS);

 	/* main convolution loop: sum1L = samples 1, 2, 3, ... 15   sum2L = samples 31, 30, ... 17 */
 	coef = coefBase + 16;
@ -303,15 +295,10 @@ void PolyphaseStereo(short *pcm, int *vbuf, const int *coefBase)
 		MC2S(7)

 		vb1 += 64;
-//		*(pcm + 0)         = ClipToShort((int)SAR64(sum1L, (32-CSHIFT)), DEF_NFRACBITS);
-//		*(pcm + 1)         = ClipToShort((int)SAR64(sum1R, (32-CSHIFT)), DEF_NFRACBITS);
-		*(pcm + 0) = SAR64_Clip(sum1L, (32+DEF_NFRACBITS-CSHIFT));
-		*(pcm + 1) = SAR64_Clip(sum1R, (32+DEF_NFRACBITS-CSHIFT));
-//		*(pcm + 2*2*i + 0) = ClipToShort((int)SAR64(sum2L, (32-CSHIFT)), DEF_NFRACBITS);
-//		*(pcm + 2*2*i + 1) = ClipToShort((int)SAR64(sum2R, (32-CSHIFT)), DEF_NFRACBITS);
-		*(pcm + 2*2*i + 0) = SAR64_Clip(sum2L, (32+DEF_NFRACBITS-CSHIFT));
-		*(pcm + 2*2*i + 1) = SAR64_Clip(sum2R, (32+DEF_NFRACBITS-CSHIFT));
-
+		*(pcm + 0)         = ClipToShort((int)SAR64(sum1L, (32-CSHIFT)), DEF_NFRACBITS);
+		*(pcm + 1)         = ClipToShort((int)SAR64(sum1R, (32-CSHIFT)), DEF_NFRACBITS);
+		*(pcm + 2*2*i + 0) = ClipToShort((int)SAR64(sum2L, (32-CSHIFT)), DEF_NFRACBITS);
+		*(pcm + 2*2*i + 1) = ClipToShort((int)SAR64(sum2R, (32-CSHIFT)), DEF_NFRACBITS);
 		pcm += 2;
 	}
 }