shithub: mp3dec

--- a/minimp3.h

+++ b/minimp3.h

@@ -26,12 +26,16 @@

 #endif

 void mp3dec_init(mp3dec_t *dec);

+#ifndef MINIMP3_FLOAT_OUTPUT

 int mp3dec_decode_frame(mp3dec_t *dec, const unsigned char *mp3, int mp3_bytes, short *pcm, mp3dec_frame_info_t *info);

+#else

+int mp3dec_decode_frame(mp3dec_t *dec, const unsigned char *mp3, int mp3_bytes, float *pcm, mp3dec_frame_info_t *info);

+void mp3dec_f32_to_s16(const float *in, short *out, int num_samples);

+#endif

 #ifdef __cplusplus

 #endif

-#endif /*MINIMP3_H*/

 #ifdef MINIMP3_IMPLEMENTATION

@@ -1365,19 +1369,28 @@

 #endif

+#ifndef MINIMP3_FLOAT_OUTPUT

+typedef short mp3d_sample_t;

 static short mp3d_scale_pcm(float sample)

-    if (sample >  32767.0) return (short) 32767;

-    if (sample < -32768.0) return (short)-32768;

-    int s = (int)(sample + .5f);

+    if (sample >=  32766.5) return (short) 32767;

+    if (sample <= -32767.5) return (short)-32768;

+    short s = (short)(sample + .5f);

     s -= (s < 0);   /* away from zero, to be compliant */

-    if (s >  32767) return (short) 32767;

-    if (s < -32768) return (short)-32768;

-    return (short)s;

+    return s;

+#else

+typedef float mp3d_sample_t;

-static void mp3d_synth_pair(short *pcm, int nch, const float *z)

+static float mp3d_scale_pcm(float sample)

+    return sample / 32768.0f;

+}

+#endif

+static void mp3d_synth_pair(mp3d_sample_t *pcm, int nch, const float *z)

+{

     float a;

     a  = (z[14*64] - z[    0]) * 29;

     a += (z[ 1*64] + z[13*64]) * 213;

@@ -1401,11 +1414,11 @@

     pcm[16*nch] = mp3d_scale_pcm(a);

-static void mp3d_synth(float *xl, short *dstl, int nch, float *lins)

+static void mp3d_synth(float *xl, mp3d_sample_t *dstl, int nch, float *lins)

     int i;

     float *xr = xl + 576*(nch - 1);

-    short *dstr = dstl + (nch - 1);

+    mp3d_sample_t *dstr = dstl + (nch - 1);

     static const float g_win[] = {

         -1,26,-31,208,218,401,-519,2063,2000,4788,-5517,7134,5959,35640,-39336,74992,

@@ -1462,6 +1475,7 @@

         V0(0) V2(1) V1(2) V2(3) V1(4) V2(5) V1(6) V2(7)

+#ifndef MINIMP3_FLOAT_OUTPUT

 #if HAVE_SSE

             static const f4 g_max = { 32767.0f, 32767.0f, 32767.0f, 32767.0f };

             static const f4 g_min = { -32768.0f, -32768.0f, -32768.0f, -32768.0f };

@@ -1490,6 +1504,32 @@

             vst1_lane_s16(dstl + (47 - i)*nch, pcma, 2);

             vst1_lane_s16(dstl + (49 + i)*nch, pcmb, 2);

 #endif

+#else

+            static const f4 g_scale = { 1.0f/32768.0f, 1.0f/32768.0f, 1.0f/32768.0f, 1.0f/32768.0f };

+            a = VMUL(a, g_scale);

+            b = VMUL(b, g_scale);

+#if HAVE_SSE

+            _mm_store_ss(dstr + (15 - i)*nch, _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1)));

+            _mm_store_ss(dstr + (17 + i)*nch, _mm_shuffle_ps(b, b, _MM_SHUFFLE(1, 1, 1, 1)));

+            _mm_store_ss(dstl + (15 - i)*nch, _mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 0, 0, 0)));

+            _mm_store_ss(dstl + (17 + i)*nch, _mm_shuffle_ps(b, b, _MM_SHUFFLE(0, 0, 0, 0)));

+            _mm_store_ss(dstr + (47 - i)*nch, _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 3, 3, 3)));

+            _mm_store_ss(dstr + (49 + i)*nch, _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 3, 3, 3)));

+            _mm_store_ss(dstl + (47 - i)*nch, _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 2, 2, 2)));

+            _mm_store_ss(dstl + (49 + i)*nch, _mm_shuffle_ps(b, b, _MM_SHUFFLE(2, 2, 2, 2)));

+#else

+            vst1q_lane_f32(dstr + (15 - i)*nch, a, 1);

+            vst1q_lane_f32(dstr + (17 + i)*nch, b, 1);

+            vst1q_lane_f32(dstl + (15 - i)*nch, a, 0);

+            vst1q_lane_f32(dstl + (17 + i)*nch, b, 0);

+            vst1q_lane_f32(dstr + (47 - i)*nch, a, 3);

+            vst1q_lane_f32(dstr + (49 + i)*nch, b, 3);

+            vst1q_lane_f32(dstl + (47 - i)*nch, a, 2);

+            vst1q_lane_f32(dstl + (49 + i)*nch, b, 2);

+#endif

+#endif /* MINIMP3_FLOAT_OUTPUT */

     } else

 #endif

@@ -1527,7 +1567,7 @@

 #endif

-static void mp3d_synth_granule(float *qmf_state, float *grbuf, int nbands, int nch, short *pcm, float *lins)

+static void mp3d_synth_granule(float *qmf_state, float *grbuf, int nbands, int nch, mp3d_sample_t *pcm, float *lins)

     int i;

     for (i = 0; i < nch; i++)

@@ -1611,7 +1651,7 @@

     dec->header[0] = 0;

-int mp3dec_decode_frame(mp3dec_t *dec, const uint8_t *mp3, int mp3_bytes, short *pcm, mp3dec_frame_info_t *info)

+int mp3dec_decode_frame(mp3dec_t *dec, const uint8_t *mp3, int mp3_bytes, mp3d_sample_t *pcm, mp3dec_frame_info_t *info)

     int i = 0, igr, frame_size = 0, success = 1;

     const uint8_t *hdr;

@@ -1704,4 +1744,68 @@

     return success*hdr_frame_samples(dec->header);

+#ifdef MINIMP3_FLOAT_OUTPUT

+void mp3dec_f32_to_s16(const float *in, short *out, int num_samples)

+{

+    if(num_samples > 0)

+    {

+        int i = 0;

+#if HAVE_SIMD

+        int aligned_count = num_samples & ~7;

+        for(;i < aligned_count;i+=8)

+        {

+            static const f4 g_scale = { 32768.0f, 32768.0f, 32768.0f, 32768.0f };

+            f4 a = VMUL(VLD(&in[i  ]), g_scale);

+            f4 b = VMUL(VLD(&in[i+4]), g_scale);

+#if HAVE_SSE

+            static const f4 g_max = { 32767.0f, 32767.0f, 32767.0f, 32767.0f };

+            static const f4 g_min = { -32768.0f, -32768.0f, -32768.0f, -32768.0f };

+            __m128i pcm8 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_max_ps(_mm_min_ps(a, g_max), g_min)),

+                                           _mm_cvtps_epi32(_mm_max_ps(_mm_min_ps(b, g_max), g_min)));

+            out[i  ] = _mm_extract_epi16(pcm8, 0);

+            out[i+1] = _mm_extract_epi16(pcm8, 1);

+            out[i+2] = _mm_extract_epi16(pcm8, 2);

+            out[i+3] = _mm_extract_epi16(pcm8, 3);

+            out[i+4] = _mm_extract_epi16(pcm8, 4);

+            out[i+5] = _mm_extract_epi16(pcm8, 5);

+            out[i+6] = _mm_extract_epi16(pcm8, 6);

+            out[i+7] = _mm_extract_epi16(pcm8, 7);

+#else

+            int16x4_t pcma, pcmb;

+            a = VADD(a, VSET(0.5f));

+            b = VADD(b, VSET(0.5f));

+            pcma = vqmovn_s32(vqaddq_s32(vcvtq_s32_f32(a), vreinterpretq_s32_u32(vcltq_f32(a, VSET(0)))));

+            pcmb = vqmovn_s32(vqaddq_s32(vcvtq_s32_f32(b), vreinterpretq_s32_u32(vcltq_f32(b, VSET(0)))));

+            vst1_lane_s16(out+i  , pcma, 0);

+            vst1_lane_s16(out+i+1, pcma, 1);

+            vst1_lane_s16(out+i+2, pcma, 2);

+            vst1_lane_s16(out+i+3, pcma, 3);

+            vst1_lane_s16(out+i+4, pcmb, 0);

+            vst1_lane_s16(out+i+5, pcmb, 1);

+            vst1_lane_s16(out+i+6, pcmb, 2);

+            vst1_lane_s16(out+i+7, pcmb, 3);

+#endif

+        }

+#endif

+        for(;i < num_samples;i++)

+        {

+            float sample = in[i] * 32768.0f;

+            if(sample >=  32766.5)

+                out[i] = (short) 32767;

+            else if (sample <= -32767.5)

+                out[i] = (short)-32768;

+            else

+            {

+                short s = (short)(sample + .5f);

+                s -= (s < 0);   /* away from zero, to be compliant */

+                out[i] = s;

+            }

+        }

+    }

+}

+#endif

 #endif /*MINIMP3_IMPLEMENTATION*/

+#endif /*MINIMP3_H*/