mp3d_synth源码不大,没多少行,但都是大量的浮点运算。
尤其是那一行宏,展开后是循环中大量计算。
另外,此函数对单声道和双声道没有区分,对于单声道来说,多算了一倍的计算量。
这一点可以简单的将for (j = 0; j < 4; j++)改为for (j = 0; j < 4; j+=2)来减少一半计算。
原始程序
- static void mp3d_synth(float *xl, mp3d_sample_t *dstl, int nch, float *lins)
- {
- int i;
- float *xr = xl + 576*(nch - 1);
- mp3d_sample_t *dstr = dstl + (nch - 1);
-
- static const float g_win[] = {
- -1,26,-31,208,218,401,-519,2063,2000,4788,-5517,7134,5959,35640,-39336,74992,
- -1,24,-35,202,222,347,-581,2080,1952,4425,-5879,7640,5288,33791,-41176,74856,
- -1,21,-38,196,225,294,-645,2087,1893,4063,-6237,8092,4561,31947,-43006,74630,
- -1,19,-41,190,227,244,-711,2085,1822,3705,-6589,8492,3776,30112,-44821,74313,
- -1,17,-45,183,228,197,-779,2075,1739,3351,-6935,8840,2935,28289,-46617,73908,
- -1,16,-49,176,228,153,-848,2057,1644,3004,-7271,9139,2037,26482,-48390,73415,
- -2,14,-53,169,227,111,-919,2032,1535,2663,-7597,9389,1082,24694,-50137,72835,
- -2,13,-58,161,224,72,-991,2001,1414,2330,-7910,9592,70,22929,-51853,72169,
- -2,11,-63,154,221,36,-1064,1962,1280,2006,-8209,9750,-998,21189,-53534,71420,
- -2,10,-68,147,215,2,-1137,1919,1131,1692,-8491,9863,-2122,19478,-55178,70590,
- -3,9,-73,139,208,-29,-1210,1870,970,1388,-8755,9935,-3300,17799,-56778,69679,
- -3,8,-79,132,200,-57,-1283,1817,794,1095,-8998,9966,-4533,16155,-58333,68692,
- -4,7,-85,125,189,-83,-1356,1759,605,814,-9219,9959,-5818,14548,-59838,67629,
- -4,7,-91,117,177,-106,-1428,1698,402,545,-9416,9916,-7154,12980,-61289,66494,
- -5,6,-97,111,163,-127,-1498,1634,185,288,-9585,9838,-8540,11455,-62684,65290
- };
- float *zlin = lins + 15*64;
- const float *w = g_win;
-
- zlin[4*15] = xl[18*16];
- zlin[4*15 + 1] = xr[18*16];
- zlin[4*15 + 2] = xl[0];
- zlin[4*15 + 3] = xr[0];
-
- zlin[4*31] = xl[1 + 18*16];
- zlin[4*31 + 1] = xr[1 + 18*16];
- zlin[4*31 + 2] = xl[1];
- zlin[4*31 + 3] = xr[1];
-
- mp3d_synth_pair(dstr, nch, lins + 4*15 + 1);
- mp3d_synth_pair(dstr + 32*nch, nch, lins + 4*15 + 64 + 1);
- mp3d_synth_pair(dstl, nch, lins + 4*15);
- mp3d_synth_pair(dstl + 32*nch, nch, lins + 4*15 + 64);
-
- for (i = 14; i >= 0; i--)
- {
- #define LOAD(k) float w0 = *w++; float w1 = *w++; float *vz = &zlin[4*i - k*64]; float *vy = &zlin[4*i - (15 - k)*64];
- #define S0(k) { uint16_t j; LOAD(k); for (j = 0; j < 4; j++) b[j] = vz[j]*w1 + vy[j]*w0, a[j] = vz[j]*w0 - vy[j]*w1; }
- #define S1(k) { uint16_t j; LOAD(k); for (j = 0; j < 4; j++) b[j] += vz[j]*w1 + vy[j]*w0, a[j] += vz[j]*w0 - vy[j]*w1; }
- #define S2(k) { uint16_t j; LOAD(k); for (j = 0; j < 4; j++) b[j] += vz[j]*w1 + vy[j]*w0, a[j] += vy[j]*w1 - vz[j]*w0; }
-
- float a[4], b[4];
-
- zlin[4*i] = xl[18*(31 - i)];
- zlin[4*i + 1] = xr[18*(31 - i)];
- zlin[4*i + 2] = xl[1 + 18*(31 - i)];
- zlin[4*i + 3] = xr[1 + 18*(31 - i)];
- zlin[4*(i + 16)] = xl[1 + 18*(1 + i)];
- zlin[4*(i + 16) + 1] = xr[1 + 18*(1 + i)];
- zlin[4*(i - 16) + 2] = xl[18*(1 + i)];
- zlin[4*(i - 16) + 3] = xr[18*(1 + i)];
-
- S0(0) S2(1) S1(2) S2(3) S1(4) S2(5) S1(6) S2(7)
- dstr[(15 - i)*nch] = mp3d_scale_pcm(a[1]);
- dstr[(17 + i)*nch] = mp3d_scale_pcm(b[1]);
- dstl[(15 - i)*nch] = mp3d_scale_pcm(a[0]);
- dstl[(17 + i)*nch] = mp3d_scale_pcm(b[0]);
- dstr[(47 - i)*nch] = mp3d_scale_pcm(a[3]);
- dstr[(49 + i)*nch] = mp3d_scale_pcm(b[3]);
- dstl[(47 - i)*nch] = mp3d_scale_pcm(a[2]);
- dstl[(49 + i)*nch] = mp3d_scale_pcm(b[2]);
- }
- }
复制代码
经循环展开,增加中间变量,保存临时计算结果,避免重复计算。
改进后的源码如下
- static void mp3d_synth(float *xl, int16_t *dstl, int nch, float *lins)
- {
- int i;
- // float *xr = xl;
- // int16_t *dstr = dstl;
-
- static const float g_win[] = {
- -1,26,-31,208,218,401,-519,2063,2000,4788,-5517,7134,5959,35640,-39336,74992,
- -1,24,-35,202,222,347,-581,2080,1952,4425,-5879,7640,5288,33791,-41176,74856,
- -1,21,-38,196,225,294,-645,2087,1893,4063,-6237,8092,4561,31947,-43006,74630,
- -1,19,-41,190,227,244,-711,2085,1822,3705,-6589,8492,3776,30112,-44821,74313,
- -1,17,-45,183,228,197,-779,2075,1739,3351,-6935,8840,2935,28289,-46617,73908,
- -1,16,-49,176,228,153,-848,2057,1644,3004,-7271,9139,2037,26482,-48390,73415,
- -2,14,-53,169,227,111,-919,2032,1535,2663,-7597,9389,1082,24694,-50137,72835,
- -2,13,-58,161,224,72,-991,2001,1414,2330,-7910,9592,70,22929,-51853,72169,
- -2,11,-63,154,221,36,-1064,1962,1280,2006,-8209,9750,-998,21189,-53534,71420,
- -2,10,-68,147,215,2,-1137,1919,1131,1692,-8491,9863,-2122,19478,-55178,70590,
- -3,9,-73,139,208,-29,-1210,1870,970,1388,-8755,9935,-3300,17799,-56778,69679,
- -3,8,-79,132,200,-57,-1283,1817,794,1095,-8998,9966,-4533,16155,-58333,68692,
- -4,7,-85,125,189,-83,-1356,1759,605,814,-9219,9959,-5818,14548,-59838,67629,
- -4,7,-91,117,177,-106,-1428,1698,402,545,-9416,9916,-7154,12980,-61289,66494,
- -5,6,-97,111,163,-127,-1498,1634,185,288,-9585,9838,-8540,11455,-62684,65290
- };
-
- float *zlin = lins + 15*64;
- const float *w = g_win;
-
- zlin[4*15] = xl[18*16];
- // zlin[4*15 + 1] = xr[18*16];
- zlin[4*15 + 2] = xl[0];
- // zlin[4*15 + 3] = xr[0];
-
- zlin[4*31] = xl[1 + 18*16];
- // zlin[4*31 + 1] = xr[1 + 18*16];
- zlin[4*31 + 2] = xl[1];
- // zlin[4*31 + 3] = xr[1];
-
- // mp3d_synth_pair(dstr, 1, lins + 4*15 + 1);
- // mp3d_synth_pair(dstr + 32, 1, lins + 4*15 + 64 + 1);
- mp3d_synth_pair(dstl, 1, lins + 4*15);
- mp3d_synth_pair(dstl + 32, 1, lins + 4*15 + 64);
-
- for (i = 14; i >= 0; i--)
- {
- #define LOAD(k) float w0 = *w++; float w1 = *w++; float *vz = &zlin[4*i - k*64]; float *vy = &zlin[4*i - (15 - k)*64]; float vzz = *vz; float vyy = *vy;
- #define S0(k) {LOAD(k); b0 = vzz*w1 + vyy*w0; a0 = vzz*w0 - vyy*w1; vzz = vz[2]; vyy = vy[2]; b2 = vzz*w1 + vyy*w0; a2 = vzz*w0 - vyy*w1;}
- #define S1(k) {LOAD(k); b0 += vzz*w1 + vyy*w0, a0 += vzz*w0 - vyy*w1; vzz = vz[2]; vyy = vy[2]; b2 += vzz*w1 + vyy*w0, a2 += vzz*w0 - vyy*w1;}
- #define S2(k) {LOAD(k); b0 += vzz*w1 + vyy*w0, a0 += vyy*w1 - vzz*w0; vzz = vz[2]; vyy = vy[2]; b2 += vzz*w1 + vyy*w0, a2 += vyy*w1 - vzz*w0;}
- float a0, a2, b0, b2;
-
- zlin[4*i] = xl[18*(31 - i)];
- // zlin[4*i + 1] = xr[18*(31 - i)];
- zlin[4*i + 2] = xl[1 + 18*(31 - i)];
- // zlin[4*i + 3] = xr[1 + 18*(31 - i)];
- zlin[4*(i + 16)] = xl[1 + 18*(1 + i)];
- // zlin[4*(i + 16) + 1] = xr[1 + 18*(1 + i)];
- zlin[4*(i - 16) + 2] = xl[18*(1 + i)];
- // zlin[4*(i - 16) + 3] = xr[18*(1 + i)];
-
- S0(0) S2(1) S1(2) S2(3) S1(4) S2(5) S1(6) S2(7)
-
- dstl[15 - i] = mp3d_scale_pcm(a0);
- dstl[17 + i] = mp3d_scale_pcm(b0);
- dstl[47 - i] = mp3d_scale_pcm(a2);
- dstl[49 + i] = mp3d_scale_pcm(b2);
- }
- }
复制代码
|