1 /*
2 * MMX optimized MP3 decoding functions
3 * Copyright (c) 2010 Vitor Sessak
4 *
5 * This file is part of FFmpeg.
6 *
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
11 *
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 */
21
28
30 static void imdct36_blocks_ ## CPU(float *out, float *buf, float *in, int count, int switch_point, int block_type);\
31 void ff_imdct36_float_ ## CPU(float *out, float *buf, float *in, float *win);
32
38
40 float *tmpbuf);
42 float *tmpbuf);
43
45
46 #if HAVE_SSE2_INLINE
47
48 #define MACS(rt, ra, rb) rt+=(ra)*(rb)
49 #define MLSS(rt, ra, rb) rt-=(ra)*(rb)
50
51 #define SUM8(op, sum, w, p) \
52 { \
53 op(sum, (w)[0 * 64], (p)[0 * 64]); \
54 op(sum, (w)[1 * 64], (p)[1 * 64]); \
55 op(sum, (w)[2 * 64], (p)[2 * 64]); \
56 op(sum, (w)[3 * 64], (p)[3 * 64]); \
57 op(sum, (w)[4 * 64], (p)[4 * 64]); \
58 op(sum, (w)[5 * 64], (p)[5 * 64]); \
59 op(sum, (w)[6 * 64], (p)[6 * 64]); \
60 op(sum, (w)[7 * 64], (p)[7 * 64]); \
61 }
62
63 static void apply_window(
const float *buf,
const float *win1,
64 const float *win2,
float *sum1,
float *sum2,
int len)
65 {
67 const float *win1a = win1+
len;
68 const float *win2a = win2+
len;
69 const float *bufa = buf+
len;
70 float *sum1a = sum1+
len;
71 float *sum2a = sum2+
len;
72
73
74 #define MULT(a, b) \
75 "movaps " #a "(%1,%0), %%xmm1 \n\t" \
76 "movaps " #a "(%3,%0), %%xmm2 \n\t" \
77 "mulps %%xmm2, %%xmm1 \n\t" \
78 "subps %%xmm1, %%xmm0 \n\t" \
79 "mulps " #b "(%2,%0), %%xmm2 \n\t" \
80 "subps %%xmm2, %%xmm4 \n\t" \
81
82 __asm__ volatile(
83 "1: \n\t"
84 "xorps %%xmm0, %%xmm0 \n\t"
85 "xorps %%xmm4, %%xmm4 \n\t"
86
87 MULT( 0, 0)
88 MULT( 256, 64)
89 MULT( 512, 128)
90 MULT( 768, 192)
91 MULT(1024, 256)
92 MULT(1280, 320)
93 MULT(1536, 384)
94 MULT(1792, 448)
95
96 "movaps %%xmm0, (%4,%0) \n\t"
97 "movaps %%xmm4, (%5,%0) \n\t"
98 "add 16,ドル %0 \n\t"
99 "jl 1b \n\t"
101 :"r"(win1a), "r"(win2a), "r"(bufa), "r"(sum1a), "r"(sum2a)
102 );
103
104 #undef MULT
105 }
106
107 static void apply_window_mp3(float *in, float *win, int *unused, float *out,
108 int incr)
109 {
114
115 float sum;
116
117 /* copy to avoid wrap */
118 __asm__ volatile(
119 "movaps 0(%0), %%xmm0 \n\t" \
120 "movaps 16(%0), %%xmm1 \n\t" \
121 "movaps 32(%0), %%xmm2 \n\t" \
122 "movaps 48(%0), %%xmm3 \n\t" \
123 "movaps %%xmm0, 0(%1) \n\t" \
124 "movaps %%xmm1, 16(%1) \n\t" \
125 "movaps %%xmm2, 32(%1) \n\t" \
126 "movaps %%xmm3, 48(%1) \n\t" \
127 "movaps 64(%0), %%xmm0 \n\t" \
128 "movaps 80(%0), %%xmm1 \n\t" \
129 "movaps 96(%0), %%xmm2 \n\t" \
130 "movaps 112(%0), %%xmm3 \n\t" \
131 "movaps %%xmm0, 64(%1) \n\t" \
132 "movaps %%xmm1, 80(%1) \n\t" \
133 "movaps %%xmm2, 96(%1) \n\t" \
134 "movaps %%xmm3, 112(%1) \n\t"
135 ::
"r"(
in),
"r"(in+512)
136 :"memory"
137 );
138
140 apply_window(in + 32, win + 48, win + 640, sumb, sumd, 16);
141
142 SUM8(
MACS, suma[0], win + 32, in + 48);
143
144 sumc[ 0] = 0;
145 sumb[16] = 0;
146 sumd[16] = 0;
147
148 #define SUMS(suma, sumb, sumc, sumd, out1, out2) \
149 "movups " #sumd "(%4), %%xmm0 \n\t" \
150 "shufps 0ドルx1b, %%xmm0, %%xmm0 \n\t" \
151 "subps " #suma "(%1), %%xmm0 \n\t" \
152 "movaps %%xmm0," #out1 "(%0) \n\t" \
153 \
154 "movups " #sumc "(%3), %%xmm0 \n\t" \
155 "shufps 0ドルx1b, %%xmm0, %%xmm0 \n\t" \
156 "addps " #sumb "(%2), %%xmm0 \n\t" \
157 "movaps %%xmm0," #out2 "(%0) \n\t"
158
159 if (incr == 1) {
160 __asm__ volatile(
161 SUMS( 0, 48, 4, 52, 0, 112)
162 SUMS(16, 32, 20, 36, 16, 96)
163 SUMS(32, 16, 36, 20, 32, 80)
164 SUMS(48, 0, 52, 4, 48, 64)
165
167 :"
r"(&suma[0]), "
r"(&sumb[0]), "
r"(&sumc[0]), "
r"(&sumd[0])
168 :"memory"
169 );
170 out += 16*incr;
172 int j;
173 float *out2 = out + 32 * incr;
174 out[0 ] = -suma[ 0];
175 out += incr;
176 out2 -= incr;
177 for(j=1;j<16;j++) {
178 *out = -suma[ j] + sumd[16-j];
179 *out2 = sumb[16-j] + sumc[ j];
180 out += incr;
181 out2 -= incr;
182 }
183 }
184
185 sum = 0;
186 SUM8(
MLSS, sum, win + 16 + 32, in + 32);
187 *out = sum;
188 }
189
190 #endif /* HAVE_SSE2_INLINE */
191
192 #if HAVE_YASM
193 #define DECL_IMDCT_BLOCKS(CPU1, CPU2) \
194 static void imdct36_blocks_ ## CPU1(float *out, float *buf, float *in, \
195 int count, int switch_point, int block_type) \
196 { \
197 int align_end = count - (count & 3); \
198 int j; \
199 for (j = 0; j < align_end; j+= 4) { \
200 LOCAL_ALIGNED_16(float, tmpbuf, [1024]); \
201 float *win = mdct_win_sse[switch_point && j < 4][block_type]; \
202 /* apply window & overlap with previous buffer */ \
203 \
204 /* select window */ \
205 ff_four_imdct36_float_ ## CPU2(out, buf, in, win, tmpbuf); \
206 in += 4*18; \
207 buf += 4*18; \
208 out += 4; \
209 } \
210 for (; j < count; j++) { \
211 /* apply window & overlap with previous buffer */ \
212 \
213 /* select window */ \
214 int win_idx = (switch_point && j < 2) ? 0 : block_type; \
215 float *win = ff_mdct_win_float[win_idx + (4 & -(j & 1))]; \
216 \
217 ff_imdct36_float_ ## CPU1(out, buf, in, win); \
218 \
219 in += 18; \
220 buf++; \
221 out++; \
222 } \
223 }
224
225 #if HAVE_SSE
226 DECL_IMDCT_BLOCKS(
sse,
sse)
227 DECL_IMDCT_BLOCKS(sse2,
sse)
228 DECL_IMDCT_BLOCKS(sse3,
sse)
229 DECL_IMDCT_BLOCKS(ssse3,
sse)
230 #endif
231 #if HAVE_AVX_EXTERNAL
232 DECL_IMDCT_BLOCKS(avx,avx)
233 #endif
234 #endif /* HAVE_YASM */
235
237 {
239
240 int i, j;
241 for (j = 0; j < 4; j++) {
242 for (i = 0; i < 40; i ++) {
251 }
252 }
253
254 #if HAVE_SSE2_INLINE
257 }
258 #endif /* HAVE_SSE2_INLINE */
259
260 #if HAVE_YASM
271 }
272 #endif /* HAVE_YASM */
273 }