1 /*
2 * MPEG video MMX templates
3 *
4 * Copyright (c) 2002 Michael Niedermayer <michaelni@gmx.at>
5 *
6 * This file is part of FFmpeg.
7 *
8 * FFmpeg is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
12 *
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
17 *
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 */
22
23 #undef MMREG_WIDTH
24 #undef MM
25 #undef MOVQ
26 #undef SPREADW
27 #undef PMAXW
28 #undef PMAX
29 #undef SAVE_SIGN
30 #undef RESTORE_SIGN
31
32 #if COMPILE_TEMPLATE_SSE2
33 #define MMREG_WIDTH "16"
34 #define MM "%%xmm"
35 #define MOVQ "movdqa"
36 #define SPREADW(a) \
37 "pshuflw 0,ドル "a", "a" \n\t"\
38 "punpcklwd "a", "a" \n\t"
39 #define PMAXW(a,b) "pmaxsw "a", "b" \n\t"
40 #define PMAX(a,b) \
41 "movhlps "a", "b" \n\t"\
42 PMAXW(b, a)\
43 "pshuflw 0ドルx0E, "a", "b" \n\t"\
44 PMAXW(b, a)\
45 "pshuflw 0ドルx01, "a", "b" \n\t"\
46 PMAXW(b, a)
47 #else
48 #define MMREG_WIDTH "8"
51 #if COMPILE_TEMPLATE_MMXEXT
52 #define SPREADW(a) "pshufw 0,ドル "a", "a" \n\t"
53 #define PMAXW(a,b) "pmaxsw "a", "b" \n\t"
54 #define PMAX(a,b) \
55 "pshufw 0ドルx0E, "a", "b" \n\t"\
56 PMAXW(b, a)\
57 "pshufw 0ドルx01, "a", "b" \n\t"\
58 PMAXW(b, a)
59 #else
61 "punpcklwd "a", "a" \n\t"\
62 "punpcklwd "a", "a" \n\t"
64 "psubusw "a", "b" \n\t"\
65 "paddw "a", "b" \n\t"
67 "movq "a", "b" \n\t"\
68 "psrlq 32,ドル "a" \n\t"\
69 PMAXW(b, a)\
70 "movq "a", "b" \n\t"\
71 "psrlq 16,ドル "a" \n\t"\
72 PMAXW(b, a)
73
74 #endif
75 #endif
76
77 #if COMPILE_TEMPLATE_SSSE3
78 #define SAVE_SIGN(a,b) \
79 "movdqa "b", "a" \n\t"\
80 "pabsw "b", "b" \n\t"
81 #define RESTORE_SIGN(a,b) \
82 "psignw "a", "b" \n\t"
83 #else
84 #define SAVE_SIGN(a,b) \
85 "pxor "a", "a" \n\t"\
86 "pcmpgtw "b", "a" \n\t" /* block[i] <= 0 ? 0xFF : 0x00 */\
87 "pxor "a", "b" \n\t"\
88 "psubw "a", "b" \n\t" /* ABS(block[i]) */
89 #define RESTORE_SIGN(a,b) \
90 "pxor "a", "b" \n\t"\
91 "psubw "a", "b" \n\t" // out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i])
92 #endif
93
96 int qscale, int *overflow)
97 {
99 int level=0, q;
//=0 is because gcc says uninitialized ...
100 const uint16_t *qmat, *bias;
102
103 av_assert2((7&(
int)(&temp_block[0])) == 0);
//did gcc align it correctly?
104
105 //s->fdct (block);
106 RENAMEl(ff_fdct) (
block);
//cannot be anything else ...
107
110
115 bias =
s->q_intra_matrix16[qscale][1];
116 qmat =
s->q_intra_matrix16[qscale][0];
117 }else{
119 bias =
s->q_chroma_intra_matrix16[qscale][1];
120 qmat =
s->q_chroma_intra_matrix16[qscale][0];
121 }
122 /* note: block[0] is assumed to be positive */
124 __asm__ volatile (
125 "mul %%ecx \n\t"
126 :
"=d" (
level),
"=a"(dummy)
128 );
129 } else
130 /* For AIC we skip quant/dequant of INTRADC */
131 level = (
block[0] + 4)>>3;
132
133 block[0]=0;
//avoid fake overflow
134 // temp_block[0] = (block[0] + (q >> 1)) / q;
135 last_non_zero_p1 = 1;
136 } else {
137 last_non_zero_p1 = 0;
138 bias =
s->q_inter_matrix16[qscale][1];
139 qmat =
s->q_inter_matrix16[qscale][0];
140 }
141
143
144 __asm__ volatile(
145 "movd %%"REG_a
", "MM "3 \n\t" // last_non_zero_p1
147 "pxor "MM "7, "MM "7 \n\t" // 0
148 "pxor "MM "4, "MM "4 \n\t" // 0
149 MOVQ " (%2), "MM "5 \n\t" // qmat[0]
150 "pxor "MM "6, "MM "6 \n\t"
151 "psubw (%3), "MM "6 \n\t" // -bias[0]
152 "mov $-128, %%"REG_a" \n\t"
153 ".p2align 4 \n\t"
154 "1: \n\t"
155 MOVQ " (%1, %%"REG_a
"), "MM "0 \n\t" // block[i]
157 "psubusw "MM "6, "MM "0 \n\t" // ABS(block[i]) + bias[0]
158 "pmulhw "MM "5, "MM "0 \n\t" // (ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16
159 "por "MM "0, "MM "4 \n\t"
160 RESTORE_SIGN(
MM "1",
MM "0")
// out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i])
161 MOVQ " "MM "0, (%5, %%"REG_a
") \n\t"
162 "pcmpeqw "MM "7, "MM "0 \n\t" // out==0 ? 0xFF : 0x00
163 MOVQ " (%4, %%"REG_a
"), "MM "1 \n\t"
164 MOVQ " "MM "7, (%1, %%"REG_a
") \n\t" // 0
165 "pandn "MM "1, "MM "0 \n\t"
168 " js 1b \n\t"
170 "movd "MM "3, %%"REG_a
" \n\t"
171 "movzb %%al, %%"REG_a" \n\t" // last_non_zero_p1
172 : "+a" (last_non_zero_p1)
173 :
"r" (
block+64),
"r" (qmat),
"r" (bias),
176 "%xmm4", "%xmm5", "%xmm6", "%xmm7")
177 );
178 }else{ // FMT_H263
179 __asm__ volatile(
180 "movd %%"REG_a
", "MM "3 \n\t" // last_non_zero_p1
182 "pxor "MM "7, "MM "7 \n\t" // 0
183 "pxor "MM "4, "MM "4 \n\t" // 0
184 "mov $-128, %%"REG_a" \n\t"
185 ".p2align 4 \n\t"
186 "1: \n\t"
187 MOVQ " (%1, %%"REG_a
"), "MM "0 \n\t" // block[i]
189 MOVQ " (%3, %%"REG_a
"), "MM "6 \n\t" // bias[0]
190 "paddusw "MM "6, "MM "0 \n\t" // ABS(block[i]) + bias[0]
191 MOVQ " (%2, %%"REG_a
"), "MM "5 \n\t" // qmat[i]
192 "pmulhw "MM "5, "MM "0 \n\t" // (ABS(block[i])*qmat[0] + bias[0]*qmat[0])>>16
193 "por "MM "0, "MM "4 \n\t"
194 RESTORE_SIGN(
MM "1",
MM "0")
// out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i])
195 MOVQ " "MM "0, (%5, %%"REG_a
") \n\t"
196 "pcmpeqw "MM "7, "MM "0 \n\t" // out==0 ? 0xFF : 0x00
197 MOVQ " (%4, %%"REG_a
"), "MM "1 \n\t"
198 MOVQ " "MM "7, (%1, %%"REG_a
") \n\t" // 0
199 "pandn "MM "1, "MM "0 \n\t"
202 " js 1b \n\t"
204 "movd "MM "3, %%"REG_a
" \n\t"
205 "movzb %%al, %%"REG_a" \n\t" // last_non_zero_p1
206 : "+a" (last_non_zero_p1)
207 :
"r" (
block+64),
"r" (qmat+64),
"r" (bias+64),
210 "%xmm4", "%xmm5", "%xmm6", "%xmm7")
211 );
212 }
213 __asm__ volatile(
214 "movd %1, "MM "1 \n\t" // max_qcoeff
216 "psubusw "MM "1, "MM "4 \n\t"
217 "packuswb "MM "4, "MM "4 \n\t"
218 #if COMPILE_TEMPLATE_SSE2
219 "packuswb "MM "4, "MM "4 \n\t"
220 #endif
221 "movd "MM "4, %0 \n\t" // *overflow
222 : "=g" (*overflow)
223 :
"g" (
s->max_qcoeff)
224 );
225
227 else block[0]= temp_block[0];
228
230 if(last_non_zero_p1 <= 1)
goto end;
231 block[0x08] = temp_block[0x01];
block[0x10] = temp_block[0x08];
232 block[0x20] = temp_block[0x10];
233 if(last_non_zero_p1 <= 4)
goto end;
234 block[0x18] = temp_block[0x09];
block[0x04] = temp_block[0x02];
235 block[0x09] = temp_block[0x03];
236 if(last_non_zero_p1 <= 7)
goto end;
237 block[0x14] = temp_block[0x0A];
block[0x28] = temp_block[0x11];
238 block[0x12] = temp_block[0x18];
block[0x02] = temp_block[0x20];
239 if(last_non_zero_p1 <= 11)
goto end;
240 block[0x1A] = temp_block[0x19];
block[0x24] = temp_block[0x12];
241 block[0x19] = temp_block[0x0B];
block[0x01] = temp_block[0x04];
242 block[0x0C] = temp_block[0x05];
243 if(last_non_zero_p1 <= 16)
goto end;
244 block[0x11] = temp_block[0x0C];
block[0x29] = temp_block[0x13];
245 block[0x16] = temp_block[0x1A];
block[0x0A] = temp_block[0x21];
246 block[0x30] = temp_block[0x28];
block[0x22] = temp_block[0x30];
247 block[0x38] = temp_block[0x29];
block[0x06] = temp_block[0x22];
248 if(last_non_zero_p1 <= 24)
goto end;
249 block[0x1B] = temp_block[0x1B];
block[0x21] = temp_block[0x14];
250 block[0x1C] = temp_block[0x0D];
block[0x05] = temp_block[0x06];
251 block[0x0D] = temp_block[0x07];
block[0x15] = temp_block[0x0E];
252 block[0x2C] = temp_block[0x15];
block[0x13] = temp_block[0x1C];
253 if(last_non_zero_p1 <= 32)
goto end;
254 block[0x0B] = temp_block[0x23];
block[0x34] = temp_block[0x2A];
255 block[0x2A] = temp_block[0x31];
block[0x32] = temp_block[0x38];
256 block[0x3A] = temp_block[0x39];
block[0x26] = temp_block[0x32];
257 block[0x39] = temp_block[0x2B];
block[0x03] = temp_block[0x24];
258 if(last_non_zero_p1 <= 40)
goto end;
259 block[0x1E] = temp_block[0x1D];
block[0x25] = temp_block[0x16];
260 block[0x1D] = temp_block[0x0F];
block[0x2D] = temp_block[0x17];
261 block[0x17] = temp_block[0x1E];
block[0x0E] = temp_block[0x25];
262 block[0x31] = temp_block[0x2C];
block[0x2B] = temp_block[0x33];
263 if(last_non_zero_p1 <= 48)
goto end;
264 block[0x36] = temp_block[0x3A];
block[0x3B] = temp_block[0x3B];
265 block[0x23] = temp_block[0x34];
block[0x3C] = temp_block[0x2D];
266 block[0x07] = temp_block[0x26];
block[0x1F] = temp_block[0x1F];
267 block[0x0F] = temp_block[0x27];
block[0x35] = temp_block[0x2E];
268 if(last_non_zero_p1 <= 56)
goto end;
269 block[0x2E] = temp_block[0x35];
block[0x33] = temp_block[0x3C];
270 block[0x3E] = temp_block[0x3D];
block[0x27] = temp_block[0x36];
271 block[0x3D] = temp_block[0x2F];
block[0x2F] = temp_block[0x37];
272 block[0x37] = temp_block[0x3E];
block[0x3F] = temp_block[0x3F];
274 if(last_non_zero_p1 <= 1)
goto end;
275 block[0x04] = temp_block[0x01];
276 block[0x08] = temp_block[0x08];
block[0x10] = temp_block[0x10];
277 if(last_non_zero_p1 <= 4)
goto end;
278 block[0x0C] = temp_block[0x09];
block[0x01] = temp_block[0x02];
279 block[0x05] = temp_block[0x03];
280 if(last_non_zero_p1 <= 7)
goto end;
281 block[0x09] = temp_block[0x0A];
block[0x14] = temp_block[0x11];
282 block[0x18] = temp_block[0x18];
block[0x20] = temp_block[0x20];
283 if(last_non_zero_p1 <= 11)
goto end;
284 block[0x1C] = temp_block[0x19];
285 block[0x11] = temp_block[0x12];
block[0x0D] = temp_block[0x0B];
286 block[0x02] = temp_block[0x04];
block[0x06] = temp_block[0x05];
287 if(last_non_zero_p1 <= 16)
goto end;
288 block[0x0A] = temp_block[0x0C];
block[0x15] = temp_block[0x13];
289 block[0x19] = temp_block[0x1A];
block[0x24] = temp_block[0x21];
290 block[0x28] = temp_block[0x28];
block[0x30] = temp_block[0x30];
291 block[0x2C] = temp_block[0x29];
block[0x21] = temp_block[0x22];
292 if(last_non_zero_p1 <= 24)
goto end;
293 block[0x1D] = temp_block[0x1B];
block[0x12] = temp_block[0x14];
294 block[0x0E] = temp_block[0x0D];
block[0x03] = temp_block[0x06];
295 block[0x07] = temp_block[0x07];
block[0x0B] = temp_block[0x0E];
296 block[0x16] = temp_block[0x15];
block[0x1A] = temp_block[0x1C];
297 if(last_non_zero_p1 <= 32)
goto end;
298 block[0x25] = temp_block[0x23];
block[0x29] = temp_block[0x2A];
299 block[0x34] = temp_block[0x31];
block[0x38] = temp_block[0x38];
300 block[0x3C] = temp_block[0x39];
block[0x31] = temp_block[0x32];
301 block[0x2D] = temp_block[0x2B];
block[0x22] = temp_block[0x24];
302 if(last_non_zero_p1 <= 40)
goto end;
303 block[0x1E] = temp_block[0x1D];
block[0x13] = temp_block[0x16];
304 block[0x0F] = temp_block[0x0F];
block[0x17] = temp_block[0x17];
305 block[0x1B] = temp_block[0x1E];
block[0x26] = temp_block[0x25];
306 block[0x2A] = temp_block[0x2C];
block[0x35] = temp_block[0x33];
307 if(last_non_zero_p1 <= 48)
goto end;
308 block[0x39] = temp_block[0x3A];
block[0x3D] = temp_block[0x3B];
309 block[0x32] = temp_block[0x34];
block[0x2E] = temp_block[0x2D];
310 block[0x23] = temp_block[0x26];
block[0x1F] = temp_block[0x1F];
311 block[0x27] = temp_block[0x27];
block[0x2B] = temp_block[0x2E];
312 if(last_non_zero_p1 <= 56)
goto end;
313 block[0x36] = temp_block[0x35];
block[0x3A] = temp_block[0x3C];
314 block[0x3E] = temp_block[0x3D];
block[0x33] = temp_block[0x36];
315 block[0x2F] = temp_block[0x2F];
block[0x37] = temp_block[0x37];
316 block[0x3B] = temp_block[0x3E];
block[0x3F] = temp_block[0x3F];
317 }else{
318 if(last_non_zero_p1 <= 1)
goto end;
319 block[0x01] = temp_block[0x01];
320 block[0x08] = temp_block[0x08];
block[0x10] = temp_block[0x10];
321 if(last_non_zero_p1 <= 4)
goto end;
322 block[0x09] = temp_block[0x09];
block[0x02] = temp_block[0x02];
323 block[0x03] = temp_block[0x03];
324 if(last_non_zero_p1 <= 7)
goto end;
325 block[0x0A] = temp_block[0x0A];
block[0x11] = temp_block[0x11];
326 block[0x18] = temp_block[0x18];
block[0x20] = temp_block[0x20];
327 if(last_non_zero_p1 <= 11)
goto end;
328 block[0x19] = temp_block[0x19];
329 block[0x12] = temp_block[0x12];
block[0x0B] = temp_block[0x0B];
330 block[0x04] = temp_block[0x04];
block[0x05] = temp_block[0x05];
331 if(last_non_zero_p1 <= 16)
goto end;
332 block[0x0C] = temp_block[0x0C];
block[0x13] = temp_block[0x13];
333 block[0x1A] = temp_block[0x1A];
block[0x21] = temp_block[0x21];
334 block[0x28] = temp_block[0x28];
block[0x30] = temp_block[0x30];
335 block[0x29] = temp_block[0x29];
block[0x22] = temp_block[0x22];
336 if(last_non_zero_p1 <= 24)
goto end;
337 block[0x1B] = temp_block[0x1B];
block[0x14] = temp_block[0x14];
338 block[0x0D] = temp_block[0x0D];
block[0x06] = temp_block[0x06];
339 block[0x07] = temp_block[0x07];
block[0x0E] = temp_block[0x0E];
340 block[0x15] = temp_block[0x15];
block[0x1C] = temp_block[0x1C];
341 if(last_non_zero_p1 <= 32)
goto end;
342 block[0x23] = temp_block[0x23];
block[0x2A] = temp_block[0x2A];
343 block[0x31] = temp_block[0x31];
block[0x38] = temp_block[0x38];
344 block[0x39] = temp_block[0x39];
block[0x32] = temp_block[0x32];
345 block[0x2B] = temp_block[0x2B];
block[0x24] = temp_block[0x24];
346 if(last_non_zero_p1 <= 40)
goto end;
347 block[0x1D] = temp_block[0x1D];
block[0x16] = temp_block[0x16];
348 block[0x0F] = temp_block[0x0F];
block[0x17] = temp_block[0x17];
349 block[0x1E] = temp_block[0x1E];
block[0x25] = temp_block[0x25];
350 block[0x2C] = temp_block[0x2C];
block[0x33] = temp_block[0x33];
351 if(last_non_zero_p1 <= 48)
goto end;
352 block[0x3A] = temp_block[0x3A];
block[0x3B] = temp_block[0x3B];
353 block[0x34] = temp_block[0x34];
block[0x2D] = temp_block[0x2D];
354 block[0x26] = temp_block[0x26];
block[0x1F] = temp_block[0x1F];
355 block[0x27] = temp_block[0x27];
block[0x2E] = temp_block[0x2E];
356 if(last_non_zero_p1 <= 56)
goto end;
357 block[0x35] = temp_block[0x35];
block[0x3C] = temp_block[0x3C];
358 block[0x3D] = temp_block[0x3D];
block[0x36] = temp_block[0x36];
359 block[0x2F] = temp_block[0x2F];
block[0x37] = temp_block[0x37];
360 block[0x3E] = temp_block[0x3E];
block[0x3F] = temp_block[0x3F];
361 }
363 return last_non_zero_p1 - 1;
364 }