libavcodec/x86/fdct_mmx.c

Go to the documentation of this file.
00001 /*
00002  * MMX optimized forward DCT
00003  * The gcc porting is Copyright (c) 2001 Fabrice Bellard.
00004  * cleanup/optimizations are Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
00005  * SSE2 optimization is Copyright (c) 2004 Denes Balatoni.
00006  *
00007  * from fdctam32.c - AP922 MMX(3D-Now) forward-DCT
00008  *
00009  * Intel Application Note AP-922 - fast, precise implementation of DCT
00010  * http://developer.intel.com/vtune/cbts/appnotes.htm
00011  *
00012  * Also of inspiration:
00013  * a page about fdct at http://www.geocities.com/ssavekar/dct.htm
00014  * Skal's fdct at http://skal.planet-d.net/coding/dct.html
00015  *
00016  * This file is part of FFmpeg.
00017  *
00018  * FFmpeg is free software; you can redistribute it and/or
00019  * modify it under the terms of the GNU Lesser General Public
00020  * License as published by the Free Software Foundation; either
00021  * version 2.1 of the License, or (at your option) any later version.
00022  *
00023  * FFmpeg is distributed in the hope that it will be useful,
00024  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00025  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
00026  * Lesser General Public License for more details.
00027  *
00028  * You should have received a copy of the GNU Lesser General Public
00029  * License along with FFmpeg; if not, write to the Free Software
00030  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
00031  */
00032 
00033 #include "libavutil/common.h"
00034 #include "libavcodec/dsputil.h"
00035 
00036 #define ATTR_ALIGN(align) __attribute__ ((__aligned__ (align)))
00037 
00039 //
00040 // constants for the forward DCT
00041 // -----------------------------
00042 //
00043 // Be sure to check that your compiler is aligning all constants to QWORD
00044 // (8-byte) memory boundaries! Otherwise the unaligned memory access will
00045 // severely stall MMX execution.
00046 //
00048 
00049 #define BITS_FRW_ACC 3 //; 2 or 3 for accuracy
00050 #define SHIFT_FRW_COL BITS_FRW_ACC
00051 #define SHIFT_FRW_ROW (BITS_FRW_ACC + 17 - 3)
00052 #define RND_FRW_ROW (1 << (SHIFT_FRW_ROW-1))
00053 //#define RND_FRW_COL (1 << (SHIFT_FRW_COL-1))
00054 
00055 #define X8(x) x,x,x,x,x,x,x,x
00056 
00057 //concatenated table, for forward DCT transformation
00058 static const int16_t fdct_tg_all_16[24] ATTR_ALIGN(16) = {
00059 X8(13036), // tg * (2<<16) + 0.5
00060 X8(27146), // tg * (2<<16) + 0.5
00061 X8(-21746) // tg * (2<<16) + 0.5
00062 };
00063 
00064 static const int16_t ocos_4_16[8] ATTR_ALIGN(16) = {
00065 X8(23170) //cos * (2<<15) + 0.5
00066 };
00067 
00068 static const int16_t fdct_one_corr[8] ATTR_ALIGN(16) = { X8(1) };
00069 
00070 static const int32_t fdct_r_row[2] ATTR_ALIGN(8) = {RND_FRW_ROW, RND_FRW_ROW };
00071 
00072 static struct
00073 {
00074 const int32_t fdct_r_row_sse2[4] ATTR_ALIGN(16);
00075 } fdct_r_row_sse2 ATTR_ALIGN(16)=
00076 {{
00077 RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW
00078 }};
00079 //static const long fdct_r_row_sse2[4] ATTR_ALIGN(16) = {RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW};
00080 
00081 static const int16_t tab_frw_01234567[] ATTR_ALIGN(8) = { // forward_dct coeff table
00082 16384, 16384, 22725, 19266,
00083 16384, 16384, 12873, 4520,
00084 21407, 8867, 19266, -4520,
00085 -8867, -21407, -22725, -12873,
00086 16384, -16384, 12873, -22725,
00087 -16384, 16384, 4520, 19266,
00088 8867, -21407, 4520, -12873,
00089 21407, -8867, 19266, -22725,
00090 
00091 22725, 22725, 31521, 26722,
00092 22725, 22725, 17855, 6270,
00093 29692, 12299, 26722, -6270,
00094 -12299, -29692, -31521, -17855,
00095 22725, -22725, 17855, -31521,
00096 -22725, 22725, 6270, 26722,
00097 12299, -29692, 6270, -17855,
00098 29692, -12299, 26722, -31521,
00099 
00100 21407, 21407, 29692, 25172,
00101 21407, 21407, 16819, 5906,
00102 27969, 11585, 25172, -5906,
00103 -11585, -27969, -29692, -16819,
00104 21407, -21407, 16819, -29692,
00105 -21407, 21407, 5906, 25172,
00106 11585, -27969, 5906, -16819,
00107 27969, -11585, 25172, -29692,
00108 
00109 19266, 19266, 26722, 22654,
00110 19266, 19266, 15137, 5315,
00111 25172, 10426, 22654, -5315,
00112 -10426, -25172, -26722, -15137,
00113 19266, -19266, 15137, -26722,
00114 -19266, 19266, 5315, 22654,
00115 10426, -25172, 5315, -15137,
00116 25172, -10426, 22654, -26722,
00117 
00118 16384, 16384, 22725, 19266,
00119 16384, 16384, 12873, 4520,
00120 21407, 8867, 19266, -4520,
00121 -8867, -21407, -22725, -12873,
00122 16384, -16384, 12873, -22725,
00123 -16384, 16384, 4520, 19266,
00124 8867, -21407, 4520, -12873,
00125 21407, -8867, 19266, -22725,
00126 
00127 19266, 19266, 26722, 22654,
00128 19266, 19266, 15137, 5315,
00129 25172, 10426, 22654, -5315,
00130 -10426, -25172, -26722, -15137,
00131 19266, -19266, 15137, -26722,
00132 -19266, 19266, 5315, 22654,
00133 10426, -25172, 5315, -15137,
00134 25172, -10426, 22654, -26722,
00135 
00136 21407, 21407, 29692, 25172,
00137 21407, 21407, 16819, 5906,
00138 27969, 11585, 25172, -5906,
00139 -11585, -27969, -29692, -16819,
00140 21407, -21407, 16819, -29692,
00141 -21407, 21407, 5906, 25172,
00142 11585, -27969, 5906, -16819,
00143 27969, -11585, 25172, -29692,
00144 
00145 22725, 22725, 31521, 26722,
00146 22725, 22725, 17855, 6270,
00147 29692, 12299, 26722, -6270,
00148 -12299, -29692, -31521, -17855,
00149 22725, -22725, 17855, -31521,
00150 -22725, 22725, 6270, 26722,
00151 12299, -29692, 6270, -17855,
00152 29692, -12299, 26722, -31521,
00153 };
00154 
00155 static struct
00156 {
00157 const int16_t tab_frw_01234567_sse2[256] ATTR_ALIGN(16);
00158 } tab_frw_01234567_sse2 ATTR_ALIGN(16) =
00159 {{
00160 //static const int16_t tab_frw_01234567_sse2[] ATTR_ALIGN(16) = { // forward_dct coeff table
00161 #define TABLE_SSE2 C4, C4, C1, C3, -C6, -C2, -C1, -C5, \
00162  C4, C4, C5, C7, C2, C6, C3, -C7, \
00163  -C4, C4, C7, C3, C6, -C2, C7, -C5, \
00164  C4, -C4, C5, -C1, C2, -C6, C3, -C1,
00165 // c1..c7 * cos(pi/4) * 2^15
00166 #define C1 22725
00167 #define C2 21407
00168 #define C3 19266
00169 #define C4 16384
00170 #define C5 12873
00171 #define C6 8867
00172 #define C7 4520
00173 TABLE_SSE2
00174 
00175 #undef C1
00176 #undef C2
00177 #undef C3
00178 #undef C4
00179 #undef C5
00180 #undef C6
00181 #undef C7
00182 #define C1 31521
00183 #define C2 29692
00184 #define C3 26722
00185 #define C4 22725
00186 #define C5 17855
00187 #define C6 12299
00188 #define C7 6270
00189 TABLE_SSE2
00190 
00191 #undef C1
00192 #undef C2
00193 #undef C3
00194 #undef C4
00195 #undef C5
00196 #undef C6
00197 #undef C7
00198 #define C1 29692
00199 #define C2 27969
00200 #define C3 25172
00201 #define C4 21407
00202 #define C5 16819
00203 #define C6 11585
00204 #define C7 5906
00205 TABLE_SSE2
00206 
00207 #undef C1
00208 #undef C2
00209 #undef C3
00210 #undef C4
00211 #undef C5
00212 #undef C6
00213 #undef C7
00214 #define C1 26722
00215 #define C2 25172
00216 #define C3 22654
00217 #define C4 19266
00218 #define C5 15137
00219 #define C6 10426
00220 #define C7 5315
00221 TABLE_SSE2
00222 
00223 #undef C1
00224 #undef C2
00225 #undef C3
00226 #undef C4
00227 #undef C5
00228 #undef C6
00229 #undef C7
00230 #define C1 22725
00231 #define C2 21407
00232 #define C3 19266
00233 #define C4 16384
00234 #define C5 12873
00235 #define C6 8867
00236 #define C7 4520
00237 TABLE_SSE2
00238 
00239 #undef C1
00240 #undef C2
00241 #undef C3
00242 #undef C4
00243 #undef C5
00244 #undef C6
00245 #undef C7
00246 #define C1 26722
00247 #define C2 25172
00248 #define C3 22654
00249 #define C4 19266
00250 #define C5 15137
00251 #define C6 10426
00252 #define C7 5315
00253 TABLE_SSE2
00254 
00255 #undef C1
00256 #undef C2
00257 #undef C3
00258 #undef C4
00259 #undef C5
00260 #undef C6
00261 #undef C7
00262 #define C1 29692
00263 #define C2 27969
00264 #define C3 25172
00265 #define C4 21407
00266 #define C5 16819
00267 #define C6 11585
00268 #define C7 5906
00269 TABLE_SSE2
00270 
00271 #undef C1
00272 #undef C2
00273 #undef C3
00274 #undef C4
00275 #undef C5
00276 #undef C6
00277 #undef C7
00278 #define C1 31521
00279 #define C2 29692
00280 #define C3 26722
00281 #define C4 22725
00282 #define C5 17855
00283 #define C6 12299
00284 #define C7 6270
00285 TABLE_SSE2
00286 }};
00287 
00288 #define S(s) AV_TOSTRING(s) //AV_STRINGIFY is too long
00289 
00290 #define FDCT_COL(cpu, mm, mov)\
00291 static av_always_inline void fdct_col_##cpu(const int16_t *in, int16_t *out, int offset)\
00292 {\
00293  __asm__ volatile (\
00294  #mov" 16(%0), %%"#mm"0 \n\t" \
00295  #mov" 96(%0), %%"#mm"1 \n\t" \
00296  #mov" %%"#mm"0, %%"#mm"2 \n\t" \
00297  #mov" 32(%0), %%"#mm"3 \n\t" \
00298  "paddsw %%"#mm"1, %%"#mm"0 \n\t" \
00299  #mov" 80(%0), %%"#mm"4 \n\t" \
00300  "psllw $"S(SHIFT_FRW_COL)", %%"#mm"0 \n\t" \
00301  #mov" (%0), %%"#mm"5 \n\t" \
00302  "paddsw %%"#mm"3, %%"#mm"4 \n\t" \
00303  "paddsw 112(%0), %%"#mm"5 \n\t" \
00304  "psllw $"S(SHIFT_FRW_COL)", %%"#mm"4 \n\t" \
00305  #mov" %%"#mm"0, %%"#mm"6 \n\t" \
00306  "psubsw %%"#mm"1, %%"#mm"2 \n\t" \
00307  #mov" 16(%1), %%"#mm"1 \n\t" \
00308  "psubsw %%"#mm"4, %%"#mm"0 \n\t" \
00309  #mov" 48(%0), %%"#mm"7 \n\t" \
00310  "pmulhw %%"#mm"0, %%"#mm"1 \n\t" \
00311  "paddsw 64(%0), %%"#mm"7 \n\t" \
00312  "psllw $"S(SHIFT_FRW_COL)", %%"#mm"5 \n\t" \
00313  "paddsw %%"#mm"4, %%"#mm"6 \n\t" \
00314  "psllw $"S(SHIFT_FRW_COL)", %%"#mm"7 \n\t" \
00315  #mov" %%"#mm"5, %%"#mm"4 \n\t" \
00316  "psubsw %%"#mm"7, %%"#mm"5 \n\t" \
00317  "paddsw %%"#mm"5, %%"#mm"1 \n\t" \
00318  "paddsw %%"#mm"7, %%"#mm"4 \n\t" \
00319  "por (%2), %%"#mm"1 \n\t" \
00320  "psllw $"S(SHIFT_FRW_COL)"+1, %%"#mm"2 \n\t" \
00321  "pmulhw 16(%1), %%"#mm"5 \n\t" \
00322  #mov" %%"#mm"4, %%"#mm"7 \n\t" \
00323  "psubsw 80(%0), %%"#mm"3 \n\t" \
00324  "psubsw %%"#mm"6, %%"#mm"4 \n\t" \
00325  #mov" %%"#mm"1, 32(%3) \n\t" \
00326  "paddsw %%"#mm"6, %%"#mm"7 \n\t" \
00327  #mov" 48(%0), %%"#mm"1 \n\t" \
00328  "psllw $"S(SHIFT_FRW_COL)"+1, %%"#mm"3 \n\t" \
00329  "psubsw 64(%0), %%"#mm"1 \n\t" \
00330  #mov" %%"#mm"2, %%"#mm"6 \n\t" \
00331  #mov" %%"#mm"4, 64(%3) \n\t" \
00332  "paddsw %%"#mm"3, %%"#mm"2 \n\t" \
00333  "pmulhw (%4), %%"#mm"2 \n\t" \
00334  "psubsw %%"#mm"3, %%"#mm"6 \n\t" \
00335  "pmulhw (%4), %%"#mm"6 \n\t" \
00336  "psubsw %%"#mm"0, %%"#mm"5 \n\t" \
00337  "por (%2), %%"#mm"5 \n\t" \
00338  "psllw $"S(SHIFT_FRW_COL)", %%"#mm"1 \n\t" \
00339  "por (%2), %%"#mm"2 \n\t" \
00340  #mov" %%"#mm"1, %%"#mm"4 \n\t" \
00341  #mov" (%0), %%"#mm"3 \n\t" \
00342  "paddsw %%"#mm"6, %%"#mm"1 \n\t" \
00343  "psubsw 112(%0), %%"#mm"3 \n\t" \
00344  "psubsw %%"#mm"6, %%"#mm"4 \n\t" \
00345  #mov" (%1), %%"#mm"0 \n\t" \
00346  "psllw $"S(SHIFT_FRW_COL)", %%"#mm"3 \n\t" \
00347  #mov" 32(%1), %%"#mm"6 \n\t" \
00348  "pmulhw %%"#mm"1, %%"#mm"0 \n\t" \
00349  #mov" %%"#mm"7, (%3) \n\t" \
00350  "pmulhw %%"#mm"4, %%"#mm"6 \n\t" \
00351  #mov" %%"#mm"5, 96(%3) \n\t" \
00352  #mov" %%"#mm"3, %%"#mm"7 \n\t" \
00353  #mov" 32(%1), %%"#mm"5 \n\t" \
00354  "psubsw %%"#mm"2, %%"#mm"7 \n\t" \
00355  "paddsw %%"#mm"2, %%"#mm"3 \n\t" \
00356  "pmulhw %%"#mm"7, %%"#mm"5 \n\t" \
00357  "paddsw %%"#mm"3, %%"#mm"0 \n\t" \
00358  "paddsw %%"#mm"4, %%"#mm"6 \n\t" \
00359  "pmulhw (%1), %%"#mm"3 \n\t" \
00360  "por (%2), %%"#mm"0 \n\t" \
00361  "paddsw %%"#mm"7, %%"#mm"5 \n\t" \
00362  "psubsw %%"#mm"6, %%"#mm"7 \n\t" \
00363  #mov" %%"#mm"0, 16(%3) \n\t" \
00364  "paddsw %%"#mm"4, %%"#mm"5 \n\t" \
00365  #mov" %%"#mm"7, 48(%3) \n\t" \
00366  "psubsw %%"#mm"1, %%"#mm"3 \n\t" \
00367  #mov" %%"#mm"5, 80(%3) \n\t" \
00368  #mov" %%"#mm"3, 112(%3) \n\t" \
00369  : \
00370  : "r" (in + offset), "r" (fdct_tg_all_16), "r" (fdct_one_corr), \
00371  "r" (out + offset), "r" (ocos_4_16)); \
00372 }
00373 
00374 FDCT_COL(mmx, mm, movq)
00375 FDCT_COL(sse2, xmm, movdqa)
00376 
00377 static av_always_inline void fdct_row_sse2(const int16_t *in, int16_t *out)
00378 {
00379 __asm__ volatile(
00380 #define FDCT_ROW_SSE2_H1(i,t) \
00381  "movq " #i "(%0), %%xmm2 \n\t" \
00382  "movq " #i "+8(%0), %%xmm0 \n\t" \
00383  "movdqa " #t "+32(%1), %%xmm3 \n\t" \
00384  "movdqa " #t "+48(%1), %%xmm7 \n\t" \
00385  "movdqa " #t "(%1), %%xmm4 \n\t" \
00386  "movdqa " #t "+16(%1), %%xmm5 \n\t"
00387 
00388 #define FDCT_ROW_SSE2_H2(i,t) \
00389  "movq " #i "(%0), %%xmm2 \n\t" \
00390  "movq " #i "+8(%0), %%xmm0 \n\t" \
00391  "movdqa " #t "+32(%1), %%xmm3 \n\t" \
00392  "movdqa " #t "+48(%1), %%xmm7 \n\t"
00393 
00394 #define FDCT_ROW_SSE2(i) \
00395  "movq %%xmm2, %%xmm1 \n\t" \
00396  "pshuflw 27,ドル %%xmm0, %%xmm0 \n\t" \
00397  "paddsw %%xmm0, %%xmm1 \n\t" \
00398  "psubsw %%xmm0, %%xmm2 \n\t" \
00399  "punpckldq %%xmm2, %%xmm1 \n\t" \
00400  "pshufd 78,ドル %%xmm1, %%xmm2 \n\t" \
00401  "pmaddwd %%xmm2, %%xmm3 \n\t" \
00402  "pmaddwd %%xmm1, %%xmm7 \n\t" \
00403  "pmaddwd %%xmm5, %%xmm2 \n\t" \
00404  "pmaddwd %%xmm4, %%xmm1 \n\t" \
00405  "paddd %%xmm7, %%xmm3 \n\t" \
00406  "paddd %%xmm2, %%xmm1 \n\t" \
00407  "paddd %%xmm6, %%xmm3 \n\t" \
00408  "paddd %%xmm6, %%xmm1 \n\t" \
00409  "psrad %3, %%xmm3 \n\t" \
00410  "psrad %3, %%xmm1 \n\t" \
00411  "packssdw %%xmm3, %%xmm1 \n\t" \
00412  "movdqa %%xmm1, " #i "(%4) \n\t"
00413 
00414 "movdqa (%2), %%xmm6 \n\t"
00415 FDCT_ROW_SSE2_H1(0,0)
00416 FDCT_ROW_SSE2(0)
00417 FDCT_ROW_SSE2_H2(64,0)
00418 FDCT_ROW_SSE2(64)
00419 
00420 FDCT_ROW_SSE2_H1(16,64)
00421 FDCT_ROW_SSE2(16)
00422 FDCT_ROW_SSE2_H2(112,64)
00423 FDCT_ROW_SSE2(112)
00424 
00425 FDCT_ROW_SSE2_H1(32,128)
00426 FDCT_ROW_SSE2(32)
00427 FDCT_ROW_SSE2_H2(96,128)
00428 FDCT_ROW_SSE2(96)
00429 
00430 FDCT_ROW_SSE2_H1(48,192)
00431 FDCT_ROW_SSE2(48)
00432 FDCT_ROW_SSE2_H2(80,192)
00433 FDCT_ROW_SSE2(80)
00434 :
00435 : "r" (in), "r" (tab_frw_01234567_sse2.tab_frw_01234567_sse2), "r" (fdct_r_row_sse2.fdct_r_row_sse2), "i" (SHIFT_FRW_ROW), "r" (out)
00436 );
00437 }
00438 
00439 static av_always_inline void fdct_row_mmx2(const int16_t *in, int16_t *out, const int16_t *table)
00440 {
00441 __asm__ volatile (
00442 "pshufw 0ドルx1B, 8(%0), %%mm5 \n\t"
00443 "movq (%0), %%mm0 \n\t"
00444 "movq %%mm0, %%mm1 \n\t"
00445 "paddsw %%mm5, %%mm0 \n\t"
00446 "psubsw %%mm5, %%mm1 \n\t"
00447 "movq %%mm0, %%mm2 \n\t"
00448 "punpckldq %%mm1, %%mm0 \n\t"
00449 "punpckhdq %%mm1, %%mm2 \n\t"
00450 "movq (%1), %%mm1 \n\t"
00451 "movq 8(%1), %%mm3 \n\t"
00452 "movq 16(%1), %%mm4 \n\t"
00453 "movq 24(%1), %%mm5 \n\t"
00454 "movq 32(%1), %%mm6 \n\t"
00455 "movq 40(%1), %%mm7 \n\t"
00456 "pmaddwd %%mm0, %%mm1 \n\t"
00457 "pmaddwd %%mm2, %%mm3 \n\t"
00458 "pmaddwd %%mm0, %%mm4 \n\t"
00459 "pmaddwd %%mm2, %%mm5 \n\t"
00460 "pmaddwd %%mm0, %%mm6 \n\t"
00461 "pmaddwd %%mm2, %%mm7 \n\t"
00462 "pmaddwd 48(%1), %%mm0 \n\t"
00463 "pmaddwd 56(%1), %%mm2 \n\t"
00464 "paddd %%mm1, %%mm3 \n\t"
00465 "paddd %%mm4, %%mm5 \n\t"
00466 "paddd %%mm6, %%mm7 \n\t"
00467 "paddd %%mm0, %%mm2 \n\t"
00468 "movq (%2), %%mm0 \n\t"
00469 "paddd %%mm0, %%mm3 \n\t"
00470 "paddd %%mm0, %%mm5 \n\t"
00471 "paddd %%mm0, %%mm7 \n\t"
00472 "paddd %%mm0, %%mm2 \n\t"
00473 "psrad $"S(SHIFT_FRW_ROW)", %%mm3 \n\t"
00474 "psrad $"S(SHIFT_FRW_ROW)", %%mm5 \n\t"
00475 "psrad $"S(SHIFT_FRW_ROW)", %%mm7 \n\t"
00476 "psrad $"S(SHIFT_FRW_ROW)", %%mm2 \n\t"
00477 "packssdw %%mm5, %%mm3 \n\t"
00478 "packssdw %%mm2, %%mm7 \n\t"
00479 "movq %%mm3, (%3) \n\t"
00480 "movq %%mm7, 8(%3) \n\t"
00481 :
00482 : "r" (in), "r" (table), "r" (fdct_r_row), "r" (out));
00483 }
00484 
00485 static av_always_inline void fdct_row_mmx(const int16_t *in, int16_t *out, const int16_t *table)
00486 {
00487 //FIXME reorder (I do not have an old MMX-only CPU here to benchmark ...)
00488 __asm__ volatile(
00489 "movd 12(%0), %%mm1 \n\t"
00490 "punpcklwd 8(%0), %%mm1 \n\t"
00491 "movq %%mm1, %%mm2 \n\t"
00492 "psrlq 0ドルx20, %%mm1 \n\t"
00493 "movq 0(%0), %%mm0 \n\t"
00494 "punpcklwd %%mm2, %%mm1 \n\t"
00495 "movq %%mm0, %%mm5 \n\t"
00496 "paddsw %%mm1, %%mm0 \n\t"
00497 "psubsw %%mm1, %%mm5 \n\t"
00498 "movq %%mm0, %%mm2 \n\t"
00499 "punpckldq %%mm5, %%mm0 \n\t"
00500 "punpckhdq %%mm5, %%mm2 \n\t"
00501 "movq 0(%1), %%mm1 \n\t"
00502 "movq 8(%1), %%mm3 \n\t"
00503 "movq 16(%1), %%mm4 \n\t"
00504 "movq 24(%1), %%mm5 \n\t"
00505 "movq 32(%1), %%mm6 \n\t"
00506 "movq 40(%1), %%mm7 \n\t"
00507 "pmaddwd %%mm0, %%mm1 \n\t"
00508 "pmaddwd %%mm2, %%mm3 \n\t"
00509 "pmaddwd %%mm0, %%mm4 \n\t"
00510 "pmaddwd %%mm2, %%mm5 \n\t"
00511 "pmaddwd %%mm0, %%mm6 \n\t"
00512 "pmaddwd %%mm2, %%mm7 \n\t"
00513 "pmaddwd 48(%1), %%mm0 \n\t"
00514 "pmaddwd 56(%1), %%mm2 \n\t"
00515 "paddd %%mm1, %%mm3 \n\t"
00516 "paddd %%mm4, %%mm5 \n\t"
00517 "paddd %%mm6, %%mm7 \n\t"
00518 "paddd %%mm0, %%mm2 \n\t"
00519 "movq (%2), %%mm0 \n\t"
00520 "paddd %%mm0, %%mm3 \n\t"
00521 "paddd %%mm0, %%mm5 \n\t"
00522 "paddd %%mm0, %%mm7 \n\t"
00523 "paddd %%mm0, %%mm2 \n\t"
00524 "psrad $"S(SHIFT_FRW_ROW)", %%mm3 \n\t"
00525 "psrad $"S(SHIFT_FRW_ROW)", %%mm5 \n\t"
00526 "psrad $"S(SHIFT_FRW_ROW)", %%mm7 \n\t"
00527 "psrad $"S(SHIFT_FRW_ROW)", %%mm2 \n\t"
00528 "packssdw %%mm5, %%mm3 \n\t"
00529 "packssdw %%mm2, %%mm7 \n\t"
00530 "movq %%mm3, 0(%3) \n\t"
00531 "movq %%mm7, 8(%3) \n\t"
00532 :
00533 : "r" (in), "r" (table), "r" (fdct_r_row), "r" (out));
00534 }
00535 
00536 void ff_fdct_mmx(int16_t *block)
00537 {
00538 int64_t align_tmp[16] ATTR_ALIGN(8);
00539 int16_t * block1= (int16_t*)align_tmp;
00540 const int16_t *table= tab_frw_01234567;
00541 int i;
00542 
00543 fdct_col_mmx(block, block1, 0);
00544 fdct_col_mmx(block, block1, 4);
00545 
00546 for(i=8;i>0;i--) {
00547 fdct_row_mmx(block1, block, table);
00548 block1 += 8;
00549 table += 32;
00550 block += 8;
00551 }
00552 }
00553 
00554 void ff_fdct_mmx2(int16_t *block)
00555 {
00556 int64_t align_tmp[16] ATTR_ALIGN(8);
00557 int16_t *block1= (int16_t*)align_tmp;
00558 const int16_t *table= tab_frw_01234567;
00559 int i;
00560 
00561 fdct_col_mmx(block, block1, 0);
00562 fdct_col_mmx(block, block1, 4);
00563 
00564 for(i=8;i>0;i--) {
00565 fdct_row_mmx2(block1, block, table);
00566 block1 += 8;
00567 table += 32;
00568 block += 8;
00569 }
00570 }
00571 
00572 void ff_fdct_sse2(int16_t *block)
00573 {
00574 int64_t align_tmp[16] ATTR_ALIGN(16);
00575 int16_t * const block1= (int16_t*)align_tmp;
00576 
00577 fdct_col_sse2(block, block1, 0);
00578 fdct_row_sse2(block1, block);
00579 }
00580 

Generated on Fri Oct 26 02:35:40 2012 for FFmpeg by doxygen 1.5.8

AltStyle によって変換されたページ (->オリジナル) /