00001 /* 00002 * Copyright (C) 2004 the ffmpeg project 00003 * 00004 * This file is part of FFmpeg. 00005 * 00006 * FFmpeg is free software; you can redistribute it and/or 00007 * modify it under the terms of the GNU Lesser General Public 00008 * License as published by the Free Software Foundation; either 00009 * version 2.1 of the License, or (at your option) any later version. 00010 * 00011 * FFmpeg is distributed in the hope that it will be useful, 00012 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00013 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00014 * Lesser General Public License for more details. 00015 * 00016 * You should have received a copy of the GNU Lesser General Public 00017 * License along with FFmpeg; if not, write to the Free Software 00018 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 00019 */ 00020 00026 #include "libavutil/x86_cpu.h" 00027 #include "libavcodec/dsputil.h" 00028 #include "dsputil_mmx.h" 00029 00030 extern const uint16_t ff_vp3_idct_data[]; 00031 00032 // this is off by one or two for some cases when filter_limit is greater than 63 00033 // in: p0 in mm6, p1 in mm4, p2 in mm2, p3 in mm1 00034 // out: p1 in mm4, p2 in mm3 00035 #define VP3_LOOP_FILTER(flim) \ 00036 "movq %%mm6, %%mm7 \n\t" \ 00037 "pand "MANGLE(ff_pb_7 )", %%mm6 \n\t" /* p0&7 */ \ 00038 "psrlw 3,ドル %%mm7 \n\t" \ 00039 "pand "MANGLE(ff_pb_1F)", %%mm7 \n\t" /* p0>>3 */ \ 00040 "movq %%mm2, %%mm3 \n\t" /* mm3 = p2 */ \ 00041 "pxor %%mm4, %%mm2 \n\t" \ 00042 "pand "MANGLE(ff_pb_1 )", %%mm2 \n\t" /* (p2^p1)&1 */ \ 00043 "movq %%mm2, %%mm5 \n\t" \ 00044 "paddb %%mm2, %%mm2 \n\t" \ 00045 "paddb %%mm5, %%mm2 \n\t" /* 3*(p2^p1)&1 */ \ 00046 "paddb %%mm6, %%mm2 \n\t" /* extra bits lost in shifts */ \ 00047 "pcmpeqb %%mm0, %%mm0 \n\t" \ 00048 "pxor %%mm0, %%mm1 \n\t" /* 255 - p3 */ \ 00049 "pavgb %%mm2, %%mm1 \n\t" /* (256 - p3 + extrabits) >> 1 */ \ 00050 "pxor %%mm4, %%mm0 \n\t" /* 255 - p1 */ \ 00051 "pavgb %%mm3, %%mm0 \n\t" /* (256 + p2-p1) >> 1 */ \ 00052 "paddb "MANGLE(ff_pb_3 )", %%mm1 \n\t" \ 00053 "pavgb %%mm0, %%mm1 \n\t" /* 128+2+( p2-p1 - p3) >> 2 */ \ 00054 "pavgb %%mm0, %%mm1 \n\t" /* 128+1+(3*(p2-p1) - p3) >> 3 */ \ 00055 "paddusb %%mm1, %%mm7 \n\t" /* d+128+1 */ \ 00056 "movq "MANGLE(ff_pb_81)", %%mm6 \n\t" \ 00057 "psubusb %%mm7, %%mm6 \n\t" \ 00058 "psubusb "MANGLE(ff_pb_81)", %%mm7 \n\t" \ 00059 \ 00060 "movq "#flim", %%mm5 \n\t" \ 00061 "pminub %%mm5, %%mm6 \n\t" \ 00062 "pminub %%mm5, %%mm7 \n\t" \ 00063 "movq %%mm6, %%mm0 \n\t" \ 00064 "movq %%mm7, %%mm1 \n\t" \ 00065 "paddb %%mm6, %%mm6 \n\t" \ 00066 "paddb %%mm7, %%mm7 \n\t" \ 00067 "pminub %%mm5, %%mm6 \n\t" \ 00068 "pminub %%mm5, %%mm7 \n\t" \ 00069 "psubb %%mm0, %%mm6 \n\t" \ 00070 "psubb %%mm1, %%mm7 \n\t" \ 00071 "paddusb %%mm7, %%mm4 \n\t" \ 00072 "psubusb %%mm6, %%mm4 \n\t" \ 00073 "psubusb %%mm7, %%mm3 \n\t" \ 00074 "paddusb %%mm6, %%mm3 \n\t" 00075 00076 #define STORE_4_WORDS(dst0, dst1, dst2, dst3, mm) \ 00077 "movd "#mm", %0 \n\t" \ 00078 "movw %w0, -1"#dst0" \n\t" \ 00079 "psrlq 32,ドル "#mm" \n\t" \ 00080 "shr 16,ドル %0 \n\t" \ 00081 "movw %w0, -1"#dst1" \n\t" \ 00082 "movd "#mm", %0 \n\t" \ 00083 "movw %w0, -1"#dst2" \n\t" \ 00084 "shr 16,ドル %0 \n\t" \ 00085 "movw %w0, -1"#dst3" \n\t" 00086 00087 void ff_vp3_v_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values) 00088 { 00089 __asm__ volatile( 00090 "movq %0, %%mm6 \n\t" 00091 "movq %1, %%mm4 \n\t" 00092 "movq %2, %%mm2 \n\t" 00093 "movq %3, %%mm1 \n\t" 00094 00095 VP3_LOOP_FILTER(%4) 00096 00097 "movq %%mm4, %1 \n\t" 00098 "movq %%mm3, %2 \n\t" 00099 00100 : "+m" (*(uint64_t*)(src - 2*stride)), 00101 "+m" (*(uint64_t*)(src - 1*stride)), 00102 "+m" (*(uint64_t*)(src + 0*stride)), 00103 "+m" (*(uint64_t*)(src + 1*stride)) 00104 : "m"(*(uint64_t*)(bounding_values+129)) 00105 ); 00106 } 00107 00108 void ff_vp3_h_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values) 00109 { 00110 x86_reg tmp; 00111 00112 __asm__ volatile( 00113 "movd -2(%1), %%mm6 \n\t" 00114 "movd -2(%1,%3), %%mm0 \n\t" 00115 "movd -2(%1,%3,2), %%mm1 \n\t" 00116 "movd -2(%1,%4), %%mm4 \n\t" 00117 00118 TRANSPOSE8x4(%%mm6, %%mm0, %%mm1, %%mm4, -2(%2), -2(%2,%3), -2(%2,%3,2), -2(%2,%4), %%mm2) 00119 VP3_LOOP_FILTER(%5) 00120 SBUTTERFLY(%%mm4, %%mm3, %%mm5, bw, q) 00121 00122 STORE_4_WORDS((%1), (%1,%3), (%1,%3,2), (%1,%4), %%mm4) 00123 STORE_4_WORDS((%2), (%2,%3), (%2,%3,2), (%2,%4), %%mm5) 00124 00125 : "=&r"(tmp) 00126 : "r"(src), "r"(src+4*stride), "r"((x86_reg)stride), "r"((x86_reg)3*stride), 00127 "m"(*(uint64_t*)(bounding_values+129)) 00128 : "memory" 00129 ); 00130 } 00131 00132 /* from original comments: The Macro does IDct on 4 1-D Dcts */ 00133 #define BeginIDCT() \ 00134 "movq "I(3)", %%mm2 \n\t" \ 00135 "movq "C(3)", %%mm6 \n\t" \ 00136 "movq %%mm2, %%mm4 \n\t" \ 00137 "movq "J(5)", %%mm7 \n\t" \ 00138 "pmulhw %%mm6, %%mm4 \n\t" /* r4 = c3*i3 - i3 */ \ 00139 "movq "C(5)", %%mm1 \n\t" \ 00140 "pmulhw %%mm7, %%mm6 \n\t" /* r6 = c3*i5 - i5 */ \ 00141 "movq %%mm1, %%mm5 \n\t" \ 00142 "pmulhw %%mm2, %%mm1 \n\t" /* r1 = c5*i3 - i3 */ \ 00143 "movq "I(1)", %%mm3 \n\t" \ 00144 "pmulhw %%mm7, %%mm5 \n\t" /* r5 = c5*i5 - i5 */ \ 00145 "movq "C(1)", %%mm0 \n\t" \ 00146 "paddw %%mm2, %%mm4 \n\t" /* r4 = c3*i3 */ \ 00147 "paddw %%mm7, %%mm6 \n\t" /* r6 = c3*i5 */ \ 00148 "paddw %%mm1, %%mm2 \n\t" /* r2 = c5*i3 */ \ 00149 "movq "J(7)", %%mm1 \n\t" \ 00150 "paddw %%mm5, %%mm7 \n\t" /* r7 = c5*i5 */ \ 00151 "movq %%mm0, %%mm5 \n\t" /* r5 = c1 */ \ 00152 "pmulhw %%mm3, %%mm0 \n\t" /* r0 = c1*i1 - i1 */ \ 00153 "paddsw %%mm7, %%mm4 \n\t" /* r4 = C = c3*i3 + c5*i5 */ \ 00154 "pmulhw %%mm1, %%mm5 \n\t" /* r5 = c1*i7 - i7 */ \ 00155 "movq "C(7)", %%mm7 \n\t" \ 00156 "psubsw %%mm2, %%mm6 \n\t" /* r6 = D = c3*i5 - c5*i3 */ \ 00157 "paddw %%mm3, %%mm0 \n\t" /* r0 = c1*i1 */ \ 00158 "pmulhw %%mm7, %%mm3 \n\t" /* r3 = c7*i1 */ \ 00159 "movq "I(2)", %%mm2 \n\t" \ 00160 "pmulhw %%mm1, %%mm7 \n\t" /* r7 = c7*i7 */ \ 00161 "paddw %%mm1, %%mm5 \n\t" /* r5 = c1*i7 */ \ 00162 "movq %%mm2, %%mm1 \n\t" /* r1 = i2 */ \ 00163 "pmulhw "C(2)", %%mm2 \n\t" /* r2 = c2*i2 - i2 */ \ 00164 "psubsw %%mm5, %%mm3 \n\t" /* r3 = B = c7*i1 - c1*i7 */ \ 00165 "movq "J(6)", %%mm5 \n\t" \ 00166 "paddsw %%mm7, %%mm0 \n\t" /* r0 = A = c1*i1 + c7*i7 */ \ 00167 "movq %%mm5, %%mm7 \n\t" /* r7 = i6 */ \ 00168 "psubsw %%mm4, %%mm0 \n\t" /* r0 = A - C */ \ 00169 "pmulhw "C(2)", %%mm5 \n\t" /* r5 = c2*i6 - i6 */ \ 00170 "paddw %%mm1, %%mm2 \n\t" /* r2 = c2*i2 */ \ 00171 "pmulhw "C(6)", %%mm1 \n\t" /* r1 = c6*i2 */ \ 00172 "paddsw %%mm4, %%mm4 \n\t" /* r4 = C + C */ \ 00173 "paddsw %%mm0, %%mm4 \n\t" /* r4 = C. = A + C */ \ 00174 "psubsw %%mm6, %%mm3 \n\t" /* r3 = B - D */ \ 00175 "paddw %%mm7, %%mm5 \n\t" /* r5 = c2*i6 */ \ 00176 "paddsw %%mm6, %%mm6 \n\t" /* r6 = D + D */ \ 00177 "pmulhw "C(6)", %%mm7 \n\t" /* r7 = c6*i6 */ \ 00178 "paddsw %%mm3, %%mm6 \n\t" /* r6 = D. = B + D */ \ 00179 "movq %%mm4, "I(1)"\n\t" /* save C. at I(1) */ \ 00180 "psubsw %%mm5, %%mm1 \n\t" /* r1 = H = c6*i2 - c2*i6 */ \ 00181 "movq "C(4)", %%mm4 \n\t" \ 00182 "movq %%mm3, %%mm5 \n\t" /* r5 = B - D */ \ 00183 "pmulhw %%mm4, %%mm3 \n\t" /* r3 = (c4 - 1) * (B - D) */ \ 00184 "paddsw %%mm2, %%mm7 \n\t" /* r3 = (c4 - 1) * (B - D) */ \ 00185 "movq %%mm6, "I(2)"\n\t" /* save D. at I(2) */ \ 00186 "movq %%mm0, %%mm2 \n\t" /* r2 = A - C */ \ 00187 "movq "I(0)", %%mm6 \n\t" \ 00188 "pmulhw %%mm4, %%mm0 \n\t" /* r0 = (c4 - 1) * (A - C) */ \ 00189 "paddw %%mm3, %%mm5 \n\t" /* r5 = B. = c4 * (B - D) */ \ 00190 "movq "J(4)", %%mm3 \n\t" \ 00191 "psubsw %%mm1, %%mm5 \n\t" /* r5 = B.. = B. - H */ \ 00192 "paddw %%mm0, %%mm2 \n\t" /* r0 = A. = c4 * (A - C) */ \ 00193 "psubsw %%mm3, %%mm6 \n\t" /* r6 = i0 - i4 */ \ 00194 "movq %%mm6, %%mm0 \n\t" \ 00195 "pmulhw %%mm4, %%mm6 \n\t" /* r6 = (c4 - 1) * (i0 - i4) */ \ 00196 "paddsw %%mm3, %%mm3 \n\t" /* r3 = i4 + i4 */ \ 00197 "paddsw %%mm1, %%mm1 \n\t" /* r1 = H + H */ \ 00198 "paddsw %%mm0, %%mm3 \n\t" /* r3 = i0 + i4 */ \ 00199 "paddsw %%mm5, %%mm1 \n\t" /* r1 = H. = B + H */ \ 00200 "pmulhw %%mm3, %%mm4 \n\t" /* r4 = (c4 - 1) * (i0 + i4) */ \ 00201 "paddsw %%mm0, %%mm6 \n\t" /* r6 = F = c4 * (i0 - i4) */ \ 00202 "psubsw %%mm2, %%mm6 \n\t" /* r6 = F. = F - A. */ \ 00203 "paddsw %%mm2, %%mm2 \n\t" /* r2 = A. + A. */ \ 00204 "movq "I(1)", %%mm0 \n\t" /* r0 = C. */ \ 00205 "paddsw %%mm6, %%mm2 \n\t" /* r2 = A.. = F + A. */ \ 00206 "paddw %%mm3, %%mm4 \n\t" /* r4 = E = c4 * (i0 + i4) */ \ 00207 "psubsw %%mm1, %%mm2 \n\t" /* r2 = R2 = A.. - H. */ 00208 00209 /* RowIDCT gets ready to transpose */ 00210 #define RowIDCT() \ 00211 BeginIDCT() \ 00212 "movq "I(2)", %%mm3 \n\t" /* r3 = D. */ \ 00213 "psubsw %%mm7, %%mm4 \n\t" /* r4 = E. = E - G */ \ 00214 "paddsw %%mm1, %%mm1 \n\t" /* r1 = H. + H. */ \ 00215 "paddsw %%mm7, %%mm7 \n\t" /* r7 = G + G */ \ 00216 "paddsw %%mm2, %%mm1 \n\t" /* r1 = R1 = A.. + H. */ \ 00217 "paddsw %%mm4, %%mm7 \n\t" /* r1 = R1 = A.. + H. */ \ 00218 "psubsw %%mm3, %%mm4 \n\t" /* r4 = R4 = E. - D. */ \ 00219 "paddsw %%mm3, %%mm3 \n\t" \ 00220 "psubsw %%mm5, %%mm6 \n\t" /* r6 = R6 = F. - B.. */ \ 00221 "paddsw %%mm5, %%mm5 \n\t" \ 00222 "paddsw %%mm4, %%mm3 \n\t" /* r3 = R3 = E. + D. */ \ 00223 "paddsw %%mm6, %%mm5 \n\t" /* r5 = R5 = F. + B.. */ \ 00224 "psubsw %%mm0, %%mm7 \n\t" /* r7 = R7 = G. - C. */ \ 00225 "paddsw %%mm0, %%mm0 \n\t" \ 00226 "movq %%mm1, "I(1)"\n\t" /* save R1 */ \ 00227 "paddsw %%mm7, %%mm0 \n\t" /* r0 = R0 = G. + C. */ 00228 00229 /* Column IDCT normalizes and stores final results */ 00230 #define ColumnIDCT() \ 00231 BeginIDCT() \ 00232 "paddsw "OC_8", %%mm2 \n\t" /* adjust R2 (and R1) for shift */ \ 00233 "paddsw %%mm1, %%mm1 \n\t" /* r1 = H. + H. */ \ 00234 "paddsw %%mm2, %%mm1 \n\t" /* r1 = R1 = A.. + H. */ \ 00235 "psraw 4,ドル %%mm2 \n\t" /* r2 = NR2 */ \ 00236 "psubsw %%mm7, %%mm4 \n\t" /* r4 = E. = E - G */ \ 00237 "psraw 4,ドル %%mm1 \n\t" /* r1 = NR1 */ \ 00238 "movq "I(2)", %%mm3 \n\t" /* r3 = D. */ \ 00239 "paddsw %%mm7, %%mm7 \n\t" /* r7 = G + G */ \ 00240 "movq %%mm2, "I(2)"\n\t" /* store NR2 at I2 */ \ 00241 "paddsw %%mm4, %%mm7 \n\t" /* r7 = G. = E + G */ \ 00242 "movq %%mm1, "I(1)"\n\t" /* store NR1 at I1 */ \ 00243 "psubsw %%mm3, %%mm4 \n\t" /* r4 = R4 = E. - D. */ \ 00244 "paddsw "OC_8", %%mm4 \n\t" /* adjust R4 (and R3) for shift */ \ 00245 "paddsw %%mm3, %%mm3 \n\t" /* r3 = D. + D. */ \ 00246 "paddsw %%mm4, %%mm3 \n\t" /* r3 = R3 = E. + D. */ \ 00247 "psraw 4,ドル %%mm4 \n\t" /* r4 = NR4 */ \ 00248 "psubsw %%mm5, %%mm6 \n\t" /* r6 = R6 = F. - B.. */ \ 00249 "psraw 4,ドル %%mm3 \n\t" /* r3 = NR3 */ \ 00250 "paddsw "OC_8", %%mm6 \n\t" /* adjust R6 (and R5) for shift */ \ 00251 "paddsw %%mm5, %%mm5 \n\t" /* r5 = B.. + B.. */ \ 00252 "paddsw %%mm6, %%mm5 \n\t" /* r5 = R5 = F. + B.. */ \ 00253 "psraw 4,ドル %%mm6 \n\t" /* r6 = NR6 */ \ 00254 "movq %%mm4, "J(4)"\n\t" /* store NR4 at J4 */ \ 00255 "psraw 4,ドル %%mm5 \n\t" /* r5 = NR5 */ \ 00256 "movq %%mm3, "I(3)"\n\t" /* store NR3 at I3 */ \ 00257 "psubsw %%mm0, %%mm7 \n\t" /* r7 = R7 = G. - C. */ \ 00258 "paddsw "OC_8", %%mm7 \n\t" /* adjust R7 (and R0) for shift */ \ 00259 "paddsw %%mm0, %%mm0 \n\t" /* r0 = C. + C. */ \ 00260 "paddsw %%mm7, %%mm0 \n\t" /* r0 = R0 = G. + C. */ \ 00261 "psraw 4,ドル %%mm7 \n\t" /* r7 = NR7 */ \ 00262 "movq %%mm6, "J(6)"\n\t" /* store NR6 at J6 */ \ 00263 "psraw 4,ドル %%mm0 \n\t" /* r0 = NR0 */ \ 00264 "movq %%mm5, "J(5)"\n\t" /* store NR5 at J5 */ \ 00265 "movq %%mm7, "J(7)"\n\t" /* store NR7 at J7 */ \ 00266 "movq %%mm0, "I(0)"\n\t" /* store NR0 at I0 */ 00267 00268 /* Following macro does two 4x4 transposes in place. 00269 00270 At entry (we assume): 00271 00272 r0 = a3 a2 a1 a0 00273 I(1) = b3 b2 b1 b0 00274 r2 = c3 c2 c1 c0 00275 r3 = d3 d2 d1 d0 00276 00277 r4 = e3 e2 e1 e0 00278 r5 = f3 f2 f1 f0 00279 r6 = g3 g2 g1 g0 00280 r7 = h3 h2 h1 h0 00281 00282 At exit, we have: 00283 00284 I(0) = d0 c0 b0 a0 00285 I(1) = d1 c1 b1 a1 00286 I(2) = d2 c2 b2 a2 00287 I(3) = d3 c3 b3 a3 00288 00289 J(4) = h0 g0 f0 e0 00290 J(5) = h1 g1 f1 e1 00291 J(6) = h2 g2 f2 e2 00292 J(7) = h3 g3 f3 e3 00293 00294 I(0) I(1) I(2) I(3) is the transpose of r0 I(1) r2 r3. 00295 J(4) J(5) J(6) J(7) is the transpose of r4 r5 r6 r7. 00296 00297 Since r1 is free at entry, we calculate the Js first. */ 00298 #define Transpose() \ 00299 "movq %%mm4, %%mm1 \n\t" /* r1 = e3 e2 e1 e0 */ \ 00300 "punpcklwd %%mm5, %%mm4 \n\t" /* r4 = f1 e1 f0 e0 */ \ 00301 "movq %%mm0, "I(0)"\n\t" /* save a3 a2 a1 a0 */ \ 00302 "punpckhwd %%mm5, %%mm1 \n\t" /* r1 = f3 e3 f2 e2 */ \ 00303 "movq %%mm6, %%mm0 \n\t" /* r0 = g3 g2 g1 g0 */ \ 00304 "punpcklwd %%mm7, %%mm6 \n\t" /* r6 = h1 g1 h0 g0 */ \ 00305 "movq %%mm4, %%mm5 \n\t" /* r5 = f1 e1 f0 e0 */ \ 00306 "punpckldq %%mm6, %%mm4 \n\t" /* r4 = h0 g0 f0 e0 = R4 */ \ 00307 "punpckhdq %%mm6, %%mm5 \n\t" /* r5 = h1 g1 f1 e1 = R5 */ \ 00308 "movq %%mm1, %%mm6 \n\t" /* r6 = f3 e3 f2 e2 */ \ 00309 "movq %%mm4, "J(4)"\n\t" \ 00310 "punpckhwd %%mm7, %%mm0 \n\t" /* r0 = h3 g3 h2 g2 */ \ 00311 "movq %%mm5, "J(5)"\n\t" \ 00312 "punpckhdq %%mm0, %%mm6 \n\t" /* r6 = h3 g3 f3 e3 = R7 */ \ 00313 "movq "I(0)", %%mm4 \n\t" /* r4 = a3 a2 a1 a0 */ \ 00314 "punpckldq %%mm0, %%mm1 \n\t" /* r1 = h2 g2 f2 e2 = R6 */ \ 00315 "movq "I(1)", %%mm5 \n\t" /* r5 = b3 b2 b1 b0 */ \ 00316 "movq %%mm4, %%mm0 \n\t" /* r0 = a3 a2 a1 a0 */ \ 00317 "movq %%mm6, "J(7)"\n\t" \ 00318 "punpcklwd %%mm5, %%mm0 \n\t" /* r0 = b1 a1 b0 a0 */ \ 00319 "movq %%mm1, "J(6)"\n\t" \ 00320 "punpckhwd %%mm5, %%mm4 \n\t" /* r4 = b3 a3 b2 a2 */ \ 00321 "movq %%mm2, %%mm5 \n\t" /* r5 = c3 c2 c1 c0 */ \ 00322 "punpcklwd %%mm3, %%mm2 \n\t" /* r2 = d1 c1 d0 c0 */ \ 00323 "movq %%mm0, %%mm1 \n\t" /* r1 = b1 a1 b0 a0 */ \ 00324 "punpckldq %%mm2, %%mm0 \n\t" /* r0 = d0 c0 b0 a0 = R0 */ \ 00325 "punpckhdq %%mm2, %%mm1 \n\t" /* r1 = d1 c1 b1 a1 = R1 */ \ 00326 "movq %%mm4, %%mm2 \n\t" /* r2 = b3 a3 b2 a2 */ \ 00327 "movq %%mm0, "I(0)"\n\t" \ 00328 "punpckhwd %%mm3, %%mm5 \n\t" /* r5 = d3 c3 d2 c2 */ \ 00329 "movq %%mm1, "I(1)"\n\t" \ 00330 "punpckhdq %%mm5, %%mm4 \n\t" /* r4 = d3 c3 b3 a3 = R3 */ \ 00331 "punpckldq %%mm5, %%mm2 \n\t" /* r2 = d2 c2 b2 a2 = R2 */ \ 00332 "movq %%mm4, "I(3)"\n\t" \ 00333 "movq %%mm2, "I(2)"\n\t" 00334 00335 void ff_vp3_idct_mmx(int16_t *output_data) 00336 { 00337 /* eax = quantized input 00338 * ebx = dequantizer matrix 00339 * ecx = IDCT constants 00340 * M(I) = ecx + MaskOffset(0) + I * 8 00341 * C(I) = ecx + CosineOffset(32) + (I-1) * 8 00342 * edx = output 00343 * r0..r7 = mm0..mm7 00344 */ 00345 00346 #define C(x) AV_STRINGIFY(16*(x-1))"(%1)" 00347 #define OC_8 "%2" 00348 00349 /* at this point, function has completed dequantization + dezigzag + 00350 * partial transposition; now do the idct itself */ 00351 #define I(x) AV_STRINGIFY(16* x )"(%0)" 00352 #define J(x) AV_STRINGIFY(16*(x-4) + 8)"(%0)" 00353 00354 __asm__ volatile ( 00355 RowIDCT() 00356 Transpose() 00357 00358 #undef I 00359 #undef J 00360 #define I(x) AV_STRINGIFY(16* x + 64)"(%0)" 00361 #define J(x) AV_STRINGIFY(16*(x-4) + 72)"(%0)" 00362 00363 RowIDCT() 00364 Transpose() 00365 00366 #undef I 00367 #undef J 00368 #define I(x) AV_STRINGIFY(16*x)"(%0)" 00369 #define J(x) AV_STRINGIFY(16*x)"(%0)" 00370 00371 ColumnIDCT() 00372 00373 #undef I 00374 #undef J 00375 #define I(x) AV_STRINGIFY(16*x + 8)"(%0)" 00376 #define J(x) AV_STRINGIFY(16*x + 8)"(%0)" 00377 00378 ColumnIDCT() 00379 :: "r"(output_data), "r"(ff_vp3_idct_data), "m"(ff_pw_8) 00380 ); 00381 #undef I 00382 #undef J 00383 00384 } 00385 00386 void ff_vp3_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block) 00387 { 00388 ff_vp3_idct_mmx(block); 00389 put_signed_pixels_clamped_mmx(block, dest, line_size); 00390 } 00391 00392 void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block) 00393 { 00394 ff_vp3_idct_mmx(block); 00395 add_pixels_clamped_mmx(block, dest, line_size); 00396 }