libavcodec/x86/mpegvideo_mmx.c

Go to the documentation of this file.
00001 /*
00002  * The simplest mpeg encoder (well, it was the simplest!)
00003  * Copyright (c) 2000,2001 Fabrice Bellard
00004  *
00005  * Optimized for ia32 CPUs by Nick Kurshev <nickols_k@mail.ru>
00006  * h263, mpeg1, mpeg2 dequantizer & draw_edges by Michael Niedermayer <michaelni@gmx.at>
00007  *
00008  * This file is part of FFmpeg.
00009  *
00010  * FFmpeg is free software; you can redistribute it and/or
00011  * modify it under the terms of the GNU Lesser General Public
00012  * License as published by the Free Software Foundation; either
00013  * version 2.1 of the License, or (at your option) any later version.
00014  *
00015  * FFmpeg is distributed in the hope that it will be useful,
00016  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00017  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
00018  * Lesser General Public License for more details.
00019  *
00020  * You should have received a copy of the GNU Lesser General Public
00021  * License along with FFmpeg; if not, write to the Free Software
00022  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
00023  */
00024 
00025 #include "libavutil/x86_cpu.h"
00026 #include "libavcodec/avcodec.h"
00027 #include "libavcodec/dsputil.h"
00028 #include "libavcodec/mpegvideo.h"
00029 #include "dsputil_mmx.h"
00030 
00031 extern uint16_t inv_zigzag_direct16[64];
00032 
00033 
00034 static void dct_unquantize_h263_intra_mmx(MpegEncContext *s,
00035 DCTELEM *block, int n, int qscale)
00036 {
00037 x86_reg level, qmul, qadd, nCoeffs;
00038 
00039 qmul = qscale << 1;
00040 
00041 assert(s->block_last_index[n]>=0 || s->h263_aic);
00042 
00043 if (!s->h263_aic) {
00044 if (n < 4)
00045 level = block[0] * s->y_dc_scale;
00046 else
00047 level = block[0] * s->c_dc_scale;
00048 qadd = (qscale - 1) | 1;
00049 }else{
00050 qadd = 0;
00051 level= block[0];
00052 }
00053 if(s->ac_pred)
00054 nCoeffs=63;
00055 else
00056 nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ];
00057 //printf("%d %d ", qmul, qadd);
00058 __asm__ volatile(
00059 "movd %1, %%mm6 \n\t" //qmul
00060 "packssdw %%mm6, %%mm6 \n\t"
00061 "packssdw %%mm6, %%mm6 \n\t"
00062 "movd %2, %%mm5 \n\t" //qadd
00063 "pxor %%mm7, %%mm7 \n\t"
00064 "packssdw %%mm5, %%mm5 \n\t"
00065 "packssdw %%mm5, %%mm5 \n\t"
00066 "psubw %%mm5, %%mm7 \n\t"
00067 "pxor %%mm4, %%mm4 \n\t"
00068 ASMALIGN(4)
00069 "1: \n\t"
00070 "movq (%0, %3), %%mm0 \n\t"
00071 "movq 8(%0, %3), %%mm1 \n\t"
00072 
00073 "pmullw %%mm6, %%mm0 \n\t"
00074 "pmullw %%mm6, %%mm1 \n\t"
00075 
00076 "movq (%0, %3), %%mm2 \n\t"
00077 "movq 8(%0, %3), %%mm3 \n\t"
00078 
00079 "pcmpgtw %%mm4, %%mm2 \n\t" // block[i] < 0 ? -1 : 0
00080 "pcmpgtw %%mm4, %%mm3 \n\t" // block[i] < 0 ? -1 : 0
00081 
00082 "pxor %%mm2, %%mm0 \n\t"
00083 "pxor %%mm3, %%mm1 \n\t"
00084 
00085 "paddw %%mm7, %%mm0 \n\t"
00086 "paddw %%mm7, %%mm1 \n\t"
00087 
00088 "pxor %%mm0, %%mm2 \n\t"
00089 "pxor %%mm1, %%mm3 \n\t"
00090 
00091 "pcmpeqw %%mm7, %%mm0 \n\t" // block[i] == 0 ? -1 : 0
00092 "pcmpeqw %%mm7, %%mm1 \n\t" // block[i] == 0 ? -1 : 0
00093 
00094 "pandn %%mm2, %%mm0 \n\t"
00095 "pandn %%mm3, %%mm1 \n\t"
00096 
00097 "movq %%mm0, (%0, %3) \n\t"
00098 "movq %%mm1, 8(%0, %3) \n\t"
00099 
00100 "add 16,ドル %3 \n\t"
00101 "jng 1b \n\t"
00102 ::"r" (block+nCoeffs), "rm"(qmul), "rm" (qadd), "r" (2*(-nCoeffs))
00103 : "memory"
00104 );
00105 block[0]= level;
00106 }
00107 
00108 
00109 static void dct_unquantize_h263_inter_mmx(MpegEncContext *s,
00110 DCTELEM *block, int n, int qscale)
00111 {
00112 x86_reg qmul, qadd, nCoeffs;
00113 
00114 qmul = qscale << 1;
00115 qadd = (qscale - 1) | 1;
00116 
00117 assert(s->block_last_index[n]>=0 || s->h263_aic);
00118 
00119 nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ];
00120 //printf("%d %d ", qmul, qadd);
00121 __asm__ volatile(
00122 "movd %1, %%mm6 \n\t" //qmul
00123 "packssdw %%mm6, %%mm6 \n\t"
00124 "packssdw %%mm6, %%mm6 \n\t"
00125 "movd %2, %%mm5 \n\t" //qadd
00126 "pxor %%mm7, %%mm7 \n\t"
00127 "packssdw %%mm5, %%mm5 \n\t"
00128 "packssdw %%mm5, %%mm5 \n\t"
00129 "psubw %%mm5, %%mm7 \n\t"
00130 "pxor %%mm4, %%mm4 \n\t"
00131 ASMALIGN(4)
00132 "1: \n\t"
00133 "movq (%0, %3), %%mm0 \n\t"
00134 "movq 8(%0, %3), %%mm1 \n\t"
00135 
00136 "pmullw %%mm6, %%mm0 \n\t"
00137 "pmullw %%mm6, %%mm1 \n\t"
00138 
00139 "movq (%0, %3), %%mm2 \n\t"
00140 "movq 8(%0, %3), %%mm3 \n\t"
00141 
00142 "pcmpgtw %%mm4, %%mm2 \n\t" // block[i] < 0 ? -1 : 0
00143 "pcmpgtw %%mm4, %%mm3 \n\t" // block[i] < 0 ? -1 : 0
00144 
00145 "pxor %%mm2, %%mm0 \n\t"
00146 "pxor %%mm3, %%mm1 \n\t"
00147 
00148 "paddw %%mm7, %%mm0 \n\t"
00149 "paddw %%mm7, %%mm1 \n\t"
00150 
00151 "pxor %%mm0, %%mm2 \n\t"
00152 "pxor %%mm1, %%mm3 \n\t"
00153 
00154 "pcmpeqw %%mm7, %%mm0 \n\t" // block[i] == 0 ? -1 : 0
00155 "pcmpeqw %%mm7, %%mm1 \n\t" // block[i] == 0 ? -1 : 0
00156 
00157 "pandn %%mm2, %%mm0 \n\t"
00158 "pandn %%mm3, %%mm1 \n\t"
00159 
00160 "movq %%mm0, (%0, %3) \n\t"
00161 "movq %%mm1, 8(%0, %3) \n\t"
00162 
00163 "add 16,ドル %3 \n\t"
00164 "jng 1b \n\t"
00165 ::"r" (block+nCoeffs), "rm"(qmul), "rm" (qadd), "r" (2*(-nCoeffs))
00166 : "memory"
00167 );
00168 }
00169 
00170 
00171 /*
00172  NK:
00173  Note: looking at PARANOID:
00174  "enable all paranoid tests for rounding, overflows, etc..."
00175 
00176 #ifdef PARANOID
00177  if (level < -2048 || level > 2047)
00178  fprintf(stderr, "unquant error %d %d\n", i, level);
00179 #endif
00180  We can suppose that result of two multiplications can't be greater than 0xFFFF
00181  i.e. is 16-bit, so we use here only PMULLW instruction and can avoid
00182  a complex multiplication.
00183 =====================================================
00184  Full formula for multiplication of 2 integer numbers
00185  which are represent as high:low words:
00186  input: value1 = high1:low1
00187  value2 = high2:low2
00188  output: value3 = value1*value2
00189  value3=high3:low3 (on overflow: modulus 2^32 wrap-around)
00190  this mean that for 0x123456 * 0x123456 correct result is 0x766cb0ce4
00191  but this algorithm will compute only 0x66cb0ce4
00192  this limited by 16-bit size of operands
00193  ---------------------------------
00194  tlow1 = high1*low2
00195  tlow2 = high2*low1
00196  tlow1 = tlow1 + tlow2
00197  high3:low3 = low1*low2
00198  high3 += tlow1
00199 */
00200 static void dct_unquantize_mpeg1_intra_mmx(MpegEncContext *s,
00201 DCTELEM *block, int n, int qscale)
00202 {
00203 x86_reg nCoeffs;
00204 const uint16_t *quant_matrix;
00205 int block0;
00206 
00207 assert(s->block_last_index[n]>=0);
00208 
00209 nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1;
00210 
00211 if (n < 4)
00212 block0 = block[0] * s->y_dc_scale;
00213 else
00214 block0 = block[0] * s->c_dc_scale;
00215 /* XXX: only mpeg1 */
00216 quant_matrix = s->intra_matrix;
00217 __asm__ volatile(
00218 "pcmpeqw %%mm7, %%mm7 \n\t"
00219 "psrlw 15,ドル %%mm7 \n\t"
00220 "movd %2, %%mm6 \n\t"
00221 "packssdw %%mm6, %%mm6 \n\t"
00222 "packssdw %%mm6, %%mm6 \n\t"
00223 "mov %3, %%"REG_a" \n\t"
00224 ASMALIGN(4)
00225 "1: \n\t"
00226 "movq (%0, %%"REG_a"), %%mm0 \n\t"
00227 "movq 8(%0, %%"REG_a"), %%mm1 \n\t"
00228 "movq (%1, %%"REG_a"), %%mm4 \n\t"
00229 "movq 8(%1, %%"REG_a"), %%mm5 \n\t"
00230 "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i]
00231 "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i]
00232 "pxor %%mm2, %%mm2 \n\t"
00233 "pxor %%mm3, %%mm3 \n\t"
00234 "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0
00235 "pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0
00236 "pxor %%mm2, %%mm0 \n\t"
00237 "pxor %%mm3, %%mm1 \n\t"
00238 "psubw %%mm2, %%mm0 \n\t" // abs(block[i])
00239 "psubw %%mm3, %%mm1 \n\t" // abs(block[i])
00240 "pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*q
00241 "pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*q
00242 "pxor %%mm4, %%mm4 \n\t"
00243 "pxor %%mm5, %%mm5 \n\t" // FIXME slow
00244 "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0
00245 "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0
00246 "psraw 3,ドル %%mm0 \n\t"
00247 "psraw 3,ドル %%mm1 \n\t"
00248 "psubw %%mm7, %%mm0 \n\t"
00249 "psubw %%mm7, %%mm1 \n\t"
00250 "por %%mm7, %%mm0 \n\t"
00251 "por %%mm7, %%mm1 \n\t"
00252 "pxor %%mm2, %%mm0 \n\t"
00253 "pxor %%mm3, %%mm1 \n\t"
00254 "psubw %%mm2, %%mm0 \n\t"
00255 "psubw %%mm3, %%mm1 \n\t"
00256 "pandn %%mm0, %%mm4 \n\t"
00257 "pandn %%mm1, %%mm5 \n\t"
00258 "movq %%mm4, (%0, %%"REG_a") \n\t"
00259 "movq %%mm5, 8(%0, %%"REG_a") \n\t"
00260 
00261 "add 16,ドル %%"REG_a" \n\t"
00262 "js 1b \n\t"
00263 ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "g" (-2*nCoeffs)
00264 : "%"REG_a, "memory"
00265 );
00266 block[0]= block0;
00267 }
00268 
00269 static void dct_unquantize_mpeg1_inter_mmx(MpegEncContext *s,
00270 DCTELEM *block, int n, int qscale)
00271 {
00272 x86_reg nCoeffs;
00273 const uint16_t *quant_matrix;
00274 
00275 assert(s->block_last_index[n]>=0);
00276 
00277 nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1;
00278 
00279 quant_matrix = s->inter_matrix;
00280 __asm__ volatile(
00281 "pcmpeqw %%mm7, %%mm7 \n\t"
00282 "psrlw 15,ドル %%mm7 \n\t"
00283 "movd %2, %%mm6 \n\t"
00284 "packssdw %%mm6, %%mm6 \n\t"
00285 "packssdw %%mm6, %%mm6 \n\t"
00286 "mov %3, %%"REG_a" \n\t"
00287 ASMALIGN(4)
00288 "1: \n\t"
00289 "movq (%0, %%"REG_a"), %%mm0 \n\t"
00290 "movq 8(%0, %%"REG_a"), %%mm1 \n\t"
00291 "movq (%1, %%"REG_a"), %%mm4 \n\t"
00292 "movq 8(%1, %%"REG_a"), %%mm5 \n\t"
00293 "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i]
00294 "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i]
00295 "pxor %%mm2, %%mm2 \n\t"
00296 "pxor %%mm3, %%mm3 \n\t"
00297 "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0
00298 "pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0
00299 "pxor %%mm2, %%mm0 \n\t"
00300 "pxor %%mm3, %%mm1 \n\t"
00301 "psubw %%mm2, %%mm0 \n\t" // abs(block[i])
00302 "psubw %%mm3, %%mm1 \n\t" // abs(block[i])
00303 "paddw %%mm0, %%mm0 \n\t" // abs(block[i])*2
00304 "paddw %%mm1, %%mm1 \n\t" // abs(block[i])*2
00305 "paddw %%mm7, %%mm0 \n\t" // abs(block[i])*2 + 1
00306 "paddw %%mm7, %%mm1 \n\t" // abs(block[i])*2 + 1
00307 "pmullw %%mm4, %%mm0 \n\t" // (abs(block[i])*2 + 1)*q
00308 "pmullw %%mm5, %%mm1 \n\t" // (abs(block[i])*2 + 1)*q
00309 "pxor %%mm4, %%mm4 \n\t"
00310 "pxor %%mm5, %%mm5 \n\t" // FIXME slow
00311 "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0
00312 "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0
00313 "psraw 4,ドル %%mm0 \n\t"
00314 "psraw 4,ドル %%mm1 \n\t"
00315 "psubw %%mm7, %%mm0 \n\t"
00316 "psubw %%mm7, %%mm1 \n\t"
00317 "por %%mm7, %%mm0 \n\t"
00318 "por %%mm7, %%mm1 \n\t"
00319 "pxor %%mm2, %%mm0 \n\t"
00320 "pxor %%mm3, %%mm1 \n\t"
00321 "psubw %%mm2, %%mm0 \n\t"
00322 "psubw %%mm3, %%mm1 \n\t"
00323 "pandn %%mm0, %%mm4 \n\t"
00324 "pandn %%mm1, %%mm5 \n\t"
00325 "movq %%mm4, (%0, %%"REG_a") \n\t"
00326 "movq %%mm5, 8(%0, %%"REG_a") \n\t"
00327 
00328 "add 16,ドル %%"REG_a" \n\t"
00329 "js 1b \n\t"
00330 ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "g" (-2*nCoeffs)
00331 : "%"REG_a, "memory"
00332 );
00333 }
00334 
00335 static void dct_unquantize_mpeg2_intra_mmx(MpegEncContext *s,
00336 DCTELEM *block, int n, int qscale)
00337 {
00338 x86_reg nCoeffs;
00339 const uint16_t *quant_matrix;
00340 int block0;
00341 
00342 assert(s->block_last_index[n]>=0);
00343 
00344 if(s->alternate_scan) nCoeffs= 63; //FIXME
00345 else nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ];
00346 
00347 if (n < 4)
00348 block0 = block[0] * s->y_dc_scale;
00349 else
00350 block0 = block[0] * s->c_dc_scale;
00351 quant_matrix = s->intra_matrix;
00352 __asm__ volatile(
00353 "pcmpeqw %%mm7, %%mm7 \n\t"
00354 "psrlw 15,ドル %%mm7 \n\t"
00355 "movd %2, %%mm6 \n\t"
00356 "packssdw %%mm6, %%mm6 \n\t"
00357 "packssdw %%mm6, %%mm6 \n\t"
00358 "mov %3, %%"REG_a" \n\t"
00359 ASMALIGN(4)
00360 "1: \n\t"
00361 "movq (%0, %%"REG_a"), %%mm0 \n\t"
00362 "movq 8(%0, %%"REG_a"), %%mm1 \n\t"
00363 "movq (%1, %%"REG_a"), %%mm4 \n\t"
00364 "movq 8(%1, %%"REG_a"), %%mm5 \n\t"
00365 "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i]
00366 "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i]
00367 "pxor %%mm2, %%mm2 \n\t"
00368 "pxor %%mm3, %%mm3 \n\t"
00369 "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0
00370 "pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0
00371 "pxor %%mm2, %%mm0 \n\t"
00372 "pxor %%mm3, %%mm1 \n\t"
00373 "psubw %%mm2, %%mm0 \n\t" // abs(block[i])
00374 "psubw %%mm3, %%mm1 \n\t" // abs(block[i])
00375 "pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*q
00376 "pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*q
00377 "pxor %%mm4, %%mm4 \n\t"
00378 "pxor %%mm5, %%mm5 \n\t" // FIXME slow
00379 "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0
00380 "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0
00381 "psraw 3,ドル %%mm0 \n\t"
00382 "psraw 3,ドル %%mm1 \n\t"
00383 "pxor %%mm2, %%mm0 \n\t"
00384 "pxor %%mm3, %%mm1 \n\t"
00385 "psubw %%mm2, %%mm0 \n\t"
00386 "psubw %%mm3, %%mm1 \n\t"
00387 "pandn %%mm0, %%mm4 \n\t"
00388 "pandn %%mm1, %%mm5 \n\t"
00389 "movq %%mm4, (%0, %%"REG_a") \n\t"
00390 "movq %%mm5, 8(%0, %%"REG_a") \n\t"
00391 
00392 "add 16,ドル %%"REG_a" \n\t"
00393 "jng 1b \n\t"
00394 ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "g" (-2*nCoeffs)
00395 : "%"REG_a, "memory"
00396 );
00397 block[0]= block0;
00398 //Note, we do not do mismatch control for intra as errors cannot accumulate
00399 }
00400 
00401 static void dct_unquantize_mpeg2_inter_mmx(MpegEncContext *s,
00402 DCTELEM *block, int n, int qscale)
00403 {
00404 x86_reg nCoeffs;
00405 const uint16_t *quant_matrix;
00406 
00407 assert(s->block_last_index[n]>=0);
00408 
00409 if(s->alternate_scan) nCoeffs= 63; //FIXME
00410 else nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ];
00411 
00412 quant_matrix = s->inter_matrix;
00413 __asm__ volatile(
00414 "pcmpeqw %%mm7, %%mm7 \n\t"
00415 "psrlq 48,ドル %%mm7 \n\t"
00416 "movd %2, %%mm6 \n\t"
00417 "packssdw %%mm6, %%mm6 \n\t"
00418 "packssdw %%mm6, %%mm6 \n\t"
00419 "mov %3, %%"REG_a" \n\t"
00420 ASMALIGN(4)
00421 "1: \n\t"
00422 "movq (%0, %%"REG_a"), %%mm0 \n\t"
00423 "movq 8(%0, %%"REG_a"), %%mm1 \n\t"
00424 "movq (%1, %%"REG_a"), %%mm4 \n\t"
00425 "movq 8(%1, %%"REG_a"), %%mm5 \n\t"
00426 "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i]
00427 "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i]
00428 "pxor %%mm2, %%mm2 \n\t"
00429 "pxor %%mm3, %%mm3 \n\t"
00430 "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0
00431 "pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0
00432 "pxor %%mm2, %%mm0 \n\t"
00433 "pxor %%mm3, %%mm1 \n\t"
00434 "psubw %%mm2, %%mm0 \n\t" // abs(block[i])
00435 "psubw %%mm3, %%mm1 \n\t" // abs(block[i])
00436 "paddw %%mm0, %%mm0 \n\t" // abs(block[i])*2
00437 "paddw %%mm1, %%mm1 \n\t" // abs(block[i])*2
00438 "pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*2*q
00439 "pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*2*q
00440 "paddw %%mm4, %%mm0 \n\t" // (abs(block[i])*2 + 1)*q
00441 "paddw %%mm5, %%mm1 \n\t" // (abs(block[i])*2 + 1)*q
00442 "pxor %%mm4, %%mm4 \n\t"
00443 "pxor %%mm5, %%mm5 \n\t" // FIXME slow
00444 "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0
00445 "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0
00446 "psrlw 4,ドル %%mm0 \n\t"
00447 "psrlw 4,ドル %%mm1 \n\t"
00448 "pxor %%mm2, %%mm0 \n\t"
00449 "pxor %%mm3, %%mm1 \n\t"
00450 "psubw %%mm2, %%mm0 \n\t"
00451 "psubw %%mm3, %%mm1 \n\t"
00452 "pandn %%mm0, %%mm4 \n\t"
00453 "pandn %%mm1, %%mm5 \n\t"
00454 "pxor %%mm4, %%mm7 \n\t"
00455 "pxor %%mm5, %%mm7 \n\t"
00456 "movq %%mm4, (%0, %%"REG_a") \n\t"
00457 "movq %%mm5, 8(%0, %%"REG_a") \n\t"
00458 
00459 "add 16,ドル %%"REG_a" \n\t"
00460 "jng 1b \n\t"
00461 "movd 124(%0, %3), %%mm0 \n\t"
00462 "movq %%mm7, %%mm6 \n\t"
00463 "psrlq 32,ドル %%mm7 \n\t"
00464 "pxor %%mm6, %%mm7 \n\t"
00465 "movq %%mm7, %%mm6 \n\t"
00466 "psrlq 16,ドル %%mm7 \n\t"
00467 "pxor %%mm6, %%mm7 \n\t"
00468 "pslld 31,ドル %%mm7 \n\t"
00469 "psrlq 15,ドル %%mm7 \n\t"
00470 "pxor %%mm7, %%mm0 \n\t"
00471 "movd %%mm0, 124(%0, %3) \n\t"
00472 
00473 ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "r" (-2*nCoeffs)
00474 : "%"REG_a, "memory"
00475 );
00476 }
00477 
00478 static void denoise_dct_mmx(MpegEncContext *s, DCTELEM *block){
00479 const int intra= s->mb_intra;
00480 int *sum= s->dct_error_sum[intra];
00481 uint16_t *offset= s->dct_offset[intra];
00482 
00483 s->dct_count[intra]++;
00484 
00485 __asm__ volatile(
00486 "pxor %%mm7, %%mm7 \n\t"
00487 "1: \n\t"
00488 "pxor %%mm0, %%mm0 \n\t"
00489 "pxor %%mm1, %%mm1 \n\t"
00490 "movq (%0), %%mm2 \n\t"
00491 "movq 8(%0), %%mm3 \n\t"
00492 "pcmpgtw %%mm2, %%mm0 \n\t"
00493 "pcmpgtw %%mm3, %%mm1 \n\t"
00494 "pxor %%mm0, %%mm2 \n\t"
00495 "pxor %%mm1, %%mm3 \n\t"
00496 "psubw %%mm0, %%mm2 \n\t"
00497 "psubw %%mm1, %%mm3 \n\t"
00498 "movq %%mm2, %%mm4 \n\t"
00499 "movq %%mm3, %%mm5 \n\t"
00500 "psubusw (%2), %%mm2 \n\t"
00501 "psubusw 8(%2), %%mm3 \n\t"
00502 "pxor %%mm0, %%mm2 \n\t"
00503 "pxor %%mm1, %%mm3 \n\t"
00504 "psubw %%mm0, %%mm2 \n\t"
00505 "psubw %%mm1, %%mm3 \n\t"
00506 "movq %%mm2, (%0) \n\t"
00507 "movq %%mm3, 8(%0) \n\t"
00508 "movq %%mm4, %%mm2 \n\t"
00509 "movq %%mm5, %%mm3 \n\t"
00510 "punpcklwd %%mm7, %%mm4 \n\t"
00511 "punpckhwd %%mm7, %%mm2 \n\t"
00512 "punpcklwd %%mm7, %%mm5 \n\t"
00513 "punpckhwd %%mm7, %%mm3 \n\t"
00514 "paddd (%1), %%mm4 \n\t"
00515 "paddd 8(%1), %%mm2 \n\t"
00516 "paddd 16(%1), %%mm5 \n\t"
00517 "paddd 24(%1), %%mm3 \n\t"
00518 "movq %%mm4, (%1) \n\t"
00519 "movq %%mm2, 8(%1) \n\t"
00520 "movq %%mm5, 16(%1) \n\t"
00521 "movq %%mm3, 24(%1) \n\t"
00522 "add 16,ドル %0 \n\t"
00523 "add 32,ドル %1 \n\t"
00524 "add 16,ドル %2 \n\t"
00525 "cmp %3, %0 \n\t"
00526 " jb 1b \n\t"
00527 : "+r" (block), "+r" (sum), "+r" (offset)
00528 : "r"(block+64)
00529 );
00530 }
00531 
00532 static void denoise_dct_sse2(MpegEncContext *s, DCTELEM *block){
00533 const int intra= s->mb_intra;
00534 int *sum= s->dct_error_sum[intra];
00535 uint16_t *offset= s->dct_offset[intra];
00536 
00537 s->dct_count[intra]++;
00538 
00539 __asm__ volatile(
00540 "pxor %%xmm7, %%xmm7 \n\t"
00541 "1: \n\t"
00542 "pxor %%xmm0, %%xmm0 \n\t"
00543 "pxor %%xmm1, %%xmm1 \n\t"
00544 "movdqa (%0), %%xmm2 \n\t"
00545 "movdqa 16(%0), %%xmm3 \n\t"
00546 "pcmpgtw %%xmm2, %%xmm0 \n\t"
00547 "pcmpgtw %%xmm3, %%xmm1 \n\t"
00548 "pxor %%xmm0, %%xmm2 \n\t"
00549 "pxor %%xmm1, %%xmm3 \n\t"
00550 "psubw %%xmm0, %%xmm2 \n\t"
00551 "psubw %%xmm1, %%xmm3 \n\t"
00552 "movdqa %%xmm2, %%xmm4 \n\t"
00553 "movdqa %%xmm3, %%xmm5 \n\t"
00554 "psubusw (%2), %%xmm2 \n\t"
00555 "psubusw 16(%2), %%xmm3 \n\t"
00556 "pxor %%xmm0, %%xmm2 \n\t"
00557 "pxor %%xmm1, %%xmm3 \n\t"
00558 "psubw %%xmm0, %%xmm2 \n\t"
00559 "psubw %%xmm1, %%xmm3 \n\t"
00560 "movdqa %%xmm2, (%0) \n\t"
00561 "movdqa %%xmm3, 16(%0) \n\t"
00562 "movdqa %%xmm4, %%xmm6 \n\t"
00563 "movdqa %%xmm5, %%xmm0 \n\t"
00564 "punpcklwd %%xmm7, %%xmm4 \n\t"
00565 "punpckhwd %%xmm7, %%xmm6 \n\t"
00566 "punpcklwd %%xmm7, %%xmm5 \n\t"
00567 "punpckhwd %%xmm7, %%xmm0 \n\t"
00568 "paddd (%1), %%xmm4 \n\t"
00569 "paddd 16(%1), %%xmm6 \n\t"
00570 "paddd 32(%1), %%xmm5 \n\t"
00571 "paddd 48(%1), %%xmm0 \n\t"
00572 "movdqa %%xmm4, (%1) \n\t"
00573 "movdqa %%xmm6, 16(%1) \n\t"
00574 "movdqa %%xmm5, 32(%1) \n\t"
00575 "movdqa %%xmm0, 48(%1) \n\t"
00576 "add 32,ドル %0 \n\t"
00577 "add 64,ドル %1 \n\t"
00578 "add 32,ドル %2 \n\t"
00579 "cmp %3, %0 \n\t"
00580 " jb 1b \n\t"
00581 : "+r" (block), "+r" (sum), "+r" (offset)
00582 : "r"(block+64)
00583 );
00584 }
00585 
00586 #if HAVE_SSSE3
00587 #define HAVE_SSSE3_BAK
00588 #endif
00589 #undef HAVE_SSSE3
00590 #define HAVE_SSSE3 0
00591 
00592 #undef HAVE_SSE2
00593 #undef HAVE_MMX2
00594 #define HAVE_SSE2 0
00595 #define HAVE_MMX2 0
00596 #define RENAME(a) a ## _MMX
00597 #define RENAMEl(a) a ## _mmx
00598 #include "mpegvideo_mmx_template.c"
00599 
00600 #undef HAVE_MMX2
00601 #define HAVE_MMX2 1
00602 #undef RENAME
00603 #undef RENAMEl
00604 #define RENAME(a) a ## _MMX2
00605 #define RENAMEl(a) a ## _mmx2
00606 #include "mpegvideo_mmx_template.c"
00607 
00608 #undef HAVE_SSE2
00609 #define HAVE_SSE2 1
00610 #undef RENAME
00611 #undef RENAMEl
00612 #define RENAME(a) a ## _SSE2
00613 #define RENAMEl(a) a ## _sse2
00614 #include "mpegvideo_mmx_template.c"
00615 
00616 #ifdef HAVE_SSSE3_BAK
00617 #undef HAVE_SSSE3
00618 #define HAVE_SSSE3 1
00619 #undef RENAME
00620 #undef RENAMEl
00621 #define RENAME(a) a ## _SSSE3
00622 #define RENAMEl(a) a ## _sse2
00623 #include "mpegvideo_mmx_template.c"
00624 #endif
00625 
00626 void MPV_common_init_mmx(MpegEncContext *s)
00627 {
00628 if (mm_flags & FF_MM_MMX) {
00629 const int dct_algo = s->avctx->dct_algo;
00630 
00631 s->dct_unquantize_h263_intra = dct_unquantize_h263_intra_mmx;
00632 s->dct_unquantize_h263_inter = dct_unquantize_h263_inter_mmx;
00633 s->dct_unquantize_mpeg1_intra = dct_unquantize_mpeg1_intra_mmx;
00634 s->dct_unquantize_mpeg1_inter = dct_unquantize_mpeg1_inter_mmx;
00635 if(!(s->flags & CODEC_FLAG_BITEXACT))
00636 s->dct_unquantize_mpeg2_intra = dct_unquantize_mpeg2_intra_mmx;
00637 s->dct_unquantize_mpeg2_inter = dct_unquantize_mpeg2_inter_mmx;
00638 
00639 if (mm_flags & FF_MM_SSE2) {
00640 s->denoise_dct= denoise_dct_sse2;
00641 } else {
00642 s->denoise_dct= denoise_dct_mmx;
00643 }
00644 
00645 if(dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX){
00646 #if HAVE_SSSE3
00647 if(mm_flags & FF_MM_SSSE3){
00648 s->dct_quantize= dct_quantize_SSSE3;
00649 } else
00650 #endif
00651 if(mm_flags & FF_MM_SSE2){
00652 s->dct_quantize= dct_quantize_SSE2;
00653 } else if(mm_flags & FF_MM_MMXEXT){
00654 s->dct_quantize= dct_quantize_MMX2;
00655 } else {
00656 s->dct_quantize= dct_quantize_MMX;
00657 }
00658 }
00659 }
00660 }
Generated on Fri Oct 26 02:35:40 2012 for FFmpeg by doxygen 1.5.8