libavcodec/x86/dsputil_mmx_avg_template.c

Go to the documentation of this file.
00001 /*
00002  * DSP utils : average functions are compiled twice for 3dnow/mmx2
00003  * Copyright (c) 2000, 2001 Fabrice Bellard
00004  * Copyright (c) 2002-2004 Michael Niedermayer
00005  *
00006  * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
00007  * mostly rewritten by Michael Niedermayer <michaelni@gmx.at>
00008  * and improved by Zdenek Kabelac <kabi@users.sf.net>
00009  *
00010  * This file is part of FFmpeg.
00011  *
00012  * FFmpeg is free software; you can redistribute it and/or
00013  * modify it under the terms of the GNU Lesser General Public
00014  * License as published by the Free Software Foundation; either
00015  * version 2.1 of the License, or (at your option) any later version.
00016  *
00017  * FFmpeg is distributed in the hope that it will be useful,
00018  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00019  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
00020  * Lesser General Public License for more details.
00021  *
00022  * You should have received a copy of the GNU Lesser General Public
00023  * License along with FFmpeg; if not, write to the Free Software
00024  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
00025  */
00026 
00027 /* XXX: we use explicit registers to avoid a gcc 2.95.2 register asm
00028  clobber bug - now it will work with 2.95.2 and also with -fPIC
00029  */
00030 static void DEF(put_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
00031 {
00032 __asm__ volatile(
00033 "lea (%3, %3), %%"REG_a" \n\t"
00034 "1: \n\t"
00035 "movq (%1), %%mm0 \n\t"
00036 "movq (%1, %3), %%mm1 \n\t"
00037 PAVGB " 1(%1), %%mm0 \n\t"
00038 PAVGB " 1(%1, %3), %%mm1 \n\t"
00039 "movq %%mm0, (%2) \n\t"
00040 "movq %%mm1, (%2, %3) \n\t"
00041 "add %%"REG_a", %1 \n\t"
00042 "add %%"REG_a", %2 \n\t"
00043 "movq (%1), %%mm0 \n\t"
00044 "movq (%1, %3), %%mm1 \n\t"
00045 PAVGB " 1(%1), %%mm0 \n\t"
00046 PAVGB " 1(%1, %3), %%mm1 \n\t"
00047 "add %%"REG_a", %1 \n\t"
00048 "movq %%mm0, (%2) \n\t"
00049 "movq %%mm1, (%2, %3) \n\t"
00050 "add %%"REG_a", %2 \n\t"
00051 "subl 4,ドル %0 \n\t"
00052 "jnz 1b \n\t"
00053 :"+g"(h), "+S"(pixels), "+D"(block)
00054 :"r" ((x86_reg)line_size)
00055 :"%"REG_a, "memory");
00056 }
00057 
00058 static void DEF(put_pixels4_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
00059 {
00060 __asm__ volatile(
00061 "testl 1,ドル %0 \n\t"
00062 " jz 1f \n\t"
00063 "movd (%1), %%mm0 \n\t"
00064 "movd (%2), %%mm1 \n\t"
00065 "add %4, %1 \n\t"
00066 "add 4,ドル %2 \n\t"
00067 PAVGB " %%mm1, %%mm0 \n\t"
00068 "movd %%mm0, (%3) \n\t"
00069 "add %5, %3 \n\t"
00070 "decl %0 \n\t"
00071 "1: \n\t"
00072 "movd (%1), %%mm0 \n\t"
00073 "add %4, %1 \n\t"
00074 "movd (%1), %%mm1 \n\t"
00075 "movd (%2), %%mm2 \n\t"
00076 "movd 4(%2), %%mm3 \n\t"
00077 "add %4, %1 \n\t"
00078 PAVGB " %%mm2, %%mm0 \n\t"
00079 PAVGB " %%mm3, %%mm1 \n\t"
00080 "movd %%mm0, (%3) \n\t"
00081 "add %5, %3 \n\t"
00082 "movd %%mm1, (%3) \n\t"
00083 "add %5, %3 \n\t"
00084 "movd (%1), %%mm0 \n\t"
00085 "add %4, %1 \n\t"
00086 "movd (%1), %%mm1 \n\t"
00087 "movd 8(%2), %%mm2 \n\t"
00088 "movd 12(%2), %%mm3 \n\t"
00089 "add %4, %1 \n\t"
00090 PAVGB " %%mm2, %%mm0 \n\t"
00091 PAVGB " %%mm3, %%mm1 \n\t"
00092 "movd %%mm0, (%3) \n\t"
00093 "add %5, %3 \n\t"
00094 "movd %%mm1, (%3) \n\t"
00095 "add %5, %3 \n\t"
00096 "add 16,ドル %2 \n\t"
00097 "subl 4,ドル %0 \n\t"
00098 "jnz 1b \n\t"
00099 #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
00100 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
00101 #else
00102 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
00103 #endif
00104 :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
00105 :"memory");
00106 }
00107 
00108 
00109 static void DEF(put_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
00110 {
00111 __asm__ volatile(
00112 "testl 1,ドル %0 \n\t"
00113 " jz 1f \n\t"
00114 "movq (%1), %%mm0 \n\t"
00115 "movq (%2), %%mm1 \n\t"
00116 "add %4, %1 \n\t"
00117 "add 8,ドル %2 \n\t"
00118 PAVGB " %%mm1, %%mm0 \n\t"
00119 "movq %%mm0, (%3) \n\t"
00120 "add %5, %3 \n\t"
00121 "decl %0 \n\t"
00122 "1: \n\t"
00123 "movq (%1), %%mm0 \n\t"
00124 "add %4, %1 \n\t"
00125 "movq (%1), %%mm1 \n\t"
00126 "add %4, %1 \n\t"
00127 PAVGB " (%2), %%mm0 \n\t"
00128 PAVGB " 8(%2), %%mm1 \n\t"
00129 "movq %%mm0, (%3) \n\t"
00130 "add %5, %3 \n\t"
00131 "movq %%mm1, (%3) \n\t"
00132 "add %5, %3 \n\t"
00133 "movq (%1), %%mm0 \n\t"
00134 "add %4, %1 \n\t"
00135 "movq (%1), %%mm1 \n\t"
00136 "add %4, %1 \n\t"
00137 PAVGB " 16(%2), %%mm0 \n\t"
00138 PAVGB " 24(%2), %%mm1 \n\t"
00139 "movq %%mm0, (%3) \n\t"
00140 "add %5, %3 \n\t"
00141 "movq %%mm1, (%3) \n\t"
00142 "add %5, %3 \n\t"
00143 "add 32,ドル %2 \n\t"
00144 "subl 4,ドル %0 \n\t"
00145 "jnz 1b \n\t"
00146 #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
00147 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
00148 #else
00149 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
00150 #endif
00151 :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
00152 :"memory");
00153 //the following should be used, though better not with gcc ...
00154 /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
00155  :"r"(src1Stride), "r"(dstStride)
00156  :"memory");*/
00157 }
00158 
00159 static void DEF(put_no_rnd_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
00160 {
00161 __asm__ volatile(
00162 "pcmpeqb %%mm6, %%mm6 \n\t"
00163 "testl 1,ドル %0 \n\t"
00164 " jz 1f \n\t"
00165 "movq (%1), %%mm0 \n\t"
00166 "movq (%2), %%mm1 \n\t"
00167 "add %4, %1 \n\t"
00168 "add 8,ドル %2 \n\t"
00169 "pxor %%mm6, %%mm0 \n\t"
00170 "pxor %%mm6, %%mm1 \n\t"
00171 PAVGB " %%mm1, %%mm0 \n\t"
00172 "pxor %%mm6, %%mm0 \n\t"
00173 "movq %%mm0, (%3) \n\t"
00174 "add %5, %3 \n\t"
00175 "decl %0 \n\t"
00176 "1: \n\t"
00177 "movq (%1), %%mm0 \n\t"
00178 "add %4, %1 \n\t"
00179 "movq (%1), %%mm1 \n\t"
00180 "add %4, %1 \n\t"
00181 "movq (%2), %%mm2 \n\t"
00182 "movq 8(%2), %%mm3 \n\t"
00183 "pxor %%mm6, %%mm0 \n\t"
00184 "pxor %%mm6, %%mm1 \n\t"
00185 "pxor %%mm6, %%mm2 \n\t"
00186 "pxor %%mm6, %%mm3 \n\t"
00187 PAVGB " %%mm2, %%mm0 \n\t"
00188 PAVGB " %%mm3, %%mm1 \n\t"
00189 "pxor %%mm6, %%mm0 \n\t"
00190 "pxor %%mm6, %%mm1 \n\t"
00191 "movq %%mm0, (%3) \n\t"
00192 "add %5, %3 \n\t"
00193 "movq %%mm1, (%3) \n\t"
00194 "add %5, %3 \n\t"
00195 "movq (%1), %%mm0 \n\t"
00196 "add %4, %1 \n\t"
00197 "movq (%1), %%mm1 \n\t"
00198 "add %4, %1 \n\t"
00199 "movq 16(%2), %%mm2 \n\t"
00200 "movq 24(%2), %%mm3 \n\t"
00201 "pxor %%mm6, %%mm0 \n\t"
00202 "pxor %%mm6, %%mm1 \n\t"
00203 "pxor %%mm6, %%mm2 \n\t"
00204 "pxor %%mm6, %%mm3 \n\t"
00205 PAVGB " %%mm2, %%mm0 \n\t"
00206 PAVGB " %%mm3, %%mm1 \n\t"
00207 "pxor %%mm6, %%mm0 \n\t"
00208 "pxor %%mm6, %%mm1 \n\t"
00209 "movq %%mm0, (%3) \n\t"
00210 "add %5, %3 \n\t"
00211 "movq %%mm1, (%3) \n\t"
00212 "add %5, %3 \n\t"
00213 "add 32,ドル %2 \n\t"
00214 "subl 4,ドル %0 \n\t"
00215 "jnz 1b \n\t"
00216 #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
00217 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
00218 #else
00219 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
00220 #endif
00221 :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
00222 :"memory");
00223 //the following should be used, though better not with gcc ...
00224 /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
00225  :"r"(src1Stride), "r"(dstStride)
00226  :"memory");*/
00227 }
00228 
00229 static void DEF(avg_pixels4_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
00230 {
00231 __asm__ volatile(
00232 "testl 1,ドル %0 \n\t"
00233 " jz 1f \n\t"
00234 "movd (%1), %%mm0 \n\t"
00235 "movd (%2), %%mm1 \n\t"
00236 "add %4, %1 \n\t"
00237 "add 4,ドル %2 \n\t"
00238 PAVGB " %%mm1, %%mm0 \n\t"
00239 PAVGB " (%3), %%mm0 \n\t"
00240 "movd %%mm0, (%3) \n\t"
00241 "add %5, %3 \n\t"
00242 "decl %0 \n\t"
00243 "1: \n\t"
00244 "movd (%1), %%mm0 \n\t"
00245 "add %4, %1 \n\t"
00246 "movd (%1), %%mm1 \n\t"
00247 "add %4, %1 \n\t"
00248 PAVGB " (%2), %%mm0 \n\t"
00249 PAVGB " 4(%2), %%mm1 \n\t"
00250 PAVGB " (%3), %%mm0 \n\t"
00251 "movd %%mm0, (%3) \n\t"
00252 "add %5, %3 \n\t"
00253 PAVGB " (%3), %%mm1 \n\t"
00254 "movd %%mm1, (%3) \n\t"
00255 "add %5, %3 \n\t"
00256 "movd (%1), %%mm0 \n\t"
00257 "add %4, %1 \n\t"
00258 "movd (%1), %%mm1 \n\t"
00259 "add %4, %1 \n\t"
00260 PAVGB " 8(%2), %%mm0 \n\t"
00261 PAVGB " 12(%2), %%mm1 \n\t"
00262 PAVGB " (%3), %%mm0 \n\t"
00263 "movd %%mm0, (%3) \n\t"
00264 "add %5, %3 \n\t"
00265 PAVGB " (%3), %%mm1 \n\t"
00266 "movd %%mm1, (%3) \n\t"
00267 "add %5, %3 \n\t"
00268 "add 16,ドル %2 \n\t"
00269 "subl 4,ドル %0 \n\t"
00270 "jnz 1b \n\t"
00271 #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
00272 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
00273 #else
00274 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
00275 #endif
00276 :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
00277 :"memory");
00278 }
00279 
00280 
00281 static void DEF(avg_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
00282 {
00283 __asm__ volatile(
00284 "testl 1,ドル %0 \n\t"
00285 " jz 1f \n\t"
00286 "movq (%1), %%mm0 \n\t"
00287 "movq (%2), %%mm1 \n\t"
00288 "add %4, %1 \n\t"
00289 "add 8,ドル %2 \n\t"
00290 PAVGB " %%mm1, %%mm0 \n\t"
00291 PAVGB " (%3), %%mm0 \n\t"
00292 "movq %%mm0, (%3) \n\t"
00293 "add %5, %3 \n\t"
00294 "decl %0 \n\t"
00295 "1: \n\t"
00296 "movq (%1), %%mm0 \n\t"
00297 "add %4, %1 \n\t"
00298 "movq (%1), %%mm1 \n\t"
00299 "add %4, %1 \n\t"
00300 PAVGB " (%2), %%mm0 \n\t"
00301 PAVGB " 8(%2), %%mm1 \n\t"
00302 PAVGB " (%3), %%mm0 \n\t"
00303 "movq %%mm0, (%3) \n\t"
00304 "add %5, %3 \n\t"
00305 PAVGB " (%3), %%mm1 \n\t"
00306 "movq %%mm1, (%3) \n\t"
00307 "add %5, %3 \n\t"
00308 "movq (%1), %%mm0 \n\t"
00309 "add %4, %1 \n\t"
00310 "movq (%1), %%mm1 \n\t"
00311 "add %4, %1 \n\t"
00312 PAVGB " 16(%2), %%mm0 \n\t"
00313 PAVGB " 24(%2), %%mm1 \n\t"
00314 PAVGB " (%3), %%mm0 \n\t"
00315 "movq %%mm0, (%3) \n\t"
00316 "add %5, %3 \n\t"
00317 PAVGB " (%3), %%mm1 \n\t"
00318 "movq %%mm1, (%3) \n\t"
00319 "add %5, %3 \n\t"
00320 "add 32,ドル %2 \n\t"
00321 "subl 4,ドル %0 \n\t"
00322 "jnz 1b \n\t"
00323 #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
00324 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
00325 #else
00326 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
00327 #endif
00328 :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
00329 :"memory");
00330 //the following should be used, though better not with gcc ...
00331 /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
00332  :"r"(src1Stride), "r"(dstStride)
00333  :"memory");*/
00334 }
00335 
00336 static void DEF(put_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
00337 {
00338 __asm__ volatile(
00339 "lea (%3, %3), %%"REG_a" \n\t"
00340 "1: \n\t"
00341 "movq (%1), %%mm0 \n\t"
00342 "movq (%1, %3), %%mm1 \n\t"
00343 "movq 8(%1), %%mm2 \n\t"
00344 "movq 8(%1, %3), %%mm3 \n\t"
00345 PAVGB " 1(%1), %%mm0 \n\t"
00346 PAVGB " 1(%1, %3), %%mm1 \n\t"
00347 PAVGB " 9(%1), %%mm2 \n\t"
00348 PAVGB " 9(%1, %3), %%mm3 \n\t"
00349 "movq %%mm0, (%2) \n\t"
00350 "movq %%mm1, (%2, %3) \n\t"
00351 "movq %%mm2, 8(%2) \n\t"
00352 "movq %%mm3, 8(%2, %3) \n\t"
00353 "add %%"REG_a", %1 \n\t"
00354 "add %%"REG_a", %2 \n\t"
00355 "movq (%1), %%mm0 \n\t"
00356 "movq (%1, %3), %%mm1 \n\t"
00357 "movq 8(%1), %%mm2 \n\t"
00358 "movq 8(%1, %3), %%mm3 \n\t"
00359 PAVGB " 1(%1), %%mm0 \n\t"
00360 PAVGB " 1(%1, %3), %%mm1 \n\t"
00361 PAVGB " 9(%1), %%mm2 \n\t"
00362 PAVGB " 9(%1, %3), %%mm3 \n\t"
00363 "add %%"REG_a", %1 \n\t"
00364 "movq %%mm0, (%2) \n\t"
00365 "movq %%mm1, (%2, %3) \n\t"
00366 "movq %%mm2, 8(%2) \n\t"
00367 "movq %%mm3, 8(%2, %3) \n\t"
00368 "add %%"REG_a", %2 \n\t"
00369 "subl 4,ドル %0 \n\t"
00370 "jnz 1b \n\t"
00371 :"+g"(h), "+S"(pixels), "+D"(block)
00372 :"r" ((x86_reg)line_size)
00373 :"%"REG_a, "memory");
00374 }
00375 
00376 static void DEF(put_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
00377 {
00378 __asm__ volatile(
00379 "testl 1,ドル %0 \n\t"
00380 " jz 1f \n\t"
00381 "movq (%1), %%mm0 \n\t"
00382 "movq 8(%1), %%mm1 \n\t"
00383 PAVGB " (%2), %%mm0 \n\t"
00384 PAVGB " 8(%2), %%mm1 \n\t"
00385 "add %4, %1 \n\t"
00386 "add 16,ドル %2 \n\t"
00387 "movq %%mm0, (%3) \n\t"
00388 "movq %%mm1, 8(%3) \n\t"
00389 "add %5, %3 \n\t"
00390 "decl %0 \n\t"
00391 "1: \n\t"
00392 "movq (%1), %%mm0 \n\t"
00393 "movq 8(%1), %%mm1 \n\t"
00394 "add %4, %1 \n\t"
00395 PAVGB " (%2), %%mm0 \n\t"
00396 PAVGB " 8(%2), %%mm1 \n\t"
00397 "movq %%mm0, (%3) \n\t"
00398 "movq %%mm1, 8(%3) \n\t"
00399 "add %5, %3 \n\t"
00400 "movq (%1), %%mm0 \n\t"
00401 "movq 8(%1), %%mm1 \n\t"
00402 "add %4, %1 \n\t"
00403 PAVGB " 16(%2), %%mm0 \n\t"
00404 PAVGB " 24(%2), %%mm1 \n\t"
00405 "movq %%mm0, (%3) \n\t"
00406 "movq %%mm1, 8(%3) \n\t"
00407 "add %5, %3 \n\t"
00408 "add 32,ドル %2 \n\t"
00409 "subl 2,ドル %0 \n\t"
00410 "jnz 1b \n\t"
00411 #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
00412 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
00413 #else
00414 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
00415 #endif
00416 :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
00417 :"memory");
00418 //the following should be used, though better not with gcc ...
00419 /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
00420  :"r"(src1Stride), "r"(dstStride)
00421  :"memory");*/
00422 }
00423 
00424 static void DEF(avg_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
00425 {
00426 __asm__ volatile(
00427 "testl 1,ドル %0 \n\t"
00428 " jz 1f \n\t"
00429 "movq (%1), %%mm0 \n\t"
00430 "movq 8(%1), %%mm1 \n\t"
00431 PAVGB " (%2), %%mm0 \n\t"
00432 PAVGB " 8(%2), %%mm1 \n\t"
00433 "add %4, %1 \n\t"
00434 "add 16,ドル %2 \n\t"
00435 PAVGB " (%3), %%mm0 \n\t"
00436 PAVGB " 8(%3), %%mm1 \n\t"
00437 "movq %%mm0, (%3) \n\t"
00438 "movq %%mm1, 8(%3) \n\t"
00439 "add %5, %3 \n\t"
00440 "decl %0 \n\t"
00441 "1: \n\t"
00442 "movq (%1), %%mm0 \n\t"
00443 "movq 8(%1), %%mm1 \n\t"
00444 "add %4, %1 \n\t"
00445 PAVGB " (%2), %%mm0 \n\t"
00446 PAVGB " 8(%2), %%mm1 \n\t"
00447 PAVGB " (%3), %%mm0 \n\t"
00448 PAVGB " 8(%3), %%mm1 \n\t"
00449 "movq %%mm0, (%3) \n\t"
00450 "movq %%mm1, 8(%3) \n\t"
00451 "add %5, %3 \n\t"
00452 "movq (%1), %%mm0 \n\t"
00453 "movq 8(%1), %%mm1 \n\t"
00454 "add %4, %1 \n\t"
00455 PAVGB " 16(%2), %%mm0 \n\t"
00456 PAVGB " 24(%2), %%mm1 \n\t"
00457 PAVGB " (%3), %%mm0 \n\t"
00458 PAVGB " 8(%3), %%mm1 \n\t"
00459 "movq %%mm0, (%3) \n\t"
00460 "movq %%mm1, 8(%3) \n\t"
00461 "add %5, %3 \n\t"
00462 "add 32,ドル %2 \n\t"
00463 "subl 2,ドル %0 \n\t"
00464 "jnz 1b \n\t"
00465 #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
00466 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
00467 #else
00468 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
00469 #endif
00470 :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
00471 :"memory");
00472 //the following should be used, though better not with gcc ...
00473 /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
00474  :"r"(src1Stride), "r"(dstStride)
00475  :"memory");*/
00476 }
00477 
00478 static void DEF(put_no_rnd_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
00479 {
00480 __asm__ volatile(
00481 "pcmpeqb %%mm6, %%mm6 \n\t"
00482 "testl 1,ドル %0 \n\t"
00483 " jz 1f \n\t"
00484 "movq (%1), %%mm0 \n\t"
00485 "movq 8(%1), %%mm1 \n\t"
00486 "movq (%2), %%mm2 \n\t"
00487 "movq 8(%2), %%mm3 \n\t"
00488 "pxor %%mm6, %%mm0 \n\t"
00489 "pxor %%mm6, %%mm1 \n\t"
00490 "pxor %%mm6, %%mm2 \n\t"
00491 "pxor %%mm6, %%mm3 \n\t"
00492 PAVGB " %%mm2, %%mm0 \n\t"
00493 PAVGB " %%mm3, %%mm1 \n\t"
00494 "pxor %%mm6, %%mm0 \n\t"
00495 "pxor %%mm6, %%mm1 \n\t"
00496 "add %4, %1 \n\t"
00497 "add 16,ドル %2 \n\t"
00498 "movq %%mm0, (%3) \n\t"
00499 "movq %%mm1, 8(%3) \n\t"
00500 "add %5, %3 \n\t"
00501 "decl %0 \n\t"
00502 "1: \n\t"
00503 "movq (%1), %%mm0 \n\t"
00504 "movq 8(%1), %%mm1 \n\t"
00505 "add %4, %1 \n\t"
00506 "movq (%2), %%mm2 \n\t"
00507 "movq 8(%2), %%mm3 \n\t"
00508 "pxor %%mm6, %%mm0 \n\t"
00509 "pxor %%mm6, %%mm1 \n\t"
00510 "pxor %%mm6, %%mm2 \n\t"
00511 "pxor %%mm6, %%mm3 \n\t"
00512 PAVGB " %%mm2, %%mm0 \n\t"
00513 PAVGB " %%mm3, %%mm1 \n\t"
00514 "pxor %%mm6, %%mm0 \n\t"
00515 "pxor %%mm6, %%mm1 \n\t"
00516 "movq %%mm0, (%3) \n\t"
00517 "movq %%mm1, 8(%3) \n\t"
00518 "add %5, %3 \n\t"
00519 "movq (%1), %%mm0 \n\t"
00520 "movq 8(%1), %%mm1 \n\t"
00521 "add %4, %1 \n\t"
00522 "movq 16(%2), %%mm2 \n\t"
00523 "movq 24(%2), %%mm3 \n\t"
00524 "pxor %%mm6, %%mm0 \n\t"
00525 "pxor %%mm6, %%mm1 \n\t"
00526 "pxor %%mm6, %%mm2 \n\t"
00527 "pxor %%mm6, %%mm3 \n\t"
00528 PAVGB " %%mm2, %%mm0 \n\t"
00529 PAVGB " %%mm3, %%mm1 \n\t"
00530 "pxor %%mm6, %%mm0 \n\t"
00531 "pxor %%mm6, %%mm1 \n\t"
00532 "movq %%mm0, (%3) \n\t"
00533 "movq %%mm1, 8(%3) \n\t"
00534 "add %5, %3 \n\t"
00535 "add 32,ドル %2 \n\t"
00536 "subl 2,ドル %0 \n\t"
00537 "jnz 1b \n\t"
00538 #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
00539 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
00540 #else
00541 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
00542 #endif
00543 :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
00544 :"memory");
00545 //the following should be used, though better not with gcc ...
00546 /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
00547  :"r"(src1Stride), "r"(dstStride)
00548  :"memory");*/
00549 }
00550 
00551 /* GL: this function does incorrect rounding if overflow */
00552 static void DEF(put_no_rnd_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
00553 {
00554 MOVQ_BONE(mm6);
00555 __asm__ volatile(
00556 "lea (%3, %3), %%"REG_a" \n\t"
00557 "1: \n\t"
00558 "movq (%1), %%mm0 \n\t"
00559 "movq (%1, %3), %%mm2 \n\t"
00560 "movq 1(%1), %%mm1 \n\t"
00561 "movq 1(%1, %3), %%mm3 \n\t"
00562 "add %%"REG_a", %1 \n\t"
00563 "psubusb %%mm6, %%mm0 \n\t"
00564 "psubusb %%mm6, %%mm2 \n\t"
00565 PAVGB " %%mm1, %%mm0 \n\t"
00566 PAVGB " %%mm3, %%mm2 \n\t"
00567 "movq %%mm0, (%2) \n\t"
00568 "movq %%mm2, (%2, %3) \n\t"
00569 "movq (%1), %%mm0 \n\t"
00570 "movq 1(%1), %%mm1 \n\t"
00571 "movq (%1, %3), %%mm2 \n\t"
00572 "movq 1(%1, %3), %%mm3 \n\t"
00573 "add %%"REG_a", %2 \n\t"
00574 "add %%"REG_a", %1 \n\t"
00575 "psubusb %%mm6, %%mm0 \n\t"
00576 "psubusb %%mm6, %%mm2 \n\t"
00577 PAVGB " %%mm1, %%mm0 \n\t"
00578 PAVGB " %%mm3, %%mm2 \n\t"
00579 "movq %%mm0, (%2) \n\t"
00580 "movq %%mm2, (%2, %3) \n\t"
00581 "add %%"REG_a", %2 \n\t"
00582 "subl 4,ドル %0 \n\t"
00583 "jnz 1b \n\t"
00584 :"+g"(h), "+S"(pixels), "+D"(block)
00585 :"r" ((x86_reg)line_size)
00586 :"%"REG_a, "memory");
00587 }
00588 
00589 static void DEF(put_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
00590 {
00591 __asm__ volatile(
00592 "lea (%3, %3), %%"REG_a" \n\t"
00593 "movq (%1), %%mm0 \n\t"
00594 "sub %3, %2 \n\t"
00595 "1: \n\t"
00596 "movq (%1, %3), %%mm1 \n\t"
00597 "movq (%1, %%"REG_a"), %%mm2 \n\t"
00598 "add %%"REG_a", %1 \n\t"
00599 PAVGB " %%mm1, %%mm0 \n\t"
00600 PAVGB " %%mm2, %%mm1 \n\t"
00601 "movq %%mm0, (%2, %3) \n\t"
00602 "movq %%mm1, (%2, %%"REG_a") \n\t"
00603 "movq (%1, %3), %%mm1 \n\t"
00604 "movq (%1, %%"REG_a"), %%mm0 \n\t"
00605 "add %%"REG_a", %2 \n\t"
00606 "add %%"REG_a", %1 \n\t"
00607 PAVGB " %%mm1, %%mm2 \n\t"
00608 PAVGB " %%mm0, %%mm1 \n\t"
00609 "movq %%mm2, (%2, %3) \n\t"
00610 "movq %%mm1, (%2, %%"REG_a") \n\t"
00611 "add %%"REG_a", %2 \n\t"
00612 "subl 4,ドル %0 \n\t"
00613 "jnz 1b \n\t"
00614 :"+g"(h), "+S"(pixels), "+D" (block)
00615 :"r" ((x86_reg)line_size)
00616 :"%"REG_a, "memory");
00617 }
00618 
00619 /* GL: this function does incorrect rounding if overflow */
00620 static void DEF(put_no_rnd_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
00621 {
00622 MOVQ_BONE(mm6);
00623 __asm__ volatile(
00624 "lea (%3, %3), %%"REG_a" \n\t"
00625 "movq (%1), %%mm0 \n\t"
00626 "sub %3, %2 \n\t"
00627 "1: \n\t"
00628 "movq (%1, %3), %%mm1 \n\t"
00629 "movq (%1, %%"REG_a"), %%mm2 \n\t"
00630 "add %%"REG_a", %1 \n\t"
00631 "psubusb %%mm6, %%mm1 \n\t"
00632 PAVGB " %%mm1, %%mm0 \n\t"
00633 PAVGB " %%mm2, %%mm1 \n\t"
00634 "movq %%mm0, (%2, %3) \n\t"
00635 "movq %%mm1, (%2, %%"REG_a") \n\t"
00636 "movq (%1, %3), %%mm1 \n\t"
00637 "movq (%1, %%"REG_a"), %%mm0 \n\t"
00638 "add %%"REG_a", %2 \n\t"
00639 "add %%"REG_a", %1 \n\t"
00640 "psubusb %%mm6, %%mm1 \n\t"
00641 PAVGB " %%mm1, %%mm2 \n\t"
00642 PAVGB " %%mm0, %%mm1 \n\t"
00643 "movq %%mm2, (%2, %3) \n\t"
00644 "movq %%mm1, (%2, %%"REG_a") \n\t"
00645 "add %%"REG_a", %2 \n\t"
00646 "subl 4,ドル %0 \n\t"
00647 "jnz 1b \n\t"
00648 :"+g"(h), "+S"(pixels), "+D" (block)
00649 :"r" ((x86_reg)line_size)
00650 :"%"REG_a, "memory");
00651 }
00652 
00653 static void DEF(avg_pixels8)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
00654 {
00655 __asm__ volatile(
00656 "lea (%3, %3), %%"REG_a" \n\t"
00657 "1: \n\t"
00658 "movq (%2), %%mm0 \n\t"
00659 "movq (%2, %3), %%mm1 \n\t"
00660 PAVGB " (%1), %%mm0 \n\t"
00661 PAVGB " (%1, %3), %%mm1 \n\t"
00662 "movq %%mm0, (%2) \n\t"
00663 "movq %%mm1, (%2, %3) \n\t"
00664 "add %%"REG_a", %1 \n\t"
00665 "add %%"REG_a", %2 \n\t"
00666 "movq (%2), %%mm0 \n\t"
00667 "movq (%2, %3), %%mm1 \n\t"
00668 PAVGB " (%1), %%mm0 \n\t"
00669 PAVGB " (%1, %3), %%mm1 \n\t"
00670 "add %%"REG_a", %1 \n\t"
00671 "movq %%mm0, (%2) \n\t"
00672 "movq %%mm1, (%2, %3) \n\t"
00673 "add %%"REG_a", %2 \n\t"
00674 "subl 4,ドル %0 \n\t"
00675 "jnz 1b \n\t"
00676 :"+g"(h), "+S"(pixels), "+D"(block)
00677 :"r" ((x86_reg)line_size)
00678 :"%"REG_a, "memory");
00679 }
00680 
00681 static void DEF(avg_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
00682 {
00683 __asm__ volatile(
00684 "lea (%3, %3), %%"REG_a" \n\t"
00685 "1: \n\t"
00686 "movq (%1), %%mm0 \n\t"
00687 "movq (%1, %3), %%mm2 \n\t"
00688 PAVGB " 1(%1), %%mm0 \n\t"
00689 PAVGB " 1(%1, %3), %%mm2 \n\t"
00690 PAVGB " (%2), %%mm0 \n\t"
00691 PAVGB " (%2, %3), %%mm2 \n\t"
00692 "add %%"REG_a", %1 \n\t"
00693 "movq %%mm0, (%2) \n\t"
00694 "movq %%mm2, (%2, %3) \n\t"
00695 "movq (%1), %%mm0 \n\t"
00696 "movq (%1, %3), %%mm2 \n\t"
00697 PAVGB " 1(%1), %%mm0 \n\t"
00698 PAVGB " 1(%1, %3), %%mm2 \n\t"
00699 "add %%"REG_a", %2 \n\t"
00700 "add %%"REG_a", %1 \n\t"
00701 PAVGB " (%2), %%mm0 \n\t"
00702 PAVGB " (%2, %3), %%mm2 \n\t"
00703 "movq %%mm0, (%2) \n\t"
00704 "movq %%mm2, (%2, %3) \n\t"
00705 "add %%"REG_a", %2 \n\t"
00706 "subl 4,ドル %0 \n\t"
00707 "jnz 1b \n\t"
00708 :"+g"(h), "+S"(pixels), "+D"(block)
00709 :"r" ((x86_reg)line_size)
00710 :"%"REG_a, "memory");
00711 }
00712 
00713 static void DEF(avg_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
00714 {
00715 __asm__ volatile(
00716 "lea (%3, %3), %%"REG_a" \n\t"
00717 "movq (%1), %%mm0 \n\t"
00718 "sub %3, %2 \n\t"
00719 "1: \n\t"
00720 "movq (%1, %3), %%mm1 \n\t"
00721 "movq (%1, %%"REG_a"), %%mm2 \n\t"
00722 "add %%"REG_a", %1 \n\t"
00723 PAVGB " %%mm1, %%mm0 \n\t"
00724 PAVGB " %%mm2, %%mm1 \n\t"
00725 "movq (%2, %3), %%mm3 \n\t"
00726 "movq (%2, %%"REG_a"), %%mm4 \n\t"
00727 PAVGB " %%mm3, %%mm0 \n\t"
00728 PAVGB " %%mm4, %%mm1 \n\t"
00729 "movq %%mm0, (%2, %3) \n\t"
00730 "movq %%mm1, (%2, %%"REG_a") \n\t"
00731 "movq (%1, %3), %%mm1 \n\t"
00732 "movq (%1, %%"REG_a"), %%mm0 \n\t"
00733 PAVGB " %%mm1, %%mm2 \n\t"
00734 PAVGB " %%mm0, %%mm1 \n\t"
00735 "add %%"REG_a", %2 \n\t"
00736 "add %%"REG_a", %1 \n\t"
00737 "movq (%2, %3), %%mm3 \n\t"
00738 "movq (%2, %%"REG_a"), %%mm4 \n\t"
00739 PAVGB " %%mm3, %%mm2 \n\t"
00740 PAVGB " %%mm4, %%mm1 \n\t"
00741 "movq %%mm2, (%2, %3) \n\t"
00742 "movq %%mm1, (%2, %%"REG_a") \n\t"
00743 "add %%"REG_a", %2 \n\t"
00744 "subl 4,ドル %0 \n\t"
00745 "jnz 1b \n\t"
00746 :"+g"(h), "+S"(pixels), "+D"(block)
00747 :"r" ((x86_reg)line_size)
00748 :"%"REG_a, "memory");
00749 }
00750 
00751 /* Note this is not correctly rounded, but this function is only
00752  * used for B-frames so it does not matter. */
00753 static void DEF(avg_pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
00754 {
00755 MOVQ_BONE(mm6);
00756 __asm__ volatile(
00757 "lea (%3, %3), %%"REG_a" \n\t"
00758 "movq (%1), %%mm0 \n\t"
00759 PAVGB " 1(%1), %%mm0 \n\t"
00760 ASMALIGN(3)
00761 "1: \n\t"
00762 "movq (%1, %%"REG_a"), %%mm2 \n\t"
00763 "movq (%1, %3), %%mm1 \n\t"
00764 "psubusb %%mm6, %%mm2 \n\t"
00765 PAVGB " 1(%1, %3), %%mm1 \n\t"
00766 PAVGB " 1(%1, %%"REG_a"), %%mm2 \n\t"
00767 "add %%"REG_a", %1 \n\t"
00768 PAVGB " %%mm1, %%mm0 \n\t"
00769 PAVGB " %%mm2, %%mm1 \n\t"
00770 PAVGB " (%2), %%mm0 \n\t"
00771 PAVGB " (%2, %3), %%mm1 \n\t"
00772 "movq %%mm0, (%2) \n\t"
00773 "movq %%mm1, (%2, %3) \n\t"
00774 "movq (%1, %3), %%mm1 \n\t"
00775 "movq (%1, %%"REG_a"), %%mm0 \n\t"
00776 PAVGB " 1(%1, %3), %%mm1 \n\t"
00777 PAVGB " 1(%1, %%"REG_a"), %%mm0 \n\t"
00778 "add %%"REG_a", %2 \n\t"
00779 "add %%"REG_a", %1 \n\t"
00780 PAVGB " %%mm1, %%mm2 \n\t"
00781 PAVGB " %%mm0, %%mm1 \n\t"
00782 PAVGB " (%2), %%mm2 \n\t"
00783 PAVGB " (%2, %3), %%mm1 \n\t"
00784 "movq %%mm2, (%2) \n\t"
00785 "movq %%mm1, (%2, %3) \n\t"
00786 "add %%"REG_a", %2 \n\t"
00787 "subl 4,ドル %0 \n\t"
00788 "jnz 1b \n\t"
00789 :"+g"(h), "+S"(pixels), "+D"(block)
00790 :"r" ((x86_reg)line_size)
00791 :"%"REG_a, "memory");
00792 }
00793 
00794 static void DEF(avg_pixels4)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
00795 {
00796 do {
00797 __asm__ volatile(
00798 "movd (%1), %%mm0 \n\t"
00799 "movd (%1, %2), %%mm1 \n\t"
00800 "movd (%1, %2, 2), %%mm2 \n\t"
00801 "movd (%1, %3), %%mm3 \n\t"
00802 PAVGB " (%0), %%mm0 \n\t"
00803 PAVGB " (%0, %2), %%mm1 \n\t"
00804 PAVGB " (%0, %2, 2), %%mm2 \n\t"
00805 PAVGB " (%0, %3), %%mm3 \n\t"
00806 "movd %%mm0, (%1) \n\t"
00807 "movd %%mm1, (%1, %2) \n\t"
00808 "movd %%mm2, (%1, %2, 2) \n\t"
00809 "movd %%mm3, (%1, %3) \n\t"
00810 ::"S"(pixels), "D"(block),
00811 "r" ((x86_reg)line_size), "r"((x86_reg)3L*line_size)
00812 :"memory");
00813 block += 4*line_size;
00814 pixels += 4*line_size;
00815 h -= 4;
00816 } while(h > 0);
00817 }
00818 
00819 //FIXME the following could be optimized too ...
00820 static void DEF(put_no_rnd_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
00821 DEF(put_no_rnd_pixels8_x2)(block , pixels , line_size, h);
00822 DEF(put_no_rnd_pixels8_x2)(block+8, pixels+8, line_size, h);
00823 }
00824 static void DEF(put_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
00825 DEF(put_pixels8_y2)(block , pixels , line_size, h);
00826 DEF(put_pixels8_y2)(block+8, pixels+8, line_size, h);
00827 }
00828 static void DEF(put_no_rnd_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
00829 DEF(put_no_rnd_pixels8_y2)(block , pixels , line_size, h);
00830 DEF(put_no_rnd_pixels8_y2)(block+8, pixels+8, line_size, h);
00831 }
00832 static void DEF(avg_pixels16)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
00833 DEF(avg_pixels8)(block , pixels , line_size, h);
00834 DEF(avg_pixels8)(block+8, pixels+8, line_size, h);
00835 }
00836 static void DEF(avg_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
00837 DEF(avg_pixels8_x2)(block , pixels , line_size, h);
00838 DEF(avg_pixels8_x2)(block+8, pixels+8, line_size, h);
00839 }
00840 static void DEF(avg_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
00841 DEF(avg_pixels8_y2)(block , pixels , line_size, h);
00842 DEF(avg_pixels8_y2)(block+8, pixels+8, line_size, h);
00843 }
00844 static void DEF(avg_pixels16_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
00845 DEF(avg_pixels8_xy2)(block , pixels , line_size, h);
00846 DEF(avg_pixels8_xy2)(block+8, pixels+8, line_size, h);
00847 }
00848 
00849 #define QPEL_2TAP_L3(OPNAME) \
00850 static void DEF(OPNAME ## 2tap_qpel16_l3)(uint8_t *dst, uint8_t *src, int stride, int h, int off1, int off2){\
00851  __asm__ volatile(\
00852  "1: \n\t"\
00853  "movq (%1,%2), %%mm0 \n\t"\
00854  "movq 8(%1,%2), %%mm1 \n\t"\
00855  PAVGB" (%1,%3), %%mm0 \n\t"\
00856  PAVGB" 8(%1,%3), %%mm1 \n\t"\
00857  PAVGB" (%1), %%mm0 \n\t"\
00858  PAVGB" 8(%1), %%mm1 \n\t"\
00859  STORE_OP( (%1,%4),%%mm0)\
00860  STORE_OP(8(%1,%4),%%mm1)\
00861  "movq %%mm0, (%1,%4) \n\t"\
00862  "movq %%mm1, 8(%1,%4) \n\t"\
00863  "add %5, %1 \n\t"\
00864  "decl %0 \n\t"\
00865  "jnz 1b \n\t"\
00866  :"+g"(h), "+r"(src)\
00867  :"r"((x86_reg)off1), "r"((x86_reg)off2),\
00868  "r"((x86_reg)(dst-src)), "r"((x86_reg)stride)\
00869  :"memory"\
00870  );\
00871 }\
00872 static void DEF(OPNAME ## 2tap_qpel8_l3)(uint8_t *dst, uint8_t *src, int stride, int h, int off1, int off2){\
00873  __asm__ volatile(\
00874  "1: \n\t"\
00875  "movq (%1,%2), %%mm0 \n\t"\
00876  PAVGB" (%1,%3), %%mm0 \n\t"\
00877  PAVGB" (%1), %%mm0 \n\t"\
00878  STORE_OP((%1,%4),%%mm0)\
00879  "movq %%mm0, (%1,%4) \n\t"\
00880  "add %5, %1 \n\t"\
00881  "decl %0 \n\t"\
00882  "jnz 1b \n\t"\
00883  :"+g"(h), "+r"(src)\
00884  :"r"((x86_reg)off1), "r"((x86_reg)off2),\
00885  "r"((x86_reg)(dst-src)), "r"((x86_reg)stride)\
00886  :"memory"\
00887  );\
00888 }
00889 
00890 #define STORE_OP(a,b) PAVGB" "#a","#b" \n\t"
00891 QPEL_2TAP_L3(avg_)
00892 #undef STORE_OP
00893 #define STORE_OP(a,b)
00894 QPEL_2TAP_L3(put_)
00895 #undef STORE_OP
00896 #undef QPEL_2TAP_L3

Generated on Fri Oct 26 02:35:40 2012 for FFmpeg by doxygen 1.5.8

AltStyle によって変換されたページ (->オリジナル) /