00001 /* 00002 * Alpha optimized DSP utils 00003 * Copyright (c) 2002 Falk Hueffner <falk@debian.org> 00004 * 00005 * This file is part of FFmpeg. 00006 * 00007 * FFmpeg is free software; you can redistribute it and/or 00008 * modify it under the terms of the GNU Lesser General Public 00009 * License as published by the Free Software Foundation; either 00010 * version 2.1 of the License, or (at your option) any later version. 00011 * 00012 * FFmpeg is distributed in the hope that it will be useful, 00013 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00014 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00015 * Lesser General Public License for more details. 00016 * 00017 * You should have received a copy of the GNU Lesser General Public 00018 * License along with FFmpeg; if not, write to the Free Software 00019 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 00020 */ 00021 00022 #include "libavcodec/dsputil.h" 00023 #include "asm.h" 00024 00025 void ff_simple_idct_axp(DCTELEM *block); 00026 void ff_simple_idct_put_axp(uint8_t *dest, int line_size, DCTELEM *block); 00027 void ff_simple_idct_add_axp(uint8_t *dest, int line_size, DCTELEM *block); 00028 00029 void put_pixels_axp_asm(uint8_t *block, const uint8_t *pixels, 00030 int line_size, int h); 00031 void put_pixels_clamped_mvi_asm(const DCTELEM *block, uint8_t *pixels, 00032 int line_size); 00033 void add_pixels_clamped_mvi_asm(const DCTELEM *block, uint8_t *pixels, 00034 int line_size); 00035 void (*put_pixels_clamped_axp_p)(const DCTELEM *block, uint8_t *pixels, 00036 int line_size); 00037 void (*add_pixels_clamped_axp_p)(const DCTELEM *block, uint8_t *pixels, 00038 int line_size); 00039 00040 void get_pixels_mvi(DCTELEM *restrict block, 00041 const uint8_t *restrict pixels, int line_size); 00042 void diff_pixels_mvi(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, 00043 int stride); 00044 int pix_abs8x8_mvi(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h); 00045 int pix_abs16x16_mvi_asm(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h); 00046 int pix_abs16x16_x2_mvi(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h); 00047 int pix_abs16x16_y2_mvi(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h); 00048 int pix_abs16x16_xy2_mvi(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h); 00049 00050 #if 0 00051 /* These functions were the base for the optimized assembler routines, 00052 and remain here for documentation purposes. */ 00053 static void put_pixels_clamped_mvi(const DCTELEM *block, uint8_t *pixels, 00054 int line_size) 00055 { 00056 int i = 8; 00057 uint64_t clampmask = zap(-1, 0xaa); /* 0x00ff00ff00ff00ff */ 00058 00059 do { 00060 uint64_t shorts0, shorts1; 00061 00062 shorts0 = ldq(block); 00063 shorts0 = maxsw4(shorts0, 0); 00064 shorts0 = minsw4(shorts0, clampmask); 00065 stl(pkwb(shorts0), pixels); 00066 00067 shorts1 = ldq(block + 4); 00068 shorts1 = maxsw4(shorts1, 0); 00069 shorts1 = minsw4(shorts1, clampmask); 00070 stl(pkwb(shorts1), pixels + 4); 00071 00072 pixels += line_size; 00073 block += 8; 00074 } while (--i); 00075 } 00076 00077 void add_pixels_clamped_mvi(const DCTELEM *block, uint8_t *pixels, 00078 int line_size) 00079 { 00080 int h = 8; 00081 /* Keep this function a leaf function by generating the constants 00082 manually (mainly for the hack value ;-). */ 00083 uint64_t clampmask = zap(-1, 0xaa); /* 0x00ff00ff00ff00ff */ 00084 uint64_t signmask = zap(-1, 0x33); 00085 signmask ^= signmask >> 1; /* 0x8000800080008000 */ 00086 00087 do { 00088 uint64_t shorts0, pix0, signs0; 00089 uint64_t shorts1, pix1, signs1; 00090 00091 shorts0 = ldq(block); 00092 shorts1 = ldq(block + 4); 00093 00094 pix0 = unpkbw(ldl(pixels)); 00095 /* Signed subword add (MMX paddw). */ 00096 signs0 = shorts0 & signmask; 00097 shorts0 &= ~signmask; 00098 shorts0 += pix0; 00099 shorts0 ^= signs0; 00100 /* Clamp. */ 00101 shorts0 = maxsw4(shorts0, 0); 00102 shorts0 = minsw4(shorts0, clampmask); 00103 00104 /* Next 4. */ 00105 pix1 = unpkbw(ldl(pixels + 4)); 00106 signs1 = shorts1 & signmask; 00107 shorts1 &= ~signmask; 00108 shorts1 += pix1; 00109 shorts1 ^= signs1; 00110 shorts1 = maxsw4(shorts1, 0); 00111 shorts1 = minsw4(shorts1, clampmask); 00112 00113 stl(pkwb(shorts0), pixels); 00114 stl(pkwb(shorts1), pixels + 4); 00115 00116 pixels += line_size; 00117 block += 8; 00118 } while (--h); 00119 } 00120 #endif 00121 00122 static void clear_blocks_axp(DCTELEM *blocks) { 00123 uint64_t *p = (uint64_t *) blocks; 00124 int n = sizeof(DCTELEM) * 6 * 64; 00125 00126 do { 00127 p[0] = 0; 00128 p[1] = 0; 00129 p[2] = 0; 00130 p[3] = 0; 00131 p[4] = 0; 00132 p[5] = 0; 00133 p[6] = 0; 00134 p[7] = 0; 00135 p += 8; 00136 n -= 8 * 8; 00137 } while (n); 00138 } 00139 00140 static inline uint64_t avg2_no_rnd(uint64_t a, uint64_t b) 00141 { 00142 return (a & b) + (((a ^ b) & BYTE_VEC(0xfe)) >> 1); 00143 } 00144 00145 static inline uint64_t avg2(uint64_t a, uint64_t b) 00146 { 00147 return (a | b) - (((a ^ b) & BYTE_VEC(0xfe)) >> 1); 00148 } 00149 00150 #if 0 00151 /* The XY2 routines basically utilize this scheme, but reuse parts in 00152 each iteration. */ 00153 static inline uint64_t avg4(uint64_t l1, uint64_t l2, uint64_t l3, uint64_t l4) 00154 { 00155 uint64_t r1 = ((l1 & ~BYTE_VEC(0x03)) >> 2) 00156 + ((l2 & ~BYTE_VEC(0x03)) >> 2) 00157 + ((l3 & ~BYTE_VEC(0x03)) >> 2) 00158 + ((l4 & ~BYTE_VEC(0x03)) >> 2); 00159 uint64_t r2 = (( (l1 & BYTE_VEC(0x03)) 00160 + (l2 & BYTE_VEC(0x03)) 00161 + (l3 & BYTE_VEC(0x03)) 00162 + (l4 & BYTE_VEC(0x03)) 00163 + BYTE_VEC(0x02)) >> 2) & BYTE_VEC(0x03); 00164 return r1 + r2; 00165 } 00166 #endif 00167 00168 #define OP(LOAD, STORE) \ 00169 do { \ 00170 STORE(LOAD(pixels), block); \ 00171 pixels += line_size; \ 00172 block += line_size; \ 00173 } while (--h) 00174 00175 #define OP_X2(LOAD, STORE) \ 00176 do { \ 00177 uint64_t pix1, pix2; \ 00178 \ 00179 pix1 = LOAD(pixels); \ 00180 pix2 = pix1 >> 8 | ((uint64_t) pixels[8] << 56); \ 00181 STORE(AVG2(pix1, pix2), block); \ 00182 pixels += line_size; \ 00183 block += line_size; \ 00184 } while (--h) 00185 00186 #define OP_Y2(LOAD, STORE) \ 00187 do { \ 00188 uint64_t pix = LOAD(pixels); \ 00189 do { \ 00190 uint64_t next_pix; \ 00191 \ 00192 pixels += line_size; \ 00193 next_pix = LOAD(pixels); \ 00194 STORE(AVG2(pix, next_pix), block); \ 00195 block += line_size; \ 00196 pix = next_pix; \ 00197 } while (--h); \ 00198 } while (0) 00199 00200 #define OP_XY2(LOAD, STORE) \ 00201 do { \ 00202 uint64_t pix1 = LOAD(pixels); \ 00203 uint64_t pix2 = pix1 >> 8 | ((uint64_t) pixels[8] << 56); \ 00204 uint64_t pix_l = (pix1 & BYTE_VEC(0x03)) \ 00205 + (pix2 & BYTE_VEC(0x03)); \ 00206 uint64_t pix_h = ((pix1 & ~BYTE_VEC(0x03)) >> 2) \ 00207 + ((pix2 & ~BYTE_VEC(0x03)) >> 2); \ 00208 \ 00209 do { \ 00210 uint64_t npix1, npix2; \ 00211 uint64_t npix_l, npix_h; \ 00212 uint64_t avg; \ 00213 \ 00214 pixels += line_size; \ 00215 npix1 = LOAD(pixels); \ 00216 npix2 = npix1 >> 8 | ((uint64_t) pixels[8] << 56); \ 00217 npix_l = (npix1 & BYTE_VEC(0x03)) \ 00218 + (npix2 & BYTE_VEC(0x03)); \ 00219 npix_h = ((npix1 & ~BYTE_VEC(0x03)) >> 2) \ 00220 + ((npix2 & ~BYTE_VEC(0x03)) >> 2); \ 00221 avg = (((pix_l + npix_l + AVG4_ROUNDER) >> 2) & BYTE_VEC(0x03)) \ 00222 + pix_h + npix_h; \ 00223 STORE(avg, block); \ 00224 \ 00225 block += line_size; \ 00226 pix_l = npix_l; \ 00227 pix_h = npix_h; \ 00228 } while (--h); \ 00229 } while (0) 00230 00231 #define MAKE_OP(OPNAME, SUFF, OPKIND, STORE) \ 00232 static void OPNAME ## _pixels ## SUFF ## _axp \ 00233 (uint8_t *restrict block, const uint8_t *restrict pixels, \ 00234 int line_size, int h) \ 00235 { \ 00236 if ((size_t) pixels & 0x7) { \ 00237 OPKIND(uldq, STORE); \ 00238 } else { \ 00239 OPKIND(ldq, STORE); \ 00240 } \ 00241 } \ 00242 \ 00243 static void OPNAME ## _pixels16 ## SUFF ## _axp \ 00244 (uint8_t *restrict block, const uint8_t *restrict pixels, \ 00245 int line_size, int h) \ 00246 { \ 00247 OPNAME ## _pixels ## SUFF ## _axp(block, pixels, line_size, h); \ 00248 OPNAME ## _pixels ## SUFF ## _axp(block + 8, pixels + 8, line_size, h); \ 00249 } 00250 00251 #define PIXOP(OPNAME, STORE) \ 00252 MAKE_OP(OPNAME, , OP, STORE) \ 00253 MAKE_OP(OPNAME, _x2, OP_X2, STORE) \ 00254 MAKE_OP(OPNAME, _y2, OP_Y2, STORE) \ 00255 MAKE_OP(OPNAME, _xy2, OP_XY2, STORE) 00256 00257 /* Rounding primitives. */ 00258 #define AVG2 avg2 00259 #define AVG4 avg4 00260 #define AVG4_ROUNDER BYTE_VEC(0x02) 00261 #define STORE(l, b) stq(l, b) 00262 PIXOP(put, STORE); 00263 00264 #undef STORE 00265 #define STORE(l, b) stq(AVG2(l, ldq(b)), b); 00266 PIXOP(avg, STORE); 00267 00268 /* Not rounding primitives. */ 00269 #undef AVG2 00270 #undef AVG4 00271 #undef AVG4_ROUNDER 00272 #undef STORE 00273 #define AVG2 avg2_no_rnd 00274 #define AVG4 avg4_no_rnd 00275 #define AVG4_ROUNDER BYTE_VEC(0x01) 00276 #define STORE(l, b) stq(l, b) 00277 PIXOP(put_no_rnd, STORE); 00278 00279 #undef STORE 00280 #define STORE(l, b) stq(AVG2(l, ldq(b)), b); 00281 PIXOP(avg_no_rnd, STORE); 00282 00283 void put_pixels16_axp_asm(uint8_t *block, const uint8_t *pixels, 00284 int line_size, int h) 00285 { 00286 put_pixels_axp_asm(block, pixels, line_size, h); 00287 put_pixels_axp_asm(block + 8, pixels + 8, line_size, h); 00288 } 00289 00290 void dsputil_init_alpha(DSPContext* c, AVCodecContext *avctx) 00291 { 00292 c->put_pixels_tab[0][0] = put_pixels16_axp_asm; 00293 c->put_pixels_tab[0][1] = put_pixels16_x2_axp; 00294 c->put_pixels_tab[0][2] = put_pixels16_y2_axp; 00295 c->put_pixels_tab[0][3] = put_pixels16_xy2_axp; 00296 00297 c->put_no_rnd_pixels_tab[0][0] = put_pixels16_axp_asm; 00298 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_axp; 00299 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_axp; 00300 c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_axp; 00301 00302 c->avg_pixels_tab[0][0] = avg_pixels16_axp; 00303 c->avg_pixels_tab[0][1] = avg_pixels16_x2_axp; 00304 c->avg_pixels_tab[0][2] = avg_pixels16_y2_axp; 00305 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_axp; 00306 00307 c->avg_no_rnd_pixels_tab[0][0] = avg_no_rnd_pixels16_axp; 00308 c->avg_no_rnd_pixels_tab[0][1] = avg_no_rnd_pixels16_x2_axp; 00309 c->avg_no_rnd_pixels_tab[0][2] = avg_no_rnd_pixels16_y2_axp; 00310 c->avg_no_rnd_pixels_tab[0][3] = avg_no_rnd_pixels16_xy2_axp; 00311 00312 c->put_pixels_tab[1][0] = put_pixels_axp_asm; 00313 c->put_pixels_tab[1][1] = put_pixels_x2_axp; 00314 c->put_pixels_tab[1][2] = put_pixels_y2_axp; 00315 c->put_pixels_tab[1][3] = put_pixels_xy2_axp; 00316 00317 c->put_no_rnd_pixels_tab[1][0] = put_pixels_axp_asm; 00318 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels_x2_axp; 00319 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels_y2_axp; 00320 c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels_xy2_axp; 00321 00322 c->avg_pixels_tab[1][0] = avg_pixels_axp; 00323 c->avg_pixels_tab[1][1] = avg_pixels_x2_axp; 00324 c->avg_pixels_tab[1][2] = avg_pixels_y2_axp; 00325 c->avg_pixels_tab[1][3] = avg_pixels_xy2_axp; 00326 00327 c->avg_no_rnd_pixels_tab[1][0] = avg_no_rnd_pixels_axp; 00328 c->avg_no_rnd_pixels_tab[1][1] = avg_no_rnd_pixels_x2_axp; 00329 c->avg_no_rnd_pixels_tab[1][2] = avg_no_rnd_pixels_y2_axp; 00330 c->avg_no_rnd_pixels_tab[1][3] = avg_no_rnd_pixels_xy2_axp; 00331 00332 c->clear_blocks = clear_blocks_axp; 00333 00334 /* amask clears all bits that correspond to present features. */ 00335 if (amask(AMASK_MVI) == 0) { 00336 c->put_pixels_clamped = put_pixels_clamped_mvi_asm; 00337 c->add_pixels_clamped = add_pixels_clamped_mvi_asm; 00338 00339 c->get_pixels = get_pixels_mvi; 00340 c->diff_pixels = diff_pixels_mvi; 00341 c->sad[0] = pix_abs16x16_mvi_asm; 00342 c->sad[1] = pix_abs8x8_mvi; 00343 c->pix_abs[0][0] = pix_abs16x16_mvi_asm; 00344 c->pix_abs[1][0] = pix_abs8x8_mvi; 00345 c->pix_abs[0][1] = pix_abs16x16_x2_mvi; 00346 c->pix_abs[0][2] = pix_abs16x16_y2_mvi; 00347 c->pix_abs[0][3] = pix_abs16x16_xy2_mvi; 00348 } 00349 00350 put_pixels_clamped_axp_p = c->put_pixels_clamped; 00351 add_pixels_clamped_axp_p = c->add_pixels_clamped; 00352 00353 if (!avctx->lowres && 00354 (avctx->idct_algo == FF_IDCT_AUTO || 00355 avctx->idct_algo == FF_IDCT_SIMPLEALPHA)) { 00356 c->idct_put = ff_simple_idct_put_axp; 00357 c->idct_add = ff_simple_idct_add_axp; 00358 c->idct = ff_simple_idct_axp; 00359 } 00360 }