00001 /* 00002 * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org> 00003 * 00004 * This file is part of FFmpeg. 00005 * 00006 * FFmpeg is free software; you can redistribute it and/or 00007 * modify it under the terms of the GNU Lesser General Public 00008 * License as published by the Free Software Foundation; either 00009 * version 2.1 of the License, or (at your option) any later version. 00010 * 00011 * FFmpeg is distributed in the hope that it will be useful, 00012 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00013 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00014 * Lesser General Public License for more details. 00015 * 00016 * You should have received a copy of the GNU Lesser General Public 00017 * License along with FFmpeg; if not, write to the Free Software 00018 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 00019 */ 00020 00021 //#define DEBUG_ALIGNMENT 00022 #ifdef DEBUG_ALIGNMENT 00023 #define ASSERT_ALIGNED(ptr) assert(((unsigned long)ptr&0x0000000F)); 00024 #else 00025 #define ASSERT_ALIGNED(ptr) ; 00026 #endif 00027 00028 /* this code assume that stride % 16 == 0 */ 00029 00030 #define CHROMA_MC8_ALTIVEC_CORE \ 00031 vsrc2ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc2uc);\ 00032 vsrc3ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc3uc);\ 00033 \ 00034 psum = vec_mladd(vA, vsrc0ssH, v32ss);\ 00035 psum = vec_mladd(vB, vsrc1ssH, psum);\ 00036 psum = vec_mladd(vC, vsrc2ssH, psum);\ 00037 psum = vec_mladd(vD, vsrc3ssH, psum);\ 00038 psum = vec_sr(psum, v6us);\ 00039 \ 00040 vdst = vec_ld(0, dst);\ 00041 ppsum = (vec_u8)vec_pack(psum, psum);\ 00042 vfdst = vec_perm(vdst, ppsum, fperm);\ 00043 \ 00044 OP_U8_ALTIVEC(fsum, vfdst, vdst);\ 00045 \ 00046 vec_st(fsum, 0, dst);\ 00047 \ 00048 vsrc0ssH = vsrc2ssH;\ 00049 vsrc1ssH = vsrc3ssH;\ 00050 \ 00051 dst += stride;\ 00052 src += stride; 00053 00054 #define CHROMA_MC8_ALTIVEC_CORE_SIMPLE \ 00055 \ 00056 vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc0uc);\ 00057 vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc1uc);\ 00058 \ 00059 psum = vec_mladd(vA, vsrc0ssH, v32ss);\ 00060 psum = vec_mladd(vE, vsrc1ssH, psum);\ 00061 psum = vec_sr(psum, v6us);\ 00062 \ 00063 vdst = vec_ld(0, dst);\ 00064 ppsum = (vec_u8)vec_pack(psum, psum);\ 00065 vfdst = vec_perm(vdst, ppsum, fperm);\ 00066 \ 00067 OP_U8_ALTIVEC(fsum, vfdst, vdst);\ 00068 \ 00069 vec_st(fsum, 0, dst);\ 00070 \ 00071 dst += stride;\ 00072 src += stride; 00073 00074 void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, 00075 int stride, int h, int x, int y) { 00076 POWERPC_PERF_DECLARE(PREFIX_h264_chroma_mc8_num, 1); 00077 DECLARE_ALIGNED_16(signed int, ABCD[4]) = 00078 {((8 - x) * (8 - y)), 00079 (( x) * (8 - y)), 00080 ((8 - x) * ( y)), 00081 (( x) * ( y))}; 00082 register int i; 00083 vec_u8 fperm; 00084 const vec_s32 vABCD = vec_ld(0, ABCD); 00085 const vec_s16 vA = vec_splat((vec_s16)vABCD, 1); 00086 const vec_s16 vB = vec_splat((vec_s16)vABCD, 3); 00087 const vec_s16 vC = vec_splat((vec_s16)vABCD, 5); 00088 const vec_s16 vD = vec_splat((vec_s16)vABCD, 7); 00089 LOAD_ZERO; 00090 const vec_s16 v32ss = vec_sl(vec_splat_s16(1),vec_splat_u16(5)); 00091 const vec_u16 v6us = vec_splat_u16(6); 00092 register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1; 00093 register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0; 00094 00095 vec_u8 vsrcAuc, vsrcBuc, vsrcperm0, vsrcperm1; 00096 vec_u8 vsrc0uc, vsrc1uc; 00097 vec_s16 vsrc0ssH, vsrc1ssH; 00098 vec_u8 vsrcCuc, vsrc2uc, vsrc3uc; 00099 vec_s16 vsrc2ssH, vsrc3ssH, psum; 00100 vec_u8 vdst, ppsum, vfdst, fsum; 00101 00102 POWERPC_PERF_START_COUNT(PREFIX_h264_chroma_mc8_num, 1); 00103 00104 if (((unsigned long)dst) % 16 == 0) { 00105 fperm = (vec_u8){0x10, 0x11, 0x12, 0x13, 00106 0x14, 0x15, 0x16, 0x17, 00107 0x08, 0x09, 0x0A, 0x0B, 00108 0x0C, 0x0D, 0x0E, 0x0F}; 00109 } else { 00110 fperm = (vec_u8){0x00, 0x01, 0x02, 0x03, 00111 0x04, 0x05, 0x06, 0x07, 00112 0x18, 0x19, 0x1A, 0x1B, 00113 0x1C, 0x1D, 0x1E, 0x1F}; 00114 } 00115 00116 vsrcAuc = vec_ld(0, src); 00117 00118 if (loadSecond) 00119 vsrcBuc = vec_ld(16, src); 00120 vsrcperm0 = vec_lvsl(0, src); 00121 vsrcperm1 = vec_lvsl(1, src); 00122 00123 vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0); 00124 if (reallyBadAlign) 00125 vsrc1uc = vsrcBuc; 00126 else 00127 vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1); 00128 00129 vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc0uc); 00130 vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc1uc); 00131 00132 if (ABCD[3]) { 00133 if (!loadSecond) {// -> !reallyBadAlign 00134 for (i = 0 ; i < h ; i++) { 00135 vsrcCuc = vec_ld(stride + 0, src); 00136 vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0); 00137 vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1); 00138 00139 CHROMA_MC8_ALTIVEC_CORE 00140 } 00141 } else { 00142 vec_u8 vsrcDuc; 00143 for (i = 0 ; i < h ; i++) { 00144 vsrcCuc = vec_ld(stride + 0, src); 00145 vsrcDuc = vec_ld(stride + 16, src); 00146 vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0); 00147 if (reallyBadAlign) 00148 vsrc3uc = vsrcDuc; 00149 else 00150 vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1); 00151 00152 CHROMA_MC8_ALTIVEC_CORE 00153 } 00154 } 00155 } else { 00156 const vec_s16 vE = vec_add(vB, vC); 00157 if (ABCD[2]) { // x == 0 B == 0 00158 if (!loadSecond) {// -> !reallyBadAlign 00159 for (i = 0 ; i < h ; i++) { 00160 vsrcCuc = vec_ld(stride + 0, src); 00161 vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0); 00162 CHROMA_MC8_ALTIVEC_CORE_SIMPLE 00163 00164 vsrc0uc = vsrc1uc; 00165 } 00166 } else { 00167 vec_u8 vsrcDuc; 00168 for (i = 0 ; i < h ; i++) { 00169 vsrcCuc = vec_ld(stride + 0, src); 00170 vsrcDuc = vec_ld(stride + 15, src); 00171 vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0); 00172 CHROMA_MC8_ALTIVEC_CORE_SIMPLE 00173 00174 vsrc0uc = vsrc1uc; 00175 } 00176 } 00177 } else { // y == 0 C == 0 00178 if (!loadSecond) {// -> !reallyBadAlign 00179 for (i = 0 ; i < h ; i++) { 00180 vsrcCuc = vec_ld(0, src); 00181 vsrc0uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0); 00182 vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1); 00183 00184 CHROMA_MC8_ALTIVEC_CORE_SIMPLE 00185 } 00186 } else { 00187 vec_u8 vsrcDuc; 00188 for (i = 0 ; i < h ; i++) { 00189 vsrcCuc = vec_ld(0, src); 00190 vsrcDuc = vec_ld(15, src); 00191 vsrc0uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0); 00192 if (reallyBadAlign) 00193 vsrc1uc = vsrcDuc; 00194 else 00195 vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1); 00196 00197 CHROMA_MC8_ALTIVEC_CORE_SIMPLE 00198 } 00199 } 00200 } 00201 } 00202 POWERPC_PERF_STOP_COUNT(PREFIX_h264_chroma_mc8_num, 1); 00203 } 00204 00205 #undef CHROMA_MC8_ALTIVEC_CORE 00206 00207 /* this code assume stride % 16 == 0 */ 00208 static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) { 00209 POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_h_lowpass_num, 1); 00210 register int i; 00211 00212 LOAD_ZERO; 00213 const vec_u8 permM2 = vec_lvsl(-2, src); 00214 const vec_u8 permM1 = vec_lvsl(-1, src); 00215 const vec_u8 permP0 = vec_lvsl(+0, src); 00216 const vec_u8 permP1 = vec_lvsl(+1, src); 00217 const vec_u8 permP2 = vec_lvsl(+2, src); 00218 const vec_u8 permP3 = vec_lvsl(+3, src); 00219 const vec_s16 v5ss = vec_splat_s16(5); 00220 const vec_u16 v5us = vec_splat_u16(5); 00221 const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); 00222 const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4)); 00223 00224 vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3; 00225 00226 register int align = ((((unsigned long)src) - 2) % 16); 00227 00228 vec_s16 srcP0A, srcP0B, srcP1A, srcP1B, 00229 srcP2A, srcP2B, srcP3A, srcP3B, 00230 srcM1A, srcM1B, srcM2A, srcM2B, 00231 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B, 00232 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B, 00233 psumA, psumB, sumA, sumB; 00234 00235 vec_u8 sum, vdst, fsum; 00236 00237 POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1); 00238 00239 for (i = 0 ; i < 16 ; i ++) { 00240 vec_u8 srcR1 = vec_ld(-2, src); 00241 vec_u8 srcR2 = vec_ld(14, src); 00242 00243 switch (align) { 00244 default: { 00245 srcM2 = vec_perm(srcR1, srcR2, permM2); 00246 srcM1 = vec_perm(srcR1, srcR2, permM1); 00247 srcP0 = vec_perm(srcR1, srcR2, permP0); 00248 srcP1 = vec_perm(srcR1, srcR2, permP1); 00249 srcP2 = vec_perm(srcR1, srcR2, permP2); 00250 srcP3 = vec_perm(srcR1, srcR2, permP3); 00251 } break; 00252 case 11: { 00253 srcM2 = vec_perm(srcR1, srcR2, permM2); 00254 srcM1 = vec_perm(srcR1, srcR2, permM1); 00255 srcP0 = vec_perm(srcR1, srcR2, permP0); 00256 srcP1 = vec_perm(srcR1, srcR2, permP1); 00257 srcP2 = vec_perm(srcR1, srcR2, permP2); 00258 srcP3 = srcR2; 00259 } break; 00260 case 12: { 00261 vec_u8 srcR3 = vec_ld(30, src); 00262 srcM2 = vec_perm(srcR1, srcR2, permM2); 00263 srcM1 = vec_perm(srcR1, srcR2, permM1); 00264 srcP0 = vec_perm(srcR1, srcR2, permP0); 00265 srcP1 = vec_perm(srcR1, srcR2, permP1); 00266 srcP2 = srcR2; 00267 srcP3 = vec_perm(srcR2, srcR3, permP3); 00268 } break; 00269 case 13: { 00270 vec_u8 srcR3 = vec_ld(30, src); 00271 srcM2 = vec_perm(srcR1, srcR2, permM2); 00272 srcM1 = vec_perm(srcR1, srcR2, permM1); 00273 srcP0 = vec_perm(srcR1, srcR2, permP0); 00274 srcP1 = srcR2; 00275 srcP2 = vec_perm(srcR2, srcR3, permP2); 00276 srcP3 = vec_perm(srcR2, srcR3, permP3); 00277 } break; 00278 case 14: { 00279 vec_u8 srcR3 = vec_ld(30, src); 00280 srcM2 = vec_perm(srcR1, srcR2, permM2); 00281 srcM1 = vec_perm(srcR1, srcR2, permM1); 00282 srcP0 = srcR2; 00283 srcP1 = vec_perm(srcR2, srcR3, permP1); 00284 srcP2 = vec_perm(srcR2, srcR3, permP2); 00285 srcP3 = vec_perm(srcR2, srcR3, permP3); 00286 } break; 00287 case 15: { 00288 vec_u8 srcR3 = vec_ld(30, src); 00289 srcM2 = vec_perm(srcR1, srcR2, permM2); 00290 srcM1 = srcR2; 00291 srcP0 = vec_perm(srcR2, srcR3, permP0); 00292 srcP1 = vec_perm(srcR2, srcR3, permP1); 00293 srcP2 = vec_perm(srcR2, srcR3, permP2); 00294 srcP3 = vec_perm(srcR2, srcR3, permP3); 00295 } break; 00296 } 00297 00298 srcP0A = (vec_s16) vec_mergeh(zero_u8v, srcP0); 00299 srcP0B = (vec_s16) vec_mergel(zero_u8v, srcP0); 00300 srcP1A = (vec_s16) vec_mergeh(zero_u8v, srcP1); 00301 srcP1B = (vec_s16) vec_mergel(zero_u8v, srcP1); 00302 00303 srcP2A = (vec_s16) vec_mergeh(zero_u8v, srcP2); 00304 srcP2B = (vec_s16) vec_mergel(zero_u8v, srcP2); 00305 srcP3A = (vec_s16) vec_mergeh(zero_u8v, srcP3); 00306 srcP3B = (vec_s16) vec_mergel(zero_u8v, srcP3); 00307 00308 srcM1A = (vec_s16) vec_mergeh(zero_u8v, srcM1); 00309 srcM1B = (vec_s16) vec_mergel(zero_u8v, srcM1); 00310 srcM2A = (vec_s16) vec_mergeh(zero_u8v, srcM2); 00311 srcM2B = (vec_s16) vec_mergel(zero_u8v, srcM2); 00312 00313 sum1A = vec_adds(srcP0A, srcP1A); 00314 sum1B = vec_adds(srcP0B, srcP1B); 00315 sum2A = vec_adds(srcM1A, srcP2A); 00316 sum2B = vec_adds(srcM1B, srcP2B); 00317 sum3A = vec_adds(srcM2A, srcP3A); 00318 sum3B = vec_adds(srcM2B, srcP3B); 00319 00320 pp1A = vec_mladd(sum1A, v20ss, v16ss); 00321 pp1B = vec_mladd(sum1B, v20ss, v16ss); 00322 00323 pp2A = vec_mladd(sum2A, v5ss, zero_s16v); 00324 pp2B = vec_mladd(sum2B, v5ss, zero_s16v); 00325 00326 pp3A = vec_add(sum3A, pp1A); 00327 pp3B = vec_add(sum3B, pp1B); 00328 00329 psumA = vec_sub(pp3A, pp2A); 00330 psumB = vec_sub(pp3B, pp2B); 00331 00332 sumA = vec_sra(psumA, v5us); 00333 sumB = vec_sra(psumB, v5us); 00334 00335 sum = vec_packsu(sumA, sumB); 00336 00337 ASSERT_ALIGNED(dst); 00338 vdst = vec_ld(0, dst); 00339 00340 OP_U8_ALTIVEC(fsum, sum, vdst); 00341 00342 vec_st(fsum, 0, dst); 00343 00344 src += srcStride; 00345 dst += dstStride; 00346 } 00347 POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1); 00348 } 00349 00350 /* this code assume stride % 16 == 0 */ 00351 static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) { 00352 POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_v_lowpass_num, 1); 00353 00354 register int i; 00355 00356 LOAD_ZERO; 00357 const vec_u8 perm = vec_lvsl(0, src); 00358 const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); 00359 const vec_u16 v5us = vec_splat_u16(5); 00360 const vec_s16 v5ss = vec_splat_s16(5); 00361 const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4)); 00362 00363 uint8_t *srcbis = src - (srcStride * 2); 00364 00365 const vec_u8 srcM2a = vec_ld(0, srcbis); 00366 const vec_u8 srcM2b = vec_ld(16, srcbis); 00367 const vec_u8 srcM2 = vec_perm(srcM2a, srcM2b, perm); 00368 //srcbis += srcStride; 00369 const vec_u8 srcM1a = vec_ld(0, srcbis += srcStride); 00370 const vec_u8 srcM1b = vec_ld(16, srcbis); 00371 const vec_u8 srcM1 = vec_perm(srcM1a, srcM1b, perm); 00372 //srcbis += srcStride; 00373 const vec_u8 srcP0a = vec_ld(0, srcbis += srcStride); 00374 const vec_u8 srcP0b = vec_ld(16, srcbis); 00375 const vec_u8 srcP0 = vec_perm(srcP0a, srcP0b, perm); 00376 //srcbis += srcStride; 00377 const vec_u8 srcP1a = vec_ld(0, srcbis += srcStride); 00378 const vec_u8 srcP1b = vec_ld(16, srcbis); 00379 const vec_u8 srcP1 = vec_perm(srcP1a, srcP1b, perm); 00380 //srcbis += srcStride; 00381 const vec_u8 srcP2a = vec_ld(0, srcbis += srcStride); 00382 const vec_u8 srcP2b = vec_ld(16, srcbis); 00383 const vec_u8 srcP2 = vec_perm(srcP2a, srcP2b, perm); 00384 //srcbis += srcStride; 00385 00386 vec_s16 srcM2ssA = (vec_s16) vec_mergeh(zero_u8v, srcM2); 00387 vec_s16 srcM2ssB = (vec_s16) vec_mergel(zero_u8v, srcM2); 00388 vec_s16 srcM1ssA = (vec_s16) vec_mergeh(zero_u8v, srcM1); 00389 vec_s16 srcM1ssB = (vec_s16) vec_mergel(zero_u8v, srcM1); 00390 vec_s16 srcP0ssA = (vec_s16) vec_mergeh(zero_u8v, srcP0); 00391 vec_s16 srcP0ssB = (vec_s16) vec_mergel(zero_u8v, srcP0); 00392 vec_s16 srcP1ssA = (vec_s16) vec_mergeh(zero_u8v, srcP1); 00393 vec_s16 srcP1ssB = (vec_s16) vec_mergel(zero_u8v, srcP1); 00394 vec_s16 srcP2ssA = (vec_s16) vec_mergeh(zero_u8v, srcP2); 00395 vec_s16 srcP2ssB = (vec_s16) vec_mergel(zero_u8v, srcP2); 00396 00397 vec_s16 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B, 00398 psumA, psumB, sumA, sumB, 00399 srcP3ssA, srcP3ssB, 00400 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B; 00401 00402 vec_u8 sum, vdst, fsum, srcP3a, srcP3b, srcP3; 00403 00404 POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1); 00405 00406 for (i = 0 ; i < 16 ; i++) { 00407 srcP3a = vec_ld(0, srcbis += srcStride); 00408 srcP3b = vec_ld(16, srcbis); 00409 srcP3 = vec_perm(srcP3a, srcP3b, perm); 00410 srcP3ssA = (vec_s16) vec_mergeh(zero_u8v, srcP3); 00411 srcP3ssB = (vec_s16) vec_mergel(zero_u8v, srcP3); 00412 //srcbis += srcStride; 00413 00414 sum1A = vec_adds(srcP0ssA, srcP1ssA); 00415 sum1B = vec_adds(srcP0ssB, srcP1ssB); 00416 sum2A = vec_adds(srcM1ssA, srcP2ssA); 00417 sum2B = vec_adds(srcM1ssB, srcP2ssB); 00418 sum3A = vec_adds(srcM2ssA, srcP3ssA); 00419 sum3B = vec_adds(srcM2ssB, srcP3ssB); 00420 00421 srcM2ssA = srcM1ssA; 00422 srcM2ssB = srcM1ssB; 00423 srcM1ssA = srcP0ssA; 00424 srcM1ssB = srcP0ssB; 00425 srcP0ssA = srcP1ssA; 00426 srcP0ssB = srcP1ssB; 00427 srcP1ssA = srcP2ssA; 00428 srcP1ssB = srcP2ssB; 00429 srcP2ssA = srcP3ssA; 00430 srcP2ssB = srcP3ssB; 00431 00432 pp1A = vec_mladd(sum1A, v20ss, v16ss); 00433 pp1B = vec_mladd(sum1B, v20ss, v16ss); 00434 00435 pp2A = vec_mladd(sum2A, v5ss, zero_s16v); 00436 pp2B = vec_mladd(sum2B, v5ss, zero_s16v); 00437 00438 pp3A = vec_add(sum3A, pp1A); 00439 pp3B = vec_add(sum3B, pp1B); 00440 00441 psumA = vec_sub(pp3A, pp2A); 00442 psumB = vec_sub(pp3B, pp2B); 00443 00444 sumA = vec_sra(psumA, v5us); 00445 sumB = vec_sra(psumB, v5us); 00446 00447 sum = vec_packsu(sumA, sumB); 00448 00449 ASSERT_ALIGNED(dst); 00450 vdst = vec_ld(0, dst); 00451 00452 OP_U8_ALTIVEC(fsum, sum, vdst); 00453 00454 vec_st(fsum, 0, dst); 00455 00456 dst += dstStride; 00457 } 00458 POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1); 00459 } 00460 00461 /* this code assume stride % 16 == 0 *and* tmp is properly aligned */ 00462 static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int srcStride) { 00463 POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_hv_lowpass_num, 1); 00464 register int i; 00465 LOAD_ZERO; 00466 const vec_u8 permM2 = vec_lvsl(-2, src); 00467 const vec_u8 permM1 = vec_lvsl(-1, src); 00468 const vec_u8 permP0 = vec_lvsl(+0, src); 00469 const vec_u8 permP1 = vec_lvsl(+1, src); 00470 const vec_u8 permP2 = vec_lvsl(+2, src); 00471 const vec_u8 permP3 = vec_lvsl(+3, src); 00472 const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); 00473 const vec_u32 v10ui = vec_splat_u32(10); 00474 const vec_s16 v5ss = vec_splat_s16(5); 00475 const vec_s16 v1ss = vec_splat_s16(1); 00476 const vec_s32 v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9)); 00477 const vec_u32 v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4)); 00478 00479 register int align = ((((unsigned long)src) - 2) % 16); 00480 00481 vec_s16 srcP0A, srcP0B, srcP1A, srcP1B, 00482 srcP2A, srcP2B, srcP3A, srcP3B, 00483 srcM1A, srcM1B, srcM2A, srcM2B, 00484 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B, 00485 pp1A, pp1B, pp2A, pp2B, psumA, psumB; 00486 00487 const vec_u8 mperm = (const vec_u8) 00488 {0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B, 00489 0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F}; 00490 int16_t *tmpbis = tmp; 00491 00492 vec_s16 tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB, 00493 tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB, 00494 tmpP2ssA, tmpP2ssB; 00495 00496 vec_s32 pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo, 00497 pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo, 00498 pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo, 00499 ssumAe, ssumAo, ssumBe, ssumBo; 00500 vec_u8 fsum, sumv, sum, vdst; 00501 vec_s16 ssume, ssumo; 00502 00503 POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1); 00504 src -= (2 * srcStride); 00505 for (i = 0 ; i < 21 ; i ++) { 00506 vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3; 00507 vec_u8 srcR1 = vec_ld(-2, src); 00508 vec_u8 srcR2 = vec_ld(14, src); 00509 00510 switch (align) { 00511 default: { 00512 srcM2 = vec_perm(srcR1, srcR2, permM2); 00513 srcM1 = vec_perm(srcR1, srcR2, permM1); 00514 srcP0 = vec_perm(srcR1, srcR2, permP0); 00515 srcP1 = vec_perm(srcR1, srcR2, permP1); 00516 srcP2 = vec_perm(srcR1, srcR2, permP2); 00517 srcP3 = vec_perm(srcR1, srcR2, permP3); 00518 } break; 00519 case 11: { 00520 srcM2 = vec_perm(srcR1, srcR2, permM2); 00521 srcM1 = vec_perm(srcR1, srcR2, permM1); 00522 srcP0 = vec_perm(srcR1, srcR2, permP0); 00523 srcP1 = vec_perm(srcR1, srcR2, permP1); 00524 srcP2 = vec_perm(srcR1, srcR2, permP2); 00525 srcP3 = srcR2; 00526 } break; 00527 case 12: { 00528 vec_u8 srcR3 = vec_ld(30, src); 00529 srcM2 = vec_perm(srcR1, srcR2, permM2); 00530 srcM1 = vec_perm(srcR1, srcR2, permM1); 00531 srcP0 = vec_perm(srcR1, srcR2, permP0); 00532 srcP1 = vec_perm(srcR1, srcR2, permP1); 00533 srcP2 = srcR2; 00534 srcP3 = vec_perm(srcR2, srcR3, permP3); 00535 } break; 00536 case 13: { 00537 vec_u8 srcR3 = vec_ld(30, src); 00538 srcM2 = vec_perm(srcR1, srcR2, permM2); 00539 srcM1 = vec_perm(srcR1, srcR2, permM1); 00540 srcP0 = vec_perm(srcR1, srcR2, permP0); 00541 srcP1 = srcR2; 00542 srcP2 = vec_perm(srcR2, srcR3, permP2); 00543 srcP3 = vec_perm(srcR2, srcR3, permP3); 00544 } break; 00545 case 14: { 00546 vec_u8 srcR3 = vec_ld(30, src); 00547 srcM2 = vec_perm(srcR1, srcR2, permM2); 00548 srcM1 = vec_perm(srcR1, srcR2, permM1); 00549 srcP0 = srcR2; 00550 srcP1 = vec_perm(srcR2, srcR3, permP1); 00551 srcP2 = vec_perm(srcR2, srcR3, permP2); 00552 srcP3 = vec_perm(srcR2, srcR3, permP3); 00553 } break; 00554 case 15: { 00555 vec_u8 srcR3 = vec_ld(30, src); 00556 srcM2 = vec_perm(srcR1, srcR2, permM2); 00557 srcM1 = srcR2; 00558 srcP0 = vec_perm(srcR2, srcR3, permP0); 00559 srcP1 = vec_perm(srcR2, srcR3, permP1); 00560 srcP2 = vec_perm(srcR2, srcR3, permP2); 00561 srcP3 = vec_perm(srcR2, srcR3, permP3); 00562 } break; 00563 } 00564 00565 srcP0A = (vec_s16) vec_mergeh(zero_u8v, srcP0); 00566 srcP0B = (vec_s16) vec_mergel(zero_u8v, srcP0); 00567 srcP1A = (vec_s16) vec_mergeh(zero_u8v, srcP1); 00568 srcP1B = (vec_s16) vec_mergel(zero_u8v, srcP1); 00569 00570 srcP2A = (vec_s16) vec_mergeh(zero_u8v, srcP2); 00571 srcP2B = (vec_s16) vec_mergel(zero_u8v, srcP2); 00572 srcP3A = (vec_s16) vec_mergeh(zero_u8v, srcP3); 00573 srcP3B = (vec_s16) vec_mergel(zero_u8v, srcP3); 00574 00575 srcM1A = (vec_s16) vec_mergeh(zero_u8v, srcM1); 00576 srcM1B = (vec_s16) vec_mergel(zero_u8v, srcM1); 00577 srcM2A = (vec_s16) vec_mergeh(zero_u8v, srcM2); 00578 srcM2B = (vec_s16) vec_mergel(zero_u8v, srcM2); 00579 00580 sum1A = vec_adds(srcP0A, srcP1A); 00581 sum1B = vec_adds(srcP0B, srcP1B); 00582 sum2A = vec_adds(srcM1A, srcP2A); 00583 sum2B = vec_adds(srcM1B, srcP2B); 00584 sum3A = vec_adds(srcM2A, srcP3A); 00585 sum3B = vec_adds(srcM2B, srcP3B); 00586 00587 pp1A = vec_mladd(sum1A, v20ss, sum3A); 00588 pp1B = vec_mladd(sum1B, v20ss, sum3B); 00589 00590 pp2A = vec_mladd(sum2A, v5ss, zero_s16v); 00591 pp2B = vec_mladd(sum2B, v5ss, zero_s16v); 00592 00593 psumA = vec_sub(pp1A, pp2A); 00594 psumB = vec_sub(pp1B, pp2B); 00595 00596 vec_st(psumA, 0, tmp); 00597 vec_st(psumB, 16, tmp); 00598 00599 src += srcStride; 00600 tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */ 00601 } 00602 00603 tmpM2ssA = vec_ld(0, tmpbis); 00604 tmpM2ssB = vec_ld(16, tmpbis); 00605 tmpbis += tmpStride; 00606 tmpM1ssA = vec_ld(0, tmpbis); 00607 tmpM1ssB = vec_ld(16, tmpbis); 00608 tmpbis += tmpStride; 00609 tmpP0ssA = vec_ld(0, tmpbis); 00610 tmpP0ssB = vec_ld(16, tmpbis); 00611 tmpbis += tmpStride; 00612 tmpP1ssA = vec_ld(0, tmpbis); 00613 tmpP1ssB = vec_ld(16, tmpbis); 00614 tmpbis += tmpStride; 00615 tmpP2ssA = vec_ld(0, tmpbis); 00616 tmpP2ssB = vec_ld(16, tmpbis); 00617 tmpbis += tmpStride; 00618 00619 for (i = 0 ; i < 16 ; i++) { 00620 const vec_s16 tmpP3ssA = vec_ld(0, tmpbis); 00621 const vec_s16 tmpP3ssB = vec_ld(16, tmpbis); 00622 00623 const vec_s16 sum1A = vec_adds(tmpP0ssA, tmpP1ssA); 00624 const vec_s16 sum1B = vec_adds(tmpP0ssB, tmpP1ssB); 00625 const vec_s16 sum2A = vec_adds(tmpM1ssA, tmpP2ssA); 00626 const vec_s16 sum2B = vec_adds(tmpM1ssB, tmpP2ssB); 00627 const vec_s16 sum3A = vec_adds(tmpM2ssA, tmpP3ssA); 00628 const vec_s16 sum3B = vec_adds(tmpM2ssB, tmpP3ssB); 00629 00630 tmpbis += tmpStride; 00631 00632 tmpM2ssA = tmpM1ssA; 00633 tmpM2ssB = tmpM1ssB; 00634 tmpM1ssA = tmpP0ssA; 00635 tmpM1ssB = tmpP0ssB; 00636 tmpP0ssA = tmpP1ssA; 00637 tmpP0ssB = tmpP1ssB; 00638 tmpP1ssA = tmpP2ssA; 00639 tmpP1ssB = tmpP2ssB; 00640 tmpP2ssA = tmpP3ssA; 00641 tmpP2ssB = tmpP3ssB; 00642 00643 pp1Ae = vec_mule(sum1A, v20ss); 00644 pp1Ao = vec_mulo(sum1A, v20ss); 00645 pp1Be = vec_mule(sum1B, v20ss); 00646 pp1Bo = vec_mulo(sum1B, v20ss); 00647 00648 pp2Ae = vec_mule(sum2A, v5ss); 00649 pp2Ao = vec_mulo(sum2A, v5ss); 00650 pp2Be = vec_mule(sum2B, v5ss); 00651 pp2Bo = vec_mulo(sum2B, v5ss); 00652 00653 pp3Ae = vec_sra((vec_s32)sum3A, v16ui); 00654 pp3Ao = vec_mulo(sum3A, v1ss); 00655 pp3Be = vec_sra((vec_s32)sum3B, v16ui); 00656 pp3Bo = vec_mulo(sum3B, v1ss); 00657 00658 pp1cAe = vec_add(pp1Ae, v512si); 00659 pp1cAo = vec_add(pp1Ao, v512si); 00660 pp1cBe = vec_add(pp1Be, v512si); 00661 pp1cBo = vec_add(pp1Bo, v512si); 00662 00663 pp32Ae = vec_sub(pp3Ae, pp2Ae); 00664 pp32Ao = vec_sub(pp3Ao, pp2Ao); 00665 pp32Be = vec_sub(pp3Be, pp2Be); 00666 pp32Bo = vec_sub(pp3Bo, pp2Bo); 00667 00668 sumAe = vec_add(pp1cAe, pp32Ae); 00669 sumAo = vec_add(pp1cAo, pp32Ao); 00670 sumBe = vec_add(pp1cBe, pp32Be); 00671 sumBo = vec_add(pp1cBo, pp32Bo); 00672 00673 ssumAe = vec_sra(sumAe, v10ui); 00674 ssumAo = vec_sra(sumAo, v10ui); 00675 ssumBe = vec_sra(sumBe, v10ui); 00676 ssumBo = vec_sra(sumBo, v10ui); 00677 00678 ssume = vec_packs(ssumAe, ssumBe); 00679 ssumo = vec_packs(ssumAo, ssumBo); 00680 00681 sumv = vec_packsu(ssume, ssumo); 00682 sum = vec_perm(sumv, sumv, mperm); 00683 00684 ASSERT_ALIGNED(dst); 00685 vdst = vec_ld(0, dst); 00686 00687 OP_U8_ALTIVEC(fsum, sum, vdst); 00688 00689 vec_st(fsum, 0, dst); 00690 00691 dst += dstStride; 00692 } 00693 POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1); 00694 }