00001 /* 00002 * Copyright (c) 2002 Brian Foley 00003 * Copyright (c) 2002 Dieter Shirley 00004 * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org> 00005 * 00006 * This file is part of FFmpeg. 00007 * 00008 * FFmpeg is free software; you can redistribute it and/or 00009 * modify it under the terms of the GNU Lesser General Public 00010 * License as published by the Free Software Foundation; either 00011 * version 2.1 of the License, or (at your option) any later version. 00012 * 00013 * FFmpeg is distributed in the hope that it will be useful, 00014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00016 * Lesser General Public License for more details. 00017 * 00018 * You should have received a copy of the GNU Lesser General Public 00019 * License along with FFmpeg; if not, write to the Free Software 00020 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 00021 */ 00022 00023 #include "config.h" 00024 #if HAVE_ALTIVEC_H 00025 #include <altivec.h> 00026 #endif 00027 #include "libavcodec/dsputil.h" 00028 #include "util_altivec.h" 00029 #include "types_altivec.h" 00030 #include "dsputil_altivec.h" 00031 00032 static int sad16_x2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) 00033 { 00034 int i; 00035 int s; 00036 const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); 00037 vector unsigned char *tv; 00038 vector unsigned char pix1v, pix2v, pix2iv, avgv, t5; 00039 vector unsigned int sad; 00040 vector signed int sumdiffs; 00041 00042 s = 0; 00043 sad = (vector unsigned int)vec_splat_u32(0); 00044 for (i = 0; i < h; i++) { 00045 /* Read unaligned pixels into our vectors. The vectors are as follows: 00046 pix1v: pix1[0]-pix1[15] 00047 pix2v: pix2[0]-pix2[15] pix2iv: pix2[1]-pix2[16] */ 00048 tv = (vector unsigned char *) pix1; 00049 pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1)); 00050 00051 tv = (vector unsigned char *) &pix2[0]; 00052 pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0])); 00053 00054 tv = (vector unsigned char *) &pix2[1]; 00055 pix2iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[1])); 00056 00057 /* Calculate the average vector */ 00058 avgv = vec_avg(pix2v, pix2iv); 00059 00060 /* Calculate a sum of abs differences vector */ 00061 t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv)); 00062 00063 /* Add each 4 pixel group together and put 4 results into sad */ 00064 sad = vec_sum4s(t5, sad); 00065 00066 pix1 += line_size; 00067 pix2 += line_size; 00068 } 00069 /* Sum up the four partial sums, and put the result into s */ 00070 sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); 00071 sumdiffs = vec_splat(sumdiffs, 3); 00072 vec_ste(sumdiffs, 0, &s); 00073 00074 return s; 00075 } 00076 00077 static int sad16_y2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) 00078 { 00079 int i; 00080 int s; 00081 const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); 00082 vector unsigned char *tv; 00083 vector unsigned char pix1v, pix2v, pix3v, avgv, t5; 00084 vector unsigned int sad; 00085 vector signed int sumdiffs; 00086 uint8_t *pix3 = pix2 + line_size; 00087 00088 s = 0; 00089 sad = (vector unsigned int)vec_splat_u32(0); 00090 00091 /* Due to the fact that pix3 = pix2 + line_size, the pix3 of one 00092 iteration becomes pix2 in the next iteration. We can use this 00093 fact to avoid a potentially expensive unaligned read, each 00094 time around the loop. 00095 Read unaligned pixels into our vectors. The vectors are as follows: 00096 pix2v: pix2[0]-pix2[15] 00097 Split the pixel vectors into shorts */ 00098 tv = (vector unsigned char *) &pix2[0]; 00099 pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0])); 00100 00101 for (i = 0; i < h; i++) { 00102 /* Read unaligned pixels into our vectors. The vectors are as follows: 00103 pix1v: pix1[0]-pix1[15] 00104 pix3v: pix3[0]-pix3[15] */ 00105 tv = (vector unsigned char *) pix1; 00106 pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1)); 00107 00108 tv = (vector unsigned char *) &pix3[0]; 00109 pix3v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[0])); 00110 00111 /* Calculate the average vector */ 00112 avgv = vec_avg(pix2v, pix3v); 00113 00114 /* Calculate a sum of abs differences vector */ 00115 t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv)); 00116 00117 /* Add each 4 pixel group together and put 4 results into sad */ 00118 sad = vec_sum4s(t5, sad); 00119 00120 pix1 += line_size; 00121 pix2v = pix3v; 00122 pix3 += line_size; 00123 00124 } 00125 00126 /* Sum up the four partial sums, and put the result into s */ 00127 sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); 00128 sumdiffs = vec_splat(sumdiffs, 3); 00129 vec_ste(sumdiffs, 0, &s); 00130 return s; 00131 } 00132 00133 static int sad16_xy2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) 00134 { 00135 int i; 00136 int s; 00137 uint8_t *pix3 = pix2 + line_size; 00138 const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); 00139 const vector unsigned short two = (const vector unsigned short)vec_splat_u16(2); 00140 vector unsigned char *tv, avgv, t5; 00141 vector unsigned char pix1v, pix2v, pix3v, pix2iv, pix3iv; 00142 vector unsigned short pix2lv, pix2hv, pix2ilv, pix2ihv; 00143 vector unsigned short pix3lv, pix3hv, pix3ilv, pix3ihv; 00144 vector unsigned short avghv, avglv; 00145 vector unsigned short t1, t2, t3, t4; 00146 vector unsigned int sad; 00147 vector signed int sumdiffs; 00148 00149 sad = (vector unsigned int)vec_splat_u32(0); 00150 00151 s = 0; 00152 00153 /* Due to the fact that pix3 = pix2 + line_size, the pix3 of one 00154 iteration becomes pix2 in the next iteration. We can use this 00155 fact to avoid a potentially expensive unaligned read, as well 00156 as some splitting, and vector addition each time around the loop. 00157 Read unaligned pixels into our vectors. The vectors are as follows: 00158 pix2v: pix2[0]-pix2[15] pix2iv: pix2[1]-pix2[16] 00159 Split the pixel vectors into shorts */ 00160 tv = (vector unsigned char *) &pix2[0]; 00161 pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0])); 00162 00163 tv = (vector unsigned char *) &pix2[1]; 00164 pix2iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[1])); 00165 00166 pix2hv = (vector unsigned short) vec_mergeh(zero, pix2v); 00167 pix2lv = (vector unsigned short) vec_mergel(zero, pix2v); 00168 pix2ihv = (vector unsigned short) vec_mergeh(zero, pix2iv); 00169 pix2ilv = (vector unsigned short) vec_mergel(zero, pix2iv); 00170 t1 = vec_add(pix2hv, pix2ihv); 00171 t2 = vec_add(pix2lv, pix2ilv); 00172 00173 for (i = 0; i < h; i++) { 00174 /* Read unaligned pixels into our vectors. The vectors are as follows: 00175 pix1v: pix1[0]-pix1[15] 00176 pix3v: pix3[0]-pix3[15] pix3iv: pix3[1]-pix3[16] */ 00177 tv = (vector unsigned char *) pix1; 00178 pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1)); 00179 00180 tv = (vector unsigned char *) &pix3[0]; 00181 pix3v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[0])); 00182 00183 tv = (vector unsigned char *) &pix3[1]; 00184 pix3iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[1])); 00185 00186 /* Note that AltiVec does have vec_avg, but this works on vector pairs 00187 and rounds up. We could do avg(avg(a,b),avg(c,d)), but the rounding 00188 would mean that, for example, avg(3,0,0,1) = 2, when it should be 1. 00189 Instead, we have to split the pixel vectors into vectors of shorts, 00190 and do the averaging by hand. */ 00191 00192 /* Split the pixel vectors into shorts */ 00193 pix3hv = (vector unsigned short) vec_mergeh(zero, pix3v); 00194 pix3lv = (vector unsigned short) vec_mergel(zero, pix3v); 00195 pix3ihv = (vector unsigned short) vec_mergeh(zero, pix3iv); 00196 pix3ilv = (vector unsigned short) vec_mergel(zero, pix3iv); 00197 00198 /* Do the averaging on them */ 00199 t3 = vec_add(pix3hv, pix3ihv); 00200 t4 = vec_add(pix3lv, pix3ilv); 00201 00202 avghv = vec_sr(vec_add(vec_add(t1, t3), two), two); 00203 avglv = vec_sr(vec_add(vec_add(t2, t4), two), two); 00204 00205 /* Pack the shorts back into a result */ 00206 avgv = vec_pack(avghv, avglv); 00207 00208 /* Calculate a sum of abs differences vector */ 00209 t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv)); 00210 00211 /* Add each 4 pixel group together and put 4 results into sad */ 00212 sad = vec_sum4s(t5, sad); 00213 00214 pix1 += line_size; 00215 pix3 += line_size; 00216 /* Transfer the calculated values for pix3 into pix2 */ 00217 t1 = t3; 00218 t2 = t4; 00219 } 00220 /* Sum up the four partial sums, and put the result into s */ 00221 sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); 00222 sumdiffs = vec_splat(sumdiffs, 3); 00223 vec_ste(sumdiffs, 0, &s); 00224 00225 return s; 00226 } 00227 00228 static int sad16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) 00229 { 00230 int i; 00231 int s; 00232 const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); 00233 vector unsigned char perm1, perm2, pix1v_low, pix1v_high, pix2v_low, pix2v_high; 00234 vector unsigned char t1, t2, t3,t4, t5; 00235 vector unsigned int sad; 00236 vector signed int sumdiffs; 00237 00238 sad = (vector unsigned int)vec_splat_u32(0); 00239 00240 00241 for (i = 0; i < h; i++) { 00242 /* Read potentially unaligned pixels into t1 and t2 */ 00243 perm1 = vec_lvsl(0, pix1); 00244 pix1v_high = vec_ld( 0, pix1); 00245 pix1v_low = vec_ld(15, pix1); 00246 perm2 = vec_lvsl(0, pix2); 00247 pix2v_high = vec_ld( 0, pix2); 00248 pix2v_low = vec_ld(15, pix2); 00249 t1 = vec_perm(pix1v_high, pix1v_low, perm1); 00250 t2 = vec_perm(pix2v_high, pix2v_low, perm2); 00251 00252 /* Calculate a sum of abs differences vector */ 00253 t3 = vec_max(t1, t2); 00254 t4 = vec_min(t1, t2); 00255 t5 = vec_sub(t3, t4); 00256 00257 /* Add each 4 pixel group together and put 4 results into sad */ 00258 sad = vec_sum4s(t5, sad); 00259 00260 pix1 += line_size; 00261 pix2 += line_size; 00262 } 00263 00264 /* Sum up the four partial sums, and put the result into s */ 00265 sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); 00266 sumdiffs = vec_splat(sumdiffs, 3); 00267 vec_ste(sumdiffs, 0, &s); 00268 00269 return s; 00270 } 00271 00272 static int sad8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) 00273 { 00274 int i; 00275 int s; 00276 const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); 00277 vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v; 00278 vector unsigned char t1, t2, t3,t4, t5; 00279 vector unsigned int sad; 00280 vector signed int sumdiffs; 00281 00282 sad = (vector unsigned int)vec_splat_u32(0); 00283 00284 permclear = (vector unsigned char){255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0}; 00285 00286 for (i = 0; i < h; i++) { 00287 /* Read potentially unaligned pixels into t1 and t2 00288 Since we're reading 16 pixels, and actually only want 8, 00289 mask out the last 8 pixels. The 0s don't change the sum. */ 00290 perm1 = vec_lvsl(0, pix1); 00291 pix1v = (vector unsigned char *) pix1; 00292 perm2 = vec_lvsl(0, pix2); 00293 pix2v = (vector unsigned char *) pix2; 00294 t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear); 00295 t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear); 00296 00297 /* Calculate a sum of abs differences vector */ 00298 t3 = vec_max(t1, t2); 00299 t4 = vec_min(t1, t2); 00300 t5 = vec_sub(t3, t4); 00301 00302 /* Add each 4 pixel group together and put 4 results into sad */ 00303 sad = vec_sum4s(t5, sad); 00304 00305 pix1 += line_size; 00306 pix2 += line_size; 00307 } 00308 00309 /* Sum up the four partial sums, and put the result into s */ 00310 sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); 00311 sumdiffs = vec_splat(sumdiffs, 3); 00312 vec_ste(sumdiffs, 0, &s); 00313 00314 return s; 00315 } 00316 00317 static int pix_norm1_altivec(uint8_t *pix, int line_size) 00318 { 00319 int i; 00320 int s; 00321 const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); 00322 vector unsigned char *tv; 00323 vector unsigned char pixv; 00324 vector unsigned int sv; 00325 vector signed int sum; 00326 00327 sv = (vector unsigned int)vec_splat_u32(0); 00328 00329 s = 0; 00330 for (i = 0; i < 16; i++) { 00331 /* Read in the potentially unaligned pixels */ 00332 tv = (vector unsigned char *) pix; 00333 pixv = vec_perm(tv[0], tv[1], vec_lvsl(0, pix)); 00334 00335 /* Square the values, and add them to our sum */ 00336 sv = vec_msum(pixv, pixv, sv); 00337 00338 pix += line_size; 00339 } 00340 /* Sum up the four partial sums, and put the result into s */ 00341 sum = vec_sums((vector signed int) sv, (vector signed int) zero); 00342 sum = vec_splat(sum, 3); 00343 vec_ste(sum, 0, &s); 00344 00345 return s; 00346 } 00347 00353 static int sse8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) 00354 { 00355 int i; 00356 int s; 00357 const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); 00358 vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v; 00359 vector unsigned char t1, t2, t3,t4, t5; 00360 vector unsigned int sum; 00361 vector signed int sumsqr; 00362 00363 sum = (vector unsigned int)vec_splat_u32(0); 00364 00365 permclear = (vector unsigned char){255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0}; 00366 00367 00368 for (i = 0; i < h; i++) { 00369 /* Read potentially unaligned pixels into t1 and t2 00370 Since we're reading 16 pixels, and actually only want 8, 00371 mask out the last 8 pixels. The 0s don't change the sum. */ 00372 perm1 = vec_lvsl(0, pix1); 00373 pix1v = (vector unsigned char *) pix1; 00374 perm2 = vec_lvsl(0, pix2); 00375 pix2v = (vector unsigned char *) pix2; 00376 t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear); 00377 t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear); 00378 00379 /* Since we want to use unsigned chars, we can take advantage 00380 of the fact that abs(a-b)^2 = (a-b)^2. */ 00381 00382 /* Calculate abs differences vector */ 00383 t3 = vec_max(t1, t2); 00384 t4 = vec_min(t1, t2); 00385 t5 = vec_sub(t3, t4); 00386 00387 /* Square the values and add them to our sum */ 00388 sum = vec_msum(t5, t5, sum); 00389 00390 pix1 += line_size; 00391 pix2 += line_size; 00392 } 00393 00394 /* Sum up the four partial sums, and put the result into s */ 00395 sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero); 00396 sumsqr = vec_splat(sumsqr, 3); 00397 vec_ste(sumsqr, 0, &s); 00398 00399 return s; 00400 } 00401 00407 static int sse16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) 00408 { 00409 int i; 00410 int s; 00411 const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); 00412 vector unsigned char perm1, perm2, *pix1v, *pix2v; 00413 vector unsigned char t1, t2, t3,t4, t5; 00414 vector unsigned int sum; 00415 vector signed int sumsqr; 00416 00417 sum = (vector unsigned int)vec_splat_u32(0); 00418 00419 for (i = 0; i < h; i++) { 00420 /* Read potentially unaligned pixels into t1 and t2 */ 00421 perm1 = vec_lvsl(0, pix1); 00422 pix1v = (vector unsigned char *) pix1; 00423 perm2 = vec_lvsl(0, pix2); 00424 pix2v = (vector unsigned char *) pix2; 00425 t1 = vec_perm(pix1v[0], pix1v[1], perm1); 00426 t2 = vec_perm(pix2v[0], pix2v[1], perm2); 00427 00428 /* Since we want to use unsigned chars, we can take advantage 00429 of the fact that abs(a-b)^2 = (a-b)^2. */ 00430 00431 /* Calculate abs differences vector */ 00432 t3 = vec_max(t1, t2); 00433 t4 = vec_min(t1, t2); 00434 t5 = vec_sub(t3, t4); 00435 00436 /* Square the values and add them to our sum */ 00437 sum = vec_msum(t5, t5, sum); 00438 00439 pix1 += line_size; 00440 pix2 += line_size; 00441 } 00442 00443 /* Sum up the four partial sums, and put the result into s */ 00444 sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero); 00445 sumsqr = vec_splat(sumsqr, 3); 00446 vec_ste(sumsqr, 0, &s); 00447 00448 return s; 00449 } 00450 00451 static int pix_sum_altivec(uint8_t * pix, int line_size) 00452 { 00453 const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); 00454 vector unsigned char perm, *pixv; 00455 vector unsigned char t1; 00456 vector unsigned int sad; 00457 vector signed int sumdiffs; 00458 00459 int i; 00460 int s; 00461 00462 sad = (vector unsigned int)vec_splat_u32(0); 00463 00464 for (i = 0; i < 16; i++) { 00465 /* Read the potentially unaligned 16 pixels into t1 */ 00466 perm = vec_lvsl(0, pix); 00467 pixv = (vector unsigned char *) pix; 00468 t1 = vec_perm(pixv[0], pixv[1], perm); 00469 00470 /* Add each 4 pixel group together and put 4 results into sad */ 00471 sad = vec_sum4s(t1, sad); 00472 00473 pix += line_size; 00474 } 00475 00476 /* Sum up the four partial sums, and put the result into s */ 00477 sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); 00478 sumdiffs = vec_splat(sumdiffs, 3); 00479 vec_ste(sumdiffs, 0, &s); 00480 00481 return s; 00482 } 00483 00484 static void get_pixels_altivec(DCTELEM *restrict block, const uint8_t *pixels, int line_size) 00485 { 00486 int i; 00487 vector unsigned char perm, bytes, *pixv; 00488 const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); 00489 vector signed short shorts; 00490 00491 for (i = 0; i < 8; i++) { 00492 // Read potentially unaligned pixels. 00493 // We're reading 16 pixels, and actually only want 8, 00494 // but we simply ignore the extras. 00495 perm = vec_lvsl(0, pixels); 00496 pixv = (vector unsigned char *) pixels; 00497 bytes = vec_perm(pixv[0], pixv[1], perm); 00498 00499 // convert the bytes into shorts 00500 shorts = (vector signed short)vec_mergeh(zero, bytes); 00501 00502 // save the data to the block, we assume the block is 16-byte aligned 00503 vec_st(shorts, i*16, (vector signed short*)block); 00504 00505 pixels += line_size; 00506 } 00507 } 00508 00509 static void diff_pixels_altivec(DCTELEM *restrict block, const uint8_t *s1, 00510 const uint8_t *s2, int stride) 00511 { 00512 int i; 00513 vector unsigned char perm, bytes, *pixv; 00514 const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); 00515 vector signed short shorts1, shorts2; 00516 00517 for (i = 0; i < 4; i++) { 00518 // Read potentially unaligned pixels 00519 // We're reading 16 pixels, and actually only want 8, 00520 // but we simply ignore the extras. 00521 perm = vec_lvsl(0, s1); 00522 pixv = (vector unsigned char *) s1; 00523 bytes = vec_perm(pixv[0], pixv[1], perm); 00524 00525 // convert the bytes into shorts 00526 shorts1 = (vector signed short)vec_mergeh(zero, bytes); 00527 00528 // Do the same for the second block of pixels 00529 perm = vec_lvsl(0, s2); 00530 pixv = (vector unsigned char *) s2; 00531 bytes = vec_perm(pixv[0], pixv[1], perm); 00532 00533 // convert the bytes into shorts 00534 shorts2 = (vector signed short)vec_mergeh(zero, bytes); 00535 00536 // Do the subtraction 00537 shorts1 = vec_sub(shorts1, shorts2); 00538 00539 // save the data to the block, we assume the block is 16-byte aligned 00540 vec_st(shorts1, 0, (vector signed short*)block); 00541 00542 s1 += stride; 00543 s2 += stride; 00544 block += 8; 00545 00546 00547 // The code below is a copy of the code above... This is a manual 00548 // unroll. 00549 00550 // Read potentially unaligned pixels 00551 // We're reading 16 pixels, and actually only want 8, 00552 // but we simply ignore the extras. 00553 perm = vec_lvsl(0, s1); 00554 pixv = (vector unsigned char *) s1; 00555 bytes = vec_perm(pixv[0], pixv[1], perm); 00556 00557 // convert the bytes into shorts 00558 shorts1 = (vector signed short)vec_mergeh(zero, bytes); 00559 00560 // Do the same for the second block of pixels 00561 perm = vec_lvsl(0, s2); 00562 pixv = (vector unsigned char *) s2; 00563 bytes = vec_perm(pixv[0], pixv[1], perm); 00564 00565 // convert the bytes into shorts 00566 shorts2 = (vector signed short)vec_mergeh(zero, bytes); 00567 00568 // Do the subtraction 00569 shorts1 = vec_sub(shorts1, shorts2); 00570 00571 // save the data to the block, we assume the block is 16-byte aligned 00572 vec_st(shorts1, 0, (vector signed short*)block); 00573 00574 s1 += stride; 00575 s2 += stride; 00576 block += 8; 00577 } 00578 } 00579 00580 00581 static void clear_block_altivec(DCTELEM *block) { 00582 LOAD_ZERO; 00583 vec_st(zero_s16v, 0, block); 00584 vec_st(zero_s16v, 16, block); 00585 vec_st(zero_s16v, 32, block); 00586 vec_st(zero_s16v, 48, block); 00587 vec_st(zero_s16v, 64, block); 00588 vec_st(zero_s16v, 80, block); 00589 vec_st(zero_s16v, 96, block); 00590 vec_st(zero_s16v, 112, block); 00591 } 00592 00593 00594 static void add_bytes_altivec(uint8_t *dst, uint8_t *src, int w) { 00595 register int i; 00596 register vector unsigned char vdst, vsrc; 00597 00598 /* dst and src are 16 bytes-aligned (guaranteed) */ 00599 for (i = 0 ; (i + 15) < w ; i+=16) { 00600 vdst = vec_ld(i, (unsigned char*)dst); 00601 vsrc = vec_ld(i, (unsigned char*)src); 00602 vdst = vec_add(vsrc, vdst); 00603 vec_st(vdst, i, (unsigned char*)dst); 00604 } 00605 /* if w is not a multiple of 16 */ 00606 for (; (i < w) ; i++) { 00607 dst[i] = src[i]; 00608 } 00609 } 00610 00611 /* next one assumes that ((line_size % 16) == 0) */ 00612 void put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) 00613 { 00614 register vector unsigned char pixelsv1, pixelsv2; 00615 register vector unsigned char pixelsv1B, pixelsv2B; 00616 register vector unsigned char pixelsv1C, pixelsv2C; 00617 register vector unsigned char pixelsv1D, pixelsv2D; 00618 00619 register vector unsigned char perm = vec_lvsl(0, pixels); 00620 int i; 00621 register int line_size_2 = line_size << 1; 00622 register int line_size_3 = line_size + line_size_2; 00623 register int line_size_4 = line_size << 2; 00624 00625 // hand-unrolling the loop by 4 gains about 15% 00626 // mininum execution time goes from 74 to 60 cycles 00627 // it's faster than -funroll-loops, but using 00628 // -funroll-loops w/ this is bad - 74 cycles again. 00629 // all this is on a 7450, tuning for the 7450 00630 for (i = 0; i < h; i += 4) { 00631 pixelsv1 = vec_ld( 0, pixels); 00632 pixelsv2 = vec_ld(15, pixels); 00633 pixelsv1B = vec_ld(line_size, pixels); 00634 pixelsv2B = vec_ld(15 + line_size, pixels); 00635 pixelsv1C = vec_ld(line_size_2, pixels); 00636 pixelsv2C = vec_ld(15 + line_size_2, pixels); 00637 pixelsv1D = vec_ld(line_size_3, pixels); 00638 pixelsv2D = vec_ld(15 + line_size_3, pixels); 00639 vec_st(vec_perm(pixelsv1, pixelsv2, perm), 00640 0, (unsigned char*)block); 00641 vec_st(vec_perm(pixelsv1B, pixelsv2B, perm), 00642 line_size, (unsigned char*)block); 00643 vec_st(vec_perm(pixelsv1C, pixelsv2C, perm), 00644 line_size_2, (unsigned char*)block); 00645 vec_st(vec_perm(pixelsv1D, pixelsv2D, perm), 00646 line_size_3, (unsigned char*)block); 00647 pixels+=line_size_4; 00648 block +=line_size_4; 00649 } 00650 } 00651 00652 /* next one assumes that ((line_size % 16) == 0) */ 00653 #define op_avg(a,b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) ) 00654 void avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) 00655 { 00656 register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv; 00657 register vector unsigned char perm = vec_lvsl(0, pixels); 00658 int i; 00659 00660 for (i = 0; i < h; i++) { 00661 pixelsv1 = vec_ld( 0, pixels); 00662 pixelsv2 = vec_ld(16,pixels); 00663 blockv = vec_ld(0, block); 00664 pixelsv = vec_perm(pixelsv1, pixelsv2, perm); 00665 blockv = vec_avg(blockv,pixelsv); 00666 vec_st(blockv, 0, (unsigned char*)block); 00667 pixels+=line_size; 00668 block +=line_size; 00669 } 00670 } 00671 00672 /* next one assumes that ((line_size % 8) == 0) */ 00673 static void avg_pixels8_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h) 00674 { 00675 register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv; 00676 int i; 00677 00678 for (i = 0; i < h; i++) { 00679 /* block is 8 bytes-aligned, so we're either in the 00680 left block (16 bytes-aligned) or in the right block (not) */ 00681 int rightside = ((unsigned long)block & 0x0000000F); 00682 00683 blockv = vec_ld(0, block); 00684 pixelsv1 = vec_ld( 0, pixels); 00685 pixelsv2 = vec_ld(16, pixels); 00686 pixelsv = vec_perm(pixelsv1, pixelsv2, vec_lvsl(0, pixels)); 00687 00688 if (rightside) { 00689 pixelsv = vec_perm(blockv, pixelsv, vcprm(0,1,s0,s1)); 00690 } else { 00691 pixelsv = vec_perm(blockv, pixelsv, vcprm(s0,s1,2,3)); 00692 } 00693 00694 blockv = vec_avg(blockv, pixelsv); 00695 00696 vec_st(blockv, 0, block); 00697 00698 pixels += line_size; 00699 block += line_size; 00700 } 00701 } 00702 00703 /* next one assumes that ((line_size % 8) == 0) */ 00704 static void put_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) 00705 { 00706 register int i; 00707 register vector unsigned char pixelsv1, pixelsv2, pixelsavg; 00708 register vector unsigned char blockv, temp1, temp2; 00709 register vector unsigned short pixelssum1, pixelssum2, temp3; 00710 register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); 00711 register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); 00712 00713 temp1 = vec_ld(0, pixels); 00714 temp2 = vec_ld(16, pixels); 00715 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); 00716 if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) { 00717 pixelsv2 = temp2; 00718 } else { 00719 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels)); 00720 } 00721 pixelsv1 = vec_mergeh(vczero, pixelsv1); 00722 pixelsv2 = vec_mergeh(vczero, pixelsv2); 00723 pixelssum1 = vec_add((vector unsigned short)pixelsv1, 00724 (vector unsigned short)pixelsv2); 00725 pixelssum1 = vec_add(pixelssum1, vctwo); 00726 00727 for (i = 0; i < h ; i++) { 00728 int rightside = ((unsigned long)block & 0x0000000F); 00729 blockv = vec_ld(0, block); 00730 00731 temp1 = vec_ld(line_size, pixels); 00732 temp2 = vec_ld(line_size + 16, pixels); 00733 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); 00734 if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) { 00735 pixelsv2 = temp2; 00736 } else { 00737 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels)); 00738 } 00739 00740 pixelsv1 = vec_mergeh(vczero, pixelsv1); 00741 pixelsv2 = vec_mergeh(vczero, pixelsv2); 00742 pixelssum2 = vec_add((vector unsigned short)pixelsv1, 00743 (vector unsigned short)pixelsv2); 00744 temp3 = vec_add(pixelssum1, pixelssum2); 00745 temp3 = vec_sra(temp3, vctwo); 00746 pixelssum1 = vec_add(pixelssum2, vctwo); 00747 pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero); 00748 00749 if (rightside) { 00750 blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1)); 00751 } else { 00752 blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3)); 00753 } 00754 00755 vec_st(blockv, 0, block); 00756 00757 block += line_size; 00758 pixels += line_size; 00759 } 00760 } 00761 00762 /* next one assumes that ((line_size % 8) == 0) */ 00763 static void put_no_rnd_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) 00764 { 00765 register int i; 00766 register vector unsigned char pixelsv1, pixelsv2, pixelsavg; 00767 register vector unsigned char blockv, temp1, temp2; 00768 register vector unsigned short pixelssum1, pixelssum2, temp3; 00769 register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); 00770 register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1); 00771 register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); 00772 00773 temp1 = vec_ld(0, pixels); 00774 temp2 = vec_ld(16, pixels); 00775 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); 00776 if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) { 00777 pixelsv2 = temp2; 00778 } else { 00779 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels)); 00780 } 00781 pixelsv1 = vec_mergeh(vczero, pixelsv1); 00782 pixelsv2 = vec_mergeh(vczero, pixelsv2); 00783 pixelssum1 = vec_add((vector unsigned short)pixelsv1, 00784 (vector unsigned short)pixelsv2); 00785 pixelssum1 = vec_add(pixelssum1, vcone); 00786 00787 for (i = 0; i < h ; i++) { 00788 int rightside = ((unsigned long)block & 0x0000000F); 00789 blockv = vec_ld(0, block); 00790 00791 temp1 = vec_ld(line_size, pixels); 00792 temp2 = vec_ld(line_size + 16, pixels); 00793 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); 00794 if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) { 00795 pixelsv2 = temp2; 00796 } else { 00797 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels)); 00798 } 00799 00800 pixelsv1 = vec_mergeh(vczero, pixelsv1); 00801 pixelsv2 = vec_mergeh(vczero, pixelsv2); 00802 pixelssum2 = vec_add((vector unsigned short)pixelsv1, 00803 (vector unsigned short)pixelsv2); 00804 temp3 = vec_add(pixelssum1, pixelssum2); 00805 temp3 = vec_sra(temp3, vctwo); 00806 pixelssum1 = vec_add(pixelssum2, vcone); 00807 pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero); 00808 00809 if (rightside) { 00810 blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1)); 00811 } else { 00812 blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3)); 00813 } 00814 00815 vec_st(blockv, 0, block); 00816 00817 block += line_size; 00818 pixels += line_size; 00819 } 00820 } 00821 00822 /* next one assumes that ((line_size % 16) == 0) */ 00823 static void put_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h) 00824 { 00825 register int i; 00826 register vector unsigned char pixelsv1, pixelsv2, pixelsv3, pixelsv4; 00827 register vector unsigned char blockv, temp1, temp2; 00828 register vector unsigned short temp3, temp4, 00829 pixelssum1, pixelssum2, pixelssum3, pixelssum4; 00830 register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); 00831 register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); 00832 00833 temp1 = vec_ld(0, pixels); 00834 temp2 = vec_ld(16, pixels); 00835 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); 00836 if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) { 00837 pixelsv2 = temp2; 00838 } else { 00839 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels)); 00840 } 00841 pixelsv3 = vec_mergel(vczero, pixelsv1); 00842 pixelsv4 = vec_mergel(vczero, pixelsv2); 00843 pixelsv1 = vec_mergeh(vczero, pixelsv1); 00844 pixelsv2 = vec_mergeh(vczero, pixelsv2); 00845 pixelssum3 = vec_add((vector unsigned short)pixelsv3, 00846 (vector unsigned short)pixelsv4); 00847 pixelssum3 = vec_add(pixelssum3, vctwo); 00848 pixelssum1 = vec_add((vector unsigned short)pixelsv1, 00849 (vector unsigned short)pixelsv2); 00850 pixelssum1 = vec_add(pixelssum1, vctwo); 00851 00852 for (i = 0; i < h ; i++) { 00853 blockv = vec_ld(0, block); 00854 00855 temp1 = vec_ld(line_size, pixels); 00856 temp2 = vec_ld(line_size + 16, pixels); 00857 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); 00858 if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) { 00859 pixelsv2 = temp2; 00860 } else { 00861 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels)); 00862 } 00863 00864 pixelsv3 = vec_mergel(vczero, pixelsv1); 00865 pixelsv4 = vec_mergel(vczero, pixelsv2); 00866 pixelsv1 = vec_mergeh(vczero, pixelsv1); 00867 pixelsv2 = vec_mergeh(vczero, pixelsv2); 00868 00869 pixelssum4 = vec_add((vector unsigned short)pixelsv3, 00870 (vector unsigned short)pixelsv4); 00871 pixelssum2 = vec_add((vector unsigned short)pixelsv1, 00872 (vector unsigned short)pixelsv2); 00873 temp4 = vec_add(pixelssum3, pixelssum4); 00874 temp4 = vec_sra(temp4, vctwo); 00875 temp3 = vec_add(pixelssum1, pixelssum2); 00876 temp3 = vec_sra(temp3, vctwo); 00877 00878 pixelssum3 = vec_add(pixelssum4, vctwo); 00879 pixelssum1 = vec_add(pixelssum2, vctwo); 00880 00881 blockv = vec_packsu(temp3, temp4); 00882 00883 vec_st(blockv, 0, block); 00884 00885 block += line_size; 00886 pixels += line_size; 00887 } 00888 } 00889 00890 /* next one assumes that ((line_size % 16) == 0) */ 00891 static void put_no_rnd_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h) 00892 { 00893 register int i; 00894 register vector unsigned char pixelsv1, pixelsv2, pixelsv3, pixelsv4; 00895 register vector unsigned char blockv, temp1, temp2; 00896 register vector unsigned short temp3, temp4, 00897 pixelssum1, pixelssum2, pixelssum3, pixelssum4; 00898 register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); 00899 register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1); 00900 register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); 00901 00902 temp1 = vec_ld(0, pixels); 00903 temp2 = vec_ld(16, pixels); 00904 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); 00905 if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) { 00906 pixelsv2 = temp2; 00907 } else { 00908 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels)); 00909 } 00910 pixelsv3 = vec_mergel(vczero, pixelsv1); 00911 pixelsv4 = vec_mergel(vczero, pixelsv2); 00912 pixelsv1 = vec_mergeh(vczero, pixelsv1); 00913 pixelsv2 = vec_mergeh(vczero, pixelsv2); 00914 pixelssum3 = vec_add((vector unsigned short)pixelsv3, 00915 (vector unsigned short)pixelsv4); 00916 pixelssum3 = vec_add(pixelssum3, vcone); 00917 pixelssum1 = vec_add((vector unsigned short)pixelsv1, 00918 (vector unsigned short)pixelsv2); 00919 pixelssum1 = vec_add(pixelssum1, vcone); 00920 00921 for (i = 0; i < h ; i++) { 00922 blockv = vec_ld(0, block); 00923 00924 temp1 = vec_ld(line_size, pixels); 00925 temp2 = vec_ld(line_size + 16, pixels); 00926 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); 00927 if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) { 00928 pixelsv2 = temp2; 00929 } else { 00930 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels)); 00931 } 00932 00933 pixelsv3 = vec_mergel(vczero, pixelsv1); 00934 pixelsv4 = vec_mergel(vczero, pixelsv2); 00935 pixelsv1 = vec_mergeh(vczero, pixelsv1); 00936 pixelsv2 = vec_mergeh(vczero, pixelsv2); 00937 00938 pixelssum4 = vec_add((vector unsigned short)pixelsv3, 00939 (vector unsigned short)pixelsv4); 00940 pixelssum2 = vec_add((vector unsigned short)pixelsv1, 00941 (vector unsigned short)pixelsv2); 00942 temp4 = vec_add(pixelssum3, pixelssum4); 00943 temp4 = vec_sra(temp4, vctwo); 00944 temp3 = vec_add(pixelssum1, pixelssum2); 00945 temp3 = vec_sra(temp3, vctwo); 00946 00947 pixelssum3 = vec_add(pixelssum4, vcone); 00948 pixelssum1 = vec_add(pixelssum2, vcone); 00949 00950 blockv = vec_packsu(temp3, temp4); 00951 00952 vec_st(blockv, 0, block); 00953 00954 block += line_size; 00955 pixels += line_size; 00956 } 00957 } 00958 00959 static int hadamard8_diff8x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){ 00960 int sum; 00961 register const vector unsigned char vzero = 00962 (const vector unsigned char)vec_splat_u8(0); 00963 register vector signed short temp0, temp1, temp2, temp3, temp4, 00964 temp5, temp6, temp7; 00965 { 00966 register const vector signed short vprod1 =(const vector signed short) 00967 { 1,-1, 1,-1, 1,-1, 1,-1 }; 00968 register const vector signed short vprod2 =(const vector signed short) 00969 { 1, 1,-1,-1, 1, 1,-1,-1 }; 00970 register const vector signed short vprod3 =(const vector signed short) 00971 { 1, 1, 1, 1,-1,-1,-1,-1 }; 00972 register const vector unsigned char perm1 = (const vector unsigned char) 00973 {0x02, 0x03, 0x00, 0x01, 0x06, 0x07, 0x04, 0x05, 00974 0x0A, 0x0B, 0x08, 0x09, 0x0E, 0x0F, 0x0C, 0x0D}; 00975 register const vector unsigned char perm2 = (const vector unsigned char) 00976 {0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03, 00977 0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B}; 00978 register const vector unsigned char perm3 = (const vector unsigned char) 00979 {0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 00980 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07}; 00981 00982 #define ONEITERBUTTERFLY(i, res) \ 00983 { \ 00984 register vector unsigned char src1, src2, srcO; \ 00985 register vector unsigned char dst1, dst2, dstO; \ 00986 register vector signed short srcV, dstV; \ 00987 register vector signed short but0, but1, but2, op1, op2, op3; \ 00988 src1 = vec_ld(stride * i, src); \ 00989 src2 = vec_ld((stride * i) + 15, src); \ 00990 srcO = vec_perm(src1, src2, vec_lvsl(stride * i, src)); \ 00991 dst1 = vec_ld(stride * i, dst); \ 00992 dst2 = vec_ld((stride * i) + 15, dst); \ 00993 dstO = vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \ 00994 /* promote the unsigned chars to signed shorts */ \ 00995 /* we're in the 8x8 function, we only care for the first 8 */ \ 00996 srcV = (vector signed short)vec_mergeh((vector signed char)vzero, \ 00997 (vector signed char)srcO); \ 00998 dstV = (vector signed short)vec_mergeh((vector signed char)vzero, \ 00999 (vector signed char)dstO); \ 01000 /* subtractions inside the first butterfly */ \ 01001 but0 = vec_sub(srcV, dstV); \ 01002 op1 = vec_perm(but0, but0, perm1); \ 01003 but1 = vec_mladd(but0, vprod1, op1); \ 01004 op2 = vec_perm(but1, but1, perm2); \ 01005 but2 = vec_mladd(but1, vprod2, op2); \ 01006 op3 = vec_perm(but2, but2, perm3); \ 01007 res = vec_mladd(but2, vprod3, op3); \ 01008 } 01009 ONEITERBUTTERFLY(0, temp0); 01010 ONEITERBUTTERFLY(1, temp1); 01011 ONEITERBUTTERFLY(2, temp2); 01012 ONEITERBUTTERFLY(3, temp3); 01013 ONEITERBUTTERFLY(4, temp4); 01014 ONEITERBUTTERFLY(5, temp5); 01015 ONEITERBUTTERFLY(6, temp6); 01016 ONEITERBUTTERFLY(7, temp7); 01017 } 01018 #undef ONEITERBUTTERFLY 01019 { 01020 register vector signed int vsum; 01021 register vector signed short line0 = vec_add(temp0, temp1); 01022 register vector signed short line1 = vec_sub(temp0, temp1); 01023 register vector signed short line2 = vec_add(temp2, temp3); 01024 register vector signed short line3 = vec_sub(temp2, temp3); 01025 register vector signed short line4 = vec_add(temp4, temp5); 01026 register vector signed short line5 = vec_sub(temp4, temp5); 01027 register vector signed short line6 = vec_add(temp6, temp7); 01028 register vector signed short line7 = vec_sub(temp6, temp7); 01029 01030 register vector signed short line0B = vec_add(line0, line2); 01031 register vector signed short line2B = vec_sub(line0, line2); 01032 register vector signed short line1B = vec_add(line1, line3); 01033 register vector signed short line3B = vec_sub(line1, line3); 01034 register vector signed short line4B = vec_add(line4, line6); 01035 register vector signed short line6B = vec_sub(line4, line6); 01036 register vector signed short line5B = vec_add(line5, line7); 01037 register vector signed short line7B = vec_sub(line5, line7); 01038 01039 register vector signed short line0C = vec_add(line0B, line4B); 01040 register vector signed short line4C = vec_sub(line0B, line4B); 01041 register vector signed short line1C = vec_add(line1B, line5B); 01042 register vector signed short line5C = vec_sub(line1B, line5B); 01043 register vector signed short line2C = vec_add(line2B, line6B); 01044 register vector signed short line6C = vec_sub(line2B, line6B); 01045 register vector signed short line3C = vec_add(line3B, line7B); 01046 register vector signed short line7C = vec_sub(line3B, line7B); 01047 01048 vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0)); 01049 vsum = vec_sum4s(vec_abs(line1C), vsum); 01050 vsum = vec_sum4s(vec_abs(line2C), vsum); 01051 vsum = vec_sum4s(vec_abs(line3C), vsum); 01052 vsum = vec_sum4s(vec_abs(line4C), vsum); 01053 vsum = vec_sum4s(vec_abs(line5C), vsum); 01054 vsum = vec_sum4s(vec_abs(line6C), vsum); 01055 vsum = vec_sum4s(vec_abs(line7C), vsum); 01056 vsum = vec_sums(vsum, (vector signed int)vzero); 01057 vsum = vec_splat(vsum, 3); 01058 vec_ste(vsum, 0, &sum); 01059 } 01060 return sum; 01061 } 01062 01063 /* 01064 16x8 works with 16 elements; it allows to avoid replicating loads, and 01065 give the compiler more rooms for scheduling. It's only used from 01066 inside hadamard8_diff16_altivec. 01067 01068 Unfortunately, it seems gcc-3.3 is a bit dumb, and the compiled code has a LOT 01069 of spill code, it seems gcc (unlike xlc) cannot keep everything in registers 01070 by itself. The following code include hand-made registers allocation. It's not 01071 clean, but on a 7450 the resulting code is much faster (best case fall from 01072 700+ cycles to 550). 01073 01074 xlc doesn't add spill code, but it doesn't know how to schedule for the 7450, 01075 and its code isn't much faster than gcc-3.3 on the 7450 (but uses 25% less 01076 instructions...) 01077 01078 On the 970, the hand-made RA is still a win (around 690 vs. around 780), but 01079 xlc goes to around 660 on the regular C code... 01080 */ 01081 01082 static int hadamard8_diff16x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h) { 01083 int sum; 01084 register vector signed short 01085 temp0 __asm__ ("v0"), 01086 temp1 __asm__ ("v1"), 01087 temp2 __asm__ ("v2"), 01088 temp3 __asm__ ("v3"), 01089 temp4 __asm__ ("v4"), 01090 temp5 __asm__ ("v5"), 01091 temp6 __asm__ ("v6"), 01092 temp7 __asm__ ("v7"); 01093 register vector signed short 01094 temp0S __asm__ ("v8"), 01095 temp1S __asm__ ("v9"), 01096 temp2S __asm__ ("v10"), 01097 temp3S __asm__ ("v11"), 01098 temp4S __asm__ ("v12"), 01099 temp5S __asm__ ("v13"), 01100 temp6S __asm__ ("v14"), 01101 temp7S __asm__ ("v15"); 01102 register const vector unsigned char vzero __asm__ ("v31") = 01103 (const vector unsigned char)vec_splat_u8(0); 01104 { 01105 register const vector signed short vprod1 __asm__ ("v16") = 01106 (const vector signed short){ 1,-1, 1,-1, 1,-1, 1,-1 }; 01107 register const vector signed short vprod2 __asm__ ("v17") = 01108 (const vector signed short){ 1, 1,-1,-1, 1, 1,-1,-1 }; 01109 register const vector signed short vprod3 __asm__ ("v18") = 01110 (const vector signed short){ 1, 1, 1, 1,-1,-1,-1,-1 }; 01111 register const vector unsigned char perm1 __asm__ ("v19") = 01112 (const vector unsigned char) 01113 {0x02, 0x03, 0x00, 0x01, 0x06, 0x07, 0x04, 0x05, 01114 0x0A, 0x0B, 0x08, 0x09, 0x0E, 0x0F, 0x0C, 0x0D}; 01115 register const vector unsigned char perm2 __asm__ ("v20") = 01116 (const vector unsigned char) 01117 {0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03, 01118 0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B}; 01119 register const vector unsigned char perm3 __asm__ ("v21") = 01120 (const vector unsigned char) 01121 {0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 01122 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07}; 01123 01124 #define ONEITERBUTTERFLY(i, res1, res2) \ 01125 { \ 01126 register vector unsigned char src1 __asm__ ("v22"), \ 01127 src2 __asm__ ("v23"), \ 01128 dst1 __asm__ ("v24"), \ 01129 dst2 __asm__ ("v25"), \ 01130 srcO __asm__ ("v22"), \ 01131 dstO __asm__ ("v23"); \ 01132 \ 01133 register vector signed short srcV __asm__ ("v24"), \ 01134 dstV __asm__ ("v25"), \ 01135 srcW __asm__ ("v26"), \ 01136 dstW __asm__ ("v27"), \ 01137 but0 __asm__ ("v28"), \ 01138 but0S __asm__ ("v29"), \ 01139 op1 __asm__ ("v30"), \ 01140 but1 __asm__ ("v22"), \ 01141 op1S __asm__ ("v23"), \ 01142 but1S __asm__ ("v24"), \ 01143 op2 __asm__ ("v25"), \ 01144 but2 __asm__ ("v26"), \ 01145 op2S __asm__ ("v27"), \ 01146 but2S __asm__ ("v28"), \ 01147 op3 __asm__ ("v29"), \ 01148 op3S __asm__ ("v30"); \ 01149 \ 01150 src1 = vec_ld(stride * i, src); \ 01151 src2 = vec_ld((stride * i) + 16, src); \ 01152 srcO = vec_perm(src1, src2, vec_lvsl(stride * i, src)); \ 01153 dst1 = vec_ld(stride * i, dst); \ 01154 dst2 = vec_ld((stride * i) + 16, dst); \ 01155 dstO = vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \ 01156 /* promote the unsigned chars to signed shorts */ \ 01157 srcV = (vector signed short)vec_mergeh((vector signed char)vzero, \ 01158 (vector signed char)srcO); \ 01159 dstV = (vector signed short)vec_mergeh((vector signed char)vzero, \ 01160 (vector signed char)dstO); \ 01161 srcW = (vector signed short)vec_mergel((vector signed char)vzero, \ 01162 (vector signed char)srcO); \ 01163 dstW = (vector signed short)vec_mergel((vector signed char)vzero, \ 01164 (vector signed char)dstO); \ 01165 /* subtractions inside the first butterfly */ \ 01166 but0 = vec_sub(srcV, dstV); \ 01167 but0S = vec_sub(srcW, dstW); \ 01168 op1 = vec_perm(but0, but0, perm1); \ 01169 but1 = vec_mladd(but0, vprod1, op1); \ 01170 op1S = vec_perm(but0S, but0S, perm1); \ 01171 but1S = vec_mladd(but0S, vprod1, op1S); \ 01172 op2 = vec_perm(but1, but1, perm2); \ 01173 but2 = vec_mladd(but1, vprod2, op2); \ 01174 op2S = vec_perm(but1S, but1S, perm2); \ 01175 but2S = vec_mladd(but1S, vprod2, op2S); \ 01176 op3 = vec_perm(but2, but2, perm3); \ 01177 res1 = vec_mladd(but2, vprod3, op3); \ 01178 op3S = vec_perm(but2S, but2S, perm3); \ 01179 res2 = vec_mladd(but2S, vprod3, op3S); \ 01180 } 01181 ONEITERBUTTERFLY(0, temp0, temp0S); 01182 ONEITERBUTTERFLY(1, temp1, temp1S); 01183 ONEITERBUTTERFLY(2, temp2, temp2S); 01184 ONEITERBUTTERFLY(3, temp3, temp3S); 01185 ONEITERBUTTERFLY(4, temp4, temp4S); 01186 ONEITERBUTTERFLY(5, temp5, temp5S); 01187 ONEITERBUTTERFLY(6, temp6, temp6S); 01188 ONEITERBUTTERFLY(7, temp7, temp7S); 01189 } 01190 #undef ONEITERBUTTERFLY 01191 { 01192 register vector signed int vsum; 01193 register vector signed short line0S, line1S, line2S, line3S, line4S, 01194 line5S, line6S, line7S, line0BS,line2BS, 01195 line1BS,line3BS,line4BS,line6BS,line5BS, 01196 line7BS,line0CS,line4CS,line1CS,line5CS, 01197 line2CS,line6CS,line3CS,line7CS; 01198 01199 register vector signed short line0 = vec_add(temp0, temp1); 01200 register vector signed short line1 = vec_sub(temp0, temp1); 01201 register vector signed short line2 = vec_add(temp2, temp3); 01202 register vector signed short line3 = vec_sub(temp2, temp3); 01203 register vector signed short line4 = vec_add(temp4, temp5); 01204 register vector signed short line5 = vec_sub(temp4, temp5); 01205 register vector signed short line6 = vec_add(temp6, temp7); 01206 register vector signed short line7 = vec_sub(temp6, temp7); 01207 01208 register vector signed short line0B = vec_add(line0, line2); 01209 register vector signed short line2B = vec_sub(line0, line2); 01210 register vector signed short line1B = vec_add(line1, line3); 01211 register vector signed short line3B = vec_sub(line1, line3); 01212 register vector signed short line4B = vec_add(line4, line6); 01213 register vector signed short line6B = vec_sub(line4, line6); 01214 register vector signed short line5B = vec_add(line5, line7); 01215 register vector signed short line7B = vec_sub(line5, line7); 01216 01217 register vector signed short line0C = vec_add(line0B, line4B); 01218 register vector signed short line4C = vec_sub(line0B, line4B); 01219 register vector signed short line1C = vec_add(line1B, line5B); 01220 register vector signed short line5C = vec_sub(line1B, line5B); 01221 register vector signed short line2C = vec_add(line2B, line6B); 01222 register vector signed short line6C = vec_sub(line2B, line6B); 01223 register vector signed short line3C = vec_add(line3B, line7B); 01224 register vector signed short line7C = vec_sub(line3B, line7B); 01225 01226 vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0)); 01227 vsum = vec_sum4s(vec_abs(line1C), vsum); 01228 vsum = vec_sum4s(vec_abs(line2C), vsum); 01229 vsum = vec_sum4s(vec_abs(line3C), vsum); 01230 vsum = vec_sum4s(vec_abs(line4C), vsum); 01231 vsum = vec_sum4s(vec_abs(line5C), vsum); 01232 vsum = vec_sum4s(vec_abs(line6C), vsum); 01233 vsum = vec_sum4s(vec_abs(line7C), vsum); 01234 01235 line0S = vec_add(temp0S, temp1S); 01236 line1S = vec_sub(temp0S, temp1S); 01237 line2S = vec_add(temp2S, temp3S); 01238 line3S = vec_sub(temp2S, temp3S); 01239 line4S = vec_add(temp4S, temp5S); 01240 line5S = vec_sub(temp4S, temp5S); 01241 line6S = vec_add(temp6S, temp7S); 01242 line7S = vec_sub(temp6S, temp7S); 01243 01244 line0BS = vec_add(line0S, line2S); 01245 line2BS = vec_sub(line0S, line2S); 01246 line1BS = vec_add(line1S, line3S); 01247 line3BS = vec_sub(line1S, line3S); 01248 line4BS = vec_add(line4S, line6S); 01249 line6BS = vec_sub(line4S, line6S); 01250 line5BS = vec_add(line5S, line7S); 01251 line7BS = vec_sub(line5S, line7S); 01252 01253 line0CS = vec_add(line0BS, line4BS); 01254 line4CS = vec_sub(line0BS, line4BS); 01255 line1CS = vec_add(line1BS, line5BS); 01256 line5CS = vec_sub(line1BS, line5BS); 01257 line2CS = vec_add(line2BS, line6BS); 01258 line6CS = vec_sub(line2BS, line6BS); 01259 line3CS = vec_add(line3BS, line7BS); 01260 line7CS = vec_sub(line3BS, line7BS); 01261 01262 vsum = vec_sum4s(vec_abs(line0CS), vsum); 01263 vsum = vec_sum4s(vec_abs(line1CS), vsum); 01264 vsum = vec_sum4s(vec_abs(line2CS), vsum); 01265 vsum = vec_sum4s(vec_abs(line3CS), vsum); 01266 vsum = vec_sum4s(vec_abs(line4CS), vsum); 01267 vsum = vec_sum4s(vec_abs(line5CS), vsum); 01268 vsum = vec_sum4s(vec_abs(line6CS), vsum); 01269 vsum = vec_sum4s(vec_abs(line7CS), vsum); 01270 vsum = vec_sums(vsum, (vector signed int)vzero); 01271 vsum = vec_splat(vsum, 3); 01272 vec_ste(vsum, 0, &sum); 01273 } 01274 return sum; 01275 } 01276 01277 static int hadamard8_diff16_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){ 01278 int score; 01279 score = hadamard8_diff16x8_altivec(s, dst, src, stride, 8); 01280 if (h==16) { 01281 dst += 8*stride; 01282 src += 8*stride; 01283 score += hadamard8_diff16x8_altivec(s, dst, src, stride, 8); 01284 } 01285 return score; 01286 } 01287 01288 static void vorbis_inverse_coupling_altivec(float *mag, float *ang, 01289 int blocksize) 01290 { 01291 int i; 01292 vector float m, a; 01293 vector bool int t0, t1; 01294 const vector unsigned int v_31 = //XXX 01295 vec_add(vec_add(vec_splat_u32(15),vec_splat_u32(15)),vec_splat_u32(1)); 01296 for (i = 0; i < blocksize; i += 4) { 01297 m = vec_ld(0, mag+i); 01298 a = vec_ld(0, ang+i); 01299 t0 = vec_cmple(m, (vector float)vec_splat_u32(0)); 01300 t1 = vec_cmple(a, (vector float)vec_splat_u32(0)); 01301 a = vec_xor(a, (vector float) vec_sl((vector unsigned int)t0, v_31)); 01302 t0 = (vector bool int)vec_and(a, t1); 01303 t1 = (vector bool int)vec_andc(a, t1); 01304 a = vec_sub(m, (vector float)t1); 01305 m = vec_add(m, (vector float)t0); 01306 vec_stl(a, 0, ang+i); 01307 vec_stl(m, 0, mag+i); 01308 } 01309 } 01310 01311 /* next one assumes that ((line_size % 8) == 0) */ 01312 static void avg_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) 01313 { 01314 register int i; 01315 register vector unsigned char pixelsv1, pixelsv2, pixelsavg; 01316 register vector unsigned char blockv, temp1, temp2, blocktemp; 01317 register vector unsigned short pixelssum1, pixelssum2, temp3; 01318 01319 register const vector unsigned char vczero = (const vector unsigned char) 01320 vec_splat_u8(0); 01321 register const vector unsigned short vctwo = (const vector unsigned short) 01322 vec_splat_u16(2); 01323 01324 temp1 = vec_ld(0, pixels); 01325 temp2 = vec_ld(16, pixels); 01326 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); 01327 if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) { 01328 pixelsv2 = temp2; 01329 } else { 01330 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels)); 01331 } 01332 pixelsv1 = vec_mergeh(vczero, pixelsv1); 01333 pixelsv2 = vec_mergeh(vczero, pixelsv2); 01334 pixelssum1 = vec_add((vector unsigned short)pixelsv1, 01335 (vector unsigned short)pixelsv2); 01336 pixelssum1 = vec_add(pixelssum1, vctwo); 01337 01338 for (i = 0; i < h ; i++) { 01339 int rightside = ((unsigned long)block & 0x0000000F); 01340 blockv = vec_ld(0, block); 01341 01342 temp1 = vec_ld(line_size, pixels); 01343 temp2 = vec_ld(line_size + 16, pixels); 01344 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); 01345 if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) { 01346 pixelsv2 = temp2; 01347 } else { 01348 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels)); 01349 } 01350 01351 pixelsv1 = vec_mergeh(vczero, pixelsv1); 01352 pixelsv2 = vec_mergeh(vczero, pixelsv2); 01353 pixelssum2 = vec_add((vector unsigned short)pixelsv1, 01354 (vector unsigned short)pixelsv2); 01355 temp3 = vec_add(pixelssum1, pixelssum2); 01356 temp3 = vec_sra(temp3, vctwo); 01357 pixelssum1 = vec_add(pixelssum2, vctwo); 01358 pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero); 01359 01360 if (rightside) { 01361 blocktemp = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1)); 01362 } else { 01363 blocktemp = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3)); 01364 } 01365 01366 blockv = vec_avg(blocktemp, blockv); 01367 vec_st(blockv, 0, block); 01368 01369 block += line_size; 01370 pixels += line_size; 01371 } 01372 } 01373 01374 void dsputil_init_altivec(DSPContext* c, AVCodecContext *avctx) 01375 { 01376 const int high_bit_depth = avctx->bits_per_raw_sample > 8; 01377 01378 c->pix_abs[0][1] = sad16_x2_altivec; 01379 c->pix_abs[0][2] = sad16_y2_altivec; 01380 c->pix_abs[0][3] = sad16_xy2_altivec; 01381 c->pix_abs[0][0] = sad16_altivec; 01382 c->pix_abs[1][0] = sad8_altivec; 01383 c->sad[0]= sad16_altivec; 01384 c->sad[1]= sad8_altivec; 01385 c->pix_norm1 = pix_norm1_altivec; 01386 c->sse[1]= sse8_altivec; 01387 c->sse[0]= sse16_altivec; 01388 c->pix_sum = pix_sum_altivec; 01389 c->diff_pixels = diff_pixels_altivec; 01390 c->add_bytes= add_bytes_altivec; 01391 if (!high_bit_depth) { 01392 c->get_pixels = get_pixels_altivec; 01393 c->clear_block = clear_block_altivec; 01394 c->put_pixels_tab[0][0] = put_pixels16_altivec; 01395 /* the two functions do the same thing, so use the same code */ 01396 c->put_no_rnd_pixels_tab[0][0] = put_pixels16_altivec; 01397 c->avg_pixels_tab[0][0] = avg_pixels16_altivec; 01398 c->avg_pixels_tab[1][0] = avg_pixels8_altivec; 01399 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_altivec; 01400 c->put_pixels_tab[1][3] = put_pixels8_xy2_altivec; 01401 c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_altivec; 01402 c->put_pixels_tab[0][3] = put_pixels16_xy2_altivec; 01403 c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_altivec; 01404 } 01405 01406 c->hadamard8_diff[0] = hadamard8_diff16_altivec; 01407 c->hadamard8_diff[1] = hadamard8_diff8x8_altivec; 01408 if (CONFIG_VORBIS_DECODER) 01409 c->vorbis_inverse_coupling = vorbis_inverse_coupling_altivec; 01410 }