00001 /* 00002 * SPARC VIS optimized inverse DCT 00003 * Copyright (c) 2007 Denes Balatoni < dbalatoni XatX interware XdotX hu > 00004 * 00005 * I did consult the following fine web page about dct 00006 * http://www.geocities.com/ssavekar/dct.htm 00007 * 00008 * This file is part of FFmpeg. 00009 * 00010 * FFmpeg is free software; you can redistribute it and/or 00011 * modify it under the terms of the GNU Lesser General Public 00012 * License as published by the Free Software Foundation; either 00013 * version 2.1 of the License, or (at your option) any later version. 00014 * 00015 * FFmpeg is distributed in the hope that it will be useful, 00016 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00017 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00018 * Lesser General Public License for more details. 00019 * 00020 * You should have received a copy of the GNU Lesser General Public 00021 * License along with FFmpeg; if not, write to the Free Software 00022 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 00023 */ 00024 00025 #include "libavcodec/dsputil.h" 00026 00027 static const DECLARE_ALIGNED_8(int16_t, coeffs[28]) = { 00028 - 1259,- 1259,- 1259,- 1259, 00029 - 4989,- 4989,- 4989,- 4989, 00030 -11045,-11045,-11045,-11045, 00031 -19195,-19195,-19195,-19195, 00032 -29126,-29126,-29126,-29126, 00033 25080, 25080, 25080, 25080, 00034 12785, 12785, 12785, 12785 00035 }; 00036 static const DECLARE_ALIGNED_8(uint16_t, scale[4]) = { 00037 65536>>6, 65536>>6, 65536>>6, 65536>>6 00038 }; 00039 static const DECLARE_ALIGNED_8(uint16_t, rounder[4]) = { 00040 1<<5, 1<<5, 1<<5, 1<<5 00041 }; 00042 static const DECLARE_ALIGNED_8(uint16_t, expand[4]) = { 00043 1<<14, 1<<14, 1<<14, 1<<14 00044 }; 00045 00046 #define INIT_IDCT \ 00047 "ldd [%1], %%f32 \n\t"\ 00048 "ldd [%1+8], %%f34 \n\t"\ 00049 "ldd [%1+16], %%f36 \n\t"\ 00050 "ldd [%1+24], %%f38 \n\t"\ 00051 "ldd [%1+32], %%f40 \n\t"\ 00052 "ldd [%1+40], %%f42 \n\t"\ 00053 "ldd [%1+48], %%f44 \n\t"\ 00054 "ldd [%0], %%f46 \n\t"\ 00055 "fzero %%f62 \n\t"\ 00056 00057 #define LOADSCALE(in) \ 00058 "ldd [" in "], %%f0 \n\t"\ 00059 "ldd [" in "+16], %%f2 \n\t"\ 00060 "ldd [" in "+32], %%f4 \n\t"\ 00061 "ldd [" in "+48], %%f6 \n\t"\ 00062 "ldd [" in "+64], %%f8 \n\t"\ 00063 "ldd [" in "+80], %%f10 \n\t"\ 00064 "ldd [" in "+96], %%f12 \n\t"\ 00065 "ldd [" in "+112], %%f14 \n\t"\ 00066 "fpadd16 %%f0, %%f0, %%f0 \n\t"\ 00067 "fpadd16 %%f2, %%f2, %%f2 \n\t"\ 00068 "fpadd16 %%f4, %%f4, %%f4 \n\t"\ 00069 "fpadd16 %%f6, %%f6, %%f6 \n\t"\ 00070 "fpadd16 %%f8, %%f8, %%f8 \n\t"\ 00071 "fpadd16 %%f10, %%f10, %%f10 \n\t"\ 00072 "fpadd16 %%f12, %%f12, %%f12 \n\t"\ 00073 "fpadd16 %%f14, %%f14, %%f14 \n\t"\ 00074 \ 00075 "fpadd16 %%f0, %%f0, %%f0 \n\t"\ 00076 "fpadd16 %%f2, %%f2, %%f2 \n\t"\ 00077 "fpadd16 %%f4, %%f4, %%f4 \n\t"\ 00078 "fpadd16 %%f6, %%f6, %%f6 \n\t"\ 00079 "fpadd16 %%f8, %%f8, %%f8 \n\t"\ 00080 "fpadd16 %%f10, %%f10, %%f10 \n\t"\ 00081 "fpadd16 %%f12, %%f12, %%f12 \n\t"\ 00082 "fpadd16 %%f14, %%f14, %%f14 \n\t"\ 00083 \ 00084 "fpadd16 %%f0, %%f0, %%f0 \n\t"\ 00085 "fpadd16 %%f2, %%f2, %%f2 \n\t"\ 00086 "fpadd16 %%f4, %%f4, %%f4 \n\t"\ 00087 "fpadd16 %%f6, %%f6, %%f6 \n\t"\ 00088 "fpadd16 %%f8, %%f8, %%f8 \n\t"\ 00089 "fpadd16 %%f10, %%f10, %%f10 \n\t"\ 00090 "fpadd16 %%f12, %%f12, %%f12 \n\t"\ 00091 "fpadd16 %%f14, %%f14, %%f14 \n\t"\ 00092 \ 00093 "fpadd16 %%f0, %%f0, %%f0 \n\t"\ 00094 "fpadd16 %%f2, %%f2, %%f2 \n\t"\ 00095 "fpadd16 %%f4, %%f4, %%f4 \n\t"\ 00096 "fpadd16 %%f6, %%f6, %%f6 \n\t"\ 00097 "fpadd16 %%f8, %%f8, %%f8 \n\t"\ 00098 "fpadd16 %%f10, %%f10, %%f10 \n\t"\ 00099 "fpadd16 %%f12, %%f12, %%f12 \n\t"\ 00100 "fpadd16 %%f14, %%f14, %%f14 \n\t"\ 00101 00102 #define LOAD(in) \ 00103 "ldd [" in "], %%f16 \n\t"\ 00104 "ldd [" in "+8], %%f18 \n\t"\ 00105 "ldd [" in "+16], %%f20 \n\t"\ 00106 "ldd [" in "+24], %%f22 \n\t"\ 00107 "ldd [" in "+32], %%f24 \n\t"\ 00108 "ldd [" in "+40], %%f26 \n\t"\ 00109 "ldd [" in "+48], %%f28 \n\t"\ 00110 "ldd [" in "+56], %%f30 \n\t"\ 00111 00112 #define TRANSPOSE \ 00113 "fpmerge %%f16, %%f24, %%f0 \n\t"\ 00114 "fpmerge %%f20, %%f28, %%f2 \n\t"\ 00115 "fpmerge %%f17, %%f25, %%f4 \n\t"\ 00116 "fpmerge %%f21, %%f29, %%f6 \n\t"\ 00117 "fpmerge %%f18, %%f26, %%f8 \n\t"\ 00118 "fpmerge %%f22, %%f30, %%f10 \n\t"\ 00119 "fpmerge %%f19, %%f27, %%f12 \n\t"\ 00120 "fpmerge %%f23, %%f31, %%f14 \n\t"\ 00121 \ 00122 "fpmerge %%f0, %%f2, %%f16 \n\t"\ 00123 "fpmerge %%f1, %%f3, %%f18 \n\t"\ 00124 "fpmerge %%f4, %%f6, %%f20 \n\t"\ 00125 "fpmerge %%f5, %%f7, %%f22 \n\t"\ 00126 "fpmerge %%f8, %%f10, %%f24 \n\t"\ 00127 "fpmerge %%f9, %%f11, %%f26 \n\t"\ 00128 "fpmerge %%f12, %%f14, %%f28 \n\t"\ 00129 "fpmerge %%f13, %%f15, %%f30 \n\t"\ 00130 \ 00131 "fpmerge %%f16, %%f17, %%f0 \n\t"\ 00132 "fpmerge %%f18, %%f19, %%f2 \n\t"\ 00133 "fpmerge %%f20, %%f21, %%f4 \n\t"\ 00134 "fpmerge %%f22, %%f23, %%f6 \n\t"\ 00135 "fpmerge %%f24, %%f25, %%f8 \n\t"\ 00136 "fpmerge %%f26, %%f27, %%f10 \n\t"\ 00137 "fpmerge %%f28, %%f29, %%f12 \n\t"\ 00138 "fpmerge %%f30, %%f31, %%f14 \n\t"\ 00139 00140 #define IDCT4ROWS \ 00141 /* 1. column */\ 00142 "fmul8ulx16 %%f0, %%f38, %%f28 \n\t"\ 00143 "for %%f4, %%f6, %%f60 \n\t"\ 00144 "fmul8ulx16 %%f2, %%f32, %%f18 \n\t"\ 00145 "fmul8ulx16 %%f2, %%f36, %%f22 \n\t"\ 00146 "fmul8ulx16 %%f2, %%f40, %%f26 \n\t"\ 00147 "fmul8ulx16 %%f2, %%f44, %%f30 \n\t"\ 00148 \ 00149 ADDROUNDER\ 00150 \ 00151 "fmul8sux16 %%f0, %%f38, %%f48 \n\t"\ 00152 "fcmpd %%fcc0, %%f62, %%f60 \n\t"\ 00153 "for %%f8, %%f10, %%f60 \n\t"\ 00154 "fmul8sux16 %%f2, %%f32, %%f50 \n\t"\ 00155 "fmul8sux16 %%f2, %%f36, %%f52 \n\t"\ 00156 "fmul8sux16 %%f2, %%f40, %%f54 \n\t"\ 00157 "fmul8sux16 %%f2, %%f44, %%f56 \n\t"\ 00158 \ 00159 "fpadd16 %%f48, %%f28, %%f28 \n\t"\ 00160 "fcmpd %%fcc1, %%f62, %%f60 \n\t"\ 00161 "for %%f12, %%f14, %%f60 \n\t"\ 00162 "fpadd16 %%f50, %%f18, %%f18 \n\t"\ 00163 "fpadd16 %%f52, %%f22, %%f22 \n\t"\ 00164 "fpadd16 %%f54, %%f26, %%f26 \n\t"\ 00165 "fpadd16 %%f56, %%f30, %%f30 \n\t"\ 00166 \ 00167 "fpadd16 %%f28, %%f0, %%f16 \n\t"\ 00168 "fcmpd %%fcc2, %%f62, %%f60 \n\t"\ 00169 "fpadd16 %%f28, %%f0, %%f20 \n\t"\ 00170 "fpadd16 %%f28, %%f0, %%f24 \n\t"\ 00171 "fpadd16 %%f28, %%f0, %%f28 \n\t"\ 00172 "fpadd16 %%f18, %%f2, %%f18 \n\t"\ 00173 "fpadd16 %%f22, %%f2, %%f22 \n\t"\ 00174 /* 2. column */\ 00175 "fbe %%fcc0, 3f \n\t"\ 00176 "fpadd16 %%f26, %%f2, %%f26 \n\t"\ 00177 "fmul8ulx16 %%f4, %%f34, %%f48 \n\t"\ 00178 "fmul8ulx16 %%f4, %%f42, %%f50 \n\t"\ 00179 "fmul8ulx16 %%f6, %%f36, %%f52 \n\t"\ 00180 "fmul8ulx16 %%f6, %%f44, %%f54 \n\t"\ 00181 "fmul8ulx16 %%f6, %%f32, %%f56 \n\t"\ 00182 "fmul8ulx16 %%f6, %%f40, %%f58 \n\t"\ 00183 \ 00184 "fpadd16 %%f16, %%f48, %%f16 \n\t"\ 00185 "fpadd16 %%f20, %%f50, %%f20 \n\t"\ 00186 "fpsub16 %%f24, %%f50, %%f24 \n\t"\ 00187 "fpsub16 %%f28, %%f48, %%f28 \n\t"\ 00188 "fpadd16 %%f18, %%f52, %%f18 \n\t"\ 00189 "fpsub16 %%f22, %%f54, %%f22 \n\t"\ 00190 "fpsub16 %%f26, %%f56, %%f26 \n\t"\ 00191 "fpsub16 %%f30, %%f58, %%f30 \n\t"\ 00192 \ 00193 "fmul8sux16 %%f4, %%f34, %%f48 \n\t"\ 00194 "fmul8sux16 %%f4, %%f42, %%f50 \n\t"\ 00195 "fmul8sux16 %%f6, %%f36, %%f52 \n\t"\ 00196 "fmul8sux16 %%f6, %%f44, %%f54 \n\t"\ 00197 "fmul8sux16 %%f6, %%f32, %%f56 \n\t"\ 00198 "fmul8sux16 %%f6, %%f40, %%f58 \n\t"\ 00199 \ 00200 "fpadd16 %%f16, %%f48, %%f16 \n\t"\ 00201 "fpadd16 %%f20, %%f50, %%f20 \n\t"\ 00202 "fpsub16 %%f24, %%f50, %%f24 \n\t"\ 00203 "fpsub16 %%f28, %%f48, %%f28 \n\t"\ 00204 "fpadd16 %%f18, %%f52, %%f18 \n\t"\ 00205 "fpsub16 %%f22, %%f54, %%f22 \n\t"\ 00206 "fpsub16 %%f26, %%f56, %%f26 \n\t"\ 00207 "fpsub16 %%f30, %%f58, %%f30 \n\t"\ 00208 \ 00209 "fpadd16 %%f16, %%f4, %%f16 \n\t"\ 00210 "fpsub16 %%f28, %%f4, %%f28 \n\t"\ 00211 "fpadd16 %%f18, %%f6, %%f18 \n\t"\ 00212 "fpsub16 %%f26, %%f6, %%f26 \n\t"\ 00213 /* 3. column */\ 00214 "3: \n\t"\ 00215 "fbe %%fcc1, 4f \n\t"\ 00216 "fpsub16 %%f30, %%f6, %%f30 \n\t"\ 00217 "fmul8ulx16 %%f8, %%f38, %%f48 \n\t"\ 00218 "fmul8ulx16 %%f10, %%f40, %%f50 \n\t"\ 00219 "fmul8ulx16 %%f10, %%f32, %%f52 \n\t"\ 00220 "fmul8ulx16 %%f10, %%f44, %%f54 \n\t"\ 00221 "fmul8ulx16 %%f10, %%f36, %%f56 \n\t"\ 00222 \ 00223 "fpadd16 %%f16, %%f48, %%f16 \n\t"\ 00224 "fpsub16 %%f20, %%f48, %%f20 \n\t"\ 00225 "fpsub16 %%f24, %%f48, %%f24 \n\t"\ 00226 "fpadd16 %%f28, %%f48, %%f28 \n\t"\ 00227 "fpadd16 %%f18, %%f50, %%f18 \n\t"\ 00228 "fpsub16 %%f22, %%f52, %%f22 \n\t"\ 00229 "fpadd16 %%f26, %%f54, %%f26 \n\t"\ 00230 "fpadd16 %%f30, %%f56, %%f30 \n\t"\ 00231 \ 00232 "fmul8sux16 %%f8, %%f38, %%f48 \n\t"\ 00233 "fmul8sux16 %%f10, %%f40, %%f50 \n\t"\ 00234 "fmul8sux16 %%f10, %%f32, %%f52 \n\t"\ 00235 "fmul8sux16 %%f10, %%f44, %%f54 \n\t"\ 00236 "fmul8sux16 %%f10, %%f36, %%f56 \n\t"\ 00237 \ 00238 "fpadd16 %%f16, %%f48, %%f16 \n\t"\ 00239 "fpsub16 %%f20, %%f48, %%f20 \n\t"\ 00240 "fpsub16 %%f24, %%f48, %%f24 \n\t"\ 00241 "fpadd16 %%f28, %%f48, %%f28 \n\t"\ 00242 "fpadd16 %%f18, %%f50, %%f18 \n\t"\ 00243 "fpsub16 %%f22, %%f52, %%f22 \n\t"\ 00244 "fpadd16 %%f26, %%f54, %%f26 \n\t"\ 00245 "fpadd16 %%f30, %%f56, %%f30 \n\t"\ 00246 \ 00247 "fpadd16 %%f16, %%f8, %%f16 \n\t"\ 00248 "fpsub16 %%f20, %%f8, %%f20 \n\t"\ 00249 "fpsub16 %%f24, %%f8, %%f24 \n\t"\ 00250 "fpadd16 %%f28, %%f8, %%f28 \n\t"\ 00251 "fpadd16 %%f18, %%f10, %%f18 \n\t"\ 00252 "fpsub16 %%f22, %%f10, %%f22 \n\t"\ 00253 /* 4. column */\ 00254 "4: \n\t"\ 00255 "fbe %%fcc2, 5f \n\t"\ 00256 "fpadd16 %%f30, %%f10, %%f30 \n\t"\ 00257 "fmul8ulx16 %%f12, %%f42, %%f48 \n\t"\ 00258 "fmul8ulx16 %%f12, %%f34, %%f50 \n\t"\ 00259 "fmul8ulx16 %%f14, %%f44, %%f52 \n\t"\ 00260 "fmul8ulx16 %%f14, %%f40, %%f54 \n\t"\ 00261 "fmul8ulx16 %%f14, %%f36, %%f56 \n\t"\ 00262 "fmul8ulx16 %%f14, %%f32, %%f58 \n\t"\ 00263 \ 00264 "fpadd16 %%f16, %%f48, %%f16 \n\t"\ 00265 "fpsub16 %%f20, %%f50, %%f20 \n\t"\ 00266 "fpadd16 %%f24, %%f50, %%f24 \n\t"\ 00267 "fpsub16 %%f28, %%f48, %%f28 \n\t"\ 00268 "fpadd16 %%f18, %%f52, %%f18 \n\t"\ 00269 "fpsub16 %%f22, %%f54, %%f22 \n\t"\ 00270 "fpadd16 %%f26, %%f56, %%f26 \n\t"\ 00271 "fpsub16 %%f30, %%f58, %%f30 \n\t"\ 00272 \ 00273 "fmul8sux16 %%f12, %%f42, %%f48 \n\t"\ 00274 "fmul8sux16 %%f12, %%f34, %%f50 \n\t"\ 00275 "fmul8sux16 %%f14, %%f44, %%f52 \n\t"\ 00276 "fmul8sux16 %%f14, %%f40, %%f54 \n\t"\ 00277 "fmul8sux16 %%f14, %%f36, %%f56 \n\t"\ 00278 "fmul8sux16 %%f14, %%f32, %%f58 \n\t"\ 00279 \ 00280 "fpadd16 %%f16, %%f48, %%f16 \n\t"\ 00281 "fpsub16 %%f20, %%f50, %%f20 \n\t"\ 00282 "fpadd16 %%f24, %%f50, %%f24 \n\t"\ 00283 "fpsub16 %%f28, %%f48, %%f28 \n\t"\ 00284 "fpadd16 %%f18, %%f52, %%f18 \n\t"\ 00285 "fpsub16 %%f22, %%f54, %%f22 \n\t"\ 00286 "fpadd16 %%f26, %%f56, %%f26 \n\t"\ 00287 "fpsub16 %%f30, %%f58, %%f30 \n\t"\ 00288 \ 00289 "fpsub16 %%f20, %%f12, %%f20 \n\t"\ 00290 "fpadd16 %%f24, %%f12, %%f24 \n\t"\ 00291 "fpsub16 %%f22, %%f14, %%f22 \n\t"\ 00292 "fpadd16 %%f26, %%f14, %%f26 \n\t"\ 00293 "fpsub16 %%f30, %%f14, %%f30 \n\t"\ 00294 /* final butterfly */\ 00295 "5: \n\t"\ 00296 "fpsub16 %%f16, %%f18, %%f48 \n\t"\ 00297 "fpsub16 %%f20, %%f22, %%f50 \n\t"\ 00298 "fpsub16 %%f24, %%f26, %%f52 \n\t"\ 00299 "fpsub16 %%f28, %%f30, %%f54 \n\t"\ 00300 "fpadd16 %%f16, %%f18, %%f16 \n\t"\ 00301 "fpadd16 %%f20, %%f22, %%f20 \n\t"\ 00302 "fpadd16 %%f24, %%f26, %%f24 \n\t"\ 00303 "fpadd16 %%f28, %%f30, %%f28 \n\t"\ 00304 00305 #define STOREROWS(out) \ 00306 "std %%f48, [" out "+112] \n\t"\ 00307 "std %%f50, [" out "+96] \n\t"\ 00308 "std %%f52, [" out "+80] \n\t"\ 00309 "std %%f54, [" out "+64] \n\t"\ 00310 "std %%f16, [" out "] \n\t"\ 00311 "std %%f20, [" out "+16] \n\t"\ 00312 "std %%f24, [" out "+32] \n\t"\ 00313 "std %%f28, [" out "+48] \n\t"\ 00314 00315 #define SCALEROWS \ 00316 "fmul8sux16 %%f46, %%f48, %%f48 \n\t"\ 00317 "fmul8sux16 %%f46, %%f50, %%f50 \n\t"\ 00318 "fmul8sux16 %%f46, %%f52, %%f52 \n\t"\ 00319 "fmul8sux16 %%f46, %%f54, %%f54 \n\t"\ 00320 "fmul8sux16 %%f46, %%f16, %%f16 \n\t"\ 00321 "fmul8sux16 %%f46, %%f20, %%f20 \n\t"\ 00322 "fmul8sux16 %%f46, %%f24, %%f24 \n\t"\ 00323 "fmul8sux16 %%f46, %%f28, %%f28 \n\t"\ 00324 00325 #define PUTPIXELSCLAMPED(dest) \ 00326 "fpack16 %%f48, %%f14 \n\t"\ 00327 "fpack16 %%f50, %%f12 \n\t"\ 00328 "fpack16 %%f16, %%f0 \n\t"\ 00329 "fpack16 %%f20, %%f2 \n\t"\ 00330 "fpack16 %%f24, %%f4 \n\t"\ 00331 "fpack16 %%f28, %%f6 \n\t"\ 00332 "fpack16 %%f54, %%f8 \n\t"\ 00333 "fpack16 %%f52, %%f10 \n\t"\ 00334 "st %%f0, [%3+" dest "] \n\t"\ 00335 "st %%f2, [%5+" dest "] \n\t"\ 00336 "st %%f4, [%6+" dest "] \n\t"\ 00337 "st %%f6, [%7+" dest "] \n\t"\ 00338 "st %%f8, [%8+" dest "] \n\t"\ 00339 "st %%f10, [%9+" dest "] \n\t"\ 00340 "st %%f12, [%10+" dest "] \n\t"\ 00341 "st %%f14, [%11+" dest "] \n\t"\ 00342 00343 #define ADDPIXELSCLAMPED(dest) \ 00344 "ldd [%5], %%f18 \n\t"\ 00345 "ld [%3+" dest"], %%f0 \n\t"\ 00346 "ld [%6+" dest"], %%f2 \n\t"\ 00347 "ld [%7+" dest"], %%f4 \n\t"\ 00348 "ld [%8+" dest"], %%f6 \n\t"\ 00349 "ld [%9+" dest"], %%f8 \n\t"\ 00350 "ld [%10+" dest"], %%f10 \n\t"\ 00351 "ld [%11+" dest"], %%f12 \n\t"\ 00352 "ld [%12+" dest"], %%f14 \n\t"\ 00353 "fmul8x16 %%f0, %%f18, %%f0 \n\t"\ 00354 "fmul8x16 %%f2, %%f18, %%f2 \n\t"\ 00355 "fmul8x16 %%f4, %%f18, %%f4 \n\t"\ 00356 "fmul8x16 %%f6, %%f18, %%f6 \n\t"\ 00357 "fmul8x16 %%f8, %%f18, %%f8 \n\t"\ 00358 "fmul8x16 %%f10, %%f18, %%f10 \n\t"\ 00359 "fmul8x16 %%f12, %%f18, %%f12 \n\t"\ 00360 "fmul8x16 %%f14, %%f18, %%f14 \n\t"\ 00361 "fpadd16 %%f0, %%f16, %%f0 \n\t"\ 00362 "fpadd16 %%f2, %%f20, %%f2 \n\t"\ 00363 "fpadd16 %%f4, %%f24, %%f4 \n\t"\ 00364 "fpadd16 %%f6, %%f28, %%f6 \n\t"\ 00365 "fpadd16 %%f8, %%f54, %%f8 \n\t"\ 00366 "fpadd16 %%f10, %%f52, %%f10 \n\t"\ 00367 "fpadd16 %%f12, %%f50, %%f12 \n\t"\ 00368 "fpadd16 %%f14, %%f48, %%f14 \n\t"\ 00369 "fpack16 %%f0, %%f0 \n\t"\ 00370 "fpack16 %%f2, %%f2 \n\t"\ 00371 "fpack16 %%f4, %%f4 \n\t"\ 00372 "fpack16 %%f6, %%f6 \n\t"\ 00373 "fpack16 %%f8, %%f8 \n\t"\ 00374 "fpack16 %%f10, %%f10 \n\t"\ 00375 "fpack16 %%f12, %%f12 \n\t"\ 00376 "fpack16 %%f14, %%f14 \n\t"\ 00377 "st %%f0, [%3+" dest "] \n\t"\ 00378 "st %%f2, [%6+" dest "] \n\t"\ 00379 "st %%f4, [%7+" dest "] \n\t"\ 00380 "st %%f6, [%8+" dest "] \n\t"\ 00381 "st %%f8, [%9+" dest "] \n\t"\ 00382 "st %%f10, [%10+" dest "] \n\t"\ 00383 "st %%f12, [%11+" dest "] \n\t"\ 00384 "st %%f14, [%12+" dest "] \n\t"\ 00385 00386 00387 void ff_simple_idct_vis(DCTELEM *data) { 00388 int out1, out2, out3, out4; 00389 DECLARE_ALIGNED_8(int16_t, temp[8*8]); 00390 00391 __asm__ volatile( 00392 INIT_IDCT 00393 00394 #define ADDROUNDER 00395 00396 // shift right 16-4=12 00397 LOADSCALE("%2+8") 00398 IDCT4ROWS 00399 STOREROWS("%3+8") 00400 LOADSCALE("%2+0") 00401 IDCT4ROWS 00402 "std %%f48, [%3+112] \n\t" 00403 "std %%f50, [%3+96] \n\t" 00404 "std %%f52, [%3+80] \n\t" 00405 "std %%f54, [%3+64] \n\t" 00406 00407 // shift right 16+4 00408 "ldd [%3+8], %%f18 \n\t" 00409 "ldd [%3+24], %%f22 \n\t" 00410 "ldd [%3+40], %%f26 \n\t" 00411 "ldd [%3+56], %%f30 \n\t" 00412 TRANSPOSE 00413 IDCT4ROWS 00414 SCALEROWS 00415 STOREROWS("%2+0") 00416 LOAD("%3+64") 00417 TRANSPOSE 00418 IDCT4ROWS 00419 SCALEROWS 00420 STOREROWS("%2+8") 00421 00422 : "=r" (out1), "=r" (out2), "=r" (out3), "=r" (out4) 00423 : "0" (scale), "1" (coeffs), "2" (data), "3" (temp) 00424 ); 00425 } 00426 00427 void ff_simple_idct_put_vis(uint8_t *dest, int line_size, DCTELEM *data) { 00428 int out1, out2, out3, out4, out5; 00429 int r1, r2, r3, r4, r5, r6, r7; 00430 00431 __asm__ volatile( 00432 "wr %%g0, 0x8, %%gsr \n\t" 00433 00434 INIT_IDCT 00435 00436 "add %3, %4, %5 \n\t" 00437 "add %5, %4, %6 \n\t" 00438 "add %6, %4, %7 \n\t" 00439 "add %7, %4, %8 \n\t" 00440 "add %8, %4, %9 \n\t" 00441 "add %9, %4, %10 \n\t" 00442 "add %10, %4, %11 \n\t" 00443 00444 // shift right 16-4=12 00445 LOADSCALE("%2+8") 00446 IDCT4ROWS 00447 STOREROWS("%2+8") 00448 LOADSCALE("%2+0") 00449 IDCT4ROWS 00450 "std %%f48, [%2+112] \n\t" 00451 "std %%f50, [%2+96] \n\t" 00452 "std %%f52, [%2+80] \n\t" 00453 "std %%f54, [%2+64] \n\t" 00454 00455 #undef ADDROUNDER 00456 #define ADDROUNDER "fpadd16 %%f28, %%f46, %%f28 \n\t" 00457 00458 // shift right 16+4 00459 "ldd [%2+8], %%f18 \n\t" 00460 "ldd [%2+24], %%f22 \n\t" 00461 "ldd [%2+40], %%f26 \n\t" 00462 "ldd [%2+56], %%f30 \n\t" 00463 TRANSPOSE 00464 IDCT4ROWS 00465 PUTPIXELSCLAMPED("0") 00466 LOAD("%2+64") 00467 TRANSPOSE 00468 IDCT4ROWS 00469 PUTPIXELSCLAMPED("4") 00470 00471 : "=r" (out1), "=r" (out2), "=r" (out3), "=r" (out4), "=r" (out5), 00472 "=r" (r1), "=r" (r2), "=r" (r3), "=r" (r4), "=r" (r5), "=r" (r6), "=r" (r7) 00473 : "0" (rounder), "1" (coeffs), "2" (data), "3" (dest), "4" (line_size) 00474 ); 00475 } 00476 00477 void ff_simple_idct_add_vis(uint8_t *dest, int line_size, DCTELEM *data) { 00478 int out1, out2, out3, out4, out5, out6; 00479 int r1, r2, r3, r4, r5, r6, r7; 00480 00481 __asm__ volatile( 00482 "wr %%g0, 0x8, %%gsr \n\t" 00483 00484 INIT_IDCT 00485 00486 "add %3, %4, %6 \n\t" 00487 "add %6, %4, %7 \n\t" 00488 "add %7, %4, %8 \n\t" 00489 "add %8, %4, %9 \n\t" 00490 "add %9, %4, %10 \n\t" 00491 "add %10, %4, %11 \n\t" 00492 "add %11, %4, %12 \n\t" 00493 00494 #undef ADDROUNDER 00495 #define ADDROUNDER 00496 00497 // shift right 16-4=12 00498 LOADSCALE("%2+8") 00499 IDCT4ROWS 00500 STOREROWS("%2+8") 00501 LOADSCALE("%2+0") 00502 IDCT4ROWS 00503 "std %%f48, [%2+112] \n\t" 00504 "std %%f50, [%2+96] \n\t" 00505 "std %%f52, [%2+80] \n\t" 00506 "std %%f54, [%2+64] \n\t" 00507 00508 #undef ADDROUNDER 00509 #define ADDROUNDER "fpadd16 %%f28, %%f46, %%f28 \n\t" 00510 00511 // shift right 16+4 00512 "ldd [%2+8], %%f18 \n\t" 00513 "ldd [%2+24], %%f22 \n\t" 00514 "ldd [%2+40], %%f26 \n\t" 00515 "ldd [%2+56], %%f30 \n\t" 00516 TRANSPOSE 00517 IDCT4ROWS 00518 ADDPIXELSCLAMPED("0") 00519 LOAD("%2+64") 00520 TRANSPOSE 00521 IDCT4ROWS 00522 ADDPIXELSCLAMPED("4") 00523 00524 : "=r" (out1), "=r" (out2), "=r" (out3), "=r" (out4), "=r" (out5), "=r" (out6), 00525 "=r" (r1), "=r" (r2), "=r" (r3), "=r" (r4), "=r" (r5), "=r" (r6), "=r" (r7) 00526 : "0" (rounder), "1" (coeffs), "2" (data), "3" (dest), "4" (line_size), "5" (expand) 00527 ); 00528 }