FFmpeg: libswscale/x86/swscale_template.c Source File

FFmpeg

[フレーム]

swscale_template.c

Go to the documentation of this file.

1 /*

3 *

4 * This file is part of FFmpeg.

5 *

6 * FFmpeg is free software; you can redistribute it and/or

7 * modify it under the terms of the GNU Lesser General Public

8 * License as published by the Free Software Foundation; either

9 * version 2.1 of the License, or (at your option) any later version.

10 *

11 * FFmpeg is distributed in the hope that it will be useful,

12 * but WITHOUT ANY WARRANTY; without even the implied warranty of

13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU

14 * Lesser General Public License for more details.

15 *

16 * You should have received a copy of the GNU Lesser General Public

17 * License along with FFmpeg; if not, write to the Free Software

18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA

19 */

21 #undef REAL_MOVNTQ

22 #undef MOVNTQ

23 #undef MOVNTQ2

24 #undef PREFETCH

26 #if COMPILE_TEMPLATE_MMXEXT

27 #define PREFETCH "prefetchnta"

28 #else

29 #define PREFETCH " # nop"

30 #endif

32 #if COMPILE_TEMPLATE_MMXEXT

33 #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"

34 #define MOVNTQ2 "movntq "

35 #else

36 #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"

37 #define MOVNTQ2 "movq "

38 #endif

39 #define MOVNTQ(a,b) REAL_MOVNTQ(a,b)

41 #if !COMPILE_TEMPLATE_MMXEXT

42 static av_always_inline void

43 dither_8to16(const uint8_t *srcDither, int rot)

44 {

45 if (rot) {

46 __asm__ volatile("pxor %%mm0, %%mm0\n\t"

47 "movq (%0), %%mm3\n\t"

48 "movq %%mm3, %%mm4\n\t"

49 "psrlq 24,ドル %%mm3\n\t"

50 "psllq 40,ドル %%mm4\n\t"

51 "por %%mm4, %%mm3\n\t"

52 "movq %%mm3, %%mm4\n\t"

53 "punpcklbw %%mm0, %%mm3\n\t"

54 "punpckhbw %%mm0, %%mm4\n\t"

55 :: "r"(srcDither)

56 );

57 } else {

58 __asm__ volatile("pxor %%mm0, %%mm0\n\t"

59 "movq (%0), %%mm3\n\t"

60 "movq %%mm3, %%mm4\n\t"

61 "punpcklbw %%mm0, %%mm3\n\t"

62 "punpckhbw %%mm0, %%mm4\n\t"

63 :: "r"(srcDither)

64 );

65 }

66 }

67 #endif

69 static void RENAME(yuv2yuvX)(const int16_t *filter, int filterSize,

70 const int16_t **src, uint8_t *dest, int dstW,

71 const uint8_t *dither, int offset)

72 {

73 dither_8to16(dither, offset);

74 filterSize--;

75 __asm__ volatile(

76 "movd %0, %%mm1\n\t"

77 "punpcklwd %%mm1, %%mm1\n\t"

78 "punpckldq %%mm1, %%mm1\n\t"

79 "psllw 3,ドル %%mm1\n\t"

80 "paddw %%mm1, %%mm3\n\t"

81 "paddw %%mm1, %%mm4\n\t"

82 "psraw 4,ドル %%mm3\n\t"

83 "psraw 4,ドル %%mm4\n\t"

84 ::"m"(filterSize)

85 );

87 __asm__ volatile(\

88 "movq %%mm3, %%mm6\n\t"

89 "movq %%mm4, %%mm7\n\t"

90 "movl %3, %%ecx\n\t"

91 "mov %0, %%"REG_d" \n\t"\

92 "mov (%%"REG_d"), %%"REG_S" \n\t"\

93 ".p2align 4 \n\t" /* FIXME Unroll? */\

94 "1: \n\t"\

95 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\

96 "movq (%%"REG_S", %%"REG_c", 2), %%mm2 \n\t" /* srcData */\

97 "movq 8(%%"REG_S", %%"REG_c", 2), %%mm5 \n\t" /* srcData */\

98 "add 16,ドル %%"REG_d" \n\t"\

99 "mov (%%"REG_d"), %%"REG_S" \n\t"\

100 "test %%"REG_S", %%"REG_S" \n\t"\

101 "pmulhw %%mm0, %%mm2 \n\t"\

102 "pmulhw %%mm0, %%mm5 \n\t"\

103 "paddw %%mm2, %%mm3 \n\t"\

104 "paddw %%mm5, %%mm4 \n\t"\

105 " jnz 1b \n\t"\

106 "psraw 3,ドル %%mm3 \n\t"\

107 "psraw 3,ドル %%mm4 \n\t"\

108 "packuswb %%mm4, %%mm3 \n\t"

109 MOVNTQ2 " %%mm3, (%1, %%"REG_c")\n\t"

110 "add 8,ドル %%"REG_c" \n\t"\

111 "cmp %2, %%"REG_c" \n\t"\

112 "movq %%mm6, %%mm3\n\t"

113 "movq %%mm7, %%mm4\n\t"

114 "mov %0, %%"REG_d" \n\t"\

115 "mov (%%"REG_d"), %%"REG_S" \n\t"\

116 "jb 1b \n\t"\

117 :: "g" (filter),

118 "r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset)

119 : "%"REG_d, "%"REG_S, "%"REG_c

120 );

121 }

122

123 #define YSCALEYUV2PACKEDX_UV \

124 __asm__ volatile(\

125 "xor %%"REG_a", %%"REG_a" \n\t"\

126 ".p2align 4 \n\t"\

127 "nop \n\t"\

128 "1: \n\t"\

129 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\

130 "mov (%%"REG_d"), %%"REG_S" \n\t"\

131 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\

132 "movq %%mm3, %%mm4 \n\t"\

133 ".p2align 4 \n\t"\

134 "2: \n\t"\

135 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\

136 "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\

137 "add %6, %%"REG_S" \n\t" \

138 "movq (%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\

139 "add 16,ドル %%"REG_d" \n\t"\

140 "mov (%%"REG_d"), %%"REG_S" \n\t"\

141 "pmulhw %%mm0, %%mm2 \n\t"\

142 "pmulhw %%mm0, %%mm5 \n\t"\

143 "paddw %%mm2, %%mm3 \n\t"\

144 "paddw %%mm5, %%mm4 \n\t"\

145 "test %%"REG_S", %%"REG_S" \n\t"\

146 " jnz 2b \n\t"\

147

148 #define YSCALEYUV2PACKEDX_YA(offset,coeff,src1,src2,dst1,dst2) \

149 "lea "offset"(%0), %%"REG_d" \n\t"\

150 "mov (%%"REG_d"), %%"REG_S" \n\t"\

151 "movq "VROUNDER_OFFSET"(%0), "#dst1" \n\t"\

152 "movq "#dst1", "#dst2" \n\t"\

153 ".p2align 4 \n\t"\

154 "2: \n\t"\

155 "movq 8(%%"REG_d"), "#coeff" \n\t" /* filterCoeff */\

156 "movq (%%"REG_S", %%"REG_a", 2), "#src1" \n\t" /* Y1srcData */\

157 "movq 8(%%"REG_S", %%"REG_a", 2), "#src2" \n\t" /* Y2srcData */\

158 "add 16,ドル %%"REG_d" \n\t"\

159 "mov (%%"REG_d"), %%"REG_S" \n\t"\

160 "pmulhw "#coeff", "#src1" \n\t"\

161 "pmulhw "#coeff", "#src2" \n\t"\

162 "paddw "#src1", "#dst1" \n\t"\

163 "paddw "#src2", "#dst2" \n\t"\

164 "test %%"REG_S", %%"REG_S" \n\t"\

165 " jnz 2b \n\t"\

166

167 #define YSCALEYUV2PACKEDX \

168 YSCALEYUV2PACKEDX_UV \

169 YSCALEYUV2PACKEDX_YA(LUM_MMX_FILTER_OFFSET,%%mm0,%%mm2,%%mm5,%%mm1,%%mm7) \

170

171 #define YSCALEYUV2PACKEDX_END \

172 :: "r" (&c->redDither), \

173 "m" (dummy), "m" (dummy), "m" (dummy),\

174 "r" (dest), "m" (dstW_reg), "m"(uv_off) \

175 : "%"REG_a, "%"REG_d, "%"REG_S \

176 );

177

178 #define YSCALEYUV2PACKEDX_ACCURATE_UV \

179 __asm__ volatile(\

180 "xor %%"REG_a", %%"REG_a" \n\t"\

181 ".p2align 4 \n\t"\

182 "nop \n\t"\

183 "1: \n\t"\

184 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\

185 "mov (%%"REG_d"), %%"REG_S" \n\t"\

186 "pxor %%mm4, %%mm4 \n\t"\

187 "pxor %%mm5, %%mm5 \n\t"\

188 "pxor %%mm6, %%mm6 \n\t"\

189 "pxor %%mm7, %%mm7 \n\t"\

190 ".p2align 4 \n\t"\

191 "2: \n\t"\

192 "movq (%%"REG_S", %%"REG_a"), %%mm0 \n\t" /* UsrcData */\

193 "add %6, %%"REG_S" \n\t" \

194 "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* VsrcData */\

195 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\

196 "movq (%%"REG_S", %%"REG_a"), %%mm1 \n\t" /* UsrcData */\

197 "movq %%mm0, %%mm3 \n\t"\

198 "punpcklwd %%mm1, %%mm0 \n\t"\

199 "punpckhwd %%mm1, %%mm3 \n\t"\

200 "movq "STR(APCK_COEF)"(%%"REG_d"),%%mm1 \n\t" /* filterCoeff */\

201 "pmaddwd %%mm1, %%mm0 \n\t"\

202 "pmaddwd %%mm1, %%mm3 \n\t"\

203 "paddd %%mm0, %%mm4 \n\t"\

204 "paddd %%mm3, %%mm5 \n\t"\

205 "add %6, %%"REG_S" \n\t" \

206 "movq (%%"REG_S", %%"REG_a"), %%mm3 \n\t" /* VsrcData */\

207 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\

208 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\

209 "test %%"REG_S", %%"REG_S" \n\t"\

210 "movq %%mm2, %%mm0 \n\t"\

211 "punpcklwd %%mm3, %%mm2 \n\t"\

212 "punpckhwd %%mm3, %%mm0 \n\t"\

213 "pmaddwd %%mm1, %%mm2 \n\t"\

214 "pmaddwd %%mm1, %%mm0 \n\t"\

215 "paddd %%mm2, %%mm6 \n\t"\

216 "paddd %%mm0, %%mm7 \n\t"\

217 " jnz 2b \n\t"\

218 "psrad 16,ドル %%mm4 \n\t"\

219 "psrad 16,ドル %%mm5 \n\t"\

220 "psrad 16,ドル %%mm6 \n\t"\

221 "psrad 16,ドル %%mm7 \n\t"\

222 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\

223 "packssdw %%mm5, %%mm4 \n\t"\

224 "packssdw %%mm7, %%mm6 \n\t"\

225 "paddw %%mm0, %%mm4 \n\t"\

226 "paddw %%mm0, %%mm6 \n\t"\

227 "movq %%mm4, "U_TEMP"(%0) \n\t"\

228 "movq %%mm6, "V_TEMP"(%0) \n\t"\

229

230 #define YSCALEYUV2PACKEDX_ACCURATE_YA(offset) \

231 "lea "offset"(%0), %%"REG_d" \n\t"\

232 "mov (%%"REG_d"), %%"REG_S" \n\t"\

233 "pxor %%mm1, %%mm1 \n\t"\

234 "pxor %%mm5, %%mm5 \n\t"\

235 "pxor %%mm7, %%mm7 \n\t"\

236 "pxor %%mm6, %%mm6 \n\t"\

237 ".p2align 4 \n\t"\

238 "2: \n\t"\

239 "movq (%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* Y1srcData */\

240 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y2srcData */\

241 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\

242 "movq (%%"REG_S", %%"REG_a", 2), %%mm4 \n\t" /* Y1srcData */\

243 "movq %%mm0, %%mm3 \n\t"\

244 "punpcklwd %%mm4, %%mm0 \n\t"\

245 "punpckhwd %%mm4, %%mm3 \n\t"\

246 "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm4 \n\t" /* filterCoeff */\

247 "pmaddwd %%mm4, %%mm0 \n\t"\

248 "pmaddwd %%mm4, %%mm3 \n\t"\

249 "paddd %%mm0, %%mm1 \n\t"\

250 "paddd %%mm3, %%mm5 \n\t"\

251 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* Y2srcData */\

252 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\

253 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\

254 "test %%"REG_S", %%"REG_S" \n\t"\

255 "movq %%mm2, %%mm0 \n\t"\

256 "punpcklwd %%mm3, %%mm2 \n\t"\

257 "punpckhwd %%mm3, %%mm0 \n\t"\

258 "pmaddwd %%mm4, %%mm2 \n\t"\

259 "pmaddwd %%mm4, %%mm0 \n\t"\

260 "paddd %%mm2, %%mm7 \n\t"\

261 "paddd %%mm0, %%mm6 \n\t"\

262 " jnz 2b \n\t"\

263 "psrad 16,ドル %%mm1 \n\t"\

264 "psrad 16,ドル %%mm5 \n\t"\

265 "psrad 16,ドル %%mm7 \n\t"\

266 "psrad 16,ドル %%mm6 \n\t"\

267 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\

268 "packssdw %%mm5, %%mm1 \n\t"\

269 "packssdw %%mm6, %%mm7 \n\t"\

270 "paddw %%mm0, %%mm1 \n\t"\

271 "paddw %%mm0, %%mm7 \n\t"\

272 "movq "U_TEMP"(%0), %%mm3 \n\t"\

273 "movq "V_TEMP"(%0), %%mm4 \n\t"\

274

275 #define YSCALEYUV2PACKEDX_ACCURATE \

276 YSCALEYUV2PACKEDX_ACCURATE_UV \

277 YSCALEYUV2PACKEDX_ACCURATE_YA(LUM_MMX_FILTER_OFFSET)

278

279 #define YSCALEYUV2RGBX \

280 "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\

281 "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\

282 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\

283 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\

284 "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\

285 "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\

286 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\

287 "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\

288 "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\

289 "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\

290 "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\

291 "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\

292 "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\

293 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\

294 "paddw %%mm3, %%mm4 \n\t"\

295 "movq %%mm2, %%mm0 \n\t"\

296 "movq %%mm5, %%mm6 \n\t"\

297 "movq %%mm4, %%mm3 \n\t"\

298 "punpcklwd %%mm2, %%mm2 \n\t"\

299 "punpcklwd %%mm5, %%mm5 \n\t"\

300 "punpcklwd %%mm4, %%mm4 \n\t"\

301 "paddw %%mm1, %%mm2 \n\t"\

302 "paddw %%mm1, %%mm5 \n\t"\

303 "paddw %%mm1, %%mm4 \n\t"\

304 "punpckhwd %%mm0, %%mm0 \n\t"\

305 "punpckhwd %%mm6, %%mm6 \n\t"\

306 "punpckhwd %%mm3, %%mm3 \n\t"\

307 "paddw %%mm7, %%mm0 \n\t"\

308 "paddw %%mm7, %%mm6 \n\t"\

309 "paddw %%mm7, %%mm3 \n\t"\

310 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\

311 "packuswb %%mm0, %%mm2 \n\t"\

312 "packuswb %%mm6, %%mm5 \n\t"\

313 "packuswb %%mm3, %%mm4 \n\t"\

314

315 #define REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) \

316 "movq "#b", "#q2" \n\t" /* B */\

317 "movq "#r", "#t" \n\t" /* R */\

318 "punpcklbw "#g", "#b" \n\t" /* GBGBGBGB 0 */\

319 "punpcklbw "#a", "#r" \n\t" /* ARARARAR 0 */\

320 "punpckhbw "#g", "#q2" \n\t" /* GBGBGBGB 2 */\

321 "punpckhbw "#a", "#t" \n\t" /* ARARARAR 2 */\

322 "movq "#b", "#q0" \n\t" /* GBGBGBGB 0 */\

323 "movq "#q2", "#q3" \n\t" /* GBGBGBGB 2 */\

324 "punpcklwd "#r", "#q0" \n\t" /* ARGBARGB 0 */\

325 "punpckhwd "#r", "#b" \n\t" /* ARGBARGB 1 */\

326 "punpcklwd "#t", "#q2" \n\t" /* ARGBARGB 2 */\

327 "punpckhwd "#t", "#q3" \n\t" /* ARGBARGB 3 */\

328 \

329 MOVNTQ( q0, (dst, index, 4))\

330 MOVNTQ( b, 8(dst, index, 4))\

331 MOVNTQ( q2, 16(dst, index, 4))\

332 MOVNTQ( q3, 24(dst, index, 4))\

333 \

334 "add 8,ドル "#index" \n\t"\

335 "cmp "#dstw", "#index" \n\t"\

336 " jb 1b \n\t"

337 #define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)

338

339 static void RENAME(yuv2rgb32_X_ar)(SwsContext *c, const int16_t *lumFilter,

340 const int16_t **lumSrc, int lumFilterSize,

341 const int16_t *chrFilter, const int16_t **chrUSrc,

342 const int16_t **chrVSrc,

343 int chrFilterSize, const int16_t **alpSrc,

344 uint8_t *dest, int dstW, int dstY)

345 {

346 x86_reg dummy=0;

347 x86_reg dstW_reg = dstW;

348 x86_reg uv_off = c->uv_offx2;

349

350 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {

351 YSCALEYUV2PACKEDX_ACCURATE

352 YSCALEYUV2RGBX

353 "movq %%mm2, "U_TEMP "(%0) \n\t"

354 "movq %%mm4, "V_TEMP "(%0) \n\t"

355 "movq %%mm5, "Y_TEMP "(%0) \n\t"

356 YSCALEYUV2PACKEDX_ACCURATE_YA(ALP_MMX_FILTER_OFFSET)

357 "movq "Y_TEMP "(%0), %%mm5 \n\t"

358 "psraw 3,ドル %%mm1 \n\t"

359 "psraw 3,ドル %%mm7 \n\t"

360 "packuswb %%mm7, %%mm1 \n\t"

361 WRITEBGR32(%4, %5, %%REGa, %%mm3, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm2, %%mm6)

362 YSCALEYUV2PACKEDX_END

363 } else {

364 YSCALEYUV2PACKEDX_ACCURATE

365 YSCALEYUV2RGBX

366 "pcmpeqd %%mm7, %%mm7 \n\t"

367 WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)

368 YSCALEYUV2PACKEDX_END

369 }

370 }

371

372 static void RENAME(yuv2rgb32_X)(SwsContext *c, const int16_t *lumFilter,

373 const int16_t **lumSrc, int lumFilterSize,

374 const int16_t *chrFilter, const int16_t **chrUSrc,

375 const int16_t **chrVSrc,

376 int chrFilterSize, const int16_t **alpSrc,

377 uint8_t *dest, int dstW, int dstY)

378 {

379 x86_reg dummy=0;

380 x86_reg dstW_reg = dstW;

381 x86_reg uv_off = c->uv_offx2;

382

383 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {

384 YSCALEYUV2PACKEDX

385 YSCALEYUV2RGBX

386 YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET, %%mm0, %%mm3, %%mm6, %%mm1, %%mm7)

387 "psraw 3,ドル %%mm1 \n\t"

388 "psraw 3,ドル %%mm7 \n\t"

389 "packuswb %%mm7, %%mm1 \n\t"

390 WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)

391 YSCALEYUV2PACKEDX_END

392 } else {

393 YSCALEYUV2PACKEDX

394 YSCALEYUV2RGBX

395 "pcmpeqd %%mm7, %%mm7 \n\t"

396 WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)

397 YSCALEYUV2PACKEDX_END

398 }

399 }

400

401 #define REAL_WRITERGB16(dst, dstw, index) \

402 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\

403 "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\

404 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\

405 "psrlq 3,ドル %%mm2 \n\t"\

406 \

407 "movq %%mm2, %%mm1 \n\t"\

408 "movq %%mm4, %%mm3 \n\t"\

409 \

410 "punpcklbw %%mm7, %%mm3 \n\t"\

411 "punpcklbw %%mm5, %%mm2 \n\t"\

412 "punpckhbw %%mm7, %%mm4 \n\t"\

413 "punpckhbw %%mm5, %%mm1 \n\t"\

414 \

415 "psllq 3,ドル %%mm3 \n\t"\

416 "psllq 3,ドル %%mm4 \n\t"\

417 \

418 "por %%mm3, %%mm2 \n\t"\

419 "por %%mm4, %%mm1 \n\t"\

420 \

421 MOVNTQ(%%mm2, (dst, index, 2))\

422 MOVNTQ(%%mm1, 8(dst, index, 2))\

423 \

424 "add 8,ドル "#index" \n\t"\

425 "cmp "#dstw", "#index" \n\t"\

426 " jb 1b \n\t"

427 #define WRITERGB16(dst, dstw, index) REAL_WRITERGB16(dst, dstw, index)

428

429 static void RENAME(yuv2rgb565_X_ar)(SwsContext *c, const int16_t *lumFilter,

430 const int16_t **lumSrc, int lumFilterSize,

431 const int16_t *chrFilter, const int16_t **chrUSrc,

432 const int16_t **chrVSrc,

433 int chrFilterSize, const int16_t **alpSrc,

434 uint8_t *dest, int dstW, int dstY)

435 {

436 x86_reg dummy=0;

437 x86_reg dstW_reg = dstW;

438 x86_reg uv_off = c->uv_offx2;

439

440 YSCALEYUV2PACKEDX_ACCURATE

441 YSCALEYUV2RGBX

442 "pxor %%mm7, %%mm7 \n\t"

443 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */

444 #ifdef DITHER1XBPP

445 "paddusb "BLUE_DITHER "(%0), %%mm2\n\t"

446 "paddusb "GREEN_DITHER "(%0), %%mm4\n\t"

447 "paddusb "RED_DITHER "(%0), %%mm5\n\t"

448 #endif

449 WRITERGB16(%4, %5, %%REGa)

450 YSCALEYUV2PACKEDX_END

451 }

452

453 static void RENAME(yuv2rgb565_X)(SwsContext *c, const int16_t *lumFilter,

454 const int16_t **lumSrc, int lumFilterSize,

455 const int16_t *chrFilter, const int16_t **chrUSrc,

456 const int16_t **chrVSrc,

457 int chrFilterSize, const int16_t **alpSrc,

458 uint8_t *dest, int dstW, int dstY)

459 {

460 x86_reg dummy=0;

461 x86_reg dstW_reg = dstW;

462 x86_reg uv_off = c->uv_offx2;

463

464 YSCALEYUV2PACKEDX

465 YSCALEYUV2RGBX

466 "pxor %%mm7, %%mm7 \n\t"

467 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */

468 #ifdef DITHER1XBPP

469 "paddusb "BLUE_DITHER "(%0), %%mm2 \n\t"

470 "paddusb "GREEN_DITHER "(%0), %%mm4 \n\t"

471 "paddusb "RED_DITHER "(%0), %%mm5 \n\t"

472 #endif

473 WRITERGB16(%4, %5, %%REGa)

474 YSCALEYUV2PACKEDX_END

475 }

476

477 #define REAL_WRITERGB15(dst, dstw, index) \

478 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\

479 "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\

480 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\

481 "psrlq 3,ドル %%mm2 \n\t"\

482 "psrlq 1,ドル %%mm5 \n\t"\

483 \

484 "movq %%mm2, %%mm1 \n\t"\

485 "movq %%mm4, %%mm3 \n\t"\

486 \

487 "punpcklbw %%mm7, %%mm3 \n\t"\

488 "punpcklbw %%mm5, %%mm2 \n\t"\

489 "punpckhbw %%mm7, %%mm4 \n\t"\

490 "punpckhbw %%mm5, %%mm1 \n\t"\

491 \

492 "psllq 2,ドル %%mm3 \n\t"\

493 "psllq 2,ドル %%mm4 \n\t"\

494 \

495 "por %%mm3, %%mm2 \n\t"\

496 "por %%mm4, %%mm1 \n\t"\

497 \

498 MOVNTQ(%%mm2, (dst, index, 2))\

499 MOVNTQ(%%mm1, 8(dst, index, 2))\

500 \

501 "add 8,ドル "#index" \n\t"\

502 "cmp "#dstw", "#index" \n\t"\

503 " jb 1b \n\t"

504 #define WRITERGB15(dst, dstw, index) REAL_WRITERGB15(dst, dstw, index)

505

506 static void RENAME(yuv2rgb555_X_ar)(SwsContext *c, const int16_t *lumFilter,

507 const int16_t **lumSrc, int lumFilterSize,

508 const int16_t *chrFilter, const int16_t **chrUSrc,

509 const int16_t **chrVSrc,

510 int chrFilterSize, const int16_t **alpSrc,

511 uint8_t *dest, int dstW, int dstY)

512 {

513 x86_reg dummy=0;

514 x86_reg dstW_reg = dstW;

515 x86_reg uv_off = c->uv_offx2;

516

517 YSCALEYUV2PACKEDX_ACCURATE

518 YSCALEYUV2RGBX

519 "pxor %%mm7, %%mm7 \n\t"

520 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */

521 #ifdef DITHER1XBPP

522 "paddusb "BLUE_DITHER "(%0), %%mm2\n\t"

523 "paddusb "GREEN_DITHER "(%0), %%mm4\n\t"

524 "paddusb "RED_DITHER "(%0), %%mm5\n\t"

525 #endif

526 WRITERGB15(%4, %5, %%REGa)

527 YSCALEYUV2PACKEDX_END

528 }

529

530 static void RENAME(yuv2rgb555_X)(SwsContext *c, const int16_t *lumFilter,

531 const int16_t **lumSrc, int lumFilterSize,

532 const int16_t *chrFilter, const int16_t **chrUSrc,

533 const int16_t **chrVSrc,

534 int chrFilterSize, const int16_t **alpSrc,

535 uint8_t *dest, int dstW, int dstY)

536 {

537 x86_reg dummy=0;

538 x86_reg dstW_reg = dstW;

539 x86_reg uv_off = c->uv_offx2;

540

541 YSCALEYUV2PACKEDX

542 YSCALEYUV2RGBX

543 "pxor %%mm7, %%mm7 \n\t"

544 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */

545 #ifdef DITHER1XBPP

546 "paddusb "BLUE_DITHER "(%0), %%mm2 \n\t"

547 "paddusb "GREEN_DITHER "(%0), %%mm4 \n\t"

548 "paddusb "RED_DITHER "(%0), %%mm5 \n\t"

549 #endif

550 WRITERGB15(%4, %5, %%REGa)

551 YSCALEYUV2PACKEDX_END

552 }

553

554 #define WRITEBGR24MMX(dst, dstw, index) \

555 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\

556 "movq %%mm2, %%mm1 \n\t" /* B */\

557 "movq %%mm5, %%mm6 \n\t" /* R */\

558 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\

559 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\

560 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\

561 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\

562 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\

563 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\

564 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\

565 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\

566 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\

567 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\

568 \

569 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\

570 "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\

571 "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\

572 "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\

573 \

574 "psllq 40,ドル %%mm0 \n\t" /* RGB00000 0 */\

575 "psllq 40,ドル %%mm2 \n\t" /* RGB00000 1 */\

576 "psllq 40,ドル %%mm1 \n\t" /* RGB00000 2 */\

577 "psllq 40,ドル %%mm3 \n\t" /* RGB00000 3 */\

578 \

579 "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\

580 "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\

581 "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\

582 "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\

583 \

584 "psrlq 8,ドル %%mm0 \n\t" /* 00RGBRGB 0 */\

585 "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\

586 "psllq 40,ドル %%mm2 \n\t" /* GB000000 1 */\

587 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\

588 MOVNTQ(%%mm0, (dst))\

589 \

590 "psrlq 24,ドル %%mm6 \n\t" /* 0000RGBR 1 */\

591 "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\

592 "psllq 24,ドル %%mm1 \n\t" /* BRGB0000 2 */\

593 "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\

594 MOVNTQ(%%mm6, 8(dst))\

595 \

596 "psrlq 40,ドル %%mm5 \n\t" /* 000000RG 2 */\

597 "psllq 8,ドル %%mm3 \n\t" /* RGBRGB00 3 */\

598 "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\

599 MOVNTQ(%%mm5, 16(dst))\

600 \

601 "add 24,ドル "#dst" \n\t"\

602 \

603 "add 8,ドル "#index" \n\t"\

604 "cmp "#dstw", "#index" \n\t"\

605 " jb 1b \n\t"

606

607 #define WRITEBGR24MMXEXT(dst, dstw, index) \

608 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\

609 "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\

610 "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\

611 "pshufw 0ドルx50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\

612 "pshufw 0ドルx50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\

613 "pshufw 0ドルx00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\

614 \

615 "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\

616 "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\

617 "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\

618 \

619 "psllq 8,ドル %%mm3 \n\t" /* G2 G1 G0 */\

620 "por %%mm1, %%mm6 \n\t"\

621 "por %%mm3, %%mm6 \n\t"\

622 MOVNTQ(%%mm6, (dst))\

623 \

624 "psrlq 8,ドル %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\

625 "pshufw 0ドルxA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\

626 "pshufw 0ドルx55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\

627 "pshufw 0ドルxA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\

628 \

629 "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5 B4 B3 */\

630 "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\

631 "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\

632 \

633 "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\

634 "por %%mm3, %%mm6 \n\t"\

635 MOVNTQ(%%mm6, 8(dst))\

636 \

637 "pshufw 0ドルxFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\

638 "pshufw 0ドルxFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\

639 "pshufw 0ドルxFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\

640 \

641 "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\

642 "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\

643 "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7 R6 R5 */\

644 \

645 "por %%mm1, %%mm3 \n\t"\

646 "por %%mm3, %%mm6 \n\t"\

647 MOVNTQ(%%mm6, 16(dst))\

648 \

649 "add 24,ドル "#dst" \n\t"\

650 \

651 "add 8,ドル "#index" \n\t"\

652 "cmp "#dstw", "#index" \n\t"\

653 " jb 1b \n\t"

654

655 #if COMPILE_TEMPLATE_MMXEXT

656 #undef WRITEBGR24

657 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMXEXT(dst, dstw, index)

658 #else

659 #undef WRITEBGR24

660 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index)

661 #endif

662

663 static void RENAME(yuv2bgr24_X_ar)(SwsContext *c, const int16_t *lumFilter,

664 const int16_t **lumSrc, int lumFilterSize,

665 const int16_t *chrFilter, const int16_t **chrUSrc,

666 const int16_t **chrVSrc,

667 int chrFilterSize, const int16_t **alpSrc,

668 uint8_t *dest, int dstW, int dstY)

669 {

670 x86_reg dummy=0;

671 x86_reg dstW_reg = dstW;

672 x86_reg uv_off = c->uv_offx2;

673

674 YSCALEYUV2PACKEDX_ACCURATE

675 YSCALEYUV2RGBX

676 "pxor %%mm7, %%mm7 \n\t"

677 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize

678 "add %4, %%"REG_c" \n\t"

679 WRITEBGR24(%%REGc, %5, %%REGa)

680 :: "r" (&c->redDither),

681 "m" (dummy), "m" (dummy), "m" (dummy),

682 "r" (dest), "m" (dstW_reg), "m"(uv_off)

683 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S

684 );

685 }

686

687 static void RENAME(yuv2bgr24_X)(SwsContext *c, const int16_t *lumFilter,

688 const int16_t **lumSrc, int lumFilterSize,

689 const int16_t *chrFilter, const int16_t **chrUSrc,

690 const int16_t **chrVSrc,

691 int chrFilterSize, const int16_t **alpSrc,

692 uint8_t *dest, int dstW, int dstY)

693 {

694 x86_reg dummy=0;

695 x86_reg dstW_reg = dstW;

696 x86_reg uv_off = c->uv_offx2;

697

698 YSCALEYUV2PACKEDX

699 YSCALEYUV2RGBX

700 "pxor %%mm7, %%mm7 \n\t"

701 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c" \n\t" //FIXME optimize

702 "add %4, %%"REG_c" \n\t"

703 WRITEBGR24(%%REGc, %5, %%REGa)

704 :: "r" (&c->redDither),

705 "m" (dummy), "m" (dummy), "m" (dummy),

706 "r" (dest), "m" (dstW_reg), "m"(uv_off)

707 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S

708 );

709 }

710

711 #define REAL_WRITEYUY2(dst, dstw, index) \

712 "packuswb %%mm3, %%mm3 \n\t"\

713 "packuswb %%mm4, %%mm4 \n\t"\

714 "packuswb %%mm7, %%mm1 \n\t"\

715 "punpcklbw %%mm4, %%mm3 \n\t"\

716 "movq %%mm1, %%mm7 \n\t"\

717 "punpcklbw %%mm3, %%mm1 \n\t"\

718 "punpckhbw %%mm3, %%mm7 \n\t"\

719 \

720 MOVNTQ(%%mm1, (dst, index, 2))\

721 MOVNTQ(%%mm7, 8(dst, index, 2))\

722 \

723 "add 8,ドル "#index" \n\t"\

724 "cmp "#dstw", "#index" \n\t"\

725 " jb 1b \n\t"

726 #define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index)

727

728 static void RENAME(yuv2yuyv422_X_ar)(SwsContext *c, const int16_t *lumFilter,

729 const int16_t **lumSrc, int lumFilterSize,

730 const int16_t *chrFilter, const int16_t **chrUSrc,

731 const int16_t **chrVSrc,

732 int chrFilterSize, const int16_t **alpSrc,

733 uint8_t *dest, int dstW, int dstY)

734 {

735 x86_reg dummy=0;

736 x86_reg dstW_reg = dstW;

737 x86_reg uv_off = c->uv_offx2;

738

739 YSCALEYUV2PACKEDX_ACCURATE

740 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */

741 "psraw 3,ドル %%mm3 \n\t"

742 "psraw 3,ドル %%mm4 \n\t"

743 "psraw 3,ドル %%mm1 \n\t"

744 "psraw 3,ドル %%mm7 \n\t"

745 WRITEYUY2(%4, %5, %%REGa)

746 YSCALEYUV2PACKEDX_END

747 }

748

749 static void RENAME(yuv2yuyv422_X)(SwsContext *c, const int16_t *lumFilter,

750 const int16_t **lumSrc, int lumFilterSize,

751 const int16_t *chrFilter, const int16_t **chrUSrc,

752 const int16_t **chrVSrc,

753 int chrFilterSize, const int16_t **alpSrc,

754 uint8_t *dest, int dstW, int dstY)

755 {

756 x86_reg dummy=0;

757 x86_reg dstW_reg = dstW;

758 x86_reg uv_off = c->uv_offx2;

759

760 YSCALEYUV2PACKEDX

761 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */

762 "psraw 3,ドル %%mm3 \n\t"

763 "psraw 3,ドル %%mm4 \n\t"

764 "psraw 3,ドル %%mm1 \n\t"

765 "psraw 3,ドル %%mm7 \n\t"

766 WRITEYUY2(%4, %5, %%REGa)

767 YSCALEYUV2PACKEDX_END

768 }

769

770 #define REAL_YSCALEYUV2RGB_UV(index, c) \

771 "xor "#index", "#index" \n\t"\

772 ".p2align 4 \n\t"\

773 "1: \n\t"\

774 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\

775 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\

776 "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \

777 "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\

778 "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\

779 "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \

780 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\

781 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\

782 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\

783 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\

784 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\

785 "psraw 4,ドル %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\

786 "psraw 4,ドル %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\

787 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\

788 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\

789 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\

790 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\

791 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\

792 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\

793 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\

794 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\

795 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\

796

797 #define REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) \

798 "movq ("#b1", "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\

799 "movq ("#b2", "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\

800 "movq 8("#b1", "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\

801 "movq 8("#b2", "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\

802 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\

803 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\

804 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\

805 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\

806 "psraw 4,ドル %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\

807 "psraw 4,ドル %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\

808 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\

809 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\

810

811 #define REAL_YSCALEYUV2RGB_COEFF(c) \

812 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\

813 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\

814 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\

815 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\

816 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\

817 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\

818 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\

819 "paddw %%mm3, %%mm4 \n\t"\

820 "movq %%mm2, %%mm0 \n\t"\

821 "movq %%mm5, %%mm6 \n\t"\

822 "movq %%mm4, %%mm3 \n\t"\

823 "punpcklwd %%mm2, %%mm2 \n\t"\

824 "punpcklwd %%mm5, %%mm5 \n\t"\

825 "punpcklwd %%mm4, %%mm4 \n\t"\

826 "paddw %%mm1, %%mm2 \n\t"\

827 "paddw %%mm1, %%mm5 \n\t"\

828 "paddw %%mm1, %%mm4 \n\t"\

829 "punpckhwd %%mm0, %%mm0 \n\t"\

830 "punpckhwd %%mm6, %%mm6 \n\t"\

831 "punpckhwd %%mm3, %%mm3 \n\t"\

832 "paddw %%mm7, %%mm0 \n\t"\

833 "paddw %%mm7, %%mm6 \n\t"\

834 "paddw %%mm7, %%mm3 \n\t"\

835 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\

836 "packuswb %%mm0, %%mm2 \n\t"\

837 "packuswb %%mm6, %%mm5 \n\t"\

838 "packuswb %%mm3, %%mm4 \n\t"\

839

840 #define YSCALEYUV2RGB_YA(index, c, b1, b2) REAL_YSCALEYUV2RGB_YA(index, c, b1, b2)

841

842 #define YSCALEYUV2RGB(index, c) \

843 REAL_YSCALEYUV2RGB_UV(index, c) \

844 REAL_YSCALEYUV2RGB_YA(index, c, %0, %1) \

845 REAL_YSCALEYUV2RGB_COEFF(c)

846

847 /**

848 * vertical bilinear scale YV12 to RGB

849 */

850 static void RENAME(yuv2rgb32_2)(SwsContext *c, const int16_t *buf[2],

851 const int16_t *ubuf[2], const int16_t *vbuf[2],

852 const int16_t *abuf[2], uint8_t *dest,

853 int dstW, int yalpha, int uvalpha, int y)

854 {

855 const int16_t *buf0 = buf[0], *buf1 = buf[1],

856 *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];

857

858 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {

859 const int16_t *abuf0 = abuf[0], *abuf1 = abuf[1];

860 #if ARCH_X86_64

861 __asm__ volatile(

862 YSCALEYUV2RGB(%%r8, %5)

863 YSCALEYUV2RGB_YA(%%r8, %5, %6, %7)

864 "psraw 3,ドル %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/

865 "psraw 3,ドル %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/

866 "packuswb %%mm7, %%mm1 \n\t"

867 WRITEBGR32(%4, 8280(%5), %%r8, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)

868 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "r" (dest),

869 "a" (&c->redDither),

870 "r" (abuf0), "r" (abuf1)

871 : "%r8"

872 );

873 #else

874 c->u_temp=(intptr_t)abuf0;

875 c->v_temp=(intptr_t)abuf1;

876 __asm__ volatile(

877 "mov %%"REG_b", "ESP_OFFSET "(%5) \n\t"

878 "mov %4, %%"REG_b" \n\t"

879 "push %%"REG_BP" \n\t"

880 YSCALEYUV2RGB(%%REGBP, %5)

881 "push %0 \n\t"

882 "push %1 \n\t"

883 "mov "U_TEMP "(%5), %0 \n\t"

884 "mov "V_TEMP "(%5), %1 \n\t"

885 YSCALEYUV2RGB_YA(%%REGBP, %5, %0, %1)

886 "psraw 3,ドル %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/

887 "psraw 3,ドル %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/

888 "packuswb %%mm7, %%mm1 \n\t"

889 "pop %1 \n\t"

890 "pop %0 \n\t"

891 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)

892 "pop %%"REG_BP" \n\t"

893 "mov "ESP_OFFSET "(%5), %%"REG_b" \n\t"

894 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),

895 "a" (&c->redDither)

896 );

897 #endif

898 } else {

899 __asm__ volatile(

900 "mov %%"REG_b", "ESP_OFFSET "(%5) \n\t"

901 "mov %4, %%"REG_b" \n\t"

902 "push %%"REG_BP" \n\t"

903 YSCALEYUV2RGB(%%REGBP, %5)

904 "pcmpeqd %%mm7, %%mm7 \n\t"

905 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)

906 "pop %%"REG_BP" \n\t"

907 "mov "ESP_OFFSET "(%5), %%"REG_b" \n\t"

908 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),

909 "a" (&c->redDither)

910 );

911 }

912 }

913

914 static void RENAME(yuv2bgr24_2)(SwsContext *c, const int16_t *buf[2],

915 const int16_t *ubuf[2], const int16_t *vbuf[2],

916 const int16_t *abuf[2], uint8_t *dest,

917 int dstW, int yalpha, int uvalpha, int y)

918 {

919 const int16_t *buf0 = buf[0], *buf1 = buf[1],

920 *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];

921

922 //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(

923 __asm__ volatile(

924 "mov %%"REG_b", "ESP_OFFSET "(%5) \n\t"

925 "mov %4, %%"REG_b" \n\t"

926 "push %%"REG_BP" \n\t"

927 YSCALEYUV2RGB(%%REGBP, %5)

928 "pxor %%mm7, %%mm7 \n\t"

929 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)

930 "pop %%"REG_BP" \n\t"

931 "mov "ESP_OFFSET "(%5), %%"REG_b" \n\t"

932 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),

933 "a" (&c->redDither)

934 );

935 }

936

937 static void RENAME(yuv2rgb555_2)(SwsContext *c, const int16_t *buf[2],

938 const int16_t *ubuf[2], const int16_t *vbuf[2],

939 const int16_t *abuf[2], uint8_t *dest,

940 int dstW, int yalpha, int uvalpha, int y)

941 {

942 const int16_t *buf0 = buf[0], *buf1 = buf[1],

943 *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];

944

945 //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(

946 __asm__ volatile(

947 "mov %%"REG_b", "ESP_OFFSET "(%5) \n\t"

948 "mov %4, %%"REG_b" \n\t"

949 "push %%"REG_BP" \n\t"

950 YSCALEYUV2RGB(%%REGBP, %5)

951 "pxor %%mm7, %%mm7 \n\t"

952 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */

953 #ifdef DITHER1XBPP

954 "paddusb "BLUE_DITHER "(%5), %%mm2 \n\t"

955 "paddusb "GREEN_DITHER "(%5), %%mm4 \n\t"

956 "paddusb "RED_DITHER "(%5), %%mm5 \n\t"

957 #endif

958 WRITERGB15(%%REGb, 8280(%5), %%REGBP)

959 "pop %%"REG_BP" \n\t"

960 "mov "ESP_OFFSET "(%5), %%"REG_b" \n\t"

961 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),

962 "a" (&c->redDither)

963 );

964 }

965

966 static void RENAME(yuv2rgb565_2)(SwsContext *c, const int16_t *buf[2],

967 const int16_t *ubuf[2], const int16_t *vbuf[2],

968 const int16_t *abuf[2], uint8_t *dest,

969 int dstW, int yalpha, int uvalpha, int y)

970 {

971 const int16_t *buf0 = buf[0], *buf1 = buf[1],

972 *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];

973

974 //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(

975 __asm__ volatile(

976 "mov %%"REG_b", "ESP_OFFSET "(%5) \n\t"

977 "mov %4, %%"REG_b" \n\t"

978 "push %%"REG_BP" \n\t"

979 YSCALEYUV2RGB(%%REGBP, %5)

980 "pxor %%mm7, %%mm7 \n\t"

981 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */

982 #ifdef DITHER1XBPP

983 "paddusb "BLUE_DITHER "(%5), %%mm2 \n\t"

984 "paddusb "GREEN_DITHER "(%5), %%mm4 \n\t"

985 "paddusb "RED_DITHER "(%5), %%mm5 \n\t"

986 #endif

987 WRITERGB16(%%REGb, 8280(%5), %%REGBP)

988 "pop %%"REG_BP" \n\t"

989 "mov "ESP_OFFSET "(%5), %%"REG_b" \n\t"

990 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),

991 "a" (&c->redDither)

992 );

993 }

994

995 #define REAL_YSCALEYUV2PACKED(index, c) \

996 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\

997 "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1 \n\t"\

998 "psraw 3,ドル %%mm0 \n\t"\

999 "psraw 3,ドル %%mm1 \n\t"\

1000 "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\

1001 "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\

1002 "xor "#index", "#index" \n\t"\

1003 ".p2align 4 \n\t"\

1004 "1: \n\t"\

1005 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\

1006 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\

1007 "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \

1008 "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\

1009 "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\

1010 "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \

1011 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\

1012 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\

1013 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\

1014 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\

1015 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\

1016 "psraw 7,ドル %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\

1017 "psraw 7,ドル %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\

1018 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\

1019 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\

1020 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\

1021 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\

1022 "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\

1023 "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\

1024 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\

1025 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\

1026 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\

1027 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\

1028 "psraw 7,ドル %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\

1029 "psraw 7,ドル %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\

1030 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\

1031 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\

1032

1033 #define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c)

1034

1035 static void RENAME(yuv2yuyv422_2)(SwsContext *c, const int16_t *buf[2],

1036 const int16_t *ubuf[2], const int16_t *vbuf[2],

1037 const int16_t *abuf[2], uint8_t *dest,

1038 int dstW, int yalpha, int uvalpha, int y)

1039 {

1040 const int16_t *buf0 = buf[0], *buf1 = buf[1],

1041 *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];

1042

1043 //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(

1044 __asm__ volatile(

1045 "mov %%"REG_b", "ESP_OFFSET "(%5) \n\t"

1046 "mov %4, %%"REG_b" \n\t"

1047 "push %%"REG_BP" \n\t"

1048 YSCALEYUV2PACKED(%%REGBP, %5)

1049 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)

1050 "pop %%"REG_BP" \n\t"

1051 "mov "ESP_OFFSET "(%5), %%"REG_b" \n\t"

1052 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),

1053 "a" (&c->redDither)

1054 );

1055 }

1056

1057 #define REAL_YSCALEYUV2RGB1(index, c) \

1058 "xor "#index", "#index" \n\t"\

1059 ".p2align 4 \n\t"\

1060 "1: \n\t"\

1061 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\

1062 "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \

1063 "movq (%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\

1064 "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \

1065 "psraw 4,ドル %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\

1066 "psraw 4,ドル %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\

1067 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\

1068 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\

1069 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\

1070 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\

1071 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\

1072 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\

1073 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\

1074 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\

1075 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\

1076 "psraw 4,ドル %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\

1077 "psraw 4,ドル %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\

1078 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\

1079 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\

1080 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\

1081 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\

1082 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\

1083 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\

1084 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\

1085 "paddw %%mm3, %%mm4 \n\t"\

1086 "movq %%mm2, %%mm0 \n\t"\

1087 "movq %%mm5, %%mm6 \n\t"\

1088 "movq %%mm4, %%mm3 \n\t"\

1089 "punpcklwd %%mm2, %%mm2 \n\t"\

1090 "punpcklwd %%mm5, %%mm5 \n\t"\

1091 "punpcklwd %%mm4, %%mm4 \n\t"\

1092 "paddw %%mm1, %%mm2 \n\t"\

1093 "paddw %%mm1, %%mm5 \n\t"\

1094 "paddw %%mm1, %%mm4 \n\t"\

1095 "punpckhwd %%mm0, %%mm0 \n\t"\

1096 "punpckhwd %%mm6, %%mm6 \n\t"\

1097 "punpckhwd %%mm3, %%mm3 \n\t"\

1098 "paddw %%mm7, %%mm0 \n\t"\

1099 "paddw %%mm7, %%mm6 \n\t"\

1100 "paddw %%mm7, %%mm3 \n\t"\

1101 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\

1102 "packuswb %%mm0, %%mm2 \n\t"\

1103 "packuswb %%mm6, %%mm5 \n\t"\

1104 "packuswb %%mm3, %%mm4 \n\t"\

1105

1106 #define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c)

1107

1108 // do vertical chrominance interpolation

1109 #define REAL_YSCALEYUV2RGB1b(index, c) \

1110 "xor "#index", "#index" \n\t"\

1111 ".p2align 4 \n\t"\

1112 "1: \n\t"\

1113 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\

1114 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\

1115 "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \

1116 "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\

1117 "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\

1118 "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \

1119 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\

1120 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\

1121 "psrlw 5,ドル %%mm3 \n\t" /*FIXME might overflow*/\

1122 "psrlw 5,ドル %%mm4 \n\t" /*FIXME might overflow*/\

1123 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\

1124 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\

1125 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\

1126 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\

1127 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\

1128 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\

1129 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\

1130 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\

1131 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\

1132 "psraw 4,ドル %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\

1133 "psraw 4,ドル %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\

1134 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\

1135 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\

1136 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\

1137 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\

1138 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\

1139 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\

1140 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\

1141 "paddw %%mm3, %%mm4 \n\t"\

1142 "movq %%mm2, %%mm0 \n\t"\

1143 "movq %%mm5, %%mm6 \n\t"\

1144 "movq %%mm4, %%mm3 \n\t"\

1145 "punpcklwd %%mm2, %%mm2 \n\t"\

1146 "punpcklwd %%mm5, %%mm5 \n\t"\

1147 "punpcklwd %%mm4, %%mm4 \n\t"\

1148 "paddw %%mm1, %%mm2 \n\t"\

1149 "paddw %%mm1, %%mm5 \n\t"\

1150 "paddw %%mm1, %%mm4 \n\t"\

1151 "punpckhwd %%mm0, %%mm0 \n\t"\

1152 "punpckhwd %%mm6, %%mm6 \n\t"\

1153 "punpckhwd %%mm3, %%mm3 \n\t"\

1154 "paddw %%mm7, %%mm0 \n\t"\

1155 "paddw %%mm7, %%mm6 \n\t"\

1156 "paddw %%mm7, %%mm3 \n\t"\

1157 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\

1158 "packuswb %%mm0, %%mm2 \n\t"\

1159 "packuswb %%mm6, %%mm5 \n\t"\

1160 "packuswb %%mm3, %%mm4 \n\t"\

1161

1162 #define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c)

1163

1164 #define REAL_YSCALEYUV2RGB1_ALPHA(index) \

1165 "movq (%1, "#index", 2), %%mm7 \n\t" /* abuf0[index ] */\

1166 "movq 8(%1, "#index", 2), %%mm1 \n\t" /* abuf0[index+4] */\

1167 "psraw 7,ドル %%mm7 \n\t" /* abuf0[index ] >>7 */\

1168 "psraw 7,ドル %%mm1 \n\t" /* abuf0[index+4] >>7 */\

1169 "packuswb %%mm1, %%mm7 \n\t"

1170 #define YSCALEYUV2RGB1_ALPHA(index) REAL_YSCALEYUV2RGB1_ALPHA(index)

1171

1172 /**

1173 * YV12 to RGB without scaling or interpolating

1174 */

1175 static void RENAME(yuv2rgb32_1)(SwsContext *c, const int16_t *buf0,

1176 const int16_t *ubuf[2], const int16_t *vbuf[2],

1177 const int16_t *abuf0, uint8_t *dest,

1178 int dstW, int uvalpha, int y)

1179 {

1180 const int16_t *ubuf0 = ubuf[0];

1181 const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1

1182

1183 if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster

1184 const int16_t *ubuf1 = ubuf[0];

1185 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {

1186 __asm__ volatile(

1187 "mov %%"REG_b", "ESP_OFFSET "(%5) \n\t"

1188 "mov %4, %%"REG_b" \n\t"

1189 "push %%"REG_BP" \n\t"

1190 YSCALEYUV2RGB1(%%REGBP, %5)

1191 YSCALEYUV2RGB1_ALPHA(%%REGBP)

1192 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)

1193 "pop %%"REG_BP" \n\t"

1194 "mov "ESP_OFFSET "(%5), %%"REG_b" \n\t"

1195 :: "c" (buf0), "d" (abuf0), "S" (ubuf0), "D" (ubuf1), "m" (dest),

1196 "a" (&c->redDither)

1197 );

1198 } else {

1199 __asm__ volatile(

1200 "mov %%"REG_b", "ESP_OFFSET "(%5) \n\t"

1201 "mov %4, %%"REG_b" \n\t"

1202 "push %%"REG_BP" \n\t"

1203 YSCALEYUV2RGB1(%%REGBP, %5)

1204 "pcmpeqd %%mm7, %%mm7 \n\t"

1205 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)

1206 "pop %%"REG_BP" \n\t"

1207 "mov "ESP_OFFSET "(%5), %%"REG_b" \n\t"

1208 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),

1209 "a" (&c->redDither)

1210 );

1211 }

1212 } else {

1213 const int16_t *ubuf1 = ubuf[1];

1214 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {

1215 __asm__ volatile(

1216 "mov %%"REG_b", "ESP_OFFSET "(%5) \n\t"

1217 "mov %4, %%"REG_b" \n\t"

1218 "push %%"REG_BP" \n\t"

1219 YSCALEYUV2RGB1b(%%REGBP, %5)

1220 YSCALEYUV2RGB1_ALPHA(%%REGBP)

1221 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)

1222 "pop %%"REG_BP" \n\t"

1223 "mov "ESP_OFFSET "(%5), %%"REG_b" \n\t"

1224 :: "c" (buf0), "d" (abuf0), "S" (ubuf0), "D" (ubuf1), "m" (dest),

1225 "a" (&c->redDither)

1226 );

1227 } else {

1228 __asm__ volatile(

1229 "mov %%"REG_b", "ESP_OFFSET "(%5) \n\t"

1230 "mov %4, %%"REG_b" \n\t"

1231 "push %%"REG_BP" \n\t"

1232 YSCALEYUV2RGB1b(%%REGBP, %5)

1233 "pcmpeqd %%mm7, %%mm7 \n\t"

1234 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)

1235 "pop %%"REG_BP" \n\t"

1236 "mov "ESP_OFFSET "(%5), %%"REG_b" \n\t"

1237 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),

1238 "a" (&c->redDither)

1239 );

1240 }

1241 }

1242 }

1243

1244 static void RENAME(yuv2bgr24_1)(SwsContext *c, const int16_t *buf0,

1245 const int16_t *ubuf[2], const int16_t *vbuf[2],

1246 const int16_t *abuf0, uint8_t *dest,

1247 int dstW, int uvalpha, int y)

1248 {

1249 const int16_t *ubuf0 = ubuf[0];

1250 const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1

1251

1252 if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster

1253 const int16_t *ubuf1 = ubuf[0];

1254 __asm__ volatile(

1255 "mov %%"REG_b", "ESP_OFFSET "(%5) \n\t"

1256 "mov %4, %%"REG_b" \n\t"

1257 "push %%"REG_BP" \n\t"

1258 YSCALEYUV2RGB1(%%REGBP, %5)

1259 "pxor %%mm7, %%mm7 \n\t"

1260 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)

1261 "pop %%"REG_BP" \n\t"

1262 "mov "ESP_OFFSET "(%5), %%"REG_b" \n\t"

1263 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),

1264 "a" (&c->redDither)

1265 );

1266 } else {

1267 const int16_t *ubuf1 = ubuf[1];

1268 __asm__ volatile(

1269 "mov %%"REG_b", "ESP_OFFSET "(%5) \n\t"

1270 "mov %4, %%"REG_b" \n\t"

1271 "push %%"REG_BP" \n\t"

1272 YSCALEYUV2RGB1b(%%REGBP, %5)

1273 "pxor %%mm7, %%mm7 \n\t"

1274 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)

1275 "pop %%"REG_BP" \n\t"

1276 "mov "ESP_OFFSET "(%5), %%"REG_b" \n\t"

1277 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),

1278 "a" (&c->redDither)

1279 );

1280 }

1281 }

1282

1283 static void RENAME(yuv2rgb555_1)(SwsContext *c, const int16_t *buf0,

1284 const int16_t *ubuf[2], const int16_t *vbuf[2],

1285 const int16_t *abuf0, uint8_t *dest,

1286 int dstW, int uvalpha, int y)

1287 {

1288 const int16_t *ubuf0 = ubuf[0];

1289 const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1

1290

1291 if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster

1292 const int16_t *ubuf1 = ubuf[0];

1293 __asm__ volatile(

1294 "mov %%"REG_b", "ESP_OFFSET "(%5) \n\t"

1295 "mov %4, %%"REG_b" \n\t"

1296 "push %%"REG_BP" \n\t"

1297 YSCALEYUV2RGB1(%%REGBP, %5)

1298 "pxor %%mm7, %%mm7 \n\t"

1299 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */

1300 #ifdef DITHER1XBPP

1301 "paddusb "BLUE_DITHER "(%5), %%mm2 \n\t"

1302 "paddusb "GREEN_DITHER "(%5), %%mm4 \n\t"

1303 "paddusb "RED_DITHER "(%5), %%mm5 \n\t"

1304 #endif

1305 WRITERGB15(%%REGb, 8280(%5), %%REGBP)

1306 "pop %%"REG_BP" \n\t"

1307 "mov "ESP_OFFSET "(%5), %%"REG_b" \n\t"

1308 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),

1309 "a" (&c->redDither)

1310 );

1311 } else {

1312 const int16_t *ubuf1 = ubuf[1];

1313 __asm__ volatile(

1314 "mov %%"REG_b", "ESP_OFFSET "(%5) \n\t"

1315 "mov %4, %%"REG_b" \n\t"

1316 "push %%"REG_BP" \n\t"

1317 YSCALEYUV2RGB1b(%%REGBP, %5)

1318 "pxor %%mm7, %%mm7 \n\t"

1319 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */

1320 #ifdef DITHER1XBPP

1321 "paddusb "BLUE_DITHER "(%5), %%mm2 \n\t"

1322 "paddusb "GREEN_DITHER "(%5), %%mm4 \n\t"

1323 "paddusb "RED_DITHER "(%5), %%mm5 \n\t"

1324 #endif

1325 WRITERGB15(%%REGb, 8280(%5), %%REGBP)

1326 "pop %%"REG_BP" \n\t"

1327 "mov "ESP_OFFSET "(%5), %%"REG_b" \n\t"

1328 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),

1329 "a" (&c->redDither)

1330 );

1331 }

1332 }

1333

1334 static void RENAME(yuv2rgb565_1)(SwsContext *c, const int16_t *buf0,

1335 const int16_t *ubuf[2], const int16_t *vbuf[2],

1336 const int16_t *abuf0, uint8_t *dest,

1337 int dstW, int uvalpha, int y)

1338 {

1339 const int16_t *ubuf0 = ubuf[0];

1340 const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1

1341

1342 if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster

1343 const int16_t *ubuf1 = ubuf[0];

1344 __asm__ volatile(

1345 "mov %%"REG_b", "ESP_OFFSET "(%5) \n\t"

1346 "mov %4, %%"REG_b" \n\t"

1347 "push %%"REG_BP" \n\t"

1348 YSCALEYUV2RGB1(%%REGBP, %5)

1349 "pxor %%mm7, %%mm7 \n\t"

1350 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */

1351 #ifdef DITHER1XBPP

1352 "paddusb "BLUE_DITHER "(%5), %%mm2 \n\t"

1353 "paddusb "GREEN_DITHER "(%5), %%mm4 \n\t"

1354 "paddusb "RED_DITHER "(%5), %%mm5 \n\t"

1355 #endif

1356 WRITERGB16(%%REGb, 8280(%5), %%REGBP)

1357 "pop %%"REG_BP" \n\t"

1358 "mov "ESP_OFFSET "(%5), %%"REG_b" \n\t"

1359 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),

1360 "a" (&c->redDither)

1361 );

1362 } else {

1363 const int16_t *ubuf1 = ubuf[1];

1364 __asm__ volatile(

1365 "mov %%"REG_b", "ESP_OFFSET "(%5) \n\t"

1366 "mov %4, %%"REG_b" \n\t"

1367 "push %%"REG_BP" \n\t"

1368 YSCALEYUV2RGB1b(%%REGBP, %5)

1369 "pxor %%mm7, %%mm7 \n\t"

1370 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */

1371 #ifdef DITHER1XBPP

1372 "paddusb "BLUE_DITHER "(%5), %%mm2 \n\t"

1373 "paddusb "GREEN_DITHER "(%5), %%mm4 \n\t"

1374 "paddusb "RED_DITHER "(%5), %%mm5 \n\t"

1375 #endif

1376 WRITERGB16(%%REGb, 8280(%5), %%REGBP)

1377 "pop %%"REG_BP" \n\t"

1378 "mov "ESP_OFFSET "(%5), %%"REG_b" \n\t"

1379 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),

1380 "a" (&c->redDither)

1381 );

1382 }

1383 }

1384

1385 #define REAL_YSCALEYUV2PACKED1(index, c) \

1386 "xor "#index", "#index" \n\t"\

1387 ".p2align 4 \n\t"\

1388 "1: \n\t"\

1389 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\

1390 "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \

1391 "movq (%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\

1392 "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \

1393 "psraw 7,ドル %%mm3 \n\t" \

1394 "psraw 7,ドル %%mm4 \n\t" \

1395 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\

1396 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\

1397 "psraw 7,ドル %%mm1 \n\t" \

1398 "psraw 7,ドル %%mm7 \n\t" \

1399

1400 #define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c)

1401

1402 #define REAL_YSCALEYUV2PACKED1b(index, c) \

1403 "xor "#index", "#index" \n\t"\

1404 ".p2align 4 \n\t"\

1405 "1: \n\t"\

1406 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\

1407 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\

1408 "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \

1409 "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\

1410 "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\

1411 "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \

1412 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\

1413 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\

1414 "psrlw 8,ドル %%mm3 \n\t" \

1415 "psrlw 8,ドル %%mm4 \n\t" \

1416 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\

1417 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\

1418 "psraw 7,ドル %%mm1 \n\t" \

1419 "psraw 7,ドル %%mm7 \n\t"

1420 #define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c)

1421

1422 static void RENAME(yuv2yuyv422_1)(SwsContext *c, const int16_t *buf0,

1423 const int16_t *ubuf[2], const int16_t *vbuf[2],

1424 const int16_t *abuf0, uint8_t *dest,

1425 int dstW, int uvalpha, int y)

1426 {

1427 const int16_t *ubuf0 = ubuf[0];

1428 const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1

1429

1430 if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster

1431 const int16_t *ubuf1 = ubuf[0];

1432 __asm__ volatile(

1433 "mov %%"REG_b", "ESP_OFFSET "(%5) \n\t"

1434 "mov %4, %%"REG_b" \n\t"

1435 "push %%"REG_BP" \n\t"

1436 YSCALEYUV2PACKED1(%%REGBP, %5)

1437 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)

1438 "pop %%"REG_BP" \n\t"

1439 "mov "ESP_OFFSET "(%5), %%"REG_b" \n\t"

1440 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),

1441 "a" (&c->redDither)

1442 );

1443 } else {

1444 const int16_t *ubuf1 = ubuf[1];

1445 __asm__ volatile(

1446 "mov %%"REG_b", "ESP_OFFSET "(%5) \n\t"

1447 "mov %4, %%"REG_b" \n\t"

1448 "push %%"REG_BP" \n\t"

1449 YSCALEYUV2PACKED1b(%%REGBP, %5)

1450 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)

1451 "pop %%"REG_BP" \n\t"

1452 "mov "ESP_OFFSET "(%5), %%"REG_b" \n\t"

1453 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),

1454 "a" (&c->redDither)

1455 );

1456 }

1457 }

1458

1459 #if COMPILE_TEMPLATE_MMXEXT

1460 static void RENAME(hyscale_fast)(SwsContext *c, int16_t *dst,

1461 int dstWidth, const uint8_t *src,

1462 int srcW, int xInc)

1463 {

1464 int32_t *filterPos = c->hLumFilterPos;

1465 int16_t *filter = c->hLumFilter;

1466 void *mmxextFilterCode = c->lumMmxextFilterCode;

1467 int i;

1468 #if defined(PIC)

1469 uint64_t ebxsave;

1470 #endif

1471 #if ARCH_X86_64

1472 uint64_t retsave;

1473 #endif

1474

1475 __asm__ volatile(

1476 #if defined(PIC)

1477 "mov %%"REG_b", %5 \n\t"

1478 #if ARCH_X86_64

1479 "mov -8(%%rsp), %%"REG_a" \n\t"

1480 "mov %%"REG_a", %6 \n\t"

1481 #endif

1482 #else

1483 #if ARCH_X86_64

1484 "mov -8(%%rsp), %%"REG_a" \n\t"

1485 "mov %%"REG_a", %5 \n\t"

1486 #endif

1487 #endif

1488 "pxor %%mm7, %%mm7 \n\t"

1489 "mov %0, %%"REG_c" \n\t"

1490 "mov %1, %%"REG_D" \n\t"

1491 "mov %2, %%"REG_d" \n\t"

1492 "mov %3, %%"REG_b" \n\t"

1493 "xor %%"REG_a", %%"REG_a" \n\t" // i

1494 PREFETCH " (%%"REG_c") \n\t"

1495 PREFETCH " 32(%%"REG_c") \n\t"

1496 PREFETCH " 64(%%"REG_c") \n\t"

1497

1498 #if ARCH_X86_64

1499 #define CALL_MMXEXT_FILTER_CODE \

1500 "movl (%%"REG_b"), %%esi \n\t"\

1501 "call *%4 \n\t"\

1502 "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\

1503 "add %%"REG_S", %%"REG_c" \n\t"\

1504 "add %%"REG_a", %%"REG_D" \n\t"\

1505 "xor %%"REG_a", %%"REG_a" \n\t"\

1506

1507 #else

1508 #define CALL_MMXEXT_FILTER_CODE \

1509 "movl (%%"REG_b"), %%esi \n\t"\

1510 "call *%4 \n\t"\

1511 "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\

1512 "add %%"REG_a", %%"REG_D" \n\t"\

1513 "xor %%"REG_a", %%"REG_a" \n\t"\

1514

1515 #endif /* ARCH_X86_64 */

1516

1517 CALL_MMXEXT_FILTER_CODE

1518 CALL_MMXEXT_FILTER_CODE

1519 CALL_MMXEXT_FILTER_CODE

1520 CALL_MMXEXT_FILTER_CODE

1521 CALL_MMXEXT_FILTER_CODE

1522 CALL_MMXEXT_FILTER_CODE

1523 CALL_MMXEXT_FILTER_CODE

1524 CALL_MMXEXT_FILTER_CODE

1525

1526 #if defined(PIC)

1527 "mov %5, %%"REG_b" \n\t"

1528 #if ARCH_X86_64

1529 "mov %6, %%"REG_a" \n\t"

1530 "mov %%"REG_a", -8(%%rsp) \n\t"

1531 #endif

1532 #else

1533 #if ARCH_X86_64

1534 "mov %5, %%"REG_a" \n\t"

1535 "mov %%"REG_a", -8(%%rsp) \n\t"

1536 #endif

1537 #endif

1538 :: "m" (src), "m" (dst), "m" (filter), "m" (filterPos),

1539 "m" (mmxextFilterCode)

1540 #if defined(PIC)

1541 ,"m" (ebxsave)

1542 #endif

1543 #if ARCH_X86_64

1544 ,"m"(retsave)

1545 #endif

1546 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D

1547 #if !defined(PIC)

1548 ,"%"REG_b

1549 #endif

1550 );

1551

1552 for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)

1553 dst[i] = src[srcW-1]*128;

1554 }

1555

1556 static void RENAME(hcscale_fast)(SwsContext *c, int16_t *dst1, int16_t *dst2,

1557 int dstWidth, const uint8_t *src1,

1558 const uint8_t *src2, int srcW, int xInc)

1559 {

1560 int32_t *filterPos = c->hChrFilterPos;

1561 int16_t *filter = c->hChrFilter;

1562 void *mmxextFilterCode = c->chrMmxextFilterCode;

1563 int i;

1564 #if defined(PIC)

1565 DECLARE_ALIGNED(8, uint64_t, ebxsave);

1566 #endif

1567 #if ARCH_X86_64

1568 DECLARE_ALIGNED(8, uint64_t, retsave);

1569 #endif

1570

1571 __asm__ volatile(

1572 #if defined(PIC)

1573 "mov %%"REG_b", %7 \n\t"

1574 #if ARCH_X86_64

1575 "mov -8(%%rsp), %%"REG_a" \n\t"

1576 "mov %%"REG_a", %8 \n\t"

1577 #endif

1578 #else

1579 #if ARCH_X86_64

1580 "mov -8(%%rsp), %%"REG_a" \n\t"

1581 "mov %%"REG_a", %7 \n\t"

1582 #endif

1583 #endif

1584 "pxor %%mm7, %%mm7 \n\t"

1585 "mov %0, %%"REG_c" \n\t"

1586 "mov %1, %%"REG_D" \n\t"

1587 "mov %2, %%"REG_d" \n\t"

1588 "mov %3, %%"REG_b" \n\t"

1589 "xor %%"REG_a", %%"REG_a" \n\t" // i

1590 PREFETCH " (%%"REG_c") \n\t"

1591 PREFETCH " 32(%%"REG_c") \n\t"

1592 PREFETCH " 64(%%"REG_c") \n\t"

1593

1594 CALL_MMXEXT_FILTER_CODE

1595 CALL_MMXEXT_FILTER_CODE

1596 CALL_MMXEXT_FILTER_CODE

1597 CALL_MMXEXT_FILTER_CODE

1598 "xor %%"REG_a", %%"REG_a" \n\t" // i

1599 "mov %5, %%"REG_c" \n\t" // src

1600 "mov %6, %%"REG_D" \n\t" // buf2

1601 PREFETCH " (%%"REG_c") \n\t"

1602 PREFETCH " 32(%%"REG_c") \n\t"

1603 PREFETCH " 64(%%"REG_c") \n\t"

1604

1605 CALL_MMXEXT_FILTER_CODE

1606 CALL_MMXEXT_FILTER_CODE

1607 CALL_MMXEXT_FILTER_CODE

1608 CALL_MMXEXT_FILTER_CODE

1609

1610 #if defined(PIC)

1611 "mov %7, %%"REG_b" \n\t"

1612 #if ARCH_X86_64

1613 "mov %8, %%"REG_a" \n\t"

1614 "mov %%"REG_a", -8(%%rsp) \n\t"

1615 #endif

1616 #else

1617 #if ARCH_X86_64

1618 "mov %7, %%"REG_a" \n\t"

1619 "mov %%"REG_a", -8(%%rsp) \n\t"

1620 #endif

1621 #endif

1622 :: "m" (src1), "m" (dst1), "m" (filter), "m" (filterPos),

1623 "m" (mmxextFilterCode), "m" (src2), "m"(dst2)

1624 #if defined(PIC)

1625 ,"m" (ebxsave)

1626 #endif

1627 #if ARCH_X86_64

1628 ,"m"(retsave)

1629 #endif

1630 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D

1631 #if !defined(PIC)

1632 ,"%"REG_b

1633 #endif

1634 );

1635

1636 for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) {

1637 dst1[i] = src1[srcW-1]*128;

1638 dst2[i] = src2[srcW-1]*128;

1639 }

1640 }

1641 #endif /* COMPILE_TEMPLATE_MMXEXT */

1642

1643 static av_cold void RENAME(sws_init_swScale)(SwsContext *c)

1644 {

1645 enum AVPixelFormat dstFormat = c->dstFormat;

1646

1647 c->use_mmx_vfilter= 0;

1648 if (!is16BPS(dstFormat) && !is9_OR_10BPS(dstFormat) && dstFormat != AV_PIX_FMT_NV12

1649 && dstFormat != AV_PIX_FMT_NV21 && !(c->flags & SWS_BITEXACT)) {

1650 if (c->flags & SWS_ACCURATE_RND) {

1651 if (!(c->flags & SWS_FULL_CHR_H_INT)) {

1652 switch (c->dstFormat) {

1653 case AV_PIX_FMT_RGB32: c->yuv2packedX = RENAME(yuv2rgb32_X_ar); break;

1654 case AV_PIX_FMT_BGR24: c->yuv2packedX = RENAME(yuv2bgr24_X_ar); break;

1655 case AV_PIX_FMT_RGB555: c->yuv2packedX = RENAME(yuv2rgb555_X_ar); break;

1656 case AV_PIX_FMT_RGB565: c->yuv2packedX = RENAME(yuv2rgb565_X_ar); break;

1657 case AV_PIX_FMT_YUYV422: c->yuv2packedX = RENAME(yuv2yuyv422_X_ar); break;

1658 default: break;

1659 }

1660 }

1661 } else {

1662 c->use_mmx_vfilter= 1;

1663 c->yuv2planeX = RENAME(yuv2yuvX );

1664 if (!(c->flags & SWS_FULL_CHR_H_INT)) {

1665 switch (c->dstFormat) {

1666 case AV_PIX_FMT_RGB32: c->yuv2packedX = RENAME(yuv2rgb32_X); break;

1667 case AV_PIX_FMT_BGR24: c->yuv2packedX = RENAME(yuv2bgr24_X); break;

1668 case AV_PIX_FMT_RGB555: c->yuv2packedX = RENAME(yuv2rgb555_X); break;

1669 case AV_PIX_FMT_RGB565: c->yuv2packedX = RENAME(yuv2rgb565_X); break;

1670 case AV_PIX_FMT_YUYV422: c->yuv2packedX = RENAME(yuv2yuyv422_X); break;

1671 default: break;

1672 }

1673 }

1674 }

1675 if (!(c->flags & SWS_FULL_CHR_H_INT)) {

1676 switch (c->dstFormat) {

1677 case AV_PIX_FMT_RGB32:

1678 c->yuv2packed1 = RENAME(yuv2rgb32_1);

1679 c->yuv2packed2 = RENAME(yuv2rgb32_2);

1680 break;

1681 case AV_PIX_FMT_BGR24:

1682 c->yuv2packed1 = RENAME(yuv2bgr24_1);

1683 c->yuv2packed2 = RENAME(yuv2bgr24_2);

1684 break;

1685 case AV_PIX_FMT_RGB555:

1686 c->yuv2packed1 = RENAME(yuv2rgb555_1);

1687 c->yuv2packed2 = RENAME(yuv2rgb555_2);

1688 break;

1689 case AV_PIX_FMT_RGB565:

1690 c->yuv2packed1 = RENAME(yuv2rgb565_1);

1691 c->yuv2packed2 = RENAME(yuv2rgb565_2);

1692 break;

1693 case AV_PIX_FMT_YUYV422:

1694 c->yuv2packed1 = RENAME(yuv2yuyv422_1);

1695 c->yuv2packed2 = RENAME(yuv2yuyv422_2);

1696 break;

1697 default:

1698 break;

1699 }

1700 }

1701 }

1702

1703 if (c->srcBpc == 8 && c->dstBpc <= 14) {

1704 // Use the new MMX scaler if the MMXEXT one can't be used (it is faster than the x86 ASM one).

1705 #if COMPILE_TEMPLATE_MMXEXT

1706 if (c->flags & SWS_FAST_BILINEAR && c->canMMXEXTBeUsed) {

1707 c->hyscale_fast = RENAME(hyscale_fast);

1708 c->hcscale_fast = RENAME(hcscale_fast);

1709 } else {

1710 #endif /* COMPILE_TEMPLATE_MMXEXT */

1711 c->hyscale_fast = NULL;

1712 c->hcscale_fast = NULL;

1713 #if COMPILE_TEMPLATE_MMXEXT

1714 }

1715 #endif /* COMPILE_TEMPLATE_MMXEXT */

1716 }

1717 }

Generated on Wed Jul 10 2013 23:48:17 for FFmpeg by doxygen 1.8.2