FFmpeg: libavcodec/alpha/dsputil_alpha.c Source File

FFmpeg

[フレーム]

dsputil_alpha.c

Go to the documentation of this file.

1 /*

2 * Alpha optimized DSP utils

4 *

5 * This file is part of FFmpeg.

6 *

7 * FFmpeg is free software; you can redistribute it and/or

8 * modify it under the terms of the GNU Lesser General Public

9 * License as published by the Free Software Foundation; either

10 * version 2.1 of the License, or (at your option) any later version.

11 *

12 * FFmpeg is distributed in the hope that it will be useful,

13 * but WITHOUT ANY WARRANTY; without even the implied warranty of

14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU

15 * Lesser General Public License for more details.

16 *

17 * You should have received a copy of the GNU Lesser General Public

18 * License along with FFmpeg; if not, write to the Free Software

19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA

20 */

22 #include "libavcodec/dsputil.h"

23 #include "dsputil_alpha.h"

24 #include "asm.h"

26 void (*put_pixels_clamped_axp_p)(const DCTELEM *block, uint8_t *pixels,

27 int line_size);

28 void (*add_pixels_clamped_axp_p)(const DCTELEM *block, uint8_t *pixels,

29 int line_size);

31 #if 0

32 /* These functions were the base for the optimized assembler routines,

33 and remain here for documentation purposes. */

34 static void put_pixels_clamped_mvi(const DCTELEM *block, uint8_t *pixels,

35 int line_size)

36 {

37 int i = 8;

38 uint64_t clampmask = zap(-1, 0xaa); /* 0x00ff00ff00ff00ff */

40 do {

41 uint64_t shorts0, shorts1;

43 shorts0 = ldq(block);

44 shorts0 = maxsw4(shorts0, 0);

45 shorts0 = minsw4(shorts0, clampmask);

46 stl(pkwb(shorts0), pixels);

48 shorts1 = ldq(block + 4);

49 shorts1 = maxsw4(shorts1, 0);

50 shorts1 = minsw4(shorts1, clampmask);

51 stl(pkwb(shorts1), pixels + 4);

53 pixels += line_size;

54 block += 8;

55 } while (--i);

56 }

58 void add_pixels_clamped_mvi(const DCTELEM *block, uint8_t *pixels,

59 int line_size)

60 {

61 int h = 8;

62 /* Keep this function a leaf function by generating the constants

63 manually (mainly for the hack value ;-). */

64 uint64_t clampmask = zap(-1, 0xaa); /* 0x00ff00ff00ff00ff */

65 uint64_t signmask = zap(-1, 0x33);

66 signmask ^= signmask >> 1; /* 0x8000800080008000 */

68 do {

69 uint64_t shorts0, pix0, signs0;

70 uint64_t shorts1, pix1, signs1;

72 shorts0 = ldq(block);

73 shorts1 = ldq(block + 4);

75 pix0 = unpkbw(ldl(pixels));

76 /* Signed subword add (MMX paddw). */

77 signs0 = shorts0 & signmask;

78 shorts0 &= ~signmask;

79 shorts0 += pix0;

80 shorts0 ^= signs0;

81 /* Clamp. */

82 shorts0 = maxsw4(shorts0, 0);

83 shorts0 = minsw4(shorts0, clampmask);

85 /* Next 4. */

86 pix1 = unpkbw(ldl(pixels + 4));

87 signs1 = shorts1 & signmask;

88 shorts1 &= ~signmask;

89 shorts1 += pix1;

90 shorts1 ^= signs1;

91 shorts1 = maxsw4(shorts1, 0);

92 shorts1 = minsw4(shorts1, clampmask);

94 stl(pkwb(shorts0), pixels);

95 stl(pkwb(shorts1), pixels + 4);

97 pixels += line_size;

98 block += 8;

99 } while (--h);

100 }

101 #endif

102

103 static void clear_blocks_axp(DCTELEM *blocks) {

104 uint64_t *p = (uint64_t *) blocks;

105 int n = sizeof(DCTELEM) * 6 * 64;

106

107 do {

108 p[0] = 0;

109 p[1] = 0;

110 p[2] = 0;

111 p[3] = 0;

112 p[4] = 0;

113 p[5] = 0;

114 p[6] = 0;

115 p[7] = 0;

116 p += 8;

117 n -= 8 * 8;

118 } while (n);

119 }

120

121 static inline uint64_t avg2_no_rnd(uint64_t a, uint64_t b)

122 {

123 return (a & b) + (((a ^ b) & BYTE_VEC(0xfe)) >> 1);

124 }

125

126 static inline uint64_t avg2(uint64_t a, uint64_t b)

127 {

128 return (a | b) - (((a ^ b) & BYTE_VEC(0xfe)) >> 1);

129 }

130

131 #if 0

132 /* The XY2 routines basically utilize this scheme, but reuse parts in

133 each iteration. */

134 static inline uint64_t avg4(uint64_t l1, uint64_t l2, uint64_t l3, uint64_t l4)

135 {

136 uint64_t r1 = ((l1 & ~BYTE_VEC(0x03)) >> 2)

137 + ((l2 & ~BYTE_VEC(0x03)) >> 2)

138 + ((l3 & ~BYTE_VEC(0x03)) >> 2)

139 + ((l4 & ~BYTE_VEC(0x03)) >> 2);

140 uint64_t r2 = (( (l1 & BYTE_VEC(0x03))

141 + (l2 & BYTE_VEC(0x03))

142 + (l3 & BYTE_VEC(0x03))

143 + (l4 & BYTE_VEC(0x03))

144 + BYTE_VEC(0x02)) >> 2) & BYTE_VEC(0x03);

145 return r1 + r2;

146 }

147 #endif

148

149 #define OP(LOAD, STORE) \

150 do { \

151 STORE(LOAD(pixels), block); \

152 pixels += line_size; \

153 block += line_size; \

154 } while (--h)

155

156 #define OP_X2(LOAD, STORE) \

157 do { \

158 uint64_t pix1, pix2; \

159 \

160 pix1 = LOAD(pixels); \

161 pix2 = pix1 >> 8 | ((uint64_t) pixels[8] << 56); \

162 STORE(AVG2(pix1, pix2), block); \

163 pixels += line_size; \

164 block += line_size; \

165 } while (--h)

166

167 #define OP_Y2(LOAD, STORE) \

168 do { \

169 uint64_t pix = LOAD(pixels); \

170 do { \

171 uint64_t next_pix; \

172 \

173 pixels += line_size; \

174 next_pix = LOAD(pixels); \

175 STORE(AVG2(pix, next_pix), block); \

176 block += line_size; \

177 pix = next_pix; \

178 } while (--h); \

179 } while (0)

180

181 #define OP_XY2(LOAD, STORE) \

182 do { \

183 uint64_t pix1 = LOAD(pixels); \

184 uint64_t pix2 = pix1 >> 8 | ((uint64_t) pixels[8] << 56); \

185 uint64_t pix_l = (pix1 & BYTE_VEC(0x03)) \

186 + (pix2 & BYTE_VEC(0x03)); \

187 uint64_t pix_h = ((pix1 & ~BYTE_VEC(0x03)) >> 2) \

188 + ((pix2 & ~BYTE_VEC(0x03)) >> 2); \

189 \

190 do { \

191 uint64_t npix1, npix2; \

192 uint64_t npix_l, npix_h; \

193 uint64_t avg; \

194 \

195 pixels += line_size; \

196 npix1 = LOAD(pixels); \

197 npix2 = npix1 >> 8 | ((uint64_t) pixels[8] << 56); \

198 npix_l = (npix1 & BYTE_VEC(0x03)) \

199 + (npix2 & BYTE_VEC(0x03)); \

200 npix_h = ((npix1 & ~BYTE_VEC(0x03)) >> 2) \

201 + ((npix2 & ~BYTE_VEC(0x03)) >> 2); \

202 avg = (((pix_l + npix_l + AVG4_ROUNDER) >> 2) & BYTE_VEC(0x03)) \

203 + pix_h + npix_h; \

204 STORE(avg, block); \

205 \

206 block += line_size; \

207 pix_l = npix_l; \

208 pix_h = npix_h; \

209 } while (--h); \

210 } while (0)

211

212 #define MAKE_OP(OPNAME, SUFF, OPKIND, STORE) \

213 static void OPNAME ## _pixels ## SUFF ## _axp \

214 (uint8_t *av_restrict block, const uint8_t *av_restrict pixels, \

215 int line_size, int h) \

216 { \

217 if ((size_t) pixels & 0x7) { \

218 OPKIND(uldq, STORE); \

219 } else { \

220 OPKIND(ldq, STORE); \

221 } \

222 } \

223 \

224 static void OPNAME ## _pixels16 ## SUFF ## _axp \

225 (uint8_t *av_restrict block, const uint8_t *av_restrict pixels, \

226 int line_size, int h) \

227 { \

228 OPNAME ## _pixels ## SUFF ## _axp(block, pixels, line_size, h); \

229 OPNAME ## _pixels ## SUFF ## _axp(block + 8, pixels + 8, line_size, h); \

230 }

231

232 #define PIXOP(OPNAME, STORE) \

233 MAKE_OP(OPNAME, , OP, STORE) \

234 MAKE_OP(OPNAME, _x2, OP_X2, STORE) \

235 MAKE_OP(OPNAME, _y2, OP_Y2, STORE) \

236 MAKE_OP(OPNAME, _xy2, OP_XY2, STORE)

237

238 /* Rounding primitives. */

239 #define AVG2 avg2

240 #define AVG4 avg4

241 #define AVG4_ROUNDER BYTE_VEC(0x02)

242 #define STORE(l, b) stq(l, b)

243 PIXOP(put, STORE);

244

245 #undef STORE

246 #define STORE(l, b) stq(AVG2(l, ldq(b)), b);

247 PIXOP(avg, STORE);

248

249 /* Not rounding primitives. */

250 #undef AVG2

251 #undef AVG4

252 #undef AVG4_ROUNDER

253 #undef STORE

254 #define AVG2 avg2_no_rnd

255 #define AVG4 avg4_no_rnd

256 #define AVG4_ROUNDER BYTE_VEC(0x01)

257 #define STORE(l, b) stq(l, b)

258 PIXOP(put_no_rnd, STORE);

259

260 #undef STORE

261 #define STORE(l, b) stq(AVG2(l, ldq(b)), b);

262 PIXOP(avg_no_rnd, STORE);

263

264 static void put_pixels16_axp_asm(uint8_t *block, const uint8_t *pixels,

265 int line_size, int h)

266 {

267 put_pixels_axp_asm(block, pixels, line_size, h);

268 put_pixels_axp_asm(block + 8, pixels + 8, line_size, h);

269 }

270

271 void ff_dsputil_init_alpha(DSPContext* c, AVCodecContext *avctx)

272 {

273 const int high_bit_depth = avctx->bits_per_raw_sample > 8;

274

275 if (!high_bit_depth) {

276 c->put_pixels_tab[0][0] = put_pixels16_axp_asm;

277 c->put_pixels_tab[0][1] = put_pixels16_x2_axp;

278 c->put_pixels_tab[0][2] = put_pixels16_y2_axp;

279 c->put_pixels_tab[0][3] = put_pixels16_xy2_axp;

280

281 c->put_no_rnd_pixels_tab[0][0] = put_pixels16_axp_asm;

282 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_axp;

283 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_axp;

284 c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_axp;

285

286 c->avg_pixels_tab[0][0] = avg_pixels16_axp;

287 c->avg_pixels_tab[0][1] = avg_pixels16_x2_axp;

288 c->avg_pixels_tab[0][2] = avg_pixels16_y2_axp;

289 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_axp;

290

291 c->avg_no_rnd_pixels_tab[0][0] = avg_no_rnd_pixels16_axp;

292 c->avg_no_rnd_pixels_tab[0][1] = avg_no_rnd_pixels16_x2_axp;

293 c->avg_no_rnd_pixels_tab[0][2] = avg_no_rnd_pixels16_y2_axp;

294 c->avg_no_rnd_pixels_tab[0][3] = avg_no_rnd_pixels16_xy2_axp;

295

296 c->put_pixels_tab[1][0] = put_pixels_axp_asm;

297 c->put_pixels_tab[1][1] = put_pixels_x2_axp;

298 c->put_pixels_tab[1][2] = put_pixels_y2_axp;

299 c->put_pixels_tab[1][3] = put_pixels_xy2_axp;

300

301 c->put_no_rnd_pixels_tab[1][0] = put_pixels_axp_asm;

302 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels_x2_axp;

303 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels_y2_axp;

304 c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels_xy2_axp;

305

306 c->avg_pixels_tab[1][0] = avg_pixels_axp;

307 c->avg_pixels_tab[1][1] = avg_pixels_x2_axp;

308 c->avg_pixels_tab[1][2] = avg_pixels_y2_axp;

309 c->avg_pixels_tab[1][3] = avg_pixels_xy2_axp;

310

311 c->avg_no_rnd_pixels_tab[1][0] = avg_no_rnd_pixels_axp;

312 c->avg_no_rnd_pixels_tab[1][1] = avg_no_rnd_pixels_x2_axp;

313 c->avg_no_rnd_pixels_tab[1][2] = avg_no_rnd_pixels_y2_axp;

314 c->avg_no_rnd_pixels_tab[1][3] = avg_no_rnd_pixels_xy2_axp;

315

316 c->clear_blocks = clear_blocks_axp;

317 }

318

319 /* amask clears all bits that correspond to present features. */

320 if (amask(AMASK_MVI) == 0) {

321 c->put_pixels_clamped = put_pixels_clamped_mvi_asm;

322 c->add_pixels_clamped = add_pixels_clamped_mvi_asm;

323

324 if (!high_bit_depth)

325 c->get_pixels = get_pixels_mvi;

326 c->diff_pixels = diff_pixels_mvi;

327 c->sad[0] = pix_abs16x16_mvi_asm;

328 c->sad[1] = pix_abs8x8_mvi;

329 c->pix_abs[0][0] = pix_abs16x16_mvi_asm;

330 c->pix_abs[1][0] = pix_abs8x8_mvi;

331 c->pix_abs[0][1] = pix_abs16x16_x2_mvi;

332 c->pix_abs[0][2] = pix_abs16x16_y2_mvi;

333 c->pix_abs[0][3] = pix_abs16x16_xy2_mvi;

334 }

335

336 put_pixels_clamped_axp_p = c->put_pixels_clamped;

337 add_pixels_clamped_axp_p = c->add_pixels_clamped;

338

339 if (!avctx->lowres && avctx->bits_per_raw_sample <= 8 &&

340 (avctx->idct_algo == FF_IDCT_AUTO ||

341 avctx->idct_algo == FF_IDCT_SIMPLEALPHA)) {

342 c->idct_put = ff_simple_idct_put_axp;

343 c->idct_add = ff_simple_idct_add_axp;

344 c->idct = ff_simple_idct_axp;

345 }

346 }

Generated on Sat May 25 2013 03:58:31 for FFmpeg by doxygen 1.8.2