FFmpeg: libswscale/ppc/swscale_altivec.c Source File

FFmpeg

[フレーム]

swscale_altivec.c

Go to the documentation of this file.

1 /*

2 * AltiVec-enhanced yuv2yuvX

3 *

5 * based on the equivalent C code in swscale.c

6 *

7 * This file is part of FFmpeg.

8 *

9 * FFmpeg is free software; you can redistribute it and/or

10 * modify it under the terms of the GNU Lesser General Public

11 * License as published by the Free Software Foundation; either

12 * version 2.1 of the License, or (at your option) any later version.

13 *

14 * FFmpeg is distributed in the hope that it will be useful,

15 * but WITHOUT ANY WARRANTY; without even the implied warranty of

16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU

17 * Lesser General Public License for more details.

18 *

19 * You should have received a copy of the GNU Lesser General Public

20 * License along with FFmpeg; if not, write to the Free Software

21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA

22 */

24 #include <inttypes.h>

26 #include "config.h"

27 #include "libswscale/swscale.h"

28 #include "libswscale/swscale_internal.h"

29 #include "libavutil/attributes.h"

30 #include "libavutil/cpu.h"

31 #include "yuv2rgb_altivec.h"

32 #include "libavutil/ppc/util_altivec.h"

34 #if HAVE_ALTIVEC

35 #if HAVE_BIGENDIAN

36 #define vzero vec_splat_s32(0)

38 #define GET_LS(a,b,c,s) {\

39 vector signed short l2 = vec_ld(((b) << 1) + 16, s);\

40 ls = vec_perm(a, l2, c);\

41 a = l2;\

42 }

44 #define yuv2planeX_8(d1, d2, l1, src, x, perm, filter) do {\

45 vector signed short ls;\

46 vector signed int vf1, vf2, i1, i2;\

47 GET_LS(l1, x, perm, src);\

48 i1 = vec_mule(filter, ls);\

49 i2 = vec_mulo(filter, ls);\

50 vf1 = vec_mergeh(i1, i2);\

51 vf2 = vec_mergel(i1, i2);\

52 d1 = vec_add(d1, vf1);\

53 d2 = vec_add(d2, vf2);\

54 } while (0)

56 #define LOAD_FILTER(vf,f) {\

57 vector unsigned char perm0 = vec_lvsl(joffset, f);\

58 vf = vec_ld(joffset, f);\

59 vf = vec_perm(vf, vf, perm0);\

60 }

61 #define LOAD_L1(ll1,s,p){\

62 p = vec_lvsl(xoffset, s);\

63 ll1 = vec_ld(xoffset, s);\

64 }

66 // The 3 above is 2 (filterSize == 4) + 1 (sizeof(short) == 2).

68 // The neat trick: We only care for half the elements,

69 // high or low depending on (i<<3)%16 (it's 0 or 8 here),

70 // and we're going to use vec_mule, so we choose

71 // carefully how to "unpack" the elements into the even slots.

72 #define GET_VF4(a, vf, f) {\

73 vf = vec_ld(a<< 3, f);\

74 if ((a << 3) % 16)\

75 vf = vec_mergel(vf, (vector signed short)vzero);\

76 else\

77 vf = vec_mergeh(vf, (vector signed short)vzero);\

78 }

79 #define FIRST_LOAD(sv, pos, s, per) {\

80 sv = vec_ld(pos, s);\

81 per = vec_lvsl(pos, s);\

82 }

83 #define UPDATE_PTR(s0, d0, s1, d1) {\

84 d0 = s0;\

85 d1 = s1;\

86 }

87 #define LOAD_SRCV(pos, a, s, per, v0, v1, vf) {\

88 v1 = vec_ld(pos + a + 16, s);\

89 vf = vec_perm(v0, v1, per);\

90 }

91 #define LOAD_SRCV8(pos, a, s, per, v0, v1, vf) {\

92 if ((((uintptr_t)s + pos) % 16) > 8) {\

93 v1 = vec_ld(pos + a + 16, s);\

94 }\

95 vf = vec_perm(v0, src_v1, per);\

96 }

97 #define GET_VFD(a, b, f, vf0, vf1, per, vf, off) {\

98 vf1 = vec_ld((a * 2 * filterSize) + (b * 2) + 16 + off, f);\

99 vf = vec_perm(vf0, vf1, per);\

100 }

101

102 #define FUNC(name) name ## _altivec

103 #include "swscale_ppc_template.c"

104 #undef FUNC

105

106 #undef vzero

107

108 #endif /* HAVE_BIGENDIAN */

109

110 #define output_pixel(pos, val, bias, signedness) \

111 if (big_endian) { \

112 AV_WB16(pos, bias + av_clip_ ## signedness ## 16(val >> shift)); \

113 } else { \

114 AV_WL16(pos, bias + av_clip_ ## signedness ## 16(val >> shift)); \

115 }

116

117 static void

118 yuv2plane1_float_u(const int32_t *src, float *dest, int dstW, int start)

119 {

120 static const int big_endian = HAVE_BIGENDIAN;

121 static const int shift = 3;

122 static const float float_mult = 1.0f / 65535.0f;

123 int i, val;

124 uint16_t val_uint;

125

126 for (i = start; i < dstW; ++i){

127 val = src[i] + (1 << (shift - 1));

128 output_pixel(&val_uint, val, 0, uint);

129 dest[i] = float_mult * (float)val_uint;

130 }

131 }

132

133 static void

134 yuv2plane1_float_bswap_u(const int32_t *src, uint32_t *dest, int dstW, int start)

135 {

136 static const int big_endian = HAVE_BIGENDIAN;

137 static const int shift = 3;

138 static const float float_mult = 1.0f / 65535.0f;

139 int i, val;

140 uint16_t val_uint;

141

142 for (i = start; i < dstW; ++i){

143 val = src[i] + (1 << (shift - 1));

144 output_pixel(&val_uint, val, 0, uint);

145 dest[i] = av_bswap32(av_float2int(float_mult * (float)val_uint));

146 }

147 }

148

149 static void yuv2plane1_float_altivec(const int32_t *src, float *dest, int dstW)

150 {

151 const int dst_u = -(uintptr_t)dest & 3;

152 const int shift = 3;

153 const int add = (1 << (shift - 1));

154 const int clip = (1 << 16) - 1;

155 const float fmult = 1.0f / 65535.0f;

156 const vec_u32 vadd = (vec_u32) {add, add, add, add};

157 const vec_u32 vshift = (vec_u32) vec_splat_u32(shift);

158 const vec_u32 vlargest = (vec_u32) {clip, clip, clip, clip};

159 const vec_f vmul = (vec_f) {fmult, fmult, fmult, fmult};

160 const vec_f vzero = (vec_f) {0, 0, 0, 0};

161 vec_u32 v;

162 vec_f vd;

163 int i;

164

165 yuv2plane1_float_u(src, dest, dst_u, 0);

166

167 for (i = dst_u; i < dstW - 3; i += 4) {

168 v = vec_ld(0, (const uint32_t *) &src[i]);

169 v = vec_add(v, vadd);

170 v = vec_sr(v, vshift);

171 v = vec_min(v, vlargest);

172

173 vd = vec_ctf(v, 0);

174 vd = vec_madd(vd, vmul, vzero);

175

176 vec_st(vd, 0, &dest[i]);

177 }

178

179 yuv2plane1_float_u(src, dest, dstW, i);

180 }

181

182 static void yuv2plane1_float_bswap_altivec(const int32_t *src, uint32_t *dest, int dstW)

183 {

184 const int dst_u = -(uintptr_t)dest & 3;

185 const int shift = 3;

186 const int add = (1 << (shift - 1));

187 const int clip = (1 << 16) - 1;

188 const float fmult = 1.0f / 65535.0f;

189 const vec_u32 vadd = (vec_u32) {add, add, add, add};

190 const vec_u32 vshift = (vec_u32) vec_splat_u32(shift);

191 const vec_u32 vlargest = (vec_u32) {clip, clip, clip, clip};

192 const vec_f vmul = (vec_f) {fmult, fmult, fmult, fmult};

193 const vec_f vzero = (vec_f) {0, 0, 0, 0};

194 const vec_u32 vswapbig = (vec_u32) {16, 16, 16, 16};

195 const vec_u16 vswapsmall = vec_splat_u16(8);

196 vec_u32 v;

197 vec_f vd;

198 int i;

199

200 yuv2plane1_float_bswap_u(src, dest, dst_u, 0);

201

202 for (i = dst_u; i < dstW - 3; i += 4) {

203 v = vec_ld(0, (const uint32_t *) &src[i]);

204 v = vec_add(v, vadd);

205 v = vec_sr(v, vshift);

206 v = vec_min(v, vlargest);

207

208 vd = vec_ctf(v, 0);

209 vd = vec_madd(vd, vmul, vzero);

210

211 vd = (vec_f) vec_rl((vec_u32) vd, vswapbig);

212 vd = (vec_f) vec_rl((vec_u16) vd, vswapsmall);

213

214 vec_st(vd, 0, (float *) &dest[i]);

215 }

216

217 yuv2plane1_float_bswap_u(src, dest, dstW, i);

218 }

219

220 #define yuv2plane1_float(template, dest_type, BE_LE) \

221 static void yuv2plane1_float ## BE_LE ## _altivec(const int16_t *src, uint8_t *dest, \

222 int dstW, \

223 const uint8_t *dither, int offset) \

224 { \

225 template((const int32_t *)src, (dest_type *)dest, dstW); \

226 }

227

228 #if HAVE_BIGENDIAN

229 yuv2plane1_float(yuv2plane1_float_altivec, float, BE)

230 yuv2plane1_float(yuv2plane1_float_bswap_altivec, uint32_t, LE)

231 #else

232 yuv2plane1_float(yuv2plane1_float_altivec, float, LE)

233 yuv2plane1_float(yuv2plane1_float_bswap_altivec, uint32_t, BE)

234 #endif

235

236 #endif /* HAVE_ALTIVEC */

237

238 av_cold void ff_sws_init_swscale_ppc(SwsContext *c)

239 {

240 #if HAVE_ALTIVEC

241 enum AVPixelFormat dstFormat = c->dstFormat;

242

243 if (!(av_get_cpu_flags() & AV_CPU_FLAG_ALTIVEC))

244 return;

245

246 #if HAVE_BIGENDIAN

247 if (c->srcBpc == 8 && c->dstBpc <= 14) {

248 c->hyScale = c->hcScale = hScale_real_altivec;

249 }

250 if (!is16BPS(dstFormat) && !isNBPS(dstFormat) && !isSemiPlanarYUV(dstFormat) &&

251 dstFormat != AV_PIX_FMT_GRAYF32BE && dstFormat != AV_PIX_FMT_GRAYF32LE &&

252 !c->needAlpha) {

253 c->yuv2planeX = yuv2planeX_altivec;

254 }

255 #endif

256

257 if (dstFormat == AV_PIX_FMT_GRAYF32BE) {

258 c->yuv2plane1 = yuv2plane1_floatBE_altivec;

259 } else if (dstFormat == AV_PIX_FMT_GRAYF32LE) {

260 c->yuv2plane1 = yuv2plane1_floatLE_altivec;

261 }

262

263 /* The following list of supported dstFormat values should

264 * match what's found in the body of ff_yuv2packedX_altivec() */

265 if (!(c->flags & (SWS_BITEXACT | SWS_FULL_CHR_H_INT)) && !c->needAlpha) {

266 switch (c->dstFormat) {

267 case AV_PIX_FMT_ABGR:

268 c->yuv2packedX = ff_yuv2abgr_X_altivec;

269 break;

270 case AV_PIX_FMT_BGRA:

271 c->yuv2packedX = ff_yuv2bgra_X_altivec;

272 break;

273 case AV_PIX_FMT_ARGB:

274 c->yuv2packedX = ff_yuv2argb_X_altivec;

275 break;

276 case AV_PIX_FMT_RGBA:

277 c->yuv2packedX = ff_yuv2rgba_X_altivec;

278 break;

279 case AV_PIX_FMT_BGR24:

280 c->yuv2packedX = ff_yuv2bgr24_X_altivec;

281 break;

282 case AV_PIX_FMT_RGB24:

283 c->yuv2packedX = ff_yuv2rgb24_X_altivec;

284 break;

285 }

286 }

287 #endif /* HAVE_ALTIVEC */

288

289 ff_sws_init_swscale_vsx(c);

290 }

AVPixelFormat

Pixel format.

Definition: pixfmt.h:64

AV_PIX_FMT_BGR24

@ AV_PIX_FMT_BGR24

packed RGB 8:8:8, 24bpp, BGRBGR...

Definition: pixfmt.h:69

AV_PIX_FMT_BGRA

@ AV_PIX_FMT_BGRA

packed BGRA 8:8:8:8, 32bpp, BGRABGRA...

Definition: pixfmt.h:95

av_get_cpu_flags

int av_get_cpu_flags(void)

Return the flags which specify extensions supported by the CPU.

Definition: cpu.c:98

av_float2int

static av_always_inline uint32_t av_float2int(float f)

Reinterpret a float as a 32-bit integer.

Definition: intfloat.h:50

AV_PIX_FMT_GRAYF32LE

@ AV_PIX_FMT_GRAYF32LE

IEEE-754 single precision Y, 32bpp, little-endian.

Definition: pixfmt.h:331

is16BPS

static av_always_inline int is16BPS(enum AVPixelFormat pix_fmt)

Definition: swscale_internal.h:696

SWS_BITEXACT

#define SWS_BITEXACT

Definition: swscale.h:85

val

static double val(void *priv, double ch)

Definition: aeval.c:76

isNBPS

static av_always_inline int isNBPS(enum AVPixelFormat pix_fmt)

Definition: swscale_internal.h:710

av_bswap32

#define av_bswap32

Definition: bswap.h:33

av_cold

#define av_cold

Definition: attributes.h:90

AV_PIX_FMT_RGBA

@ AV_PIX_FMT_RGBA

packed RGBA 8:8:8:8, 32bpp, RGBARGBA...

Definition: pixfmt.h:93

isSemiPlanarYUV

static av_always_inline int isSemiPlanarYUV(enum AVPixelFormat pix_fmt)

Definition: swscale_internal.h:742

yuv2plane1_float

yuv2plane1_float(yuv2plane1_float_c_template, yuv2plane1_float(float, LE)

Definition: output.c:306

src

#define src

Definition: vp8dsp.c:255

AV_CPU_FLAG_ALTIVEC

#define AV_CPU_FLAG_ALTIVEC

standard

Definition: cpu.h:59

AV_PIX_FMT_ABGR

@ AV_PIX_FMT_ABGR

packed ABGR 8:8:8:8, 32bpp, ABGRABGR...

Definition: pixfmt.h:94

Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c

Definition: undefined.txt:32

AV_PIX_FMT_RGB24

@ AV_PIX_FMT_RGB24

packed RGB 8:8:8, 24bpp, RGBRGB...

Definition: pixfmt.h:68

vec_u32

#define vec_u32

Definition: util_altivec.h:38

cpu.h

SWS_FULL_CHR_H_INT

#define SWS_FULL_CHR_H_INT

Definition: swscale.h:80