FFmpeg: tests/checkasm/vp9dsp.c Source File

FFmpeg

[フレーム]

vp9dsp.c

Go to the documentation of this file.

1 /*

3 *

4 * This file is part of FFmpeg.

5 *

6 * FFmpeg is free software; you can redistribute it and/or modify

7 * it under the terms of the GNU General Public License as published by

8 * the Free Software Foundation; either version 2 of the License, or

9 * (at your option) any later version.

10 *

11 * FFmpeg is distributed in the hope that it will be useful,

12 * but WITHOUT ANY WARRANTY; without even the implied warranty of

13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

14 * GNU General Public License for more details.

15 *

16 * You should have received a copy of the GNU General Public License along

17 * with FFmpeg; if not, write to the Free Software Foundation, Inc.,

18 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

19 */

21 #include <math.h>

22 #include <string.h>

23 #include "checkasm.h"

24 #include "libavcodec/vp9data.h"

25 #include "libavcodec/vp9.h"

26 #include "libavutil/common.h"

27 #include "libavutil/internal.h"

28 #include "libavutil/intreadwrite.h"

29 #include "libavutil/mathematics.h"

30 #include "libavutil/mem_internal.h"

32 static const uint32_t pixel_mask[3] = { 0xffffffff, 0x03ff03ff, 0x0fff0fff };

33 #define SIZEOF_PIXEL ((bit_depth + 7) / 8)

35 #define randomize_buffers() \

36 do { \

37 uint32_t mask = pixel_mask[(bit_depth - 8) >> 1]; \

38 int k; \

39 for (k = -4; k < SIZEOF_PIXEL * FFMAX(8, size); k += 4) { \

40 uint32_t r = rnd() & mask; \

41 AV_WN32A(a + k, r); \

42 } \

43 for (k = 0; k < size * SIZEOF_PIXEL; k += 4) { \

44 uint32_t r = rnd() & mask; \

45 AV_WN32A(l + k, r); \

46 } \

47 } while (0)

49 static void check_ipred(void)

50 {

51 LOCAL_ALIGNED_32(uint8_t, a_buf, [64 * 2]);

52 uint8_t *a = &a_buf[32 * 2];

53 LOCAL_ALIGNED_32(uint8_t, l, [32 * 2]);

54 LOCAL_ALIGNED_32(uint8_t, dst0, [32 * 32 * 2]);

55 LOCAL_ALIGNED_32(uint8_t, dst1, [32 * 32 * 2]);

56 VP9DSPContext dsp;

57 int tx, mode, bit_depth;

58 declare_func_emms(AV_CPU_FLAG_MMX | AV_CPU_FLAG_MMXEXT, void, uint8_t *dst, ptrdiff_t stride,

59 const uint8_t *left, const uint8_t *top);

60 static const char *const mode_names[N_INTRA_PRED_MODES] = {

61 [VERT_PRED] = "vert",

62 [HOR_PRED] = "hor",

63 [DC_PRED] = "dc",

64 [DIAG_DOWN_LEFT_PRED] = "diag_downleft",

65 [DIAG_DOWN_RIGHT_PRED] = "diag_downright",

66 [VERT_RIGHT_PRED] = "vert_right",

67 [HOR_DOWN_PRED] = "hor_down",

68 [VERT_LEFT_PRED] = "vert_left",

69 [HOR_UP_PRED] = "hor_up",

70 [TM_VP8_PRED] = "tm",

71 [LEFT_DC_PRED] = "dc_left",

72 [TOP_DC_PRED] = "dc_top",

73 [DC_128_PRED] = "dc_128",

74 [DC_127_PRED] = "dc_127",

75 [DC_129_PRED] = "dc_129",

76 };

78 for (bit_depth = 8; bit_depth <= 12; bit_depth += 2) {

79 ff_vp9dsp_init(&dsp, bit_depth, 0);

80 for (tx = 0; tx < 4; tx++) {

81 int size = 4 << tx;

83 for (mode = 0; mode < N_INTRA_PRED_MODES; mode++) {

84 if (check_func(dsp.intra_pred[tx][mode], "vp9_%s_%dx%d_%dbpp",

85 mode_names[mode], size, size, bit_depth)) {

86 randomize_buffers();

87 call_ref(dst0, size * SIZEOF_PIXEL, l, a);

88 call_new(dst1, size * SIZEOF_PIXEL, l, a);

89 if (memcmp(dst0, dst1, size * size * SIZEOF_PIXEL))

90 fail();

91 bench_new(dst1, size * SIZEOF_PIXEL,l, a);

92 }

93 }

94 }

95 }

96 report("ipred");

97 }

99 #undef randomize_buffers

100

101 #define randomize_buffers() \

102 do { \

103 uint32_t mask = pixel_mask[(bit_depth - 8) >> 1]; \

104 for (y = 0; y < sz; y++) { \

105 for (x = 0; x < sz * SIZEOF_PIXEL; x += 4) { \

106 uint32_t r = rnd() & mask; \

107 AV_WN32A(dst + y * sz * SIZEOF_PIXEL + x, r); \

108 AV_WN32A(src + y * sz * SIZEOF_PIXEL + x, rnd() & mask); \

109 } \

110 for (x = 0; x < sz; x++) { \

111 if (bit_depth == 8) { \

112 coef[y * sz + x] = src[y * sz + x] - dst[y * sz + x]; \

113 } else { \

114 ((int32_t *) coef)[y * sz + x] = \

115 ((uint16_t *) src)[y * sz + x] - \

116 ((uint16_t *) dst)[y * sz + x]; \

117 } \

118 } \

119 } \

120 } while(0)

121

122 // wht function copied from libvpx

123 static void fwht_1d(double *out, const double *in, int sz)

124 {

125 double t0 = in[0] + in[1];

126 double t3 = in[3] - in[2];

127 double t4 = trunc((t0 - t3) * 0.5);

128 double t1 = t4 - in[1];

129 double t2 = t4 - in[2];

130

131 out[0] = t0 - t2;

132 out[1] = t2;

133 out[2] = t3 + t1;

134 out[3] = t1;

135 }

136

137 // standard DCT-II

138 static void fdct_1d(double *out, const double *in, int sz)

139 {

140 int k, n;

141

142 for (k = 0; k < sz; k++) {

143 out[k] = 0.0;

144 for (n = 0; n < sz; n++)

145 out[k] += in[n] * cos(M_PI * (2 * n + 1) * k / (sz * 2.0));

146 }

147 out[0] *= M_SQRT1_2;

148 }

149

150 // see "Towards jointly optimal spatial prediction and adaptive transform in

151 // video/image coding", by J. Han, A. Saxena, and K. Rose

152 // IEEE Proc. ICASSP, pp. 726-729, Mar. 2010.

153 static void fadst4_1d(double *out, const double *in, int sz)

154 {

155 int k, n;

156

157 for (k = 0; k < sz; k++) {

158 out[k] = 0.0;

159 for (n = 0; n < sz; n++)

160 out[k] += in[n] * sin(M_PI * (n + 1) * (2 * k + 1) / (sz * 2.0 + 1.0));

161 }

162 }

163

164 // see "A Butterfly Structured Design of The Hybrid Transform Coding Scheme",

165 // by Jingning Han, Yaowu Xu, and Debargha Mukherjee

166 // http://static.googleusercontent.com/media/research.google.com/en//pubs/archive/41418.pdf

167 static void fadst_1d(double *out, const double *in, int sz)

168 {

169 int k, n;

170

171 for (k = 0; k < sz; k++) {

172 out[k] = 0.0;

173 for (n = 0; n < sz; n++)

174 out[k] += in[n] * sin(M_PI * (2 * n + 1) * (2 * k + 1) / (sz * 4.0));

175 }

176 }

177

178 typedef void (*ftx1d_fn)(double *out, const double *in, int sz);

179 static void ftx_2d(double *out, const double *in, enum TxfmMode tx,

180 enum TxfmType txtp, int sz)

181 {

182 static const double scaling_factors[5][4] = {

183 { 4.0, 16.0 * M_SQRT1_2 / 3.0, 16.0 * M_SQRT1_2 / 3.0, 32.0 / 9.0 },

184 { 2.0, 2.0, 2.0, 2.0 },

185 { 1.0, 1.0, 1.0, 1.0 },

186 { 0.25 },

187 { 4.0 }

188 };

189 static const ftx1d_fn ftx1d_tbl[5][4][2] = {

190 {

191 { fdct_1d, fdct_1d },

192 { fadst4_1d, fdct_1d },

193 { fdct_1d, fadst4_1d },

194 { fadst4_1d, fadst4_1d },

195 }, {

196 { fdct_1d, fdct_1d },

197 { fadst_1d, fdct_1d },

198 { fdct_1d, fadst_1d },

199 { fadst_1d, fadst_1d },

200 }, {

201 { fdct_1d, fdct_1d },

202 { fadst_1d, fdct_1d },

203 { fdct_1d, fadst_1d },

204 { fadst_1d, fadst_1d },

205 }, {

206 { fdct_1d, fdct_1d },

207 }, {

208 { fwht_1d, fwht_1d },

209 },

210 };

211 double temp[1024];

212 double scaling_factor = scaling_factors[tx][txtp];

213 int i, j;

214

215 // cols

216 for (i = 0; i < sz; ++i) {

217 double temp_out[32];

218

219 ftx1d_tbl[tx][txtp][0](temp_out, &in[i * sz], sz);

220 // scale and transpose

221 for (j = 0; j < sz; ++j)

222 temp[j * sz + i] = temp_out[j] * scaling_factor;

223 }

224

225 // rows

226 for (i = 0; i < sz; i++)

227 ftx1d_tbl[tx][txtp][1](&out[i * sz], &temp[i * sz], sz);

228 }

229

230 static void ftx(int16_t *buf, enum TxfmMode tx,

231 enum TxfmType txtp, int sz, int bit_depth)

232 {

233 double ind[1024], outd[1024];

234 int n;

235

236 emms_c();

237 for (n = 0; n < sz * sz; n++) {

238 if (bit_depth == 8)

239 ind[n] = buf[n];

240 else

241 ind[n] = ((int32_t *) buf)[n];

242 }

243 ftx_2d(outd, ind, tx, txtp, sz);

244 for (n = 0; n < sz * sz; n++) {

245 if (bit_depth == 8)

246 buf[n] = lrint(outd[n]);

247 else

248 ((int32_t *) buf)[n] = lrint(outd[n]);

249 }

250 }

251

252 static int copy_subcoefs(int16_t *out, const int16_t *in, enum TxfmMode tx,

253 enum TxfmType txtp, int sz, int sub, int bit_depth)

254 {

255 // copy the topleft coefficients such that the return value (being the

256 // coefficient scantable index for the eob token) guarantees that only

257 // the topleft $sub out of $sz (where $sz >= $sub) coefficients in both

258 // dimensions are non-zero. This leads to braching to specific optimized

259 // simd versions (e.g. dc-only) so that we get full asm coverage in this

260 // test

261

262 int n;

263 const int16_t *scan = ff_vp9_scans[tx][txtp];

264 int eob;

265

266 for (n = 0; n < sz * sz; n++) {

267 int rc = scan[n], rcx = rc % sz, rcy = rc / sz;

268

269 // find eob for this sub-idct

270 if (rcx >= sub || rcy >= sub)

271 break;

272

273 // copy coef

274 if (bit_depth == 8) {

275 out[rc] = in[rc];

276 } else {

277 AV_COPY32(&out[rc * 2], &in[rc * 2]);

278 }

279 }

280

281 eob = n;

282

283 for (; n < sz * sz; n++) {

284 int rc = scan[n];

285

286 // zero

287 if (bit_depth == 8) {

288 out[rc] = 0;

289 } else {

290 AV_ZERO32(&out[rc * 2]);

291 }

292 }

293

294 return eob;

295 }

296

297 static int is_zero(const int16_t *c, int sz)

298 {

299 int n;

300

301 for (n = 0; n < sz / sizeof(int16_t); n += 2)

302 if (AV_RN32A(&c[n]))

303 return 0;

304

305 return 1;

306 }

307

308 #define SIZEOF_COEF (2 * ((bit_depth + 7) / 8))

309

310 static void check_itxfm(void)

311 {

312 LOCAL_ALIGNED_32(uint8_t, src, [32 * 32 * 2]);

313 LOCAL_ALIGNED_32(uint8_t, dst, [32 * 32 * 2]);

314 LOCAL_ALIGNED_32(uint8_t, dst0, [32 * 32 * 2]);

315 LOCAL_ALIGNED_32(uint8_t, dst1, [32 * 32 * 2]);

316 LOCAL_ALIGNED_32(int16_t, coef, [32 * 32 * 2]);

317 LOCAL_ALIGNED_32(int16_t, subcoef0, [32 * 32 * 2]);

318 LOCAL_ALIGNED_32(int16_t, subcoef1, [32 * 32 * 2]);

319 declare_func_emms(AV_CPU_FLAG_MMX | AV_CPU_FLAG_MMXEXT, void, uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);

320 VP9DSPContext dsp;

321 int y, x, tx, txtp, bit_depth, sub;

322 static const char *const txtp_types[N_TXFM_TYPES] = {

323 [DCT_DCT] = "dct_dct", [DCT_ADST] = "adst_dct",

324 [ADST_DCT] = "dct_adst", [ADST_ADST] = "adst_adst"

325 };

326

327 for (bit_depth = 8; bit_depth <= 12; bit_depth += 2) {

328 ff_vp9dsp_init(&dsp, bit_depth, 0);

329

330 for (tx = TX_4X4; tx <= N_TXFM_SIZES /* 4 = lossless */; tx++) {

331 int sz = 4 << (tx & 3);

332 int n_txtps = tx < TX_32X32 ? N_TXFM_TYPES : 1;

333

334 for (txtp = 0; txtp < n_txtps; txtp++) {

335 // skip testing sub-IDCTs for WHT or ADST since they don't

336 // implement it in any of the SIMD functions. If they do,

337 // consider changing this to ensure we have complete test

338 // coverage. Test sub=1 for dc-only, then 2, 4, 8, 12, etc,

339 // since the arm version can distinguish them at that level.

340 for (sub = (txtp == 0 && tx < 4) ? 1 : sz; sub <= sz;

341 sub < 4 ? (sub <<= 1) : (sub += 4)) {

342 if (check_func(dsp.itxfm_add[tx][txtp],

343 "vp9_inv_%s_%dx%d_sub%d_add_%d",

344 tx == 4 ? "wht_wht" : txtp_types[txtp],

345 sz, sz, sub, bit_depth)) {

346 int eob;

347

348 randomize_buffers();

349 ftx(coef, tx, txtp, sz, bit_depth);

350

351 if (sub < sz) {

352 eob = copy_subcoefs(subcoef0, coef, tx, txtp,

353 sz, sub, bit_depth);

354 } else {

355 eob = sz * sz;

356 memcpy(subcoef0, coef, sz * sz * SIZEOF_COEF);

357 }

358

359 memcpy(dst0, dst, sz * sz * SIZEOF_PIXEL);

360 memcpy(dst1, dst, sz * sz * SIZEOF_PIXEL);

361 memcpy(subcoef1, subcoef0, sz * sz * SIZEOF_COEF);

362 call_ref(dst0, sz * SIZEOF_PIXEL, subcoef0, eob);

363 call_new(dst1, sz * SIZEOF_PIXEL, subcoef1, eob);

364 if (memcmp(dst0, dst1, sz * sz * SIZEOF_PIXEL) ||

365 !is_zero(subcoef0, sz * sz * SIZEOF_COEF) ||

366 !is_zero(subcoef1, sz * sz * SIZEOF_COEF))

367 fail();

368

369 bench_new(dst, sz * SIZEOF_PIXEL, coef, eob);

370 }

371 }

372 }

373 }

374 }

375 report("itxfm");

376 }

377

378 #undef randomize_buffers

379

380 #define setpx(a,b,c) \

381 do { \

382 if (SIZEOF_PIXEL == 1) { \

383 buf0[(a) + (b) * jstride] = av_clip_uint8(c); \

384 } else { \

385 ((uint16_t *)buf0)[(a) + (b) * jstride] = av_clip_uintp2(c, bit_depth); \

386 } \

387 } while (0)

388

389 // c can be an assignment and must not be put under ()

390 #define setdx(a,b,c,d) setpx(a,b,c-(d)+(rnd()%((d)*2+1)))

391 #define setsx(a,b,c,d) setdx(a,b,c,(d) << (bit_depth - 8))

392 static void randomize_loopfilter_buffers(int bidx, int lineoff, int str,

393 int bit_depth, int dir, const int *E,

394 const int *F, const int *H, const int *I,

395 uint8_t *buf0, uint8_t *buf1)

396 {

397 uint32_t mask = (1 << bit_depth) - 1;

398 int off = dir ? lineoff : lineoff * 16;

399 int istride = dir ? 1 : 16;

400 int jstride = dir ? str : 1;

401 int i, j;

402 for (i = 0; i < 2; i++) /* flat16 */ {

403 int idx = off + i * istride, p0, q0;

404 setpx(idx, 0, q0 = rnd() & mask);

405 setsx(idx, -1, p0 = q0, E[bidx] >> 2);

406 for (j = 1; j < 8; j++) {

407 setsx(idx, -1 - j, p0, F[bidx]);

408 setsx(idx, j, q0, F[bidx]);

409 }

410 }

411 for (i = 2; i < 4; i++) /* flat8 */ {

412 int idx = off + i * istride, p0, q0;

413 setpx(idx, 0, q0 = rnd() & mask);

414 setsx(idx, -1, p0 = q0, E[bidx] >> 2);

415 for (j = 1; j < 4; j++) {

416 setsx(idx, -1 - j, p0, F[bidx]);

417 setsx(idx, j, q0, F[bidx]);

418 }

419 for (j = 4; j < 8; j++) {

420 setpx(idx, -1 - j, rnd() & mask);

421 setpx(idx, j, rnd() & mask);

422 }

423 }

424 for (i = 4; i < 6; i++) /* regular */ {

425 int idx = off + i * istride, p2, p1, p0, q0, q1, q2;

426 setpx(idx, 0, q0 = rnd() & mask);

427 setsx(idx, 1, q1 = q0, I[bidx]);

428 setsx(idx, 2, q2 = q1, I[bidx]);

429 setsx(idx, 3, q2, I[bidx]);

430 setsx(idx, -1, p0 = q0, E[bidx] >> 2);

431 setsx(idx, -2, p1 = p0, I[bidx]);

432 setsx(idx, -3, p2 = p1, I[bidx]);

433 setsx(idx, -4, p2, I[bidx]);

434 for (j = 4; j < 8; j++) {

435 setpx(idx, -1 - j, rnd() & mask);

436 setpx(idx, j, rnd() & mask);

437 }

438 }

439 for (i = 6; i < 8; i++) /* off */ {

440 int idx = off + i * istride;

441 for (j = 0; j < 8; j++) {

442 setpx(idx, -1 - j, rnd() & mask);

443 setpx(idx, j, rnd() & mask);

444 }

445 }

446 }

447 #define randomize_buffers(bidx, lineoff, str) \

448 randomize_loopfilter_buffers(bidx, lineoff, str, bit_depth, dir, \

449 E, F, H, I, buf0, buf1)

450

451 static void check_loopfilter(void)

452 {

453 LOCAL_ALIGNED_32(uint8_t, base0, [32 + 16 * 16 * 2]);

454 LOCAL_ALIGNED_32(uint8_t, base1, [32 + 16 * 16 * 2]);

455 VP9DSPContext dsp;

456 int dir, wd, wd2, bit_depth;

457 static const char *const dir_name[2] = { "h", "v" };

458 static const int E[2] = { 20, 28 }, I[2] = { 10, 16 };

459 static const int H[2] = { 7, 11 }, F[2] = { 1, 1 };

460 declare_func_emms(AV_CPU_FLAG_MMX | AV_CPU_FLAG_MMXEXT, void, uint8_t *dst, ptrdiff_t stride, int E, int I, int H);

461

462 for (bit_depth = 8; bit_depth <= 12; bit_depth += 2) {

463 ff_vp9dsp_init(&dsp, bit_depth, 0);

464

465 for (dir = 0; dir < 2; dir++) {

466 int midoff = (dir ? 8 * 8 : 8) * SIZEOF_PIXEL;

467 int midoff_aligned = (dir ? 8 * 8 : 16) * SIZEOF_PIXEL;

468 uint8_t *buf0 = base0 + midoff_aligned;

469 uint8_t *buf1 = base1 + midoff_aligned;

470

471 for (wd = 0; wd < 3; wd++) {

472 // 4/8/16wd_8px

473 if (check_func(dsp.loop_filter_8[wd][dir],

474 "vp9_loop_filter_%s_%d_8_%dbpp",

475 dir_name[dir], 4 << wd, bit_depth)) {

476 randomize_buffers(0, 0, 8);

477 memcpy(buf1 - midoff, buf0 - midoff,

478 16 * 8 * SIZEOF_PIXEL);

479 call_ref(buf0, 16 * SIZEOF_PIXEL >> dir, E[0], I[0], H[0]);

480 call_new(buf1, 16 * SIZEOF_PIXEL >> dir, E[0], I[0], H[0]);

481 if (memcmp(buf0 - midoff, buf1 - midoff, 16 * 8 * SIZEOF_PIXEL))

482 fail();

483 bench_new(buf1, 16 * SIZEOF_PIXEL >> dir, E[0], I[0], H[0]);

484 }

485 }

486

487 midoff = (dir ? 16 * 8 : 8) * SIZEOF_PIXEL;

488 midoff_aligned = (dir ? 16 * 8 : 16) * SIZEOF_PIXEL;

489

490 buf0 = base0 + midoff_aligned;

491 buf1 = base1 + midoff_aligned;

492

493 // 16wd_16px loopfilter

494 if (check_func(dsp.loop_filter_16[dir],

495 "vp9_loop_filter_%s_16_16_%dbpp",

496 dir_name[dir], bit_depth)) {

497 randomize_buffers(0, 0, 16);

498 randomize_buffers(0, 8, 16);

499 memcpy(buf1 - midoff, buf0 - midoff, 16 * 16 * SIZEOF_PIXEL);

500 call_ref(buf0, 16 * SIZEOF_PIXEL, E[0], I[0], H[0]);

501 call_new(buf1, 16 * SIZEOF_PIXEL, E[0], I[0], H[0]);

502 if (memcmp(buf0 - midoff, buf1 - midoff, 16 * 16 * SIZEOF_PIXEL))

503 fail();

504 bench_new(buf1, 16 * SIZEOF_PIXEL, E[0], I[0], H[0]);

505 }

506

507 for (wd = 0; wd < 2; wd++) {

508 for (wd2 = 0; wd2 < 2; wd2++) {

509 // mix2 loopfilter

510 if (check_func(dsp.loop_filter_mix2[wd][wd2][dir],

511 "vp9_loop_filter_mix2_%s_%d%d_16_%dbpp",

512 dir_name[dir], 4 << wd, 4 << wd2, bit_depth)) {

513 randomize_buffers(0, 0, 16);

514 randomize_buffers(1, 8, 16);

515 memcpy(buf1 - midoff, buf0 - midoff, 16 * 16 * SIZEOF_PIXEL);

516 #define M(a) (((a)[1] << 8) | (a)[0])

517 call_ref(buf0, 16 * SIZEOF_PIXEL, M(E), M(I), M(H));

518 call_new(buf1, 16 * SIZEOF_PIXEL, M(E), M(I), M(H));

519 if (memcmp(buf0 - midoff, buf1 - midoff, 16 * 16 * SIZEOF_PIXEL))

520 fail();

521 bench_new(buf1, 16 * SIZEOF_PIXEL, M(E), M(I), M(H));

522 #undef M

523 }

524 }

525 }

526 }

527 }

528 report("loopfilter");

529 }

530

531 #undef setsx

532 #undef setpx

533 #undef setdx

534 #undef randomize_buffers

535

536 #define DST_BUF_SIZE (size * size * SIZEOF_PIXEL)

537 #define SRC_BUF_STRIDE 72

538 #define SRC_BUF_SIZE ((size + 7) * SRC_BUF_STRIDE * SIZEOF_PIXEL)

539 #define src (buf + 3 * SIZEOF_PIXEL * (SRC_BUF_STRIDE + 1))

540

541 #define randomize_buffers() \

542 do { \

543 uint32_t mask = pixel_mask[(bit_depth - 8) >> 1]; \

544 int k; \

545 for (k = 0; k < SRC_BUF_SIZE; k += 4) { \

546 uint32_t r = rnd() & mask; \

547 AV_WN32A(buf + k, r); \

548 } \

549 if (op == 1) { \

550 for (k = 0; k < DST_BUF_SIZE; k += 4) { \

551 uint32_t r = rnd() & mask; \

552 AV_WN32A(dst0 + k, r); \

553 AV_WN32A(dst1 + k, r); \

554 } \

555 } \

556 } while (0)

557

558 static void check_mc(void)

559 {

560 LOCAL_ALIGNED_32(uint8_t, buf, [72 * 72 * 2]);

561 LOCAL_ALIGNED_32(uint8_t, dst0, [64 * 64 * 2]);

562 LOCAL_ALIGNED_32(uint8_t, dst1, [64 * 64 * 2]);

563 VP9DSPContext dsp;

564 int op, hsize, bit_depth, filter, dx, dy;

565 declare_func_emms(AV_CPU_FLAG_MMX | AV_CPU_FLAG_MMXEXT, void, uint8_t *dst, ptrdiff_t dst_stride,

566 const uint8_t *ref, ptrdiff_t ref_stride,

567 int h, int mx, int my);

568 static const char *const filter_names[4] = {

569 "8tap_smooth", "8tap_regular", "8tap_sharp", "bilin"

570 };

571 static const char *const subpel_names[2][2] = { { "", "h" }, { "v", "hv" } };

572 static const char *const op_names[2] = { "put", "avg" };

573 char str[256];

574

575 for (op = 0; op < 2; op++) {

576 for (bit_depth = 8; bit_depth <= 12; bit_depth += 2) {

577 ff_vp9dsp_init(&dsp, bit_depth, 0);

578 for (hsize = 0; hsize < 5; hsize++) {

579 int size = 64 >> hsize;

580

581 for (filter = 0; filter < 4; filter++) {

582 for (dx = 0; dx < 2; dx++) {

583 for (dy = 0; dy < 2; dy++) {

584 if (dx || dy) {

585 snprintf(str, sizeof(str),

586 "%s_%s_%d%s", op_names[op],

587 filter_names[filter], size,

588 subpel_names[dy][dx]);

589 } else {

590 snprintf(str, sizeof(str),

591 "%s%d", op_names[op], size);

592 }

593 if (check_func(dsp.mc[hsize][filter][op][dx][dy],

594 "vp9_%s_%dbpp", str, bit_depth)) {

595 int mx = dx ? 1 + (rnd() % 14) : 0;

596 int my = dy ? 1 + (rnd() % 14) : 0;

597 randomize_buffers();

598 call_ref(dst0, size * SIZEOF_PIXEL,

599 src, SRC_BUF_STRIDE * SIZEOF_PIXEL,

600 size, mx, my);

601 call_new(dst1, size * SIZEOF_PIXEL,

602 src, SRC_BUF_STRIDE * SIZEOF_PIXEL,

603 size, mx, my);

604 if (memcmp(dst0, dst1, DST_BUF_SIZE))

605 fail();

606

607 // simd implementations for each filter of subpel

608 // functions are identical

609 if (filter >= 1 && filter <= 2) continue;

610 // 10/12 bpp for bilin are identical

611 if (bit_depth == 12 && filter == 3) continue;

612

613 bench_new(dst1, size * SIZEOF_PIXEL,

614 src, SRC_BUF_STRIDE * SIZEOF_PIXEL,

615 size, mx, my);

616 }

617 }

618 }

619 }

620 }

621 }

622 }

623 report("mc");

624 }

625

626 void checkasm_check_vp9dsp(void)

627 {

628 check_ipred();

629 check_itxfm();

630 check_loopfilter();

631 check_mc();

632 }

declare_func_emms

#define declare_func_emms(cpu_flags, ret,...)

Definition: checkasm.h:128

bit_depth

static void bit_depth(AudioStatsContext *s, uint64_t mask, uint64_t imask, AVRational *depth)

Definition: af_astats.c:226

static const uint8_t q1[256]

Definition: twofish.c:100

fwht_1d

static void fwht_1d(double *out, const double *in, int sz)

Definition: vp9dsp.c:123

setpx

#define setpx(a, b, c)

Definition: vp9dsp.c:380

mem_internal.h

DC_128_PRED

@ DC_128_PRED

Definition: vp9.h:58

out

FILE * out

Definition: movenc.c:54

N_TXFM_TYPES

@ N_TXFM_TYPES

Definition: vp9.h:42

sub

static float sub(float src0, float src1)

Definition: dnn_backend_native_layer_mathbinary.c:31

VP9DSPContext::loop_filter_8

void(* loop_filter_8[3][2])(uint8_t *dst, ptrdiff_t stride, int mb_lim, int lim, int hev_thr)

Definition: vp9dsp.h:80

TM_VP8_PRED

@ TM_VP8_PRED

Definition: vp9.h:55

DC_PRED

@ DC_PRED

Definition: vp9.h:48

VP9DSPContext

Definition: vp9dsp.h:39

check_func

#define check_func(func,...)

Definition: checkasm.h:122

#define t0

Definition: regdef.h:28

randomize_loopfilter_buffers

static void randomize_loopfilter_buffers(int bidx, int lineoff, int str, int bit_depth, int dir, const int *E, const int *F, const int *H, const int *I, uint8_t *buf0, uint8_t *buf1)

Definition: vp9dsp.c:392

VERT_LEFT_PRED

@ VERT_LEFT_PRED

Definition: vp9.h:53

fadst4_1d

static void fadst4_1d(double *out, const double *in, int sz)

Definition: vp9dsp.c:153

#define F(x)

#define t1

Definition: regdef.h:29

mathematics.h

filter

filter_frame For filters that do not use the this method is called when a frame is pushed to the filter s input It can be called at any time except in a reentrant way If the input frame is enough to produce then the filter should push the output frames on the output link immediately As an exception to the previous rule if the input frame is enough to produce several output frames then the filter needs output only at least one per link The additional frames can be left buffered in the filter

Definition: filter_design.txt:228

call_ref

#define call_ref(...)

Definition: checkasm.h:137

check_itxfm

static void check_itxfm(void)

Definition: vp9dsp.c:310

N_TXFM_SIZES

@ N_TXFM_SIZES

Definition: vp9.h:32

DC_127_PRED

@ DC_127_PRED

Definition: vp9.h:59

VP9DSPContext::loop_filter_mix2

void(* loop_filter_mix2[2][2][2])(uint8_t *dst, ptrdiff_t stride, int mb_lim, int lim, int hev_thr)

Definition: vp9dsp.h:102

fail

#define fail()

Definition: checkasm.h:131

VERT_PRED

@ VERT_PRED

Definition: vp9.h:46

trunc

static __device__ float trunc(float a)

Definition: cuda_runtime.h:179

SIZEOF_PIXEL

#define SIZEOF_PIXEL

Definition: vp9dsp.c:33

checkasm.h

DIAG_DOWN_RIGHT_PRED

@ DIAG_DOWN_RIGHT_PRED

Definition: vp9.h:50

copy_subcoefs

static int copy_subcoefs(int16_t *out, const int16_t *in, enum TxfmMode tx, enum TxfmType txtp, int sz, int sub, int bit_depth)

Definition: vp9dsp.c:252

check_mc

static void check_mc(void)

Definition: vp9dsp.c:558

fadst_1d

static void fadst_1d(double *out, const double *in, int sz)

Definition: vp9dsp.c:167

setsx

#define setsx(a, b, c, d)

Definition: vp9dsp.c:391

lrint

#define lrint

Definition: tablegen.h:53

rnd

#define rnd()

Definition: checkasm.h:115

SIZEOF_COEF

#define SIZEOF_COEF

Definition: vp9dsp.c:308

HOR_PRED

@ HOR_PRED

Definition: vp9.h:47

mask

static const uint16_t mask[17]

Definition: lzw.c:38

intreadwrite.h

src

#define src

Definition: vp9dsp.c:539

AV_ZERO32

#define AV_ZERO32(d)

Definition: intreadwrite.h:629

static int op(uint8_t **dst, const uint8_t *dst_end, GetByteContext *gb, int pixel, int count, int *x, int width, int linesize)

Perform decode operation.

Definition: anm.c:76

ff_vp9_scans

const int16_t *const ff_vp9_scans[5][4]

Definition: vp9data.c:600

vp9data.h

SRC_BUF_STRIDE

#define SRC_BUF_STRIDE

Definition: vp9dsp.c:537

LEFT_DC_PRED

@ LEFT_DC_PRED

Definition: vp9.h:56

check_loopfilter

static void check_loopfilter(void)

Definition: vp9dsp.c:451

ff_vp9dsp_init

av_cold void ff_vp9dsp_init(VP9DSPContext *dsp, int bpp, int bitexact)

Definition: vp9dsp.c:88

ftx_2d

static void ftx_2d(double *out, const double *in, enum TxfmMode tx, enum TxfmType txtp, int sz)

Definition: vp9dsp.c:179

ftx

static void ftx(int16_t *buf, enum TxfmMode tx, enum TxfmType txtp, int sz, int bit_depth)

Definition: vp9dsp.c:230

static const uint8_t q0[256]

Definition: twofish.c:81

#define E

Definition: avdct.c:32

DCT_ADST

@ DCT_ADST

Definition: vp9.h:39

call_new

#define call_new(...)

Definition: checkasm.h:209

LOCAL_ALIGNED_32

#define LOCAL_ALIGNED_32(t, v,...)

Definition: mem_internal.h:136

#define M(a)

Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c

Definition: undefined.txt:32

VP9DSPContext::itxfm_add

void(* itxfm_add[N_TXFM_SIZES+1][N_TXFM_TYPES])(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob)

Definition: vp9dsp.h:70

VP9DSPContext::intra_pred

void(* intra_pred[N_TXFM_SIZES][N_INTRA_PRED_MODES])(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, const uint8_t *top)

Definition: vp9dsp.h:51

TxfmMode

Definition: vp9.h:27

vp9.h

DCT_DCT

@ DCT_DCT

Definition: vp9.h:38

TxfmType

Definition: vp9.h:37

pixel_mask

static const uint32_t pixel_mask[3]

Definition: vp9dsp.c:32

N_INTRA_PRED_MODES

@ N_INTRA_PRED_MODES

Definition: vp9.h:61

size

int size

Definition: twinvq_data.h:10344

VERT_RIGHT_PRED

@ VERT_RIGHT_PRED

Definition: vp9.h:51

VP9DSPContext::mc

vp9_mc_func mc[5][N_FILTERS][2][2][2]

Definition: vp9dsp.h:114

TX_4X4

@ TX_4X4

Definition: vp9.h:28

The reader does not expect b to be semantically here and if the code is changed by maybe adding a a division or other the signedness will almost certainly be mistaken To avoid this confusion a new type was SUINT is the C unsigned type but it holds a signed int to use the same example SUINT a

Definition: undefined.txt:41

#define H

Definition: pixlet.c:40

M_PI

#define M_PI

Definition: mathematics.h:52

ftx1d_fn

void(* ftx1d_fn)(double *out, const double *in, int sz)

Definition: vp9dsp.c:178

randomize_buffers

#define randomize_buffers()

Definition: vp9dsp.c:541

report

#define report

Definition: checkasm.h:134

is_zero

static int is_zero(const int16_t *c, int sz)

Definition: vp9dsp.c:297

bench_new

#define bench_new(...)

Definition: checkasm.h:272

#define i(width, name, range_min, range_max)

Definition: cbs_h2645.c:269

fdct_1d

static void fdct_1d(double *out, const double *in, int sz)

Definition: vp9dsp.c:138

DC_129_PRED

@ DC_129_PRED

Definition: vp9.h:60

internal.h

#define t4

Definition: regdef.h:32

#define t3

Definition: regdef.h:31

common.h

ADST_ADST

@ ADST_ADST

Definition: vp9.h:41

AV_COPY32

#define AV_COPY32(d, s)

Definition: intreadwrite.h:601

AV_RN32A

#define AV_RN32A(p)

Definition: intreadwrite.h:526

stride

#define stride

Definition: h264pred_template.c:537

left

Tag MUST be and< 10hcoeff half pel interpolation filter coefficients, hcoeff[0] are the 2 middle coefficients[1] are the next outer ones and so on, resulting in a filter like:...eff[2], hcoeff[1], hcoeff[0], hcoeff[0], hcoeff[1], hcoeff[2] ... the sign of the coefficients is not explicitly stored but alternates after each coeff and coeff[0] is positive, so ...,+,-,+,-,+,+,-,+,-,+,... hcoeff[0] is not explicitly stored but found by subtracting the sum of all stored coefficients with signs from 32 hcoeff[0]=32 - hcoeff[1] - hcoeff[2] - ... a good choice for hcoeff and htaps is htaps=6 hcoeff={40,-10, 2} an alternative which requires more computations at both encoder and decoder side and may or may not be better is htaps=8 hcoeff={42,-14, 6,-2}ref_frames minimum of the number of available reference frames and max_ref_frames for example the first frame after a key frame always has ref_frames=1spatial_decomposition_type wavelet type 0 is a 9/7 symmetric compact integer wavelet 1 is a 5/3 symmetric compact integer wavelet others are reserved stored as delta from last, last is reset to 0 if always_reset||keyframeqlog quality(logarithmic quantizer scale) stored as delta from last, last is reset to 0 if always_reset||keyframemv_scale stored as delta from last, last is reset to 0 if always_reset||keyframe FIXME check that everything works fine if this changes between framesqbias dequantization bias stored as delta from last, last is reset to 0 if always_reset||keyframeblock_max_depth maximum depth of the block tree stored as delta from last, last is reset to 0 if always_reset||keyframequant_table quantization tableHighlevel bitstream structure:==============================--------------------------------------------|Header|--------------------------------------------|------------------------------------|||Block0||||split?||||yes no||||......... intra?||||:Block01 :yes no||||:Block02 :....... ..........||||:Block03 ::y DC ::ref index:||||:Block04 ::cb DC ::motion x :||||......... :cr DC ::motion y :||||....... ..........|||------------------------------------||------------------------------------|||Block1|||...|--------------------------------------------|------------ ------------ ------------|||Y subbands||Cb subbands||Cr subbands||||--- ---||--- ---||--- ---|||||LL0||HL0||||LL0||HL0||||LL0||HL0|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||LH0||HH0||||LH0||HH0||||LH0||HH0|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||HL1||LH1||||HL1||LH1||||HL1||LH1|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||HH1||HL2||||HH1||HL2||||HH1||HL2|||||...||...||...|||------------ ------------ ------------|--------------------------------------------Decoding process:=================------------|||Subbands|------------||||------------|Intra DC||||LL0 subband prediction ------------|\ Dequantization ------------------- \||Reference frames|\ IDWT|------- -------|Motion \|||Frame 0||Frame 1||Compensation . OBMC v -------|------- -------|--------------. \------> Frame n output Frame Frame<----------------------------------/|...|------------------- Range Coder:============Binary Range Coder:------------------- The implemented range coder is an adapted version based upon "Range encoding: an algorithm for removing redundancy from a digitised message." by G. N. N. Martin. The symbols encoded by the Snow range coder are bits(0|1). The associated probabilities are not fix but change depending on the symbol mix seen so far. bit seen|new state ---------+----------------------------------------------- 0|256 - state_transition_table[256 - old_state];1|state_transition_table[old_state];state_transition_table={ 0, 0, 0, 0, 0, 0, 0, 0, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 190, 191, 192, 194, 194, 195, 196, 197, 198, 199, 200, 201, 202, 202, 204, 205, 206, 207, 208, 209, 209, 210, 211, 212, 213, 215, 215, 216, 217, 218, 219, 220, 220, 222, 223, 224, 225, 226, 227, 227, 229, 229, 230, 231, 232, 234, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 248, 0, 0, 0, 0, 0, 0, 0};FIXME Range Coding of integers:------------------------- FIXME Neighboring Blocks:===================left and top are set to the respective blocks unless they are outside of the image in which case they are set to the Null block top-left is set to the top left block unless it is outside of the image in which case it is set to the left block if this block has no larger parent block or it is at the left side of its parent block and the top right block is not outside of the image then the top right block is used for top-right else the top-left block is used Null block y, cb, cr are 128 level, ref, mx and my are 0 Motion Vector Prediction:=========================1. the motion vectors of all the neighboring blocks are scaled to compensate for the difference of reference frames scaled_mv=(mv *(256 *(current_reference+1)/(mv.reference+1))+128)> the median of the scaled left

Definition: snow.txt:386

M_SQRT1_2

#define M_SQRT1_2

Definition: mathematics.h:58

AV_CPU_FLAG_MMX

#define AV_CPU_FLAG_MMX

standard MMX

Definition: cpu.h:29

#define t2

Definition: regdef.h:30

HOR_UP_PRED

@ HOR_UP_PRED