FFmpeg: tests/checkasm/vp9dsp.c Source File

FFmpeg

[フレーム]

vp9dsp.c

Go to the documentation of this file.

1 /*

3 *

4 * This file is part of FFmpeg.

5 *

6 * FFmpeg is free software; you can redistribute it and/or modify

7 * it under the terms of the GNU General Public License as published by

8 * the Free Software Foundation; either version 2 of the License, or

9 * (at your option) any later version.

10 *

11 * FFmpeg is distributed in the hope that it will be useful,

12 * but WITHOUT ANY WARRANTY; without even the implied warranty of

13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

14 * GNU General Public License for more details.

15 *

16 * You should have received a copy of the GNU General Public License along

17 * with FFmpeg; if not, write to the Free Software Foundation, Inc.,

18 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

19 */

21 #include <math.h>

22 #include <string.h>

23 #include "checkasm.h"

24 #include "libavcodec/vp9data.h"

25 #include "libavcodec/vp9.h"

26 #include "libavutil/common.h"

27 #include "libavutil/emms.h"

28 #include "libavutil/internal.h"

29 #include "libavutil/intreadwrite.h"

30 #include "libavutil/mathematics.h"

31 #include "libavutil/mem_internal.h"

33 static const uint32_t pixel_mask[3] = { 0xffffffff, 0x03ff03ff, 0x0fff0fff };

34 #define SIZEOF_PIXEL ((bit_depth + 7) / 8)

36 #define randomize_buffers() \

37 do { \

38 uint32_t mask = pixel_mask[(bit_depth - 8) >> 1]; \

39 int k; \

40 for (k = -4; k < SIZEOF_PIXEL * FFMAX(8, size); k += 4) { \

41 uint32_t r = rnd() & mask; \

42 AV_WN32A(a + k, r); \

43 } \

44 for (k = 0; k < size * SIZEOF_PIXEL; k += 4) { \

45 uint32_t r = rnd() & mask; \

46 AV_WN32A(l + k, r); \

47 } \

48 } while (0)

50 static void check_ipred(void)

51 {

52 LOCAL_ALIGNED_32(uint8_t, a_buf, [64 * 2]);

53 uint8_t *a = &a_buf[32 * 2];

54 LOCAL_ALIGNED_32(uint8_t, l, [32 * 2]);

55 LOCAL_ALIGNED_32(uint8_t, dst0, [32 * 32 * 2]);

56 LOCAL_ALIGNED_32(uint8_t, dst1, [32 * 32 * 2]);

57 VP9DSPContext dsp;

58 int tx, mode, bit_depth;

59 declare_func_emms(AV_CPU_FLAG_MMX | AV_CPU_FLAG_MMXEXT, void, uint8_t *dst, ptrdiff_t stride,

60 const uint8_t *left, const uint8_t *top);

61 static const char *const mode_names[N_INTRA_PRED_MODES] = {

62 [VERT_PRED] = "vert",

63 [HOR_PRED] = "hor",

64 [DC_PRED] = "dc",

65 [DIAG_DOWN_LEFT_PRED] = "diag_downleft",

66 [DIAG_DOWN_RIGHT_PRED] = "diag_downright",

67 [VERT_RIGHT_PRED] = "vert_right",

68 [HOR_DOWN_PRED] = "hor_down",

69 [VERT_LEFT_PRED] = "vert_left",

70 [HOR_UP_PRED] = "hor_up",

71 [TM_VP8_PRED] = "tm",

72 [LEFT_DC_PRED] = "dc_left",

73 [TOP_DC_PRED] = "dc_top",

74 [DC_128_PRED] = "dc_128",

75 [DC_127_PRED] = "dc_127",

76 [DC_129_PRED] = "dc_129",

77 };

79 for (bit_depth = 8; bit_depth <= 12; bit_depth += 2) {

80 ff_vp9dsp_init(&dsp, bit_depth, 0);

81 for (tx = 0; tx < 4; tx++) {

82 int size = 4 << tx;

84 for (mode = 0; mode < N_INTRA_PRED_MODES; mode++) {

85 if (check_func(dsp.intra_pred[tx][mode], "vp9_%s_%dx%d_%dbpp",

86 mode_names[mode], size, size, bit_depth)) {

87 randomize_buffers();

88 call_ref(dst0, size * SIZEOF_PIXEL, l, a);

89 call_new(dst1, size * SIZEOF_PIXEL, l, a);

90 if (memcmp(dst0, dst1, size * size * SIZEOF_PIXEL))

91 fail();

92 bench_new(dst1, size * SIZEOF_PIXEL,l, a);

93 }

94 }

95 }

96 }

97 report("ipred");

98 }

100 #undef randomize_buffers

101

102 #define randomize_buffers() \

103 do { \

104 uint32_t mask = pixel_mask[(bit_depth - 8) >> 1]; \

105 for (y = 0; y < sz; y++) { \

106 for (x = 0; x < sz * SIZEOF_PIXEL; x += 4) { \

107 uint32_t r = rnd() & mask; \

108 AV_WN32A(dst + y * sz * SIZEOF_PIXEL + x, r); \

109 AV_WN32A(src + y * sz * SIZEOF_PIXEL + x, rnd() & mask); \

110 } \

111 for (x = 0; x < sz; x++) { \

112 if (bit_depth == 8) { \

113 coef[y * sz + x] = src[y * sz + x] - dst[y * sz + x]; \

114 } else { \

115 ((int32_t *) coef)[y * sz + x] = \

116 ((uint16_t *) src)[y * sz + x] - \

117 ((uint16_t *) dst)[y * sz + x]; \

118 } \

119 } \

120 } \

121 } while(0)

122

123 // wht function copied from libvpx

124 static void fwht_1d(double *out, const double *in, int sz)

125 {

126 double t0 = in[0] + in[1];

127 double t3 = in[3] - in[2];

128 double t4 = trunc((t0 - t3) * 0.5);

129 double t1 = t4 - in[1];

130 double t2 = t4 - in[2];

131

132 out[0] = t0 - t2;

133 out[1] = t2;

134 out[2] = t3 + t1;

135 out[3] = t1;

136 }

137

138 // standard DCT-II

139 static void fdct_1d(double *out, const double *in, int sz)

140 {

141 int k, n;

142

143 for (k = 0; k < sz; k++) {

144 out[k] = 0.0;

145 for (n = 0; n < sz; n++)

146 out[k] += in[n] * cos(M_PI * (2 * n + 1) * k / (sz * 2.0));

147 }

148 out[0] *= M_SQRT1_2;

149 }

150

151 // see "Towards jointly optimal spatial prediction and adaptive transform in

152 // video/image coding", by J. Han, A. Saxena, and K. Rose

153 // IEEE Proc. ICASSP, pp. 726-729, Mar. 2010.

154 static void fadst4_1d(double *out, const double *in, int sz)

155 {

156 int k, n;

157

158 for (k = 0; k < sz; k++) {

159 out[k] = 0.0;

160 for (n = 0; n < sz; n++)

161 out[k] += in[n] * sin(M_PI * (n + 1) * (2 * k + 1) / (sz * 2.0 + 1.0));

162 }

163 }

164

165 // see "A Butterfly Structured Design of The Hybrid Transform Coding Scheme",

166 // by Jingning Han, Yaowu Xu, and Debargha Mukherjee

167 // http://static.googleusercontent.com/media/research.google.com/en//pubs/archive/41418.pdf

168 static void fadst_1d(double *out, const double *in, int sz)

169 {

170 int k, n;

171

172 for (k = 0; k < sz; k++) {

173 out[k] = 0.0;

174 for (n = 0; n < sz; n++)

175 out[k] += in[n] * sin(M_PI * (2 * n + 1) * (2 * k + 1) / (sz * 4.0));

176 }

177 }

178

179 typedef void (*ftx1d_fn)(double *out, const double *in, int sz);

180 static void ftx_2d(double *out, const double *in, enum TxfmMode tx,

181 enum TxfmType txtp, int sz)

182 {

183 static const double scaling_factors[5][4] = {

184 { 4.0, 16.0 * M_SQRT1_2 / 3.0, 16.0 * M_SQRT1_2 / 3.0, 32.0 / 9.0 },

185 { 2.0, 2.0, 2.0, 2.0 },

186 { 1.0, 1.0, 1.0, 1.0 },

187 { 0.25 },

188 { 4.0 }

189 };

190 static const ftx1d_fn ftx1d_tbl[5][4][2] = {

191 {

192 { fdct_1d, fdct_1d },

193 { fadst4_1d, fdct_1d },

194 { fdct_1d, fadst4_1d },

195 { fadst4_1d, fadst4_1d },

196 }, {

197 { fdct_1d, fdct_1d },

198 { fadst_1d, fdct_1d },

199 { fdct_1d, fadst_1d },

200 { fadst_1d, fadst_1d },

201 }, {

202 { fdct_1d, fdct_1d },

203 { fadst_1d, fdct_1d },

204 { fdct_1d, fadst_1d },

205 { fadst_1d, fadst_1d },

206 }, {

207 { fdct_1d, fdct_1d },

208 }, {

209 { fwht_1d, fwht_1d },

210 },

211 };

212 double temp[1024];

213 double scaling_factor = scaling_factors[tx][txtp];

214 int i, j;

215

216 // cols

217 for (i = 0; i < sz; ++i) {

218 double temp_out[32];

219

220 ftx1d_tbl[tx][txtp][0](temp_out, &in[i * sz], sz);

221 // scale and transpose

222 for (j = 0; j < sz; ++j)

223 temp[j * sz + i] = temp_out[j] * scaling_factor;

224 }

225

226 // rows

227 for (i = 0; i < sz; i++)

228 ftx1d_tbl[tx][txtp][1](&out[i * sz], &temp[i * sz], sz);

229 }

230

231 static void ftx(int16_t *buf, enum TxfmMode tx,

232 enum TxfmType txtp, int sz, int bit_depth)

233 {

234 double ind[1024], outd[1024];

235 int n;

236

237 emms_c();

238 for (n = 0; n < sz * sz; n++) {

239 if (bit_depth == 8)

240 ind[n] = buf[n];

241 else

242 ind[n] = ((int32_t *) buf)[n];

243 }

244 ftx_2d(outd, ind, tx, txtp, sz);

245 for (n = 0; n < sz * sz; n++) {

246 if (bit_depth == 8)

247 buf[n] = lrint(outd[n]);

248 else

249 ((int32_t *) buf)[n] = lrint(outd[n]);

250 }

251 }

252

253 static int copy_subcoefs(int16_t *out, const int16_t *in, enum TxfmMode tx,

254 enum TxfmType txtp, int sz, int sub, int bit_depth)

255 {

256 // copy the topleft coefficients such that the return value (being the

257 // coefficient scantable index for the eob token) guarantees that only

258 // the topleft $sub out of $sz (where $sz >= $sub) coefficients in both

259 // dimensions are non-zero. This leads to braching to specific optimized

260 // simd versions (e.g. dc-only) so that we get full asm coverage in this

261 // test

262

263 int n;

264 const int16_t *scan = ff_vp9_scans[tx][txtp];

265 int eob;

266

267 for (n = 0; n < sz * sz; n++) {

268 int rc = scan[n], rcx = rc % sz, rcy = rc / sz;

269

270 // find eob for this sub-idct

271 if (rcx >= sub || rcy >= sub)

272 break;

273

274 // copy coef

275 if (bit_depth == 8) {

276 out[rc] = in[rc];

277 } else {

278 AV_COPY32(&out[rc * 2], &in[rc * 2]);

279 }

280 }

281

282 eob = n;

283

284 for (; n < sz * sz; n++) {

285 int rc = scan[n];

286

287 // zero

288 if (bit_depth == 8) {

289 out[rc] = 0;

290 } else {

291 AV_ZERO32(&out[rc * 2]);

292 }

293 }

294

295 return eob;

296 }

297

298 static int is_zero(const int16_t *c, int sz)

299 {

300 int n;

301

302 for (n = 0; n < sz / sizeof(int16_t); n += 2)

303 if (AV_RN32A(&c[n]))

304 return 0;

305

306 return 1;

307 }

308

309 #define SIZEOF_COEF (2 * ((bit_depth + 7) / 8))

310

311 static void check_itxfm(void)

312 {

313 LOCAL_ALIGNED_32(uint8_t, src, [32 * 32 * 2]);

314 LOCAL_ALIGNED_32(uint8_t, dst, [32 * 32 * 2]);

315 LOCAL_ALIGNED_32(uint8_t, dst0, [32 * 32 * 2]);

316 LOCAL_ALIGNED_32(uint8_t, dst1, [32 * 32 * 2]);

317 LOCAL_ALIGNED_32(int16_t, coef, [32 * 32 * 2]);

318 LOCAL_ALIGNED_32(int16_t, subcoef0, [32 * 32 * 2]);

319 LOCAL_ALIGNED_32(int16_t, subcoef1, [32 * 32 * 2]);

320 declare_func_emms(AV_CPU_FLAG_MMX | AV_CPU_FLAG_MMXEXT, void, uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);

321 VP9DSPContext dsp;

322 int y, x, tx, txtp, bit_depth, sub;

323 static const char *const txtp_types[N_TXFM_TYPES] = {

324 [DCT_DCT] = "dct_dct", [DCT_ADST] = "adst_dct",

325 [ADST_DCT] = "dct_adst", [ADST_ADST] = "adst_adst"

326 };

327

328 for (bit_depth = 8; bit_depth <= 12; bit_depth += 2) {

329 ff_vp9dsp_init(&dsp, bit_depth, 0);

330

331 for (tx = TX_4X4; tx <= N_TXFM_SIZES /* 4 = lossless */; tx++) {

332 int sz = 4 << (tx & 3);

333 int n_txtps = tx < TX_32X32 ? N_TXFM_TYPES : 1;

334

335 for (txtp = 0; txtp < n_txtps; txtp++) {

336 // skip testing sub-IDCTs for WHT or ADST since they don't

337 // implement it in any of the SIMD functions. If they do,

338 // consider changing this to ensure we have complete test

339 // coverage. Test sub=1 for dc-only, then 2, 4, 8, 12, etc,

340 // since the arm version can distinguish them at that level.

341 for (sub = (txtp == 0 && tx < 4) ? 1 : sz; sub <= sz;

342 sub < 4 ? (sub <<= 1) : (sub += 4)) {

343 if (check_func(dsp.itxfm_add[tx][txtp],

344 "vp9_inv_%s_%dx%d_sub%d_add_%d",

345 tx == 4 ? "wht_wht" : txtp_types[txtp],

346 sz, sz, sub, bit_depth)) {

347 int eob;

348

349 randomize_buffers();

350 ftx(coef, tx, txtp, sz, bit_depth);

351

352 if (sub < sz) {

353 eob = copy_subcoefs(subcoef0, coef, tx, txtp,

354 sz, sub, bit_depth);

355 } else {

356 eob = sz * sz;

357 memcpy(subcoef0, coef, sz * sz * SIZEOF_COEF);

358 }

359

360 memcpy(dst0, dst, sz * sz * SIZEOF_PIXEL);

361 memcpy(dst1, dst, sz * sz * SIZEOF_PIXEL);

362 memcpy(subcoef1, subcoef0, sz * sz * SIZEOF_COEF);

363 call_ref(dst0, sz * SIZEOF_PIXEL, subcoef0, eob);

364 call_new(dst1, sz * SIZEOF_PIXEL, subcoef1, eob);

365 if (memcmp(dst0, dst1, sz * sz * SIZEOF_PIXEL) ||

366 !is_zero(subcoef0, sz * sz * SIZEOF_COEF) ||

367 !is_zero(subcoef1, sz * sz * SIZEOF_COEF))

368 fail();

369

370 bench_new(dst, sz * SIZEOF_PIXEL, coef, eob);

371 }

372 }

373 }

374 }

375 }

376 report("itxfm");

377 }

378

379 #undef randomize_buffers

380

381 #define setpx(a,b,c) \

382 do { \

383 if (SIZEOF_PIXEL == 1) { \

384 buf0[(a) + (b) * jstride] = av_clip_uint8(c); \

385 } else { \

386 ((uint16_t *)buf0)[(a) + (b) * jstride] = av_clip_uintp2(c, bit_depth); \

387 } \

388 } while (0)

389

390 // c can be an assignment and must not be put under ()

391 #define setdx(a,b,c,d) setpx(a,b,c-(d)+(rnd()%((d)*2+1)))

392 #define setsx(a,b,c,d) setdx(a,b,c,(d) << (bit_depth - 8))

393 static void randomize_loopfilter_buffers(int bidx, int lineoff, int str,

394 int bit_depth, int dir, const int *E,

395 const int *F, const int *H, const int *I,

396 uint8_t *buf0, uint8_t *buf1)

397 {

398 uint32_t mask = (1 << bit_depth) - 1;

399 int off = dir ? lineoff : lineoff * 16;

400 int istride = dir ? 1 : 16;

401 int jstride = dir ? str : 1;

402 int i, j;

403 for (i = 0; i < 2; i++) /* flat16 */ {

404 int idx = off + i * istride, p0, q0;

405 setpx(idx, 0, q0 = rnd() & mask);

406 setsx(idx, -1, p0 = q0, E[bidx] >> 2);

407 for (j = 1; j < 8; j++) {

408 setsx(idx, -1 - j, p0, F[bidx]);

409 setsx(idx, j, q0, F[bidx]);

410 }

411 }

412 for (i = 2; i < 4; i++) /* flat8 */ {

413 int idx = off + i * istride, p0, q0;

414 setpx(idx, 0, q0 = rnd() & mask);

415 setsx(idx, -1, p0 = q0, E[bidx] >> 2);

416 for (j = 1; j < 4; j++) {

417 setsx(idx, -1 - j, p0, F[bidx]);

418 setsx(idx, j, q0, F[bidx]);

419 }

420 for (j = 4; j < 8; j++) {

421 setpx(idx, -1 - j, rnd() & mask);

422 setpx(idx, j, rnd() & mask);

423 }

424 }

425 for (i = 4; i < 6; i++) /* regular */ {

426 int idx = off + i * istride, p2, p1, p0, q0, q1, q2;

427 setpx(idx, 0, q0 = rnd() & mask);

428 setsx(idx, 1, q1 = q0, I[bidx]);

429 setsx(idx, 2, q2 = q1, I[bidx]);

430 setsx(idx, 3, q2, I[bidx]);

431 setsx(idx, -1, p0 = q0, E[bidx] >> 2);

432 setsx(idx, -2, p1 = p0, I[bidx]);

433 setsx(idx, -3, p2 = p1, I[bidx]);

434 setsx(idx, -4, p2, I[bidx]);

435 for (j = 4; j < 8; j++) {

436 setpx(idx, -1 - j, rnd() & mask);

437 setpx(idx, j, rnd() & mask);

438 }

439 }

440 for (i = 6; i < 8; i++) /* off */ {

441 int idx = off + i * istride;

442 for (j = 0; j < 8; j++) {

443 setpx(idx, -1 - j, rnd() & mask);

444 setpx(idx, j, rnd() & mask);

445 }

446 }

447 }

448 #define randomize_buffers(bidx, lineoff, str) \

449 randomize_loopfilter_buffers(bidx, lineoff, str, bit_depth, dir, \

450 E, F, H, I, buf0, buf1)

451

452 static void check_loopfilter(void)

453 {

454 LOCAL_ALIGNED_32(uint8_t, base0, [32 + 16 * 16 * 2]);

455 LOCAL_ALIGNED_32(uint8_t, base1, [32 + 16 * 16 * 2]);

456 VP9DSPContext dsp;

457 int dir, wd, wd2, bit_depth;

458 static const char *const dir_name[2] = { "h", "v" };

459 static const int E[2] = { 20, 28 }, I[2] = { 10, 16 };

460 static const int H[2] = { 7, 11 }, F[2] = { 1, 1 };

461 declare_func_emms(AV_CPU_FLAG_MMX | AV_CPU_FLAG_MMXEXT, void, uint8_t *dst, ptrdiff_t stride, int E, int I, int H);

462

463 for (bit_depth = 8; bit_depth <= 12; bit_depth += 2) {

464 ff_vp9dsp_init(&dsp, bit_depth, 0);

465

466 for (dir = 0; dir < 2; dir++) {

467 int midoff = (dir ? 8 * 8 : 8) * SIZEOF_PIXEL;

468 int midoff_aligned = (dir ? 8 * 8 : 16) * SIZEOF_PIXEL;

469 uint8_t *buf0 = base0 + midoff_aligned;

470 uint8_t *buf1 = base1 + midoff_aligned;

471

472 for (wd = 0; wd < 3; wd++) {

473 // 4/8/16wd_8px

474 if (check_func(dsp.loop_filter_8[wd][dir],

475 "vp9_loop_filter_%s_%d_8_%dbpp",

476 dir_name[dir], 4 << wd, bit_depth)) {

477 randomize_buffers(0, 0, 8);

478 memcpy(buf1 - midoff, buf0 - midoff,

479 16 * 8 * SIZEOF_PIXEL);

480 call_ref(buf0, 16 * SIZEOF_PIXEL >> dir, E[0], I[0], H[0]);

481 call_new(buf1, 16 * SIZEOF_PIXEL >> dir, E[0], I[0], H[0]);

482 if (memcmp(buf0 - midoff, buf1 - midoff, 16 * 8 * SIZEOF_PIXEL))

483 fail();

484 bench_new(buf1, 16 * SIZEOF_PIXEL >> dir, E[0], I[0], H[0]);

485 }

486 }

487

488 midoff = (dir ? 16 * 8 : 8) * SIZEOF_PIXEL;

489 midoff_aligned = (dir ? 16 * 8 : 16) * SIZEOF_PIXEL;

490

491 buf0 = base0 + midoff_aligned;

492 buf1 = base1 + midoff_aligned;

493

494 // 16wd_16px loopfilter

495 if (check_func(dsp.loop_filter_16[dir],

496 "vp9_loop_filter_%s_16_16_%dbpp",

497 dir_name[dir], bit_depth)) {

498 randomize_buffers(0, 0, 16);

499 randomize_buffers(0, 8, 16);

500 memcpy(buf1 - midoff, buf0 - midoff, 16 * 16 * SIZEOF_PIXEL);

501 call_ref(buf0, 16 * SIZEOF_PIXEL, E[0], I[0], H[0]);

502 call_new(buf1, 16 * SIZEOF_PIXEL, E[0], I[0], H[0]);

503 if (memcmp(buf0 - midoff, buf1 - midoff, 16 * 16 * SIZEOF_PIXEL))

504 fail();

505 bench_new(buf1, 16 * SIZEOF_PIXEL, E[0], I[0], H[0]);

506 }

507

508 for (wd = 0; wd < 2; wd++) {

509 for (wd2 = 0; wd2 < 2; wd2++) {

510 // mix2 loopfilter

511 if (check_func(dsp.loop_filter_mix2[wd][wd2][dir],

512 "vp9_loop_filter_mix2_%s_%d%d_16_%dbpp",

513 dir_name[dir], 4 << wd, 4 << wd2, bit_depth)) {

514 randomize_buffers(0, 0, 16);

515 randomize_buffers(1, 8, 16);

516 memcpy(buf1 - midoff, buf0 - midoff, 16 * 16 * SIZEOF_PIXEL);

517 #define M(a) (((a)[1] << 8) | (a)[0])

518 call_ref(buf0, 16 * SIZEOF_PIXEL, M(E), M(I), M(H));

519 call_new(buf1, 16 * SIZEOF_PIXEL, M(E), M(I), M(H));

520 if (memcmp(buf0 - midoff, buf1 - midoff, 16 * 16 * SIZEOF_PIXEL))

521 fail();

522 bench_new(buf1, 16 * SIZEOF_PIXEL, M(E), M(I), M(H));

523 #undef M

524 }

525 }

526 }

527 }

528 }

529 report("loopfilter");

530 }

531

532 #undef setsx

533 #undef setpx

534 #undef setdx

535 #undef randomize_buffers

536

537 #define DST_BUF_SIZE (size * size * SIZEOF_PIXEL)

538 #define SRC_BUF_STRIDE 72

539 #define SRC_BUF_SIZE ((size + 7) * SRC_BUF_STRIDE * SIZEOF_PIXEL)

540 #define src (buf + 3 * SIZEOF_PIXEL * (SRC_BUF_STRIDE + 1))

541

542 #define randomize_buffers() \

543 do { \

544 uint32_t mask = pixel_mask[(bit_depth - 8) >> 1]; \

545 int k; \

546 for (k = 0; k < SRC_BUF_SIZE; k += 4) { \

547 uint32_t r = rnd() & mask; \

548 AV_WN32A(buf + k, r); \

549 } \

550 if (op == 1) { \

551 for (k = 0; k < DST_BUF_SIZE; k += 4) { \

552 uint32_t r = rnd() & mask; \

553 AV_WN32A(dst0 + k, r); \

554 AV_WN32A(dst1 + k, r); \

555 } \

556 } \

557 } while (0)

558

559 static void check_mc(void)

560 {

561 LOCAL_ALIGNED_32(uint8_t, buf, [72 * 72 * 2]);

562 LOCAL_ALIGNED_32(uint8_t, dst0, [64 * 64 * 2]);

563 LOCAL_ALIGNED_32(uint8_t, dst1, [64 * 64 * 2]);

564 VP9DSPContext dsp;

565 int op, hsize, bit_depth, filter, dx, dy;

566 declare_func_emms(AV_CPU_FLAG_MMX | AV_CPU_FLAG_MMXEXT, void, uint8_t *dst, ptrdiff_t dst_stride,

567 const uint8_t *ref, ptrdiff_t ref_stride,

568 int h, int mx, int my);

569 static const char *const filter_names[4] = {

570 "8tap_smooth", "8tap_regular", "8tap_sharp", "bilin"

571 };

572 static const char *const subpel_names[2][2] = { { "", "h" }, { "v", "hv" } };

573 static const char *const op_names[2] = { "put", "avg" };

574 char str[256];

575

576 for (op = 0; op < 2; op++) {

577 for (bit_depth = 8; bit_depth <= 12; bit_depth += 2) {

578 ff_vp9dsp_init(&dsp, bit_depth, 0);

579 for (hsize = 0; hsize < 5; hsize++) {

580 int size = 64 >> hsize;

581

582 for (filter = 0; filter < 4; filter++) {

583 for (dx = 0; dx < 2; dx++) {

584 for (dy = 0; dy < 2; dy++) {

585 if (dx || dy) {

586 snprintf(str, sizeof(str),

587 "%s_%s_%d%s", op_names[op],

588 filter_names[filter], size,

589 subpel_names[dy][dx]);

590 } else {

591 snprintf(str, sizeof(str),

592 "%s%d", op_names[op], size);

593 }

594 if (check_func(dsp.mc[hsize][filter][op][dx][dy],

595 "vp9_%s_%dbpp", str, bit_depth)) {

596 int mx = dx ? 1 + (rnd() % 14) : 0;

597 int my = dy ? 1 + (rnd() % 14) : 0;

598 randomize_buffers();

599 call_ref(dst0, size * SIZEOF_PIXEL,

600 src, SRC_BUF_STRIDE * SIZEOF_PIXEL,

601 size, mx, my);

602 call_new(dst1, size * SIZEOF_PIXEL,

603 src, SRC_BUF_STRIDE * SIZEOF_PIXEL,

604 size, mx, my);

605 if (memcmp(dst0, dst1, DST_BUF_SIZE))

606 fail();

607

608 // simd implementations for each filter of subpel

609 // functions are identical

610 if (filter >= 1 && filter <= 2) continue;

611 // 10/12 bpp for bilin are identical

612 if (bit_depth == 12 && filter == 3) continue;

613

614 bench_new(dst1, size * SIZEOF_PIXEL,

615 src, SRC_BUF_STRIDE * SIZEOF_PIXEL,

616 size, mx, my);

617 }

618 }

619 }

620 }

621 }

622 }

623 }

624 report("mc");

625 }

626

627 void checkasm_check_vp9dsp(void)

628 {

629 check_ipred();

630 check_itxfm();

631 check_loopfilter();

632 check_mc();

633 }

declare_func_emms

#define declare_func_emms(cpu_flags, ret,...)

Definition: checkasm.h:185

static const uint8_t q1[256]

Definition: twofish.c:100

fwht_1d

static void fwht_1d(double *out, const double *in, int sz)

Definition: vp9dsp.c:124

setpx

#define setpx(a, b, c)

Definition: vp9dsp.c:381

mem_internal.h

DC_128_PRED

@ DC_128_PRED

Definition: vp9.h:58

out

FILE * out

Definition: movenc.c:55

N_TXFM_TYPES

@ N_TXFM_TYPES

Definition: vp9.h:42

mask

int mask

Definition: mediacodecdec_common.c:154

VP9DSPContext::loop_filter_8

void(* loop_filter_8[3][2])(uint8_t *dst, ptrdiff_t stride, int mb_lim, int lim, int hev_thr)

Definition: vp9dsp.h:81

TM_VP8_PRED

@ TM_VP8_PRED

Definition: vp9.h:55

DC_PRED

@ DC_PRED

Definition: vp9.h:48

VP9DSPContext

Definition: vp9dsp.h:40

check_func

#define check_func(func,...)

Definition: checkasm.h:179

randomize_loopfilter_buffers

static void randomize_loopfilter_buffers(int bidx, int lineoff, int str, int bit_depth, int dir, const int *E, const int *F, const int *H, const int *I, uint8_t *buf0, uint8_t *buf1)

Definition: vp9dsp.c:393

VERT_LEFT_PRED

@ VERT_LEFT_PRED

Definition: vp9.h:53

fadst4_1d

static void fadst4_1d(double *out, const double *in, int sz)

Definition: vp9dsp.c:154

#define F(x)

filter

void(* filter)(uint8_t *src, int stride, int qscale)

Definition: h263dsp.c:29

mathematics.h

call_ref

#define call_ref(...)

Definition: checkasm.h:194

bit_depth

static void bit_depth(AudioStatsContext *s, const uint64_t *const mask, uint8_t *depth)

Definition: af_astats.c:246

check_itxfm

static void check_itxfm(void)

Definition: vp9dsp.c:311

N_TXFM_SIZES

@ N_TXFM_SIZES

Definition: vp9.h:32

uint8_t ptrdiff_t const uint8_t ptrdiff_t int intptr_t mx

Definition: dsp.h:53

DC_127_PRED

@ DC_127_PRED

Definition: vp9.h:59

VP9DSPContext::loop_filter_mix2

void(* loop_filter_mix2[2][2][2])(uint8_t *dst, ptrdiff_t stride, int mb_lim, int lim, int hev_thr)

Definition: vp9dsp.h:103

fail

#define fail()

Definition: checkasm.h:188

VERT_PRED

@ VERT_PRED

Definition: vp9.h:46

trunc

static __device__ float trunc(float a)

Definition: cuda_runtime.h:179

SIZEOF_PIXEL

#define SIZEOF_PIXEL

Definition: vp9dsp.c:34

checkasm.h

DIAG_DOWN_RIGHT_PRED

@ DIAG_DOWN_RIGHT_PRED

Definition: vp9.h:50

copy_subcoefs

static int copy_subcoefs(int16_t *out, const int16_t *in, enum TxfmMode tx, enum TxfmType txtp, int sz, int sub, int bit_depth)

Definition: vp9dsp.c:253

check_mc

static void check_mc(void)

Definition: vp9dsp.c:559

fadst_1d

static void fadst_1d(double *out, const double *in, int sz)

Definition: vp9dsp.c:168

setsx

#define setsx(a, b, c, d)

Definition: vp9dsp.c:392

lrint

#define lrint

Definition: tablegen.h:53

rnd

#define rnd()

Definition: checkasm.h:172

SIZEOF_COEF

#define SIZEOF_COEF

Definition: vp9dsp.c:309

HOR_PRED

@ HOR_PRED

Definition: vp9.h:47

emms_c

#define emms_c()

Definition: emms.h:63

intreadwrite.h

src

#define src

Definition: vp9dsp.c:540

AV_ZERO32

#define AV_ZERO32(d)

Definition: intreadwrite.h:662

static int op(uint8_t **dst, const uint8_t *dst_end, GetByteContext *gb, int pixel, int count, int *x, int width, int linesize)

Perform decode operation.

Definition: anm.c:76

ff_vp9_scans

const int16_t *const ff_vp9_scans[5][4]

Definition: vp9data.c:600

vp9data.h

SRC_BUF_STRIDE

#define SRC_BUF_STRIDE

Definition: vp9dsp.c:538

LEFT_DC_PRED

@ LEFT_DC_PRED

Definition: vp9.h:56

check_loopfilter

static void check_loopfilter(void)

Definition: vp9dsp.c:452

ff_vp9dsp_init

av_cold void ff_vp9dsp_init(VP9DSPContext *dsp, int bpp, int bitexact)

Definition: vp9dsp.c:88

ftx_2d

static void ftx_2d(double *out, const double *in, enum TxfmMode tx, enum TxfmType txtp, int sz)

Definition: vp9dsp.c:180

ftx

static void ftx(int16_t *buf, enum TxfmMode tx, enum TxfmType txtp, int sz, int bit_depth)

Definition: vp9dsp.c:231

static const uint8_t q0[256]

Definition: twofish.c:81

#define E

Definition: avdct.c:33

uint8_t ptrdiff_t const uint8_t ptrdiff_t int intptr_t intptr_t my

Definition: dsp.h:53

DCT_ADST

@ DCT_ADST

Definition: vp9.h:39

call_new

#define call_new(...)

Definition: checkasm.h:297

LOCAL_ALIGNED_32

#define LOCAL_ALIGNED_32(t, v,...)

Definition: mem_internal.h:156

#define M(a)

Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c

Definition: undefined.txt:32

VP9DSPContext::itxfm_add

void(* itxfm_add[N_TXFM_SIZES+1][N_TXFM_TYPES])(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob)

Definition: vp9dsp.h:71

VP9DSPContext::intra_pred

void(* intra_pred[N_TXFM_SIZES][N_INTRA_PRED_MODES])(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, const uint8_t *top)

Definition: vp9dsp.h:52

TxfmMode

Definition: vp9.h:27

vp9.h

DCT_DCT

@ DCT_DCT

Definition: vp9.h:38

TxfmType

Definition: vp9.h:37

pixel_mask

static const uint32_t pixel_mask[3]

Definition: vp9dsp.c:33

N_INTRA_PRED_MODES

@ N_INTRA_PRED_MODES

Definition: vp9.h:61

dst

uint8_t ptrdiff_t const uint8_t ptrdiff_t int intptr_t intptr_t int int16_t * dst

Definition: dsp.h:83

size

int size

Definition: twinvq_data.h:10344

VERT_RIGHT_PRED

@ VERT_RIGHT_PRED

Definition: vp9.h:51

VP9DSPContext::mc

vp9_mc_func mc[5][N_FILTERS][2][2][2]

Definition: vp9dsp.h:115

TX_4X4

@ TX_4X4

Definition: vp9.h:28

The reader does not expect b to be semantically here and if the code is changed by maybe adding a a division or other the signedness will almost certainly be mistaken To avoid this confusion a new type was SUINT is the C unsigned type but it holds a signed int to use the same example SUINT a

Definition: undefined.txt:41

#define H

Definition: pixlet.c:39

M_PI

#define M_PI

Definition: mathematics.h:67

ftx1d_fn

void(* ftx1d_fn)(double *out, const double *in, int sz)

Definition: vp9dsp.c:179

emms.h

randomize_buffers

#define randomize_buffers()

Definition: vp9dsp.c:542

report

#define report

Definition: checkasm.h:191

is_zero

static int is_zero(const int16_t *c, int sz)

Definition: vp9dsp.c:298

bench_new

#define bench_new(...)

Definition: checkasm.h:368

#define i(width, name, range_min, range_max)

Definition: cbs_h2645.c:256

fdct_1d

static void fdct_1d(double *out, const double *in, int sz)

Definition: vp9dsp.c:139

DC_129_PRED

@ DC_129_PRED

Definition: vp9.h:60

internal.h

common.h

ADST_ADST

@ ADST_ADST

Definition: vp9.h:41

AV_COPY32

#define AV_COPY32(d, s)

Definition: intreadwrite.h:634

AV_RN32A

#define AV_RN32A(p)

Definition: intreadwrite.h:522

stride

#define stride

Definition: h264pred_template.c:537

left

Tag MUST be and< 10hcoeff half pel interpolation filter coefficients, hcoeff[0] are the 2 middle coefficients[1] are the next outer ones and so on, resulting in a filter like:...eff[2], hcoeff[1], hcoeff[0], hcoeff[0], hcoeff[1], hcoeff[2] ... the sign of the coefficients is not explicitly stored but alternates after each coeff and coeff[0] is positive, so ...,+,-,+,-,+,+,-,+,-,+,... hcoeff[0] is not explicitly stored but found by subtracting the sum of all stored coefficients with signs from 32 hcoeff[0]=32 - hcoeff[1] - hcoeff[2] - ... a good choice for hcoeff and htaps is htaps=6 hcoeff={40,-10, 2} an alternative which requires more computations at both encoder and decoder side and may or may not be better is htaps=8 hcoeff={42,-14, 6,-2}ref_frames minimum of the number of available reference frames and max_ref_frames for example the first frame after a key frame always has ref_frames=1spatial_decomposition_type wavelet type 0 is a 9/7 symmetric compact integer wavelet 1 is a 5/3 symmetric compact integer wavelet others are reserved stored as delta from last, last is reset to 0 if always_reset||keyframeqlog quality(logarithmic quantizer scale) stored as delta from last, last is reset to 0 if always_reset||keyframemv_scale stored as delta from last, last is reset to 0 if always_reset||keyframe FIXME check that everything works fine if this changes between framesqbias dequantization bias stored as delta from last, last is reset to 0 if always_reset||keyframeblock_max_depth maximum depth of the block tree stored as delta from last, last is reset to 0 if always_reset||keyframequant_table quantization tableHighlevel bitstream structure:==============================--------------------------------------------|Header|--------------------------------------------|------------------------------------|||Block0||||split?||||yes no||||......... intra?||||:Block01 :yes no||||:Block02 :....... ..........||||:Block03 ::y DC ::ref index:||||:Block04 ::cb DC ::motion x :||||......... :cr DC ::motion y :||||....... ..........|||------------------------------------||------------------------------------|||Block1|||...|--------------------------------------------|------------ ------------ ------------|||Y subbands||Cb subbands||Cr subbands||||--- ---||--- ---||--- ---|||||LL0||HL0||||LL0||HL0||||LL0||HL0|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||LH0||HH0||||LH0||HH0||||LH0||HH0|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||HL1||LH1||||HL1||LH1||||HL1||LH1|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||HH1||HL2||||HH1||HL2||||HH1||HL2|||||...||...||...|||------------ ------------ ------------|--------------------------------------------Decoding process:=================------------|||Subbands|------------||||------------|Intra DC||||LL0 subband prediction ------------|\ Dequantization ------------------- \||Reference frames|\ IDWT|------- -------|Motion \|||Frame 0||Frame 1||Compensation . OBMC v -------|------- -------|--------------. \------> Frame n output Frame Frame<----------------------------------/|...|------------------- Range Coder:============Binary Range Coder:------------------- The implemented range coder is an adapted version based upon "Range encoding: an algorithm for removing redundancy from a digitised message." by G. N. N. Martin. The symbols encoded by the Snow range coder are bits(0|1). The associated probabilities are not fix but change depending on the symbol mix seen so far. bit seen|new state ---------+----------------------------------------------- 0|256 - state_transition_table[256 - old_state];1|state_transition_table[old_state];state_transition_table={ 0, 0, 0, 0, 0, 0, 0, 0, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 190, 191, 192, 194, 194, 195, 196, 197, 198, 199, 200, 201, 202, 202, 204, 205, 206, 207, 208, 209, 209, 210, 211, 212, 213, 215, 215, 216, 217, 218, 219, 220, 220, 222, 223, 224, 225, 226, 227, 227, 229, 229, 230, 231, 232, 234, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 248, 0, 0, 0, 0, 0, 0, 0};FIXME Range Coding of integers:------------------------- FIXME Neighboring Blocks:===================left and top are set to the respective blocks unless they are outside of the image in which case they are set to the Null block top-left is set to the top left block unless it is outside of the image in which case it is set to the left block if this block has no larger parent block or it is at the left side of its parent block and the top right block is not outside of the image then the top right block is used for top-right else the top-left block is used Null block y, cb, cr are 128 level, ref, mx and my are 0 Motion Vector Prediction:=========================1. the motion vectors of all the neighboring blocks are scaled to compensate for the difference of reference frames scaled_mv=(mv *(256 *(current_reference+1)/(mv.reference+1))+128)> the median of the scaled left

Definition: snow.txt:386

M_SQRT1_2

#define M_SQRT1_2

Definition: mathematics.h:103

AV_CPU_FLAG_MMX

#define AV_CPU_FLAG_MMX

standard MMX

Definition: cpu.h:30

HOR_UP_PRED

@ HOR_UP_PRED

Definition: vp9.h:54

mode