1 /*
2 * Copyright (c) 2015 Parag Salasakar (Parag.Salasakar@imgtec.com)
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
23
24 #define PCKEV_AVG_ST_UB(in0, in1, dst, pdst) \
25 { \
26 v16u8 tmp_m; \
27 \
28 tmp_m = (v16u8) __msa_pckev_b((v16i8) in0, (v16i8) in1); \
29 tmp_m = __msa_aver_u_b(tmp_m, (v16u8) dst); \
30 ST_UB(tmp_m, (pdst)); \
31 }
32
33 #define PCKEV_ST_SB4(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
34 { \
35 v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
36 uint8_t *pdst_m = (uint8_t *) (pdst); \
37 \
38 PCKEV_B4_SB(in0, in1, in2, in3, in4, in5, in6, in7, \
39 tmp0_m, tmp1_m, tmp2_m, tmp3_m); \
40 ST_SB4(tmp0_m, tmp1_m, tmp2_m, tmp3_m, pdst_m, stride); \
41 }
42
43 #define PCKEV_AVG_ST8x4_UB(in1, dst0, in2, dst1, in3, dst2, in4, dst3, \
44 pdst, stride) \
45 { \
46 v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
47 uint8_t *pdst_m = (uint8_t *) (pdst); \
48 \
49 PCKEV_B2_UB(in2, in1, in4, in3, tmp0_m, tmp1_m); \
50 PCKEV_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m); \
51 AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m); \
52 ST_D4(tmp0_m, tmp1_m, 0, 1, 0, 1, pdst_m, stride); \
53 }
54
58 {
59 uint8_t loop_cnt;
60 uint32_t out0, out1;
61 v16u8
src0,
src1, src0_sld1, src1_sld1, res0, res1;
62 v16i8 zeros = { 0 };
63
64 for (loop_cnt = (
height >> 1); loop_cnt--;) {
66 src += (2 * src_stride);
67
70
71 out0 = __msa_copy_u_w((v4i32) res0, 0);
72 out1 = __msa_copy_u_w((v4i32) res1, 0);
77 }
78 }
79
83 {
84 uint8_t loop_cnt;
85 v16i8
src0,
src1,
src2, src3, src0_sld1, src1_sld1, src2_sld1, src3_sld1;
86 v16i8 zeros = { 0 };
87
88 for (loop_cnt = (
height >> 2); loop_cnt--;) {
90 src += (4 * src_stride);
91
93 src0_sld1, src1_sld1, src2_sld1, src3_sld1);
95 src2, src2_sld1, src3, src3_sld1,
dst, dst_stride);
96 dst += (4 * dst_stride);
97 }
98 }
99
103 {
104 uint8_t loop_cnt;
106 v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
107
108 for (loop_cnt = (
height >> 3); loop_cnt--;) {
111 src8, src9, src10, src11, src12, src13, src14, src15);
112 src += (8 * src_stride);
113
116 dst += (4 * dst_stride);
117
118 AVER_ST16x4_UB(src4, src12, src5, src13, src6, src14, src7, src15,
120 dst += (4 * dst_stride);
121 }
122 }
123
126 {
128 v16i8 src0_sld1, src1_sld1, src2_sld1, src3_sld1;
129 v16i8 src4_sld1, src5_sld1, src6_sld1, src7_sld1;
130 v16i8 zeros = { 0 };
131
133 src += (8 * src_stride);
134
136 src0_sld1, src1_sld1, src2_sld1, src3_sld1);
137 SLDI_B4_SB(zeros, src4, zeros, src5, zeros, src6, zeros, src7, 1,
138 src4_sld1, src5_sld1, src6_sld1, src7_sld1);
139
141 src2, src2_sld1, src3, src3_sld1,
dst, dst_stride);
142 dst += (4 * dst_stride);
144 src6, src6_sld1, src7, src7_sld1,
dst, dst_stride);
145 }
146
149 {
150 v16i8
src0,
src1,
src2, src3, src0_sld1, src1_sld1, src2_sld1, src3_sld1;
151 v16i8 zeros = { 0 };
152
155 src0_sld1, src1_sld1, src2_sld1, src3_sld1);
157 src2, src2_sld1, src3, src3_sld1,
dst, dst_stride);
158 }
159
163 {
164 v16u8
src0,
src1,
src2, src3, src4, src5, src6, src7, src8;
165 v16u8 src9, src10, src11, src12, src13, src14, src15;
166
169 src8, src9, src10, src11, src12, src13, src14, src15);
170 src += (8 * src_stride);
171
174 dst += (4 * dst_stride);
175
177 LD_UB4((
src + 1), src_stride, src8, src9, src10, src11);
178 src += (4 * src_stride);
179
180 AVE_ST16x4_UB(src4, src12, src5, src13, src6, src14, src7, src15,
182 dst += (4 * dst_stride);
183
184 LD_UB4(
src, src_stride, src4, src5, src6, src7);
185 LD_UB4((
src + 1), src_stride, src12, src13, src14, src15);
186 src += (4 * src_stride);
187
190 dst += (4 * dst_stride);
191 AVE_ST16x4_UB(src4, src12, src5, src13, src6, src14, src7, src15,
193 }
194
198 {
199 v16u8
src0,
src1,
src2, src3, src4, src5, src6, src7, src8;
200 v16u8 src9, src10, src11, src12, src13, src14, src15;
201
204 src8, src9, src10, src11, src12, src13, src14, src15);
205
208 dst += (4 * dst_stride);
209 AVE_ST16x4_UB(src4, src12, src5, src13, src6, src14, src7, src15,
211 }
212
217 {
218 uint8_t loop_cnt;
219 uint32_t dst0, dst1, out0, out1;
220 v16u8
src0,
src1, src0_sld1, src1_sld1, res0, res1;
221 v16u8 tmp0 = { 0 };
222 v16u8 tmp1 = { 0 };
223 v16i8 zeros = { 0 };
224
225 for (loop_cnt = (
height >> 1); loop_cnt--;) {
227 src += (2 * src_stride);
228
230
232 dst1 =
LW(
dst + dst_stride);
233 tmp0 = (v16u8) __msa_insert_w((v4i32) tmp0, 0, dst0);
234 tmp1 = (v16u8) __msa_insert_w((v4i32) tmp1, 0, dst1);
235
238
239 out0 = __msa_copy_u_w((v4i32) res0, 0);
240 out1 = __msa_copy_u_w((v4i32) res1, 0);
245 }
246 }
247
252 {
253 uint8_t loop_cnt;
254 v16i8
src0,
src1,
src2, src3, src0_sld1, src1_sld1, src2_sld1, src3_sld1;
255 v16i8 zeros = { 0 };
256
257 for (loop_cnt = (
height >> 2); loop_cnt--;) {
259 src += (4 * src_stride);
260
262 src0_sld1, src1_sld1, src2_sld1, src3_sld1);
263
265 src3, src3_sld1,
dst, dst_stride);
266 dst += (4 * dst_stride);
267 }
268 }
269
274 {
275 uint8_t loop_cnt;
276 v16u8
src0,
src1,
src2, src3, src4, src5, src6, src7, src8;
277 v16u8 src9, src10, src11, src12, src13, src14, src15;
278
279 for (loop_cnt = (
height >> 3); loop_cnt--;) {
282 src8, src9, src10, src11, src12, src13, src14, src15);
283 src += (8 * src_stride);
284
287 dst += (4 * dst_stride);
290 dst += (4 * dst_stride);
291 }
292 }
293
297 {
298 uint8_t loop_cnt;
299 uint32_t out0, out1;
301
304
305 for (loop_cnt = (
height >> 1); loop_cnt--;) {
307 src += (2 * src_stride);
308
310
311 out0 = __msa_copy_u_w((v4i32) res0, 0);
312 out1 = __msa_copy_u_w((v4i32) res1, 0);
317
319 }
320 }
321
325 {
326 uint8_t loop_cnt;
328
331
332 for (loop_cnt = (
height >> 2); loop_cnt--;) {
334 src += (4 * src_stride);
335
338 dst += (4 * dst_stride);
339
341 }
342 }
343
347 {
348 uint8_t loop_cnt;
349 v16u8
src0,
src1,
src2, src3, src4, src5, src6, src7, src8;
350
353
354 for (loop_cnt = (
height >> 3); loop_cnt--;) {
356 src += (8 * src_stride);
357
360 dst += (4 * dst_stride);
363 dst += (4 * dst_stride);
364
366 }
367 }
368
371 {
372 v16u8
src0,
src1,
src2, src3, src4, src5, src6, src7, src8;
373
375 src += (8 * src_stride);
377
380 dst += (4 * dst_stride);
381
382 AVE_ST8x4_UB(src4, src5, src5, src6, src6, src7, src7, src8,
384 }
385
388 {
390
394 }
395
399 {
400 v16u8
src0,
src1,
src2, src3, src4, src5, src6, src7, src8;
401 v16u8 src9, src10, src11, src12, src13, src14, src15, src16;
402
404 src += (8 * src_stride);
406 src8, src9, src10, src11, src12, src13, src14, src15);
407 src += (8 * src_stride);
409
412 dst += (4 * dst_stride);
413 AVE_ST16x4_UB(src4, src5, src5, src6, src6, src7, src7, src8,
415 dst += (4 * dst_stride);
416 AVE_ST16x4_UB(src8, src9, src9, src10, src10, src11, src11, src12,
418 dst += (4 * dst_stride);
420 src14, src15, src15, src16,
dst, dst_stride);
421 }
422
426 {
427 v16u8
src0,
src1,
src2, src3, src4, src5, src6, src7, src8;
428
430 src += (8 * src_stride);
432
435 dst += (4 * dst_stride);
436 AVE_ST16x4_UB(src4, src5, src5, src6, src6, src7, src7, src8,
438 }
439
444 {
445 uint8_t loop_cnt;
446 uint32_t out0, out1, dst0, dst1;
448 v16u8 tmp0 = { 0 };
449 v16u8 tmp1 = { 0 };
450 v16u8 res0, res1;
451
454
455 for (loop_cnt = (
height >> 1); loop_cnt--;) {
457 src += (2 * src_stride);
459 dst1 =
LW(
dst + dst_stride);
460 tmp0 = (v16u8) __msa_insert_w((v4i32) tmp0, 0, dst0);
461 tmp1 = (v16u8) __msa_insert_w((v4i32) tmp1, 0, dst1);
464 out0 = __msa_copy_u_w((v4i32) res0, 0);
465 out1 = __msa_copy_u_w((v4i32) res1, 0);
471 }
472 }
473
478 {
479 uint8_t loop_cnt;
481
484
485 for (loop_cnt = (
height >> 2); loop_cnt--;) {
487 src += (4 * src_stride);
488
491 dst += (4 * dst_stride);
493 }
494 }
495
500 {
501 uint8_t loop_cnt;
502 v16u8
src0,
src1,
src2, src3, src4, src5, src6, src7, src8;
503 v16u8 res0, res1, res2, res3, res4, res5, res6, res7;
504 v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
505
508
509 for (loop_cnt = (
height >> 3); loop_cnt--;) {
511 src += (8 * src_stride);
513 res0, res1, res2, res3);
514 AVER_UB4_UB(src4, src5, src5, src6, src6, src7, src7, src8,
515 res4, res5, res6, res7);
516
517 LD_UB8(
dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
518 AVER_UB4_UB(dst0, res0, dst1, res1, dst2, res2, dst3, res3,
519 res0, res1, res2, res3);
520 AVER_UB4_UB(dst4, res4, dst5, res5, dst6, res6, dst7, res7,
521 res4, res5, res6, res7);
522 ST_UB8(res0, res1, res2, res3, res4, res5, res6, res7,
dst, dst_stride);
523 dst += (8 * dst_stride);
524
526 }
527 }
528
532 {
533 uint8_t loop_cnt;
534 uint32_t res0, res1;
535 v16i8
src0,
src1,
src2, src0_sld1, src1_sld1, src2_sld1;
536 v16u8 src0_r, src1_r, src2_r, res;
537 v8u16 add0, add1, add2, sum0, sum1;
538 v16i8 zeros = { 0 };
539
542
543 for (loop_cnt = (
height >> 1); loop_cnt--;) {
545 src += (2 * src_stride);
546
548 src1_sld1, src2_sld1);
550 src0_r, src1_r, src2_r);
551 HADD_UB3_UH(src0_r, src1_r, src2_r, add0, add1, add2);
552 ADD2(add0, add1, add1, add2, sum0, sum1);
554 res = (v16u8) __msa_pckev_b((v16i8) sum1, (v16i8) sum0);
555 res0 = __msa_copy_u_w((v4i32) res, 0);
556 res1 = __msa_copy_u_w((v4i32) res, 2);
561
563 }
564 }
565
569 {
570 uint8_t loop_cnt;
572 v16i8 src0_sld1, src1_sld1, src2_sld1, src3_sld1, src4_sld1;
573 v16u8 src0_r, src1_r, src2_r, src3_r, src4_r;
574 v8u16 add0, add1, add2, add3, add4;
575 v8u16 sum0, sum1, sum2, sum3;
576 v16i8 zeros = { 0 };
577
580
581 for (loop_cnt = (
height >> 2); loop_cnt--;) {
583 src += (4 * src_stride);
584
586 src1_sld1, src2_sld1);
587 SLDI_B2_SB(zeros, src3, zeros, src4, 1, src3_sld1, src4_sld1);
589 src1_r, src2_r);
590 ILVR_B2_UB(src3_sld1, src3, src4_sld1, src4, src3_r, src4_r);
591 HADD_UB3_UH(src0_r, src1_r, src2_r, add0, add1, add2);
593 ADD4(add0, add1, add1, add2, add2, add3, add3, add4,
594 sum0, sum1, sum2, sum3);
598 dst += (4 * dst_stride);
600 }
601 }
602
606 {
607 uint8_t loop_cnt;
608 v16u8
src0,
src1,
src2, src3, src4, src5, src6, src7, src8, src9;
609 v16u8 src10, src11, src12, src13, src14, src15, src16, src17;
610 v8u16 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
611 v8u16 src8_r, src0_l, src1_l, src2_l, src3_l, src4_l, src5_l, src6_l;
612 v8u16 src7_l, src8_l;
613 v8u16 sum0_r, sum1_r, sum2_r, sum3_r, sum4_r, sum5_r, sum6_r, sum7_r;
614 v8u16 sum0_l, sum1_l, sum2_l, sum3_l, sum4_l, sum5_l, sum6_l, sum7_l;
615
616 for (loop_cnt = (
height >> 3); loop_cnt--;) {
619 src9, src10, src11, src12, src13, src14, src15, src16);
620 src += (8 * src_stride);
621
624
634 HADD_UB3_UH(src0_r, src1_r, src2_r, src0_r, src1_r, src2_r);
635 HADD_UB3_UH(src3_r, src4_r, src5_r, src3_r, src4_r, src5_r);
636 HADD_UB3_UH(src6_r, src7_r, src8_r, src6_r, src7_r, src8_r);
637 HADD_UB3_UH(src0_l, src1_l, src2_l, src0_l, src1_l, src2_l);
638 HADD_UB3_UH(src3_l, src4_l, src5_l, src3_l, src4_l, src5_l);
639 HADD_UB3_UH(src6_l, src7_l, src8_l, src6_l, src7_l, src8_l);
640 ADD4(src0_r, src1_r, src1_r, src2_r, src2_r, src3_r, src3_r, src4_r,
641 sum0_r, sum1_r, sum2_r, sum3_r);
642 ADD4(src4_r, src5_r, src5_r, src6_r, src6_r, src7_r, src7_r, src8_r,
643 sum4_r, sum5_r, sum6_r, sum7_r);
644 ADD4(src0_l, src1_l, src1_l, src2_l, src2_l, src3_l, src3_l, src4_l,
645 sum0_l, sum1_l, sum2_l, sum3_l);
646 ADD4(src4_l, src5_l, src5_l, src6_l, src6_l, src7_l, src7_l, src8_l,
647 sum4_l, sum5_l, sum6_l, sum7_l);
652 PCKEV_ST_SB4(sum0_l, sum0_r, sum1_l, sum1_r, sum2_l, sum2_r,
653 sum3_l, sum3_r,
dst, dst_stride);
654 dst += (4 * dst_stride);
655 PCKEV_ST_SB4(sum4_l, sum4_r, sum5_l, sum5_r, sum6_l, sum6_r,
656 sum7_l, sum7_r,
dst, dst_stride);
657 dst += (4 * dst_stride);
658 }
659 }
660
663 {
664 v16u8
src0,
src1,
src2, src3, src4, src5, src6, src7, src8;
665 v16u8 src0_sld1, src1_sld1, src2_sld1, src3_sld1;
666 v16u8 src4_sld1, src5_sld1, src6_sld1, src7_sld1, src8_sld1;
667 v8u16 src0_r, src1_r, src2_r, src3_r;
668 v8u16 src4_r, src5_r, src6_r, src7_r, src8_r;
669 v8u16 add0, add1, add2, add3, add4, add5, add6, add7, add8;
670 v8u16 sum0, sum1, sum2, sum3, sum4, sum5, sum6, sum7;
671 v16i8 out0, out1;
672 v16i8 zeros = { 0 };
673
675 src += (8 * src_stride);
677
679 src0_sld1, src1_sld1, src2_sld1, src3_sld1);
680 SLDI_B3_UB(zeros, src4, zeros, src5, zeros, src6, 1, src4_sld1,
681 src5_sld1, src6_sld1);
682 SLDI_B2_UB(zeros, src7, zeros, src8, 1, src7_sld1, src8_sld1);
684 src3, src0_r, src1_r, src2_r, src3_r);
685 ILVR_B3_UH(src4_sld1, src4, src5_sld1, src5, src6_sld1, src6, src4_r,
686 src5_r, src6_r);
687 ILVR_B2_UH(src7_sld1, src7, src8_sld1, src8, src7_r, src8_r);
688 HADD_UB3_UH(src0_r, src1_r, src2_r, add0, add1, add2);
689 HADD_UB3_UH(src3_r, src4_r, src5_r, add3, add4, add5);
690 HADD_UB3_UH(src6_r, src7_r, src8_r, add6, add7, add8);
691
692 sum0 = add0 + add1 + 1;
693 sum1 = add1 + add2 + 1;
694 sum2 = add2 + add3 + 1;
695 sum3 = add3 + add4 + 1;
696 sum4 = add4 + add5 + 1;
697 sum5 = add5 + add6 + 1;
698 sum6 = add6 + add7 + 1;
699 sum7 = add7 + add8 + 1;
700
701 SRA_4V(sum0, sum1, sum2, sum3, 2);
702 SRA_4V(sum4, sum5, sum6, sum7, 2);
704 ST_D4(out0, out1, 0, 1, 0, 1,
dst, dst_stride);
706 ST_D4(out0, out1, 0, 1, 0, 1,
dst + 4 * dst_stride, dst_stride);
707 }
708
711 {
713 v16i8 src0_sld1, src1_sld1, src2_sld1, src3_sld1, src4_sld1;
714 v8u16 src0_r, src1_r, src2_r, src3_r, src4_r;
715 v8u16 add0, add1, add2, add3, add4;
716 v8u16 sum0, sum1, sum2, sum3;
717 v16i8 out0, out1;
718 v16i8 zeros = { 0 };
719
721 src += (4 * src_stride);
723
725 src1_sld1, src2_sld1);
726 SLDI_B2_SB(zeros, src3, zeros, src4, 1, src3_sld1, src4_sld1);
728 src1_r, src2_r);
729 ILVR_B2_UH(src3_sld1, src3, src4_sld1, src4, src3_r, src4_r);
730 HADD_UB3_UH(src0_r, src1_r, src2_r, add0, add1, add2);
732
733 sum0 = add0 + add1 + 1;
734 sum1 = add1 + add2 + 1;
735 sum2 = add2 + add3 + 1;
736 sum3 = add3 + add4 + 1;
737
738 SRA_4V(sum0, sum1, sum2, sum3, 2);
740 ST_D4(out0, out1, 0, 1, 0, 1,
dst, dst_stride);
741 }
742
746 {
747 v16u8
src0,
src1,
src2, src3, src4, src5, src6, src7, src8, src9;
748 v16u8 src10, src11, src12, src13, src14, src15, src16, src17;
749 v8u16 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
750 v8u16 src8_r, src0_l, src1_l, src2_l, src3_l, src4_l, src5_l, src6_l;
751 v8u16 src7_l, src8_l;
752 v8u16 sum0_r, sum1_r, sum2_r, sum3_r, sum4_r, sum5_r, sum6_r, sum7_r;
753 v8u16 sum0_l, sum1_l, sum2_l, sum3_l, sum4_l, sum5_l, sum6_l, sum7_l;
754
757 src9, src10, src11, src12, src13, src14, src15, src16);
758 src += (8 * src_stride);
761
771
772 HADD_UB3_UH(src0_r, src1_r, src2_r, src0_r, src1_r, src2_r);
773 HADD_UB3_UH(src3_r, src4_r, src5_r, src3_r, src4_r, src5_r);
774 HADD_UB3_UH(src6_r, src7_r, src8_r, src6_r, src7_r, src8_r);
775 HADD_UB3_UH(src0_l, src1_l, src2_l, src0_l, src1_l, src2_l);
776 HADD_UB3_UH(src3_l, src4_l, src5_l, src3_l, src4_l, src5_l);
777 HADD_UB3_UH(src6_l, src7_l, src8_l, src6_l, src7_l, src8_l);
778
779 sum0_r = src0_r + src1_r + 1;
780 sum1_r = src1_r + src2_r + 1;
781 sum2_r = src2_r + src3_r + 1;
782 sum3_r = src3_r + src4_r + 1;
783 sum4_r = src4_r + src5_r + 1;
784 sum5_r = src5_r + src6_r + 1;
785 sum6_r = src6_r + src7_r + 1;
786 sum7_r = src7_r + src8_r + 1;
787 sum0_l = src0_l + src1_l + 1;
788 sum1_l = src1_l + src2_l + 1;
789 sum2_l = src2_l + src3_l + 1;
790 sum3_l = src3_l + src4_l + 1;
791 sum4_l = src4_l + src5_l + 1;
792 sum5_l = src5_l + src6_l + 1;
793 sum6_l = src6_l + src7_l + 1;
794 sum7_l = src7_l + src8_l + 1;
795
796 SRA_4V(sum0_r, sum1_r, sum2_r, sum3_r, 2);
797 SRA_4V(sum4_r, sum5_r, sum6_r, sum7_r, 2);
798 SRA_4V(sum0_l, sum1_l, sum2_l, sum3_l, 2);
799 SRA_4V(sum4_l, sum5_l, sum6_l, sum7_l, 2);
801 sum2_l, sum2_r, sum3_l, sum3_r,
dst, dst_stride);
802 dst += (4 * dst_stride);
803
806 src9, src10, src11, src12, src13, src14, src15, src16);
807 src += (8 * src_stride);
810
812 sum6_l, sum6_r, sum7_l, sum7_r,
dst, dst_stride);
813 dst += (4 * dst_stride);
814
824
825 HADD_UB3_UH(src0_r, src1_r, src2_r, src0_r, src1_r, src2_r);
826 HADD_UB3_UH(src3_r, src4_r, src5_r, src3_r, src4_r, src5_r);
827 HADD_UB3_UH(src6_r, src7_r, src8_r, src6_r, src7_r, src8_r);
828 HADD_UB3_UH(src0_l, src1_l, src2_l, src0_l, src1_l, src2_l);
829 HADD_UB3_UH(src3_l, src4_l, src5_l, src3_l, src4_l, src5_l);
830 HADD_UB3_UH(src6_l, src7_l, src8_l, src6_l, src7_l, src8_l);
831
832 sum0_r = src0_r + src1_r + 1;
833 sum1_r = src1_r + src2_r + 1;
834 sum2_r = src2_r + src3_r + 1;
835 sum3_r = src3_r + src4_r + 1;
836 sum4_r = src4_r + src5_r + 1;
837 sum5_r = src5_r + src6_r + 1;
838 sum6_r = src6_r + src7_r + 1;
839 sum7_r = src7_r + src8_r + 1;
840 sum0_l = src0_l + src1_l + 1;
841 sum1_l = src1_l + src2_l + 1;
842 sum2_l = src2_l + src3_l + 1;
843 sum3_l = src3_l + src4_l + 1;
844 sum4_l = src4_l + src5_l + 1;
845 sum5_l = src5_l + src6_l + 1;
846 sum6_l = src6_l + src7_l + 1;
847 sum7_l = src7_l + src8_l + 1;
848
849 SRA_4V(sum0_r, sum1_r, sum2_r, sum3_r, 2);
850 SRA_4V(sum4_r, sum5_r, sum6_r, sum7_r, 2);
851 SRA_4V(sum0_l, sum1_l, sum2_l, sum3_l, 2);
852 SRA_4V(sum4_l, sum5_l, sum6_l, sum7_l, 2);
854 sum2_l, sum2_r, sum3_l, sum3_r,
dst, dst_stride);
855 dst += (4 * dst_stride);
857 sum6_l, sum6_r, sum7_l, sum7_r,
dst, dst_stride);
858 }
859
863 {
864 v16u8
src0,
src1,
src2, src3, src4, src5, src6, src7, src8, src9;
865 v16u8 src10, src11, src12, src13, src14, src15, src16, src17;
866 v8u16 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
867 v8u16 src8_r, src0_l, src1_l, src2_l, src3_l, src4_l, src5_l, src6_l;
868 v8u16 src7_l, src8_l;
869 v8u16 sum0_r, sum1_r, sum2_r, sum3_r, sum4_r, sum5_r, sum6_r, sum7_r;
870 v8u16 sum0_l, sum1_l, sum2_l, sum3_l, sum4_l, sum5_l, sum6_l, sum7_l;
871
874 src9, src10, src11, src12, src13, src14, src15, src16);
875 src += (8 * src_stride);
878
888
889 HADD_UB3_UH(src0_r, src1_r, src2_r, src0_r, src1_r, src2_r);
890 HADD_UB3_UH(src3_r, src4_r, src5_r, src3_r, src4_r, src5_r);
891 HADD_UB3_UH(src6_r, src7_r, src8_r, src6_r, src7_r, src8_r);
892 HADD_UB3_UH(src0_l, src1_l, src2_l, src0_l, src1_l, src2_l);
893 HADD_UB3_UH(src3_l, src4_l, src5_l, src3_l, src4_l, src5_l);
894 HADD_UB3_UH(src6_l, src7_l, src8_l, src6_l, src7_l, src8_l);
895
896 sum0_r = src0_r + src1_r + 1;
897 sum1_r = src1_r + src2_r + 1;
898 sum2_r = src2_r + src3_r + 1;
899 sum3_r = src3_r + src4_r + 1;
900 sum4_r = src4_r + src5_r + 1;
901 sum5_r = src5_r + src6_r + 1;
902 sum6_r = src6_r + src7_r + 1;
903 sum7_r = src7_r + src8_r + 1;
904 sum0_l = src0_l + src1_l + 1;
905 sum1_l = src1_l + src2_l + 1;
906 sum2_l = src2_l + src3_l + 1;
907 sum3_l = src3_l + src4_l + 1;
908 sum4_l = src4_l + src5_l + 1;
909 sum5_l = src5_l + src6_l + 1;
910 sum6_l = src6_l + src7_l + 1;
911 sum7_l = src7_l + src8_l + 1;
912
913 SRA_4V(sum0_r, sum1_r, sum2_r, sum3_r, 2);
914 SRA_4V(sum4_r, sum5_r, sum6_r, sum7_r, 2);
915 SRA_4V(sum0_l, sum1_l, sum2_l, sum3_l, 2);
916 SRA_4V(sum4_l, sum5_l, sum6_l, sum7_l, 2);
918 sum2_l, sum2_r, sum3_l, sum3_r,
dst, dst_stride);
919 dst += (4 * dst_stride);
921 sum6_l, sum6_r, sum7_l, sum7_r,
dst, dst_stride);
922 }
923
928 {
929 uint8_t loop_cnt;
930 uint32_t out0, out1;
931 v16i8
src0,
src1,
src2, src0_sld1, src1_sld1, src2_sld1;
932 v16u8 src0_r, src1_r, src2_r;
933 v8u16 add0, add1, add2, sum0, sum1;
934 v16u8 dst0, dst1, res0, res1;
935 v16i8 zeros = { 0 };
936
939
940 for (loop_cnt = (
height >> 1); loop_cnt--;) {
942 src += (2 * src_stride);
943
946 src1_sld1, src2_sld1);
948 src1_r, src2_r);
949 HADD_UB3_UH(src0_r, src1_r, src2_r, add0, add1, add2);
950 ADD2(add0, add1, add1, add2, sum0, sum1);
954
955 out0 = __msa_copy_u_w((v4i32) res0, 0);
956 out1 = __msa_copy_u_w((v4i32) res1, 0);
961
963 }
964 }
965
970 {
971 uint8_t loop_cnt;
973 v16i8 src0_sld1, src1_sld1, src2_sld1, src3_sld1, src4_sld1;
974 v16u8 dst0, dst1, dst2, dst3;
975 v16u8 src0_r, src1_r, src2_r, src3_r, src4_r;
976 v8u16 add0, add1, add2, add3, add4;
977 v8u16 sum0, sum1, sum2, sum3;
978 v16i8 zeros = { 0 };
979
982
983 for (loop_cnt = (
height >> 2); loop_cnt--;) {
985 src += (4 * src_stride);
986
987 LD_UB4(
dst, dst_stride, dst0, dst1, dst2, dst3);
989 src1_sld1, src2_sld1);
990 SLDI_B2_SB(zeros, src3, zeros, src4, 1, src3_sld1, src4_sld1);
992 src1_r, src2_r);
993 ILVR_B2_UB(src3_sld1, src3, src4_sld1, src4, src3_r, src4_r);
994 HADD_UB3_UH(src0_r, src1_r, src2_r, add0, add1, add2);
996 ADD4(add0, add1, add1, add2, add2, add3, add3, add4,
997 sum0, sum1, sum2, sum3);
1000 sum2, dst2, sum3, dst3,
dst, dst_stride);
1001 dst += (4 * dst_stride);
1003 }
1004 }
1005
1010 {
1011 uint8_t loop_cnt;
1012 v16u8
src0,
src1,
src2, src3, src4, src5, src6, src7, src8, src9, src10;
1013 v16u8 src11, src12, src13, src14, src15, src16, src17;
1014 v16u8 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
1015 v16u8 src8_r, src0_l, src1_l, src2_l, src3_l, src4_l, src5_l, src6_l;
1016 v16u8 src7_l, src8_l;
1017 v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1018 v8u16 sum0_r, sum1_r, sum2_r, sum3_r, sum4_r, sum5_r, sum6_r, sum7_r;
1019 v8u16 sum0_l, sum1_l, sum2_l, sum3_l, sum4_l, sum5_l, sum6_l, sum7_l;
1020 v8u16 add0, add1, add2, add3, add4, add5, add6, add7, add8;
1021
1022 for (loop_cnt = (
height >> 3); loop_cnt--;) {
1025 src9, src10, src11, src12, src13, src14, src15, src16);
1026 src += (8 * src_stride);
1027
1030
1040 HADD_UB3_UH(src0_r, src1_r, src2_r, add0, add1, add2);
1041 HADD_UB3_UH(src3_r, src4_r, src5_r, add3, add4, add5);
1042 HADD_UB3_UH(src6_r, src7_r, src8_r, add6, add7, add8);
1043 ADD4(add0, add1, add1, add2, add2, add3, add3, add4, sum0_r, sum1_r,
1044 sum2_r, sum3_r);
1045 ADD4(add4, add5, add5, add6, add6, add7, add7, add8, sum4_r, sum5_r,
1046 sum6_r, sum7_r);
1047 HADD_UB3_UH(src0_l, src1_l, src2_l, add0, add1, add2);
1048 HADD_UB3_UH(src3_l, src4_l, src5_l, add3, add4, add5);
1049 HADD_UB3_UH(src6_l, src7_l, src8_l, add6, add7, add8);
1050 ADD4(add0, add1, add1, add2, add2, add3, add3, add4, sum0_l, sum1_l,
1051 sum2_l, sum3_l);
1052 ADD4(add4, add5, add5, add6, add6, add7, add7, add8, sum4_l, sum5_l,
1053 sum6_l, sum7_l);
1058 LD_UB8(
dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
1075 }
1076 }
1077
1081 {
1083 uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
1085
1087 for (cnt = (
height / 12); cnt--;) {
1090 src += (8 * src_stride);
1091
1092 out0 = __msa_copy_u_d((v2i64)
src0, 0);
1093 out1 = __msa_copy_u_d((v2i64)
src1, 0);
1094 out2 = __msa_copy_u_d((v2i64)
src2, 0);
1095 out3 = __msa_copy_u_d((v2i64) src3, 0);
1096 out4 = __msa_copy_u_d((v2i64) src4, 0);
1097 out5 = __msa_copy_u_d((v2i64) src5, 0);
1098 out6 = __msa_copy_u_d((v2i64) src6, 0);
1099 out7 = __msa_copy_u_d((v2i64) src7, 0);
1100
1101 SD4(out0, out1, out2, out3,
dst, dst_stride);
1102 dst += (4 * dst_stride);
1103 SD4(out4, out5, out6, out7,
dst, dst_stride);
1104 dst += (4 * dst_stride);
1105
1107 src += (4 * src_stride);
1108
1109 out0 = __msa_copy_u_d((v2i64)
src0, 0);
1110 out1 = __msa_copy_u_d((v2i64)
src1, 0);
1111 out2 = __msa_copy_u_d((v2i64)
src2, 0);
1112 out3 = __msa_copy_u_d((v2i64) src3, 0);
1113
1114 SD4(out0, out1, out2, out3,
dst, dst_stride);
1115 dst += (4 * dst_stride);
1116 }
1117 }
else if (0 ==
height % 8) {
1118 for (cnt =
height >> 3; cnt--;) {
1121 src += (8 * src_stride);
1122
1123 out0 = __msa_copy_u_d((v2i64)
src0, 0);
1124 out1 = __msa_copy_u_d((v2i64)
src1, 0);
1125 out2 = __msa_copy_u_d((v2i64)
src2, 0);
1126 out3 = __msa_copy_u_d((v2i64) src3, 0);
1127 out4 = __msa_copy_u_d((v2i64) src4, 0);
1128 out5 = __msa_copy_u_d((v2i64) src5, 0);
1129 out6 = __msa_copy_u_d((v2i64) src6, 0);
1130 out7 = __msa_copy_u_d((v2i64) src7, 0);
1131
1132 SD4(out0, out1, out2, out3,
dst, dst_stride);
1133 dst += (4 * dst_stride);
1134 SD4(out4, out5, out6, out7,
dst, dst_stride);
1135 dst += (4 * dst_stride);
1136 }
1137 }
else if (0 ==
height % 4) {
1138 for (cnt = (
height / 4); cnt--;) {
1140 src += (4 * src_stride);
1141 out0 = __msa_copy_u_d((v2i64)
src0, 0);
1142 out1 = __msa_copy_u_d((v2i64)
src1, 0);
1143 out2 = __msa_copy_u_d((v2i64)
src2, 0);
1144 out3 = __msa_copy_u_d((v2i64) src3, 0);
1145
1146 SD4(out0, out1, out2, out3,
dst, dst_stride);
1147 dst += (4 * dst_stride);
1148 }
1149 }
else if (0 ==
height % 2) {
1150 for (cnt = (
height / 2); cnt--;) {
1152 src += (2 * src_stride);
1153 out0 = __msa_copy_u_d((v2i64)
src0, 0);
1154 out1 = __msa_copy_u_d((v2i64)
src1, 0);
1155
1160 }
1161 }
1162 }
1163
1167 {
1169 const uint8_t *src_tmp;
1170 uint8_t *dst_tmp;
1172
1173 for (cnt = (
width >> 4); cnt--;) {
1176
1177 for (loop_cnt = (
height >> 3); loop_cnt--;) {
1178 LD_UB8(src_tmp, src_stride,
1180 src_tmp += (8 * src_stride);
1181
1183 dst_tmp, dst_stride);
1184 dst_tmp += (8 * dst_stride);
1185 }
1186
1189 }
1190 }
1191
1195 {
1198
1200 for (cnt = (
height / 12); cnt--;) {
1203 src += (8 * src_stride);
1206 dst += (8 * dst_stride);
1207
1209 src += (4 * src_stride);
1211 dst += (4 * dst_stride);
1212 }
1213 }
else if (0 ==
height % 8) {
1215 }
else if (0 ==
height % 4) {
1216 for (cnt = (
height >> 2); cnt--;) {
1218 src += (4 * src_stride);
1219
1221 dst += (4 * dst_stride);
1222 }
1223 }
1224 }
1225
1229 {
1231 uint32_t out0, out1, out2, out3;
1233 v16u8 dst0, dst1, dst2, dst3;
1234
1236 for (cnt = (
height / 4); cnt--;) {
1238 src += (4 * src_stride);
1239
1240 LD_UB4(
dst, dst_stride, dst0, dst1, dst2, dst3);
1241
1243 dst0, dst1, dst2, dst3);
1244
1245 out0 = __msa_copy_u_w((v4i32) dst0, 0);
1246 out1 = __msa_copy_u_w((v4i32) dst1, 0);
1247 out2 = __msa_copy_u_w((v4i32) dst2, 0);
1248 out3 = __msa_copy_u_w((v4i32) dst3, 0);
1249 SW4(out0, out1, out2, out3,
dst, dst_stride);
1250 dst += (4 * dst_stride);
1251 }
1252 }
else if (0 == (
height % 2)) {
1253 for (cnt = (
height / 2); cnt--;) {
1255 src += (2 * src_stride);
1256
1258
1260
1261 out0 = __msa_copy_u_w((v4i32) dst0, 0);
1262 out1 = __msa_copy_u_w((v4i32) dst1, 0);
1267 }
1268 }
1269 }
1270
1274 {
1276 uint64_t out0, out1, out2, out3;
1278 v16u8 dst0, dst1, dst2, dst3;
1279
1280 for (cnt = (
height / 4); cnt--;) {
1282 src += (4 * src_stride);
1283 LD_UB4(
dst, dst_stride, dst0, dst1, dst2, dst3);
1284
1286 dst0, dst1, dst2, dst3);
1287
1288 out0 = __msa_copy_u_d((v2i64) dst0, 0);
1289 out1 = __msa_copy_u_d((v2i64) dst1, 0);
1290 out2 = __msa_copy_u_d((v2i64) dst2, 0);
1291 out3 = __msa_copy_u_d((v2i64) dst3, 0);
1292 SD4(out0, out1, out2, out3,
dst, dst_stride);
1293 dst += (4 * dst_stride);
1294 }
1295 }
1296
1300 {
1303 v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1304
1305 for (cnt = (
height / 8); cnt--;) {
1307 src += (8 * src_stride);
1308 LD_UB8(
dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
1309
1311 dst0, dst1, dst2, dst3);
1312 AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7,
1313 dst4, dst5, dst6, dst7);
1314 ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7,
dst, dst_stride);
1315 dst += (8 * dst_stride);
1316 }
1317 }
1318
1320 ptrdiff_t line_size,
int h)
1321 {
1323 }
1324
1326 ptrdiff_t line_size,
int h)
1327 {
1329 }
1330
1332 ptrdiff_t line_size,
int h)
1333 {
1335 }
1336
1338 ptrdiff_t line_size,
int h)
1339 {
1341 }
1342
1344 ptrdiff_t line_size,
int h)
1345 {
1347 }
1348
1350 ptrdiff_t line_size,
int h)
1351 {
1353 }
1354
1356 ptrdiff_t line_size,
int h)
1357 {
1359 }
1360
1362 ptrdiff_t line_size,
int h)
1363 {
1365 }
1366
1368 ptrdiff_t line_size,
int h)
1369 {
1371 }
1372
1374 ptrdiff_t line_size,
int h)
1375 {
1377 }
1378
1380 ptrdiff_t line_size,
int h)
1381 {
1383 }
1384
1386 ptrdiff_t line_size,
int h)
1387 {
1390 }
else if (
h == 8) {
1392 }
1393 }
1394
1396 ptrdiff_t line_size,
int h)
1397 {
1400 }
else if (
h == 8) {
1402 }
1403 }
1404
1406 const uint8_t *pixels,
1407 ptrdiff_t line_size,
int h)
1408 {
1411 }
else if (
h == 8) {
1413 }
1414 }
1415
1417 ptrdiff_t line_size,
int h)
1418 {
1421 }
else if (
h == 4) {
1423 }
1424 }
1425
1427 ptrdiff_t line_size,
int h)
1428 {
1431 }
else if (
h == 4) {
1433 }
1434 }
1435
1437 ptrdiff_t line_size,
int h)
1438 {
1441 }
else if (
h == 4) {
1443 }
1444 }
1445
1447 ptrdiff_t line_size,
int h)
1448 {
1450 }
1451
1453 ptrdiff_t line_size,
int h)
1454 {
1456 }
1457
1459 ptrdiff_t line_size,
int h)
1460 {
1462 }
1463
1465 ptrdiff_t line_size,
int h)
1466 {
1468 }
1469
1471 ptrdiff_t line_size,
int h)
1472 {
1474 }
1475
1477 ptrdiff_t line_size,
int h)
1478 {
1480 }
1481
1483 ptrdiff_t line_size,
int h)
1484 {
1486 }
1487
1489 ptrdiff_t line_size,
int h)
1490 {
1492 }
1493
1495 ptrdiff_t line_size,
int h)
1496 {
1498 }
1499
1501 ptrdiff_t line_size,
int h)
1502 {
1504 }
1505
1507 ptrdiff_t line_size,
int h)
1508 {
1510 }
1511
1513 ptrdiff_t line_size,
int h)
1514 {
1516 }