1 /*
2 * Copyright (c) 2015 - 2017 Parag Salasakar (Parag.Salasakar@imgtec.com)
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
23
27 {
28 uint32_t tp0, tp1, offset_val;
31 v8i16 src0_r, tmp0, wgt, denom,
offset;
32
33 offset_val = (unsigned) offset_in << log2_denom;
34
35 wgt = __msa_fill_h(src_weight);
36 offset = __msa_fill_h(offset_val);
37 denom = __msa_fill_h(log2_denom);
38
41 src0_r = (v8i16) __msa_ilvr_b((v16i8)
zero, (v16i8)
src0);
42 tmp0 = wgt * src0_r;
43 tmp0 = __msa_adds_s_h(tmp0,
offset);
44 tmp0 = __msa_maxi_s_h(tmp0, 0);
45 tmp0 = __msa_srlr_h(tmp0, denom);
46 tmp0 = (v8i16) __msa_sat_u_h((v8u16) tmp0, 7);
47 src0 = (v16u8) __msa_pckev_b((v16i8) tmp0, (v16i8) tmp0);
49 }
50
54 {
55 uint32_t tp0, tp1, tp2, tp3, offset_val;
57 v8i16 src0_r, src1_r, tmp0, tmp1, wgt, denom,
offset;
58
59 offset_val = (unsigned) offset_in << log2_denom;
60
61 wgt = __msa_fill_h(src_weight);
62 offset = __msa_fill_h(offset_val);
63 denom = __msa_fill_h(log2_denom);
64
68 MUL2(wgt, src0_r, wgt, src1_r, tmp0, tmp1);
71 tmp0 = __msa_srlr_h(tmp0, denom);
72 tmp1 = __msa_srlr_h(tmp1, denom);
74 src0 = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
76 }
77
81 {
82 uint32_t tp0, tp1, tp2, tp3, offset_val;
84 v8i16 src0_r, src1_r, src2_r, src3_r, tmp0, tmp1, tmp2, tmp3;
86
87 offset_val = (unsigned) offset_in << log2_denom;
88
89 wgt = __msa_fill_h(src_weight);
90 offset = __msa_fill_h(offset_val);
91 denom = __msa_fill_h(log2_denom);
92
99 MUL4(wgt, src0_r, wgt, src1_r, wgt, src2_r, wgt, src3_r, tmp0, tmp1, tmp2,
100 tmp3);
102 tmp1, tmp2, tmp3);
107 ST_W8(
src0,
src1, 0, 1, 2, 3, 0, 1, 2, 3,
data,
stride);
108 }
109
113 {
114 uint32_t offset_val;
115 uint64_t tp0, tp1, tp2, tp3;
117 v8i16 src0_r, src1_r, src2_r, src3_r, tmp0, tmp1, tmp2, tmp3;
119
120 offset_val = (unsigned) offset_in << log2_denom;
121
122 wgt = __msa_fill_h(src_weight);
123 offset = __msa_fill_h(offset_val);
124 denom = __msa_fill_h(log2_denom);
125
131 MUL4(wgt, src0_r, wgt, src1_r, wgt, src2_r, wgt, src3_r, tmp0, tmp1, tmp2,
132 tmp3);
134 tmp1, tmp2, tmp3);
140 }
141
144 {
145 uint32_t offset_val;
146 uint64_t tp0, tp1, tp2, tp3;
147 v16u8
src0 = { 0 },
src1 = { 0 },
src2 = { 0 }, src3 = { 0 };
148 v8i16 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
149 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
151
152 offset_val = (unsigned) offset_in << log2_denom;
153
154 wgt = __msa_fill_h(src_weight);
155 offset = __msa_fill_h(offset_val);
156 denom = __msa_fill_h(log2_denom);
157
168 MUL4(wgt, src0_r, wgt, src1_r, wgt, src2_r, wgt, src3_r, tmp0, tmp1, tmp2,
169 tmp3);
170 MUL4(wgt, src4_r, wgt, src5_r, wgt, src6_r, wgt, src7_r, tmp4, tmp5, tmp6,
171 tmp7);
173 tmp1, tmp2, tmp3);
175 tmp5, tmp6, tmp7);
176 MAXI_SH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 0);
177 SRLR_H8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, denom);
178 SAT_UH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 7);
179 PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6,
src0,
src1,
181 ST_D8(
src0,
src1,
src2, src3, 0, 1, 0, 1, 0, 1, 0, 1,
data,
stride);
182 }
183
187 {
188 uint32_t offset_val, cnt;
189 uint64_t tp0, tp1, tp2, tp3;
190 v16u8
src0 = { 0 },
src1 = { 0 },
src2 = { 0 }, src3 = { 0 };
191 v8i16 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
192 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
194
195 offset_val = (unsigned) offset_in << log2_denom;
196
197 wgt = __msa_fill_h(src_weight);
198 offset = __msa_fill_h(offset_val);
199 denom = __msa_fill_h(log2_denom);
200
201 for (cnt = 2; cnt--;) {
212 MUL4(wgt, src0_r, wgt, src1_r, wgt, src2_r, wgt, src3_r, tmp0, tmp1,
213 tmp2, tmp3);
214 MUL4(wgt, src4_r, wgt, src5_r, wgt, src6_r, wgt, src7_r, tmp4, tmp5,
215 tmp6, tmp7);
217 tmp0, tmp1, tmp2, tmp3);
219 tmp4, tmp5, tmp6, tmp7);
220 MAXI_SH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 0);
221 SRLR_H8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, denom);
222 SAT_UH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 7);
223 PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6,
src0,
src1,
225 ST_D8(
src0,
src1,
src2, src3, 0, 1, 0, 1, 0, 1, 0, 1,
data,
stride);
227 }
228 }
229
233 {
234 uint32_t tp0, tp1;
235 v16i8 src_wgt, dst_wgt, wgt, vec0;
236 v16u8
src0 = { 0 }, dst0 = { 0 };
237 v8i16 tmp0, denom,
offset, max255 = __msa_ldi_h(255);
238
239 offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
240 offset_in += (128 * (src_weight + dst_weight));
241
242 src_wgt = __msa_fill_b(src_weight);
243 dst_wgt = __msa_fill_b(dst_weight);
244 offset = __msa_fill_h(offset_in);
245 denom = __msa_fill_h(log2_denom + 1);
246
247 wgt = __msa_ilvev_b(dst_wgt, src_wgt);
248
254 vec0 = (v16i8) __msa_ilvr_b((v16i8) dst0, (v16i8)
src0);
255 tmp0 = __msa_dpadd_s_h(
offset, wgt, vec0);
256 tmp0 >>= denom;
257 tmp0 = __msa_maxi_s_h(tmp0, 0);
258 tmp0 = __msa_min_s_h(max255, tmp0);
259 dst0 = (v16u8) __msa_pckev_b((v16i8) tmp0, (v16i8) tmp0);
261 }
262
266 {
267 uint32_t tp0, tp1, tp2, tp3;
268 v16i8 src_wgt, dst_wgt, wgt, vec0, vec1;
270 v8i16 tmp0, tmp1, denom,
offset;
271
272 offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
273 offset_in += (128 * (src_weight + dst_weight));
274
275 src_wgt = __msa_fill_b(src_weight);
276 dst_wgt = __msa_fill_b(dst_weight);
277 offset = __msa_fill_h(offset_in);
278 denom = __msa_fill_h(log2_denom + 1);
279
280 wgt = __msa_ilvev_b(dst_wgt, src_wgt);
281
288 tmp0 = __msa_dpadd_s_h(
offset, wgt, vec0);
289 tmp1 = __msa_dpadd_s_h(
offset, wgt, vec1);
290 tmp0 >>= denom;
291 tmp1 >>= denom;
293 dst0 = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
295 }
296
300 {
301 uint32_t tp0, tp1, tp2, tp3;
302 v16i8 src_wgt, dst_wgt, wgt, vec0, vec1, vec2, vec3;
304 v8i16 tmp0, tmp1, tmp2, tmp3, denom,
offset;
305
306 offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
307 offset_in += (128 * (src_weight + dst_weight));
308
309 src_wgt = __msa_fill_b(src_weight);
310 dst_wgt = __msa_fill_b(dst_weight);
311 offset = __msa_fill_h(offset_in);
312 denom = __msa_fill_h(log2_denom + 1);
313 wgt = __msa_ilvev_b(dst_wgt, src_wgt);
314
327 tmp0 = __msa_dpadd_s_h(
offset, wgt, vec0);
328 tmp1 = __msa_dpadd_s_h(
offset, wgt, vec1);
329 tmp2 = __msa_dpadd_s_h(
offset, wgt, vec2);
330 tmp3 = __msa_dpadd_s_h(
offset, wgt, vec3);
331 SRA_4V(tmp0, tmp1, tmp2, tmp3, denom);
334 ST_W8(dst0, dst1, 0, 1, 2, 3, 0, 1, 2, 3,
dst,
stride);
335 }
336
340 {
341 uint64_t tp0, tp1, tp2, tp3;
342 v16i8 src_wgt, dst_wgt, wgt, vec0, vec1, vec2, vec3;
344 v8i16 tmp0, tmp1, tmp2, tmp3, denom,
offset;
345
346 offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
347 offset_in += (128 * (src_weight + dst_weight));
348
349 src_wgt = __msa_fill_b(src_weight);
350 dst_wgt = __msa_fill_b(dst_weight);
351 offset = __msa_fill_h(offset_in);
352 denom = __msa_fill_h(log2_denom + 1);
353
354 wgt = __msa_ilvev_b(dst_wgt, src_wgt);
355
365 tmp0 = __msa_dpadd_s_h(
offset, wgt, vec0);
366 tmp1 = __msa_dpadd_s_h(
offset, wgt, vec1);
367 tmp2 = __msa_dpadd_s_h(
offset, wgt, vec2);
368 tmp3 = __msa_dpadd_s_h(
offset, wgt, vec3);
369 SRA_4V(tmp0, tmp1, tmp2, tmp3, denom);
373 }
374
378 {
379 uint64_t tp0, tp1, tp2, tp3;
380 v16i8 src_wgt, dst_wgt, wgt, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
382 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, denom,
offset;
383
384 offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
385 offset_in += (128 * (src_weight + dst_weight));
386
387 src_wgt = __msa_fill_b(src_weight);
388 dst_wgt = __msa_fill_b(dst_weight);
389 offset = __msa_fill_h(offset_in);
390 denom = __msa_fill_h(log2_denom + 1);
391 wgt = __msa_ilvev_b(dst_wgt, src_wgt);
392
410 tmp0 = __msa_dpadd_s_h(
offset, wgt, vec0);
411 tmp1 = __msa_dpadd_s_h(
offset, wgt, vec1);
412 tmp2 = __msa_dpadd_s_h(
offset, wgt, vec2);
413 tmp3 = __msa_dpadd_s_h(
offset, wgt, vec3);
414 tmp4 = __msa_dpadd_s_h(
offset, wgt, vec4);
415 tmp5 = __msa_dpadd_s_h(
offset, wgt, vec5);
416 tmp6 = __msa_dpadd_s_h(
offset, wgt, vec6);
417 tmp7 = __msa_dpadd_s_h(
offset, wgt, vec7);
418 SRA_4V(tmp0, tmp1, tmp2, tmp3, denom);
419 SRA_4V(tmp4, tmp5, tmp6, tmp7, denom);
423 ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1,
dst,
stride);
424 }
425
429 {
430 uint8_t cnt;
431 uint64_t tp0, tp1, tp2, tp3;
432 v16i8 src_wgt, dst_wgt, wgt;
434 v16u8 dst0, dst1, dst2, dst3;
435 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
436 v8i16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
438
439 offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
440 offset_in += (128 * (src_weight + dst_weight));
441
442 src_wgt = __msa_fill_b(src_weight);
443 dst_wgt = __msa_fill_b(dst_weight);
444 offset = __msa_fill_h(offset_in);
445 denom = __msa_fill_h(log2_denom + 1);
446 wgt = __msa_ilvev_b(dst_wgt, src_wgt);
447
448 for (cnt = 2; cnt--;) {
466 vec0, vec2, vec4, vec6);
468 vec1, vec3, vec5, vec7);
469
470 temp0 = __msa_dpadd_s_h(
offset, wgt, vec0);
471 temp1 = __msa_dpadd_s_h(
offset, wgt, vec1);
472 temp2 = __msa_dpadd_s_h(
offset, wgt, vec2);
473 temp3 = __msa_dpadd_s_h(
offset, wgt, vec3);
474 temp4 = __msa_dpadd_s_h(
offset, wgt, vec4);
475 temp5 = __msa_dpadd_s_h(
offset, wgt, vec5);
476 temp6 = __msa_dpadd_s_h(
offset, wgt, vec6);
477 temp7 = __msa_dpadd_s_h(
offset, wgt, vec7);
478
479 SRA_4V(temp0, temp1, temp2, temp3, denom);
480 SRA_4V(temp4, temp5, temp6, temp7, denom);
481 CLIP_SH8_0_255(temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7);
482 PCKEV_B4_UB(temp1, temp0, temp3, temp2, temp5, temp4, temp7, temp6,
483 dst0, dst1, dst2, dst3);
484 ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1,
dst,
stride);
486 }
487 }
488
489 #define AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_or_q3_org_in, p0_or_q0_org_in, \
490 q3_or_p3_org_in, p1_or_q1_org_in, \
491 p2_or_q2_org_in, q1_or_p1_org_in, \
492 p0_or_q0_out, p1_or_q1_out, p2_or_q2_out) \
493 { \
494 v8i16 threshold; \
495 v8i16 const3 = __msa_ldi_h(3); \
496 \
497 threshold = (p0_or_q0_org_in) + (q3_or_p3_org_in); \
498 threshold += (p1_or_q1_org_in); \
499 \
500 (p0_or_q0_out) = threshold << 1; \
501 (p0_or_q0_out) += (p2_or_q2_org_in); \
502 (p0_or_q0_out) += (q1_or_p1_org_in); \
503 (p0_or_q0_out) = __msa_srari_h((p0_or_q0_out), 3); \
504 \
505 (p1_or_q1_out) = (p2_or_q2_org_in) + threshold; \
506 (p1_or_q1_out) = __msa_srari_h((p1_or_q1_out), 2); \
507 \
508 (p2_or_q2_out) = (p2_or_q2_org_in) * const3; \
509 (p2_or_q2_out) += (p3_or_q3_org_in); \
510 (p2_or_q2_out) += (p3_or_q3_org_in); \
511 (p2_or_q2_out) += threshold; \
512 (p2_or_q2_out) = __msa_srari_h((p2_or_q2_out), 3); \
513 }
514
515 /* data[-u32_img_width] = (uint8_t)((2 * p1 + p0 + q1 + 2) >> 2); */
516 #define AVC_LPF_P0_OR_Q0(p0_or_q0_org_in, q1_or_p1_org_in, \
517 p1_or_q1_org_in, p0_or_q0_out) \
518 { \
519 (p0_or_q0_out) = (p0_or_q0_org_in) + (q1_or_p1_org_in); \
520 (p0_or_q0_out) += (p1_or_q1_org_in); \
521 (p0_or_q0_out) += (p1_or_q1_org_in); \
522 (p0_or_q0_out) = __msa_srari_h((p0_or_q0_out), 2); \
523 }
524
525 #define AVC_LPF_P1_OR_Q1(p0_or_q0_org_in, q0_or_p0_org_in, \
526 p1_or_q1_org_in, p2_or_q2_org_in, \
527 negate_tc_in, tc_in, p1_or_q1_out) \
528 { \
529 v8i16 clip3, temp; \
530 \
531 clip3 = (v8i16) __msa_aver_u_h((v8u16) p0_or_q0_org_in, \
532 (v8u16) q0_or_p0_org_in); \
533 temp = p1_or_q1_org_in << 1; \
534 clip3 = clip3 - temp; \
535 clip3 = __msa_ave_s_h(p2_or_q2_org_in, clip3); \
536 CLIP_SH(clip3, negate_tc_in, tc_in); \
537 p1_or_q1_out = p1_or_q1_org_in + clip3; \
538 }
539
540 #define AVC_LPF_P0Q0(q0_or_p0_org_in, p0_or_q0_org_in, \
541 p1_or_q1_org_in, q1_or_p1_org_in, \
542 negate_threshold_in, threshold_in, \
543 p0_or_q0_out, q0_or_p0_out) \
544 { \
545 v8i16 q0_sub_p0, p1_sub_q1, delta; \
546 \
547 q0_sub_p0 = q0_or_p0_org_in - p0_or_q0_org_in; \
548 p1_sub_q1 = p1_or_q1_org_in - q1_or_p1_org_in; \
549 q0_sub_p0 <<= 2; \
550 p1_sub_q1 += 4; \
551 delta = q0_sub_p0 + p1_sub_q1; \
552 delta >>= 3; \
553 \
554 CLIP_SH(delta, negate_threshold_in, threshold_in); \
555 \
556 p0_or_q0_out = p0_or_q0_org_in + delta; \
557 q0_or_p0_out = q0_or_p0_org_in - delta; \
558 \
559 CLIP_SH2_0_255(p0_or_q0_out, q0_or_p0_out); \
560 }
561
562 #define AVC_LPF_H_CHROMA_422(src, stride, tc_val, alpha, beta, res) \
563 { \
564 uint32_t load0, load1, load2, load3; \
565 v16u8 src0 = { 0 }; \
566 v16u8 src1 = { 0 }; \
567 v16u8 src2 = { 0 }; \
568 v16u8 src3 = { 0 }; \
569 v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0; \
570 v16u8 is_less_than, is_less_than_alpha, is_less_than_beta; \
571 v8i16 tc, q0_sub_p0, p1_sub_q1, delta; \
572 v8i16 res0_r, res1_r; \
573 v16i8 zeros = { 0 }; \
574 v16u8 res0, res1; \
575 \
576 LW4((src - 2), stride, load0, load1, load2, load3); \
577 src0 = (v16u8) __msa_insert_w((v4i32) src0, 0, load0); \
578 src1 = (v16u8) __msa_insert_w((v4i32) src1, 0, load1); \
579 src2 = (v16u8) __msa_insert_w((v4i32) src2, 0, load2); \
580 src3 = (v16u8) __msa_insert_w((v4i32) src3, 0, load3); \
581 \
582 TRANSPOSE4x4_UB_UB(src0, src1, src2, src3, src0, src1, src2, src3); \
583 \
584 p0_asub_q0 = __msa_asub_u_b(src2, src1); \
585 p1_asub_p0 = __msa_asub_u_b(src1, src0); \
586 q1_asub_q0 = __msa_asub_u_b(src2, src3); \
587 \
588 tc = __msa_fill_h(tc_val); \
589 \
590 is_less_than_alpha = (p0_asub_q0 < alpha); \
591 is_less_than_beta = (p1_asub_p0 < beta); \
592 is_less_than = is_less_than_alpha & is_less_than_beta; \
593 is_less_than_beta = (q1_asub_q0 < beta); \
594 is_less_than = is_less_than_beta & is_less_than; \
595 \
596 ILVR_B2_SH(src2, src1, src0, src3, q0_sub_p0, p1_sub_q1); \
597 HSUB_UB2_SH(q0_sub_p0, p1_sub_q1, q0_sub_p0, p1_sub_q1); \
598 \
599 q0_sub_p0 <<= 2; \
600 delta = q0_sub_p0 + p1_sub_q1; \
601 delta = __msa_srari_h(delta, 3); \
602 \
603 CLIP_SH(delta, -tc, tc); \
604 \
605 ILVR_B2_SH(zeros, src1, zeros, src2, res0_r, res1_r); \
606 \
607 res0_r += delta; \
608 res1_r -= delta; \
609 \
610 CLIP_SH2_0_255(res0_r, res1_r); \
611 PCKEV_B2_UB(res0_r, res0_r, res1_r, res1_r, res0, res1); \
612 \
613 res0 = __msa_bmnz_v(src1, res0, is_less_than); \
614 res1 = __msa_bmnz_v(src2, res1, is_less_than); \
615 \
616 res = (v16u8) __msa_ilvr_b((v16i8) res1, (v16i8) res0); \
617 }
618
619 #define TRANSPOSE2x4_B_UB(in0, in1, out0, out1, out2, out3) \
620 { \
621 v16i8 zero_m = { 0 }; \
622 \
623 out0 = (v16u8) __msa_ilvr_b((v16i8) in1, (v16i8) in0); \
624 out1 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out0, 2); \
625 SLDI_B2_UB(zero_m, out1, zero_m, out2, 2, out2, out3); \
626 }
627
628 #define AVC_LPF_H_2BYTE_CHROMA_422(src, stride, tc_val, alpha, beta, res) \
629 { \
630 uint32_t load0, load1; \
631 v16u8 src0 = { 0 }; \
632 v16u8 src1 = { 0 }; \
633 v16u8 src2 = { 0 }; \
634 v16u8 src3 = { 0 }; \
635 v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0; \
636 v16u8 is_less_than, is_less_than_alpha, is_less_than_beta; \
637 v8i16 tc, q0_sub_p0, p1_sub_q1, delta, res0_r, res1_r; \
638 v16i8 zeros = { 0 }; \
639 v16u8 res0, res1; \
640 \
641 load0 = LW(src - 2); \
642 load1 = LW(src - 2 + stride); \
643 \
644 src0 = (v16u8) __msa_insert_w((v4i32) src0, 0, load0); \
645 src1 = (v16u8) __msa_insert_w((v4i32) src1, 0, load1); \
646 \
647 TRANSPOSE2x4_B_UB(src0, src1, src0, src1, src2, src3); \
648 \
649 p0_asub_q0 = __msa_asub_u_b(src2, src1); \
650 p1_asub_p0 = __msa_asub_u_b(src1, src0); \
651 q1_asub_q0 = __msa_asub_u_b(src2, src3); \
652 \
653 tc = __msa_fill_h(tc_val); \
654 \
655 is_less_than_alpha = (p0_asub_q0 < alpha); \
656 is_less_than_beta = (p1_asub_p0 < beta); \
657 is_less_than = is_less_than_alpha & is_less_than_beta; \
658 is_less_than_beta = (q1_asub_q0 < beta); \
659 is_less_than = is_less_than_beta & is_less_than; \
660 \
661 ILVR_B2_SH(src2, src1, src0, src3, q0_sub_p0, p1_sub_q1); \
662 HSUB_UB2_SH(q0_sub_p0, p1_sub_q1, q0_sub_p0, p1_sub_q1); \
663 \
664 q0_sub_p0 <<= 2; \
665 delta = q0_sub_p0 + p1_sub_q1; \
666 delta = __msa_srari_h(delta, 3); \
667 CLIP_SH(delta, -tc, tc); \
668 \
669 ILVR_B2_SH(zeros, src1, zeros, src2, res0_r, res1_r); \
670 \
671 res0_r += delta; \
672 res1_r -= delta; \
673 \
674 CLIP_SH2_0_255(res0_r, res1_r); \
675 PCKEV_B2_UB(res0_r, res0_r, res1_r, res1_r, res0, res1); \
676 \
677 res0 = __msa_bmnz_v(src1, res0, is_less_than); \
678 res1 = __msa_bmnz_v(src2, res1, is_less_than); \
679 \
680 res = (v16u8) __msa_ilvr_b((v16i8) res1, (v16i8) res0); \
681 }
682
684 uint8_t alpha_in,
685 uint8_t beta_in,
686 ptrdiff_t img_width)
687 {
688 v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;
689 v16u8 is_less_than, is_less_than_beta, is_less_than_alpha;
690 v16u8 p1_org, p0_org, q0_org, q1_org;
691
692 LD_UB4(
data - (img_width << 1), img_width, p1_org, p0_org, q0_org, q1_org);
693
694 p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org);
695 p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org);
696 q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org);
697
698 is_less_than_alpha = (p0_asub_q0 < alpha_in);
699 is_less_than_beta = (p1_asub_p0 < beta_in);
700 is_less_than = is_less_than_beta & is_less_than_alpha;
701 is_less_than_beta = (q1_asub_q0 < beta_in);
702 is_less_than = is_less_than_beta & is_less_than;
703
704 if (!__msa_test_bz_v(is_less_than)) {
705 v16u8 p2_asub_p0, q2_asub_q0, p0,
q0, negate_is_less_than_beta;
706 v8i16 p0_r = { 0 };
707 v8i16 q0_r = { 0 };
708 v8i16 p0_l = { 0 };
709 v8i16 q0_l = { 0 };
711 v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
712 v8i16 p1_org_l, p0_org_l, q0_org_l, q1_org_l;
713 v16u8 q2_org =
LD_UB(
data + (2 * img_width));
714 v16u8 p2_org =
LD_UB(
data - (3 * img_width));
715 v16u8 tmp_flag = (v16u8)__msa_fill_b((alpha_in >> 2) + 2);
716
720
721 tmp_flag = (p0_asub_q0 < tmp_flag);
722
723 p2_asub_p0 = __msa_asub_u_b(p2_org, p0_org);
724 is_less_than_beta = (p2_asub_p0 < beta_in);
725 is_less_than_beta = is_less_than_beta & tmp_flag;
726 negate_is_less_than_beta = __msa_xori_b(is_less_than_beta, 0xff);
727 is_less_than_beta = is_less_than_beta & is_less_than;
728 negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
729
730 q1_org_r = (v8i16) __msa_ilvr_b(
zero, (v16i8) q1_org);
731 q1_org_l = (v8i16) __msa_ilvl_b(
zero, (v16i8) q1_org);
732
733 /* combine and store */
734 if (!__msa_test_bz_v(is_less_than_beta)) {
735 v8i16 p3_org_l, p3_org_r;
736 v16u8 p3_org =
LD_UB(
data - (img_width << 2));
737 v16u8 p2, p1;
738 v8i16 p2_r = { 0 };
739 v8i16 p2_l = { 0 };
740 v8i16 p1_r = { 0 };
741 v8i16 p1_l = { 0 };
742
745 p2_r, q1_org_r, p0_r, p1_r, p2_r);
746
749 p2_l, q1_org_l, p0_l, p1_l, p2_l);
750
751 PCKEV_B3_UB(p0_l, p0_r, p1_l, p1_r, p2_l, p2_r, p0, p1, p2);
752
753 p0_org = __msa_bmnz_v(p0_org, p0, is_less_than_beta);
754 p1_org = __msa_bmnz_v(p1_org, p1, is_less_than_beta);
755 p2_org = __msa_bmnz_v(p2_org, p2, is_less_than_beta);
756
759 }
760
763
764 /* combine */
765 p0 = (v16u8) __msa_pckev_b((v16i8) p0_l, (v16i8) p0_r);
766 p0_org = __msa_bmnz_v(p0_org, p0, negate_is_less_than_beta);
767
769
770 /* if (tmpFlag && (unsigned)ABS(q2-q0) < thresholds->beta_in) */
771 q2_asub_q0 = __msa_asub_u_b(q2_org, q0_org);
772 is_less_than_beta = (q2_asub_q0 < beta_in);
773 is_less_than_beta = is_less_than_beta & tmp_flag;
774 negate_is_less_than_beta = __msa_xori_b(is_less_than_beta, 0xff);
775 is_less_than_beta = is_less_than_beta & is_less_than;
776 negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
777
778 /* combine and store */
779 if (!__msa_test_bz_v(is_less_than_beta)) {
780 v8i16 q3_org_r, q3_org_l;
781 v16u8 q3_org =
LD_UB(
data + (3 * img_width));
783 v8i16 q2_r = { 0 };
784 v8i16 q2_l = { 0 };
785 v8i16 q1_r = { 0 };
786 v8i16 q1_l = { 0 };
787
790 q2_r, p1_org_r, q0_r, q1_r, q2_r);
791
794 q2_l, p1_org_l, q0_l, q1_l, q2_l);
795
797 q0_org = __msa_bmnz_v(q0_org,
q0, is_less_than_beta);
798 q1_org = __msa_bmnz_v(q1_org,
q1, is_less_than_beta);
799 q2_org = __msa_bmnz_v(q2_org, q2, is_less_than_beta);
800
803 }
804
807
808 /* combine */
809 q0 = (v16u8) __msa_pckev_b((v16i8) q0_l, (v16i8) q0_r);
810 q0_org = __msa_bmnz_v(q0_org,
q0, negate_is_less_than_beta);
811
813 }
814 }
815
817 uint8_t alpha_in,
818 uint8_t beta_in,
819 ptrdiff_t img_width)
820 {
822 v16u8
alpha, beta, p0_asub_q0;
823 v16u8 is_less_than_alpha, is_less_than, is_less_than_beta;
824 v16u8 p3_org, p2_org, p1_org, p0_org, q0_org, q1_org, q2_org, q3_org;
825 v16u8 p1_asub_p0, q1_asub_q0;
826
827
828 {
829 v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
830 v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
831
832 LD_UB8(
src, img_width, row0, row1, row2, row3, row4, row5, row6, row7);
834 row8, row9, row10, row11, row12, row13, row14, row15);
835
837 row4, row5, row6, row7,
838 row8, row9, row10, row11,
839 row12, row13, row14, row15,
840 p3_org, p2_org, p1_org, p0_org,
841 q0_org, q1_org, q2_org, q3_org);
842 }
843
844 p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org);
845 p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org);
846 q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org);
847
848 alpha = (v16u8) __msa_fill_b(alpha_in);
849 beta = (v16u8) __msa_fill_b(beta_in);
850
851 is_less_than_alpha = (p0_asub_q0 <
alpha);
852 is_less_than_beta = (p1_asub_p0 < beta);
853 is_less_than = is_less_than_beta & is_less_than_alpha;
854 is_less_than_beta = (q1_asub_q0 < beta);
855 is_less_than = is_less_than_beta & is_less_than;
856
857 if (!__msa_test_bz_v(is_less_than)) {
858 v8i16 p0_r = { 0 };
859 v8i16 q0_r = { 0 };
860 v8i16 p0_l = { 0 };
861 v8i16 q0_l = { 0 };
863 v16u8 tmp_flag, p0,
q0, p2_asub_p0, q2_asub_q0;
864 v16u8 negate_is_less_than_beta;
865 v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
866 v8i16 p1_org_l, p0_org_l, q0_org_l, q1_org_l;
867
872
873 tmp_flag =
alpha >> 2;
874 tmp_flag = tmp_flag + 2;
875 tmp_flag = (p0_asub_q0 < tmp_flag);
876
877 p2_asub_p0 = __msa_asub_u_b(p2_org, p0_org);
878 is_less_than_beta = (p2_asub_p0 < beta);
879 is_less_than_beta = tmp_flag & is_less_than_beta;
880 negate_is_less_than_beta = __msa_xori_b(is_less_than_beta, 0xff);
881 is_less_than_beta = is_less_than_beta & is_less_than;
882 negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
883
884 if (!__msa_test_bz_v(is_less_than_beta)) {
885 v16u8 p2, p1;
886 v8i16 p3_org_r, p3_org_l;
887 v8i16 p2_l = { 0 };
888 v8i16 p2_r = { 0 };
889 v8i16 p1_l = { 0 };
890 v8i16 p1_r = { 0 };
891
894 p2_r, q1_org_r, p0_r, p1_r, p2_r);
895
898 p2_l, q1_org_l, p0_l, p1_l, p2_l);
899
900 PCKEV_B3_UB(p0_l, p0_r, p1_l, p1_r, p2_l, p2_r, p0, p1, p2);
901 p0_org = __msa_bmnz_v(p0_org, p0, is_less_than_beta);
902 p1_org = __msa_bmnz_v(p1_org, p1, is_less_than_beta);
903 p2_org = __msa_bmnz_v(p2_org, p2, is_less_than_beta);
904 }
905
908
909 p0 = (v16u8) __msa_pckev_b((v16i8) p0_l, (v16i8) p0_r);
910 p0_org = __msa_bmnz_v(p0_org, p0, negate_is_less_than_beta);
911
912 q2_asub_q0 = __msa_asub_u_b(q2_org, q0_org);
913 is_less_than_beta = (q2_asub_q0 < beta);
914
915 is_less_than_beta = is_less_than_beta & tmp_flag;
916 negate_is_less_than_beta = __msa_xori_b(is_less_than_beta, 0xff);
917
918 is_less_than_beta = is_less_than_beta & is_less_than;
919 negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
920
921 if (!__msa_test_bz_v(is_less_than_beta)) {
923 v8i16 q3_org_r, q3_org_l;
924 v8i16 q1_l = { 0 };
925 v8i16 q1_r = { 0 };
926 v8i16 q2_l = { 0 };
927 v8i16 q2_r = { 0 };
928
931 q2_r, p1_org_r, q0_r, q1_r, q2_r);
932
935 q2_l, p1_org_l, q0_l, q1_l, q2_l);
936
938 q0_org = __msa_bmnz_v(q0_org,
q0, is_less_than_beta);
939 q1_org = __msa_bmnz_v(q1_org,
q1, is_less_than_beta);
940 q2_org = __msa_bmnz_v(q2_org, q2, is_less_than_beta);
941 }
942
945
946 q0 = (v16u8) __msa_pckev_b((v16i8) q0_l, (v16i8) q0_r);
947 q0_org = __msa_bmnz_v(q0_org,
q0, negate_is_less_than_beta);
948
949 {
950 v8i16 tp0, tp1, tp2, tp3, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
951
955
958
960 ST_W4(tmp3, 0, 1, 2, 3,
src, img_width);
961 ST_H4(tmp2, 0, 1, 2, 3,
src + 4, img_width);
962 src += 4 * img_width;
963 ST_W4(tmp4, 0, 1, 2, 3,
src, img_width);
964 ST_H4(tmp2, 4, 5, 6, 7,
src + 4, img_width);
965 src += 4 * img_width;
966
967 ST_W4(tmp6, 0, 1, 2, 3,
src, img_width);
968 ST_H4(tmp5, 0, 1, 2, 3,
src + 4, img_width);
969 src += 4 * img_width;
970 ST_W4(tmp7, 0, 1, 2, 3,
src, img_width);
971 ST_H4(tmp5, 4, 5, 6, 7,
src + 4, img_width);
972 }
973 }
974 }
975
980 {
981 uint64_t load0, load1;
982 uint32_t out0, out2;
983 uint16_t out1, out3;
984 v8u16 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
985 v8u16 dst0_r, dst1_r, dst4_r, dst5_r;
986 v8u16 dst2_x_r, dst2_y_r, dst3_x_r, dst3_y_r;
987 v16u8 dst0, dst1, dst4, dst5, dst2_x, dst2_y, dst3_x, dst3_y;
988 v8i16 tmp0, tmp1, tmp2, tmp3;
990 v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0, p2_asub_p0, q2_asub_q0;
991 v16u8 is_less_than, is_less_than_alpha, is_less_than_beta;
992 v16u8 is_less_than_beta1, is_less_than_beta2;
996 v16i8 src3 = { 0 };
997 v16i8 src4 = { 0 };
998 v16i8 src5 = { 0 };
999 v16i8 src6 = { 0 };
1000 v16i8 src7 = { 0 };
1001 v16i8 zeros = { 0 };
1002
1003 load0 =
LD(
src - 4);
1005 src0 = (v16i8) __msa_insert_d((v2i64)
src0, 0, load0);
1006 src1 = (v16i8) __msa_insert_d((v2i64)
src1, 0, load1);
1007
1010 src2 = (v16i8) __msa_insert_d((v2i64)
src2, 0, load0);
1011 src3 = (v16i8) __msa_insert_d((v2i64) src3, 0, load1);
1012
1015 src4 = (v16i8) __msa_insert_d((v2i64) src4, 0, load0);
1016 src5 = (v16i8) __msa_insert_d((v2i64) src5, 0, load1);
1017
1020 src6 = (v16i8) __msa_insert_d((v2i64) src6, 0, load0);
1021 src7 = (v16i8) __msa_insert_d((v2i64) src7, 0, load1);
1022
1025
1028
1029 ILVR_W2_SB(tmp2, tmp0, tmp3, tmp1, src6, src3);
1033
1034 p0_asub_q0 = __msa_asub_u_b((v16u8)
src2, (v16u8) src3);
1035 p1_asub_p0 = __msa_asub_u_b((v16u8)
src1, (v16u8)
src2);
1036 q1_asub_q0 = __msa_asub_u_b((v16u8) src4, (v16u8) src3);
1037
1038 alpha = (v16u8) __msa_fill_b(alpha_in);
1039 beta = (v16u8) __msa_fill_b(beta_in);
1040
1041 is_less_than_alpha = (p0_asub_q0 <
alpha);
1042 is_less_than_beta = (p1_asub_p0 < beta);
1043 is_less_than = is_less_than_alpha & is_less_than_beta;
1044 is_less_than_beta = (q1_asub_q0 < beta);
1045 is_less_than = is_less_than & is_less_than_beta;
1046
1049
1050 is_less_than_alpha = (p0_asub_q0 <
alpha);
1051
1052 p2_asub_p0 = __msa_asub_u_b((v16u8)
src0, (v16u8)
src2);
1053 is_less_than_beta1 = (p2_asub_p0 < beta);
1054 q2_asub_q0 = __msa_asub_u_b((v16u8) src5, (v16u8) src3);
1055 is_less_than_beta2 = (q2_asub_q0 < beta);
1056
1058 src0_r, src1_r, src2_r, src3_r);
1059 ILVR_B4_UH(zeros, src4, zeros, src5, zeros, src6, zeros, src7,
1060 src4_r, src5_r, src6_r, src7_r);
1061
1062 dst2_x_r = src1_r + src2_r + src3_r;
1063 dst2_x_r = src0_r + (2 * (dst2_x_r)) + src4_r;
1064 dst2_x_r = (v8u16) __msa_srari_h((v8i16) dst2_x_r, 3);
1065 dst1_r = src0_r + src1_r + src2_r + src3_r;
1066 dst1_r = (v8u16) __msa_srari_h((v8i16) dst1_r, 2);
1067
1068 dst0_r = (2 * src6_r) + (3 * src0_r);
1069 dst0_r += src1_r + src2_r + src3_r;
1070 dst0_r = (v8u16) __msa_srari_h((v8i16) dst0_r, 3);
1071 dst2_y_r = (2 * src1_r) + src2_r + src4_r;
1072 dst2_y_r = (v8u16) __msa_srari_h((v8i16) dst2_y_r, 2);
1073
1074 PCKEV_B2_UB(dst2_x_r, dst2_x_r, dst2_y_r, dst2_y_r, dst2_x, dst2_y);
1075 dst2_x = __msa_bmnz_v(dst2_y, dst2_x, is_less_than_beta1);
1076
1077 dst3_x_r = src2_r + src3_r + src4_r;
1078 dst3_x_r = src1_r + (2 * dst3_x_r) + src5_r;
1079 dst3_x_r = (v8u16) __msa_srari_h((v8i16) dst3_x_r, 3);
1080 dst4_r = src2_r + src3_r + src4_r + src5_r;
1081 dst4_r = (v8u16) __msa_srari_h((v8i16) dst4_r, 2);
1082
1083 dst5_r = (2 * src7_r) + (3 * src5_r);
1084 dst5_r += src4_r + src3_r + src2_r;
1085 dst5_r = (v8u16) __msa_srari_h((v8i16) dst5_r, 3);
1086 dst3_y_r = (2 * src4_r) + src3_r + src1_r;
1087 dst3_y_r = (v8u16) __msa_srari_h((v8i16) dst3_y_r, 2);
1088
1089 PCKEV_B2_UB(dst3_x_r, dst3_x_r, dst3_y_r, dst3_y_r, dst3_x, dst3_y);
1090 dst3_x = __msa_bmnz_v(dst3_y, dst3_x, is_less_than_beta2);
1091
1092 dst2_y_r = (2 * src1_r) + src2_r + src4_r;
1093 dst2_y_r = (v8u16) __msa_srari_h((v8i16) dst2_y_r, 2);
1094 dst3_y_r = (2 * src4_r) + src3_r + src1_r;
1095 dst3_y_r = (v8u16) __msa_srari_h((v8i16) dst3_y_r, 2);
1096
1097 PCKEV_B2_UB(dst2_y_r, dst2_y_r, dst3_y_r, dst3_y_r, dst2_y, dst3_y);
1098
1099 dst2_x = __msa_bmnz_v(dst2_y, dst2_x, is_less_than_alpha);
1100 dst3_x = __msa_bmnz_v(dst3_y, dst3_x, is_less_than_alpha);
1101 dst2_x = __msa_bmnz_v((v16u8)
src2, dst2_x, is_less_than);
1102 dst3_x = __msa_bmnz_v((v16u8) src3, dst3_x, is_less_than);
1103
1104 is_less_than = is_less_than_alpha & is_less_than;
1105 dst1 = (v16u8) __msa_pckev_b((v16i8) dst1_r, (v16i8) dst1_r);
1106 is_less_than_beta1 = is_less_than_beta1 & is_less_than;
1107 dst1 = __msa_bmnz_v((v16u8)
src1, dst1, is_less_than_beta1);
1108
1109 dst0 = (v16u8) __msa_pckev_b((v16i8) dst0_r, (v16i8) dst0_r);
1110 dst0 = __msa_bmnz_v((v16u8)
src0, dst0, is_less_than_beta1);
1111 dst4 = (v16u8) __msa_pckev_b((v16i8) dst4_r, (v16i8) dst4_r);
1112 is_less_than_beta2 = is_less_than_beta2 & is_less_than;
1113 dst4 = __msa_bmnz_v((v16u8) src4, dst4, is_less_than_beta2);
1114 dst5 = (v16u8) __msa_pckev_b((v16i8) dst5_r, (v16i8) dst5_r);
1115 dst5 = __msa_bmnz_v((v16u8) src5, dst5, is_less_than_beta2);
1116
1117 ILVR_B2_UB(dst1, dst0, dst3_x, dst2_x, dst0, dst1);
1118 dst2_x = (v16u8) __msa_ilvr_b((v16i8) dst5, (v16i8) dst4);
1121
1122 ILVR_W2_UB(tmp2, tmp0, tmp3, tmp1, dst0, dst4);
1123 SLDI_B2_UB(zeros, dst0, zeros, dst4, 8, dst1, dst5);
1124 dst2_x = (v16u8) __msa_ilvl_w((v4i32) tmp2, (v4i32) tmp0);
1125 dst2_y = (v16u8) __msa_ilvl_w((v4i32) tmp3, (v4i32) tmp1);
1126 SLDI_B2_UB(zeros, dst2_x, zeros, dst2_y, 8, dst3_x, dst3_y);
1127
1128 out0 = __msa_copy_u_w((v4i32) dst0, 0);
1129 out1 = __msa_copy_u_h((v8i16) dst0, 2);
1130 out2 = __msa_copy_u_w((v4i32) dst1, 0);
1131 out3 = __msa_copy_u_h((v8i16) dst1, 2);
1132
1133 SW(out0, (
src - 3));
1134 SH(out1, (
src + 1));
1136 SW(out2, (
src - 3));
1137 SH(out3, (
src + 1));
1139
1140 out0 = __msa_copy_u_w((v4i32) dst2_x, 0);
1141 out1 = __msa_copy_u_h((v8i16) dst2_x, 2);
1142 out2 = __msa_copy_u_w((v4i32) dst3_x, 0);
1143 out3 = __msa_copy_u_h((v8i16) dst3_x, 2);
1144
1145 SW(out0, (
src - 3));
1146 SH(out1, (
src + 1));
1148 SW(out2, (
src - 3));
1149 SH(out3, (
src + 1));
1151
1152 out0 = __msa_copy_u_w((v4i32) dst4, 0);
1153 out1 = __msa_copy_u_h((v8i16) dst4, 2);
1154 out2 = __msa_copy_u_w((v4i32) dst5, 0);
1155 out3 = __msa_copy_u_h((v8i16) dst5, 2);
1156
1157 SW(out0, (
src - 3));
1158 SH(out1, (
src + 1));
1160 SW(out2, (
src - 3));
1161 SH(out3, (
src + 1));
1163
1164 out0 = __msa_copy_u_w((v4i32) dst2_y, 0);
1165 out1 = __msa_copy_u_h((v8i16) dst2_y, 2);
1166 out2 = __msa_copy_u_w((v4i32) dst3_y, 0);
1167 out3 = __msa_copy_u_h((v8i16) dst3_y, 2);
1168
1169 SW(out0, (
src - 3));
1170 SH(out1, (
src + 1));
1172 SW(out2, (
src - 3));
1173 SH(out3, (
src + 1));
1174 }
1175
1177 uint8_t alpha_in,
1178 uint8_t beta_in,
1179 ptrdiff_t img_width)
1180 {
1182 v16u8 is_less_than;
1183 v8i16 p0_or_q0, q0_or_p0;
1184 v16u8 p1_or_q1_org, p0_or_q0_org, q0_or_p0_org, q1_or_p1_org;
1186 v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;
1187 v16u8 is_less_than_alpha, is_less_than_beta;
1188 v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
1189
1190 alpha = (v16u8) __msa_fill_b(alpha_in);
1191 beta = (v16u8) __msa_fill_b(beta_in);
1192
1193 LD_UB4(data_cb_or_cr - (img_width << 1), img_width,
1194 p1_or_q1_org, p0_or_q0_org, q0_or_p0_org, q1_or_p1_org);
1195
1196 p0_asub_q0 = __msa_asub_u_b(p0_or_q0_org, q0_or_p0_org);
1197 p1_asub_p0 = __msa_asub_u_b(p1_or_q1_org, p0_or_q0_org);
1198 q1_asub_q0 = __msa_asub_u_b(q1_or_p1_org, q0_or_p0_org);
1199
1200 is_less_than_alpha = (p0_asub_q0 <
alpha);
1201 is_less_than_beta = (p1_asub_p0 < beta);
1202 is_less_than = is_less_than_beta & is_less_than_alpha;
1203 is_less_than_beta = (q1_asub_q0 < beta);
1204 is_less_than = is_less_than_beta & is_less_than;
1205
1206 is_less_than = (v16u8) __msa_ilvr_d((v2i64)
zero, (v2i64) is_less_than);
1207
1208 if (!__msa_test_bz_v(is_less_than)) {
1210 zero, q1_or_p1_org, p1_org_r, p0_org_r, q0_org_r, q1_org_r);
1214
1215 p0_or_q0_org =
1216 __msa_bmnz_v(p0_or_q0_org, (v16u8) p0_or_q0, is_less_than);
1217 q0_or_p0_org =
1218 __msa_bmnz_v(q0_or_p0_org, (v16u8) q0_or_p0, is_less_than);
1219
1220 ST_UB(q0_or_p0_org, data_cb_or_cr);
1221 ST_UB(p0_or_q0_org, data_cb_or_cr - img_width);
1222 }
1223 }
1224
1226 uint8_t alpha_in,
1227 uint8_t beta_in,
1228 ptrdiff_t img_width)
1229 {
1230 v8i16 tmp1;
1231 v16u8
alpha, beta, is_less_than;
1232 v8i16 p0_or_q0, q0_or_p0;
1233 v16u8 p1_or_q1_org, p0_or_q0_org, q0_or_p0_org, q1_or_p1_org;
1235 v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;
1236 v16u8 is_less_than_alpha, is_less_than_beta;
1237 v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
1238
1239 {
1240 v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
1241
1242 LD_UB8((data_cb_or_cr - 2), img_width,
1243 row0, row1, row2, row3, row4, row5, row6, row7);
1244
1246 p1_or_q1_org, p0_or_q0_org,
1247 q0_or_p0_org, q1_or_p1_org);
1248 }
1249
1250 alpha = (v16u8) __msa_fill_b(alpha_in);
1251 beta = (v16u8) __msa_fill_b(beta_in);
1252
1253 p0_asub_q0 = __msa_asub_u_b(p0_or_q0_org, q0_or_p0_org);
1254 p1_asub_p0 = __msa_asub_u_b(p1_or_q1_org, p0_or_q0_org);
1255 q1_asub_q0 = __msa_asub_u_b(q1_or_p1_org, q0_or_p0_org);
1256
1257 is_less_than_alpha = (p0_asub_q0 <
alpha);
1258 is_less_than_beta = (p1_asub_p0 < beta);
1259 is_less_than = is_less_than_beta & is_less_than_alpha;
1260 is_less_than_beta = (q1_asub_q0 < beta);
1261 is_less_than = is_less_than_beta & is_less_than;
1262 is_less_than = (v16u8) __msa_ilvr_d((v2i64)
zero, (v2i64) is_less_than);
1263
1264 if (!__msa_test_bz_v(is_less_than)) {
1266 zero, q1_or_p1_org, p1_org_r, p0_org_r, q0_org_r, q1_org_r);
1267
1270
1271 /* convert 16 bit output into 8 bit output */
1273
1274 p0_or_q0_org =
1275 __msa_bmnz_v(p0_or_q0_org, (v16u8) p0_or_q0, is_less_than);
1276 q0_or_p0_org =
1277 __msa_bmnz_v(q0_or_p0_org, (v16u8) q0_or_p0, is_less_than);
1278 tmp1 = (v8i16) __msa_ilvr_b((v16i8) q0_or_p0_org, (v16i8) p0_or_q0_org);
1279
1280 data_cb_or_cr -= 1;
1281 ST_H4(tmp1, 0, 1, 2, 3, data_cb_or_cr, img_width);
1282 data_cb_or_cr += 4 * img_width;
1283 ST_H4(tmp1, 4, 5, 6, 7, data_cb_or_cr, img_width);
1284 }
1285 }
1286
1288 uint8_t iAlpha, uint8_t iBeta,
1289 uint8_t* pTc)
1290 {
1291 v16u8 p0, p1, p2,
q0,
q1, q2;
1292 v16i8 iTc, negiTc, negTc,
flags,
f;
1293 v8i16 p0_l, p0_r, p1_l, p1_r, p2_l, p2_r, q0_l, q0_r, q1_l, q1_r, q2_l, q2_r;
1294 v8i16 tc_l, tc_r, negTc_l, negTc_r;
1295 v8i16 iTc_l, iTc_r, negiTc_l, negiTc_r;
1296 // Use for temporary variable
1297 v8i16 t0, t1, t2, t3;
1299 v16u8 bDetaP0Q0, bDetaP1P0, bDetaQ1Q0, bDetaP2P0, bDetaQ2Q0;
1300 v16i8 const_1_b = __msa_ldi_b(1);
1301 v8i16 const_1_h = __msa_ldi_h(1);
1302 v8i16 const_4_h = __msa_ldi_h(4);
1303 v8i16 const_not_255_h = __msa_ldi_h(~255);
1305 v16i8 tc = { pTc[0 >> 2], pTc[1 >> 2], pTc[2 >> 2], pTc[3 >> 2],
1306 pTc[4 >> 2], pTc[5 >> 2], pTc[6 >> 2], pTc[7 >> 2],
1307 pTc[8 >> 2], pTc[9 >> 2], pTc[10 >> 2], pTc[11 >> 2],
1308 pTc[12 >> 2], pTc[13 >> 2], pTc[14 >> 2], pTc[15 >> 2] };
1310 iTc = tc;
1311
1312 // Load data from pPix
1313 LD_SH8(pPix - 3, iStride, t0, t1, t2, t3, q1_l, q1_r, q2_l, q2_r);
1314 LD_SH8(pPix + 8 * iStride - 3, iStride, p0_l, p0_r, p1_l, p1_r,
1315 p2_l, p2_r, q0_l, q0_r);
1317 p0_l, p0_r, p1_l, p1_r, p2_l, p2_r, q0_l, q0_r,
1319
1320 alpha = (v16u8)__msa_fill_b(iAlpha);
1321 beta = (v16u8)__msa_fill_b(iBeta);
1322
1323 bDetaP0Q0 = __msa_asub_u_b(p0,
q0);
1324 bDetaP1P0 = __msa_asub_u_b(p1, p0);
1325 bDetaQ1Q0 = __msa_asub_u_b(
q1,
q0);
1326 bDetaP2P0 = __msa_asub_u_b(p2, p0);
1327 bDetaQ2Q0 = __msa_asub_u_b(q2,
q0);
1328 bDetaP0Q0 = (v16u8)__msa_clt_u_b(bDetaP0Q0,
alpha);
1329 bDetaP1P0 = (v16u8)__msa_clt_u_b(bDetaP1P0, beta);
1330 bDetaQ1Q0 = (v16u8)__msa_clt_u_b(bDetaQ1Q0, beta);
1331 bDetaP2P0 = (v16u8)__msa_clt_u_b(bDetaP2P0, beta);
1332 bDetaQ2Q0 = (v16u8)__msa_clt_u_b(bDetaQ2Q0, beta);
1333
1334 // Unsigned extend p0, p1, p2, q0, q1, q2 from 8 bits to 16 bits
1341 // Signed extend tc, negTc from 8 bits to 16 bits
1346
1347 f = (v16i8)bDetaP0Q0 & (v16i8)bDetaP1P0 & (v16i8)bDetaQ1Q0;
1348 flags =
f & (v16i8)bDetaP2P0;
1350 iTc += ((~
flags) & const_1_b);
1351 flags =
f & (v16i8)bDetaQ2Q0;
1353 iTc += ((~
flags) & const_1_b);
1354 negiTc =
zero - iTc;
1355 // Signed extend iTc, negiTc from 8 bits to 16 bits
1360
1361 // Calculate the left part
1362 // p1
1363 t0 = (p2_l + ((p0_l + q0_l + const_1_h) >> 1) - (p1_l << 1)) >> 1;
1364 t0 = __msa_max_s_h(negTc_l, t0);
1365 t0 = __msa_min_s_h(tc_l, t0);
1366 t1 = p1_l + t0;
1367 // q1
1368 t0 = (q2_l + ((p0_l + q0_l + const_1_h) >> 1) - (q1_l << 1)) >> 1;
1369 t0 = __msa_max_s_h(negTc_l, t0);
1370 t0 = __msa_min_s_h(tc_l, t0);
1371 t2 = q1_l + t0;
1372 // iDeta
1373 t0 = (((q0_l - p0_l) << 2) + (p1_l - q1_l) + const_4_h) >> 3;
1374 t0 = __msa_max_s_h(negiTc_l, t0);
1375 t0 = __msa_min_s_h(iTc_l, t0);
1376 p1_l = t1;
1377 q1_l = t2;
1378 // p0
1379 t1 = p0_l + t0;
1380 t2 = t1 & const_not_255_h;
1381 t3 = __msa_cle_s_h((v8i16)
zero, t1);
1382 flags = (v16i8)__msa_ceq_h(t2, (v8i16)
zero);
1383 p0_l = (t1 & (v8i16)
flags) + (t3 & (v8i16)(~
flags));
1384 // q0
1385 t1 = q0_l - t0;
1386 t2 = t1 & const_not_255_h;
1387 t3 = __msa_cle_s_h((v8i16)
zero, t1);
1388 flags = (v16i8)__msa_ceq_h(t2, (v8i16)
zero);
1389 q0_l = (t1 & (v8i16)
flags) + (t3 & (v8i16)(~
flags));
1390
1391 // Calculate the right part
1392 // p1
1393 t0 = (p2_r + ((p0_r + q0_r + const_1_h) >> 1) - (p1_r << 1)) >> 1;
1394 t0 = __msa_max_s_h(negTc_r, t0);
1395 t0 = __msa_min_s_h(tc_r, t0);
1396 t1 = p1_r + t0;
1397 // q1
1398 t0 = (q2_r + ((p0_r + q0_r + const_1_h) >> 1) - (q1_r << 1)) >> 1;
1399 t0 = __msa_max_s_h(negTc_r, t0);
1400 t0 = __msa_min_s_h(tc_r, t0);
1401 t2 = q1_r + t0;
1402 // iDeta
1403 t0 = (((q0_r - p0_r) << 2) + (p1_r - q1_r) + const_4_h) >> 3;
1404 t0 = __msa_max_s_h(negiTc_r, t0);
1405 t0 = __msa_min_s_h(iTc_r, t0);
1406 p1_r = t1;
1407 q1_r = t2;
1408 // p0
1409 t1 = p0_r + t0;
1410 t2 = t1 & const_not_255_h;
1411 t3 = __msa_cle_s_h((v8i16)
zero, t1);
1412 flags = (v16i8)__msa_ceq_h(t2, (v8i16)
zero);
1413 p0_r = (t1 & (v8i16)
flags) + (t3 & (v8i16)(~
flags));
1414 // q0
1415 t1 = q0_r - t0;
1416 t2 = t1 & const_not_255_h;
1417 t3 = __msa_cle_s_h((v8i16)
zero, t1);
1418 flags = (v16i8)__msa_ceq_h(t2, (v8i16)
zero);
1419 q0_r = (t1 & (v8i16)
flags) + (t3 & (v8i16)(~
flags));
1420
1421 // Combined left and right
1422 PCKEV_B4(v8i16, p1_l, p1_r, p0_l, p0_r, q0_l, q0_r, q1_l, q1_r,
1423 t0, t1, t2, t3);
1426 p0 = (v16u8)(((v16i8)t1 &
flags) + (p0 & (~
flags)));
1428 // Using t1, t2 as temporary flags
1429 t1 = (v8i16)(
flags & (~(__msa_ceq_b((v16i8)bDetaP2P0,
zero))));
1430 p1 = (v16u8)(t0 & t1) + (p1 & (v16u8)(~t1));
1431 t2 = (v8i16)(
flags & (~(__msa_ceq_b((v16i8)bDetaQ2Q0,
zero))));
1432 q1 = (v16u8)(t3 & t2) + (
q1 & (v16u8)(~t2));
1433
1438 // Store data to pPix
1439 ST_W8(p1, p0, 0, 1, 2, 3, 0, 1, 2, 3, pPix - 2, iStride);
1440 ST_W8(
q0,
q1, 0, 1, 2, 3, 0, 1, 2, 3, pPix + 8 * iStride - 2, iStride);
1441 }
1442
1444 uint8_t bs0, uint8_t bs1,
1445 uint8_t bs2, uint8_t bs3,
1446 uint8_t tc0, uint8_t tc1,
1447 uint8_t tc2, uint8_t tc3,
1448 uint8_t alpha_in,
1449 uint8_t beta_in,
1450 ptrdiff_t image_width)
1451 {
1452 v16u8 tmp_vec;
1453 v16u8 bs = { 0 };
1454
1455 tmp_vec = (v16u8) __msa_fill_b(bs0);
1456 bs = (v16u8) __msa_insve_w((v4i32) bs, 0, (v4i32) tmp_vec);
1457 tmp_vec = (v16u8) __msa_fill_b(bs1);
1458 bs = (v16u8) __msa_insve_w((v4i32) bs, 1, (v4i32) tmp_vec);
1459 tmp_vec = (v16u8) __msa_fill_b(bs2);
1460 bs = (v16u8) __msa_insve_w((v4i32) bs, 2, (v4i32) tmp_vec);
1461 tmp_vec = (v16u8) __msa_fill_b(bs3);
1462 bs = (v16u8) __msa_insve_w((v4i32) bs, 3, (v4i32) tmp_vec);
1463
1464 if (!__msa_test_bz_v(bs)) {
1465 v16u8
alpha, beta, is_less_than, is_less_than_beta;
1466 v16u8 p0,
q0, p2_org, p1_org, p0_org, q0_org, q1_org, q2_org;
1467 v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;
1468 v16u8 is_less_than_alpha, is_bs_greater_than0;
1469 v8i16 p0_r, q0_r, p0_l, q0_l;
1470 v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
1471 v8i16 p1_org_l, p0_org_l, q0_org_l, q1_org_l;
1473 v16i8 tc = { 0 };
1474
1475 tmp_vec = (v16u8) __msa_fill_b(tc0);
1476 tc = (v16i8) __msa_insve_w((v4i32) tc, 0, (v4i32) tmp_vec);
1477 tmp_vec = (v16u8) __msa_fill_b(tc1);
1478 tc = (v16i8) __msa_insve_w((v4i32) tc, 1, (v4i32) tmp_vec);
1479 tmp_vec = (v16u8) __msa_fill_b(tc2);
1480 tc = (v16i8) __msa_insve_w((v4i32) tc, 2, (v4i32) tmp_vec);
1481 tmp_vec = (v16u8) __msa_fill_b(tc3);
1482 tc = (v16i8) __msa_insve_w((v4i32) tc, 3, (v4i32) tmp_vec);
1483
1484 alpha = (v16u8) __msa_fill_b(alpha_in);
1485 beta = (v16u8) __msa_fill_b(beta_in);
1486
1487 LD_UB5(
data - (3 * image_width), image_width,
1488 p2_org, p1_org, p0_org, q0_org, q1_org);
1489
1490 is_bs_greater_than0 = ((v16u8)
zero < bs);
1491 p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org);
1492 p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org);
1493 q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org);
1494
1495 is_less_than_alpha = (p0_asub_q0 <
alpha);
1496 is_less_than_beta = (p1_asub_p0 < beta);
1497 is_less_than = is_less_than_beta & is_less_than_alpha;
1498 is_less_than_beta = (q1_asub_q0 < beta);
1499 is_less_than = is_less_than_beta & is_less_than;
1500 is_less_than = is_less_than & is_bs_greater_than0;
1501
1502 if (!__msa_test_bz_v(is_less_than)) {
1503 v16i8 sign_negate_tc, negate_tc;
1504 v8i16 negate_tc_r, i16_negatetc_l, tc_l, tc_r;
1505 v16u8 p2_asub_p0, q2_asub_q0;
1506
1507 q2_org =
LD_UB(
data + (2 * image_width));
1508 negate_tc =
zero - tc;
1509 sign_negate_tc = __msa_clti_s_b(negate_tc, 0);
1510
1511 ILVRL_B2_SH(sign_negate_tc, negate_tc, negate_tc_r, i16_negatetc_l);
1512
1517
1518 p2_asub_p0 = __msa_asub_u_b(p2_org, p0_org);
1519 is_less_than_beta = (p2_asub_p0 < beta);
1520 is_less_than_beta = is_less_than_beta & is_less_than;
1521
1522 if (!__msa_test_bz_v(is_less_than_beta)) {
1523 v16u8 p1;
1524 v8i16 p1_r = { 0 };
1525 v8i16 p1_l = { 0 };
1526 v8i16 p2_org_r = (v8i16) __msa_ilvr_b(
zero, (v16i8) p2_org);
1527 v8i16 p2_org_l = (v8i16) __msa_ilvl_b(
zero, (v16i8) p2_org);
1528
1530 negate_tc_r, tc_r, p1_r);
1532 i16_negatetc_l, tc_l, p1_l);
1533
1534 p1 = (v16u8) __msa_pckev_b((v16i8) p1_l, (v16i8) p1_r);
1535 p1_org = __msa_bmnz_v(p1_org, p1, is_less_than_beta);
1536 ST_UB(p1_org,
data - (2 * image_width));
1537
1538 is_less_than_beta = __msa_andi_b(is_less_than_beta, 1);
1539 tc = tc + (v16i8) is_less_than_beta;
1540 }
1541
1542 q2_asub_q0 = __msa_asub_u_b(q2_org, q0_org);
1543 is_less_than_beta = (q2_asub_q0 < beta);
1544 is_less_than_beta = is_less_than_beta & is_less_than;
1545
1546 q1_org_r = (v8i16) __msa_ilvr_b(
zero, (v16i8) q1_org);
1547 q1_org_l = (v8i16) __msa_ilvl_b(
zero, (v16i8) q1_org);
1548
1549 if (!__msa_test_bz_v(is_less_than_beta)) {
1551 v8i16 q1_r = { 0 };
1552 v8i16 q1_l = { 0 };
1553 v8i16 q2_org_r = (v8i16) __msa_ilvr_b(
zero, (v16i8) q2_org);
1554 v8i16 q2_org_l = (v8i16) __msa_ilvl_b(
zero, (v16i8) q2_org);
1555
1557 negate_tc_r, tc_r, q1_r);
1559 i16_negatetc_l, tc_l, q1_l);
1560
1561 q1 = (v16u8) __msa_pckev_b((v16i8) q1_l, (v16i8) q1_r);
1562 q1_org = __msa_bmnz_v(q1_org,
q1, is_less_than_beta);
1564
1565 is_less_than_beta = __msa_andi_b(is_less_than_beta, 1);
1566 tc = tc + (v16i8) is_less_than_beta;
1567 }
1568 {
1569 v16i8 negate_thresh, sign_negate_thresh;
1570 v8i16 threshold_r, threshold_l;
1571 v8i16 negate_thresh_l, negate_thresh_r;
1572
1573 negate_thresh =
zero - tc;
1574 sign_negate_thresh = __msa_clti_s_b(negate_thresh, 0);
1575
1577 threshold_r, negate_thresh_r);
1579 negate_thresh_r, threshold_r, p0_r, q0_r);
1580
1581 threshold_l = (v8i16) __msa_ilvl_b(
zero, tc);
1582 negate_thresh_l = (v8i16) __msa_ilvl_b(sign_negate_thresh,
1583 negate_thresh);
1585 negate_thresh_l, threshold_l, p0_l, q0_l);
1586 }
1587
1589
1590 p0_org = __msa_bmnz_v(p0_org, p0, is_less_than);
1591 q0_org = __msa_bmnz_v(q0_org,
q0, is_less_than);
1592
1595 }
1596 }
1597 }
1598
1601 int8_t *tc0)
1602 {
1604 uint32_t out0, out1, out2, out3;
1605 uint64_t load;
1606 uint32_t tc_val;
1608 v16i8 inp0 = { 0 };
1609 v16i8 inp1 = { 0 };
1610 v16i8 inp2 = { 0 };
1611 v16i8 inp3 = { 0 };
1612 v16i8 inp4 = { 0 };
1613 v16i8 inp5 = { 0 };
1614 v16i8 inp6 = { 0 };
1615 v16i8 inp7 = { 0 };
1617 v8i16 src4, src5, src6, src7;
1618 v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0, p2_asub_p0, q2_asub_q0;
1619 v16u8 is_less_than, is_less_than_alpha, is_less_than_beta;
1620 v16u8 is_less_than_beta1, is_less_than_beta2;
1621 v8i16 tc, tc_orig_r, tc_plus1;
1622 v16u8 is_tc_orig1, is_tc_orig2, tc_orig = { 0 };
1623 v8i16 p0_ilvr_q0, p0_add_q0, q0_sub_p0, p1_sub_q1;
1624 v8i16 src2_r, src3_r;
1625 v8i16 p2_r, p1_r, q2_r, q1_r;
1626 v16u8 p2, q2, p0,
q0;
1627 v4i32 dst0, dst1;
1628 v16i8 zeros = { 0 };
1629
1630 alpha = (v16u8) __msa_fill_b(alpha_in);
1631 beta = (v16u8) __msa_fill_b(beta_in);
1632
1633 if (tc0[0] < 0) {
1635 } else {
1637 inp0 = (v16i8) __msa_insert_d((v2i64) inp0, 0, load);
1639 inp1 = (v16i8) __msa_insert_d((v2i64) inp1, 0, load);
1641 }
1642
1643 if (tc0[1] < 0) {
1645 } else {
1647 inp2 = (v16i8) __msa_insert_d((v2i64) inp2, 0, load);
1649 inp3 = (v16i8) __msa_insert_d((v2i64) inp3, 0, load);
1651 }
1652
1653 if (tc0[2] < 0) {
1655 } else {
1657 inp4 = (v16i8) __msa_insert_d((v2i64) inp4, 0, load);
1659 inp5 = (v16i8) __msa_insert_d((v2i64) inp5, 0, load);
1661 }
1662
1663 if (tc0[3] < 0) {
1665 } else {
1667 inp6 = (v16i8) __msa_insert_d((v2i64) inp6, 0, load);
1669 inp7 = (v16i8) __msa_insert_d((v2i64) inp7, 0, load);
1671 }
1672
1673 ILVR_B4_SB(inp1, inp0, inp3, inp2, inp5, inp4, inp7, inp6,
1675
1678
1679 src0 = (v16i8) __msa_ilvr_w((v4i32) src6, (v4i32) src4);
1680 src1 = __msa_sldi_b(zeros, (v16i8)
src0, 8);
1681 src2 = (v16i8) __msa_ilvl_w((v4i32) src6, (v4i32) src4);
1682 src3 = __msa_sldi_b(zeros, (v16i8)
src2, 8);
1683 src4 = (v8i16) __msa_ilvr_w((v4i32) src7, (v4i32) src5);
1684 src5 = (v8i16) __msa_sldi_b(zeros, (v16i8) src4, 8);
1685
1686 p0_asub_q0 = __msa_asub_u_b((v16u8)
src2, (v16u8) src3);
1687 p1_asub_p0 = __msa_asub_u_b((v16u8)
src1, (v16u8)
src2);
1688 q1_asub_q0 = __msa_asub_u_b((v16u8) src4, (v16u8) src3);
1689 p2_asub_p0 = __msa_asub_u_b((v16u8)
src0, (v16u8)
src2);
1690 q2_asub_q0 = __msa_asub_u_b((v16u8) src5, (v16u8) src3);
1691
1692 is_less_than_alpha = (p0_asub_q0 <
alpha);
1693 is_less_than_beta = (p1_asub_p0 < beta);
1694 is_less_than = is_less_than_alpha & is_less_than_beta;
1695 is_less_than_beta = (q1_asub_q0 < beta);
1696 is_less_than = is_less_than_beta & is_less_than;
1697
1698 is_less_than_beta1 = (p2_asub_p0 < beta);
1699 is_less_than_beta2 = (q2_asub_q0 < beta);
1700
1701 p0_ilvr_q0 = (v8i16) __msa_ilvr_b((v16i8) src3, (v16i8)
src2);
1702 p0_add_q0 = (v8i16) __msa_hadd_u_h((v16u8) p0_ilvr_q0, (v16u8) p0_ilvr_q0);
1703 p0_add_q0 = __msa_srari_h(p0_add_q0, 1);
1704
1706 p2_r += p0_add_q0;
1707 p2_r >>= 1;
1708 p2_r -= p1_r;
1709 ILVR_B2_SH(zeros, src5, zeros, src4, q2_r, q1_r);
1710 q2_r += p0_add_q0;
1711 q2_r >>= 1;
1712 q2_r -= q1_r;
1713
1715 tc_orig = (v16u8) __msa_insert_w((v4i32) tc_orig, 0, tc_val);
1716 tc_orig = (v16u8) __msa_ilvr_b((v16i8) tc_orig, (v16i8) tc_orig);
1717 is_tc_orig1 = tc_orig;
1718 is_tc_orig2 = tc_orig;
1719 tc_orig_r = (v8i16) __msa_ilvr_b(zeros, (v16i8) tc_orig);
1720 tc = tc_orig_r;
1721
1722 CLIP_SH(p2_r, -tc_orig_r, tc_orig_r);
1723 CLIP_SH(q2_r, -tc_orig_r, tc_orig_r);
1724
1725 p2_r += p1_r;
1726 q2_r += q1_r;
1727
1729
1730 is_tc_orig1 = (zeros < is_tc_orig1);
1731 is_tc_orig2 = is_tc_orig1;
1732 is_tc_orig1 = is_less_than_beta1 & is_tc_orig1;
1733 is_tc_orig2 = is_less_than_beta2 & is_tc_orig2;
1734 is_tc_orig1 = is_less_than & is_tc_orig1;
1735 is_tc_orig2 = is_less_than & is_tc_orig2;
1736
1737 p2 = __msa_bmnz_v((v16u8)
src1, p2, is_tc_orig1);
1738 q2 = __msa_bmnz_v((v16u8) src4, q2, is_tc_orig2);
1739
1740 q0_sub_p0 = __msa_hsub_u_h((v16u8) p0_ilvr_q0, (v16u8) p0_ilvr_q0);
1741 q0_sub_p0 <<= 2;
1742 p1_sub_q1 = p1_r - q1_r;
1743 q0_sub_p0 += p1_sub_q1;
1744 q0_sub_p0 = __msa_srari_h(q0_sub_p0, 3);
1745
1746 tc_plus1 = tc + 1;
1747 is_less_than_beta1 = (v16u8) __msa_ilvr_b((v16i8) is_less_than_beta1,
1748 (v16i8) is_less_than_beta1);
1749 tc = (v8i16) __msa_bmnz_v((v16u8) tc, (v16u8) tc_plus1, is_less_than_beta1);
1750 tc_plus1 = tc + 1;
1751 is_less_than_beta2 = (v16u8) __msa_ilvr_b((v16i8) is_less_than_beta2,
1752 (v16i8) is_less_than_beta2);
1753 tc = (v8i16) __msa_bmnz_v((v16u8) tc, (v16u8) tc_plus1, is_less_than_beta2);
1754
1756
1758 src2_r += q0_sub_p0;
1759 src3_r -= q0_sub_p0;
1760
1762
1764
1765 p0 = __msa_bmnz_v((v16u8)
src2, p0, is_less_than);
1766 q0 = __msa_bmnz_v((v16u8) src3,
q0, is_less_than);
1767
1769
1771
1773
1774 out0 = __msa_copy_u_w(dst0, 0);
1775 out1 = __msa_copy_u_w(dst0, 1);
1776 out2 = __msa_copy_u_w(dst0, 2);
1777 out3 = __msa_copy_u_w(dst0, 3);
1778
1779 if (tc0[0] < 0) {
1781 } else {
1786 }
1787
1788 if (tc0[1] < 0) {
1790 } else {
1795 }
1796
1797 out0 = __msa_copy_u_w(dst1, 0);
1798 out1 = __msa_copy_u_w(dst1, 1);
1799 out2 = __msa_copy_u_w(dst1, 2);
1800 out3 = __msa_copy_u_w(dst1, 3);
1801
1802 if (tc0[2] < 0) {
1804 } else {
1809 }
1810
1811 if (tc0[3] >= 0) {
1815 }
1816 }
1817
1819 uint8_t bs0, uint8_t bs1,
1820 uint8_t bs2, uint8_t bs3,
1821 uint8_t tc0, uint8_t tc1,
1822 uint8_t tc2, uint8_t tc3,
1823 uint8_t alpha_in,
1824 uint8_t beta_in,
1825 ptrdiff_t img_width)
1826 {
1828 v8i16 tmp_vec;
1829 v8i16 bs = { 0 };
1830 v8i16 tc = { 0 };
1831 v16u8 p0,
q0, p0_asub_q0, p1_asub_p0, q1_asub_q0;
1832 v16u8 is_less_than;
1833 v16u8 is_less_than_beta, is_less_than_alpha, is_bs_greater_than0;
1834 v8i16 p0_r, q0_r;
1835 v16u8 p1_org, p0_org, q0_org, q1_org;
1836 v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
1837 v16i8 negate_tc, sign_negate_tc;
1838 v8i16 tc_r, negate_tc_r;
1840
1841 tmp_vec = (v8i16) __msa_fill_b(bs0);
1842 bs = __msa_insve_h(bs, 0, tmp_vec);
1843 tmp_vec = (v8i16) __msa_fill_b(bs1);
1844 bs = __msa_insve_h(bs, 1, tmp_vec);
1845 tmp_vec = (v8i16) __msa_fill_b(bs2);
1846 bs = __msa_insve_h(bs, 2, tmp_vec);
1847 tmp_vec = (v8i16) __msa_fill_b(bs3);
1848 bs = __msa_insve_h(bs, 3, tmp_vec);
1849
1850 if (!__msa_test_bz_v((v16u8) bs)) {
1851 tmp_vec = (v8i16) __msa_fill_b(tc0);
1852 tc = __msa_insve_h(tc, 0, tmp_vec);
1853 tmp_vec = (v8i16) __msa_fill_b(tc1);
1854 tc = __msa_insve_h(tc, 1, tmp_vec);
1855 tmp_vec = (v8i16) __msa_fill_b(tc2);
1856 tc = __msa_insve_h(tc, 2, tmp_vec);
1857 tmp_vec = (v8i16) __msa_fill_b(tc3);
1858 tc = __msa_insve_h(tc, 3, tmp_vec);
1859
1860 is_bs_greater_than0 = (v16u8) (
zero < (v16i8) bs);
1861
1862 alpha = (v16u8) __msa_fill_b(alpha_in);
1863 beta = (v16u8) __msa_fill_b(beta_in);
1864
1866 p1_org, p0_org, q0_org, q1_org);
1867
1868 p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org);
1869 p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org);
1870 q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org);
1871
1872 is_less_than_alpha = (p0_asub_q0 <
alpha);
1873 is_less_than_beta = (p1_asub_p0 < beta);
1874 is_less_than = is_less_than_beta & is_less_than_alpha;
1875 is_less_than_beta = (q1_asub_q0 < beta);
1876 is_less_than = is_less_than_beta & is_less_than;
1877 is_less_than = is_less_than & is_bs_greater_than0;
1878
1879 is_less_than = (v16u8) __msa_ilvr_d((v2i64)
zero, (v2i64) is_less_than);
1880
1881 if (!__msa_test_bz_v(is_less_than)) {
1882 negate_tc =
zero - (v16i8) tc;
1883 sign_negate_tc = __msa_clti_s_b(negate_tc, 0);
1884
1885 ILVR_B2_SH(
zero, tc, sign_negate_tc, negate_tc, tc_r, negate_tc_r);
1886
1888 p1_org_r, p0_org_r, q0_org_r, q1_org_r);
1889
1890 AVC_LPF_P0Q0(q0_org_r, p0_org_r, p1_org_r, q1_org_r, negate_tc_r,
1891 tc_r, p0_r, q0_r);
1892
1894
1895 p0_org = __msa_bmnz_v(p0_org, p0, is_less_than);
1896 q0_org = __msa_bmnz_v(q0_org,
q0, is_less_than);
1897
1900 }
1901 }
1902 }
1903
1905 uint8_t bs0, uint8_t bs1,
1906 uint8_t bs2, uint8_t bs3,
1907 uint8_t tc0, uint8_t tc1,
1908 uint8_t tc2, uint8_t tc3,
1909 uint8_t alpha_in,
1910 uint8_t beta_in,
1911 ptrdiff_t img_width)
1912 {
1915 v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;
1916 v16u8 is_less_than, is_less_than_beta, is_less_than_alpha;
1918 v8i16 p0_r = { 0 };
1919 v8i16 q0_r = { 0 };
1920 v16u8 p1_org, p0_org, q0_org, q1_org;
1921 v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
1922 v16u8 is_bs_greater_than0;
1923 v8i16 tc_r, negate_tc_r;
1924 v16i8 negate_tc, sign_negate_tc;
1926 v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
1927 v8i16 tmp1, tmp_vec, bs = { 0 };
1928 v8i16 tc = { 0 };
1929
1930 tmp_vec = (v8i16) __msa_fill_b(bs0);
1931 bs = __msa_insve_h(bs, 0, tmp_vec);
1932 tmp_vec = (v8i16) __msa_fill_b(bs1);
1933 bs = __msa_insve_h(bs, 1, tmp_vec);
1934 tmp_vec = (v8i16) __msa_fill_b(bs2);
1935 bs = __msa_insve_h(bs, 2, tmp_vec);
1936 tmp_vec = (v8i16) __msa_fill_b(bs3);
1937 bs = __msa_insve_h(bs, 3, tmp_vec);
1938
1939 if (!__msa_test_bz_v((v16u8) bs)) {
1940 tmp_vec = (v8i16) __msa_fill_b(tc0);
1941 tc = __msa_insve_h(tc, 0, tmp_vec);
1942 tmp_vec = (v8i16) __msa_fill_b(tc1);
1943 tc = __msa_insve_h(tc, 1, tmp_vec);
1944 tmp_vec = (v8i16) __msa_fill_b(tc2);
1945 tc = __msa_insve_h(tc, 2, tmp_vec);
1946 tmp_vec = (v8i16) __msa_fill_b(tc3);
1947 tc = __msa_insve_h(tc, 3, tmp_vec);
1948
1949 is_bs_greater_than0 = (v16u8) (
zero < (v16i8) bs);
1950
1952 row0, row1, row2, row3, row4, row5, row6, row7);
1953
1955 row4, row5, row6, row7,
1956 p1_org, p0_org, q0_org, q1_org);
1957
1958 p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org);
1959 p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org);
1960 q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org);
1961
1962 alpha = (v16u8) __msa_fill_b(alpha_in);
1963 beta = (v16u8) __msa_fill_b(beta_in);
1964
1965 is_less_than_alpha = (p0_asub_q0 <
alpha);
1966 is_less_than_beta = (p1_asub_p0 < beta);
1967 is_less_than = is_less_than_beta & is_less_than_alpha;
1968 is_less_than_beta = (q1_asub_q0 < beta);
1969 is_less_than = is_less_than_beta & is_less_than;
1970 is_less_than = is_bs_greater_than0 & is_less_than;
1971
1972 is_less_than = (v16u8) __msa_ilvr_d((v2i64)
zero, (v2i64) is_less_than);
1973
1974 if (!__msa_test_bz_v(is_less_than)) {
1976 p1_org_r, p0_org_r, q0_org_r, q1_org_r);
1977
1978 negate_tc =
zero - (v16i8) tc;
1979 sign_negate_tc = __msa_clti_s_b(negate_tc, 0);
1980
1981 ILVR_B2_SH(sign_negate_tc, negate_tc,
zero, tc, negate_tc_r, tc_r);
1982
1983 AVC_LPF_P0Q0(q0_org_r, p0_org_r, p1_org_r, q1_org_r, negate_tc_r,
1984 tc_r, p0_r, q0_r);
1985
1987
1988 p0_org = __msa_bmnz_v(p0_org, p0, is_less_than);
1989 q0_org = __msa_bmnz_v(q0_org,
q0, is_less_than);
1990 tmp1 = (v8i16) __msa_ilvr_b((v16i8) q0_org, (v16i8) p0_org);
1992 ST_H4(tmp1, 0, 1, 2, 3,
src, img_width);
1993 src += 4 * img_width;
1994 ST_H4(tmp1, 4, 5, 6, 7,
src, img_width);
1995 }
1996 }
1997 }
1998
2001 int8_t *tc0)
2002 {
2004 v16u8
alpha, beta, res;
2005
2006 alpha = (v16u8) __msa_fill_b(alpha_in);
2007 beta = (v16u8) __msa_fill_b(beta_in);
2008
2009 for (col = 0; col < 4; col++) {
2010 tc_val = (tc0[col] - 1) + 1;
2011
2012 if (tc_val <= 0) {
2014 continue;
2015 }
2016
2020 }
2021 }
2022
2027 int8_t *tc0)
2028 {
2030 int16_t out0, out1;
2031 v16u8
alpha, beta, res;
2032
2033 alpha = (v16u8) __msa_fill_b(alpha_in);
2034 beta = (v16u8) __msa_fill_b(beta_in);
2035
2036 for (col = 0; col < 4; col++) {
2037 tc_val = (tc0[col] - 1) + 1;
2038
2039 if (tc_val <= 0) {
2041 continue;
2042 }
2043
2045
2046 out0 = __msa_copy_s_h((v8i16) res, 0);
2047 out1 = __msa_copy_s_h((v8i16) res, 1);
2048
2049 SH(out0, (
src - 1));
2051 SH(out1, (
src - 1));
2053 }
2054 }
2055
2057 int alpha,
int beta, int8_t *tc)
2058 {
2059 // uint8_t bs0 = 1;
2060 // uint8_t bs1 = 1;
2061 // uint8_t bs2 = 1;
2062 // uint8_t bs3 = 1;
2063 //
2064 // if (tc[0] < 0)
2065 // bs0 = 0;
2066 // if (tc[1] < 0)
2067 // bs1 = 0;
2068 // if (tc[2] < 0)
2069 // bs2 = 0;
2070 // if (tc[3] < 0)
2071 // bs3 = 0;
2072 //
2073 // avc_loopfilter_luma_inter_edge_ver_msa(data, bs0, bs1, bs2, bs3,
2074 // tc[0], tc[1], tc[2], tc[3],
2075 // alpha, beta, img_width);
2077 }
2078
2080 int alpha,
int beta, int8_t *tc)
2081 {
2082
2083 uint8_t bs0 = 1;
2084 uint8_t bs1 = 1;
2085 uint8_t bs2 = 1;
2086 uint8_t bs3 = 1;
2087
2088 if (tc[0] < 0)
2089 bs0 = 0;
2090 if (tc[1] < 0)
2091 bs1 = 0;
2092 if (tc[2] < 0)
2093 bs2 = 0;
2094 if (tc[3] < 0)
2095 bs3 = 0;
2096
2098 tc[0], tc[1], tc[2], tc[3],
2099 alpha, beta, img_width);
2100 }
2101
2103 int alpha,
int beta, int8_t *tc)
2104 {
2105 uint8_t bs0 = 1;
2106 uint8_t bs1 = 1;
2107 uint8_t bs2 = 1;
2108 uint8_t bs3 = 1;
2109
2110 if (tc[0] < 0)
2111 bs0 = 0;
2112 if (tc[1] < 0)
2113 bs1 = 0;
2114 if (tc[2] < 0)
2115 bs2 = 0;
2116 if (tc[3] < 0)
2117 bs3 = 0;
2118
2120 tc[0], tc[1], tc[2], tc[3],
2121 alpha, beta, img_width);
2122 }
2123
2125 int alpha,
int beta, int8_t *tc)
2126 {
2127 uint8_t bs0 = 1;
2128 uint8_t bs1 = 1;
2129 uint8_t bs2 = 1;
2130 uint8_t bs3 = 1;
2131
2132 if (tc[0] < 0)
2133 bs0 = 0;
2134 if (tc[1] < 0)
2135 bs1 = 0;
2136 if (tc[2] < 0)
2137 bs2 = 0;
2138 if (tc[3] < 0)
2139 bs3 = 0;
2140
2142 tc[0], tc[1], tc[2], tc[3],
2143 alpha, beta, img_width);
2144 }
2145
2147 int alpha,
int beta)
2148 {
2150 (uint8_t) beta,
2151 img_width);
2152 }
2153
2155 int alpha,
int beta)
2156 {
2158 (uint8_t) beta,
2159 img_width);
2160 }
2161
2163 int alpha,
int beta)
2164 {
2166 (uint8_t) beta,
2167 img_width);
2168 }
2169
2171 int alpha,
int beta)
2172 {
2174 (uint8_t) beta,
2175 img_width);
2176 }
2177
2179 ptrdiff_t ystride,
2181 int8_t *tc0)
2182 {
2184 }
2185
2187 ptrdiff_t ystride,
2190 int8_t *tc0)
2191 {
2193 }
2194
2196 ptrdiff_t ystride,
2199 int8_t *tc0)
2200 {
2202 }
2203
2205 ptrdiff_t ystride,
2208 {
2210 }
2211
2213 int height,
int log2_denom,
2214 int weight_src, int offset_in)
2215 {
2216 uint32_t offset_val;
2219 v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
2220 v8i16 src0_l, src1_l, src2_l, src3_l, src0_r, src1_r, src2_r, src3_r;
2221 v8i16 src4_l, src5_l, src6_l, src7_l, src4_r, src5_r, src6_r, src7_r;
2222 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2223 v8i16 tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
2224 v8i16 wgt, denom,
offset;
2225
2226 offset_val = (unsigned) offset_in << log2_denom;
2227
2228 wgt = __msa_fill_h(weight_src);
2229 offset = __msa_fill_h(offset_val);
2230 denom = __msa_fill_h(log2_denom);
2231
2233 ILVR_B4_SH(
zero,
src0,
zero,
src1,
zero,
src2,
zero, src3, src0_r, src1_r,
2234 src2_r, src3_r);
2235 ILVL_B4_SH(
zero,
src0,
zero,
src1,
zero,
src2,
zero, src3, src0_l, src1_l,
2236 src2_l, src3_l);
2237 ILVR_B4_SH(
zero, src4,
zero, src5,
zero, src6,
zero, src7, src4_r, src5_r,
2238 src6_r, src7_r);
2239 ILVL_B4_SH(
zero, src4,
zero, src5,
zero, src6,
zero, src7, src4_l, src5_l,
2240 src6_l, src7_l);
2241 MUL4(wgt, src0_r, wgt, src0_l, wgt, src1_r, wgt, src1_l, tmp0, tmp1, tmp2,
2242 tmp3);
2243 MUL4(wgt, src2_r, wgt, src2_l, wgt, src3_r, wgt, src3_l, tmp4, tmp5, tmp6,
2244 tmp7);
2245 MUL4(wgt, src4_r, wgt, src4_l, wgt, src5_r, wgt, src5_l, tmp8, tmp9, tmp10,
2246 tmp11);
2247 MUL4(wgt, src6_r, wgt, src6_l, wgt, src7_r, wgt, src7_l, tmp12, tmp13,
2248 tmp14, tmp15);
2250 tmp1, tmp2, tmp3);
2252 tmp5, tmp6, tmp7);
2254 tmp9, tmp10, tmp11);
2256 tmp12, tmp13, tmp14, tmp15);
2257 MAXI_SH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 0);
2258 MAXI_SH8_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, 0);
2259 SRLR_H8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, denom);
2260 SRLR_H8_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, denom);
2261 SAT_UH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 7);
2262 SAT_UH8_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, 7);
2263 PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, dst0, dst1,
2264 dst2, dst3);
2265 PCKEV_B4_UB(tmp9, tmp8, tmp11, tmp10, tmp13, tmp12, tmp15, tmp14, dst4,
2266 dst5, dst6, dst7);
2269
2273 src1_r, src2_r, src3_r);
2275 src1_l, src2_l, src3_l);
2277 src5_r, src6_r, src7_r);
2279 src5_l, src6_l, src7_l);
2280 MUL4(wgt, src0_r, wgt, src0_l, wgt, src1_r, wgt, src1_l, tmp0, tmp1,
2281 tmp2, tmp3);
2282 MUL4(wgt, src2_r, wgt, src2_l, wgt, src3_r, wgt, src3_l, tmp4, tmp5,
2283 tmp6, tmp7);
2284 MUL4(wgt, src4_r, wgt, src4_l, wgt, src5_r, wgt, src5_l, tmp8, tmp9,
2285 tmp10, tmp11);
2286 MUL4(wgt, src6_r, wgt, src6_l, wgt, src7_r, wgt, src7_l, tmp12, tmp13,
2287 tmp14, tmp15);
2289 tmp0, tmp1, tmp2, tmp3);
2291 tmp4, tmp5, tmp6, tmp7);
2293 tmp8, tmp9, tmp10, tmp11);
2295 tmp12, tmp13, tmp14, tmp15);
2296 MAXI_SH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 0);
2297 MAXI_SH8_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, 0);
2298 SRLR_H8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, denom);
2299 SRLR_H8_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, denom);
2300 SAT_UH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 7);
2301 SAT_UH8_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, 7);
2302 PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, dst0, dst1,
2303 dst2, dst3);
2304 PCKEV_B4_UB(tmp9, tmp8, tmp11, tmp10, tmp13, tmp12, tmp15, tmp14, dst4,
2305 dst5, dst6, dst7);
2307 }
2308 }
2309
2311 int height,
int log2_denom,
2312 int weight_src,
int offset)
2313 {
2316 }
else if (8 ==
height) {
2318 } else {
2320 }
2321 }
2322
2324 int height,
int log2_denom,
2325 int weight_src,
int offset)
2326 {
2329 }
else if (4 ==
height) {
2331 } else {
2333 }
2334 }
2335
2338 int log2_denom, int weight_dst,
2339 int weight_src, int offset_in)
2340 {
2341 v16i8 src_wgt, dst_wgt, wgt;
2343 v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
2344 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2345 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
2346 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2347 v8i16 tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
2349
2350 offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
2351 offset_in += (128 * (weight_src + weight_dst));
2352
2353 src_wgt = __msa_fill_b(weight_src);
2354 dst_wgt = __msa_fill_b(weight_dst);
2355 offset = __msa_fill_h(offset_in);
2356 denom = __msa_fill_h(log2_denom + 1);
2357
2358 wgt = __msa_ilvev_b(dst_wgt, src_wgt);
2359
2365 ILVR_B4_SB(dst0,
src0, dst1,
src1, dst2,
src2, dst3, src3, vec0, vec2, vec4,
2366 vec6);
2367 ILVL_B4_SB(dst0,
src0, dst1,
src1, dst2,
src2, dst3, src3, vec1, vec3, vec5,
2368 vec7);
2369 ILVR_B4_SB(dst4, src4, dst5, src5, dst6, src6, dst7, src7, vec8, vec10,
2370 vec12, vec14);
2371 ILVL_B4_SB(dst4, src4, dst5, src5, dst6, src6, dst7, src7, vec9, vec11,
2372 vec13, vec15);
2373 tmp0 = __msa_dpadd_s_h(
offset, wgt, vec0);
2374 tmp1 = __msa_dpadd_s_h(
offset, wgt, vec1);
2375 tmp2 = __msa_dpadd_s_h(
offset, wgt, vec2);
2376 tmp3 = __msa_dpadd_s_h(
offset, wgt, vec3);
2377 tmp4 = __msa_dpadd_s_h(
offset, wgt, vec4);
2378 tmp5 = __msa_dpadd_s_h(
offset, wgt, vec5);
2379 tmp6 = __msa_dpadd_s_h(
offset, wgt, vec6);
2380 tmp7 = __msa_dpadd_s_h(
offset, wgt, vec7);
2381 tmp8 = __msa_dpadd_s_h(
offset, wgt, vec8);
2382 tmp9 = __msa_dpadd_s_h(
offset, wgt, vec9);
2383 tmp10 = __msa_dpadd_s_h(
offset, wgt, vec10);
2384 tmp11 = __msa_dpadd_s_h(
offset, wgt, vec11);
2385 tmp12 = __msa_dpadd_s_h(
offset, wgt, vec12);
2386 tmp13 = __msa_dpadd_s_h(
offset, wgt, vec13);
2387 tmp14 = __msa_dpadd_s_h(
offset, wgt, vec14);
2388 tmp15 = __msa_dpadd_s_h(
offset, wgt, vec15);
2389 SRA_4V(tmp0, tmp1, tmp2, tmp3, denom);
2390 SRA_4V(tmp4, tmp5, tmp6, tmp7, denom);
2391 SRA_4V(tmp8, tmp9, tmp10, tmp11, denom);
2392 SRA_4V(tmp12, tmp13, tmp14, tmp15, denom);
2394 CLIP_SH8_0_255(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15);
2395 PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, dst0, dst1,
2396 dst2, dst3);
2397 PCKEV_B4_UB(tmp9, tmp8, tmp11, tmp10, tmp13, tmp12, tmp15, tmp14, dst4,
2398 dst5, dst6, dst7);
2401
2407 ILVR_B4_SB(dst0,
src0, dst1,
src1, dst2,
src2, dst3, src3, vec0, vec2,
2408 vec4, vec6);
2409 ILVL_B4_SB(dst0,
src0, dst1,
src1, dst2,
src2, dst3, src3, vec1, vec3,
2410 vec5, vec7);
2411 ILVR_B4_SB(dst4, src4, dst5, src5, dst6, src6, dst7, src7, vec8, vec10,
2412 vec12, vec14);
2413 ILVL_B4_SB(dst4, src4, dst5, src5, dst6, src6, dst7, src7, vec9, vec11,
2414 vec13, vec15);
2415 tmp0 = __msa_dpadd_s_h(
offset, wgt, vec0);
2416 tmp1 = __msa_dpadd_s_h(
offset, wgt, vec1);
2417 tmp2 = __msa_dpadd_s_h(
offset, wgt, vec2);
2418 tmp3 = __msa_dpadd_s_h(
offset, wgt, vec3);
2419 tmp4 = __msa_dpadd_s_h(
offset, wgt, vec4);
2420 tmp5 = __msa_dpadd_s_h(
offset, wgt, vec5);
2421 tmp6 = __msa_dpadd_s_h(
offset, wgt, vec6);
2422 tmp7 = __msa_dpadd_s_h(
offset, wgt, vec7);
2423 tmp8 = __msa_dpadd_s_h(
offset, wgt, vec8);
2424 tmp9 = __msa_dpadd_s_h(
offset, wgt, vec9);
2425 tmp10 = __msa_dpadd_s_h(
offset, wgt, vec10);
2426 tmp11 = __msa_dpadd_s_h(
offset, wgt, vec11);
2427 tmp12 = __msa_dpadd_s_h(
offset, wgt, vec12);
2428 tmp13 = __msa_dpadd_s_h(
offset, wgt, vec13);
2429 tmp14 = __msa_dpadd_s_h(
offset, wgt, vec14);
2430 tmp15 = __msa_dpadd_s_h(
offset, wgt, vec15);
2431 SRA_4V(tmp0, tmp1, tmp2, tmp3, denom);
2432 SRA_4V(tmp4, tmp5, tmp6, tmp7, denom);
2433 SRA_4V(tmp8, tmp9, tmp10, tmp11, denom);
2434 SRA_4V(tmp12, tmp13, tmp14, tmp15, denom);
2436 CLIP_SH8_0_255(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15);
2437 PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, dst0, dst1,
2438 dst2, dst3);
2439 PCKEV_B4_UB(tmp9, tmp8, tmp11, tmp10, tmp13, tmp12, tmp15, tmp14, dst4,
2440 dst5, dst6, dst7);
2442 }
2443 }
2444
2447 int log2_denom, int weight_dst,
2448 int weight_src,
int offset)
2449 {
2453 }
else if (8 ==
height) {
2456 } else {
2459 }
2460 }
2461
2464 int log2_denom, int weight_dst,
2465 int weight_src,
int offset)
2466 {
2470 }
else if (4 ==
height) {
2473 } else {
2476 }
2477 }