1 /*
2 * Copyright (c) 2015 Shivraj Patil (Shivraj.Patil@imgtec.com)
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
24
25 #define VP9_LPF_FILTER4_8W(p1_in, p0_in, q0_in, q1_in, mask_in, hev_in, \
26 p1_out, p0_out, q0_out, q1_out) \
27 { \
28 v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, filt_sign; \
29 v16i8 filt, filt1, filt2, cnst4b, cnst3b; \
30 v8i16 q0_sub_p0_r, filt_r, cnst3h; \
31 \
32 p1_m = (v16i8) __msa_xori_b(p1_in, 0x80); \
33 p0_m = (v16i8) __msa_xori_b(p0_in, 0x80); \
34 q0_m = (v16i8) __msa_xori_b(q0_in, 0x80); \
35 q1_m = (v16i8) __msa_xori_b(q1_in, 0x80); \
36 \
37 filt = __msa_subs_s_b(p1_m, q1_m); \
38 filt = filt & (v16i8) hev_in; \
39 q0_sub_p0 = q0_m - p0_m; \
40 filt_sign = __msa_clti_s_b(filt, 0); \
41 \
42 cnst3h = __msa_ldi_h(3); \
43 q0_sub_p0_r = (v8i16) __msa_ilvr_b(q0_sub_p0, q0_sub_p0); \
44 q0_sub_p0_r = __msa_dotp_s_h((v16i8) q0_sub_p0_r, (v16i8) cnst3h); \
45 filt_r = (v8i16) __msa_ilvr_b(filt_sign, filt); \
46 filt_r += q0_sub_p0_r; \
47 filt_r = __msa_sat_s_h(filt_r, 7); \
48 \
49 /* combine left and right part */ \
50 filt = __msa_pckev_b((v16i8) filt_r, (v16i8) filt_r); \
51 \
52 filt = filt & (v16i8) mask_in; \
53 cnst4b = __msa_ldi_b(4); \
54 filt1 = __msa_adds_s_b(filt, cnst4b); \
55 filt1 >>= 3; \
56 \
57 cnst3b = __msa_ldi_b(3); \
58 filt2 = __msa_adds_s_b(filt, cnst3b); \
59 filt2 >>= 3; \
60 \
61 q0_m = __msa_subs_s_b(q0_m, filt1); \
62 q0_out = __msa_xori_b((v16u8) q0_m, 0x80); \
63 p0_m = __msa_adds_s_b(p0_m, filt2); \
64 p0_out = __msa_xori_b((v16u8) p0_m, 0x80); \
65 \
66 filt = __msa_srari_b(filt1, 1); \
67 hev_in = __msa_xori_b((v16u8) hev_in, 0xff); \
68 filt = filt & (v16i8) hev_in; \
69 \
70 q1_m = __msa_subs_s_b(q1_m, filt); \
71 q1_out = __msa_xori_b((v16u8) q1_m, 0x80); \
72 p1_m = __msa_adds_s_b(p1_m, filt); \
73 p1_out = __msa_xori_b((v16u8) p1_m, 0x80); \
74 }
75
76 #define VP9_LPF_FILTER4_4W(p1_in, p0_in, q0_in, q1_in, mask_in, hev_in, \
77 p1_out, p0_out, q0_out, q1_out) \
78 { \
79 v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, filt_sign; \
80 v16i8 filt, filt1, filt2, cnst4b, cnst3b; \
81 v8i16 q0_sub_p0_r, q0_sub_p0_l, filt_l, filt_r, cnst3h; \
82 \
83 p1_m = (v16i8) __msa_xori_b(p1_in, 0x80); \
84 p0_m = (v16i8) __msa_xori_b(p0_in, 0x80); \
85 q0_m = (v16i8) __msa_xori_b(q0_in, 0x80); \
86 q1_m = (v16i8) __msa_xori_b(q1_in, 0x80); \
87 \
88 filt = __msa_subs_s_b(p1_m, q1_m); \
89 \
90 filt = filt & (v16i8) hev_in; \
91 \
92 q0_sub_p0 = q0_m - p0_m; \
93 filt_sign = __msa_clti_s_b(filt, 0); \
94 \
95 cnst3h = __msa_ldi_h(3); \
96 q0_sub_p0_r = (v8i16) __msa_ilvr_b(q0_sub_p0, q0_sub_p0); \
97 q0_sub_p0_r = __msa_dotp_s_h((v16i8) q0_sub_p0_r, (v16i8) cnst3h); \
98 filt_r = (v8i16) __msa_ilvr_b(filt_sign, filt); \
99 filt_r += q0_sub_p0_r; \
100 filt_r = __msa_sat_s_h(filt_r, 7); \
101 \
102 q0_sub_p0_l = (v8i16) __msa_ilvl_b(q0_sub_p0, q0_sub_p0); \
103 q0_sub_p0_l = __msa_dotp_s_h((v16i8) q0_sub_p0_l, (v16i8) cnst3h); \
104 filt_l = (v8i16) __msa_ilvl_b(filt_sign, filt); \
105 filt_l += q0_sub_p0_l; \
106 filt_l = __msa_sat_s_h(filt_l, 7); \
107 \
108 filt = __msa_pckev_b((v16i8) filt_l, (v16i8) filt_r); \
109 filt = filt & (v16i8) mask_in; \
110 \
111 cnst4b = __msa_ldi_b(4); \
112 filt1 = __msa_adds_s_b(filt, cnst4b); \
113 filt1 >>= 3; \
114 \
115 cnst3b = __msa_ldi_b(3); \
116 filt2 = __msa_adds_s_b(filt, cnst3b); \
117 filt2 >>= 3; \
118 \
119 q0_m = __msa_subs_s_b(q0_m, filt1); \
120 q0_out = __msa_xori_b((v16u8) q0_m, 0x80); \
121 p0_m = __msa_adds_s_b(p0_m, filt2); \
122 p0_out = __msa_xori_b((v16u8) p0_m, 0x80); \
123 \
124 filt = __msa_srari_b(filt1, 1); \
125 hev_in = __msa_xori_b((v16u8) hev_in, 0xff); \
126 filt = filt & (v16i8) hev_in; \
127 \
128 q1_m = __msa_subs_s_b(q1_m, filt); \
129 q1_out = __msa_xori_b((v16u8) q1_m, 0x80); \
130 p1_m = __msa_adds_s_b(p1_m, filt); \
131 p1_out = __msa_xori_b((v16u8) p1_m, 0x80); \
132 }
133
134 #define VP9_FLAT4(p3_in, p2_in, p0_in, q0_in, q2_in, q3_in, flat_out) \
135 { \
136 v16u8 tmp, p2_a_sub_p0, q2_a_sub_q0, p3_a_sub_p0, q3_a_sub_q0; \
137 v16u8 zero_in = { 0 }; \
138 \
139 tmp = __msa_ori_b(zero_in, 1); \
140 p2_a_sub_p0 = __msa_asub_u_b(p2_in, p0_in); \
141 q2_a_sub_q0 = __msa_asub_u_b(q2_in, q0_in); \
142 p3_a_sub_p0 = __msa_asub_u_b(p3_in, p0_in); \
143 q3_a_sub_q0 = __msa_asub_u_b(q3_in, q0_in); \
144 \
145 p2_a_sub_p0 = __msa_max_u_b(p2_a_sub_p0, q2_a_sub_q0); \
146 flat_out = __msa_max_u_b(p2_a_sub_p0, flat_out); \
147 p3_a_sub_p0 = __msa_max_u_b(p3_a_sub_p0, q3_a_sub_q0); \
148 flat_out = __msa_max_u_b(p3_a_sub_p0, flat_out); \
149 \
150 flat_out = (tmp < (v16u8) flat_out); \
151 flat_out = __msa_xori_b(flat_out, 0xff); \
152 flat_out = flat_out & (mask); \
153 }
154
155 #define VP9_FLAT5(p7_in, p6_in, p5_in, p4_in, p0_in, q0_in, q4_in, \
156 q5_in, q6_in, q7_in, flat_in, flat2_out) \
157 { \
158 v16u8 tmp, zero_in = { 0 }; \
159 v16u8 p4_a_sub_p0, q4_a_sub_q0, p5_a_sub_p0, q5_a_sub_q0; \
160 v16u8 p6_a_sub_p0, q6_a_sub_q0, p7_a_sub_p0, q7_a_sub_q0; \
161 \
162 tmp = __msa_ori_b(zero_in, 1); \
163 p4_a_sub_p0 = __msa_asub_u_b(p4_in, p0_in); \
164 q4_a_sub_q0 = __msa_asub_u_b(q4_in, q0_in); \
165 p5_a_sub_p0 = __msa_asub_u_b(p5_in, p0_in); \
166 q5_a_sub_q0 = __msa_asub_u_b(q5_in, q0_in); \
167 p6_a_sub_p0 = __msa_asub_u_b(p6_in, p0_in); \
168 q6_a_sub_q0 = __msa_asub_u_b(q6_in, q0_in); \
169 p7_a_sub_p0 = __msa_asub_u_b(p7_in, p0_in); \
170 q7_a_sub_q0 = __msa_asub_u_b(q7_in, q0_in); \
171 \
172 p4_a_sub_p0 = __msa_max_u_b(p4_a_sub_p0, q4_a_sub_q0); \
173 flat2_out = __msa_max_u_b(p5_a_sub_p0, q5_a_sub_q0); \
174 flat2_out = __msa_max_u_b(p4_a_sub_p0, flat2_out); \
175 p6_a_sub_p0 = __msa_max_u_b(p6_a_sub_p0, q6_a_sub_q0); \
176 flat2_out = __msa_max_u_b(p6_a_sub_p0, flat2_out); \
177 p7_a_sub_p0 = __msa_max_u_b(p7_a_sub_p0, q7_a_sub_q0); \
178 flat2_out = __msa_max_u_b(p7_a_sub_p0, flat2_out); \
179 \
180 flat2_out = (tmp < (v16u8) flat2_out); \
181 flat2_out = __msa_xori_b(flat2_out, 0xff); \
182 flat2_out = flat2_out & flat_in; \
183 }
184
185 #define VP9_FILTER8(p3_in, p2_in, p1_in, p0_in, \
186 q0_in, q1_in, q2_in, q3_in, \
187 p2_filt8_out, p1_filt8_out, p0_filt8_out, \
188 q0_filt8_out, q1_filt8_out, q2_filt8_out) \
189 { \
190 v8u16 tmp0, tmp1, tmp2; \
191 \
192 tmp2 = p2_in + p1_in + p0_in; \
193 tmp0 = p3_in << 1; \
194 \
195 tmp0 = tmp0 + tmp2 + q0_in; \
196 tmp1 = tmp0 + p3_in + p2_in; \
197 p2_filt8_out = (v8i16) __msa_srari_h((v8i16) tmp1, 3); \
198 \
199 tmp1 = tmp0 + p1_in + q1_in; \
200 p1_filt8_out = (v8i16) __msa_srari_h((v8i16) tmp1, 3); \
201 \
202 tmp1 = q2_in + q1_in + q0_in; \
203 tmp2 = tmp2 + tmp1; \
204 tmp0 = tmp2 + (p0_in); \
205 tmp0 = tmp0 + (p3_in); \
206 p0_filt8_out = (v8i16) __msa_srari_h((v8i16) tmp0, 3); \
207 \
208 tmp0 = q2_in + q3_in; \
209 tmp0 = p0_in + tmp1 + tmp0; \
210 tmp1 = q3_in + q3_in; \
211 tmp1 = tmp1 + tmp0; \
212 q2_filt8_out = (v8i16) __msa_srari_h((v8i16) tmp1, 3); \
213 \
214 tmp0 = tmp2 + q3_in; \
215 tmp1 = tmp0 + q0_in; \
216 q0_filt8_out = (v8i16) __msa_srari_h((v8i16) tmp1, 3); \
217 \
218 tmp1 = tmp0 - p2_in; \
219 tmp0 = q1_in + q3_in; \
220 tmp1 = tmp0 + tmp1; \
221 q1_filt8_out = (v8i16) __msa_srari_h((v8i16) tmp1, 3); \
222 }
223
224 #define LPF_MASK_HEV(p3_in, p2_in, p1_in, p0_in, \
225 q0_in, q1_in, q2_in, q3_in, \
226 limit_in, b_limit_in, thresh_in, \
227 hev_out, mask_out, flat_out) \
228 { \
229 v16u8 p3_asub_p2_m, p2_asub_p1_m, p1_asub_p0_m, q1_asub_q0_m; \
230 v16u8 p1_asub_q1_m, p0_asub_q0_m, q3_asub_q2_m, q2_asub_q1_m; \
231 \
232 /* absolute subtraction of pixel values */ \
233 p3_asub_p2_m = __msa_asub_u_b(p3_in, p2_in); \
234 p2_asub_p1_m = __msa_asub_u_b(p2_in, p1_in); \
235 p1_asub_p0_m = __msa_asub_u_b(p1_in, p0_in); \
236 q1_asub_q0_m = __msa_asub_u_b(q1_in, q0_in); \
237 q2_asub_q1_m = __msa_asub_u_b(q2_in, q1_in); \
238 q3_asub_q2_m = __msa_asub_u_b(q3_in, q2_in); \
239 p0_asub_q0_m = __msa_asub_u_b(p0_in, q0_in); \
240 p1_asub_q1_m = __msa_asub_u_b(p1_in, q1_in); \
241 \
242 /* calculation of hev */ \
243 flat_out = __msa_max_u_b(p1_asub_p0_m, q1_asub_q0_m); \
244 hev_out = thresh_in < (v16u8) flat_out; \
245 \
246 /* calculation of mask */ \
247 p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p0_asub_q0_m); \
248 p1_asub_q1_m >>= 1; \
249 p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p1_asub_q1_m); \
250 \
251 mask_out = b_limit_in < p0_asub_q0_m; \
252 mask_out = __msa_max_u_b(flat_out, mask_out); \
253 p3_asub_p2_m = __msa_max_u_b(p3_asub_p2_m, p2_asub_p1_m); \
254 mask_out = __msa_max_u_b(p3_asub_p2_m, mask_out); \
255 q2_asub_q1_m = __msa_max_u_b(q2_asub_q1_m, q3_asub_q2_m); \
256 mask_out = __msa_max_u_b(q2_asub_q1_m, mask_out); \
257 \
258 mask_out = limit_in < (v16u8) mask_out; \
259 mask_out = __msa_xori_b(mask_out, 0xff); \
260 }
261
266 {
267 uint64_t p1_d, p0_d, q0_d, q1_d;
269 v16u8 p3, p2, p1, p0, q3, q2,
q1,
q0, p1_out, p0_out, q0_out, q1_out;
270
271 /* load vector elements */
272 LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
273
274 thresh = (v16u8) __msa_fill_b(thresh_ptr);
275 b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
276 limit = (v16u8) __msa_fill_b(limit_ptr);
277
278 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
279 hev, mask, flat);
281 q1_out);
282
283 p1_d = __msa_copy_u_d((v2i64) p1_out, 0);
284 p0_d = __msa_copy_u_d((v2i64) p0_out, 0);
285 q0_d = __msa_copy_u_d((v2i64) q0_out, 0);
286 q1_d = __msa_copy_u_d((v2i64) q1_out, 0);
287 SD4(p1_d, p0_d, q0_d, q1_d, (src - 2 * pitch), pitch);
288 }
289
290
295 {
296 v16u8
mask,
hev,
flat, thresh0, b_limit0, limit0, thresh1, b_limit1, limit1;
297 v16u8 p3, p2, p1, p0, q3, q2,
q1,
q0;
298
299 /* load vector elements */
300 LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
301
302 thresh0 = (v16u8) __msa_fill_b(thresh_ptr);
303 thresh1 = (v16u8) __msa_fill_b(thresh_ptr >> 8);
304 thresh0 = (v16u8) __msa_ilvr_d((v2i64) thresh1, (v2i64) thresh0);
305
306 b_limit0 = (v16u8) __msa_fill_b(b_limit_ptr);
307 b_limit1 = (v16u8) __msa_fill_b(b_limit_ptr >> 8);
308 b_limit0 = (v16u8) __msa_ilvr_d((v2i64) b_limit1, (v2i64) b_limit0);
309
310 limit0 = (v16u8) __msa_fill_b(limit_ptr);
311 limit1 = (v16u8) __msa_fill_b(limit_ptr >> 8);
312 limit0 = (v16u8) __msa_ilvr_d((v2i64) limit1, (v2i64) limit0);
313
314 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0,
315 hev, mask, flat);
317
318 ST_UB4(p1, p0, q0, q1, (src - 2 * pitch), pitch);
319 }
320
325 {
326 uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d;
328 v16u8 p3, p2, p1, p0, q3, q2,
q1,
q0;
329 v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
330 v8i16 p2_filter8, p1_filter8, p0_filter8;
331 v8i16 q0_filter8, q1_filter8, q2_filter8;
332 v8u16 p3_r, p2_r, p1_r, p0_r, q3_r, q2_r, q1_r, q0_r;
334
335 /* load vector elements */
336 LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
337
338 thresh = (v16u8) __msa_fill_b(thresh_ptr);
339 b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
340 limit = (v16u8) __msa_fill_b(limit_ptr);
341
342 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
343 hev, mask, flat);
346 q1_out);
347
348 flat = (v16u8) __msa_ilvr_d((v2i64)
zero, (v2i64) flat);
349
350 /* if flat is zero for all pixels, then no need to calculate other filter */
351 if (__msa_test_bz_v(flat)) {
352 p1_d = __msa_copy_u_d((v2i64) p1_out, 0);
353 p0_d = __msa_copy_u_d((v2i64) p0_out, 0);
354 q0_d = __msa_copy_u_d((v2i64) q0_out, 0);
355 q1_d = __msa_copy_u_d((v2i64) q1_out, 0);
356 SD4(p1_d, p0_d, q0_d, q1_d, (src - 2 * pitch), pitch);
357 } else {
358 ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
359 zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r,
360 q2_r, q3_r);
361 VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filter8,
362 p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8);
363
364 /* convert 16 bit output data into 8 bit */
365 PCKEV_B4_SH(zero, p2_filter8, zero, p1_filter8, zero, p0_filter8,
366 zero, q0_filter8, p2_filter8, p1_filter8, p0_filter8,
367 q0_filter8);
368 PCKEV_B2_SH(zero, q1_filter8, zero, q2_filter8, q1_filter8, q2_filter8);
369
370 /* store pixel values */
371 p2_out = __msa_bmnz_v(p2, (v16u8) p2_filter8, flat);
372 p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filter8, flat);
373 p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filter8, flat);
374 q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filter8, flat);
375 q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filter8, flat);
376 q2_out = __msa_bmnz_v(q2, (v16u8) q2_filter8, flat);
377
378 p2_d = __msa_copy_u_d((v2i64) p2_out, 0);
379 p1_d = __msa_copy_u_d((v2i64) p1_out, 0);
380 p0_d = __msa_copy_u_d((v2i64) p0_out, 0);
381 q0_d = __msa_copy_u_d((v2i64) q0_out, 0);
382 q1_d = __msa_copy_u_d((v2i64) q1_out, 0);
383 q2_d = __msa_copy_u_d((v2i64) q2_out, 0);
384
385 src -= 3 * pitch;
386
387 SD4(p2_d, p1_d, p0_d, q0_d, src, pitch);
388 src += (4 * pitch);
390 src += pitch;
392 }
393 }
394
399 {
400 v16u8 p3, p2, p1, p0, q3, q2,
q1,
q0;
401 v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
402 v16u8
flat,
mask,
hev, tmp, thresh, b_limit, limit;
403 v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
404 v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
405 v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r;
406 v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r;
407 v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l;
408 v8i16 q0_filt8_l, q1_filt8_l, q2_filt8_l;
410
411 /* load vector elements */
412 LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
413
414 thresh = (v16u8) __msa_fill_b(thresh_ptr);
415 tmp = (v16u8) __msa_fill_b(thresh_ptr >> 8);
416 thresh = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) thresh);
417
418 b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
419 tmp = (v16u8) __msa_fill_b(b_limit_ptr >> 8);
420 b_limit = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) b_limit);
421
422 limit = (v16u8) __msa_fill_b(limit_ptr);
423 tmp = (v16u8) __msa_fill_b(limit_ptr >> 8);
424 limit = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) limit);
425
426 /* mask and hev */
427 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
428 hev, mask, flat);
431 q1_out);
432
433 /* if flat is zero for all pixels, then no need to calculate other filter */
434 if (__msa_test_bz_v(flat)) {
435 ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch);
436 } else {
437 ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
438 zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r,
439 q2_r, q3_r);
440 VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
441 p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
442
443 ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l,
444 p0_l);
445 ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l,
446 q3_l);
447 VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
448 p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
449
450 /* convert 16 bit output data into 8 bit */
451 PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
452 p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
453 p0_filt8_r, q0_filt8_r);
454 PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r,
455 q1_filt8_r, q2_filt8_r);
456
457 /* store pixel values */
458 p2_out = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, flat);
459 p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, flat);
460 p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, flat);
461 q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, flat);
462 q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, flat);
463 q2_out = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, flat);
464
465 src -= 3 * pitch;
466
467 ST_UB4(p2_out, p1_out, p0_out, q0_out, src, pitch);
468 src += (4 * pitch);
469 ST_UB2(q1_out, q2_out, src, pitch);
470 src += (2 * pitch);
471 }
472 }
473
478 {
479 v16u8 p3, p2, p1, p0, q3, q2,
q1,
q0;
480 v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
481 v16u8
flat,
mask,
hev, tmp, thresh, b_limit, limit;
482 v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
483 v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r;
484 v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r;
486
487 /* load vector elements */
488 LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
489
490 thresh = (v16u8) __msa_fill_b(thresh_ptr);
491 tmp = (v16u8) __msa_fill_b(thresh_ptr >> 8);
492 thresh = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) thresh);
493
494 b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
495 tmp = (v16u8) __msa_fill_b(b_limit_ptr >> 8);
496 b_limit = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) b_limit);
497
498 limit = (v16u8) __msa_fill_b(limit_ptr);
499 tmp = (v16u8) __msa_fill_b(limit_ptr >> 8);
500 limit = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) limit);
501
502 /* mask and hev */
503 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
504 hev, mask, flat);
507 q1_out);
508
509 flat = (v16u8) __msa_ilvr_d((v2i64)
zero, (v2i64) flat);
510
511 /* if flat is zero for all pixels, then no need to calculate other filter */
512 if (__msa_test_bz_v(flat)) {
513 ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch);
514 } else {
515 ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
516 zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r,
517 q2_r, q3_r);
518 VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
519 p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
520
521 /* convert 16 bit output data into 8 bit */
522 PCKEV_B4_SH(p2_filt8_r, p2_filt8_r, p1_filt8_r, p1_filt8_r,
523 p0_filt8_r, p0_filt8_r, q0_filt8_r, q0_filt8_r,
524 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r);
525 PCKEV_B2_SH(q1_filt8_r, q1_filt8_r, q2_filt8_r, q2_filt8_r,
526 q1_filt8_r, q2_filt8_r);
527
528 /* store pixel values */
529 p2_out = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, flat);
530 p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, flat);
531 p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, flat);
532 q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, flat);
533 q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, flat);
534 q2_out = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, flat);
535
536 src -= 3 * pitch;
537
538 ST_UB4(p2_out, p1_out, p0_out, q0_out, src, pitch);
539 src += (4 * pitch);
540 ST_UB2(q1_out, q2_out, src, pitch);
541 src += (2 * pitch);
542 }
543 }
544
549 {
550 v16u8 p3, p2, p1, p0, q3, q2,
q1,
q0;
551 v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
552 v16u8
flat,
mask,
hev, tmp, thresh, b_limit, limit;
553 v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
554 v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l;
555 v8i16 q0_filt8_l, q1_filt8_l, q2_filt8_l;
557
558 /* load vector elements */
559 LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
560
561 thresh = (v16u8) __msa_fill_b(thresh_ptr);
562 tmp = (v16u8) __msa_fill_b(thresh_ptr >> 8);
563 thresh = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) thresh);
564
565 b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
566 tmp = (v16u8) __msa_fill_b(b_limit_ptr >> 8);
567 b_limit = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) b_limit);
568
569 limit = (v16u8) __msa_fill_b(limit_ptr);
570 tmp = (v16u8) __msa_fill_b(limit_ptr >> 8);
571 limit = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) limit);
572
573 /* mask and hev */
574 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
575 hev, mask, flat);
578 q1_out);
579
580 flat = (v16u8) __msa_insve_d((v2i64)
flat, 0, (v2i64) zero);
581
582 /* if flat is zero for all pixels, then no need to calculate other filter */
583 if (__msa_test_bz_v(flat)) {
584 ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch);
585 } else {
586 ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l,
587 p0_l);
588 ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l,
589 q3_l);
590 VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
591 p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
592
593 /* convert 16 bit output data into 8 bit */
594 PCKEV_B4_SH(p2_filt8_l, p2_filt8_l, p1_filt8_l, p1_filt8_l,
595 p0_filt8_l, p0_filt8_l, q0_filt8_l, q0_filt8_l,
596 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l);
597 PCKEV_B2_SH(q1_filt8_l, q1_filt8_l, q2_filt8_l, q2_filt8_l,
598 q1_filt8_l, q2_filt8_l);
599
600 /* store pixel values */
601 p2_out = __msa_bmnz_v(p2, (v16u8) p2_filt8_l, flat);
602 p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_l, flat);
603 p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_l, flat);
604 q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_l, flat);
605 q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_l, flat);
606 q2_out = __msa_bmnz_v(q2, (v16u8) q2_filt8_l, flat);
607
608 src -= 3 * pitch;
609
610 ST_UB4(p2_out, p1_out, p0_out, q0_out, src, pitch);
611 src += (4 * pitch);
612 ST_UB2(q1_out, q2_out, src, pitch);
613 src += (2 * pitch);
614 }
615 }
616
622 {
623 v16u8 p3, p2, p1, p0, q3, q2,
q1,
q0;
624 v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
626 v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
627 v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
628 v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r;
629 v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r;
630 v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l;
631 v8i16 q0_filt8_l, q1_filt8_l, q2_filt8_l;
633
634 /* load vector elements */
635 LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
636
637 thresh = (v16u8) __msa_fill_b(thresh_ptr);
638 b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
639 limit = (v16u8) __msa_fill_b(limit_ptr);
640
641 /* mask and hev */
642 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
643 hev, mask, flat);
646 q1_out);
647
648 /* if flat is zero for all pixels, then no need to calculate other filter */
649 if (__msa_test_bz_v(flat)) {
650 ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch);
651
652 return 1;
653 } else {
654 ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
655 zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r,
656 q2_r, q3_r);
657 VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
658 p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
659
660 ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l,
661 p0_l);
662 ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l,
663 q3_l);
664 VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
665 p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
666
667 /* convert 16 bit output data into 8 bit */
668 PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
669 p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
670 p0_filt8_r, q0_filt8_r);
671 PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r,
672 q2_filt8_r);
673
674 /* store pixel values */
675 p2_out = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, flat);
676 p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, flat);
677 p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, flat);
678 q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, flat);
679 q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, flat);
680 q2_out = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, flat);
681
682 ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16);
683 filter48 += (4 * 16);
684 ST_UB2(q1_out, q2_out, filter48, 16);
685 filter48 += (2 * 16);
686 ST_UB(flat, filter48);
687
688 return 0;
689 }
690 }
691
693 {
694 v16u8
flat, flat2, filter8;
696 v16u8 p7, p6, p5, p4, p3, p2, p1, p0,
q0,
q1, q2, q3, q4, q5, q6, q7;
697 v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in;
698 v8u16 p3_r_in, p2_r_in, p1_r_in, p0_r_in;
699 v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in;
700 v8u16 q3_r_in, q2_r_in, q1_r_in, q0_r_in;
701 v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in;
702 v8u16 p3_l_in, p2_l_in, p1_l_in, p0_l_in;
703 v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in;
704 v8u16 q3_l_in, q2_l_in, q1_l_in, q0_l_in;
705 v8u16 tmp0_r, tmp1_r, tmp0_l, tmp1_l;
706 v8i16 l_out, r_out;
707
708 flat =
LD_UB(filter48 + 96);
709
710 LD_UB8((src - 8 * pitch), pitch, p7, p6, p5, p4, p3, p2, p1, p0);
711 LD_UB8(src, pitch, q0, q1, q2, q3, q4, q5, q6, q7);
712 VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
713
714 /* if flat2 is zero for all pixels, then no need to calculate other filter */
715 if (__msa_test_bz_v(flat2)) {
716 LD_UB4(filter48, 16, p2, p1, p0, q0);
717 LD_UB2(filter48 + 4 * 16, 16, q1, q2);
718
719 src -= 3 * pitch;
720 ST_UB4(p2, p1, p0, q0, src, pitch);
721 src += (4 * pitch);
722 ST_UB2(q1, q2, src, pitch);
723 } else {
724 src -= 7 * pitch;
725
726 ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2,
727 zero, p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in,
728 p3_r_in, p2_r_in, p1_r_in, p0_r_in);
729
730 q0_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8)
q0);
731
732 tmp0_r = p7_r_in << 3;
733 tmp0_r -= p7_r_in;
734 tmp0_r += p6_r_in;
735 tmp0_r += q0_r_in;
736 tmp1_r = p6_r_in + p5_r_in;
737 tmp1_r += p4_r_in;
738 tmp1_r += p3_r_in;
739 tmp1_r += p2_r_in;
740 tmp1_r += p1_r_in;
741 tmp1_r += p0_r_in;
742 tmp1_r += tmp0_r;
743 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
744
745 ILVL_B4_UH(zero, p7, zero, p6, zero, p5, zero, p4, p7_l_in, p6_l_in,
746 p5_l_in, p4_l_in);
747 ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l_in, p2_l_in,
748 p1_l_in, p0_l_in);
749 q0_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8)
q0);
750
751 tmp0_l = p7_l_in << 3;
752 tmp0_l -= p7_l_in;
753 tmp0_l += p6_l_in;
754 tmp0_l += q0_l_in;
755 tmp1_l = p6_l_in + p5_l_in;
756 tmp1_l += p4_l_in;
757 tmp1_l += p3_l_in;
758 tmp1_l += p2_l_in;
759 tmp1_l += p1_l_in;
760 tmp1_l += p0_l_in;
761 tmp1_l += tmp0_l;
762 l_out = __msa_srari_h((v8i16) tmp1_l, 4);
763
764 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
765 p6 = __msa_bmnz_v(p6, (v16u8) r_out, flat2);
767 src += pitch;
768
769 /* p5 */
770 q1_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8)
q1);
771 tmp0_r = p5_r_in - p6_r_in;
772 tmp0_r += q1_r_in;
773 tmp0_r -= p7_r_in;
774 tmp1_r += tmp0_r;
775 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
776
777 q1_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8)
q1);
778 tmp0_l = p5_l_in - p6_l_in;
779 tmp0_l += q1_l_in;
780 tmp0_l -= p7_l_in;
781 tmp1_l += tmp0_l;
782 l_out = __msa_srari_h((v8i16) tmp1_l, 4);
783
784 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
785 p5 = __msa_bmnz_v(p5, (v16u8) r_out, flat2);
787 src += pitch;
788
789 /* p4 */
790 q2_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q2);
791 tmp0_r = p4_r_in - p5_r_in;
792 tmp0_r += q2_r_in;
793 tmp0_r -= p7_r_in;
794 tmp1_r += tmp0_r;
795 r_out = (v8i16) __msa_srari_h((v8i16) tmp1_r, 4);
796
797 q2_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q2);
798 tmp0_l = p4_l_in - p5_l_in;
799 tmp0_l += q2_l_in;
800 tmp0_l -= p7_l_in;
801 tmp1_l += tmp0_l;
802 l_out = __msa_srari_h((v8i16) tmp1_l, 4);
803
804 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
805 p4 = __msa_bmnz_v(p4, (v16u8) r_out, flat2);
807 src += pitch;
808
809 /* p3 */
810 q3_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q3);
811 tmp0_r = p3_r_in - p4_r_in;
812 tmp0_r += q3_r_in;
813 tmp0_r -= p7_r_in;
814 tmp1_r += tmp0_r;
815 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
816
817 q3_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q3);
818 tmp0_l = p3_l_in - p4_l_in;
819 tmp0_l += q3_l_in;
820 tmp0_l -= p7_l_in;
821 tmp1_l += tmp0_l;
822 l_out = __msa_srari_h((v8i16) tmp1_l, 4);
823
824 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
825 p3 = __msa_bmnz_v(p3, (v16u8) r_out, flat2);
827 src += pitch;
828
829 /* p2 */
830 q4_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q4);
831 filter8 =
LD_UB(filter48);
832 tmp0_r = p2_r_in - p3_r_in;
833 tmp0_r += q4_r_in;
834 tmp0_r -= p7_r_in;
835 tmp1_r += tmp0_r;
836 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
837
838 q4_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q4);
839 tmp0_l = p2_l_in - p3_l_in;
840 tmp0_l += q4_l_in;
841 tmp0_l -= p7_l_in;
842 tmp1_l += tmp0_l;
843 l_out = __msa_srari_h((v8i16) tmp1_l, 4);
844
845 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
846 filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
848 src += pitch;
849
850 /* p1 */
851 q5_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q5);
852 filter8 =
LD_UB(filter48 + 16);
853 tmp0_r = p1_r_in - p2_r_in;
854 tmp0_r += q5_r_in;
855 tmp0_r -= p7_r_in;
856 tmp1_r += tmp0_r;
857 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
858
859 q5_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q5);
860 tmp0_l = p1_l_in - p2_l_in;
861 tmp0_l += q5_l_in;
862 tmp0_l -= p7_l_in;
863 tmp1_l += tmp0_l;
864 l_out = __msa_srari_h((v8i16) tmp1_l, 4);
865
866 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
867 filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
869 src += pitch;
870
871 /* p0 */
872 q6_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q6);
873 filter8 =
LD_UB(filter48 + 32);
874 tmp0_r = p0_r_in - p1_r_in;
875 tmp0_r += q6_r_in;
876 tmp0_r -= p7_r_in;
877 tmp1_r += tmp0_r;
878 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
879
880 q6_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q6);
881 tmp0_l = p0_l_in - p1_l_in;
882 tmp0_l += q6_l_in;
883 tmp0_l -= p7_l_in;
884 tmp1_l += tmp0_l;
885 l_out = __msa_srari_h((v8i16) tmp1_l, 4);
886
887 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
888 filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
890 src += pitch;
891
892 /* q0 */
893 q7_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q7);
894 filter8 =
LD_UB(filter48 + 48);
895 tmp0_r = q7_r_in - p0_r_in;
896 tmp0_r += q0_r_in;
897 tmp0_r -= p7_r_in;
898 tmp1_r += tmp0_r;
899 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
900
901 q7_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q7);
902 tmp0_l = q7_l_in - p0_l_in;
903 tmp0_l += q0_l_in;
904 tmp0_l -= p7_l_in;
905 tmp1_l += tmp0_l;
906 l_out = __msa_srari_h((v8i16) tmp1_l, 4);
907
908 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
909 filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
911 src += pitch;
912
913 /* q1 */
914 filter8 =
LD_UB(filter48 + 64);
915 tmp0_r = q7_r_in - q0_r_in;
916 tmp0_r += q1_r_in;
917 tmp0_r -= p6_r_in;
918 tmp1_r += tmp0_r;
919 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
920
921 tmp0_l = q7_l_in - q0_l_in;
922 tmp0_l += q1_l_in;
923 tmp0_l -= p6_l_in;
924 tmp1_l += tmp0_l;
925 l_out = __msa_srari_h((v8i16) tmp1_l, 4);
926
927 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
928 filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
930 src += pitch;
931
932 /* q2 */
933 filter8 =
LD_UB(filter48 + 80);
934 tmp0_r = q7_r_in - q1_r_in;
935 tmp0_r += q2_r_in;
936 tmp0_r -= p5_r_in;
937 tmp1_r += tmp0_r;
938 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
939
940 tmp0_l = q7_l_in - q1_l_in;
941 tmp0_l += q2_l_in;
942 tmp0_l -= p5_l_in;
943 tmp1_l += tmp0_l;
944 l_out = __msa_srari_h((v8i16) tmp1_l, 4);
945
946 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
947 filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
949 src += pitch;
950
951 /* q3 */
952 tmp0_r = q7_r_in - q2_r_in;
953 tmp0_r += q3_r_in;
954 tmp0_r -= p4_r_in;
955 tmp1_r += tmp0_r;
956 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
957
958 tmp0_l = q7_l_in - q2_l_in;
959 tmp0_l += q3_l_in;
960 tmp0_l -= p4_l_in;
961 tmp1_l += tmp0_l;
962 l_out = __msa_srari_h((v8i16) tmp1_l, 4);
963
964 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
965 q3 = __msa_bmnz_v(q3, (v16u8) r_out, flat2);
967 src += pitch;
968
969 /* q4 */
970 tmp0_r = q7_r_in - q3_r_in;
971 tmp0_r += q4_r_in;
972 tmp0_r -= p3_r_in;
973 tmp1_r += tmp0_r;
974 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
975
976 tmp0_l = q7_l_in - q3_l_in;
977 tmp0_l += q4_l_in;
978 tmp0_l -= p3_l_in;
979 tmp1_l += tmp0_l;
980 l_out = __msa_srari_h((v8i16) tmp1_l, 4);
981
982 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
983 q4 = __msa_bmnz_v(q4, (v16u8) r_out, flat2);
985 src += pitch;
986
987 /* q5 */
988 tmp0_r = q7_r_in - q4_r_in;
989 tmp0_r += q5_r_in;
990 tmp0_r -= p2_r_in;
991 tmp1_r += tmp0_r;
992 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
993
994 tmp0_l = q7_l_in - q4_l_in;
995 tmp0_l += q5_l_in;
996 tmp0_l -= p2_l_in;
997 tmp1_l += tmp0_l;
998 l_out = __msa_srari_h((v8i16) tmp1_l, 4);
999
1000 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
1001 q5 = __msa_bmnz_v(q5, (v16u8) r_out, flat2);
1003 src += pitch;
1004
1005 /* q6 */
1006 tmp0_r = q7_r_in - q5_r_in;
1007 tmp0_r += q6_r_in;
1008 tmp0_r -= p1_r_in;
1009 tmp1_r += tmp0_r;
1010 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
1011
1012 tmp0_l = q7_l_in - q5_l_in;
1013 tmp0_l += q6_l_in;
1014 tmp0_l -= p1_l_in;
1015 tmp1_l += tmp0_l;
1016 l_out = __msa_srari_h((v8i16) tmp1_l, 4);
1017
1018 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
1019 q6 = __msa_bmnz_v(q6, (v16u8) r_out, flat2);
1021 }
1022 }
1023
1028 {
1031
1033 b_limit_ptr, limit_ptr, thresh_ptr);
1034
1035 if (0 == early_exit) {
1037 }
1038 }
1039
1044 {
1045 uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d;
1046 uint64_t dword0, dword1;
1047 v16u8 flat2,
mask,
hev,
flat, thresh, b_limit, limit;
1048 v16u8 p3, p2, p1, p0, q3, q2,
q1,
q0, p7, p6, p5, p4, q4, q5, q6, q7;
1049 v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
1050 v16u8 p0_filter16, p1_filter16;
1051 v8i16 p2_filter8, p1_filter8, p0_filter8;
1052 v8i16 q0_filter8, q1_filter8, q2_filter8;
1053 v8u16 p7_r, p6_r, p5_r, p4_r, q7_r, q6_r, q5_r, q4_r;
1054 v8u16 p3_r, p2_r, p1_r, p0_r, q3_r, q2_r, q1_r, q0_r;
1056 v8u16 tmp0, tmp1, tmp2;
1057
1058 /* load vector elements */
1059 LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
1060
1061 thresh = (v16u8) __msa_fill_b(thresh_ptr);
1062 b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
1063 limit = (v16u8) __msa_fill_b(limit_ptr);
1064
1065 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
1066 hev, mask, flat);
1067 VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
1069 q1_out);
1070
1071 flat = (v16u8) __msa_ilvr_d((v2i64)
zero, (v2i64) flat);
1072
1073 /* if flat is zero for all pixels, then no need to calculate other filter */
1074 if (__msa_test_bz_v(flat)) {
1075 p1_d = __msa_copy_u_d((v2i64) p1_out, 0);
1076 p0_d = __msa_copy_u_d((v2i64) p0_out, 0);
1077 q0_d = __msa_copy_u_d((v2i64) q0_out, 0);
1078 q1_d = __msa_copy_u_d((v2i64) q1_out, 0);
1079 SD4(p1_d, p0_d, q0_d, q1_d, src - 2 * pitch, pitch);
1080 } else {
1081 /* convert 8 bit input data into 16 bit */
1082 ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero,
1083 q1, zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r,
1084 q1_r, q2_r, q3_r);
1085 VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r,
1086 p2_filter8, p1_filter8, p0_filter8, q0_filter8,
1087 q1_filter8, q2_filter8);
1088
1089 /* convert 16 bit output data into 8 bit */
1090 PCKEV_B4_SH(zero, p2_filter8, zero, p1_filter8, zero, p0_filter8,
1091 zero, q0_filter8, p2_filter8, p1_filter8, p0_filter8,
1092 q0_filter8);
1093 PCKEV_B2_SH(zero, q1_filter8, zero, q2_filter8, q1_filter8,
1094 q2_filter8);
1095
1096 /* store pixel values */
1097 p2_out = __msa_bmnz_v(p2, (v16u8) p2_filter8, flat);
1098 p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filter8, flat);
1099 p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filter8, flat);
1100 q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filter8, flat);
1101 q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filter8, flat);
1102 q2_out = __msa_bmnz_v(q2, (v16u8) q2_filter8, flat);
1103
1104 /* load 16 vector elements */
1105 LD_UB4((src - 8 * pitch), pitch, p7, p6, p5, p4);
1106 LD_UB4(src + (4 * pitch), pitch, q4, q5, q6, q7);
1107
1108 VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
1109
1110 /* if flat2 is zero for all pixels, then no need to calculate other filter */
1111 if (__msa_test_bz_v(flat2)) {
1112 p2_d = __msa_copy_u_d((v2i64) p2_out, 0);
1113 p1_d = __msa_copy_u_d((v2i64) p1_out, 0);
1114 p0_d = __msa_copy_u_d((v2i64) p0_out, 0);
1115 q0_d = __msa_copy_u_d((v2i64) q0_out, 0);
1116 q1_d = __msa_copy_u_d((v2i64) q1_out, 0);
1117 q2_d = __msa_copy_u_d((v2i64) q2_out, 0);
1118
1119 SD4(p2_d, p1_d, p0_d, q0_d, src - 3 * pitch, pitch);
1120 SD(q1_d, src + pitch);
1121 SD(q2_d, src + 2 * pitch);
1122 } else {
1123 /* LSB(right) 8 pixel operation */
1124 ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, q4,
1125 zero, q5, zero, q6, zero, q7, p7_r, p6_r, p5_r, p4_r,
1126 q4_r, q5_r, q6_r, q7_r);
1127
1128 tmp0 = p7_r << 3;
1129 tmp0 -= p7_r;
1130 tmp0 += p6_r;
1131 tmp0 += q0_r;
1132
1133 src -= 7 * pitch;
1134
1135 /* calculation of p6 and p5 */
1136 tmp1 = p6_r + p5_r + p4_r + p3_r;
1137 tmp1 += (p2_r + p1_r + p0_r);
1138 tmp1 += tmp0;
1139 p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1140 tmp0 = p5_r - p6_r + q1_r - p7_r;
1141 tmp1 += tmp0;
1142 p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1144 p0_filter16, p1_filter16);
1145 p0_filter16 = __msa_bmnz_v(p6, p0_filter16, flat2);
1146 p1_filter16 = __msa_bmnz_v(p5, p1_filter16, flat2);
1147 dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0);
1148 dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0);
1150 src += pitch;
1152 src += pitch;
1153
1154 /* calculation of p4 and p3 */
1155 tmp0 = p4_r - p5_r + q2_r - p7_r;
1156 tmp2 = p3_r - p4_r + q3_r - p7_r;
1157 tmp1 += tmp0;
1158 p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1159 tmp1 += tmp2;
1160 p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1162 p0_filter16, p1_filter16);
1163 p0_filter16 = __msa_bmnz_v(p4, p0_filter16, flat2);
1164 p1_filter16 = __msa_bmnz_v(p3, p1_filter16, flat2);
1165 dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0);
1166 dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0);
1168 src += pitch;
1170 src += pitch;
1171
1172 /* calculation of p2 and p1 */
1173 tmp0 = p2_r - p3_r + q4_r - p7_r;
1174 tmp2 = p1_r - p2_r + q5_r - p7_r;
1175 tmp1 += tmp0;
1176 p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1177 tmp1 += tmp2;
1178 p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1180 p0_filter16, p1_filter16);
1181 p0_filter16 = __msa_bmnz_v(p2_out, p0_filter16, flat2);
1182 p1_filter16 = __msa_bmnz_v(p1_out, p1_filter16, flat2);
1183 dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0);
1184 dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0);
1186 src += pitch;
1188 src += pitch;
1189
1190 /* calculation of p0 and q0 */
1191 tmp0 = (p0_r - p1_r) + (q6_r - p7_r);
1192 tmp2 = (q7_r - p0_r) + (q0_r - p7_r);
1193 tmp1 += tmp0;
1194 p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1195 tmp1 += tmp2;
1196 p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1198 p0_filter16, p1_filter16);
1199 p0_filter16 = __msa_bmnz_v(p0_out, p0_filter16, flat2);
1200 p1_filter16 = __msa_bmnz_v(q0_out, p1_filter16, flat2);
1201 dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0);
1202 dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0);
1204 src += pitch;
1206 src += pitch;
1207
1208 /* calculation of q1 and q2 */
1209 tmp0 = q7_r - q0_r + q1_r - p6_r;
1210 tmp2 = q7_r - q1_r + q2_r - p5_r;
1211 tmp1 += tmp0;
1212 p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1213 tmp1 += tmp2;
1214 p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1216 p0_filter16, p1_filter16);
1217 p0_filter16 = __msa_bmnz_v(q1_out, p0_filter16, flat2);
1218 p1_filter16 = __msa_bmnz_v(q2_out, p1_filter16, flat2);
1219 dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0);
1220 dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0);
1222 src += pitch;
1224 src += pitch;
1225
1226 /* calculation of q3 and q4 */
1227 tmp0 = (q7_r - q2_r) + (q3_r - p4_r);
1228 tmp2 = (q7_r - q3_r) + (q4_r - p3_r);
1229 tmp1 += tmp0;
1230 p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1231 tmp1 += tmp2;
1232 p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1234 p0_filter16, p1_filter16);
1235 p0_filter16 = __msa_bmnz_v(q3, p0_filter16, flat2);
1236 p1_filter16 = __msa_bmnz_v(q4, p1_filter16, flat2);
1237 dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0);
1238 dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0);
1240 src += pitch;
1242 src += pitch;
1243
1244 /* calculation of q5 and q6 */
1245 tmp0 = (q7_r - q4_r) + (q5_r - p2_r);
1246 tmp2 = (q7_r - q5_r) + (q6_r - p1_r);
1247 tmp1 += tmp0;
1248 p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1249 tmp1 += tmp2;
1250 p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1252 p0_filter16, p1_filter16);
1253 p0_filter16 = __msa_bmnz_v(q5, p0_filter16, flat2);
1254 p1_filter16 = __msa_bmnz_v(q6, p1_filter16, flat2);
1255 dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0);
1256 dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0);
1258 src += pitch;
1260 }
1261 }
1262 }
1263
1268 {
1270 v16u8 p3, p2, p1, p0, q3, q2,
q1,
q0;
1271 v8i16 vec0, vec1, vec2, vec3;
1272
1273 LD_UB8((src - 4), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
1274
1275 thresh = (v16u8) __msa_fill_b(thresh_ptr);
1276 b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
1277 limit = (v16u8) __msa_fill_b(limit_ptr);
1278
1280 p3, p2, p1, p0, q0, q1, q2, q3);
1281 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
1282 hev, mask, flat);
1286
1287 src -= 2;
1288 ST4x4_UB(vec2, vec2, 0, 1, 2, 3, src, pitch);
1289 src += 4 * pitch;
1290 ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
1291 }
1292
1297 {
1299 v16u8 thresh0, b_limit0, limit0, thresh1, b_limit1, limit1;
1300 v16u8 p3, p2, p1, p0, q3, q2,
q1,
q0;
1301 v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
1302 v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
1303 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
1304
1305 LD_UB8(src - 4, pitch, row0, row1, row2, row3, row4, row5, row6, row7);
1306 LD_UB8(src - 4 + (8 * pitch), pitch,
1307 row8, row9, row10, row11, row12, row13, row14, row15);
1308
1310 row8, row9, row10, row11, row12, row13, row14, row15,
1311 p3, p2, p1, p0, q0, q1, q2, q3);
1312
1313 thresh0 = (v16u8) __msa_fill_b(thresh_ptr);
1314 thresh1 = (v16u8) __msa_fill_b(thresh_ptr >> 8);
1315 thresh0 = (v16u8) __msa_ilvr_d((v2i64) thresh1, (v2i64) thresh0);
1316
1317 b_limit0 = (v16u8) __msa_fill_b(b_limit_ptr);
1318 b_limit1 = (v16u8) __msa_fill_b(b_limit_ptr >> 8);
1319 b_limit0 = (v16u8) __msa_ilvr_d((v2i64) b_limit1, (v2i64) b_limit0);
1320
1321 limit0 = (v16u8) __msa_fill_b(limit_ptr);
1322 limit1 = (v16u8) __msa_fill_b(limit_ptr >> 8);
1323 limit0 = (v16u8) __msa_ilvr_d((v2i64) limit1, (v2i64) limit0);
1324
1325 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0,
1326 hev, mask, flat);
1332
1333 src -= 2;
1334
1336 src += (8 * pitch);
1338 }
1339
1344 {
1345 v16u8 p3, p2, p1, p0, q3, q2,
q1,
q0;
1346 v16u8 p1_out, p0_out, q0_out, q1_out;
1348 v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
1349 v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r;
1350 v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r;
1352 v8i16 vec0, vec1, vec2, vec3, vec4;
1353
1354 /* load vector elements */
1355 LD_UB8(src - 4, pitch, p3, p2, p1, p0, q0, q1, q2, q3);
1356
1358 p3, p2, p1, p0, q0, q1, q2, q3);
1359
1360 thresh = (v16u8) __msa_fill_b(thresh_ptr);
1361 b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
1362 limit = (v16u8) __msa_fill_b(limit_ptr);
1363
1364 /* mask and hev */
1365 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
1366 hev, mask, flat);
1367 /* flat4 */
1368 VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
1369 /* filter4 */
1371 q1_out);
1372
1373 flat = (v16u8) __msa_ilvr_d((v2i64)
zero, (v2i64) flat);
1374
1375 /* if flat is zero for all pixels, then no need to calculate other filter */
1376 if (__msa_test_bz_v(flat)) {
1377 /* Store 4 pixels p1-_q1 */
1378 ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1380
1381 src -= 2;
1382 ST4x4_UB(vec2, vec2, 0, 1, 2, 3, src, pitch);
1383 src += 4 * pitch;
1384 ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
1385 } else {
1386 ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
1387 zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
1388 q3_r);
1389 VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
1390 p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
1391 /* convert 16 bit output data into 8 bit */
1392 PCKEV_B4_SH(p2_filt8_r, p2_filt8_r, p1_filt8_r, p1_filt8_r, p0_filt8_r,
1393 p0_filt8_r, q0_filt8_r, q0_filt8_r, p2_filt8_r, p1_filt8_r,
1394 p0_filt8_r, q0_filt8_r);
1395 PCKEV_B2_SH(q1_filt8_r, q1_filt8_r, q2_filt8_r, q2_filt8_r, q1_filt8_r,
1396 q2_filt8_r);
1397
1398 /* store pixel values */
1399 p2 = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, flat);
1400 p1 = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, flat);
1401 p0 = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, flat);
1402 q0 = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, flat);
1403 q1 = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, flat);
1404 q2 = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, flat);
1405
1406 /* Store 6 pixels p2-_q2 */
1409 vec4 = (v8i16) __msa_ilvr_b((v16i8) q2, (v16i8) q1);
1410
1411 src -= 3;
1412 ST4x4_UB(vec2, vec2, 0, 1, 2, 3, src, pitch);
1414 src += (4 * pitch);
1415 ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
1417 }
1418 }
1419
1424 {
1426 v16u8 p3, p2, p1, p0, q3, q2,
q1,
q0;
1427 v16u8 p1_out, p0_out, q0_out, q1_out;
1429 v16u8 row4, row5, row6, row7, row12, row13, row14, row15;
1430 v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
1431 v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
1432 v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r;
1433 v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r;
1434 v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l;
1435 v8i16 q0_filt8_l, q1_filt8_l, q2_filt8_l;
1437 v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1438
1439 temp_src = src - 4;
1440
1441 LD_UB8(temp_src, pitch, p0, p1, p2, p3, row4, row5, row6, row7);
1442 temp_src += (8 * pitch);
1443 LD_UB8(temp_src, pitch, q3, q2, q1, q0, row12, row13, row14, row15);
1444
1445 /* transpose 16x8 matrix into 8x16 */
1447 q3, q2, q1, q0, row12, row13, row14, row15,
1448 p3, p2, p1, p0, q0, q1, q2, q3);
1449
1450 thresh = (v16u8) __msa_fill_b(thresh_ptr);
1451 vec0 = (v8i16) __msa_fill_b(thresh_ptr >> 8);
1452 thresh = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) thresh);
1453
1454 b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
1455 vec0 = (v8i16) __msa_fill_b(b_limit_ptr >> 8);
1456 b_limit = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) b_limit);
1457
1458 limit = (v16u8) __msa_fill_b(limit_ptr);
1459 vec0 = (v8i16) __msa_fill_b(limit_ptr >> 8);
1460 limit = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) limit);
1461
1462 /* mask and hev */
1463 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
1464 hev, mask, flat);
1465 /* flat4 */
1466 VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
1467 /* filter4 */
1469 q1_out);
1470
1471 /* if flat is zero for all pixels, then no need to calculate other filter */
1472 if (__msa_test_bz_v(flat)) {
1473 ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1475 ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1477
1478 src -= 2;
1480 src += 8 * pitch;
1482 } else {
1483 ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
1484 zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
1485 q3_r);
1486 VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
1487 p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
1488
1489 ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l,
1490 p0_l);
1491 ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l,
1492 q3_l);
1493
1494 /* filter8 */
1495 VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
1496 p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
1497
1498 /* convert 16 bit output data into 8 bit */
1499 PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
1500 p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
1501 p0_filt8_r, q0_filt8_r);
1502 PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r,
1503 q2_filt8_r);
1504
1505 /* store pixel values */
1506 p2 = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, flat);
1507 p1 = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, flat);
1508 p0 = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, flat);
1509 q0 = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, flat);
1510 q1 = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, flat);
1511 q2 = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, flat);
1512
1518
1519 src -= 3;
1520 ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
1522 src += (4 * pitch);
1523 ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src, pitch);
1525 src += (4 * pitch);
1526 ST4x4_UB(vec6, vec6, 0, 1, 2, 3, src, pitch);
1528 src += (4 * pitch);
1529 ST4x4_UB(vec7, vec7, 0, 1, 2, 3, src, pitch);
1531 }
1532 }
1533
1538 {
1540 v16u8 p3, p2, p1, p0, q3, q2,
q1,
q0;
1541 v16u8 p1_out, p0_out, q0_out, q1_out;
1543 v16u8 row4, row5, row6, row7, row12, row13, row14, row15;
1544 v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
1545 v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r;
1546 v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r;
1548 v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1549
1550 temp_src = src - 4;
1551
1552 LD_UB8(temp_src, pitch, p0, p1, p2, p3, row4, row5, row6, row7);
1553 temp_src += (8 * pitch);
1554 LD_UB8(temp_src, pitch, q3, q2, q1, q0, row12, row13, row14, row15);
1555
1556 /* transpose 16x8 matrix into 8x16 */
1558 q3, q2, q1, q0, row12, row13, row14, row15,
1559 p3, p2, p1, p0, q0, q1, q2, q3);
1560
1561 thresh = (v16u8) __msa_fill_b(thresh_ptr);
1562 vec0 = (v8i16) __msa_fill_b(thresh_ptr >> 8);
1563 thresh = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) thresh);
1564
1565 b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
1566 vec0 = (v8i16) __msa_fill_b(b_limit_ptr >> 8);
1567 b_limit = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) b_limit);
1568
1569 limit = (v16u8) __msa_fill_b(limit_ptr);
1570 vec0 = (v8i16) __msa_fill_b(limit_ptr >> 8);
1571 limit = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) limit);
1572
1573 /* mask and hev */
1574 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
1575 hev, mask, flat);
1576 /* flat4 */
1577 VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
1578 /* filter4 */
1580 q1_out);
1581
1582 flat = (v16u8) __msa_ilvr_d((v2i64)
zero, (v2i64) flat);
1583
1584 /* if flat is zero for all pixels, then no need to calculate other filter */
1585 if (__msa_test_bz_v(flat)) {
1586 ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1588 ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1590
1591 src -= 2;
1593 src += 8 * pitch;
1595 } else {
1596 ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
1597 zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
1598 q3_r);
1599 VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
1600 p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
1601
1602 /* convert 16 bit output data into 8 bit */
1603 PCKEV_B4_SH(p2_filt8_r, p2_filt8_r, p1_filt8_r, p1_filt8_r,
1604 p0_filt8_r, p0_filt8_r, q0_filt8_r, q0_filt8_r,
1605 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r);
1606 PCKEV_B2_SH(q1_filt8_r, q1_filt8_r, q2_filt8_r, q2_filt8_r,
1607 q1_filt8_r, q2_filt8_r);
1608
1609 /* store pixel values */
1610 p2 = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, flat);
1611 p1 = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, flat);
1612 p0 = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, flat);
1613 q0 = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, flat);
1614 q1 = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, flat);
1615 q2 = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, flat);
1616
1622
1623 src -= 3;
1624 ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
1626 src += (4 * pitch);
1627 ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src, pitch);
1629 src += (4 * pitch);
1630 ST4x4_UB(vec6, vec6, 0, 1, 2, 3, src, pitch);
1632 src += (4 * pitch);
1633 ST4x4_UB(vec7, vec7, 0, 1, 2, 3, src, pitch);
1635 }
1636 }
1637
1642 {
1644 v16u8 p3, p2, p1, p0, q3, q2,
q1,
q0;
1645 v16u8 p1_out, p0_out, q0_out, q1_out;
1647 v16u8 row4, row5, row6, row7, row12, row13, row14, row15;
1648 v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
1649 v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l;
1650 v8i16 q0_filt8_l, q1_filt8_l, q2_filt8_l;
1652 v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1653
1654 temp_src = src - 4;
1655
1656 LD_UB8(temp_src, pitch, p0, p1, p2, p3, row4, row5, row6, row7);
1657 temp_src += (8 * pitch);
1658 LD_UB8(temp_src, pitch, q3, q2, q1, q0, row12, row13, row14, row15);
1659
1660 /* transpose 16x8 matrix into 8x16 */
1662 q3, q2, q1, q0, row12, row13, row14, row15,
1663 p3, p2, p1, p0, q0, q1, q2, q3);
1664
1665 thresh = (v16u8) __msa_fill_b(thresh_ptr);
1666 vec0 = (v8i16) __msa_fill_b(thresh_ptr >> 8);
1667 thresh = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) thresh);
1668
1669 b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
1670 vec0 = (v8i16) __msa_fill_b(b_limit_ptr >> 8);
1671 b_limit = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) b_limit);
1672
1673 limit = (v16u8) __msa_fill_b(limit_ptr);
1674 vec0 = (v8i16) __msa_fill_b(limit_ptr >> 8);
1675 limit = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) limit);
1676
1677 /* mask and hev */
1678 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
1679 hev, mask, flat);
1680 /* flat4 */
1681 VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
1682 /* filter4 */
1684 q1_out);
1685
1686 flat = (v16u8) __msa_insve_d((v2i64)
flat, 0, (v2i64) zero);
1687
1688 /* if flat is zero for all pixels, then no need to calculate other filter */
1689 if (__msa_test_bz_v(flat)) {
1690 ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1692 ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1694
1695 src -= 2;
1697 src += 8 * pitch;
1699 } else {
1700 ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l,
1701 p0_l);
1702 ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l,
1703 q3_l);
1704
1705 VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
1706 p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
1707
1708 /* convert 16 bit output data into 8 bit */
1709 PCKEV_B4_SH(p2_filt8_l, p2_filt8_l, p1_filt8_l, p1_filt8_l,
1710 p0_filt8_l, p0_filt8_l, q0_filt8_l, q0_filt8_l,
1711 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l);
1712 PCKEV_B2_SH(q1_filt8_l, q1_filt8_l, q2_filt8_l, q2_filt8_l,
1713 q1_filt8_l, q2_filt8_l);
1714
1715 /* store pixel values */
1716 p2 = __msa_bmnz_v(p2, (v16u8) p2_filt8_l, flat);
1717 p1 = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_l, flat);
1718 p0 = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_l, flat);
1719 q0 = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_l, flat);
1720 q1 = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_l, flat);
1721 q2 = __msa_bmnz_v(q2, (v16u8) q2_filt8_l, flat);
1722
1728
1729 src -= 3;
1730 ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
1732 src += (4 * pitch);
1733 ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src, pitch);
1735 src += (4 * pitch);
1736 ST4x4_UB(vec6, vec6, 0, 1, 2, 3, src, pitch);
1738 src += (4 * pitch);
1739 ST4x4_UB(vec7, vec7, 0, 1, 2, 3, src, pitch);
1741 }
1742 }
1743
1746 {
1747 v16u8 p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org, p0_org;
1748 v16i8 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
1749 v16u8 p7, p6, p5, p4, p3, p2, p1, p0,
q0,
q1, q2, q3, q4, q5, q6, q7;
1750
1752 p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org, p0_org);
1753 /* 8x8 transpose */
1755 p0_org, p7, p6, p5, p4, p3, p2, p1, p0);
1756 /* 8x8 transpose */
1757 ILVL_B4_SB(p5_org, p7_org, p4_org, p6_org, p1_org, p3_org, p0_org, p2_org,
1758 tmp0, tmp1, tmp2, tmp3);
1759 ILVR_B2_SB(tmp1, tmp0, tmp3, tmp2, tmp4, tmp6);
1760 ILVL_B2_SB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp7);
1764
1765 ST_UB8(p7, p6, p5, p4, p3, p2, p1, p0, output, out_pitch);
1766 output += (8 * out_pitch);
1767 ST_UB8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_pitch);
1768 }
1769
1772 {
1773 v16u8 p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o;
1774 v16u8 p7, p6, p5, p4, p3, p2, p1, p0,
q0,
q1, q2, q3, q4, q5, q6, q7;
1775
1776 LD_UB8(input, in_pitch, p7, p6, p5, p4, p3, p2, p1, p0);
1777 LD_UB8(input + (8 * in_pitch), in_pitch, q0, q1, q2, q3, q4, q5, q6, q7);
1778 TRANSPOSE16x8_UB_UB(p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5,
1779 q6, q7, p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o);
1780 ST_UB8(p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o, output, out_pitch);
1781 }
1782
1785 {
1786 v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
1787 v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
1788 v8i16 tmp0, tmp1, tmp4, tmp5, tmp6, tmp7;
1789 v4i32 tmp2, tmp3;
1790 v16u8 p7, p6, p5, p4, p3, p2, p1, p0,
q0,
q1, q2, q3, q4, q5, q6, q7;
1791
1792 LD_UB8(input, in_pitch, row0, row1, row2, row3, row4, row5, row6, row7);
1793 input += (8 * in_pitch);
1795 row8, row9, row10, row11, row12, row13, row14, row15);
1796
1798 row8, row9, row10, row11, row12, row13, row14, row15,
1799 p7, p6, p5, p4, p3, p2, p1, p0);
1800
1801 /* transpose 16x8 matrix into 8x16 */
1802 /* total 8 intermediate register and 32 instructions */
1803 q7 = (v16u8) __msa_ilvod_d((v2i64) row8, (v2i64) row0);
1804 q6 = (v16u8) __msa_ilvod_d((v2i64) row9, (v2i64) row1);
1805 q5 = (v16u8) __msa_ilvod_d((v2i64) row10, (v2i64) row2);
1806 q4 = (v16u8) __msa_ilvod_d((v2i64) row11, (v2i64) row3);
1807 q3 = (v16u8) __msa_ilvod_d((v2i64) row12, (v2i64) row4);
1808 q2 = (v16u8) __msa_ilvod_d((v2i64) row13, (v2i64) row5);
1809 q1 = (v16u8) __msa_ilvod_d((v2i64) row14, (v2i64) row6);
1810 q0 = (v16u8) __msa_ilvod_d((v2i64) row15, (v2i64) row7);
1811
1813 tmp4 = (v8i16) __msa_ilvod_b((v16i8) q6, (v16i8) q7);
1814 tmp5 = (v8i16) __msa_ilvod_b((v16i8) q4, (v16i8) q5);
1815
1817 tmp6 = (v8i16) __msa_ilvod_b((v16i8) q2, (v16i8) q3);
1818 tmp7 = (v8i16) __msa_ilvod_b((v16i8)
q0, (v16i8) q1);
1819
1821 q0 = (v16u8) __msa_ilvev_w(tmp3, tmp2);
1822 q4 = (v16u8) __msa_ilvod_w(tmp3, tmp2);
1823
1824 tmp2 = (v4i32) __msa_ilvod_h(tmp1, tmp0);
1825 tmp3 = (v4i32) __msa_ilvod_h((v8i16) q7, (v8i16) q5);
1826 q2 = (v16u8) __msa_ilvev_w(tmp3, tmp2);
1827 q6 = (v16u8) __msa_ilvod_w(tmp3, tmp2);
1828
1830 q1 = (v16u8) __msa_ilvev_w(tmp3, tmp2);
1831 q5 = (v16u8) __msa_ilvod_w(tmp3, tmp2);
1832
1833 tmp2 = (v4i32) __msa_ilvod_h(tmp5, tmp4);
1834 tmp3 = (v4i32) __msa_ilvod_h(tmp7, tmp6);
1835 q3 = (v16u8) __msa_ilvev_w(tmp3, tmp2);
1836 q7 = (v16u8) __msa_ilvod_w(tmp3, tmp2);
1837
1838 ST_UB8(p7, p6, p5, p4, p3, p2, p1, p0, output, out_pitch);
1839 output += (8 * out_pitch);
1840 ST_UB8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_pitch);
1841 }
1842
1848 {
1849 v16u8 p3, p2, p1, p0, q3, q2,
q1,
q0;
1850 v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
1852 v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
1853 v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r;
1854 v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r;
1856 v8i16 vec0, vec1, vec2, vec3;
1857
1858 /* load vector elements */
1859 LD_UB8(src - (4 * 16), 16, p3, p2, p1, p0, q0, q1, q2, q3);
1860
1861 thresh = (v16u8) __msa_fill_b(thresh_ptr);
1862 b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
1863 limit = (v16u8) __msa_fill_b(limit_ptr);
1864
1865 /* mask and hev */
1866 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
1867 hev, mask, flat);
1868 /* flat4 */
1869 VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
1870 /* filter4 */
1872 q1_out);
1873
1874 flat = (v16u8) __msa_ilvr_d((v2i64)
zero, (v2i64) flat);
1875
1876 /* if flat is zero for all pixels, then no need to calculate other filter */
1877 if (__msa_test_bz_v(flat)) {
1878 ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1880 ST4x8_UB(vec2, vec3, (src_org - 2), pitch_org);
1881 return 1;
1882 } else {
1883 ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
1884 zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
1885 q3_r);
1886 VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
1887 p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
1888
1889 /* convert 16 bit output data into 8 bit */
1890 p2_r = (v8u16) __msa_pckev_b((v16i8) p2_filt8_r, (v16i8) p2_filt8_r);
1891 p1_r = (v8u16) __msa_pckev_b((v16i8) p1_filt8_r, (v16i8) p1_filt8_r);
1892 p0_r = (v8u16) __msa_pckev_b((v16i8) p0_filt8_r, (v16i8) p0_filt8_r);
1893 q0_r = (v8u16) __msa_pckev_b((v16i8) q0_filt8_r, (v16i8) q0_filt8_r);
1894 q1_r = (v8u16) __msa_pckev_b((v16i8) q1_filt8_r, (v16i8) q1_filt8_r);
1895 q2_r = (v8u16) __msa_pckev_b((v16i8) q2_filt8_r, (v16i8) q2_filt8_r);
1896
1897 /* store pixel values */
1898 p2_out = __msa_bmnz_v(p2, (v16u8) p2_r, flat);
1899 p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_r, flat);
1900 p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_r, flat);
1901 q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_r, flat);
1902 q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_r, flat);
1903 q2_out = __msa_bmnz_v(q2, (v16u8) q2_r, flat);
1904
1905 ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16);
1906 filter48 += (4 * 16);
1907 ST_UB2(q1_out, q2_out, filter48, 16);
1908 filter48 += (2 * 16);
1909 ST_UB(flat, filter48);
1910
1911 return 0;
1912 }
1913 }
1914
1917 {
1919 v16u8 filter8,
flat, flat2;
1920 v16u8 p7, p6, p5, p4, p3, p2, p1, p0,
q0,
q1, q2, q3, q4, q5, q6, q7;
1921 v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in;
1922 v8u16 p3_r_in, p2_r_in, p1_r_in, p0_r_in;
1923 v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in;
1924 v8u16 q3_r_in, q2_r_in, q1_r_in, q0_r_in;
1925 v8u16 tmp0_r, tmp1_r;
1926 v8i16 r_out;
1927
1928 flat =
LD_UB(filter48 + 6 * 16);
1929
1930 LD_UB8((src - 8 * 16), 16, p7, p6, p5, p4, p3, p2, p1, p0);
1931 LD_UB8(src, 16, q0, q1, q2, q3, q4, q5, q6, q7);
1932
1933 VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
1934
1935 /* if flat2 is zero for all pixels, then no need to calculate other filter */
1936 if (__msa_test_bz_v(flat2)) {
1937 v8i16 vec0, vec1, vec2, vec3, vec4;
1938
1939 LD_UB4(filter48, 16, p2, p1, p0, q0);
1940 LD_UB2(filter48 + 4 * 16, 16, q1, q2);
1941
1944 vec2 = (v8i16) __msa_ilvr_b((v16i8) q2, (v16i8) q1);
1945
1946 src_org -= 3;
1947 ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src_org, pitch);
1948 ST2x4_UB(vec2, 0, (src_org + 4), pitch);
1949 src_org += (4 * pitch);
1950 ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src_org, pitch);
1951 ST2x4_UB(vec2, 4, (src_org + 4), pitch);
1952
1953 return 1;
1954 } else {
1955 src -= 7 * 16;
1956
1957 ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2,
1958 zero, p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in,
1959 p3_r_in, p2_r_in, p1_r_in, p0_r_in);
1960 q0_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8)
q0);
1961
1962 tmp0_r = p7_r_in << 3;
1963 tmp0_r -= p7_r_in;
1964 tmp0_r += p6_r_in;
1965 tmp0_r += q0_r_in;
1966 tmp1_r = p6_r_in + p5_r_in;
1967 tmp1_r += p4_r_in;
1968 tmp1_r += p3_r_in;
1969 tmp1_r += p2_r_in;
1970 tmp1_r += p1_r_in;
1971 tmp1_r += p0_r_in;
1972 tmp1_r += tmp0_r;
1973
1974 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
1975 r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
1976 p6 = __msa_bmnz_v(p6, (v16u8) r_out, flat2);
1978 src += 16;
1979
1980 /* p5 */
1981 q1_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8)
q1);
1982 tmp0_r = p5_r_in - p6_r_in;
1983 tmp0_r += q1_r_in;
1984 tmp0_r -= p7_r_in;
1985 tmp1_r += tmp0_r;
1986 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
1987 r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
1988 p5 = __msa_bmnz_v(p5, (v16u8) r_out, flat2);
1990 src += 16;
1991
1992 /* p4 */
1993 q2_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q2);
1994 tmp0_r = p4_r_in - p5_r_in;
1995 tmp0_r += q2_r_in;
1996 tmp0_r -= p7_r_in;
1997 tmp1_r += tmp0_r;
1998 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
1999 r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
2000 p4 = __msa_bmnz_v(p4, (v16u8) r_out, flat2);
2002 src += 16;
2003
2004 /* p3 */
2005 q3_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q3);
2006 tmp0_r = p3_r_in - p4_r_in;
2007 tmp0_r += q3_r_in;
2008 tmp0_r -= p7_r_in;
2009 tmp1_r += tmp0_r;
2010 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2011 r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
2012 p3 = __msa_bmnz_v(p3, (v16u8) r_out, flat2);
2014 src += 16;
2015
2016 /* p2 */
2017 q4_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q4);
2018 filter8 =
LD_UB(filter48);
2019 tmp0_r = p2_r_in - p3_r_in;
2020 tmp0_r += q4_r_in;
2021 tmp0_r -= p7_r_in;
2022 tmp1_r += tmp0_r;
2023 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2024 r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
2025 filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
2027 src += 16;
2028
2029 /* p1 */
2030 q5_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q5);
2031 filter8 =
LD_UB(filter48 + 16);
2032 tmp0_r = p1_r_in - p2_r_in;
2033 tmp0_r += q5_r_in;
2034 tmp0_r -= p7_r_in;
2035 tmp1_r += tmp0_r;
2036 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2037 r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
2038 filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
2040 src += 16;
2041
2042 /* p0 */
2043 q6_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q6);
2044 filter8 =
LD_UB(filter48 + 32);
2045 tmp0_r = p0_r_in - p1_r_in;
2046 tmp0_r += q6_r_in;
2047 tmp0_r -= p7_r_in;
2048 tmp1_r += tmp0_r;
2049 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2050 r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
2051 filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
2053 src += 16;
2054
2055 /* q0 */
2056 q7_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q7);
2057 filter8 =
LD_UB(filter48 + 48);
2058 tmp0_r = q7_r_in - p0_r_in;
2059 tmp0_r += q0_r_in;
2060 tmp0_r -= p7_r_in;
2061 tmp1_r += tmp0_r;
2062 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2063 r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
2064 filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
2066 src += 16;
2067
2068 /* q1 */
2069 filter8 =
LD_UB(filter48 + 64);
2070 tmp0_r = q7_r_in - q0_r_in;
2071 tmp0_r += q1_r_in;
2072 tmp0_r -= p6_r_in;
2073 tmp1_r += tmp0_r;
2074 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2075 r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
2076 filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
2078 src += 16;
2079
2080 /* q2 */
2081 filter8 =
LD_UB(filter48 + 80);
2082 tmp0_r = q7_r_in - q1_r_in;
2083 tmp0_r += q2_r_in;
2084 tmp0_r -= p5_r_in;
2085 tmp1_r += tmp0_r;
2086 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2087 r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
2088 filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
2090 src += 16;
2091
2092 /* q3 */
2093 tmp0_r = q7_r_in - q2_r_in;
2094 tmp0_r += q3_r_in;
2095 tmp0_r -= p4_r_in;
2096 tmp1_r += tmp0_r;
2097 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2098 r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
2099 q3 = __msa_bmnz_v(q3, (v16u8) r_out, flat2);
2101 src += 16;
2102
2103 /* q4 */
2104 tmp0_r = q7_r_in - q3_r_in;
2105 tmp0_r += q4_r_in;
2106 tmp0_r -= p3_r_in;
2107 tmp1_r += tmp0_r;
2108 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2109 r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
2110 q4 = __msa_bmnz_v(q4, (v16u8) r_out, flat2);
2112 src += 16;
2113
2114 /* q5 */
2115 tmp0_r = q7_r_in - q4_r_in;
2116 tmp0_r += q5_r_in;
2117 tmp0_r -= p2_r_in;
2118 tmp1_r += tmp0_r;
2119 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2120 r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
2121 q5 = __msa_bmnz_v(q5, (v16u8) r_out, flat2);
2123 src += 16;
2124
2125 /* q6 */
2126 tmp0_r = q7_r_in - q5_r_in;
2127 tmp0_r += q6_r_in;
2128 tmp0_r -= p1_r_in;
2129 tmp1_r += tmp0_r;
2130 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2131 r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
2132 q6 = __msa_bmnz_v(q6, (v16u8) r_out, flat2);
2134
2135 return 0;
2136 }
2137 }
2138
2143 {
2146 uint8_t *filter48 = &transposed_input[16 * 16];
2147
2149
2151 &filter48[0], src, pitch,
2152 b_limit_ptr, limit_ptr, thresh_ptr);
2153
2154 if (0 == early_exit) {
2156 &filter48[0]);
2157
2158 if (0 == early_exit) {
2160 }
2161 }
2162 }
2163
2165 uint8_t *src_org, ptrdiff_t pitch,
2169 {
2170 v16u8 p3, p2, p1, p0, q3, q2,
q1,
q0;
2171 v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
2173 v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
2174 v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
2175 v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r;
2176 v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r;
2177 v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l;
2178 v8i16 q0_filt8_l, q1_filt8_l, q2_filt8_l;
2180 v8i16 vec0, vec1, vec2, vec3, vec4, vec5;
2181
2182 /* load vector elements */
2183 LD_UB8(src - (4 * 16), 16, p3, p2, p1, p0, q0, q1, q2, q3);
2184
2185 thresh = (v16u8) __msa_fill_b(thresh_ptr);
2186 b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
2187 limit = (v16u8) __msa_fill_b(limit_ptr);
2188
2189 /* mask and hev */
2190 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
2191 hev, mask, flat);
2192 /* flat4 */
2193 VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
2194 /* filter4 */
2196 q1_out);
2197
2198 /* if flat is zero for all pixels, then no need to calculate other filter */
2199 if (__msa_test_bz_v(flat)) {
2200 ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
2202 ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
2204
2205 src_org -= 2;
2206 ST4x8_UB(vec2, vec3, src_org, pitch);
2207 src_org += 8 * pitch;
2208 ST4x8_UB(vec4, vec5, src_org, pitch);
2209
2210 return 1;
2211 } else {
2212 ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
2213 zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
2214 q3_r);
2215 VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
2216 p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
2217 ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l,
2218 p0_l);
2219 ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l,
2220 q3_l);
2221 VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
2222 p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
2223
2224 /* convert 16 bit output data into 8 bit */
2225 PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
2226 p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
2227 p0_filt8_r, q0_filt8_r);
2228 PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r,
2229 q2_filt8_r);
2230
2231 /* store pixel values */
2232 p2_out = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, flat);
2233 p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, flat);
2234 p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, flat);
2235 q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, flat);
2236 q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, flat);
2237 q2_out = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, flat);
2238
2239 ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16);
2240 filter48 += (4 * 16);
2241 ST_UB2(q1_out, q2_out, filter48, 16);
2242 filter48 += (2 * 16);
2243 ST_UB(flat, filter48);
2244
2245 return 0;
2246 }
2247 }
2248
2251 {
2252 v16u8
flat, flat2, filter8;
2254 v16u8 p7, p6, p5, p4, p3, p2, p1, p0,
q0,
q1, q2, q3, q4, q5, q6, q7;
2255 v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in;
2256 v8u16 p3_r_in, p2_r_in, p1_r_in, p0_r_in;
2257 v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in;
2258 v8u16 q3_r_in, q2_r_in, q1_r_in, q0_r_in;
2259 v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in;
2260 v8u16 p3_l_in, p2_l_in, p1_l_in, p0_l_in;
2261 v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in;
2262 v8u16 q3_l_in, q2_l_in, q1_l_in, q0_l_in;
2263 v8u16 tmp0_r, tmp1_r, tmp0_l, tmp1_l;
2264 v8i16 l_out, r_out;
2265
2266 flat =
LD_UB(filter48 + 6 * 16);
2267
2268 LD_UB8((src - 8 * 16), 16, p7, p6, p5, p4, p3, p2, p1, p0);
2269 LD_UB8(src, 16, q0, q1, q2, q3, q4, q5, q6, q7);
2270
2271 VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
2272
2273 /* if flat2 is zero for all pixels, then no need to calculate other filter */
2274 if (__msa_test_bz_v(flat2)) {
2275 v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2276
2277 LD_UB4(filter48, 16, p2, p1, p0, q0);
2278 LD_UB2(filter48 + 4 * 16, 16, q1, q2);
2279
2285
2286 src_org -= 3;
2287 ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src_org, pitch);
2288 ST2x4_UB(vec2, 0, (src_org + 4), pitch);
2289 src_org += (4 * pitch);
2290 ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src_org, pitch);
2291 ST2x4_UB(vec2, 4, (src_org + 4), pitch);
2292 src_org += (4 * pitch);
2293 ST4x4_UB(vec6, vec6, 0, 1, 2, 3, src_org, pitch);
2294 ST2x4_UB(vec5, 0, (src_org + 4), pitch);
2295 src_org += (4 * pitch);
2296 ST4x4_UB(vec7, vec7, 0, 1, 2, 3, src_org, pitch);
2297 ST2x4_UB(vec5, 4, (src_org + 4), pitch);
2298
2299 return 1;
2300 } else {
2301 src -= 7 * 16;
2302
2303 ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2,
2304 zero, p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in,
2305 p3_r_in, p2_r_in, p1_r_in, p0_r_in);
2306 q0_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8)
q0);
2307
2308 tmp0_r = p7_r_in << 3;
2309 tmp0_r -= p7_r_in;
2310 tmp0_r += p6_r_in;
2311 tmp0_r += q0_r_in;
2312 tmp1_r = p6_r_in + p5_r_in;
2313 tmp1_r += p4_r_in;
2314 tmp1_r += p3_r_in;
2315 tmp1_r += p2_r_in;
2316 tmp1_r += p1_r_in;
2317 tmp1_r += p0_r_in;
2318 tmp1_r += tmp0_r;
2319 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2320
2321 ILVL_B4_UH(zero, p7, zero, p6, zero, p5, zero, p4, p7_l_in, p6_l_in,
2322 p5_l_in, p4_l_in);
2323 ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l_in, p2_l_in,
2324 p1_l_in, p0_l_in);
2325 q0_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8)
q0);
2326
2327 tmp0_l = p7_l_in << 3;
2328 tmp0_l -= p7_l_in;
2329 tmp0_l += p6_l_in;
2330 tmp0_l += q0_l_in;
2331 tmp1_l = p6_l_in + p5_l_in;
2332 tmp1_l += p4_l_in;
2333 tmp1_l += p3_l_in;
2334 tmp1_l += p2_l_in;
2335 tmp1_l += p1_l_in;
2336 tmp1_l += p0_l_in;
2337 tmp1_l += tmp0_l;
2338 l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2339
2340 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2341 p6 = __msa_bmnz_v(p6, (v16u8) r_out, flat2);
2343 src += 16;
2344
2345 /* p5 */
2346 q1_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8)
q1);
2347 tmp0_r = p5_r_in - p6_r_in;
2348 tmp0_r += q1_r_in;
2349 tmp0_r -= p7_r_in;
2350 tmp1_r += tmp0_r;
2351 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2352 q1_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8)
q1);
2353 tmp0_l = p5_l_in - p6_l_in;
2354 tmp0_l += q1_l_in;
2355 tmp0_l -= p7_l_in;
2356 tmp1_l += tmp0_l;
2357 l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2358 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2359 p5 = __msa_bmnz_v(p5, (v16u8) r_out, flat2);
2361 src += 16;
2362
2363 /* p4 */
2364 q2_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q2);
2365 tmp0_r = p4_r_in - p5_r_in;
2366 tmp0_r += q2_r_in;
2367 tmp0_r -= p7_r_in;
2368 tmp1_r += tmp0_r;
2369 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2370 q2_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q2);
2371 tmp0_l = p4_l_in - p5_l_in;
2372 tmp0_l += q2_l_in;
2373 tmp0_l -= p7_l_in;
2374 tmp1_l += tmp0_l;
2375 l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2376 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2377 p4 = __msa_bmnz_v(p4, (v16u8) r_out, flat2);
2379 src += 16;
2380
2381 /* p3 */
2382 q3_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q3);
2383 tmp0_r = p3_r_in - p4_r_in;
2384 tmp0_r += q3_r_in;
2385 tmp0_r -= p7_r_in;
2386 tmp1_r += tmp0_r;
2387 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2388 q3_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q3);
2389 tmp0_l = p3_l_in - p4_l_in;
2390 tmp0_l += q3_l_in;
2391 tmp0_l -= p7_l_in;
2392 tmp1_l += tmp0_l;
2393 l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2394 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2395 p3 = __msa_bmnz_v(p3, (v16u8) r_out, flat2);
2397 src += 16;
2398
2399 /* p2 */
2400 q4_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q4);
2401 filter8 =
LD_UB(filter48);
2402 tmp0_r = p2_r_in - p3_r_in;
2403 tmp0_r += q4_r_in;
2404 tmp0_r -= p7_r_in;
2405 tmp1_r += tmp0_r;
2406 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2407 q4_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q4);
2408 tmp0_l = p2_l_in - p3_l_in;
2409 tmp0_l += q4_l_in;
2410 tmp0_l -= p7_l_in;
2411 tmp1_l += tmp0_l;
2412 l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2413 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2414 filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
2415 ST_UB(filter8, src);
2416 src += 16;
2417
2418 /* p1 */
2419 q5_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q5);
2420 filter8 =
LD_UB(filter48 + 16);
2421 tmp0_r = p1_r_in - p2_r_in;
2422 tmp0_r += q5_r_in;
2423 tmp0_r -= p7_r_in;
2424 tmp1_r += tmp0_r;
2425 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2426 q5_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q5);
2427 tmp0_l = p1_l_in - p2_l_in;
2428 tmp0_l += q5_l_in;
2429 tmp0_l -= p7_l_in;
2430 tmp1_l += tmp0_l;
2431 l_out = __msa_srari_h((v8i16) (tmp1_l), 4);
2432 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2433 filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
2434 ST_UB(filter8, src);
2435 src += 16;
2436
2437 /* p0 */
2438 q6_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q6);
2439 filter8 =
LD_UB(filter48 + 32);
2440 tmp0_r = p0_r_in - p1_r_in;
2441 tmp0_r += q6_r_in;
2442 tmp0_r -= p7_r_in;
2443 tmp1_r += tmp0_r;
2444 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2445 q6_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q6);
2446 tmp0_l = p0_l_in - p1_l_in;
2447 tmp0_l += q6_l_in;
2448 tmp0_l -= p7_l_in;
2449 tmp1_l += tmp0_l;
2450 l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2451 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2452 filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
2453 ST_UB(filter8, src);
2454 src += 16;
2455
2456 /* q0 */
2457 q7_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q7);
2458 filter8 =
LD_UB(filter48 + 48);
2459 tmp0_r = q7_r_in - p0_r_in;
2460 tmp0_r += q0_r_in;
2461 tmp0_r -= p7_r_in;
2462 tmp1_r += tmp0_r;
2463 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2464 q7_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q7);
2465 tmp0_l = q7_l_in - p0_l_in;
2466 tmp0_l += q0_l_in;
2467 tmp0_l -= p7_l_in;
2468 tmp1_l += tmp0_l;
2469 l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2470 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2471 filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
2472 ST_UB(filter8, src);
2473 src += 16;
2474
2475 /* q1 */
2476 filter8 =
LD_UB(filter48 + 64);
2477 tmp0_r = q7_r_in - q0_r_in;
2478 tmp0_r += q1_r_in;
2479 tmp0_r -= p6_r_in;
2480 tmp1_r += tmp0_r;
2481 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2482 tmp0_l = q7_l_in - q0_l_in;
2483 tmp0_l += q1_l_in;
2484 tmp0_l -= p6_l_in;
2485 tmp1_l += tmp0_l;
2486 l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2487 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2488 filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
2489 ST_UB(filter8, src);
2490 src += 16;
2491
2492 /* q2 */
2493 filter8 =
LD_UB(filter48 + 80);
2494 tmp0_r = q7_r_in - q1_r_in;
2495 tmp0_r += q2_r_in;
2496 tmp0_r -= p5_r_in;
2497 tmp1_r += tmp0_r;
2498 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2499 tmp0_l = q7_l_in - q1_l_in;
2500 tmp0_l += q2_l_in;
2501 tmp0_l -= p5_l_in;
2502 tmp1_l += tmp0_l;
2503 l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2504 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2505 filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
2506 ST_UB(filter8, src);
2507 src += 16;
2508
2509 /* q3 */
2510 tmp0_r = q7_r_in - q2_r_in;
2511 tmp0_r += q3_r_in;
2512 tmp0_r -= p4_r_in;
2513 tmp1_r += tmp0_r;
2514 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2515 tmp0_l = q7_l_in - q2_l_in;
2516 tmp0_l += q3_l_in;
2517 tmp0_l -= p4_l_in;
2518 tmp1_l += tmp0_l;
2519 l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2520 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2521 q3 = __msa_bmnz_v(q3, (v16u8) r_out, flat2);
2523 src += 16;
2524
2525 /* q4 */
2526 tmp0_r = q7_r_in - q3_r_in;
2527 tmp0_r += q4_r_in;
2528 tmp0_r -= p3_r_in;
2529 tmp1_r += tmp0_r;
2530 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2531 tmp0_l = q7_l_in - q3_l_in;
2532 tmp0_l += q4_l_in;
2533 tmp0_l -= p3_l_in;
2534 tmp1_l += tmp0_l;
2535 l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2536 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2537 q4 = __msa_bmnz_v(q4, (v16u8) r_out, flat2);
2539 src += 16;
2540
2541 /* q5 */
2542 tmp0_r = q7_r_in - q4_r_in;
2543 tmp0_r += q5_r_in;
2544 tmp0_r -= p2_r_in;
2545 tmp1_r += tmp0_r;
2546 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2547 tmp0_l = q7_l_in - q4_l_in;
2548 tmp0_l += q5_l_in;
2549 tmp0_l -= p2_l_in;
2550 tmp1_l += tmp0_l;
2551 l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2552 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2553 q5 = __msa_bmnz_v(q5, (v16u8) r_out, flat2);
2555 src += 16;
2556
2557 /* q6 */
2558 tmp0_r = q7_r_in - q5_r_in;
2559 tmp0_r += q6_r_in;
2560 tmp0_r -= p1_r_in;
2561 tmp1_r += tmp0_r;
2562 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2563 tmp0_l = q7_l_in - q5_l_in;
2564 tmp0_l += q6_l_in;
2565 tmp0_l -= p1_l_in;
2566 tmp1_l += tmp0_l;
2567 l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2568 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2569 q6 = __msa_bmnz_v(q6, (v16u8) r_out, flat2);
2571
2572 return 0;
2573 }
2574 }
2575
2580 {
2583 uint8_t *filter48 = &transposed_input[16 * 16];
2584
2586
2588 &filter48[0], src, pitch,
2589 b_limit_ptr, limit_ptr, thresh_ptr);
2590
2591 if (0 == early_exit) {
2593 &filter48[0]);
2594
2595 if (0 == early_exit) {
2597 }
2598 }
2599 }
void ff_loop_filter_h_8_8_msa(uint8_t *src, ptrdiff_t pitch, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
static void vp9_transpose_8x16_to_16x8(uint8_t *input, int32_t in_pitch, uint8_t *output, int32_t out_pitch)
static void vp9_transpose_16x16(uint8_t *input, int32_t in_pitch, uint8_t *output, int32_t out_pitch)
void ff_loop_filter_v_44_16_msa(uint8_t *src, ptrdiff_t pitch, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
void ff_loop_filter_v_4_8_msa(uint8_t *src, ptrdiff_t pitch, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
#define VP9_FILTER8(p3_in, p2_in, p1_in, p0_in,q0_in, q1_in, q2_in, q3_in,p2_filt8_out, p1_filt8_out, p0_filt8_out,q0_filt8_out, q1_filt8_out, q2_filt8_out)
void ff_loop_filter_h_48_16_msa(uint8_t *src, ptrdiff_t pitch, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
static const uint8_t q1[256]
#define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride)
#define TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7,in8, in9, in10, in11, in12, in13, in14, in15,out0, out1, out2, out3, out4, out5, out6, out7)
#define VP9_FLAT4(p3_in, p2_in, p0_in, q0_in, q2_in, q3_in, flat_out)
#define SLDI_B4_0_UB(...)
static int32_t vp9_hz_lpf_t4_and_t8_16w(uint8_t *src, ptrdiff_t pitch, uint8_t *filter48, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
static void vp9_hz_lpf_t16_16w(uint8_t *src, ptrdiff_t pitch, uint8_t *filter48)
static const uint16_t mask[17]
void ff_loop_filter_h_84_16_msa(uint8_t *src, ptrdiff_t pitch, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
static int32_t vp9_vt_lpf_t4_and_t8_8w(uint8_t *src, uint8_t *filter48, uint8_t *src_org, int32_t pitch_org, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
void ff_loop_filter_h_4_8_msa(uint8_t *src, ptrdiff_t pitch, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
void ff_loop_filter_v_48_16_msa(uint8_t *src, ptrdiff_t pitch, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
static const uint8_t q0[256]
void ff_loop_filter_v_16_16_msa(uint8_t *src, ptrdiff_t pitch, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
#define VP9_FLAT5(p7_in, p6_in, p5_in, p4_in, p0_in, q0_in, q4_in,q5_in, q6_in, q7_in, flat_in, flat2_out)
void ff_loop_filter_h_16_8_msa(uint8_t *src, ptrdiff_t pitch, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
static int32_t vp9_vt_lpf_t16_16w(uint8_t *src, uint8_t *src_org, ptrdiff_t pitch, uint8_t *filter48)
#define ST2x4_UB(in, stidx, pdst, stride)
#define LPF_MASK_HEV(p3_in, p2_in, p1_in, p0_in,q0_in, q1_in, q2_in, q3_in,limit_in, b_limit_in, thresh_in,hev_out, mask_out, flat_out)
static int32_t vp9_vt_lpf_t16_8w(uint8_t *src, uint8_t *src_org, ptrdiff_t pitch, uint8_t *filter48)
void ff_loop_filter_v_16_8_msa(uint8_t *src, ptrdiff_t pitch, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
#define TRANSPOSE8x8_UB_UB(...)
void ff_loop_filter_h_44_16_msa(uint8_t *src, ptrdiff_t pitch, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
static void vp9_transpose_16x8_to_8x16(uint8_t *input, int32_t in_pitch, uint8_t *output, int32_t out_pitch)
void ff_loop_filter_v_84_16_msa(uint8_t *src, ptrdiff_t pitch, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
#define SD4(in0, in1, in2, in3, pdst, stride)
#define ST4x8_UB(in0, in1, pdst, stride)
#define ALLOC_ALIGNED(align)
void ff_loop_filter_v_88_16_msa(uint8_t *src, ptrdiff_t pitch, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
void ff_loop_filter_v_8_8_msa(uint8_t *src, ptrdiff_t pitch, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
void ff_loop_filter_h_16_16_msa(uint8_t *src, ptrdiff_t pitch, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
#define VP9_LPF_FILTER4_4W(p1_in, p0_in, q0_in, q1_in, mask_in, hev_in,p1_out, p0_out, q0_out, q1_out)
#define ST8x1_UB(in, pdst)
static av_always_inline int hev(uint8_t *p, ptrdiff_t stride, int thresh)
void ff_loop_filter_h_88_16_msa(uint8_t *src, ptrdiff_t pitch, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
#define VP9_LPF_FILTER4_8W(p1_in, p0_in, q0_in, q1_in, mask_in, hev_in,p1_out, p0_out, q0_out, q1_out)
static int32_t vp9_vt_lpf_t4_and_t8_16w(uint8_t *src, uint8_t *filter48, uint8_t *src_org, ptrdiff_t pitch, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)