1 // Copyright 2016 Google Inc. All Rights Reserved.
2 //
3 // Use of this source code is governed by a BSD-style license
4 // that can be found in the COPYING file in the root of the source
5 // tree. An additional intellectual property rights grant can be found
6 // in the file PATENTS. All contributing project authors may
7 // be found in the AUTHORS file in the root of the source tree.
8 // -----------------------------------------------------------------------------
9 //
10 // MSA version of rescaling functions
11 //
12 // Author: Prashant Patil (prashant.patil@imgtec.com)
13
14 #include "src/dsp/dsp.h"
15
16 #if defined(WEBP_USE_MSA) && !defined(WEBP_REDUCE_SIZE)
17
18 #include <assert.h>
19
20 #include "src/utils/rescaler_utils.h"
21 #include "src/dsp/msa_macro.h"
22
23 #define ROUNDER (WEBP_RESCALER_ONE >> 1)
24 #define MULT_FIX(x, y) (((uint64_t)(x) * (y) + ROUNDER) >> WEBP_RESCALER_RFIX)
25 #define MULT_FIX_FLOOR(x, y) (((uint64_t)(x) * (y)) >> WEBP_RESCALER_RFIX)
26
27 #define CALC_MULT_FIX_16(in0, in1, in2, in3, scale, shift, dst) do { \
28 v4u32 tmp0, tmp1, tmp2, tmp3; \
29 v16u8 t0, t1, t2, t3, t4, t5; \
30 v2u64 out0, out1, out2, out3; \
31 ILVRL_W2_UW(zero, in0, tmp0, tmp1); \
32 ILVRL_W2_UW(zero, in1, tmp2, tmp3); \
33 DOTP_UW2_UD(tmp0, tmp1, scale, scale, out0, out1); \
34 DOTP_UW2_UD(tmp2, tmp3, scale, scale, out2, out3); \
35 SRAR_D4_UD(out0, out1, out2, out3, shift); \
36 PCKEV_B2_UB(out1, out0, out3, out2, t0, t1); \
37 ILVRL_W2_UW(zero, in2, tmp0, tmp1); \
38 ILVRL_W2_UW(zero, in3, tmp2, tmp3); \
39 DOTP_UW2_UD(tmp0, tmp1, scale, scale, out0, out1); \
40 DOTP_UW2_UD(tmp2, tmp3, scale, scale, out2, out3); \
41 SRAR_D4_UD(out0, out1, out2, out3, shift); \
42 PCKEV_B2_UB(out1, out0, out3, out2, t2, t3); \
43 PCKEV_B2_UB(t1, t0, t3, t2, t4, t5); \
44 dst = (v16u8)__msa_pckev_b((v16i8)t5, (v16i8)t4); \
45 } while (0)
46
47 #define CALC_MULT_FIX_4(in0, scale, shift, dst) do { \
48 v4u32 tmp0, tmp1; \
49 v16i8 t0, t1; \
50 v2u64 out0, out1; \
51 ILVRL_W2_UW(zero, in0, tmp0, tmp1); \
52 DOTP_UW2_UD(tmp0, tmp1, scale, scale, out0, out1); \
53 SRAR_D2_UD(out0, out1, shift); \
54 t0 = __msa_pckev_b((v16i8)out1, (v16i8)out0); \
55 t1 = __msa_pckev_b(t0, t0); \
56 t0 = __msa_pckev_b(t1, t1); \
57 dst = __msa_copy_s_w((v4i32)t0, 0); \
58 } while (0)
59
60 #define CALC_MULT_FIX1_16(in0, in1, in2, in3, fyscale, shift, \
61 dst0, dst1, dst2, dst3) do { \
62 v4u32 tmp0, tmp1, tmp2, tmp3; \
63 v2u64 out0, out1, out2, out3; \
64 ILVRL_W2_UW(zero, in0, tmp0, tmp1); \
65 ILVRL_W2_UW(zero, in1, tmp2, tmp3); \
66 DOTP_UW2_UD(tmp0, tmp1, fyscale, fyscale, out0, out1); \
67 DOTP_UW2_UD(tmp2, tmp3, fyscale, fyscale, out2, out3); \
68 SRAR_D4_UD(out0, out1, out2, out3, shift); \
69 PCKEV_W2_UW(out1, out0, out3, out2, dst0, dst1); \
70 ILVRL_W2_UW(zero, in2, tmp0, tmp1); \
71 ILVRL_W2_UW(zero, in3, tmp2, tmp3); \
72 DOTP_UW2_UD(tmp0, tmp1, fyscale, fyscale, out0, out1); \
73 DOTP_UW2_UD(tmp2, tmp3, fyscale, fyscale, out2, out3); \
74 SRAR_D4_UD(out0, out1, out2, out3, shift); \
75 PCKEV_W2_UW(out1, out0, out3, out2, dst2, dst3); \
76 } while (0)
77
78 #define CALC_MULT_FIX1_4(in0, scale, shift, dst) do { \
79 v4u32 tmp0, tmp1; \
80 v2u64 out0, out1; \
81 ILVRL_W2_UW(zero, in0, tmp0, tmp1); \
82 DOTP_UW2_UD(tmp0, tmp1, scale, scale, out0, out1); \
83 SRAR_D2_UD(out0, out1, shift); \
84 dst = (v4u32)__msa_pckev_w((v4i32)out1, (v4i32)out0); \
85 } while (0)
86
87 #define CALC_MULT_FIX2_16(in0, in1, in2, in3, mult, scale, shift, \
88 dst0, dst1) do { \
89 v4u32 tmp0, tmp1, tmp2, tmp3; \
90 v2u64 out0, out1, out2, out3; \
91 ILVRL_W2_UW(in0, in2, tmp0, tmp1); \
92 ILVRL_W2_UW(in1, in3, tmp2, tmp3); \
93 DOTP_UW2_UD(tmp0, tmp1, mult, mult, out0, out1); \
94 DOTP_UW2_UD(tmp2, tmp3, mult, mult, out2, out3); \
95 SRAR_D4_UD(out0, out1, out2, out3, shift); \
96 DOTP_UW2_UD(out0, out1, scale, scale, out0, out1); \
97 DOTP_UW2_UD(out2, out3, scale, scale, out2, out3); \
98 SRAR_D4_UD(out0, out1, out2, out3, shift); \
99 PCKEV_B2_UB(out1, out0, out3, out2, dst0, dst1); \
100 } while (0)
101
102 #define CALC_MULT_FIX2_4(in0, in1, mult, scale, shift, dst) do { \
103 v4u32 tmp0, tmp1; \
104 v2u64 out0, out1; \
105 v16i8 t0, t1; \
106 ILVRL_W2_UW(in0, in1, tmp0, tmp1); \
107 DOTP_UW2_UD(tmp0, tmp1, mult, mult, out0, out1); \
108 SRAR_D2_UD(out0, out1, shift); \
109 DOTP_UW2_UD(out0, out1, scale, scale, out0, out1); \
110 SRAR_D2_UD(out0, out1, shift); \
111 t0 = __msa_pckev_b((v16i8)out1, (v16i8)out0); \
112 t1 = __msa_pckev_b(t0, t0); \
113 t0 = __msa_pckev_b(t1, t1); \
114 dst = __msa_copy_s_w((v4i32)t0, 0); \
115 } while (0)
116
ExportRowExpand_0(const uint32_t * frow,uint8_t * dst,int length,WebPRescaler * const wrk)117 static WEBP_INLINE void ExportRowExpand_0(const uint32_t* frow, uint8_t* dst,
118 int length,
119 WebPRescaler* const wrk) {
120 const v4u32 scale = (v4u32)__msa_fill_w(wrk->fy_scale);
121 const v4u32 shift = (v4u32)__msa_fill_w(WEBP_RESCALER_RFIX);
122 const v4i32 zero = { 0 };
123
124 while (length >= 16) {
125 v4u32 src0, src1, src2, src3;
126 v16u8 out;
127 LD_UW4(frow, 4, src0, src1, src2, src3);
128 CALC_MULT_FIX_16(src0, src1, src2, src3, scale, shift, out);
129 ST_UB(out, dst);
130 length -= 16;
131 frow += 16;
132 dst += 16;
133 }
134 if (length > 0) {
135 int x_out;
136 if (length >= 12) {
137 uint32_t val0_m, val1_m, val2_m;
138 v4u32 src0, src1, src2;
139 LD_UW3(frow, 4, src0, src1, src2);
140 CALC_MULT_FIX_4(src0, scale, shift, val0_m);
141 CALC_MULT_FIX_4(src1, scale, shift, val1_m);
142 CALC_MULT_FIX_4(src2, scale, shift, val2_m);
143 SW3(val0_m, val1_m, val2_m, dst, 4);
144 length -= 12;
145 frow += 12;
146 dst += 12;
147 } else if (length >= 8) {
148 uint32_t val0_m, val1_m;
149 v4u32 src0, src1;
150 LD_UW2(frow, 4, src0, src1);
151 CALC_MULT_FIX_4(src0, scale, shift, val0_m);
152 CALC_MULT_FIX_4(src1, scale, shift, val1_m);
153 SW2(val0_m, val1_m, dst, 4);
154 length -= 8;
155 frow += 8;
156 dst += 8;
157 } else if (length >= 4) {
158 uint32_t val0_m;
159 const v4u32 src0 = LD_UW(frow);
160 CALC_MULT_FIX_4(src0, scale, shift, val0_m);
161 SW(val0_m, dst);
162 length -= 4;
163 frow += 4;
164 dst += 4;
165 }
166 for (x_out = 0; x_out < length; ++x_out) {
167 const uint32_t J = frow[x_out];
168 const int v = (int)MULT_FIX(J, wrk->fy_scale);
169 assert(v >= 0 && v <= 255);
170 dst[x_out] = v;
171 }
172 }
173 }
174
ExportRowExpand_1(const uint32_t * frow,uint32_t * irow,uint8_t * dst,int length,WebPRescaler * const wrk)175 static WEBP_INLINE void ExportRowExpand_1(const uint32_t* frow, uint32_t* irow,
176 uint8_t* dst, int length,
177 WebPRescaler* const wrk) {
178 const uint32_t B = WEBP_RESCALER_FRAC(-wrk->y_accum, wrk->y_sub);
179 const uint32_t A = (uint32_t)(WEBP_RESCALER_ONE - B);
180 const v4i32 B1 = __msa_fill_w(B);
181 const v4i32 A1 = __msa_fill_w(A);
182 const v4i32 AB = __msa_ilvr_w(A1, B1);
183 const v4u32 scale = (v4u32)__msa_fill_w(wrk->fy_scale);
184 const v4u32 shift = (v4u32)__msa_fill_w(WEBP_RESCALER_RFIX);
185
186 while (length >= 16) {
187 v4u32 frow0, frow1, frow2, frow3, irow0, irow1, irow2, irow3;
188 v16u8 t0, t1, t2, t3, t4, t5;
189 LD_UW4(frow, 4, frow0, frow1, frow2, frow3);
190 LD_UW4(irow, 4, irow0, irow1, irow2, irow3);
191 CALC_MULT_FIX2_16(frow0, frow1, irow0, irow1, AB, scale, shift, t0, t1);
192 CALC_MULT_FIX2_16(frow2, frow3, irow2, irow3, AB, scale, shift, t2, t3);
193 PCKEV_B2_UB(t1, t0, t3, t2, t4, t5);
194 t0 = (v16u8)__msa_pckev_b((v16i8)t5, (v16i8)t4);
195 ST_UB(t0, dst);
196 frow += 16;
197 irow += 16;
198 dst += 16;
199 length -= 16;
200 }
201 if (length > 0) {
202 int x_out;
203 if (length >= 12) {
204 uint32_t val0_m, val1_m, val2_m;
205 v4u32 frow0, frow1, frow2, irow0, irow1, irow2;
206 LD_UW3(frow, 4, frow0, frow1, frow2);
207 LD_UW3(irow, 4, irow0, irow1, irow2);
208 CALC_MULT_FIX2_4(frow0, irow0, AB, scale, shift, val0_m);
209 CALC_MULT_FIX2_4(frow1, irow1, AB, scale, shift, val1_m);
210 CALC_MULT_FIX2_4(frow2, irow2, AB, scale, shift, val2_m);
211 SW3(val0_m, val1_m, val2_m, dst, 4);
212 frow += 12;
213 irow += 12;
214 dst += 12;
215 length -= 12;
216 } else if (length >= 8) {
217 uint32_t val0_m, val1_m;
218 v4u32 frow0, frow1, irow0, irow1;
219 LD_UW2(frow, 4, frow0, frow1);
220 LD_UW2(irow, 4, irow0, irow1);
221 CALC_MULT_FIX2_4(frow0, irow0, AB, scale, shift, val0_m);
222 CALC_MULT_FIX2_4(frow1, irow1, AB, scale, shift, val1_m);
223 SW2(val0_m, val1_m, dst, 4);
224 frow += 4;
225 irow += 4;
226 dst += 4;
227 length -= 4;
228 } else if (length >= 4) {
229 uint32_t val0_m;
230 const v4u32 frow0 = LD_UW(frow + 0);
231 const v4u32 irow0 = LD_UW(irow + 0);
232 CALC_MULT_FIX2_4(frow0, irow0, AB, scale, shift, val0_m);
233 SW(val0_m, dst);
234 frow += 4;
235 irow += 4;
236 dst += 4;
237 length -= 4;
238 }
239 for (x_out = 0; x_out < length; ++x_out) {
240 const uint64_t I = (uint64_t)A * frow[x_out]
241 + (uint64_t)B * irow[x_out];
242 const uint32_t J = (uint32_t)((I + ROUNDER) >> WEBP_RESCALER_RFIX);
243 const int v = (int)MULT_FIX(J, wrk->fy_scale);
244 assert(v >= 0 && v <= 255);
245 dst[x_out] = v;
246 }
247 }
248 }
249
RescalerExportRowExpand_MIPSdspR2(WebPRescaler * const wrk)250 static void RescalerExportRowExpand_MIPSdspR2(WebPRescaler* const wrk) {
251 uint8_t* dst = wrk->dst;
252 rescaler_t* irow = wrk->irow;
253 const int x_out_max = wrk->dst_width * wrk->num_channels;
254 const rescaler_t* frow = wrk->frow;
255 assert(!WebPRescalerOutputDone(wrk));
256 assert(wrk->y_accum <= 0);
257 assert(wrk->y_expand);
258 assert(wrk->y_sub != 0);
259 if (wrk->y_accum == 0) {
260 ExportRowExpand_0(frow, dst, x_out_max, wrk);
261 } else {
262 ExportRowExpand_1(frow, irow, dst, x_out_max, wrk);
263 }
264 }
265
266 #if 0 // disabled for now. TODO(skal): make match the C-code
267 static WEBP_INLINE void ExportRowShrink_0(const uint32_t* frow, uint32_t* irow,
268 uint8_t* dst, int length,
269 const uint32_t yscale,
270 WebPRescaler* const wrk) {
271 const v4u32 y_scale = (v4u32)__msa_fill_w(yscale);
272 const v4u32 fxyscale = (v4u32)__msa_fill_w(wrk->fxy_scale);
273 const v4u32 shiftval = (v4u32)__msa_fill_w(WEBP_RESCALER_RFIX);
274 const v4i32 zero = { 0 };
275
276 while (length >= 16) {
277 v4u32 src0, src1, src2, src3, frac0, frac1, frac2, frac3;
278 v16u8 out;
279 LD_UW4(frow, 4, src0, src1, src2, src3);
280 CALC_MULT_FIX1_16(src0, src1, src2, src3, y_scale, shiftval,
281 frac0, frac1, frac2, frac3);
282 LD_UW4(irow, 4, src0, src1, src2, src3);
283 SUB4(src0, frac0, src1, frac1, src2, frac2, src3, frac3,
284 src0, src1, src2, src3);
285 CALC_MULT_FIX_16(src0, src1, src2, src3, fxyscale, shiftval, out);
286 ST_UB(out, dst);
287 ST_UW4(frac0, frac1, frac2, frac3, irow, 4);
288 frow += 16;
289 irow += 16;
290 dst += 16;
291 length -= 16;
292 }
293 if (length > 0) {
294 int x_out;
295 if (length >= 12) {
296 uint32_t val0_m, val1_m, val2_m;
297 v4u32 src0, src1, src2, frac0, frac1, frac2;
298 LD_UW3(frow, 4, src0, src1, src2);
299 CALC_MULT_FIX1_4(src0, y_scale, shiftval, frac0);
300 CALC_MULT_FIX1_4(src1, y_scale, shiftval, frac1);
301 CALC_MULT_FIX1_4(src2, y_scale, shiftval, frac2);
302 LD_UW3(irow, 4, src0, src1, src2);
303 SUB3(src0, frac0, src1, frac1, src2, frac2, src0, src1, src2);
304 CALC_MULT_FIX_4(src0, fxyscale, shiftval, val0_m);
305 CALC_MULT_FIX_4(src1, fxyscale, shiftval, val1_m);
306 CALC_MULT_FIX_4(src2, fxyscale, shiftval, val2_m);
307 SW3(val0_m, val1_m, val2_m, dst, 4);
308 ST_UW3(frac0, frac1, frac2, irow, 4);
309 frow += 12;
310 irow += 12;
311 dst += 12;
312 length -= 12;
313 } else if (length >= 8) {
314 uint32_t val0_m, val1_m;
315 v4u32 src0, src1, frac0, frac1;
316 LD_UW2(frow, 4, src0, src1);
317 CALC_MULT_FIX1_4(src0, y_scale, shiftval, frac0);
318 CALC_MULT_FIX1_4(src1, y_scale, shiftval, frac1);
319 LD_UW2(irow, 4, src0, src1);
320 SUB2(src0, frac0, src1, frac1, src0, src1);
321 CALC_MULT_FIX_4(src0, fxyscale, shiftval, val0_m);
322 CALC_MULT_FIX_4(src1, fxyscale, shiftval, val1_m);
323 SW2(val0_m, val1_m, dst, 4);
324 ST_UW2(frac0, frac1, irow, 4);
325 frow += 8;
326 irow += 8;
327 dst += 8;
328 length -= 8;
329 } else if (length >= 4) {
330 uint32_t val0_m;
331 v4u32 frac0;
332 v4u32 src0 = LD_UW(frow);
333 CALC_MULT_FIX1_4(src0, y_scale, shiftval, frac0);
334 src0 = LD_UW(irow);
335 src0 = src0 - frac0;
336 CALC_MULT_FIX_4(src0, fxyscale, shiftval, val0_m);
337 SW(val0_m, dst);
338 ST_UW(frac0, irow);
339 frow += 4;
340 irow += 4;
341 dst += 4;
342 length -= 4;
343 }
344 for (x_out = 0; x_out < length; ++x_out) {
345 const uint32_t frac = (uint32_t)MULT_FIX(frow[x_out], yscale);
346 const int v = (int)MULT_FIX_FLOOR(irow[x_out] - frac, wrk->fxy_scale);
347 assert(v >= 0 && v <= 255);
348 dst[x_out] = v;
349 irow[x_out] = frac;
350 }
351 }
352 }
353
354 static WEBP_INLINE void ExportRowShrink_1(uint32_t* irow, uint8_t* dst,
355 int length,
356 WebPRescaler* const wrk) {
357 const v4u32 scale = (v4u32)__msa_fill_w(wrk->fxy_scale);
358 const v4u32 shift = (v4u32)__msa_fill_w(WEBP_RESCALER_RFIX);
359 const v4i32 zero = { 0 };
360
361 while (length >= 16) {
362 v4u32 src0, src1, src2, src3;
363 v16u8 dst0;
364 LD_UW4(irow, 4, src0, src1, src2, src3);
365 CALC_MULT_FIX_16(src0, src1, src2, src3, scale, shift, dst0);
366 ST_UB(dst0, dst);
367 ST_SW4(zero, zero, zero, zero, irow, 4);
368 length -= 16;
369 irow += 16;
370 dst += 16;
371 }
372 if (length > 0) {
373 int x_out;
374 if (length >= 12) {
375 uint32_t val0_m, val1_m, val2_m;
376 v4u32 src0, src1, src2;
377 LD_UW3(irow, 4, src0, src1, src2);
378 CALC_MULT_FIX_4(src0, scale, shift, val0_m);
379 CALC_MULT_FIX_4(src1, scale, shift, val1_m);
380 CALC_MULT_FIX_4(src2, scale, shift, val2_m);
381 SW3(val0_m, val1_m, val2_m, dst, 4);
382 ST_SW3(zero, zero, zero, irow, 4);
383 length -= 12;
384 irow += 12;
385 dst += 12;
386 } else if (length >= 8) {
387 uint32_t val0_m, val1_m;
388 v4u32 src0, src1;
389 LD_UW2(irow, 4, src0, src1);
390 CALC_MULT_FIX_4(src0, scale, shift, val0_m);
391 CALC_MULT_FIX_4(src1, scale, shift, val1_m);
392 SW2(val0_m, val1_m, dst, 4);
393 ST_SW2(zero, zero, irow, 4);
394 length -= 8;
395 irow += 8;
396 dst += 8;
397 } else if (length >= 4) {
398 uint32_t val0_m;
399 const v4u32 src0 = LD_UW(irow + 0);
400 CALC_MULT_FIX_4(src0, scale, shift, val0_m);
401 SW(val0_m, dst);
402 ST_SW(zero, irow);
403 length -= 4;
404 irow += 4;
405 dst += 4;
406 }
407 for (x_out = 0; x_out < length; ++x_out) {
408 const int v = (int)MULT_FIX(irow[x_out], wrk->fxy_scale);
409 assert(v >= 0 && v <= 255);
410 dst[x_out] = v;
411 irow[x_out] = 0;
412 }
413 }
414 }
415
416 static void RescalerExportRowShrink_MIPSdspR2(WebPRescaler* const wrk) {
417 uint8_t* dst = wrk->dst;
418 rescaler_t* irow = wrk->irow;
419 const int x_out_max = wrk->dst_width * wrk->num_channels;
420 const rescaler_t* frow = wrk->frow;
421 const uint32_t yscale = wrk->fy_scale * (-wrk->y_accum);
422 assert(!WebPRescalerOutputDone(wrk));
423 assert(wrk->y_accum <= 0);
424 assert(!wrk->y_expand);
425 if (yscale) {
426 ExportRowShrink_0(frow, irow, dst, x_out_max, yscale, wrk);
427 } else {
428 ExportRowShrink_1(irow, dst, x_out_max, wrk);
429 }
430 }
431 #endif // 0
432
433 //------------------------------------------------------------------------------
434 // Entry point
435
436 extern void WebPRescalerDspInitMSA(void);
437
WebPRescalerDspInitMSA(void)438 WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInitMSA(void) {
439 WebPRescalerExportRowExpand = RescalerExportRowExpand_MIPSdspR2;
440 // WebPRescalerExportRowShrink = RescalerExportRowShrink_MIPSdspR2;
441 }
442
443 #else // !WEBP_USE_MSA
444
445 WEBP_DSP_INIT_STUB(WebPRescalerDspInitMSA)
446
447 #endif // WEBP_USE_MSA
448