1 // Copyright 2016 Google Inc. All Rights Reserved.
2 //
3 // Use of this source code is governed by a BSD-style license
4 // that can be found in the COPYING file in the root of the source
5 // tree. An additional intellectual property rights grant can be found
6 // in the file PATENTS. All contributing project authors may
7 // be found in the AUTHORS file in the root of the source tree.
8 // -----------------------------------------------------------------------------
9 //
10 // MSA version of rescaling functions
11 //
12 // Author: Prashant Patil (prashant.patil@imgtec.com)
13 
14 #include "src/dsp/dsp.h"
15 
16 #if defined(WEBP_USE_MSA) && !defined(WEBP_REDUCE_SIZE)
17 
18 #include <assert.h>
19 
20 #include "src/utils/rescaler_utils.h"
21 #include "src/dsp/msa_macro.h"
22 
23 #define ROUNDER (WEBP_RESCALER_ONE >> 1)
24 #define MULT_FIX(x, y) (((uint64_t)(x) * (y) + ROUNDER) >> WEBP_RESCALER_RFIX)
25 #define MULT_FIX_FLOOR(x, y) (((uint64_t)(x) * (y)) >> WEBP_RESCALER_RFIX)
26 
27 #define CALC_MULT_FIX_16(in0, in1, in2, in3, scale, shift, dst) do {  \
28   v4u32 tmp0, tmp1, tmp2, tmp3;                                       \
29   v16u8 t0, t1, t2, t3, t4, t5;                                       \
30   v2u64 out0, out1, out2, out3;                                       \
31   ILVRL_W2_UW(zero, in0, tmp0, tmp1);                                 \
32   ILVRL_W2_UW(zero, in1, tmp2, tmp3);                                 \
33   DOTP_UW2_UD(tmp0, tmp1, scale, scale, out0, out1);                  \
34   DOTP_UW2_UD(tmp2, tmp3, scale, scale, out2, out3);                  \
35   SRAR_D4_UD(out0, out1, out2, out3, shift);                          \
36   PCKEV_B2_UB(out1, out0, out3, out2, t0, t1);                        \
37   ILVRL_W2_UW(zero, in2, tmp0, tmp1);                                 \
38   ILVRL_W2_UW(zero, in3, tmp2, tmp3);                                 \
39   DOTP_UW2_UD(tmp0, tmp1, scale, scale, out0, out1);                  \
40   DOTP_UW2_UD(tmp2, tmp3, scale, scale, out2, out3);                  \
41   SRAR_D4_UD(out0, out1, out2, out3, shift);                          \
42   PCKEV_B2_UB(out1, out0, out3, out2, t2, t3);                        \
43   PCKEV_B2_UB(t1, t0, t3, t2, t4, t5);                                \
44   dst = (v16u8)__msa_pckev_b((v16i8)t5, (v16i8)t4);                   \
45 } while (0)
46 
47 #define CALC_MULT_FIX_4(in0, scale, shift, dst) do {  \
48   v4u32 tmp0, tmp1;                                   \
49   v16i8 t0, t1;                                       \
50   v2u64 out0, out1;                                   \
51   ILVRL_W2_UW(zero, in0, tmp0, tmp1);                 \
52   DOTP_UW2_UD(tmp0, tmp1, scale, scale, out0, out1);  \
53   SRAR_D2_UD(out0, out1, shift);                      \
54   t0 = __msa_pckev_b((v16i8)out1, (v16i8)out0);       \
55   t1 = __msa_pckev_b(t0, t0);                         \
56   t0 = __msa_pckev_b(t1, t1);                         \
57   dst = __msa_copy_s_w((v4i32)t0, 0);                 \
58 } while (0)
59 
60 #define CALC_MULT_FIX1_16(in0, in1, in2, in3, fyscale, shift,  \
61                           dst0, dst1, dst2, dst3) do {         \
62   v4u32 tmp0, tmp1, tmp2, tmp3;                                \
63   v2u64 out0, out1, out2, out3;                                \
64   ILVRL_W2_UW(zero, in0, tmp0, tmp1);                          \
65   ILVRL_W2_UW(zero, in1, tmp2, tmp3);                          \
66   DOTP_UW2_UD(tmp0, tmp1, fyscale, fyscale, out0, out1);       \
67   DOTP_UW2_UD(tmp2, tmp3, fyscale, fyscale, out2, out3);       \
68   SRAR_D4_UD(out0, out1, out2, out3, shift);                   \
69   PCKEV_W2_UW(out1, out0, out3, out2, dst0, dst1);             \
70   ILVRL_W2_UW(zero, in2, tmp0, tmp1);                          \
71   ILVRL_W2_UW(zero, in3, tmp2, tmp3);                          \
72   DOTP_UW2_UD(tmp0, tmp1, fyscale, fyscale, out0, out1);       \
73   DOTP_UW2_UD(tmp2, tmp3, fyscale, fyscale, out2, out3);       \
74   SRAR_D4_UD(out0, out1, out2, out3, shift);                   \
75   PCKEV_W2_UW(out1, out0, out3, out2, dst2, dst3);             \
76 } while (0)
77 
78 #define CALC_MULT_FIX1_4(in0, scale, shift, dst) do {    \
79   v4u32 tmp0, tmp1;                                      \
80   v2u64 out0, out1;                                      \
81   ILVRL_W2_UW(zero, in0, tmp0, tmp1);                    \
82   DOTP_UW2_UD(tmp0, tmp1, scale, scale, out0, out1);     \
83   SRAR_D2_UD(out0, out1, shift);                         \
84   dst = (v4u32)__msa_pckev_w((v4i32)out1, (v4i32)out0);  \
85 } while (0)
86 
87 #define CALC_MULT_FIX2_16(in0, in1, in2, in3, mult, scale, shift,  \
88                           dst0, dst1) do {                         \
89   v4u32 tmp0, tmp1, tmp2, tmp3;                                    \
90   v2u64 out0, out1, out2, out3;                                    \
91   ILVRL_W2_UW(in0, in2, tmp0, tmp1);                               \
92   ILVRL_W2_UW(in1, in3, tmp2, tmp3);                               \
93   DOTP_UW2_UD(tmp0, tmp1, mult, mult, out0, out1);                 \
94   DOTP_UW2_UD(tmp2, tmp3, mult, mult, out2, out3);                 \
95   SRAR_D4_UD(out0, out1, out2, out3, shift);                       \
96   DOTP_UW2_UD(out0, out1, scale, scale, out0, out1);               \
97   DOTP_UW2_UD(out2, out3, scale, scale, out2, out3);               \
98   SRAR_D4_UD(out0, out1, out2, out3, shift);                       \
99   PCKEV_B2_UB(out1, out0, out3, out2, dst0, dst1);                 \
100 } while (0)
101 
102 #define CALC_MULT_FIX2_4(in0, in1, mult, scale, shift, dst) do {  \
103   v4u32 tmp0, tmp1;                                               \
104   v2u64 out0, out1;                                               \
105   v16i8 t0, t1;                                                   \
106   ILVRL_W2_UW(in0, in1, tmp0, tmp1);                              \
107   DOTP_UW2_UD(tmp0, tmp1, mult, mult, out0, out1);                \
108   SRAR_D2_UD(out0, out1, shift);                                  \
109   DOTP_UW2_UD(out0, out1, scale, scale, out0, out1);              \
110   SRAR_D2_UD(out0, out1, shift);                                  \
111   t0 = __msa_pckev_b((v16i8)out1, (v16i8)out0);                   \
112   t1 = __msa_pckev_b(t0, t0);                                     \
113   t0 = __msa_pckev_b(t1, t1);                                     \
114   dst = __msa_copy_s_w((v4i32)t0, 0);                             \
115 } while (0)
116 
ExportRowExpand_0(const uint32_t * frow,uint8_t * dst,int length,WebPRescaler * const wrk)117 static WEBP_INLINE void ExportRowExpand_0(const uint32_t* frow, uint8_t* dst,
118                                           int length,
119                                           WebPRescaler* const wrk) {
120   const v4u32 scale = (v4u32)__msa_fill_w(wrk->fy_scale);
121   const v4u32 shift = (v4u32)__msa_fill_w(WEBP_RESCALER_RFIX);
122   const v4i32 zero = { 0 };
123 
124   while (length >= 16) {
125     v4u32 src0, src1, src2, src3;
126     v16u8 out;
127     LD_UW4(frow, 4, src0, src1, src2, src3);
128     CALC_MULT_FIX_16(src0, src1, src2, src3, scale, shift, out);
129     ST_UB(out, dst);
130     length -= 16;
131     frow   += 16;
132     dst    += 16;
133   }
134   if (length > 0) {
135     int x_out;
136     if (length >= 12) {
137       uint32_t val0_m, val1_m, val2_m;
138       v4u32 src0, src1, src2;
139       LD_UW3(frow, 4, src0, src1, src2);
140       CALC_MULT_FIX_4(src0, scale, shift, val0_m);
141       CALC_MULT_FIX_4(src1, scale, shift, val1_m);
142       CALC_MULT_FIX_4(src2, scale, shift, val2_m);
143       SW3(val0_m, val1_m, val2_m, dst, 4);
144       length -= 12;
145       frow   += 12;
146       dst    += 12;
147     } else if (length >= 8) {
148       uint32_t val0_m, val1_m;
149       v4u32 src0, src1;
150       LD_UW2(frow, 4, src0, src1);
151       CALC_MULT_FIX_4(src0, scale, shift, val0_m);
152       CALC_MULT_FIX_4(src1, scale, shift, val1_m);
153       SW2(val0_m, val1_m, dst, 4);
154       length -= 8;
155       frow   += 8;
156       dst    += 8;
157     } else if (length >= 4) {
158       uint32_t val0_m;
159       const v4u32 src0 = LD_UW(frow);
160       CALC_MULT_FIX_4(src0, scale, shift, val0_m);
161       SW(val0_m, dst);
162       length -= 4;
163       frow   += 4;
164       dst    += 4;
165     }
166     for (x_out = 0; x_out < length; ++x_out) {
167       const uint32_t J = frow[x_out];
168       const int v = (int)MULT_FIX(J, wrk->fy_scale);
169       assert(v >= 0 && v <= 255);
170       dst[x_out] = v;
171     }
172   }
173 }
174 
ExportRowExpand_1(const uint32_t * frow,uint32_t * irow,uint8_t * dst,int length,WebPRescaler * const wrk)175 static WEBP_INLINE void ExportRowExpand_1(const uint32_t* frow, uint32_t* irow,
176                                           uint8_t* dst, int length,
177                                           WebPRescaler* const wrk) {
178   const uint32_t B = WEBP_RESCALER_FRAC(-wrk->y_accum, wrk->y_sub);
179   const uint32_t A = (uint32_t)(WEBP_RESCALER_ONE - B);
180   const v4i32 B1 = __msa_fill_w(B);
181   const v4i32 A1 = __msa_fill_w(A);
182   const v4i32 AB = __msa_ilvr_w(A1, B1);
183   const v4u32 scale = (v4u32)__msa_fill_w(wrk->fy_scale);
184   const v4u32 shift = (v4u32)__msa_fill_w(WEBP_RESCALER_RFIX);
185 
186   while (length >= 16) {
187     v4u32 frow0, frow1, frow2, frow3, irow0, irow1, irow2, irow3;
188     v16u8 t0, t1, t2, t3, t4, t5;
189     LD_UW4(frow, 4, frow0, frow1, frow2, frow3);
190     LD_UW4(irow, 4, irow0, irow1, irow2, irow3);
191     CALC_MULT_FIX2_16(frow0, frow1, irow0, irow1, AB, scale, shift, t0, t1);
192     CALC_MULT_FIX2_16(frow2, frow3, irow2, irow3, AB, scale, shift, t2, t3);
193     PCKEV_B2_UB(t1, t0, t3, t2, t4, t5);
194     t0 = (v16u8)__msa_pckev_b((v16i8)t5, (v16i8)t4);
195     ST_UB(t0, dst);
196     frow   += 16;
197     irow   += 16;
198     dst    += 16;
199     length -= 16;
200   }
201   if (length > 0) {
202     int x_out;
203     if (length >= 12) {
204       uint32_t val0_m, val1_m, val2_m;
205       v4u32 frow0, frow1, frow2, irow0, irow1, irow2;
206       LD_UW3(frow, 4, frow0, frow1, frow2);
207       LD_UW3(irow, 4, irow0, irow1, irow2);
208       CALC_MULT_FIX2_4(frow0, irow0, AB, scale, shift, val0_m);
209       CALC_MULT_FIX2_4(frow1, irow1, AB, scale, shift, val1_m);
210       CALC_MULT_FIX2_4(frow2, irow2, AB, scale, shift, val2_m);
211       SW3(val0_m, val1_m, val2_m, dst, 4);
212       frow   += 12;
213       irow   += 12;
214       dst    += 12;
215       length -= 12;
216     } else if (length >= 8) {
217       uint32_t val0_m, val1_m;
218       v4u32 frow0, frow1, irow0, irow1;
219       LD_UW2(frow, 4, frow0, frow1);
220       LD_UW2(irow, 4, irow0, irow1);
221       CALC_MULT_FIX2_4(frow0, irow0, AB, scale, shift, val0_m);
222       CALC_MULT_FIX2_4(frow1, irow1, AB, scale, shift, val1_m);
223       SW2(val0_m, val1_m, dst, 4);
224       frow   += 4;
225       irow   += 4;
226       dst    += 4;
227       length -= 4;
228     } else if (length >= 4) {
229       uint32_t val0_m;
230       const v4u32 frow0 = LD_UW(frow + 0);
231       const v4u32 irow0 = LD_UW(irow + 0);
232       CALC_MULT_FIX2_4(frow0, irow0, AB, scale, shift, val0_m);
233       SW(val0_m, dst);
234       frow   += 4;
235       irow   += 4;
236       dst    += 4;
237       length -= 4;
238     }
239     for (x_out = 0; x_out < length; ++x_out) {
240       const uint64_t I = (uint64_t)A * frow[x_out]
241                        + (uint64_t)B * irow[x_out];
242       const uint32_t J = (uint32_t)((I + ROUNDER) >> WEBP_RESCALER_RFIX);
243       const int v = (int)MULT_FIX(J, wrk->fy_scale);
244       assert(v >= 0 && v <= 255);
245       dst[x_out] = v;
246     }
247   }
248 }
249 
RescalerExportRowExpand_MIPSdspR2(WebPRescaler * const wrk)250 static void RescalerExportRowExpand_MIPSdspR2(WebPRescaler* const wrk) {
251   uint8_t* dst = wrk->dst;
252   rescaler_t* irow = wrk->irow;
253   const int x_out_max = wrk->dst_width * wrk->num_channels;
254   const rescaler_t* frow = wrk->frow;
255   assert(!WebPRescalerOutputDone(wrk));
256   assert(wrk->y_accum <= 0);
257   assert(wrk->y_expand);
258   assert(wrk->y_sub != 0);
259   if (wrk->y_accum == 0) {
260     ExportRowExpand_0(frow, dst, x_out_max, wrk);
261   } else {
262     ExportRowExpand_1(frow, irow, dst, x_out_max, wrk);
263   }
264 }
265 
266 #if 0  // disabled for now. TODO(skal): make match the C-code
267 static WEBP_INLINE void ExportRowShrink_0(const uint32_t* frow, uint32_t* irow,
268                                           uint8_t* dst, int length,
269                                           const uint32_t yscale,
270                                           WebPRescaler* const wrk) {
271   const v4u32 y_scale = (v4u32)__msa_fill_w(yscale);
272   const v4u32 fxyscale = (v4u32)__msa_fill_w(wrk->fxy_scale);
273   const v4u32 shiftval = (v4u32)__msa_fill_w(WEBP_RESCALER_RFIX);
274   const v4i32 zero = { 0 };
275 
276   while (length >= 16) {
277     v4u32 src0, src1, src2, src3, frac0, frac1, frac2, frac3;
278     v16u8 out;
279     LD_UW4(frow, 4, src0, src1, src2, src3);
280     CALC_MULT_FIX1_16(src0, src1, src2, src3, y_scale, shiftval,
281                       frac0, frac1, frac2, frac3);
282     LD_UW4(irow, 4, src0, src1, src2, src3);
283     SUB4(src0, frac0, src1, frac1, src2, frac2, src3, frac3,
284          src0, src1, src2, src3);
285     CALC_MULT_FIX_16(src0, src1, src2, src3, fxyscale, shiftval, out);
286     ST_UB(out, dst);
287     ST_UW4(frac0, frac1, frac2, frac3, irow, 4);
288     frow   += 16;
289     irow   += 16;
290     dst    += 16;
291     length -= 16;
292   }
293   if (length > 0) {
294     int x_out;
295     if (length >= 12) {
296       uint32_t val0_m, val1_m, val2_m;
297       v4u32 src0, src1, src2, frac0, frac1, frac2;
298       LD_UW3(frow, 4, src0, src1, src2);
299       CALC_MULT_FIX1_4(src0, y_scale, shiftval, frac0);
300       CALC_MULT_FIX1_4(src1, y_scale, shiftval, frac1);
301       CALC_MULT_FIX1_4(src2, y_scale, shiftval, frac2);
302       LD_UW3(irow, 4, src0, src1, src2);
303       SUB3(src0, frac0, src1, frac1, src2, frac2, src0, src1, src2);
304       CALC_MULT_FIX_4(src0, fxyscale, shiftval, val0_m);
305       CALC_MULT_FIX_4(src1, fxyscale, shiftval, val1_m);
306       CALC_MULT_FIX_4(src2, fxyscale, shiftval, val2_m);
307       SW3(val0_m, val1_m, val2_m, dst, 4);
308       ST_UW3(frac0, frac1, frac2, irow, 4);
309       frow   += 12;
310       irow   += 12;
311       dst    += 12;
312       length -= 12;
313     } else if (length >= 8) {
314       uint32_t val0_m, val1_m;
315       v4u32 src0, src1, frac0, frac1;
316       LD_UW2(frow, 4, src0, src1);
317       CALC_MULT_FIX1_4(src0, y_scale, shiftval, frac0);
318       CALC_MULT_FIX1_4(src1, y_scale, shiftval, frac1);
319       LD_UW2(irow, 4, src0, src1);
320       SUB2(src0, frac0, src1, frac1, src0, src1);
321       CALC_MULT_FIX_4(src0, fxyscale, shiftval, val0_m);
322       CALC_MULT_FIX_4(src1, fxyscale, shiftval, val1_m);
323       SW2(val0_m, val1_m, dst, 4);
324       ST_UW2(frac0, frac1, irow, 4);
325       frow   += 8;
326       irow   += 8;
327       dst    += 8;
328       length -= 8;
329     } else if (length >= 4) {
330       uint32_t val0_m;
331       v4u32 frac0;
332       v4u32 src0 = LD_UW(frow);
333       CALC_MULT_FIX1_4(src0, y_scale, shiftval, frac0);
334       src0 = LD_UW(irow);
335       src0 = src0 - frac0;
336       CALC_MULT_FIX_4(src0, fxyscale, shiftval, val0_m);
337       SW(val0_m, dst);
338       ST_UW(frac0, irow);
339       frow   += 4;
340       irow   += 4;
341       dst    += 4;
342       length -= 4;
343     }
344     for (x_out = 0; x_out < length; ++x_out) {
345       const uint32_t frac = (uint32_t)MULT_FIX(frow[x_out], yscale);
346       const int v = (int)MULT_FIX_FLOOR(irow[x_out] - frac, wrk->fxy_scale);
347       assert(v >= 0 && v <= 255);
348       dst[x_out] = v;
349       irow[x_out] = frac;
350     }
351   }
352 }
353 
354 static WEBP_INLINE void ExportRowShrink_1(uint32_t* irow, uint8_t* dst,
355                                           int length,
356                                           WebPRescaler* const wrk) {
357   const v4u32 scale = (v4u32)__msa_fill_w(wrk->fxy_scale);
358   const v4u32 shift = (v4u32)__msa_fill_w(WEBP_RESCALER_RFIX);
359   const v4i32 zero = { 0 };
360 
361   while (length >= 16) {
362     v4u32 src0, src1, src2, src3;
363     v16u8 dst0;
364     LD_UW4(irow, 4, src0, src1, src2, src3);
365     CALC_MULT_FIX_16(src0, src1, src2, src3, scale, shift, dst0);
366     ST_UB(dst0, dst);
367     ST_SW4(zero, zero, zero, zero, irow, 4);
368     length -= 16;
369     irow   += 16;
370     dst    += 16;
371   }
372   if (length > 0) {
373     int x_out;
374     if (length >= 12) {
375       uint32_t val0_m, val1_m, val2_m;
376       v4u32 src0, src1, src2;
377       LD_UW3(irow, 4, src0, src1, src2);
378       CALC_MULT_FIX_4(src0, scale, shift, val0_m);
379       CALC_MULT_FIX_4(src1, scale, shift, val1_m);
380       CALC_MULT_FIX_4(src2, scale, shift, val2_m);
381       SW3(val0_m, val1_m, val2_m, dst, 4);
382       ST_SW3(zero, zero, zero, irow, 4);
383       length -= 12;
384       irow   += 12;
385       dst    += 12;
386     } else if (length >= 8) {
387       uint32_t val0_m, val1_m;
388       v4u32 src0, src1;
389       LD_UW2(irow, 4, src0, src1);
390       CALC_MULT_FIX_4(src0, scale, shift, val0_m);
391       CALC_MULT_FIX_4(src1, scale, shift, val1_m);
392       SW2(val0_m, val1_m, dst, 4);
393       ST_SW2(zero, zero, irow, 4);
394       length -= 8;
395       irow   += 8;
396       dst    += 8;
397     } else if (length >= 4) {
398       uint32_t val0_m;
399       const v4u32 src0 = LD_UW(irow + 0);
400       CALC_MULT_FIX_4(src0, scale, shift, val0_m);
401       SW(val0_m, dst);
402       ST_SW(zero, irow);
403       length -= 4;
404       irow   += 4;
405       dst    += 4;
406     }
407     for (x_out = 0; x_out < length; ++x_out) {
408       const int v = (int)MULT_FIX(irow[x_out], wrk->fxy_scale);
409       assert(v >= 0 && v <= 255);
410       dst[x_out] = v;
411       irow[x_out] = 0;
412     }
413   }
414 }
415 
416 static void RescalerExportRowShrink_MIPSdspR2(WebPRescaler* const wrk) {
417   uint8_t* dst = wrk->dst;
418   rescaler_t* irow = wrk->irow;
419   const int x_out_max = wrk->dst_width * wrk->num_channels;
420   const rescaler_t* frow = wrk->frow;
421   const uint32_t yscale = wrk->fy_scale * (-wrk->y_accum);
422   assert(!WebPRescalerOutputDone(wrk));
423   assert(wrk->y_accum <= 0);
424   assert(!wrk->y_expand);
425   if (yscale) {
426     ExportRowShrink_0(frow, irow, dst, x_out_max, yscale, wrk);
427   } else {
428     ExportRowShrink_1(irow, dst, x_out_max, wrk);
429   }
430 }
431 #endif  // 0
432 
433 //------------------------------------------------------------------------------
434 // Entry point
435 
436 extern void WebPRescalerDspInitMSA(void);
437 
WebPRescalerDspInitMSA(void)438 WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInitMSA(void) {
439   WebPRescalerExportRowExpand = RescalerExportRowExpand_MIPSdspR2;
440 //  WebPRescalerExportRowShrink = RescalerExportRowShrink_MIPSdspR2;
441 }
442 
443 #else     // !WEBP_USE_MSA
444 
445 WEBP_DSP_INIT_STUB(WebPRescalerDspInitMSA)
446 
447 #endif    // WEBP_USE_MSA
448