1 /* ----------------------------------------------------------------------
2  * Title:        csky_vdsp2_nnfunctions.h
3  * Description:  Public header file for CSI NN Library
4  *
5  * -------------------------------------------------------------------- */
6 
7 #ifndef _CSKY_VDSP2_NNFUNCTIONS_H
8 #define _CSKY_VDSP2_NNFUNCTIONS_H
9 
10 #ifdef __cplusplus
11 extern    "C"
12 {
13 #endif
14 
15 #ifdef CSKY_VDSP2_MATH_DSP
16 #include "csky_vdsp2_math.h"
17 #include "csky_vdsp2_nnsupportfunctions.h"
18 #endif
19 
20 /**
21  * @brief Struct for specifying activation function types
22  *
23  */
24 typedef enum
25 {
26     CSKY_SIGMOID = 0, /**< Sigmoid activation function */
27     CSKY_TANH = 1, /**< Tanh activation function */
28 } csky_vdsp2_nn_activation_type;
29 
30   /**
31    * @brief Basic Q7 convolution function
32    * @param[in]       Im_in       pointer to input tensor
33    * @param[in]       dim_im_in   input tensor dimention
34    * @param[in]       ch_im_in    number of input tensor channels
35    * @param[in]       wt          pointer to kernel weights
36    * @param[in]       ch_im_out   number of filters, i.e., output tensor channels
37    * @param[in]       dim_kernel  filter kernel size
38    * @param[in]       padding     padding sizes
39    * @param[in]       stride      convolution stride
40    * @param[in]       bias        pointer to bias
41    * @param[in]       bias_shift  amount of left-shift for bias
42    * @param[in]       out_shift   amount of right-shift for output
43    * @param[in,out]   Im_out      pointer to output tensor
44    * @param[in]       dim_im_out  output tensor dimension
45    * @param[in,out]   bufferA     pointer to buffer space for input
46    * @return          none.
47    *
48    */
49 
50 void csky_vdsp2_convolve_HWC_q7_basic(const q7_t * Im_in,
51                                        const uint16_t dim_im_in,
52                                        const uint16_t ch_im_in,
53                                        const q7_t * wt,
54                                        const uint16_t ch_im_out,
55                                        const uint16_t dim_kernel,
56                                        const uint16_t padding,
57                                        const uint16_t stride,
58                                        const q7_t * bias,
59                                        const uint16_t bias_shift,
60                                        const uint16_t out_shift,
61                                        q7_t * Im_out,
62                                        const uint16_t dim_im_out,
63                                        q15_t * bufferA);
64 
65   /**
66    * @brief Basic Q15 convolution function
67    * @param[in]       Im_in       pointer to input tensor
68    * @param[in]       dim_im_in   input tensor dimention
69    * @param[in]       ch_im_in    number of input tensor channels
70    * @param[in]       wt          pointer to kernel weights
71    * @param[in]       ch_im_out   number of filters, i.e., output tensor channels
72    * @param[in]       dim_kernel  filter kernel size
73    * @param[in]       padding     padding sizes
74    * @param[in]       stride      convolution stride
75    * @param[in]       bias        pointer to bias
76    * @param[in]       bias_shift  amount of left-shift for bias
77    * @param[in]       out_shift   amount of right-shift for output
78    * @param[in,out]   Im_out      pointer to output tensor
79    * @param[in]       dim_im_out  output tensor dimension
80    * @param[in,out]   bufferA     pointer to buffer space for input
81    * @return          none.
82    *
83    */
84 
85 void csky_vdsp2_convolve_HWC_q15_basic(const q15_t * Im_in,
86                                         const uint16_t dim_im_in,
87                                         const uint16_t ch_im_in,
88                                         const q15_t * wt,
89                                         const uint16_t ch_im_out,
90                                         const uint16_t dim_kernel,
91                                         const uint16_t padding,
92                                         const uint16_t stride,
93                                         const q15_t * bias,
94                                         const uint16_t bias_shift,
95                                         const uint16_t out_shift,
96                                         q15_t * Im_out,
97                                         const uint16_t dim_im_out,
98                                         q15_t * bufferA);
99 
100 
101   /**
102    * @brief Fast Q7 convolution function (non-sqaure shape)
103    * @param[in]       Im_in        pointer to input tensor
104    * @param[in]       dim_im_in_x  input tensor dimention x
105    * @param[in]       dim_im_in_y  input tensor dimention y
106    * @param[in]       ch_im_in     number of input tensor channels
107    * @param[in]       wt           pointer to kernel weights
108    * @param[in]       ch_im_out    number of filters, i.e., output tensor channels
109    * @param[in]       dim_kernel_x filter kernel size x
110    * @param[in]       dim_kernel_y filter kernel size y
111    * @param[in]       padding_x    padding size x
112    * @param[in]       padding_y    padding size y
113    * @param[in]       stride_x     convolution stride x
114    * @param[in]       stride_y     convolution stride y
115    * @param[in]       bias         pointer to bias
116    * @param[in]       bias_shift   amount of left-shift for bias
117    * @param[in]       out_shift    amount of right-shift for output
118    * @param[in,out]   Im_out       pointer to output tensor
119    * @param[in]       dim_im_out_x output tensor dimension x
120    * @param[in]       dim_im_out_y output tensor dimension y
121    * @param[in,out]   bufferA      pointer to buffer space for input
122    * @return          none.
123    *
124    * This function is the version with full list of optimization tricks, but with
125    * some contraints:
126    *   ch_im_in is multiple of 4
127    *   ch_im_out is multiple of 2
128    */
129 
130 void csky_vdsp2_convolve_HWC_q7_fast_nonsquare(const q7_t * Im_in,
131                                                 const uint16_t dim_im_in_x,
132                                                 const uint16_t dim_im_in_y,
133                                                 const uint16_t ch_im_in,
134                                                 const q7_t * wt,
135                                                 const uint16_t ch_im_out,
136                                                 const uint16_t dim_kernel_x,
137                                                 const uint16_t dim_kernel_y,
138                                                 const uint16_t padding_x,
139                                                 const uint16_t padding_y,
140                                                 const uint16_t stride_x,
141                                                 const uint16_t stride_y,
142                                                 const q7_t * bias,
143                                                 const uint16_t bias_shift,
144                                                 const uint16_t out_shift,
145                                                 q7_t * Im_out,
146                                                 const uint16_t dim_im_out_x,
147                                                 const uint16_t dim_im_out_y,
148                                                 q15_t * bufferA);
149 
150   /**
151    * @brief Fast Q7 version of 1x1 convolution (non-sqaure shape)
152    * @param[in]       Im_in        pointer to input tensor
153    * @param[in]       dim_im_in_x  input tensor dimention x
154    * @param[in]       dim_im_in_y  input tensor dimention y
155    * @param[in]       ch_im_in     number of input tensor channels
156    * @param[in]       wt           pointer to kernel weights
157    * @param[in]       ch_im_out    number of filters, i.e., output tensor channels
158    * @param[in]       dim_kernel_x filter kernel size x
159    * @param[in]       dim_kernel_y filter kernel size y
160    * @param[in]       padding_x    padding size x
161    * @param[in]       padding_y    padding size y
162    * @param[in]       stride_x     convolution stride x
163    * @param[in]       stride_y     convolution stride y
164    * @param[in]       bias         pointer to bias
165    * @param[in]       bias_shift   amount of left-shift for bias
166    * @param[in]       out_shift    amount of right-shift for output
167    * @param[in,out]   Im_out       pointer to output tensor
168    * @param[in]       dim_im_out_x output tensor dimension x
169    * @param[in]       dim_im_out_y output tensor dimension y
170    * @param[in,out]   bufferA      pointer to buffer space for input
171    * @return          none.
172    *
173    * This function implement convolution with 1x1 kernel size (i.e., dim_kernel_x=1
174    * and dim_kernel_y=1). It can be used for
175    * second half of MobileNets after depthwise separable convolution.
176    *
177    * This function is the version with full list of optimization tricks, but with
178    * some contraints:
179    *   ch_im_in is multiple of 4
180    *   ch_im_out is multiple of 2
181    */
182 void csky_vdsp2_convolve_1x1_HWC_q7_fast(const q7_t * Im_in,
183                                          const uint16_t dim_im_in_x,
184                                          const uint16_t dim_im_in_y,
185                                          const uint16_t ch_im_in,
186                                          const q7_t * wt,
187                                          const uint16_t ch_im_out,
188                                          const q7_t * bias,
189                                          const uint16_t bias_shift,
190                                          const uint16_t out_shift,
191                                          q7_t * Im_out,
192                                          const uint16_t dim_im_out_x,
193                                          const uint16_t dim_im_out_y,
194                                          q15_t * bufferA);
195 
196   /**
197    * @brief Q7 version of convolution for RGB image
198    * @param[in]       Im_in       pointer to input tensor
199    * @param[in]       dim_im_in   input tensor dimention
200    * @param[in]       ch_im_in    number of input tensor channels
201    * @param[in]       wt          pointer to kernel weights
202    * @param[in]       ch_im_out   number of filters, i.e., output tensor channels
203    * @param[in]       dim_kernel  filter kernel size
204    * @param[in]       padding     padding sizes
205    * @param[in]       stride      convolution stride
206    * @param[in]       bias        pointer to bias
207    * @param[in]       bias_shift  amount of left-shift for bias
208    * @param[in]       out_shift   amount of right-shift for output
209    * @param[in,out]   Im_out      pointer to output tensor
210    * @param[in]       dim_im_out  output tensor dimension
211    * @param[in,out]   bufferA     pointer to buffer space for input
212    * @return          none.
213    *
214    * This kernel is written exclusively for convolution with ch_im_in
215    * equals 3. This applies on the first layer of CNNs which has input
216    * image with RGB format.
217    */
218 
219 void csky_vdsp2_convolve_HWC_q7_RGB(const q7_t * Im_in,
220                                      const uint16_t dim_im_in,
221                                      const q7_t * wt,
222                                      const uint16_t ch_im_out,
223                                      const uint16_t dim_kernel,
224                                      const uint16_t padding,
225                                      const uint16_t stride,
226                                      const q7_t * bias,
227                                      const uint16_t bias_shift,
228                                      const uint16_t out_shift,
229                                      q7_t * Im_out,
230                                      const uint16_t dim_im_out,
231                                      q15_t * bufferA);
232 
233 
234   /**
235    * @brief Q7 depthwise separable convolution function
236    * @param[in]       Im_in       pointer to input tensor
237    * @param[in]       dim_im_in   input tensor dimention
238    * @param[in]       ch_im_in    number of input tensor channels
239    * @param[in]       wt          pointer to kernel weights
240    * @param[in]       ch_im_out   number of filters, i.e., output tensor channels
241    * @param[in]       dim_kernel  filter kernel size
242    * @param[in]       padding     padding sizes
243    * @param[in]       stride      convolution stride
244    * @param[in]       bias        pointer to bias
245    * @param[in]       bias_shift  amount of left-shift for bias
246    * @param[in]       out_shift   amount of right-shift for output
247    * @param[in,out]   Im_out      pointer to output tensor
248    * @param[in]       dim_im_out  output tensor dimension
249    * @param[in,out]   bufferA     pointer to buffer space for input
250    * @return          none.
251    *
252    * This function is the version with full list of optimization tricks, but with
253    * some contraints:
254    *   ch_im_in is multiple of 2
255    *   ch_im_out is multiple of 2
256    */
257 
258 void csky_vdsp2_depthwise_separable_conv_HWC_q7(const q7_t * Im_in,
259                                                  const uint16_t dim_im_in,
260                                                  const uint16_t ch_im_in,
261                                                  const q7_t * wt,
262                                                  const uint16_t ch_im_out,
263                                                  const uint16_t dim_kernel,
264                                                  const uint16_t padding,
265                                                  const uint16_t stride,
266                                                  const q7_t * bias,
267                                                  const uint16_t bias_shift,
268                                                  const uint16_t out_shift,
269                                                  q7_t * Im_out,
270                                                  const uint16_t dim_im_out,
271                                                  q15_t * bufferA);
272 
273   /**
274    * @brief Q7 depthwise separable convolution function (non-square shape)
275    * @param[in]       Im_in         pointer to input tensor
276    * @param[in]       dim_im_in_x   input tensor dimention x
277    * @param[in]       dim_im_in_y   input tensor dimention y
278    * @param[in]       ch_im_in      number of input tensor channels
279    * @param[in]       wt            pointer to kernel weights
280    * @param[in]       ch_im_out     number of filters, i.e., output tensor channels
281    * @param[in]       dim_kernel_x  filter kernel size x
282    * @param[in]       dim_kernel_y  filter kernel size y
283    * @param[in]       padding_x     padding sizes x
284    * @param[in]       padding_y     padding sizes y
285    * @param[in]       stride_x      convolution stride x
286    * @param[in]       stride_y      convolution stride y
287    * @param[in]       bias          pointer to bias
288    * @param[in]       bias_shift    amount of left-shift for bias
289    * @param[in]       out_shift     amount of right-shift for output
290    * @param[in,out]   Im_out        pointer to output tensor
291    * @param[in]       dim_im_out_x  output tensor dimension x
292    * @param[in]       dim_im_out_y  output tensor dimension y
293    * @param[in,out]   bufferA       pointer to buffer space for input
294    * @return          none.
295    *
296    * This function is the version with full list of optimization tricks, but with
297    * some contraints:
298    *   ch_im_in is multiple of 2
299    *   ch_im_out is multiple of 2
300    */
301 void csky_vdsp2_depthwise_separable_conv_HWC_q7_nonsquare(const q7_t * Im_in,
302                                                  const uint16_t dim_im_in_x,
303                                                  const uint16_t dim_im_in_y,
304                                                  const uint16_t ch_im_in,
305                                                  const q7_t * wt,
306                                                  const uint16_t ch_im_out,
307                                                  const uint16_t dim_kernel_x,
308                                                  const uint16_t dim_kernel_y,
309                                                  const uint16_t padding_x,
310                                                  const uint16_t padding_y,
311                                                  const uint16_t stride_x,
312                                                  const uint16_t stride_y,
313                                                  const q7_t * bias,
314                                                  const uint16_t bias_shift,
315                                                  const uint16_t out_shift,
316                                                  q7_t * Im_out,
317                                                  const uint16_t dim_im_out_x,
318                                                  const uint16_t dim_im_out_y,
319                                                  q15_t * bufferA);
320 
321 
322   /**
323    * @brief Q7 basic fully-connected layer function
324    * @param[in]       pV          pointer to input vector
325    * @param[in]       pM          pointer to matrix weights
326    * @param[in]       dim_vec     length of the vector
327    * @param[in]       num_of_rows number of rows in weight matrix
328    * @param[in]       bias_shift  amount of left-shift for bias
329    * @param[in]       out_shift   amount of right-shift for output
330    * @param[in]       bias        pointer to bias
331    * @param[in,out]   pOut        pointer to output vector
332    * @return          none.
333    */
334 
335 void csky_vdsp2_fully_connected_q7(const q7_t * pV,
336                                     const q7_t * pM,
337                                     const uint16_t dim_vec,
338                                     const uint16_t num_of_rows,
339                                     const uint16_t bias_shift,
340                                     const uint16_t out_shift,
341                                     const q7_t * bias,
342                                     q7_t * pOut);
343 
344 
345   /**
346    * @brief Q15 basic fully-connected layer function
347    * @param[in]       pV          pointer to input vector
348    * @param[in]       pM          pointer to matrix weights
349    * @param[in]       dim_vec     length of the vector
350    * @param[in]       num_of_rows number of rows in weight matrix
351    * @param[in]       bias_shift  amount of left-shift for bias
352    * @param[in]       out_shift   amount of right-shift for output
353    * @param[in]       bias        pointer to bias
354    * @param[in,out]   pOut        pointer to output vector
355    * @return          none.
356    *
357    */
358 
359 void csky_vdsp2_fully_connected_q15(const q15_t * pV,
360                                      const q15_t * pM,
361                                      const uint16_t dim_vec,
362                                      const uint16_t num_of_rows,
363                                      const uint16_t bias_shift,
364                                      const uint16_t out_shift,
365                                      const q15_t * bias,
366                                      q15_t * pOut);
367 
368 
369   /**
370    * @brief Mixed Q15-Q7 fully-connected layer function
371    * @param[in]       pV          pointer to input vector
372    * @param[in]       pM          pointer to matrix weights
373    * @param[in]       dim_vec     length of the vector
374    * @param[in]       num_of_rows number of rows in weight matrix
375    * @param[in]       bias_shift  amount of left-shift for bias
376    * @param[in]       out_shift   amount of right-shift for output
377    * @param[in]       bias        pointer to bias
378    * @param[in,out]   pOut        pointer to output vector
379    * @return          none.
380    *
381    */
382 
383 void csky_vdsp2_fully_connected_mat_q7_vec_q15(const q15_t * pV,
384                                                 const q7_t * pM,
385                                                 const uint16_t dim_vec,
386                                                 const uint16_t num_of_rows,
387                                                 const uint16_t bias_shift,
388                                                 const uint16_t out_shift,
389                                                 const q7_t * bias,
390                                                 q15_t * pOut);
391 
392 
393 
394   /**
395    * @brief Q7 RELU function
396    * @param[in,out]   data        pointer to input
397    * @param[in]       size        number of elements
398    * @return none.
399    */
400 
401 void csky_vdsp2_relu_q7(q7_t * data, uint16_t size);
402 
403   /**
404    * @brief Q15 RELU function
405    * @param[in,out]   data        pointer to input
406    * @param[in]       size        number of elements
407    * @return none.
408    */
409 
410 void csky_vdsp2_relu_q15(q15_t * data, uint16_t size);
411 
412   /**
413    * @brief Q7 neural network activation function using direct table look-up
414    * @param[in,out]   data        pointer to input
415    * @param[in]       size        number of elements
416    * @param[in]       int_width   bit-width of the integer part, assume to be smaller than 3
417    * @param[in]       type        type of activation functions
418    * @return none.
419    */
420 
421 void csky_vdsp2_nn_activations_direct_q7(q7_t * data, uint16_t size,
422                                    uint16_t int_width,
423                                    csky_vdsp2_nn_activation_type type);
424 
425   /**
426    * @brief Q15 neural network activation function using direct table look-up
427    * @param[in,out]   data        pointer to input
428    * @param[in]       size        number of elements
429    * @param[in]       int_width   bit-width of the integer part, assume to be smaller than 3
430    * @param[in]       type        type of activation functions
431    * @return none.
432    */
433 
434 void csky_vdsp2_nn_activations_direct_q15(q15_t * data, uint16_t size,
435                                     uint16_t int_width,
436                                     csky_vdsp2_nn_activation_type type);
437 
438   /**
439    * @brief Q7 max pooling function
440    * @param[in]       Im_in       pointer to input tensor
441    * @param[in]       dim_im_in   input tensor dimention
442    * @param[in]       ch_im_in    number of input tensor channels
443    * @param[in]       dim_kernel  filter kernel size
444    * @param[in]       padding     padding sizes
445    * @param[in]       stride      convolution stride
446    * @param[in]       dim_im_out  output tensor dimension
447    * @param[in,out]   bufferA     pointer to buffer space for input
448    * @param[in,out]   Im_out      pointer to output tensor
449    * @return none.
450    *
451    */
452 
453 void csky_vdsp2_maxpool_q7_HWC(q7_t * Im_in,
454                          const uint16_t dim_im_in,
455                          const uint16_t ch_im_in,
456                          const uint16_t dim_kernel,
457                          const uint16_t padding,
458                          const uint16_t stride,
459                          const uint16_t dim_im_out,
460                          q7_t * bufferA,
461                          q7_t * Im_out);
462 
463   /**
464    * @brief Q7 average pooling function
465    * @param[in]       Im_in       pointer to input tensor
466    * @param[in]       dim_im_in   input tensor dimention
467    * @param[in]       ch_im_in    number of input tensor channels
468    * @param[in]       dim_kernel  filter kernel size
469    * @param[in]       padding     padding sizes
470    * @param[in]       stride      convolution stride
471    * @param[in]       dim_im_out  output tensor dimension
472    * @param[in,out]   bufferA     pointer to buffer space for input
473    * @param[in,out]   Im_out      pointer to output tensor
474    * @return none.
475    *
476    */
477 
478 void csky_vdsp2_avepool_q7_HWC(q7_t * Im_in,
479                          const uint16_t dim_im_in,
480                          const uint16_t ch_im_in,
481                          const uint16_t dim_kernel,
482                          const uint16_t padding,
483                          const uint16_t stride,
484                          const uint16_t dim_im_out,
485                          q7_t * bufferA,
486                          q7_t * Im_out);
487 
488 
489   /**
490    * @brief Q7 softmax function
491    * @param[in]       vec_in      pointer to input vector
492    * @param[in]       dim_vec     input vector dimention
493    * @param[out]      p_out       pointer to output vector
494    * @return none.
495    *
496    */
497 
498 void csky_vdsp2_softmax_q7(const q7_t *vec_in, const uint16_t dim_vec, q7_t *p_out);
499 
500   /**
501    * @brief Q15 softmax function
502    * @param[in]       vec_in      pointer to input vector
503    * @param[in]       dim_vec     input vector dimention
504    * @param[out]      p_out       pointer to output vector
505    * @return none.
506    *
507    */
508 
509 void csky_vdsp2_softmax_q15(const q15_t *vec_in, const uint16_t dim_vec,
510                       q15_t *p_out);
511 
512 #ifdef __cplusplus
513 }
514 #endif
515 
516 #endif
517