1 /* ---------------------------------------------------------------------- 2 * Title: csky_vdsp2_nnfunctions.h 3 * Description: Public header file for CSI NN Library 4 * 5 * -------------------------------------------------------------------- */ 6 7 #ifndef _CSKY_VDSP2_NNFUNCTIONS_H 8 #define _CSKY_VDSP2_NNFUNCTIONS_H 9 10 #ifdef __cplusplus 11 extern "C" 12 { 13 #endif 14 15 #ifdef CSKY_VDSP2_MATH_DSP 16 #include "csky_vdsp2_math.h" 17 #include "csky_vdsp2_nnsupportfunctions.h" 18 #endif 19 20 /** 21 * @brief Struct for specifying activation function types 22 * 23 */ 24 typedef enum 25 { 26 CSKY_SIGMOID = 0, /**< Sigmoid activation function */ 27 CSKY_TANH = 1, /**< Tanh activation function */ 28 } csky_vdsp2_nn_activation_type; 29 30 /** 31 * @brief Basic Q7 convolution function 32 * @param[in] Im_in pointer to input tensor 33 * @param[in] dim_im_in input tensor dimention 34 * @param[in] ch_im_in number of input tensor channels 35 * @param[in] wt pointer to kernel weights 36 * @param[in] ch_im_out number of filters, i.e., output tensor channels 37 * @param[in] dim_kernel filter kernel size 38 * @param[in] padding padding sizes 39 * @param[in] stride convolution stride 40 * @param[in] bias pointer to bias 41 * @param[in] bias_shift amount of left-shift for bias 42 * @param[in] out_shift amount of right-shift for output 43 * @param[in,out] Im_out pointer to output tensor 44 * @param[in] dim_im_out output tensor dimension 45 * @param[in,out] bufferA pointer to buffer space for input 46 * @return none. 47 * 48 */ 49 50 void csky_vdsp2_convolve_HWC_q7_basic(const q7_t * Im_in, 51 const uint16_t dim_im_in, 52 const uint16_t ch_im_in, 53 const q7_t * wt, 54 const uint16_t ch_im_out, 55 const uint16_t dim_kernel, 56 const uint16_t padding, 57 const uint16_t stride, 58 const q7_t * bias, 59 const uint16_t bias_shift, 60 const uint16_t out_shift, 61 q7_t * Im_out, 62 const uint16_t dim_im_out, 63 q15_t * bufferA); 64 65 /** 66 * @brief Basic Q15 convolution function 67 * @param[in] Im_in pointer to input tensor 68 * @param[in] dim_im_in input tensor dimention 69 * @param[in] ch_im_in number of input tensor channels 70 * @param[in] wt pointer to kernel weights 71 * @param[in] ch_im_out number of filters, i.e., output tensor channels 72 * @param[in] dim_kernel filter kernel size 73 * @param[in] padding padding sizes 74 * @param[in] stride convolution stride 75 * @param[in] bias pointer to bias 76 * @param[in] bias_shift amount of left-shift for bias 77 * @param[in] out_shift amount of right-shift for output 78 * @param[in,out] Im_out pointer to output tensor 79 * @param[in] dim_im_out output tensor dimension 80 * @param[in,out] bufferA pointer to buffer space for input 81 * @return none. 82 * 83 */ 84 85 void csky_vdsp2_convolve_HWC_q15_basic(const q15_t * Im_in, 86 const uint16_t dim_im_in, 87 const uint16_t ch_im_in, 88 const q15_t * wt, 89 const uint16_t ch_im_out, 90 const uint16_t dim_kernel, 91 const uint16_t padding, 92 const uint16_t stride, 93 const q15_t * bias, 94 const uint16_t bias_shift, 95 const uint16_t out_shift, 96 q15_t * Im_out, 97 const uint16_t dim_im_out, 98 q15_t * bufferA); 99 100 101 /** 102 * @brief Fast Q7 convolution function (non-sqaure shape) 103 * @param[in] Im_in pointer to input tensor 104 * @param[in] dim_im_in_x input tensor dimention x 105 * @param[in] dim_im_in_y input tensor dimention y 106 * @param[in] ch_im_in number of input tensor channels 107 * @param[in] wt pointer to kernel weights 108 * @param[in] ch_im_out number of filters, i.e., output tensor channels 109 * @param[in] dim_kernel_x filter kernel size x 110 * @param[in] dim_kernel_y filter kernel size y 111 * @param[in] padding_x padding size x 112 * @param[in] padding_y padding size y 113 * @param[in] stride_x convolution stride x 114 * @param[in] stride_y convolution stride y 115 * @param[in] bias pointer to bias 116 * @param[in] bias_shift amount of left-shift for bias 117 * @param[in] out_shift amount of right-shift for output 118 * @param[in,out] Im_out pointer to output tensor 119 * @param[in] dim_im_out_x output tensor dimension x 120 * @param[in] dim_im_out_y output tensor dimension y 121 * @param[in,out] bufferA pointer to buffer space for input 122 * @return none. 123 * 124 * This function is the version with full list of optimization tricks, but with 125 * some contraints: 126 * ch_im_in is multiple of 4 127 * ch_im_out is multiple of 2 128 */ 129 130 void csky_vdsp2_convolve_HWC_q7_fast_nonsquare(const q7_t * Im_in, 131 const uint16_t dim_im_in_x, 132 const uint16_t dim_im_in_y, 133 const uint16_t ch_im_in, 134 const q7_t * wt, 135 const uint16_t ch_im_out, 136 const uint16_t dim_kernel_x, 137 const uint16_t dim_kernel_y, 138 const uint16_t padding_x, 139 const uint16_t padding_y, 140 const uint16_t stride_x, 141 const uint16_t stride_y, 142 const q7_t * bias, 143 const uint16_t bias_shift, 144 const uint16_t out_shift, 145 q7_t * Im_out, 146 const uint16_t dim_im_out_x, 147 const uint16_t dim_im_out_y, 148 q15_t * bufferA); 149 150 /** 151 * @brief Fast Q7 version of 1x1 convolution (non-sqaure shape) 152 * @param[in] Im_in pointer to input tensor 153 * @param[in] dim_im_in_x input tensor dimention x 154 * @param[in] dim_im_in_y input tensor dimention y 155 * @param[in] ch_im_in number of input tensor channels 156 * @param[in] wt pointer to kernel weights 157 * @param[in] ch_im_out number of filters, i.e., output tensor channels 158 * @param[in] dim_kernel_x filter kernel size x 159 * @param[in] dim_kernel_y filter kernel size y 160 * @param[in] padding_x padding size x 161 * @param[in] padding_y padding size y 162 * @param[in] stride_x convolution stride x 163 * @param[in] stride_y convolution stride y 164 * @param[in] bias pointer to bias 165 * @param[in] bias_shift amount of left-shift for bias 166 * @param[in] out_shift amount of right-shift for output 167 * @param[in,out] Im_out pointer to output tensor 168 * @param[in] dim_im_out_x output tensor dimension x 169 * @param[in] dim_im_out_y output tensor dimension y 170 * @param[in,out] bufferA pointer to buffer space for input 171 * @return none. 172 * 173 * This function implement convolution with 1x1 kernel size (i.e., dim_kernel_x=1 174 * and dim_kernel_y=1). It can be used for 175 * second half of MobileNets after depthwise separable convolution. 176 * 177 * This function is the version with full list of optimization tricks, but with 178 * some contraints: 179 * ch_im_in is multiple of 4 180 * ch_im_out is multiple of 2 181 */ 182 void csky_vdsp2_convolve_1x1_HWC_q7_fast(const q7_t * Im_in, 183 const uint16_t dim_im_in_x, 184 const uint16_t dim_im_in_y, 185 const uint16_t ch_im_in, 186 const q7_t * wt, 187 const uint16_t ch_im_out, 188 const q7_t * bias, 189 const uint16_t bias_shift, 190 const uint16_t out_shift, 191 q7_t * Im_out, 192 const uint16_t dim_im_out_x, 193 const uint16_t dim_im_out_y, 194 q15_t * bufferA); 195 196 /** 197 * @brief Q7 version of convolution for RGB image 198 * @param[in] Im_in pointer to input tensor 199 * @param[in] dim_im_in input tensor dimention 200 * @param[in] ch_im_in number of input tensor channels 201 * @param[in] wt pointer to kernel weights 202 * @param[in] ch_im_out number of filters, i.e., output tensor channels 203 * @param[in] dim_kernel filter kernel size 204 * @param[in] padding padding sizes 205 * @param[in] stride convolution stride 206 * @param[in] bias pointer to bias 207 * @param[in] bias_shift amount of left-shift for bias 208 * @param[in] out_shift amount of right-shift for output 209 * @param[in,out] Im_out pointer to output tensor 210 * @param[in] dim_im_out output tensor dimension 211 * @param[in,out] bufferA pointer to buffer space for input 212 * @return none. 213 * 214 * This kernel is written exclusively for convolution with ch_im_in 215 * equals 3. This applies on the first layer of CNNs which has input 216 * image with RGB format. 217 */ 218 219 void csky_vdsp2_convolve_HWC_q7_RGB(const q7_t * Im_in, 220 const uint16_t dim_im_in, 221 const q7_t * wt, 222 const uint16_t ch_im_out, 223 const uint16_t dim_kernel, 224 const uint16_t padding, 225 const uint16_t stride, 226 const q7_t * bias, 227 const uint16_t bias_shift, 228 const uint16_t out_shift, 229 q7_t * Im_out, 230 const uint16_t dim_im_out, 231 q15_t * bufferA); 232 233 234 /** 235 * @brief Q7 depthwise separable convolution function 236 * @param[in] Im_in pointer to input tensor 237 * @param[in] dim_im_in input tensor dimention 238 * @param[in] ch_im_in number of input tensor channels 239 * @param[in] wt pointer to kernel weights 240 * @param[in] ch_im_out number of filters, i.e., output tensor channels 241 * @param[in] dim_kernel filter kernel size 242 * @param[in] padding padding sizes 243 * @param[in] stride convolution stride 244 * @param[in] bias pointer to bias 245 * @param[in] bias_shift amount of left-shift for bias 246 * @param[in] out_shift amount of right-shift for output 247 * @param[in,out] Im_out pointer to output tensor 248 * @param[in] dim_im_out output tensor dimension 249 * @param[in,out] bufferA pointer to buffer space for input 250 * @return none. 251 * 252 * This function is the version with full list of optimization tricks, but with 253 * some contraints: 254 * ch_im_in is multiple of 2 255 * ch_im_out is multiple of 2 256 */ 257 258 void csky_vdsp2_depthwise_separable_conv_HWC_q7(const q7_t * Im_in, 259 const uint16_t dim_im_in, 260 const uint16_t ch_im_in, 261 const q7_t * wt, 262 const uint16_t ch_im_out, 263 const uint16_t dim_kernel, 264 const uint16_t padding, 265 const uint16_t stride, 266 const q7_t * bias, 267 const uint16_t bias_shift, 268 const uint16_t out_shift, 269 q7_t * Im_out, 270 const uint16_t dim_im_out, 271 q15_t * bufferA); 272 273 /** 274 * @brief Q7 depthwise separable convolution function (non-square shape) 275 * @param[in] Im_in pointer to input tensor 276 * @param[in] dim_im_in_x input tensor dimention x 277 * @param[in] dim_im_in_y input tensor dimention y 278 * @param[in] ch_im_in number of input tensor channels 279 * @param[in] wt pointer to kernel weights 280 * @param[in] ch_im_out number of filters, i.e., output tensor channels 281 * @param[in] dim_kernel_x filter kernel size x 282 * @param[in] dim_kernel_y filter kernel size y 283 * @param[in] padding_x padding sizes x 284 * @param[in] padding_y padding sizes y 285 * @param[in] stride_x convolution stride x 286 * @param[in] stride_y convolution stride y 287 * @param[in] bias pointer to bias 288 * @param[in] bias_shift amount of left-shift for bias 289 * @param[in] out_shift amount of right-shift for output 290 * @param[in,out] Im_out pointer to output tensor 291 * @param[in] dim_im_out_x output tensor dimension x 292 * @param[in] dim_im_out_y output tensor dimension y 293 * @param[in,out] bufferA pointer to buffer space for input 294 * @return none. 295 * 296 * This function is the version with full list of optimization tricks, but with 297 * some contraints: 298 * ch_im_in is multiple of 2 299 * ch_im_out is multiple of 2 300 */ 301 void csky_vdsp2_depthwise_separable_conv_HWC_q7_nonsquare(const q7_t * Im_in, 302 const uint16_t dim_im_in_x, 303 const uint16_t dim_im_in_y, 304 const uint16_t ch_im_in, 305 const q7_t * wt, 306 const uint16_t ch_im_out, 307 const uint16_t dim_kernel_x, 308 const uint16_t dim_kernel_y, 309 const uint16_t padding_x, 310 const uint16_t padding_y, 311 const uint16_t stride_x, 312 const uint16_t stride_y, 313 const q7_t * bias, 314 const uint16_t bias_shift, 315 const uint16_t out_shift, 316 q7_t * Im_out, 317 const uint16_t dim_im_out_x, 318 const uint16_t dim_im_out_y, 319 q15_t * bufferA); 320 321 322 /** 323 * @brief Q7 basic fully-connected layer function 324 * @param[in] pV pointer to input vector 325 * @param[in] pM pointer to matrix weights 326 * @param[in] dim_vec length of the vector 327 * @param[in] num_of_rows number of rows in weight matrix 328 * @param[in] bias_shift amount of left-shift for bias 329 * @param[in] out_shift amount of right-shift for output 330 * @param[in] bias pointer to bias 331 * @param[in,out] pOut pointer to output vector 332 * @return none. 333 */ 334 335 void csky_vdsp2_fully_connected_q7(const q7_t * pV, 336 const q7_t * pM, 337 const uint16_t dim_vec, 338 const uint16_t num_of_rows, 339 const uint16_t bias_shift, 340 const uint16_t out_shift, 341 const q7_t * bias, 342 q7_t * pOut); 343 344 345 /** 346 * @brief Q15 basic fully-connected layer function 347 * @param[in] pV pointer to input vector 348 * @param[in] pM pointer to matrix weights 349 * @param[in] dim_vec length of the vector 350 * @param[in] num_of_rows number of rows in weight matrix 351 * @param[in] bias_shift amount of left-shift for bias 352 * @param[in] out_shift amount of right-shift for output 353 * @param[in] bias pointer to bias 354 * @param[in,out] pOut pointer to output vector 355 * @return none. 356 * 357 */ 358 359 void csky_vdsp2_fully_connected_q15(const q15_t * pV, 360 const q15_t * pM, 361 const uint16_t dim_vec, 362 const uint16_t num_of_rows, 363 const uint16_t bias_shift, 364 const uint16_t out_shift, 365 const q15_t * bias, 366 q15_t * pOut); 367 368 369 /** 370 * @brief Mixed Q15-Q7 fully-connected layer function 371 * @param[in] pV pointer to input vector 372 * @param[in] pM pointer to matrix weights 373 * @param[in] dim_vec length of the vector 374 * @param[in] num_of_rows number of rows in weight matrix 375 * @param[in] bias_shift amount of left-shift for bias 376 * @param[in] out_shift amount of right-shift for output 377 * @param[in] bias pointer to bias 378 * @param[in,out] pOut pointer to output vector 379 * @return none. 380 * 381 */ 382 383 void csky_vdsp2_fully_connected_mat_q7_vec_q15(const q15_t * pV, 384 const q7_t * pM, 385 const uint16_t dim_vec, 386 const uint16_t num_of_rows, 387 const uint16_t bias_shift, 388 const uint16_t out_shift, 389 const q7_t * bias, 390 q15_t * pOut); 391 392 393 394 /** 395 * @brief Q7 RELU function 396 * @param[in,out] data pointer to input 397 * @param[in] size number of elements 398 * @return none. 399 */ 400 401 void csky_vdsp2_relu_q7(q7_t * data, uint16_t size); 402 403 /** 404 * @brief Q15 RELU function 405 * @param[in,out] data pointer to input 406 * @param[in] size number of elements 407 * @return none. 408 */ 409 410 void csky_vdsp2_relu_q15(q15_t * data, uint16_t size); 411 412 /** 413 * @brief Q7 neural network activation function using direct table look-up 414 * @param[in,out] data pointer to input 415 * @param[in] size number of elements 416 * @param[in] int_width bit-width of the integer part, assume to be smaller than 3 417 * @param[in] type type of activation functions 418 * @return none. 419 */ 420 421 void csky_vdsp2_nn_activations_direct_q7(q7_t * data, uint16_t size, 422 uint16_t int_width, 423 csky_vdsp2_nn_activation_type type); 424 425 /** 426 * @brief Q15 neural network activation function using direct table look-up 427 * @param[in,out] data pointer to input 428 * @param[in] size number of elements 429 * @param[in] int_width bit-width of the integer part, assume to be smaller than 3 430 * @param[in] type type of activation functions 431 * @return none. 432 */ 433 434 void csky_vdsp2_nn_activations_direct_q15(q15_t * data, uint16_t size, 435 uint16_t int_width, 436 csky_vdsp2_nn_activation_type type); 437 438 /** 439 * @brief Q7 max pooling function 440 * @param[in] Im_in pointer to input tensor 441 * @param[in] dim_im_in input tensor dimention 442 * @param[in] ch_im_in number of input tensor channels 443 * @param[in] dim_kernel filter kernel size 444 * @param[in] padding padding sizes 445 * @param[in] stride convolution stride 446 * @param[in] dim_im_out output tensor dimension 447 * @param[in,out] bufferA pointer to buffer space for input 448 * @param[in,out] Im_out pointer to output tensor 449 * @return none. 450 * 451 */ 452 453 void csky_vdsp2_maxpool_q7_HWC(q7_t * Im_in, 454 const uint16_t dim_im_in, 455 const uint16_t ch_im_in, 456 const uint16_t dim_kernel, 457 const uint16_t padding, 458 const uint16_t stride, 459 const uint16_t dim_im_out, 460 q7_t * bufferA, 461 q7_t * Im_out); 462 463 /** 464 * @brief Q7 average pooling function 465 * @param[in] Im_in pointer to input tensor 466 * @param[in] dim_im_in input tensor dimention 467 * @param[in] ch_im_in number of input tensor channels 468 * @param[in] dim_kernel filter kernel size 469 * @param[in] padding padding sizes 470 * @param[in] stride convolution stride 471 * @param[in] dim_im_out output tensor dimension 472 * @param[in,out] bufferA pointer to buffer space for input 473 * @param[in,out] Im_out pointer to output tensor 474 * @return none. 475 * 476 */ 477 478 void csky_vdsp2_avepool_q7_HWC(q7_t * Im_in, 479 const uint16_t dim_im_in, 480 const uint16_t ch_im_in, 481 const uint16_t dim_kernel, 482 const uint16_t padding, 483 const uint16_t stride, 484 const uint16_t dim_im_out, 485 q7_t * bufferA, 486 q7_t * Im_out); 487 488 489 /** 490 * @brief Q7 softmax function 491 * @param[in] vec_in pointer to input vector 492 * @param[in] dim_vec input vector dimention 493 * @param[out] p_out pointer to output vector 494 * @return none. 495 * 496 */ 497 498 void csky_vdsp2_softmax_q7(const q7_t *vec_in, const uint16_t dim_vec, q7_t *p_out); 499 500 /** 501 * @brief Q15 softmax function 502 * @param[in] vec_in pointer to input vector 503 * @param[in] dim_vec input vector dimention 504 * @param[out] p_out pointer to output vector 505 * @return none. 506 * 507 */ 508 509 void csky_vdsp2_softmax_q15(const q15_t *vec_in, const uint16_t dim_vec, 510 q15_t *p_out); 511 512 #ifdef __cplusplus 513 } 514 #endif 515 516 #endif 517