1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_cfft_f32.c
4  * Description:  Combined Radix Decimation in Frequency CFFT Floating point processing function
5  *
6  * $Date:        27. January 2017
7  * $Revision:    V.1.5.1
8  *
9  * Target Processor: Cortex-M cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "arm_math.h"
30 #include "arm_common_tables.h"
31 
32 extern void arm_radix8_butterfly_f32(
33     float32_t * pSrc,
34     uint16_t fftLen,
35     const float32_t * pCoef,
36     uint16_t twidCoefModifier);
37 
38 extern void arm_bitreversal_32(
39     uint32_t * pSrc,
40     const uint16_t bitRevLen,
41     const uint16_t * pBitRevTable);
42 
43 /**
44 * @ingroup groupTransforms
45 */
46 
47 /**
48 * @defgroup ComplexFFT Complex FFT Functions
49 *
50 * \par
51 * The Fast Fourier Transform (FFT) is an efficient algorithm for computing the
52 * Discrete Fourier Transform (DFT).  The FFT can be orders of magnitude faster
53 * than the DFT, especially for long lengths.
54 * The algorithms described in this section
55 * operate on complex data.  A separate set of functions is devoted to handling
56 * of real sequences.
57 * \par
58 * There are separate algorithms for handling floating-point, Q15, and Q31 data
59 * types.  The algorithms available for each data type are described next.
60 * \par
61 * The FFT functions operate in-place.  That is, the array holding the input data
62 * will also be used to hold the corresponding result.  The input data is complex
63 * and contains <code>2*fftLen</code> interleaved values as shown below.
64 * <pre> {real[0], imag[0], real[1], imag[1],..} </pre>
65 * The FFT result will be contained in the same array and the frequency domain
66 * values will have the same interleaving.
67 *
68 * \par Floating-point
69 * The floating-point complex FFT uses a mixed-radix algorithm.  Multiple radix-8
70 * stages are performed along with a single radix-2 or radix-4 stage, as needed.
71 * The algorithm supports lengths of [16, 32, 64, ..., 4096] and each length uses
72 * a different twiddle factor table.
73 * \par
74 * The function uses the standard FFT definition and output values may grow by a
75 * factor of <code>fftLen</code> when computing the forward transform.  The
76 * inverse transform includes a scale of <code>1/fftLen</code> as part of the
77 * calculation and this matches the textbook definition of the inverse FFT.
78 * \par
79 * Pre-initialized data structures containing twiddle factors and bit reversal
80 * tables are provided and defined in <code>arm_const_structs.h</code>.  Include
81 * this header in your function and then pass one of the constant structures as
82 * an argument to arm_cfft_f32.  For example:
83 * \par
84 * <code>arm_cfft_f32(arm_cfft_sR_f32_len64, pSrc, 1, 1)</code>
85 * \par
86 * computes a 64-point inverse complex FFT including bit reversal.
87 * The data structures are treated as constant data and not modified during the
88 * calculation.  The same data structure can be reused for multiple transforms
89 * including mixing forward and inverse transforms.
90 * \par
91 * Earlier releases of the library provided separate radix-2 and radix-4
92 * algorithms that operated on floating-point data.  These functions are still
93 * provided but are deprecated.  The older functions are slower and less general
94 * than the new functions.
95 * \par
96 * An example of initialization of the constants for the arm_cfft_f32 function follows:
97 * \code
98 * const static arm_cfft_instance_f32 *S;
99 * ...
100 *   switch (length) {
101 *     case 16:
102 *       S = &arm_cfft_sR_f32_len16;
103 *       break;
104 *     case 32:
105 *       S = &arm_cfft_sR_f32_len32;
106 *       break;
107 *     case 64:
108 *       S = &arm_cfft_sR_f32_len64;
109 *       break;
110 *     case 128:
111 *       S = &arm_cfft_sR_f32_len128;
112 *       break;
113 *     case 256:
114 *       S = &arm_cfft_sR_f32_len256;
115 *       break;
116 *     case 512:
117 *       S = &arm_cfft_sR_f32_len512;
118 *       break;
119 *     case 1024:
120 *       S = &arm_cfft_sR_f32_len1024;
121 *       break;
122 *     case 2048:
123 *       S = &arm_cfft_sR_f32_len2048;
124 *       break;
125 *     case 4096:
126 *       S = &arm_cfft_sR_f32_len4096;
127 *       break;
128 *   }
129 * \endcode
130 * \par Q15 and Q31
131 * The floating-point complex FFT uses a mixed-radix algorithm.  Multiple radix-4
132 * stages are performed along with a single radix-2 stage, as needed.
133 * The algorithm supports lengths of [16, 32, 64, ..., 4096] and each length uses
134 * a different twiddle factor table.
135 * \par
136 * The function uses the standard FFT definition and output values may grow by a
137 * factor of <code>fftLen</code> when computing the forward transform.  The
138 * inverse transform includes a scale of <code>1/fftLen</code> as part of the
139 * calculation and this matches the textbook definition of the inverse FFT.
140 * \par
141 * Pre-initialized data structures containing twiddle factors and bit reversal
142 * tables are provided and defined in <code>arm_const_structs.h</code>.  Include
143 * this header in your function and then pass one of the constant structures as
144 * an argument to arm_cfft_q31.  For example:
145 * \par
146 * <code>arm_cfft_q31(arm_cfft_sR_q31_len64, pSrc, 1, 1)</code>
147 * \par
148 * computes a 64-point inverse complex FFT including bit reversal.
149 * The data structures are treated as constant data and not modified during the
150 * calculation.  The same data structure can be reused for multiple transforms
151 * including mixing forward and inverse transforms.
152 * \par
153 * Earlier releases of the library provided separate radix-2 and radix-4
154 * algorithms that operated on floating-point data.  These functions are still
155 * provided but are deprecated.  The older functions are slower and less general
156 * than the new functions.
157 * \par
158 * An example of initialization of the constants for the arm_cfft_q31 function follows:
159 * \code
160 * const static arm_cfft_instance_q31 *S;
161 * ...
162 *   switch (length) {
163 *     case 16:
164 *       S = &arm_cfft_sR_q31_len16;
165 *       break;
166 *     case 32:
167 *       S = &arm_cfft_sR_q31_len32;
168 *       break;
169 *     case 64:
170 *       S = &arm_cfft_sR_q31_len64;
171 *       break;
172 *     case 128:
173 *       S = &arm_cfft_sR_q31_len128;
174 *       break;
175 *     case 256:
176 *       S = &arm_cfft_sR_q31_len256;
177 *       break;
178 *     case 512:
179 *       S = &arm_cfft_sR_q31_len512;
180 *       break;
181 *     case 1024:
182 *       S = &arm_cfft_sR_q31_len1024;
183 *       break;
184 *     case 2048:
185 *       S = &arm_cfft_sR_q31_len2048;
186 *       break;
187 *     case 4096:
188 *       S = &arm_cfft_sR_q31_len4096;
189 *       break;
190 *   }
191 * \endcode
192 *
193 */
194 
arm_cfft_radix8by2_f32(arm_cfft_instance_f32 * S,float32_t * p1)195 void arm_cfft_radix8by2_f32( arm_cfft_instance_f32 * S, float32_t * p1)
196 {
197     uint32_t    L  = S->fftLen;
198     float32_t * pCol1, * pCol2, * pMid1, * pMid2;
199     float32_t * p2 = p1 + L;
200     const float32_t * tw = (float32_t *) S->pTwiddle;
201     float32_t t1[4], t2[4], t3[4], t4[4], twR, twI;
202     float32_t m0, m1, m2, m3;
203     uint32_t l;
204 
205     pCol1 = p1;
206     pCol2 = p2;
207 
208     //    Define new length
209     L >>= 1;
210     //    Initialize mid pointers
211     pMid1 = p1 + L;
212     pMid2 = p2 + L;
213 
214     // do two dot Fourier transform
215     for ( l = L >> 2; l > 0; l-- )
216     {
217         t1[0] = p1[0];
218         t1[1] = p1[1];
219         t1[2] = p1[2];
220         t1[3] = p1[3];
221 
222         t2[0] = p2[0];
223         t2[1] = p2[1];
224         t2[2] = p2[2];
225         t2[3] = p2[3];
226 
227         t3[0] = pMid1[0];
228         t3[1] = pMid1[1];
229         t3[2] = pMid1[2];
230         t3[3] = pMid1[3];
231 
232         t4[0] = pMid2[0];
233         t4[1] = pMid2[1];
234         t4[2] = pMid2[2];
235         t4[3] = pMid2[3];
236 
237         *p1++ = t1[0] + t2[0];
238         *p1++ = t1[1] + t2[1];
239         *p1++ = t1[2] + t2[2];
240         *p1++ = t1[3] + t2[3];    // col 1
241 
242         t2[0] = t1[0] - t2[0];
243         t2[1] = t1[1] - t2[1];
244         t2[2] = t1[2] - t2[2];
245         t2[3] = t1[3] - t2[3];    // for col 2
246 
247         *pMid1++ = t3[0] + t4[0];
248         *pMid1++ = t3[1] + t4[1];
249         *pMid1++ = t3[2] + t4[2];
250         *pMid1++ = t3[3] + t4[3]; // col 1
251 
252         t4[0] = t4[0] - t3[0];
253         t4[1] = t4[1] - t3[1];
254         t4[2] = t4[2] - t3[2];
255         t4[3] = t4[3] - t3[3];    // for col 2
256 
257         twR = *tw++;
258         twI = *tw++;
259 
260         // multiply by twiddle factors
261         m0 = t2[0] * twR;
262         m1 = t2[1] * twI;
263         m2 = t2[1] * twR;
264         m3 = t2[0] * twI;
265 
266         // R  =  R  *  Tr - I * Ti
267         *p2++ = m0 + m1;
268         // I  =  I  *  Tr + R * Ti
269         *p2++ = m2 - m3;
270 
271         // use vertical symmetry
272         //  0.9988 - 0.0491i <==> -0.0491 - 0.9988i
273         m0 = t4[0] * twI;
274         m1 = t4[1] * twR;
275         m2 = t4[1] * twI;
276         m3 = t4[0] * twR;
277 
278         *pMid2++ = m0 - m1;
279         *pMid2++ = m2 + m3;
280 
281         twR = *tw++;
282         twI = *tw++;
283 
284         m0 = t2[2] * twR;
285         m1 = t2[3] * twI;
286         m2 = t2[3] * twR;
287         m3 = t2[2] * twI;
288 
289         *p2++ = m0 + m1;
290         *p2++ = m2 - m3;
291 
292         m0 = t4[2] * twI;
293         m1 = t4[3] * twR;
294         m2 = t4[3] * twI;
295         m3 = t4[2] * twR;
296 
297         *pMid2++ = m0 - m1;
298         *pMid2++ = m2 + m3;
299     }
300 
301     // first col
302     arm_radix8_butterfly_f32( pCol1, L, (float32_t *) S->pTwiddle, 2u);
303     // second col
304     arm_radix8_butterfly_f32( pCol2, L, (float32_t *) S->pTwiddle, 2u);
305 }
306 
arm_cfft_radix8by4_f32(arm_cfft_instance_f32 * S,float32_t * p1)307 void arm_cfft_radix8by4_f32( arm_cfft_instance_f32 * S, float32_t * p1)
308 {
309     uint32_t    L  = S->fftLen >> 1;
310     float32_t * pCol1, *pCol2, *pCol3, *pCol4, *pEnd1, *pEnd2, *pEnd3, *pEnd4;
311     const float32_t *tw2, *tw3, *tw4;
312     float32_t * p2 = p1 + L;
313     float32_t * p3 = p2 + L;
314     float32_t * p4 = p3 + L;
315     float32_t t2[4], t3[4], t4[4], twR, twI;
316     float32_t p1ap3_0, p1sp3_0, p1ap3_1, p1sp3_1;
317     float32_t m0, m1, m2, m3;
318     uint32_t l, twMod2, twMod3, twMod4;
319 
320     pCol1 = p1;         // points to real values by default
321     pCol2 = p2;
322     pCol3 = p3;
323     pCol4 = p4;
324     pEnd1 = p2 - 1;     // points to imaginary values by default
325     pEnd2 = p3 - 1;
326     pEnd3 = p4 - 1;
327     pEnd4 = pEnd3 + L;
328 
329     tw2 = tw3 = tw4 = (float32_t *) S->pTwiddle;
330 
331     L >>= 1;
332 
333     // do four dot Fourier transform
334 
335     twMod2 = 2;
336     twMod3 = 4;
337     twMod4 = 6;
338 
339     // TOP
340     p1ap3_0 = p1[0] + p3[0];
341     p1sp3_0 = p1[0] - p3[0];
342     p1ap3_1 = p1[1] + p3[1];
343     p1sp3_1 = p1[1] - p3[1];
344 
345     // col 2
346     t2[0] = p1sp3_0 + p2[1] - p4[1];
347     t2[1] = p1sp3_1 - p2[0] + p4[0];
348     // col 3
349     t3[0] = p1ap3_0 - p2[0] - p4[0];
350     t3[1] = p1ap3_1 - p2[1] - p4[1];
351     // col 4
352     t4[0] = p1sp3_0 - p2[1] + p4[1];
353     t4[1] = p1sp3_1 + p2[0] - p4[0];
354     // col 1
355     *p1++ = p1ap3_0 + p2[0] + p4[0];
356     *p1++ = p1ap3_1 + p2[1] + p4[1];
357 
358     // Twiddle factors are ones
359     *p2++ = t2[0];
360     *p2++ = t2[1];
361     *p3++ = t3[0];
362     *p3++ = t3[1];
363     *p4++ = t4[0];
364     *p4++ = t4[1];
365 
366     tw2 += twMod2;
367     tw3 += twMod3;
368     tw4 += twMod4;
369 
370     for (l = (L - 2) >> 1; l > 0; l-- )
371     {
372         // TOP
373         p1ap3_0 = p1[0] + p3[0];
374         p1sp3_0 = p1[0] - p3[0];
375         p1ap3_1 = p1[1] + p3[1];
376         p1sp3_1 = p1[1] - p3[1];
377         // col 2
378         t2[0] = p1sp3_0 + p2[1] - p4[1];
379         t2[1] = p1sp3_1 - p2[0] + p4[0];
380         // col 3
381         t3[0] = p1ap3_0 - p2[0] - p4[0];
382         t3[1] = p1ap3_1 - p2[1] - p4[1];
383         // col 4
384         t4[0] = p1sp3_0 - p2[1] + p4[1];
385         t4[1] = p1sp3_1 + p2[0] - p4[0];
386         // col 1 - top
387         *p1++ = p1ap3_0 + p2[0] + p4[0];
388         *p1++ = p1ap3_1 + p2[1] + p4[1];
389 
390         // BOTTOM
391         p1ap3_1 = pEnd1[-1] + pEnd3[-1];
392         p1sp3_1 = pEnd1[-1] - pEnd3[-1];
393         p1ap3_0 = pEnd1[0] + pEnd3[0];
394         p1sp3_0 = pEnd1[0] - pEnd3[0];
395         // col 2
396         t2[2] = pEnd2[0]  - pEnd4[0] + p1sp3_1;
397         t2[3] = pEnd1[0] - pEnd3[0] - pEnd2[-1] + pEnd4[-1];
398         // col 3
399         t3[2] = p1ap3_1 - pEnd2[-1] - pEnd4[-1];
400         t3[3] = p1ap3_0 - pEnd2[0]  - pEnd4[0];
401         // col 4
402         t4[2] = pEnd2[0]  - pEnd4[0]  - p1sp3_1;
403         t4[3] = pEnd4[-1] - pEnd2[-1] - p1sp3_0;
404         // col 1 - Bottom
405         *pEnd1-- = p1ap3_0 + pEnd2[0] + pEnd4[0];
406         *pEnd1-- = p1ap3_1 + pEnd2[-1] + pEnd4[-1];
407 
408         // COL 2
409         // read twiddle factors
410         twR = *tw2++;
411         twI = *tw2++;
412         // multiply by twiddle factors
413         //  let    Z1 = a + i(b),   Z2 = c + i(d)
414         //   =>  Z1 * Z2  =  (a*c - b*d) + i(b*c + a*d)
415 
416         // Top
417         m0 = t2[0] * twR;
418         m1 = t2[1] * twI;
419         m2 = t2[1] * twR;
420         m3 = t2[0] * twI;
421 
422         *p2++ = m0 + m1;
423         *p2++ = m2 - m3;
424         // use vertical symmetry col 2
425         // 0.9997 - 0.0245i  <==>  0.0245 - 0.9997i
426         // Bottom
427         m0 = t2[3] * twI;
428         m1 = t2[2] * twR;
429         m2 = t2[2] * twI;
430         m3 = t2[3] * twR;
431 
432         *pEnd2-- = m0 - m1;
433         *pEnd2-- = m2 + m3;
434 
435         // COL 3
436         twR = tw3[0];
437         twI = tw3[1];
438         tw3 += twMod3;
439         // Top
440         m0 = t3[0] * twR;
441         m1 = t3[1] * twI;
442         m2 = t3[1] * twR;
443         m3 = t3[0] * twI;
444 
445         *p3++ = m0 + m1;
446         *p3++ = m2 - m3;
447         // use vertical symmetry col 3
448         // 0.9988 - 0.0491i  <==>  -0.9988 - 0.0491i
449         // Bottom
450         m0 = -t3[3] * twR;
451         m1 = t3[2] * twI;
452         m2 = t3[2] * twR;
453         m3 = t3[3] * twI;
454 
455         *pEnd3-- = m0 - m1;
456         *pEnd3-- = m3 - m2;
457 
458         // COL 4
459         twR = tw4[0];
460         twI = tw4[1];
461         tw4 += twMod4;
462         // Top
463         m0 = t4[0] * twR;
464         m1 = t4[1] * twI;
465         m2 = t4[1] * twR;
466         m3 = t4[0] * twI;
467 
468         *p4++ = m0 + m1;
469         *p4++ = m2 - m3;
470         // use vertical symmetry col 4
471         // 0.9973 - 0.0736i  <==>  -0.0736 + 0.9973i
472         // Bottom
473         m0 = t4[3] * twI;
474         m1 = t4[2] * twR;
475         m2 = t4[2] * twI;
476         m3 = t4[3] * twR;
477 
478         *pEnd4-- = m0 - m1;
479         *pEnd4-- = m2 + m3;
480     }
481 
482     //MIDDLE
483     // Twiddle factors are
484     //  1.0000  0.7071-0.7071i  -1.0000i  -0.7071-0.7071i
485     p1ap3_0 = p1[0] + p3[0];
486     p1sp3_0 = p1[0] - p3[0];
487     p1ap3_1 = p1[1] + p3[1];
488     p1sp3_1 = p1[1] - p3[1];
489 
490     // col 2
491     t2[0] = p1sp3_0 + p2[1] - p4[1];
492     t2[1] = p1sp3_1 - p2[0] + p4[0];
493     // col 3
494     t3[0] = p1ap3_0 - p2[0] - p4[0];
495     t3[1] = p1ap3_1 - p2[1] - p4[1];
496     // col 4
497     t4[0] = p1sp3_0 - p2[1] + p4[1];
498     t4[1] = p1sp3_1 + p2[0] - p4[0];
499     // col 1 - Top
500     *p1++ = p1ap3_0 + p2[0] + p4[0];
501     *p1++ = p1ap3_1 + p2[1] + p4[1];
502 
503     // COL 2
504     twR = tw2[0];
505     twI = tw2[1];
506 
507     m0 = t2[0] * twR;
508     m1 = t2[1] * twI;
509     m2 = t2[1] * twR;
510     m3 = t2[0] * twI;
511 
512     *p2++ = m0 + m1;
513     *p2++ = m2 - m3;
514     // COL 3
515     twR = tw3[0];
516     twI = tw3[1];
517 
518     m0 = t3[0] * twR;
519     m1 = t3[1] * twI;
520     m2 = t3[1] * twR;
521     m3 = t3[0] * twI;
522 
523     *p3++ = m0 + m1;
524     *p3++ = m2 - m3;
525     // COL 4
526     twR = tw4[0];
527     twI = tw4[1];
528 
529     m0 = t4[0] * twR;
530     m1 = t4[1] * twI;
531     m2 = t4[1] * twR;
532     m3 = t4[0] * twI;
533 
534     *p4++ = m0 + m1;
535     *p4++ = m2 - m3;
536 
537     // first col
538     arm_radix8_butterfly_f32( pCol1, L, (float32_t *) S->pTwiddle, 4u);
539     // second col
540     arm_radix8_butterfly_f32( pCol2, L, (float32_t *) S->pTwiddle, 4u);
541     // third col
542     arm_radix8_butterfly_f32( pCol3, L, (float32_t *) S->pTwiddle, 4u);
543     // fourth col
544     arm_radix8_butterfly_f32( pCol4, L, (float32_t *) S->pTwiddle, 4u);
545 }
546 
547 /**
548 * @addtogroup ComplexFFT
549 * @{
550 */
551 
552 /**
553 * @details
554 * @brief       Processing function for the floating-point complex FFT.
555 * @param[in]      *S    points to an instance of the floating-point CFFT structure.
556 * @param[in, out] *p1   points to the complex data buffer of size <code>2*fftLen</code>. Processing occurs in-place.
557 * @param[in]     ifftFlag       flag that selects forward (ifftFlag=0) or inverse (ifftFlag=1) transform.
558 * @param[in]     bitReverseFlag flag that enables (bitReverseFlag=1) or disables (bitReverseFlag=0) bit reversal of output.
559 * @return none.
560 */
561 
arm_cfft_f32(const arm_cfft_instance_f32 * S,float32_t * p1,uint8_t ifftFlag,uint8_t bitReverseFlag)562 void arm_cfft_f32(
563     const arm_cfft_instance_f32 * S,
564     float32_t * p1,
565     uint8_t ifftFlag,
566     uint8_t bitReverseFlag)
567 {
568     uint32_t  L = S->fftLen, l;
569     float32_t invL, * pSrc;
570 
571     if (ifftFlag == 1u)
572     {
573         /*  Conjugate input data  */
574         pSrc = p1 + 1;
575         for(l=0; l<L; l++)
576         {
577             *pSrc = -*pSrc;
578             pSrc += 2;
579         }
580     }
581 
582     switch (L)
583     {
584     case 16:
585     case 128:
586     case 1024:
587         arm_cfft_radix8by2_f32  ( (arm_cfft_instance_f32 *) S, p1);
588         break;
589     case 32:
590     case 256:
591     case 2048:
592         arm_cfft_radix8by4_f32  ( (arm_cfft_instance_f32 *) S, p1);
593         break;
594     case 64:
595     case 512:
596     case 4096:
597         arm_radix8_butterfly_f32( p1, L, (float32_t *) S->pTwiddle, 1);
598         break;
599     }
600 
601     if ( bitReverseFlag )
602         arm_bitreversal_32((uint32_t*)p1,S->bitRevLength,S->pBitRevTable);
603 
604     if (ifftFlag == 1u)
605     {
606         invL = 1.0f/(float32_t)L;
607         /*  Conjugate and scale output data */
608         pSrc = p1;
609         for(l=0; l<L; l++)
610         {
611             *pSrc++ *=   invL ;
612             *pSrc  = -(*pSrc) * invL;
613             pSrc++;
614         }
615     }
616 }
617 
618 /**
619 * @} end of ComplexFFT group
620 */
621