1/*
2 * Copyright (C) 2017 Hangzhou C-SKY Microsystems co.,ltd.
3 *
4 * Licensed under the LGPL v2.1 or later, see the file COPYING.LIB
5 * in this tarball.
6 */
7
8.macro      GET_FRONT_BITS rx ry
9#ifdef      __cskyLE__
10    lsr     \rx, \ry
11#else
12    lsl     \rx, \ry
13#endif
14.endm
15
16.macro      GET_AFTER_BITS rx ry
17#ifdef      __cskyLE__
18    lsl     \rx, \ry
19#else
20    lsr     \rx, \ry
21#endif
22.endm
23
24
25#ifdef WANT_WIDE
26# define Wmemcpy wmemcpy
27#else
28# define Wmemcpy memcpy
29#endif
30
31/* void *memcpy(void *dest, const void *src, size_t n); */
32
33	.text
34	.align 2
35	.global Wmemcpy
36	.type   Wmemcpy, @function
37Wmemcpy:
38    mov     r7, r2
39    cmplti  r4, 4                                   /* If len less than 4 bytes */
40    jbt     .L_copy_by_byte
41
42    mov     r6, r2
43    andi    r6, 3
44    cmpnei  r6, 0
45    jbt     .L_dest_not_aligned                     /* If dest is not 4 bytes aligned */
46.L0:
47    mov     r6, r3
48    andi    r6, 3
49    cmpnei  r6, 0
50    jbt     .L_dest_aligned_but_src_not_aligned     /* If dest is aligned, but src is not aligned */
51
52    cmplti  r4, 16                                  /* dest and src are all aligned */
53    jbt     .L_aligned_and_len_less_16bytes         /* If len less than 16 bytes */
54
55    subi    sp, 8
56    stw     r8, (sp, 0)
57    stw     r9, (sp, 4)
58.L_aligned_and_len_larger_16bytes:                  /* src and dst are all aligned, and len > 16 bytes */
59    ldw     r1, (r3, 0)
60    ldw     r5, (r3, 4)
61    ldw     r8, (r3, 8)
62    ldw     r9, (r3, 12)
63    stw     r1, (r7, 0)
64    stw     r5, (r7, 4)
65    stw     r8, (r7, 8)
66    stw     r9, (r7, 12)
67    subi    r4, 16
68    addi    r3, 16
69    addi    r7, 16
70    cmplti  r4, 16
71    jbf     .L_aligned_and_len_larger_16bytes
72    ldw     r8, (sp, 0)
73    ldw     r9, (sp, 4)
74    addi    sp, 8
75
76.L_aligned_and_len_less_16bytes:
77    cmplti  r4, 4
78    jbt     .L_copy_by_byte
79    ldw     r1, (r3, 0)
80    stw     r1, (r7, 0)
81    subi    r4, 4
82    addi    r3, 4
83    addi    r7, 4
84    jbr     .L_aligned_and_len_less_16bytes
85
86.L_copy_by_byte:                                    /* len less than 4 bytes */
87    cmpnei  r4, 0
88    jbf     .L_return
89    ldb     r1, (r3, 0)
90    stb     r1, (r7, 0)
91    subi    r4, 1
92    addi    r3, 1
93    addi    r7, 1
94    jbr     .L_copy_by_byte
95
96.L_return:
97    rts
98
99/* If dest is not aligned, we copy some bytes to make dest align.
100   Then we should judge whether src is aligned. */
101
102.L_dest_not_aligned:
103    mov     r5, r3                                  /* consider overlapped case */
104    rsub    r5, r5, r7
105    abs     r5, r5
106    cmplt   r5, r4
107    jbt     .L_copy_by_byte
108
109.L1:
110    ldb     r1, (r3, 0)                             /* makes the dest align. */
111    stb     r1, (r7, 0)
112    addi    r6, 1
113    subi    r4, 1
114    addi    r3, 1
115    addi    r7, 1
116    cmpnei  r6, 4
117    jbt     .L1
118    cmplti  r4, 4
119    jbt     .L_copy_by_byte
120    jbf     .L0                                     /* judge whether the src is aligned. */
121
122.L_dest_aligned_but_src_not_aligned:
123    mov     r5, r3                                  /* consider overlapped case*/
124    rsub    r5, r5, r7
125    abs     r5, r5
126    cmplt   r5, r4
127    jbt     .L_copy_by_byte
128
129    bclri   r3, 0
130    bclri   r3, 1
131    ldw     r1, (r3, 0)
132    addi    r3, 4
133
134    subi    sp, 16
135    stw     r11, (sp,0)
136    stw     r12, (sp,4)
137    stw     r13, (sp,8)
138    movi    r5, 8
139    mult    r5, r6                                  /* r6 is used to store tne misaligned bits */
140    mov     r12, r5
141    rsubi   r5, 31
142    addi    r5, 1
143    mov     r13, r5
144
145    cmplti  r4, 16
146    jbt     .L_not_aligned_and_len_less_16bytes
147
148    stw     r8, (sp, 12)
149    subi    sp, 8
150    stw     r9, (sp, 0)
151    stw     r10, (sp, 4)
152.L_not_aligned_and_len_larger_16bytes:
153    ldw     r5, (r3, 0)
154    ldw     r11, (r3, 4)
155    ldw     r8, (r3, 8)
156    ldw     r9, (r3, 12)
157
158    GET_FRONT_BITS r1 r12                          /* little or big endian? */
159    mov     r10, r5
160    GET_AFTER_BITS r5 r13
161    or      r5, r1
162
163    GET_FRONT_BITS r10 r12
164    mov     r1, r11
165    GET_AFTER_BITS r11 r13
166    or      r11, r10
167
168    GET_FRONT_BITS r1 r12
169    mov     r10, r8
170    GET_AFTER_BITS r8 r13
171    or      r8, r1
172
173    GET_FRONT_BITS r10 r12
174    mov     r1, r9
175    GET_AFTER_BITS r9 r13
176    or      r9, r10
177
178    stw     r5, (r7, 0)
179    stw     r11, (r7, 4)
180    stw     r8, (r7, 8)
181    stw     r9, (r7, 12)
182    subi    r4, 16
183    addi    r3, 16
184    addi    r7, 16
185    cmplti  r4, 16
186    jbf     .L_not_aligned_and_len_larger_16bytes
187    ldw     r9, (sp, 0)
188    ldw     r10, (sp, 4)
189    addi    sp, 8
190    ldw     r8, (sp,12)
191
192.L_not_aligned_and_len_less_16bytes:
193    cmplti  r4, 4
194    jbf     .L2
195    rsubi   r6, 4                                   /* r6 is used to stored the misaligned bits */
196    subu    r3, r6                                 /* initial the position */
197    ldw     r11, (sp, 0)
198    ldw     r12, (sp, 4)
199    ldw     r13, (sp, 8)
200    addi    sp, 16
201    jbr     .L_copy_by_byte
202.L2:
203    ldw     r5, (r3, 0)
204    GET_FRONT_BITS r1 r12
205    mov     r11, r1
206    mov     r1, r5
207    GET_AFTER_BITS r5 r13
208    or      r5, r11
209    stw     r5, (r7, 0)
210    subi    r4, 4
211    addi    r3, 4
212    addi    r7, 4
213    jbr     .L_not_aligned_and_len_less_16bytes
214
215.size   Wmemcpy, .-Wmemcpy
216
217libc_hidden_def(Wmemcpy)
218.weak Wmemcpy
219