1/*
2 * Copyright (C) 2017 Hangzhou C-SKY Microsystems co.,ltd.
3 *
4 * Licensed under the LGPL v2.1 or later, see the file COPYING.LIB
5 * in this tarball.
6 */
7
8.macro      GET_FRONT_BITS rx ry
9#ifdef      __cskyLE__
10    lsr     \rx, \ry
11#else
12    lsl     \rx, \ry
13#endif
14.endm
15
16.macro      GET_AFTER_BITS rx ry
17#ifdef      __cskyLE__
18    lsl     \rx, \ry
19#else
20    lsr     \rx, \ry
21#endif
22.endm
23
24
25#ifdef WANT_WIDE
26# define Wmemcpy wmemcpy
27#else
28# define Wmemcpy memcpy
29#endif
30
31/* void *memcpy(void *dest, const void *src, size_t n); */
32
33    .text
34	.align 2
35	.global Wmemcpy
36	.type   Wmemcpy, @function
37Wmemcpy:
38    mov     r3, r0
39    cmplti  r2, 4                                            /* If len less than 4 bytes */
40    jbt     .L_copy_by_byte
41
42    mov     r12, r0
43    andi    r12, 3
44    bnez    r12, .L_dest_not_aligned                         /* If dest is not 4 bytes aligned */
45.L0:
46    mov     r12, r1
47    andi    r12, 3
48    bnez    r12, .L_dest_aligned_but_src_not_aligned         /* If dest is aligned, but src is not aligned */
49
50    cmplti  r2, 16                                           /* dest and src are all aligned */
51    jbt     .L_aligned_and_len_less_16bytes                  /* If len less than 16 bytes */
52
53.L_aligned_and_len_larger_16bytes:                           /* src and dst are all aligned, and len > 16 bytes */
54    ldw     r18, (r1, 0)
55    ldw     r19, (r1, 4)
56    ldw     r20, (r1, 8)
57    ldw     r21, (r1, 12)
58    stw     r18, (r3, 0)
59    stw     r19, (r3, 4)
60    stw     r20, (r3, 8)
61    stw     r21, (r3, 12)
62    subi    r2, 16
63    addi    r1, 16
64    addi    r3, 16
65    cmplti  r2, 16
66    jbf     .L_aligned_and_len_larger_16bytes
67
68.L_aligned_and_len_less_16bytes:
69    cmplti  r2, 4
70    jbt     .L_copy_by_byte
71    ldw     r18, (r1, 0)
72    stw     r18, (r3, 0)
73    subi    r2, 4
74    addi    r1, 4
75    addi    r3, 4
76    jbr     .L_aligned_and_len_less_16bytes
77
78.L_copy_by_byte:                                    /* len less than 4 bytes */
79    cmpnei  r2, 0
80    jbf     .L_return
81    ldb     r18, (r1, 0)
82    stb     r18, (r3, 0)
83    subi    r2, 1
84    addi    r1, 1
85    addi    r3, 1
86    jbr     .L_copy_by_byte
87
88.L_return:
89    rts
90
91/* If dest is not aligned, just copying some bytes makes the dest align.
92   After that, we judge whether the src is aligned. */
93
94.L_dest_not_aligned:
95    rsub    r13, r1, r3                              /* consider overlapped case */
96    abs     r13, r13
97    cmplt   r13, r2
98    jbt     .L_copy_by_byte
99
100.L1:
101    ldb     r18, (r1, 0)                             /* makes the dest align. */
102    stb     r18, (r3, 0)
103    addi    r12, 1
104    subi    r2, 1
105    addi    r1, 1
106    addi    r3, 1
107    cmpnei  r12, 4
108    jbt     .L1
109    cmplti  r2, 4
110    jbt     .L_copy_by_byte
111    jbf     .L0                                     /* judge whether the src is aligned. */
112
113.L_dest_aligned_but_src_not_aligned:
114    rsub    r13, r1, r3                             /* consider overlapped case */
115    abs     r13, r13
116    cmplt   r13, r2
117    jbt     .L_copy_by_byte
118
119    bclri   r1, 0
120    bclri   r1, 1
121    ldw     r18, (r1, 0)
122    addi    r1, 4
123
124    movi    r13, 8
125    mult    r13, r12
126    mov     r24, r13                                /* r12 is used to store the misaligned bits */
127    rsubi   r13, 32
128    mov     r25, r13
129
130    cmplti  r2, 16
131    jbt     .L_not_aligned_and_len_less_16bytes
132
133.L_not_aligned_and_len_larger_16bytes:
134    ldw     r20, (r1, 0)
135    ldw     r21, (r1, 4)
136    ldw     r22, (r1, 8)
137    ldw     r23, (r1, 12)
138
139    GET_FRONT_BITS r18 r24                          /* little or big endian? */
140    mov     r19, r20
141    GET_AFTER_BITS r20 r25
142    or      r20, r18
143
144    GET_FRONT_BITS r19 r24
145    mov     r18, r21
146    GET_AFTER_BITS r21 r13
147    or      r21, r19
148
149    GET_FRONT_BITS r18 r24
150    mov     r19, r22
151    GET_AFTER_BITS r22 r25
152    or      r22, r18
153
154    GET_FRONT_BITS r19 r24
155    mov     r18, r23
156    GET_AFTER_BITS r23 r25
157    or      r23, r19
158
159    stw     r20, (r3, 0)
160    stw     r21, (r3, 4)
161    stw     r22, (r3, 8)
162    stw     r23, (r3, 12)
163    subi    r2, 16
164    addi    r1, 16
165    addi    r3, 16
166    cmplti  r2, 16
167    jbf     .L_not_aligned_and_len_larger_16bytes
168
169.L_not_aligned_and_len_less_16bytes:
170    cmplti  r2, 4
171    jbf     .L2
172    rsubi   r12, 4                                   /* r12 is used to stored the misaligned bits */
173    subu    r1, r12                                  /* initial the position */
174    jbr     .L_copy_by_byte
175.L2:
176    ldw     r21, (r1, 0)
177    GET_FRONT_BITS r18 r24
178    mov     r19, r18
179    mov     r18, r21
180    GET_AFTER_BITS r21 r25
181    or      r21, r19
182    stw     r21, (r3, 0)
183    subi    r2, 4
184    addi    r1, 4
185    addi    r3, 4
186    jbr     .L_not_aligned_and_len_less_16bytes
187
188.size   Wmemcpy, .-Wmemcpy
189
190libc_hidden_def(Wmemcpy)
191.weak Wmemcpy
192