1/* SPDX-License-Identifier: GPL-2.0-or-later */
2#
3# Accelerated chacha20 implementation for ppc64le.
4#
5# Copyright 2023- IBM Corp. All rights reserved
6#
7#===================================================================================
8# Written by Danny Tsen <dtsen@us.ibm.com>
9#
10# do rounds,  8 quarter rounds
11# 1.  a += b; d ^= a; d <<<= 16;
12# 2.  c += d; b ^= c; b <<<= 12;
13# 3.  a += b; d ^= a; d <<<= 8;
14# 4.  c += d; b ^= c; b <<<= 7
15#
16# row1 = (row1 + row2),  row4 = row1 xor row4,  row4 rotate each word by 16
17# row3 = (row3 + row4),  row2 = row3 xor row2,  row2 rotate each word by 12
18# row1 = (row1 + row2), row4 = row1 xor row4,  row4 rotate each word by 8
19# row3 = (row3 + row4), row2 = row3 xor row2,  row2 rotate each word by 7
20#
21# 4 blocks (a b c d)
22#
23# a0 b0 c0 d0
24# a1 b1 c1 d1
25# ...
26# a4 b4 c4 d4
27# ...
28# a8 b8 c8 d8
29# ...
30# a12 b12 c12 d12
31# a13 ...
32# a14 ...
33# a15 b15 c15 d15
34#
35# Column round (v0, v4,  v8, v12, v1, v5,  v9, v13, v2, v6, v10, v14, v3, v7, v11, v15)
36# Diagnal round (v0, v5, v10, v15, v1, v6, v11, v12, v2, v7,  v8, v13, v3, v4,  v9, v14)
37#
38
39#include <asm/ppc_asm.h>
40#include <asm/asm-offsets.h>
41#include <asm/asm-compat.h>
42#include <linux/linkage.h>
43
44.machine	"any"
45.text
46
47.macro	SAVE_GPR GPR OFFSET FRAME
48	std	\GPR,\OFFSET(\FRAME)
49.endm
50
51.macro	SAVE_VRS VRS OFFSET FRAME
52	li	16, \OFFSET
53	stvx	\VRS, 16, \FRAME
54.endm
55
56.macro	SAVE_VSX VSX OFFSET FRAME
57	li	16, \OFFSET
58	stxvx	\VSX, 16, \FRAME
59.endm
60
61.macro	RESTORE_GPR GPR OFFSET FRAME
62	ld	\GPR,\OFFSET(\FRAME)
63.endm
64
65.macro	RESTORE_VRS VRS OFFSET FRAME
66	li	16, \OFFSET
67	lvx	\VRS, 16, \FRAME
68.endm
69
70.macro	RESTORE_VSX VSX OFFSET FRAME
71	li	16, \OFFSET
72	lxvx	\VSX, 16, \FRAME
73.endm
74
75.macro SAVE_REGS
76	mflr 0
77	std 0, 16(1)
78	stdu 1,-752(1)
79
80	SAVE_GPR 14, 112, 1
81	SAVE_GPR 15, 120, 1
82	SAVE_GPR 16, 128, 1
83	SAVE_GPR 17, 136, 1
84	SAVE_GPR 18, 144, 1
85	SAVE_GPR 19, 152, 1
86	SAVE_GPR 20, 160, 1
87	SAVE_GPR 21, 168, 1
88	SAVE_GPR 22, 176, 1
89	SAVE_GPR 23, 184, 1
90	SAVE_GPR 24, 192, 1
91	SAVE_GPR 25, 200, 1
92	SAVE_GPR 26, 208, 1
93	SAVE_GPR 27, 216, 1
94	SAVE_GPR 28, 224, 1
95	SAVE_GPR 29, 232, 1
96	SAVE_GPR 30, 240, 1
97	SAVE_GPR 31, 248, 1
98
99	addi	9, 1, 256
100	SAVE_VRS 20, 0, 9
101	SAVE_VRS 21, 16, 9
102	SAVE_VRS 22, 32, 9
103	SAVE_VRS 23, 48, 9
104	SAVE_VRS 24, 64, 9
105	SAVE_VRS 25, 80, 9
106	SAVE_VRS 26, 96, 9
107	SAVE_VRS 27, 112, 9
108	SAVE_VRS 28, 128, 9
109	SAVE_VRS 29, 144, 9
110	SAVE_VRS 30, 160, 9
111	SAVE_VRS 31, 176, 9
112
113	SAVE_VSX 14, 192, 9
114	SAVE_VSX 15, 208, 9
115	SAVE_VSX 16, 224, 9
116	SAVE_VSX 17, 240, 9
117	SAVE_VSX 18, 256, 9
118	SAVE_VSX 19, 272, 9
119	SAVE_VSX 20, 288, 9
120	SAVE_VSX 21, 304, 9
121	SAVE_VSX 22, 320, 9
122	SAVE_VSX 23, 336, 9
123	SAVE_VSX 24, 352, 9
124	SAVE_VSX 25, 368, 9
125	SAVE_VSX 26, 384, 9
126	SAVE_VSX 27, 400, 9
127	SAVE_VSX 28, 416, 9
128	SAVE_VSX 29, 432, 9
129	SAVE_VSX 30, 448, 9
130	SAVE_VSX 31, 464, 9
131.endm # SAVE_REGS
132
133.macro RESTORE_REGS
134	addi	9, 1, 256
135	RESTORE_VRS 20, 0, 9
136	RESTORE_VRS 21, 16, 9
137	RESTORE_VRS 22, 32, 9
138	RESTORE_VRS 23, 48, 9
139	RESTORE_VRS 24, 64, 9
140	RESTORE_VRS 25, 80, 9
141	RESTORE_VRS 26, 96, 9
142	RESTORE_VRS 27, 112, 9
143	RESTORE_VRS 28, 128, 9
144	RESTORE_VRS 29, 144, 9
145	RESTORE_VRS 30, 160, 9
146	RESTORE_VRS 31, 176, 9
147
148	RESTORE_VSX 14, 192, 9
149	RESTORE_VSX 15, 208, 9
150	RESTORE_VSX 16, 224, 9
151	RESTORE_VSX 17, 240, 9
152	RESTORE_VSX 18, 256, 9
153	RESTORE_VSX 19, 272, 9
154	RESTORE_VSX 20, 288, 9
155	RESTORE_VSX 21, 304, 9
156	RESTORE_VSX 22, 320, 9
157	RESTORE_VSX 23, 336, 9
158	RESTORE_VSX 24, 352, 9
159	RESTORE_VSX 25, 368, 9
160	RESTORE_VSX 26, 384, 9
161	RESTORE_VSX 27, 400, 9
162	RESTORE_VSX 28, 416, 9
163	RESTORE_VSX 29, 432, 9
164	RESTORE_VSX 30, 448, 9
165	RESTORE_VSX 31, 464, 9
166
167	RESTORE_GPR 14, 112, 1
168	RESTORE_GPR 15, 120, 1
169	RESTORE_GPR 16, 128, 1
170	RESTORE_GPR 17, 136, 1
171	RESTORE_GPR 18, 144, 1
172	RESTORE_GPR 19, 152, 1
173	RESTORE_GPR 20, 160, 1
174	RESTORE_GPR 21, 168, 1
175	RESTORE_GPR 22, 176, 1
176	RESTORE_GPR 23, 184, 1
177	RESTORE_GPR 24, 192, 1
178	RESTORE_GPR 25, 200, 1
179	RESTORE_GPR 26, 208, 1
180	RESTORE_GPR 27, 216, 1
181	RESTORE_GPR 28, 224, 1
182	RESTORE_GPR 29, 232, 1
183	RESTORE_GPR 30, 240, 1
184	RESTORE_GPR 31, 248, 1
185
186	addi    1, 1, 752
187	ld 0, 16(1)
188	mtlr 0
189.endm # RESTORE_REGS
190
191.macro QT_loop_8x
192	# QR(v0, v4,  v8, v12, v1, v5,  v9, v13, v2, v6, v10, v14, v3, v7, v11, v15)
193	xxlor	0, 32+25, 32+25
194	xxlor	32+25, 20, 20
195	vadduwm 0, 0, 4
196	vadduwm 1, 1, 5
197	vadduwm 2, 2, 6
198	vadduwm 3, 3, 7
199	  vadduwm 16, 16, 20
200	  vadduwm 17, 17, 21
201	  vadduwm 18, 18, 22
202	  vadduwm 19, 19, 23
203
204	  vpermxor 12, 12, 0, 25
205	  vpermxor 13, 13, 1, 25
206	  vpermxor 14, 14, 2, 25
207	  vpermxor 15, 15, 3, 25
208	  vpermxor 28, 28, 16, 25
209	  vpermxor 29, 29, 17, 25
210	  vpermxor 30, 30, 18, 25
211	  vpermxor 31, 31, 19, 25
212	xxlor	32+25, 0, 0
213	vadduwm 8, 8, 12
214	vadduwm 9, 9, 13
215	vadduwm 10, 10, 14
216	vadduwm 11, 11, 15
217	  vadduwm 24, 24, 28
218	  vadduwm 25, 25, 29
219	  vadduwm 26, 26, 30
220	  vadduwm 27, 27, 31
221	vxor 4, 4, 8
222	vxor 5, 5, 9
223	vxor 6, 6, 10
224	vxor 7, 7, 11
225	  vxor 20, 20, 24
226	  vxor 21, 21, 25
227	  vxor 22, 22, 26
228	  vxor 23, 23, 27
229
230	xxlor	0, 32+25, 32+25
231	xxlor	32+25, 21, 21
232	vrlw 4, 4, 25  #
233	vrlw 5, 5, 25
234	vrlw 6, 6, 25
235	vrlw 7, 7, 25
236	  vrlw 20, 20, 25  #
237	  vrlw 21, 21, 25
238	  vrlw 22, 22, 25
239	  vrlw 23, 23, 25
240	xxlor	32+25, 0, 0
241	vadduwm 0, 0, 4
242	vadduwm 1, 1, 5
243	vadduwm 2, 2, 6
244	vadduwm 3, 3, 7
245	  vadduwm 16, 16, 20
246	  vadduwm 17, 17, 21
247	  vadduwm 18, 18, 22
248	  vadduwm 19, 19, 23
249
250	xxlor	0, 32+25, 32+25
251	xxlor	32+25, 22, 22
252	  vpermxor 12, 12, 0, 25
253	  vpermxor 13, 13, 1, 25
254	  vpermxor 14, 14, 2, 25
255	  vpermxor 15, 15, 3, 25
256	  vpermxor 28, 28, 16, 25
257	  vpermxor 29, 29, 17, 25
258	  vpermxor 30, 30, 18, 25
259	  vpermxor 31, 31, 19, 25
260	xxlor	32+25, 0, 0
261	vadduwm 8, 8, 12
262	vadduwm 9, 9, 13
263	vadduwm 10, 10, 14
264	vadduwm 11, 11, 15
265	  vadduwm 24, 24, 28
266	  vadduwm 25, 25, 29
267	  vadduwm 26, 26, 30
268	  vadduwm 27, 27, 31
269	xxlor	0, 32+28, 32+28
270	xxlor	32+28, 23, 23
271	vxor 4, 4, 8
272	vxor 5, 5, 9
273	vxor 6, 6, 10
274	vxor 7, 7, 11
275	  vxor 20, 20, 24
276	  vxor 21, 21, 25
277	  vxor 22, 22, 26
278	  vxor 23, 23, 27
279	vrlw 4, 4, 28  #
280	vrlw 5, 5, 28
281	vrlw 6, 6, 28
282	vrlw 7, 7, 28
283	  vrlw 20, 20, 28  #
284	  vrlw 21, 21, 28
285	  vrlw 22, 22, 28
286	  vrlw 23, 23, 28
287	xxlor	32+28, 0, 0
288
289	# QR(v0, v5, v10, v15, v1, v6, v11, v12, v2, v7,  v8, v13, v3, v4,  v9, v14)
290	xxlor	0, 32+25, 32+25
291	xxlor	32+25, 20, 20
292	vadduwm 0, 0, 5
293	vadduwm 1, 1, 6
294	vadduwm 2, 2, 7
295	vadduwm 3, 3, 4
296	  vadduwm 16, 16, 21
297	  vadduwm 17, 17, 22
298	  vadduwm 18, 18, 23
299	  vadduwm 19, 19, 20
300
301	  vpermxor 15, 15, 0, 25
302	  vpermxor 12, 12, 1, 25
303	  vpermxor 13, 13, 2, 25
304	  vpermxor 14, 14, 3, 25
305	  vpermxor 31, 31, 16, 25
306	  vpermxor 28, 28, 17, 25
307	  vpermxor 29, 29, 18, 25
308	  vpermxor 30, 30, 19, 25
309
310	xxlor	32+25, 0, 0
311	vadduwm 10, 10, 15
312	vadduwm 11, 11, 12
313	vadduwm 8, 8, 13
314	vadduwm 9, 9, 14
315	  vadduwm 26, 26, 31
316	  vadduwm 27, 27, 28
317	  vadduwm 24, 24, 29
318	  vadduwm 25, 25, 30
319	vxor 5, 5, 10
320	vxor 6, 6, 11
321	vxor 7, 7, 8
322	vxor 4, 4, 9
323	  vxor 21, 21, 26
324	  vxor 22, 22, 27
325	  vxor 23, 23, 24
326	  vxor 20, 20, 25
327
328	xxlor	0, 32+25, 32+25
329	xxlor	32+25, 21, 21
330	vrlw 5, 5, 25
331	vrlw 6, 6, 25
332	vrlw 7, 7, 25
333	vrlw 4, 4, 25
334	  vrlw 21, 21, 25
335	  vrlw 22, 22, 25
336	  vrlw 23, 23, 25
337	  vrlw 20, 20, 25
338	xxlor	32+25, 0, 0
339
340	vadduwm 0, 0, 5
341	vadduwm 1, 1, 6
342	vadduwm 2, 2, 7
343	vadduwm 3, 3, 4
344	  vadduwm 16, 16, 21
345	  vadduwm 17, 17, 22
346	  vadduwm 18, 18, 23
347	  vadduwm 19, 19, 20
348
349	xxlor	0, 32+25, 32+25
350	xxlor	32+25, 22, 22
351	  vpermxor 15, 15, 0, 25
352	  vpermxor 12, 12, 1, 25
353	  vpermxor 13, 13, 2, 25
354	  vpermxor 14, 14, 3, 25
355	  vpermxor 31, 31, 16, 25
356	  vpermxor 28, 28, 17, 25
357	  vpermxor 29, 29, 18, 25
358	  vpermxor 30, 30, 19, 25
359	xxlor	32+25, 0, 0
360
361	vadduwm 10, 10, 15
362	vadduwm 11, 11, 12
363	vadduwm 8, 8, 13
364	vadduwm 9, 9, 14
365	  vadduwm 26, 26, 31
366	  vadduwm 27, 27, 28
367	  vadduwm 24, 24, 29
368	  vadduwm 25, 25, 30
369
370	xxlor	0, 32+28, 32+28
371	xxlor	32+28, 23, 23
372	vxor 5, 5, 10
373	vxor 6, 6, 11
374	vxor 7, 7, 8
375	vxor 4, 4, 9
376	  vxor 21, 21, 26
377	  vxor 22, 22, 27
378	  vxor 23, 23, 24
379	  vxor 20, 20, 25
380	vrlw 5, 5, 28
381	vrlw 6, 6, 28
382	vrlw 7, 7, 28
383	vrlw 4, 4, 28
384	  vrlw 21, 21, 28
385	  vrlw 22, 22, 28
386	  vrlw 23, 23, 28
387	  vrlw 20, 20, 28
388	xxlor	32+28, 0, 0
389.endm
390
391.macro QT_loop_4x
392	# QR(v0, v4,  v8, v12, v1, v5,  v9, v13, v2, v6, v10, v14, v3, v7, v11, v15)
393	vadduwm 0, 0, 4
394	vadduwm 1, 1, 5
395	vadduwm 2, 2, 6
396	vadduwm 3, 3, 7
397	  vpermxor 12, 12, 0, 20
398	  vpermxor 13, 13, 1, 20
399	  vpermxor 14, 14, 2, 20
400	  vpermxor 15, 15, 3, 20
401	vadduwm 8, 8, 12
402	vadduwm 9, 9, 13
403	vadduwm 10, 10, 14
404	vadduwm 11, 11, 15
405	vxor 4, 4, 8
406	vxor 5, 5, 9
407	vxor 6, 6, 10
408	vxor 7, 7, 11
409	vrlw 4, 4, 21
410	vrlw 5, 5, 21
411	vrlw 6, 6, 21
412	vrlw 7, 7, 21
413	vadduwm 0, 0, 4
414	vadduwm 1, 1, 5
415	vadduwm 2, 2, 6
416	vadduwm 3, 3, 7
417	  vpermxor 12, 12, 0, 22
418	  vpermxor 13, 13, 1, 22
419	  vpermxor 14, 14, 2, 22
420	  vpermxor 15, 15, 3, 22
421	vadduwm 8, 8, 12
422	vadduwm 9, 9, 13
423	vadduwm 10, 10, 14
424	vadduwm 11, 11, 15
425	vxor 4, 4, 8
426	vxor 5, 5, 9
427	vxor 6, 6, 10
428	vxor 7, 7, 11
429	vrlw 4, 4, 23
430	vrlw 5, 5, 23
431	vrlw 6, 6, 23
432	vrlw 7, 7, 23
433
434	# QR(v0, v5, v10, v15, v1, v6, v11, v12, v2, v7,  v8, v13, v3, v4,  v9, v14)
435	vadduwm 0, 0, 5
436	vadduwm 1, 1, 6
437	vadduwm 2, 2, 7
438	vadduwm 3, 3, 4
439	  vpermxor 15, 15, 0, 20
440	  vpermxor 12, 12, 1, 20
441	  vpermxor 13, 13, 2, 20
442	  vpermxor 14, 14, 3, 20
443	vadduwm 10, 10, 15
444	vadduwm 11, 11, 12
445	vadduwm 8, 8, 13
446	vadduwm 9, 9, 14
447	vxor 5, 5, 10
448	vxor 6, 6, 11
449	vxor 7, 7, 8
450	vxor 4, 4, 9
451	vrlw 5, 5, 21
452	vrlw 6, 6, 21
453	vrlw 7, 7, 21
454	vrlw 4, 4, 21
455	vadduwm 0, 0, 5
456	vadduwm 1, 1, 6
457	vadduwm 2, 2, 7
458	vadduwm 3, 3, 4
459	  vpermxor 15, 15, 0, 22
460	  vpermxor 12, 12, 1, 22
461	  vpermxor 13, 13, 2, 22
462	  vpermxor 14, 14, 3, 22
463	vadduwm 10, 10, 15
464	vadduwm 11, 11, 12
465	vadduwm 8, 8, 13
466	vadduwm 9, 9, 14
467	vxor 5, 5, 10
468	vxor 6, 6, 11
469	vxor 7, 7, 8
470	vxor 4, 4, 9
471	vrlw 5, 5, 23
472	vrlw 6, 6, 23
473	vrlw 7, 7, 23
474	vrlw 4, 4, 23
475.endm
476
477# Transpose
478.macro TP_4x a0 a1 a2 a3
479	xxmrghw  10, 32+\a0, 32+\a1	# a0, a1, b0, b1
480	xxmrghw  11, 32+\a2, 32+\a3	# a2, a3, b2, b3
481	xxmrglw  12, 32+\a0, 32+\a1	# c0, c1, d0, d1
482	xxmrglw  13, 32+\a2, 32+\a3	# c2, c3, d2, d3
483	xxpermdi	32+\a0, 10, 11, 0	# a0, a1, a2, a3
484	xxpermdi	32+\a1, 10, 11, 3	# b0, b1, b2, b3
485	xxpermdi	32+\a2, 12, 13, 0	# c0, c1, c2, c3
486	xxpermdi	32+\a3, 12, 13, 3	# d0, d1, d2, d3
487.endm
488
489# key stream = working state + state
490.macro Add_state S
491	vadduwm \S+0, \S+0, 16-\S
492	vadduwm \S+4, \S+4, 17-\S
493	vadduwm \S+8, \S+8, 18-\S
494	vadduwm \S+12, \S+12, 19-\S
495
496	vadduwm \S+1, \S+1, 16-\S
497	vadduwm \S+5, \S+5, 17-\S
498	vadduwm \S+9, \S+9, 18-\S
499	vadduwm \S+13, \S+13, 19-\S
500
501	vadduwm \S+2, \S+2, 16-\S
502	vadduwm \S+6, \S+6, 17-\S
503	vadduwm \S+10, \S+10, 18-\S
504	vadduwm \S+14, \S+14, 19-\S
505
506	vadduwm	\S+3, \S+3, 16-\S
507	vadduwm	\S+7, \S+7, 17-\S
508	vadduwm	\S+11, \S+11, 18-\S
509	vadduwm	\S+15, \S+15, 19-\S
510.endm
511
512#
513# write 256 bytes
514#
515.macro Write_256 S
516	add 9, 14, 5
517	add 16, 14, 4
518	lxvw4x 0, 0, 9
519	lxvw4x 1, 17, 9
520	lxvw4x 2, 18, 9
521	lxvw4x 3, 19, 9
522	lxvw4x 4, 20, 9
523	lxvw4x 5, 21, 9
524	lxvw4x 6, 22, 9
525	lxvw4x 7, 23, 9
526	lxvw4x 8, 24, 9
527	lxvw4x 9, 25, 9
528	lxvw4x 10, 26, 9
529	lxvw4x 11, 27, 9
530	lxvw4x 12, 28, 9
531	lxvw4x 13, 29, 9
532	lxvw4x 14, 30, 9
533	lxvw4x 15, 31, 9
534
535	xxlxor \S+32, \S+32, 0
536	xxlxor \S+36, \S+36, 1
537	xxlxor \S+40, \S+40, 2
538	xxlxor \S+44, \S+44, 3
539	xxlxor \S+33, \S+33, 4
540	xxlxor \S+37, \S+37, 5
541	xxlxor \S+41, \S+41, 6
542	xxlxor \S+45, \S+45, 7
543	xxlxor \S+34, \S+34, 8
544	xxlxor \S+38, \S+38, 9
545	xxlxor \S+42, \S+42, 10
546	xxlxor \S+46, \S+46, 11
547	xxlxor \S+35, \S+35, 12
548	xxlxor \S+39, \S+39, 13
549	xxlxor \S+43, \S+43, 14
550	xxlxor \S+47, \S+47, 15
551
552	stxvw4x \S+32, 0, 16
553	stxvw4x \S+36, 17, 16
554	stxvw4x \S+40, 18, 16
555	stxvw4x \S+44, 19, 16
556
557	stxvw4x \S+33, 20, 16
558	stxvw4x \S+37, 21, 16
559	stxvw4x \S+41, 22, 16
560	stxvw4x \S+45, 23, 16
561
562	stxvw4x \S+34, 24, 16
563	stxvw4x \S+38, 25, 16
564	stxvw4x \S+42, 26, 16
565	stxvw4x \S+46, 27, 16
566
567	stxvw4x \S+35, 28, 16
568	stxvw4x \S+39, 29, 16
569	stxvw4x \S+43, 30, 16
570	stxvw4x \S+47, 31, 16
571
572.endm
573
574#
575# void chacha_p10le_8x(const struct chacha_state *state, u8 *dst, const u8 *src,
576#		       unsigned int len, int nrounds);
577#
578SYM_FUNC_START(chacha_p10le_8x)
579.align 5
580	cmpdi	6, 0
581	ble	Out_no_chacha
582
583	SAVE_REGS
584
585	# r17 - r31 mainly for Write_256 macro.
586	li	17, 16
587	li	18, 32
588	li	19, 48
589	li	20, 64
590	li	21, 80
591	li	22, 96
592	li	23, 112
593	li	24, 128
594	li	25, 144
595	li	26, 160
596	li	27, 176
597	li	28, 192
598	li	29, 208
599	li	30, 224
600	li	31, 240
601
602	mr 15, 6			# len
603	li 14, 0			# offset to inp and outp
604
605        lxvw4x	48, 0, 3		#  vr16, constants
606	lxvw4x	49, 17, 3		#  vr17, key 1
607	lxvw4x	50, 18, 3		#  vr18, key 2
608	lxvw4x	51, 19, 3		#  vr19, counter, nonce
609
610	# create (0, 1, 2, 3) counters
611	vspltisw 0, 0
612	vspltisw 1, 1
613	vspltisw 2, 2
614	vspltisw 3, 3
615	vmrghw	4, 0, 1
616	vmrglw	5, 2, 3
617	vsldoi	30, 4, 5, 8		# vr30 counter, 4 (0, 1, 2, 3)
618
619	vspltisw 21, 12
620	vspltisw 23, 7
621
622	addis	11, 2, permx@toc@ha
623	addi	11, 11, permx@toc@l
624	lxvw4x	32+20, 0, 11
625	lxvw4x	32+22, 17, 11
626
627	sradi	8, 7, 1
628
629	mtctr 8
630
631	# save constants to vsx
632	xxlor	16, 48, 48
633	xxlor	17, 49, 49
634	xxlor	18, 50, 50
635	xxlor	19, 51, 51
636
637	vspltisw 25, 4
638	vspltisw 26, 8
639
640	xxlor	25, 32+26, 32+26
641	xxlor	24, 32+25, 32+25
642
643	vadduwm	31, 30, 25		# counter = (0, 1, 2, 3) + (4, 4, 4, 4)
644	xxlor	30, 32+30, 32+30
645	xxlor	31, 32+31, 32+31
646
647	xxlor	20, 32+20, 32+20
648	xxlor	21, 32+21, 32+21
649	xxlor	22, 32+22, 32+22
650	xxlor	23, 32+23, 32+23
651
652	cmpdi	6, 512
653	blt	Loop_last
654
655Loop_8x:
656	xxspltw  32+0, 16, 0
657	xxspltw  32+1, 16, 1
658	xxspltw  32+2, 16, 2
659	xxspltw  32+3, 16, 3
660
661	xxspltw  32+4, 17, 0
662	xxspltw  32+5, 17, 1
663	xxspltw  32+6, 17, 2
664	xxspltw  32+7, 17, 3
665	xxspltw  32+8, 18, 0
666	xxspltw  32+9, 18, 1
667	xxspltw  32+10, 18, 2
668	xxspltw  32+11, 18, 3
669	xxspltw  32+12, 19, 0
670	xxspltw  32+13, 19, 1
671	xxspltw  32+14, 19, 2
672	xxspltw  32+15, 19, 3
673	vadduwm	12, 12, 30	# increase counter
674
675	xxspltw  32+16, 16, 0
676	xxspltw  32+17, 16, 1
677	xxspltw  32+18, 16, 2
678	xxspltw  32+19, 16, 3
679
680	xxspltw  32+20, 17, 0
681	xxspltw  32+21, 17, 1
682	xxspltw  32+22, 17, 2
683	xxspltw  32+23, 17, 3
684	xxspltw  32+24, 18, 0
685	xxspltw  32+25, 18, 1
686	xxspltw  32+26, 18, 2
687	xxspltw  32+27, 18, 3
688	xxspltw  32+28, 19, 0
689	xxspltw  32+29, 19, 1
690	vadduwm	28, 28, 31	# increase counter
691	xxspltw  32+30, 19, 2
692	xxspltw  32+31, 19, 3
693
694.align 5
695quarter_loop_8x:
696	QT_loop_8x
697
698	bdnz	quarter_loop_8x
699
700	xxlor	0, 32+30, 32+30
701	xxlor	32+30, 30, 30
702	vadduwm	12, 12, 30
703	xxlor	32+30, 0, 0
704	TP_4x 0, 1, 2, 3
705	TP_4x 4, 5, 6, 7
706	TP_4x 8, 9, 10, 11
707	TP_4x 12, 13, 14, 15
708
709	xxlor	0, 48, 48
710	xxlor	1, 49, 49
711	xxlor	2, 50, 50
712	xxlor	3, 51, 51
713	xxlor	48, 16, 16
714	xxlor	49, 17, 17
715	xxlor	50, 18, 18
716	xxlor	51, 19, 19
717	Add_state 0
718	xxlor	48, 0, 0
719	xxlor	49, 1, 1
720	xxlor	50, 2, 2
721	xxlor	51, 3, 3
722	Write_256 0
723	addi	14, 14, 256	# offset +=256
724	addi	15, 15, -256	# len -=256
725
726	xxlor	5, 32+31, 32+31
727	xxlor	32+31, 31, 31
728	vadduwm	28, 28, 31
729	xxlor	32+31, 5, 5
730	TP_4x 16+0, 16+1, 16+2, 16+3
731	TP_4x 16+4, 16+5, 16+6, 16+7
732	TP_4x 16+8, 16+9, 16+10, 16+11
733	TP_4x 16+12, 16+13, 16+14, 16+15
734
735	xxlor	32, 16, 16
736	xxlor	33, 17, 17
737	xxlor	34, 18, 18
738	xxlor	35, 19, 19
739	Add_state 16
740	Write_256 16
741	addi	14, 14, 256	# offset +=256
742	addi	15, 15, -256	# len +=256
743
744	xxlor	32+24, 24, 24
745	xxlor	32+25, 25, 25
746	xxlor	32+30, 30, 30
747	vadduwm	30, 30, 25
748	vadduwm	31, 30, 24
749	xxlor	30, 32+30, 32+30
750	xxlor	31, 32+31, 32+31
751
752	cmpdi	15, 0
753	beq	Out_loop
754
755	cmpdi	15, 512
756	blt	Loop_last
757
758	mtctr 8
759	b Loop_8x
760
761Loop_last:
762        lxvw4x	48, 0, 3		#  vr16, constants
763	lxvw4x	49, 17, 3		#  vr17, key 1
764	lxvw4x	50, 18, 3		#  vr18, key 2
765	lxvw4x	51, 19, 3		#  vr19, counter, nonce
766
767	vspltisw 21, 12
768	vspltisw 23, 7
769	addis	11, 2, permx@toc@ha
770	addi	11, 11, permx@toc@l
771	lxvw4x	32+20, 0, 11
772	lxvw4x	32+22, 17, 11
773
774	sradi	8, 7, 1
775	mtctr 8
776
777Loop_4x:
778	vspltw  0, 16, 0
779	vspltw  1, 16, 1
780	vspltw  2, 16, 2
781	vspltw  3, 16, 3
782
783	vspltw  4, 17, 0
784	vspltw  5, 17, 1
785	vspltw  6, 17, 2
786	vspltw  7, 17, 3
787	vspltw  8, 18, 0
788	vspltw  9, 18, 1
789	vspltw  10, 18, 2
790	vspltw  11, 18, 3
791	vspltw  12, 19, 0
792	vadduwm	12, 12, 30	# increase counter
793	vspltw  13, 19, 1
794	vspltw  14, 19, 2
795	vspltw  15, 19, 3
796
797.align 5
798quarter_loop:
799	QT_loop_4x
800
801	bdnz	quarter_loop
802
803	vadduwm	12, 12, 30
804	TP_4x 0, 1, 2, 3
805	TP_4x 4, 5, 6, 7
806	TP_4x 8, 9, 10, 11
807	TP_4x 12, 13, 14, 15
808
809	Add_state 0
810	Write_256 0
811	addi	14, 14, 256	# offset += 256
812	addi	15, 15, -256	# len += 256
813
814	# Update state counter
815	vspltisw 25, 4
816	vadduwm	30, 30, 25
817
818	cmpdi	15, 0
819	beq	Out_loop
820	cmpdi	15, 256
821	blt	Out_loop
822
823	mtctr 8
824	b Loop_4x
825
826Out_loop:
827	RESTORE_REGS
828	blr
829
830Out_no_chacha:
831	li	3, 0
832	blr
833SYM_FUNC_END(chacha_p10le_8x)
834
835SYM_DATA_START_LOCAL(PERMX)
836.align 5
837permx:
838.long 0x22330011, 0x66774455, 0xaabb8899, 0xeeffccdd
839.long 0x11223300, 0x55667744, 0x99aabb88, 0xddeeffcc
840SYM_DATA_END(PERMX)
841