1/* SPDX-License-Identifier: BSD-2-Clause */
2/*
3 * Copyright (c) 2016, 2020 Linaro Limited
4 * Copyright (C) 2015 Linaro Ltd <ard.biesheuvel@linaro.org>
5 *
6 * aes-ce-core.S - AES in CBC/CTR/XTS mode using ARMv8 Crypto Extensions
7 */
8
9#include <asm.S>
10
11	.fpu		crypto-neon-fp-armv8
12
13	.macro		enc_round, state, key
14	aese.8		\state, \key
15	aesmc.8		\state, \state
16	.endm
17
18	.macro		dec_round, state, key
19	aesd.8		\state, \key
20	aesimc.8	\state, \state
21	.endm
22
23	.macro		enc_dround, key1, key2
24	enc_round	q0, \key1
25	enc_round	q0, \key2
26	.endm
27
28	.macro		dec_dround, key1, key2
29	dec_round	q0, \key1
30	dec_round	q0, \key2
31	.endm
32
33	.macro		enc_fround, key1, key2, key3
34	enc_round	q0, \key1
35	aese.8		q0, \key2
36	veor		q0, q0, \key3
37	.endm
38
39	.macro		dec_fround, key1, key2, key3
40	dec_round	q0, \key1
41	aesd.8		q0, \key2
42	veor		q0, q0, \key3
43	.endm
44
45	.macro		enc_dround_3x, key1, key2
46	enc_round	q0, \key1
47	enc_round	q1, \key1
48	enc_round	q2, \key1
49	enc_round	q0, \key2
50	enc_round	q1, \key2
51	enc_round	q2, \key2
52	.endm
53
54	.macro		dec_dround_3x, key1, key2
55	dec_round	q0, \key1
56	dec_round	q1, \key1
57	dec_round	q2, \key1
58	dec_round	q0, \key2
59	dec_round	q1, \key2
60	dec_round	q2, \key2
61	.endm
62
63	.macro		enc_fround_3x, key1, key2, key3
64	enc_round	q0, \key1
65	enc_round	q1, \key1
66	enc_round	q2, \key1
67	aese.8		q0, \key2
68	aese.8		q1, \key2
69	aese.8		q2, \key2
70	veor		q0, q0, \key3
71	veor		q1, q1, \key3
72	veor		q2, q2, \key3
73	.endm
74
75	.macro		dec_fround_3x, key1, key2, key3
76	dec_round	q0, \key1
77	dec_round	q1, \key1
78	dec_round	q2, \key1
79	aesd.8		q0, \key2
80	aesd.8		q1, \key2
81	aesd.8		q2, \key2
82	veor		q0, q0, \key3
83	veor		q1, q1, \key3
84	veor		q2, q2, \key3
85	.endm
86
87	.macro		do_block, dround, fround
88	cmp		r3, #12			@ which key size?
89	vld1.8		{q10-q11}, [ip]!
90	\dround		q8, q9
91	vld1.8		{q12-q13}, [ip]!
92	\dround		q10, q11
93	vld1.8		{q10-q11}, [ip]!
94	\dround		q12, q13
95	vld1.8		{q12-q13}, [ip]!
96	\dround		q10, q11
97	blo		0f			@ AES-128: 10 rounds
98	vld1.8		{q10-q11}, [ip]!
99	\dround		q12, q13
100	beq		1f			@ AES-192: 12 rounds
101	vld1.8		{q12-q13}, [ip]
102	\dround		q10, q11
1030:	\fround		q12, q13, q14
104	bx		lr
105
1061:	\fround		q10, q11, q14
107	bx		lr
108	.endm
109
110	/*
111	 * Internal, non-AAPCS compliant functions that implement the core
112	 * AES transforms. These should preserve all registers except q0 -
113	 * q2 and ip.
114	 * Arguments:
115	 *   q0        : first in/output block
116	 *   q1        : second in/output block (_3x version only)
117	 *   q2        : third in/output block (_3x version only)
118	 *   q8        : first round key
119	 *   q9        : secound round key
120	 *   q14       : final round key
121	 *   r2        : address of round key array
122	 *   r3        : number of rounds
123	 */
124	.section	.text.ce_aes_helpers
125	.align		6
126aes_encrypt:
127	add		ip, r2, #32		@ 3rd round key
128.Laes_encrypt_tweak:
129	do_block	enc_dround, enc_fround
130
131	.align		6
132aes_decrypt:
133	add		ip, r2, #32		@ 3rd round key
134	do_block	dec_dround, dec_fround
135
136	.align		6
137aes_encrypt_3x:
138	add		ip, r2, #32		@ 3rd round key
139	do_block	enc_dround_3x, enc_fround_3x
140
141	.align		6
142aes_decrypt_3x:
143	add		ip, r2, #32		@ 3rd round key
144	do_block	dec_dround_3x, dec_fround_3x
145
146	.macro		prepare_key, rk, rounds
147	add		ip, \rk, \rounds, lsl #4
148	vld1.8		{q8-q9}, [\rk]		@ load first 2 round keys
149	vld1.8		{q14}, [ip]		@ load last round key
150	.endm
151
152	/*
153	 * void ce_aes_ecb_encrypt(uint8_t out[], uint8_t const in[],
154	 *			   uint8_t const rk[], int rounds, int blocks,
155	 *			   int first)
156	 */
157FUNC ce_aes_ecb_encrypt , :
158	push		{r4, lr}
159	ldr		r4, [sp, #8]
160	prepare_key	r2, r3
161.Lecbencloop3x:
162	subs		r4, r4, #3
163	bmi		.Lecbenc1x
164	vld1.8		{q0-q1}, [r1]!
165	vld1.8		{q2}, [r1]!
166	bl		aes_encrypt_3x
167	vst1.8		{q0-q1}, [r0]!
168	vst1.8		{q2}, [r0]!
169	b		.Lecbencloop3x
170.Lecbenc1x:
171	adds		r4, r4, #3
172	beq		.Lecbencout
173.Lecbencloop:
174	vld1.8		{q0}, [r1]!
175	bl		aes_encrypt
176	vst1.8		{q0}, [r0]!
177	subs		r4, r4, #1
178	bne		.Lecbencloop
179.Lecbencout:
180	pop		{r4, pc}
181END_FUNC ce_aes_ecb_encrypt
182
183	/*
184	 * void ce_aes_ecb_decrypt(uint8_t out[], uint8_t const in[],
185	 *			   uint8_t const rk[], int rounds, int blocks,
186	 *			   int first)
187	 */
188FUNC ce_aes_ecb_decrypt , :
189	push		{r4, lr}
190	ldr		r4, [sp, #8]
191	prepare_key	r2, r3
192.Lecbdecloop3x:
193	subs		r4, r4, #3
194	bmi		.Lecbdec1x
195	vld1.8		{q0-q1}, [r1]!
196	vld1.8		{q2}, [r1]!
197	bl		aes_decrypt_3x
198	vst1.8		{q0-q1}, [r0]!
199	vst1.8		{q2}, [r0]!
200	b		.Lecbdecloop3x
201.Lecbdec1x:
202	adds		r4, r4, #3
203	beq		.Lecbdecout
204.Lecbdecloop:
205	vld1.8		{q0}, [r1]!
206	bl		aes_decrypt
207	vst1.8		{q0}, [r0]!
208	subs		r4, r4, #1
209	bne		.Lecbdecloop
210.Lecbdecout:
211	pop		{r4, pc}
212END_FUNC ce_aes_ecb_decrypt
213
214	/*
215	 * void ce_aes_cbc_encrypt(uint8_t out[], uint8_t const in[],
216	 * 			   uint8_t const rk[], int rounds, int blocks,
217	 *			   uint8_t iv[])
218	 */
219FUNC ce_aes_cbc_encrypt , :
220	push		{r4-r6, lr}
221	ldrd		r4, r5, [sp, #16]
222	vld1.8		{q0}, [r5]
223	prepare_key	r2, r3
224.Lcbcencloop:
225	vld1.8		{q1}, [r1]!		@ get next pt block
226	veor		q0, q0, q1		@ ..and xor with iv
227	bl		aes_encrypt
228	vst1.8		{q0}, [r0]!
229	subs		r4, r4, #1
230	bne		.Lcbcencloop
231	vst1.8		{q0}, [r5]
232	pop		{r4-r6, pc}
233END_FUNC ce_aes_cbc_encrypt
234
235	/*
236	 * void ce_aes_cbc_decrypt(uint8_t out[], uint8_t const in[],
237	 *			   uint8_t const rk[], int rounds, int blocks,
238	 *			   uint8_t iv[])
239	 */
240FUNC ce_aes_cbc_decrypt , :
241	push		{r4-r6, lr}
242	ldrd		r4, r5, [sp, #16]
243	vld1.8		{q6}, [r5]		@ keep iv in q6
244	prepare_key	r2, r3
245.Lcbcdecloop3x:
246	subs		r4, r4, #3
247	bmi		.Lcbcdec1x
248	vld1.8		{q0-q1}, [r1]!
249	vld1.8		{q2}, [r1]!
250	vmov		q3, q0
251	vmov		q4, q1
252	vmov		q5, q2
253	bl		aes_decrypt_3x
254	veor		q0, q0, q6
255	veor		q1, q1, q3
256	veor		q2, q2, q4
257	vmov		q6, q5
258	vst1.8		{q0-q1}, [r0]!
259	vst1.8		{q2}, [r0]!
260	b		.Lcbcdecloop3x
261.Lcbcdec1x:
262	adds		r4, r4, #3
263	beq		.Lcbcdecout
264	vmov		q15, q14		@ preserve last round key
265.Lcbcdecloop:
266	vld1.8		{q0}, [r1]!		@ get next ct block
267	veor		q14, q15, q6		@ combine prev ct with last key
268	vmov		q6, q0
269	bl		aes_decrypt
270	vst1.8		{q0}, [r0]!
271	subs		r4, r4, #1
272	bne		.Lcbcdecloop
273.Lcbcdecout:
274	vst1.8		{q6}, [r5]		@ keep iv in q6
275	pop		{r4-r6, pc}
276END_FUNC ce_aes_cbc_decrypt
277
278	/*
279	 * void ce_aes_ctr_encrypt(uint8_t out[], uint8_t const in[],
280	 *			   uint8_t const rk[], int rounds, int blocks,
281	 *			   uint8_t ctr[], int first)
282	 */
283FUNC ce_aes_ctr_encrypt , :
284	push		{r4-r6, lr}
285	ldrd		r4, r5, [sp, #16]
286	vld1.8		{q6}, [r5]		@ load ctr
287	prepare_key	r2, r3
288	vmov		r6, s27			@ keep swabbed ctr in r6
289	rev		r6, r6
290	cmn		r6, r4			@ 32 bit overflow?
291	bcs		.Lctrloop
292.Lctrloop3x:
293	subs		r4, r4, #3
294	bmi		.Lctr1x
295	add		r6, r6, #1
296	vmov		q0, q6
297	vmov		q1, q6
298	rev		ip, r6
299	add		r6, r6, #1
300	vmov		q2, q6
301	vmov		s7, ip
302	rev		ip, r6
303	add		r6, r6, #1
304	vmov		s11, ip
305	vld1.8		{q3-q4}, [r1]!
306	vld1.8		{q5}, [r1]!
307	bl		aes_encrypt_3x
308	veor		q0, q0, q3
309	veor		q1, q1, q4
310	veor		q2, q2, q5
311	rev		ip, r6
312	vst1.8		{q0-q1}, [r0]!
313	vst1.8		{q2}, [r0]!
314	vmov		s27, ip
315	b		.Lctrloop3x
316.Lctr1x:
317	adds		r4, r4, #3
318	beq		.Lctrout
319.Lctrloop:
320	vmov		q0, q6
321	bl		aes_encrypt
322	subs		r4, r4, #1
323	bmi		.Lctrtailblock		@ blocks < 0 means tail block
324	vld1.8		{q3}, [r1]!
325	veor		q3, q0, q3
326	vst1.8		{q3}, [r0]!
327
328	adds		r6, r6, #1		@ increment BE ctr
329	rev		ip, r6
330	vmov		s27, ip
331	bcs		.Lctrcarry
332	teq		r4, #0
333	bne		.Lctrloop
334.Lctrout:
335	vst1.8		{q6}, [r5]
336	pop		{r4-r6, pc}
337
338.Lctrtailblock:
339	vst1.8		{q0}, [r0, :64]		@ return just the key stream
340	pop		{r4-r6, pc}
341
342.Lctrcarry:
343	.irp		sreg, s26, s25, s24
344	vmov		ip, \sreg		@ load next word of ctr
345	rev		ip, ip			@ ... to handle the carry
346	adds		ip, ip, #1
347	rev		ip, ip
348	vmov		\sreg, ip
349	bcc		0f
350	.endr
3510:	teq		r4, #0
352	beq		.Lctrout
353	b		.Lctrloop
354END_FUNC ce_aes_ctr_encrypt
355
356	/*
357	 * void ce_aes_xts_encrypt(uint8_t out[], uint8_t const in[],
358	 *			   uint8_t const rk1[], int rounds, int blocks,
359	 *			   uint8_t const rk2[], uint8_t iv[])
360	 * void ce_aes_xts_decrypt(uint8_t out[], uint8_t const in[],
361	 *			   uint8_t const rk1[], int rounds, int blocks,
362	 *			   uint8_t const rk2[], uint8_t iv[]);
363	 */
364
365	.macro		next_tweak, out, in, const, tmp
366	vshr.s64	\tmp, \in, #63
367	vand		\tmp, \tmp, \const
368	vadd.u64	\out, \in, \in
369	vext.8		\tmp, \tmp, \tmp, #8
370	veor		\out, \out, \tmp
371	.endm
372
373LOCAL_FUNC ce_aes_xts_init , :
374	vldr		d14, .Lxts_mul_x
375	vldr		d15, .Lxts_mul_x + 8
376
377	ldr		r4, [sp, #16]		@ load args
378	ldr		r5, [sp, #24]
379	vld1.8		{q0}, [r5]		@ load iv
380
381	@ Encrypt the IV in q0 with the second AES key. This should only
382	@ be done at the start of a block.
383	ldr		r6, [sp, #20]		@ load AES key 2
384	prepare_key	r6, r3
385	add		ip, r6, #32		@ 3rd round key of key 2
386	b		.Laes_encrypt_tweak	@ tail call
387
388	.align		3
389.Lxts_mul_x:
390	.quad		1, 0x87
391END_FUNC ce_aes_xts_init
392
393FUNC ce_aes_xts_encrypt , :
394	push		{r4-r6, lr}
395
396	bl		ce_aes_xts_init		@ run shared prologue
397	prepare_key	r2, r3
398	vmov		q3, q0
399
400	teq		r6, #0			@ start of a block?
401	bne		.Lxtsenc3x
402
403.Lxtsencloop3x:
404	next_tweak	q3, q3, q7, q6
405.Lxtsenc3x:
406	subs		r4, r4, #3
407	bmi		.Lxtsenc1x
408	vld1.8		{q0-q1}, [r1]!		@ get 3 pt blocks
409	vld1.8		{q2}, [r1]!
410	next_tweak	q4, q3, q7, q6
411	veor		q0, q0, q3
412	next_tweak	q5, q4, q7, q6
413	veor		q1, q1, q4
414	veor		q2, q2, q5
415	bl		aes_encrypt_3x
416	veor		q0, q0, q3
417	veor		q1, q1, q4
418	veor		q2, q2, q5
419	vst1.8		{q0-q1}, [r0]!		@ write 3 ct blocks
420	vst1.8		{q2}, [r0]!
421	vmov		q3, q5
422	teq		r4, #0
423	beq		.Lxtsencout
424	b		.Lxtsencloop3x
425.Lxtsenc1x:
426	adds		r4, r4, #3
427	beq		.Lxtsencout
428.Lxtsencloop:
429	vld1.8		{q0}, [r1]!
430	veor		q0, q0, q3
431	bl		aes_encrypt
432	veor		q0, q0, q3
433	vst1.8		{q0}, [r0]!
434	subs		r4, r4, #1
435	beq		.Lxtsencout
436	next_tweak	q3, q3, q7, q6
437	b		.Lxtsencloop
438.Lxtsencout:
439	next_tweak	q3, q3, q7, q6
440	vst1.8		{q3}, [r5]
441	pop		{r4-r6, pc}
442END_FUNC ce_aes_xts_encrypt
443
444FUNC ce_aes_xts_decrypt , :
445	push		{r4-r6, lr}
446
447	bl		ce_aes_xts_init		@ run shared prologue
448	prepare_key	r2, r3
449	vmov		q3, q0
450
451	teq		r6, #0			@ start of a block?
452	bne		.Lxtsdec3x
453
454.Lxtsdecloop3x:
455	next_tweak	q3, q3, q7, q6
456.Lxtsdec3x:
457	subs		r4, r4, #3
458	bmi		.Lxtsdec1x
459	vld1.8		{q0-q1}, [r1]!		@ get 3 ct blocks
460	vld1.8		{q2}, [r1]!
461	next_tweak	q4, q3, q7, q6
462	veor		q0, q0, q3
463	next_tweak	q5, q4, q7, q6
464	veor		q1, q1, q4
465	veor		q2, q2, q5
466	bl		aes_decrypt_3x
467	veor		q0, q0, q3
468	veor		q1, q1, q4
469	veor		q2, q2, q5
470	vst1.8		{q0-q1}, [r0]!		@ write 3 pt blocks
471	vst1.8		{q2}, [r0]!
472	vmov		q3, q5
473	teq		r4, #0
474	beq		.Lxtsdecout
475	b		.Lxtsdecloop3x
476.Lxtsdec1x:
477	adds		r4, r4, #3
478	beq		.Lxtsdecout
479.Lxtsdecloop:
480	vld1.8		{q0}, [r1]!
481	veor		q0, q0, q3
482	add		ip, r2, #32		@ 3rd round key
483	bl		aes_decrypt
484	veor		q0, q0, q3
485	vst1.8		{q0}, [r0]!
486	subs		r4, r4, #1
487	beq		.Lxtsdecout
488	next_tweak	q3, q3, q7, q6
489	b		.Lxtsdecloop
490.Lxtsdecout:
491	next_tweak	q3, q3, q7, q6
492	vst1.8		{q3}, [r5]
493	pop		{r4-r6, pc}
494END_FUNC ce_aes_xts_decrypt
495
496	/*
497	 * u32 ce_aes_sub(u32 input) - use the aese instruction to perform the
498	 *                             AES sbox substitution on each byte in
499	 *                             'input'
500	 */
501FUNC ce_aes_sub , :
502	vdup.32		q1, r0
503	veor		q0, q0, q0
504	aese.8		q0, q1
505	vmov		r0, s0
506	bx		lr
507END_FUNC ce_aes_sub
508
509	/*
510	 * void ce_aes_invert(void *dst, const void *src)
511	 *
512	 * perform the Inverse MixColumns operation on round key in
513	 */
514FUNC ce_aes_invert , :
515	vld1.8		{q0}, [r1]
516	aesimc.8	q0, q0
517	vst1.8		{q0}, [r0]
518	bx		lr
519END_FUNC ce_aes_invert
520
521	/*
522	 * void ce_aes_xor_block(uint8_t out[], uint8_t const op1[],
523	 *			 uint8_t const op2[]);
524	 */
525FUNC ce_aes_xor_block , :
526	vld1.8		{q0}, [r1]
527	vld1.8		{q1}, [r2]
528	veor		q0, q0, q1
529	vst1.8		{q0}, [r0]
530	bx		lr
531END_FUNC ce_aes_xor_block
532