1/* SPDX-License-Identifier: BSD-2-Clause */ 2/* 3 * Copyright (c) 2016, 2020 Linaro Limited 4 * Copyright (C) 2015 Linaro Ltd <ard.biesheuvel@linaro.org> 5 * 6 * aes-ce-core.S - AES in CBC/CTR/XTS mode using ARMv8 Crypto Extensions 7 */ 8 9#include <asm.S> 10 11 .fpu crypto-neon-fp-armv8 12 13 .macro enc_round, state, key 14 aese.8 \state, \key 15 aesmc.8 \state, \state 16 .endm 17 18 .macro dec_round, state, key 19 aesd.8 \state, \key 20 aesimc.8 \state, \state 21 .endm 22 23 .macro enc_dround, key1, key2 24 enc_round q0, \key1 25 enc_round q0, \key2 26 .endm 27 28 .macro dec_dround, key1, key2 29 dec_round q0, \key1 30 dec_round q0, \key2 31 .endm 32 33 .macro enc_fround, key1, key2, key3 34 enc_round q0, \key1 35 aese.8 q0, \key2 36 veor q0, q0, \key3 37 .endm 38 39 .macro dec_fround, key1, key2, key3 40 dec_round q0, \key1 41 aesd.8 q0, \key2 42 veor q0, q0, \key3 43 .endm 44 45 .macro enc_dround_3x, key1, key2 46 enc_round q0, \key1 47 enc_round q1, \key1 48 enc_round q2, \key1 49 enc_round q0, \key2 50 enc_round q1, \key2 51 enc_round q2, \key2 52 .endm 53 54 .macro dec_dround_3x, key1, key2 55 dec_round q0, \key1 56 dec_round q1, \key1 57 dec_round q2, \key1 58 dec_round q0, \key2 59 dec_round q1, \key2 60 dec_round q2, \key2 61 .endm 62 63 .macro enc_fround_3x, key1, key2, key3 64 enc_round q0, \key1 65 enc_round q1, \key1 66 enc_round q2, \key1 67 aese.8 q0, \key2 68 aese.8 q1, \key2 69 aese.8 q2, \key2 70 veor q0, q0, \key3 71 veor q1, q1, \key3 72 veor q2, q2, \key3 73 .endm 74 75 .macro dec_fround_3x, key1, key2, key3 76 dec_round q0, \key1 77 dec_round q1, \key1 78 dec_round q2, \key1 79 aesd.8 q0, \key2 80 aesd.8 q1, \key2 81 aesd.8 q2, \key2 82 veor q0, q0, \key3 83 veor q1, q1, \key3 84 veor q2, q2, \key3 85 .endm 86 87 .macro do_block, dround, fround 88 cmp r3, #12 @ which key size? 89 vld1.8 {q10-q11}, [ip]! 90 \dround q8, q9 91 vld1.8 {q12-q13}, [ip]! 92 \dround q10, q11 93 vld1.8 {q10-q11}, [ip]! 94 \dround q12, q13 95 vld1.8 {q12-q13}, [ip]! 96 \dround q10, q11 97 blo 0f @ AES-128: 10 rounds 98 vld1.8 {q10-q11}, [ip]! 99 \dround q12, q13 100 beq 1f @ AES-192: 12 rounds 101 vld1.8 {q12-q13}, [ip] 102 \dround q10, q11 1030: \fround q12, q13, q14 104 bx lr 105 1061: \fround q10, q11, q14 107 bx lr 108 .endm 109 110 /* 111 * Internal, non-AAPCS compliant functions that implement the core 112 * AES transforms. These should preserve all registers except q0 - 113 * q2 and ip. 114 * Arguments: 115 * q0 : first in/output block 116 * q1 : second in/output block (_3x version only) 117 * q2 : third in/output block (_3x version only) 118 * q8 : first round key 119 * q9 : secound round key 120 * q14 : final round key 121 * r2 : address of round key array 122 * r3 : number of rounds 123 */ 124 .section .text.ce_aes_helpers 125 .align 6 126aes_encrypt: 127 add ip, r2, #32 @ 3rd round key 128.Laes_encrypt_tweak: 129 do_block enc_dround, enc_fround 130 131 .align 6 132aes_decrypt: 133 add ip, r2, #32 @ 3rd round key 134 do_block dec_dround, dec_fround 135 136 .align 6 137aes_encrypt_3x: 138 add ip, r2, #32 @ 3rd round key 139 do_block enc_dround_3x, enc_fround_3x 140 141 .align 6 142aes_decrypt_3x: 143 add ip, r2, #32 @ 3rd round key 144 do_block dec_dround_3x, dec_fround_3x 145 146 .macro prepare_key, rk, rounds 147 add ip, \rk, \rounds, lsl #4 148 vld1.8 {q8-q9}, [\rk] @ load first 2 round keys 149 vld1.8 {q14}, [ip] @ load last round key 150 .endm 151 152 /* 153 * void ce_aes_ecb_encrypt(uint8_t out[], uint8_t const in[], 154 * uint8_t const rk[], int rounds, int blocks, 155 * int first) 156 */ 157FUNC ce_aes_ecb_encrypt , : 158 push {r4, lr} 159 ldr r4, [sp, #8] 160 prepare_key r2, r3 161.Lecbencloop3x: 162 subs r4, r4, #3 163 bmi .Lecbenc1x 164 vld1.8 {q0-q1}, [r1]! 165 vld1.8 {q2}, [r1]! 166 bl aes_encrypt_3x 167 vst1.8 {q0-q1}, [r0]! 168 vst1.8 {q2}, [r0]! 169 b .Lecbencloop3x 170.Lecbenc1x: 171 adds r4, r4, #3 172 beq .Lecbencout 173.Lecbencloop: 174 vld1.8 {q0}, [r1]! 175 bl aes_encrypt 176 vst1.8 {q0}, [r0]! 177 subs r4, r4, #1 178 bne .Lecbencloop 179.Lecbencout: 180 pop {r4, pc} 181END_FUNC ce_aes_ecb_encrypt 182 183 /* 184 * void ce_aes_ecb_decrypt(uint8_t out[], uint8_t const in[], 185 * uint8_t const rk[], int rounds, int blocks, 186 * int first) 187 */ 188FUNC ce_aes_ecb_decrypt , : 189 push {r4, lr} 190 ldr r4, [sp, #8] 191 prepare_key r2, r3 192.Lecbdecloop3x: 193 subs r4, r4, #3 194 bmi .Lecbdec1x 195 vld1.8 {q0-q1}, [r1]! 196 vld1.8 {q2}, [r1]! 197 bl aes_decrypt_3x 198 vst1.8 {q0-q1}, [r0]! 199 vst1.8 {q2}, [r0]! 200 b .Lecbdecloop3x 201.Lecbdec1x: 202 adds r4, r4, #3 203 beq .Lecbdecout 204.Lecbdecloop: 205 vld1.8 {q0}, [r1]! 206 bl aes_decrypt 207 vst1.8 {q0}, [r0]! 208 subs r4, r4, #1 209 bne .Lecbdecloop 210.Lecbdecout: 211 pop {r4, pc} 212END_FUNC ce_aes_ecb_decrypt 213 214 /* 215 * void ce_aes_cbc_encrypt(uint8_t out[], uint8_t const in[], 216 * uint8_t const rk[], int rounds, int blocks, 217 * uint8_t iv[]) 218 */ 219FUNC ce_aes_cbc_encrypt , : 220 push {r4-r6, lr} 221 ldrd r4, r5, [sp, #16] 222 vld1.8 {q0}, [r5] 223 prepare_key r2, r3 224.Lcbcencloop: 225 vld1.8 {q1}, [r1]! @ get next pt block 226 veor q0, q0, q1 @ ..and xor with iv 227 bl aes_encrypt 228 vst1.8 {q0}, [r0]! 229 subs r4, r4, #1 230 bne .Lcbcencloop 231 vst1.8 {q0}, [r5] 232 pop {r4-r6, pc} 233END_FUNC ce_aes_cbc_encrypt 234 235 /* 236 * void ce_aes_cbc_decrypt(uint8_t out[], uint8_t const in[], 237 * uint8_t const rk[], int rounds, int blocks, 238 * uint8_t iv[]) 239 */ 240FUNC ce_aes_cbc_decrypt , : 241 push {r4-r6, lr} 242 ldrd r4, r5, [sp, #16] 243 vld1.8 {q6}, [r5] @ keep iv in q6 244 prepare_key r2, r3 245.Lcbcdecloop3x: 246 subs r4, r4, #3 247 bmi .Lcbcdec1x 248 vld1.8 {q0-q1}, [r1]! 249 vld1.8 {q2}, [r1]! 250 vmov q3, q0 251 vmov q4, q1 252 vmov q5, q2 253 bl aes_decrypt_3x 254 veor q0, q0, q6 255 veor q1, q1, q3 256 veor q2, q2, q4 257 vmov q6, q5 258 vst1.8 {q0-q1}, [r0]! 259 vst1.8 {q2}, [r0]! 260 b .Lcbcdecloop3x 261.Lcbcdec1x: 262 adds r4, r4, #3 263 beq .Lcbcdecout 264 vmov q15, q14 @ preserve last round key 265.Lcbcdecloop: 266 vld1.8 {q0}, [r1]! @ get next ct block 267 veor q14, q15, q6 @ combine prev ct with last key 268 vmov q6, q0 269 bl aes_decrypt 270 vst1.8 {q0}, [r0]! 271 subs r4, r4, #1 272 bne .Lcbcdecloop 273.Lcbcdecout: 274 vst1.8 {q6}, [r5] @ keep iv in q6 275 pop {r4-r6, pc} 276END_FUNC ce_aes_cbc_decrypt 277 278 /* 279 * void ce_aes_ctr_encrypt(uint8_t out[], uint8_t const in[], 280 * uint8_t const rk[], int rounds, int blocks, 281 * uint8_t ctr[], int first) 282 */ 283FUNC ce_aes_ctr_encrypt , : 284 push {r4-r6, lr} 285 ldrd r4, r5, [sp, #16] 286 vld1.8 {q6}, [r5] @ load ctr 287 prepare_key r2, r3 288 vmov r6, s27 @ keep swabbed ctr in r6 289 rev r6, r6 290 cmn r6, r4 @ 32 bit overflow? 291 bcs .Lctrloop 292.Lctrloop3x: 293 subs r4, r4, #3 294 bmi .Lctr1x 295 add r6, r6, #1 296 vmov q0, q6 297 vmov q1, q6 298 rev ip, r6 299 add r6, r6, #1 300 vmov q2, q6 301 vmov s7, ip 302 rev ip, r6 303 add r6, r6, #1 304 vmov s11, ip 305 vld1.8 {q3-q4}, [r1]! 306 vld1.8 {q5}, [r1]! 307 bl aes_encrypt_3x 308 veor q0, q0, q3 309 veor q1, q1, q4 310 veor q2, q2, q5 311 rev ip, r6 312 vst1.8 {q0-q1}, [r0]! 313 vst1.8 {q2}, [r0]! 314 vmov s27, ip 315 b .Lctrloop3x 316.Lctr1x: 317 adds r4, r4, #3 318 beq .Lctrout 319.Lctrloop: 320 vmov q0, q6 321 bl aes_encrypt 322 subs r4, r4, #1 323 bmi .Lctrtailblock @ blocks < 0 means tail block 324 vld1.8 {q3}, [r1]! 325 veor q3, q0, q3 326 vst1.8 {q3}, [r0]! 327 328 adds r6, r6, #1 @ increment BE ctr 329 rev ip, r6 330 vmov s27, ip 331 bcs .Lctrcarry 332 teq r4, #0 333 bne .Lctrloop 334.Lctrout: 335 vst1.8 {q6}, [r5] 336 pop {r4-r6, pc} 337 338.Lctrtailblock: 339 vst1.8 {q0}, [r0, :64] @ return just the key stream 340 pop {r4-r6, pc} 341 342.Lctrcarry: 343 .irp sreg, s26, s25, s24 344 vmov ip, \sreg @ load next word of ctr 345 rev ip, ip @ ... to handle the carry 346 adds ip, ip, #1 347 rev ip, ip 348 vmov \sreg, ip 349 bcc 0f 350 .endr 3510: teq r4, #0 352 beq .Lctrout 353 b .Lctrloop 354END_FUNC ce_aes_ctr_encrypt 355 356 /* 357 * void ce_aes_xts_encrypt(uint8_t out[], uint8_t const in[], 358 * uint8_t const rk1[], int rounds, int blocks, 359 * uint8_t const rk2[], uint8_t iv[]) 360 * void ce_aes_xts_decrypt(uint8_t out[], uint8_t const in[], 361 * uint8_t const rk1[], int rounds, int blocks, 362 * uint8_t const rk2[], uint8_t iv[]); 363 */ 364 365 .macro next_tweak, out, in, const, tmp 366 vshr.s64 \tmp, \in, #63 367 vand \tmp, \tmp, \const 368 vadd.u64 \out, \in, \in 369 vext.8 \tmp, \tmp, \tmp, #8 370 veor \out, \out, \tmp 371 .endm 372 373LOCAL_FUNC ce_aes_xts_init , : 374 vldr d14, .Lxts_mul_x 375 vldr d15, .Lxts_mul_x + 8 376 377 ldr r4, [sp, #16] @ load args 378 ldr r5, [sp, #24] 379 vld1.8 {q0}, [r5] @ load iv 380 381 @ Encrypt the IV in q0 with the second AES key. This should only 382 @ be done at the start of a block. 383 ldr r6, [sp, #20] @ load AES key 2 384 prepare_key r6, r3 385 add ip, r6, #32 @ 3rd round key of key 2 386 b .Laes_encrypt_tweak @ tail call 387 388 .align 3 389.Lxts_mul_x: 390 .quad 1, 0x87 391END_FUNC ce_aes_xts_init 392 393FUNC ce_aes_xts_encrypt , : 394 push {r4-r6, lr} 395 396 bl ce_aes_xts_init @ run shared prologue 397 prepare_key r2, r3 398 vmov q3, q0 399 400 teq r6, #0 @ start of a block? 401 bne .Lxtsenc3x 402 403.Lxtsencloop3x: 404 next_tweak q3, q3, q7, q6 405.Lxtsenc3x: 406 subs r4, r4, #3 407 bmi .Lxtsenc1x 408 vld1.8 {q0-q1}, [r1]! @ get 3 pt blocks 409 vld1.8 {q2}, [r1]! 410 next_tweak q4, q3, q7, q6 411 veor q0, q0, q3 412 next_tweak q5, q4, q7, q6 413 veor q1, q1, q4 414 veor q2, q2, q5 415 bl aes_encrypt_3x 416 veor q0, q0, q3 417 veor q1, q1, q4 418 veor q2, q2, q5 419 vst1.8 {q0-q1}, [r0]! @ write 3 ct blocks 420 vst1.8 {q2}, [r0]! 421 vmov q3, q5 422 teq r4, #0 423 beq .Lxtsencout 424 b .Lxtsencloop3x 425.Lxtsenc1x: 426 adds r4, r4, #3 427 beq .Lxtsencout 428.Lxtsencloop: 429 vld1.8 {q0}, [r1]! 430 veor q0, q0, q3 431 bl aes_encrypt 432 veor q0, q0, q3 433 vst1.8 {q0}, [r0]! 434 subs r4, r4, #1 435 beq .Lxtsencout 436 next_tweak q3, q3, q7, q6 437 b .Lxtsencloop 438.Lxtsencout: 439 next_tweak q3, q3, q7, q6 440 vst1.8 {q3}, [r5] 441 pop {r4-r6, pc} 442END_FUNC ce_aes_xts_encrypt 443 444FUNC ce_aes_xts_decrypt , : 445 push {r4-r6, lr} 446 447 bl ce_aes_xts_init @ run shared prologue 448 prepare_key r2, r3 449 vmov q3, q0 450 451 teq r6, #0 @ start of a block? 452 bne .Lxtsdec3x 453 454.Lxtsdecloop3x: 455 next_tweak q3, q3, q7, q6 456.Lxtsdec3x: 457 subs r4, r4, #3 458 bmi .Lxtsdec1x 459 vld1.8 {q0-q1}, [r1]! @ get 3 ct blocks 460 vld1.8 {q2}, [r1]! 461 next_tweak q4, q3, q7, q6 462 veor q0, q0, q3 463 next_tweak q5, q4, q7, q6 464 veor q1, q1, q4 465 veor q2, q2, q5 466 bl aes_decrypt_3x 467 veor q0, q0, q3 468 veor q1, q1, q4 469 veor q2, q2, q5 470 vst1.8 {q0-q1}, [r0]! @ write 3 pt blocks 471 vst1.8 {q2}, [r0]! 472 vmov q3, q5 473 teq r4, #0 474 beq .Lxtsdecout 475 b .Lxtsdecloop3x 476.Lxtsdec1x: 477 adds r4, r4, #3 478 beq .Lxtsdecout 479.Lxtsdecloop: 480 vld1.8 {q0}, [r1]! 481 veor q0, q0, q3 482 add ip, r2, #32 @ 3rd round key 483 bl aes_decrypt 484 veor q0, q0, q3 485 vst1.8 {q0}, [r0]! 486 subs r4, r4, #1 487 beq .Lxtsdecout 488 next_tweak q3, q3, q7, q6 489 b .Lxtsdecloop 490.Lxtsdecout: 491 next_tweak q3, q3, q7, q6 492 vst1.8 {q3}, [r5] 493 pop {r4-r6, pc} 494END_FUNC ce_aes_xts_decrypt 495 496 /* 497 * u32 ce_aes_sub(u32 input) - use the aese instruction to perform the 498 * AES sbox substitution on each byte in 499 * 'input' 500 */ 501FUNC ce_aes_sub , : 502 vdup.32 q1, r0 503 veor q0, q0, q0 504 aese.8 q0, q1 505 vmov r0, s0 506 bx lr 507END_FUNC ce_aes_sub 508 509 /* 510 * void ce_aes_invert(void *dst, const void *src) 511 * 512 * perform the Inverse MixColumns operation on round key in 513 */ 514FUNC ce_aes_invert , : 515 vld1.8 {q0}, [r1] 516 aesimc.8 q0, q0 517 vst1.8 {q0}, [r0] 518 bx lr 519END_FUNC ce_aes_invert 520 521 /* 522 * void ce_aes_xor_block(uint8_t out[], uint8_t const op1[], 523 * uint8_t const op2[]); 524 */ 525FUNC ce_aes_xor_block , : 526 vld1.8 {q0}, [r1] 527 vld1.8 {q1}, [r2] 528 veor q0, q0, q1 529 vst1.8 {q0}, [r0] 530 bx lr 531END_FUNC ce_aes_xor_block 532