1/* SPDX-License-Identifier: GPL-2.0-or-later */ 2# 3# Accelerated chacha20 implementation for ppc64le. 4# 5# Copyright 2023- IBM Corp. All rights reserved 6# 7#=================================================================================== 8# Written by Danny Tsen <dtsen@us.ibm.com> 9# 10# do rounds, 8 quarter rounds 11# 1. a += b; d ^= a; d <<<= 16; 12# 2. c += d; b ^= c; b <<<= 12; 13# 3. a += b; d ^= a; d <<<= 8; 14# 4. c += d; b ^= c; b <<<= 7 15# 16# row1 = (row1 + row2), row4 = row1 xor row4, row4 rotate each word by 16 17# row3 = (row3 + row4), row2 = row3 xor row2, row2 rotate each word by 12 18# row1 = (row1 + row2), row4 = row1 xor row4, row4 rotate each word by 8 19# row3 = (row3 + row4), row2 = row3 xor row2, row2 rotate each word by 7 20# 21# 4 blocks (a b c d) 22# 23# a0 b0 c0 d0 24# a1 b1 c1 d1 25# ... 26# a4 b4 c4 d4 27# ... 28# a8 b8 c8 d8 29# ... 30# a12 b12 c12 d12 31# a13 ... 32# a14 ... 33# a15 b15 c15 d15 34# 35# Column round (v0, v4, v8, v12, v1, v5, v9, v13, v2, v6, v10, v14, v3, v7, v11, v15) 36# Diagnal round (v0, v5, v10, v15, v1, v6, v11, v12, v2, v7, v8, v13, v3, v4, v9, v14) 37# 38 39#include <asm/ppc_asm.h> 40#include <asm/asm-offsets.h> 41#include <asm/asm-compat.h> 42#include <linux/linkage.h> 43 44.machine "any" 45.text 46 47.macro SAVE_GPR GPR OFFSET FRAME 48 std \GPR,\OFFSET(\FRAME) 49.endm 50 51.macro SAVE_VRS VRS OFFSET FRAME 52 li 16, \OFFSET 53 stvx \VRS, 16, \FRAME 54.endm 55 56.macro SAVE_VSX VSX OFFSET FRAME 57 li 16, \OFFSET 58 stxvx \VSX, 16, \FRAME 59.endm 60 61.macro RESTORE_GPR GPR OFFSET FRAME 62 ld \GPR,\OFFSET(\FRAME) 63.endm 64 65.macro RESTORE_VRS VRS OFFSET FRAME 66 li 16, \OFFSET 67 lvx \VRS, 16, \FRAME 68.endm 69 70.macro RESTORE_VSX VSX OFFSET FRAME 71 li 16, \OFFSET 72 lxvx \VSX, 16, \FRAME 73.endm 74 75.macro SAVE_REGS 76 mflr 0 77 std 0, 16(1) 78 stdu 1,-752(1) 79 80 SAVE_GPR 14, 112, 1 81 SAVE_GPR 15, 120, 1 82 SAVE_GPR 16, 128, 1 83 SAVE_GPR 17, 136, 1 84 SAVE_GPR 18, 144, 1 85 SAVE_GPR 19, 152, 1 86 SAVE_GPR 20, 160, 1 87 SAVE_GPR 21, 168, 1 88 SAVE_GPR 22, 176, 1 89 SAVE_GPR 23, 184, 1 90 SAVE_GPR 24, 192, 1 91 SAVE_GPR 25, 200, 1 92 SAVE_GPR 26, 208, 1 93 SAVE_GPR 27, 216, 1 94 SAVE_GPR 28, 224, 1 95 SAVE_GPR 29, 232, 1 96 SAVE_GPR 30, 240, 1 97 SAVE_GPR 31, 248, 1 98 99 addi 9, 1, 256 100 SAVE_VRS 20, 0, 9 101 SAVE_VRS 21, 16, 9 102 SAVE_VRS 22, 32, 9 103 SAVE_VRS 23, 48, 9 104 SAVE_VRS 24, 64, 9 105 SAVE_VRS 25, 80, 9 106 SAVE_VRS 26, 96, 9 107 SAVE_VRS 27, 112, 9 108 SAVE_VRS 28, 128, 9 109 SAVE_VRS 29, 144, 9 110 SAVE_VRS 30, 160, 9 111 SAVE_VRS 31, 176, 9 112 113 SAVE_VSX 14, 192, 9 114 SAVE_VSX 15, 208, 9 115 SAVE_VSX 16, 224, 9 116 SAVE_VSX 17, 240, 9 117 SAVE_VSX 18, 256, 9 118 SAVE_VSX 19, 272, 9 119 SAVE_VSX 20, 288, 9 120 SAVE_VSX 21, 304, 9 121 SAVE_VSX 22, 320, 9 122 SAVE_VSX 23, 336, 9 123 SAVE_VSX 24, 352, 9 124 SAVE_VSX 25, 368, 9 125 SAVE_VSX 26, 384, 9 126 SAVE_VSX 27, 400, 9 127 SAVE_VSX 28, 416, 9 128 SAVE_VSX 29, 432, 9 129 SAVE_VSX 30, 448, 9 130 SAVE_VSX 31, 464, 9 131.endm # SAVE_REGS 132 133.macro RESTORE_REGS 134 addi 9, 1, 256 135 RESTORE_VRS 20, 0, 9 136 RESTORE_VRS 21, 16, 9 137 RESTORE_VRS 22, 32, 9 138 RESTORE_VRS 23, 48, 9 139 RESTORE_VRS 24, 64, 9 140 RESTORE_VRS 25, 80, 9 141 RESTORE_VRS 26, 96, 9 142 RESTORE_VRS 27, 112, 9 143 RESTORE_VRS 28, 128, 9 144 RESTORE_VRS 29, 144, 9 145 RESTORE_VRS 30, 160, 9 146 RESTORE_VRS 31, 176, 9 147 148 RESTORE_VSX 14, 192, 9 149 RESTORE_VSX 15, 208, 9 150 RESTORE_VSX 16, 224, 9 151 RESTORE_VSX 17, 240, 9 152 RESTORE_VSX 18, 256, 9 153 RESTORE_VSX 19, 272, 9 154 RESTORE_VSX 20, 288, 9 155 RESTORE_VSX 21, 304, 9 156 RESTORE_VSX 22, 320, 9 157 RESTORE_VSX 23, 336, 9 158 RESTORE_VSX 24, 352, 9 159 RESTORE_VSX 25, 368, 9 160 RESTORE_VSX 26, 384, 9 161 RESTORE_VSX 27, 400, 9 162 RESTORE_VSX 28, 416, 9 163 RESTORE_VSX 29, 432, 9 164 RESTORE_VSX 30, 448, 9 165 RESTORE_VSX 31, 464, 9 166 167 RESTORE_GPR 14, 112, 1 168 RESTORE_GPR 15, 120, 1 169 RESTORE_GPR 16, 128, 1 170 RESTORE_GPR 17, 136, 1 171 RESTORE_GPR 18, 144, 1 172 RESTORE_GPR 19, 152, 1 173 RESTORE_GPR 20, 160, 1 174 RESTORE_GPR 21, 168, 1 175 RESTORE_GPR 22, 176, 1 176 RESTORE_GPR 23, 184, 1 177 RESTORE_GPR 24, 192, 1 178 RESTORE_GPR 25, 200, 1 179 RESTORE_GPR 26, 208, 1 180 RESTORE_GPR 27, 216, 1 181 RESTORE_GPR 28, 224, 1 182 RESTORE_GPR 29, 232, 1 183 RESTORE_GPR 30, 240, 1 184 RESTORE_GPR 31, 248, 1 185 186 addi 1, 1, 752 187 ld 0, 16(1) 188 mtlr 0 189.endm # RESTORE_REGS 190 191.macro QT_loop_8x 192 # QR(v0, v4, v8, v12, v1, v5, v9, v13, v2, v6, v10, v14, v3, v7, v11, v15) 193 xxlor 0, 32+25, 32+25 194 xxlor 32+25, 20, 20 195 vadduwm 0, 0, 4 196 vadduwm 1, 1, 5 197 vadduwm 2, 2, 6 198 vadduwm 3, 3, 7 199 vadduwm 16, 16, 20 200 vadduwm 17, 17, 21 201 vadduwm 18, 18, 22 202 vadduwm 19, 19, 23 203 204 vpermxor 12, 12, 0, 25 205 vpermxor 13, 13, 1, 25 206 vpermxor 14, 14, 2, 25 207 vpermxor 15, 15, 3, 25 208 vpermxor 28, 28, 16, 25 209 vpermxor 29, 29, 17, 25 210 vpermxor 30, 30, 18, 25 211 vpermxor 31, 31, 19, 25 212 xxlor 32+25, 0, 0 213 vadduwm 8, 8, 12 214 vadduwm 9, 9, 13 215 vadduwm 10, 10, 14 216 vadduwm 11, 11, 15 217 vadduwm 24, 24, 28 218 vadduwm 25, 25, 29 219 vadduwm 26, 26, 30 220 vadduwm 27, 27, 31 221 vxor 4, 4, 8 222 vxor 5, 5, 9 223 vxor 6, 6, 10 224 vxor 7, 7, 11 225 vxor 20, 20, 24 226 vxor 21, 21, 25 227 vxor 22, 22, 26 228 vxor 23, 23, 27 229 230 xxlor 0, 32+25, 32+25 231 xxlor 32+25, 21, 21 232 vrlw 4, 4, 25 # 233 vrlw 5, 5, 25 234 vrlw 6, 6, 25 235 vrlw 7, 7, 25 236 vrlw 20, 20, 25 # 237 vrlw 21, 21, 25 238 vrlw 22, 22, 25 239 vrlw 23, 23, 25 240 xxlor 32+25, 0, 0 241 vadduwm 0, 0, 4 242 vadduwm 1, 1, 5 243 vadduwm 2, 2, 6 244 vadduwm 3, 3, 7 245 vadduwm 16, 16, 20 246 vadduwm 17, 17, 21 247 vadduwm 18, 18, 22 248 vadduwm 19, 19, 23 249 250 xxlor 0, 32+25, 32+25 251 xxlor 32+25, 22, 22 252 vpermxor 12, 12, 0, 25 253 vpermxor 13, 13, 1, 25 254 vpermxor 14, 14, 2, 25 255 vpermxor 15, 15, 3, 25 256 vpermxor 28, 28, 16, 25 257 vpermxor 29, 29, 17, 25 258 vpermxor 30, 30, 18, 25 259 vpermxor 31, 31, 19, 25 260 xxlor 32+25, 0, 0 261 vadduwm 8, 8, 12 262 vadduwm 9, 9, 13 263 vadduwm 10, 10, 14 264 vadduwm 11, 11, 15 265 vadduwm 24, 24, 28 266 vadduwm 25, 25, 29 267 vadduwm 26, 26, 30 268 vadduwm 27, 27, 31 269 xxlor 0, 32+28, 32+28 270 xxlor 32+28, 23, 23 271 vxor 4, 4, 8 272 vxor 5, 5, 9 273 vxor 6, 6, 10 274 vxor 7, 7, 11 275 vxor 20, 20, 24 276 vxor 21, 21, 25 277 vxor 22, 22, 26 278 vxor 23, 23, 27 279 vrlw 4, 4, 28 # 280 vrlw 5, 5, 28 281 vrlw 6, 6, 28 282 vrlw 7, 7, 28 283 vrlw 20, 20, 28 # 284 vrlw 21, 21, 28 285 vrlw 22, 22, 28 286 vrlw 23, 23, 28 287 xxlor 32+28, 0, 0 288 289 # QR(v0, v5, v10, v15, v1, v6, v11, v12, v2, v7, v8, v13, v3, v4, v9, v14) 290 xxlor 0, 32+25, 32+25 291 xxlor 32+25, 20, 20 292 vadduwm 0, 0, 5 293 vadduwm 1, 1, 6 294 vadduwm 2, 2, 7 295 vadduwm 3, 3, 4 296 vadduwm 16, 16, 21 297 vadduwm 17, 17, 22 298 vadduwm 18, 18, 23 299 vadduwm 19, 19, 20 300 301 vpermxor 15, 15, 0, 25 302 vpermxor 12, 12, 1, 25 303 vpermxor 13, 13, 2, 25 304 vpermxor 14, 14, 3, 25 305 vpermxor 31, 31, 16, 25 306 vpermxor 28, 28, 17, 25 307 vpermxor 29, 29, 18, 25 308 vpermxor 30, 30, 19, 25 309 310 xxlor 32+25, 0, 0 311 vadduwm 10, 10, 15 312 vadduwm 11, 11, 12 313 vadduwm 8, 8, 13 314 vadduwm 9, 9, 14 315 vadduwm 26, 26, 31 316 vadduwm 27, 27, 28 317 vadduwm 24, 24, 29 318 vadduwm 25, 25, 30 319 vxor 5, 5, 10 320 vxor 6, 6, 11 321 vxor 7, 7, 8 322 vxor 4, 4, 9 323 vxor 21, 21, 26 324 vxor 22, 22, 27 325 vxor 23, 23, 24 326 vxor 20, 20, 25 327 328 xxlor 0, 32+25, 32+25 329 xxlor 32+25, 21, 21 330 vrlw 5, 5, 25 331 vrlw 6, 6, 25 332 vrlw 7, 7, 25 333 vrlw 4, 4, 25 334 vrlw 21, 21, 25 335 vrlw 22, 22, 25 336 vrlw 23, 23, 25 337 vrlw 20, 20, 25 338 xxlor 32+25, 0, 0 339 340 vadduwm 0, 0, 5 341 vadduwm 1, 1, 6 342 vadduwm 2, 2, 7 343 vadduwm 3, 3, 4 344 vadduwm 16, 16, 21 345 vadduwm 17, 17, 22 346 vadduwm 18, 18, 23 347 vadduwm 19, 19, 20 348 349 xxlor 0, 32+25, 32+25 350 xxlor 32+25, 22, 22 351 vpermxor 15, 15, 0, 25 352 vpermxor 12, 12, 1, 25 353 vpermxor 13, 13, 2, 25 354 vpermxor 14, 14, 3, 25 355 vpermxor 31, 31, 16, 25 356 vpermxor 28, 28, 17, 25 357 vpermxor 29, 29, 18, 25 358 vpermxor 30, 30, 19, 25 359 xxlor 32+25, 0, 0 360 361 vadduwm 10, 10, 15 362 vadduwm 11, 11, 12 363 vadduwm 8, 8, 13 364 vadduwm 9, 9, 14 365 vadduwm 26, 26, 31 366 vadduwm 27, 27, 28 367 vadduwm 24, 24, 29 368 vadduwm 25, 25, 30 369 370 xxlor 0, 32+28, 32+28 371 xxlor 32+28, 23, 23 372 vxor 5, 5, 10 373 vxor 6, 6, 11 374 vxor 7, 7, 8 375 vxor 4, 4, 9 376 vxor 21, 21, 26 377 vxor 22, 22, 27 378 vxor 23, 23, 24 379 vxor 20, 20, 25 380 vrlw 5, 5, 28 381 vrlw 6, 6, 28 382 vrlw 7, 7, 28 383 vrlw 4, 4, 28 384 vrlw 21, 21, 28 385 vrlw 22, 22, 28 386 vrlw 23, 23, 28 387 vrlw 20, 20, 28 388 xxlor 32+28, 0, 0 389.endm 390 391.macro QT_loop_4x 392 # QR(v0, v4, v8, v12, v1, v5, v9, v13, v2, v6, v10, v14, v3, v7, v11, v15) 393 vadduwm 0, 0, 4 394 vadduwm 1, 1, 5 395 vadduwm 2, 2, 6 396 vadduwm 3, 3, 7 397 vpermxor 12, 12, 0, 20 398 vpermxor 13, 13, 1, 20 399 vpermxor 14, 14, 2, 20 400 vpermxor 15, 15, 3, 20 401 vadduwm 8, 8, 12 402 vadduwm 9, 9, 13 403 vadduwm 10, 10, 14 404 vadduwm 11, 11, 15 405 vxor 4, 4, 8 406 vxor 5, 5, 9 407 vxor 6, 6, 10 408 vxor 7, 7, 11 409 vrlw 4, 4, 21 410 vrlw 5, 5, 21 411 vrlw 6, 6, 21 412 vrlw 7, 7, 21 413 vadduwm 0, 0, 4 414 vadduwm 1, 1, 5 415 vadduwm 2, 2, 6 416 vadduwm 3, 3, 7 417 vpermxor 12, 12, 0, 22 418 vpermxor 13, 13, 1, 22 419 vpermxor 14, 14, 2, 22 420 vpermxor 15, 15, 3, 22 421 vadduwm 8, 8, 12 422 vadduwm 9, 9, 13 423 vadduwm 10, 10, 14 424 vadduwm 11, 11, 15 425 vxor 4, 4, 8 426 vxor 5, 5, 9 427 vxor 6, 6, 10 428 vxor 7, 7, 11 429 vrlw 4, 4, 23 430 vrlw 5, 5, 23 431 vrlw 6, 6, 23 432 vrlw 7, 7, 23 433 434 # QR(v0, v5, v10, v15, v1, v6, v11, v12, v2, v7, v8, v13, v3, v4, v9, v14) 435 vadduwm 0, 0, 5 436 vadduwm 1, 1, 6 437 vadduwm 2, 2, 7 438 vadduwm 3, 3, 4 439 vpermxor 15, 15, 0, 20 440 vpermxor 12, 12, 1, 20 441 vpermxor 13, 13, 2, 20 442 vpermxor 14, 14, 3, 20 443 vadduwm 10, 10, 15 444 vadduwm 11, 11, 12 445 vadduwm 8, 8, 13 446 vadduwm 9, 9, 14 447 vxor 5, 5, 10 448 vxor 6, 6, 11 449 vxor 7, 7, 8 450 vxor 4, 4, 9 451 vrlw 5, 5, 21 452 vrlw 6, 6, 21 453 vrlw 7, 7, 21 454 vrlw 4, 4, 21 455 vadduwm 0, 0, 5 456 vadduwm 1, 1, 6 457 vadduwm 2, 2, 7 458 vadduwm 3, 3, 4 459 vpermxor 15, 15, 0, 22 460 vpermxor 12, 12, 1, 22 461 vpermxor 13, 13, 2, 22 462 vpermxor 14, 14, 3, 22 463 vadduwm 10, 10, 15 464 vadduwm 11, 11, 12 465 vadduwm 8, 8, 13 466 vadduwm 9, 9, 14 467 vxor 5, 5, 10 468 vxor 6, 6, 11 469 vxor 7, 7, 8 470 vxor 4, 4, 9 471 vrlw 5, 5, 23 472 vrlw 6, 6, 23 473 vrlw 7, 7, 23 474 vrlw 4, 4, 23 475.endm 476 477# Transpose 478.macro TP_4x a0 a1 a2 a3 479 xxmrghw 10, 32+\a0, 32+\a1 # a0, a1, b0, b1 480 xxmrghw 11, 32+\a2, 32+\a3 # a2, a3, b2, b3 481 xxmrglw 12, 32+\a0, 32+\a1 # c0, c1, d0, d1 482 xxmrglw 13, 32+\a2, 32+\a3 # c2, c3, d2, d3 483 xxpermdi 32+\a0, 10, 11, 0 # a0, a1, a2, a3 484 xxpermdi 32+\a1, 10, 11, 3 # b0, b1, b2, b3 485 xxpermdi 32+\a2, 12, 13, 0 # c0, c1, c2, c3 486 xxpermdi 32+\a3, 12, 13, 3 # d0, d1, d2, d3 487.endm 488 489# key stream = working state + state 490.macro Add_state S 491 vadduwm \S+0, \S+0, 16-\S 492 vadduwm \S+4, \S+4, 17-\S 493 vadduwm \S+8, \S+8, 18-\S 494 vadduwm \S+12, \S+12, 19-\S 495 496 vadduwm \S+1, \S+1, 16-\S 497 vadduwm \S+5, \S+5, 17-\S 498 vadduwm \S+9, \S+9, 18-\S 499 vadduwm \S+13, \S+13, 19-\S 500 501 vadduwm \S+2, \S+2, 16-\S 502 vadduwm \S+6, \S+6, 17-\S 503 vadduwm \S+10, \S+10, 18-\S 504 vadduwm \S+14, \S+14, 19-\S 505 506 vadduwm \S+3, \S+3, 16-\S 507 vadduwm \S+7, \S+7, 17-\S 508 vadduwm \S+11, \S+11, 18-\S 509 vadduwm \S+15, \S+15, 19-\S 510.endm 511 512# 513# write 256 bytes 514# 515.macro Write_256 S 516 add 9, 14, 5 517 add 16, 14, 4 518 lxvw4x 0, 0, 9 519 lxvw4x 1, 17, 9 520 lxvw4x 2, 18, 9 521 lxvw4x 3, 19, 9 522 lxvw4x 4, 20, 9 523 lxvw4x 5, 21, 9 524 lxvw4x 6, 22, 9 525 lxvw4x 7, 23, 9 526 lxvw4x 8, 24, 9 527 lxvw4x 9, 25, 9 528 lxvw4x 10, 26, 9 529 lxvw4x 11, 27, 9 530 lxvw4x 12, 28, 9 531 lxvw4x 13, 29, 9 532 lxvw4x 14, 30, 9 533 lxvw4x 15, 31, 9 534 535 xxlxor \S+32, \S+32, 0 536 xxlxor \S+36, \S+36, 1 537 xxlxor \S+40, \S+40, 2 538 xxlxor \S+44, \S+44, 3 539 xxlxor \S+33, \S+33, 4 540 xxlxor \S+37, \S+37, 5 541 xxlxor \S+41, \S+41, 6 542 xxlxor \S+45, \S+45, 7 543 xxlxor \S+34, \S+34, 8 544 xxlxor \S+38, \S+38, 9 545 xxlxor \S+42, \S+42, 10 546 xxlxor \S+46, \S+46, 11 547 xxlxor \S+35, \S+35, 12 548 xxlxor \S+39, \S+39, 13 549 xxlxor \S+43, \S+43, 14 550 xxlxor \S+47, \S+47, 15 551 552 stxvw4x \S+32, 0, 16 553 stxvw4x \S+36, 17, 16 554 stxvw4x \S+40, 18, 16 555 stxvw4x \S+44, 19, 16 556 557 stxvw4x \S+33, 20, 16 558 stxvw4x \S+37, 21, 16 559 stxvw4x \S+41, 22, 16 560 stxvw4x \S+45, 23, 16 561 562 stxvw4x \S+34, 24, 16 563 stxvw4x \S+38, 25, 16 564 stxvw4x \S+42, 26, 16 565 stxvw4x \S+46, 27, 16 566 567 stxvw4x \S+35, 28, 16 568 stxvw4x \S+39, 29, 16 569 stxvw4x \S+43, 30, 16 570 stxvw4x \S+47, 31, 16 571 572.endm 573 574# 575# void chacha_p10le_8x(const struct chacha_state *state, u8 *dst, const u8 *src, 576# unsigned int len, int nrounds); 577# 578SYM_FUNC_START(chacha_p10le_8x) 579.align 5 580 cmpdi 6, 0 581 ble Out_no_chacha 582 583 SAVE_REGS 584 585 # r17 - r31 mainly for Write_256 macro. 586 li 17, 16 587 li 18, 32 588 li 19, 48 589 li 20, 64 590 li 21, 80 591 li 22, 96 592 li 23, 112 593 li 24, 128 594 li 25, 144 595 li 26, 160 596 li 27, 176 597 li 28, 192 598 li 29, 208 599 li 30, 224 600 li 31, 240 601 602 mr 15, 6 # len 603 li 14, 0 # offset to inp and outp 604 605 lxvw4x 48, 0, 3 # vr16, constants 606 lxvw4x 49, 17, 3 # vr17, key 1 607 lxvw4x 50, 18, 3 # vr18, key 2 608 lxvw4x 51, 19, 3 # vr19, counter, nonce 609 610 # create (0, 1, 2, 3) counters 611 vspltisw 0, 0 612 vspltisw 1, 1 613 vspltisw 2, 2 614 vspltisw 3, 3 615 vmrghw 4, 0, 1 616 vmrglw 5, 2, 3 617 vsldoi 30, 4, 5, 8 # vr30 counter, 4 (0, 1, 2, 3) 618 619 vspltisw 21, 12 620 vspltisw 23, 7 621 622 addis 11, 2, permx@toc@ha 623 addi 11, 11, permx@toc@l 624 lxvw4x 32+20, 0, 11 625 lxvw4x 32+22, 17, 11 626 627 sradi 8, 7, 1 628 629 mtctr 8 630 631 # save constants to vsx 632 xxlor 16, 48, 48 633 xxlor 17, 49, 49 634 xxlor 18, 50, 50 635 xxlor 19, 51, 51 636 637 vspltisw 25, 4 638 vspltisw 26, 8 639 640 xxlor 25, 32+26, 32+26 641 xxlor 24, 32+25, 32+25 642 643 vadduwm 31, 30, 25 # counter = (0, 1, 2, 3) + (4, 4, 4, 4) 644 xxlor 30, 32+30, 32+30 645 xxlor 31, 32+31, 32+31 646 647 xxlor 20, 32+20, 32+20 648 xxlor 21, 32+21, 32+21 649 xxlor 22, 32+22, 32+22 650 xxlor 23, 32+23, 32+23 651 652 cmpdi 6, 512 653 blt Loop_last 654 655Loop_8x: 656 xxspltw 32+0, 16, 0 657 xxspltw 32+1, 16, 1 658 xxspltw 32+2, 16, 2 659 xxspltw 32+3, 16, 3 660 661 xxspltw 32+4, 17, 0 662 xxspltw 32+5, 17, 1 663 xxspltw 32+6, 17, 2 664 xxspltw 32+7, 17, 3 665 xxspltw 32+8, 18, 0 666 xxspltw 32+9, 18, 1 667 xxspltw 32+10, 18, 2 668 xxspltw 32+11, 18, 3 669 xxspltw 32+12, 19, 0 670 xxspltw 32+13, 19, 1 671 xxspltw 32+14, 19, 2 672 xxspltw 32+15, 19, 3 673 vadduwm 12, 12, 30 # increase counter 674 675 xxspltw 32+16, 16, 0 676 xxspltw 32+17, 16, 1 677 xxspltw 32+18, 16, 2 678 xxspltw 32+19, 16, 3 679 680 xxspltw 32+20, 17, 0 681 xxspltw 32+21, 17, 1 682 xxspltw 32+22, 17, 2 683 xxspltw 32+23, 17, 3 684 xxspltw 32+24, 18, 0 685 xxspltw 32+25, 18, 1 686 xxspltw 32+26, 18, 2 687 xxspltw 32+27, 18, 3 688 xxspltw 32+28, 19, 0 689 xxspltw 32+29, 19, 1 690 vadduwm 28, 28, 31 # increase counter 691 xxspltw 32+30, 19, 2 692 xxspltw 32+31, 19, 3 693 694.align 5 695quarter_loop_8x: 696 QT_loop_8x 697 698 bdnz quarter_loop_8x 699 700 xxlor 0, 32+30, 32+30 701 xxlor 32+30, 30, 30 702 vadduwm 12, 12, 30 703 xxlor 32+30, 0, 0 704 TP_4x 0, 1, 2, 3 705 TP_4x 4, 5, 6, 7 706 TP_4x 8, 9, 10, 11 707 TP_4x 12, 13, 14, 15 708 709 xxlor 0, 48, 48 710 xxlor 1, 49, 49 711 xxlor 2, 50, 50 712 xxlor 3, 51, 51 713 xxlor 48, 16, 16 714 xxlor 49, 17, 17 715 xxlor 50, 18, 18 716 xxlor 51, 19, 19 717 Add_state 0 718 xxlor 48, 0, 0 719 xxlor 49, 1, 1 720 xxlor 50, 2, 2 721 xxlor 51, 3, 3 722 Write_256 0 723 addi 14, 14, 256 # offset +=256 724 addi 15, 15, -256 # len -=256 725 726 xxlor 5, 32+31, 32+31 727 xxlor 32+31, 31, 31 728 vadduwm 28, 28, 31 729 xxlor 32+31, 5, 5 730 TP_4x 16+0, 16+1, 16+2, 16+3 731 TP_4x 16+4, 16+5, 16+6, 16+7 732 TP_4x 16+8, 16+9, 16+10, 16+11 733 TP_4x 16+12, 16+13, 16+14, 16+15 734 735 xxlor 32, 16, 16 736 xxlor 33, 17, 17 737 xxlor 34, 18, 18 738 xxlor 35, 19, 19 739 Add_state 16 740 Write_256 16 741 addi 14, 14, 256 # offset +=256 742 addi 15, 15, -256 # len +=256 743 744 xxlor 32+24, 24, 24 745 xxlor 32+25, 25, 25 746 xxlor 32+30, 30, 30 747 vadduwm 30, 30, 25 748 vadduwm 31, 30, 24 749 xxlor 30, 32+30, 32+30 750 xxlor 31, 32+31, 32+31 751 752 cmpdi 15, 0 753 beq Out_loop 754 755 cmpdi 15, 512 756 blt Loop_last 757 758 mtctr 8 759 b Loop_8x 760 761Loop_last: 762 lxvw4x 48, 0, 3 # vr16, constants 763 lxvw4x 49, 17, 3 # vr17, key 1 764 lxvw4x 50, 18, 3 # vr18, key 2 765 lxvw4x 51, 19, 3 # vr19, counter, nonce 766 767 vspltisw 21, 12 768 vspltisw 23, 7 769 addis 11, 2, permx@toc@ha 770 addi 11, 11, permx@toc@l 771 lxvw4x 32+20, 0, 11 772 lxvw4x 32+22, 17, 11 773 774 sradi 8, 7, 1 775 mtctr 8 776 777Loop_4x: 778 vspltw 0, 16, 0 779 vspltw 1, 16, 1 780 vspltw 2, 16, 2 781 vspltw 3, 16, 3 782 783 vspltw 4, 17, 0 784 vspltw 5, 17, 1 785 vspltw 6, 17, 2 786 vspltw 7, 17, 3 787 vspltw 8, 18, 0 788 vspltw 9, 18, 1 789 vspltw 10, 18, 2 790 vspltw 11, 18, 3 791 vspltw 12, 19, 0 792 vadduwm 12, 12, 30 # increase counter 793 vspltw 13, 19, 1 794 vspltw 14, 19, 2 795 vspltw 15, 19, 3 796 797.align 5 798quarter_loop: 799 QT_loop_4x 800 801 bdnz quarter_loop 802 803 vadduwm 12, 12, 30 804 TP_4x 0, 1, 2, 3 805 TP_4x 4, 5, 6, 7 806 TP_4x 8, 9, 10, 11 807 TP_4x 12, 13, 14, 15 808 809 Add_state 0 810 Write_256 0 811 addi 14, 14, 256 # offset += 256 812 addi 15, 15, -256 # len += 256 813 814 # Update state counter 815 vspltisw 25, 4 816 vadduwm 30, 30, 25 817 818 cmpdi 15, 0 819 beq Out_loop 820 cmpdi 15, 256 821 blt Out_loop 822 823 mtctr 8 824 b Loop_4x 825 826Out_loop: 827 RESTORE_REGS 828 blr 829 830Out_no_chacha: 831 li 3, 0 832 blr 833SYM_FUNC_END(chacha_p10le_8x) 834 835SYM_DATA_START_LOCAL(PERMX) 836.align 5 837permx: 838.long 0x22330011, 0x66774455, 0xaabb8899, 0xeeffccdd 839.long 0x11223300, 0x55667744, 0x99aabb88, 0xddeeffcc 840SYM_DATA_END(PERMX) 841