| @@ -1,14 +1,14 @@ | | | @@ -1,14 +1,14 @@ |
1 | /* $NetBSD: chacha_neon_32.S,v 1.1 2020/07/28 20:08:48 riastradh Exp $ */ | | 1 | /* $NetBSD: chacha_neon_32.S,v 1.2 2020/07/29 14:23:59 riastradh Exp $ */ |
2 | | | 2 | |
3 | /*- | | 3 | /*- |
4 | * Copyright (c) 2020 The NetBSD Foundation, Inc. | | 4 | * Copyright (c) 2020 The NetBSD Foundation, Inc. |
5 | * All rights reserved. | | 5 | * All rights reserved. |
6 | * | | 6 | * |
7 | * Redistribution and use in source and binary forms, with or without | | 7 | * Redistribution and use in source and binary forms, with or without |
8 | * modification, are permitted provided that the following conditions | | 8 | * modification, are permitted provided that the following conditions |
9 | * are met: | | 9 | * are met: |
10 | * 1. Redistributions of source code must retain the above copyright | | 10 | * 1. Redistributions of source code must retain the above copyright |
11 | * notice, this list of conditions and the following disclaimer. | | 11 | * notice, this list of conditions and the following disclaimer. |
12 | * 2. Redistributions in binary form must reproduce the above copyright | | 12 | * 2. Redistributions in binary form must reproduce the above copyright |
13 | * notice, this list of conditions and the following disclaimer in the | | 13 | * notice, this list of conditions and the following disclaimer in the |
14 | * documentation and/or other materials provided with the distribution. | | 14 | * documentation and/or other materials provided with the distribution. |
| @@ -18,27 +18,27 @@ | | | @@ -18,27 +18,27 @@ |
18 | * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR | | 18 | * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
19 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS | | 19 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS |
20 | * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | | 20 | * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
21 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | | 21 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
22 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | | 22 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
23 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | | 23 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
24 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | | 24 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
25 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | | 25 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
26 | * POSSIBILITY OF SUCH DAMAGE. | | 26 | * POSSIBILITY OF SUCH DAMAGE. |
27 | */ | | 27 | */ |
28 | | | 28 | |
29 | #include <machine/asm.h> | | 29 | #include <machine/asm.h> |
30 | | | 30 | |
31 | RCSID("$NetBSD: chacha_neon_32.S,v 1.1 2020/07/28 20:08:48 riastradh Exp $") | | 31 | RCSID("$NetBSD: chacha_neon_32.S,v 1.2 2020/07/29 14:23:59 riastradh Exp $") |
32 | | | 32 | |
33 | .fpu neon | | 33 | .fpu neon |
34 | | | 34 | |
35 | /* | | 35 | /* |
36 | * ChaCha round, split up so we can interleave the quarterrounds on | | 36 | * ChaCha round, split up so we can interleave the quarterrounds on |
37 | * independent rows/diagonals to maximize pipeline efficiency, with | | 37 | * independent rows/diagonals to maximize pipeline efficiency, with |
38 | * spills to deal with the scarcity of registers. Reference: | | 38 | * spills to deal with the scarcity of registers. Reference: |
39 | * | | 39 | * |
40 | * Daniel J. Bernstein, `ChaCha, a variant of Salsa20', Workshop | | 40 | * Daniel J. Bernstein, `ChaCha, a variant of Salsa20', Workshop |
41 | * Record of the State of the Art in Stream Ciphers -- SASC 2008. | | 41 | * Record of the State of the Art in Stream Ciphers -- SASC 2008. |
42 | * https://cr.yp.to/papers.html#chacha | | 42 | * https://cr.yp.to/papers.html#chacha |
43 | * | | 43 | * |
44 | * a += b; d ^= a; d <<<= 16; | | 44 | * a += b; d ^= a; d <<<= 16; |
| @@ -295,41 +295,49 @@ ENTRY(chacha_stream256_neon) | | | @@ -295,41 +295,49 @@ ENTRY(chacha_stream256_neon) |
295 | /* | | 295 | /* |
296 | * At this point, the blocks are: | | 296 | * At this point, the blocks are: |
297 | * | | 297 | * |
298 | * q0 = (x0[0], x0[1]; x0[2], x0[3]) | | 298 | * q0 = (x0[0], x0[1]; x0[2], x0[3]) |
299 | * q1 = (x2[0], x2[1]; x2[2], x2[3]) | | 299 | * q1 = (x2[0], x2[1]; x2[2], x2[3]) |
300 | * q2 = (x1[0], x1[1]; x1[2], x1[3]) | | 300 | * q2 = (x1[0], x1[1]; x1[2], x1[3]) |
301 | * q3 = (x3[0], x3[1]; x3[2], x3[3]) | | 301 | * q3 = (x3[0], x3[1]; x3[2], x3[3]) |
302 | * q4 = (x0[4], x0[5]; x0[6], x0[7]) | | 302 | * q4 = (x0[4], x0[5]; x0[6], x0[7]) |
303 | * q5 = (x2[4], x2[5]; x2[6], x2[7]) | | 303 | * q5 = (x2[4], x2[5]; x2[6], x2[7]) |
304 | * q6 = (x1[4], x1[5]; x1[6], x1[7]) | | 304 | * q6 = (x1[4], x1[5]; x1[6], x1[7]) |
305 | * q7 = (x3[4], x3[5]; x3[6], x3[7]) | | 305 | * q7 = (x3[4], x3[5]; x3[6], x3[7]) |
306 | * | | 306 | * |
307 | * The first two rows to write out are q0 = x0[0:4) and q4 = | | 307 | * The first two rows to write out are q0 = x0[0:4) and q4 = |
308 | * x0[4:8). If we first swap q1 and q4, then once we've | | 308 | * x0[4:8). Swapping q1<->q4, q3<->q6, q9<->q12, and q11<->q14 |
309 | * written them out we free up consecutive registers q0-q1 for | | 309 | * enables us to issue all stores in consecutive pairs: |
310 | * store-multiple. | | 310 | * x0 in q0-q1 |
| | | 311 | * x1 in q8-q9 |
| | | 312 | * x2 in q2-q3 |
| | | 313 | * x3 in q10-q11 |
| | | 314 | * x4 in q4-q5 |
| | | 315 | * x5 in q12-q3 |
| | | 316 | * x6 in q6-q7 |
| | | 317 | * x7 in q14-q15 |
311 | */ | | 318 | */ |
312 | | | 319 | |
313 | vswp q1, q4 | | 320 | vswp q1, q4 |
| | | 321 | vswp q3, q6 |
314 | | | 322 | |
315 | vadd.u32 q0, q0, q9 | | 323 | vadd.u32 q0, q0, q9 |
316 | vadd.u32 q4, q4, q9 | | 324 | vadd.u32 q4, q4, q9 |
317 | vadd.u32 q2, q2, q9 | | 325 | vadd.u32 q2, q2, q9 |
318 | vadd.u32 q3, q3, q9 | | 326 | vadd.u32 q6, q6, q9 |
319 | | | 327 | |
320 | vadd.u32 q1, q1, q8 | | 328 | vadd.u32 q1, q1, q8 |
321 | vadd.u32 q5, q5, q8 | | 329 | vadd.u32 q5, q5, q8 |
322 | vadd.u32 q6, q6, q8 | | 330 | vadd.u32 q3, q3, q8 |
323 | vadd.u32 q7, q7, q8 | | 331 | vadd.u32 q7, q7, q8 |
324 | | | 332 | |
325 | vld1.32 {q8-q9}, [fp, :256] /* restore q8-q9 */ | | 333 | vld1.32 {q8-q9}, [fp, :256] /* restore q8-q9 */ |
326 | | | 334 | |
327 | LE32TOH(q0) | | 335 | LE32TOH(q0) |
328 | LE32TOH(q1) | | 336 | LE32TOH(q1) |
329 | LE32TOH(q2) | | 337 | LE32TOH(q2) |
330 | LE32TOH(q3) | | 338 | LE32TOH(q3) |
331 | LE32TOH(q4) | | 339 | LE32TOH(q4) |
332 | LE32TOH(q5) | | 340 | LE32TOH(q5) |
333 | LE32TOH(q6) | | 341 | LE32TOH(q6) |
334 | LE32TOH(q7) | | 342 | LE32TOH(q7) |
335 | | | 343 | |
| @@ -339,67 +347,60 @@ ENTRY(chacha_stream256_neon) | | | @@ -339,67 +347,60 @@ ENTRY(chacha_stream256_neon) |
339 | vmov d2, r3, r6 | | 347 | vmov d2, r3, r6 |
340 | vmov d3, r8, r10 | | 348 | vmov d3, r8, r10 |
341 | | | 349 | |
342 | vzip.32 q8, q9 | | 350 | vzip.32 q8, q9 |
343 | vzip.32 q10, q11 | | 351 | vzip.32 q10, q11 |
344 | vzip.32 q12, q13 | | 352 | vzip.32 q12, q13 |
345 | vzip.32 q14, q15 | | 353 | vzip.32 q14, q15 |
346 | | | 354 | |
347 | vswp d17, d20 | | 355 | vswp d17, d20 |
348 | vswp d25, d28 | | 356 | vswp d25, d28 |
349 | vswp d19, d22 | | 357 | vswp d19, d22 |
350 | vswp d27, d30 | | 358 | vswp d27, d30 |
351 | | | 359 | |
| | | 360 | vswp q9, q12 |
| | | 361 | vswp q11, q14 |
| | | 362 | |
352 | vadd.u32 q8, q8, q0 | | 363 | vadd.u32 q8, q8, q0 |
353 | vadd.u32 q9, q9, q0 | | 364 | vadd.u32 q12, q12, q0 |
354 | vadd.u32 q10, q10, q0 | | 365 | vadd.u32 q10, q10, q0 |
355 | vadd.u32 q11, q11, q0 | | 366 | vadd.u32 q14, q14, q0 |
356 | | | 367 | |
357 | vadd.u32 q12, q12, q1 | | 368 | vadd.u32 q9, q9, q1 |
358 | vadd.u32 q13, q13, q1 | | 369 | vadd.u32 q13, q13, q1 |
359 | vadd.u32 q14, q14, q1 | | 370 | vadd.u32 q11, q11, q1 |
360 | vadd.u32 q15, q15, q1 | | 371 | vadd.u32 q15, q15, q1 |
361 | | | 372 | |
362 | LE32TOH(q8) | | 373 | LE32TOH(q8) |
363 | LE32TOH(q9) | | 374 | LE32TOH(q9) |
364 | LE32TOH(q10) | | 375 | LE32TOH(q10) |
365 | LE32TOH(q11) | | 376 | LE32TOH(q11) |
366 | LE32TOH(q12) | | 377 | LE32TOH(q12) |
367 | LE32TOH(q13) | | 378 | LE32TOH(q13) |
368 | LE32TOH(q14) | | 379 | LE32TOH(q14) |
369 | LE32TOH(q15) | | 380 | LE32TOH(q15) |
370 | | | 381 | |
371 | /* prepare to zero temporary space on stack */ | | 382 | /* vst1.32 {q0-q1}, [r0]! */ |
372 | vmov.i32 q0, #0 | | 383 | vst1.32 {q8-q9}, [r0]! |
373 | vmov.i32 q1, #0 | | 384 | vst1.32 {q2-q3}, [r0]! |
374 | | | 385 | vst1.32 {q10-q11}, [r0]! |
375 | /* vst1.32 {q0}, [r0]! */ | | 386 | vst1.32 {q4-q5}, [r0]! |
376 | /* vst1.32 {q1}, [r0]! */ /* (was q4 before vswp) */ | | 387 | vst1.32 {q12-q13}, [r0]! |
377 | vst1.32 {q8}, [r0]! | | 388 | vst1.32 {q6-q7}, [r0]! |
378 | vst1.32 {q12}, [r0]! | | 389 | vst1.32 {q14-q15}, [r0] |
379 | vst1.32 {q2}, [r0]! | | | |
380 | vst1.32 {q6}, [r0]! | | | |
381 | vst1.32 {q10}, [r0]! | | | |
382 | vst1.32 {q14}, [r0]! | | | |
383 | vst1.32 {q4}, [r0]! /* (was q1 before vswp) */ | | | |
384 | vst1.32 {q5}, [r0]! | | | |
385 | vst1.32 {q9}, [r0]! | | | |
386 | vst1.32 {q13}, [r0]! | | | |
387 | vst1.32 {q3}, [r0]! | | | |
388 | vst1.32 {q7}, [r0]! | | | |
389 | vst1.32 {q11}, [r0]! | | | |
390 | vst1.32 {q15}, [r0] | | | |
391 | | | 390 | |
392 | /* zero temporary space on the stack */ | | 391 | /* zero temporary space on the stack */ |
| | | 392 | vmov.i32 q0, #0 |
| | | 393 | vmov.i32 q1, #0 |
393 | vst1.8 {q0-q1}, [fp, :256] | | 394 | vst1.8 {q0-q1}, [fp, :256] |
394 | | | 395 | |
395 | /* restore callee-saves registers and stack */ | | 396 | /* restore callee-saves registers and stack */ |
396 | vpop {d8-d15} | | 397 | vpop {d8-d15} |
397 | pop {r4, r5, r6, r7, r8, r10, fp, lr} | | 398 | pop {r4, r5, r6, r7, r8, r10, fp, lr} |
398 | bx lr | | 399 | bx lr |
399 | END(chacha_stream256_neon) | | 400 | END(chacha_stream256_neon) |
400 | | | 401 | |
401 | /* | | 402 | /* |
402 | * chacha_stream_xor256_neon(uint8_t s[256]@r0, const uint8_t p[256]@r1, | | 403 | * chacha_stream_xor256_neon(uint8_t s[256]@r0, const uint8_t p[256]@r1, |
403 | * uint32_t blkno@r2, | | 404 | * uint32_t blkno@r2, |
404 | * const uint8_t nonce[12]@r3, | | 405 | * const uint8_t nonce[12]@r3, |
405 | * const uint8_t key[32]@sp[0], | | 406 | * const uint8_t key[32]@sp[0], |
| @@ -471,111 +472,60 @@ ENTRY(chacha_stream_xor256_neon) | | | @@ -471,111 +472,60 @@ ENTRY(chacha_stream_xor256_neon) |
471 | 2: subs ip, ip, #2 | | 472 | 2: subs ip, ip, #2 |
472 | ROUND q0,q1,q2,q3, q4,q5,q6,q7, q8,q9,q10,q11, q12,q13,q14,q15, \ | | 473 | ROUND q0,q1,q2,q3, q4,q5,q6,q7, q8,q9,q10,q11, q12,q13,q14,q15, \ |
473 | d16, d24,d25, d26,d27, d28,d29, d30,d31 | | 474 | d16, d24,d25, d26,d27, d28,d29, d30,d31 |
474 | ROUNDLD q0,q1,q2,q3, q4,q5,q6,q7, q8,q9,q10,q11, q12,q13,q14,q15 | | 475 | ROUNDLD q0,q1,q2,q3, q4,q5,q6,q7, q8,q9,q10,q11, q12,q13,q14,q15 |
475 | ROUND q0,q1,q2,q3, q5,q6,q7,q4, q10,q11,q8,q9, q15,q12,q13,q14, \ | | 476 | ROUND q0,q1,q2,q3, q5,q6,q7,q4, q10,q11,q8,q9, q15,q12,q13,q14, \ |
476 | d20, d30,d31, d24,d25, d26,d27, d28,d29 | | 477 | d20, d30,d31, d24,d25, d26,d27, d28,d29 |
477 | bne 1b | | 478 | bne 1b |
478 | | | 479 | |
479 | /* | | 480 | /* |
480 | * q8-q9 are free / saved on the stack. Now for the real fun: | | 481 | * q8-q9 are free / saved on the stack. Now for the real fun: |
481 | * in only 16 registers, compute p[i] ^ (y[i] + x[i]) for i in | | 482 | * in only 16 registers, compute p[i] ^ (y[i] + x[i]) for i in |
482 | * {0,1,2,...,15}. The twist is that the p[i] and the y[i] are | | 483 | * {0,1,2,...,15}. The twist is that the p[i] and the y[i] are |
483 | * transposed from one another, and the x[i] are in general | | 484 | * transposed from one another, and the x[i] are in general |
484 | * registers and memory. So we have: | | 485 | * registers and memory. See comments in chacha_stream256_neon |
485 | * | | 486 | * for the layout with swaps. |
486 | * q0 = (x0[0], x1[0]; x2[0], x3[0]) | | | |
487 | * q1 = (x0[1], x1[1]; x2[1], x3[1]) | | | |
488 | * q2 = (x0[2], x1[2]; x2[2], x3[2]) | | | |
489 | * q3 = (x0[3], x1[3]; x2[3], x3[3]) | | | |
490 | * ... | | | |
491 | * q15 = (x0[15], x1[15]; x2[15], x3[15]) | | | |
492 | * | | | |
493 | * where xi[j] is the jth word of the ith 16-word block. Zip | | | |
494 | * consecutive pairs with vzip.32, and you get: | | | |
495 | * | | | |
496 | * q0 = (x0[0], x0[1]; x1[0], x1[1]) | | | |
497 | * q1 = (x2[0], x2[1]; x3[0], x3[1]) | | | |
498 | * q2 = (x0[2], x0[3]; x1[2], x1[3]) | | | |
499 | * q3 = (x2[2], x2[3]; x3[2], x3[3]) | | | |
500 | * ... | | | |
501 | * q15 = (x2[14], x2[15]; x3[14], x3[15]) | | | |
502 | * | | | |
503 | * As 64-bit d registers, this is: | | | |
504 | * | | | |
505 | * d0 = (x0[0], x0[1]) d1 = (x1[0], x1[1]) | | | |
506 | * d2 = (x2[0], x2[1]) d3 = (x3[0], x3[1]) | | | |
507 | * d4 = (x0[2], x0[3]) d5 = (x1[2], x1[3]) | | | |
508 | * d6 = (x2[2], x2[3]) d7 = (x3[2], x3[3]) | | | |
509 | * ... | | | |
510 | * d30 = (x2[14], x2[15]) d31 = (x3[14], x3[15]) | | | |
511 | * | | | |
512 | * Swap d1<->d4, d3<->d6, ..., and you get: | | | |
513 | * | | | |
514 | * q0 = (x0[0], x0[1]; x0[2], x0[3]) | | | |
515 | * q1 = (x2[0], x2[1]; x2[2], x2[3]) | | | |
516 | * q2 = (x1[0], x1[1]; x1[2], x1[3]) | | | |
517 | * q3 = (x3[0], x3[1]; x3[2], x3[3]) | | | |
518 | * ... | | | |
519 | * q15 = (x15[0], x15[1]; x15[2], x15[3]) | | | |
520 | */ | | 487 | */ |
521 | | | 488 | |
522 | sub r7, r7, #0x10 | | 489 | sub r7, r7, #0x10 |
523 | vdup.32 q8, r2 /* q8 := (blkno, blkno, blkno, blkno) */ | | 490 | vdup.32 q8, r2 /* q8 := (blkno, blkno, blkno, blkno) */ |
524 | vld1.32 {q9}, [r7, :128] /* q9 := (0, 1, 2, 3) */ | | 491 | vld1.32 {q9}, [r7, :128] /* q9 := (0, 1, 2, 3) */ |
525 | | | 492 | |
526 | vzip.32 q0, q1 | | 493 | vzip.32 q0, q1 |
527 | vzip.32 q2, q3 | | 494 | vzip.32 q2, q3 |
528 | vzip.32 q4, q5 | | 495 | vzip.32 q4, q5 |
529 | vzip.32 q6, q7 | | 496 | vzip.32 q6, q7 |
530 | | | 497 | |
531 | vadd.u32 q8, q8, q9 /* q8 := (blkno,blkno+1,blkno+2,blkno+3) */ | | 498 | vadd.u32 q8, q8, q9 /* q8 := (blkno,blkno+1,blkno+2,blkno+3) */ |
532 | vld1.32 {q9}, [r5] /* q9 := constant */ | | 499 | vld1.32 {q9}, [r5] /* q9 := constant */ |
533 | vadd.u32 q12, q12, q8 /* q12 += (blkno,blkno+1,blkno+2,blkno+3) */ | | 500 | vadd.u32 q12, q12, q8 /* q12 += (blkno,blkno+1,blkno+2,blkno+3) */ |
534 | vld1.32 {q8}, [r4]! /* q8 := key[0:16) */ | | 501 | vld1.32 {q8}, [r4]! /* q8 := key[0:16) */ |
535 | | | 502 | |
536 | vswp d1, d4 | | | |
537 | vswp d9, d12 | | | |
538 | vswp d3, d6 | | 503 | vswp d3, d6 |
| | | 504 | vswp d9, d12 |
| | | 505 | vswp d1, d4 |
539 | vswp d11, d14 | | 506 | vswp d11, d14 |
540 | | | 507 | |
541 | /* | | | |
542 | * At this point, the blocks are: | | | |
543 | * | | | |
544 | * q0 = (x0[0], x0[1]; x0[2], x0[3]) | | | |
545 | * q1 = (x2[0], x2[1]; x2[2], x2[3]) | | | |
546 | * q2 = (x1[0], x1[1]; x1[2], x1[3]) | | | |
547 | * q3 = (x3[0], x3[1]; x3[2], x3[3]) | | | |
548 | * q4 = (x0[4], x0[5]; x0[6], x0[7]) | | | |
549 | * q5 = (x2[4], x2[5]; x2[6], x2[7]) | | | |
550 | * q6 = (x1[4], x1[5]; x1[6], x1[7]) | | | |
551 | * q7 = (x3[4], x3[5]; x3[6], x3[7]) | | | |
552 | * | | | |
553 | * The first two rows to write out are q0 = x0[0:4) and q4 = | | | |
554 | * x0[4:8). If we first swap q1 and q4, then once we've | | | |
555 | * written them out we free up consecutive registers q0-q1 for | | | |
556 | * store-multiple. | | | |
557 | */ | | | |
558 | | | | |
559 | vswp q1, q4 | | 508 | vswp q1, q4 |
| | | 509 | vswp q3, q6 |
560 | | | 510 | |
561 | vadd.u32 q0, q0, q9 | | 511 | vadd.u32 q0, q0, q9 |
562 | vadd.u32 q4, q4, q9 | | 512 | vadd.u32 q4, q4, q9 |
563 | vadd.u32 q2, q2, q9 | | 513 | vadd.u32 q2, q2, q9 |
564 | vadd.u32 q3, q3, q9 | | 514 | vadd.u32 q6, q6, q9 |
565 | | | 515 | |
566 | vadd.u32 q1, q1, q8 | | 516 | vadd.u32 q1, q1, q8 |
567 | vadd.u32 q5, q5, q8 | | 517 | vadd.u32 q5, q5, q8 |
568 | vadd.u32 q6, q6, q8 | | 518 | vadd.u32 q3, q3, q8 |
569 | vadd.u32 q7, q7, q8 | | 519 | vadd.u32 q7, q7, q8 |
570 | | | 520 | |
571 | vld1.32 {q8-q9}, [r1]! /* load plaintext bytes [0:32) */ | | 521 | vld1.32 {q8-q9}, [r1]! /* load plaintext bytes [0:32) */ |
572 | | | 522 | |
573 | LE32TOH(q0) | | 523 | LE32TOH(q0) |
574 | LE32TOH(q1) | | 524 | LE32TOH(q1) |
575 | LE32TOH(q2) | | 525 | LE32TOH(q2) |
576 | LE32TOH(q6) | | 526 | LE32TOH(q6) |
577 | LE32TOH(q4) | | 527 | LE32TOH(q4) |
578 | LE32TOH(q5) | | 528 | LE32TOH(q5) |
579 | LE32TOH(q3) | | 529 | LE32TOH(q3) |
580 | LE32TOH(q7) | | 530 | LE32TOH(q7) |
581 | | | 531 | |
| @@ -585,96 +535,95 @@ ENTRY(chacha_stream_xor256_neon) | | | @@ -585,96 +535,95 @@ ENTRY(chacha_stream_xor256_neon) |
585 | vld1.32 {q8-q9}, [fp, :256] /* restore q8-q9 */ | | 535 | vld1.32 {q8-q9}, [fp, :256] /* restore q8-q9 */ |
586 | | | 536 | |
587 | vst1.32 {q0-q1}, [r0]! /* store ciphertext bytes [0:32) */ | | 537 | vst1.32 {q0-q1}, [r0]! /* store ciphertext bytes [0:32) */ |
588 | vld1.32 {q0}, [r4] /* q0 := key[16:32) */ | | 538 | vld1.32 {q0}, [r4] /* q0 := key[16:32) */ |
589 | mov r3, #0 /* q1 = (0, nonce[0:4), ..., nonce[8:12)) */ | | 539 | mov r3, #0 /* q1 = (0, nonce[0:4), ..., nonce[8:12)) */ |
590 | vmov d2, r3, r6 | | 540 | vmov d2, r3, r6 |
591 | vmov d3, r8, r10 | | 541 | vmov d3, r8, r10 |
592 | | | 542 | |
593 | vzip.32 q8, q9 | | 543 | vzip.32 q8, q9 |
594 | vzip.32 q10, q11 | | 544 | vzip.32 q10, q11 |
595 | vzip.32 q12, q13 | | 545 | vzip.32 q12, q13 |
596 | vzip.32 q14, q15 | | 546 | vzip.32 q14, q15 |
597 | | | 547 | |
598 | vswp d17, d20 | | | |
599 | vswp d25, d28 | | | |
600 | vswp d19, d22 | | 548 | vswp d19, d22 |
| | | 549 | vswp d25, d28 |
| | | 550 | vswp d17, d20 |
601 | vswp d27, d30 | | 551 | vswp d27, d30 |
602 | | | 552 | |
603 | vswp q9, q12 /* free up q9 earlier for consecutive q8-q9 */ | | 553 | vswp q9, q12 /* free up q9 earlier for consecutive q8-q9 */ |
| | | 554 | vswp q11, q14 |
604 | | | 555 | |
605 | vadd.u32 q8, q8, q0 | | 556 | vadd.u32 q8, q8, q0 |
606 | vadd.u32 q12, q12, q0 | | 557 | vadd.u32 q12, q12, q0 |
607 | vadd.u32 q10, q10, q0 | | 558 | vadd.u32 q10, q10, q0 |
608 | vadd.u32 q11, q11, q0 | | 559 | vadd.u32 q14, q14, q0 |
609 | | | 560 | |
610 | vadd.u32 q9, q9, q1 | | 561 | vadd.u32 q9, q9, q1 |
611 | vadd.u32 q13, q13, q1 | | 562 | vadd.u32 q13, q13, q1 |
612 | vadd.u32 q14, q14, q1 | | 563 | vadd.u32 q11, q11, q1 |
613 | vadd.u32 q15, q15, q1 | | 564 | vadd.u32 q15, q15, q1 |
614 | | | 565 | |
615 | vld1.32 {q0-q1}, [r1]! /* load plaintext bytes [32:64) */ | | 566 | vld1.32 {q0-q1}, [r1]! /* load plaintext bytes [32:64) */ |
616 | | | 567 | |
617 | LE32TOH(q8) | | 568 | LE32TOH(q8) |
618 | LE32TOH(q9) | | 569 | LE32TOH(q9) |
619 | LE32TOH(q10) | | 570 | LE32TOH(q10) |
620 | LE32TOH(q14) | | 571 | LE32TOH(q11) |
621 | LE32TOH(q12) | | 572 | LE32TOH(q12) |
622 | LE32TOH(q13) | | 573 | LE32TOH(q13) |
623 | LE32TOH(q11) | | 574 | LE32TOH(q14) |
624 | LE32TOH(q15) | | 575 | LE32TOH(q15) |
625 | | | 576 | |
626 | veor q0, q0, q8 /* compute ciphertext bytes [32:64) */ | | 577 | veor q0, q0, q8 /* compute ciphertext bytes [32:64) */ |
627 | veor q1, q1, q9 | | 578 | veor q1, q1, q9 |
628 | | | 579 | |
629 | vld1.32 {q8-q9}, [r1]! /* load plaintext bytes [64:96) */ | | 580 | vld1.32 {q8-q9}, [r1]! /* load plaintext bytes [64:96) */ |
630 | vst1.32 {q0-q1}, [r0]! /* store ciphertext bytes [32:64) */ | | 581 | vst1.32 {q0-q1}, [r0]! /* store ciphertext bytes [32:64) */ |
631 | vld1.32 {q0-q1}, [r1]! /* load plaintext bytes [96:128) */ | | 582 | vld1.32 {q0-q1}, [r1]! /* load plaintext bytes [96:128) */ |
632 | | | 583 | |
633 | veor q2, q2, q8 /* compute ciphertext bytes [64:96) */ | | 584 | veor q2, q2, q8 /* compute ciphertext bytes [64:96) */ |
634 | veor q6, q6, q9 | | 585 | veor q3, q3, q9 |
635 | | | 586 | |
636 | vld1.32 {q8-q9}, [r1]! /* load plaintext bytes [128:160) */ | | 587 | vld1.32 {q8-q9}, [r1]! /* load plaintext bytes [128:160) */ |
637 | vst1.32 {q2}, [r0]! /* store ciphertext bytes [64:80) */ | | 588 | vst1.32 {q2-q3}, [r0]! /* store ciphertext bytes [64:80) */ |
638 | | | 589 | |
639 | veor q10, q10, q0 /* compute ciphertext bytes [96:128) */ | | 590 | veor q10, q10, q0 /* compute ciphertext bytes [96:128) */ |
640 | veor q14, q14, q1 | | 591 | veor q11, q11, q1 |
641 | | | 592 | |
642 | vld1.32 {q0-q1}, [r1]! /* load plaintext bytes [160:192) */ | | 593 | vld1.32 {q0-q1}, [r1]! /* load plaintext bytes [160:192) */ |
643 | vst1.32 {q6}, [r0]! /* store ciphertext bytes [80:96) */ | | 594 | vst1.32 {q10-q11}, [r0]! /* store ciphertext bytes [80:96) */ |
644 | | | 595 | |
645 | veor q4, q4, q8 /* compute ciphertext bytes [128:160) */ | | 596 | veor q4, q4, q8 /* compute ciphertext bytes [128:160) */ |
646 | veor q5, q5, q9 | | 597 | veor q5, q5, q9 |
647 | | | 598 | |
648 | vld1.32 {q8-q9}, [r1]! /* load plaintext bytes [192:224) */ | | 599 | vld1.32 {q8-q9}, [r1]! /* load plaintext bytes [192:224) */ |
649 | vst1.32 {q10}, [r0]! /* store ciphertext bytes [96:112) */ | | 600 | vst1.32 {q4-q5}, [r0]! /* store ciphertext bytes [96:112) */ |
650 | | | 601 | |
651 | veor q12, q12, q0 /* compute ciphertext bytes [160:192) */ | | 602 | veor q12, q12, q0 /* compute ciphertext bytes [160:192) */ |
652 | veor q13, q13, q1 | | 603 | veor q13, q13, q1 |
653 | | | 604 | |
654 | vld1.32 {q0-q1}, [r1] /* load plaintext bytes [224:256) */ | | 605 | vld1.32 {q0-q1}, [r1] /* load plaintext bytes [224:256) */ |
655 | vst1.32 {q14}, [r0]! /* store ciphertext bytes [112:128) */ | | 606 | vst1.32 {q12-q13}, [r0]! /* store ciphertext bytes [112:128) */ |
656 | | | 607 | |
657 | veor q8, q3, q8 /* compute ciphertext bytes [192:224) */ | | 608 | veor q6, q6, q8 /* compute ciphertext bytes [192:224) */ |
658 | veor q9, q7, q9 | | 609 | veor q7, q7, q9 |
659 | | | 610 | |
660 | vst1.32 {q4-q5}, [r0]! /* store ciphertext bytes [128:160) */ | | 611 | vst1.32 {q6-q7}, [r0]! /* store ciphertext bytes [192:224) */ |
661 | vst1.32 {q12-q13}, [r0]! /* store ciphertext bytes [160:192) */ | | | |
662 | | | 612 | |
663 | veor q0, q11, q0 /* compute ciphertext bytes [224:256) */ | | 613 | veor q14, q14, q0 /* compute ciphertext bytes [224:256) */ |
664 | veor q1, q15, q1 | | 614 | veor q15, q15, q1 |
665 | | | 615 | |
666 | vst1.32 {q8-q9}, [r0]! /* store ciphertext bytes [192:224) */ | | 616 | vst1.32 {q14-q15}, [r0] /* store ciphertext bytes [224:256) */ |
667 | vst1.32 {q0-q1}, [r0] /* store ciphertext bytes [224:256) */ | | | |
668 | | | 617 | |
669 | /* zero temporary space on the stack */ | | 618 | /* zero temporary space on the stack */ |
670 | vmov.i32 q0, #0 | | 619 | vmov.i32 q0, #0 |
671 | vmov.i32 q1, #0 | | 620 | vmov.i32 q1, #0 |
672 | vst1.8 {q0-q1}, [fp, :256] | | 621 | vst1.8 {q0-q1}, [fp, :256] |
673 | | | 622 | |
674 | /* restore callee-saves registers and stack */ | | 623 | /* restore callee-saves registers and stack */ |
675 | vpop {d8-d15} | | 624 | vpop {d8-d15} |
676 | pop {r4, r5, r6, r7, r8, r10, fp, lr} | | 625 | pop {r4, r5, r6, r7, r8, r10, fp, lr} |
677 | bx lr | | 626 | bx lr |
678 | END(chacha_stream_xor256_neon) | | 627 | END(chacha_stream_xor256_neon) |
679 | | | 628 | |
680 | .section .rodata | | 629 | .section .rodata |