blob: 2493ae08a65e2898b37a4397716b366d5aa65c60 [file] [log] [blame]
Jeff Vander Stoep2bbaf7e2020-12-04 14:00:07 +01001// This file is generated from a similarly-named Perl script in the BoringSSL
2// source tree. Do not edit by hand.
3
4#if !defined(__has_feature)
5#define __has_feature(x) 0
6#endif
7#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
8#define OPENSSL_NO_ASM
9#endif
10
11#if !defined(OPENSSL_NO_ASM)
12#if defined(BORINGSSL_PREFIX)
13#include <boringssl_prefix_symbols_asm.h>
14#endif
Joel Galenson96d408b2021-06-08 17:53:00 -070015#include <openssl/arm_arch.h>
16
Jeff Vander Stoep2bbaf7e2020-12-04 14:00:07 +010017.text
18
19.globl _bn_mul_mont
20.private_extern _bn_mul_mont
21
22.align 5
23_bn_mul_mont:
Joel Galenson96d408b2021-06-08 17:53:00 -070024 AARCH64_SIGN_LINK_REGISTER
Jeff Vander Stoep2bbaf7e2020-12-04 14:00:07 +010025 tst x5,#7
26 b.eq __bn_sqr8x_mont
27 tst x5,#3
28 b.eq __bn_mul4x_mont
29Lmul_mont:
30 stp x29,x30,[sp,#-64]!
31 add x29,sp,#0
32 stp x19,x20,[sp,#16]
33 stp x21,x22,[sp,#32]
34 stp x23,x24,[sp,#48]
35
36 ldr x9,[x2],#8 // bp[0]
37 sub x22,sp,x5,lsl#3
38 ldp x7,x8,[x1],#16 // ap[0..1]
39 lsl x5,x5,#3
40 ldr x4,[x4] // *n0
41 and x22,x22,#-16 // ABI says so
42 ldp x13,x14,[x3],#16 // np[0..1]
43
44 mul x6,x7,x9 // ap[0]*bp[0]
45 sub x21,x5,#16 // j=num-2
46 umulh x7,x7,x9
47 mul x10,x8,x9 // ap[1]*bp[0]
48 umulh x11,x8,x9
49
50 mul x15,x6,x4 // "tp[0]"*n0
51 mov sp,x22 // alloca
52
53 // (*) mul x12,x13,x15 // np[0]*m1
54 umulh x13,x13,x15
55 mul x16,x14,x15 // np[1]*m1
56 // (*) adds x12,x12,x6 // discarded
57 // (*) As for removal of first multiplication and addition
58 // instructions. The outcome of first addition is
59 // guaranteed to be zero, which leaves two computationally
60 // significant outcomes: it either carries or not. Then
61 // question is when does it carry? Is there alternative
62 // way to deduce it? If you follow operations, you can
63 // observe that condition for carry is quite simple:
64 // x6 being non-zero. So that carry can be calculated
65 // by adding -1 to x6. That's what next instruction does.
66 subs xzr,x6,#1 // (*)
67 umulh x17,x14,x15
68 adc x13,x13,xzr
69 cbz x21,L1st_skip
70
71L1st:
72 ldr x8,[x1],#8
73 adds x6,x10,x7
74 sub x21,x21,#8 // j--
75 adc x7,x11,xzr
76
77 ldr x14,[x3],#8
78 adds x12,x16,x13
79 mul x10,x8,x9 // ap[j]*bp[0]
80 adc x13,x17,xzr
81 umulh x11,x8,x9
82
83 adds x12,x12,x6
84 mul x16,x14,x15 // np[j]*m1
85 adc x13,x13,xzr
86 umulh x17,x14,x15
87 str x12,[x22],#8 // tp[j-1]
88 cbnz x21,L1st
89
90L1st_skip:
91 adds x6,x10,x7
92 sub x1,x1,x5 // rewind x1
93 adc x7,x11,xzr
94
95 adds x12,x16,x13
96 sub x3,x3,x5 // rewind x3
97 adc x13,x17,xzr
98
99 adds x12,x12,x6
100 sub x20,x5,#8 // i=num-1
101 adcs x13,x13,x7
102
103 adc x19,xzr,xzr // upmost overflow bit
104 stp x12,x13,[x22]
105
106Louter:
107 ldr x9,[x2],#8 // bp[i]
108 ldp x7,x8,[x1],#16
109 ldr x23,[sp] // tp[0]
110 add x22,sp,#8
111
112 mul x6,x7,x9 // ap[0]*bp[i]
113 sub x21,x5,#16 // j=num-2
114 umulh x7,x7,x9
115 ldp x13,x14,[x3],#16
116 mul x10,x8,x9 // ap[1]*bp[i]
117 adds x6,x6,x23
118 umulh x11,x8,x9
119 adc x7,x7,xzr
120
121 mul x15,x6,x4
122 sub x20,x20,#8 // i--
123
124 // (*) mul x12,x13,x15 // np[0]*m1
125 umulh x13,x13,x15
126 mul x16,x14,x15 // np[1]*m1
127 // (*) adds x12,x12,x6
128 subs xzr,x6,#1 // (*)
129 umulh x17,x14,x15
130 cbz x21,Linner_skip
131
132Linner:
133 ldr x8,[x1],#8
134 adc x13,x13,xzr
135 ldr x23,[x22],#8 // tp[j]
136 adds x6,x10,x7
137 sub x21,x21,#8 // j--
138 adc x7,x11,xzr
139
140 adds x12,x16,x13
141 ldr x14,[x3],#8
142 adc x13,x17,xzr
143
144 mul x10,x8,x9 // ap[j]*bp[i]
145 adds x6,x6,x23
146 umulh x11,x8,x9
147 adc x7,x7,xzr
148
149 mul x16,x14,x15 // np[j]*m1
150 adds x12,x12,x6
151 umulh x17,x14,x15
152 str x12,[x22,#-16] // tp[j-1]
153 cbnz x21,Linner
154
155Linner_skip:
156 ldr x23,[x22],#8 // tp[j]
157 adc x13,x13,xzr
158 adds x6,x10,x7
159 sub x1,x1,x5 // rewind x1
160 adc x7,x11,xzr
161
162 adds x12,x16,x13
163 sub x3,x3,x5 // rewind x3
164 adcs x13,x17,x19
165 adc x19,xzr,xzr
166
167 adds x6,x6,x23
168 adc x7,x7,xzr
169
170 adds x12,x12,x6
171 adcs x13,x13,x7
172 adc x19,x19,xzr // upmost overflow bit
173 stp x12,x13,[x22,#-16]
174
175 cbnz x20,Louter
176
177 // Final step. We see if result is larger than modulus, and
178 // if it is, subtract the modulus. But comparison implies
179 // subtraction. So we subtract modulus, see if it borrowed,
180 // and conditionally copy original value.
181 ldr x23,[sp] // tp[0]
182 add x22,sp,#8
183 ldr x14,[x3],#8 // np[0]
184 subs x21,x5,#8 // j=num-1 and clear borrow
185 mov x1,x0
186Lsub:
187 sbcs x8,x23,x14 // tp[j]-np[j]
188 ldr x23,[x22],#8
189 sub x21,x21,#8 // j--
190 ldr x14,[x3],#8
191 str x8,[x1],#8 // rp[j]=tp[j]-np[j]
192 cbnz x21,Lsub
193
194 sbcs x8,x23,x14
195 sbcs x19,x19,xzr // did it borrow?
196 str x8,[x1],#8 // rp[num-1]
197
198 ldr x23,[sp] // tp[0]
199 add x22,sp,#8
200 ldr x8,[x0],#8 // rp[0]
201 sub x5,x5,#8 // num--
202 nop
203Lcond_copy:
204 sub x5,x5,#8 // num--
205 csel x14,x23,x8,lo // did it borrow?
206 ldr x23,[x22],#8
207 ldr x8,[x0],#8
208 str xzr,[x22,#-16] // wipe tp
209 str x14,[x0,#-16]
210 cbnz x5,Lcond_copy
211
212 csel x14,x23,x8,lo
213 str xzr,[x22,#-8] // wipe tp
214 str x14,[x0,#-8]
215
216 ldp x19,x20,[x29,#16]
217 mov sp,x29
218 ldp x21,x22,[x29,#32]
219 mov x0,#1
220 ldp x23,x24,[x29,#48]
221 ldr x29,[sp],#64
Joel Galenson96d408b2021-06-08 17:53:00 -0700222 AARCH64_VALIDATE_LINK_REGISTER
Jeff Vander Stoep2bbaf7e2020-12-04 14:00:07 +0100223 ret
224
225
226.align 5
227__bn_sqr8x_mont:
Joel Galenson96d408b2021-06-08 17:53:00 -0700228 // Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_sqr8x_mont is jumped to
229 // only from bn_mul_mont which has already signed the return address.
Jeff Vander Stoep2bbaf7e2020-12-04 14:00:07 +0100230 cmp x1,x2
231 b.ne __bn_mul4x_mont
232Lsqr8x_mont:
233 stp x29,x30,[sp,#-128]!
234 add x29,sp,#0
235 stp x19,x20,[sp,#16]
236 stp x21,x22,[sp,#32]
237 stp x23,x24,[sp,#48]
238 stp x25,x26,[sp,#64]
239 stp x27,x28,[sp,#80]
240 stp x0,x3,[sp,#96] // offload rp and np
241
242 ldp x6,x7,[x1,#8*0]
243 ldp x8,x9,[x1,#8*2]
244 ldp x10,x11,[x1,#8*4]
245 ldp x12,x13,[x1,#8*6]
246
247 sub x2,sp,x5,lsl#4
248 lsl x5,x5,#3
249 ldr x4,[x4] // *n0
250 mov sp,x2 // alloca
251 sub x27,x5,#8*8
252 b Lsqr8x_zero_start
253
254Lsqr8x_zero:
255 sub x27,x27,#8*8
256 stp xzr,xzr,[x2,#8*0]
257 stp xzr,xzr,[x2,#8*2]
258 stp xzr,xzr,[x2,#8*4]
259 stp xzr,xzr,[x2,#8*6]
260Lsqr8x_zero_start:
261 stp xzr,xzr,[x2,#8*8]
262 stp xzr,xzr,[x2,#8*10]
263 stp xzr,xzr,[x2,#8*12]
264 stp xzr,xzr,[x2,#8*14]
265 add x2,x2,#8*16
266 cbnz x27,Lsqr8x_zero
267
268 add x3,x1,x5
269 add x1,x1,#8*8
270 mov x19,xzr
271 mov x20,xzr
272 mov x21,xzr
273 mov x22,xzr
274 mov x23,xzr
275 mov x24,xzr
276 mov x25,xzr
277 mov x26,xzr
278 mov x2,sp
279 str x4,[x29,#112] // offload n0
280
281 // Multiply everything but a[i]*a[i]
282.align 4
283Lsqr8x_outer_loop:
284 // a[1]a[0] (i)
285 // a[2]a[0]
286 // a[3]a[0]
287 // a[4]a[0]
288 // a[5]a[0]
289 // a[6]a[0]
290 // a[7]a[0]
291 // a[2]a[1] (ii)
292 // a[3]a[1]
293 // a[4]a[1]
294 // a[5]a[1]
295 // a[6]a[1]
296 // a[7]a[1]
297 // a[3]a[2] (iii)
298 // a[4]a[2]
299 // a[5]a[2]
300 // a[6]a[2]
301 // a[7]a[2]
302 // a[4]a[3] (iv)
303 // a[5]a[3]
304 // a[6]a[3]
305 // a[7]a[3]
306 // a[5]a[4] (v)
307 // a[6]a[4]
308 // a[7]a[4]
309 // a[6]a[5] (vi)
310 // a[7]a[5]
311 // a[7]a[6] (vii)
312
313 mul x14,x7,x6 // lo(a[1..7]*a[0]) (i)
314 mul x15,x8,x6
315 mul x16,x9,x6
316 mul x17,x10,x6
317 adds x20,x20,x14 // t[1]+lo(a[1]*a[0])
318 mul x14,x11,x6
319 adcs x21,x21,x15
320 mul x15,x12,x6
321 adcs x22,x22,x16
322 mul x16,x13,x6
323 adcs x23,x23,x17
324 umulh x17,x7,x6 // hi(a[1..7]*a[0])
325 adcs x24,x24,x14
326 umulh x14,x8,x6
327 adcs x25,x25,x15
328 umulh x15,x9,x6
329 adcs x26,x26,x16
330 umulh x16,x10,x6
331 stp x19,x20,[x2],#8*2 // t[0..1]
332 adc x19,xzr,xzr // t[8]
333 adds x21,x21,x17 // t[2]+lo(a[1]*a[0])
334 umulh x17,x11,x6
335 adcs x22,x22,x14
336 umulh x14,x12,x6
337 adcs x23,x23,x15
338 umulh x15,x13,x6
339 adcs x24,x24,x16
340 mul x16,x8,x7 // lo(a[2..7]*a[1]) (ii)
341 adcs x25,x25,x17
342 mul x17,x9,x7
343 adcs x26,x26,x14
344 mul x14,x10,x7
345 adc x19,x19,x15
346
347 mul x15,x11,x7
348 adds x22,x22,x16
349 mul x16,x12,x7
350 adcs x23,x23,x17
351 mul x17,x13,x7
352 adcs x24,x24,x14
353 umulh x14,x8,x7 // hi(a[2..7]*a[1])
354 adcs x25,x25,x15
355 umulh x15,x9,x7
356 adcs x26,x26,x16
357 umulh x16,x10,x7
358 adcs x19,x19,x17
359 umulh x17,x11,x7
360 stp x21,x22,[x2],#8*2 // t[2..3]
361 adc x20,xzr,xzr // t[9]
362 adds x23,x23,x14
363 umulh x14,x12,x7
364 adcs x24,x24,x15
365 umulh x15,x13,x7
366 adcs x25,x25,x16
367 mul x16,x9,x8 // lo(a[3..7]*a[2]) (iii)
368 adcs x26,x26,x17
369 mul x17,x10,x8
370 adcs x19,x19,x14
371 mul x14,x11,x8
372 adc x20,x20,x15
373
374 mul x15,x12,x8
375 adds x24,x24,x16
376 mul x16,x13,x8
377 adcs x25,x25,x17
378 umulh x17,x9,x8 // hi(a[3..7]*a[2])
379 adcs x26,x26,x14
380 umulh x14,x10,x8
381 adcs x19,x19,x15
382 umulh x15,x11,x8
383 adcs x20,x20,x16
384 umulh x16,x12,x8
385 stp x23,x24,[x2],#8*2 // t[4..5]
386 adc x21,xzr,xzr // t[10]
387 adds x25,x25,x17
388 umulh x17,x13,x8
389 adcs x26,x26,x14
390 mul x14,x10,x9 // lo(a[4..7]*a[3]) (iv)
391 adcs x19,x19,x15
392 mul x15,x11,x9
393 adcs x20,x20,x16
394 mul x16,x12,x9
395 adc x21,x21,x17
396
397 mul x17,x13,x9
398 adds x26,x26,x14
399 umulh x14,x10,x9 // hi(a[4..7]*a[3])
400 adcs x19,x19,x15
401 umulh x15,x11,x9
402 adcs x20,x20,x16
403 umulh x16,x12,x9
404 adcs x21,x21,x17
405 umulh x17,x13,x9
406 stp x25,x26,[x2],#8*2 // t[6..7]
407 adc x22,xzr,xzr // t[11]
408 adds x19,x19,x14
409 mul x14,x11,x10 // lo(a[5..7]*a[4]) (v)
410 adcs x20,x20,x15
411 mul x15,x12,x10
412 adcs x21,x21,x16
413 mul x16,x13,x10
414 adc x22,x22,x17
415
416 umulh x17,x11,x10 // hi(a[5..7]*a[4])
417 adds x20,x20,x14
418 umulh x14,x12,x10
419 adcs x21,x21,x15
420 umulh x15,x13,x10
421 adcs x22,x22,x16
422 mul x16,x12,x11 // lo(a[6..7]*a[5]) (vi)
423 adc x23,xzr,xzr // t[12]
424 adds x21,x21,x17
425 mul x17,x13,x11
426 adcs x22,x22,x14
427 umulh x14,x12,x11 // hi(a[6..7]*a[5])
428 adc x23,x23,x15
429
430 umulh x15,x13,x11
431 adds x22,x22,x16
432 mul x16,x13,x12 // lo(a[7]*a[6]) (vii)
433 adcs x23,x23,x17
434 umulh x17,x13,x12 // hi(a[7]*a[6])
435 adc x24,xzr,xzr // t[13]
436 adds x23,x23,x14
437 sub x27,x3,x1 // done yet?
438 adc x24,x24,x15
439
440 adds x24,x24,x16
441 sub x14,x3,x5 // rewinded ap
442 adc x25,xzr,xzr // t[14]
443 add x25,x25,x17
444
445 cbz x27,Lsqr8x_outer_break
446
447 mov x4,x6
448 ldp x6,x7,[x2,#8*0]
449 ldp x8,x9,[x2,#8*2]
450 ldp x10,x11,[x2,#8*4]
451 ldp x12,x13,[x2,#8*6]
452 adds x19,x19,x6
453 adcs x20,x20,x7
454 ldp x6,x7,[x1,#8*0]
455 adcs x21,x21,x8
456 adcs x22,x22,x9
457 ldp x8,x9,[x1,#8*2]
458 adcs x23,x23,x10
459 adcs x24,x24,x11
460 ldp x10,x11,[x1,#8*4]
461 adcs x25,x25,x12
462 mov x0,x1
463 adcs x26,xzr,x13
464 ldp x12,x13,[x1,#8*6]
465 add x1,x1,#8*8
466 //adc x28,xzr,xzr // moved below
467 mov x27,#-8*8
468
469 // a[8]a[0]
470 // a[9]a[0]
471 // a[a]a[0]
472 // a[b]a[0]
473 // a[c]a[0]
474 // a[d]a[0]
475 // a[e]a[0]
476 // a[f]a[0]
477 // a[8]a[1]
478 // a[f]a[1]........................
479 // a[8]a[2]
480 // a[f]a[2]........................
481 // a[8]a[3]
482 // a[f]a[3]........................
483 // a[8]a[4]
484 // a[f]a[4]........................
485 // a[8]a[5]
486 // a[f]a[5]........................
487 // a[8]a[6]
488 // a[f]a[6]........................
489 // a[8]a[7]
490 // a[f]a[7]........................
491Lsqr8x_mul:
492 mul x14,x6,x4
493 adc x28,xzr,xzr // carry bit, modulo-scheduled
494 mul x15,x7,x4
495 add x27,x27,#8
496 mul x16,x8,x4
497 mul x17,x9,x4
498 adds x19,x19,x14
499 mul x14,x10,x4
500 adcs x20,x20,x15
501 mul x15,x11,x4
502 adcs x21,x21,x16
503 mul x16,x12,x4
504 adcs x22,x22,x17
505 mul x17,x13,x4
506 adcs x23,x23,x14
507 umulh x14,x6,x4
508 adcs x24,x24,x15
509 umulh x15,x7,x4
510 adcs x25,x25,x16
511 umulh x16,x8,x4
512 adcs x26,x26,x17
513 umulh x17,x9,x4
514 adc x28,x28,xzr
515 str x19,[x2],#8
516 adds x19,x20,x14
517 umulh x14,x10,x4
518 adcs x20,x21,x15
519 umulh x15,x11,x4
520 adcs x21,x22,x16
521 umulh x16,x12,x4
522 adcs x22,x23,x17
523 umulh x17,x13,x4
524 ldr x4,[x0,x27]
525 adcs x23,x24,x14
526 adcs x24,x25,x15
527 adcs x25,x26,x16
528 adcs x26,x28,x17
529 //adc x28,xzr,xzr // moved above
530 cbnz x27,Lsqr8x_mul
531 // note that carry flag is guaranteed
532 // to be zero at this point
533 cmp x1,x3 // done yet?
534 b.eq Lsqr8x_break
535
536 ldp x6,x7,[x2,#8*0]
537 ldp x8,x9,[x2,#8*2]
538 ldp x10,x11,[x2,#8*4]
539 ldp x12,x13,[x2,#8*6]
540 adds x19,x19,x6
541 ldr x4,[x0,#-8*8]
542 adcs x20,x20,x7
543 ldp x6,x7,[x1,#8*0]
544 adcs x21,x21,x8
545 adcs x22,x22,x9
546 ldp x8,x9,[x1,#8*2]
547 adcs x23,x23,x10
548 adcs x24,x24,x11
549 ldp x10,x11,[x1,#8*4]
550 adcs x25,x25,x12
551 mov x27,#-8*8
552 adcs x26,x26,x13
553 ldp x12,x13,[x1,#8*6]
554 add x1,x1,#8*8
555 //adc x28,xzr,xzr // moved above
556 b Lsqr8x_mul
557
558.align 4
559Lsqr8x_break:
560 ldp x6,x7,[x0,#8*0]
561 add x1,x0,#8*8
562 ldp x8,x9,[x0,#8*2]
563 sub x14,x3,x1 // is it last iteration?
564 ldp x10,x11,[x0,#8*4]
565 sub x15,x2,x14
566 ldp x12,x13,[x0,#8*6]
567 cbz x14,Lsqr8x_outer_loop
568
569 stp x19,x20,[x2,#8*0]
570 ldp x19,x20,[x15,#8*0]
571 stp x21,x22,[x2,#8*2]
572 ldp x21,x22,[x15,#8*2]
573 stp x23,x24,[x2,#8*4]
574 ldp x23,x24,[x15,#8*4]
575 stp x25,x26,[x2,#8*6]
576 mov x2,x15
577 ldp x25,x26,[x15,#8*6]
578 b Lsqr8x_outer_loop
579
580.align 4
581Lsqr8x_outer_break:
582 // Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0]
583 ldp x7,x9,[x14,#8*0] // recall that x14 is &a[0]
584 ldp x15,x16,[sp,#8*1]
585 ldp x11,x13,[x14,#8*2]
586 add x1,x14,#8*4
587 ldp x17,x14,[sp,#8*3]
588
589 stp x19,x20,[x2,#8*0]
590 mul x19,x7,x7
591 stp x21,x22,[x2,#8*2]
592 umulh x7,x7,x7
593 stp x23,x24,[x2,#8*4]
594 mul x8,x9,x9
595 stp x25,x26,[x2,#8*6]
596 mov x2,sp
597 umulh x9,x9,x9
598 adds x20,x7,x15,lsl#1
599 extr x15,x16,x15,#63
600 sub x27,x5,#8*4
601
602Lsqr4x_shift_n_add:
603 adcs x21,x8,x15
604 extr x16,x17,x16,#63
605 sub x27,x27,#8*4
606 adcs x22,x9,x16
607 ldp x15,x16,[x2,#8*5]
608 mul x10,x11,x11
609 ldp x7,x9,[x1],#8*2
610 umulh x11,x11,x11
611 mul x12,x13,x13
612 umulh x13,x13,x13
613 extr x17,x14,x17,#63
614 stp x19,x20,[x2,#8*0]
615 adcs x23,x10,x17
616 extr x14,x15,x14,#63
617 stp x21,x22,[x2,#8*2]
618 adcs x24,x11,x14
619 ldp x17,x14,[x2,#8*7]
620 extr x15,x16,x15,#63
621 adcs x25,x12,x15
622 extr x16,x17,x16,#63
623 adcs x26,x13,x16
624 ldp x15,x16,[x2,#8*9]
625 mul x6,x7,x7
626 ldp x11,x13,[x1],#8*2
627 umulh x7,x7,x7
628 mul x8,x9,x9
629 umulh x9,x9,x9
630 stp x23,x24,[x2,#8*4]
631 extr x17,x14,x17,#63
632 stp x25,x26,[x2,#8*6]
633 add x2,x2,#8*8
634 adcs x19,x6,x17
635 extr x14,x15,x14,#63
636 adcs x20,x7,x14
637 ldp x17,x14,[x2,#8*3]
638 extr x15,x16,x15,#63
639 cbnz x27,Lsqr4x_shift_n_add
640 ldp x1,x4,[x29,#104] // pull np and n0
641
642 adcs x21,x8,x15
643 extr x16,x17,x16,#63
644 adcs x22,x9,x16
645 ldp x15,x16,[x2,#8*5]
646 mul x10,x11,x11
647 umulh x11,x11,x11
648 stp x19,x20,[x2,#8*0]
649 mul x12,x13,x13
650 umulh x13,x13,x13
651 stp x21,x22,[x2,#8*2]
652 extr x17,x14,x17,#63
653 adcs x23,x10,x17
654 extr x14,x15,x14,#63
655 ldp x19,x20,[sp,#8*0]
656 adcs x24,x11,x14
657 extr x15,x16,x15,#63
658 ldp x6,x7,[x1,#8*0]
659 adcs x25,x12,x15
660 extr x16,xzr,x16,#63
661 ldp x8,x9,[x1,#8*2]
662 adc x26,x13,x16
663 ldp x10,x11,[x1,#8*4]
664
665 // Reduce by 512 bits per iteration
666 mul x28,x4,x19 // t[0]*n0
667 ldp x12,x13,[x1,#8*6]
668 add x3,x1,x5
669 ldp x21,x22,[sp,#8*2]
670 stp x23,x24,[x2,#8*4]
671 ldp x23,x24,[sp,#8*4]
672 stp x25,x26,[x2,#8*6]
673 ldp x25,x26,[sp,#8*6]
674 add x1,x1,#8*8
675 mov x30,xzr // initial top-most carry
676 mov x2,sp
677 mov x27,#8
678
679Lsqr8x_reduction:
680 // (*) mul x14,x6,x28 // lo(n[0-7])*lo(t[0]*n0)
681 mul x15,x7,x28
682 sub x27,x27,#1
683 mul x16,x8,x28
684 str x28,[x2],#8 // put aside t[0]*n0 for tail processing
685 mul x17,x9,x28
686 // (*) adds xzr,x19,x14
687 subs xzr,x19,#1 // (*)
688 mul x14,x10,x28
689 adcs x19,x20,x15
690 mul x15,x11,x28
691 adcs x20,x21,x16
692 mul x16,x12,x28
693 adcs x21,x22,x17
694 mul x17,x13,x28
695 adcs x22,x23,x14
696 umulh x14,x6,x28 // hi(n[0-7])*lo(t[0]*n0)
697 adcs x23,x24,x15
698 umulh x15,x7,x28
699 adcs x24,x25,x16
700 umulh x16,x8,x28
701 adcs x25,x26,x17
702 umulh x17,x9,x28
703 adc x26,xzr,xzr
704 adds x19,x19,x14
705 umulh x14,x10,x28
706 adcs x20,x20,x15
707 umulh x15,x11,x28
708 adcs x21,x21,x16
709 umulh x16,x12,x28
710 adcs x22,x22,x17
711 umulh x17,x13,x28
712 mul x28,x4,x19 // next t[0]*n0
713 adcs x23,x23,x14
714 adcs x24,x24,x15
715 adcs x25,x25,x16
716 adc x26,x26,x17
717 cbnz x27,Lsqr8x_reduction
718
719 ldp x14,x15,[x2,#8*0]
720 ldp x16,x17,[x2,#8*2]
721 mov x0,x2
722 sub x27,x3,x1 // done yet?
723 adds x19,x19,x14
724 adcs x20,x20,x15
725 ldp x14,x15,[x2,#8*4]
726 adcs x21,x21,x16
727 adcs x22,x22,x17
728 ldp x16,x17,[x2,#8*6]
729 adcs x23,x23,x14
730 adcs x24,x24,x15
731 adcs x25,x25,x16
732 adcs x26,x26,x17
733 //adc x28,xzr,xzr // moved below
734 cbz x27,Lsqr8x8_post_condition
735
736 ldr x4,[x2,#-8*8]
737 ldp x6,x7,[x1,#8*0]
738 ldp x8,x9,[x1,#8*2]
739 ldp x10,x11,[x1,#8*4]
740 mov x27,#-8*8
741 ldp x12,x13,[x1,#8*6]
742 add x1,x1,#8*8
743
744Lsqr8x_tail:
745 mul x14,x6,x4
746 adc x28,xzr,xzr // carry bit, modulo-scheduled
747 mul x15,x7,x4
748 add x27,x27,#8
749 mul x16,x8,x4
750 mul x17,x9,x4
751 adds x19,x19,x14
752 mul x14,x10,x4
753 adcs x20,x20,x15
754 mul x15,x11,x4
755 adcs x21,x21,x16
756 mul x16,x12,x4
757 adcs x22,x22,x17
758 mul x17,x13,x4
759 adcs x23,x23,x14
760 umulh x14,x6,x4
761 adcs x24,x24,x15
762 umulh x15,x7,x4
763 adcs x25,x25,x16
764 umulh x16,x8,x4
765 adcs x26,x26,x17
766 umulh x17,x9,x4
767 adc x28,x28,xzr
768 str x19,[x2],#8
769 adds x19,x20,x14
770 umulh x14,x10,x4
771 adcs x20,x21,x15
772 umulh x15,x11,x4
773 adcs x21,x22,x16
774 umulh x16,x12,x4
775 adcs x22,x23,x17
776 umulh x17,x13,x4
777 ldr x4,[x0,x27]
778 adcs x23,x24,x14
779 adcs x24,x25,x15
780 adcs x25,x26,x16
781 adcs x26,x28,x17
782 //adc x28,xzr,xzr // moved above
783 cbnz x27,Lsqr8x_tail
784 // note that carry flag is guaranteed
785 // to be zero at this point
786 ldp x6,x7,[x2,#8*0]
787 sub x27,x3,x1 // done yet?
788 sub x16,x3,x5 // rewinded np
789 ldp x8,x9,[x2,#8*2]
790 ldp x10,x11,[x2,#8*4]
791 ldp x12,x13,[x2,#8*6]
792 cbz x27,Lsqr8x_tail_break
793
794 ldr x4,[x0,#-8*8]
795 adds x19,x19,x6
796 adcs x20,x20,x7
797 ldp x6,x7,[x1,#8*0]
798 adcs x21,x21,x8
799 adcs x22,x22,x9
800 ldp x8,x9,[x1,#8*2]
801 adcs x23,x23,x10
802 adcs x24,x24,x11
803 ldp x10,x11,[x1,#8*4]
804 adcs x25,x25,x12
805 mov x27,#-8*8
806 adcs x26,x26,x13
807 ldp x12,x13,[x1,#8*6]
808 add x1,x1,#8*8
809 //adc x28,xzr,xzr // moved above
810 b Lsqr8x_tail
811
812.align 4
813Lsqr8x_tail_break:
814 ldr x4,[x29,#112] // pull n0
815 add x27,x2,#8*8 // end of current t[num] window
816
817 subs xzr,x30,#1 // "move" top-most carry to carry bit
818 adcs x14,x19,x6
819 adcs x15,x20,x7
820 ldp x19,x20,[x0,#8*0]
821 adcs x21,x21,x8
822 ldp x6,x7,[x16,#8*0] // recall that x16 is &n[0]
823 adcs x22,x22,x9
824 ldp x8,x9,[x16,#8*2]
825 adcs x23,x23,x10
826 adcs x24,x24,x11
827 ldp x10,x11,[x16,#8*4]
828 adcs x25,x25,x12
829 adcs x26,x26,x13
830 ldp x12,x13,[x16,#8*6]
831 add x1,x16,#8*8
832 adc x30,xzr,xzr // top-most carry
833 mul x28,x4,x19
834 stp x14,x15,[x2,#8*0]
835 stp x21,x22,[x2,#8*2]
836 ldp x21,x22,[x0,#8*2]
837 stp x23,x24,[x2,#8*4]
838 ldp x23,x24,[x0,#8*4]
839 cmp x27,x29 // did we hit the bottom?
840 stp x25,x26,[x2,#8*6]
841 mov x2,x0 // slide the window
842 ldp x25,x26,[x0,#8*6]
843 mov x27,#8
844 b.ne Lsqr8x_reduction
845
846 // Final step. We see if result is larger than modulus, and
847 // if it is, subtract the modulus. But comparison implies
848 // subtraction. So we subtract modulus, see if it borrowed,
849 // and conditionally copy original value.
850 ldr x0,[x29,#96] // pull rp
851 add x2,x2,#8*8
852 subs x14,x19,x6
853 sbcs x15,x20,x7
854 sub x27,x5,#8*8
855 mov x3,x0 // x0 copy
856
857Lsqr8x_sub:
858 sbcs x16,x21,x8
859 ldp x6,x7,[x1,#8*0]
860 sbcs x17,x22,x9
861 stp x14,x15,[x0,#8*0]
862 sbcs x14,x23,x10
863 ldp x8,x9,[x1,#8*2]
864 sbcs x15,x24,x11
865 stp x16,x17,[x0,#8*2]
866 sbcs x16,x25,x12
867 ldp x10,x11,[x1,#8*4]
868 sbcs x17,x26,x13
869 ldp x12,x13,[x1,#8*6]
870 add x1,x1,#8*8
871 ldp x19,x20,[x2,#8*0]
872 sub x27,x27,#8*8
873 ldp x21,x22,[x2,#8*2]
874 ldp x23,x24,[x2,#8*4]
875 ldp x25,x26,[x2,#8*6]
876 add x2,x2,#8*8
877 stp x14,x15,[x0,#8*4]
878 sbcs x14,x19,x6
879 stp x16,x17,[x0,#8*6]
880 add x0,x0,#8*8
881 sbcs x15,x20,x7
882 cbnz x27,Lsqr8x_sub
883
884 sbcs x16,x21,x8
885 mov x2,sp
886 add x1,sp,x5
887 ldp x6,x7,[x3,#8*0]
888 sbcs x17,x22,x9
889 stp x14,x15,[x0,#8*0]
890 sbcs x14,x23,x10
891 ldp x8,x9,[x3,#8*2]
892 sbcs x15,x24,x11
893 stp x16,x17,[x0,#8*2]
894 sbcs x16,x25,x12
895 ldp x19,x20,[x1,#8*0]
896 sbcs x17,x26,x13
897 ldp x21,x22,[x1,#8*2]
898 sbcs xzr,x30,xzr // did it borrow?
899 ldr x30,[x29,#8] // pull return address
900 stp x14,x15,[x0,#8*4]
901 stp x16,x17,[x0,#8*6]
902
903 sub x27,x5,#8*4
904Lsqr4x_cond_copy:
905 sub x27,x27,#8*4
906 csel x14,x19,x6,lo
907 stp xzr,xzr,[x2,#8*0]
908 csel x15,x20,x7,lo
909 ldp x6,x7,[x3,#8*4]
910 ldp x19,x20,[x1,#8*4]
911 csel x16,x21,x8,lo
912 stp xzr,xzr,[x2,#8*2]
913 add x2,x2,#8*4
914 csel x17,x22,x9,lo
915 ldp x8,x9,[x3,#8*6]
916 ldp x21,x22,[x1,#8*6]
917 add x1,x1,#8*4
918 stp x14,x15,[x3,#8*0]
919 stp x16,x17,[x3,#8*2]
920 add x3,x3,#8*4
921 stp xzr,xzr,[x1,#8*0]
922 stp xzr,xzr,[x1,#8*2]
923 cbnz x27,Lsqr4x_cond_copy
924
925 csel x14,x19,x6,lo
926 stp xzr,xzr,[x2,#8*0]
927 csel x15,x20,x7,lo
928 stp xzr,xzr,[x2,#8*2]
929 csel x16,x21,x8,lo
930 csel x17,x22,x9,lo
931 stp x14,x15,[x3,#8*0]
932 stp x16,x17,[x3,#8*2]
933
934 b Lsqr8x_done
935
936.align 4
937Lsqr8x8_post_condition:
938 adc x28,xzr,xzr
939 ldr x30,[x29,#8] // pull return address
940 // x19-7,x28 hold result, x6-7 hold modulus
941 subs x6,x19,x6
942 ldr x1,[x29,#96] // pull rp
943 sbcs x7,x20,x7
944 stp xzr,xzr,[sp,#8*0]
945 sbcs x8,x21,x8
946 stp xzr,xzr,[sp,#8*2]
947 sbcs x9,x22,x9
948 stp xzr,xzr,[sp,#8*4]
949 sbcs x10,x23,x10
950 stp xzr,xzr,[sp,#8*6]
951 sbcs x11,x24,x11
952 stp xzr,xzr,[sp,#8*8]
953 sbcs x12,x25,x12
954 stp xzr,xzr,[sp,#8*10]
955 sbcs x13,x26,x13
956 stp xzr,xzr,[sp,#8*12]
957 sbcs x28,x28,xzr // did it borrow?
958 stp xzr,xzr,[sp,#8*14]
959
960 // x6-7 hold result-modulus
961 csel x6,x19,x6,lo
962 csel x7,x20,x7,lo
963 csel x8,x21,x8,lo
964 csel x9,x22,x9,lo
965 stp x6,x7,[x1,#8*0]
966 csel x10,x23,x10,lo
967 csel x11,x24,x11,lo
968 stp x8,x9,[x1,#8*2]
969 csel x12,x25,x12,lo
970 csel x13,x26,x13,lo
971 stp x10,x11,[x1,#8*4]
972 stp x12,x13,[x1,#8*6]
973
974Lsqr8x_done:
975 ldp x19,x20,[x29,#16]
976 mov sp,x29
977 ldp x21,x22,[x29,#32]
978 mov x0,#1
979 ldp x23,x24,[x29,#48]
980 ldp x25,x26,[x29,#64]
981 ldp x27,x28,[x29,#80]
982 ldr x29,[sp],#128
Joel Galenson96d408b2021-06-08 17:53:00 -0700983 // x30 is popped earlier
984 AARCH64_VALIDATE_LINK_REGISTER
Jeff Vander Stoep2bbaf7e2020-12-04 14:00:07 +0100985 ret
986
987
988.align 5
989__bn_mul4x_mont:
Joel Galenson96d408b2021-06-08 17:53:00 -0700990 // Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_mul4x_mont is jumped to
991 // only from bn_mul_mont or __bn_mul8x_mont which have already signed the
992 // return address.
Jeff Vander Stoep2bbaf7e2020-12-04 14:00:07 +0100993 stp x29,x30,[sp,#-128]!
994 add x29,sp,#0
995 stp x19,x20,[sp,#16]
996 stp x21,x22,[sp,#32]
997 stp x23,x24,[sp,#48]
998 stp x25,x26,[sp,#64]
999 stp x27,x28,[sp,#80]
1000
1001 sub x26,sp,x5,lsl#3
1002 lsl x5,x5,#3
1003 ldr x4,[x4] // *n0
1004 sub sp,x26,#8*4 // alloca
1005
1006 add x10,x2,x5
1007 add x27,x1,x5
1008 stp x0,x10,[x29,#96] // offload rp and &b[num]
1009
1010 ldr x24,[x2,#8*0] // b[0]
1011 ldp x6,x7,[x1,#8*0] // a[0..3]
1012 ldp x8,x9,[x1,#8*2]
1013 add x1,x1,#8*4
1014 mov x19,xzr
1015 mov x20,xzr
1016 mov x21,xzr
1017 mov x22,xzr
1018 ldp x14,x15,[x3,#8*0] // n[0..3]
1019 ldp x16,x17,[x3,#8*2]
1020 adds x3,x3,#8*4 // clear carry bit
1021 mov x0,xzr
1022 mov x28,#0
1023 mov x26,sp
1024
1025Loop_mul4x_1st_reduction:
1026 mul x10,x6,x24 // lo(a[0..3]*b[0])
1027 adc x0,x0,xzr // modulo-scheduled
1028 mul x11,x7,x24
1029 add x28,x28,#8
1030 mul x12,x8,x24
1031 and x28,x28,#31
1032 mul x13,x9,x24
1033 adds x19,x19,x10
1034 umulh x10,x6,x24 // hi(a[0..3]*b[0])
1035 adcs x20,x20,x11
1036 mul x25,x19,x4 // t[0]*n0
1037 adcs x21,x21,x12
1038 umulh x11,x7,x24
1039 adcs x22,x22,x13
1040 umulh x12,x8,x24
1041 adc x23,xzr,xzr
1042 umulh x13,x9,x24
1043 ldr x24,[x2,x28] // next b[i] (or b[0])
1044 adds x20,x20,x10
1045 // (*) mul x10,x14,x25 // lo(n[0..3]*t[0]*n0)
1046 str x25,[x26],#8 // put aside t[0]*n0 for tail processing
1047 adcs x21,x21,x11
1048 mul x11,x15,x25
1049 adcs x22,x22,x12
1050 mul x12,x16,x25
1051 adc x23,x23,x13 // can't overflow
1052 mul x13,x17,x25
1053 // (*) adds xzr,x19,x10
1054 subs xzr,x19,#1 // (*)
1055 umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0)
1056 adcs x19,x20,x11
1057 umulh x11,x15,x25
1058 adcs x20,x21,x12
1059 umulh x12,x16,x25
1060 adcs x21,x22,x13
1061 umulh x13,x17,x25
1062 adcs x22,x23,x0
1063 adc x0,xzr,xzr
1064 adds x19,x19,x10
1065 sub x10,x27,x1
1066 adcs x20,x20,x11
1067 adcs x21,x21,x12
1068 adcs x22,x22,x13
1069 //adc x0,x0,xzr
1070 cbnz x28,Loop_mul4x_1st_reduction
1071
1072 cbz x10,Lmul4x4_post_condition
1073
1074 ldp x6,x7,[x1,#8*0] // a[4..7]
1075 ldp x8,x9,[x1,#8*2]
1076 add x1,x1,#8*4
1077 ldr x25,[sp] // a[0]*n0
1078 ldp x14,x15,[x3,#8*0] // n[4..7]
1079 ldp x16,x17,[x3,#8*2]
1080 add x3,x3,#8*4
1081
1082Loop_mul4x_1st_tail:
1083 mul x10,x6,x24 // lo(a[4..7]*b[i])
1084 adc x0,x0,xzr // modulo-scheduled
1085 mul x11,x7,x24
1086 add x28,x28,#8
1087 mul x12,x8,x24
1088 and x28,x28,#31
1089 mul x13,x9,x24
1090 adds x19,x19,x10
1091 umulh x10,x6,x24 // hi(a[4..7]*b[i])
1092 adcs x20,x20,x11
1093 umulh x11,x7,x24
1094 adcs x21,x21,x12
1095 umulh x12,x8,x24
1096 adcs x22,x22,x13
1097 umulh x13,x9,x24
1098 adc x23,xzr,xzr
1099 ldr x24,[x2,x28] // next b[i] (or b[0])
1100 adds x20,x20,x10
1101 mul x10,x14,x25 // lo(n[4..7]*a[0]*n0)
1102 adcs x21,x21,x11
1103 mul x11,x15,x25
1104 adcs x22,x22,x12
1105 mul x12,x16,x25
1106 adc x23,x23,x13 // can't overflow
1107 mul x13,x17,x25
1108 adds x19,x19,x10
1109 umulh x10,x14,x25 // hi(n[4..7]*a[0]*n0)
1110 adcs x20,x20,x11
1111 umulh x11,x15,x25
1112 adcs x21,x21,x12
1113 umulh x12,x16,x25
1114 adcs x22,x22,x13
1115 adcs x23,x23,x0
1116 umulh x13,x17,x25
1117 adc x0,xzr,xzr
1118 ldr x25,[sp,x28] // next t[0]*n0
1119 str x19,[x26],#8 // result!!!
1120 adds x19,x20,x10
1121 sub x10,x27,x1 // done yet?
1122 adcs x20,x21,x11
1123 adcs x21,x22,x12
1124 adcs x22,x23,x13
1125 //adc x0,x0,xzr
1126 cbnz x28,Loop_mul4x_1st_tail
1127
1128 sub x11,x27,x5 // rewinded x1
1129 cbz x10,Lmul4x_proceed
1130
1131 ldp x6,x7,[x1,#8*0]
1132 ldp x8,x9,[x1,#8*2]
1133 add x1,x1,#8*4
1134 ldp x14,x15,[x3,#8*0]
1135 ldp x16,x17,[x3,#8*2]
1136 add x3,x3,#8*4
1137 b Loop_mul4x_1st_tail
1138
1139.align 5
1140Lmul4x_proceed:
1141 ldr x24,[x2,#8*4]! // *++b
1142 adc x30,x0,xzr
1143 ldp x6,x7,[x11,#8*0] // a[0..3]
1144 sub x3,x3,x5 // rewind np
1145 ldp x8,x9,[x11,#8*2]
1146 add x1,x11,#8*4
1147
1148 stp x19,x20,[x26,#8*0] // result!!!
1149 ldp x19,x20,[sp,#8*4] // t[0..3]
1150 stp x21,x22,[x26,#8*2] // result!!!
1151 ldp x21,x22,[sp,#8*6]
1152
1153 ldp x14,x15,[x3,#8*0] // n[0..3]
1154 mov x26,sp
1155 ldp x16,x17,[x3,#8*2]
1156 adds x3,x3,#8*4 // clear carry bit
1157 mov x0,xzr
1158
1159.align 4
1160Loop_mul4x_reduction:
1161 mul x10,x6,x24 // lo(a[0..3]*b[4])
1162 adc x0,x0,xzr // modulo-scheduled
1163 mul x11,x7,x24
1164 add x28,x28,#8
1165 mul x12,x8,x24
1166 and x28,x28,#31
1167 mul x13,x9,x24
1168 adds x19,x19,x10
1169 umulh x10,x6,x24 // hi(a[0..3]*b[4])
1170 adcs x20,x20,x11
1171 mul x25,x19,x4 // t[0]*n0
1172 adcs x21,x21,x12
1173 umulh x11,x7,x24
1174 adcs x22,x22,x13
1175 umulh x12,x8,x24
1176 adc x23,xzr,xzr
1177 umulh x13,x9,x24
1178 ldr x24,[x2,x28] // next b[i]
1179 adds x20,x20,x10
1180 // (*) mul x10,x14,x25
1181 str x25,[x26],#8 // put aside t[0]*n0 for tail processing
1182 adcs x21,x21,x11
1183 mul x11,x15,x25 // lo(n[0..3]*t[0]*n0
1184 adcs x22,x22,x12
1185 mul x12,x16,x25
1186 adc x23,x23,x13 // can't overflow
1187 mul x13,x17,x25
1188 // (*) adds xzr,x19,x10
1189 subs xzr,x19,#1 // (*)
1190 umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0
1191 adcs x19,x20,x11
1192 umulh x11,x15,x25
1193 adcs x20,x21,x12
1194 umulh x12,x16,x25
1195 adcs x21,x22,x13
1196 umulh x13,x17,x25
1197 adcs x22,x23,x0
1198 adc x0,xzr,xzr
1199 adds x19,x19,x10
1200 adcs x20,x20,x11
1201 adcs x21,x21,x12
1202 adcs x22,x22,x13
1203 //adc x0,x0,xzr
1204 cbnz x28,Loop_mul4x_reduction
1205
1206 adc x0,x0,xzr
1207 ldp x10,x11,[x26,#8*4] // t[4..7]
1208 ldp x12,x13,[x26,#8*6]
1209 ldp x6,x7,[x1,#8*0] // a[4..7]
1210 ldp x8,x9,[x1,#8*2]
1211 add x1,x1,#8*4
1212 adds x19,x19,x10
1213 adcs x20,x20,x11
1214 adcs x21,x21,x12
1215 adcs x22,x22,x13
1216 //adc x0,x0,xzr
1217
1218 ldr x25,[sp] // t[0]*n0
1219 ldp x14,x15,[x3,#8*0] // n[4..7]
1220 ldp x16,x17,[x3,#8*2]
1221 add x3,x3,#8*4
1222
1223.align 4
1224Loop_mul4x_tail:
1225 mul x10,x6,x24 // lo(a[4..7]*b[4])
1226 adc x0,x0,xzr // modulo-scheduled
1227 mul x11,x7,x24
1228 add x28,x28,#8
1229 mul x12,x8,x24
1230 and x28,x28,#31
1231 mul x13,x9,x24
1232 adds x19,x19,x10
1233 umulh x10,x6,x24 // hi(a[4..7]*b[4])
1234 adcs x20,x20,x11
1235 umulh x11,x7,x24
1236 adcs x21,x21,x12
1237 umulh x12,x8,x24
1238 adcs x22,x22,x13
1239 umulh x13,x9,x24
1240 adc x23,xzr,xzr
1241 ldr x24,[x2,x28] // next b[i]
1242 adds x20,x20,x10
1243 mul x10,x14,x25 // lo(n[4..7]*t[0]*n0)
1244 adcs x21,x21,x11
1245 mul x11,x15,x25
1246 adcs x22,x22,x12
1247 mul x12,x16,x25
1248 adc x23,x23,x13 // can't overflow
1249 mul x13,x17,x25
1250 adds x19,x19,x10
1251 umulh x10,x14,x25 // hi(n[4..7]*t[0]*n0)
1252 adcs x20,x20,x11
1253 umulh x11,x15,x25
1254 adcs x21,x21,x12
1255 umulh x12,x16,x25
1256 adcs x22,x22,x13
1257 umulh x13,x17,x25
1258 adcs x23,x23,x0
1259 ldr x25,[sp,x28] // next a[0]*n0
1260 adc x0,xzr,xzr
1261 str x19,[x26],#8 // result!!!
1262 adds x19,x20,x10
1263 sub x10,x27,x1 // done yet?
1264 adcs x20,x21,x11
1265 adcs x21,x22,x12
1266 adcs x22,x23,x13
1267 //adc x0,x0,xzr
1268 cbnz x28,Loop_mul4x_tail
1269
1270 sub x11,x3,x5 // rewinded np?
1271 adc x0,x0,xzr
1272 cbz x10,Loop_mul4x_break
1273
1274 ldp x10,x11,[x26,#8*4]
1275 ldp x12,x13,[x26,#8*6]
1276 ldp x6,x7,[x1,#8*0]
1277 ldp x8,x9,[x1,#8*2]
1278 add x1,x1,#8*4
1279 adds x19,x19,x10
1280 adcs x20,x20,x11
1281 adcs x21,x21,x12
1282 adcs x22,x22,x13
1283 //adc x0,x0,xzr
1284 ldp x14,x15,[x3,#8*0]
1285 ldp x16,x17,[x3,#8*2]
1286 add x3,x3,#8*4
1287 b Loop_mul4x_tail
1288
1289.align 4
1290Loop_mul4x_break:
1291 ldp x12,x13,[x29,#96] // pull rp and &b[num]
1292 adds x19,x19,x30
1293 add x2,x2,#8*4 // bp++
1294 adcs x20,x20,xzr
1295 sub x1,x1,x5 // rewind ap
1296 adcs x21,x21,xzr
1297 stp x19,x20,[x26,#8*0] // result!!!
1298 adcs x22,x22,xzr
1299 ldp x19,x20,[sp,#8*4] // t[0..3]
1300 adc x30,x0,xzr
1301 stp x21,x22,[x26,#8*2] // result!!!
1302 cmp x2,x13 // done yet?
1303 ldp x21,x22,[sp,#8*6]
1304 ldp x14,x15,[x11,#8*0] // n[0..3]
1305 ldp x16,x17,[x11,#8*2]
1306 add x3,x11,#8*4
1307 b.eq Lmul4x_post
1308
1309 ldr x24,[x2]
1310 ldp x6,x7,[x1,#8*0] // a[0..3]
1311 ldp x8,x9,[x1,#8*2]
1312 adds x1,x1,#8*4 // clear carry bit
1313 mov x0,xzr
1314 mov x26,sp
1315 b Loop_mul4x_reduction
1316
1317.align 4
1318Lmul4x_post:
1319 // Final step. We see if result is larger than modulus, and
1320 // if it is, subtract the modulus. But comparison implies
1321 // subtraction. So we subtract modulus, see if it borrowed,
1322 // and conditionally copy original value.
1323 mov x0,x12
1324 mov x27,x12 // x0 copy
1325 subs x10,x19,x14
1326 add x26,sp,#8*8
1327 sbcs x11,x20,x15
1328 sub x28,x5,#8*4
1329
1330Lmul4x_sub:
1331 sbcs x12,x21,x16
1332 ldp x14,x15,[x3,#8*0]
1333 sub x28,x28,#8*4
1334 ldp x19,x20,[x26,#8*0]
1335 sbcs x13,x22,x17
1336 ldp x16,x17,[x3,#8*2]
1337 add x3,x3,#8*4
1338 ldp x21,x22,[x26,#8*2]
1339 add x26,x26,#8*4
1340 stp x10,x11,[x0,#8*0]
1341 sbcs x10,x19,x14
1342 stp x12,x13,[x0,#8*2]
1343 add x0,x0,#8*4
1344 sbcs x11,x20,x15
1345 cbnz x28,Lmul4x_sub
1346
1347 sbcs x12,x21,x16
1348 mov x26,sp
1349 add x1,sp,#8*4
1350 ldp x6,x7,[x27,#8*0]
1351 sbcs x13,x22,x17
1352 stp x10,x11,[x0,#8*0]
1353 ldp x8,x9,[x27,#8*2]
1354 stp x12,x13,[x0,#8*2]
1355 ldp x19,x20,[x1,#8*0]
1356 ldp x21,x22,[x1,#8*2]
1357 sbcs xzr,x30,xzr // did it borrow?
1358 ldr x30,[x29,#8] // pull return address
1359
1360 sub x28,x5,#8*4
1361Lmul4x_cond_copy:
1362 sub x28,x28,#8*4
1363 csel x10,x19,x6,lo
1364 stp xzr,xzr,[x26,#8*0]
1365 csel x11,x20,x7,lo
1366 ldp x6,x7,[x27,#8*4]
1367 ldp x19,x20,[x1,#8*4]
1368 csel x12,x21,x8,lo
1369 stp xzr,xzr,[x26,#8*2]
1370 add x26,x26,#8*4
1371 csel x13,x22,x9,lo
1372 ldp x8,x9,[x27,#8*6]
1373 ldp x21,x22,[x1,#8*6]
1374 add x1,x1,#8*4
1375 stp x10,x11,[x27,#8*0]
1376 stp x12,x13,[x27,#8*2]
1377 add x27,x27,#8*4
1378 cbnz x28,Lmul4x_cond_copy
1379
1380 csel x10,x19,x6,lo
1381 stp xzr,xzr,[x26,#8*0]
1382 csel x11,x20,x7,lo
1383 stp xzr,xzr,[x26,#8*2]
1384 csel x12,x21,x8,lo
1385 stp xzr,xzr,[x26,#8*3]
1386 csel x13,x22,x9,lo
1387 stp xzr,xzr,[x26,#8*4]
1388 stp x10,x11,[x27,#8*0]
1389 stp x12,x13,[x27,#8*2]
1390
1391 b Lmul4x_done
1392
1393.align 4
1394Lmul4x4_post_condition:
1395 adc x0,x0,xzr
1396 ldr x1,[x29,#96] // pull rp
1397 // x19-3,x0 hold result, x14-7 hold modulus
1398 subs x6,x19,x14
1399 ldr x30,[x29,#8] // pull return address
1400 sbcs x7,x20,x15
1401 stp xzr,xzr,[sp,#8*0]
1402 sbcs x8,x21,x16
1403 stp xzr,xzr,[sp,#8*2]
1404 sbcs x9,x22,x17
1405 stp xzr,xzr,[sp,#8*4]
1406 sbcs xzr,x0,xzr // did it borrow?
1407 stp xzr,xzr,[sp,#8*6]
1408
1409 // x6-3 hold result-modulus
1410 csel x6,x19,x6,lo
1411 csel x7,x20,x7,lo
1412 csel x8,x21,x8,lo
1413 csel x9,x22,x9,lo
1414 stp x6,x7,[x1,#8*0]
1415 stp x8,x9,[x1,#8*2]
1416
1417Lmul4x_done:
1418 ldp x19,x20,[x29,#16]
1419 mov sp,x29
1420 ldp x21,x22,[x29,#32]
1421 mov x0,#1
1422 ldp x23,x24,[x29,#48]
1423 ldp x25,x26,[x29,#64]
1424 ldp x27,x28,[x29,#80]
1425 ldr x29,[sp],#128
Joel Galenson96d408b2021-06-08 17:53:00 -07001426 // x30 is popped earlier
1427 AARCH64_VALIDATE_LINK_REGISTER
Jeff Vander Stoep2bbaf7e2020-12-04 14:00:07 +01001428 ret
1429
1430.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1431.align 2
1432.align 4
1433#endif // !OPENSSL_NO_ASM