blob: e9a6a067a2258a23e90e8071ba183b699e7fab76 [file] [log] [blame]
Patrice Arruda748609c2020-06-25 12:12:21 -07001// Copyright 2019 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
Dan Willemsencc753b72021-08-31 13:25:42 -07005//go:build ppc64le
Patrice Arruda748609c2020-06-25 12:12:21 -07006
7package elliptic
8
9import (
10 "crypto/subtle"
11 "encoding/binary"
12 "math/big"
13)
14
15// This was ported from the s390x implementation for ppc64le.
16// Some hints are included here for changes that should be
17// in the big endian ppc64 implementation, however more
18// investigation and testing is needed for the ppc64 big
19// endian version to work.
20type p256CurveFast struct {
21 *CurveParams
22}
23
24type p256Point struct {
25 x [32]byte
26 y [32]byte
27 z [32]byte
28}
29
30var (
31 p256 Curve
32 p256PreFast *[37][64]p256Point
33)
34
35func initP256Arch() {
36 p256 = p256CurveFast{p256Params}
37 initTable()
38 return
39}
40
41func (curve p256CurveFast) Params() *CurveParams {
42 return curve.CurveParams
43}
44
45// Functions implemented in p256_asm_ppc64le.s
46// Montgomery multiplication modulo P256
47//
48//go:noescape
49func p256MulAsm(res, in1, in2 []byte)
50
51// Montgomery square modulo P256
52//
53func p256Sqr(res, in []byte) {
54 p256MulAsm(res, in, in)
55}
56
57// Montgomery multiplication by 1
58//
59//go:noescape
60func p256FromMont(res, in []byte)
61
62// iff cond == 1 val <- -val
63//
64//go:noescape
65func p256NegCond(val *p256Point, cond int)
66
67// if cond == 0 res <- b; else res <- a
68//
69//go:noescape
70func p256MovCond(res, a, b *p256Point, cond int)
71
72// Constant time table access
73//
74//go:noescape
75func p256Select(point *p256Point, table []p256Point, idx int)
76
77//
78//go:noescape
79func p256SelectBase(point *p256Point, table []p256Point, idx int)
80
81// Point add with P2 being affine point
82// If sign == 1 -> P2 = -P2
83// If sel == 0 -> P3 = P1
84// if zero == 0 -> P3 = P2
85//
86//go:noescape
87func p256PointAddAffineAsm(res, in1, in2 *p256Point, sign, sel, zero int)
88
89// Point add
90//
91//go:noescape
92func p256PointAddAsm(res, in1, in2 *p256Point) int
93
94//
95//go:noescape
96func p256PointDoubleAsm(res, in *p256Point)
97
98// The result should be a slice in LE order, but the slice
99// from big.Bytes is in BE order.
100// TODO: For big endian implementation, do not reverse bytes.
101//
102func fromBig(big *big.Int) []byte {
103 // This could be done a lot more efficiently...
104 res := big.Bytes()
105 t := make([]byte, 32)
106 if len(res) < 32 {
107 copy(t[32-len(res):], res)
108 } else if len(res) == 32 {
109 copy(t, res)
110 } else {
111 copy(t, res[len(res)-32:])
112 }
113 p256ReverseBytes(t, t)
114 return t
115}
116
117// p256GetMultiplier makes sure byte array will have 32 byte elements, If the scalar
118// is equal or greater than the order of the group, it's reduced modulo that order.
119func p256GetMultiplier(in []byte) []byte {
120 n := new(big.Int).SetBytes(in)
121
122 if n.Cmp(p256Params.N) >= 0 {
123 n.Mod(n, p256Params.N)
124 }
125 return fromBig(n)
126}
127
128// p256MulAsm operates in a Montgomery domain with R = 2^256 mod p, where p is the
129// underlying field of the curve. (See initP256 for the value.) Thus rr here is
130// R×R mod p. See comment in Inverse about how this is used.
131// TODO: For big endian implementation, the bytes in these slices should be in reverse order,
132// as found in the s390x implementation.
133var rr = []byte{0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0, 0xff, 0xff, 0xff, 0xff, 0xfb, 0xff, 0xff, 0xff, 0xfe, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xfd, 0xff, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00}
134
135// (This is one, in the Montgomery domain.)
136var one = []byte{0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xfe, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00}
137
138func maybeReduceModP(in *big.Int) *big.Int {
139 if in.Cmp(p256Params.P) < 0 {
140 return in
141 }
142 return new(big.Int).Mod(in, p256Params.P)
143}
144
145// p256ReverseBytes copies the first 32 bytes from in to res in reverse order.
146func p256ReverseBytes(res, in []byte) {
147 // remove bounds check
148 in = in[:32]
149 res = res[:32]
150
151 // Load in reverse order
152 a := binary.BigEndian.Uint64(in[0:])
153 b := binary.BigEndian.Uint64(in[8:])
154 c := binary.BigEndian.Uint64(in[16:])
155 d := binary.BigEndian.Uint64(in[24:])
156
157 // Store in normal order
158 binary.LittleEndian.PutUint64(res[0:], d)
159 binary.LittleEndian.PutUint64(res[8:], c)
160 binary.LittleEndian.PutUint64(res[16:], b)
161 binary.LittleEndian.PutUint64(res[24:], a)
162}
163
164func (curve p256CurveFast) CombinedMult(bigX, bigY *big.Int, baseScalar, scalar []byte) (x, y *big.Int) {
165 var r1, r2 p256Point
166
167 scalarReduced := p256GetMultiplier(baseScalar)
168 r1IsInfinity := scalarIsZero(scalarReduced)
169 r1.p256BaseMult(scalarReduced)
170
171 copy(r2.x[:], fromBig(maybeReduceModP(bigX)))
172 copy(r2.y[:], fromBig(maybeReduceModP(bigY)))
173 copy(r2.z[:], one)
174 p256MulAsm(r2.x[:], r2.x[:], rr[:])
175 p256MulAsm(r2.y[:], r2.y[:], rr[:])
176
177 scalarReduced = p256GetMultiplier(scalar)
178 r2IsInfinity := scalarIsZero(scalarReduced)
179 r2.p256ScalarMult(scalarReduced)
180
181 var sum, double p256Point
182 pointsEqual := p256PointAddAsm(&sum, &r1, &r2)
183 p256PointDoubleAsm(&double, &r1)
184 p256MovCond(&sum, &double, &sum, pointsEqual)
185 p256MovCond(&sum, &r1, &sum, r2IsInfinity)
186 p256MovCond(&sum, &r2, &sum, r1IsInfinity)
187 return sum.p256PointToAffine()
188}
189
190func (curve p256CurveFast) ScalarBaseMult(scalar []byte) (x, y *big.Int) {
191 var r p256Point
192 reducedScalar := p256GetMultiplier(scalar)
193 r.p256BaseMult(reducedScalar)
194 return r.p256PointToAffine()
195}
196
197func (curve p256CurveFast) ScalarMult(bigX, bigY *big.Int, scalar []byte) (x, y *big.Int) {
198 scalarReduced := p256GetMultiplier(scalar)
199 var r p256Point
200 copy(r.x[:], fromBig(maybeReduceModP(bigX)))
201 copy(r.y[:], fromBig(maybeReduceModP(bigY)))
202 copy(r.z[:], one)
203 p256MulAsm(r.x[:], r.x[:], rr[:])
204 p256MulAsm(r.y[:], r.y[:], rr[:])
205 r.p256ScalarMult(scalarReduced)
206 return r.p256PointToAffine()
207}
208
209func scalarIsZero(scalar []byte) int {
210 // If any byte is not zero, return 0.
211 // Check for -0.... since that appears to compare to 0.
212 b := byte(0)
213 for _, s := range scalar {
214 b |= s
215 }
216 return subtle.ConstantTimeByteEq(b, 0)
217}
218
219func (p *p256Point) p256PointToAffine() (x, y *big.Int) {
220 zInv := make([]byte, 32)
221 zInvSq := make([]byte, 32)
222
223 p256Inverse(zInv, p.z[:])
224 p256Sqr(zInvSq, zInv)
225 p256MulAsm(zInv, zInv, zInvSq)
226
227 p256MulAsm(zInvSq, p.x[:], zInvSq)
228 p256MulAsm(zInv, p.y[:], zInv)
229
230 p256FromMont(zInvSq, zInvSq)
231 p256FromMont(zInv, zInv)
232
233 // SetBytes expects a slice in big endian order,
234 // since ppc64le is little endian, reverse the bytes.
235 // TODO: For big endian, bytes don't need to be reversed.
236 p256ReverseBytes(zInvSq, zInvSq)
237 p256ReverseBytes(zInv, zInv)
238 rx := new(big.Int).SetBytes(zInvSq)
239 ry := new(big.Int).SetBytes(zInv)
240 return rx, ry
241}
242
243// p256Inverse sets out to in^-1 mod p.
244func p256Inverse(out, in []byte) {
245 var stack [6 * 32]byte
246 p2 := stack[32*0 : 32*0+32]
247 p4 := stack[32*1 : 32*1+32]
248 p8 := stack[32*2 : 32*2+32]
249 p16 := stack[32*3 : 32*3+32]
250 p32 := stack[32*4 : 32*4+32]
251
252 p256Sqr(out, in)
253 p256MulAsm(p2, out, in) // 3*p
254
255 p256Sqr(out, p2)
256 p256Sqr(out, out)
257 p256MulAsm(p4, out, p2) // f*p
258
259 p256Sqr(out, p4)
260 p256Sqr(out, out)
261 p256Sqr(out, out)
262 p256Sqr(out, out)
263 p256MulAsm(p8, out, p4) // ff*p
264
265 p256Sqr(out, p8)
266
267 for i := 0; i < 7; i++ {
268 p256Sqr(out, out)
269 }
270 p256MulAsm(p16, out, p8) // ffff*p
271
272 p256Sqr(out, p16)
273 for i := 0; i < 15; i++ {
274 p256Sqr(out, out)
275 }
276 p256MulAsm(p32, out, p16) // ffffffff*p
277
278 p256Sqr(out, p32)
279
280 for i := 0; i < 31; i++ {
281 p256Sqr(out, out)
282 }
283 p256MulAsm(out, out, in)
284
285 for i := 0; i < 32*4; i++ {
286 p256Sqr(out, out)
287 }
288 p256MulAsm(out, out, p32)
289
290 for i := 0; i < 32; i++ {
291 p256Sqr(out, out)
292 }
293 p256MulAsm(out, out, p32)
294
295 for i := 0; i < 16; i++ {
296 p256Sqr(out, out)
297 }
298 p256MulAsm(out, out, p16)
299
300 for i := 0; i < 8; i++ {
301 p256Sqr(out, out)
302 }
303 p256MulAsm(out, out, p8)
304
305 p256Sqr(out, out)
306 p256Sqr(out, out)
307 p256Sqr(out, out)
308 p256Sqr(out, out)
309 p256MulAsm(out, out, p4)
310
311 p256Sqr(out, out)
312 p256Sqr(out, out)
313 p256MulAsm(out, out, p2)
314
315 p256Sqr(out, out)
316 p256Sqr(out, out)
317 p256MulAsm(out, out, in)
318}
319
320func boothW5(in uint) (int, int) {
321 var s uint = ^((in >> 5) - 1)
322 var d uint = (1 << 6) - in - 1
323 d = (d & s) | (in & (^s))
324 d = (d >> 1) + (d & 1)
325 return int(d), int(s & 1)
326}
327
328func boothW6(in uint) (int, int) {
329 var s uint = ^((in >> 6) - 1)
330 var d uint = (1 << 7) - in - 1
331 d = (d & s) | (in & (^s))
332 d = (d >> 1) + (d & 1)
333 return int(d), int(s & 1)
334}
335
336func boothW7(in uint) (int, int) {
337 var s uint = ^((in >> 7) - 1)
338 var d uint = (1 << 8) - in - 1
339 d = (d & s) | (in & (^s))
340 d = (d >> 1) + (d & 1)
341 return int(d), int(s & 1)
342}
343
344func initTable() {
345
346 p256PreFast = new([37][64]p256Point)
347
348 // TODO: For big endian, these slices should be in reverse byte order,
349 // as found in the s390x implementation.
350 basePoint := p256Point{
351 x: [32]byte{0x3c, 0x14, 0xa9, 0x18, 0xd4, 0x30, 0xe7, 0x79, 0x01, 0xb6, 0xed, 0x5f, 0xfc, 0x95, 0xba, 0x75,
352 0x10, 0x25, 0x62, 0x77, 0x2b, 0x73, 0xfb, 0x79, 0xc6, 0x55, 0x37, 0xa5, 0x76, 0x5f, 0x90, 0x18}, //(p256.x*2^256)%p
353 y: [32]byte{0x0a, 0x56, 0x95, 0xce, 0x57, 0x53, 0xf2, 0xdd, 0x5c, 0xe4, 0x19, 0xba, 0xe4, 0xb8, 0x4a, 0x8b,
354 0x25, 0xf3, 0x21, 0xdd, 0x88, 0x86, 0xe8, 0xd2, 0x85, 0x5d, 0x88, 0x25, 0x18, 0xff, 0x71, 0x85}, //(p256.y*2^256)%p
355 z: [32]byte{0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
356 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xfe, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00}, //(p256.z*2^256)%p
357
358 }
359
360 t1 := new(p256Point)
361 t2 := new(p256Point)
362 *t2 = basePoint
363
364 zInv := make([]byte, 32)
365 zInvSq := make([]byte, 32)
366 for j := 0; j < 64; j++ {
367 *t1 = *t2
368 for i := 0; i < 37; i++ {
369 // The window size is 7 so we need to double 7 times.
370 if i != 0 {
371 for k := 0; k < 7; k++ {
372 p256PointDoubleAsm(t1, t1)
373 }
374 }
375 // Convert the point to affine form. (Its values are
376 // still in Montgomery form however.)
377 p256Inverse(zInv, t1.z[:])
378 p256Sqr(zInvSq, zInv)
379 p256MulAsm(zInv, zInv, zInvSq)
380
381 p256MulAsm(t1.x[:], t1.x[:], zInvSq)
382 p256MulAsm(t1.y[:], t1.y[:], zInv)
383
384 copy(t1.z[:], basePoint.z[:])
385 // Update the table entry
386 copy(p256PreFast[i][j].x[:], t1.x[:])
387 copy(p256PreFast[i][j].y[:], t1.y[:])
388 }
389 if j == 0 {
390 p256PointDoubleAsm(t2, &basePoint)
391 } else {
392 p256PointAddAsm(t2, t2, &basePoint)
393 }
394 }
395}
396
397func (p *p256Point) p256BaseMult(scalar []byte) {
398 // TODO: For big endian, the index should be 31 not 0.
399 wvalue := (uint(scalar[0]) << 1) & 0xff
400 sel, sign := boothW7(uint(wvalue))
401 p256SelectBase(p, p256PreFast[0][:], sel)
402 p256NegCond(p, sign)
403
404 copy(p.z[:], one[:])
405 var t0 p256Point
406
407 copy(t0.z[:], one[:])
408
409 index := uint(6)
410 zero := sel
411 for i := 1; i < 37; i++ {
412 // TODO: For big endian, use the same index values as found
413 // in the s390x implementation.
414 if index < 247 {
415 wvalue = ((uint(scalar[index/8]) >> (index % 8)) + (uint(scalar[index/8+1]) << (8 - (index % 8)))) & 0xff
416 } else {
417 wvalue = (uint(scalar[index/8]) >> (index % 8)) & 0xff
418 }
419 index += 7
420 sel, sign = boothW7(uint(wvalue))
421 p256SelectBase(&t0, p256PreFast[i][:], sel)
422 p256PointAddAffineAsm(p, p, &t0, sign, sel, zero)
423 zero |= sel
424 }
425}
426
427func (p *p256Point) p256ScalarMult(scalar []byte) {
428 // precomp is a table of precomputed points that stores powers of p
429 // from p^1 to p^16.
430 var precomp [16]p256Point
431 var t0, t1, t2, t3 p256Point
432
433 *&precomp[0] = *p
434 p256PointDoubleAsm(&t0, p)
435 p256PointDoubleAsm(&t1, &t0)
436 p256PointDoubleAsm(&t2, &t1)
437 p256PointDoubleAsm(&t3, &t2)
438 *&precomp[1] = t0
439 *&precomp[3] = t1
440 *&precomp[7] = t2
441 *&precomp[15] = t3
442
443 p256PointAddAsm(&t0, &t0, p)
444 p256PointAddAsm(&t1, &t1, p)
445 p256PointAddAsm(&t2, &t2, p)
446
447 *&precomp[2] = t0
448 *&precomp[4] = t1
449 *&precomp[8] = t2
450
451 p256PointDoubleAsm(&t0, &t0)
452 p256PointDoubleAsm(&t1, &t1)
453 *&precomp[5] = t0
454 *&precomp[9] = t1
455
456 p256PointAddAsm(&t2, &t0, p)
457 p256PointAddAsm(&t1, &t1, p)
458 *&precomp[6] = t2
459 *&precomp[10] = t1
460
461 p256PointDoubleAsm(&t0, &t0)
462 p256PointDoubleAsm(&t2, &t2)
463 *&precomp[11] = t0
464 *&precomp[13] = t2
465
466 p256PointAddAsm(&t0, &t0, p)
467 p256PointAddAsm(&t2, &t2, p)
468 *&precomp[12] = t0
469 *&precomp[14] = t2
470
471 // Start scanning the window from top bit
472 index := uint(254)
473 var sel, sign int
474
475 // TODO: For big endian, use index found in s390x implementation.
476 wvalue := (uint(scalar[index/8]) >> (index % 8)) & 0x3f
477 sel, _ = boothW5(uint(wvalue))
478 p256Select(p, precomp[:], sel)
479 zero := sel
480
481 for index > 4 {
482 index -= 5
483 p256PointDoubleAsm(p, p)
484 p256PointDoubleAsm(p, p)
485 p256PointDoubleAsm(p, p)
486 p256PointDoubleAsm(p, p)
487 p256PointDoubleAsm(p, p)
488
489 // TODO: For big endian, use index values as found in s390x implementation.
490 if index < 247 {
491 wvalue = ((uint(scalar[index/8]) >> (index % 8)) + (uint(scalar[index/8+1]) << (8 - (index % 8)))) & 0x3f
492 } else {
493 wvalue = (uint(scalar[index/8]) >> (index % 8)) & 0x3f
494 }
495
496 sel, sign = boothW5(uint(wvalue))
497
498 p256Select(&t0, precomp[:], sel)
499 p256NegCond(&t0, sign)
500 p256PointAddAsm(&t1, p, &t0)
501 p256MovCond(&t1, &t1, p, sel)
502 p256MovCond(p, &t1, &t0, zero)
503 zero |= sel
504 }
505
506 p256PointDoubleAsm(p, p)
507 p256PointDoubleAsm(p, p)
508 p256PointDoubleAsm(p, p)
509 p256PointDoubleAsm(p, p)
510 p256PointDoubleAsm(p, p)
511
512 // TODO: Use index for big endian as found in s390x implementation.
513 wvalue = (uint(scalar[0]) << 1) & 0x3f
514 sel, sign = boothW5(uint(wvalue))
515
516 p256Select(&t0, precomp[:], sel)
517 p256NegCond(&t0, sign)
518 p256PointAddAsm(&t1, p, &t0)
519 p256MovCond(&t1, &t1, p, sel)
520 p256MovCond(p, &t1, &t0, zero)
521}