yggm/libs/f25519.cu

137 lines
4.3 KiB
Plaintext
Raw Normal View History

2025-03-14 22:52:59 +05:00
#include <f25519.cuh>
2025-03-15 05:02:21 +05:00
#include <cuda_runtime.h>
2025-03-14 22:52:59 +05:00
__device__ void f25519_copy(unsigned char* __restrict__ x, const unsigned char* __restrict__ a) {
2025-03-15 05:02:21 +05:00
const uint4* src = reinterpret_cast<const uint4*>(a);
uint4* dst = reinterpret_cast<uint4*>(x);
dst[0] = src[0];
dst[1] = src[1];
2025-03-14 22:52:59 +05:00
}
__device__ void f25519_select(unsigned char* __restrict__ dst, const unsigned char* __restrict__ zero, const unsigned char* __restrict__ one, unsigned char cond) {
2025-03-15 05:02:21 +05:00
unsigned mask = static_cast<unsigned>(-cond);
uint4* d = reinterpret_cast<uint4*>(dst);
const uint4* z = reinterpret_cast<const uint4*>(zero);
const uint4* o = reinterpret_cast<const uint4*>(one);
uint4 res0, res1;
res0.x = (z[0].x & ~mask) | (o[0].x & mask);
res0.y = (z[0].y & ~mask) | (o[0].y & mask);
res0.z = (z[0].z & ~mask) | (o[0].z & mask);
res0.w = (z[0].w & ~mask) | (o[0].w & mask);
res1.x = (z[1].x & ~mask) | (o[1].x & mask);
res1.y = (z[1].y & ~mask) | (o[1].y & mask);
res1.z = (z[1].z & ~mask) | (o[1].z & mask);
res1.w = (z[1].w & ~mask) | (o[1].w & mask);
d[0] = res0;
d[1] = res1;
2025-03-14 22:52:59 +05:00
}
2025-03-15 05:23:11 +05:00
2025-03-14 22:52:59 +05:00
__device__ void f25519_normalize(unsigned char* __restrict__ x) {
2025-03-15 05:02:21 +05:00
__align__(32) unsigned char minusp[32];
unsigned c = (x[31] >> 7) * 19;
2025-03-14 22:52:59 +05:00
x[31] &= 127;
2025-03-15 05:23:11 +05:00
#pragma unroll
2025-03-15 04:42:31 +05:00
for (int i = 0; i < 32; i++) {
2025-03-14 22:52:59 +05:00
c += x[i];
x[i] = (unsigned char)c;
c >>= 8;
}
c = 19;
#pragma unroll
2025-03-15 05:23:11 +05:00
for (int i = 0; i < 31; i++) {
2025-03-14 22:52:59 +05:00
c += x[i];
minusp[i] = (unsigned char)c;
c >>= 8;
}
2025-03-15 05:02:21 +05:00
c += x[31] - 128;
minusp[31] = (unsigned char)c;
2025-03-14 22:52:59 +05:00
f25519_select(x, minusp, x, (c >> 15) & 1);
}
__device__ void f25519_add(unsigned char* __restrict__ r, const unsigned char* __restrict__ a, const unsigned char* __restrict__ b) {
2025-03-15 05:02:21 +05:00
unsigned c = 0;
2025-03-15 05:23:11 +05:00
#pragma unroll
2025-03-15 04:42:31 +05:00
for (int i = 0; i < 32; i++) {
2025-03-15 05:02:21 +05:00
c = (c >> 8) + ((unsigned)a[i]) + ((unsigned)b[i]);
2025-03-14 22:52:59 +05:00
r[i] = (unsigned char)c;
}
2025-03-15 05:02:21 +05:00
r[31] &= 127;
2025-03-14 22:52:59 +05:00
c = (c >> 7) * 19;
2025-03-15 05:23:11 +05:00
#pragma unroll
2025-03-15 04:42:31 +05:00
for (int i = 0; i < 32; i++) {
2025-03-14 22:52:59 +05:00
c += r[i];
r[i] = (unsigned char)c;
c >>= 8;
}
}
__device__ void f25519_sub(unsigned char* __restrict__ r, const unsigned char* __restrict__ a, const unsigned char* __restrict__ b) {
unsigned c = 218;
#pragma unroll
2025-03-15 05:23:11 +05:00
for (int i = 0; i < 31; i++) {
2025-03-14 22:52:59 +05:00
c += 65280 + ((unsigned)a[i]) - ((unsigned)b[i]);
r[i] = (unsigned char)c;
c >>= 8;
}
2025-03-15 05:23:11 +05:00
c += ((unsigned)a[31]) - ((unsigned)b[31]);
r[31] = (unsigned char)(c & 127);
2025-03-14 22:52:59 +05:00
c = (c >> 7) * 19;
2025-03-15 05:23:11 +05:00
#pragma unroll
for (int i = 0; i < 32; i++) {
2025-03-14 22:52:59 +05:00
c += r[i];
r[i] = (unsigned char)c;
c >>= 8;
}
}
__device__ void f25519_neg(unsigned char* __restrict__ r, const unsigned char* __restrict__ a) {
unsigned c = 218;
#pragma unroll
2025-03-15 05:23:11 +05:00
for (int i = 0; i < 31; i++) {
2025-03-14 22:52:59 +05:00
c += 65280 - ((unsigned)a[i]);
r[i] = (unsigned char)c;
c >>= 8;
}
2025-03-15 05:23:11 +05:00
c -= ((unsigned)a[31]);
r[31] = (unsigned char)(c & 127);
2025-03-14 22:52:59 +05:00
c = (c >> 7) * 19;
2025-03-15 05:23:11 +05:00
#pragma unroll
for (int i = 0; i < 32; i++) {
2025-03-14 22:52:59 +05:00
c += r[i];
r[i] = (unsigned char)c;
c >>= 8;
}
}
__device__ void f25519_mul__distinct(unsigned char* __restrict__ r, const unsigned char* __restrict__ a, const unsigned char* __restrict__ b) {
unsigned c = 0;
2025-03-15 05:23:11 +05:00
#pragma unroll
2025-03-15 04:42:31 +05:00
for (int i = 0; i < 32; i++) {
2025-03-14 22:52:59 +05:00
c >>= 8;
2025-03-15 05:23:11 +05:00
for (int j = 0; j <= i; j++)
2025-03-14 22:52:59 +05:00
c += ((unsigned)a[j]) * ((unsigned)b[i - j]);
2025-03-15 05:23:11 +05:00
for (int j = i + 1; j < 32; j++)
2025-03-15 04:42:31 +05:00
c += ((unsigned)a[j]) * ((unsigned)b[32 + i - j]) * 38;
2025-03-14 22:52:59 +05:00
r[i] = (unsigned char)c;
}
2025-03-15 05:02:21 +05:00
r[31] &= 127;
2025-03-14 22:52:59 +05:00
c = (c >> 7) * 19;
2025-03-15 05:23:11 +05:00
#pragma unroll
2025-03-15 04:42:31 +05:00
for (int i = 0; i < 32; i++) {
2025-03-14 22:52:59 +05:00
c += r[i];
r[i] = (unsigned char)c;
c >>= 8;
}
}
__device__ void f25519_inv__distinct(unsigned char* __restrict__ r, const unsigned char* __restrict__ x) {
2025-03-15 05:02:21 +05:00
__align__(32) unsigned char s[32];
2025-03-14 22:52:59 +05:00
f25519_mul__distinct(s, x, x);
f25519_mul__distinct(r, s, x);
2025-03-15 05:23:11 +05:00
#pragma unroll
2025-03-14 22:52:59 +05:00
for (int i = 0; i < 248; i++) {
f25519_mul__distinct(s, r, r);
f25519_mul__distinct(r, s, x);
}
f25519_mul__distinct(s, r, r);
f25519_mul__distinct(r, s, s);
f25519_mul__distinct(s, r, x);
f25519_mul__distinct(r, s, s);
f25519_mul__distinct(s, r, r);
f25519_mul__distinct(r, s, x);
f25519_mul__distinct(s, r, r);
f25519_mul__distinct(r, s, x);
2025-03-15 05:23:11 +05:00
}