135 lines
4.7 KiB
Plaintext
135 lines
4.7 KiB
Plaintext
__device__ void f25519_copy(unsigned char* __restrict__ x, const unsigned char* __restrict__ a) {
|
|
const uint4* src = reinterpret_cast<const uint4*>(a);
|
|
uint4* dst = reinterpret_cast<uint4*>(x);
|
|
dst[0] = src[0];
|
|
dst[1] = src[1];
|
|
}
|
|
__device__ void f25519_select(unsigned char* __restrict__ dst, const unsigned char* __restrict__ z, const unsigned char* __restrict__ o, unsigned char cond) {
|
|
unsigned int mask = static_cast<unsigned int>(-cond);
|
|
const uint4* vZero = reinterpret_cast<const uint4*>(z);
|
|
const uint4* vOne = reinterpret_cast<const uint4*>(o);
|
|
uint4* vDst = reinterpret_cast<uint4*>(dst);
|
|
uint4 res0, res1;
|
|
res0.x = (vZero[0].x & ~mask) | (vOne[0].x & mask);
|
|
res0.y = (vZero[0].y & ~mask) | (vOne[0].y & mask);
|
|
res0.z = (vZero[0].z & ~mask) | (vOne[0].z & mask);
|
|
res0.w = (vZero[0].w & ~mask) | (vOne[0].w & mask);
|
|
res1.x = (vZero[1].x & ~mask) | (vOne[1].x & mask);
|
|
res1.y = (vZero[1].y & ~mask) | (vOne[1].y & mask);
|
|
res1.z = (vZero[1].z & ~mask) | (vOne[1].z & mask);
|
|
res1.w = (vZero[1].w & ~mask) | (vOne[1].w & mask);
|
|
vDst[0] = res0;
|
|
vDst[1] = res1;
|
|
}
|
|
__device__ void f25519_normalize(unsigned char* __restrict__ x) {
|
|
__align__(32) unsigned char minusp[32];
|
|
unsigned c = (x[31] >> 7) * 19;
|
|
x[31] &= 127;
|
|
#pragma unroll 32
|
|
for (int i = 0; i < 32; i++) {
|
|
c += x[i];
|
|
x[i] = static_cast<unsigned char>(c);
|
|
c >>= 8;
|
|
}
|
|
c = 19;
|
|
#pragma unroll 31
|
|
for (int i = 0; i < 31; i++) {
|
|
c += x[i];
|
|
minusp[i] = static_cast<unsigned char>(c);
|
|
c >>= 8;
|
|
}
|
|
c += x[31] - 128;
|
|
minusp[31] = static_cast<unsigned char>(c);
|
|
f25519_select(x, minusp, x, static_cast<unsigned char>((c >> 15) & 1));
|
|
}
|
|
__device__ void f25519_add(unsigned char* __restrict__ r, const unsigned char* __restrict__ a, const unsigned char* __restrict__ b) {
|
|
unsigned c = 0;
|
|
#pragma unroll 32
|
|
for (int i = 0; i < 32; i++) {
|
|
c = (c >> 8) + static_cast<unsigned>(a[i]) + static_cast<unsigned>(b[i]);
|
|
r[i] = static_cast<unsigned char>(c);
|
|
}
|
|
r[31] &= 127;
|
|
c = (c >> 7) * 19;
|
|
#pragma unroll 32
|
|
for (int i = 0; i < 32; i++) {
|
|
c += r[i];
|
|
r[i] = static_cast<unsigned char>(c);
|
|
c >>= 8;
|
|
}
|
|
}
|
|
__device__ void f25519_sub(unsigned char* __restrict__ r, const unsigned char* __restrict__ a, const unsigned char* __restrict__ b) {
|
|
unsigned c = 218;
|
|
#pragma unroll 31
|
|
for (int i = 0; i < 31; i++) {
|
|
c += 65280 + static_cast<unsigned>(a[i]) - static_cast<unsigned>(b[i]);
|
|
r[i] = static_cast<unsigned char>(c);
|
|
c >>= 8;
|
|
}
|
|
c += static_cast<unsigned>(a[31]) - static_cast<unsigned>(b[31]);
|
|
r[31] = static_cast<unsigned char>(c & 127);
|
|
c = (c >> 7) * 19;
|
|
#pragma unroll 32
|
|
for (int i = 0; i < 32; i++) {
|
|
c += r[i];
|
|
r[i] = static_cast<unsigned char>(c);
|
|
c >>= 8;
|
|
}
|
|
}
|
|
__device__ void f25519_neg(unsigned char* __restrict__ r, const unsigned char* __restrict__ a) {
|
|
unsigned c = 218;
|
|
#pragma unroll 31
|
|
for (int i = 0; i < 31; i++) {
|
|
c += 65280 - static_cast<unsigned>(a[i]);
|
|
r[i] = static_cast<unsigned char>(c);
|
|
c >>= 8;
|
|
}
|
|
c -= static_cast<unsigned>(a[31]);
|
|
r[31] = static_cast<unsigned char>(c & 127);
|
|
c = (c >> 7) * 19;
|
|
#pragma unroll 32
|
|
for (int i = 0; i < 32; i++) {
|
|
c += r[i];
|
|
r[i] = static_cast<unsigned char>(c);
|
|
c >>= 8;
|
|
}
|
|
}
|
|
__device__ void f25519_mul__distinct(unsigned char* __restrict__ r, const unsigned char* __restrict__ a, const unsigned char* __restrict__ b) {
|
|
unsigned c = 0;
|
|
#pragma unroll 32
|
|
for (int i = 0; i < 32; i++) {
|
|
c >>= 8;
|
|
for (int j = 0; j <= i; j++)
|
|
c += static_cast<unsigned>(a[j]) * static_cast<unsigned>(b[i - j]);
|
|
for (int j = i + 1; j < 32; j++)
|
|
c += static_cast<unsigned>(a[j]) * static_cast<unsigned>(b[32 + i - j]) * 38;
|
|
r[i] = static_cast<unsigned char>(c);
|
|
}
|
|
r[31] &= 127;
|
|
c = (c >> 7) * 19;
|
|
#pragma unroll 32
|
|
for (int i = 0; i < 32; i++) {
|
|
c += r[i];
|
|
r[i] = static_cast<unsigned char>(c);
|
|
c >>= 8;
|
|
}
|
|
}
|
|
__device__ void f25519_inv__distinct(unsigned char* __restrict__ r, const unsigned char* __restrict__ x) {
|
|
__align__(32) unsigned char s[32];
|
|
f25519_mul__distinct(s, x, x);
|
|
f25519_mul__distinct(r, s, x);
|
|
#pragma unroll 248
|
|
for (int i = 0; i < 248; i++) {
|
|
f25519_mul__distinct(s, r, r);
|
|
f25519_mul__distinct(r, s, x);
|
|
}
|
|
f25519_mul__distinct(s, r, r);
|
|
f25519_mul__distinct(r, s, s);
|
|
f25519_mul__distinct(s, r, x);
|
|
f25519_mul__distinct(r, s, s);
|
|
f25519_mul__distinct(s, r, r);
|
|
f25519_mul__distinct(r, s, x);
|
|
f25519_mul__distinct(s, r, r);
|
|
f25519_mul__distinct(r, s, x);
|
|
}
|