__device__ void f25519_copy(unsigned char* __restrict__ x, const unsigned char* __restrict__ a) { const uint4* src = reinterpret_cast(a); uint4* dst = reinterpret_cast(x); dst[0] = src[0]; dst[1] = src[1]; } __device__ void f25519_select(unsigned char* __restrict__ dst, const unsigned char* __restrict__ z, const unsigned char* __restrict__ o, unsigned char cond) { unsigned int mask = static_cast(-cond); const uint4* vZero = reinterpret_cast(z); const uint4* vOne = reinterpret_cast(o); uint4* vDst = reinterpret_cast(dst); uint4 res0, res1; res0.x = (vZero[0].x & ~mask) | (vOne[0].x & mask); res0.y = (vZero[0].y & ~mask) | (vOne[0].y & mask); res0.z = (vZero[0].z & ~mask) | (vOne[0].z & mask); res0.w = (vZero[0].w & ~mask) | (vOne[0].w & mask); res1.x = (vZero[1].x & ~mask) | (vOne[1].x & mask); res1.y = (vZero[1].y & ~mask) | (vOne[1].y & mask); res1.z = (vZero[1].z & ~mask) | (vOne[1].z & mask); res1.w = (vZero[1].w & ~mask) | (vOne[1].w & mask); vDst[0] = res0; vDst[1] = res1; } __device__ void f25519_normalize(unsigned char* __restrict__ x) { __align__(32) unsigned char minusp[32]; unsigned c = (x[31] >> 7) * 19; x[31] &= 127; #pragma unroll 32 for (int i = 0; i < 32; i++) { c += x[i]; x[i] = static_cast(c); c >>= 8; } c = 19; #pragma unroll 31 for (int i = 0; i < 31; i++) { c += x[i]; minusp[i] = static_cast(c); c >>= 8; } c += x[31] - 128; minusp[31] = static_cast(c); f25519_select(x, minusp, x, static_cast((c >> 15) & 1)); } __device__ void f25519_add(unsigned char* __restrict__ r, const unsigned char* __restrict__ a, const unsigned char* __restrict__ b) { unsigned c = 0; #pragma unroll 32 for (int i = 0; i < 32; i++) { c = (c >> 8) + static_cast(a[i]) + static_cast(b[i]); r[i] = static_cast(c); } r[31] &= 127; c = (c >> 7) * 19; #pragma unroll 32 for (int i = 0; i < 32; i++) { c += r[i]; r[i] = static_cast(c); c >>= 8; } } __device__ void f25519_sub(unsigned char* __restrict__ r, const unsigned char* __restrict__ a, const unsigned char* __restrict__ b) { unsigned c = 218; #pragma unroll 31 for (int i = 0; i < 31; i++) { c += 65280 + static_cast(a[i]) - static_cast(b[i]); r[i] = static_cast(c); c >>= 8; } c += static_cast(a[31]) - static_cast(b[31]); r[31] = static_cast(c & 127); c = (c >> 7) * 19; #pragma unroll 32 for (int i = 0; i < 32; i++) { c += r[i]; r[i] = static_cast(c); c >>= 8; } } __device__ void f25519_neg(unsigned char* __restrict__ r, const unsigned char* __restrict__ a) { unsigned c = 218; #pragma unroll 31 for (int i = 0; i < 31; i++) { c += 65280 - static_cast(a[i]); r[i] = static_cast(c); c >>= 8; } c -= static_cast(a[31]); r[31] = static_cast(c & 127); c = (c >> 7) * 19; #pragma unroll 32 for (int i = 0; i < 32; i++) { c += r[i]; r[i] = static_cast(c); c >>= 8; } } __device__ void f25519_mul__distinct(unsigned char* __restrict__ r, const unsigned char* __restrict__ a, const unsigned char* __restrict__ b) { unsigned c = 0; #pragma unroll 32 for (int i = 0; i < 32; i++) { c >>= 8; for (int j = 0; j <= i; j++) c += static_cast(a[j]) * static_cast(b[i - j]); for (int j = i + 1; j < 32; j++) c += static_cast(a[j]) * static_cast(b[32 + i - j]) * 38; r[i] = static_cast(c); } r[31] &= 127; c = (c >> 7) * 19; #pragma unroll 32 for (int i = 0; i < 32; i++) { c += r[i]; r[i] = static_cast(c); c >>= 8; } } __device__ void f25519_inv__distinct(unsigned char* __restrict__ r, const unsigned char* __restrict__ x) { __align__(32) unsigned char s[32]; f25519_mul__distinct(s, x, x); f25519_mul__distinct(r, s, x); #pragma unroll 248 for (int i = 0; i < 248; i++) { f25519_mul__distinct(s, r, r); f25519_mul__distinct(r, s, x); } f25519_mul__distinct(s, r, r); f25519_mul__distinct(r, s, s); f25519_mul__distinct(s, r, x); f25519_mul__distinct(r, s, s); f25519_mul__distinct(s, r, r); f25519_mul__distinct(r, s, x); f25519_mul__distinct(s, r, r); f25519_mul__distinct(r, s, x); }