#pragma once #define F25519_SIZE 32 __device__ __constant__ unsigned char f25519_zero[F25519_SIZE] = { 0 }; __device__ __constant__ unsigned char f25519_one[F25519_SIZE] = { 1 }; __device__ __forceinline__ void f25519_load(unsigned char* __restrict__ x, unsigned int c) { #pragma unroll for (unsigned int i = 0; i < sizeof(c); i++) { x[i] = c & 0xFF; c >>= 8; } #pragma unroll for (unsigned int i = sizeof(c); i < F25519_SIZE; i++) { x[i] = 0; } } __device__ __forceinline__ void f25519_copy(unsigned char* __restrict__ x, const unsigned char* __restrict__ a) { #pragma unroll for (int i = 0; i < F25519_SIZE; i++) { x[i] = a[i]; } } __device__ __forceinline__ void f25519_select(unsigned char* __restrict__ dst, const unsigned char* __restrict__ zero, const unsigned char* __restrict__ one, unsigned char cond) { const unsigned char mask = 0 - cond; #pragma unroll for (int i = 0; i < F25519_SIZE; i++) { dst[i] = zero[i] ^ (mask & (one[i] ^ zero[i])); } } __device__ __forceinline__ void f25519_normalize(unsigned char* __restrict__ x) { unsigned char minusp[F25519_SIZE]; unsigned short c = (x[31] >> 7) * 19; x[31] &= 127; #pragma unroll for (int i = 0; i < F25519_SIZE; i++) { c += x[i]; x[i] = (unsigned char)c; c >>= 8; } c = 19; #pragma unroll for (int i = 0; i + 1 < F25519_SIZE; i++) { c += x[i]; minusp[i] = (unsigned char)c; c >>= 8; } c += x[F25519_SIZE - 1] - 128; minusp[F25519_SIZE - 1] = (unsigned char)c; f25519_select(x, minusp, x, (c >> 15) & 1); } __device__ __forceinline__ unsigned char f25519_eq(const unsigned char* __restrict__ x, const unsigned char* __restrict__ y) { unsigned char s = 0; #pragma unroll for (int i = 0; i < F25519_SIZE; i++) s |= x[i] ^ y[i]; s |= s >> 4; s |= s >> 2; s |= s >> 1; return (s ^ 1) & 1; } __device__ __forceinline__ void f25519_add(unsigned char* __restrict__ r, const unsigned char* __restrict__ a, const unsigned char* __restrict__ b) { unsigned short c = 0; #pragma unroll for (int i = 0; i < F25519_SIZE; i++) { c = (c >> 8) + ((unsigned short)a[i]) + ((unsigned short)b[i]); r[i] = (unsigned char)c; } r[F25519_SIZE - 1] &= 127; c = (c >> 7) * 19; #pragma unroll for (int i = 0; i < F25519_SIZE; i++) { c += r[i]; r[i] = (unsigned char)c; c >>= 8; } } __device__ __forceinline__ void f25519_sub(unsigned char* __restrict__ r, const unsigned char* __restrict__ a, const unsigned char* __restrict__ b) { uint32_t c = 218; int i = 0; #pragma unroll for (i = 0; i + 1 < F25519_SIZE; i++) { c += 65280 + ((uint32_t)a[i]) - ((uint32_t)b[i]); r[i] = (unsigned char)c; c >>= 8; } c += ((uint32_t)a[i]) - ((uint32_t)b[i]); r[i] = (unsigned char)(c & 127); c = (c >> 7) * 19; #pragma unroll for (i = 0; i < F25519_SIZE; i++) { c += r[i]; r[i] = (unsigned char)c; c >>= 8; } } __device__ __forceinline__ void f25519_neg(unsigned char* __restrict__ r, const unsigned char* __restrict__ a) { uint32_t c = 218; int i = 0; #pragma unroll for (i = 0; i + 1 < F25519_SIZE; i++) { c += 65280 - ((uint32_t)a[i]); r[i] = (unsigned char)c; c >>= 8; } c -= ((uint32_t)a[i]); r[i] = (unsigned char)(c & 127); c = (c >> 7) * 19; #pragma unroll for (i = 0; i < F25519_SIZE; i++) { c += r[i]; r[i] = (unsigned char)c; c >>= 8; } } __device__ __forceinline__ void f25519_mul__distinct(unsigned char* __restrict__ r, const unsigned char* __restrict__ a, const unsigned char* __restrict__ b) { uint32_t c = 0; #pragma unroll for (int i = 0; i < F25519_SIZE; i++) { c >>= 8; for (int j = 0; j <= i; j++) { c += ((uint32_t)a[j]) * ((uint32_t)b[i - j]); } for (int j = i + 1; j < F25519_SIZE; j++) { c += ((uint32_t)a[j]) * ((uint32_t)b[F25519_SIZE + i - j]) * 38; } r[i] = (unsigned char)c; } r[F25519_SIZE - 1] &= 127; c = (c >> 7) * 19; #pragma unroll for (int i = 0; i < F25519_SIZE; i++) { c += r[i]; r[i] = (unsigned char)c; c >>= 8; } } /* __device__ __forceinline__ void f25519_mul_c(unsigned char* __restrict__ r, const unsigned char* __restrict__ a, uint32_t b) { uint32_t c = 0; #pragma unroll for (int i = 0; i < F25519_SIZE; i++) { c = (c >> 8) + b * ((uint32_t)a[i]); r[i] = (unsigned char)c; } r[F25519_SIZE - 1] &= 127; c = (c >> 7) * 19; #pragma unroll for (int i = 0; i < F25519_SIZE; i++) { c += r[i]; r[i] = (unsigned char)c; c >>= 8; } } */ __device__ __forceinline__ void f25519_inv__distinct(unsigned char* __restrict__ r, const unsigned char* __restrict__ x) { unsigned char s[F25519_SIZE]; f25519_mul__distinct(s, x, x); f25519_mul__distinct(r, s, x); #pragma unroll for (int i = 0; i < 248; i++) { f25519_mul__distinct(s, r, r); f25519_mul__distinct(r, s, x); } f25519_mul__distinct(s, r, r); f25519_mul__distinct(r, s, s); f25519_mul__distinct(s, r, x); f25519_mul__distinct(r, s, s); f25519_mul__distinct(s, r, r); f25519_mul__distinct(r, s, x); f25519_mul__distinct(s, r, r); f25519_mul__distinct(r, s, x); } /* __device__ __forceinline__ void exp2523(unsigned char* __restrict__ r, const unsigned char* __restrict__ x, unsigned char* __restrict__ s) { int i; f25519_mul__distinct(r, x, x); f25519_mul__distinct(s, r, x); #pragma unroll for (i = 0; i < 248; i++) { f25519_mul__distinct(r, s, s); f25519_mul__distinct(s, r, x); } f25519_mul__distinct(r, s, s); f25519_mul__distinct(s, r, r); f25519_mul__distinct(r, s, x); } __device__ __forceinline__ void f25519_sqrt(unsigned char* __restrict__ r, const unsigned char* __restrict__ a) { unsigned char v[F25519_SIZE], i_val[F25519_SIZE], x[F25519_SIZE], y[F25519_SIZE]; f25519_mul_c(x, a, 2); exp2523(v, x, y); f25519_mul__distinct(y, v, v); f25519_mul__distinct(i_val, x, y); unsigned char one[F25519_SIZE]; f25519_load(one, 1); f25519_sub(i_val, i_val, one); f25519_mul__distinct(x, v, a); f25519_mul__distinct(r, x, i_val); } */