optimized

This commit is contained in:
rcxpony 2025-03-16 22:03:05 +05:00
parent 9e6cff28fd
commit b7240b3d8d
6 changed files with 510 additions and 1077 deletions

1187
libs/fe.cu

File diff suppressed because it is too large Load Diff

View File

@ -1,23 +1,16 @@
#ifndef __FE_H
#define __FE_H
#include <fixedint.h>
typedef int32_t fe[10];
void __host__ __device__ fe_0(fe h);
using fe = int[10];
void __device__ __host__ fe_1(fe h);
void __device__ __host__ fe_frombytes(fe h, const unsigned char *s);
void __device__ __host__ fe_tobytes(unsigned char *s, const fe h);
void __host__ __device__ fe_copy(fe h, const fe f);
int __host__ __device__ fe_isnegative(const fe f);
int __device__ __host__ fe_isnonzero(const fe f);
void __host__ __device__ fe_cmov(fe f, const fe g, unsigned int b);
void fe_cswap(fe f, fe g, unsigned int b);
void __device__ __host__ fe_neg(fe h, const fe f);
void __device__ __host__ fe_add(fe h, const fe f, const fe g);
void __device__ __host__ fe_invert(fe out, const fe z);
void __device__ __host__ fe_sq(fe h, const fe f);
void __host__ __device__ fe_sq2(fe h, const fe f);
void __device__ __host__ fe_mul(fe h, const fe f, const fe g);
void fe_mul121666(fe h, fe f);
void __device__ __host__ fe_pow22523(fe out, const fe z);
void __device__ __host__ fe_sub(fe h, const fe f, const fe g);
void __device__ __host__ fe_tobytes(unsigned char *s, const fe& h);
void __host__ __device__ fe_copy(fe h, const fe& f);
int __host__ __device__ fe_isnegative(const fe& f);
void __host__ __device__ fe_cmov(fe f, const fe& g, unsigned int b);
void __device__ __host__ fe_neg(fe h, const fe& f);
void __device__ __host__ fe_add(fe h, const fe& f, const fe& g);
void __device__ __host__ fe_invert(fe out, const fe& z);
void __device__ __host__ fe_sq(fe h, const fe& f);
void __host__ __device__ fe_sq2(fe h, const fe& f);
void __device__ __host__ fe_mul(fe h, const fe& f, const fe& g);
void __device__ __host__ fe_sub(fe h, const fe& f, const fe& g);
#endif

View File

@ -1,51 +0,0 @@
#if ((defined(__STDC__) && __STDC__ && __STDC_VERSION__ >= 199901L) || (defined(__WATCOMC__) && (defined(_STDINT_H_INCLUDED) || __WATCOMC__ >= 1250)) || (defined(__GNUC__) && (defined(_STDINT_H) || defined(_STDINT_H_) || defined(__UINT_FAST64_TYPE__)) )) && !defined(FIXEDINT_H_INCLUDED)
#include <stdint.h>
#define FIXEDINT_H_INCLUDED
#if defined(__WATCOMC__) && __WATCOMC__ >= 1250 && !defined(UINT64_C)
#include <limits.h>
#define UINT64_C(x) (x + (UINT64_MAX - UINT64_MAX))
#endif
#endif
#ifndef FIXEDINT_H_INCLUDED
#define FIXEDINT_H_INCLUDED
#include <limits.h>
#ifndef uint32_t
#if (ULONG_MAX == 0xffffffffUL)
typedef unsigned long uint32_t;
#elif (UINT_MAX == 0xffffffffUL)
typedef unsigned int uint32_t;
#elif (USHRT_MAX == 0xffffffffUL)
typedef unsigned short uint32_t;
#endif
#endif
#ifndef int32_t
#if (LONG_MAX == 0x7fffffffL)
typedef signed long int32_t;
#elif (INT_MAX == 0x7fffffffL)
typedef signed int int32_t;
#elif (SHRT_MAX == 0x7fffffffL)
typedef signed short int32_t;
#endif
#endif
#if (defined(__STDC__) && defined(__STDC_VERSION__) && __STDC__ && __STDC_VERSION__ >= 199901L)
typedef long long int64_t;
typedef unsigned long long uint64_t;
#define UINT64_C(v) v ##ULL
#define INT64_C(v) v ##LL
#elif defined(__GNUC__)
__extension__ typedef long long int64_t;
__extension__ typedef unsigned long long uint64_t;
#define UINT64_C(v) v ##ULL
#define INT64_C(v) v ##LL
#elif defined(__MWERKS__) || defined(__SUNPRO_C) || defined(__SUNPRO_CC) || defined(__APPLE_CC__) || defined(_LONG_LONG) || defined(_CRAYC)
typedef long long int64_t;
typedef unsigned long long uint64_t;
#define UINT64_C(v) v ##ULL
#define INT64_C(v) v ##LL
#elif (defined(__WATCOMC__) && defined(__WATCOM_INT64__)) || (defined(_MSC_VER) && _INTEGRAL_MAX_BITS >= 64) || (defined(__BORLANDC__) && __BORLANDC__ > 0x460) || defined(__alpha) || defined(__DECC)
typedef __int64 int64_t;
typedef unsigned __int64 uint64_t;
#define UINT64_C(v) v ##UI64
#define INT64_C(v) v ##I64
#endif
#endif

View File

@ -1,158 +1,6 @@
#include <ge.cuh>
#include <precomp_data.h>
void __host__ __device__ ge_add(ge_p1p1 *r, const ge_p3 *p, const ge_cached *q) {
fe t0;
fe_add(r->X, p->Y, p->X);
fe_sub(r->Y, p->Y, p->X);
fe_mul(r->Z, r->X, q->YplusX);
fe_mul(r->Y, r->Y, q->YminusX);
fe_mul(r->T, q->T2d, p->T);
fe_mul(r->X, p->Z, q->Z);
fe_add(t0, r->X, r->X);
fe_sub(r->X, r->Z, r->Y);
fe_add(r->Y, r->Z, r->Y);
fe_add(r->Z, t0, r->T);
fe_sub(r->T, t0, r->T);
}
static void __host__ __device__ slide(signed char *r, const unsigned char *a) {
int i;
int b;
int k;
#pragma unroll 256
for (i = 0; i < 256; ++i) {
r[i] = 1 & (a[i >> 3] >> (i & 7));
}
#pragma unroll 256
for (i = 0; i < 256; ++i)
if (r[i]) {
#pragma unroll
for (b = 1; b <= 6 && i + b < 256; ++b) {
if (r[i + b]) {
if (r[i] + (r[i + b] << b) <= 15) {
r[i] += r[i + b] << b;
r[i + b] = 0;
} else if (r[i] - (r[i + b] << b) >= -15) {
r[i] -= r[i + b] << b;
#pragma unroll
for (k = i + b; k < 256; ++k) {
if (!r[k]) {
r[k] = 1;
break;
}
r[k] = 0;
}
} else {
break;
}
}
}
}
}
void __host__ __device__ ge_double_scalarmult_vartime(ge_p2 *r, const unsigned char *a, const ge_p3 *A, const unsigned char *b) {
signed char aslide[256];
signed char bslide[256];
ge_cached Ai[8];
ge_p1p1 t;
ge_p3 u;
ge_p3 A2;
int i;
slide(aslide, a);
slide(bslide, b);
ge_p3_to_cached(&Ai[0], A);
ge_p3_dbl(&t, A);
ge_p1p1_to_p3(&A2, &t);
ge_add(&t, &A2, &Ai[0]);
ge_p1p1_to_p3(&u, &t);
ge_p3_to_cached(&Ai[1], &u);
ge_add(&t, &A2, &Ai[1]);
ge_p1p1_to_p3(&u, &t);
ge_p3_to_cached(&Ai[2], &u);
ge_add(&t, &A2, &Ai[2]);
ge_p1p1_to_p3(&u, &t);
ge_p3_to_cached(&Ai[3], &u);
ge_add(&t, &A2, &Ai[3]);
ge_p1p1_to_p3(&u, &t);
ge_p3_to_cached(&Ai[4], &u);
ge_add(&t, &A2, &Ai[4]);
ge_p1p1_to_p3(&u, &t);
ge_p3_to_cached(&Ai[5], &u);
ge_add(&t, &A2, &Ai[5]);
ge_p1p1_to_p3(&u, &t);
ge_p3_to_cached(&Ai[6], &u);
ge_add(&t, &A2, &Ai[6]);
ge_p1p1_to_p3(&u, &t);
ge_p3_to_cached(&Ai[7], &u);
ge_p2_0(r);
for (i = 255; i >= 0; --i) {
if (aslide[i] || bslide[i]) {
break;
}
}
for (; i >= 0; --i) {
ge_p2_dbl(&t, r);
if (aslide[i] > 0) {
ge_p1p1_to_p3(&u, &t);
ge_add(&t, &u, &Ai[aslide[i] / 2]);
} else if (aslide[i] < 0) {
ge_p1p1_to_p3(&u, &t);
ge_sub(&t, &u, &Ai[(-aslide[i]) / 2]);
}
if (bslide[i] > 0) {
ge_p1p1_to_p3(&u, &t);
ge_madd(&t, &u, &Bi[bslide[i] / 2]);
} else if (bslide[i] < 0) {
ge_p1p1_to_p3(&u, &t);
ge_msub(&t, &u, &Bi[(-bslide[i]) / 2]);
}
ge_p1p1_to_p2(r, &t);
}
}
static __constant__ __device__ fe d = {
-10913610, 13857413, -15372611, 6949391, 114729, -8787816, -6275908, -3247719, -18696448, -12055116
};
static __constant__ __device__ fe sqrtm1 = {
-32595792, -7943725, 9377950, 3500415, 12389472, -272473, -25146209, -2005654, 326686, 11406482
};
int __device__ __host__ ge_frombytes_negate_vartime(ge_p3 *h, const unsigned char *s) {
fe u;
fe v;
fe v3;
fe vxx;
fe check;
fe_frombytes(h->Y, s);
fe_1(h->Z);
fe_sq(u, h->Y);
fe_mul(v, u, d);
fe_sub(u, u, h->Z);
fe_add(v, v, h->Z);
fe_sq(v3, v);
fe_mul(v3, v3, v);
fe_sq(h->X, v3);
fe_mul(h->X, h->X, v);
fe_mul(h->X, h->X, u);
fe_pow22523(h->X, h->X);
fe_mul(h->X, h->X, v3);
fe_mul(h->X, h->X, u);
fe_sq(vxx, h->X);
fe_mul(vxx, vxx, v);
fe_sub(check, vxx, u);
if (fe_isnonzero(check)) {
fe_add(check, vxx, u);
if (fe_isnonzero(check)) {
return -1;
}
fe_mul(h->X, h->X, sqrtm1);
}
if (fe_isnegative(h->X) == (s[31] >> 7)) {
fe_neg(h->X, h->X);
}
fe_mul(h->T, h->X, h->Y);
return 0;
}
void __host__ __device__ ge_madd(ge_p1p1 *r, const ge_p3 *p, const ge_precomp *q) {
void __host__ __device__ ge_madd(ge_p1p1* r, const ge_p3* p, const ge_precomp* q) {
fe t0;
fe_add(r->X, p->Y, p->X);
fe_sub(r->Y, p->Y, p->X);
@ -165,36 +13,18 @@ void __host__ __device__ ge_madd(ge_p1p1 *r, const ge_p3 *p, const ge_precomp *q
fe_add(r->Z, t0, r->T);
fe_sub(r->T, t0, r->T);
}
void __host__ __device__ ge_msub(ge_p1p1 *r, const ge_p3 *p, const ge_precomp *q) {
fe t0;
fe_add(r->X, p->Y, p->X);
fe_sub(r->Y, p->Y, p->X);
fe_mul(r->Z, r->X, q->yminusx);
fe_mul(r->Y, r->Y, q->yplusx);
fe_mul(r->T, q->xy2d, p->T);
fe_add(t0, p->Z, p->Z);
fe_sub(r->X, r->Z, r->Y);
fe_add(r->Y, r->Z, r->Y);
fe_sub(r->Z, t0, r->T);
fe_add(r->T, t0, r->T);
}
void __host__ __device__ ge_p1p1_to_p2(ge_p2 *r, const ge_p1p1 *p) {
void __host__ __device__ ge_p1p1_to_p2(ge_p2* r, const ge_p1p1* p) {
fe_mul(r->X, p->X, p->T);
fe_mul(r->Y, p->Y, p->Z);
fe_mul(r->Z, p->Z, p->T);
}
void __host__ __device__ ge_p1p1_to_p3(ge_p3 *r, const ge_p1p1 *p) {
void __host__ __device__ ge_p1p1_to_p3(ge_p3* r, const ge_p1p1* p) {
fe_mul(r->X, p->X, p->T);
fe_mul(r->Y, p->Y, p->Z);
fe_mul(r->Z, p->Z, p->T);
fe_mul(r->T, p->X, p->Y);
}
void __host__ __device__ ge_p2_0(ge_p2 *h) {
fe_0(h->X);
fe_1(h->Y);
fe_1(h->Z);
}
void __host__ __device__ ge_p2_dbl(ge_p1p1 *r, const ge_p2 *p) {
void __host__ __device__ ge_p2_dbl(ge_p1p1* r, const ge_p2* p) {
fe t0;
fe_sq(r->X, p->X);
fe_sq(r->Z, p->Y);
@ -206,35 +36,15 @@ void __host__ __device__ ge_p2_dbl(ge_p1p1 *r, const ge_p2 *p) {
fe_sub(r->X, t0, r->Y);
fe_sub(r->T, r->T, r->Z);
}
void __host__ __device__ ge_p3_0(ge_p3 *h) {
fe_0(h->X);
fe_1(h->Y);
fe_1(h->Z);
fe_0(h->T);
}
void __host__ __device__ ge_p3_dbl(ge_p1p1 *r, const ge_p3 *p) {
void __host__ __device__ ge_p3_dbl(ge_p1p1* r, const ge_p3* p) {
ge_p2 q;
ge_p3_to_p2(&q, p);
fe_copy(q.X, p->X);
fe_copy(q.Y, p->Y);
fe_copy(q.Z, p->Z);
ge_p2_dbl(r, &q);
}
static __constant__ __device__ fe d2 = {
-21827239, -5839606, -30745221, 13898782, 229458, 15978800, -12551817, -6495438, 29715968, 9444199
};
void __host__ __device__ ge_p3_to_cached(ge_cached *r, const ge_p3 *p) {
fe_add(r->YplusX, p->Y, p->X);
fe_sub(r->YminusX, p->Y, p->X);
fe_copy(r->Z, p->Z);
fe_mul(r->T2d, p->T, d2);
}
void ge_p3_to_p2(ge_p2 *r, const ge_p3 *p) {
fe_copy(r->X, p->X);
fe_copy(r->Y, p->Y);
fe_copy(r->Z, p->Z);
}
void ge_p3_tobytes(unsigned char *s, const ge_p3 *h) {
fe recip;
fe x;
fe y;
void ge_p3_tobytes(unsigned char* s, const ge_p3* h) {
fe recip, x, y;
fe_invert(recip, h->Z);
fe_mul(x, h->X, recip);
fe_mul(y, h->Y, recip);
@ -242,31 +52,26 @@ void ge_p3_tobytes(unsigned char *s, const ge_p3 *h) {
s[31] ^= fe_isnegative(x) << 7;
}
static unsigned char __host__ __device__ equal(signed char b, signed char c) {
unsigned char ub = b;
unsigned char uc = c;
unsigned char x = ub ^ uc;
uint64_t y = x;
y -= 1;
y >>= 63;
return (unsigned char) y;
}
static unsigned char __host__ __device__ negative(signed char b) {
uint64_t x = b;
unsigned long x = b ^ c;
x -= 1;
x >>= 63;
return (unsigned char) x;
return (unsigned char)x;
}
static void __host__ __device__ cmov(ge_precomp *t, const ge_precomp *u, unsigned char b) {
static void __host__ __device__ cmov(ge_precomp* t, const ge_precomp* u, unsigned char b) {
fe_cmov(t->yplusx, u->yplusx, b);
fe_cmov(t->yminusx, u->yminusx, b);
fe_cmov(t->xy2d, u->xy2d, b);
}
static void __host__ __device__ select(ge_precomp *t, int pos, signed char b) {
static void __host__ __device__ select(ge_precomp* t, int pos, signed char b) {
ge_precomp minust;
unsigned char bnegative = negative(b);
unsigned long x = b;
x >>= 63;
unsigned char bnegative = static_cast<unsigned char>(x);
unsigned char babs = b - (((-bnegative) & b) << 1);
fe_1(t->yplusx);
fe_1(t->yminusx);
fe_0(t->xy2d);
#pragma unroll 10
for (int i = 0; i < 10; i++) t->xy2d[i] = 0;
cmov(t, &base[pos][0], equal(babs, 1));
cmov(t, &base[pos][1], equal(babs, 2));
cmov(t, &base[pos][2], equal(babs, 3));
@ -298,8 +103,13 @@ void __device__ __host__ ge_scalarmult_base(ge_p3* h, const unsigned char* a) {
e[i] -= carry << 4;
}
e[63] += carry;
ge_p3_0(h);
#pragma unroll 32
#pragma unroll 10
for (int i = 0; i < 10; i++) h->X[i] = 0;
fe_1(h->Y);
fe_1(h->Z);
#pragma unroll 10
for (int i = 0; i < 10; i++) h->T[i] = 0;
#pragma unroll 64
for (i = 1; i < 64; i += 2) {
select(&t, i >> 1, e[i]);
ge_madd(&r, h, &t);
@ -310,34 +120,10 @@ void __device__ __host__ ge_scalarmult_base(ge_p3* h, const unsigned char* a) {
ge_p1p1_to_p2(&s, &r); ge_p2_dbl(&r, &s);
ge_p1p1_to_p2(&s, &r); ge_p2_dbl(&r, &s);
ge_p1p1_to_p3(h, &r);
#pragma unroll 32
#pragma unroll 64
for (i = 0; i < 64; i += 2) {
select(&t, i >> 1, e[i]);
ge_madd(&r, h, &t);
ge_p1p1_to_p3(h, &r);
}
}
void __host__ __device__ ge_sub(ge_p1p1 *r, const ge_p3 *p, const ge_cached *q) {
fe t0;
fe_add(r->X, p->Y, p->X);
fe_sub(r->Y, p->Y, p->X);
fe_mul(r->Z, r->X, q->YminusX);
fe_mul(r->Y, r->Y, q->YplusX);
fe_mul(r->T, q->T2d, p->T);
fe_mul(r->X, p->Z, q->Z);
fe_add(t0, r->X, r->X);
fe_sub(r->X, r->Z, r->Y);
fe_add(r->Y, r->Z, r->Y);
fe_sub(r->Z, t0, r->T);
fe_add(r->T, t0, r->T);
}
void __host__ __device__ ge_tobytes(unsigned char *s, const ge_p2 *h) {
fe recip;
fe x;
fe y;
fe_invert(recip, h->Z);
fe_mul(x, h->X, recip);
fe_mul(y, h->Y, recip);
fe_tobytes(s, y);
s[31] ^= fe_isnegative(x) << 7;
}

View File

@ -23,27 +23,12 @@ typedef struct {
fe yminusx;
fe xy2d;
} ge_precomp;
typedef struct {
fe YplusX;
fe YminusX;
fe Z;
fe T2d;
} ge_cached;
void __host__ __device__ ge_p3_tobytes(unsigned char *s, const ge_p3 *h);
void __host__ __device__ ge_tobytes(unsigned char *s, const ge_p2 *h);
int __host__ __device__ ge_frombytes_negate_vartime(ge_p3 *h, const unsigned char *s);
void __host__ __device__ ge_add(ge_p1p1 *r, const ge_p3 *p, const ge_cached *q);
void __host__ __device__ ge_sub(ge_p1p1 *r, const ge_p3 *p, const ge_cached *q);
void __host__ __device__ ge_double_scalarmult_vartime(ge_p2 *r, const unsigned char *a, const ge_p3 *A, const unsigned char *b);
void __host__ __device__ ge_madd(ge_p1p1 *r, const ge_p3 *p, const ge_precomp *q);
void __host__ __device__ ge_msub(ge_p1p1 *r, const ge_p3 *p, const ge_precomp *q);
void __host__ __device__ ge_scalarmult_base(ge_p3 *h, const unsigned char *a);
void __host__ __device__ ge_p1p1_to_p2(ge_p2 *r, const ge_p1p1 *p);
void __host__ __device__ ge_p1p1_to_p3(ge_p3 *r, const ge_p1p1 *p);
void __host__ __device__ ge_p2_0(ge_p2 *h);
void __host__ __device__ ge_p2_dbl(ge_p1p1 *r, const ge_p2 *p);
void __host__ __device__ ge_p3_0(ge_p3 *h);
void __host__ __device__ ge_p3_dbl(ge_p1p1 *r, const ge_p3 *p);
void __host__ __device__ ge_p3_to_cached(ge_cached *r, const ge_p3 *p);
void __host__ __device__ ge_p3_to_p2(ge_p2 *r, const ge_p3 *p);
#endif

View File

@ -6,6 +6,13 @@
#include <edsign.cuh>
#include <string.cuh>
#include <keymanip.cuh>
#ifndef DEBUG
#define THREADSPB 256
#define THDIVTHPB (tTh / THREADSPB)
#else
#define THREADSPB 1
#define THDIVTHPB 1
#endif
__device__ unsigned d_high = 0x10;
__device__ int parameters(const char* arg) noexcept {
if ((cstring_find(arg, "--altitude") == 0 && cstring_length(arg) == 10) ||
@ -114,9 +121,13 @@ __global__ void KeyGenKernel(curandState* randStates) {
printf("\nIPv6:\t%s\nPK:\t%s\nSK:\t%s\n", getAddr(raw).data, ktos(keys.PublicKey).data, ktos(keys.PrivateKey).data);
d_high = zeros;
}
#ifdef DEBUG
if ((x & 0xFF) == 0) {
printf("Iters: %d\n", x);
}
#endif
}
}
#define THREADS_P_B 256
int main(int argc, char* argv[]) {
int* d_result, mBpSM, h_high;
char** d_argv;
@ -135,16 +146,17 @@ int main(int argc, char* argv[]) {
cudaDeviceSynchronize();
cudaMemcpyFromSymbol(&h_high, d_high, sizeof(unsigned));
cudaGetDeviceProperties(&prop, 0);
cudaOccupancyMaxActiveBlocksPerMultiprocessor(&mBpSM, KeyGenKernel, THREADS_P_B, 0);
const int tTh = mBpSM * prop.multiProcessorCount * THREADS_P_B;
printf("High addrs: 2%02x+\nSMs: %d\nTotalThreads: %d\nBlocks: %d (Threads: %d)\n", h_high, prop.multiProcessorCount, tTh, tTh / THREADS_P_B, THREADS_P_B);
cudaOccupancyMaxActiveBlocksPerMultiprocessor(&mBpSM, KeyGenKernel, THREADSPB, 0);
const int tTh = mBpSM * prop.multiProcessorCount * THREADSPB;
printf("High addrs: 2%02x+\nSMs: %d\nTotalThreads: %d\nBlocks: %d (Threads: %d)\n", h_high, prop.multiProcessorCount, tTh, tTh / THREADSPB, THREADSPB);
cudaMalloc(&rst, tTh * sizeof(curandState));
unsigned int* d_seeds;
cudaMalloc(&d_seeds, tTh * sizeof(unsigned int));
initRand<<<tTh / THREADS_P_B, THREADS_P_B >>>(rst, d_seeds);
unsigned* d_seeds;
cudaMalloc(&d_seeds, tTh * sizeof(unsigned));
initRand<<<THDIVTHPB, THREADSPB>>>(rst, d_seeds);
cudaDeviceSynchronize();
unsigned int* h_seeds = (unsigned int*)malloc(tTh * sizeof(unsigned int));
cudaMemcpy(h_seeds, d_seeds, tTh * sizeof(unsigned int), cudaMemcpyDeviceToHost);
#ifndef DEBUG
unsigned* h_seeds = (unsigned*)malloc(tTh * sizeof(unsigned));
cudaMemcpy(h_seeds, d_seeds, tTh * sizeof(unsigned), cudaMemcpyDeviceToHost);
if (checkSeeds(h_seeds, tTh)) {
fprintf(stderr, "Duplicate seeds found!\n");
free(h_seeds);
@ -154,7 +166,8 @@ int main(int argc, char* argv[]) {
}
free(h_seeds);
cudaFree(d_seeds);
KeyGenKernel << <tTh / THREADS_P_B, THREADS_P_B >> > (rst);
#endif
KeyGenKernel<<<THDIVTHPB, THREADSPB>>>(rst);
cudaFree(rst);
return 0;
}