optimized
This commit is contained in:
parent
9e6cff28fd
commit
b7240b3d8d
1123
libs/fe.cu
1123
libs/fe.cu
File diff suppressed because it is too large
Load Diff
31
libs/fe.cuh
31
libs/fe.cuh
@ -1,23 +1,16 @@
|
||||
#ifndef __FE_H
|
||||
#define __FE_H
|
||||
#include <fixedint.h>
|
||||
typedef int32_t fe[10];
|
||||
void __host__ __device__ fe_0(fe h);
|
||||
using fe = int[10];
|
||||
void __device__ __host__ fe_1(fe h);
|
||||
void __device__ __host__ fe_frombytes(fe h, const unsigned char *s);
|
||||
void __device__ __host__ fe_tobytes(unsigned char *s, const fe h);
|
||||
void __host__ __device__ fe_copy(fe h, const fe f);
|
||||
int __host__ __device__ fe_isnegative(const fe f);
|
||||
int __device__ __host__ fe_isnonzero(const fe f);
|
||||
void __host__ __device__ fe_cmov(fe f, const fe g, unsigned int b);
|
||||
void fe_cswap(fe f, fe g, unsigned int b);
|
||||
void __device__ __host__ fe_neg(fe h, const fe f);
|
||||
void __device__ __host__ fe_add(fe h, const fe f, const fe g);
|
||||
void __device__ __host__ fe_invert(fe out, const fe z);
|
||||
void __device__ __host__ fe_sq(fe h, const fe f);
|
||||
void __host__ __device__ fe_sq2(fe h, const fe f);
|
||||
void __device__ __host__ fe_mul(fe h, const fe f, const fe g);
|
||||
void fe_mul121666(fe h, fe f);
|
||||
void __device__ __host__ fe_pow22523(fe out, const fe z);
|
||||
void __device__ __host__ fe_sub(fe h, const fe f, const fe g);
|
||||
void __device__ __host__ fe_tobytes(unsigned char *s, const fe& h);
|
||||
void __host__ __device__ fe_copy(fe h, const fe& f);
|
||||
int __host__ __device__ fe_isnegative(const fe& f);
|
||||
void __host__ __device__ fe_cmov(fe f, const fe& g, unsigned int b);
|
||||
void __device__ __host__ fe_neg(fe h, const fe& f);
|
||||
void __device__ __host__ fe_add(fe h, const fe& f, const fe& g);
|
||||
void __device__ __host__ fe_invert(fe out, const fe& z);
|
||||
void __device__ __host__ fe_sq(fe h, const fe& f);
|
||||
void __host__ __device__ fe_sq2(fe h, const fe& f);
|
||||
void __device__ __host__ fe_mul(fe h, const fe& f, const fe& g);
|
||||
void __device__ __host__ fe_sub(fe h, const fe& f, const fe& g);
|
||||
#endif
|
||||
|
@ -1,51 +0,0 @@
|
||||
#if ((defined(__STDC__) && __STDC__ && __STDC_VERSION__ >= 199901L) || (defined(__WATCOMC__) && (defined(_STDINT_H_INCLUDED) || __WATCOMC__ >= 1250)) || (defined(__GNUC__) && (defined(_STDINT_H) || defined(_STDINT_H_) || defined(__UINT_FAST64_TYPE__)) )) && !defined(FIXEDINT_H_INCLUDED)
|
||||
#include <stdint.h>
|
||||
#define FIXEDINT_H_INCLUDED
|
||||
#if defined(__WATCOMC__) && __WATCOMC__ >= 1250 && !defined(UINT64_C)
|
||||
#include <limits.h>
|
||||
#define UINT64_C(x) (x + (UINT64_MAX - UINT64_MAX))
|
||||
#endif
|
||||
#endif
|
||||
#ifndef FIXEDINT_H_INCLUDED
|
||||
#define FIXEDINT_H_INCLUDED
|
||||
#include <limits.h>
|
||||
#ifndef uint32_t
|
||||
#if (ULONG_MAX == 0xffffffffUL)
|
||||
typedef unsigned long uint32_t;
|
||||
#elif (UINT_MAX == 0xffffffffUL)
|
||||
typedef unsigned int uint32_t;
|
||||
#elif (USHRT_MAX == 0xffffffffUL)
|
||||
typedef unsigned short uint32_t;
|
||||
#endif
|
||||
#endif
|
||||
#ifndef int32_t
|
||||
#if (LONG_MAX == 0x7fffffffL)
|
||||
typedef signed long int32_t;
|
||||
#elif (INT_MAX == 0x7fffffffL)
|
||||
typedef signed int int32_t;
|
||||
#elif (SHRT_MAX == 0x7fffffffL)
|
||||
typedef signed short int32_t;
|
||||
#endif
|
||||
#endif
|
||||
#if (defined(__STDC__) && defined(__STDC_VERSION__) && __STDC__ && __STDC_VERSION__ >= 199901L)
|
||||
typedef long long int64_t;
|
||||
typedef unsigned long long uint64_t;
|
||||
#define UINT64_C(v) v ##ULL
|
||||
#define INT64_C(v) v ##LL
|
||||
#elif defined(__GNUC__)
|
||||
__extension__ typedef long long int64_t;
|
||||
__extension__ typedef unsigned long long uint64_t;
|
||||
#define UINT64_C(v) v ##ULL
|
||||
#define INT64_C(v) v ##LL
|
||||
#elif defined(__MWERKS__) || defined(__SUNPRO_C) || defined(__SUNPRO_CC) || defined(__APPLE_CC__) || defined(_LONG_LONG) || defined(_CRAYC)
|
||||
typedef long long int64_t;
|
||||
typedef unsigned long long uint64_t;
|
||||
#define UINT64_C(v) v ##ULL
|
||||
#define INT64_C(v) v ##LL
|
||||
#elif (defined(__WATCOMC__) && defined(__WATCOM_INT64__)) || (defined(_MSC_VER) && _INTEGRAL_MAX_BITS >= 64) || (defined(__BORLANDC__) && __BORLANDC__ > 0x460) || defined(__alpha) || defined(__DECC)
|
||||
typedef __int64 int64_t;
|
||||
typedef unsigned __int64 uint64_t;
|
||||
#define UINT64_C(v) v ##UI64
|
||||
#define INT64_C(v) v ##I64
|
||||
#endif
|
||||
#endif
|
252
libs/ge.cu
252
libs/ge.cu
@ -1,157 +1,5 @@
|
||||
#include <ge.cuh>
|
||||
#include <precomp_data.h>
|
||||
void __host__ __device__ ge_add(ge_p1p1 *r, const ge_p3 *p, const ge_cached *q) {
|
||||
fe t0;
|
||||
fe_add(r->X, p->Y, p->X);
|
||||
fe_sub(r->Y, p->Y, p->X);
|
||||
fe_mul(r->Z, r->X, q->YplusX);
|
||||
fe_mul(r->Y, r->Y, q->YminusX);
|
||||
fe_mul(r->T, q->T2d, p->T);
|
||||
fe_mul(r->X, p->Z, q->Z);
|
||||
fe_add(t0, r->X, r->X);
|
||||
fe_sub(r->X, r->Z, r->Y);
|
||||
fe_add(r->Y, r->Z, r->Y);
|
||||
fe_add(r->Z, t0, r->T);
|
||||
fe_sub(r->T, t0, r->T);
|
||||
}
|
||||
static void __host__ __device__ slide(signed char *r, const unsigned char *a) {
|
||||
int i;
|
||||
int b;
|
||||
int k;
|
||||
#pragma unroll 256
|
||||
for (i = 0; i < 256; ++i) {
|
||||
r[i] = 1 & (a[i >> 3] >> (i & 7));
|
||||
}
|
||||
#pragma unroll 256
|
||||
for (i = 0; i < 256; ++i)
|
||||
if (r[i]) {
|
||||
#pragma unroll
|
||||
for (b = 1; b <= 6 && i + b < 256; ++b) {
|
||||
if (r[i + b]) {
|
||||
if (r[i] + (r[i + b] << b) <= 15) {
|
||||
r[i] += r[i + b] << b;
|
||||
r[i + b] = 0;
|
||||
} else if (r[i] - (r[i + b] << b) >= -15) {
|
||||
r[i] -= r[i + b] << b;
|
||||
#pragma unroll
|
||||
for (k = i + b; k < 256; ++k) {
|
||||
if (!r[k]) {
|
||||
r[k] = 1;
|
||||
break;
|
||||
}
|
||||
|
||||
r[k] = 0;
|
||||
}
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
void __host__ __device__ ge_double_scalarmult_vartime(ge_p2 *r, const unsigned char *a, const ge_p3 *A, const unsigned char *b) {
|
||||
signed char aslide[256];
|
||||
signed char bslide[256];
|
||||
ge_cached Ai[8];
|
||||
ge_p1p1 t;
|
||||
ge_p3 u;
|
||||
ge_p3 A2;
|
||||
int i;
|
||||
slide(aslide, a);
|
||||
slide(bslide, b);
|
||||
ge_p3_to_cached(&Ai[0], A);
|
||||
ge_p3_dbl(&t, A);
|
||||
ge_p1p1_to_p3(&A2, &t);
|
||||
ge_add(&t, &A2, &Ai[0]);
|
||||
ge_p1p1_to_p3(&u, &t);
|
||||
ge_p3_to_cached(&Ai[1], &u);
|
||||
ge_add(&t, &A2, &Ai[1]);
|
||||
ge_p1p1_to_p3(&u, &t);
|
||||
ge_p3_to_cached(&Ai[2], &u);
|
||||
ge_add(&t, &A2, &Ai[2]);
|
||||
ge_p1p1_to_p3(&u, &t);
|
||||
ge_p3_to_cached(&Ai[3], &u);
|
||||
ge_add(&t, &A2, &Ai[3]);
|
||||
ge_p1p1_to_p3(&u, &t);
|
||||
ge_p3_to_cached(&Ai[4], &u);
|
||||
ge_add(&t, &A2, &Ai[4]);
|
||||
ge_p1p1_to_p3(&u, &t);
|
||||
ge_p3_to_cached(&Ai[5], &u);
|
||||
ge_add(&t, &A2, &Ai[5]);
|
||||
ge_p1p1_to_p3(&u, &t);
|
||||
ge_p3_to_cached(&Ai[6], &u);
|
||||
ge_add(&t, &A2, &Ai[6]);
|
||||
ge_p1p1_to_p3(&u, &t);
|
||||
ge_p3_to_cached(&Ai[7], &u);
|
||||
ge_p2_0(r);
|
||||
for (i = 255; i >= 0; --i) {
|
||||
if (aslide[i] || bslide[i]) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
for (; i >= 0; --i) {
|
||||
ge_p2_dbl(&t, r);
|
||||
if (aslide[i] > 0) {
|
||||
ge_p1p1_to_p3(&u, &t);
|
||||
ge_add(&t, &u, &Ai[aslide[i] / 2]);
|
||||
} else if (aslide[i] < 0) {
|
||||
ge_p1p1_to_p3(&u, &t);
|
||||
ge_sub(&t, &u, &Ai[(-aslide[i]) / 2]);
|
||||
}
|
||||
|
||||
if (bslide[i] > 0) {
|
||||
ge_p1p1_to_p3(&u, &t);
|
||||
ge_madd(&t, &u, &Bi[bslide[i] / 2]);
|
||||
} else if (bslide[i] < 0) {
|
||||
ge_p1p1_to_p3(&u, &t);
|
||||
ge_msub(&t, &u, &Bi[(-bslide[i]) / 2]);
|
||||
}
|
||||
|
||||
ge_p1p1_to_p2(r, &t);
|
||||
}
|
||||
}
|
||||
static __constant__ __device__ fe d = {
|
||||
-10913610, 13857413, -15372611, 6949391, 114729, -8787816, -6275908, -3247719, -18696448, -12055116
|
||||
};
|
||||
static __constant__ __device__ fe sqrtm1 = {
|
||||
-32595792, -7943725, 9377950, 3500415, 12389472, -272473, -25146209, -2005654, 326686, 11406482
|
||||
};
|
||||
int __device__ __host__ ge_frombytes_negate_vartime(ge_p3 *h, const unsigned char *s) {
|
||||
fe u;
|
||||
fe v;
|
||||
fe v3;
|
||||
fe vxx;
|
||||
fe check;
|
||||
fe_frombytes(h->Y, s);
|
||||
fe_1(h->Z);
|
||||
fe_sq(u, h->Y);
|
||||
fe_mul(v, u, d);
|
||||
fe_sub(u, u, h->Z);
|
||||
fe_add(v, v, h->Z);
|
||||
fe_sq(v3, v);
|
||||
fe_mul(v3, v3, v);
|
||||
fe_sq(h->X, v3);
|
||||
fe_mul(h->X, h->X, v);
|
||||
fe_mul(h->X, h->X, u);
|
||||
fe_pow22523(h->X, h->X);
|
||||
fe_mul(h->X, h->X, v3);
|
||||
fe_mul(h->X, h->X, u);
|
||||
fe_sq(vxx, h->X);
|
||||
fe_mul(vxx, vxx, v);
|
||||
fe_sub(check, vxx, u);
|
||||
if (fe_isnonzero(check)) {
|
||||
fe_add(check, vxx, u);
|
||||
if (fe_isnonzero(check)) {
|
||||
return -1;
|
||||
}
|
||||
fe_mul(h->X, h->X, sqrtm1);
|
||||
}
|
||||
if (fe_isnegative(h->X) == (s[31] >> 7)) {
|
||||
fe_neg(h->X, h->X);
|
||||
}
|
||||
fe_mul(h->T, h->X, h->Y);
|
||||
return 0;
|
||||
}
|
||||
void __host__ __device__ ge_madd(ge_p1p1* r, const ge_p3* p, const ge_precomp* q) {
|
||||
fe t0;
|
||||
fe_add(r->X, p->Y, p->X);
|
||||
@ -165,19 +13,6 @@ void __host__ __device__ ge_madd(ge_p1p1 *r, const ge_p3 *p, const ge_precomp *q
|
||||
fe_add(r->Z, t0, r->T);
|
||||
fe_sub(r->T, t0, r->T);
|
||||
}
|
||||
void __host__ __device__ ge_msub(ge_p1p1 *r, const ge_p3 *p, const ge_precomp *q) {
|
||||
fe t0;
|
||||
fe_add(r->X, p->Y, p->X);
|
||||
fe_sub(r->Y, p->Y, p->X);
|
||||
fe_mul(r->Z, r->X, q->yminusx);
|
||||
fe_mul(r->Y, r->Y, q->yplusx);
|
||||
fe_mul(r->T, q->xy2d, p->T);
|
||||
fe_add(t0, p->Z, p->Z);
|
||||
fe_sub(r->X, r->Z, r->Y);
|
||||
fe_add(r->Y, r->Z, r->Y);
|
||||
fe_sub(r->Z, t0, r->T);
|
||||
fe_add(r->T, t0, r->T);
|
||||
}
|
||||
void __host__ __device__ ge_p1p1_to_p2(ge_p2* r, const ge_p1p1* p) {
|
||||
fe_mul(r->X, p->X, p->T);
|
||||
fe_mul(r->Y, p->Y, p->Z);
|
||||
@ -189,11 +24,6 @@ void __host__ __device__ ge_p1p1_to_p3(ge_p3 *r, const ge_p1p1 *p) {
|
||||
fe_mul(r->Z, p->Z, p->T);
|
||||
fe_mul(r->T, p->X, p->Y);
|
||||
}
|
||||
void __host__ __device__ ge_p2_0(ge_p2 *h) {
|
||||
fe_0(h->X);
|
||||
fe_1(h->Y);
|
||||
fe_1(h->Z);
|
||||
}
|
||||
void __host__ __device__ ge_p2_dbl(ge_p1p1* r, const ge_p2* p) {
|
||||
fe t0;
|
||||
fe_sq(r->X, p->X);
|
||||
@ -206,35 +36,15 @@ void __host__ __device__ ge_p2_dbl(ge_p1p1 *r, const ge_p2 *p) {
|
||||
fe_sub(r->X, t0, r->Y);
|
||||
fe_sub(r->T, r->T, r->Z);
|
||||
}
|
||||
void __host__ __device__ ge_p3_0(ge_p3 *h) {
|
||||
fe_0(h->X);
|
||||
fe_1(h->Y);
|
||||
fe_1(h->Z);
|
||||
fe_0(h->T);
|
||||
}
|
||||
void __host__ __device__ ge_p3_dbl(ge_p1p1* r, const ge_p3* p) {
|
||||
ge_p2 q;
|
||||
ge_p3_to_p2(&q, p);
|
||||
fe_copy(q.X, p->X);
|
||||
fe_copy(q.Y, p->Y);
|
||||
fe_copy(q.Z, p->Z);
|
||||
ge_p2_dbl(r, &q);
|
||||
}
|
||||
static __constant__ __device__ fe d2 = {
|
||||
-21827239, -5839606, -30745221, 13898782, 229458, 15978800, -12551817, -6495438, 29715968, 9444199
|
||||
};
|
||||
void __host__ __device__ ge_p3_to_cached(ge_cached *r, const ge_p3 *p) {
|
||||
fe_add(r->YplusX, p->Y, p->X);
|
||||
fe_sub(r->YminusX, p->Y, p->X);
|
||||
fe_copy(r->Z, p->Z);
|
||||
fe_mul(r->T2d, p->T, d2);
|
||||
}
|
||||
void ge_p3_to_p2(ge_p2 *r, const ge_p3 *p) {
|
||||
fe_copy(r->X, p->X);
|
||||
fe_copy(r->Y, p->Y);
|
||||
fe_copy(r->Z, p->Z);
|
||||
}
|
||||
void ge_p3_tobytes(unsigned char* s, const ge_p3* h) {
|
||||
fe recip;
|
||||
fe x;
|
||||
fe y;
|
||||
fe recip, x, y;
|
||||
fe_invert(recip, h->Z);
|
||||
fe_mul(x, h->X, recip);
|
||||
fe_mul(y, h->Y, recip);
|
||||
@ -242,16 +52,8 @@ void ge_p3_tobytes(unsigned char *s, const ge_p3 *h) {
|
||||
s[31] ^= fe_isnegative(x) << 7;
|
||||
}
|
||||
static unsigned char __host__ __device__ equal(signed char b, signed char c) {
|
||||
unsigned char ub = b;
|
||||
unsigned char uc = c;
|
||||
unsigned char x = ub ^ uc;
|
||||
uint64_t y = x;
|
||||
y -= 1;
|
||||
y >>= 63;
|
||||
return (unsigned char) y;
|
||||
}
|
||||
static unsigned char __host__ __device__ negative(signed char b) {
|
||||
uint64_t x = b;
|
||||
unsigned long x = b ^ c;
|
||||
x -= 1;
|
||||
x >>= 63;
|
||||
return (unsigned char)x;
|
||||
}
|
||||
@ -262,11 +64,14 @@ static void __host__ __device__ cmov(ge_precomp *t, const ge_precomp *u, unsigne
|
||||
}
|
||||
static void __host__ __device__ select(ge_precomp* t, int pos, signed char b) {
|
||||
ge_precomp minust;
|
||||
unsigned char bnegative = negative(b);
|
||||
unsigned long x = b;
|
||||
x >>= 63;
|
||||
unsigned char bnegative = static_cast<unsigned char>(x);
|
||||
unsigned char babs = b - (((-bnegative) & b) << 1);
|
||||
fe_1(t->yplusx);
|
||||
fe_1(t->yminusx);
|
||||
fe_0(t->xy2d);
|
||||
#pragma unroll 10
|
||||
for (int i = 0; i < 10; i++) t->xy2d[i] = 0;
|
||||
cmov(t, &base[pos][0], equal(babs, 1));
|
||||
cmov(t, &base[pos][1], equal(babs, 2));
|
||||
cmov(t, &base[pos][2], equal(babs, 3));
|
||||
@ -298,8 +103,13 @@ void __device__ __host__ ge_scalarmult_base(ge_p3* h, const unsigned char* a) {
|
||||
e[i] -= carry << 4;
|
||||
}
|
||||
e[63] += carry;
|
||||
ge_p3_0(h);
|
||||
#pragma unroll 32
|
||||
#pragma unroll 10
|
||||
for (int i = 0; i < 10; i++) h->X[i] = 0;
|
||||
fe_1(h->Y);
|
||||
fe_1(h->Z);
|
||||
#pragma unroll 10
|
||||
for (int i = 0; i < 10; i++) h->T[i] = 0;
|
||||
#pragma unroll 64
|
||||
for (i = 1; i < 64; i += 2) {
|
||||
select(&t, i >> 1, e[i]);
|
||||
ge_madd(&r, h, &t);
|
||||
@ -310,34 +120,10 @@ void __device__ __host__ ge_scalarmult_base(ge_p3* h, const unsigned char* a) {
|
||||
ge_p1p1_to_p2(&s, &r); ge_p2_dbl(&r, &s);
|
||||
ge_p1p1_to_p2(&s, &r); ge_p2_dbl(&r, &s);
|
||||
ge_p1p1_to_p3(h, &r);
|
||||
#pragma unroll 32
|
||||
#pragma unroll 64
|
||||
for (i = 0; i < 64; i += 2) {
|
||||
select(&t, i >> 1, e[i]);
|
||||
ge_madd(&r, h, &t);
|
||||
ge_p1p1_to_p3(h, &r);
|
||||
}
|
||||
}
|
||||
void __host__ __device__ ge_sub(ge_p1p1 *r, const ge_p3 *p, const ge_cached *q) {
|
||||
fe t0;
|
||||
fe_add(r->X, p->Y, p->X);
|
||||
fe_sub(r->Y, p->Y, p->X);
|
||||
fe_mul(r->Z, r->X, q->YminusX);
|
||||
fe_mul(r->Y, r->Y, q->YplusX);
|
||||
fe_mul(r->T, q->T2d, p->T);
|
||||
fe_mul(r->X, p->Z, q->Z);
|
||||
fe_add(t0, r->X, r->X);
|
||||
fe_sub(r->X, r->Z, r->Y);
|
||||
fe_add(r->Y, r->Z, r->Y);
|
||||
fe_sub(r->Z, t0, r->T);
|
||||
fe_add(r->T, t0, r->T);
|
||||
}
|
||||
void __host__ __device__ ge_tobytes(unsigned char *s, const ge_p2 *h) {
|
||||
fe recip;
|
||||
fe x;
|
||||
fe y;
|
||||
fe_invert(recip, h->Z);
|
||||
fe_mul(x, h->X, recip);
|
||||
fe_mul(y, h->Y, recip);
|
||||
fe_tobytes(s, y);
|
||||
s[31] ^= fe_isnegative(x) << 7;
|
||||
}
|
15
libs/ge.cuh
15
libs/ge.cuh
@ -23,27 +23,12 @@ typedef struct {
|
||||
fe yminusx;
|
||||
fe xy2d;
|
||||
} ge_precomp;
|
||||
typedef struct {
|
||||
fe YplusX;
|
||||
fe YminusX;
|
||||
fe Z;
|
||||
fe T2d;
|
||||
} ge_cached;
|
||||
void __host__ __device__ ge_p3_tobytes(unsigned char *s, const ge_p3 *h);
|
||||
void __host__ __device__ ge_tobytes(unsigned char *s, const ge_p2 *h);
|
||||
int __host__ __device__ ge_frombytes_negate_vartime(ge_p3 *h, const unsigned char *s);
|
||||
void __host__ __device__ ge_add(ge_p1p1 *r, const ge_p3 *p, const ge_cached *q);
|
||||
void __host__ __device__ ge_sub(ge_p1p1 *r, const ge_p3 *p, const ge_cached *q);
|
||||
void __host__ __device__ ge_double_scalarmult_vartime(ge_p2 *r, const unsigned char *a, const ge_p3 *A, const unsigned char *b);
|
||||
void __host__ __device__ ge_madd(ge_p1p1 *r, const ge_p3 *p, const ge_precomp *q);
|
||||
void __host__ __device__ ge_msub(ge_p1p1 *r, const ge_p3 *p, const ge_precomp *q);
|
||||
void __host__ __device__ ge_scalarmult_base(ge_p3 *h, const unsigned char *a);
|
||||
void __host__ __device__ ge_p1p1_to_p2(ge_p2 *r, const ge_p1p1 *p);
|
||||
void __host__ __device__ ge_p1p1_to_p3(ge_p3 *r, const ge_p1p1 *p);
|
||||
void __host__ __device__ ge_p2_0(ge_p2 *h);
|
||||
void __host__ __device__ ge_p2_dbl(ge_p1p1 *r, const ge_p2 *p);
|
||||
void __host__ __device__ ge_p3_0(ge_p3 *h);
|
||||
void __host__ __device__ ge_p3_dbl(ge_p1p1 *r, const ge_p3 *p);
|
||||
void __host__ __device__ ge_p3_to_cached(ge_cached *r, const ge_p3 *p);
|
||||
void __host__ __device__ ge_p3_to_p2(ge_p2 *r, const ge_p3 *p);
|
||||
#endif
|
@ -6,6 +6,13 @@
|
||||
#include <edsign.cuh>
|
||||
#include <string.cuh>
|
||||
#include <keymanip.cuh>
|
||||
#ifndef DEBUG
|
||||
#define THREADSPB 256
|
||||
#define THDIVTHPB (tTh / THREADSPB)
|
||||
#else
|
||||
#define THREADSPB 1
|
||||
#define THDIVTHPB 1
|
||||
#endif
|
||||
__device__ unsigned d_high = 0x10;
|
||||
__device__ int parameters(const char* arg) noexcept {
|
||||
if ((cstring_find(arg, "--altitude") == 0 && cstring_length(arg) == 10) ||
|
||||
@ -114,9 +121,13 @@ __global__ void KeyGenKernel(curandState* randStates) {
|
||||
printf("\nIPv6:\t%s\nPK:\t%s\nSK:\t%s\n", getAddr(raw).data, ktos(keys.PublicKey).data, ktos(keys.PrivateKey).data);
|
||||
d_high = zeros;
|
||||
}
|
||||
#ifdef DEBUG
|
||||
if ((x & 0xFF) == 0) {
|
||||
printf("Iters: %d\n", x);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
#define THREADS_P_B 256
|
||||
int main(int argc, char* argv[]) {
|
||||
int* d_result, mBpSM, h_high;
|
||||
char** d_argv;
|
||||
@ -135,16 +146,17 @@ int main(int argc, char* argv[]) {
|
||||
cudaDeviceSynchronize();
|
||||
cudaMemcpyFromSymbol(&h_high, d_high, sizeof(unsigned));
|
||||
cudaGetDeviceProperties(&prop, 0);
|
||||
cudaOccupancyMaxActiveBlocksPerMultiprocessor(&mBpSM, KeyGenKernel, THREADS_P_B, 0);
|
||||
const int tTh = mBpSM * prop.multiProcessorCount * THREADS_P_B;
|
||||
printf("High addrs: 2%02x+\nSMs: %d\nTotalThreads: %d\nBlocks: %d (Threads: %d)\n", h_high, prop.multiProcessorCount, tTh, tTh / THREADS_P_B, THREADS_P_B);
|
||||
cudaOccupancyMaxActiveBlocksPerMultiprocessor(&mBpSM, KeyGenKernel, THREADSPB, 0);
|
||||
const int tTh = mBpSM * prop.multiProcessorCount * THREADSPB;
|
||||
printf("High addrs: 2%02x+\nSMs: %d\nTotalThreads: %d\nBlocks: %d (Threads: %d)\n", h_high, prop.multiProcessorCount, tTh, tTh / THREADSPB, THREADSPB);
|
||||
cudaMalloc(&rst, tTh * sizeof(curandState));
|
||||
unsigned int* d_seeds;
|
||||
cudaMalloc(&d_seeds, tTh * sizeof(unsigned int));
|
||||
initRand<<<tTh / THREADS_P_B, THREADS_P_B >>>(rst, d_seeds);
|
||||
unsigned* d_seeds;
|
||||
cudaMalloc(&d_seeds, tTh * sizeof(unsigned));
|
||||
initRand<<<THDIVTHPB, THREADSPB>>>(rst, d_seeds);
|
||||
cudaDeviceSynchronize();
|
||||
unsigned int* h_seeds = (unsigned int*)malloc(tTh * sizeof(unsigned int));
|
||||
cudaMemcpy(h_seeds, d_seeds, tTh * sizeof(unsigned int), cudaMemcpyDeviceToHost);
|
||||
#ifndef DEBUG
|
||||
unsigned* h_seeds = (unsigned*)malloc(tTh * sizeof(unsigned));
|
||||
cudaMemcpy(h_seeds, d_seeds, tTh * sizeof(unsigned), cudaMemcpyDeviceToHost);
|
||||
if (checkSeeds(h_seeds, tTh)) {
|
||||
fprintf(stderr, "Duplicate seeds found!\n");
|
||||
free(h_seeds);
|
||||
@ -154,7 +166,8 @@ int main(int argc, char* argv[]) {
|
||||
}
|
||||
free(h_seeds);
|
||||
cudaFree(d_seeds);
|
||||
KeyGenKernel << <tTh / THREADS_P_B, THREADS_P_B >> > (rst);
|
||||
#endif
|
||||
KeyGenKernel<<<THDIVTHPB, THREADSPB>>>(rst);
|
||||
cudaFree(rst);
|
||||
return 0;
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user