Compare commits
44 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 6fa8acd2e3 | |||
|
|
5bf02d95ab | ||
|
|
361964edf7 | ||
|
|
ab6061f5bf | ||
|
|
4b5a9967ed | ||
|
|
558b6b3a2c | ||
|
|
f0befd4925 | ||
|
|
2944a25784 | ||
|
|
f2b96920fb | ||
|
|
75c78b5a35 | ||
|
|
8d83b3198d | ||
|
|
279f949dff | ||
|
|
fa31fc69b0 | ||
|
|
e591951b69 | ||
|
|
8b33e5d73d | ||
|
|
6dd774c206 | ||
|
|
d3dd7b7e4d | ||
|
|
137db6f6b2 | ||
|
|
c7797dd663 | ||
|
|
37a2f9cf52 | ||
|
|
14fd274f36 | ||
|
|
d01662bd63 | ||
|
|
3cb19e527e | ||
|
|
98ddd7387a | ||
|
|
c37db7679b | ||
|
|
988534bc45 | ||
|
|
a9a693dc07 | ||
|
|
1499fc5d28 | ||
|
|
6d92f9ee43 | ||
|
|
b17df465d5 | ||
| b0c15fad82 | |||
| 5b543aa710 | |||
| 7ea0e62697 | |||
| bc814d988c | |||
| b7240b3d8d | |||
| 9e6cff28fd | |||
| 735cc0c467 | |||
| 4f5a8129ae | |||
| e6c4f9ceb5 | |||
| 17f6d7be58 | |||
| cfecea84ec | |||
| f333047c1f | |||
| 6198670dc4 | |||
| da1d9b12ca |
271
.clang-format
Normal file
271
.clang-format
Normal file
@@ -0,0 +1,271 @@
|
||||
BasedOnStyle: WebKit
|
||||
AccessModifierOffset: -4
|
||||
AlignAfterOpenBracket: DontAlign
|
||||
AlignArrayOfStructures: None
|
||||
AlignConsecutiveAssignments:
|
||||
Enabled: false
|
||||
AcrossEmptyLines: false
|
||||
AcrossComments: false
|
||||
AlignCompound: false
|
||||
AlignFunctionPointers: false
|
||||
PadOperators: true
|
||||
AlignConsecutiveBitFields:
|
||||
Enabled: false
|
||||
AcrossEmptyLines: false
|
||||
AcrossComments: false
|
||||
AlignCompound: false
|
||||
AlignFunctionPointers: false
|
||||
PadOperators: false
|
||||
AlignConsecutiveDeclarations:
|
||||
Enabled: false
|
||||
AcrossEmptyLines: false
|
||||
AcrossComments: false
|
||||
AlignCompound: false
|
||||
AlignFunctionPointers: false
|
||||
PadOperators: false
|
||||
AlignConsecutiveMacros:
|
||||
Enabled: false
|
||||
AcrossEmptyLines: false
|
||||
AcrossComments: false
|
||||
AlignCompound: false
|
||||
AlignFunctionPointers: false
|
||||
PadOperators: false
|
||||
AlignConsecutiveShortCaseStatements:
|
||||
Enabled: false
|
||||
AcrossEmptyLines: false
|
||||
AcrossComments: false
|
||||
AlignCaseArrows: false
|
||||
AlignCaseColons: false
|
||||
AlignConsecutiveTableGenBreakingDAGArgColons:
|
||||
Enabled: false
|
||||
AcrossEmptyLines: false
|
||||
AcrossComments: false
|
||||
AlignCompound: false
|
||||
AlignFunctionPointers: false
|
||||
PadOperators: false
|
||||
AlignConsecutiveTableGenCondOperatorColons:
|
||||
Enabled: false
|
||||
AcrossEmptyLines: false
|
||||
AcrossComments: false
|
||||
AlignCompound: false
|
||||
AlignFunctionPointers: false
|
||||
PadOperators: false
|
||||
AlignConsecutiveTableGenDefinitionColons:
|
||||
Enabled: false
|
||||
AcrossEmptyLines: false
|
||||
AcrossComments: false
|
||||
AlignCompound: false
|
||||
AlignFunctionPointers: false
|
||||
PadOperators: false
|
||||
AlignEscapedNewlines: Right
|
||||
AlignOperands: DontAlign
|
||||
AlignTrailingComments:
|
||||
Kind: Never
|
||||
OverEmptyLines: 0
|
||||
AllowAllArgumentsOnNextLine: true
|
||||
AllowAllParametersOfDeclarationOnNextLine: true
|
||||
AllowBreakBeforeNoexceptSpecifier: Never
|
||||
AllowShortBlocksOnASingleLine: Empty
|
||||
AllowShortCaseExpressionOnASingleLine: true
|
||||
AllowShortCaseLabelsOnASingleLine: false
|
||||
AllowShortCompoundRequirementOnASingleLine: true
|
||||
AllowShortEnumsOnASingleLine: true
|
||||
AllowShortFunctionsOnASingleLine: All
|
||||
AllowShortIfStatementsOnASingleLine: Never
|
||||
AllowShortLambdasOnASingleLine: All
|
||||
AllowShortLoopsOnASingleLine: false
|
||||
AlwaysBreakAfterDefinitionReturnType: None
|
||||
AlwaysBreakBeforeMultilineStrings: false
|
||||
AttributeMacros:
|
||||
- __capability
|
||||
BinPackArguments: true
|
||||
BinPackParameters: true
|
||||
BitFieldColonSpacing: Both
|
||||
BraceWrapping:
|
||||
AfterCaseLabel: false
|
||||
AfterClass: false
|
||||
AfterControlStatement: Never
|
||||
AfterEnum: false
|
||||
AfterFunction: true
|
||||
AfterNamespace: false
|
||||
AfterObjCDeclaration: false
|
||||
AfterStruct: false
|
||||
AfterUnion: false
|
||||
AfterExternBlock: false
|
||||
BeforeCatch: false
|
||||
BeforeElse: false
|
||||
BeforeLambdaBody: false
|
||||
BeforeWhile: false
|
||||
IndentBraces: false
|
||||
SplitEmptyFunction: true
|
||||
SplitEmptyRecord: true
|
||||
SplitEmptyNamespace: true
|
||||
BreakAdjacentStringLiterals: true
|
||||
BreakAfterAttributes: Leave
|
||||
BreakAfterJavaFieldAnnotations: false
|
||||
BreakAfterReturnType: None
|
||||
BreakArrays: true
|
||||
BreakBeforeBinaryOperators: All
|
||||
BreakBeforeBraces: Attach
|
||||
BreakBeforeConceptDeclarations: Always
|
||||
BreakBeforeInlineASMColon: OnlyMultiline
|
||||
BreakBeforeTernaryOperators: true
|
||||
BreakConstructorInitializers: BeforeComma
|
||||
BreakFunctionDefinitionParameters: false
|
||||
BreakInheritanceList: BeforeColon
|
||||
BreakStringLiterals: true
|
||||
BreakTemplateDeclarations: MultiLine
|
||||
ColumnLimit: 0
|
||||
CommentPragmas: "^ IWYU pragma:"
|
||||
CompactNamespaces: false
|
||||
ConstructorInitializerIndentWidth: 4
|
||||
ContinuationIndentWidth: 4
|
||||
Cpp11BracedListStyle: false
|
||||
DerivePointerAlignment: false
|
||||
DisableFormat: false
|
||||
EmptyLineAfterAccessModifier: Never
|
||||
EmptyLineBeforeAccessModifier: LogicalBlock
|
||||
ExperimentalAutoDetectBinPacking: false
|
||||
FixNamespaceComments: false
|
||||
ForEachMacros:
|
||||
- foreach
|
||||
- Q_FOREACH
|
||||
- BOOST_FOREACH
|
||||
IfMacros:
|
||||
- KJ_IF_MAYBE
|
||||
IncludeBlocks: Preserve
|
||||
IncludeCategories:
|
||||
- Regex: ^"(llvm|llvm-c|clang|clang-c)/
|
||||
Priority: 2
|
||||
SortPriority: 0
|
||||
CaseSensitive: false
|
||||
- Regex: ^(<|"(gtest|gmock|isl|json)/)
|
||||
Priority: 3
|
||||
SortPriority: 0
|
||||
CaseSensitive: false
|
||||
- Regex: .*
|
||||
Priority: 1
|
||||
SortPriority: 0
|
||||
CaseSensitive: false
|
||||
IncludeIsMainRegex: (Test)?$
|
||||
IncludeIsMainSourceRegex: ""
|
||||
IndentAccessModifiers: false
|
||||
IndentCaseBlocks: false
|
||||
IndentCaseLabels: false
|
||||
IndentExternBlock: AfterExternBlock
|
||||
IndentGotoLabels: true
|
||||
IndentPPDirectives: None
|
||||
IndentRequiresClause: true
|
||||
IndentWidth: 4
|
||||
IndentWrappedFunctionNames: false
|
||||
InsertBraces: false
|
||||
InsertNewlineAtEOF: false
|
||||
InsertTrailingCommas: None
|
||||
IntegerLiteralSeparator:
|
||||
Binary: 0
|
||||
BinaryMinDigits: 0
|
||||
Decimal: 0
|
||||
DecimalMinDigits: 0
|
||||
Hex: 0
|
||||
HexMinDigits: 0
|
||||
JavaScriptQuotes: Leave
|
||||
JavaScriptWrapImports: true
|
||||
KeepEmptyLines:
|
||||
AtEndOfFile: false
|
||||
AtStartOfBlock: true
|
||||
AtStartOfFile: true
|
||||
LambdaBodyIndentation: Signature
|
||||
LineEnding: DeriveLF
|
||||
MacroBlockBegin: ""
|
||||
MacroBlockEnd: ""
|
||||
MainIncludeChar: Quote
|
||||
MaxEmptyLinesToKeep: 1
|
||||
NamespaceIndentation: Inner
|
||||
ObjCBinPackProtocolList: Auto
|
||||
ObjCBlockIndentWidth: 4
|
||||
ObjCBreakBeforeNestedBlockParam: true
|
||||
ObjCSpaceAfterProperty: true
|
||||
ObjCSpaceBeforeProtocolList: true
|
||||
PPIndentWidth: -1
|
||||
PackConstructorInitializers: BinPack
|
||||
PenaltyBreakAssignment: 2
|
||||
PenaltyBreakBeforeFirstCallParameter: 19
|
||||
PenaltyBreakComment: 300
|
||||
PenaltyBreakFirstLessLess: 120
|
||||
PenaltyBreakOpenParenthesis: 0
|
||||
PenaltyBreakScopeResolution: 500
|
||||
PenaltyBreakString: 1000
|
||||
PenaltyBreakTemplateDeclaration: 10
|
||||
PenaltyExcessCharacter: 1000000
|
||||
PenaltyIndentedWhitespace: 0
|
||||
PenaltyReturnTypeOnItsOwnLine: 60
|
||||
PointerAlignment: Left
|
||||
QualifierAlignment: Leave
|
||||
ReferenceAlignment: Pointer
|
||||
ReflowComments: true
|
||||
RemoveBracesLLVM: false
|
||||
RemoveParentheses: Leave
|
||||
RemoveSemicolon: false
|
||||
RequiresClausePosition: OwnLine
|
||||
RequiresExpressionIndentation: OuterScope
|
||||
SeparateDefinitionBlocks: Leave
|
||||
ShortNamespaceLines: 1
|
||||
SkipMacroDefinitionBody: false
|
||||
SortIncludes: CaseSensitive
|
||||
SortJavaStaticImport: Before
|
||||
SortUsingDeclarations: LexicographicNumeric
|
||||
SpaceAfterCStyleCast: false
|
||||
SpaceAfterLogicalNot: false
|
||||
SpaceAfterTemplateKeyword: true
|
||||
SpaceAroundPointerQualifiers: Default
|
||||
SpaceBeforeAssignmentOperators: true
|
||||
SpaceBeforeCaseColon: false
|
||||
SpaceBeforeCpp11BracedList: true
|
||||
SpaceBeforeCtorInitializerColon: true
|
||||
SpaceBeforeInheritanceColon: true
|
||||
SpaceBeforeJsonColon: false
|
||||
SpaceBeforeParens: ControlStatements
|
||||
SpaceBeforeParensOptions:
|
||||
AfterControlStatements: true
|
||||
AfterForeachMacros: true
|
||||
AfterFunctionDeclarationName: false
|
||||
AfterFunctionDefinitionName: false
|
||||
AfterIfMacros: true
|
||||
AfterOverloadedOperator: false
|
||||
AfterPlacementOperator: true
|
||||
AfterRequiresInClause: false
|
||||
AfterRequiresInExpression: false
|
||||
BeforeNonEmptyParentheses: false
|
||||
SpaceBeforeRangeBasedForLoopColon: true
|
||||
SpaceBeforeSquareBrackets: false
|
||||
SpaceInEmptyBlock: true
|
||||
SpacesBeforeTrailingComments: 1
|
||||
SpacesInAngles: Never
|
||||
SpacesInContainerLiterals: true
|
||||
SpacesInLineCommentPrefix:
|
||||
Minimum: 1
|
||||
Maximum: -1
|
||||
SpacesInParens: Never
|
||||
SpacesInParensOptions:
|
||||
ExceptDoubleParentheses: false
|
||||
InConditionalStatements: false
|
||||
InCStyleCasts: false
|
||||
InEmptyParentheses: false
|
||||
Other: false
|
||||
SpacesInSquareBrackets: false
|
||||
Standard: Latest
|
||||
StatementAttributeLikeMacros:
|
||||
- Q_EMIT
|
||||
StatementMacros:
|
||||
- Q_UNUSED
|
||||
- QT_REQUIRE_VERSION
|
||||
TabWidth: 8
|
||||
TableGenBreakInsideDAGArg: DontBreak
|
||||
UseTab: Never
|
||||
VerilogBreakBetweenInstancePorts: true
|
||||
WhitespaceSensitiveMacros:
|
||||
- BOOST_PP_STRINGIZE
|
||||
- CF_SWIFT_NAME
|
||||
- NS_SWIFT_NAME
|
||||
- PP_STRINGIZE
|
||||
- STRINGIZE
|
||||
2
.clangd
Normal file
2
.clangd
Normal file
@@ -0,0 +1,2 @@
|
||||
CompileFlags:
|
||||
Remove: ['-Xcompiler=-Wall,-Winvalid-pch','-rdc=true','-prec-div=false','-Xptxas','-ftz=true','-prec-sqrt=false','-gencode','--default-stream','--expt-relaxed-constexpr','-G']
|
||||
4
.gitignore
vendored
4
.gitignore
vendored
@@ -1,3 +1,3 @@
|
||||
.vscode/
|
||||
build/*
|
||||
-Makefile
|
||||
.cache/
|
||||
build/
|
||||
@@ -1,11 +0,0 @@
|
||||
cmake_minimum_required(VERSION 3.31)
|
||||
project(yggm)
|
||||
set(CMAKE_CXX_STANDARD 20)
|
||||
set(CMAKE_CXX_STANDARD_REQUIRED ON)
|
||||
file(GLOB ${PROJECT_NAME}SOURCES *.cpp)
|
||||
file(GLOB ${PROJECT_NAME}HEADERS *.h)
|
||||
add_executable(${PROJECT_NAME} sources/main.cpp)
|
||||
set(CXX_ADDITIONAL_FLAGS "-mavx2 -fomit-frame-pointer -ftree-vectorize -ftree-slp-vectorize -fdelete-null-pointer-checks -fno-exceptions -fno-rtti")
|
||||
set(CMAKE_CXX_FLAGS_RELEASE "-O3 -march=native -mtune=native -ffast-math -pipe -Wall -Wextra -Wpedantic -Wconversion -Wuninitialized -Wsign-conversion -flto=full")
|
||||
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} ${CXX_ADDITIONAL_FLAGS}")
|
||||
target_link_libraries(${PROJECT_NAME} pthread sodium)
|
||||
23
README.md
23
README.md
@@ -1,19 +1,18 @@
|
||||
# yggm
|
||||
### Yggdrasil address miner
|
||||
# How to build for CPU
|
||||
# How to build for CPU/GPU
|
||||
### First, install gcc14 and clang. Then:
|
||||
```sh
|
||||
git clone https://rcxpony.name/rcxpony/yggm.git && cd yggm
|
||||
cmake -B build && cmake --build build -j$(nproc)
|
||||
build/yggm -t 10
|
||||
```
|
||||
# How to build for GPU
|
||||
```sh
|
||||
git clone https://rcxpony.name/rcxpony/yggm.git && cd yggm/build
|
||||
make -j$(nproc)
|
||||
./yggmcu -t 10
|
||||
meson setup build -Dbuildtype=release --native-file native.ini && cd build
|
||||
meson compile
|
||||
./yggmc -t 10 // for CPU
|
||||
./yggmcu -t 10 // for GPU
|
||||
```
|
||||
# ToDo
|
||||
- [x] Cuda support (not optimized)
|
||||
- [x] Support for avx2
|
||||
- [ ] Support for sse4
|
||||
- [x] CUDA support
|
||||
- [x] AVX2 optimizations
|
||||
- [ ] (CUDA) Optimize internal algorithms
|
||||
- [ ] Windows support
|
||||
- [ ] Tests
|
||||
#
|
||||
@@ -1,31 +0,0 @@
|
||||
NVCC := nvcc
|
||||
NVCC_FLAGS := -rdc=true -O3 -Xptxas -O3 \
|
||||
-use_fast_math -ftz=true -prec-div=false -prec-sqrt=false \
|
||||
-gencode arch=compute_75,code=sm_75 \
|
||||
--default-stream per-thread \
|
||||
-Wno-deprecated-gpu-targets \
|
||||
--expt-relaxed-constexpr \
|
||||
-I../libs/ \
|
||||
-std=c++17
|
||||
BUILD ?= RELEASE
|
||||
ifeq ($(BUILD),DEBUG)
|
||||
BUILD_DEFINES := -DDEBUG
|
||||
else
|
||||
BUILD_DEFINES := -DRELEASE
|
||||
endif
|
||||
MAIN_SOURCE := ../sources/main.cu
|
||||
LIBS_DIR := ../libs/
|
||||
BUILD_DIR := ../build
|
||||
LIBS_SOURCES := $(wildcard $(LIBS_DIR)*.cu)
|
||||
LIBS_OBJECTS := $(patsubst $(LIBS_DIR)%.cu,$(BUILD_DIR)/%.o,$(LIBS_SOURCES))
|
||||
TARGET := yggmcu
|
||||
.PHONY: all clean
|
||||
all: $(TARGET)
|
||||
$(BUILD_DIR):
|
||||
@mkdir -p $(BUILD_DIR)
|
||||
$(TARGET): $(MAIN_SOURCE) $(LIBS_OBJECTS)
|
||||
$(NVCC) $(NVCC_FLAGS) -o $@ $^
|
||||
$(BUILD_DIR)/%.o: $(LIBS_DIR)%.cu | $(BUILD_DIR)
|
||||
$(NVCC) $(NVCC_FLAGS) -c $< -o $@
|
||||
clean:
|
||||
@rm -f $(BUILD_DIR)/*.o $(TARGET)
|
||||
14
libs/defines.h
Normal file
14
libs/defines.h
Normal file
@@ -0,0 +1,14 @@
|
||||
#ifndef __DEFINES_H
|
||||
#define __DEFINES_H
|
||||
#ifdef DEBUG
|
||||
#define __BUILDTYPE__ "Debug"
|
||||
#define THREADSPB 1
|
||||
#define THDIVTHPB 1
|
||||
#define WHCOND x < 0xFFFFFFFF
|
||||
#else
|
||||
#define __BUILDTYPE__ "Release"
|
||||
#define THREADSPB 256
|
||||
#define THDIVTHPB (tTh / THREADSPB)
|
||||
#define WHCOND true
|
||||
#endif
|
||||
#endif
|
||||
244
libs/ed25519.cu
244
libs/ed25519.cu
@@ -1,106 +1,156 @@
|
||||
#include <ed25519.cuh>
|
||||
#include <f25519.cuh>
|
||||
__device__ __constant__ struct ed25519_pt ed25519_base = {
|
||||
{0x1a,0xd5,0x25,0x8f,0x60,0x2d,0x56,0xc9,0xb2,0xa7,0x25,0x95,0x60,0xc7,0x2c,0x69,
|
||||
0x5c,0xdc,0xd6,0xfd,0x31,0xe2,0xa4,0xc0,0xfe,0x53,0x6e,0xcd,0xd3,0x36,0x69,0x21},
|
||||
{0x58,0x66,0x66,0x66,0x66,0x66,0x66,0x66,0x66,0x66,0x66,0x66,0x66,0x66,0x66,0x66,
|
||||
0x66,0x66,0x66,0x66,0x66,0x66,0x66,0x66,0x66,0x66,0x66,0x66,0x66,0x66,0x66,0x66},
|
||||
{0xa3,0xdd,0xb7,0xa5,0xb3,0x8a,0xde,0x6d,0xf5,0x52,0x51,0x77,0x80,0x9f,0xf0,0x20,
|
||||
0x7d,0xe3,0xab,0x64,0x8e,0x4e,0xea,0x66,0x65,0x76,0x8b,0xd7,0x0f,0x5f,0x87,0x67},
|
||||
{1,0}
|
||||
};
|
||||
__device__ __constant__ struct ed25519_pt ed25519_neutral = {
|
||||
{0}, {1,0}, {0}, {1,0}
|
||||
};
|
||||
__device__ __constant__ unsigned char ed25519_d[32] = {
|
||||
0xa3,0x78,0x59,0x13,0xca,0x4d,0xeb,0x75,0xab,0xd8,0x41,0x41,0x4d,0x0a,0x70,0x00,
|
||||
0x98,0xe8,0x79,0x77,0x79,0x40,0xc7,0x8c,0x73,0xfe,0x6f,0x2b,0xee,0x6c,0x03,0x52
|
||||
};
|
||||
__device__ __constant__ unsigned char ed25519_k[32] = {
|
||||
0x59,0xf1,0xb2,0x26,0x94,0x9b,0xd6,0xeb,0x56,0xb1,0x83,0x82,0x9a,0x14,0xe0,0x00,
|
||||
0x30,0xd1,0xf3,0xee,0xf2,0x80,0x8e,0x19,0xe7,0xfc,0xdf,0x56,0xdc,0xd9,0x06,0x24
|
||||
};
|
||||
__device__ void ed25519_unproject(unsigned char* __restrict__ x, unsigned char* __restrict__ y, const struct ed25519_pt* __restrict__ p) {
|
||||
unsigned char z1[32];
|
||||
f25519_inv__distinct(z1, p->z);
|
||||
f25519_mul__distinct(x, p->x, z1);
|
||||
f25519_mul__distinct(y, p->y, z1);
|
||||
f25519_normalize(x);
|
||||
f25519_normalize(y);
|
||||
#include <precomp_data.h>
|
||||
void __host__ __device__ ge_madd(ge_p1p1& __restrict__ r, const ge_p3& __restrict__ p, const precomp_data& __restrict__ q) {
|
||||
fe t0;
|
||||
fe_add(r.X, p.Y, p.X);
|
||||
fe_sub(r.Y, p.Y, p.X);
|
||||
fe_mul(r.Z, r.X, q.yplusx);
|
||||
fe_mul(r.Y, r.Y, q.yminusx);
|
||||
fe_mul(r.T, q.xy2d, p.T);
|
||||
fe_add(t0, p.Z, p.Z);
|
||||
fe_sub(r.X, r.Z, r.Y);
|
||||
fe_add(r.Y, r.Z, r.Y);
|
||||
fe_add(r.Z, t0, r.T);
|
||||
fe_sub(r.T, t0, r.T);
|
||||
}
|
||||
__device__ void ed25519_pack(unsigned char* __restrict__ c, const unsigned char* __restrict__ x, const unsigned char* __restrict__ y) {
|
||||
unsigned char tmp[32];
|
||||
unsigned char parity;
|
||||
f25519_copy(tmp, x);
|
||||
f25519_normalize(tmp);
|
||||
parity = (tmp[0] & 1) << 7;
|
||||
f25519_copy(c, y);
|
||||
f25519_normalize(c);
|
||||
c[31] |= parity;
|
||||
// r.XYZ = p.XYZ * p.TZT
|
||||
void __host__ __device__ ge_p1p1_to_p2(ge_p2& __restrict__ r, const ge_p1p1& __restrict__ p) {
|
||||
fe_mul(r.X, p.X, p.T);
|
||||
fe_mul(r.Y, p.Y, p.Z);
|
||||
fe_mul(r.Z, p.Z, p.T);
|
||||
}
|
||||
__device__ __forceinline__ void ed25519_add(struct ed25519_pt* __restrict__ r, const struct ed25519_pt* __restrict__ p1, const struct ed25519_pt* __restrict__ p2) {
|
||||
unsigned char a[32], b[32], c[32], d[32], e[32], f[32], g[32], h[32];
|
||||
f25519_sub(c, p1->y, p1->x);
|
||||
f25519_sub(d, p2->y, p2->x);
|
||||
f25519_mul__distinct(a, c, d);
|
||||
f25519_add(c, p1->y, p1->x);
|
||||
f25519_add(d, p2->y, p2->x);
|
||||
f25519_mul__distinct(b, c, d);
|
||||
f25519_mul__distinct(d, p1->t, p2->t);
|
||||
f25519_mul__distinct(c, d, ed25519_k);
|
||||
f25519_mul__distinct(d, p1->z, p2->z);
|
||||
f25519_add(d, d, d);
|
||||
f25519_sub(e, b, a);
|
||||
f25519_sub(f, d, c);
|
||||
f25519_add(g, d, c);
|
||||
f25519_add(h, b, a);
|
||||
f25519_mul__distinct(r->x, e, f);
|
||||
f25519_mul__distinct(r->y, g, h);
|
||||
f25519_mul__distinct(r->t, e, h);
|
||||
f25519_mul__distinct(r->z, f, g);
|
||||
void inline __host__ __device__ ge_p1p1_to_p3(ge_p3& __restrict__ r, const ge_p1p1& __restrict__ p) {
|
||||
fe_mul(r.X, p.X, p.T);
|
||||
fe_mul(r.Y, p.Y, p.Z);
|
||||
fe_mul(r.Z, p.Z, p.T);
|
||||
fe_mul(r.T, p.X, p.Y);
|
||||
}
|
||||
__device__ __forceinline__ void ed25519_double(struct ed25519_pt* __restrict__ r, const struct ed25519_pt* __restrict__ p) {
|
||||
unsigned char a[32], b[32], c[32], e[32], f[32], g[32], h[32];
|
||||
f25519_mul__distinct(a, p->x, p->x);
|
||||
f25519_mul__distinct(b, p->y, p->y);
|
||||
f25519_mul__distinct(c, p->z, p->z);
|
||||
f25519_add(c, c, c);
|
||||
f25519_add(f, p->x, p->y);
|
||||
f25519_mul__distinct(e, f, f);
|
||||
f25519_sub(e, e, a);
|
||||
f25519_sub(e, e, b);
|
||||
f25519_sub(g, b, a);
|
||||
f25519_sub(f, g, c);
|
||||
f25519_neg(h, b);
|
||||
f25519_sub(h, h, a);
|
||||
f25519_mul__distinct(r->x, e, f);
|
||||
f25519_mul__distinct(r->y, g, h);
|
||||
f25519_mul__distinct(r->t, e, h);
|
||||
f25519_mul__distinct(r->z, f, g);
|
||||
void __host__ __device__ ge_p2_dbl(ge_p1p1& __restrict__ r, const ge_p2& __restrict__ p) {
|
||||
fe t0;
|
||||
fe_mul(r.X, p.X, p.X);
|
||||
fe_mul(r.Z, p.Y, p.Y);
|
||||
fe_mul2(r.T, p.Z);
|
||||
fe_add(r.Y, p.X, p.Y);
|
||||
fe_mul(t0, r.Y, r.Y);
|
||||
fe_add(r.Y, r.Z, r.X);
|
||||
fe_sub(r.Z, r.Z, r.X);
|
||||
fe_sub(r.X, t0, r.Y);
|
||||
fe_sub(r.T, r.T, r.Z);
|
||||
}
|
||||
__device__ __forceinline__ void ed25519_copy(struct ed25519_pt* __restrict__ dst, const struct ed25519_pt* __restrict__ src) {
|
||||
f25519_copy(dst->x, src->x);
|
||||
f25519_copy(dst->y, src->y);
|
||||
f25519_copy(dst->t, src->t);
|
||||
f25519_copy(dst->z, src->z);
|
||||
void __host__ __device__ ge_p3_dbl(ge_p1p1& __restrict__ r, const ge_p3& __restrict__ p) {
|
||||
ge_p2 q;
|
||||
fe_copy(q.X, p.X);
|
||||
fe_copy(q.Y, p.Y);
|
||||
fe_copy(q.Z, p.Z);
|
||||
ge_p2_dbl(r, q);
|
||||
}
|
||||
__device__ void ed25519_smult(ed25519_pt* r_out, const struct ed25519_pt* __restrict__ p, const unsigned char* __restrict__ e) {
|
||||
ed25519_pt r = ed25519_neutral;
|
||||
#pragma unroll 256
|
||||
for (int i = 255; i >= 0; i--) {
|
||||
struct ed25519_pt s;
|
||||
ed25519_double(&r, &r);
|
||||
ed25519_add(&s, &r, p);
|
||||
unsigned char bit = (e[i >> 3] >> (i & 7)) & 1;
|
||||
f25519_select(r.x, r.x, s.x, bit);
|
||||
f25519_select(r.y, r.y, s.y, bit);
|
||||
f25519_select(r.z, r.z, s.z, bit);
|
||||
f25519_select(r.t, r.t, s.t, bit);
|
||||
void __host__ __device__ ge_p3_tobytes(unsigned char* __restrict__ s, const ge_p3& __restrict__ h) {
|
||||
fe recip, x, y;
|
||||
fe_invert(recip, h.Z);
|
||||
fe_mul(x, h.X, recip);
|
||||
fe_mul(y, h.Y, recip);
|
||||
fe_tobytes(s, y);
|
||||
s[31] ^= fe_isnegative(x) << 7;
|
||||
}
|
||||
static constexpr unsigned char __host__ __device__ equal(const signed char b, const signed char c) {
|
||||
unsigned long x = (b ^ c) - 1;
|
||||
return static_cast<unsigned char>(x >>= 63);
|
||||
}
|
||||
static void __host__ __device__ cmov(precomp_data& __restrict__ t, const precomp_data& __restrict__ u, unsigned char b) {
|
||||
fe_cmov(t.yplusx, u.yplusx, b);
|
||||
fe_cmov(t.yminusx, u.yminusx, b);
|
||||
fe_cmov(t.xy2d, u.xy2d, b);
|
||||
}
|
||||
static void __host__ __device__ select(precomp_data& __restrict__ t, signed int pos, signed char b) {
|
||||
precomp_data minust;
|
||||
unsigned long x = b;
|
||||
x >>= 63;
|
||||
unsigned char bnegative = static_cast<unsigned char>(x);
|
||||
unsigned char babs = b - (((-bnegative) & b) << 1);
|
||||
t.yplusx[0] = 1;
|
||||
t.yminusx[0] = 1;
|
||||
t.yplusx[1] = 0;
|
||||
t.yminusx[1] = 0;
|
||||
t.yplusx[2] = 0;
|
||||
t.yminusx[2] = 0;
|
||||
t.yplusx[3] = 0;
|
||||
t.yminusx[3] = 0;
|
||||
t.yplusx[4] = 0;
|
||||
t.yminusx[4] = 0;
|
||||
t.yplusx[5] = 0;
|
||||
t.yminusx[5] = 0;
|
||||
t.yplusx[6] = 0;
|
||||
t.yminusx[6] = 0;
|
||||
t.yplusx[7] = 0;
|
||||
t.yminusx[7] = 0;
|
||||
t.yplusx[8] = 0;
|
||||
t.yminusx[8] = 0;
|
||||
t.yplusx[9] = 0;
|
||||
t.yminusx[9] = 0;
|
||||
t.xy2d[0] = 0;
|
||||
t.xy2d[1] = 0;
|
||||
t.xy2d[2] = 0;
|
||||
t.xy2d[3] = 0;
|
||||
t.xy2d[4] = 0;
|
||||
t.xy2d[5] = 0;
|
||||
t.xy2d[6] = 0;
|
||||
t.xy2d[7] = 0;
|
||||
t.xy2d[8] = 0;
|
||||
t.xy2d[9] = 0;
|
||||
cmov(t, base[pos][0], equal(babs, 1));
|
||||
cmov(t, base[pos][1], equal(babs, 2));
|
||||
cmov(t, base[pos][2], equal(babs, 3));
|
||||
cmov(t, base[pos][3], equal(babs, 4));
|
||||
cmov(t, base[pos][4], equal(babs, 5));
|
||||
cmov(t, base[pos][5], equal(babs, 6));
|
||||
cmov(t, base[pos][6], equal(babs, 7));
|
||||
cmov(t, base[pos][7], equal(babs, 8));
|
||||
fe_copy(minust.yplusx, t.yminusx);
|
||||
fe_copy(minust.yminusx, t.yplusx);
|
||||
fe_neg(minust.xy2d, t.xy2d);
|
||||
cmov(t, minust, bnegative);
|
||||
}
|
||||
void __device__ __host__ ge_scalarmult_base(ge_p3& __restrict__ h, const unsigned char* __restrict__ a) {
|
||||
signed char e[64], carry;
|
||||
signed int x;
|
||||
ge_p1p1 r;
|
||||
ge_p2 s;
|
||||
precomp_data t;
|
||||
#pragma unroll 32
|
||||
for (signed int i = 0; i < 32; i++) {
|
||||
e[2 * i] = a[i] & 15;
|
||||
e[2 * i + 1] = a[i] >> 4;
|
||||
}
|
||||
#pragma unroll 63
|
||||
for (x = 0, carry = 0; x < 63; x++) {
|
||||
e[x] += carry;
|
||||
carry = (e[x] + 8) >> 4;
|
||||
e[x] -= carry << 4;
|
||||
}
|
||||
e[63] += carry;
|
||||
fe_0(h.X);
|
||||
fe_1(h.Y);
|
||||
fe_1(h.Z);
|
||||
fe_0(h.T);
|
||||
#pragma unroll 32
|
||||
for (int i = 1; i < 64; i += 2) {
|
||||
select(t, i >> 1, e[i]);
|
||||
ge_madd(r, h, t);
|
||||
ge_p1p1_to_p3(h, r);
|
||||
}
|
||||
ge_p3_dbl(r, h);
|
||||
ge_p1p1_to_p2(s, r);
|
||||
ge_p2_dbl(r, s);
|
||||
ge_p1p1_to_p2(s, r);
|
||||
ge_p2_dbl(r, s);
|
||||
ge_p1p1_to_p2(s, r);
|
||||
ge_p2_dbl(r, s);
|
||||
ge_p1p1_to_p3(h, r);
|
||||
#pragma unroll 32
|
||||
for (int i = 0; i < 64; i += 2) {
|
||||
select(t, i >> 1, e[i]);
|
||||
ge_madd(r, h, t);
|
||||
ge_p1p1_to_p3(h, r);
|
||||
}
|
||||
ed25519_copy(r_out, &r);
|
||||
}
|
||||
__device__ void ed25519_prepare(unsigned char* __restrict__ e) {
|
||||
e[0] &= 0xf8;
|
||||
e[31] &= 0x7f;
|
||||
e[31] |= 0x40;
|
||||
}
|
||||
@@ -1,15 +1,22 @@
|
||||
#ifndef __ED25519_CUH
|
||||
#define __ED25519_CUH
|
||||
struct ed25519_pt { unsigned char x[32], y[32], t[32], z[32]; };
|
||||
extern __device__ __constant__ struct ed25519_pt ed25519_base;
|
||||
extern __device__ __constant__ struct ed25519_pt ed25519_neutral;
|
||||
extern __device__ __constant__ unsigned char ed25519_d[32];
|
||||
extern __device__ __constant__ unsigned char ed25519_k[32];
|
||||
__device__ void ed25519_unproject(unsigned char* x, unsigned char* y, const struct ed25519_pt* p);
|
||||
__device__ void ed25519_pack(unsigned char* c, const unsigned char* x, const unsigned char* y);
|
||||
__device__ __forceinline__ void ed25519_add(struct ed25519_pt* r, const struct ed25519_pt* p1, const struct ed25519_pt* p2);
|
||||
__device__ __forceinline__ void ed25519_double(struct ed25519_pt* r, const struct ed25519_pt* p);
|
||||
__device__ __forceinline__ void ed25519_copy(struct ed25519_pt* dst, const struct ed25519_pt* src);
|
||||
__device__ void ed25519_smult(struct ed25519_pt* r_out, const struct ed25519_pt* p, const unsigned char* e);
|
||||
__device__ void ed25519_prepare(unsigned char* e);
|
||||
typedef struct {
|
||||
signed long X[10], Y[10], Z[10];
|
||||
} ge_p2;
|
||||
typedef struct {
|
||||
signed long X[10], Y[10], Z[10], T[10];
|
||||
} ge_p3;
|
||||
typedef struct {
|
||||
signed long X[10], Y[10], Z[10], T[10];
|
||||
} ge_p1p1;
|
||||
typedef struct {
|
||||
signed long yplusx[10], yminusx[10], xy2d[10];
|
||||
} precomp_data;
|
||||
void __host__ __device__ ge_p3_tobytes(unsigned char* __restrict__ s, const ge_p3& __restrict__ h);
|
||||
void __host__ __device__ ge_madd(ge_p1p1* __restrict__ r, const ge_p3& __restrict__ p, const precomp_data& __restrict__ q);
|
||||
void __host__ __device__ ge_scalarmult_base(ge_p3& __restrict__ h, const unsigned char* __restrict__ a);
|
||||
void __host__ __device__ ge_p1p1_to_p2(ge_p2& __restrict__ r, const ge_p1p1& __restrict__ p);
|
||||
void __host__ __device__ ge_p1p1_to_p3(ge_p3& __restrict__ r, const ge_p1p1& __restrict__ p);
|
||||
void __host__ __device__ ge_p2_dbl(ge_p1p1& __restrict__ r, const ge_p2& __restrict__ p);
|
||||
void __host__ __device__ ge_p3_dbl(ge_p1p1& __restrict__ r, const ge_p3& __restrict__ p);
|
||||
#endif
|
||||
@@ -1,46 +1,20 @@
|
||||
#include <edsign.cuh>
|
||||
#include <ed25519.cuh>
|
||||
#include <edsign.cuh>
|
||||
#include <sha512.cuh>
|
||||
__device__ __forceinline__ void expand_key(unsigned char* expanded, const unsigned char* secret) {
|
||||
__device__ __forceinline__ void expand_key(unsigned char* __restrict__ expanded, const unsigned char* __restrict__ secret) {
|
||||
struct sha512_state s;
|
||||
memcpy(&s, &sha512_initial_state, sizeof(s));
|
||||
sha512_final(&s, secret);
|
||||
sha512_get(&s, expanded);
|
||||
ed25519_prepare(expanded);
|
||||
expanded[0] &= 0xf8;
|
||||
expanded[31] = (expanded[31] & 0x7F) | 0x40;
|
||||
}
|
||||
__device__ __forceinline__ void pp(unsigned char* packed, const struct ed25519_pt* p) {
|
||||
unsigned char x[32], y[32];
|
||||
ed25519_unproject(x, y, p);
|
||||
ed25519_pack(packed, x, y);
|
||||
}
|
||||
__device__ __forceinline__ void sm_pack(unsigned char* r, const unsigned char* k) {
|
||||
struct ed25519_pt p;
|
||||
ed25519_smult(&p, &ed25519_base, k);
|
||||
pp(r, &p);
|
||||
}
|
||||
__device__ __forceinline__ void edsign_sec_to_pub(unsigned char* pub, const unsigned char* secret) {
|
||||
__device__ void ed25519_create_keypair(unsigned char* __restrict__ private_key, unsigned char* __restrict__ public_key, unsigned char* __restrict__ seed) {
|
||||
unsigned char expanded[64];
|
||||
expand_key(expanded, secret);
|
||||
sm_pack(pub, expanded);
|
||||
}
|
||||
__device__ __forceinline__ void compact_wipe(void* __restrict__ data) {
|
||||
unsigned char* p = (unsigned char*)data;
|
||||
unsigned long i = 0;
|
||||
#pragma unroll
|
||||
for (; i + 3 < 32; i += 4) {
|
||||
p[i] = 0;
|
||||
p[i + 1] = 0;
|
||||
p[i + 2] = 0;
|
||||
p[i + 3] = 0;
|
||||
}
|
||||
#pragma unroll
|
||||
for (; i < 32; i++) {
|
||||
p[i] = 0;
|
||||
}
|
||||
}
|
||||
__device__ void ed25519_keygen(unsigned char private_key[64], unsigned char public_key[32], unsigned char random_seed[32]) {
|
||||
edsign_sec_to_pub(public_key, random_seed);
|
||||
memcpy(private_key, random_seed, 32);
|
||||
expand_key(expanded, seed);
|
||||
ge_p3 A;
|
||||
ge_scalarmult_base(A, expanded);
|
||||
ge_p3_tobytes(public_key, A);
|
||||
memcpy(private_key, seed, 32);
|
||||
memcpy(private_key + 32, public_key, 32);
|
||||
compact_wipe(random_seed);
|
||||
}
|
||||
@@ -1,9 +1,5 @@
|
||||
#ifndef __EDSIGN_CUH
|
||||
#define __EDSIGN_CUH
|
||||
__device__ __forceinline__ void expand_key(unsigned char* expanded, const unsigned char* secret);
|
||||
__device__ __forceinline__ void pp(unsigned char* packed, const struct ed25519_pt* p);
|
||||
__device__ __forceinline__ void sm_pack(unsigned char* r, const unsigned char* k);
|
||||
__device__ __forceinline__ void edsign_sec_to_pub(unsigned char* pub, const unsigned char* secret);
|
||||
__device__ __forceinline__ void compact_wipe(void* __restrict__ data);
|
||||
__device__ void ed25519_keygen(unsigned char private_key[64], unsigned char public_key[32], unsigned char random_seed[32]);
|
||||
__device__ __forceinline__ void expand_key(unsigned char* __restrict__ expanded, const unsigned char* __restrict__ secret);
|
||||
__device__ void ed25519_create_keypair(unsigned char* __restrict__ private_key, unsigned char* __restrict__ public_key, unsigned char* __restrict__ seed);
|
||||
#endif
|
||||
524
libs/f25519.cu
524
libs/f25519.cu
@@ -1,137 +1,409 @@
|
||||
#include <f25519.cuh>
|
||||
#include <cuda_runtime.h>
|
||||
__device__ void f25519_copy(unsigned char* __restrict__ x, const unsigned char* __restrict__ a) {
|
||||
const uint4* src = reinterpret_cast<const uint4*>(a);
|
||||
uint4* dst = reinterpret_cast<uint4*>(x);
|
||||
dst[0] = src[0];
|
||||
dst[1] = src[1];
|
||||
// h = {0};
|
||||
void __device__ __host__ fe_0(fe& __restrict__ h) {
|
||||
h[0] = 0;
|
||||
h[1] = 0;
|
||||
h[2] = 0;
|
||||
h[3] = 0;
|
||||
h[4] = 0;
|
||||
h[5] = 0;
|
||||
h[6] = 0;
|
||||
h[7] = 0;
|
||||
h[8] = 0;
|
||||
h[9] = 0;
|
||||
}
|
||||
__device__ void f25519_select(unsigned char* __restrict__ dst, const unsigned char* __restrict__ zero, const unsigned char* __restrict__ one, unsigned char cond) {
|
||||
unsigned mask = static_cast<unsigned>(-cond);
|
||||
uint4* d = reinterpret_cast<uint4*>(dst);
|
||||
const uint4* z = reinterpret_cast<const uint4*>(zero);
|
||||
const uint4* o = reinterpret_cast<const uint4*>(one);
|
||||
uint4 res0, res1;
|
||||
res0.x = (z[0].x & ~mask) | (o[0].x & mask);
|
||||
res0.y = (z[0].y & ~mask) | (o[0].y & mask);
|
||||
res0.z = (z[0].z & ~mask) | (o[0].z & mask);
|
||||
res0.w = (z[0].w & ~mask) | (o[0].w & mask);
|
||||
res1.x = (z[1].x & ~mask) | (o[1].x & mask);
|
||||
res1.y = (z[1].y & ~mask) | (o[1].y & mask);
|
||||
res1.z = (z[1].z & ~mask) | (o[1].z & mask);
|
||||
res1.w = (z[1].w & ~mask) | (o[1].w & mask);
|
||||
d[0] = res0;
|
||||
d[1] = res1;
|
||||
// h = {1,0,0,0,0,0,0,0,0,0,0};
|
||||
void __device__ __host__ fe_1(fe& __restrict__ h) {
|
||||
h[0] = 1;
|
||||
h[1] = 0;
|
||||
h[2] = 0;
|
||||
h[3] = 0;
|
||||
h[4] = 0;
|
||||
h[5] = 0;
|
||||
h[6] = 0;
|
||||
h[7] = 0;
|
||||
h[8] = 0;
|
||||
h[9] = 0;
|
||||
}
|
||||
|
||||
__device__ void f25519_normalize(unsigned char* __restrict__ x) {
|
||||
__align__(32) unsigned char minusp[32];
|
||||
unsigned c = (x[31] >> 7) * 19;
|
||||
x[31] &= 127;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < 32; i++) {
|
||||
c += x[i];
|
||||
x[i] = (unsigned char)c;
|
||||
c >>= 8;
|
||||
}
|
||||
c = 19;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < 31; i++) {
|
||||
c += x[i];
|
||||
minusp[i] = (unsigned char)c;
|
||||
c >>= 8;
|
||||
}
|
||||
c += x[31] - 128;
|
||||
minusp[31] = (unsigned char)c;
|
||||
f25519_select(x, minusp, x, (c >> 15) & 1);
|
||||
// h = f + g
|
||||
void __device__ __host__ fe_add(fe& __restrict__ h, const fe& __restrict__ f, const fe& __restrict__ g) {
|
||||
long f0 = f[0], f1 = f[1], f2 = f[2], f3 = f[3], f4 = f[4], f5 = f[5], f6 = f[6], f7 = f[7], f8 = f[8], f9 = f[9];
|
||||
long g0 = g[0], g1 = g[1], g2 = g[2], g3 = g[3], g4 = g[4], g5 = g[5], g6 = g[6], g7 = g[7], g8 = g[8], g9 = g[9];
|
||||
long h0 = f0 + g0, h1 = f1 + g1, h2 = f2 + g2, h3 = f3 + g3, h4 = f4 + g4, h5 = f5 + g5, h6 = f6 + g6, h7 = f7 + g7, h8 = f8 + g8, h9 = f9 + g9;
|
||||
h[0] = h0;
|
||||
h[1] = h1;
|
||||
h[2] = h2;
|
||||
h[3] = h3;
|
||||
h[4] = h4;
|
||||
h[5] = h5;
|
||||
h[6] = h6;
|
||||
h[7] = h7;
|
||||
h[8] = h8;
|
||||
h[9] = h9;
|
||||
}
|
||||
__device__ void f25519_add(unsigned char* __restrict__ r, const unsigned char* __restrict__ a, const unsigned char* __restrict__ b) {
|
||||
unsigned c = 0;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < 32; i++) {
|
||||
c = (c >> 8) + ((unsigned)a[i]) + ((unsigned)b[i]);
|
||||
r[i] = (unsigned char)c;
|
||||
}
|
||||
r[31] &= 127;
|
||||
c = (c >> 7) * 19;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < 32; i++) {
|
||||
c += r[i];
|
||||
r[i] = (unsigned char)c;
|
||||
c >>= 8;
|
||||
}
|
||||
void __device__ __host__ fe_cmov(fe& __restrict__ f, const fe& __restrict__ g, const unsigned b) {
|
||||
int x = -b;
|
||||
long f0 = f[0], f1 = f[1], f2 = f[2], f3 = f[3], f4 = f[4], f5 = f[5], f6 = f[6], f7 = f[7], f8 = f[8], f9 = f[9];
|
||||
long g0 = g[0], g1 = g[1], g2 = g[2], g3 = g[3], g4 = g[4], g5 = g[5], g6 = g[6], g7 = g[7], g8 = g[8], g9 = g[9];
|
||||
long x0 = (f0 ^ g0) & x, x1 = (f1 ^ g1) & x, x2 = (f2 ^ g2) & x, x3 = (f3 ^ g3) & x, x4 = (f4 ^ g4) & x, x5 = (f5 ^ g5) & x, x6 = (f6 ^ g6) & x, x7 = (f7 ^ g7) & x, x8 = (f8 ^ g8) & x, x9 = (f9 ^ g9) & x;
|
||||
f[0] = f0 ^ x0;
|
||||
f[1] = f1 ^ x1;
|
||||
f[2] = f2 ^ x2;
|
||||
f[3] = f3 ^ x3;
|
||||
f[4] = f4 ^ x4;
|
||||
f[5] = f5 ^ x5;
|
||||
f[6] = f6 ^ x6;
|
||||
f[7] = f7 ^ x7;
|
||||
f[8] = f8 ^ x8;
|
||||
f[9] = f9 ^ x9;
|
||||
}
|
||||
__device__ void f25519_sub(unsigned char* __restrict__ r, const unsigned char* __restrict__ a, const unsigned char* __restrict__ b) {
|
||||
unsigned c = 218;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < 31; i++) {
|
||||
c += 65280 + ((unsigned)a[i]) - ((unsigned)b[i]);
|
||||
r[i] = (unsigned char)c;
|
||||
c >>= 8;
|
||||
}
|
||||
c += ((unsigned)a[31]) - ((unsigned)b[31]);
|
||||
r[31] = (unsigned char)(c & 127);
|
||||
c = (c >> 7) * 19;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < 32; i++) {
|
||||
c += r[i];
|
||||
r[i] = (unsigned char)c;
|
||||
c >>= 8;
|
||||
}
|
||||
// h[0..9] = f[0..9]
|
||||
void __device__ __host__ fe_copy(fe& __restrict__ h, const fe& __restrict__ f) {
|
||||
long f0 = f[0], f1 = f[1], f2 = f[2], f3 = f[3], f4 = f[4], f5 = f[5], f6 = f[6], f7 = f[7], f8 = f[8], f9 = f[9];
|
||||
h[0] = f0;
|
||||
h[1] = f1;
|
||||
h[2] = f2;
|
||||
h[3] = f3;
|
||||
h[4] = f4;
|
||||
h[5] = f5;
|
||||
h[6] = f6;
|
||||
h[7] = f7;
|
||||
h[8] = f8;
|
||||
h[9] = f9;
|
||||
}
|
||||
__device__ void f25519_neg(unsigned char* __restrict__ r, const unsigned char* __restrict__ a) {
|
||||
unsigned c = 218;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < 31; i++) {
|
||||
c += 65280 - ((unsigned)a[i]);
|
||||
r[i] = (unsigned char)c;
|
||||
c >>= 8;
|
||||
void __device__ __host__ fe_invert(fe& __restrict__ out, const fe& __restrict__ z) {
|
||||
fe t0, t1, t2, t3;
|
||||
fe_mul(t0, z, z);
|
||||
fe_mul(t1, t0, t0);
|
||||
fe_mul(t1, t1, t1);
|
||||
fe_mul(t1, z, t1);
|
||||
fe_mul(t0, t0, t1);
|
||||
fe_mul(t2, t0, t0);
|
||||
fe_mul(t1, t1, t2);
|
||||
fe_mul(t2, t1, t1);
|
||||
fe_mul(t2, t2, t2);
|
||||
fe_mul(t2, t2, t2);
|
||||
fe_mul(t2, t2, t2);
|
||||
fe_mul(t2, t2, t2);
|
||||
fe_mul(t1, t2, t1);
|
||||
fe_mul(t2, t1, t1);
|
||||
fe_mul(t2, t2, t2);
|
||||
fe_mul(t2, t2, t2);
|
||||
fe_mul(t2, t2, t2);
|
||||
fe_mul(t2, t2, t2);
|
||||
fe_mul(t2, t2, t2);
|
||||
fe_mul(t2, t2, t2);
|
||||
fe_mul(t2, t2, t2);
|
||||
fe_mul(t2, t2, t2);
|
||||
fe_mul(t2, t2, t2);
|
||||
fe_mul(t2, t2, t1);
|
||||
fe_mul(t3, t2, t2);
|
||||
fe_mul(t3, t3, t3);
|
||||
fe_mul(t3, t3, t3);
|
||||
fe_mul(t3, t3, t3);
|
||||
fe_mul(t3, t3, t3);
|
||||
fe_mul(t3, t3, t3);
|
||||
fe_mul(t3, t3, t3);
|
||||
fe_mul(t3, t3, t3);
|
||||
fe_mul(t3, t3, t3);
|
||||
fe_mul(t3, t3, t3);
|
||||
fe_mul(t3, t3, t3);
|
||||
fe_mul(t3, t3, t3);
|
||||
fe_mul(t3, t3, t3);
|
||||
fe_mul(t3, t3, t3);
|
||||
fe_mul(t3, t3, t3);
|
||||
fe_mul(t3, t3, t3);
|
||||
fe_mul(t3, t3, t3);
|
||||
fe_mul(t3, t3, t3);
|
||||
fe_mul(t3, t3, t3);
|
||||
fe_mul(t3, t3, t3);
|
||||
fe_mul(t2, t3, t2);
|
||||
fe_mul(t2, t2, t2);
|
||||
fe_mul(t2, t2, t2);
|
||||
fe_mul(t2, t2, t2);
|
||||
fe_mul(t2, t2, t2);
|
||||
fe_mul(t2, t2, t2);
|
||||
fe_mul(t2, t2, t2);
|
||||
fe_mul(t2, t2, t2);
|
||||
fe_mul(t2, t2, t2);
|
||||
fe_mul(t2, t2, t2);
|
||||
fe_mul(t2, t2, t2);
|
||||
fe_mul(t1, t2, t1);
|
||||
fe_mul(t2, t1, t1);
|
||||
#pragma unroll 49
|
||||
for (int i = 0; i < 49; ++i) {
|
||||
fe_mul(t2, t2, t2);
|
||||
}
|
||||
c -= ((unsigned)a[31]);
|
||||
r[31] = (unsigned char)(c & 127);
|
||||
c = (c >> 7) * 19;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < 32; i++) {
|
||||
c += r[i];
|
||||
r[i] = (unsigned char)c;
|
||||
c >>= 8;
|
||||
fe_mul(t2, t2, t1);
|
||||
fe_mul(t3, t2, t2);
|
||||
#pragma unroll 99
|
||||
for (int i = 0; i < 99; ++i) {
|
||||
fe_mul(t3, t3, t3);
|
||||
}
|
||||
fe_mul(t2, t3, t2);
|
||||
fe_mul(t2, t2, t2);
|
||||
#pragma unroll 49
|
||||
for (int i = 0; i < 49; ++i) {
|
||||
fe_mul(t2, t2, t2);
|
||||
}
|
||||
fe_mul(t1, t2, t1);
|
||||
fe_mul(t1, t1, t1);
|
||||
fe_mul(t1, t1, t1);
|
||||
fe_mul(t1, t1, t1);
|
||||
fe_mul(t1, t1, t1);
|
||||
fe_mul(t1, t1, t1);
|
||||
fe_mul(out, t1, t0);
|
||||
}
|
||||
__device__ void f25519_mul__distinct(unsigned char* __restrict__ r, const unsigned char* __restrict__ a, const unsigned char* __restrict__ b) {
|
||||
unsigned c = 0;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < 32; i++) {
|
||||
c >>= 8;
|
||||
for (int j = 0; j <= i; j++)
|
||||
c += ((unsigned)a[j]) * ((unsigned)b[i - j]);
|
||||
for (int j = i + 1; j < 32; j++)
|
||||
c += ((unsigned)a[j]) * ((unsigned)b[32 + i - j]) * 38;
|
||||
r[i] = (unsigned char)c;
|
||||
}
|
||||
r[31] &= 127;
|
||||
c = (c >> 7) * 19;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < 32; i++) {
|
||||
c += r[i];
|
||||
r[i] = (unsigned char)c;
|
||||
c >>= 8;
|
||||
}
|
||||
int __device__ __host__ fe_isnegative(const fe& __restrict__ f) {
|
||||
unsigned char s[32];
|
||||
fe_tobytes(s, f);
|
||||
return s[0] & 1;
|
||||
}
|
||||
__device__ void f25519_inv__distinct(unsigned char* __restrict__ r, const unsigned char* __restrict__ x) {
|
||||
__align__(32) unsigned char s[32];
|
||||
f25519_mul__distinct(s, x, x);
|
||||
f25519_mul__distinct(r, s, x);
|
||||
#pragma unroll
|
||||
for (int i = 0; i < 248; i++) {
|
||||
f25519_mul__distinct(s, r, r);
|
||||
f25519_mul__distinct(r, s, x);
|
||||
}
|
||||
f25519_mul__distinct(s, r, r);
|
||||
f25519_mul__distinct(r, s, s);
|
||||
f25519_mul__distinct(s, r, x);
|
||||
f25519_mul__distinct(r, s, s);
|
||||
f25519_mul__distinct(s, r, r);
|
||||
f25519_mul__distinct(r, s, x);
|
||||
f25519_mul__distinct(s, r, r);
|
||||
f25519_mul__distinct(r, s, x);
|
||||
// h = f * g
|
||||
void __device__ __host__ fe_mul(fe& __restrict__ h, const fe& __restrict__ f, const fe& __restrict__ g) {
|
||||
long f0 = f[0], f1 = f[1], f2 = f[2], f3 = f[3], f4 = f[4], f5 = f[5], f6 = f[6], f7 = f[7], f8 = f[8], f9 = f[9];
|
||||
long g0 = g[0], g1 = g[1], g2 = g[2], g3 = g[3], g4 = g[4], g5 = g[5], g6 = g[6], g7 = g[7], g8 = g[8], g9 = g[9];
|
||||
long f0g0 = f0 * g0, f0g1 = f0 * g1, f0g2 = f0 * g2, f0g3 = f0 * g3, f0g4 = f0 * g4, f0g5 = f0 * g5, f0g6 = f0 * g6, f0g7 = f0 * g7, f0g8 = f0 * g8, f0g9 = f0 * g9;
|
||||
long f1g0 = f1 * g0, f1g1_2 = f1 * g1 << 1L, f1g2 = f1 * g2, f1g3_2 = f1 * g3 << 1L, f1g4 = f1 * g4, f1g5_2 = f1 * g5 << 1L, f1g6 = f1 * g6, f1g7_2 = f1 * g7 << 1L, f1g8 = f1 * g8, f1g9_38 = f1 * g9 * 38L;
|
||||
long f2g0 = f2 * g0, f2g1 = f2 * g1, f2g2 = f2 * g2, f2g3 = f2 * g3, f2g4 = f2 * g4, f2g5 = f2 * g5, f2g6 = f2 * g6, f2g7 = f2 * g7, f2g8_19 = f2 * g8 * 19L, f2g9_19 = f2 * g9 * 38L >> 1L;
|
||||
long f3g0 = f3 * g0, f3g1_2 = f3 * g1 << 1L, f3g2 = f3 * g2, f3g3_2 = f3 * g3 << 1L, f3g4 = f3 * g4, f3g5_2 = f3 * g5 << 1L, f3g6 = f3 * g6, f3g7_38 = f3 * g7 * 38L, f3g8_19 = f3 * g8 * 19L, f3g9_38 = f3 * g9 * 38L;
|
||||
long f4g0 = f4 * g0, f4g1 = f4 * g1, f4g2 = f4 * g2, f4g3 = f4 * g3, f4g4 = f4 * g4, f4g5 = f4 * g5, f4g6_19 = f4 * g6 * 19L, f4g7_19 = f4 * g7 * 38L >> 1L, f4g8_19 = f4 * g8 * 19L, f4g9_19 = f4 * g9 * 38L >> 1L;
|
||||
long f5g0 = f5 * g0, f5g1_2 = f5 * g1 << 1L, f5g2 = f5 * g2, f5g3_2 = f5 * g3 << 1L, f5g4 = f5 * g4, f5g5_38 = f5 * g5 * 38L, f5g6_19 = f5 * g6 * 19L, f5g7_38 = f5 * g7 * 38L, f5g8_19 = f5 * g8 * 19L, f5g9_38 = f5 * g9 * 38L;
|
||||
long f6g0 = f6 * g0, f6g1 = f6 * g1, f6g2 = f6 * g2, f6g3 = f6 * g3, f6g4_19 = f6 * g4 * 19L, f6g5_19 = f6 * g5 * 38L >> 1L, f6g6_19 = f6 * g6 * 19L, f6g7_19 = f6 * g7 * 38L >> 1L, f6g8_19 = f6 * g8 * 19L, f6g9_19 = f6 * g9 * 38L >> 1L;
|
||||
long f7g0 = f7 * g0, f7g1_2 = f7 * g1 << 1L, f7g2 = f7 * g2, f7g3_38 = f7 * g3 * 38L, f7g4_19 = f7 * g4 * 19L, f7g5_38 = f7 * g5 * 38L, f7g6_19 = f7 * g6 * 19L, f7g7_38 = f7 * g7 * 38L, f7g8_19 = f7 * g8 * 19L, f7g9_38 = f7 * g9 * 38L;
|
||||
long f8g0 = f8 * g0, f8g1 = f8 * g1, f8g2_19 = f8 * g2 * 19L, f8g3_19 = f8 * g3 * 38L >> 1L, f8g4_19 = f8 * g4 * 19L, f8g5_19 = f8 * g5 * 38L >> 1L, f8g6_19 = f8 * g6 * 19L, f8g7_19 = f8 * g7 * 38L >> 1L, f8g8_19 = f8 * g8 * 19L, f8g9_19 = f8 * g9 * 38L >> 1L;
|
||||
long f9g0 = f9 * g0, f9g1_38 = f9 * g1 * 38L, f9g2_19 = f9 * g2 * 19L, f9g3_38 = f9 * g3 * 38L, f9g4_19 = f9 * g4 * 19L, f9g5_38 = f9 * g5 * 38L, f9g6_19 = f9 * g6 * 19L, f9g7_38 = f9 * g7 * 38L, f9g8_19 = f9 * g8 * 19L, f9g9_38 = f9 * g9 * 38L;
|
||||
long h0 = f0g0 + f1g9_38 + f2g8_19 + f3g7_38 + f4g6_19 + f5g5_38 + f6g4_19 + f7g3_38 + f8g2_19 + f9g1_38;
|
||||
long h1 = f0g1 + f1g0 + f2g9_19 + f3g8_19 + f4g7_19 + f5g6_19 + f6g5_19 + f7g4_19 + f8g3_19 + f9g2_19;
|
||||
long h2 = f0g2 + f1g1_2 + f2g0 + f3g9_38 + f4g8_19 + f5g7_38 + f6g6_19 + f7g5_38 + f8g4_19 + f9g3_38;
|
||||
long h3 = f0g3 + f1g2 + f2g1 + f3g0 + f4g9_19 + f5g8_19 + f6g7_19 + f7g6_19 + f8g5_19 + f9g4_19;
|
||||
long h4 = f0g4 + f1g3_2 + f2g2 + f3g1_2 + f4g0 + f5g9_38 + f6g8_19 + f7g7_38 + f8g6_19 + f9g5_38;
|
||||
long h5 = f0g5 + f1g4 + f2g3 + f3g2 + f4g1 + f5g0 + f6g9_19 + f7g8_19 + f8g7_19 + f9g6_19;
|
||||
long h6 = f0g6 + f1g5_2 + f2g4 + f3g3_2 + f4g2 + f5g1_2 + f6g0 + f7g9_38 + f8g8_19 + f9g7_38;
|
||||
long h7 = f0g7 + f1g6 + f2g5 + f3g4 + f4g3 + f5g2 + f6g1 + f7g0 + f8g9_19 + f9g8_19;
|
||||
long h8 = f0g8 + f1g7_2 + f2g6 + f3g5_2 + f4g4 + f5g3_2 + f6g2 + f7g1_2 + f8g0 + f9g9_38;
|
||||
long h9 = f0g9 + f1g8 + f2g7 + f3g6 + f4g5 + f5g4 + f6g3 + f7g2 + f8g1 + f9g0;
|
||||
long carry = (h0 + 33554432L) >> 26L;
|
||||
h1 += carry;
|
||||
h0 -= carry << 26L;
|
||||
carry = (h4 + 33554432L) >> 26L;
|
||||
h5 += carry;
|
||||
h4 -= carry << 26L;
|
||||
carry = (h1 + 16777216L) >> 25L;
|
||||
h2 += carry;
|
||||
h1 -= carry << 25L;
|
||||
carry = (h5 + 16777216L) >> 25L;
|
||||
h6 += carry;
|
||||
h5 -= carry << 25L;
|
||||
carry = (h2 + 33554432L) >> 26L;
|
||||
h3 += carry;
|
||||
h2 -= carry << 26L;
|
||||
carry = (h6 + 33554432L) >> 26L;
|
||||
h7 += carry;
|
||||
h6 -= carry << 26L;
|
||||
carry = (h3 + 16777216L) >> 25L;
|
||||
h4 += carry;
|
||||
h3 -= carry << 25L;
|
||||
carry = (h7 + 16777216L) >> 25L;
|
||||
h8 += carry;
|
||||
h7 -= carry << 25L;
|
||||
carry = (h4 + 33554432L) >> 26L;
|
||||
h5 += carry;
|
||||
h4 -= carry << 26L;
|
||||
carry = (h8 + 33554432L) >> 26L;
|
||||
h9 += carry;
|
||||
h8 -= carry << 26L;
|
||||
carry = (h9 + 16777216L) >> 25L;
|
||||
h0 += carry * 19L;
|
||||
h9 -= carry << 25L;
|
||||
carry = (h0 + 33554432L) >> 26L;
|
||||
h1 += carry;
|
||||
h0 -= carry << 26L;
|
||||
h[0] = static_cast<int>(h0);
|
||||
h[1] = static_cast<int>(h1);
|
||||
h[2] = static_cast<int>(h2);
|
||||
h[3] = static_cast<int>(h3);
|
||||
h[4] = static_cast<int>(h4);
|
||||
h[5] = static_cast<int>(h5);
|
||||
h[6] = static_cast<int>(h6);
|
||||
h[7] = static_cast<int>(h7);
|
||||
h[8] = static_cast<int>(h8);
|
||||
h[9] = static_cast<int>(h9);
|
||||
}
|
||||
// h = -f
|
||||
void __host__ __device__ fe_neg(fe& __restrict__ h, const fe& __restrict__ f) {
|
||||
long f0 = f[0], f1 = f[1], f2 = f[2], f3 = f[3], f4 = f[4], f5 = f[5], f6 = f[6], f7 = f[7], f8 = f[8], f9 = f[9];
|
||||
long h0 = -f0, h1 = -f1, h2 = -f2, h3 = -f3, h4 = -f4, h5 = -f5, h6 = -f6, h7 = -f7, h8 = -f8, h9 = -f9;
|
||||
h[0] = h0;
|
||||
h[1] = h1;
|
||||
h[2] = h2;
|
||||
h[3] = h3;
|
||||
h[4] = h4;
|
||||
h[5] = h5;
|
||||
h[6] = h6;
|
||||
h[7] = h7;
|
||||
h[8] = h8;
|
||||
h[9] = h9;
|
||||
}
|
||||
void __host__ __device__ fe_mul2(fe& __restrict__ h, const fe& __restrict__ f) {
|
||||
const long f0 = f[0], f1 = f[1], f2 = f[2], f3 = f[3], f4 = f[4], f5 = f[5], f6 = f[6], f7 = f[7], f8 = f[8], f9 = f[9];
|
||||
long f0_2 = f0 << 1, f1_2 = f1 << 1, f2_2 = f2 << 1, f3_2 = f3 << 1, f4_2 = f4 << 1, f5_2 = f5 << 1, f6_2 = f6 << 1, f7_2 = f7 << 1;
|
||||
long f5_38 = 38 * f5, f6_19 = 19 * f6, f7_38 = 38 * f7, f8_19 = 19 * f8, f9_38 = 38 * f9;
|
||||
long f0f0 = f0 * f0, f0f1_2 = f0_2 * f1, f0f2_2 = f0_2 * f2, f0f3_2 = f0_2 * f3, f0f4_2 = f0_2 * f4, f0f5_2 = f0_2 * f5, f0f6_2 = f0_2 * f6, f0f7_2 = f0_2 * f7, f0f8_2 = f0_2 * f8, f0f9_2 = f0_2 * f9;
|
||||
long f1f1_2 = f1_2 * f1, f1f2_2 = f1_2 * f2, f1f3_4 = f1_2 * f3_2, f1f4_2 = f1_2 * f4, f1f5_4 = f1_2 * f5_2, f1f6_2 = f1_2 * f6, f1f7_4 = f1_2 * f7_2, f1f8_2 = f1_2 * f8, f1f9_76 = f1_2 * f9_38;
|
||||
long f2f2 = f2 * f2, f2f3_2 = f2_2 * f3, f2f4_2 = f2_2 * f4, f2f5_2 = f2_2 * f5, f2f6_2 = f2_2 * f6, f2f7_2 = f2_2 * f7, f2f8_38 = f2_2 * f8_19, f2f9_38 = f2 * f9_38;
|
||||
long f3f3_2 = f3_2 * f3, f3f4_2 = f3_2 * f4, f3f5_4 = f3_2 * f5_2, f3f6_2 = f3_2 * f6, f3f7_76 = f3_2 * f7_38, f3f8_38 = f3_2 * f8_19, f3f9_76 = f3_2 * f9_38;
|
||||
long f4f4 = f4 * f4, f4f5_2 = f4_2 * f5, f4f6_38 = f4_2 * f6_19, f4f7_38 = f4 * f7_38, f4f8_38 = f4_2 * f8_19, f4f9_38 = f4 * f9_38;
|
||||
long f5f5_38 = f5 * f5_38, f5f6_38 = f5_2 * f6_19, f5f7_76 = f5_2 * f7_38, f5f8_38 = f5_2 * f8_19, f5f9_76 = f5_2 * f9_38;
|
||||
long f6f6_19 = f6 * f6_19, f6f7_38 = f6 * f7_38, f6f8_38 = f6_2 * f8_19, f6f9_38 = f6 * f9_38;
|
||||
long f7f7_38 = f7 * f7_38, f7f8_38 = f7_2 * f8_19, f7f9_76 = f7_2 * f9_38;
|
||||
long f8f8_19 = f8 * f8_19, f8f9_38 = f8 * f9_38;
|
||||
long f9f9_38 = f9 * f9_38;
|
||||
long h0 = (f0f0 + f1f9_76 + f2f8_38 + f3f7_76 + f4f6_38 + f5f5_38) << 1;
|
||||
long h1 = (f0f1_2 + f2f9_38 + f3f8_38 + f4f7_38 + f5f6_38) << 1;
|
||||
long h2 = (f0f2_2 + f1f1_2 + f3f9_76 + f4f8_38 + f5f7_76 + f6f6_19) << 1;
|
||||
long h3 = (f0f3_2 + f1f2_2 + f4f9_38 + f5f8_38 + f6f7_38) << 1;
|
||||
long h4 = (f0f4_2 + f1f3_4 + f2f2 + f5f9_76 + f6f8_38 + f7f7_38) << 1;
|
||||
long h5 = (f0f5_2 + f1f4_2 + f2f3_2 + f6f9_38 + f7f8_38) << 1;
|
||||
long h6 = (f0f6_2 + f1f5_4 + f2f4_2 + f3f3_2 + f7f9_76 + f8f8_19) << 1;
|
||||
long h7 = (f0f7_2 + f1f6_2 + f2f5_2 + f3f4_2 + f8f9_38) << 1;
|
||||
long h8 = (f0f8_2 + f1f7_4 + f2f6_2 + f3f5_4 + f4f4 + f9f9_38) << 1;
|
||||
long h9 = (f0f9_2 + f1f8_2 + f2f7_2 + f3f6_2 + f4f5_2) << 1;
|
||||
long carry = (h0 + 33554432L) >> 26L;
|
||||
h1 += carry;
|
||||
h0 -= carry << 26L;
|
||||
carry = (h4 + 33554432L) >> 26L;
|
||||
h5 += carry;
|
||||
h4 -= carry << 26L;
|
||||
carry = (h1 + 16777216L) >> 25L;
|
||||
h2 += carry;
|
||||
h1 -= carry << 25L;
|
||||
carry = (h5 + 16777216L) >> 25L;
|
||||
h6 += carry;
|
||||
h5 -= carry << 25L;
|
||||
carry = (h2 + 33554432L) >> 26L;
|
||||
h3 += carry;
|
||||
h2 -= carry << 26L;
|
||||
carry = (h6 + 33554432L) >> 26L;
|
||||
h7 += carry;
|
||||
h6 -= carry << 26L;
|
||||
carry = (h3 + 16777216L) >> 25L;
|
||||
h4 += carry;
|
||||
h3 -= carry << 25L;
|
||||
carry = (h7 + 16777216L) >> 25L;
|
||||
h8 += carry;
|
||||
h7 -= carry << 25L;
|
||||
carry = (h4 + 33554432L) >> 26L;
|
||||
h5 += carry;
|
||||
h4 -= carry << 26L;
|
||||
carry = (h8 + 33554432L) >> 26L;
|
||||
h9 += carry;
|
||||
h8 -= carry << 26L;
|
||||
carry = (h9 + 16777216L) >> 25L;
|
||||
h0 += carry * 19L;
|
||||
h9 -= carry << 25L;
|
||||
carry = (h0 + 33554432L) >> 26L;
|
||||
h1 += carry;
|
||||
h0 -= carry << 26L;
|
||||
h[0] = static_cast<int>(h0);
|
||||
h[1] = static_cast<int>(h1);
|
||||
h[2] = static_cast<int>(h2);
|
||||
h[3] = static_cast<int>(h3);
|
||||
h[4] = static_cast<int>(h4);
|
||||
h[5] = static_cast<int>(h5);
|
||||
h[6] = static_cast<int>(h6);
|
||||
h[7] = static_cast<int>(h7);
|
||||
h[8] = static_cast<int>(h8);
|
||||
h[9] = static_cast<int>(h9);
|
||||
}
|
||||
void __device__ __host__ fe_sub(fe& __restrict__ h, const fe& __restrict__ f, const fe& __restrict__ g) {
|
||||
long f0 = f[0], f1 = f[1], f2 = f[2], f3 = f[3], f4 = f[4], f5 = f[5], f6 = f[6], f7 = f[7], f8 = f[8], f9 = f[9];
|
||||
long g0 = g[0], g1 = g[1], g2 = g[2], g3 = g[3], g4 = g[4], g5 = g[5], g6 = g[6], g7 = g[7], g8 = g[8], g9 = g[9];
|
||||
long h0 = f0 - g0, h1 = f1 - g1, h2 = f2 - g2, h3 = f3 - g3, h4 = f4 - g4, h5 = f5 - g5, h6 = f6 - g6, h7 = f7 - g7, h8 = f8 - g8, h9 = f9 - g9;
|
||||
h[0] = h0;
|
||||
h[1] = h1;
|
||||
h[2] = h2;
|
||||
h[3] = h3;
|
||||
h[4] = h4;
|
||||
h[5] = h5;
|
||||
h[6] = h6;
|
||||
h[7] = h7;
|
||||
h[8] = h8;
|
||||
h[9] = h9;
|
||||
}
|
||||
void __device__ __host__ fe_tobytes(unsigned char* __restrict__ s, const fe& __restrict__ h) {
|
||||
long h0 = h[0], h1 = h[1], h2 = h[2], h3 = h[3], h4 = h[4], h5 = h[5], h6 = h[6], h7 = h[7], h8 = h[8], h9 = h[9], carry0, carry1, carry2, carry3, carry4, carry5, carry6, carry7, carry8, carry9, q;
|
||||
q = (19 * h9 + (1 << 24)) >> 25;
|
||||
q = (h0 + q) >> 26;
|
||||
q = (h1 + q) >> 25;
|
||||
q = (h2 + q) >> 26;
|
||||
q = (h3 + q) >> 25;
|
||||
q = (h4 + q) >> 26;
|
||||
q = (h5 + q) >> 25;
|
||||
q = (h6 + q) >> 26;
|
||||
q = (h7 + q) >> 25;
|
||||
q = (h8 + q) >> 26;
|
||||
q = (h9 + q) >> 25;
|
||||
h0 += 19 * q;
|
||||
carry0 = h0 >> 26;
|
||||
h1 += carry0;
|
||||
h0 -= carry0 << 26;
|
||||
carry1 = h1 >> 25;
|
||||
h2 += carry1;
|
||||
h1 -= carry1 << 25;
|
||||
carry2 = h2 >> 26;
|
||||
h3 += carry2;
|
||||
h2 -= carry2 << 26;
|
||||
carry3 = h3 >> 25;
|
||||
h4 += carry3;
|
||||
h3 -= carry3 << 25;
|
||||
carry4 = h4 >> 26;
|
||||
h5 += carry4;
|
||||
h4 -= carry4 << 26;
|
||||
carry5 = h5 >> 25;
|
||||
h6 += carry5;
|
||||
h5 -= carry5 << 25;
|
||||
carry6 = h6 >> 26;
|
||||
h7 += carry6;
|
||||
h6 -= carry6 << 26;
|
||||
carry7 = h7 >> 25;
|
||||
h8 += carry7;
|
||||
h7 -= carry7 << 25;
|
||||
carry8 = h8 >> 26;
|
||||
h9 += carry8;
|
||||
h8 -= carry8 << 26;
|
||||
carry9 = h9 >> 25;
|
||||
h9 -= carry9 << 25;
|
||||
s[0] = static_cast<unsigned char>(h0);
|
||||
s[1] = static_cast<unsigned char>(h0 >> 8);
|
||||
s[2] = static_cast<unsigned char>(h0 >> 16);
|
||||
s[3] = static_cast<unsigned char>((h0 >> 24) | (h1 << 2));
|
||||
s[4] = static_cast<unsigned char>(h1 >> 6);
|
||||
s[5] = static_cast<unsigned char>(h1 >> 14);
|
||||
s[6] = static_cast<unsigned char>((h1 >> 22) | (h2 << 3));
|
||||
s[7] = static_cast<unsigned char>(h2 >> 5);
|
||||
s[8] = static_cast<unsigned char>(h2 >> 13);
|
||||
s[9] = static_cast<unsigned char>((h2 >> 21) | (h3 << 5));
|
||||
s[10] = static_cast<unsigned char>(h3 >> 3);
|
||||
s[11] = static_cast<unsigned char>(h3 >> 11);
|
||||
s[12] = static_cast<unsigned char>((h3 >> 19) | (h4 << 6));
|
||||
s[13] = static_cast<unsigned char>(h4 >> 2);
|
||||
s[14] = static_cast<unsigned char>(h4 >> 10);
|
||||
s[15] = static_cast<unsigned char>(h4 >> 18);
|
||||
s[16] = static_cast<unsigned char>(h5);
|
||||
s[17] = static_cast<unsigned char>(h5 >> 8);
|
||||
s[18] = static_cast<unsigned char>(h5 >> 16);
|
||||
s[19] = static_cast<unsigned char>((h5 >> 24) | (h6 << 1));
|
||||
s[20] = static_cast<unsigned char>(h6 >> 7);
|
||||
s[21] = static_cast<unsigned char>(h6 >> 15);
|
||||
s[22] = static_cast<unsigned char>((h6 >> 23) | (h7 << 3));
|
||||
s[23] = static_cast<unsigned char>(h7 >> 5);
|
||||
s[24] = static_cast<unsigned char>(h7 >> 13);
|
||||
s[25] = static_cast<unsigned char>((h7 >> 21) | (h8 << 4));
|
||||
s[26] = static_cast<unsigned char>(h8 >> 4);
|
||||
s[27] = static_cast<unsigned char>(h8 >> 12);
|
||||
s[28] = static_cast<unsigned char>((h8 >> 20) | (h9 << 6));
|
||||
s[29] = static_cast<unsigned char>(h9 >> 2);
|
||||
s[30] = static_cast<unsigned char>(h9 >> 10);
|
||||
s[31] = static_cast<unsigned char>(h9 >> 18);
|
||||
}
|
||||
@@ -1,11 +1,16 @@
|
||||
#ifndef __F25519_CUH
|
||||
#define __F25519_CUH
|
||||
__device__ void f25519_copy(unsigned char* __restrict__ x, const unsigned char* __restrict__ a);
|
||||
__device__ void f25519_select(unsigned char* __restrict__ dst, const unsigned char* __restrict__ zero, const unsigned char* __restrict__ one, unsigned char cond);
|
||||
__device__ void f25519_normalize(unsigned char* __restrict__ x);
|
||||
__device__ void f25519_add(unsigned char* __restrict__ r, const unsigned char* __restrict__ a, const unsigned char* __restrict__ b);
|
||||
__device__ void f25519_sub(unsigned char* __restrict__ r, const unsigned char* __restrict__ a, const unsigned char* __restrict__ b);
|
||||
__device__ void f25519_neg(unsigned char* __restrict__ r, const unsigned char* __restrict__ a);
|
||||
__device__ void f25519_mul__distinct(unsigned char* __restrict__ r, const unsigned char* __restrict__ a, const unsigned char* __restrict__ b);
|
||||
__device__ void f25519_inv__distinct(unsigned char* __restrict__ r, const unsigned char* __restrict__ x);
|
||||
#endif
|
||||
using fe = signed long[10];
|
||||
void __device__ __host__ fe_0(fe& __restrict__ h);
|
||||
void __device__ __host__ fe_1(fe& __restrict__ h);
|
||||
void __device__ __host__ fe_tobytes(unsigned char* __restrict__ s, const fe& __restrict__ h);
|
||||
void __device__ __host__ fe_copy(fe& __restrict__ h, const fe& __restrict__ f);
|
||||
int __device__ __host__ fe_isnegative(const fe& __restrict__ f);
|
||||
void __device__ __host__ fe_cmov(fe& __restrict__ f, const fe& __restrict__ g, const unsigned b);
|
||||
void __device__ __host__ fe_neg(fe& __restrict__ h, const fe& __restrict__ f);
|
||||
void __device__ __host__ fe_add(fe& __restrict__ h, const fe& __restrict__ f, const fe& __restrict__ g);
|
||||
void __device__ __host__ fe_invert(fe& __restrict__ out, const fe& __restrict__ z);
|
||||
void __device__ __host__ fe_mul2(fe& __restrict__ h, const fe& __restrict__ f);
|
||||
void __device__ __host__ fe_mul(fe& __restrict__ h, const fe& __restrict__ f, const fe& __restrict__ g);
|
||||
void __device__ __host__ fe_sub(fe& __restrict__ h, const fe& __restrict__ f, const fe& __restrict__ g);
|
||||
#endif
|
||||
|
||||
@@ -1,27 +1,28 @@
|
||||
#include <keymanip.cuh>
|
||||
static __constant__ const char* hexDigits = "0123456789abcdef";
|
||||
__device__ ds64 ktos(const unsigned char* key) noexcept {
|
||||
ds64 str;
|
||||
const char* hexDigits = "0123456789abcdef";
|
||||
#pragma unroll 32
|
||||
for (unsigned char i = 0; i < 32; i++) {
|
||||
for (signed int i = 0; i < 32; i++) {
|
||||
str.data[2 * i] = hexDigits[key[i] >> 4];
|
||||
str.data[2 * i + 1] = hexDigits[key[i] & 0x0F];
|
||||
}
|
||||
str.data[65] = '\0';
|
||||
return str;
|
||||
}
|
||||
__device__ ds46 getAddr(const unsigned char rawAddr[16]) noexcept {
|
||||
__device__ ds46 getAddr(const Addr16 rawAddr) noexcept {
|
||||
ds46 addrStr;
|
||||
const char* hexDigits = "0123456789abcdef";
|
||||
unsigned pos = 0;
|
||||
#pragma unroll 8
|
||||
for (unsigned char group = 0; group < 8; group++) {
|
||||
int idx = group * 2;
|
||||
for (signed int group = 0; group < 8; group++) {
|
||||
int idx = group << 1;
|
||||
addrStr.data[pos++] = hexDigits[rawAddr[idx] >> 4];
|
||||
addrStr.data[pos++] = hexDigits[rawAddr[idx] & 0x0F];
|
||||
addrStr.data[pos++] = hexDigits[rawAddr[idx + 1] >> 4];
|
||||
addrStr.data[pos++] = hexDigits[rawAddr[idx + 1] & 0x0F];
|
||||
if (group < 7) { addrStr.data[pos++] = ':'; }
|
||||
if (group < 7) {
|
||||
addrStr.data[pos++] = ':';
|
||||
}
|
||||
}
|
||||
addrStr.data[pos] = '\0';
|
||||
return addrStr;
|
||||
@@ -31,7 +32,7 @@ __device__ void getRawAddress(int lErase, Key32& InvertedPublicKey, Addr16& rawA
|
||||
const int bitsToShift = lErase & 7;
|
||||
const int start = lErase >> 3;
|
||||
if (bitsToShift) {
|
||||
#pragma unroll
|
||||
#pragma unroll
|
||||
for (int i = start; i < start + 15; i++) {
|
||||
InvertedPublicKey[i] = static_cast<unsigned char>((InvertedPublicKey[i] << bitsToShift) | (InvertedPublicKey[i + 1] >> (8 - bitsToShift)));
|
||||
}
|
||||
@@ -41,6 +42,9 @@ __device__ void getRawAddress(int lErase, Key32& InvertedPublicKey, Addr16& rawA
|
||||
memcpy(&rawAddr[2], &InvertedPublicKey[start], 14);
|
||||
}
|
||||
__device__ void invertKey(const unsigned char* key, unsigned char* inverted) {
|
||||
#pragma unroll 32
|
||||
for (unsigned char i = 0; i < 32; i++) inverted[i] = key[i] ^ 0xFF;
|
||||
#pragma unroll 8
|
||||
for (int i = 0; i < 32; i += 4) {
|
||||
uchar4 k = *(reinterpret_cast<const uchar4*>(&key[i]));
|
||||
*(reinterpret_cast<uchar4*>(&inverted[i])) = make_uchar4(k.x ^ 0xFF, k.y ^ 0xFF, k.z ^ 0xFF, k.w ^ 0xFF);
|
||||
}
|
||||
}
|
||||
@@ -9,11 +9,10 @@ struct ds46 {
|
||||
using Addr16 = unsigned char[16];
|
||||
using Key32 = unsigned char[32];
|
||||
struct KeysBox32 {
|
||||
Key32 PublicKey;
|
||||
Key32 PrivateKey;
|
||||
Key32 PublicKey, PrivateKey;
|
||||
};
|
||||
__device__ ds64 ktos(const unsigned char* key) noexcept;
|
||||
__device__ ds46 getAddr(const unsigned char rawAddr[16]) noexcept;
|
||||
__device__ ds46 getAddr(const Addr16 rawAddr) noexcept;
|
||||
__device__ void getRawAddress(int lErase, Key32& InvertedPublicKey, Addr16& rawAddr) noexcept;
|
||||
__device__ void invertKey(const unsigned char* key, unsigned char* inverted);
|
||||
#endif
|
||||
1389
libs/precomp_data.h
Normal file
1389
libs/precomp_data.h
Normal file
File diff suppressed because it is too large
Load Diff
150
libs/sha512.cu
150
libs/sha512.cu
@@ -1,51 +1,109 @@
|
||||
#include <sha512.cuh>
|
||||
__device__ __constant__ sha512_state sha512_initial_state = { {
|
||||
0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL,
|
||||
0x3c6ef372fe94f82bULL, 0xa54ff53a5f1d36f1ULL,
|
||||
0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL,
|
||||
0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL,
|
||||
0x6a09e667f3bcc908ULL,
|
||||
0xbb67ae8584caa73bULL,
|
||||
0x3c6ef372fe94f82bULL,
|
||||
0xa54ff53a5f1d36f1ULL,
|
||||
0x510e527fade682d1ULL,
|
||||
0x9b05688c2b3e6c1fULL,
|
||||
0x1f83d9abfb41bd6bULL,
|
||||
0x5be0cd19137e2179ULL,
|
||||
} };
|
||||
__device__ __constant__ unsigned long round_k[80] = {
|
||||
0x428a2f98d728ae22ULL, 0x7137449123ef65cdULL, 0xb5c0fbcfec4d3b2fULL, 0xe9b5dba58189dbbcULL,
|
||||
0x3956c25bf348b538ULL, 0x59f111f1b605d019ULL, 0x923f82a4af194f9bULL, 0xab1c5ed5da6d8118ULL,
|
||||
0xd807aa98a3030242ULL, 0x12835b0145706fbeULL, 0x243185be4ee4b28cULL, 0x550c7dc3d5ffb4e2ULL,
|
||||
0x72be5d74f27b896fULL, 0x80deb1fe3b1696b1ULL, 0x9bdc06a725c71235ULL, 0xc19bf174cf692694ULL,
|
||||
0xe49b69c19ef14ad2ULL, 0xefbe4786384f25e3ULL, 0x0fc19dc68b8cd5b5ULL, 0x240ca1cc77ac9c65ULL,
|
||||
0x2de92c6f592b0275ULL, 0x4a7484aa6ea6e483ULL, 0x5cb0a9dcbd41fbd4ULL, 0x76f988da831153b5ULL,
|
||||
0x983e5152ee66dfabULL, 0xa831c66d2db43210ULL, 0xb00327c898fb213fULL, 0xbf597fc7beef0ee4ULL,
|
||||
0xc6e00bf33da88fc2ULL, 0xd5a79147930aa725ULL, 0x06ca6351e003826fULL, 0x142929670a0e6e70ULL,
|
||||
0x27b70a8546d22ffcULL, 0x2e1b21385c26c926ULL, 0x4d2c6dfc5ac42aedULL, 0x53380d139d95b3dfULL,
|
||||
0x650a73548baf63deULL, 0x766a0abb3c77b2a8ULL, 0x81c2c92e47edaee6ULL, 0x92722c851482353bULL,
|
||||
0xa2bfe8a14cf10364ULL, 0xa81a664bbc423001ULL, 0xc24b8b70d0f89791ULL, 0xc76c51a30654be30ULL,
|
||||
0xd192e819d6ef5218ULL, 0xd69906245565a910ULL, 0xf40e35855771202aULL, 0x106aa07032bbd1b8ULL,
|
||||
0x19a4c116b8d2d0c8ULL, 0x1e376c085141ab53ULL, 0x2748774cdf8eeb99ULL, 0x34b0bcb5e19b48a8ULL,
|
||||
0x391c0cb3c5c95a63ULL, 0x4ed8aa4ae3418acbULL, 0x5b9cca4f7763e373ULL, 0x682e6ff3d6b2b8a3ULL,
|
||||
0x748f82ee5defb2fcULL, 0x78a5636f43172f60ULL, 0x84c87814a1f0ab72ULL, 0x8cc702081a6439ecULL,
|
||||
0x90befffa23631e28ULL, 0xa4506cebde82bde9ULL, 0xbef9a3f7b2c67915ULL, 0xc67178f2e372532bULL,
|
||||
0xca273eceea26619cULL, 0xd186b8c721c0c207ULL, 0xeada7dd6cde0eb1eULL, 0xf57d4f7fee6ed178ULL,
|
||||
0x06f067aa72176fbaULL, 0x0a637dc5a2c898a6ULL, 0x113f9804bef90daeULL, 0x1b710b35131c471bULL,
|
||||
0x28db77f523047d84ULL, 0x32caab7b40c72493ULL, 0x3c9ebe0a15c9bebcULL, 0x431d67c49c100d4cULL,
|
||||
0x4cc5d4becb3e42b6ULL, 0x597f299cfc657e2aULL, 0x5fcb6fab3ad6faecULL, 0x6c44198c4a475817ULL,
|
||||
0x428a2f98d728ae22ULL,
|
||||
0x7137449123ef65cdULL,
|
||||
0xb5c0fbcfec4d3b2fULL,
|
||||
0xe9b5dba58189dbbcULL,
|
||||
0x3956c25bf348b538ULL,
|
||||
0x59f111f1b605d019ULL,
|
||||
0x923f82a4af194f9bULL,
|
||||
0xab1c5ed5da6d8118ULL,
|
||||
0xd807aa98a3030242ULL,
|
||||
0x12835b0145706fbeULL,
|
||||
0x243185be4ee4b28cULL,
|
||||
0x550c7dc3d5ffb4e2ULL,
|
||||
0x72be5d74f27b896fULL,
|
||||
0x80deb1fe3b1696b1ULL,
|
||||
0x9bdc06a725c71235ULL,
|
||||
0xc19bf174cf692694ULL,
|
||||
0xe49b69c19ef14ad2ULL,
|
||||
0xefbe4786384f25e3ULL,
|
||||
0x0fc19dc68b8cd5b5ULL,
|
||||
0x240ca1cc77ac9c65ULL,
|
||||
0x2de92c6f592b0275ULL,
|
||||
0x4a7484aa6ea6e483ULL,
|
||||
0x5cb0a9dcbd41fbd4ULL,
|
||||
0x76f988da831153b5ULL,
|
||||
0x983e5152ee66dfabULL,
|
||||
0xa831c66d2db43210ULL,
|
||||
0xb00327c898fb213fULL,
|
||||
0xbf597fc7beef0ee4ULL,
|
||||
0xc6e00bf33da88fc2ULL,
|
||||
0xd5a79147930aa725ULL,
|
||||
0x06ca6351e003826fULL,
|
||||
0x142929670a0e6e70ULL,
|
||||
0x27b70a8546d22ffcULL,
|
||||
0x2e1b21385c26c926ULL,
|
||||
0x4d2c6dfc5ac42aedULL,
|
||||
0x53380d139d95b3dfULL,
|
||||
0x650a73548baf63deULL,
|
||||
0x766a0abb3c77b2a8ULL,
|
||||
0x81c2c92e47edaee6ULL,
|
||||
0x92722c851482353bULL,
|
||||
0xa2bfe8a14cf10364ULL,
|
||||
0xa81a664bbc423001ULL,
|
||||
0xc24b8b70d0f89791ULL,
|
||||
0xc76c51a30654be30ULL,
|
||||
0xd192e819d6ef5218ULL,
|
||||
0xd69906245565a910ULL,
|
||||
0xf40e35855771202aULL,
|
||||
0x106aa07032bbd1b8ULL,
|
||||
0x19a4c116b8d2d0c8ULL,
|
||||
0x1e376c085141ab53ULL,
|
||||
0x2748774cdf8eeb99ULL,
|
||||
0x34b0bcb5e19b48a8ULL,
|
||||
0x391c0cb3c5c95a63ULL,
|
||||
0x4ed8aa4ae3418acbULL,
|
||||
0x5b9cca4f7763e373ULL,
|
||||
0x682e6ff3d6b2b8a3ULL,
|
||||
0x748f82ee5defb2fcULL,
|
||||
0x78a5636f43172f60ULL,
|
||||
0x84c87814a1f0ab72ULL,
|
||||
0x8cc702081a6439ecULL,
|
||||
0x90befffa23631e28ULL,
|
||||
0xa4506cebde82bde9ULL,
|
||||
0xbef9a3f7b2c67915ULL,
|
||||
0xc67178f2e372532bULL,
|
||||
0xca273eceea26619cULL,
|
||||
0xd186b8c721c0c207ULL,
|
||||
0xeada7dd6cde0eb1eULL,
|
||||
0xf57d4f7fee6ed178ULL,
|
||||
0x06f067aa72176fbaULL,
|
||||
0x0a637dc5a2c898a6ULL,
|
||||
0x113f9804bef90daeULL,
|
||||
0x1b710b35131c471bULL,
|
||||
0x28db77f523047d84ULL,
|
||||
0x32caab7b40c72493ULL,
|
||||
0x3c9ebe0a15c9bebcULL,
|
||||
0x431d67c49c100d4cULL,
|
||||
0x4cc5d4becb3e42b6ULL,
|
||||
0x597f299cfc657e2aULL,
|
||||
0x5fcb6fab3ad6faecULL,
|
||||
0x6c44198c4a475817ULL,
|
||||
};
|
||||
__device__ __forceinline__ unsigned long load64(const unsigned char* __restrict__ x) {
|
||||
return (static_cast<unsigned long>(x[0]) << 56) | (static_cast<unsigned long>(x[1]) << 48) |
|
||||
(static_cast<unsigned long>(x[2]) << 40) | (static_cast<unsigned long>(x[3]) << 32) |
|
||||
(static_cast<unsigned long>(x[4]) << 24) | (static_cast<unsigned long>(x[5]) << 16) |
|
||||
(static_cast<unsigned long>(x[6]) << 8) | (static_cast<unsigned long>(x[7]));
|
||||
return (static_cast<unsigned long>(x[0]) << 56) | (static_cast<unsigned long>(x[1]) << 48) | (static_cast<unsigned long>(x[2]) << 40) | (static_cast<unsigned long>(x[3]) << 32) | (static_cast<unsigned long>(x[4]) << 24) | (static_cast<unsigned long>(x[5]) << 16) | (static_cast<unsigned long>(x[6]) << 8) | (static_cast<unsigned long>(x[7]));
|
||||
}
|
||||
__device__ __forceinline__ void store64(unsigned char* __restrict__ x, unsigned long v) {
|
||||
__device__ inline void store64(unsigned char* __restrict__ x, unsigned long v) {
|
||||
#pragma unroll 8
|
||||
for (int i = 0; i < 8; i++) {
|
||||
x[i] = (unsigned char)(v >> (56 - i * 8));
|
||||
}
|
||||
for (unsigned char i = 0; i < 8; i++)
|
||||
x[i] = static_cast<unsigned char>(v >> (56 - i * 8));
|
||||
}
|
||||
__device__ __forceinline__ unsigned long rot64(unsigned long x, int bits) {
|
||||
return (x >> bits) | (x << (64 - bits));
|
||||
}
|
||||
__device__ __forceinline__ void sha512_block(sha512_state* __restrict__ s, const unsigned char* __restrict__ blk) {
|
||||
#define rot64(x, bits) ((x >> bits) | (x << (64 - bits)))
|
||||
__device__ void sha512_block(sha512_state* __restrict__ s, const unsigned char* __restrict__ blk) {
|
||||
unsigned long w[16];
|
||||
#pragma unroll 16
|
||||
for (int i = 0; i < 16; i++) {
|
||||
for (unsigned char i = 0; i < 16; i++) {
|
||||
w[i] = load64(blk + i * 8);
|
||||
}
|
||||
unsigned long a = s->h[0];
|
||||
@@ -57,7 +115,7 @@ __device__ __forceinline__ void sha512_block(sha512_state* __restrict__ s, const
|
||||
unsigned long g = s->h[6];
|
||||
unsigned long h = s->h[7];
|
||||
#pragma unroll 80
|
||||
for (int i = 0; i < 80; i++) {
|
||||
for (unsigned char i = 0; i < 80; i++) {
|
||||
const int idx = i & 15;
|
||||
const int idx1 = (i + 1) & 15;
|
||||
const int idx7 = (i + 9) & 15;
|
||||
@@ -87,7 +145,7 @@ __device__ __forceinline__ void sha512_block(sha512_state* __restrict__ s, const
|
||||
s->h[6] += g;
|
||||
s->h[7] += h;
|
||||
}
|
||||
__device__ void sha512_final(sha512_state* s, const unsigned char* blk) {
|
||||
__device__ void sha512_final(sha512_state* __restrict__ s, const unsigned char* __restrict__ blk) {
|
||||
unsigned char temp[128];
|
||||
unsigned long last_size = 32 & (128 - 1);
|
||||
memcpy(temp, blk, last_size);
|
||||
@@ -99,23 +157,23 @@ __device__ void sha512_final(sha512_state* s, const unsigned char* blk) {
|
||||
store64(temp + 128 - 8, 256);
|
||||
sha512_block(s, temp);
|
||||
}
|
||||
__device__ void sha512_get(const sha512_state* s, unsigned char* hash) {
|
||||
__device__ void sha512_get(const sha512_state* __restrict__ s, unsigned char* __restrict__ hash) {
|
||||
unsigned len = 64;
|
||||
if (len > 128) len = 128;
|
||||
if (len > 128)
|
||||
len = 128;
|
||||
unsigned i = 0, c = (len < 8) ? len : 8;
|
||||
store64(hash, s->h[i]);
|
||||
hash += c;
|
||||
len -= c;
|
||||
i++;
|
||||
#pragma unroll
|
||||
while (len >= 8) {
|
||||
store64(hash, s->h[i]);
|
||||
hash += 8;
|
||||
len -= 8;
|
||||
i++;
|
||||
}
|
||||
if (len > 0) {
|
||||
unsigned char tmp[8];
|
||||
store64(tmp, s->h[i]);
|
||||
memcpy(hash, tmp, len);
|
||||
}
|
||||
unsigned char tmp[8];
|
||||
store64(tmp, s->h[i]);
|
||||
memcpy(hash, tmp, len);
|
||||
}
|
||||
@@ -5,10 +5,9 @@ struct sha512_state {
|
||||
};
|
||||
extern __device__ __constant__ sha512_state sha512_initial_state;
|
||||
extern __device__ __constant__ unsigned long round_k[80];
|
||||
__device__ __forceinline__ unsigned long load64(const unsigned char* x);
|
||||
__device__ __forceinline__ void store64(unsigned char* x, unsigned long v);
|
||||
__device__ __forceinline__ unsigned long rot64(unsigned long x, int bits);
|
||||
__device__ __forceinline__ void sha512_block(sha512_state* s, const unsigned char* blk);
|
||||
__device__ void sha512_final(sha512_state* s, const unsigned char* blk);
|
||||
__device__ void sha512_get(const sha512_state* s, unsigned char* hash);
|
||||
__device__ __forceinline__ unsigned long load64(const unsigned char* __restrict__ x);
|
||||
__device__ __forceinline__ void store64(unsigned char* __restrict__ x, unsigned long v);
|
||||
__device__ __forceinline__ void sha512_block(sha512_state* __restrict__ s, const unsigned char* __restrict__ blk);
|
||||
__device__ void sha512_final(sha512_state* __restrict__ s, const unsigned char* __restrict__ blk);
|
||||
__device__ void sha512_get(const sha512_state* __restrict__ s, unsigned char* __restrict__ hash);
|
||||
#endif
|
||||
@@ -1,18 +1,22 @@
|
||||
__device__ int cstring_length(const char* s) {
|
||||
int len = 0;
|
||||
while (s[len]) len++;
|
||||
while (s[len])
|
||||
len++;
|
||||
return len;
|
||||
}
|
||||
__device__ int cstring_find(const char* s, const char* sub) {
|
||||
int i, j;
|
||||
int n = cstring_length(s);
|
||||
int m = cstring_length(sub);
|
||||
if (m == 0) return 0;
|
||||
if (m == 0)
|
||||
return 0;
|
||||
for (i = 0; i <= n - m; i++) {
|
||||
for (j = 0; j < m; j++) {
|
||||
if (s[i + j] != sub[j]) break;
|
||||
if (s[i + j] != sub[j])
|
||||
break;
|
||||
}
|
||||
if (j == m) return i;
|
||||
if (j == m)
|
||||
return i;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
@@ -22,7 +26,8 @@ __device__ int cstring_to_ull(const char* s, unsigned* val) {
|
||||
if (s[0] == '0' && (s[1] == 'x' || s[1] == 'X')) {
|
||||
i = 2;
|
||||
}
|
||||
if (s[i] == '\0') return 1;
|
||||
if (s[i] == '\0')
|
||||
return 1;
|
||||
for (; s[i]; i++) {
|
||||
char c = s[i];
|
||||
int digit;
|
||||
@@ -60,7 +65,8 @@ __device__ void concat(const char* s1, const char* s2, char* out, int outSize) {
|
||||
}
|
||||
while (s2[j] && i < outSize - 1) {
|
||||
out[i] = s2[j];
|
||||
i++; j++;
|
||||
i++;
|
||||
j++;
|
||||
}
|
||||
out[i] = '\0';
|
||||
}
|
||||
|
||||
59
meson.build
Normal file
59
meson.build
Normal file
@@ -0,0 +1,59 @@
|
||||
project(
|
||||
'yggm',
|
||||
['cpp', 'cuda'],
|
||||
version: '20.08.2025',
|
||||
default_options: ['cpp_std=c++20'],
|
||||
)
|
||||
|
||||
cpp = meson.get_compiler('cpp')
|
||||
cuda = meson.get_compiler('cuda')
|
||||
|
||||
common_cpp_flags = [
|
||||
'-march=native',
|
||||
'-ffast-math',
|
||||
'-pipe',
|
||||
'-funroll-loops',
|
||||
'-Wpedantic',
|
||||
'-Wconversion',
|
||||
'-Wuninitialized',
|
||||
'-Wsign-conversion',
|
||||
'-masm=intel',
|
||||
'-I../libs/',
|
||||
]
|
||||
|
||||
common_cuda_flags = [
|
||||
'-rdc=true',
|
||||
'-Xptxas', '-O3',
|
||||
'-use_fast_math',
|
||||
'-ftz=true',
|
||||
'-prec-div=false',
|
||||
'-prec-sqrt=false',
|
||||
'-gencode', 'arch=compute_75,code=sm_75',
|
||||
'-Wno-deprecated-gpu-targets',
|
||||
'--expt-relaxed-constexpr',
|
||||
'-I../libs/',
|
||||
]
|
||||
|
||||
add_project_arguments(common_cpp_flags, language: 'cpp')
|
||||
add_project_arguments(common_cuda_flags, language: 'cuda')
|
||||
|
||||
if get_option('buildtype') == 'release'
|
||||
add_project_arguments(['-DRELEASE'], language: ['cpp', 'cuda'])
|
||||
elif get_option('buildtype') == 'debug'
|
||||
add_project_arguments(['-DDEBUG'], language: ['cpp', 'cuda'])
|
||||
endif
|
||||
|
||||
cpp_sources = ['sources/main.cpp'] + files(run_command('find', 'libs', '-name','*.h').stdout().split())
|
||||
cuda_sources = ['sources/main.cu'] + files(run_command('find', 'libs', '-name', '*.cu').stdout().split())
|
||||
|
||||
executable(
|
||||
'yggmcu',
|
||||
sources: cuda_sources,
|
||||
link_args: ['-lcudart', '-L/usr/local/cuda/lib64'],
|
||||
)
|
||||
|
||||
executable(
|
||||
'yggmc',
|
||||
sources: cpp_sources,
|
||||
dependencies: dependency('libsodium'),
|
||||
)
|
||||
4
native.ini
Normal file
4
native.ini
Normal file
@@ -0,0 +1,4 @@
|
||||
[binaries]
|
||||
cpp = 'clang++'
|
||||
cuda = '/opt/cuda/bin/nvcc'
|
||||
cuda_host_compiler = '/opt/cuda/bin/nvcc'
|
||||
121
sources/main.cu
121
sources/main.cu
@@ -1,24 +1,27 @@
|
||||
#include <stdio.h>
|
||||
#include <cuda_runtime.h>
|
||||
#include <curand_kernel.h>
|
||||
#include <sha512.cuh>
|
||||
#include <defines.h>
|
||||
#include <ed25519.cuh>
|
||||
#include <edsign.cuh>
|
||||
#include <string.cuh>
|
||||
#include <keymanip.cuh>
|
||||
__device__ unsigned d_high = 0x10;
|
||||
#include <sha512.cuh>
|
||||
#include <stdio.h>
|
||||
#include <string.cuh>
|
||||
__device__ unsigned d_high = 0x14;
|
||||
__device__ int parameters(const char* arg) noexcept {
|
||||
if ((cstring_find(arg, "--altitude") == 0 && cstring_length(arg) == 10) || (cstring_find(arg, "-a") == 0 && cstring_length(arg) == 2)) {
|
||||
return 777;
|
||||
}
|
||||
int space_index = cstring_find(arg, " ");
|
||||
if (space_index == -1) return 0;
|
||||
if (space_index == -1)
|
||||
return 0;
|
||||
const int substr_start = space_index + 1;
|
||||
char sub_arg[256];
|
||||
extract_substring(arg, substr_start, sub_arg, 256);
|
||||
if (cstring_find(arg, "--altitude") != -1 || cstring_find(arg, "-a") != -1) {
|
||||
unsigned tmp_high;
|
||||
if (cstring_to_ull(sub_arg, &tmp_high) != 0) return 1;
|
||||
if (cstring_to_ull(sub_arg, &tmp_high) != 0)
|
||||
return 1;
|
||||
d_high = tmp_high;
|
||||
}
|
||||
return 0;
|
||||
@@ -45,59 +48,93 @@ __global__ void args(char** argv, int argc, int* result) {
|
||||
__device__ __forceinline__ unsigned char zeroCounter(unsigned int x) noexcept {
|
||||
return x ? static_cast<unsigned char>(__clz(x)) : 32;
|
||||
}
|
||||
__device__ __forceinline__ unsigned char getZeros(const unsigned char* v) noexcept {
|
||||
__device__ __forceinline__ unsigned char getZeros(const unsigned char* __restrict__ v) noexcept {
|
||||
unsigned char leadZeros = 0;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < 32; i += 4) {
|
||||
unsigned word = (static_cast<unsigned>(v[i]) << 24) | (static_cast<unsigned>(v[i + 1]) << 16) | (static_cast<unsigned>(v[i + 2]) << 8) | (static_cast<unsigned>(v[i + 3]));
|
||||
if (word == 0)
|
||||
#pragma unroll 8
|
||||
for (unsigned char i = 0; i < 32; i += 4) {
|
||||
unsigned w = (static_cast<unsigned>(v[i]) << 24) | (static_cast<unsigned>(v[i + 1]) << 16) | (static_cast<unsigned>(v[i + 2]) << 8) | (static_cast<unsigned>(v[i + 3]));
|
||||
if (w == 0)
|
||||
leadZeros += 32;
|
||||
else {
|
||||
leadZeros += zeroCounter(word);
|
||||
leadZeros += zeroCounter(w);
|
||||
break;
|
||||
}
|
||||
}
|
||||
return leadZeros;
|
||||
}
|
||||
__global__ void initRand(curandState* rs) {
|
||||
__global__ void initRandSeed(curandState* __restrict__ states, const unsigned long seed) {
|
||||
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
curand_init(seed, idx, 0, &states[idx]);
|
||||
}
|
||||
__global__ void initRand(curandState* rs, unsigned int* d_seeds) {
|
||||
int id = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
curand_init(clock64() + id * 7919ULL, id, 0, &rs[id]);
|
||||
#pragma unroll 10
|
||||
for (int i = 0; i < 10; i++) {
|
||||
curand(&rs[id]);
|
||||
}
|
||||
unsigned seed = curand(&rs[id]);
|
||||
d_seeds[id] = seed;
|
||||
}
|
||||
__device__ __forceinline__ void rmbytes(unsigned char* buf, curandState* state) {
|
||||
#pragma unroll 32
|
||||
for (unsigned long i = 0; i < 32; i++) {
|
||||
buf[i] = curand(state) & 0xFF;
|
||||
int checkSeeds(unsigned* __restrict__ seeds, int count) {
|
||||
for (int i = 0; i < count; i++) {
|
||||
for (int j = i + 1; j < count; j++) {
|
||||
if (seeds[i] == seeds[j]) {
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
__device__ __forceinline__ void rmbytes(unsigned char* __restrict__ buf, curandState& __restrict__ state) {
|
||||
#pragma unroll 8
|
||||
for (int i = 0; i < 8; i++) {
|
||||
unsigned r = curand(&state);
|
||||
buf[i * 4] = static_cast<unsigned char>(r & 0xFF);
|
||||
buf[i * 4 + 1] = static_cast<unsigned char>((r >> 8) & 0xFF);
|
||||
buf[i * 4 + 2] = static_cast<unsigned char>((r >> 16) & 0xFF);
|
||||
buf[i * 4 + 3] = static_cast<unsigned char>((r >> 24) & 0xFF);
|
||||
}
|
||||
}
|
||||
__global__ void KeyGen(curandState* randStates) {
|
||||
curandState localState = randStates[blockIdx.x * blockDim.x + threadIdx.x];
|
||||
while (true) {
|
||||
KeysBox32 keys;
|
||||
__global__ void KeyGenKernel(curandState* __restrict__ randStates) {
|
||||
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
curandState localState = randStates[idx];
|
||||
#ifdef DEBUG
|
||||
unsigned x = 0;
|
||||
#endif
|
||||
while (WHCOND) {
|
||||
Key32 seed;
|
||||
rmbytes(seed, &localState);
|
||||
ed25519_keygen(keys.PrivateKey, keys.PublicKey, seed);
|
||||
if (unsigned zeros = getZeros(keys.PublicKey); zeros > atomicMax((unsigned*)&d_high, zeros)) {
|
||||
KeysBox32 keys;
|
||||
rmbytes(seed, localState);
|
||||
ed25519_create_keypair(keys.PrivateKey, keys.PublicKey, seed);
|
||||
if (unsigned char zeros = getZeros(keys.PublicKey); zeros > atomicMax(&d_high, zeros)) {
|
||||
Addr16 raw;
|
||||
Key32 inv;
|
||||
invertKey(keys.PublicKey, inv);
|
||||
getRawAddress(zeros, inv, raw);
|
||||
#ifdef RELEASE
|
||||
printf("\nIPv6:\t%s\nPK:\t%s\nSK:\t%s\n", getAddr(raw).data, ktos(keys.PublicKey).data, ktos(keys.PrivateKey).data);
|
||||
#else
|
||||
printf("\nIPv6:\t%s\nFK:\t%s%s\n", getAddr(raw).data, ktos(keys.PrivateKey).data, ktos(keys.PublicKey).data);
|
||||
#endif
|
||||
d_high = zeros;
|
||||
}
|
||||
#ifdef DEBUG
|
||||
if ((++x & 0xFF) == 0) {
|
||||
printf("\rIters: %d", x);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
int main(int argc, char* argv[]) {
|
||||
const int thPerBlock = 256;
|
||||
int* d_result, mBpSM, h_high;
|
||||
printf("BuildType: %s\n", __BUILDTYPE__);
|
||||
int *d_result, mBpSM, h_high;
|
||||
char** d_argv;
|
||||
cudaDeviceProp prop;
|
||||
curandState* rst;
|
||||
cudaMalloc((void**)&d_result, sizeof(int));
|
||||
cudaMalloc((void**)&d_argv, argc * sizeof(char*));
|
||||
for (int i = 0; i < argc; i++) {
|
||||
for (unsigned char i = 0; i < argc; i++) {
|
||||
unsigned long len = strlen(argv[i]) + 1;
|
||||
char* d_str;
|
||||
cudaMalloc((void**)&d_str, len);
|
||||
@@ -107,15 +144,29 @@ int main(int argc, char* argv[]) {
|
||||
args<<<1, 1>>>(d_argv, argc, d_result);
|
||||
cudaDeviceSynchronize();
|
||||
cudaMemcpyFromSymbol(&h_high, d_high, sizeof(unsigned));
|
||||
cudaGetDeviceProperties(&prop, 0);
|
||||
cudaOccupancyMaxActiveBlocksPerMultiprocessor(&mBpSM, KeyGen, thPerBlock, 0);
|
||||
const int totalTh = mBpSM * prop.multiProcessorCount * thPerBlock;
|
||||
printf("High addrs: 2%02x+\nSMs: %d\nMaxBlocksPerSM: %d\nTotalTh: %d\nBlocksThreads: %d:%d\n", h_high, prop.multiProcessorCount, mBpSM, totalTh, totalTh / thPerBlock, thPerBlock);
|
||||
cudaMalloc(&rst, totalTh * sizeof(curandState));
|
||||
initRand<<<totalTh / thPerBlock, thPerBlock>>>(rst);
|
||||
cudaDeviceSynchronize();
|
||||
KeyGen<<<totalTh / thPerBlock, thPerBlock>>>(rst);
|
||||
cudaGetDeviceProperties_v2(&prop, 0);
|
||||
cudaOccupancyMaxActiveBlocksPerMultiprocessor(&mBpSM, KeyGenKernel, THREADSPB, 0);
|
||||
const unsigned tTh = mBpSM * prop.multiProcessorCount * THREADSPB;
|
||||
printf("High Addresses: 2%02x+\nSMs: %d\nTotalThreads: %d\nBlocks: %d (Threads: %d)\n", h_high, prop.multiProcessorCount, tTh, tTh / THREADSPB, THREADSPB);
|
||||
cudaMalloc(&rst, tTh * sizeof(curandState));
|
||||
unsigned* d_seeds;
|
||||
cudaMalloc(&d_seeds, tTh * sizeof(unsigned));
|
||||
initRand<<<THDIVTHPB, THREADSPB>>>(rst, d_seeds);
|
||||
cudaDeviceSynchronize();
|
||||
#ifndef DEBUG
|
||||
unsigned* h_seeds = static_cast<unsigned*>(malloc(tTh * sizeof(unsigned)));
|
||||
cudaMemcpy(h_seeds, d_seeds, tTh * sizeof(unsigned), cudaMemcpyDeviceToHost);
|
||||
if (checkSeeds(h_seeds, tTh)) {
|
||||
fprintf(stderr, "Error: DUPLICATE SEEDS FOUND!\n");
|
||||
free(h_seeds);
|
||||
cudaFree(d_seeds);
|
||||
cudaFree(rst);
|
||||
return 1;
|
||||
}
|
||||
free(h_seeds);
|
||||
cudaFree(d_seeds);
|
||||
#endif
|
||||
KeyGenKernel<<<THDIVTHPB, THREADSPB>>>(rst);
|
||||
cudaFree(rst);
|
||||
return 0;
|
||||
}
|
||||
@@ -1,14 +1,14 @@
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
#include <iomanip>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include <random>
|
||||
#include <memory.h>
|
||||
#include <thread>
|
||||
#include <sodium.h>
|
||||
#include "defines.h"
|
||||
#include <arpa/inet.h>
|
||||
#include <atomic>
|
||||
#include <immintrin.h>
|
||||
#include <iostream>
|
||||
#include <memory.h>
|
||||
#include <sodium.h>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <thread>
|
||||
#include <vector>
|
||||
struct option {
|
||||
unsigned proc = 0;
|
||||
std::atomic<unsigned> high = 0x14;
|
||||
@@ -20,21 +20,18 @@ int parameters(std::string arg) {
|
||||
std::istringstream ss(arg.substr(arg.find(" ") + 1));
|
||||
if (arg.find("--threads") != npos || arg.find("-t") != npos) {
|
||||
ss >> conf.proc;
|
||||
if (ss.fail()) return 1;
|
||||
if (ss.fail())
|
||||
return 1;
|
||||
return 0;
|
||||
}
|
||||
if (arg.find("--altitude") != npos || arg.find("-a") != npos) {
|
||||
unsigned tmp_high;
|
||||
ss >> std::hex >> tmp_high;
|
||||
if (ss.fail()) return 1;
|
||||
if (ss.fail())
|
||||
return 1;
|
||||
conf.high = tmp_high;
|
||||
return 0;
|
||||
}
|
||||
/*
|
||||
if (arg.find("--invert") != npos || arg.find("-i") != npos) {
|
||||
return 0;
|
||||
}
|
||||
*/
|
||||
}
|
||||
if (arg == "--threads" || arg == "-t" || arg == "--altitude" || arg == "-a") {
|
||||
return 777;
|
||||
@@ -45,22 +42,20 @@ void displayConfig() {
|
||||
unsigned processor_count = std::thread::hardware_concurrency();
|
||||
if (conf.proc == 0 || conf.proc > static_cast<unsigned>(processor_count)) {
|
||||
conf.proc = static_cast<unsigned>(processor_count);
|
||||
|
||||
}
|
||||
printf("Threads: %u, high addresses (2%02x+)\n", conf.proc, conf.high.load());
|
||||
}
|
||||
using Address = unsigned char[16];
|
||||
using Key = unsigned char[32];
|
||||
inline std::string getAddress(const Address& rawAddr) noexcept {
|
||||
inline std::string getAddress(const Address& rawAddr) {
|
||||
char ipStrBuf[46];
|
||||
inet_ntop(AF_INET6, rawAddr, ipStrBuf, 46);
|
||||
return std::string(ipStrBuf);
|
||||
}
|
||||
inline std::string KeyToString(const unsigned char* key) noexcept {
|
||||
inline std::string KeyToString(const unsigned char* key) {
|
||||
char result[65];
|
||||
const char* hexDigits = "0123456789abcdef";
|
||||
#pragma unroll
|
||||
for (unsigned char i = 0; i < 32; i++) {
|
||||
for (int i = 0; i < 32; i++) {
|
||||
result[2 * i] = hexDigits[key[i] >> 4];
|
||||
result[2 * i + 1] = hexDigits[key[i] & 0x0F];
|
||||
}
|
||||
@@ -71,10 +66,10 @@ typedef struct alignas(32) {
|
||||
Key PublicKey;
|
||||
Key PrivateKey;
|
||||
} KeysBox;
|
||||
void getRawAddress(int lErase, Key& InvertedPublicKey, Address& rawAddr) noexcept {
|
||||
void getRawAddress(unsigned lErase, Key& InvertedPublicKey, Address& rawAddr) {
|
||||
lErase++;
|
||||
const int bitsToShift = lErase % 8;
|
||||
const int start = lErase / 8;
|
||||
const int start = static_cast<int>(lErase / 8U);
|
||||
if (bitsToShift != 0) {
|
||||
for (int i = start; i < start + 15; i++) {
|
||||
InvertedPublicKey[i] = static_cast<unsigned char>((InvertedPublicKey[i] << bitsToShift) | (InvertedPublicKey[i + 1] >> (8 - bitsToShift)));
|
||||
@@ -84,79 +79,59 @@ void getRawAddress(int lErase, Key& InvertedPublicKey, Address& rawAddr) noexcep
|
||||
rawAddr[1] = static_cast<unsigned char>(lErase - 1);
|
||||
memcpy(&rawAddr[2], &InvertedPublicKey[start], 14);
|
||||
}
|
||||
inline void invertKey(const unsigned char* __restrict key, Key& inverted) noexcept {
|
||||
inline void invertKey(const unsigned char* __restrict key, Key& inverted) {
|
||||
#ifdef __AVX2__
|
||||
_mm256_storeu_si256(reinterpret_cast<__m256i*>(inverted), _mm256_xor_si256(_mm256_loadu_si256(reinterpret_cast<const __m256i*>(key)), _mm256_set1_epi8(0xFF)));
|
||||
#else
|
||||
for (int i = 0; i < 32; ++i) {
|
||||
inverted[i] = static_cast<unsigned char>(key[i] ^ 0xFF);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
[[nodiscard]] inline unsigned char zeroCounter(unsigned int x) noexcept {
|
||||
return x == 0 ? 32 : static_cast<unsigned char>(__builtin_clz(x));
|
||||
}
|
||||
[[nodiscard]] inline unsigned char getZeros(const Key& v) noexcept {
|
||||
unsigned char leadZeros = 0;
|
||||
for (unsigned char i = 0; i < 32; i += 4) {
|
||||
unsigned word = (static_cast<unsigned>(v[i]) << 24) | (static_cast<unsigned>(v[i + 1]) << 16) | (static_cast<unsigned>(v[i + 2]) << 8) | (static_cast<unsigned>(v[i + 3]));
|
||||
inline unsigned getZeros(const Key& v) {
|
||||
unsigned leadZeros = 0;
|
||||
for (int i = 0; i < 32; i += 8) {
|
||||
unsigned long long word = (static_cast<unsigned long long>(v[i]) << 56) | (static_cast<unsigned long long>(v[i + 1]) << 48) | (static_cast<unsigned long long>(v[i + 2]) << 40) | (static_cast<unsigned long long>(v[i + 3]) << 32) | (static_cast<unsigned long long>(v[i + 4]) << 24) | (static_cast<unsigned long long>(v[i + 5]) << 16) | (static_cast<unsigned long long>(v[i + 6]) << 8) | static_cast<unsigned long long>(v[i + 7]);
|
||||
if (word == 0) {
|
||||
leadZeros += 32;
|
||||
leadZeros += 64;
|
||||
} else {
|
||||
leadZeros += zeroCounter(word);
|
||||
leadZeros += static_cast<unsigned>(__builtin_clzll(word));
|
||||
break;
|
||||
}
|
||||
}
|
||||
return leadZeros;
|
||||
}
|
||||
[[nodiscard]] inline long long xorshift64(unsigned long& state) noexcept {
|
||||
state ^= state << 21;
|
||||
state ^= state >> 35;
|
||||
state ^= state << 4;
|
||||
return static_cast<long long>(state * 2685821657736338717);
|
||||
}
|
||||
inline void rmbytes(unsigned char* __restrict buf, unsigned long& state) noexcept {
|
||||
for (unsigned char x = 0; x < 1; x++) {
|
||||
_mm256_storeu_si256((__m256i*) & buf[x * 32], _mm256_set_epi64x(xorshift64(state), xorshift64(state), xorshift64(state), xorshift64(state)));
|
||||
}
|
||||
}
|
||||
inline void sign_keypair(unsigned char* __restrict pk, unsigned char* __restrict sk, const unsigned char* __restrict seed) noexcept {
|
||||
alignas(32) unsigned char h[64];
|
||||
crypto_hash_sha512(h, seed, 32);
|
||||
h[31] = (h[31] & 0xF8) | (0x40 | (h[31] & 0x7F));
|
||||
crypto_scalarmult_ed25519_base(pk, h);
|
||||
_mm256_storeu_si256(reinterpret_cast<__m256i*>(sk), _mm256_loadu_si256(reinterpret_cast<const __m256i*>(seed)));
|
||||
_mm256_storeu_si256(reinterpret_cast<__m256i*>(sk + 32), _mm256_loadu_si256(reinterpret_cast<const __m256i*>(pk)));
|
||||
}
|
||||
|
||||
void miner_thread() noexcept {
|
||||
alignas(32) Key inv;
|
||||
alignas(32) Key seed;
|
||||
KeysBox keys;
|
||||
Address rawAddr;
|
||||
std::random_device rd;
|
||||
unsigned long state = static_cast<unsigned long>(rd());
|
||||
printf("Using seed: %lu\n", state);
|
||||
void miner_thread() {
|
||||
alignas(32) thread_local Key inv;
|
||||
thread_local KeysBox keys;
|
||||
thread_local Address rawAddr;
|
||||
while (true) {
|
||||
rmbytes(seed, state);
|
||||
//sign_keypair(keys.PublicKey, keys.PrivateKey, seed);
|
||||
crypto_sign_ed25519_seed_keypair(keys.PublicKey, keys.PrivateKey, seed);
|
||||
if (unsigned char ones = getZeros(keys.PublicKey); ones > conf.high.load()) {
|
||||
conf.high.store(ones);
|
||||
crypto_sign_ed25519_keypair(keys.PublicKey, keys.PrivateKey);
|
||||
unsigned ones = getZeros(keys.PublicKey), high = conf.high.load(std::memory_order_relaxed);
|
||||
while (ones > high && !conf.high.compare_exchange_strong(high, ones, std::memory_order_relaxed)) {
|
||||
high = conf.high.load(std::memory_order_relaxed);
|
||||
}
|
||||
if (ones > high) {
|
||||
invertKey(keys.PublicKey, inv);
|
||||
getRawAddress(ones, inv, rawAddr);
|
||||
printf("\nIPv6:\t%s\nPK:\t%s\nSK:\t%s\n", getAddress(rawAddr).c_str(), KeyToString(keys.PublicKey).c_str(), KeyToString(keys.PrivateKey).c_str());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void startThreads() noexcept {
|
||||
void startThreads() {
|
||||
std::vector<std::thread> threads;
|
||||
threads.reserve(conf.proc);
|
||||
for (unsigned char x = 0; x < conf.proc; x++) {
|
||||
for (unsigned x = 0; x < conf.proc; x++) {
|
||||
threads.emplace_back(miner_thread);
|
||||
}
|
||||
for (auto& thread : threads) {
|
||||
thread.join();
|
||||
}
|
||||
}
|
||||
int main(int argc, char* argv[]) noexcept {
|
||||
if (argc < 2) return 0;
|
||||
for (int x = 1; x < argc; x++) {
|
||||
int main(int argc, char* argv[]) {
|
||||
std::cout << "BuildType: " << __BUILDTYPE__ << std::endl << "AVX2: " << __AVX2__ << std::endl;
|
||||
if (argc < 1)
|
||||
return 0;
|
||||
for (int x = 0; x < argc; x++) {
|
||||
if (int res = parameters(argv[x]); res == 777) {
|
||||
if (++x >= argc) {
|
||||
std::cerr << "Empty value for parameter \"" << argv[x - 1] << "\"" << std::endl;
|
||||
Reference in New Issue
Block a user