vibecoded cuda patch for https://github.com/DavidBuchanan314/plcmine. use at your own risk, it worked for me on a 2070S but tweaks might be needed for other hardware
cuda.patch edited
621 lines 26 kB view raw
1diff --git a/native/Makefile b/native/Makefile 2index 2d63753..1d4c402 100644 3--- a/native/Makefile 4+++ b/native/Makefile 5@@ -2,6 +2,9 @@ all: mine 6 7 override CFLAGS += -march=native -mtune=native -O3 -flto -Wall -Wextra -Wpedantic -Wno-deprecated-declarations -Wno-format 8 9+NVCC ?= nvcc 10+NVCCFLAGS ?= -O3 -arch=sm_75 --ptxas-options=-v -DUSE_BITMAP_FILTER 11+ 12 # make mine_nogmp CFLAGS=-DBENCHMARK 13 14 mine: mine.c util.h 15@@ -9,3 +12,6 @@ mine: mine.c util.h 16 17 mine_nogmp: mine_nogmp.c util.h bigint.h 18 $(CC) mine_nogmp.c -o mine_nogmp $(CFLAGS) -lcrypto 19+ 20+mine_cuda: mine_cuda.cu util.h bigint.h 21+ $(NVCC) mine_cuda.cu -o mine_cuda $(NVCCFLAGS) 22diff --git a/native/mine_cuda b/native/mine_cuda 23new file mode 100755 24index 0000000..99850c0 25Binary files /dev/null and b/native/mine_cuda differ 26diff --git a/native/mine_cuda.cu b/native/mine_cuda.cu 27new file mode 100644 28index 0000000..fc22816 29--- /dev/null 30+++ b/native/mine_cuda.cu 31@@ -0,0 +1,590 @@ 32+// mine_cuda.cu - CUDA port of mine_nogmp.c 33+// Based on the OpenCL port (ocl_mine.cl / ocl_mine.py) 34+// 35+// USAGE: ./mine_cuda <ignored_threads> precomputed.bin did:key:<pubkey> <prefix> [prefix2 ...] 36+// (thread count arg is accepted for CLI compatibility but ignored - GPU config is compile-time) 37+ 38+#include <stdint.h> 39+#include <stdio.h> 40+#include <stdlib.h> 41+#include <string.h> 42+#include <assert.h> 43+#include <time.h> 44+#include <locale.h> 45+#include <cuda_runtime.h> 46+ 47+// ---- tuning knobs ---- 48+#ifndef STEPS_PER_TASK 49+#define STEPS_PER_TASK 512 50+#endif 51+#ifndef MAX_RESULTS 52+#define MAX_RESULTS 64 53+#endif 54+#ifndef WORK_SIZE 55+#define WORK_SIZE (1 << 20) // handles per kernel call 56+#endif 57+#ifndef BLOCK_SIZE 58+#define BLOCK_SIZE 256 59+#endif 60+ 61+// ---- fixed offsets (matching mine_nogmp.c and ocl_mine.py) ---- 62+#define PRESIGNED_LEN 155 63+#define PRESIGNED_HANDLE_OFF 55 64+#define PRESIGNED_PUBKEY_OFF 77 65+ 66+#define SIGNED_LEN 247 67+#define SIGNED_SIG_OFF 7 68+#define SIGNED_HANDLE_OFF 147 69+#define SIGNED_PUBKEY_OFF 169 70+ 71+#define RESULT_STRIDE 68 // handle(6)+pad(2)+row(4)+k_inv(32)+did_b32(24) 72+ 73+#define MAX_PREFIXES 16 74+#define MAX_PREFIX_DATA 128 75+#define MASK26 0x03FFFFFFu 76+ 77+// ---- constant memory ---- 78+__constant__ uint8_t d_presigned_tpl[PRESIGNED_LEN]; 79+__constant__ uint8_t d_signed_tpl[SIGNED_LEN]; 80+__constant__ uint32_t d_firstbyte_bitmap[8]; // 256-bit bitmap for first-byte filter 81+__constant__ uint8_t d_firstbytes[MAX_PREFIXES]; 82+__constant__ uint8_t d_prefix_data[MAX_PREFIX_DATA]; 83+__constant__ uint32_t d_prefix_lens[MAX_PREFIXES]; 84+__constant__ uint32_t d_prefix_offsets[MAX_PREFIXES]; 85+__constant__ uint32_t d_num_prefixes; 86+ 87+// ---- SHA-256 ---- 88+__constant__ uint32_t K_SHA[64] = { 89+ 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5,0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5, 90+ 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3,0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174, 91+ 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc,0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da, 92+ 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7,0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967, 93+ 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13,0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85, 94+ 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3,0xd192e819,0xd6990624,0xf40e3585,0x106aa070, 95+ 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5,0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3, 96+ 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208,0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 97+}; 98+ 99+static __device__ __forceinline__ uint32_t rotr32(uint32_t x, int n) { 100+ return (x >> n) | (x << (32 - n)); 101+} 102+#define CH(x,y,z) ((z)^((x)&((y)^(z)))) 103+#define MAJ(x,y,z) (((x)&(y))|((z)&((x)|(y)))) 104+#define EP0(x) (rotr32(x,2)^rotr32(x,13)^rotr32(x,22)) 105+#define EP1(x) (rotr32(x,6)^rotr32(x,11)^rotr32(x,25)) 106+#define SIG0(x) (rotr32(x,7)^rotr32(x,18)^((x)>>3)) 107+#define SIG1(x) (rotr32(x,17)^rotr32(x,19)^((x)>>10)) 108+ 109+static __device__ void sha256_compress(uint32_t state[8], uint32_t blk[16]) { 110+ uint32_t a=state[0],b=state[1],c=state[2],d=state[3]; 111+ uint32_t e=state[4],f=state[5],g=state[6],h=state[7]; 112+ #pragma unroll 113+ for (int i=0;i<16;i++){ 114+ uint32_t t1=h+EP1(e)+CH(e,f,g)+K_SHA[i]+blk[i]; 115+ uint32_t t2=EP0(a)+MAJ(a,b,c); 116+ h=g;g=f;f=e;e=d+t1;d=c;c=b;b=a;a=t1+t2; 117+ } 118+ #pragma unroll 119+ for (int i=16;i<64;i++){ 120+ blk[i&15]=SIG1(blk[(i-2)&15])+blk[(i-7)&15]+SIG0(blk[(i-15)&15])+blk[(i-16)&15]; 121+ uint32_t t1=h+EP1(e)+CH(e,f,g)+K_SHA[i]+blk[i&15]; 122+ uint32_t t2=EP0(a)+MAJ(a,b,c); 123+ h=g;g=f;f=e;e=d+t1;d=c;c=b;b=a;a=t1+t2; 124+ } 125+ state[0]+=a;state[1]+=b;state[2]+=c;state[3]+=d; 126+ state[4]+=e;state[5]+=f;state[6]+=g;state[7]+=h; 127+} 128+ 129+static __device__ void sha256_buf(const uint8_t *data, uint32_t len, uint32_t out[8]) { 130+ const uint32_t INIT[8] = { 131+ 0x6a09e667,0xbb67ae85,0x3c6ef372,0xa54ff53a, 132+ 0x510e527f,0x9b05688c,0x1f83d9ab,0x5be0cd19 133+ }; 134+ uint32_t state[8]; 135+ #pragma unroll 136+ for (int i=0;i<8;i++) state[i]=INIT[i]; 137+ 138+ uint32_t blk[16]; 139+ uint32_t pos=0; 140+ 141+ while (pos+64<=len) { 142+ #pragma unroll 143+ for (int w=0;w<16;w++){ 144+ uint32_t b=pos+w*4; 145+ blk[w]=((uint32_t)data[b]<<24)|((uint32_t)data[b+1]<<16) 146+ |((uint32_t)data[b+2]<<8 )|((uint32_t)data[b+3]); 147+ } 148+ sha256_compress(state,blk); 149+ pos+=64; 150+ } 151+ 152+ uint32_t rem=len-pos; 153+ #pragma unroll 154+ for (int w=0;w<16;w++) blk[w]=0; 155+ for (uint32_t b=0;b<rem;b++){ 156+ uint32_t w=b>>2, sh=24-((b&3)<<3); 157+ blk[w]|=((uint32_t)data[pos+b])<<sh; 158+ } 159+ {uint32_t b=rem,w=b>>2,sh=24-((b&3)<<3);blk[w]|=(uint32_t)0x80u<<sh;} 160+ 161+ if (rem<56){ 162+ blk[15]=len*8; 163+ sha256_compress(state,blk); 164+ } else { 165+ sha256_compress(state,blk); 166+ #pragma unroll 167+ for (int w=0;w<16;w++) blk[w]=0; 168+ blk[15]=len*8; 169+ sha256_compress(state,blk); 170+ } 171+ #pragma unroll 172+ for (int i=0;i<8;i++) out[i]=state[i]; 173+} 174+ 175+// ---- 256-bit modular FMA (10x26-bit limbs, same as bigint.h) ---- 176+__constant__ uint32_t N_LIMBS[10] = {3555649,9937716,33799165,60472610,45788892,67108863,67108863,67108863,67108863,4194303}; 177+__constant__ uint32_t C_LIMBS[5] = {63553215,57171147,33309698,6636253,21319971}; 178+ 179+static __device__ void mod_fma(uint32_t res[10], const uint32_t a[10], const uint32_t b[10], const uint32_t c[10]) { 180+ uint64_t t[20]={0}; 181+ uint32_t hi[10]={0}; 182+ 183+ #pragma unroll 184+ for (int i=0;i<10;i++) 185+ #pragma unroll 186+ for (int j=0;j<10;j++) 187+ t[i+j]+=(uint64_t)a[i]*(uint64_t)b[j]; 188+ 189+ t[10]+=t[9]>>26; t[9]&=MASK26; 190+ #pragma unroll 191+ for (int i=10;i<19;i++){t[i+1]+=t[i]>>26;hi[i-10]=(uint32_t)((t[i]&MASK26)<<4);t[i]=0;} 192+ hi[9]=(uint32_t)(t[19]<<4); 193+ for (int i=0;i<10;i++) 194+ #pragma unroll 195+ for (int j=0;j<5;j++) 196+ t[i+j]+=(uint64_t)hi[i]*(uint64_t)C_LIMBS[j]; 197+ 198+ t[10]+=t[9]>>26; t[9]&=MASK26; 199+ #pragma unroll 200+ for (int i=10;i<14;i++){t[i+1]+=t[i]>>26;hi[i-10]=(uint32_t)((t[i]&MASK26)<<4);t[i]=0;} 201+ hi[4]=(uint32_t)(t[14]<<4); 202+ for (int i=0;i<5;i++) 203+ #pragma unroll 204+ for (int j=0;j<5;j++) 205+ t[i+j]+=(uint64_t)hi[i]*(uint64_t)C_LIMBS[j]; 206+ 207+ #pragma unroll 208+ for (int i=0;i<10;i++) t[i]+=c[i]; 209+ #pragma unroll 210+ for (int i=0;i<10;i++){t[i+1]+=t[i]>>26;t[i]&=MASK26;} 211+ 212+ uint32_t ov=(uint32_t)((t[9]>>22)+(t[10]<<4)); 213+ #pragma unroll 214+ for (int j=0;j<5;j++) t[j]+=(uint64_t)ov*(uint64_t)C_LIMBS[j]; 215+ #pragma unroll 216+ for (int i=0;i<10;i++){t[i+1]+=t[i]>>26;t[i]&=MASK26;} 217+ 218+ #pragma unroll 219+ for (int i=0;i<10;i++) res[i]=(uint32_t)t[i]; 220+ 221+ if (res[9]&(1u<<21)){ 222+ #pragma unroll 223+ for (int i=0;i<10;i++) res[i]=N_LIMBS[i]-res[i]; 224+ #pragma unroll 225+ for (int i=0;i<9;i++){res[i+1]-=res[i]>>31;res[i]&=MASK26;} 226+ } 227+} 228+ 229+static __host__ __device__ void bigint_unpack(uint32_t res[10], const uint8_t buf[32]) { 230+ res[0]=((uint32_t)buf[31])|((uint32_t)buf[30]<<8)|((uint32_t)buf[29]<<16)|((uint32_t)(buf[28]&0x03)<<24);res[0]&=MASK26; 231+ res[1]=((uint32_t)(buf[28]>>2))|((uint32_t)buf[27]<<6)|((uint32_t)buf[26]<<14)|((uint32_t)(buf[25]&0x0F)<<22);res[1]&=MASK26; 232+ res[2]=((uint32_t)(buf[25]>>4))|((uint32_t)buf[24]<<4)|((uint32_t)buf[23]<<12)|((uint32_t)(buf[22]&0x3F)<<20);res[2]&=MASK26; 233+ res[3]=((uint32_t)(buf[22]>>6))|((uint32_t)buf[21]<<2)|((uint32_t)buf[20]<<10)|((uint32_t)buf[19]<<18);res[3]&=MASK26; 234+ res[4]=((uint32_t)buf[18])|((uint32_t)buf[17]<<8)|((uint32_t)buf[16]<<16)|((uint32_t)(buf[15]&0x03)<<24);res[4]&=MASK26; 235+ res[5]=((uint32_t)(buf[15]>>2))|((uint32_t)buf[14]<<6)|((uint32_t)buf[13]<<14)|((uint32_t)(buf[12]&0x0F)<<22);res[5]&=MASK26; 236+ res[6]=((uint32_t)(buf[12]>>4))|((uint32_t)buf[11]<<4)|((uint32_t)buf[10]<<12)|((uint32_t)(buf[9]&0x3F)<<20);res[6]&=MASK26; 237+ res[7]=((uint32_t)(buf[9]>>6))|((uint32_t)buf[8]<<2)|((uint32_t)buf[7]<<10)|((uint32_t)buf[6]<<18);res[7]&=MASK26; 238+ res[8]=((uint32_t)buf[5])|((uint32_t)buf[4]<<8)|((uint32_t)buf[3]<<16)|((uint32_t)(buf[2]&0x03)<<24);res[8]&=MASK26; 239+ res[9]=((uint32_t)(buf[2]>>2))|((uint32_t)buf[1]<<6)|((uint32_t)buf[0]<<14);res[9]&=0x003FFFFFu; 240+} 241+ 242+static __device__ void bigint_pack(uint8_t buf[32], const uint32_t b[10]) { 243+ buf[31]=(b[0]>>0)&0xFF;buf[30]=(b[0]>>8)&0xFF;buf[29]=(b[0]>>16)&0xFF;buf[28]=(b[0]>>24)&0x03; 244+ buf[28]|=(b[1]&0x3F)<<2;buf[27]=(b[1]>>6)&0xFF;buf[26]=(b[1]>>14)&0xFF;buf[25]=(b[1]>>22)&0x0F; 245+ buf[25]|=(b[2]&0x0F)<<4;buf[24]=(b[2]>>4)&0xFF;buf[23]=(b[2]>>12)&0xFF;buf[22]=(b[2]>>20)&0x3F; 246+ buf[22]|=(b[3]&0x03)<<6;buf[21]=(b[3]>>2)&0xFF;buf[20]=(b[3]>>10)&0xFF;buf[19]=(b[3]>>18)&0xFF; 247+ buf[18]=(b[4]>>0)&0xFF;buf[17]=(b[4]>>8)&0xFF;buf[16]=(b[4]>>16)&0xFF;buf[15]=(b[4]>>24)&0x03; 248+ buf[15]|=(b[5]&0x3F)<<2;buf[14]=(b[5]>>6)&0xFF;buf[13]=(b[5]>>14)&0xFF;buf[12]=(b[5]>>22)&0x0F; 249+ buf[12]|=(b[6]&0x0F)<<4;buf[11]=(b[6]>>4)&0xFF;buf[10]=(b[6]>>12)&0xFF;buf[9]=(b[6]>>20)&0x3F; 250+ buf[9]|=(b[7]&0x03)<<6;buf[8]=(b[7]>>2)&0xFF;buf[7]=(b[7]>>10)&0xFF;buf[6]=(b[7]>>18)&0xFF; 251+ buf[5]=(b[8]>>0)&0xFF;buf[4]=(b[8]>>8)&0xFF;buf[3]=(b[8]>>16)&0xFF;buf[2]=(b[8]>>24)&0x03; 252+ buf[2]|=(b[9]&0x3F)<<2;buf[1]=(b[9]>>6)&0xFF;buf[0]=(b[9]>>14)&0xFF; 253+} 254+ 255+// ---- base64url no-pad ---- 256+__constant__ uint8_t B64[64] = {'A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z','a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z','0','1','2','3','4','5','6','7','8','9','-','_'}; 257+ 258+// Encode r_bytes[30..31] || s_bytes[0..31] (34 bytes) -> 46 chars 259+static __device__ void b64_raw_sig(uint8_t *out, uint8_t r0, uint8_t r1, const uint8_t *s) { 260+ out[0]=B64[(r0>>2)&0x3f]; 261+ out[1]=B64[((r0<<4)|(r1>>4))&0x3f]; 262+ out[2]=B64[((r1<<2)|(s[0]>>6))&0x3f]; 263+ out[3]=B64[s[0]&0x3f]; 264+ #pragma unroll 265+ for (int i=0;i<10;i++){ 266+ uint8_t a=s[1+i*3],b=s[2+i*3],c=s[3+i*3]; 267+ out[4+i*4]=B64[(a>>2)&0x3f]; 268+ out[5+i*4]=B64[((a<<4)|(b>>4))&0x3f]; 269+ out[6+i*4]=B64[((b<<2)|(c>>6))&0x3f]; 270+ out[7+i*4]=B64[c&0x3f]; 271+ } 272+ out[44]=B64[(s[31]>>2)&0x3f]; 273+ out[45]=B64[(s[31]<<4)&0x3f]; 274+} 275+ 276+// ---- base32 multibase ---- 277+__constant__ uint8_t B32[32] = {'a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z','2','3','4','5','6','7'}; 278+ 279+static __device__ void b32_encode(uint8_t *out, const uint8_t *data, uint32_t len) { 280+ uint32_t i=0; 281+ while (i+4<len){ 282+ uint8_t a=data[i],b=data[i+1],c=data[i+2],d=data[i+3],e=data[i+4];i+=5; 283+ *out++=B32[(a>>3)&0x1f];*out++=B32[((a<<2)|(b>>6))&0x1f]; 284+ *out++=B32[(b>>1)&0x1f];*out++=B32[((b<<4)|(c>>4))&0x1f]; 285+ *out++=B32[((c<<1)|(d>>7))&0x1f];*out++=B32[(d>>2)&0x1f]; 286+ *out++=B32[((d<<3)|(e>>5))&0x1f];*out++=B32[e&0x1f]; 287+ } 288+ uint32_t rem=len-i; 289+ if(rem==4){uint8_t a=data[i],b=data[i+1],c=data[i+2],d=data[i+3]; 290+ *out++=B32[(a>>3)&0x1f];*out++=B32[((a<<2)|(b>>6))&0x1f]; 291+ *out++=B32[(b>>1)&0x1f];*out++=B32[((b<<4)|(c>>4))&0x1f]; 292+ *out++=B32[((c<<1)|(d>>7))&0x1f];*out++=B32[(d>>2)&0x1f];*out++=B32[(d<<3)&0x1f];} 293+ else if(rem==3){uint8_t a=data[i],b=data[i+1],c=data[i+2]; 294+ *out++=B32[(a>>3)&0x1f];*out++=B32[((a<<2)|(b>>6))&0x1f]; 295+ *out++=B32[(b>>1)&0x1f];*out++=B32[((b<<4)|(c>>4))&0x1f];*out++=B32[(c<<1)&0x1f];} 296+ else if(rem==2){uint8_t a=data[i],b=data[i+1]; 297+ *out++=B32[(a>>3)&0x1f];*out++=B32[((a<<2)|(b>>6))&0x1f]; 298+ *out++=B32[(b>>1)&0x1f];*out++=B32[(b<<4)&0x1f];} 299+ else if(rem==1){uint8_t a=data[i]; 300+ *out++=B32[(a>>3)&0x1f];*out++=B32[(a<<2)&0x1f];} 301+} 302+ 303+// ---- main kernel ---- 304+__global__ __launch_bounds__(BLOCK_SIZE) 305+void mine_plc( 306+ const uint32_t * __restrict__ limb_table, 307+ const uint8_t * __restrict__ r_b64_tbl, 308+ const uint8_t * __restrict__ r_tail, 309+ uint8_t * __restrict__ results, 310+ uint32_t * __restrict__ result_count, 311+ uint32_t handle_base, 312+ uint32_t row_base, 313+ uint32_t num_rows 314+) { 315+ uint32_t gid = blockIdx.x * blockDim.x + threadIdx.x; 316+ uint32_t handle_idx = handle_base + gid; 317+ 318+ uint8_t handle[6]; 319+ {uint32_t idx=handle_idx;for(int j=0;j<6;j++){handle[5-j]=B64[idx&0x3f];idx>>=6;}} 320+ 321+ uint8_t presigned[PRESIGNED_LEN]; 322+ #pragma unroll 323+ for (int i=0;i<PRESIGNED_LEN;i++) presigned[i]=d_presigned_tpl[i]; 324+ for (int j=0;j<6;j++) presigned[PRESIGNED_HANDLE_OFF+j]=handle[j]; 325+ 326+ uint32_t z_state[8]; 327+ sha256_buf(presigned, PRESIGNED_LEN, z_state); 328+ 329+ uint8_t z_bytes[32]; 330+ #pragma unroll 331+ for (int w=0;w<8;w++){ 332+ z_bytes[w*4+0]=(z_state[w]>>24)&0xFF;z_bytes[w*4+1]=(z_state[w]>>16)&0xFF; 333+ z_bytes[w*4+2]=(z_state[w]>> 8)&0xFF;z_bytes[w*4+3]=(z_state[w] )&0xFF; 334+ } 335+ uint32_t z[10]; 336+ bigint_unpack(z, z_bytes); 337+ 338+ uint8_t signed_op[SIGNED_LEN]; 339+ #pragma unroll 340+ for (int i=0;i<SIGNED_LEN;i++) signed_op[i]=d_signed_tpl[i]; 341+ for (int j=0;j<6;j++) signed_op[SIGNED_HANDLE_OFF+j]=handle[j]; 342+ 343+ uint32_t row_end=row_base+STEPS_PER_TASK; 344+ if (row_end>num_rows) row_end=num_rows; 345+ 346+ for (uint32_t row=row_base; row<row_end; row++) { 347+ uint32_t loff=row*20; 348+ uint32_t k_inv_rDa[10], k_inv_l[10]; 349+ #pragma unroll 350+ for (int i=0;i<10;i++) k_inv_rDa[i]=limb_table[loff+i]; 351+ #pragma unroll 352+ for (int i=0;i<10;i++) k_inv_l[i]=limb_table[loff+10+i]; 353+ 354+ uint32_t s[10]; 355+ mod_fma(s, z, k_inv_l, k_inv_rDa); 356+ 357+ uint8_t s_bytes[32]; 358+ bigint_pack(s_bytes, s); 359+ 360+ uint32_t rb64=row*40; 361+ #pragma unroll 362+ for (int b=0;b<40;b++) signed_op[SIGNED_SIG_OFF+b]=r_b64_tbl[rb64+b]; 363+ b64_raw_sig(&signed_op[SIGNED_SIG_OFF+40], r_tail[row*2], r_tail[row*2+1], s_bytes); 364+ 365+ uint32_t did_state[8]; 366+ sha256_buf(signed_op, SIGNED_LEN, did_state); 367+ 368+ uint8_t b0=(did_state[0]>>24)&0xFF; 369+#ifdef USE_BITMAP_FILTER 370+ if (!(d_firstbyte_bitmap[b0>>5]&(1u<<(b0&31)))) continue; 371+#else 372+ {int hit=0;for(uint32_t p=0;p<d_num_prefixes;p++)if(b0==d_firstbytes[p]){hit=1;break;}if(!hit)continue;} 373+#endif 374+ 375+ uint8_t did_hash[15]; 376+ #pragma unroll 377+ for (int b=0;b<15;b++){int w=b/4,sh=24-(b&3)*8;did_hash[b]=(did_state[w]>>sh)&0xFF;} 378+ uint8_t did_b32[24]; 379+ b32_encode(did_b32, did_hash, 15); 380+ 381+ for (uint32_t p=0;p<d_num_prefixes;p++){ 382+ if (b0!=d_firstbytes[p]) continue; 383+ uint32_t plen=d_prefix_lens[p], poff=d_prefix_offsets[p]; 384+ int ok=1; 385+ for (uint32_t ci=0;ci<plen;ci++) if(did_b32[ci]!=d_prefix_data[poff+ci]){ok=0;break;} 386+ if (!ok) continue; 387+ 388+ uint32_t slot=atomicAdd(result_count, 1u); 389+ if (slot<MAX_RESULTS){ 390+ uint32_t roff=slot*RESULT_STRIDE; 391+ for (int b=0;b<6;b++) results[roff+b]=handle[b]; 392+ results[roff+6]=0;results[roff+7]=0; 393+ results[roff+8]=(row>>24)&0xFF;results[roff+9]=(row>>16)&0xFF; 394+ results[roff+10]=(row>>8)&0xFF;results[roff+11]=row&0xFF; 395+ uint8_t k_inv_bytes[32]; 396+ bigint_pack(k_inv_bytes, k_inv_l); 397+ for (int b=0;b<32;b++) results[roff+12+b]=k_inv_bytes[b]; 398+ for (int b=0;b<24;b++) results[roff+44+b]=did_b32[b]; 399+ } 400+ } 401+ } 402+} 403+ 404+// ---- host utilities ---- 405+ 406+static void cuda_check(cudaError_t err, const char *where) { 407+ if (err!=cudaSuccess){ 408+ fprintf(stderr,"CUDA error at %s: %s\n",where,cudaGetErrorString(err)); 409+ exit(1); 410+ } 411+} 412+#define CUDA_CHECK(x) cuda_check((x), #x) 413+ 414+static double timestamp(void) { 415+ struct timespec ts; 416+ clock_gettime(CLOCK_REALTIME,&ts); 417+ return ts.tv_sec+ts.tv_nsec*1e-9; 418+} 419+ 420+// load precomputed.bin -> device buffers 421+// each row = [r(32), k_inv_rDa(32), k_inv(32)] = 96 bytes 422+static void load_precomputed(const char *path, 423+ uint32_t **d_limb_table, uint8_t **d_r_b64, uint8_t **d_r_tail, 424+ size_t *num_rows_out) 425+{ 426+ static const char b64c[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_"; 427+ 428+ FILE *f=fopen(path,"rb"); 429+ if (!f){fprintf(stderr,"cannot open %s\n",path);exit(1);} 430+ fseek(f,0,SEEK_END); 431+ long sz=ftell(f);rewind(f); 432+ assert(sz%96==0); 433+ size_t num_rows=sz/96; 434+ uint8_t *raw=(uint8_t*)malloc(sz); 435+ assert(fread(raw,sz,1,f)==1); 436+ fclose(f); 437+ 438+ uint32_t *limb_table=(uint32_t*)malloc(num_rows*20*sizeof(uint32_t)); 439+ uint8_t *r_b64 =(uint8_t *)malloc(num_rows*40); 440+ uint8_t *r_tail =(uint8_t *)malloc(num_rows*2); 441+ 442+ for (size_t i=0;i<num_rows;i++){ 443+ uint8_t *row=raw+i*96; 444+ bigint_unpack(limb_table+i*20, row+32); // k_inv_rDa 445+ bigint_unpack(limb_table+i*20+10, row+64); // k_inv 446+ 447+ // base64url-encode r[0..29] (exactly 30 bytes = 10 groups of 3 -> 40 chars, no remainder) 448+ for (int g=0;g<10;g++){ 449+ uint8_t a=row[g*3],b=row[g*3+1],c=row[g*3+2]; 450+ r_b64[i*40+g*4+0]=b64c[(a>>2)&0x3f]; 451+ r_b64[i*40+g*4+1]=b64c[((a<<4)|(b>>4))&0x3f]; 452+ r_b64[i*40+g*4+2]=b64c[((b<<2)|(c>>6))&0x3f]; 453+ r_b64[i*40+g*4+3]=b64c[c&0x3f]; 454+ } 455+ r_tail[i*2 ]=row[30]; 456+ r_tail[i*2+1]=row[31]; 457+ } 458+ free(raw); 459+ 460+ CUDA_CHECK(cudaMalloc(d_limb_table, num_rows*20*sizeof(uint32_t))); 461+ CUDA_CHECK(cudaMalloc(d_r_b64, num_rows*40)); 462+ CUDA_CHECK(cudaMalloc(d_r_tail, num_rows*2)); 463+ CUDA_CHECK(cudaMemcpy(*d_limb_table, limb_table, num_rows*20*sizeof(uint32_t), cudaMemcpyHostToDevice)); 464+ CUDA_CHECK(cudaMemcpy(*d_r_b64, r_b64, num_rows*40, cudaMemcpyHostToDevice)); 465+ CUDA_CHECK(cudaMemcpy(*d_r_tail, r_tail, num_rows*2, cudaMemcpyHostToDevice)); 466+ free(limb_table);free(r_b64);free(r_tail); 467+ *num_rows_out=num_rows; 468+} 469+ 470+static uint8_t prefix_firstbyte(const char *p) { 471+ static const char *b32c="abcdefghijklmnopqrstuvwxyz234567"; 472+ int c0=(int)(strchr(b32c,p[0])-b32c); 473+ int c1=(int)(strchr(b32c,p[1])-b32c); 474+ return (uint8_t)((c0<<3)|(c1>>2)); 475+} 476+ 477+int main(int argc, char *argv[]) { 478+ if (argc<5){ 479+ fprintf(stderr,"USAGE: %s <threads_ignored> precomputed.bin did:key:<pubkey> <prefix> [prefix2 ...]\n",argv[0]); 480+ return 1; 481+ } 482+ const char *precomputed_path=argv[2]; 483+ const char *pubkey=argv[3]; 484+ int num_prefixes=argc-4; 485+ char **prefixes=argv+4; 486+ 487+ if (strlen(pubkey)!=57){fprintf(stderr,"invalid pubkey length\n");return 1;} 488+ for (int i=0;i<num_prefixes;i++){ 489+ if (strlen(prefixes[i])<2){fprintf(stderr,"prefix too short\n");return 1;} 490+ if (strlen(prefixes[i])>8){fprintf(stderr,"prefix too long (>8)\n");return 1;} 491+ } 492+ if (num_prefixes>MAX_PREFIXES){fprintf(stderr,"too many prefixes (max %d)\n",MAX_PREFIXES);return 1;} 493+ 494+ // ---- build templates ---- 495+ uint8_t presigned_tpl[PRESIGNED_LEN]; 496+ { 497+ char buf[256]; 498+ int n=snprintf(buf,sizeof(buf), 499+ "\xa6""dprev\xf6""dtypemplc_operationhservices\xa0""kalsoKnownAs\x81""kat://AAAAAAlrotationKeys\x81""x9%ssverificationMethods\xa0", 500+ pubkey); 501+ assert(n==PRESIGNED_LEN); 502+ memcpy(presigned_tpl,buf,PRESIGNED_LEN); 503+ } 504+ 505+ uint8_t signed_tpl[SIGNED_LEN]; 506+ { 507+ const uint8_t header[] = "\xa7""csigxV"; // 7 bytes 508+ const uint8_t mid[] = "dprev\xf6""dtypemplc_operationhservices\xa0""kalsoKnownAs\x81""kat://AAAAAAlrotationKeys\x81""x9"; 509+ const uint8_t tail[] = "sverificationMethods\xa0"; // 21 bytes 510+ uint8_t *p=signed_tpl; 511+ memcpy(p,header,7); p+=7; 512+ memset(p,'A',86); p+=86; // sig placeholder 513+ memcpy(p,mid,sizeof(mid)-1); p+=sizeof(mid)-1; 514+ assert((p-signed_tpl)==SIGNED_PUBKEY_OFF); 515+ memcpy(p,pubkey,57); p+=57; 516+ memcpy(p,tail,21); p+=21; 517+ assert((p-signed_tpl)==SIGNED_LEN); 518+ } 519+ 520+ // ---- upload templates to constant memory ---- 521+ CUDA_CHECK(cudaMemcpyToSymbol(d_presigned_tpl, presigned_tpl, PRESIGNED_LEN)); 522+ CUDA_CHECK(cudaMemcpyToSymbol(d_signed_tpl, signed_tpl, SIGNED_LEN)); 523+ 524+ // ---- upload prefix data ---- 525+ { 526+ uint8_t fb[MAX_PREFIXES]={0}; 527+ uint8_t pdata[MAX_PREFIX_DATA]={0}; 528+ uint32_t plens[MAX_PREFIXES]={0}; 529+ uint32_t poffs[MAX_PREFIXES]={0}; 530+ uint32_t off=0; 531+ for (int i=0;i<num_prefixes;i++){ 532+ fb[i]=prefix_firstbyte(prefixes[i]); 533+ plens[i]=(uint32_t)strlen(prefixes[i]); 534+ poffs[i]=off; 535+ memcpy(pdata+off,prefixes[i],plens[i]); 536+ off+=plens[i]; 537+ } 538+ uint32_t bitmap[8]={0}; 539+ for (int i=0;i<num_prefixes;i++) bitmap[fb[i]>>5]|=(1u<<(fb[i]&31)); 540+ uint32_t np=(uint32_t)num_prefixes; 541+ CUDA_CHECK(cudaMemcpyToSymbol(d_firstbyte_bitmap, bitmap, sizeof(bitmap))); 542+ CUDA_CHECK(cudaMemcpyToSymbol(d_firstbytes, fb, sizeof(uint8_t)*num_prefixes)); 543+ CUDA_CHECK(cudaMemcpyToSymbol(d_prefix_data, pdata, off)); 544+ CUDA_CHECK(cudaMemcpyToSymbol(d_prefix_lens, plens, sizeof(uint32_t)*num_prefixes)); 545+ CUDA_CHECK(cudaMemcpyToSymbol(d_prefix_offsets,poffs, sizeof(uint32_t)*num_prefixes)); 546+ CUDA_CHECK(cudaMemcpyToSymbol(d_num_prefixes, &np, sizeof(uint32_t))); 547+ } 548+ 549+ // ---- load precomputed table ---- 550+ uint32_t *d_limb_table; uint8_t *d_r_b64, *d_r_tail; 551+ size_t num_rows; 552+ load_precomputed(precomputed_path, &d_limb_table, &d_r_b64, &d_r_tail, &num_rows); 553+ fprintf(stderr,"imported %zu rows, WORK_SIZE=%d STEPS_PER_TASK=%d\n",num_rows,WORK_SIZE,STEPS_PER_TASK); 554+ 555+ // ---- allocate result buffers ---- 556+ uint8_t *d_results; 557+ uint32_t *d_result_count; 558+ uint8_t h_results[MAX_RESULTS*RESULT_STRIDE]; 559+ uint32_t h_result_count; 560+ CUDA_CHECK(cudaMalloc(&d_results, MAX_RESULTS*RESULT_STRIDE)); 561+ CUDA_CHECK(cudaMalloc(&d_result_count, sizeof(uint32_t))); 562+ 563+ // ---- mining loop ---- 564+ CUDA_CHECK(cudaFuncSetAttribute(mine_plc, cudaFuncAttributePreferredSharedMemoryCarveout, 0)); 565+ 566+ setlocale(LC_NUMERIC,""); 567+ uint64_t total_plcs=0, total_found=0; 568+ double start=timestamp(); 569+ uint64_t handle_base=0; 570+ int blocks=WORK_SIZE/BLOCK_SIZE; 571+ 572+ while (1) { 573+ for (uint32_t row_base=0; row_base<(uint32_t)num_rows; row_base+=STEPS_PER_TASK) { 574+ 575+ h_result_count=0; 576+ CUDA_CHECK(cudaMemcpy(d_result_count,&h_result_count,sizeof(uint32_t),cudaMemcpyHostToDevice)); 577+ 578+ mine_plc<<<blocks,BLOCK_SIZE>>>( 579+ d_limb_table, d_r_b64, d_r_tail, 580+ d_results, d_result_count, 581+ (uint32_t)handle_base, row_base, (uint32_t)num_rows 582+ ); 583+ CUDA_CHECK(cudaGetLastError()); 584+ CUDA_CHECK(cudaDeviceSynchronize()); 585+ 586+ CUDA_CHECK(cudaMemcpy(&h_result_count,d_result_count,sizeof(uint32_t),cudaMemcpyDeviceToHost)); 587+ 588+ uint32_t n=h_result_count<MAX_RESULTS?h_result_count:MAX_RESULTS; 589+ if (n>0){ 590+ CUDA_CHECK(cudaMemcpy(h_results,d_results,n*RESULT_STRIDE,cudaMemcpyDeviceToHost)); 591+ for (uint32_t i=0;i<n;i++){ 592+ uint8_t *r=h_results+i*RESULT_STRIDE; 593+ char handle[7]; memcpy(handle,r,6); handle[6]=0; 594+ uint32_t row=((uint32_t)r[8]<<24)|((uint32_t)r[9]<<16)|((uint32_t)r[10]<<8)|r[11]; 595+ char did_b32[25]; memcpy(did_b32,r+44,24); did_b32[24]=0; 596+ printf("%s %s 0x",did_b32,handle); 597+ for (int b=0;b<32;b++) printf("%02x",r[12+b]); 598+ printf("\n"); 599+ fflush(stdout); 600+ total_found++; 601+ (void)row; 602+ } 603+ } 604+ 605+ uint32_t rows_this_call=(row_base+STEPS_PER_TASK<(uint32_t)num_rows) 606+ ?STEPS_PER_TASK:(uint32_t)num_rows-row_base; 607+ total_plcs+=(uint64_t)WORK_SIZE*rows_this_call; 608+ 609+ double dur=timestamp()-start; 610+ double rate=total_plcs/1e6/dur; 611+ fprintf(stderr,"\tStats: %'llu PLCs in %.3fs (%.1fM/s avg) Found: %llu\r", 612+ (unsigned long long)total_plcs, dur, rate, (unsigned long long)total_found); 613+ } 614+ handle_base+=WORK_SIZE; 615+ if (handle_base >= (1ULL<<32)) { 616+ fprintf(stderr,"\nhandle space exhausted (2^32 handles checked)\n"); 617+ goto done; 618+ } 619+ } 620+ done:; 621+}