kaktoos.cu 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395
  1. #define GRID_SIZE (1LL << 24)
  2. #define BLOCK_SIZE 512
  3. #define CHUNK_SIZE (GRID_SIZE / BLOCK_SIZE)
  4. #define RNG_MUL 25214903917ULL
  5. #define RNG_ADD 11ULL
  6. #define RNG_MASK ((1ULL << 48) - 1)
  7. #ifndef CACTUS_HEIGHT
  8. #define CACTUS_HEIGHT 7
  9. #endif
  10. #include <chrono>
  11. #include <cstdint>
  12. #include <mutex>
  13. #include <thread>
  14. #include <cuda.h>
  15. #ifdef BOINC
  16. #include "boinc_api.h"
  17. #if defined _WIN32 || defined _WIN64
  18. #include "boinc_win.h"
  19. #endif
  20. #endif
  21. __device__ unsigned long long block_add_gpu[BLOCK_SIZE + 1];
  22. __device__ unsigned long long block_mul_gpu[BLOCK_SIZE + 1];
  23. __device__ unsigned long long chunk_add_gpu[CHUNK_SIZE + 1];
  24. __device__ unsigned long long chunk_mul_gpu[CHUNK_SIZE + 1];
  25. __device__ int32_t FLOOR_LEVEL;
  26. __device__ inline int32_t next(uint32_t *random, uint32_t *index, int bits)
  27. {
  28. return (random[(*index)++] >> (32 - bits));
  29. }
  30. __device__ inline int32_t next_int(uint32_t *random, uint32_t *index, int32_t bound)
  31. {
  32. int32_t bits, value;
  33. do {
  34. bits = next(random, index, 31);
  35. value = bits % bound;
  36. } while (bits - value + (bound - 1) < 0);
  37. return value;
  38. }
  39. __device__ inline int32_t next_int_unknown(uint32_t *random, uint32_t *index, int32_t bound)
  40. {
  41. if ((bound & -bound) == bound) {
  42. return (int32_t) ((bound * (unsigned long long) next(random, index, 31)) >> 31);
  43. } else {
  44. return next_int(random, index, bound);
  45. }
  46. }
  47. __device__ inline uint8_t extract(const uint32_t *heightmap, uint16_t pos)
  48. {
  49. return ((heightmap[pos >> 3] >> ((pos & 7) << 2)) & 15) + FLOOR_LEVEL;
  50. }
  51. __device__ inline void increase(uint32_t *heightmap, uint16_t pos, uint8_t addend)
  52. {
  53. heightmap[pos >> 3] += addend << ((pos & 7) << 2);
  54. }
  55. __global__ void crack(unsigned long long seed, unsigned long long *out, unsigned long long *out_n)
  56. {
  57. __shared__ uint32_t random[BLOCK_SIZE + 1024];
  58. __shared__ uint32_t skip_index[BLOCK_SIZE + 1024 - 100];
  59. __shared__ uint32_t skip_first[BLOCK_SIZE + 1024 - 102];
  60. __shared__ uint32_t skip_always[BLOCK_SIZE + 1024 - 102];
  61. __shared__ uint32_t floor_skip[BLOCK_SIZE + 1024 - 102];
  62. __shared__ uint8_t floor_terrain[BLOCK_SIZE + 1024 - 102];
  63. __shared__ uint32_t offset_skip[BLOCK_SIZE + 1024 - 4];
  64. __shared__ uint8_t offset_height[BLOCK_SIZE + 1024 - 4];
  65. uint32_t heightmap[128];
  66. uint32_t random_index;
  67. seed = (seed * chunk_mul_gpu[blockIdx.x] + chunk_add_gpu[blockIdx.x]) & RNG_MASK;
  68. seed = (seed * block_mul_gpu[threadIdx.x] + block_add_gpu[threadIdx.x]) & RNG_MASK;
  69. unsigned long long seed2 = seed;
  70. seed = ((seed - 11ULL) * 246154705703781ULL) & RNG_MASK;
  71. random[threadIdx.x + BLOCK_SIZE * 0] = (uint32_t) (seed2 >> 16);
  72. for (int i = threadIdx.x + BLOCK_SIZE; i < BLOCK_SIZE + 1024; i += BLOCK_SIZE) {
  73. seed2 = (seed2 * block_mul_gpu[BLOCK_SIZE] + block_add_gpu[BLOCK_SIZE]) & RNG_MASK;
  74. random[i] = (uint32_t) (seed2 >> 16);
  75. }
  76. for (int i = 0; i < 128; i++) {
  77. heightmap[i] = 0;
  78. }
  79. __syncthreads();
  80. for (int i = threadIdx.x; i < BLOCK_SIZE + 1024 - 4; i += BLOCK_SIZE) {
  81. random_index = i;
  82. uint8_t offset = next_int_unknown(random, &random_index, next_int(random, &random_index, 3) + 1) + 1;
  83. offset_height[i] = offset;
  84. offset_skip[i] = random_index;
  85. }
  86. __syncthreads();
  87. for (int i = threadIdx.x; i < BLOCK_SIZE + 1024 - 100; i += BLOCK_SIZE) {
  88. random_index = i;
  89. for (int j = 0; j < 10; j++) {
  90. random_index += 6;
  91. random_index = offset_skip[random_index];
  92. }
  93. skip_index[i] = random_index;
  94. }
  95. __syncthreads();
  96. for (int i = threadIdx.x; i < BLOCK_SIZE + 1024 - 102; i += BLOCK_SIZE) {
  97. random_index = i + 2;
  98. int16_t terrain = next_int_unknown(random, &random_index, (FLOOR_LEVEL + 1) * 2);
  99. floor_terrain[i] = terrain;
  100. floor_skip[i] = random_index;
  101. if (terrain - 3 > FLOOR_LEVEL + CACTUS_HEIGHT + 1) {
  102. skip_first[i] = skip_index[random_index];
  103. skip_always[i] = skip_index[random_index];
  104. } else if (terrain - 3 > FLOOR_LEVEL + 1) {
  105. skip_first[i] = skip_index[random_index];
  106. skip_always[i] = 0;
  107. } else if (terrain + 3 <= FLOOR_LEVEL && terrain - 3 >= 0) {
  108. skip_first[i] = random_index + 60;
  109. skip_always[i] = random_index + 60;
  110. } else {
  111. skip_first[i] = 0;
  112. skip_always[i] = 0;
  113. }
  114. }
  115. __syncthreads();
  116. random_index = threadIdx.x;
  117. uint16_t best = 0;
  118. bool changed = false;
  119. int i = 0;
  120. for (; i < 10 && skip_first[random_index]; i++) {
  121. random_index = skip_first[random_index];
  122. }
  123. for (; i < 10; i++) {
  124. if (!changed && skip_first[random_index]) {
  125. random_index = skip_first[random_index];
  126. continue;
  127. }
  128. uint16_t bx = next(random, &random_index, 4) + 8;
  129. uint16_t bz = next(random, &random_index, 4) + 8;
  130. uint16_t initial = bx * 32 + bz;
  131. int16_t terrain;
  132. if (extract(heightmap, initial) == FLOOR_LEVEL) {
  133. if (skip_always[random_index - 2]) {
  134. random_index = skip_always[random_index - 2];
  135. continue;
  136. }
  137. terrain = floor_terrain[random_index - 2];
  138. random_index = floor_skip[random_index - 2];
  139. } else {
  140. terrain = next_int_unknown(random, &random_index, (extract(heightmap, initial) + 1) * 2);
  141. if (terrain + 3 <= FLOOR_LEVEL && terrain - 3 >= 0) {
  142. random_index += 60;
  143. continue;
  144. }
  145. }
  146. if (terrain - 3 > extract(heightmap, best) + 1) {
  147. random_index = skip_index[random_index];
  148. continue;
  149. }
  150. for (int j = 0; j < 10; j++) {
  151. int16_t bx = next(random, &random_index, 3) - next(random, &random_index, 3);
  152. int16_t by = next(random, &random_index, 2) - next(random, &random_index, 2);
  153. int16_t bz = next(random, &random_index, 3) - next(random, &random_index, 3);
  154. uint16_t xz = initial + bx * 32 + bz;
  155. int16_t y = (int16_t) terrain + by;
  156. if (y <= extract(heightmap, xz) && y >= 0) continue;
  157. uint8_t offset = offset_height[random_index];
  158. random_index = offset_skip[random_index];
  159. if (y != extract(heightmap, xz) + 1) continue;
  160. if (y == FLOOR_LEVEL + 1) {
  161. uint8_t mask = 0;
  162. if (bz != 0x00) mask |= extract(heightmap, xz - 1) - FLOOR_LEVEL;
  163. if (bz != 0x1F) mask |= extract(heightmap, xz + 1) - FLOOR_LEVEL;
  164. if (bx != 0x00) mask |= extract(heightmap, xz - 32) - FLOOR_LEVEL;
  165. if (bx != 0x1F) mask |= extract(heightmap, xz + 32) - FLOOR_LEVEL;
  166. if (mask) continue;
  167. }
  168. increase(heightmap, xz, offset);
  169. changed = true;
  170. if (extract(heightmap, xz) > extract(heightmap, best)) best = xz;
  171. }
  172. }
  173. if (extract(heightmap, best) - FLOOR_LEVEL >= CACTUS_HEIGHT) {
  174. out[atomicAdd((unsigned long long*) out_n, 1ULL)] = seed;
  175. }
  176. }
  177. unsigned long long block_add[BLOCK_SIZE + 1];
  178. unsigned long long block_mul[BLOCK_SIZE + 1];
  179. unsigned long long chunk_add[CHUNK_SIZE + 1];
  180. unsigned long long chunk_mul[CHUNK_SIZE + 1];
  181. unsigned long long offset = 0;
  182. unsigned long long seed = 0;
  183. unsigned long long total_seeds = 0;
  184. time_t elapsed_chkpoint = 0;
  185. std::mutex mutexcuda;
  186. std::thread threads[1];
  187. unsigned long long BEGIN;
  188. unsigned long long BEGINOrig;
  189. unsigned long long END;
  190. int checkpoint_now;
  191. struct checkpoint_vars {
  192. unsigned long long offset;
  193. time_t elapsed_chkpoint;
  194. };
  195. int32_t floor_level_host;
  196. void run(int gpu_device)
  197. {
  198. FILE* kaktseeds = fopen("kaktseeds.txt", "w+");
  199. unsigned long long *out;
  200. unsigned long long *out_n;
  201. cudaSetDevice(gpu_device);
  202. cudaMallocManaged(&out, GRID_SIZE * sizeof(*out));
  203. cudaMallocManaged(&out_n, sizeof(*out_n));
  204. cudaMemcpyToSymbol(block_add_gpu, block_add, (BLOCK_SIZE + 1) * sizeof(*block_add));
  205. cudaMemcpyToSymbol(block_mul_gpu, block_mul, (BLOCK_SIZE + 1) * sizeof(*block_mul));
  206. cudaMemcpyToSymbol(chunk_add_gpu, chunk_add, (CHUNK_SIZE + 1) * sizeof(*chunk_add));
  207. cudaMemcpyToSymbol(chunk_mul_gpu, chunk_mul, (CHUNK_SIZE + 1) * sizeof(*chunk_mul));
  208. cudaMemcpyToSymbol(FLOOR_LEVEL, &floor_level_host, sizeof(int32_t));
  209. while (true) {
  210. *out_n = 0;
  211. {
  212. if (offset >= END) break;
  213. unsigned long long seed_gpu = (seed * RNG_MUL + RNG_ADD) & RNG_MASK;
  214. crack<<<CHUNK_SIZE, BLOCK_SIZE>>>(seed_gpu, out, out_n);
  215. offset += GRID_SIZE;
  216. seed = (seed * chunk_mul[CHUNK_SIZE] + chunk_add[CHUNK_SIZE]) & RNG_MASK;
  217. }
  218. cudaDeviceSynchronize();
  219. {
  220. total_seeds += *out_n;
  221. for (unsigned long long i = 0; i < *out_n; i++){
  222. fprintf(kaktseeds,"s: %llu,\n", out[i]);
  223. }
  224. fflush(kaktseeds);
  225. }
  226. }
  227. fclose(kaktseeds);
  228. cudaFree(out_n);
  229. cudaFree(out);
  230. }
  231. int main(int argc, char *argv[])
  232. {
  233. #ifdef BOINC
  234. BOINC_OPTIONS options;
  235. boinc_options_defaults(options);
  236. options.normal_thread_priority = true;
  237. boinc_init_options(&options);
  238. #endif
  239. block_add[0] = 0;
  240. block_mul[0] = 1;
  241. for (unsigned long long i = 0; i < BLOCK_SIZE; i++) {
  242. block_add[i + 1] = (block_add[i] * RNG_MUL + RNG_ADD) & RNG_MASK;
  243. block_mul[i + 1] = (block_mul[i] * RNG_MUL) & RNG_MASK;
  244. }
  245. chunk_add[0] = 0;
  246. chunk_mul[0] = 1;
  247. for (unsigned long long i = 0; i < CHUNK_SIZE; i++) {
  248. chunk_add[i + 1] = (chunk_add[i] * block_mul[BLOCK_SIZE] + block_add[BLOCK_SIZE]) & RNG_MASK;
  249. chunk_mul[i + 1] = (chunk_mul[i] * block_mul[BLOCK_SIZE]) & RNG_MASK;
  250. }
  251. int gpu_device = 0;
  252. for (int i = 1; i < argc; i += 2) {
  253. const char *param = argv[i];
  254. if (strcmp(param, "-d") == 0 || strcmp(param, "--device") == 0) {
  255. gpu_device = atoi(argv[i + 1]);
  256. } else if (strcmp(param, "-s") == 0 || strcmp(param, "--start") == 0) {
  257. sscanf(argv[i + 1], "%llu", &BEGIN);
  258. } else if (strcmp(param, "-e") == 0 || strcmp(param, "--end") == 0) {
  259. sscanf(argv[i + 1], "%llu", &END);
  260. } else if (strcmp(param, "-h") == 0 || strcmp(param, "--height") == 0){
  261. sscanf(argv[i + 1], "%llu", &floor_level_host);
  262. } else {
  263. fprintf(stderr,"Unknown parameter: %s\n", param);
  264. }
  265. }
  266. BEGINOrig = BEGIN;
  267. FILE *checkpoint_data = boinc_fopen("kaktpoint.txt", "rb");
  268. if (!checkpoint_data) {
  269. fprintf(stderr,"No checkpoint to load\n");
  270. } else {
  271. #ifdef BOINC
  272. boinc_begin_critical_section();
  273. #endif
  274. struct checkpoint_vars data_store;
  275. fread(&data_store, sizeof(data_store), 1, checkpoint_data);
  276. BEGIN = data_store.offset;
  277. elapsed_chkpoint = data_store.elapsed_chkpoint;
  278. fprintf(stderr,"Checkpoint loaded, task time %d s, seed pos: %llu\n", elapsed_chkpoint, BEGIN);
  279. fclose(checkpoint_data);
  280. #ifdef BOINC
  281. boinc_end_critical_section();
  282. #endif
  283. }
  284. for (; offset + GRID_SIZE <= BEGIN; offset += GRID_SIZE)
  285. seed = (seed * chunk_mul[CHUNK_SIZE] + chunk_add[CHUNK_SIZE]) & RNG_MASK;
  286. for (; offset + 1 <= BEGIN; offset += 1)
  287. seed = (seed * RNG_MUL + RNG_ADD) & RNG_MASK;
  288. #ifdef BOINC
  289. APP_INIT_DATA aid;
  290. boinc_get_init_data(aid);
  291. if (aid.gpu_device_num >= 0) {
  292. gpu_device = aid.gpu_device_num;
  293. fprintf(stderr,"boinc gpu %i gpuindex: %i \n", aid.gpu_device_num, gpu_device);
  294. } else {
  295. fprintf(stderr,"stndalone gpuindex %i \n", gpu_device);
  296. }
  297. #endif
  298. threads[0] = std::thread(run, gpu_device);
  299. checkpoint_now = 0;
  300. time_t start_time = time(NULL);
  301. while (offset < END) {
  302. using namespace std::chrono_literals;
  303. std::this_thread::sleep_for(1s);
  304. time_t elapsed = time(NULL) - start_time;
  305. unsigned long long count = offset - BEGIN;
  306. double frac = (double) count / (double) (END - BEGIN);
  307. #ifdef BOINC
  308. boinc_fraction_done(frac);
  309. #endif
  310. checkpoint_now++;
  311. if (checkpoint_now >= 30 || boinc_time_to_checkpoint() ){ // 30 for 30 secs before checkpoint
  312. #ifdef BOINC
  313. boinc_begin_critical_section(); // Boinc should not interrupt this
  314. #endif
  315. // Checkpointing section below
  316. boinc_delete_file("kaktpoint.txt"); // Don't touch, same func as normal fdel
  317. FILE *checkpoint_data = boinc_fopen("kaktpoint.txt", "wb");
  318. struct checkpoint_vars data_store;
  319. data_store.offset = offset;
  320. data_store.elapsed_chkpoint = elapsed_chkpoint + elapsed;
  321. fwrite(&data_store, sizeof(data_store), 1, checkpoint_data);
  322. fclose(checkpoint_data);
  323. checkpoint_now=0;
  324. #ifdef BOINC
  325. boinc_end_critical_section();
  326. boinc_checkpoint_completed(); // Checkpointing completed
  327. #endif
  328. }
  329. }
  330. #ifdef BOINC
  331. boinc_begin_critical_section();
  332. #endif
  333. for (std::thread& thread : threads)
  334. thread.join();
  335. time_t elapsed = time(NULL) - start_time;
  336. unsigned long long count = offset - BEGIN;
  337. double done = (double) count / 1000000.0;
  338. double speed = done / (double) elapsed;
  339. fprintf(stderr, "\nSpeed: %.2lfm/s\n", speed );
  340. fprintf(stderr, "Done\n");
  341. fprintf(stderr, "Processed: %llu seeds in %.2lfs seconds\n", END - BEGINOrig, (double) elapsed_chkpoint + (double) elapsed );
  342. #ifdef BOINC
  343. boinc_end_critical_section();
  344. #endif
  345. boinc_finish(0);
  346. }