memcpy_shootout.c 4.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217
  1. #include <stdlib.h>
  2. #include <stdio.h>
  3. #include <string.h>
  4. #include <time.h>
  5. #include <stdarg.h>
  6. #include <unistd.h>
  7. void* memcpy_1(void* restrict dest, const void* restrict src, size_t n);
  8. void* memcpy_2(void* restrict dest, const void* restrict src, size_t n);
  9. void* memcpy_2b(void* restrict dest, const void* restrict src, size_t n);
  10. void* memcpy_3(void* restrict dest, const void* restrict src, size_t n);
  11. void* memcpy_4(void* restrict dest, const void* restrict src, size_t n);
  12. void* memcpy_5(void* restrict dest, const void* restrict src, size_t n);
  13. void* memcpy_6(void* restrict dest, const void* restrict src, size_t n);
  14. void* memcpy_6_slower(void* restrict dest, const void* restrict src, size_t n);
  15. #define FN_LIST \
  16. X(memcpy, glibc) \
  17. X(memcpy_1, naive) \
  18. X(memcpy_6, xaligned_avx) \
  19. // X(memcpy_2b, unaligned_avx)
  20. // X(memcpy_2, unaligned_avx_a)
  21. // X(memcpy_5, aligned_avx)
  22. // X(memcpy_4, avx_uncached)
  23. enum {
  24. #define X(a,...) ORD_##a,
  25. FN_LIST
  26. #undef X
  27. DEPTH
  28. };
  29. char* fn_names[] = {
  30. #define X(a,b,...) #b,
  31. FN_LIST
  32. #undef X
  33. };
  34. long cacheSize;
  35. long halfCacheSize;
  36. char* dest; // to help keep gcc from optimizing it away
  37. char* cachebuster;
  38. double getCurrentTime() { // in seconds
  39. double now;
  40. struct timespec ts;
  41. static double offset = 0;
  42. // CLOCK_MONOTONIC_RAW is linux-specific.
  43. clock_gettime(CLOCK_MONOTONIC_RAW, &ts);
  44. now = (double)ts.tv_sec + ((double)ts.tv_nsec / 1000000000.0);
  45. if(offset == 0) offset = now;
  46. return now - offset;
  47. }
  48. // touch a large, contiguous section of memory that will overlap with the entire cache (hopefully, assuming there's no vma magic here)
  49. void reset_cache() {
  50. for(size_t i = 0; i < cacheSize; i++) {
  51. cachebuster[i]++;
  52. }
  53. }
  54. int main(int argc, char* argv[]) {
  55. srand(101);
  56. cacheSize = sysconf(_SC_LEVEL3_CACHE_SIZE);
  57. long l4 = sysconf(_SC_LEVEL4_CACHE_SIZE);
  58. if(l4 > cacheSize) cacheSize = l4;
  59. halfCacheSize = cacheSize / 2;
  60. cachebuster = calloc(1, cacheSize);
  61. fprintf(stderr, "cache size: %ld\n", cacheSize);
  62. int nsizes = 8;
  63. // create some test memory chunks
  64. int* sizes = malloc(sizeof(*sizes) * nsizes);
  65. char** datas = malloc(sizeof(*datas) * nsizes);
  66. long maxSize = 1024 * 1024 * 16;
  67. for(int i = 0; i < nsizes; i++) {
  68. // sizes[i] = 8 + ((double)rand() / (double)RAND_MAX) * 1024*1024;
  69. sizes[i] = ((double)(i + 1) / (double)nsizes) * maxSize;
  70. datas[i] = aligned_alloc(32, sizes[i]);
  71. }
  72. // fill in some dummy data
  73. for(int i = 0; i < nsizes; i++) {
  74. for(int j = 0; j < sizes[i]; j++) {
  75. datas[i][j] = rand();
  76. }
  77. fprintf(stderr, "\rinitializing: %d / %d", i + 1, nsizes);
  78. fflush(stderr);
  79. }
  80. fprintf(stderr, "\n ");
  81. // somewhere big enough to hold any of the chunks
  82. dest = aligned_alloc(32, maxSize+8+ 32);
  83. //
  84. // for(int i = 0; i < nsizes; i++) {
  85. // for(int j = 0; j < 32; j++) {
  86. // memcpy_5(dest + j, datas[i] + j, sizes[i]);
  87. // }
  88. // }
  89. //
  90. //
  91. int reps = argc < 100 ? 1000 : argc;
  92. double* finals = calloc(1, sizeof(*finals) * DEPTH * nsizes);
  93. for(int i = 0; i < nsizes; i++) {
  94. for(int al = 0; al < 1; al++) {
  95. fprintf(stderr, "\rtesting: %d / %d", al + 1 + (i * 32), nsizes * 32);
  96. fflush(stderr);
  97. #define X(name,...) \
  98. reset_cache(); \
  99. double start_##name = getCurrentTime(); \
  100. \
  101. for(int k = 0; k < reps; k++) { \
  102. name(dest, datas[i], sizes[i]); \
  103. } \
  104. \
  105. double end_##name = getCurrentTime(); \
  106. double final_##name = end_##name - start_##name; \
  107. finals[ORD_##name + DEPTH * i] += final_##name; \
  108. //printf(#name ": %d bytes, %.15fs\n", sizes[i], final_##name);
  109. FN_LIST
  110. #undef X
  111. }
  112. }
  113. fprintf(stderr, "\n ");
  114. double totals[DEPTH];
  115. for(int i = 0; i < DEPTH; i++) {
  116. totals[i] = 0;
  117. }
  118. fprintf(stderr, "sizes: ");
  119. for(int i = 0; i < nsizes; i++) {
  120. fprintf(stderr, "%d, ", sizes[i]);
  121. for(int j = 0; j < DEPTH; j++) {
  122. totals[j] += finals[j + DEPTH * i];
  123. }
  124. }
  125. fprintf(stderr, "\n\n");
  126. int maxname = 0;
  127. for(int i = 0; i < DEPTH; i++) {
  128. if(strlen(fn_names[i]) > maxname) maxname = strlen(fn_names[i]);
  129. }
  130. for(int i = 0; i < DEPTH; i++) {
  131. fprintf(stderr, "%s: %.*s%.15f\n", fn_names[i], (int)(maxname - strlen(fn_names[i])), " ", totals[i]);
  132. }
  133. // sizes header, first col empty
  134. printf(",");
  135. for(int i = 0; i < nsizes; i++) {
  136. printf("%d%s", sizes[i], i == nsizes - 1 ? "" : ",");
  137. }
  138. printf("\n");
  139. // time data
  140. for(int j = 0; j < DEPTH; j++) {
  141. printf("\"%s\",", fn_names[j]);
  142. for(int i = 0; i < nsizes; i++) {
  143. printf("%.15f%s", finals[j + DEPTH * i], i == nsizes - 1 ? "" : ",");
  144. }
  145. printf("\n");
  146. }
  147. // #define X(name, desc,...) printf("%s: %.15f\n", #desc, totals[ORD_##name]);
  148. // FN_LIST
  149. // #undef X
  150. return 0;
  151. }