memcpy.c 9.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496
  1. #include <stdio.h>
  2. #include <stddef.h>
  3. #define NO_GCC_LIBC_CALL __attribute__ ((__optimize__ ("-fno-tree-loop-distribute-patterns")))
  4. #include <immintrin.h>
  5. #include <emmintrin.h>
  6. typedef unsigned long u64;
  7. typedef unsigned int u32;
  8. typedef unsigned short u16;
  9. typedef unsigned char u8;
  10. extern long cacheSize;
  11. extern long halfCacheSize;
  12. // naive C
  13. NO_GCC_LIBC_CALL void* memcpy_1(void* restrict dest, const void* restrict src, size_t n) {
  14. char* restrict d = (char*)dest;
  15. char* restrict s = (char*)src;
  16. while(n--) *d++ = *s++;
  17. return dest;
  18. }
  19. // unaligned avx
  20. NO_GCC_LIBC_CALL void* memcpy_2(void* restrict dest, const void* restrict src, size_t n) {
  21. char* restrict d = (char*)dest;
  22. char* restrict s = (char*)src;
  23. size_t n32s = n / 32;
  24. for(size_t j = n32s; j; j--) {
  25. __m256i tmp = _mm256_loadu_si256((__m256i*)s);
  26. _mm256_storeu_si256((__m256i*)d, tmp);
  27. s += 32;
  28. d += 32;
  29. }
  30. n -= n32s * 32;
  31. size_t n8s = n / 8;
  32. n -= n8s;
  33. switch(n8s) {
  34. case 3: *((u64*)d) = *((u64*)s); d += 8; s += 8;
  35. case 2: *((u64*)d) = *((u64*)s); d += 8; s += 8;
  36. case 1: *((u64*)d) = *((u64*)s); d += 8; s += 8;
  37. case 0:
  38. }
  39. switch(n) {
  40. case 7: *d++ = *s++;
  41. case 6: *d++ = *s++;
  42. case 5: *d++ = *s++;
  43. case 4: *d++ = *s++;
  44. case 3: *d++ = *s++;
  45. case 2: *d++ = *s++;
  46. case 1: *d++ = *s++;
  47. case 0:
  48. }
  49. return dest;
  50. }
  51. // unaligned avx, alternate close
  52. NO_GCC_LIBC_CALL void* memcpy_2b(void* restrict dest, const void* restrict src, size_t n) {
  53. char* restrict d = (char*)dest;
  54. char* restrict s = (char*)src;
  55. size_t n32s = n / 32;
  56. for(size_t j = n32s; j; j--) {
  57. __m256i tmp = _mm256_loadu_si256((__m256i*)s);
  58. _mm256_storeu_si256((__m256i*)d, tmp);
  59. s += 32;
  60. d += 32;
  61. }
  62. n -= n32s * 32;
  63. if(n == 0) return dest;
  64. s = s - 32 + n;
  65. d = d - 32 + n;
  66. __m256i tmp2 = _mm256_loadu_si256((__m256i*)s);
  67. _mm256_storeu_si256((__m256i*)d, tmp2);
  68. return dest;
  69. }
  70. // aligned avx, requires aligned inputs
  71. NO_GCC_LIBC_CALL void* memcpy_3(void* restrict dest, const void* restrict src, size_t n) {
  72. char* restrict d = (char*)dest;
  73. char* restrict s = (char*)src;
  74. size_t n32s = n / 32;
  75. for(size_t j = n32s; j; j--) {
  76. __m256i tmp = _mm256_load_si256((__m256i*)s);
  77. _mm256_store_si256((__m256i*)d, tmp);
  78. s += 32;
  79. d += 32;
  80. }
  81. n -= n32s * 32;
  82. size_t n8s = n / 8;
  83. n -= n8s * 8;
  84. switch(n8s) {
  85. case 3: *((u64*)d) = *((u64*)s); d += 8; s += 8;
  86. case 2: *((u64*)d) = *((u64*)s); d += 8; s += 8;
  87. case 1: *((u64*)d) = *((u64*)s); d += 8; s += 8;
  88. case 0:
  89. }
  90. switch(n) {
  91. case 7: *d++ = *s++;
  92. case 6: *d++ = *s++;
  93. case 5: *d++ = *s++;
  94. case 4: *d++ = *s++;
  95. case 3: *d++ = *s++;
  96. case 2: *d++ = *s++;
  97. case 1: *d++ = *s++;
  98. case 0:
  99. }
  100. return dest;
  101. }
  102. // uncached aligned avx
  103. NO_GCC_LIBC_CALL void* memcpy_4(void* restrict dest, const void* restrict src, size_t n) {
  104. char* restrict d = (char*)dest;
  105. char* restrict s = (char*)src;
  106. u64 low = 31ul;
  107. size_t begin = (u64)s & low;
  108. size_t b8s = begin / 8;
  109. switch(b8s) {
  110. case 3: *((u64*)d) = *((u64*)s); d += 8; s += 8;
  111. case 2: *((u64*)d) = *((u64*)s); d += 8; s += 8;
  112. case 1: *((u64*)d) = *((u64*)s); d += 8; s += 8;
  113. case 0:
  114. }
  115. switch(begin - b8s * 8) {
  116. case 7: *d++ = *s++;
  117. case 6: *d++ = *s++;
  118. case 5: *d++ = *s++;
  119. case 4: *d++ = *s++;
  120. case 3: *d++ = *s++;
  121. case 2: *d++ = *s++;
  122. case 1: *d++ = *s++;
  123. case 0:
  124. }
  125. n -= begin;
  126. size_t n32s = n / 32;
  127. for(size_t j = n32s; j; j--) {
  128. __m256i tmp = _mm256_stream_load_si256((__m256i*)s);
  129. _mm256_stream_si256((__m256i*)d, tmp);
  130. s += 32;
  131. d += 32;
  132. }
  133. n -= n32s * 32;
  134. size_t n8s = n / 8;
  135. n -= n8s * 8;
  136. switch(n8s) {
  137. case 3: *((u64*)d) = *((u64*)s); d += 8; s += 8;
  138. case 2: *((u64*)d) = *((u64*)s); d += 8; s += 8;
  139. case 1: *((u64*)d) = *((u64*)s); d += 8; s += 8;
  140. case 0:
  141. }
  142. switch(n) {
  143. case 7: *d++ = *s++;
  144. case 6: *d++ = *s++;
  145. case 5: *d++ = *s++;
  146. case 4: *d++ = *s++;
  147. case 3: *d++ = *s++;
  148. case 2: *d++ = *s++;
  149. case 1: *d++ = *s++;
  150. case 0:
  151. }
  152. return dest;
  153. }
  154. // auto-aligning avx
  155. NO_GCC_LIBC_CALL void* memcpy_5(void* restrict dest, const void* restrict src, size_t n) {
  156. char* restrict d = (char*)dest;
  157. char* restrict s = (char*)src;
  158. u64 low = 31ul;
  159. size_t begin = (u64)s & low;
  160. size_t b8s = begin / 8;
  161. switch(b8s) {
  162. case 3: *((u64*)d) = *((u64*)s); d += 8; s += 8;
  163. case 2: *((u64*)d) = *((u64*)s); d += 8; s += 8;
  164. case 1: *((u64*)d) = *((u64*)s); d += 8; s += 8;
  165. case 0:
  166. }
  167. switch(begin - b8s * 8) {
  168. case 7: *d++ = *s++;
  169. case 6: *d++ = *s++;
  170. case 5: *d++ = *s++;
  171. case 4: *d++ = *s++;
  172. case 3: *d++ = *s++;
  173. case 2: *d++ = *s++;
  174. case 1: *d++ = *s++;
  175. case 0:
  176. }
  177. n -= begin;
  178. size_t n32s = n / 32;
  179. for(size_t j = n32s; j; j--) {
  180. __m256i tmp = _mm256_load_si256((__m256i*)s);
  181. _mm256_storeu_si256((__m256i*)d, tmp);
  182. s += 32;
  183. d += 32;
  184. }
  185. n -= n32s * 32;
  186. size_t n8s = n / 8;
  187. n -= n8s * 8;
  188. switch(n8s) {
  189. case 3: *((u64*)d) = *((u64*)s); d += 8; s += 8;
  190. case 2: *((u64*)d) = *((u64*)s); d += 8; s += 8;
  191. case 1: *((u64*)d) = *((u64*)s); d += 8; s += 8;
  192. case 0:
  193. }
  194. switch(n) {
  195. case 7: *d++ = *s++;
  196. case 6: *d++ = *s++;
  197. case 5: *d++ = *s++;
  198. case 4: *d++ = *s++;
  199. case 3: *d++ = *s++;
  200. case 2: *d++ = *s++;
  201. case 1: *d++ = *s++;
  202. case 0:
  203. }
  204. return dest;
  205. }
  206. // auto-aligning avx, unaligned beginning
  207. NO_GCC_LIBC_CALL void* memcpy_6_slower(void* restrict dest, const void* restrict src, size_t n) {
  208. char* restrict d = (char*)dest;
  209. char* restrict s = (char*)src;
  210. switch(n) {
  211. #define do16 _mm_storeu_si128((__m128i*)d, _mm_loadu_si128((__m128i*)s)); d += 16; s += 16;
  212. #define do8 *((u64*)d) = *((u64*)s); d += 8; s += 8;
  213. #define do4 *((u32*)d) = *((u32*)s); d += 4; s += 4;
  214. #define do2 *((u16*)d) = *((u16*)s); d += 2; s += 2;
  215. #define do1 *((u8*)d) = *((u8*)s);
  216. case 31: do16
  217. case 15: do8
  218. case 7: do4
  219. case 3: do2
  220. case 1: do1 break;
  221. case 30: do16
  222. case 14: do8
  223. case 6: do4
  224. case 2: do2 break;
  225. case 29: do16
  226. case 13: do8
  227. case 5: do4 do1 break;
  228. case 28: do16
  229. case 12: do8
  230. case 4: do4 break;
  231. case 27: do16
  232. case 11: do8 do2 do1 break;
  233. case 26: do16
  234. case 10: do2
  235. case 8: do8 break;
  236. case 25: do16
  237. case 9: do8 do1 break;
  238. case 24: do16 do8 break;
  239. case 23: do16 do4 do2 do1 break;
  240. case 22: do16 do4 do2 break;
  241. case 21: do16 do4 do1 break;
  242. case 20: do16 do4 break;
  243. case 19: do16 do2 do1 break;
  244. case 18: do16 do2 break;
  245. case 17: do16 do1 break;
  246. case 16: do16 break;
  247. default: goto BIGGER;
  248. }
  249. return dest;
  250. BIGGER:
  251. u64 low = 31ul;
  252. size_t begin = (u64)s & low;
  253. __m256i tmp = _mm256_loadu_si256((__m256i*)s);
  254. _mm256_storeu_si256((__m256i*)d, tmp);
  255. s += begin;
  256. d += begin;
  257. n -= begin;
  258. size_t n32s = n / 32;
  259. if(n >= halfCacheSize) { // skip the cache
  260. for(size_t j = n32s; j; j--) {
  261. __m256i tmp = _mm256_stream_load_si256((__m256i*)s);
  262. _mm256_storeu_si256((__m256i*)d, tmp);
  263. s += 32;
  264. d += 32;
  265. }
  266. }
  267. else { // don't skip the cache
  268. for(size_t j = n32s; j; j--) {
  269. __m256i tmp = _mm256_load_si256((__m256i*)s);
  270. _mm256_storeu_si256((__m256i*)d, tmp);
  271. s += 32;
  272. d += 32;
  273. }
  274. }
  275. n -= n32s * 32;
  276. if(n == 0) return dest;
  277. s = s - 32 + n;
  278. d = d - 32 + n;
  279. __m256i tmp2 = _mm256_loadu_si256((__m256i*)s);
  280. _mm256_storeu_si256((__m256i*)d, tmp2);
  281. return dest;
  282. }
  283. // auto-aligning avx, unaligned beginning
  284. NO_GCC_LIBC_CALL void* memcpy_6(void* restrict dest, const void* restrict src, size_t n) {
  285. char* restrict d = (char*)dest;
  286. char* restrict s = (char*)src;
  287. if(n < 32) { // faster than while(n--) *d++ = *s++;
  288. if(n >= 16) { _mm_storeu_si128((__m128i*)d, _mm_loadu_si128((__m128i*)s)); d += 16; s += 16; n -= 16; }
  289. if(n >= 8) { *((u64*)d) = *((u64*)s); d += 8; s += 8; n -= 8; }
  290. if(n >= 4) { *((u32*)d) = *((u32*)s); d += 4; s += 4; n -= 4; }
  291. if(n >= 2) { *((u16*)d) = *((u16*)s); d += 2; s += 2; n -= 2; }
  292. if(n >= 1) { *((u8*)d) = *((u8*)s); }
  293. return dest;
  294. }
  295. u64 low = 31ul;
  296. size_t n32s;
  297. int salign = (u64)s & low;
  298. // TODO: check the size where it's faster
  299. if(n >= halfCacheSize) { // skip the cache
  300. int dalign = (u64)d & low;
  301. if(salign == dalign) {
  302. size_t begin = salign;
  303. // TODO: check if it's faster to switch on the size of begin for smaller operations
  304. _mm256_storeu_si256((__m256i*)d, _mm256_loadu_si256((__m256i*)s));
  305. s += begin;
  306. d += begin;
  307. n -= begin;
  308. n32s = n / 32;
  309. for(size_t j = n32s; j; j--) {
  310. __m256i tmp = _mm256_stream_load_si256((__m256i*)s);
  311. _mm256_stream_si256((__m256i*)d, tmp);
  312. s += 32;
  313. d += 32;
  314. }
  315. }
  316. else {
  317. size_t begin = salign; // TODO: check whether it's more important to optimize for loads or stores
  318. // TODO: check whether it even matters at all -- UPDATE: looks like it does, and GCC doesn't handle it well
  319. // TODO: verify that these functions all actually work
  320. _mm256_storeu_si256((__m256i*)d, _mm256_loadu_si256((__m256i*)s));
  321. s += begin;
  322. d += begin;
  323. n -= begin;
  324. n32s = n / 32;
  325. for(size_t j = n32s; j; j--) {
  326. __m256i tmp = _mm256_stream_load_si256((__m256i*)s);
  327. _mm256_storeu_si256((__m256i*)d, tmp);
  328. s += 32;
  329. d += 32;
  330. }
  331. }
  332. }
  333. else { // don't skip the cache
  334. size_t begin = salign;
  335. _mm256_storeu_si256((__m256i*)d, _mm256_loadu_si256((__m256i*)s));
  336. s += begin;
  337. d += begin;
  338. n -= begin;
  339. n32s = n / 32;
  340. for(size_t j = n32s; j; j--) {
  341. __m256i tmp = _mm256_load_si256((__m256i*)s);
  342. _mm256_storeu_si256((__m256i*)d, tmp);
  343. s += 32;
  344. d += 32;
  345. }
  346. }
  347. n -= n32s * 32;
  348. if(n == 0) return dest;
  349. s = s - 32 + n;
  350. d = d - 32 + n;
  351. __m256i tmp2 = _mm256_loadu_si256((__m256i*)s);
  352. _mm256_storeu_si256((__m256i*)d, tmp2);
  353. return dest;
  354. }