use-optimized-memcpy-memset.patch 35 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317
  1. From 1299e4c3ef43ba7509a104fc8cdeea90c61dc7cc Mon Sep 17 00:00:00 2001
  2. From: 12101111 <w12101111@gmail.com>
  3. Date: Wed, 9 Mar 2022 23:32:02 +0800
  4. Subject: [PATCH] use optimized memcpy & memset
  5. ---
  6. src/string/aarch64/memcpy.S | 287 ++++++------
  7. src/string/aarch64/memmove.S | 1 +
  8. src/string/x86_64/memcpy.S | 487 +++++++++++++++++++++
  9. src/string/x86_64/memcpy.s | 25 --
  10. src/string/x86_64/{memmove.s => memmove.S} | 5 +
  11. src/string/x86_64/memset.S | 316 +++++++++++++
  12. src/string/x86_64/memset.s | 72 ---
  13. 7 files changed, 962 insertions(+), 231 deletions(-)
  14. create mode 100644 src/string/aarch64/memmove.S
  15. create mode 100644 src/string/x86_64/memcpy.S
  16. delete mode 100644 src/string/x86_64/memcpy.s
  17. rename src/string/x86_64/{memmove.s => memmove.S} (80%)
  18. create mode 100644 src/string/x86_64/memset.S
  19. delete mode 100644 src/string/x86_64/memset.s
  20. diff --git a/src/string/aarch64/memcpy.S b/src/string/aarch64/memcpy.S
  21. index 48bb8a8d..272a727e 100644
  22. --- a/src/string/aarch64/memcpy.S
  23. +++ b/src/string/aarch64/memcpy.S
  24. @@ -7,38 +7,38 @@
  25. /* Assumptions:
  26. *
  27. - * ARMv8-a, AArch64, unaligned accesses.
  28. + * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
  29. *
  30. */
  31. -#define dstin x0
  32. -#define src x1
  33. -#define count x2
  34. -#define dst x3
  35. -#define srcend x4
  36. -#define dstend x5
  37. -#define A_l x6
  38. -#define A_lw w6
  39. -#define A_h x7
  40. -#define B_l x8
  41. -#define B_lw w8
  42. -#define B_h x9
  43. -#define C_l x10
  44. -#define C_lw w10
  45. -#define C_h x11
  46. -#define D_l x12
  47. -#define D_h x13
  48. -#define E_l x14
  49. -#define E_h x15
  50. -#define F_l x16
  51. -#define F_h x17
  52. -#define G_l count
  53. -#define G_h dst
  54. -#define H_l src
  55. -#define H_h srcend
  56. -#define tmp1 x14
  57. -
  58. -/* This implementation of memcpy uses unaligned accesses and branchless
  59. +#define dstin x0
  60. +#define src x1
  61. +#define count x2
  62. +#define dst x3
  63. +#define srcend x4
  64. +#define dstend x5
  65. +#define A_l x6
  66. +#define A_lw w6
  67. +#define A_h x7
  68. +#define B_l x8
  69. +#define B_lw w8
  70. +#define B_h x9
  71. +#define C_lw w10
  72. +#define tmp1 x14
  73. +
  74. +#define A_q q0
  75. +#define B_q q1
  76. +#define C_q q2
  77. +#define D_q q3
  78. +#define E_q q4
  79. +#define F_q q5
  80. +#define G_q q6
  81. +#define H_q q7
  82. +
  83. +#define L(l) .L ## l
  84. +
  85. +/* This implementation handles overlaps and supports both memcpy and memmove
  86. + from a single entry point. It uses unaligned accesses and branchless
  87. sequences to keep the code small, simple and improve performance.
  88. Copies are split into 3 main cases: small copies of up to 32 bytes, medium
  89. @@ -46,141 +46,160 @@
  90. check is negligible since it is only required for large copies.
  91. Large copies use a software pipelined loop processing 64 bytes per iteration.
  92. - The destination pointer is 16-byte aligned to minimize unaligned accesses.
  93. + The source pointer is 16-byte aligned to minimize unaligned accesses.
  94. The loop tail is handled by always copying 64 bytes from the end.
  95. */
  96. .global memcpy
  97. .type memcpy,%function
  98. -memcpy:
  99. - add srcend, src, count
  100. - add dstend, dstin, count
  101. - cmp count, 128
  102. - b.hi .Lcopy_long
  103. - cmp count, 32
  104. - b.hi .Lcopy32_128
  105. +.global memmove
  106. +.type memmove,%function
  107. +memmove:
  108. +memcpy: add srcend, src, count
  109. + add dstend, dstin, count
  110. + cmp count, 128
  111. + b.hi L(copy_long)
  112. + cmp count, 32
  113. + b.hi L(copy32_128)
  114. /* Small copies: 0..32 bytes. */
  115. - cmp count, 16
  116. - b.lo .Lcopy16
  117. - ldp A_l, A_h, [src]
  118. - ldp D_l, D_h, [srcend, -16]
  119. - stp A_l, A_h, [dstin]
  120. - stp D_l, D_h, [dstend, -16]
  121. + cmp count, 16
  122. + b.lo L(copy16)
  123. + ldr A_q, [src]
  124. + ldr B_q, [srcend, -16]
  125. + str A_q, [dstin]
  126. + str B_q, [dstend, -16]
  127. ret
  128. /* Copy 8-15 bytes. */
  129. -.Lcopy16:
  130. - tbz count, 3, .Lcopy8
  131. - ldr A_l, [src]
  132. - ldr A_h, [srcend, -8]
  133. - str A_l, [dstin]
  134. - str A_h, [dstend, -8]
  135. +L(copy16):
  136. + tbz count, 3, L(copy8)
  137. + ldr A_l, [src]
  138. + ldr A_h, [srcend, -8]
  139. + str A_l, [dstin]
  140. + str A_h, [dstend, -8]
  141. ret
  142. .p2align 3
  143. /* Copy 4-7 bytes. */
  144. -.Lcopy8:
  145. - tbz count, 2, .Lcopy4
  146. - ldr A_lw, [src]
  147. - ldr B_lw, [srcend, -4]
  148. - str A_lw, [dstin]
  149. - str B_lw, [dstend, -4]
  150. +L(copy8):
  151. + tbz count, 2, L(copy4)
  152. + ldr A_lw, [src]
  153. + ldr B_lw, [srcend, -4]
  154. + str A_lw, [dstin]
  155. + str B_lw, [dstend, -4]
  156. ret
  157. /* Copy 0..3 bytes using a branchless sequence. */
  158. -.Lcopy4:
  159. - cbz count, .Lcopy0
  160. - lsr tmp1, count, 1
  161. - ldrb A_lw, [src]
  162. - ldrb C_lw, [srcend, -1]
  163. - ldrb B_lw, [src, tmp1]
  164. - strb A_lw, [dstin]
  165. - strb B_lw, [dstin, tmp1]
  166. - strb C_lw, [dstend, -1]
  167. -.Lcopy0:
  168. +L(copy4):
  169. + cbz count, L(copy0)
  170. + lsr tmp1, count, 1
  171. + ldrb A_lw, [src]
  172. + ldrb C_lw, [srcend, -1]
  173. + ldrb B_lw, [src, tmp1]
  174. + strb A_lw, [dstin]
  175. + strb B_lw, [dstin, tmp1]
  176. + strb C_lw, [dstend, -1]
  177. +L(copy0):
  178. ret
  179. .p2align 4
  180. /* Medium copies: 33..128 bytes. */
  181. -.Lcopy32_128:
  182. - ldp A_l, A_h, [src]
  183. - ldp B_l, B_h, [src, 16]
  184. - ldp C_l, C_h, [srcend, -32]
  185. - ldp D_l, D_h, [srcend, -16]
  186. - cmp count, 64
  187. - b.hi .Lcopy128
  188. - stp A_l, A_h, [dstin]
  189. - stp B_l, B_h, [dstin, 16]
  190. - stp C_l, C_h, [dstend, -32]
  191. - stp D_l, D_h, [dstend, -16]
  192. +L(copy32_128):
  193. + ldp A_q, B_q, [src]
  194. + ldp C_q, D_q, [srcend, -32]
  195. + cmp count, 64
  196. + b.hi L(copy128)
  197. + stp A_q, B_q, [dstin]
  198. + stp C_q, D_q, [dstend, -32]
  199. ret
  200. .p2align 4
  201. /* Copy 65..128 bytes. */
  202. -.Lcopy128:
  203. - ldp E_l, E_h, [src, 32]
  204. - ldp F_l, F_h, [src, 48]
  205. - cmp count, 96
  206. - b.ls .Lcopy96
  207. - ldp G_l, G_h, [srcend, -64]
  208. - ldp H_l, H_h, [srcend, -48]
  209. - stp G_l, G_h, [dstend, -64]
  210. - stp H_l, H_h, [dstend, -48]
  211. -.Lcopy96:
  212. - stp A_l, A_h, [dstin]
  213. - stp B_l, B_h, [dstin, 16]
  214. - stp E_l, E_h, [dstin, 32]
  215. - stp F_l, F_h, [dstin, 48]
  216. - stp C_l, C_h, [dstend, -32]
  217. - stp D_l, D_h, [dstend, -16]
  218. +L(copy128):
  219. + ldp E_q, F_q, [src, 32]
  220. + cmp count, 96
  221. + b.ls L(copy96)
  222. + ldp G_q, H_q, [srcend, -64]
  223. + stp G_q, H_q, [dstend, -64]
  224. +L(copy96):
  225. + stp A_q, B_q, [dstin]
  226. + stp E_q, F_q, [dstin, 32]
  227. + stp C_q, D_q, [dstend, -32]
  228. ret
  229. - .p2align 4
  230. /* Copy more than 128 bytes. */
  231. -.Lcopy_long:
  232. -
  233. - /* Copy 16 bytes and then align dst to 16-byte alignment. */
  234. -
  235. - ldp D_l, D_h, [src]
  236. - and tmp1, dstin, 15
  237. - bic dst, dstin, 15
  238. - sub src, src, tmp1
  239. - add count, count, tmp1 /* Count is now 16 too large. */
  240. - ldp A_l, A_h, [src, 16]
  241. - stp D_l, D_h, [dstin]
  242. - ldp B_l, B_h, [src, 32]
  243. - ldp C_l, C_h, [src, 48]
  244. - ldp D_l, D_h, [src, 64]!
  245. - subs count, count, 128 + 16 /* Test and readjust count. */
  246. - b.ls .Lcopy64_from_end
  247. -
  248. -.Lloop64:
  249. - stp A_l, A_h, [dst, 16]
  250. - ldp A_l, A_h, [src, 16]
  251. - stp B_l, B_h, [dst, 32]
  252. - ldp B_l, B_h, [src, 32]
  253. - stp C_l, C_h, [dst, 48]
  254. - ldp C_l, C_h, [src, 48]
  255. - stp D_l, D_h, [dst, 64]!
  256. - ldp D_l, D_h, [src, 64]!
  257. - subs count, count, 64
  258. - b.hi .Lloop64
  259. +L(copy_long):
  260. + /* Use backwards copy if there is an overlap. */
  261. + sub tmp1, dstin, src
  262. + cmp tmp1, count
  263. + b.lo L(copy_long_backwards)
  264. +
  265. + /* Copy 16 bytes and then align src to 16-byte alignment. */
  266. + ldr D_q, [src]
  267. + and tmp1, src, 15
  268. + bic src, src, 15
  269. + sub dst, dstin, tmp1
  270. + add count, count, tmp1 /* Count is now 16 too large. */
  271. + ldp A_q, B_q, [src, 16]
  272. + str D_q, [dstin]
  273. + ldp C_q, D_q, [src, 48]
  274. + subs count, count, 128 + 16 /* Test and readjust count. */
  275. + b.ls L(copy64_from_end)
  276. +L(loop64):
  277. + stp A_q, B_q, [dst, 16]
  278. + ldp A_q, B_q, [src, 80]
  279. + stp C_q, D_q, [dst, 48]
  280. + ldp C_q, D_q, [src, 112]
  281. + add src, src, 64
  282. + add dst, dst, 64
  283. + subs count, count, 64
  284. + b.hi L(loop64)
  285. /* Write the last iteration and copy 64 bytes from the end. */
  286. -.Lcopy64_from_end:
  287. - ldp E_l, E_h, [srcend, -64]
  288. - stp A_l, A_h, [dst, 16]
  289. - ldp A_l, A_h, [srcend, -48]
  290. - stp B_l, B_h, [dst, 32]
  291. - ldp B_l, B_h, [srcend, -32]
  292. - stp C_l, C_h, [dst, 48]
  293. - ldp C_l, C_h, [srcend, -16]
  294. - stp D_l, D_h, [dst, 64]
  295. - stp E_l, E_h, [dstend, -64]
  296. - stp A_l, A_h, [dstend, -48]
  297. - stp B_l, B_h, [dstend, -32]
  298. - stp C_l, C_h, [dstend, -16]
  299. +L(copy64_from_end):
  300. + ldp E_q, F_q, [srcend, -64]
  301. + stp A_q, B_q, [dst, 16]
  302. + ldp A_q, B_q, [srcend, -32]
  303. + stp C_q, D_q, [dst, 48]
  304. + stp E_q, F_q, [dstend, -64]
  305. + stp A_q, B_q, [dstend, -32]
  306. + ret
  307. +
  308. + /* Large backwards copy for overlapping copies.
  309. + Copy 16 bytes and then align srcend to 16-byte alignment. */
  310. +L(copy_long_backwards):
  311. + cbz tmp1, L(copy0)
  312. + ldr D_q, [srcend, -16]
  313. + and tmp1, srcend, 15
  314. + bic srcend, srcend, 15
  315. + sub count, count, tmp1
  316. + ldp A_q, B_q, [srcend, -32]
  317. + str D_q, [dstend, -16]
  318. + ldp C_q, D_q, [srcend, -64]
  319. + sub dstend, dstend, tmp1
  320. + subs count, count, 128
  321. + b.ls L(copy64_from_start)
  322. +
  323. +L(loop64_backwards):
  324. + str B_q, [dstend, -16]
  325. + str A_q, [dstend, -32]
  326. + ldp A_q, B_q, [srcend, -96]
  327. + str D_q, [dstend, -48]
  328. + str C_q, [dstend, -64]!
  329. + ldp C_q, D_q, [srcend, -128]
  330. + sub srcend, srcend, 64
  331. + subs count, count, 64
  332. + b.hi L(loop64_backwards)
  333. +
  334. + /* Write the last iteration and copy 64 bytes from the start. */
  335. +L(copy64_from_start):
  336. + ldp E_q, F_q, [src, 32]
  337. + stp A_q, B_q, [dstend, -32]
  338. + ldp A_q, B_q, [src]
  339. + stp C_q, D_q, [dstend, -64]
  340. + stp E_q, F_q, [dstin, 32]
  341. + stp A_q, B_q, [dstin]
  342. ret
  343. .size memcpy,.-memcpy
  344. diff --git a/src/string/aarch64/memmove.S b/src/string/aarch64/memmove.S
  345. new file mode 100644
  346. index 00000000..90fd94a7
  347. --- /dev/null
  348. +++ b/src/string/aarch64/memmove.S
  349. @@ -0,0 +1 @@
  350. +// implemented as memcpy
  351. diff --git a/src/string/x86_64/memcpy.S b/src/string/x86_64/memcpy.S
  352. new file mode 100644
  353. index 00000000..c972b677
  354. --- /dev/null
  355. +++ b/src/string/x86_64/memcpy.S
  356. @@ -0,0 +1,487 @@
  357. +/*
  358. + * Copyright (c) Meta Platforms, Inc. and affiliates.
  359. + *
  360. + * Licensed under the Apache License, Version 2.0 (the "License");
  361. + * you may not use this file except in compliance with the License.
  362. + * You may obtain a copy of the License at
  363. + *
  364. + * http://www.apache.org/licenses/LICENSE-2.0
  365. + *
  366. + * Unless required by applicable law or agreed to in writing, software
  367. + * distributed under the License is distributed on an "AS IS" BASIS,
  368. + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  369. + * See the License for the specific language governing permissions and
  370. + * limitations under the License.
  371. + */
  372. +
  373. +/*
  374. + * __folly_memcpy: An optimized memcpy implementation that uses prefetch and
  375. + * AVX2 instructions.
  376. + *
  377. + * This implementation of memcpy acts as a memmove: while overlapping copies
  378. + * are undefined in memcpy, in some implementations they're the same function and
  379. + * legacy programs rely on this behavior.
  380. + *
  381. + * This implementation uses prefetch to avoid dtlb misses. This can
  382. + * substantially reduce dtlb store misses in cases where the destination
  383. + * location is absent from L1 cache and where the copy size is small enough
  384. + * that the hardware prefetcher doesn't have a large impact.
  385. + *
  386. + * The number of branches is limited by the use of overlapping loads & stores.
  387. + * This helps with copies where the source and destination cache lines are already
  388. + * present in L1 because there are fewer instructions to execute and fewer
  389. + * branches to potentially mispredict.
  390. + * e.g. to copy the last 4 <= n <= 7 bytes: copy the first & last 4 bytes (overlapped):
  391. + * movl (%rsi), %r8d
  392. + * movl -4(%rsi,%rdx), %r9d
  393. + * movl %r8d, (%rdi)
  394. + * movl %r9d, -4(%rdi,%rdx)
  395. + *
  396. + *
  397. + * For sizes up to 256 all source data is first read into registers and then written:
  398. + * - n <= 16: overlapping movs
  399. + * - n <= 32: overlapping unaligned 16-byte SSE XMM load/stores
  400. + * - n <= 256: overlapping unaligned 32-byte AVX YMM load/stores
  401. + *
  402. + * Large copies (> 256 bytes) use unaligned loads + aligned stores.
  403. + * This is observed to always be faster than rep movsb, so the rep movsb
  404. + * instruction is not used.
  405. + * - The head & tail may be unaligned => they're always written using unaligned stores.
  406. + *
  407. + * If the copy size is humongous (> 32 KiB) and the source and destination are both
  408. + * aligned, this memcpy will use non-temporal operations (AVX2). This can have
  409. + * a substantial speedup for copies where data is absent from L1, but it
  410. + * is significantly slower if the source and destination data were already
  411. + * in L1. The use of non-temporal operations also has the effect that after
  412. + * the copy is complete, the data will be moved out of L1, even if the data was
  413. + * present before the copy started.
  414. + *
  415. + * For n > 256 and overlapping src & dst buffers (memmove):
  416. + * - use unaligned loads + aligned stores, but not non-temporal stores
  417. + * - for dst < src forward copy in 128 byte batches:
  418. + * - unaligned load the first 32 bytes & last 4 x 32 bytes
  419. + * - forward copy (unaligned load + aligned stores) 4 x 32 bytes at a time
  420. + * - unaligned store the first 32 bytes & last 4 x 32 bytes
  421. + * - for dst > src backward copy in 128 byte batches:
  422. + * - unaligned load the first 4 x 32 bytes & last 32 bytes
  423. + * - backward copy (unaligned load + aligned stores) 4 x 32 bytes at a time
  424. + * - unaligned store the first 4 x 32 bytes & last 32 bytes
  425. + *
  426. + * @author Logan Evans <lpe@fb.com>
  427. + */
  428. +
  429. +#if defined(__AVX2__)
  430. +
  431. +#if defined(PREFETCH)
  432. +#undef PREFETCH
  433. +#endif
  434. +#if __PRFCHW__ // Broadwell+
  435. +#define PREFETCH prefetchw
  436. +#else
  437. +#define PREFETCH prefetcht0
  438. +#endif
  439. +
  440. +// This threshold is half of L1 cache on a Skylake machine, which means that
  441. +// potentially all of L1 will be populated by this copy once it is executed
  442. +// (dst and src are cached for temporal copies).
  443. +#define NON_TEMPORAL_STORE_THRESHOLD $32768
  444. +
  445. + .file "memcpy.S"
  446. + .section .text,"ax"
  447. +
  448. + .type __folly_memcpy_short, @function
  449. +__folly_memcpy_short:
  450. + .cfi_startproc
  451. +
  452. +.L_GE1_LE7:
  453. + cmp $1, %rdx
  454. + je .L_EQ1
  455. +
  456. + cmp $4, %rdx
  457. + jae .L_GE4_LE7
  458. +
  459. +.L_GE2_LE3:
  460. + movw (%rsi), %r8w
  461. + movw -2(%rsi,%rdx), %r9w
  462. + movw %r8w, (%rdi)
  463. + movw %r9w, -2(%rdi,%rdx)
  464. + ret
  465. +
  466. + .align 2
  467. +.L_EQ1:
  468. + movb (%rsi), %r8b
  469. + movb %r8b, (%rdi)
  470. + ret
  471. +
  472. + // Aligning the target of a jump to an even address has a measurable
  473. + // speedup in microbenchmarks.
  474. + .align 2
  475. +.L_GE4_LE7:
  476. + movl (%rsi), %r8d
  477. + movl -4(%rsi,%rdx), %r9d
  478. + movl %r8d, (%rdi)
  479. + movl %r9d, -4(%rdi,%rdx)
  480. + ret
  481. +
  482. + .cfi_endproc
  483. + .size __folly_memcpy_short, .-__folly_memcpy_short
  484. +
  485. +// memcpy is an alternative entrypoint into the function named __folly_memcpy.
  486. +// The compiler is able to call memcpy since the name is global while
  487. +// stacktraces will show __folly_memcpy since that is the name of the function.
  488. +// This is intended to aid in debugging by making it obvious which version of
  489. +// memcpy is being used.
  490. + .align 64
  491. + .hidden __folly_memcpy
  492. + .type __folly_memcpy, @function
  493. +
  494. +__folly_memcpy:
  495. + .cfi_startproc
  496. +
  497. + mov %rdi, %rax # return: $rdi
  498. +
  499. + test %rdx, %rdx
  500. + je .L_EQ0
  501. +
  502. + PREFETCH (%rdi)
  503. + PREFETCH -1(%rdi,%rdx)
  504. +
  505. + cmp $8, %rdx
  506. + jb .L_GE1_LE7
  507. +
  508. +.L_GE8:
  509. + cmp $32, %rdx
  510. + ja .L_GE33
  511. +
  512. +.L_GE8_LE32:
  513. + cmp $16, %rdx
  514. + ja .L_GE17_LE32
  515. +
  516. +.L_GE8_LE16:
  517. + mov (%rsi), %r8
  518. + mov -8(%rsi,%rdx), %r9
  519. + mov %r8, (%rdi)
  520. + mov %r9, -8(%rdi,%rdx)
  521. +.L_EQ0:
  522. + ret
  523. +
  524. + .align 2
  525. +.L_GE17_LE32:
  526. + movdqu (%rsi), %xmm0
  527. + movdqu -16(%rsi,%rdx), %xmm1
  528. + movdqu %xmm0, (%rdi)
  529. + movdqu %xmm1, -16(%rdi,%rdx)
  530. + ret
  531. +
  532. + .align 2
  533. +.L_GE193_LE256:
  534. + vmovdqu %ymm3, 96(%rdi)
  535. + vmovdqu %ymm4, -128(%rdi,%rdx)
  536. +
  537. +.L_GE129_LE192:
  538. + vmovdqu %ymm2, 64(%rdi)
  539. + vmovdqu %ymm5, -96(%rdi,%rdx)
  540. +
  541. +.L_GE65_LE128:
  542. + vmovdqu %ymm1, 32(%rdi)
  543. + vmovdqu %ymm6, -64(%rdi,%rdx)
  544. +
  545. +.L_GE33_LE64:
  546. + vmovdqu %ymm0, (%rdi)
  547. + vmovdqu %ymm7, -32(%rdi,%rdx)
  548. +
  549. + vzeroupper
  550. + ret
  551. +
  552. + .align 2
  553. +.L_GE33:
  554. + vmovdqu (%rsi), %ymm0
  555. + vmovdqu -32(%rsi,%rdx), %ymm7
  556. +
  557. + cmp $64, %rdx
  558. + jbe .L_GE33_LE64
  559. +
  560. + PREFETCH 64(%rdi)
  561. +
  562. + vmovdqu 32(%rsi), %ymm1
  563. + vmovdqu -64(%rsi,%rdx), %ymm6
  564. +
  565. + cmp $128, %rdx
  566. + jbe .L_GE65_LE128
  567. +
  568. + PREFETCH 128(%rdi)
  569. +
  570. + vmovdqu 64(%rsi), %ymm2
  571. + vmovdqu -96(%rsi,%rdx), %ymm5
  572. +
  573. + cmp $192, %rdx
  574. + jbe .L_GE129_LE192
  575. +
  576. + PREFETCH 192(%rdi)
  577. +
  578. + vmovdqu 96(%rsi), %ymm3
  579. + vmovdqu -128(%rsi,%rdx), %ymm4
  580. +
  581. + cmp $256, %rdx
  582. + jbe .L_GE193_LE256
  583. +
  584. +.L_GE257:
  585. + PREFETCH 256(%rdi)
  586. +
  587. + // Check if there is an overlap. If there is an overlap then the caller
  588. + // has a bug since this is undefined behavior. However, for legacy
  589. + // reasons this behavior is expected by some callers.
  590. + //
  591. + // All copies through 256 bytes will operate as a memmove since for
  592. + // those sizes all reads are performed before any writes.
  593. + //
  594. + // This check uses the idea that there is an overlap if
  595. + // (%rdi < (%rsi + %rdx)) && (%rsi < (%rdi + %rdx)),
  596. + // or equivalently, there is no overlap if
  597. + // ((%rsi + %rdx) <= %rdi) || ((%rdi + %rdx) <= %rsi).
  598. + //
  599. + // %r9 will be used after .L_ALIGNED_DST_LOOP to calculate how many
  600. + // bytes remain to be copied.
  601. +
  602. + // (%rsi + %rdx <= %rdi) => no overlap
  603. + lea (%rsi,%rdx), %r9
  604. + cmp %rdi, %r9
  605. + jbe .L_NO_OVERLAP
  606. +
  607. + // (%rdi + %rdx <= %rsi) => no overlap
  608. + lea (%rdi,%rdx), %r8
  609. + cmp %rsi, %r8
  610. + // If no info is available in branch predictor's cache, Intel CPUs assume
  611. + // forward jumps are not taken. Use a forward jump as overlapping buffers
  612. + // are unlikely.
  613. + ja .L_OVERLAP
  614. +
  615. + .align 2
  616. +.L_NO_OVERLAP:
  617. + vmovdqu %ymm0, (%rdi)
  618. + vmovdqu %ymm1, 32(%rdi)
  619. + vmovdqu %ymm2, 64(%rdi)
  620. + vmovdqu %ymm3, 96(%rdi)
  621. +
  622. + // Align %rdi to a 32 byte boundary.
  623. + // %rcx = 128 - 31 & %rdi
  624. + mov $128, %rcx
  625. + and $31, %rdi
  626. + sub %rdi, %rcx
  627. +
  628. + lea (%rsi,%rcx), %rsi
  629. + lea (%rax,%rcx), %rdi
  630. + sub %rcx, %rdx
  631. +
  632. + // %r8 is the end condition for the loop.
  633. + lea -128(%rsi,%rdx), %r8
  634. +
  635. + cmp NON_TEMPORAL_STORE_THRESHOLD, %rdx
  636. + jae .L_NON_TEMPORAL_LOOP
  637. +
  638. + .align 2
  639. +.L_ALIGNED_DST_LOOP:
  640. + PREFETCH 128(%rdi)
  641. + PREFETCH 192(%rdi)
  642. +
  643. + vmovdqu (%rsi), %ymm0
  644. + vmovdqu 32(%rsi), %ymm1
  645. + vmovdqu 64(%rsi), %ymm2
  646. + vmovdqu 96(%rsi), %ymm3
  647. + add $128, %rsi
  648. +
  649. + vmovdqa %ymm0, (%rdi)
  650. + vmovdqa %ymm1, 32(%rdi)
  651. + vmovdqa %ymm2, 64(%rdi)
  652. + vmovdqa %ymm3, 96(%rdi)
  653. + add $128, %rdi
  654. +
  655. + cmp %r8, %rsi
  656. + jb .L_ALIGNED_DST_LOOP
  657. +
  658. +.L_ALIGNED_DST_LOOP_END:
  659. + sub %rsi, %r9
  660. + mov %r9, %rdx
  661. +
  662. + vmovdqu %ymm4, -128(%rdi,%rdx)
  663. + vmovdqu %ymm5, -96(%rdi,%rdx)
  664. + vmovdqu %ymm6, -64(%rdi,%rdx)
  665. + vmovdqu %ymm7, -32(%rdi,%rdx)
  666. +
  667. + vzeroupper
  668. + ret
  669. +
  670. + .align 2
  671. +.L_NON_TEMPORAL_LOOP:
  672. + testb $31, %sil
  673. + jne .L_ALIGNED_DST_LOOP
  674. + // This is prefetching the source data unlike ALIGNED_DST_LOOP which
  675. + // prefetches the destination data. This choice is again informed by
  676. + // benchmarks. With a non-temporal store the entirety of the cache line
  677. + // is being written so the previous data can be discarded without being
  678. + // fetched.
  679. + prefetchnta 128(%rsi)
  680. + prefetchnta 196(%rsi)
  681. +
  682. + vmovntdqa (%rsi), %ymm0
  683. + vmovntdqa 32(%rsi), %ymm1
  684. + vmovntdqa 64(%rsi), %ymm2
  685. + vmovntdqa 96(%rsi), %ymm3
  686. + add $128, %rsi
  687. +
  688. + vmovntdq %ymm0, (%rdi)
  689. + vmovntdq %ymm1, 32(%rdi)
  690. + vmovntdq %ymm2, 64(%rdi)
  691. + vmovntdq %ymm3, 96(%rdi)
  692. + add $128, %rdi
  693. +
  694. + cmp %r8, %rsi
  695. + jb .L_NON_TEMPORAL_LOOP
  696. +
  697. + sfence
  698. + jmp .L_ALIGNED_DST_LOOP_END
  699. +
  700. +
  701. +.L_OVERLAP:
  702. + .align 2
  703. + cmp %rdi, %rsi
  704. + jb .L_OVERLAP_BWD // %rsi < %rdi => backward-copy
  705. + je .L_RET // %rsi == %rdi => return, nothing to copy
  706. +
  707. + // Source & destination buffers overlap. Forward copy.
  708. +
  709. + vmovdqu (%rsi), %ymm8
  710. +
  711. + // Align %rdi to a 32 byte boundary.
  712. + // %rcx = 32 - 31 & %rdi
  713. + mov $32, %rcx
  714. + and $31, %rdi
  715. + sub %rdi, %rcx
  716. +
  717. + lea (%rsi,%rcx), %rsi
  718. + lea (%rax,%rcx), %rdi
  719. + sub %rcx, %rdx
  720. +
  721. + // %r8 is the end condition for the loop.
  722. + lea -128(%rsi,%rdx), %r8
  723. +
  724. +
  725. +.L_OVERLAP_FWD_ALIGNED_DST_LOOP:
  726. + PREFETCH 128(%rdi)
  727. + PREFETCH 192(%rdi)
  728. +
  729. + vmovdqu (%rsi), %ymm0
  730. + vmovdqu 32(%rsi), %ymm1
  731. + vmovdqu 64(%rsi), %ymm2
  732. + vmovdqu 96(%rsi), %ymm3
  733. + add $128, %rsi
  734. +
  735. + vmovdqa %ymm0, (%rdi)
  736. + vmovdqa %ymm1, 32(%rdi)
  737. + vmovdqa %ymm2, 64(%rdi)
  738. + vmovdqa %ymm3, 96(%rdi)
  739. + add $128, %rdi
  740. +
  741. + cmp %r8, %rsi
  742. + jb .L_OVERLAP_FWD_ALIGNED_DST_LOOP
  743. +
  744. + sub %rsi, %r9
  745. + mov %r9, %rdx
  746. +
  747. + vmovdqu %ymm4, -128(%rdi,%rdx)
  748. + vmovdqu %ymm5, -96(%rdi,%rdx)
  749. + vmovdqu %ymm6, -64(%rdi,%rdx)
  750. + vmovdqu %ymm7, -32(%rdi,%rdx)
  751. + vmovdqu %ymm8, (%rax) // %rax == the original (unaligned) %rdi
  752. +
  753. + vzeroupper
  754. +
  755. +.L_RET:
  756. + ret
  757. +
  758. +.L_OVERLAP_BWD:
  759. + # Save last 32 bytes.
  760. + vmovdqu -32(%rsi, %rdx), %ymm8
  761. + lea -32(%rdi, %rdx), %r9
  762. +
  763. +
  764. + // %r8 is the end condition for the loop.
  765. + lea 128(%rsi), %r8
  766. +
  767. + // Align %rdi+%rdx (destination end) to a 32 byte boundary.
  768. + // %rcx = (%rdi + %rdx - 32) & 31
  769. + mov %r9, %rcx
  770. + and $31, %rcx
  771. + // Set %rsi & %rdi to the end of the 32 byte aligned range.
  772. + sub %rcx, %rdx
  773. + add %rdx, %rsi
  774. + add %rdx, %rdi
  775. +
  776. +
  777. +.L_OVERLAP_BWD_ALIGNED_DST_LOOP:
  778. + PREFETCH -128(%rdi)
  779. + PREFETCH -192(%rdi)
  780. +
  781. + vmovdqu -32(%rsi), %ymm4
  782. + vmovdqu -64(%rsi), %ymm5
  783. + vmovdqu -96(%rsi), %ymm6
  784. + vmovdqu -128(%rsi), %ymm7
  785. + sub $128, %rsi
  786. +
  787. + vmovdqa %ymm4, -32(%rdi)
  788. + vmovdqa %ymm5, -64(%rdi)
  789. + vmovdqa %ymm6, -96(%rdi)
  790. + vmovdqa %ymm7, -128(%rdi)
  791. + sub $128, %rdi
  792. +
  793. + cmp %r8, %rsi
  794. + ja .L_OVERLAP_BWD_ALIGNED_DST_LOOP
  795. +
  796. + vmovdqu %ymm0, (%rax) // %rax == the original unaligned %rdi
  797. + vmovdqu %ymm1, 32(%rax)
  798. + vmovdqu %ymm2, 64(%rax)
  799. + vmovdqu %ymm3, 96(%rax)
  800. + vmovdqu %ymm8, (%r9)
  801. +
  802. + vzeroupper
  803. + ret
  804. +
  805. + .cfi_endproc
  806. + .size __folly_memcpy, .-__folly_memcpy
  807. +
  808. + .global memcpy
  809. + memcpy = __folly_memcpy
  810. +
  811. + .global memmove
  812. + memmove = __folly_memcpy
  813. +
  814. +#else
  815. +// original musl implementation
  816. +
  817. +.global memcpy
  818. +.global __memcpy_fwd
  819. +.hidden __memcpy_fwd
  820. +.type memcpy,@function
  821. +memcpy:
  822. +__memcpy_fwd:
  823. + mov %rdi,%rax
  824. + cmp $8,%rdx
  825. + jc 1f
  826. + test $7,%edi
  827. + jz 1f
  828. +2: movsb
  829. + dec %rdx
  830. + test $7,%edi
  831. + jnz 2b
  832. +1: mov %rdx,%rcx
  833. + shr $3,%rcx
  834. + rep
  835. + movsq
  836. + and $7,%edx
  837. + jz 1f
  838. +2: movsb
  839. + dec %edx
  840. + jnz 2b
  841. +1: ret
  842. +
  843. +#endif
  844. \ No newline at end of file
  845. diff --git a/src/string/x86_64/memcpy.s b/src/string/x86_64/memcpy.s
  846. deleted file mode 100644
  847. index 3d960efa..00000000
  848. --- a/src/string/x86_64/memcpy.s
  849. +++ /dev/null
  850. @@ -1,25 +0,0 @@
  851. -.global memcpy
  852. -.global __memcpy_fwd
  853. -.hidden __memcpy_fwd
  854. -.type memcpy,@function
  855. -memcpy:
  856. -__memcpy_fwd:
  857. - mov %rdi,%rax
  858. - cmp $8,%rdx
  859. - jc 1f
  860. - test $7,%edi
  861. - jz 1f
  862. -2: movsb
  863. - dec %rdx
  864. - test $7,%edi
  865. - jnz 2b
  866. -1: mov %rdx,%rcx
  867. - shr $3,%rcx
  868. - rep
  869. - movsq
  870. - and $7,%edx
  871. - jz 1f
  872. -2: movsb
  873. - dec %edx
  874. - jnz 2b
  875. -1: ret
  876. diff --git a/src/string/x86_64/memmove.s b/src/string/x86_64/memmove.S
  877. similarity index 80%
  878. rename from src/string/x86_64/memmove.s
  879. rename to src/string/x86_64/memmove.S
  880. index 172c0252..be31d75f 100644
  881. --- a/src/string/x86_64/memmove.s
  882. +++ b/src/string/x86_64/memmove.S
  883. @@ -1,3 +1,7 @@
  884. +
  885. +#if defined(__AVX2__)
  886. +// implemented as memcpy
  887. +#else
  888. .global memmove
  889. .type memmove,@function
  890. memmove:
  891. @@ -14,3 +18,4 @@ memmove:
  892. cld
  893. lea 1(%rdi),%rax
  894. ret
  895. +#endif
  896. \ No newline at end of file
  897. diff --git a/src/string/x86_64/memset.S b/src/string/x86_64/memset.S
  898. new file mode 100644
  899. index 00000000..a42ac3fd
  900. --- /dev/null
  901. +++ b/src/string/x86_64/memset.S
  902. @@ -0,0 +1,316 @@
  903. +/*
  904. + * Copyright (c) Facebook, Inc. and its affiliates.
  905. + *
  906. + * Licensed under the Apache License, Version 2.0 (the "License");
  907. + * you may not use this file except in compliance with the License.
  908. + * You may obtain a copy of the License at
  909. + *
  910. + * http://www.apache.org/licenses/LICENSE-2.0
  911. + *
  912. + * Unless required by applicable law or agreed to in writing, software
  913. + * distributed under the License is distributed on an "AS IS" BASIS,
  914. + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  915. + * See the License for the specific language governing permissions and
  916. + * limitations under the License.
  917. + */
  918. +
  919. +#if defined(__AVX2__)
  920. +
  921. +#define LABEL(x) .L##x
  922. +
  923. +.text
  924. +.p2align 5, 0x90
  925. +.hidden __folly_memset
  926. +.type __folly_memset, @function
  927. +__folly_memset:
  928. + .cfi_startproc
  929. +
  930. +// RDI is the buffer
  931. +// RSI is the value
  932. +// RDX is length
  933. + vmovd %esi, %xmm0
  934. + vpbroadcastb %xmm0, %ymm0
  935. + mov %rdi, %rax
  936. + cmp $0x40, %rdx
  937. + jae LABEL(above_64)
  938. +
  939. +LABEL(below_64):
  940. + cmp $0x20, %rdx
  941. + jb LABEL(below_32)
  942. + vmovdqu %ymm0, (%rdi)
  943. + vmovdqu %ymm0, -0x20(%rdi,%rdx)
  944. + vzeroupper
  945. + retq
  946. +
  947. +.align 32
  948. +LABEL(below_32):
  949. + cmp $0x10, %rdx
  950. + jae LABEL(in_16_to_32)
  951. +
  952. +LABEL(below_16):
  953. + cmp $0x4, %rdx
  954. + jbe LABEL(below_4)
  955. +
  956. +LABEL(in_4_to_16):
  957. + // Scalar stores from this point.
  958. + vmovq %xmm0, %rsi
  959. + cmp $0x7, %rdx
  960. + jbe LABEL(in_4_to_8)
  961. + // Two 8-wide stores, up to 16 bytes.
  962. + mov %rsi, -0x8(%rdi, %rdx)
  963. + mov %rsi, (%rdi)
  964. + vzeroupper
  965. + retq
  966. +
  967. +.align 32
  968. +LABEL(below_4):
  969. + vmovq %xmm0, %rsi
  970. + vzeroupper
  971. + cmp $0x1, %rdx
  972. + jbe LABEL(none_or_one)
  973. + mov %si, (%rdi)
  974. + mov %si, -0x2(%rdi, %rdx)
  975. +
  976. +LABEL(exit):
  977. + retq
  978. +
  979. +.align 16
  980. +LABEL(in_4_to_8):
  981. + // two 4-wide stores, upto 8 bytes.
  982. + mov %esi, -0x4(%rdi,%rdx)
  983. + mov %esi, (%rdi)
  984. + vzeroupper
  985. + retq
  986. +
  987. +.align 32
  988. +LABEL(in_16_to_32):
  989. + vmovups %xmm0, (%rdi)
  990. + vmovups %xmm0, -0x10(%rdi,%rdx)
  991. + vzeroupper
  992. + retq
  993. +
  994. +LABEL(above_64):
  995. + cmp $0xb0, %rdx
  996. + ja LABEL(above_192)
  997. + cmp $0x80, %rdx
  998. + jbe LABEL(in_64_to_128)
  999. + // Do some work filling unaligned 32bit words.
  1000. + // last_word -> rsi
  1001. + lea -0x20(%rdi,%rdx), %rsi
  1002. + // rcx -> fill pointer.
  1003. + // We have at least 128 bytes to store.
  1004. + vmovdqu %ymm0, (%rdi)
  1005. + vmovdqu %ymm0, 0x20(%rdi)
  1006. + vmovdqu %ymm0, 0x40(%rdi)
  1007. + add $0x60, %rdi
  1008. +
  1009. +.align 32
  1010. +LABEL(fill_32):
  1011. + vmovdqu %ymm0, (%rdi)
  1012. + add $0x20, %rdi
  1013. + cmp %rdi, %rsi
  1014. + ja LABEL(fill_32)
  1015. + // Stamp the last unaligned store.
  1016. + vmovdqu %ymm0, (%rsi)
  1017. + vzeroupper
  1018. + retq
  1019. +
  1020. +.align 32
  1021. +LABEL(in_64_to_128):
  1022. + // Last_word -> rsi
  1023. + vmovdqu %ymm0, (%rdi)
  1024. + vmovdqu %ymm0, 0x20(%rdi)
  1025. + vmovdqu %ymm0, -0x40(%rdi,%rdx)
  1026. + vmovdqu %ymm0, -0x20(%rdi,%rdx)
  1027. + vzeroupper
  1028. + retq
  1029. +
  1030. +.align 32
  1031. +LABEL(above_192):
  1032. +// rdi is the buffer address
  1033. +// rsi is the value
  1034. +// rdx is length
  1035. + cmp $0x1000, %rdx
  1036. + jae LABEL(large_stosq)
  1037. + // Store the first unaligned 32 bytes.
  1038. + vmovdqu %ymm0, (%rdi)
  1039. + // The first aligned word is stored in %rsi.
  1040. + mov %rdi, %rsi
  1041. + mov %rdi, %rax
  1042. + and $0xffffffffffffffe0, %rsi
  1043. + lea 0x20(%rsi), %rsi
  1044. + // Compute the address of the last unaligned word into rdi.
  1045. + lea -0x20(%rdx), %rdx
  1046. + add %rdx, %rdi
  1047. + // Check if we can do a full 5x32B stamp.
  1048. + lea 0xa0(%rsi), %rcx
  1049. + cmp %rcx, %rdi
  1050. + jb LABEL(stamp_4)
  1051. +
  1052. +LABEL(fill_192):
  1053. + vmovdqa %ymm0, (%rsi)
  1054. + vmovdqa %ymm0, 0x20(%rsi)
  1055. + vmovdqa %ymm0, 0x40(%rsi)
  1056. + vmovdqa %ymm0, 0x60(%rsi)
  1057. + vmovdqa %ymm0, 0x80(%rsi)
  1058. + add $0xa0, %rsi
  1059. + lea 0xa0(%rsi), %rcx
  1060. + cmp %rcx, %rdi
  1061. + ja LABEL(fill_192)
  1062. +
  1063. +LABEL(fill_192_tail):
  1064. + cmp %rsi, %rdi
  1065. + jb LABEL(fill_192_done)
  1066. + vmovdqa %ymm0, (%rsi)
  1067. +
  1068. + lea 0x20(%rsi), %rcx
  1069. + cmp %rcx, %rdi
  1070. + jb LABEL(fill_192_done)
  1071. + vmovdqa %ymm0, 0x20(%rsi)
  1072. +
  1073. + lea 0x40(%rsi), %rcx
  1074. + cmp %rcx, %rdi
  1075. + jb LABEL(fill_192_done)
  1076. + vmovdqa %ymm0, 0x40(%rsi)
  1077. +
  1078. + lea 0x60(%rsi), %rcx
  1079. + cmp %rcx, %rdi
  1080. + jb LABEL(fill_192_done)
  1081. + vmovdqa %ymm0, 0x60(%rsi)
  1082. +
  1083. +LABEL(last_wide_store):
  1084. + lea 0x80(%rsi), %rcx
  1085. + cmp %rcx, %rdi
  1086. + jb LABEL(fill_192_done)
  1087. + vmovdqa %ymm0, 0x80(%rsi)
  1088. +
  1089. +.align 16
  1090. +LABEL(fill_192_done):
  1091. + // Stamp the last word.
  1092. + vmovdqu %ymm0, (%rdi)
  1093. + vzeroupper
  1094. + // FIXME return buffer address
  1095. + ret
  1096. +
  1097. +LABEL(stamp_4):
  1098. + vmovdqa %ymm0, (%rsi)
  1099. + vmovdqa %ymm0, 0x20(%rsi)
  1100. + vmovdqa %ymm0, 0x40(%rsi)
  1101. + vmovdqa %ymm0, 0x60(%rsi)
  1102. + jmp LABEL(last_wide_store)
  1103. +
  1104. +LABEL(large_stosq):
  1105. +// rdi is the buffer address
  1106. +// rsi is the value
  1107. +// rdx is length
  1108. + vmovd %xmm0, %rax
  1109. + mov %rax, (%rdi)
  1110. + mov %rdi, %rsi
  1111. + // Align rdi to 8B
  1112. + and $0xfffffffffffffff8, %rdi
  1113. + lea 0x8(%rdi), %rdi
  1114. + // Fill buffer using stosq
  1115. + mov %rdx, %rcx
  1116. + sub $0x8, %rcx
  1117. + shrq $0x3, %rcx
  1118. + // rcx - number of QWORD elements
  1119. + // rax - value
  1120. + // rdi - buffer pointer
  1121. + rep stosq
  1122. + // Fill last 16 bytes
  1123. + vmovdqu %xmm0, -0x10(%rsi, %rdx)
  1124. + vzeroupper
  1125. + mov %rsi, %rax
  1126. + ret
  1127. +
  1128. +.align 16
  1129. +LABEL(none_or_one):
  1130. + test %rdx, %rdx
  1131. + je LABEL(exit)
  1132. + // Store one and exit
  1133. + mov %sil, (%rdi)
  1134. + ret
  1135. +
  1136. + .cfi_endproc
  1137. + .size __folly_memset, .-__folly_memset
  1138. +
  1139. + .global memset
  1140. + memset = __folly_memset
  1141. +
  1142. +#else
  1143. +// original musl implementation
  1144. +
  1145. +.global memset
  1146. +.type memset,@function
  1147. +memset:
  1148. + movzbq %sil,%rax
  1149. + mov $0x101010101010101,%r8
  1150. + imul %r8,%rax
  1151. +
  1152. + cmp $126,%rdx
  1153. + ja 2f
  1154. +
  1155. + test %edx,%edx
  1156. + jz 1f
  1157. +
  1158. + mov %sil,(%rdi)
  1159. + mov %sil,-1(%rdi,%rdx)
  1160. + cmp $2,%edx
  1161. + jbe 1f
  1162. +
  1163. + mov %ax,1(%rdi)
  1164. + mov %ax,(-1-2)(%rdi,%rdx)
  1165. + cmp $6,%edx
  1166. + jbe 1f
  1167. +
  1168. + mov %eax,(1+2)(%rdi)
  1169. + mov %eax,(-1-2-4)(%rdi,%rdx)
  1170. + cmp $14,%edx
  1171. + jbe 1f
  1172. +
  1173. + mov %rax,(1+2+4)(%rdi)
  1174. + mov %rax,(-1-2-4-8)(%rdi,%rdx)
  1175. + cmp $30,%edx
  1176. + jbe 1f
  1177. +
  1178. + mov %rax,(1+2+4+8)(%rdi)
  1179. + mov %rax,(1+2+4+8+8)(%rdi)
  1180. + mov %rax,(-1-2-4-8-16)(%rdi,%rdx)
  1181. + mov %rax,(-1-2-4-8-8)(%rdi,%rdx)
  1182. + cmp $62,%edx
  1183. + jbe 1f
  1184. +
  1185. + mov %rax,(1+2+4+8+16)(%rdi)
  1186. + mov %rax,(1+2+4+8+16+8)(%rdi)
  1187. + mov %rax,(1+2+4+8+16+16)(%rdi)
  1188. + mov %rax,(1+2+4+8+16+24)(%rdi)
  1189. + mov %rax,(-1-2-4-8-16-32)(%rdi,%rdx)
  1190. + mov %rax,(-1-2-4-8-16-24)(%rdi,%rdx)
  1191. + mov %rax,(-1-2-4-8-16-16)(%rdi,%rdx)
  1192. + mov %rax,(-1-2-4-8-16-8)(%rdi,%rdx)
  1193. +
  1194. +1: mov %rdi,%rax
  1195. + ret
  1196. +
  1197. +2: test $15,%edi
  1198. + mov %rdi,%r8
  1199. + mov %rax,-8(%rdi,%rdx)
  1200. + mov %rdx,%rcx
  1201. + jnz 2f
  1202. +
  1203. +1: shr $3,%rcx
  1204. + rep
  1205. + stosq
  1206. + mov %r8,%rax
  1207. + ret
  1208. +
  1209. +2: xor %edx,%edx
  1210. + sub %edi,%edx
  1211. + and $15,%edx
  1212. + mov %rax,(%rdi)
  1213. + mov %rax,8(%rdi)
  1214. + sub %rdx,%rcx
  1215. + add %rdx,%rdi
  1216. + jmp 1b
  1217. +
  1218. +#endif // __AVX2__
  1219. \ No newline at end of file
  1220. diff --git a/src/string/x86_64/memset.s b/src/string/x86_64/memset.s
  1221. deleted file mode 100644
  1222. index 2d3f5e52..00000000
  1223. --- a/src/string/x86_64/memset.s
  1224. +++ /dev/null
  1225. @@ -1,72 +0,0 @@
  1226. -.global memset
  1227. -.type memset,@function
  1228. -memset:
  1229. - movzbq %sil,%rax
  1230. - mov $0x101010101010101,%r8
  1231. - imul %r8,%rax
  1232. -
  1233. - cmp $126,%rdx
  1234. - ja 2f
  1235. -
  1236. - test %edx,%edx
  1237. - jz 1f
  1238. -
  1239. - mov %sil,(%rdi)
  1240. - mov %sil,-1(%rdi,%rdx)
  1241. - cmp $2,%edx
  1242. - jbe 1f
  1243. -
  1244. - mov %ax,1(%rdi)
  1245. - mov %ax,(-1-2)(%rdi,%rdx)
  1246. - cmp $6,%edx
  1247. - jbe 1f
  1248. -
  1249. - mov %eax,(1+2)(%rdi)
  1250. - mov %eax,(-1-2-4)(%rdi,%rdx)
  1251. - cmp $14,%edx
  1252. - jbe 1f
  1253. -
  1254. - mov %rax,(1+2+4)(%rdi)
  1255. - mov %rax,(-1-2-4-8)(%rdi,%rdx)
  1256. - cmp $30,%edx
  1257. - jbe 1f
  1258. -
  1259. - mov %rax,(1+2+4+8)(%rdi)
  1260. - mov %rax,(1+2+4+8+8)(%rdi)
  1261. - mov %rax,(-1-2-4-8-16)(%rdi,%rdx)
  1262. - mov %rax,(-1-2-4-8-8)(%rdi,%rdx)
  1263. - cmp $62,%edx
  1264. - jbe 1f
  1265. -
  1266. - mov %rax,(1+2+4+8+16)(%rdi)
  1267. - mov %rax,(1+2+4+8+16+8)(%rdi)
  1268. - mov %rax,(1+2+4+8+16+16)(%rdi)
  1269. - mov %rax,(1+2+4+8+16+24)(%rdi)
  1270. - mov %rax,(-1-2-4-8-16-32)(%rdi,%rdx)
  1271. - mov %rax,(-1-2-4-8-16-24)(%rdi,%rdx)
  1272. - mov %rax,(-1-2-4-8-16-16)(%rdi,%rdx)
  1273. - mov %rax,(-1-2-4-8-16-8)(%rdi,%rdx)
  1274. -
  1275. -1: mov %rdi,%rax
  1276. - ret
  1277. -
  1278. -2: test $15,%edi
  1279. - mov %rdi,%r8
  1280. - mov %rax,-8(%rdi,%rdx)
  1281. - mov %rdx,%rcx
  1282. - jnz 2f
  1283. -
  1284. -1: shr $3,%rcx
  1285. - rep
  1286. - stosq
  1287. - mov %r8,%rax
  1288. - ret
  1289. -
  1290. -2: xor %edx,%edx
  1291. - sub %edi,%edx
  1292. - and $15,%edx
  1293. - mov %rax,(%rdi)
  1294. - mov %rax,8(%rdi)
  1295. - sub %rdx,%rcx
  1296. - add %rdx,%rdi
  1297. - jmp 1b
  1298. --
  1299. 2.35.2