anex5
/
gentoo-locomotion
mirror of https://github.com/anex5/gentoo-locomotion


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317
							From 1299e4c3ef43ba7509a104fc8cdeea90c61dc7cc Mon Sep 17 00:00:00 2001
From: 12101111 <w12101111@gmail.com>
Date: Wed, 9 Mar 2022 23:32:02 +0800
Subject: [PATCH] use optimized memcpy & memset

---
 src/string/aarch64/memcpy.S                | 287 ++++++------
 src/string/aarch64/memmove.S               |   1 +
 src/string/x86_64/memcpy.S                 | 487 +++++++++++++++++++++
 src/string/x86_64/memcpy.s                 |  25 --
 src/string/x86_64/{memmove.s => memmove.S} |   5 +
 src/string/x86_64/memset.S                 | 316 +++++++++++++
 src/string/x86_64/memset.s                 |  72 ---
 7 files changed, 962 insertions(+), 231 deletions(-)
 create mode 100644 src/string/aarch64/memmove.S
 create mode 100644 src/string/x86_64/memcpy.S
 delete mode 100644 src/string/x86_64/memcpy.s
 rename src/string/x86_64/{memmove.s => memmove.S} (80%)
 create mode 100644 src/string/x86_64/memset.S
 delete mode 100644 src/string/x86_64/memset.s

diff --git a/src/string/aarch64/memcpy.S b/src/string/aarch64/memcpy.S
index 48bb8a8d..272a727e 100644
--- a/src/string/aarch64/memcpy.S
+++ b/src/string/aarch64/memcpy.S
@@ -7,38 +7,38 @@
 
 /* Assumptions:
  *
- * ARMv8-a, AArch64, unaligned accesses.
+ * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
  *
  */
 
-#define dstin   x0
-#define src     x1
-#define count   x2
-#define dst     x3
-#define srcend  x4
-#define dstend  x5
-#define A_l     x6
-#define A_lw    w6
-#define A_h     x7
-#define B_l     x8
-#define B_lw    w8
-#define B_h     x9
-#define C_l     x10
-#define C_lw    w10
-#define C_h     x11
-#define D_l     x12
-#define D_h     x13
-#define E_l     x14
-#define E_h     x15
-#define F_l     x16
-#define F_h     x17
-#define G_l     count
-#define G_h     dst
-#define H_l     src
-#define H_h     srcend
-#define tmp1    x14
-
-/* This implementation of memcpy uses unaligned accesses and branchless
+#define dstin	x0
+#define src	x1
+#define count	x2
+#define dst	x3
+#define srcend	x4
+#define dstend	x5
+#define A_l	x6
+#define A_lw	w6
+#define A_h	x7
+#define B_l	x8
+#define B_lw	w8
+#define B_h	x9
+#define C_lw	w10
+#define tmp1	x14
+
+#define A_q	q0
+#define B_q	q1
+#define C_q	q2
+#define D_q	q3
+#define E_q	q4
+#define F_q	q5
+#define G_q	q6
+#define H_q	q7
+
+#define L(l) .L ## l
+
+/* This implementation handles overlaps and supports both memcpy and memmove
+   from a single entry point.  It uses unaligned accesses and branchless
    sequences to keep the code small, simple and improve performance.
 
    Copies are split into 3 main cases: small copies of up to 32 bytes, medium
@@ -46,141 +46,160 @@
    check is negligible since it is only required for large copies.
 
    Large copies use a software pipelined loop processing 64 bytes per iteration.
-   The destination pointer is 16-byte aligned to minimize unaligned accesses.
+   The source pointer is 16-byte aligned to minimize unaligned accesses.
    The loop tail is handled by always copying 64 bytes from the end.
 */
 
 .global memcpy
 .type memcpy,%function
-memcpy:
-	add     srcend, src, count
-	add     dstend, dstin, count
-	cmp     count, 128
-	b.hi    .Lcopy_long
-	cmp     count, 32
-	b.hi    .Lcopy32_128
+.global memmove
+.type memmove,%function
+memmove:
+memcpy:	add	srcend, src, count
+	add	dstend, dstin, count
+	cmp	count, 128
+	b.hi	L(copy_long)
+	cmp	count, 32
+	b.hi	L(copy32_128)
 
 	/* Small copies: 0..32 bytes.  */
-	cmp     count, 16
-	b.lo    .Lcopy16
-	ldp     A_l, A_h, [src]
-	ldp     D_l, D_h, [srcend, -16]
-	stp     A_l, A_h, [dstin]
-	stp     D_l, D_h, [dstend, -16]
+	cmp	count, 16
+	b.lo	L(copy16)
+	ldr	A_q, [src]
+	ldr	B_q, [srcend, -16]
+	str	A_q, [dstin]
+	str	B_q, [dstend, -16]
 	ret
 
 	/* Copy 8-15 bytes.  */
-.Lcopy16:
-	tbz     count, 3, .Lcopy8
-	ldr     A_l, [src]
-	ldr     A_h, [srcend, -8]
-	str     A_l, [dstin]
-	str     A_h, [dstend, -8]
+L(copy16):
+	tbz	count, 3, L(copy8)
+	ldr	A_l, [src]
+	ldr	A_h, [srcend, -8]
+	str	A_l, [dstin]
+	str	A_h, [dstend, -8]
 	ret
 
 	.p2align 3
 	/* Copy 4-7 bytes.  */
-.Lcopy8:
-	tbz     count, 2, .Lcopy4
-	ldr     A_lw, [src]
-	ldr     B_lw, [srcend, -4]
-	str     A_lw, [dstin]
-	str     B_lw, [dstend, -4]
+L(copy8):
+	tbz	count, 2, L(copy4)
+	ldr	A_lw, [src]
+	ldr	B_lw, [srcend, -4]
+	str	A_lw, [dstin]
+	str	B_lw, [dstend, -4]
 	ret
 
 	/* Copy 0..3 bytes using a branchless sequence.  */
-.Lcopy4:
-	cbz     count, .Lcopy0
-	lsr     tmp1, count, 1
-	ldrb    A_lw, [src]
-	ldrb    C_lw, [srcend, -1]
-	ldrb    B_lw, [src, tmp1]
-	strb    A_lw, [dstin]
-	strb    B_lw, [dstin, tmp1]
-	strb    C_lw, [dstend, -1]
-.Lcopy0:
+L(copy4):
+	cbz	count, L(copy0)
+	lsr	tmp1, count, 1
+	ldrb	A_lw, [src]
+	ldrb	C_lw, [srcend, -1]
+	ldrb	B_lw, [src, tmp1]
+	strb	A_lw, [dstin]
+	strb	B_lw, [dstin, tmp1]
+	strb	C_lw, [dstend, -1]
+L(copy0):
 	ret
 
 	.p2align 4
 	/* Medium copies: 33..128 bytes.  */
-.Lcopy32_128:
-	ldp     A_l, A_h, [src]
-	ldp     B_l, B_h, [src, 16]
-	ldp     C_l, C_h, [srcend, -32]
-	ldp     D_l, D_h, [srcend, -16]
-	cmp     count, 64
-	b.hi    .Lcopy128
-	stp     A_l, A_h, [dstin]
-	stp     B_l, B_h, [dstin, 16]
-	stp     C_l, C_h, [dstend, -32]
-	stp     D_l, D_h, [dstend, -16]
+L(copy32_128):
+	ldp	A_q, B_q, [src]
+	ldp	C_q, D_q, [srcend, -32]
+	cmp	count, 64
+	b.hi	L(copy128)
+	stp	A_q, B_q, [dstin]
+	stp	C_q, D_q, [dstend, -32]
 	ret
 
 	.p2align 4
 	/* Copy 65..128 bytes.  */
-.Lcopy128:
-	ldp     E_l, E_h, [src, 32]
-	ldp     F_l, F_h, [src, 48]
-	cmp     count, 96
-	b.ls    .Lcopy96
-	ldp     G_l, G_h, [srcend, -64]
-	ldp     H_l, H_h, [srcend, -48]
-	stp     G_l, G_h, [dstend, -64]
-	stp     H_l, H_h, [dstend, -48]
-.Lcopy96:
-	stp     A_l, A_h, [dstin]
-	stp     B_l, B_h, [dstin, 16]
-	stp     E_l, E_h, [dstin, 32]
-	stp     F_l, F_h, [dstin, 48]
-	stp     C_l, C_h, [dstend, -32]
-	stp     D_l, D_h, [dstend, -16]
+L(copy128):
+	ldp	E_q, F_q, [src, 32]
+	cmp	count, 96
+	b.ls	L(copy96)
+	ldp	G_q, H_q, [srcend, -64]
+	stp	G_q, H_q, [dstend, -64]
+L(copy96):
+	stp	A_q, B_q, [dstin]
+	stp	E_q, F_q, [dstin, 32]
+	stp	C_q, D_q, [dstend, -32]
 	ret
 
-	.p2align 4
 	/* Copy more than 128 bytes.  */
-.Lcopy_long:
-
-	/* Copy 16 bytes and then align dst to 16-byte alignment.  */
-
-	ldp     D_l, D_h, [src]
-	and     tmp1, dstin, 15
-	bic     dst, dstin, 15
-	sub     src, src, tmp1
-	add     count, count, tmp1      /* Count is now 16 too large.  */
-	ldp     A_l, A_h, [src, 16]
-	stp     D_l, D_h, [dstin]
-	ldp     B_l, B_h, [src, 32]
-	ldp     C_l, C_h, [src, 48]
-	ldp     D_l, D_h, [src, 64]!
-	subs    count, count, 128 + 16  /* Test and readjust count.  */
-	b.ls    .Lcopy64_from_end
-
-.Lloop64:
-	stp     A_l, A_h, [dst, 16]
-	ldp     A_l, A_h, [src, 16]
-	stp     B_l, B_h, [dst, 32]
-	ldp     B_l, B_h, [src, 32]
-	stp     C_l, C_h, [dst, 48]
-	ldp     C_l, C_h, [src, 48]
-	stp     D_l, D_h, [dst, 64]!
-	ldp     D_l, D_h, [src, 64]!
-	subs    count, count, 64
-	b.hi    .Lloop64
+L(copy_long):
+	/* Use backwards copy if there is an overlap.  */
+	sub	tmp1, dstin, src
+	cmp	tmp1, count
+	b.lo	L(copy_long_backwards)
+
+	/* Copy 16 bytes and then align src to 16-byte alignment.  */
+	ldr	D_q, [src]
+	and	tmp1, src, 15
+	bic	src, src, 15
+	sub	dst, dstin, tmp1
+	add	count, count, tmp1	/* Count is now 16 too large.  */
+	ldp	A_q, B_q, [src, 16]
+	str	D_q, [dstin]
+	ldp	C_q, D_q, [src, 48]
+	subs	count, count, 128 + 16	/* Test and readjust count.  */
+	b.ls	L(copy64_from_end)
+L(loop64):
+	stp	A_q, B_q, [dst, 16]
+	ldp	A_q, B_q, [src, 80]
+	stp	C_q, D_q, [dst, 48]
+	ldp	C_q, D_q, [src, 112]
+	add	src, src, 64
+	add	dst, dst, 64
+	subs	count, count, 64
+	b.hi	L(loop64)
 
 	/* Write the last iteration and copy 64 bytes from the end.  */
-.Lcopy64_from_end:
-	ldp     E_l, E_h, [srcend, -64]
-	stp     A_l, A_h, [dst, 16]
-	ldp     A_l, A_h, [srcend, -48]
-	stp     B_l, B_h, [dst, 32]
-	ldp     B_l, B_h, [srcend, -32]
-	stp     C_l, C_h, [dst, 48]
-	ldp     C_l, C_h, [srcend, -16]
-	stp     D_l, D_h, [dst, 64]
-	stp     E_l, E_h, [dstend, -64]
-	stp     A_l, A_h, [dstend, -48]
-	stp     B_l, B_h, [dstend, -32]
-	stp     C_l, C_h, [dstend, -16]
+L(copy64_from_end):
+	ldp	E_q, F_q, [srcend, -64]
+	stp	A_q, B_q, [dst, 16]
+	ldp	A_q, B_q, [srcend, -32]
+	stp	C_q, D_q, [dst, 48]
+	stp	E_q, F_q, [dstend, -64]
+	stp	A_q, B_q, [dstend, -32]
+	ret
+
+	/* Large backwards copy for overlapping copies.
+	   Copy 16 bytes and then align srcend to 16-byte alignment.  */
+L(copy_long_backwards):
+	cbz	tmp1, L(copy0)
+	ldr	D_q, [srcend, -16]
+	and	tmp1, srcend, 15
+	bic	srcend, srcend, 15
+	sub	count, count, tmp1
+	ldp	A_q, B_q, [srcend, -32]
+	str	D_q, [dstend, -16]
+	ldp	C_q, D_q, [srcend, -64]
+	sub	dstend, dstend, tmp1
+	subs	count, count, 128
+	b.ls	L(copy64_from_start)
+
+L(loop64_backwards):
+	str	B_q, [dstend, -16]
+	str	A_q, [dstend, -32]
+	ldp	A_q, B_q, [srcend, -96]
+	str	D_q, [dstend, -48]
+	str	C_q, [dstend, -64]!
+	ldp	C_q, D_q, [srcend, -128]
+	sub	srcend, srcend, 64
+	subs	count, count, 64
+	b.hi	L(loop64_backwards)
+
+	/* Write the last iteration and copy 64 bytes from the start.  */
+L(copy64_from_start):
+	ldp	E_q, F_q, [src, 32]
+	stp	A_q, B_q, [dstend, -32]
+	ldp	A_q, B_q, [src]
+	stp	C_q, D_q, [dstend, -64]
+	stp	E_q, F_q, [dstin, 32]
+	stp	A_q, B_q, [dstin]
 	ret
 
 .size memcpy,.-memcpy
diff --git a/src/string/aarch64/memmove.S b/src/string/aarch64/memmove.S
new file mode 100644
index 00000000..90fd94a7
--- /dev/null
+++ b/src/string/aarch64/memmove.S
@@ -0,0 +1 @@
+// implemented as memcpy
diff --git a/src/string/x86_64/memcpy.S b/src/string/x86_64/memcpy.S
new file mode 100644
index 00000000..c972b677
--- /dev/null
+++ b/src/string/x86_64/memcpy.S
@@ -0,0 +1,487 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * __folly_memcpy: An optimized memcpy implementation that uses prefetch and
+ * AVX2 instructions.
+ *
+ * This implementation of memcpy acts as a memmove: while overlapping copies
+ * are undefined in memcpy, in some implementations they're the same function and
+ * legacy programs rely on this behavior.
+ *
+ * This implementation uses prefetch to avoid dtlb misses. This can
+ * substantially reduce dtlb store misses in cases where the destination
+ * location is absent from L1 cache and where the copy size is small enough
+ * that the hardware prefetcher doesn't have a large impact.
+ *
+ * The number of branches is limited by the use of overlapping loads & stores.
+ * This helps with copies where the source and destination cache lines are already
+ * present in L1 because there are fewer instructions to execute and fewer
+ * branches to potentially mispredict.
+ *   e.g. to copy the last 4 <= n <= 7 bytes: copy the first & last 4 bytes (overlapped):
+ *      movl        (%rsi), %r8d
+ *      movl        -4(%rsi,%rdx), %r9d
+ *      movl        %r8d, (%rdi)
+ *      movl        %r9d, -4(%rdi,%rdx)
+ *
+ *
+ * For sizes up to 256 all source data is first read into registers and then written:
+ * - n <=  16: overlapping movs
+ * - n <=  32: overlapping unaligned 16-byte SSE XMM load/stores
+ * - n <= 256: overlapping unaligned 32-byte AVX YMM load/stores
+ *
+ * Large copies (> 256 bytes) use unaligned loads + aligned stores.
+ * This is observed to always be faster than rep movsb, so the rep movsb
+ * instruction is not used.
+ * - The head & tail may be unaligned => they're always written using unaligned stores.
+ *
+ * If the copy size is humongous (> 32 KiB) and the source and destination are both
+ * aligned, this memcpy will use non-temporal operations (AVX2). This can have
+ * a substantial speedup for copies where data is absent from L1, but it
+ * is significantly slower if the source and destination data were already
+ * in L1. The use of non-temporal operations also has the effect that after
+ * the copy is complete, the data will be moved out of L1, even if the data was
+ * present before the copy started.
+ *
+ * For n > 256 and overlapping src & dst buffers (memmove):
+ * - use unaligned loads + aligned stores, but not non-temporal stores
+ * - for dst < src forward copy in 128 byte batches:
+ *   - unaligned load the first 32 bytes & last 4 x 32 bytes
+ *   - forward copy (unaligned load + aligned stores) 4 x 32 bytes at a time
+ *   - unaligned store the first 32 bytes & last 4 x 32 bytes
+ * - for dst > src backward copy in 128 byte batches:
+ *   - unaligned load the first 4 x 32 bytes & last 32 bytes
+ *   - backward copy (unaligned load + aligned stores) 4 x 32 bytes at a time
+ *   - unaligned store the first 4 x 32 bytes & last 32 bytes
+ *
+ * @author Logan Evans <lpe@fb.com>
+ */
+
+#if defined(__AVX2__)
+
+#if defined(PREFETCH)
+#undef PREFETCH
+#endif
+#if __PRFCHW__ // Broadwell+
+#define PREFETCH prefetchw
+#else
+#define PREFETCH prefetcht0
+#endif
+
+// This threshold is half of L1 cache on a Skylake machine, which means that
+// potentially all of L1 will be populated by this copy once it is executed
+// (dst and src are cached for temporal copies).
+#define NON_TEMPORAL_STORE_THRESHOLD $32768
+
+        .file       "memcpy.S"
+        .section    .text,"ax"
+
+        .type       __folly_memcpy_short, @function
+__folly_memcpy_short:
+        .cfi_startproc
+
+.L_GE1_LE7:
+        cmp         $1, %rdx
+        je          .L_EQ1
+
+        cmp         $4, %rdx
+        jae         .L_GE4_LE7
+
+.L_GE2_LE3:
+        movw        (%rsi), %r8w
+        movw        -2(%rsi,%rdx), %r9w
+        movw        %r8w, (%rdi)
+        movw        %r9w, -2(%rdi,%rdx)
+        ret
+
+        .align      2
+.L_EQ1:
+        movb        (%rsi), %r8b
+        movb        %r8b, (%rdi)
+        ret
+
+        // Aligning the target of a jump to an even address has a measurable
+        // speedup in microbenchmarks.
+        .align      2
+.L_GE4_LE7:
+        movl        (%rsi), %r8d
+        movl        -4(%rsi,%rdx), %r9d
+        movl        %r8d, (%rdi)
+        movl        %r9d, -4(%rdi,%rdx)
+        ret
+
+        .cfi_endproc
+        .size       __folly_memcpy_short, .-__folly_memcpy_short
+
+// memcpy is an alternative entrypoint into the function named __folly_memcpy.
+// The compiler is able to call memcpy since the name is global while
+// stacktraces will show __folly_memcpy since that is the name of the function.
+// This is intended to aid in debugging by making it obvious which version of
+// memcpy is being used.
+        .align      64
+        .hidden      __folly_memcpy
+        .type       __folly_memcpy, @function
+
+__folly_memcpy:
+        .cfi_startproc
+
+        mov         %rdi, %rax    # return: $rdi
+
+        test        %rdx, %rdx
+        je          .L_EQ0
+
+        PREFETCH    (%rdi)
+        PREFETCH    -1(%rdi,%rdx)
+
+        cmp         $8, %rdx
+        jb          .L_GE1_LE7
+
+.L_GE8:
+        cmp         $32, %rdx
+        ja          .L_GE33
+
+.L_GE8_LE32:
+        cmp         $16, %rdx
+        ja          .L_GE17_LE32
+
+.L_GE8_LE16:
+        mov         (%rsi), %r8
+        mov         -8(%rsi,%rdx), %r9
+        mov         %r8, (%rdi)
+        mov         %r9, -8(%rdi,%rdx)
+.L_EQ0:
+        ret
+
+        .align      2
+.L_GE17_LE32:
+        movdqu      (%rsi), %xmm0
+        movdqu      -16(%rsi,%rdx), %xmm1
+        movdqu      %xmm0, (%rdi)
+        movdqu      %xmm1, -16(%rdi,%rdx)
+        ret
+
+        .align      2
+.L_GE193_LE256:
+        vmovdqu     %ymm3, 96(%rdi)
+        vmovdqu     %ymm4, -128(%rdi,%rdx)
+
+.L_GE129_LE192:
+        vmovdqu     %ymm2, 64(%rdi)
+        vmovdqu     %ymm5, -96(%rdi,%rdx)
+
+.L_GE65_LE128:
+        vmovdqu     %ymm1, 32(%rdi)
+        vmovdqu     %ymm6, -64(%rdi,%rdx)
+
+.L_GE33_LE64:
+        vmovdqu     %ymm0, (%rdi)
+        vmovdqu     %ymm7, -32(%rdi,%rdx)
+
+        vzeroupper
+        ret
+
+        .align      2
+.L_GE33:
+        vmovdqu     (%rsi), %ymm0
+        vmovdqu     -32(%rsi,%rdx), %ymm7
+
+        cmp         $64, %rdx
+        jbe         .L_GE33_LE64
+
+        PREFETCH    64(%rdi)
+
+        vmovdqu     32(%rsi), %ymm1
+        vmovdqu     -64(%rsi,%rdx), %ymm6
+
+        cmp         $128, %rdx
+        jbe         .L_GE65_LE128
+
+        PREFETCH    128(%rdi)
+
+        vmovdqu     64(%rsi), %ymm2
+        vmovdqu     -96(%rsi,%rdx), %ymm5
+
+        cmp         $192, %rdx
+        jbe         .L_GE129_LE192
+
+        PREFETCH    192(%rdi)
+
+        vmovdqu     96(%rsi), %ymm3
+        vmovdqu     -128(%rsi,%rdx), %ymm4
+
+        cmp         $256, %rdx
+        jbe         .L_GE193_LE256
+
+.L_GE257:
+        PREFETCH    256(%rdi)
+
+        // Check if there is an overlap. If there is an overlap then the caller
+        // has a bug since this is undefined behavior. However, for legacy
+        // reasons this behavior is expected by some callers.
+        //
+        // All copies through 256 bytes will operate as a memmove since for
+        // those sizes all reads are performed before any writes.
+        //
+        // This check uses the idea that there is an overlap if
+        // (%rdi < (%rsi + %rdx)) && (%rsi < (%rdi + %rdx)),
+        // or equivalently, there is no overlap if
+        // ((%rsi + %rdx) <= %rdi) || ((%rdi + %rdx) <= %rsi).
+        //
+        // %r9 will be used after .L_ALIGNED_DST_LOOP to calculate how many
+        // bytes remain to be copied.
+
+        // (%rsi + %rdx <= %rdi) => no overlap
+        lea         (%rsi,%rdx), %r9
+        cmp         %rdi, %r9
+        jbe         .L_NO_OVERLAP
+
+        // (%rdi + %rdx <= %rsi) => no overlap
+        lea         (%rdi,%rdx), %r8
+        cmp         %rsi, %r8
+        // If no info is available in branch predictor's cache, Intel CPUs assume
+        // forward jumps are not taken. Use a forward jump as overlapping buffers
+        // are unlikely.
+        ja          .L_OVERLAP
+
+        .align      2
+.L_NO_OVERLAP:
+        vmovdqu     %ymm0, (%rdi)
+        vmovdqu     %ymm1, 32(%rdi)
+        vmovdqu     %ymm2, 64(%rdi)
+        vmovdqu     %ymm3, 96(%rdi)
+
+        // Align %rdi to a 32 byte boundary.
+        // %rcx = 128 - 31 & %rdi
+        mov         $128, %rcx
+        and         $31, %rdi
+        sub         %rdi, %rcx
+
+        lea         (%rsi,%rcx), %rsi
+        lea         (%rax,%rcx), %rdi
+        sub         %rcx, %rdx
+
+        // %r8 is the end condition for the loop.
+        lea         -128(%rsi,%rdx), %r8
+
+        cmp         NON_TEMPORAL_STORE_THRESHOLD, %rdx
+        jae         .L_NON_TEMPORAL_LOOP
+
+        .align      2
+.L_ALIGNED_DST_LOOP:
+        PREFETCH    128(%rdi)
+        PREFETCH    192(%rdi)
+
+        vmovdqu     (%rsi), %ymm0
+        vmovdqu     32(%rsi), %ymm1
+        vmovdqu     64(%rsi), %ymm2
+        vmovdqu     96(%rsi), %ymm3
+        add         $128, %rsi
+
+        vmovdqa     %ymm0, (%rdi)
+        vmovdqa     %ymm1, 32(%rdi)
+        vmovdqa     %ymm2, 64(%rdi)
+        vmovdqa     %ymm3, 96(%rdi)
+        add         $128, %rdi
+
+        cmp         %r8, %rsi
+        jb          .L_ALIGNED_DST_LOOP
+
+.L_ALIGNED_DST_LOOP_END:
+        sub         %rsi, %r9
+        mov         %r9, %rdx
+
+        vmovdqu     %ymm4, -128(%rdi,%rdx)
+        vmovdqu     %ymm5, -96(%rdi,%rdx)
+        vmovdqu     %ymm6, -64(%rdi,%rdx)
+        vmovdqu     %ymm7, -32(%rdi,%rdx)
+
+        vzeroupper
+        ret
+
+        .align      2
+.L_NON_TEMPORAL_LOOP:
+        testb       $31, %sil
+        jne         .L_ALIGNED_DST_LOOP
+        // This is prefetching the source data unlike ALIGNED_DST_LOOP which
+        // prefetches the destination data. This choice is again informed by
+        // benchmarks. With a non-temporal store the entirety of the cache line
+        // is being written so the previous data can be discarded without being
+        // fetched.
+        prefetchnta 128(%rsi)
+        prefetchnta 196(%rsi)
+
+        vmovntdqa   (%rsi), %ymm0
+        vmovntdqa   32(%rsi), %ymm1
+        vmovntdqa   64(%rsi), %ymm2
+        vmovntdqa   96(%rsi), %ymm3
+        add         $128, %rsi
+
+        vmovntdq    %ymm0, (%rdi)
+        vmovntdq    %ymm1, 32(%rdi)
+        vmovntdq    %ymm2, 64(%rdi)
+        vmovntdq    %ymm3, 96(%rdi)
+        add         $128, %rdi
+
+        cmp         %r8, %rsi
+        jb          .L_NON_TEMPORAL_LOOP
+
+        sfence
+        jmp         .L_ALIGNED_DST_LOOP_END
+
+
+.L_OVERLAP:
+        .align      2
+        cmp         %rdi, %rsi
+        jb          .L_OVERLAP_BWD  // %rsi  < %rdi => backward-copy
+        je          .L_RET          // %rsi == %rdi => return, nothing to copy
+
+        // Source & destination buffers overlap. Forward copy.
+
+        vmovdqu     (%rsi), %ymm8
+
+        // Align %rdi to a 32 byte boundary.
+        // %rcx = 32 - 31 & %rdi
+        mov         $32, %rcx
+        and         $31, %rdi
+        sub         %rdi, %rcx
+
+        lea         (%rsi,%rcx), %rsi
+        lea         (%rax,%rcx), %rdi
+        sub         %rcx, %rdx
+
+        // %r8 is the end condition for the loop.
+        lea         -128(%rsi,%rdx), %r8
+
+
+.L_OVERLAP_FWD_ALIGNED_DST_LOOP:
+        PREFETCH    128(%rdi)
+        PREFETCH    192(%rdi)
+
+        vmovdqu       (%rsi), %ymm0
+        vmovdqu     32(%rsi), %ymm1
+        vmovdqu     64(%rsi), %ymm2
+        vmovdqu     96(%rsi), %ymm3
+        add         $128, %rsi
+
+        vmovdqa     %ymm0,   (%rdi)
+        vmovdqa     %ymm1, 32(%rdi)
+        vmovdqa     %ymm2, 64(%rdi)
+        vmovdqa     %ymm3, 96(%rdi)
+        add         $128, %rdi
+
+        cmp         %r8, %rsi
+        jb          .L_OVERLAP_FWD_ALIGNED_DST_LOOP
+
+        sub         %rsi, %r9
+        mov         %r9, %rdx
+
+        vmovdqu     %ymm4, -128(%rdi,%rdx)
+        vmovdqu     %ymm5,  -96(%rdi,%rdx)
+        vmovdqu     %ymm6,  -64(%rdi,%rdx)
+        vmovdqu     %ymm7,  -32(%rdi,%rdx)
+        vmovdqu     %ymm8, (%rax)  // %rax == the original (unaligned) %rdi
+
+        vzeroupper
+
+.L_RET:
+        ret
+
+.L_OVERLAP_BWD:
+        # Save last 32 bytes.
+        vmovdqu     -32(%rsi, %rdx), %ymm8
+        lea         -32(%rdi, %rdx), %r9
+
+
+        // %r8 is the end condition for the loop.
+        lea         128(%rsi), %r8
+
+        // Align %rdi+%rdx (destination end) to a 32 byte boundary.
+        // %rcx = (%rdi + %rdx - 32) & 31
+        mov         %r9, %rcx
+        and         $31, %rcx
+        // Set %rsi & %rdi to the end of the 32 byte aligned range.
+        sub         %rcx, %rdx
+        add         %rdx, %rsi
+        add         %rdx, %rdi
+
+
+.L_OVERLAP_BWD_ALIGNED_DST_LOOP:
+        PREFETCH    -128(%rdi)
+        PREFETCH    -192(%rdi)
+
+        vmovdqu      -32(%rsi), %ymm4
+        vmovdqu      -64(%rsi), %ymm5
+        vmovdqu      -96(%rsi), %ymm6
+        vmovdqu     -128(%rsi), %ymm7
+        sub         $128, %rsi
+
+        vmovdqa     %ymm4,  -32(%rdi)
+        vmovdqa     %ymm5,  -64(%rdi)
+        vmovdqa     %ymm6,  -96(%rdi)
+        vmovdqa     %ymm7, -128(%rdi)
+        sub         $128, %rdi
+
+        cmp         %r8, %rsi
+        ja          .L_OVERLAP_BWD_ALIGNED_DST_LOOP
+
+        vmovdqu     %ymm0,   (%rax)  // %rax == the original unaligned %rdi
+        vmovdqu     %ymm1, 32(%rax)
+        vmovdqu     %ymm2, 64(%rax)
+        vmovdqu     %ymm3, 96(%rax)
+        vmovdqu     %ymm8, (%r9)
+
+        vzeroupper
+	ret
+
+        .cfi_endproc
+        .size       __folly_memcpy, .-__folly_memcpy
+
+        .global       memcpy
+        memcpy = __folly_memcpy
+
+        .global       memmove
+        memmove = __folly_memcpy
+
+#else
+// original musl implementation
+
+.global memcpy
+.global __memcpy_fwd
+.hidden __memcpy_fwd
+.type memcpy,@function
+memcpy:
+__memcpy_fwd:
+	mov %rdi,%rax
+	cmp $8,%rdx
+	jc 1f
+	test $7,%edi
+	jz 1f
+2:	movsb
+	dec %rdx
+	test $7,%edi
+	jnz 2b
+1:	mov %rdx,%rcx
+	shr $3,%rcx
+	rep
+	movsq
+	and $7,%edx
+	jz 1f
+2:	movsb
+	dec %edx
+	jnz 2b
+1:	ret
+
+#endif
\ No newline at end of file
diff --git a/src/string/x86_64/memcpy.s b/src/string/x86_64/memcpy.s
deleted file mode 100644
index 3d960efa..00000000
--- a/src/string/x86_64/memcpy.s
+++ /dev/null
@@ -1,25 +0,0 @@
-.global memcpy
-.global __memcpy_fwd
-.hidden __memcpy_fwd
-.type memcpy,@function
-memcpy:
-__memcpy_fwd:
-	mov %rdi,%rax
-	cmp $8,%rdx
-	jc 1f
-	test $7,%edi
-	jz 1f
-2:	movsb
-	dec %rdx
-	test $7,%edi
-	jnz 2b
-1:	mov %rdx,%rcx
-	shr $3,%rcx
-	rep
-	movsq
-	and $7,%edx
-	jz 1f
-2:	movsb
-	dec %edx
-	jnz 2b
-1:	ret
diff --git a/src/string/x86_64/memmove.s b/src/string/x86_64/memmove.S
similarity index 80%
rename from src/string/x86_64/memmove.s
rename to src/string/x86_64/memmove.S
index 172c0252..be31d75f 100644
--- a/src/string/x86_64/memmove.s
+++ b/src/string/x86_64/memmove.S
@@ -1,3 +1,7 @@
+
+#if defined(__AVX2__)
+// implemented as memcpy
+#else
 .global memmove
 .type memmove,@function
 memmove:
@@ -14,3 +18,4 @@ memmove:
 	cld
 	lea 1(%rdi),%rax
 	ret
+#endif
\ No newline at end of file
diff --git a/src/string/x86_64/memset.S b/src/string/x86_64/memset.S
new file mode 100644
index 00000000..a42ac3fd
--- /dev/null
+++ b/src/string/x86_64/memset.S
@@ -0,0 +1,316 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#if defined(__AVX2__)
+
+#define LABEL(x)     .L##x
+
+.text
+.p2align  5, 0x90
+.hidden __folly_memset
+.type   __folly_memset, @function
+__folly_memset:
+        .cfi_startproc
+
+// RDI is the buffer
+// RSI is the value
+// RDX is length
+        vmovd           %esi, %xmm0
+        vpbroadcastb    %xmm0, %ymm0
+        mov             %rdi, %rax
+        cmp             $0x40, %rdx
+        jae             LABEL(above_64)
+
+LABEL(below_64):
+        cmp             $0x20, %rdx
+        jb              LABEL(below_32)
+        vmovdqu         %ymm0, (%rdi)
+        vmovdqu         %ymm0, -0x20(%rdi,%rdx)
+        vzeroupper
+        retq
+
+.align 32
+LABEL(below_32):
+        cmp             $0x10, %rdx
+        jae             LABEL(in_16_to_32)
+
+LABEL(below_16):
+        cmp             $0x4, %rdx
+        jbe             LABEL(below_4)
+
+LABEL(in_4_to_16):
+        // Scalar stores from this point.
+        vmovq           %xmm0, %rsi
+        cmp             $0x7, %rdx
+        jbe             LABEL(in_4_to_8)
+        // Two 8-wide stores, up to 16 bytes.
+        mov             %rsi, -0x8(%rdi, %rdx)
+        mov             %rsi, (%rdi)
+        vzeroupper
+        retq
+
+.align 32
+LABEL(below_4):
+        vmovq           %xmm0, %rsi
+        vzeroupper
+        cmp             $0x1, %rdx
+        jbe             LABEL(none_or_one)
+        mov             %si, (%rdi)
+        mov             %si, -0x2(%rdi, %rdx)
+
+LABEL(exit):
+        retq
+
+.align 16
+LABEL(in_4_to_8):
+        // two 4-wide stores, upto 8 bytes.
+        mov             %esi, -0x4(%rdi,%rdx)
+        mov             %esi, (%rdi)
+        vzeroupper
+        retq
+
+.align 32
+LABEL(in_16_to_32):
+        vmovups         %xmm0, (%rdi)
+        vmovups         %xmm0, -0x10(%rdi,%rdx)
+        vzeroupper
+        retq
+
+LABEL(above_64):
+        cmp             $0xb0, %rdx
+        ja              LABEL(above_192)
+        cmp             $0x80, %rdx
+        jbe             LABEL(in_64_to_128)
+        // Do some work filling unaligned 32bit words.
+        // last_word -> rsi
+        lea             -0x20(%rdi,%rdx), %rsi
+        // rcx -> fill pointer.
+        // We have at least 128 bytes to store.
+        vmovdqu         %ymm0, (%rdi)
+        vmovdqu         %ymm0, 0x20(%rdi)
+        vmovdqu         %ymm0, 0x40(%rdi)
+        add             $0x60, %rdi
+
+.align 32
+LABEL(fill_32):
+        vmovdqu         %ymm0, (%rdi)
+        add             $0x20, %rdi
+        cmp             %rdi, %rsi
+        ja              LABEL(fill_32)
+        // Stamp the last unaligned store.
+        vmovdqu         %ymm0, (%rsi)
+        vzeroupper
+        retq
+
+.align 32
+LABEL(in_64_to_128):
+        // Last_word -> rsi
+        vmovdqu         %ymm0, (%rdi)
+        vmovdqu         %ymm0,  0x20(%rdi)
+        vmovdqu         %ymm0, -0x40(%rdi,%rdx)
+        vmovdqu         %ymm0, -0x20(%rdi,%rdx)
+        vzeroupper
+        retq
+
+.align 32
+LABEL(above_192):
+// rdi is the buffer address
+// rsi is the value
+// rdx is length
+        cmp             $0x1000, %rdx
+        jae             LABEL(large_stosq)
+        // Store the first unaligned 32 bytes.
+        vmovdqu         %ymm0, (%rdi)
+        // The first aligned word is stored in %rsi.
+        mov             %rdi, %rsi
+        mov             %rdi, %rax
+        and             $0xffffffffffffffe0, %rsi
+        lea             0x20(%rsi), %rsi
+        // Compute the address of the last unaligned word into rdi.
+        lea             -0x20(%rdx), %rdx
+        add             %rdx, %rdi
+        // Check if we can do a full 5x32B stamp.
+        lea             0xa0(%rsi), %rcx
+        cmp             %rcx, %rdi
+        jb              LABEL(stamp_4)
+
+LABEL(fill_192):
+        vmovdqa         %ymm0, (%rsi)
+        vmovdqa         %ymm0, 0x20(%rsi)
+        vmovdqa         %ymm0, 0x40(%rsi)
+        vmovdqa         %ymm0, 0x60(%rsi)
+        vmovdqa         %ymm0, 0x80(%rsi)
+        add             $0xa0, %rsi
+        lea             0xa0(%rsi), %rcx
+        cmp             %rcx, %rdi
+        ja              LABEL(fill_192)
+
+LABEL(fill_192_tail):
+        cmp             %rsi, %rdi
+        jb              LABEL(fill_192_done)
+        vmovdqa         %ymm0, (%rsi)
+
+        lea             0x20(%rsi), %rcx
+        cmp             %rcx, %rdi
+        jb              LABEL(fill_192_done)
+        vmovdqa         %ymm0, 0x20(%rsi)
+
+        lea             0x40(%rsi), %rcx
+        cmp             %rcx, %rdi
+        jb              LABEL(fill_192_done)
+        vmovdqa         %ymm0, 0x40(%rsi)
+
+        lea             0x60(%rsi), %rcx
+        cmp             %rcx, %rdi
+        jb              LABEL(fill_192_done)
+        vmovdqa         %ymm0, 0x60(%rsi)
+
+LABEL(last_wide_store):
+        lea             0x80(%rsi), %rcx
+        cmp             %rcx, %rdi
+        jb              LABEL(fill_192_done)
+        vmovdqa         %ymm0, 0x80(%rsi)
+
+.align 16
+LABEL(fill_192_done):
+        // Stamp the last word.
+        vmovdqu         %ymm0, (%rdi)
+        vzeroupper
+        // FIXME return buffer address
+        ret
+
+LABEL(stamp_4):
+        vmovdqa         %ymm0, (%rsi)
+        vmovdqa         %ymm0, 0x20(%rsi)
+        vmovdqa         %ymm0, 0x40(%rsi)
+        vmovdqa         %ymm0, 0x60(%rsi)
+        jmp             LABEL(last_wide_store)
+
+LABEL(large_stosq):
+// rdi is the buffer address
+// rsi is the value
+// rdx is length
+        vmovd           %xmm0, %rax
+        mov             %rax, (%rdi)
+        mov             %rdi, %rsi
+        // Align rdi to 8B
+        and             $0xfffffffffffffff8, %rdi
+        lea             0x8(%rdi), %rdi
+        // Fill buffer using stosq
+        mov             %rdx, %rcx
+        sub             $0x8, %rcx
+        shrq            $0x3, %rcx
+        // rcx - number of QWORD elements
+        // rax - value
+        // rdi - buffer pointer
+        rep stosq
+        // Fill last 16 bytes
+        vmovdqu         %xmm0, -0x10(%rsi, %rdx)
+        vzeroupper
+        mov             %rsi, %rax
+        ret
+
+.align 16
+LABEL(none_or_one):
+        test            %rdx, %rdx
+        je              LABEL(exit)
+        // Store one and exit
+        mov             %sil, (%rdi)
+        ret
+
+        .cfi_endproc
+        .size       __folly_memset, .-__folly_memset
+
+        .global       memset
+        memset = __folly_memset
+
+#else
+// original musl implementation
+
+.global memset
+.type memset,@function
+memset:
+	movzbq %sil,%rax
+	mov $0x101010101010101,%r8
+	imul %r8,%rax
+
+	cmp $126,%rdx
+	ja 2f
+
+	test %edx,%edx
+	jz 1f
+
+	mov %sil,(%rdi)
+	mov %sil,-1(%rdi,%rdx)
+	cmp $2,%edx
+	jbe 1f
+
+	mov %ax,1(%rdi)
+	mov %ax,(-1-2)(%rdi,%rdx)
+	cmp $6,%edx
+	jbe 1f
+
+	mov %eax,(1+2)(%rdi)
+	mov %eax,(-1-2-4)(%rdi,%rdx)
+	cmp $14,%edx
+	jbe 1f
+
+	mov %rax,(1+2+4)(%rdi)
+	mov %rax,(-1-2-4-8)(%rdi,%rdx)
+	cmp $30,%edx
+	jbe 1f
+
+	mov %rax,(1+2+4+8)(%rdi)
+	mov %rax,(1+2+4+8+8)(%rdi)
+	mov %rax,(-1-2-4-8-16)(%rdi,%rdx)
+	mov %rax,(-1-2-4-8-8)(%rdi,%rdx)
+	cmp $62,%edx
+	jbe 1f
+
+	mov %rax,(1+2+4+8+16)(%rdi)
+	mov %rax,(1+2+4+8+16+8)(%rdi)
+	mov %rax,(1+2+4+8+16+16)(%rdi)
+	mov %rax,(1+2+4+8+16+24)(%rdi)
+	mov %rax,(-1-2-4-8-16-32)(%rdi,%rdx)
+	mov %rax,(-1-2-4-8-16-24)(%rdi,%rdx)
+	mov %rax,(-1-2-4-8-16-16)(%rdi,%rdx)
+	mov %rax,(-1-2-4-8-16-8)(%rdi,%rdx)
+
+1:	mov %rdi,%rax
+	ret
+
+2:	test $15,%edi
+	mov %rdi,%r8
+	mov %rax,-8(%rdi,%rdx)
+	mov %rdx,%rcx
+	jnz 2f
+
+1:	shr $3,%rcx
+	rep
+	stosq
+	mov %r8,%rax
+	ret
+
+2:	xor %edx,%edx
+	sub %edi,%edx
+	and $15,%edx
+	mov %rax,(%rdi)
+	mov %rax,8(%rdi)
+	sub %rdx,%rcx
+	add %rdx,%rdi
+	jmp 1b
+
+#endif // __AVX2__
\ No newline at end of file
diff --git a/src/string/x86_64/memset.s b/src/string/x86_64/memset.s
deleted file mode 100644
index 2d3f5e52..00000000
--- a/src/string/x86_64/memset.s
+++ /dev/null
@@ -1,72 +0,0 @@
-.global memset
-.type memset,@function
-memset:
-	movzbq %sil,%rax
-	mov $0x101010101010101,%r8
-	imul %r8,%rax
-
-	cmp $126,%rdx
-	ja 2f
-
-	test %edx,%edx
-	jz 1f
-
-	mov %sil,(%rdi)
-	mov %sil,-1(%rdi,%rdx)
-	cmp $2,%edx
-	jbe 1f
-
-	mov %ax,1(%rdi)
-	mov %ax,(-1-2)(%rdi,%rdx)
-	cmp $6,%edx
-	jbe 1f
-
-	mov %eax,(1+2)(%rdi)
-	mov %eax,(-1-2-4)(%rdi,%rdx)
-	cmp $14,%edx
-	jbe 1f
-
-	mov %rax,(1+2+4)(%rdi)
-	mov %rax,(-1-2-4-8)(%rdi,%rdx)
-	cmp $30,%edx
-	jbe 1f
-
-	mov %rax,(1+2+4+8)(%rdi)
-	mov %rax,(1+2+4+8+8)(%rdi)
-	mov %rax,(-1-2-4-8-16)(%rdi,%rdx)
-	mov %rax,(-1-2-4-8-8)(%rdi,%rdx)
-	cmp $62,%edx
-	jbe 1f
-
-	mov %rax,(1+2+4+8+16)(%rdi)
-	mov %rax,(1+2+4+8+16+8)(%rdi)
-	mov %rax,(1+2+4+8+16+16)(%rdi)
-	mov %rax,(1+2+4+8+16+24)(%rdi)
-	mov %rax,(-1-2-4-8-16-32)(%rdi,%rdx)
-	mov %rax,(-1-2-4-8-16-24)(%rdi,%rdx)
-	mov %rax,(-1-2-4-8-16-16)(%rdi,%rdx)
-	mov %rax,(-1-2-4-8-16-8)(%rdi,%rdx)
-
-1:	mov %rdi,%rax
-	ret
-
-2:	test $15,%edi
-	mov %rdi,%r8
-	mov %rax,-8(%rdi,%rdx)
-	mov %rdx,%rcx
-	jnz 2f
-
-1:	shr $3,%rcx
-	rep
-	stosq
-	mov %r8,%rax
-	ret
-
-2:	xor %edx,%edx
-	sub %edi,%edx
-	and $15,%edx
-	mov %rax,(%rdi)
-	mov %rax,8(%rdi)
-	sub %rdx,%rcx
-	add %rdx,%rdi
-	jmp 1b
-- 
2.35.2