123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201 |
- /*
- * Copyright (C) 2013 ARM Ltd.
- * Copyright (C) 2013 Linaro.
- *
- * This code is based on glibc cortex strings work originally authored by Linaro
- * and re-licensed under GPLv2 for the Linux kernel. The original code can
- * be found @
- *
- * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
- * files/head:/src/aarch64/
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program. If not, see <http://www.gnu.org/licenses/>.
- */
- #include <linux/linkage.h>
- #include <asm/assembler.h>
- #include <asm/cache.h>
- /*
- * Move a buffer from src to test (alignment handled by the hardware).
- * If dest <= src, call memcpy, otherwise copy in reverse order.
- *
- * Parameters:
- * x0 - dest
- * x1 - src
- * x2 - n
- * Returns:
- * x0 - dest
- */
- dstin .req x0
- src .req x1
- count .req x2
- tmp1 .req x3
- tmp1w .req w3
- tmp2 .req x4
- tmp2w .req w4
- tmp3 .req x5
- tmp3w .req w5
- dst .req x6
- A_l .req x7
- A_h .req x8
- B_l .req x9
- B_h .req x10
- C_l .req x11
- C_h .req x12
- D_l .req x13
- D_h .req x14
- .weak memmove
- ENTRY(__memmove)
- ENTRY(memmove)
- cmp dstin, src
- b.lo __memcpy
- add tmp1, src, count
- cmp dstin, tmp1
- b.hs __memcpy /* No overlap. */
- add dst, dstin, count
- add src, src, count
- cmp count, #16
- b.lo .Ltail15 /*probably non-alignment accesses.*/
- ands tmp2, src, #15 /* Bytes to reach alignment. */
- b.eq .LSrcAligned
- sub count, count, tmp2
- /*
- * process the aligned offset length to make the src aligned firstly.
- * those extra instructions' cost is acceptable. It also make the
- * coming accesses are based on aligned address.
- */
- tbz tmp2, #0, 1f
- ldrb tmp1w, [src, #-1]!
- strb tmp1w, [dst, #-1]!
- 1:
- tbz tmp2, #1, 2f
- ldrh tmp1w, [src, #-2]!
- strh tmp1w, [dst, #-2]!
- 2:
- tbz tmp2, #2, 3f
- ldr tmp1w, [src, #-4]!
- str tmp1w, [dst, #-4]!
- 3:
- tbz tmp2, #3, .LSrcAligned
- ldr tmp1, [src, #-8]!
- str tmp1, [dst, #-8]!
- .LSrcAligned:
- cmp count, #64
- b.ge .Lcpy_over64
- /*
- * Deal with small copies quickly by dropping straight into the
- * exit block.
- */
- .Ltail63:
- /*
- * Copy up to 48 bytes of data. At this point we only need the
- * bottom 6 bits of count to be accurate.
- */
- ands tmp1, count, #0x30
- b.eq .Ltail15
- cmp tmp1w, #0x20
- b.eq 1f
- b.lt 2f
- ldp A_l, A_h, [src, #-16]!
- stp A_l, A_h, [dst, #-16]!
- 1:
- ldp A_l, A_h, [src, #-16]!
- stp A_l, A_h, [dst, #-16]!
- 2:
- ldp A_l, A_h, [src, #-16]!
- stp A_l, A_h, [dst, #-16]!
- .Ltail15:
- tbz count, #3, 1f
- ldr tmp1, [src, #-8]!
- str tmp1, [dst, #-8]!
- 1:
- tbz count, #2, 2f
- ldr tmp1w, [src, #-4]!
- str tmp1w, [dst, #-4]!
- 2:
- tbz count, #1, 3f
- ldrh tmp1w, [src, #-2]!
- strh tmp1w, [dst, #-2]!
- 3:
- tbz count, #0, .Lexitfunc
- ldrb tmp1w, [src, #-1]
- strb tmp1w, [dst, #-1]
- .Lexitfunc:
- ret
- .Lcpy_over64:
- subs count, count, #128
- b.ge .Lcpy_body_large
- /*
- * Less than 128 bytes to copy, so handle 64 bytes here and then jump
- * to the tail.
- */
- ldp A_l, A_h, [src, #-16]
- stp A_l, A_h, [dst, #-16]
- ldp B_l, B_h, [src, #-32]
- ldp C_l, C_h, [src, #-48]
- stp B_l, B_h, [dst, #-32]
- stp C_l, C_h, [dst, #-48]
- ldp D_l, D_h, [src, #-64]!
- stp D_l, D_h, [dst, #-64]!
- tst count, #0x3f
- b.ne .Ltail63
- ret
- /*
- * Critical loop. Start at a new cache line boundary. Assuming
- * 64 bytes per line this ensures the entire loop is in one line.
- */
- .p2align L1_CACHE_SHIFT
- .Lcpy_body_large:
- /* pre-load 64 bytes data. */
- ldp A_l, A_h, [src, #-16]
- ldp B_l, B_h, [src, #-32]
- ldp C_l, C_h, [src, #-48]
- ldp D_l, D_h, [src, #-64]!
- 1:
- /*
- * interlace the load of next 64 bytes data block with store of the last
- * loaded 64 bytes data.
- */
- stp A_l, A_h, [dst, #-16]
- ldp A_l, A_h, [src, #-16]
- stp B_l, B_h, [dst, #-32]
- ldp B_l, B_h, [src, #-32]
- stp C_l, C_h, [dst, #-48]
- ldp C_l, C_h, [src, #-48]
- stp D_l, D_h, [dst, #-64]!
- ldp D_l, D_h, [src, #-64]!
- subs count, count, #64
- b.ge 1b
- stp A_l, A_h, [dst, #-16]
- stp B_l, B_h, [dst, #-32]
- stp C_l, C_h, [dst, #-48]
- stp D_l, D_h, [dst, #-64]!
- tst count, #0x3f
- b.ne .Ltail63
- ret
- ENDPIPROC(memmove)
- ENDPROC(__memmove)
|