123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186 |
- ! Copyright (C) 2008-2012 Imagination Technologies Ltd.
- .text
- .global _memcpy
- .type _memcpy,function
- ! D1Ar1 dst
- ! D0Ar2 src
- ! D1Ar3 cnt
- ! D0Re0 dst
- _memcpy:
- CMP D1Ar3, #16
- MOV A1.2, D0Ar2 ! source pointer
- MOV A0.2, D1Ar1 ! destination pointer
- MOV A0.3, D1Ar1 ! for return value
- ! If there are less than 16 bytes to copy use the byte copy loop
- BGE $Llong_copy
- $Lbyte_copy:
- ! Simply copy a byte at a time
- SUBS TXRPT, D1Ar3, #1
- BLT $Lend
- $Lloop_byte:
- GETB D1Re0, [A1.2++]
- SETB [A0.2++], D1Re0
- BR $Lloop_byte
- $Lend:
- ! Finally set return value and return
- MOV D0Re0, A0.3
- MOV PC, D1RtP
- $Llong_copy:
- ANDS D1Ar5, D1Ar1, #7 ! test destination alignment
- BZ $Laligned_dst
- ! The destination address is not 8 byte aligned. We will copy bytes from
- ! the source to the destination until the remaining data has an 8 byte
- ! destination address alignment (i.e we should never copy more than 7
- ! bytes here).
- $Lalign_dst:
- GETB D0Re0, [A1.2++]
- ADD D1Ar5, D1Ar5, #1 ! dest is aligned when D1Ar5 reaches #8
- SUB D1Ar3, D1Ar3, #1 ! decrement count of remaining bytes
- SETB [A0.2++], D0Re0
- CMP D1Ar5, #8
- BNE $Lalign_dst
- ! We have at least (16 - 7) = 9 bytes to copy - calculate the number of 8 byte
- ! blocks, then jump to the unaligned copy loop or fall through to the aligned
- ! copy loop as appropriate.
- $Laligned_dst:
- MOV D0Ar4, A1.2
- LSR D1Ar5, D1Ar3, #3 ! D1Ar5 = number of 8 byte blocks
- ANDS D0Ar4, D0Ar4, #7 ! test source alignment
- BNZ $Lunaligned_copy ! if unaligned, use unaligned copy loop
- ! Both source and destination are 8 byte aligned - the easy case.
- $Laligned_copy:
- LSRS D1Ar5, D1Ar3, #5 ! D1Ar5 = number of 32 byte blocks
- BZ $Lbyte_copy
- SUB TXRPT, D1Ar5, #1
- $Laligned_32:
- GETL D0Re0, D1Re0, [A1.2++]
- GETL D0Ar6, D1Ar5, [A1.2++]
- SETL [A0.2++], D0Re0, D1Re0
- SETL [A0.2++], D0Ar6, D1Ar5
- GETL D0Re0, D1Re0, [A1.2++]
- GETL D0Ar6, D1Ar5, [A1.2++]
- SETL [A0.2++], D0Re0, D1Re0
- SETL [A0.2++], D0Ar6, D1Ar5
- BR $Laligned_32
- ! If there are any remaining bytes use the byte copy loop, otherwise we are done
- ANDS D1Ar3, D1Ar3, #0x1f
- BNZ $Lbyte_copy
- B $Lend
- ! The destination is 8 byte aligned but the source is not, and there are 8
- ! or more bytes to be copied.
- $Lunaligned_copy:
- ! Adjust the source pointer (A1.2) to the 8 byte boundary before its
- ! current value
- MOV D0Ar4, A1.2
- MOV D0Ar6, A1.2
- ANDMB D0Ar4, D0Ar4, #0xfff8
- MOV A1.2, D0Ar4
- ! Save the number of bytes of mis-alignment in D0Ar4 for use later
- SUBS D0Ar6, D0Ar6, D0Ar4
- MOV D0Ar4, D0Ar6
- ! if there is no mis-alignment after all, use the aligned copy loop
- BZ $Laligned_copy
- ! prefetch 8 bytes
- GETL D0Re0, D1Re0, [A1.2]
- SUB TXRPT, D1Ar5, #1
- ! There are 3 mis-alignment cases to be considered. Less than 4 bytes, exactly
- ! 4 bytes, and more than 4 bytes.
- CMP D0Ar6, #4
- BLT $Lunaligned_1_2_3 ! use 1-3 byte mis-alignment loop
- BZ $Lunaligned_4 ! use 4 byte mis-alignment loop
- ! The mis-alignment is more than 4 bytes
- $Lunaligned_5_6_7:
- SUB D0Ar6, D0Ar6, #4
- ! Calculate the bit offsets required for the shift operations necesssary
- ! to align the data.
- ! D0Ar6 = bit offset, D1Ar5 = (32 - bit offset)
- MULW D0Ar6, D0Ar6, #8
- MOV D1Ar5, #32
- SUB D1Ar5, D1Ar5, D0Ar6
- ! Move data 4 bytes before we enter the main loop
- MOV D0Re0, D1Re0
- $Lloop_5_6_7:
- GETL D0Ar2, D1Ar1, [++A1.2]
- ! form 64-bit data in D0Re0, D1Re0
- LSR D0Re0, D0Re0, D0Ar6
- MOV D1Re0, D0Ar2
- LSL D1Re0, D1Re0, D1Ar5
- ADD D0Re0, D0Re0, D1Re0
- LSR D0Ar2, D0Ar2, D0Ar6
- LSL D1Re0, D1Ar1, D1Ar5
- ADD D1Re0, D1Re0, D0Ar2
- SETL [A0.2++], D0Re0, D1Re0
- MOV D0Re0, D1Ar1
- BR $Lloop_5_6_7
- B $Lunaligned_end
- $Lunaligned_1_2_3:
- ! Calculate the bit offsets required for the shift operations necesssary
- ! to align the data.
- ! D0Ar6 = bit offset, D1Ar5 = (32 - bit offset)
- MULW D0Ar6, D0Ar6, #8
- MOV D1Ar5, #32
- SUB D1Ar5, D1Ar5, D0Ar6
- $Lloop_1_2_3:
- ! form 64-bit data in D0Re0,D1Re0
- LSR D0Re0, D0Re0, D0Ar6
- LSL D1Ar1, D1Re0, D1Ar5
- ADD D0Re0, D0Re0, D1Ar1
- MOV D0Ar2, D1Re0
- LSR D0FrT, D0Ar2, D0Ar6
- GETL D0Ar2, D1Ar1, [++A1.2]
- MOV D1Re0, D0Ar2
- LSL D1Re0, D1Re0, D1Ar5
- ADD D1Re0, D1Re0, D0FrT
- SETL [A0.2++], D0Re0, D1Re0
- MOV D0Re0, D0Ar2
- MOV D1Re0, D1Ar1
- BR $Lloop_1_2_3
- B $Lunaligned_end
- ! The 4 byte mis-alignment case - this does not require any shifting, just a
- ! shuffling of registers.
- $Lunaligned_4:
- MOV D0Re0, D1Re0
- $Lloop_4:
- GETL D0Ar2, D1Ar1, [++A1.2]
- MOV D1Re0, D0Ar2
- SETL [A0.2++], D0Re0, D1Re0
- MOV D0Re0, D1Ar1
- BR $Lloop_4
- $Lunaligned_end:
- ! If there are no remaining bytes to copy, we are done.
- ANDS D1Ar3, D1Ar3, #7
- BZ $Lend
- ! Re-adjust the source pointer (A1.2) back to the actual (unaligned) byte
- ! address of the remaining bytes, and fall through to the byte copy loop.
- MOV D0Ar6, A1.2
- ADD D1Ar5, D0Ar4, D0Ar6
- MOV A1.2, D1Ar5
- B $Lbyte_copy
- .size _memcpy,.-_memcpy
|