ev6-memcpy.S 6.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250
  1. /*
  2. * arch/alpha/lib/ev6-memcpy.S
  3. * 21264 version by Rick Gorton <rick.gorton@alpha-processor.com>
  4. *
  5. * Reasonably optimized memcpy() routine for the Alpha 21264
  6. *
  7. * - memory accessed as aligned quadwords only
  8. * - uses bcmpge to compare 8 bytes in parallel
  9. *
  10. * Much of the information about 21264 scheduling/coding comes from:
  11. * Compiler Writer's Guide for the Alpha 21264
  12. * abbreviated as 'CWG' in other comments here
  13. * ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
  14. * Scheduling notation:
  15. * E - either cluster
  16. * U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1
  17. * L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1
  18. *
  19. * Temp usage notes:
  20. * $1,$2, - scratch
  21. */
  22. #include <asm/export.h>
  23. .set noreorder
  24. .set noat
  25. .align 4
  26. .globl memcpy
  27. .ent memcpy
  28. memcpy:
  29. .frame $30,0,$26,0
  30. .prologue 0
  31. mov $16, $0 # E : copy dest to return
  32. ble $18, $nomoredata # U : done with the copy?
  33. xor $16, $17, $1 # E : are source and dest alignments the same?
  34. and $1, 7, $1 # E : are they the same mod 8?
  35. bne $1, $misaligned # U : Nope - gotta do this the slow way
  36. /* source and dest are same mod 8 address */
  37. and $16, 7, $1 # E : Are both 0mod8?
  38. beq $1, $both_0mod8 # U : Yes
  39. nop # E :
  40. /*
  41. * source and dest are same misalignment. move a byte at a time
  42. * until a 0mod8 alignment for both is reached.
  43. * At least one byte more to move
  44. */
  45. $head_align:
  46. ldbu $1, 0($17) # L : grab a byte
  47. subq $18, 1, $18 # E : count--
  48. addq $17, 1, $17 # E : src++
  49. stb $1, 0($16) # L :
  50. addq $16, 1, $16 # E : dest++
  51. and $16, 7, $1 # E : Are we at 0mod8 yet?
  52. ble $18, $nomoredata # U : done with the copy?
  53. bne $1, $head_align # U :
  54. $both_0mod8:
  55. cmple $18, 127, $1 # E : Can we unroll the loop?
  56. bne $1, $no_unroll # U :
  57. and $16, 63, $1 # E : get mod64 alignment
  58. beq $1, $do_unroll # U : no single quads to fiddle
  59. $single_head_quad:
  60. ldq $1, 0($17) # L : get 8 bytes
  61. subq $18, 8, $18 # E : count -= 8
  62. addq $17, 8, $17 # E : src += 8
  63. nop # E :
  64. stq $1, 0($16) # L : store
  65. addq $16, 8, $16 # E : dest += 8
  66. and $16, 63, $1 # E : get mod64 alignment
  67. bne $1, $single_head_quad # U : still not fully aligned
  68. $do_unroll:
  69. addq $16, 64, $7 # E : Initial (+1 trip) wh64 address
  70. cmple $18, 127, $1 # E : Can we go through the unrolled loop?
  71. bne $1, $tail_quads # U : Nope
  72. nop # E :
  73. $unroll_body:
  74. wh64 ($7) # L1 : memory subsystem hint: 64 bytes at
  75. # ($7) are about to be over-written
  76. ldq $6, 0($17) # L0 : bytes 0..7
  77. nop # E :
  78. nop # E :
  79. ldq $4, 8($17) # L : bytes 8..15
  80. ldq $5, 16($17) # L : bytes 16..23
  81. addq $7, 64, $7 # E : Update next wh64 address
  82. nop # E :
  83. ldq $3, 24($17) # L : bytes 24..31
  84. addq $16, 64, $1 # E : fallback value for wh64
  85. nop # E :
  86. nop # E :
  87. addq $17, 32, $17 # E : src += 32 bytes
  88. stq $6, 0($16) # L : bytes 0..7
  89. nop # E :
  90. nop # E :
  91. stq $4, 8($16) # L : bytes 8..15
  92. stq $5, 16($16) # L : bytes 16..23
  93. subq $18, 192, $2 # E : At least two more trips to go?
  94. nop # E :
  95. stq $3, 24($16) # L : bytes 24..31
  96. addq $16, 32, $16 # E : dest += 32 bytes
  97. nop # E :
  98. nop # E :
  99. ldq $6, 0($17) # L : bytes 0..7
  100. ldq $4, 8($17) # L : bytes 8..15
  101. cmovlt $2, $1, $7 # E : Latency 2, extra map slot - Use
  102. # fallback wh64 address if < 2 more trips
  103. nop # E :
  104. ldq $5, 16($17) # L : bytes 16..23
  105. ldq $3, 24($17) # L : bytes 24..31
  106. addq $16, 32, $16 # E : dest += 32
  107. subq $18, 64, $18 # E : count -= 64
  108. addq $17, 32, $17 # E : src += 32
  109. stq $6, -32($16) # L : bytes 0..7
  110. stq $4, -24($16) # L : bytes 8..15
  111. cmple $18, 63, $1 # E : At least one more trip?
  112. stq $5, -16($16) # L : bytes 16..23
  113. stq $3, -8($16) # L : bytes 24..31
  114. nop # E :
  115. beq $1, $unroll_body
  116. $tail_quads:
  117. $no_unroll:
  118. .align 4
  119. subq $18, 8, $18 # E : At least a quad left?
  120. blt $18, $less_than_8 # U : Nope
  121. nop # E :
  122. nop # E :
  123. $move_a_quad:
  124. ldq $1, 0($17) # L : fetch 8
  125. subq $18, 8, $18 # E : count -= 8
  126. addq $17, 8, $17 # E : src += 8
  127. nop # E :
  128. stq $1, 0($16) # L : store 8
  129. addq $16, 8, $16 # E : dest += 8
  130. bge $18, $move_a_quad # U :
  131. nop # E :
  132. $less_than_8:
  133. .align 4
  134. addq $18, 8, $18 # E : add back for trailing bytes
  135. ble $18, $nomoredata # U : All-done
  136. nop # E :
  137. nop # E :
  138. /* Trailing bytes */
  139. $tail_bytes:
  140. subq $18, 1, $18 # E : count--
  141. ldbu $1, 0($17) # L : fetch a byte
  142. addq $17, 1, $17 # E : src++
  143. nop # E :
  144. stb $1, 0($16) # L : store a byte
  145. addq $16, 1, $16 # E : dest++
  146. bgt $18, $tail_bytes # U : more to be done?
  147. nop # E :
  148. /* branching to exit takes 3 extra cycles, so replicate exit here */
  149. ret $31, ($26), 1 # L0 :
  150. nop # E :
  151. nop # E :
  152. nop # E :
  153. $misaligned:
  154. mov $0, $4 # E : dest temp
  155. and $0, 7, $1 # E : dest alignment mod8
  156. beq $1, $dest_0mod8 # U : life doesnt totally suck
  157. nop
  158. $aligndest:
  159. ble $18, $nomoredata # U :
  160. ldbu $1, 0($17) # L : fetch a byte
  161. subq $18, 1, $18 # E : count--
  162. addq $17, 1, $17 # E : src++
  163. stb $1, 0($4) # L : store it
  164. addq $4, 1, $4 # E : dest++
  165. and $4, 7, $1 # E : dest 0mod8 yet?
  166. bne $1, $aligndest # U : go until we are aligned.
  167. /* Source has unknown alignment, but dest is known to be 0mod8 */
  168. $dest_0mod8:
  169. subq $18, 8, $18 # E : At least a quad left?
  170. blt $18, $misalign_tail # U : Nope
  171. ldq_u $3, 0($17) # L : seed (rotating load) of 8 bytes
  172. nop # E :
  173. $mis_quad:
  174. ldq_u $16, 8($17) # L : Fetch next 8
  175. extql $3, $17, $3 # U : masking
  176. extqh $16, $17, $1 # U : masking
  177. bis $3, $1, $1 # E : merged bytes to store
  178. subq $18, 8, $18 # E : count -= 8
  179. addq $17, 8, $17 # E : src += 8
  180. stq $1, 0($4) # L : store 8 (aligned)
  181. mov $16, $3 # E : "rotate" source data
  182. addq $4, 8, $4 # E : dest += 8
  183. bge $18, $mis_quad # U : More quads to move
  184. nop
  185. nop
  186. $misalign_tail:
  187. addq $18, 8, $18 # E : account for tail stuff
  188. ble $18, $nomoredata # U :
  189. nop
  190. nop
  191. $misalign_byte:
  192. ldbu $1, 0($17) # L : fetch 1
  193. subq $18, 1, $18 # E : count--
  194. addq $17, 1, $17 # E : src++
  195. nop # E :
  196. stb $1, 0($4) # L : store
  197. addq $4, 1, $4 # E : dest++
  198. bgt $18, $misalign_byte # U : more to go?
  199. nop
  200. $nomoredata:
  201. ret $31, ($26), 1 # L0 :
  202. nop # E :
  203. nop # E :
  204. nop # E :
  205. .end memcpy
  206. EXPORT_SYMBOL(memcpy)
  207. /* For backwards module compatibility. */
  208. __memcpy = memcpy
  209. .globl __memcpy