U1memcpy.S 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564
  1. /* U1memcpy.S: UltraSPARC-I/II/IIi/IIe optimized memcpy.
  2. *
  3. * Copyright (C) 1997, 2004 David S. Miller (davem@redhat.com)
  4. * Copyright (C) 1996, 1997, 1998, 1999 Jakub Jelinek (jj@ultra.linux.cz)
  5. */
  6. #ifdef __KERNEL__
  7. #include <asm/visasm.h>
  8. #include <asm/asi.h>
  9. #define GLOBAL_SPARE g7
  10. #else
  11. #define GLOBAL_SPARE g5
  12. #define ASI_BLK_P 0xf0
  13. #define FPRS_FEF 0x04
  14. #ifdef MEMCPY_DEBUG
  15. #define VISEntry rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs; \
  16. clr %g1; clr %g2; clr %g3; subcc %g0, %g0, %g0;
  17. #define VISExit and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs
  18. #else
  19. #define VISEntry rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs
  20. #define VISExit and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs
  21. #endif
  22. #endif
  23. #ifndef EX_LD
  24. #define EX_LD(x) x
  25. #endif
  26. #ifndef EX_ST
  27. #define EX_ST(x) x
  28. #endif
  29. #ifndef EX_RETVAL
  30. #define EX_RETVAL(x) x
  31. #endif
  32. #ifndef LOAD
  33. #define LOAD(type,addr,dest) type [addr], dest
  34. #endif
  35. #ifndef LOAD_BLK
  36. #define LOAD_BLK(addr,dest) ldda [addr] ASI_BLK_P, dest
  37. #endif
  38. #ifndef STORE
  39. #define STORE(type,src,addr) type src, [addr]
  40. #endif
  41. #ifndef STORE_BLK
  42. #define STORE_BLK(src,addr) stda src, [addr] ASI_BLK_P
  43. #endif
  44. #ifndef FUNC_NAME
  45. #define FUNC_NAME memcpy
  46. #endif
  47. #ifndef PREAMBLE
  48. #define PREAMBLE
  49. #endif
  50. #ifndef XCC
  51. #define XCC xcc
  52. #endif
  53. #define FREG_FROB(f1, f2, f3, f4, f5, f6, f7, f8, f9) \
  54. faligndata %f1, %f2, %f48; \
  55. faligndata %f2, %f3, %f50; \
  56. faligndata %f3, %f4, %f52; \
  57. faligndata %f4, %f5, %f54; \
  58. faligndata %f5, %f6, %f56; \
  59. faligndata %f6, %f7, %f58; \
  60. faligndata %f7, %f8, %f60; \
  61. faligndata %f8, %f9, %f62;
  62. #define MAIN_LOOP_CHUNK(src, dest, fdest, fsrc, len, jmptgt) \
  63. EX_LD(LOAD_BLK(%src, %fdest)); \
  64. EX_ST(STORE_BLK(%fsrc, %dest)); \
  65. add %src, 0x40, %src; \
  66. subcc %len, 0x40, %len; \
  67. be,pn %xcc, jmptgt; \
  68. add %dest, 0x40, %dest; \
  69. #define LOOP_CHUNK1(src, dest, len, branch_dest) \
  70. MAIN_LOOP_CHUNK(src, dest, f0, f48, len, branch_dest)
  71. #define LOOP_CHUNK2(src, dest, len, branch_dest) \
  72. MAIN_LOOP_CHUNK(src, dest, f16, f48, len, branch_dest)
  73. #define LOOP_CHUNK3(src, dest, len, branch_dest) \
  74. MAIN_LOOP_CHUNK(src, dest, f32, f48, len, branch_dest)
  75. #define DO_SYNC membar #Sync;
  76. #define STORE_SYNC(dest, fsrc) \
  77. EX_ST(STORE_BLK(%fsrc, %dest)); \
  78. add %dest, 0x40, %dest; \
  79. DO_SYNC
  80. #define STORE_JUMP(dest, fsrc, target) \
  81. EX_ST(STORE_BLK(%fsrc, %dest)); \
  82. add %dest, 0x40, %dest; \
  83. ba,pt %xcc, target; \
  84. nop;
  85. #define FINISH_VISCHUNK(dest, f0, f1, left) \
  86. subcc %left, 8, %left;\
  87. bl,pn %xcc, 95f; \
  88. faligndata %f0, %f1, %f48; \
  89. EX_ST(STORE(std, %f48, %dest)); \
  90. add %dest, 8, %dest;
  91. #define UNEVEN_VISCHUNK_LAST(dest, f0, f1, left) \
  92. subcc %left, 8, %left; \
  93. bl,pn %xcc, 95f; \
  94. fsrc1 %f0, %f1;
  95. #define UNEVEN_VISCHUNK(dest, f0, f1, left) \
  96. UNEVEN_VISCHUNK_LAST(dest, f0, f1, left) \
  97. ba,a,pt %xcc, 93f;
  98. .register %g2,#scratch
  99. .register %g3,#scratch
  100. .text
  101. .align 64
  102. .globl FUNC_NAME
  103. .type FUNC_NAME,#function
  104. FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */
  105. srlx %o2, 31, %g2
  106. cmp %g2, 0
  107. tne %xcc, 5
  108. PREAMBLE
  109. mov %o0, %o4
  110. cmp %o2, 0
  111. be,pn %XCC, 85f
  112. or %o0, %o1, %o3
  113. cmp %o2, 16
  114. blu,a,pn %XCC, 80f
  115. or %o3, %o2, %o3
  116. cmp %o2, (5 * 64)
  117. blu,pt %XCC, 70f
  118. andcc %o3, 0x7, %g0
  119. /* Clobbers o5/g1/g2/g3/g7/icc/xcc. */
  120. VISEntry
  121. /* Is 'dst' already aligned on an 64-byte boundary? */
  122. andcc %o0, 0x3f, %g2
  123. be,pt %XCC, 2f
  124. /* Compute abs((dst & 0x3f) - 0x40) into %g2. This is the number
  125. * of bytes to copy to make 'dst' 64-byte aligned. We pre-
  126. * subtract this from 'len'.
  127. */
  128. sub %o0, %o1, %GLOBAL_SPARE
  129. sub %g2, 0x40, %g2
  130. sub %g0, %g2, %g2
  131. sub %o2, %g2, %o2
  132. andcc %g2, 0x7, %g1
  133. be,pt %icc, 2f
  134. and %g2, 0x38, %g2
  135. 1: subcc %g1, 0x1, %g1
  136. EX_LD(LOAD(ldub, %o1 + 0x00, %o3))
  137. EX_ST(STORE(stb, %o3, %o1 + %GLOBAL_SPARE))
  138. bgu,pt %XCC, 1b
  139. add %o1, 0x1, %o1
  140. add %o1, %GLOBAL_SPARE, %o0
  141. 2: cmp %g2, 0x0
  142. and %o1, 0x7, %g1
  143. be,pt %icc, 3f
  144. alignaddr %o1, %g0, %o1
  145. EX_LD(LOAD(ldd, %o1, %f4))
  146. 1: EX_LD(LOAD(ldd, %o1 + 0x8, %f6))
  147. add %o1, 0x8, %o1
  148. subcc %g2, 0x8, %g2
  149. faligndata %f4, %f6, %f0
  150. EX_ST(STORE(std, %f0, %o0))
  151. be,pn %icc, 3f
  152. add %o0, 0x8, %o0
  153. EX_LD(LOAD(ldd, %o1 + 0x8, %f4))
  154. add %o1, 0x8, %o1
  155. subcc %g2, 0x8, %g2
  156. faligndata %f6, %f4, %f0
  157. EX_ST(STORE(std, %f0, %o0))
  158. bne,pt %icc, 1b
  159. add %o0, 0x8, %o0
  160. /* Destination is 64-byte aligned. */
  161. 3:
  162. membar #LoadStore | #StoreStore | #StoreLoad
  163. subcc %o2, 0x40, %GLOBAL_SPARE
  164. add %o1, %g1, %g1
  165. andncc %GLOBAL_SPARE, (0x40 - 1), %GLOBAL_SPARE
  166. srl %g1, 3, %g2
  167. sub %o2, %GLOBAL_SPARE, %g3
  168. andn %o1, (0x40 - 1), %o1
  169. and %g2, 7, %g2
  170. andncc %g3, 0x7, %g3
  171. fmovd %f0, %f2
  172. sub %g3, 0x8, %g3
  173. sub %o2, %GLOBAL_SPARE, %o2
  174. add %g1, %GLOBAL_SPARE, %g1
  175. subcc %o2, %g3, %o2
  176. EX_LD(LOAD_BLK(%o1, %f0))
  177. add %o1, 0x40, %o1
  178. add %g1, %g3, %g1
  179. EX_LD(LOAD_BLK(%o1, %f16))
  180. add %o1, 0x40, %o1
  181. sub %GLOBAL_SPARE, 0x80, %GLOBAL_SPARE
  182. EX_LD(LOAD_BLK(%o1, %f32))
  183. add %o1, 0x40, %o1
  184. /* There are 8 instances of the unrolled loop,
  185. * one for each possible alignment of the
  186. * source buffer. Each loop instance is 452
  187. * bytes.
  188. */
  189. sll %g2, 3, %o3
  190. sub %o3, %g2, %o3
  191. sllx %o3, 4, %o3
  192. add %o3, %g2, %o3
  193. sllx %o3, 2, %g2
  194. 1: rd %pc, %o3
  195. add %o3, %lo(1f - 1b), %o3
  196. jmpl %o3 + %g2, %g0
  197. nop
  198. .align 64
  199. 1: FREG_FROB(f0, f2, f4, f6, f8, f10,f12,f14,f16)
  200. LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f)
  201. FREG_FROB(f16,f18,f20,f22,f24,f26,f28,f30,f32)
  202. LOOP_CHUNK2(o1, o0, GLOBAL_SPARE, 2f)
  203. FREG_FROB(f32,f34,f36,f38,f40,f42,f44,f46,f0)
  204. LOOP_CHUNK3(o1, o0, GLOBAL_SPARE, 3f)
  205. ba,pt %xcc, 1b+4
  206. faligndata %f0, %f2, %f48
  207. 1: FREG_FROB(f16,f18,f20,f22,f24,f26,f28,f30,f32)
  208. STORE_SYNC(o0, f48)
  209. FREG_FROB(f32,f34,f36,f38,f40,f42,f44,f46,f0)
  210. STORE_JUMP(o0, f48, 40f)
  211. 2: FREG_FROB(f32,f34,f36,f38,f40,f42,f44,f46,f0)
  212. STORE_SYNC(o0, f48)
  213. FREG_FROB(f0, f2, f4, f6, f8, f10,f12,f14,f16)
  214. STORE_JUMP(o0, f48, 48f)
  215. 3: FREG_FROB(f0, f2, f4, f6, f8, f10,f12,f14,f16)
  216. STORE_SYNC(o0, f48)
  217. FREG_FROB(f16,f18,f20,f22,f24,f26,f28,f30,f32)
  218. STORE_JUMP(o0, f48, 56f)
  219. 1: FREG_FROB(f2, f4, f6, f8, f10,f12,f14,f16,f18)
  220. LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f)
  221. FREG_FROB(f18,f20,f22,f24,f26,f28,f30,f32,f34)
  222. LOOP_CHUNK2(o1, o0, GLOBAL_SPARE, 2f)
  223. FREG_FROB(f34,f36,f38,f40,f42,f44,f46,f0, f2)
  224. LOOP_CHUNK3(o1, o0, GLOBAL_SPARE, 3f)
  225. ba,pt %xcc, 1b+4
  226. faligndata %f2, %f4, %f48
  227. 1: FREG_FROB(f18,f20,f22,f24,f26,f28,f30,f32,f34)
  228. STORE_SYNC(o0, f48)
  229. FREG_FROB(f34,f36,f38,f40,f42,f44,f46,f0, f2)
  230. STORE_JUMP(o0, f48, 41f)
  231. 2: FREG_FROB(f34,f36,f38,f40,f42,f44,f46,f0, f2)
  232. STORE_SYNC(o0, f48)
  233. FREG_FROB(f2, f4, f6, f8, f10,f12,f14,f16,f18)
  234. STORE_JUMP(o0, f48, 49f)
  235. 3: FREG_FROB(f2, f4, f6, f8, f10,f12,f14,f16,f18)
  236. STORE_SYNC(o0, f48)
  237. FREG_FROB(f18,f20,f22,f24,f26,f28,f30,f32,f34)
  238. STORE_JUMP(o0, f48, 57f)
  239. 1: FREG_FROB(f4, f6, f8, f10,f12,f14,f16,f18,f20)
  240. LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f)
  241. FREG_FROB(f20,f22,f24,f26,f28,f30,f32,f34,f36)
  242. LOOP_CHUNK2(o1, o0, GLOBAL_SPARE, 2f)
  243. FREG_FROB(f36,f38,f40,f42,f44,f46,f0, f2, f4)
  244. LOOP_CHUNK3(o1, o0, GLOBAL_SPARE, 3f)
  245. ba,pt %xcc, 1b+4
  246. faligndata %f4, %f6, %f48
  247. 1: FREG_FROB(f20,f22,f24,f26,f28,f30,f32,f34,f36)
  248. STORE_SYNC(o0, f48)
  249. FREG_FROB(f36,f38,f40,f42,f44,f46,f0, f2, f4)
  250. STORE_JUMP(o0, f48, 42f)
  251. 2: FREG_FROB(f36,f38,f40,f42,f44,f46,f0, f2, f4)
  252. STORE_SYNC(o0, f48)
  253. FREG_FROB(f4, f6, f8, f10,f12,f14,f16,f18,f20)
  254. STORE_JUMP(o0, f48, 50f)
  255. 3: FREG_FROB(f4, f6, f8, f10,f12,f14,f16,f18,f20)
  256. STORE_SYNC(o0, f48)
  257. FREG_FROB(f20,f22,f24,f26,f28,f30,f32,f34,f36)
  258. STORE_JUMP(o0, f48, 58f)
  259. 1: FREG_FROB(f6, f8, f10,f12,f14,f16,f18,f20,f22)
  260. LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f)
  261. FREG_FROB(f22,f24,f26,f28,f30,f32,f34,f36,f38)
  262. LOOP_CHUNK2(o1, o0, GLOBAL_SPARE, 2f)
  263. FREG_FROB(f38,f40,f42,f44,f46,f0, f2, f4, f6)
  264. LOOP_CHUNK3(o1, o0, GLOBAL_SPARE, 3f)
  265. ba,pt %xcc, 1b+4
  266. faligndata %f6, %f8, %f48
  267. 1: FREG_FROB(f22,f24,f26,f28,f30,f32,f34,f36,f38)
  268. STORE_SYNC(o0, f48)
  269. FREG_FROB(f38,f40,f42,f44,f46,f0, f2, f4, f6)
  270. STORE_JUMP(o0, f48, 43f)
  271. 2: FREG_FROB(f38,f40,f42,f44,f46,f0, f2, f4, f6)
  272. STORE_SYNC(o0, f48)
  273. FREG_FROB(f6, f8, f10,f12,f14,f16,f18,f20,f22)
  274. STORE_JUMP(o0, f48, 51f)
  275. 3: FREG_FROB(f6, f8, f10,f12,f14,f16,f18,f20,f22)
  276. STORE_SYNC(o0, f48)
  277. FREG_FROB(f22,f24,f26,f28,f30,f32,f34,f36,f38)
  278. STORE_JUMP(o0, f48, 59f)
  279. 1: FREG_FROB(f8, f10,f12,f14,f16,f18,f20,f22,f24)
  280. LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f)
  281. FREG_FROB(f24,f26,f28,f30,f32,f34,f36,f38,f40)
  282. LOOP_CHUNK2(o1, o0, GLOBAL_SPARE, 2f)
  283. FREG_FROB(f40,f42,f44,f46,f0, f2, f4, f6, f8)
  284. LOOP_CHUNK3(o1, o0, GLOBAL_SPARE, 3f)
  285. ba,pt %xcc, 1b+4
  286. faligndata %f8, %f10, %f48
  287. 1: FREG_FROB(f24,f26,f28,f30,f32,f34,f36,f38,f40)
  288. STORE_SYNC(o0, f48)
  289. FREG_FROB(f40,f42,f44,f46,f0, f2, f4, f6, f8)
  290. STORE_JUMP(o0, f48, 44f)
  291. 2: FREG_FROB(f40,f42,f44,f46,f0, f2, f4, f6, f8)
  292. STORE_SYNC(o0, f48)
  293. FREG_FROB(f8, f10,f12,f14,f16,f18,f20,f22,f24)
  294. STORE_JUMP(o0, f48, 52f)
  295. 3: FREG_FROB(f8, f10,f12,f14,f16,f18,f20,f22,f24)
  296. STORE_SYNC(o0, f48)
  297. FREG_FROB(f24,f26,f28,f30,f32,f34,f36,f38,f40)
  298. STORE_JUMP(o0, f48, 60f)
  299. 1: FREG_FROB(f10,f12,f14,f16,f18,f20,f22,f24,f26)
  300. LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f)
  301. FREG_FROB(f26,f28,f30,f32,f34,f36,f38,f40,f42)
  302. LOOP_CHUNK2(o1, o0, GLOBAL_SPARE, 2f)
  303. FREG_FROB(f42,f44,f46,f0, f2, f4, f6, f8, f10)
  304. LOOP_CHUNK3(o1, o0, GLOBAL_SPARE, 3f)
  305. ba,pt %xcc, 1b+4
  306. faligndata %f10, %f12, %f48
  307. 1: FREG_FROB(f26,f28,f30,f32,f34,f36,f38,f40,f42)
  308. STORE_SYNC(o0, f48)
  309. FREG_FROB(f42,f44,f46,f0, f2, f4, f6, f8, f10)
  310. STORE_JUMP(o0, f48, 45f)
  311. 2: FREG_FROB(f42,f44,f46,f0, f2, f4, f6, f8, f10)
  312. STORE_SYNC(o0, f48)
  313. FREG_FROB(f10,f12,f14,f16,f18,f20,f22,f24,f26)
  314. STORE_JUMP(o0, f48, 53f)
  315. 3: FREG_FROB(f10,f12,f14,f16,f18,f20,f22,f24,f26)
  316. STORE_SYNC(o0, f48)
  317. FREG_FROB(f26,f28,f30,f32,f34,f36,f38,f40,f42)
  318. STORE_JUMP(o0, f48, 61f)
  319. 1: FREG_FROB(f12,f14,f16,f18,f20,f22,f24,f26,f28)
  320. LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f)
  321. FREG_FROB(f28,f30,f32,f34,f36,f38,f40,f42,f44)
  322. LOOP_CHUNK2(o1, o0, GLOBAL_SPARE, 2f)
  323. FREG_FROB(f44,f46,f0, f2, f4, f6, f8, f10,f12)
  324. LOOP_CHUNK3(o1, o0, GLOBAL_SPARE, 3f)
  325. ba,pt %xcc, 1b+4
  326. faligndata %f12, %f14, %f48
  327. 1: FREG_FROB(f28,f30,f32,f34,f36,f38,f40,f42,f44)
  328. STORE_SYNC(o0, f48)
  329. FREG_FROB(f44,f46,f0, f2, f4, f6, f8, f10,f12)
  330. STORE_JUMP(o0, f48, 46f)
  331. 2: FREG_FROB(f44,f46,f0, f2, f4, f6, f8, f10,f12)
  332. STORE_SYNC(o0, f48)
  333. FREG_FROB(f12,f14,f16,f18,f20,f22,f24,f26,f28)
  334. STORE_JUMP(o0, f48, 54f)
  335. 3: FREG_FROB(f12,f14,f16,f18,f20,f22,f24,f26,f28)
  336. STORE_SYNC(o0, f48)
  337. FREG_FROB(f28,f30,f32,f34,f36,f38,f40,f42,f44)
  338. STORE_JUMP(o0, f48, 62f)
  339. 1: FREG_FROB(f14,f16,f18,f20,f22,f24,f26,f28,f30)
  340. LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f)
  341. FREG_FROB(f30,f32,f34,f36,f38,f40,f42,f44,f46)
  342. LOOP_CHUNK2(o1, o0, GLOBAL_SPARE, 2f)
  343. FREG_FROB(f46,f0, f2, f4, f6, f8, f10,f12,f14)
  344. LOOP_CHUNK3(o1, o0, GLOBAL_SPARE, 3f)
  345. ba,pt %xcc, 1b+4
  346. faligndata %f14, %f16, %f48
  347. 1: FREG_FROB(f30,f32,f34,f36,f38,f40,f42,f44,f46)
  348. STORE_SYNC(o0, f48)
  349. FREG_FROB(f46,f0, f2, f4, f6, f8, f10,f12,f14)
  350. STORE_JUMP(o0, f48, 47f)
  351. 2: FREG_FROB(f46,f0, f2, f4, f6, f8, f10,f12,f14)
  352. STORE_SYNC(o0, f48)
  353. FREG_FROB(f14,f16,f18,f20,f22,f24,f26,f28,f30)
  354. STORE_JUMP(o0, f48, 55f)
  355. 3: FREG_FROB(f14,f16,f18,f20,f22,f24,f26,f28,f30)
  356. STORE_SYNC(o0, f48)
  357. FREG_FROB(f30,f32,f34,f36,f38,f40,f42,f44,f46)
  358. STORE_JUMP(o0, f48, 63f)
  359. 40: FINISH_VISCHUNK(o0, f0, f2, g3)
  360. 41: FINISH_VISCHUNK(o0, f2, f4, g3)
  361. 42: FINISH_VISCHUNK(o0, f4, f6, g3)
  362. 43: FINISH_VISCHUNK(o0, f6, f8, g3)
  363. 44: FINISH_VISCHUNK(o0, f8, f10, g3)
  364. 45: FINISH_VISCHUNK(o0, f10, f12, g3)
  365. 46: FINISH_VISCHUNK(o0, f12, f14, g3)
  366. 47: UNEVEN_VISCHUNK(o0, f14, f0, g3)
  367. 48: FINISH_VISCHUNK(o0, f16, f18, g3)
  368. 49: FINISH_VISCHUNK(o0, f18, f20, g3)
  369. 50: FINISH_VISCHUNK(o0, f20, f22, g3)
  370. 51: FINISH_VISCHUNK(o0, f22, f24, g3)
  371. 52: FINISH_VISCHUNK(o0, f24, f26, g3)
  372. 53: FINISH_VISCHUNK(o0, f26, f28, g3)
  373. 54: FINISH_VISCHUNK(o0, f28, f30, g3)
  374. 55: UNEVEN_VISCHUNK(o0, f30, f0, g3)
  375. 56: FINISH_VISCHUNK(o0, f32, f34, g3)
  376. 57: FINISH_VISCHUNK(o0, f34, f36, g3)
  377. 58: FINISH_VISCHUNK(o0, f36, f38, g3)
  378. 59: FINISH_VISCHUNK(o0, f38, f40, g3)
  379. 60: FINISH_VISCHUNK(o0, f40, f42, g3)
  380. 61: FINISH_VISCHUNK(o0, f42, f44, g3)
  381. 62: FINISH_VISCHUNK(o0, f44, f46, g3)
  382. 63: UNEVEN_VISCHUNK_LAST(o0, f46, f0, g3)
  383. 93: EX_LD(LOAD(ldd, %o1, %f2))
  384. add %o1, 8, %o1
  385. subcc %g3, 8, %g3
  386. faligndata %f0, %f2, %f8
  387. EX_ST(STORE(std, %f8, %o0))
  388. bl,pn %xcc, 95f
  389. add %o0, 8, %o0
  390. EX_LD(LOAD(ldd, %o1, %f0))
  391. add %o1, 8, %o1
  392. subcc %g3, 8, %g3
  393. faligndata %f2, %f0, %f8
  394. EX_ST(STORE(std, %f8, %o0))
  395. bge,pt %xcc, 93b
  396. add %o0, 8, %o0
  397. 95: brz,pt %o2, 2f
  398. mov %g1, %o1
  399. 1: EX_LD(LOAD(ldub, %o1, %o3))
  400. add %o1, 1, %o1
  401. subcc %o2, 1, %o2
  402. EX_ST(STORE(stb, %o3, %o0))
  403. bne,pt %xcc, 1b
  404. add %o0, 1, %o0
  405. 2: membar #StoreLoad | #StoreStore
  406. VISExit
  407. retl
  408. mov EX_RETVAL(%o4), %o0
  409. .align 64
  410. 70: /* 16 < len <= (5 * 64) */
  411. bne,pn %XCC, 75f
  412. sub %o0, %o1, %o3
  413. 72: andn %o2, 0xf, %GLOBAL_SPARE
  414. and %o2, 0xf, %o2
  415. 1: EX_LD(LOAD(ldx, %o1 + 0x00, %o5))
  416. EX_LD(LOAD(ldx, %o1 + 0x08, %g1))
  417. subcc %GLOBAL_SPARE, 0x10, %GLOBAL_SPARE
  418. EX_ST(STORE(stx, %o5, %o1 + %o3))
  419. add %o1, 0x8, %o1
  420. EX_ST(STORE(stx, %g1, %o1 + %o3))
  421. bgu,pt %XCC, 1b
  422. add %o1, 0x8, %o1
  423. 73: andcc %o2, 0x8, %g0
  424. be,pt %XCC, 1f
  425. nop
  426. EX_LD(LOAD(ldx, %o1, %o5))
  427. sub %o2, 0x8, %o2
  428. EX_ST(STORE(stx, %o5, %o1 + %o3))
  429. add %o1, 0x8, %o1
  430. 1: andcc %o2, 0x4, %g0
  431. be,pt %XCC, 1f
  432. nop
  433. EX_LD(LOAD(lduw, %o1, %o5))
  434. sub %o2, 0x4, %o2
  435. EX_ST(STORE(stw, %o5, %o1 + %o3))
  436. add %o1, 0x4, %o1
  437. 1: cmp %o2, 0
  438. be,pt %XCC, 85f
  439. nop
  440. ba,pt %xcc, 90f
  441. nop
  442. 75: andcc %o0, 0x7, %g1
  443. sub %g1, 0x8, %g1
  444. be,pn %icc, 2f
  445. sub %g0, %g1, %g1
  446. sub %o2, %g1, %o2
  447. 1: EX_LD(LOAD(ldub, %o1, %o5))
  448. subcc %g1, 1, %g1
  449. EX_ST(STORE(stb, %o5, %o1 + %o3))
  450. bgu,pt %icc, 1b
  451. add %o1, 1, %o1
  452. 2: add %o1, %o3, %o0
  453. andcc %o1, 0x7, %g1
  454. bne,pt %icc, 8f
  455. sll %g1, 3, %g1
  456. cmp %o2, 16
  457. bgeu,pt %icc, 72b
  458. nop
  459. ba,a,pt %xcc, 73b
  460. 8: mov 64, %o3
  461. andn %o1, 0x7, %o1
  462. EX_LD(LOAD(ldx, %o1, %g2))
  463. sub %o3, %g1, %o3
  464. andn %o2, 0x7, %GLOBAL_SPARE
  465. sllx %g2, %g1, %g2
  466. 1: EX_LD(LOAD(ldx, %o1 + 0x8, %g3))
  467. subcc %GLOBAL_SPARE, 0x8, %GLOBAL_SPARE
  468. add %o1, 0x8, %o1
  469. srlx %g3, %o3, %o5
  470. or %o5, %g2, %o5
  471. EX_ST(STORE(stx, %o5, %o0))
  472. add %o0, 0x8, %o0
  473. bgu,pt %icc, 1b
  474. sllx %g3, %g1, %g2
  475. srl %g1, 3, %g1
  476. andcc %o2, 0x7, %o2
  477. be,pn %icc, 85f
  478. add %o1, %g1, %o1
  479. ba,pt %xcc, 90f
  480. sub %o0, %o1, %o3
  481. .align 64
  482. 80: /* 0 < len <= 16 */
  483. andcc %o3, 0x3, %g0
  484. bne,pn %XCC, 90f
  485. sub %o0, %o1, %o3
  486. 1: EX_LD(LOAD(lduw, %o1, %g1))
  487. subcc %o2, 4, %o2
  488. EX_ST(STORE(stw, %g1, %o1 + %o3))
  489. bgu,pt %XCC, 1b
  490. add %o1, 4, %o1
  491. 85: retl
  492. mov EX_RETVAL(%o4), %o0
  493. .align 32
  494. 90: EX_LD(LOAD(ldub, %o1, %g1))
  495. subcc %o2, 1, %o2
  496. EX_ST(STORE(stb, %g1, %o1 + %o3))
  497. bgu,pt %XCC, 90b
  498. add %o1, 1, %o1
  499. retl
  500. mov EX_RETVAL(%o4), %o0
  501. .size FUNC_NAME, .-FUNC_NAME