memcpy_mck.S 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667
  1. /*
  2. * Itanium 2-optimized version of memcpy and copy_user function
  3. *
  4. * Inputs:
  5. * in0: destination address
  6. * in1: source address
  7. * in2: number of bytes to copy
  8. * Output:
  9. * for memcpy: return dest
  10. * for copy_user: return 0 if success,
  11. * or number of byte NOT copied if error occurred.
  12. *
  13. * Copyright (C) 2002 Intel Corp.
  14. * Copyright (C) 2002 Ken Chen <kenneth.w.chen@intel.com>
  15. */
  16. #include <asm/asmmacro.h>
  17. #include <asm/page.h>
  18. #define EK(y...) EX(y)
  19. /* McKinley specific optimization */
  20. #define retval r8
  21. #define saved_pfs r31
  22. #define saved_lc r10
  23. #define saved_pr r11
  24. #define saved_in0 r14
  25. #define saved_in1 r15
  26. #define saved_in2 r16
  27. #define src0 r2
  28. #define src1 r3
  29. #define dst0 r17
  30. #define dst1 r18
  31. #define cnt r9
  32. /* r19-r30 are temp for each code section */
  33. #define PREFETCH_DIST 8
  34. #define src_pre_mem r19
  35. #define dst_pre_mem r20
  36. #define src_pre_l2 r21
  37. #define dst_pre_l2 r22
  38. #define t1 r23
  39. #define t2 r24
  40. #define t3 r25
  41. #define t4 r26
  42. #define t5 t1 // alias!
  43. #define t6 t2 // alias!
  44. #define t7 t3 // alias!
  45. #define n8 r27
  46. #define t9 t5 // alias!
  47. #define t10 t4 // alias!
  48. #define t11 t7 // alias!
  49. #define t12 t6 // alias!
  50. #define t14 t10 // alias!
  51. #define t13 r28
  52. #define t15 r29
  53. #define tmp r30
  54. /* defines for long_copy block */
  55. #define A 0
  56. #define B (PREFETCH_DIST)
  57. #define C (B + PREFETCH_DIST)
  58. #define D (C + 1)
  59. #define N (D + 1)
  60. #define Nrot ((N + 7) & ~7)
  61. /* alias */
  62. #define in0 r32
  63. #define in1 r33
  64. #define in2 r34
  65. GLOBAL_ENTRY(memcpy)
  66. and r28=0x7,in0
  67. and r29=0x7,in1
  68. mov f6=f0
  69. mov retval=in0
  70. br.cond.sptk .common_code
  71. ;;
  72. END(memcpy)
  73. GLOBAL_ENTRY(__copy_user)
  74. .prologue
  75. // check dest alignment
  76. and r28=0x7,in0
  77. and r29=0x7,in1
  78. mov f6=f1
  79. mov saved_in0=in0 // save dest pointer
  80. mov saved_in1=in1 // save src pointer
  81. mov retval=r0 // initialize return value
  82. ;;
  83. .common_code:
  84. cmp.gt p15,p0=8,in2 // check for small size
  85. cmp.ne p13,p0=0,r28 // check dest alignment
  86. cmp.ne p14,p0=0,r29 // check src alignment
  87. add src0=0,in1
  88. sub r30=8,r28 // for .align_dest
  89. mov saved_in2=in2 // save len
  90. ;;
  91. add dst0=0,in0
  92. add dst1=1,in0 // dest odd index
  93. cmp.le p6,p0 = 1,r30 // for .align_dest
  94. (p15) br.cond.dpnt .memcpy_short
  95. (p13) br.cond.dpnt .align_dest
  96. (p14) br.cond.dpnt .unaligned_src
  97. ;;
  98. // both dest and src are aligned on 8-byte boundary
  99. .aligned_src:
  100. .save ar.pfs, saved_pfs
  101. alloc saved_pfs=ar.pfs,3,Nrot-3,0,Nrot
  102. .save pr, saved_pr
  103. mov saved_pr=pr
  104. shr.u cnt=in2,7 // this much cache line
  105. ;;
  106. cmp.lt p6,p0=2*PREFETCH_DIST,cnt
  107. cmp.lt p7,p8=1,cnt
  108. .save ar.lc, saved_lc
  109. mov saved_lc=ar.lc
  110. .body
  111. add cnt=-1,cnt
  112. add src_pre_mem=0,in1 // prefetch src pointer
  113. add dst_pre_mem=0,in0 // prefetch dest pointer
  114. ;;
  115. (p7) mov ar.lc=cnt // prefetch count
  116. (p8) mov ar.lc=r0
  117. (p6) br.cond.dpnt .long_copy
  118. ;;
  119. .prefetch:
  120. lfetch.fault [src_pre_mem], 128
  121. lfetch.fault.excl [dst_pre_mem], 128
  122. br.cloop.dptk.few .prefetch
  123. ;;
  124. .medium_copy:
  125. and tmp=31,in2 // copy length after iteration
  126. shr.u r29=in2,5 // number of 32-byte iteration
  127. add dst1=8,dst0 // 2nd dest pointer
  128. ;;
  129. add cnt=-1,r29 // ctop iteration adjustment
  130. cmp.eq p10,p0=r29,r0 // do we really need to loop?
  131. add src1=8,src0 // 2nd src pointer
  132. cmp.le p6,p0=8,tmp
  133. ;;
  134. cmp.le p7,p0=16,tmp
  135. mov ar.lc=cnt // loop setup
  136. cmp.eq p16,p17 = r0,r0
  137. mov ar.ec=2
  138. (p10) br.dpnt.few .aligned_src_tail
  139. ;;
  140. TEXT_ALIGN(32)
  141. 1:
  142. EX(.ex_handler, (p16) ld8 r34=[src0],16)
  143. EK(.ex_handler, (p16) ld8 r38=[src1],16)
  144. EX(.ex_handler, (p17) st8 [dst0]=r33,16)
  145. EK(.ex_handler, (p17) st8 [dst1]=r37,16)
  146. ;;
  147. EX(.ex_handler, (p16) ld8 r32=[src0],16)
  148. EK(.ex_handler, (p16) ld8 r36=[src1],16)
  149. EX(.ex_handler, (p16) st8 [dst0]=r34,16)
  150. EK(.ex_handler, (p16) st8 [dst1]=r38,16)
  151. br.ctop.dptk.few 1b
  152. ;;
  153. .aligned_src_tail:
  154. EX(.ex_handler, (p6) ld8 t1=[src0])
  155. mov ar.lc=saved_lc
  156. mov ar.pfs=saved_pfs
  157. EX(.ex_hndlr_s, (p7) ld8 t2=[src1],8)
  158. cmp.le p8,p0=24,tmp
  159. and r21=-8,tmp
  160. ;;
  161. EX(.ex_hndlr_s, (p8) ld8 t3=[src1])
  162. EX(.ex_handler, (p6) st8 [dst0]=t1) // store byte 1
  163. and in2=7,tmp // remaining length
  164. EX(.ex_hndlr_d, (p7) st8 [dst1]=t2,8) // store byte 2
  165. add src0=src0,r21 // setting up src pointer
  166. add dst0=dst0,r21 // setting up dest pointer
  167. ;;
  168. EX(.ex_handler, (p8) st8 [dst1]=t3) // store byte 3
  169. mov pr=saved_pr,-1
  170. br.dptk.many .memcpy_short
  171. ;;
  172. /* code taken from copy_page_mck */
  173. .long_copy:
  174. .rotr v[2*PREFETCH_DIST]
  175. .rotp p[N]
  176. mov src_pre_mem = src0
  177. mov pr.rot = 0x10000
  178. mov ar.ec = 1 // special unrolled loop
  179. mov dst_pre_mem = dst0
  180. add src_pre_l2 = 8*8, src0
  181. add dst_pre_l2 = 8*8, dst0
  182. ;;
  183. add src0 = 8, src_pre_mem // first t1 src
  184. mov ar.lc = 2*PREFETCH_DIST - 1
  185. shr.u cnt=in2,7 // number of lines
  186. add src1 = 3*8, src_pre_mem // first t3 src
  187. add dst0 = 8, dst_pre_mem // first t1 dst
  188. add dst1 = 3*8, dst_pre_mem // first t3 dst
  189. ;;
  190. and tmp=127,in2 // remaining bytes after this block
  191. add cnt = -(2*PREFETCH_DIST) - 1, cnt
  192. // same as .line_copy loop, but with all predicated-off instructions removed:
  193. .prefetch_loop:
  194. EX(.ex_hndlr_lcpy_1, (p[A]) ld8 v[A] = [src_pre_mem], 128) // M0
  195. EK(.ex_hndlr_lcpy_1, (p[B]) st8 [dst_pre_mem] = v[B], 128) // M2
  196. br.ctop.sptk .prefetch_loop
  197. ;;
  198. cmp.eq p16, p0 = r0, r0 // reset p16 to 1
  199. mov ar.lc = cnt
  200. mov ar.ec = N // # of stages in pipeline
  201. ;;
  202. .line_copy:
  203. EX(.ex_handler, (p[D]) ld8 t2 = [src0], 3*8) // M0
  204. EK(.ex_handler, (p[D]) ld8 t4 = [src1], 3*8) // M1
  205. EX(.ex_handler_lcpy, (p[B]) st8 [dst_pre_mem] = v[B], 128) // M2 prefetch dst from memory
  206. EK(.ex_handler_lcpy, (p[D]) st8 [dst_pre_l2] = n8, 128) // M3 prefetch dst from L2
  207. ;;
  208. EX(.ex_handler_lcpy, (p[A]) ld8 v[A] = [src_pre_mem], 128) // M0 prefetch src from memory
  209. EK(.ex_handler_lcpy, (p[C]) ld8 n8 = [src_pre_l2], 128) // M1 prefetch src from L2
  210. EX(.ex_handler, (p[D]) st8 [dst0] = t1, 8) // M2
  211. EK(.ex_handler, (p[D]) st8 [dst1] = t3, 8) // M3
  212. ;;
  213. EX(.ex_handler, (p[D]) ld8 t5 = [src0], 8)
  214. EK(.ex_handler, (p[D]) ld8 t7 = [src1], 3*8)
  215. EX(.ex_handler, (p[D]) st8 [dst0] = t2, 3*8)
  216. EK(.ex_handler, (p[D]) st8 [dst1] = t4, 3*8)
  217. ;;
  218. EX(.ex_handler, (p[D]) ld8 t6 = [src0], 3*8)
  219. EK(.ex_handler, (p[D]) ld8 t10 = [src1], 8)
  220. EX(.ex_handler, (p[D]) st8 [dst0] = t5, 8)
  221. EK(.ex_handler, (p[D]) st8 [dst1] = t7, 3*8)
  222. ;;
  223. EX(.ex_handler, (p[D]) ld8 t9 = [src0], 3*8)
  224. EK(.ex_handler, (p[D]) ld8 t11 = [src1], 3*8)
  225. EX(.ex_handler, (p[D]) st8 [dst0] = t6, 3*8)
  226. EK(.ex_handler, (p[D]) st8 [dst1] = t10, 8)
  227. ;;
  228. EX(.ex_handler, (p[D]) ld8 t12 = [src0], 8)
  229. EK(.ex_handler, (p[D]) ld8 t14 = [src1], 8)
  230. EX(.ex_handler, (p[D]) st8 [dst0] = t9, 3*8)
  231. EK(.ex_handler, (p[D]) st8 [dst1] = t11, 3*8)
  232. ;;
  233. EX(.ex_handler, (p[D]) ld8 t13 = [src0], 4*8)
  234. EK(.ex_handler, (p[D]) ld8 t15 = [src1], 4*8)
  235. EX(.ex_handler, (p[D]) st8 [dst0] = t12, 8)
  236. EK(.ex_handler, (p[D]) st8 [dst1] = t14, 8)
  237. ;;
  238. EX(.ex_handler, (p[C]) ld8 t1 = [src0], 8)
  239. EK(.ex_handler, (p[C]) ld8 t3 = [src1], 8)
  240. EX(.ex_handler, (p[D]) st8 [dst0] = t13, 4*8)
  241. EK(.ex_handler, (p[D]) st8 [dst1] = t15, 4*8)
  242. br.ctop.sptk .line_copy
  243. ;;
  244. add dst0=-8,dst0
  245. add src0=-8,src0
  246. mov in2=tmp
  247. .restore sp
  248. br.sptk.many .medium_copy
  249. ;;
  250. #define BLOCK_SIZE 128*32
  251. #define blocksize r23
  252. #define curlen r24
  253. // dest is on 8-byte boundary, src is not. We need to do
  254. // ld8-ld8, shrp, then st8. Max 8 byte copy per cycle.
  255. .unaligned_src:
  256. .prologue
  257. .save ar.pfs, saved_pfs
  258. alloc saved_pfs=ar.pfs,3,5,0,8
  259. .save ar.lc, saved_lc
  260. mov saved_lc=ar.lc
  261. .save pr, saved_pr
  262. mov saved_pr=pr
  263. .body
  264. .4k_block:
  265. mov saved_in0=dst0 // need to save all input arguments
  266. mov saved_in2=in2
  267. mov blocksize=BLOCK_SIZE
  268. ;;
  269. cmp.lt p6,p7=blocksize,in2
  270. mov saved_in1=src0
  271. ;;
  272. (p6) mov in2=blocksize
  273. ;;
  274. shr.u r21=in2,7 // this much cache line
  275. shr.u r22=in2,4 // number of 16-byte iteration
  276. and curlen=15,in2 // copy length after iteration
  277. and r30=7,src0 // source alignment
  278. ;;
  279. cmp.lt p7,p8=1,r21
  280. add cnt=-1,r21
  281. ;;
  282. add src_pre_mem=0,src0 // prefetch src pointer
  283. add dst_pre_mem=0,dst0 // prefetch dest pointer
  284. and src0=-8,src0 // 1st src pointer
  285. (p7) mov ar.lc = cnt
  286. (p8) mov ar.lc = r0
  287. ;;
  288. TEXT_ALIGN(32)
  289. 1: lfetch.fault [src_pre_mem], 128
  290. lfetch.fault.excl [dst_pre_mem], 128
  291. br.cloop.dptk.few 1b
  292. ;;
  293. shladd dst1=r22,3,dst0 // 2nd dest pointer
  294. shladd src1=r22,3,src0 // 2nd src pointer
  295. cmp.eq p8,p9=r22,r0 // do we really need to loop?
  296. cmp.le p6,p7=8,curlen; // have at least 8 byte remaining?
  297. add cnt=-1,r22 // ctop iteration adjustment
  298. ;;
  299. EX(.ex_handler, (p9) ld8 r33=[src0],8) // loop primer
  300. EK(.ex_handler, (p9) ld8 r37=[src1],8)
  301. (p8) br.dpnt.few .noloop
  302. ;;
  303. // The jump address is calculated based on src alignment. The COPYU
  304. // macro below need to confine its size to power of two, so an entry
  305. // can be caulated using shl instead of an expensive multiply. The
  306. // size is then hard coded by the following #define to match the
  307. // actual size. This make it somewhat tedious when COPYU macro gets
  308. // changed and this need to be adjusted to match.
  309. #define LOOP_SIZE 6
  310. 1:
  311. mov r29=ip // jmp_table thread
  312. mov ar.lc=cnt
  313. ;;
  314. add r29=.jump_table - 1b - (.jmp1-.jump_table), r29
  315. shl r28=r30, LOOP_SIZE // jmp_table thread
  316. mov ar.ec=2 // loop setup
  317. ;;
  318. add r29=r29,r28 // jmp_table thread
  319. cmp.eq p16,p17=r0,r0
  320. ;;
  321. mov b6=r29 // jmp_table thread
  322. ;;
  323. br.cond.sptk.few b6
  324. // for 8-15 byte case
  325. // We will skip the loop, but need to replicate the side effect
  326. // that the loop produces.
  327. .noloop:
  328. EX(.ex_handler, (p6) ld8 r37=[src1],8)
  329. add src0=8,src0
  330. (p6) shl r25=r30,3
  331. ;;
  332. EX(.ex_handler, (p6) ld8 r27=[src1])
  333. (p6) shr.u r28=r37,r25
  334. (p6) sub r26=64,r25
  335. ;;
  336. (p6) shl r27=r27,r26
  337. ;;
  338. (p6) or r21=r28,r27
  339. .unaligned_src_tail:
  340. /* check if we have more than blocksize to copy, if so go back */
  341. cmp.gt p8,p0=saved_in2,blocksize
  342. ;;
  343. (p8) add dst0=saved_in0,blocksize
  344. (p8) add src0=saved_in1,blocksize
  345. (p8) sub in2=saved_in2,blocksize
  346. (p8) br.dpnt .4k_block
  347. ;;
  348. /* we have up to 15 byte to copy in the tail.
  349. * part of work is already done in the jump table code
  350. * we are at the following state.
  351. * src side:
  352. *
  353. * xxxxxx xx <----- r21 has xxxxxxxx already
  354. * -------- -------- --------
  355. * 0 8 16
  356. * ^
  357. * |
  358. * src1
  359. *
  360. * dst
  361. * -------- -------- --------
  362. * ^
  363. * |
  364. * dst1
  365. */
  366. EX(.ex_handler, (p6) st8 [dst1]=r21,8) // more than 8 byte to copy
  367. (p6) add curlen=-8,curlen // update length
  368. mov ar.pfs=saved_pfs
  369. ;;
  370. mov ar.lc=saved_lc
  371. mov pr=saved_pr,-1
  372. mov in2=curlen // remaining length
  373. mov dst0=dst1 // dest pointer
  374. add src0=src1,r30 // forward by src alignment
  375. ;;
  376. // 7 byte or smaller.
  377. .memcpy_short:
  378. cmp.le p8,p9 = 1,in2
  379. cmp.le p10,p11 = 2,in2
  380. cmp.le p12,p13 = 3,in2
  381. cmp.le p14,p15 = 4,in2
  382. add src1=1,src0 // second src pointer
  383. add dst1=1,dst0 // second dest pointer
  384. ;;
  385. EX(.ex_handler_short, (p8) ld1 t1=[src0],2)
  386. EK(.ex_handler_short, (p10) ld1 t2=[src1],2)
  387. (p9) br.ret.dpnt rp // 0 byte copy
  388. ;;
  389. EX(.ex_handler_short, (p8) st1 [dst0]=t1,2)
  390. EK(.ex_handler_short, (p10) st1 [dst1]=t2,2)
  391. (p11) br.ret.dpnt rp // 1 byte copy
  392. EX(.ex_handler_short, (p12) ld1 t3=[src0],2)
  393. EK(.ex_handler_short, (p14) ld1 t4=[src1],2)
  394. (p13) br.ret.dpnt rp // 2 byte copy
  395. ;;
  396. cmp.le p6,p7 = 5,in2
  397. cmp.le p8,p9 = 6,in2
  398. cmp.le p10,p11 = 7,in2
  399. EX(.ex_handler_short, (p12) st1 [dst0]=t3,2)
  400. EK(.ex_handler_short, (p14) st1 [dst1]=t4,2)
  401. (p15) br.ret.dpnt rp // 3 byte copy
  402. ;;
  403. EX(.ex_handler_short, (p6) ld1 t5=[src0],2)
  404. EK(.ex_handler_short, (p8) ld1 t6=[src1],2)
  405. (p7) br.ret.dpnt rp // 4 byte copy
  406. ;;
  407. EX(.ex_handler_short, (p6) st1 [dst0]=t5,2)
  408. EK(.ex_handler_short, (p8) st1 [dst1]=t6,2)
  409. (p9) br.ret.dptk rp // 5 byte copy
  410. EX(.ex_handler_short, (p10) ld1 t7=[src0],2)
  411. (p11) br.ret.dptk rp // 6 byte copy
  412. ;;
  413. EX(.ex_handler_short, (p10) st1 [dst0]=t7,2)
  414. br.ret.dptk rp // done all cases
  415. /* Align dest to nearest 8-byte boundary. We know we have at
  416. * least 7 bytes to copy, enough to crawl to 8-byte boundary.
  417. * Actual number of byte to crawl depend on the dest alignment.
  418. * 7 byte or less is taken care at .memcpy_short
  419. * src0 - source even index
  420. * src1 - source odd index
  421. * dst0 - dest even index
  422. * dst1 - dest odd index
  423. * r30 - distance to 8-byte boundary
  424. */
  425. .align_dest:
  426. add src1=1,in1 // source odd index
  427. cmp.le p7,p0 = 2,r30 // for .align_dest
  428. cmp.le p8,p0 = 3,r30 // for .align_dest
  429. EX(.ex_handler_short, (p6) ld1 t1=[src0],2)
  430. cmp.le p9,p0 = 4,r30 // for .align_dest
  431. cmp.le p10,p0 = 5,r30
  432. ;;
  433. EX(.ex_handler_short, (p7) ld1 t2=[src1],2)
  434. EK(.ex_handler_short, (p8) ld1 t3=[src0],2)
  435. cmp.le p11,p0 = 6,r30
  436. EX(.ex_handler_short, (p6) st1 [dst0] = t1,2)
  437. cmp.le p12,p0 = 7,r30
  438. ;;
  439. EX(.ex_handler_short, (p9) ld1 t4=[src1],2)
  440. EK(.ex_handler_short, (p10) ld1 t5=[src0],2)
  441. EX(.ex_handler_short, (p7) st1 [dst1] = t2,2)
  442. EK(.ex_handler_short, (p8) st1 [dst0] = t3,2)
  443. ;;
  444. EX(.ex_handler_short, (p11) ld1 t6=[src1],2)
  445. EK(.ex_handler_short, (p12) ld1 t7=[src0],2)
  446. cmp.eq p6,p7=r28,r29
  447. EX(.ex_handler_short, (p9) st1 [dst1] = t4,2)
  448. EK(.ex_handler_short, (p10) st1 [dst0] = t5,2)
  449. sub in2=in2,r30
  450. ;;
  451. EX(.ex_handler_short, (p11) st1 [dst1] = t6,2)
  452. EK(.ex_handler_short, (p12) st1 [dst0] = t7)
  453. add dst0=in0,r30 // setup arguments
  454. add src0=in1,r30
  455. (p6) br.cond.dptk .aligned_src
  456. (p7) br.cond.dpnt .unaligned_src
  457. ;;
  458. /* main loop body in jump table format */
  459. #define COPYU(shift) \
  460. 1: \
  461. EX(.ex_handler, (p16) ld8 r32=[src0],8); /* 1 */ \
  462. EK(.ex_handler, (p16) ld8 r36=[src1],8); \
  463. (p17) shrp r35=r33,r34,shift;; /* 1 */ \
  464. EX(.ex_handler, (p6) ld8 r22=[src1]); /* common, prime for tail section */ \
  465. nop.m 0; \
  466. (p16) shrp r38=r36,r37,shift; \
  467. EX(.ex_handler, (p17) st8 [dst0]=r35,8); /* 1 */ \
  468. EK(.ex_handler, (p17) st8 [dst1]=r39,8); \
  469. br.ctop.dptk.few 1b;; \
  470. (p7) add src1=-8,src1; /* back out for <8 byte case */ \
  471. shrp r21=r22,r38,shift; /* speculative work */ \
  472. br.sptk.few .unaligned_src_tail /* branch out of jump table */ \
  473. ;;
  474. TEXT_ALIGN(32)
  475. .jump_table:
  476. COPYU(8) // unaligned cases
  477. .jmp1:
  478. COPYU(16)
  479. COPYU(24)
  480. COPYU(32)
  481. COPYU(40)
  482. COPYU(48)
  483. COPYU(56)
  484. #undef A
  485. #undef B
  486. #undef C
  487. #undef D
  488. /*
  489. * Due to lack of local tag support in gcc 2.x assembler, it is not clear which
  490. * instruction failed in the bundle. The exception algorithm is that we
  491. * first figure out the faulting address, then detect if there is any
  492. * progress made on the copy, if so, redo the copy from last known copied
  493. * location up to the faulting address (exclusive). In the copy_from_user
  494. * case, remaining byte in kernel buffer will be zeroed.
  495. *
  496. * Take copy_from_user as an example, in the code there are multiple loads
  497. * in a bundle and those multiple loads could span over two pages, the
  498. * faulting address is calculated as page_round_down(max(src0, src1)).
  499. * This is based on knowledge that if we can access one byte in a page, we
  500. * can access any byte in that page.
  501. *
  502. * predicate used in the exception handler:
  503. * p6-p7: direction
  504. * p10-p11: src faulting addr calculation
  505. * p12-p13: dst faulting addr calculation
  506. */
  507. #define A r19
  508. #define B r20
  509. #define C r21
  510. #define D r22
  511. #define F r28
  512. #define memset_arg0 r32
  513. #define memset_arg2 r33
  514. #define saved_retval loc0
  515. #define saved_rtlink loc1
  516. #define saved_pfs_stack loc2
  517. .ex_hndlr_s:
  518. add src0=8,src0
  519. br.sptk .ex_handler
  520. ;;
  521. .ex_hndlr_d:
  522. add dst0=8,dst0
  523. br.sptk .ex_handler
  524. ;;
  525. .ex_hndlr_lcpy_1:
  526. mov src1=src_pre_mem
  527. mov dst1=dst_pre_mem
  528. cmp.gtu p10,p11=src_pre_mem,saved_in1
  529. cmp.gtu p12,p13=dst_pre_mem,saved_in0
  530. ;;
  531. (p10) add src0=8,saved_in1
  532. (p11) mov src0=saved_in1
  533. (p12) add dst0=8,saved_in0
  534. (p13) mov dst0=saved_in0
  535. br.sptk .ex_handler
  536. .ex_handler_lcpy:
  537. // in line_copy block, the preload addresses should always ahead
  538. // of the other two src/dst pointers. Furthermore, src1/dst1 should
  539. // always ahead of src0/dst0.
  540. mov src1=src_pre_mem
  541. mov dst1=dst_pre_mem
  542. .ex_handler:
  543. mov pr=saved_pr,-1 // first restore pr, lc, and pfs
  544. mov ar.lc=saved_lc
  545. mov ar.pfs=saved_pfs
  546. ;;
  547. .ex_handler_short: // fault occurred in these sections didn't change pr, lc, pfs
  548. cmp.ltu p6,p7=saved_in0, saved_in1 // get the copy direction
  549. cmp.ltu p10,p11=src0,src1
  550. cmp.ltu p12,p13=dst0,dst1
  551. fcmp.eq p8,p0=f6,f0 // is it memcpy?
  552. mov tmp = dst0
  553. ;;
  554. (p11) mov src1 = src0 // pick the larger of the two
  555. (p13) mov dst0 = dst1 // make dst0 the smaller one
  556. (p13) mov dst1 = tmp // and dst1 the larger one
  557. ;;
  558. (p6) dep F = r0,dst1,0,PAGE_SHIFT // usr dst round down to page boundary
  559. (p7) dep F = r0,src1,0,PAGE_SHIFT // usr src round down to page boundary
  560. ;;
  561. (p6) cmp.le p14,p0=dst0,saved_in0 // no progress has been made on store
  562. (p7) cmp.le p14,p0=src0,saved_in1 // no progress has been made on load
  563. mov retval=saved_in2
  564. (p8) ld1 tmp=[src1] // force an oops for memcpy call
  565. (p8) st1 [dst1]=r0 // force an oops for memcpy call
  566. (p14) br.ret.sptk.many rp
  567. /*
  568. * The remaining byte to copy is calculated as:
  569. *
  570. * A = (faulting_addr - orig_src) -> len to faulting ld address
  571. * or
  572. * (faulting_addr - orig_dst) -> len to faulting st address
  573. * B = (cur_dst - orig_dst) -> len copied so far
  574. * C = A - B -> len need to be copied
  575. * D = orig_len - A -> len need to be zeroed
  576. */
  577. (p6) sub A = F, saved_in0
  578. (p7) sub A = F, saved_in1
  579. clrrrb
  580. ;;
  581. alloc saved_pfs_stack=ar.pfs,3,3,3,0
  582. cmp.lt p8,p0=A,r0
  583. sub B = dst0, saved_in0 // how many byte copied so far
  584. ;;
  585. (p8) mov A = 0; // A shouldn't be negative, cap it
  586. ;;
  587. sub C = A, B
  588. sub D = saved_in2, A
  589. ;;
  590. cmp.gt p8,p0=C,r0 // more than 1 byte?
  591. add memset_arg0=saved_in0, A
  592. (p6) mov memset_arg2=0 // copy_to_user should not call memset
  593. (p7) mov memset_arg2=D // copy_from_user need to have kbuf zeroed
  594. mov r8=0
  595. mov saved_retval = D
  596. mov saved_rtlink = b0
  597. add out0=saved_in0, B
  598. add out1=saved_in1, B
  599. mov out2=C
  600. (p8) br.call.sptk.few b0=__copy_user // recursive call
  601. ;;
  602. add saved_retval=saved_retval,r8 // above might return non-zero value
  603. cmp.gt p8,p0=memset_arg2,r0 // more than 1 byte?
  604. mov out0=memset_arg0 // *s
  605. mov out1=r0 // c
  606. mov out2=memset_arg2 // n
  607. (p8) br.call.sptk.few b0=memset
  608. ;;
  609. mov retval=saved_retval
  610. mov ar.pfs=saved_pfs_stack
  611. mov b0=saved_rtlink
  612. br.ret.sptk.many rp
  613. /* end of McKinley specific optimization */
  614. END(__copy_user)