copy_user.S 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611
  1. /*
  2. *
  3. * Optimized version of the copy_user() routine.
  4. * It is used to copy date across the kernel/user boundary.
  5. *
  6. * The source and destination are always on opposite side of
  7. * the boundary. When reading from user space we must catch
  8. * faults on loads. When writing to user space we must catch
  9. * errors on stores. Note that because of the nature of the copy
  10. * we don't need to worry about overlapping regions.
  11. *
  12. *
  13. * Inputs:
  14. * in0 address of source buffer
  15. * in1 address of destination buffer
  16. * in2 number of bytes to copy
  17. *
  18. * Outputs:
  19. * ret0 0 in case of success. The number of bytes NOT copied in
  20. * case of error.
  21. *
  22. * Copyright (C) 2000-2001 Hewlett-Packard Co
  23. * Stephane Eranian <eranian@hpl.hp.com>
  24. *
  25. * Fixme:
  26. * - handle the case where we have more than 16 bytes and the alignment
  27. * are different.
  28. * - more benchmarking
  29. * - fix extraneous stop bit introduced by the EX() macro.
  30. */
  31. #include <asm/asmmacro.h>
  32. //
  33. // Tuneable parameters
  34. //
  35. #define COPY_BREAK 16 // we do byte copy below (must be >=16)
  36. #define PIPE_DEPTH 21 // pipe depth
  37. #define EPI p[PIPE_DEPTH-1]
  38. //
  39. // arguments
  40. //
  41. #define dst in0
  42. #define src in1
  43. #define len in2
  44. //
  45. // local registers
  46. //
  47. #define t1 r2 // rshift in bytes
  48. #define t2 r3 // lshift in bytes
  49. #define rshift r14 // right shift in bits
  50. #define lshift r15 // left shift in bits
  51. #define word1 r16
  52. #define word2 r17
  53. #define cnt r18
  54. #define len2 r19
  55. #define saved_lc r20
  56. #define saved_pr r21
  57. #define tmp r22
  58. #define val r23
  59. #define src1 r24
  60. #define dst1 r25
  61. #define src2 r26
  62. #define dst2 r27
  63. #define len1 r28
  64. #define enddst r29
  65. #define endsrc r30
  66. #define saved_pfs r31
  67. GLOBAL_ENTRY(__copy_user)
  68. .prologue
  69. .save ar.pfs, saved_pfs
  70. alloc saved_pfs=ar.pfs,3,((2*PIPE_DEPTH+7)&~7),0,((2*PIPE_DEPTH+7)&~7)
  71. .rotr val1[PIPE_DEPTH],val2[PIPE_DEPTH]
  72. .rotp p[PIPE_DEPTH]
  73. adds len2=-1,len // br.ctop is repeat/until
  74. mov ret0=r0
  75. ;; // RAW of cfm when len=0
  76. cmp.eq p8,p0=r0,len // check for zero length
  77. .save ar.lc, saved_lc
  78. mov saved_lc=ar.lc // preserve ar.lc (slow)
  79. (p8) br.ret.spnt.many rp // empty mempcy()
  80. ;;
  81. add enddst=dst,len // first byte after end of source
  82. add endsrc=src,len // first byte after end of destination
  83. .save pr, saved_pr
  84. mov saved_pr=pr // preserve predicates
  85. .body
  86. mov dst1=dst // copy because of rotation
  87. mov ar.ec=PIPE_DEPTH
  88. mov pr.rot=1<<16 // p16=true all others are false
  89. mov src1=src // copy because of rotation
  90. mov ar.lc=len2 // initialize lc for small count
  91. cmp.lt p10,p7=COPY_BREAK,len // if len > COPY_BREAK then long copy
  92. xor tmp=src,dst // same alignment test prepare
  93. (p10) br.cond.dptk .long_copy_user
  94. ;; // RAW pr.rot/p16 ?
  95. //
  96. // Now we do the byte by byte loop with software pipeline
  97. //
  98. // p7 is necessarily false by now
  99. 1:
  100. EX(.failure_in_pipe1,(p16) ld1 val1[0]=[src1],1)
  101. EX(.failure_out,(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1)
  102. br.ctop.dptk.few 1b
  103. ;;
  104. mov ar.lc=saved_lc
  105. mov pr=saved_pr,0xffffffffffff0000
  106. mov ar.pfs=saved_pfs // restore ar.ec
  107. br.ret.sptk.many rp // end of short memcpy
  108. //
  109. // Not 8-byte aligned
  110. //
  111. .diff_align_copy_user:
  112. // At this point we know we have more than 16 bytes to copy
  113. // and also that src and dest do _not_ have the same alignment.
  114. and src2=0x7,src1 // src offset
  115. and dst2=0x7,dst1 // dst offset
  116. ;;
  117. // The basic idea is that we copy byte-by-byte at the head so
  118. // that we can reach 8-byte alignment for both src1 and dst1.
  119. // Then copy the body using software pipelined 8-byte copy,
  120. // shifting the two back-to-back words right and left, then copy
  121. // the tail by copying byte-by-byte.
  122. //
  123. // Fault handling. If the byte-by-byte at the head fails on the
  124. // load, then restart and finish the pipleline by copying zeros
  125. // to the dst1. Then copy zeros for the rest of dst1.
  126. // If 8-byte software pipeline fails on the load, do the same as
  127. // failure_in3 does. If the byte-by-byte at the tail fails, it is
  128. // handled simply by failure_in_pipe1.
  129. //
  130. // The case p14 represents the source has more bytes in the
  131. // the first word (by the shifted part), whereas the p15 needs to
  132. // copy some bytes from the 2nd word of the source that has the
  133. // tail of the 1st of the destination.
  134. //
  135. //
  136. // Optimization. If dst1 is 8-byte aligned (quite common), we don't need
  137. // to copy the head to dst1, to start 8-byte copy software pipeline.
  138. // We know src1 is not 8-byte aligned in this case.
  139. //
  140. cmp.eq p14,p15=r0,dst2
  141. (p15) br.cond.spnt 1f
  142. ;;
  143. sub t1=8,src2
  144. mov t2=src2
  145. ;;
  146. shl rshift=t2,3
  147. sub len1=len,t1 // set len1
  148. ;;
  149. sub lshift=64,rshift
  150. ;;
  151. br.cond.spnt .word_copy_user
  152. ;;
  153. 1:
  154. cmp.leu p14,p15=src2,dst2
  155. sub t1=dst2,src2
  156. ;;
  157. .pred.rel "mutex", p14, p15
  158. (p14) sub word1=8,src2 // (8 - src offset)
  159. (p15) sub t1=r0,t1 // absolute value
  160. (p15) sub word1=8,dst2 // (8 - dst offset)
  161. ;;
  162. // For the case p14, we don't need to copy the shifted part to
  163. // the 1st word of destination.
  164. sub t2=8,t1
  165. (p14) sub word1=word1,t1
  166. ;;
  167. sub len1=len,word1 // resulting len
  168. (p15) shl rshift=t1,3 // in bits
  169. (p14) shl rshift=t2,3
  170. ;;
  171. (p14) sub len1=len1,t1
  172. adds cnt=-1,word1
  173. ;;
  174. sub lshift=64,rshift
  175. mov ar.ec=PIPE_DEPTH
  176. mov pr.rot=1<<16 // p16=true all others are false
  177. mov ar.lc=cnt
  178. ;;
  179. 2:
  180. EX(.failure_in_pipe2,(p16) ld1 val1[0]=[src1],1)
  181. EX(.failure_out,(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1)
  182. br.ctop.dptk.few 2b
  183. ;;
  184. clrrrb
  185. ;;
  186. .word_copy_user:
  187. cmp.gtu p9,p0=16,len1
  188. (p9) br.cond.spnt 4f // if (16 > len1) skip 8-byte copy
  189. ;;
  190. shr.u cnt=len1,3 // number of 64-bit words
  191. ;;
  192. adds cnt=-1,cnt
  193. ;;
  194. .pred.rel "mutex", p14, p15
  195. (p14) sub src1=src1,t2
  196. (p15) sub src1=src1,t1
  197. //
  198. // Now both src1 and dst1 point to an 8-byte aligned address. And
  199. // we have more than 8 bytes to copy.
  200. //
  201. mov ar.lc=cnt
  202. mov ar.ec=PIPE_DEPTH
  203. mov pr.rot=1<<16 // p16=true all others are false
  204. ;;
  205. 3:
  206. //
  207. // The pipleline consists of 3 stages:
  208. // 1 (p16): Load a word from src1
  209. // 2 (EPI_1): Shift right pair, saving to tmp
  210. // 3 (EPI): Store tmp to dst1
  211. //
  212. // To make it simple, use at least 2 (p16) loops to set up val1[n]
  213. // because we need 2 back-to-back val1[] to get tmp.
  214. // Note that this implies EPI_2 must be p18 or greater.
  215. //
  216. #define EPI_1 p[PIPE_DEPTH-2]
  217. #define SWITCH(pred, shift) cmp.eq pred,p0=shift,rshift
  218. #define CASE(pred, shift) \
  219. (pred) br.cond.spnt .copy_user_bit##shift
  220. #define BODY(rshift) \
  221. .copy_user_bit##rshift: \
  222. 1: \
  223. EX(.failure_out,(EPI) st8 [dst1]=tmp,8); \
  224. (EPI_1) shrp tmp=val1[PIPE_DEPTH-2],val1[PIPE_DEPTH-1],rshift; \
  225. EX(3f,(p16) ld8 val1[1]=[src1],8); \
  226. (p16) mov val1[0]=r0; \
  227. br.ctop.dptk 1b; \
  228. ;; \
  229. br.cond.sptk.many .diff_align_do_tail; \
  230. 2: \
  231. (EPI) st8 [dst1]=tmp,8; \
  232. (EPI_1) shrp tmp=val1[PIPE_DEPTH-2],val1[PIPE_DEPTH-1],rshift; \
  233. 3: \
  234. (p16) mov val1[1]=r0; \
  235. (p16) mov val1[0]=r0; \
  236. br.ctop.dptk 2b; \
  237. ;; \
  238. br.cond.sptk.many .failure_in2
  239. //
  240. // Since the instruction 'shrp' requires a fixed 128-bit value
  241. // specifying the bits to shift, we need to provide 7 cases
  242. // below.
  243. //
  244. SWITCH(p6, 8)
  245. SWITCH(p7, 16)
  246. SWITCH(p8, 24)
  247. SWITCH(p9, 32)
  248. SWITCH(p10, 40)
  249. SWITCH(p11, 48)
  250. SWITCH(p12, 56)
  251. ;;
  252. CASE(p6, 8)
  253. CASE(p7, 16)
  254. CASE(p8, 24)
  255. CASE(p9, 32)
  256. CASE(p10, 40)
  257. CASE(p11, 48)
  258. CASE(p12, 56)
  259. ;;
  260. BODY(8)
  261. BODY(16)
  262. BODY(24)
  263. BODY(32)
  264. BODY(40)
  265. BODY(48)
  266. BODY(56)
  267. ;;
  268. .diff_align_do_tail:
  269. .pred.rel "mutex", p14, p15
  270. (p14) sub src1=src1,t1
  271. (p14) adds dst1=-8,dst1
  272. (p15) sub dst1=dst1,t1
  273. ;;
  274. 4:
  275. // Tail correction.
  276. //
  277. // The problem with this piplelined loop is that the last word is not
  278. // loaded and thus parf of the last word written is not correct.
  279. // To fix that, we simply copy the tail byte by byte.
  280. sub len1=endsrc,src1,1
  281. clrrrb
  282. ;;
  283. mov ar.ec=PIPE_DEPTH
  284. mov pr.rot=1<<16 // p16=true all others are false
  285. mov ar.lc=len1
  286. ;;
  287. 5:
  288. EX(.failure_in_pipe1,(p16) ld1 val1[0]=[src1],1)
  289. EX(.failure_out,(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1)
  290. br.ctop.dptk.few 5b
  291. ;;
  292. mov ar.lc=saved_lc
  293. mov pr=saved_pr,0xffffffffffff0000
  294. mov ar.pfs=saved_pfs
  295. br.ret.sptk.many rp
  296. //
  297. // Beginning of long mempcy (i.e. > 16 bytes)
  298. //
  299. .long_copy_user:
  300. tbit.nz p6,p7=src1,0 // odd alignment
  301. and tmp=7,tmp
  302. ;;
  303. cmp.eq p10,p8=r0,tmp
  304. mov len1=len // copy because of rotation
  305. (p8) br.cond.dpnt .diff_align_copy_user
  306. ;;
  307. // At this point we know we have more than 16 bytes to copy
  308. // and also that both src and dest have the same alignment
  309. // which may not be the one we want. So for now we must move
  310. // forward slowly until we reach 16byte alignment: no need to
  311. // worry about reaching the end of buffer.
  312. //
  313. EX(.failure_in1,(p6) ld1 val1[0]=[src1],1) // 1-byte aligned
  314. (p6) adds len1=-1,len1;;
  315. tbit.nz p7,p0=src1,1
  316. ;;
  317. EX(.failure_in1,(p7) ld2 val1[1]=[src1],2) // 2-byte aligned
  318. (p7) adds len1=-2,len1;;
  319. tbit.nz p8,p0=src1,2
  320. ;;
  321. //
  322. // Stop bit not required after ld4 because if we fail on ld4
  323. // we have never executed the ld1, therefore st1 is not executed.
  324. //
  325. EX(.failure_in1,(p8) ld4 val2[0]=[src1],4) // 4-byte aligned
  326. ;;
  327. EX(.failure_out,(p6) st1 [dst1]=val1[0],1)
  328. tbit.nz p9,p0=src1,3
  329. ;;
  330. //
  331. // Stop bit not required after ld8 because if we fail on ld8
  332. // we have never executed the ld2, therefore st2 is not executed.
  333. //
  334. EX(.failure_in1,(p9) ld8 val2[1]=[src1],8) // 8-byte aligned
  335. EX(.failure_out,(p7) st2 [dst1]=val1[1],2)
  336. (p8) adds len1=-4,len1
  337. ;;
  338. EX(.failure_out, (p8) st4 [dst1]=val2[0],4)
  339. (p9) adds len1=-8,len1;;
  340. shr.u cnt=len1,4 // number of 128-bit (2x64bit) words
  341. ;;
  342. EX(.failure_out, (p9) st8 [dst1]=val2[1],8)
  343. tbit.nz p6,p0=len1,3
  344. cmp.eq p7,p0=r0,cnt
  345. adds tmp=-1,cnt // br.ctop is repeat/until
  346. (p7) br.cond.dpnt .dotail // we have less than 16 bytes left
  347. ;;
  348. adds src2=8,src1
  349. adds dst2=8,dst1
  350. mov ar.lc=tmp
  351. ;;
  352. //
  353. // 16bytes/iteration
  354. //
  355. 2:
  356. EX(.failure_in3,(p16) ld8 val1[0]=[src1],16)
  357. (p16) ld8 val2[0]=[src2],16
  358. EX(.failure_out, (EPI) st8 [dst1]=val1[PIPE_DEPTH-1],16)
  359. (EPI) st8 [dst2]=val2[PIPE_DEPTH-1],16
  360. br.ctop.dptk 2b
  361. ;; // RAW on src1 when fall through from loop
  362. //
  363. // Tail correction based on len only
  364. //
  365. // No matter where we come from (loop or test) the src1 pointer
  366. // is 16 byte aligned AND we have less than 16 bytes to copy.
  367. //
  368. .dotail:
  369. EX(.failure_in1,(p6) ld8 val1[0]=[src1],8) // at least 8 bytes
  370. tbit.nz p7,p0=len1,2
  371. ;;
  372. EX(.failure_in1,(p7) ld4 val1[1]=[src1],4) // at least 4 bytes
  373. tbit.nz p8,p0=len1,1
  374. ;;
  375. EX(.failure_in1,(p8) ld2 val2[0]=[src1],2) // at least 2 bytes
  376. tbit.nz p9,p0=len1,0
  377. ;;
  378. EX(.failure_out, (p6) st8 [dst1]=val1[0],8)
  379. ;;
  380. EX(.failure_in1,(p9) ld1 val2[1]=[src1]) // only 1 byte left
  381. mov ar.lc=saved_lc
  382. ;;
  383. EX(.failure_out,(p7) st4 [dst1]=val1[1],4)
  384. mov pr=saved_pr,0xffffffffffff0000
  385. ;;
  386. EX(.failure_out, (p8) st2 [dst1]=val2[0],2)
  387. mov ar.pfs=saved_pfs
  388. ;;
  389. EX(.failure_out, (p9) st1 [dst1]=val2[1])
  390. br.ret.sptk.many rp
  391. //
  392. // Here we handle the case where the byte by byte copy fails
  393. // on the load.
  394. // Several factors make the zeroing of the rest of the buffer kind of
  395. // tricky:
  396. // - the pipeline: loads/stores are not in sync (pipeline)
  397. //
  398. // In the same loop iteration, the dst1 pointer does not directly
  399. // reflect where the faulty load was.
  400. //
  401. // - pipeline effect
  402. // When you get a fault on load, you may have valid data from
  403. // previous loads not yet store in transit. Such data must be
  404. // store normally before moving onto zeroing the rest.
  405. //
  406. // - single/multi dispersal independence.
  407. //
  408. // solution:
  409. // - we don't disrupt the pipeline, i.e. data in transit in
  410. // the software pipeline will be eventually move to memory.
  411. // We simply replace the load with a simple mov and keep the
  412. // pipeline going. We can't really do this inline because
  413. // p16 is always reset to 1 when lc > 0.
  414. //
  415. .failure_in_pipe1:
  416. sub ret0=endsrc,src1 // number of bytes to zero, i.e. not copied
  417. 1:
  418. (p16) mov val1[0]=r0
  419. (EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1
  420. br.ctop.dptk 1b
  421. ;;
  422. mov pr=saved_pr,0xffffffffffff0000
  423. mov ar.lc=saved_lc
  424. mov ar.pfs=saved_pfs
  425. br.ret.sptk.many rp
  426. //
  427. // This is the case where the byte by byte copy fails on the load
  428. // when we copy the head. We need to finish the pipeline and copy
  429. // zeros for the rest of the destination. Since this happens
  430. // at the top we still need to fill the body and tail.
  431. .failure_in_pipe2:
  432. sub ret0=endsrc,src1 // number of bytes to zero, i.e. not copied
  433. 2:
  434. (p16) mov val1[0]=r0
  435. (EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1
  436. br.ctop.dptk 2b
  437. ;;
  438. sub len=enddst,dst1,1 // precompute len
  439. br.cond.dptk.many .failure_in1bis
  440. ;;
  441. //
  442. // Here we handle the head & tail part when we check for alignment.
  443. // The following code handles only the load failures. The
  444. // main diffculty comes from the fact that loads/stores are
  445. // scheduled. So when you fail on a load, the stores corresponding
  446. // to previous successful loads must be executed.
  447. //
  448. // However some simplifications are possible given the way
  449. // things work.
  450. //
  451. // 1) HEAD
  452. // Theory of operation:
  453. //
  454. // Page A | Page B
  455. // ---------|-----
  456. // 1|8 x
  457. // 1 2|8 x
  458. // 4|8 x
  459. // 1 4|8 x
  460. // 2 4|8 x
  461. // 1 2 4|8 x
  462. // |1
  463. // |2 x
  464. // |4 x
  465. //
  466. // page_size >= 4k (2^12). (x means 4, 2, 1)
  467. // Here we suppose Page A exists and Page B does not.
  468. //
  469. // As we move towards eight byte alignment we may encounter faults.
  470. // The numbers on each page show the size of the load (current alignment).
  471. //
  472. // Key point:
  473. // - if you fail on 1, 2, 4 then you have never executed any smaller
  474. // size loads, e.g. failing ld4 means no ld1 nor ld2 executed
  475. // before.
  476. //
  477. // This allows us to simplify the cleanup code, because basically you
  478. // only have to worry about "pending" stores in the case of a failing
  479. // ld8(). Given the way the code is written today, this means only
  480. // worry about st2, st4. There we can use the information encapsulated
  481. // into the predicates.
  482. //
  483. // Other key point:
  484. // - if you fail on the ld8 in the head, it means you went straight
  485. // to it, i.e. 8byte alignment within an unexisting page.
  486. // Again this comes from the fact that if you crossed just for the ld8 then
  487. // you are 8byte aligned but also 16byte align, therefore you would
  488. // either go for the 16byte copy loop OR the ld8 in the tail part.
  489. // The combination ld1, ld2, ld4, ld8 where you fail on ld8 is impossible
  490. // because it would mean you had 15bytes to copy in which case you
  491. // would have defaulted to the byte by byte copy.
  492. //
  493. //
  494. // 2) TAIL
  495. // Here we now we have less than 16 bytes AND we are either 8 or 16 byte
  496. // aligned.
  497. //
  498. // Key point:
  499. // This means that we either:
  500. // - are right on a page boundary
  501. // OR
  502. // - are at more than 16 bytes from a page boundary with
  503. // at most 15 bytes to copy: no chance of crossing.
  504. //
  505. // This allows us to assume that if we fail on a load we haven't possibly
  506. // executed any of the previous (tail) ones, so we don't need to do
  507. // any stores. For instance, if we fail on ld2, this means we had
  508. // 2 or 3 bytes left to copy and we did not execute the ld8 nor ld4.
  509. //
  510. // This means that we are in a situation similar the a fault in the
  511. // head part. That's nice!
  512. //
  513. .failure_in1:
  514. sub ret0=endsrc,src1 // number of bytes to zero, i.e. not copied
  515. sub len=endsrc,src1,1
  516. //
  517. // we know that ret0 can never be zero at this point
  518. // because we failed why trying to do a load, i.e. there is still
  519. // some work to do.
  520. // The failure_in1bis and length problem is taken care of at the
  521. // calling side.
  522. //
  523. ;;
  524. .failure_in1bis: // from (.failure_in3)
  525. mov ar.lc=len // Continue with a stupid byte store.
  526. ;;
  527. 5:
  528. st1 [dst1]=r0,1
  529. br.cloop.dptk 5b
  530. ;;
  531. mov pr=saved_pr,0xffffffffffff0000
  532. mov ar.lc=saved_lc
  533. mov ar.pfs=saved_pfs
  534. br.ret.sptk.many rp
  535. //
  536. // Here we simply restart the loop but instead
  537. // of doing loads we fill the pipeline with zeroes
  538. // We can't simply store r0 because we may have valid
  539. // data in transit in the pipeline.
  540. // ar.lc and ar.ec are setup correctly at this point
  541. //
  542. // we MUST use src1/endsrc here and not dst1/enddst because
  543. // of the pipeline effect.
  544. //
  545. .failure_in3:
  546. sub ret0=endsrc,src1 // number of bytes to zero, i.e. not copied
  547. ;;
  548. 2:
  549. (p16) mov val1[0]=r0
  550. (p16) mov val2[0]=r0
  551. (EPI) st8 [dst1]=val1[PIPE_DEPTH-1],16
  552. (EPI) st8 [dst2]=val2[PIPE_DEPTH-1],16
  553. br.ctop.dptk 2b
  554. ;;
  555. cmp.ne p6,p0=dst1,enddst // Do we need to finish the tail ?
  556. sub len=enddst,dst1,1 // precompute len
  557. (p6) br.cond.dptk .failure_in1bis
  558. ;;
  559. mov pr=saved_pr,0xffffffffffff0000
  560. mov ar.lc=saved_lc
  561. mov ar.pfs=saved_pfs
  562. br.ret.sptk.many rp
  563. .failure_in2:
  564. sub ret0=endsrc,src1
  565. cmp.ne p6,p0=dst1,enddst // Do we need to finish the tail ?
  566. sub len=enddst,dst1,1 // precompute len
  567. (p6) br.cond.dptk .failure_in1bis
  568. ;;
  569. mov pr=saved_pr,0xffffffffffff0000
  570. mov ar.lc=saved_lc
  571. mov ar.pfs=saved_pfs
  572. br.ret.sptk.many rp
  573. //
  574. // handling of failures on stores: that's the easy part
  575. //
  576. .failure_out:
  577. sub ret0=enddst,dst1
  578. mov pr=saved_pr,0xffffffffffff0000
  579. mov ar.lc=saved_lc
  580. mov ar.pfs=saved_pfs
  581. br.ret.sptk.many rp
  582. END(__copy_user)