copy_user.S 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613
  1. /*
  2. *
  3. * Optimized version of the copy_user() routine.
  4. * It is used to copy date across the kernel/user boundary.
  5. *
  6. * The source and destination are always on opposite side of
  7. * the boundary. When reading from user space we must catch
  8. * faults on loads. When writing to user space we must catch
  9. * errors on stores. Note that because of the nature of the copy
  10. * we don't need to worry about overlapping regions.
  11. *
  12. *
  13. * Inputs:
  14. * in0 address of source buffer
  15. * in1 address of destination buffer
  16. * in2 number of bytes to copy
  17. *
  18. * Outputs:
  19. * ret0 0 in case of success. The number of bytes NOT copied in
  20. * case of error.
  21. *
  22. * Copyright (C) 2000-2001 Hewlett-Packard Co
  23. * Stephane Eranian <eranian@hpl.hp.com>
  24. *
  25. * Fixme:
  26. * - handle the case where we have more than 16 bytes and the alignment
  27. * are different.
  28. * - more benchmarking
  29. * - fix extraneous stop bit introduced by the EX() macro.
  30. */
  31. #include <asm/asmmacro.h>
  32. #include <asm/export.h>
  33. //
  34. // Tuneable parameters
  35. //
  36. #define COPY_BREAK 16 // we do byte copy below (must be >=16)
  37. #define PIPE_DEPTH 21 // pipe depth
  38. #define EPI p[PIPE_DEPTH-1]
  39. //
  40. // arguments
  41. //
  42. #define dst in0
  43. #define src in1
  44. #define len in2
  45. //
  46. // local registers
  47. //
  48. #define t1 r2 // rshift in bytes
  49. #define t2 r3 // lshift in bytes
  50. #define rshift r14 // right shift in bits
  51. #define lshift r15 // left shift in bits
  52. #define word1 r16
  53. #define word2 r17
  54. #define cnt r18
  55. #define len2 r19
  56. #define saved_lc r20
  57. #define saved_pr r21
  58. #define tmp r22
  59. #define val r23
  60. #define src1 r24
  61. #define dst1 r25
  62. #define src2 r26
  63. #define dst2 r27
  64. #define len1 r28
  65. #define enddst r29
  66. #define endsrc r30
  67. #define saved_pfs r31
  68. GLOBAL_ENTRY(__copy_user)
  69. .prologue
  70. .save ar.pfs, saved_pfs
  71. alloc saved_pfs=ar.pfs,3,((2*PIPE_DEPTH+7)&~7),0,((2*PIPE_DEPTH+7)&~7)
  72. .rotr val1[PIPE_DEPTH],val2[PIPE_DEPTH]
  73. .rotp p[PIPE_DEPTH]
  74. adds len2=-1,len // br.ctop is repeat/until
  75. mov ret0=r0
  76. ;; // RAW of cfm when len=0
  77. cmp.eq p8,p0=r0,len // check for zero length
  78. .save ar.lc, saved_lc
  79. mov saved_lc=ar.lc // preserve ar.lc (slow)
  80. (p8) br.ret.spnt.many rp // empty mempcy()
  81. ;;
  82. add enddst=dst,len // first byte after end of source
  83. add endsrc=src,len // first byte after end of destination
  84. .save pr, saved_pr
  85. mov saved_pr=pr // preserve predicates
  86. .body
  87. mov dst1=dst // copy because of rotation
  88. mov ar.ec=PIPE_DEPTH
  89. mov pr.rot=1<<16 // p16=true all others are false
  90. mov src1=src // copy because of rotation
  91. mov ar.lc=len2 // initialize lc for small count
  92. cmp.lt p10,p7=COPY_BREAK,len // if len > COPY_BREAK then long copy
  93. xor tmp=src,dst // same alignment test prepare
  94. (p10) br.cond.dptk .long_copy_user
  95. ;; // RAW pr.rot/p16 ?
  96. //
  97. // Now we do the byte by byte loop with software pipeline
  98. //
  99. // p7 is necessarily false by now
  100. 1:
  101. EX(.failure_in_pipe1,(p16) ld1 val1[0]=[src1],1)
  102. EX(.failure_out,(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1)
  103. br.ctop.dptk.few 1b
  104. ;;
  105. mov ar.lc=saved_lc
  106. mov pr=saved_pr,0xffffffffffff0000
  107. mov ar.pfs=saved_pfs // restore ar.ec
  108. br.ret.sptk.many rp // end of short memcpy
  109. //
  110. // Not 8-byte aligned
  111. //
  112. .diff_align_copy_user:
  113. // At this point we know we have more than 16 bytes to copy
  114. // and also that src and dest do _not_ have the same alignment.
  115. and src2=0x7,src1 // src offset
  116. and dst2=0x7,dst1 // dst offset
  117. ;;
  118. // The basic idea is that we copy byte-by-byte at the head so
  119. // that we can reach 8-byte alignment for both src1 and dst1.
  120. // Then copy the body using software pipelined 8-byte copy,
  121. // shifting the two back-to-back words right and left, then copy
  122. // the tail by copying byte-by-byte.
  123. //
  124. // Fault handling. If the byte-by-byte at the head fails on the
  125. // load, then restart and finish the pipleline by copying zeros
  126. // to the dst1. Then copy zeros for the rest of dst1.
  127. // If 8-byte software pipeline fails on the load, do the same as
  128. // failure_in3 does. If the byte-by-byte at the tail fails, it is
  129. // handled simply by failure_in_pipe1.
  130. //
  131. // The case p14 represents the source has more bytes in the
  132. // the first word (by the shifted part), whereas the p15 needs to
  133. // copy some bytes from the 2nd word of the source that has the
  134. // tail of the 1st of the destination.
  135. //
  136. //
  137. // Optimization. If dst1 is 8-byte aligned (quite common), we don't need
  138. // to copy the head to dst1, to start 8-byte copy software pipeline.
  139. // We know src1 is not 8-byte aligned in this case.
  140. //
  141. cmp.eq p14,p15=r0,dst2
  142. (p15) br.cond.spnt 1f
  143. ;;
  144. sub t1=8,src2
  145. mov t2=src2
  146. ;;
  147. shl rshift=t2,3
  148. sub len1=len,t1 // set len1
  149. ;;
  150. sub lshift=64,rshift
  151. ;;
  152. br.cond.spnt .word_copy_user
  153. ;;
  154. 1:
  155. cmp.leu p14,p15=src2,dst2
  156. sub t1=dst2,src2
  157. ;;
  158. .pred.rel "mutex", p14, p15
  159. (p14) sub word1=8,src2 // (8 - src offset)
  160. (p15) sub t1=r0,t1 // absolute value
  161. (p15) sub word1=8,dst2 // (8 - dst offset)
  162. ;;
  163. // For the case p14, we don't need to copy the shifted part to
  164. // the 1st word of destination.
  165. sub t2=8,t1
  166. (p14) sub word1=word1,t1
  167. ;;
  168. sub len1=len,word1 // resulting len
  169. (p15) shl rshift=t1,3 // in bits
  170. (p14) shl rshift=t2,3
  171. ;;
  172. (p14) sub len1=len1,t1
  173. adds cnt=-1,word1
  174. ;;
  175. sub lshift=64,rshift
  176. mov ar.ec=PIPE_DEPTH
  177. mov pr.rot=1<<16 // p16=true all others are false
  178. mov ar.lc=cnt
  179. ;;
  180. 2:
  181. EX(.failure_in_pipe2,(p16) ld1 val1[0]=[src1],1)
  182. EX(.failure_out,(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1)
  183. br.ctop.dptk.few 2b
  184. ;;
  185. clrrrb
  186. ;;
  187. .word_copy_user:
  188. cmp.gtu p9,p0=16,len1
  189. (p9) br.cond.spnt 4f // if (16 > len1) skip 8-byte copy
  190. ;;
  191. shr.u cnt=len1,3 // number of 64-bit words
  192. ;;
  193. adds cnt=-1,cnt
  194. ;;
  195. .pred.rel "mutex", p14, p15
  196. (p14) sub src1=src1,t2
  197. (p15) sub src1=src1,t1
  198. //
  199. // Now both src1 and dst1 point to an 8-byte aligned address. And
  200. // we have more than 8 bytes to copy.
  201. //
  202. mov ar.lc=cnt
  203. mov ar.ec=PIPE_DEPTH
  204. mov pr.rot=1<<16 // p16=true all others are false
  205. ;;
  206. 3:
  207. //
  208. // The pipleline consists of 3 stages:
  209. // 1 (p16): Load a word from src1
  210. // 2 (EPI_1): Shift right pair, saving to tmp
  211. // 3 (EPI): Store tmp to dst1
  212. //
  213. // To make it simple, use at least 2 (p16) loops to set up val1[n]
  214. // because we need 2 back-to-back val1[] to get tmp.
  215. // Note that this implies EPI_2 must be p18 or greater.
  216. //
  217. #define EPI_1 p[PIPE_DEPTH-2]
  218. #define SWITCH(pred, shift) cmp.eq pred,p0=shift,rshift
  219. #define CASE(pred, shift) \
  220. (pred) br.cond.spnt .copy_user_bit##shift
  221. #define BODY(rshift) \
  222. .copy_user_bit##rshift: \
  223. 1: \
  224. EX(.failure_out,(EPI) st8 [dst1]=tmp,8); \
  225. (EPI_1) shrp tmp=val1[PIPE_DEPTH-2],val1[PIPE_DEPTH-1],rshift; \
  226. EX(3f,(p16) ld8 val1[1]=[src1],8); \
  227. (p16) mov val1[0]=r0; \
  228. br.ctop.dptk 1b; \
  229. ;; \
  230. br.cond.sptk.many .diff_align_do_tail; \
  231. 2: \
  232. (EPI) st8 [dst1]=tmp,8; \
  233. (EPI_1) shrp tmp=val1[PIPE_DEPTH-2],val1[PIPE_DEPTH-1],rshift; \
  234. 3: \
  235. (p16) mov val1[1]=r0; \
  236. (p16) mov val1[0]=r0; \
  237. br.ctop.dptk 2b; \
  238. ;; \
  239. br.cond.sptk.many .failure_in2
  240. //
  241. // Since the instruction 'shrp' requires a fixed 128-bit value
  242. // specifying the bits to shift, we need to provide 7 cases
  243. // below.
  244. //
  245. SWITCH(p6, 8)
  246. SWITCH(p7, 16)
  247. SWITCH(p8, 24)
  248. SWITCH(p9, 32)
  249. SWITCH(p10, 40)
  250. SWITCH(p11, 48)
  251. SWITCH(p12, 56)
  252. ;;
  253. CASE(p6, 8)
  254. CASE(p7, 16)
  255. CASE(p8, 24)
  256. CASE(p9, 32)
  257. CASE(p10, 40)
  258. CASE(p11, 48)
  259. CASE(p12, 56)
  260. ;;
  261. BODY(8)
  262. BODY(16)
  263. BODY(24)
  264. BODY(32)
  265. BODY(40)
  266. BODY(48)
  267. BODY(56)
  268. ;;
  269. .diff_align_do_tail:
  270. .pred.rel "mutex", p14, p15
  271. (p14) sub src1=src1,t1
  272. (p14) adds dst1=-8,dst1
  273. (p15) sub dst1=dst1,t1
  274. ;;
  275. 4:
  276. // Tail correction.
  277. //
  278. // The problem with this piplelined loop is that the last word is not
  279. // loaded and thus parf of the last word written is not correct.
  280. // To fix that, we simply copy the tail byte by byte.
  281. sub len1=endsrc,src1,1
  282. clrrrb
  283. ;;
  284. mov ar.ec=PIPE_DEPTH
  285. mov pr.rot=1<<16 // p16=true all others are false
  286. mov ar.lc=len1
  287. ;;
  288. 5:
  289. EX(.failure_in_pipe1,(p16) ld1 val1[0]=[src1],1)
  290. EX(.failure_out,(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1)
  291. br.ctop.dptk.few 5b
  292. ;;
  293. mov ar.lc=saved_lc
  294. mov pr=saved_pr,0xffffffffffff0000
  295. mov ar.pfs=saved_pfs
  296. br.ret.sptk.many rp
  297. //
  298. // Beginning of long mempcy (i.e. > 16 bytes)
  299. //
  300. .long_copy_user:
  301. tbit.nz p6,p7=src1,0 // odd alignment
  302. and tmp=7,tmp
  303. ;;
  304. cmp.eq p10,p8=r0,tmp
  305. mov len1=len // copy because of rotation
  306. (p8) br.cond.dpnt .diff_align_copy_user
  307. ;;
  308. // At this point we know we have more than 16 bytes to copy
  309. // and also that both src and dest have the same alignment
  310. // which may not be the one we want. So for now we must move
  311. // forward slowly until we reach 16byte alignment: no need to
  312. // worry about reaching the end of buffer.
  313. //
  314. EX(.failure_in1,(p6) ld1 val1[0]=[src1],1) // 1-byte aligned
  315. (p6) adds len1=-1,len1;;
  316. tbit.nz p7,p0=src1,1
  317. ;;
  318. EX(.failure_in1,(p7) ld2 val1[1]=[src1],2) // 2-byte aligned
  319. (p7) adds len1=-2,len1;;
  320. tbit.nz p8,p0=src1,2
  321. ;;
  322. //
  323. // Stop bit not required after ld4 because if we fail on ld4
  324. // we have never executed the ld1, therefore st1 is not executed.
  325. //
  326. EX(.failure_in1,(p8) ld4 val2[0]=[src1],4) // 4-byte aligned
  327. ;;
  328. EX(.failure_out,(p6) st1 [dst1]=val1[0],1)
  329. tbit.nz p9,p0=src1,3
  330. ;;
  331. //
  332. // Stop bit not required after ld8 because if we fail on ld8
  333. // we have never executed the ld2, therefore st2 is not executed.
  334. //
  335. EX(.failure_in1,(p9) ld8 val2[1]=[src1],8) // 8-byte aligned
  336. EX(.failure_out,(p7) st2 [dst1]=val1[1],2)
  337. (p8) adds len1=-4,len1
  338. ;;
  339. EX(.failure_out, (p8) st4 [dst1]=val2[0],4)
  340. (p9) adds len1=-8,len1;;
  341. shr.u cnt=len1,4 // number of 128-bit (2x64bit) words
  342. ;;
  343. EX(.failure_out, (p9) st8 [dst1]=val2[1],8)
  344. tbit.nz p6,p0=len1,3
  345. cmp.eq p7,p0=r0,cnt
  346. adds tmp=-1,cnt // br.ctop is repeat/until
  347. (p7) br.cond.dpnt .dotail // we have less than 16 bytes left
  348. ;;
  349. adds src2=8,src1
  350. adds dst2=8,dst1
  351. mov ar.lc=tmp
  352. ;;
  353. //
  354. // 16bytes/iteration
  355. //
  356. 2:
  357. EX(.failure_in3,(p16) ld8 val1[0]=[src1],16)
  358. (p16) ld8 val2[0]=[src2],16
  359. EX(.failure_out, (EPI) st8 [dst1]=val1[PIPE_DEPTH-1],16)
  360. (EPI) st8 [dst2]=val2[PIPE_DEPTH-1],16
  361. br.ctop.dptk 2b
  362. ;; // RAW on src1 when fall through from loop
  363. //
  364. // Tail correction based on len only
  365. //
  366. // No matter where we come from (loop or test) the src1 pointer
  367. // is 16 byte aligned AND we have less than 16 bytes to copy.
  368. //
  369. .dotail:
  370. EX(.failure_in1,(p6) ld8 val1[0]=[src1],8) // at least 8 bytes
  371. tbit.nz p7,p0=len1,2
  372. ;;
  373. EX(.failure_in1,(p7) ld4 val1[1]=[src1],4) // at least 4 bytes
  374. tbit.nz p8,p0=len1,1
  375. ;;
  376. EX(.failure_in1,(p8) ld2 val2[0]=[src1],2) // at least 2 bytes
  377. tbit.nz p9,p0=len1,0
  378. ;;
  379. EX(.failure_out, (p6) st8 [dst1]=val1[0],8)
  380. ;;
  381. EX(.failure_in1,(p9) ld1 val2[1]=[src1]) // only 1 byte left
  382. mov ar.lc=saved_lc
  383. ;;
  384. EX(.failure_out,(p7) st4 [dst1]=val1[1],4)
  385. mov pr=saved_pr,0xffffffffffff0000
  386. ;;
  387. EX(.failure_out, (p8) st2 [dst1]=val2[0],2)
  388. mov ar.pfs=saved_pfs
  389. ;;
  390. EX(.failure_out, (p9) st1 [dst1]=val2[1])
  391. br.ret.sptk.many rp
  392. //
  393. // Here we handle the case where the byte by byte copy fails
  394. // on the load.
  395. // Several factors make the zeroing of the rest of the buffer kind of
  396. // tricky:
  397. // - the pipeline: loads/stores are not in sync (pipeline)
  398. //
  399. // In the same loop iteration, the dst1 pointer does not directly
  400. // reflect where the faulty load was.
  401. //
  402. // - pipeline effect
  403. // When you get a fault on load, you may have valid data from
  404. // previous loads not yet store in transit. Such data must be
  405. // store normally before moving onto zeroing the rest.
  406. //
  407. // - single/multi dispersal independence.
  408. //
  409. // solution:
  410. // - we don't disrupt the pipeline, i.e. data in transit in
  411. // the software pipeline will be eventually move to memory.
  412. // We simply replace the load with a simple mov and keep the
  413. // pipeline going. We can't really do this inline because
  414. // p16 is always reset to 1 when lc > 0.
  415. //
  416. .failure_in_pipe1:
  417. sub ret0=endsrc,src1 // number of bytes to zero, i.e. not copied
  418. 1:
  419. (p16) mov val1[0]=r0
  420. (EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1
  421. br.ctop.dptk 1b
  422. ;;
  423. mov pr=saved_pr,0xffffffffffff0000
  424. mov ar.lc=saved_lc
  425. mov ar.pfs=saved_pfs
  426. br.ret.sptk.many rp
  427. //
  428. // This is the case where the byte by byte copy fails on the load
  429. // when we copy the head. We need to finish the pipeline and copy
  430. // zeros for the rest of the destination. Since this happens
  431. // at the top we still need to fill the body and tail.
  432. .failure_in_pipe2:
  433. sub ret0=endsrc,src1 // number of bytes to zero, i.e. not copied
  434. 2:
  435. (p16) mov val1[0]=r0
  436. (EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1
  437. br.ctop.dptk 2b
  438. ;;
  439. sub len=enddst,dst1,1 // precompute len
  440. br.cond.dptk.many .failure_in1bis
  441. ;;
  442. //
  443. // Here we handle the head & tail part when we check for alignment.
  444. // The following code handles only the load failures. The
  445. // main diffculty comes from the fact that loads/stores are
  446. // scheduled. So when you fail on a load, the stores corresponding
  447. // to previous successful loads must be executed.
  448. //
  449. // However some simplifications are possible given the way
  450. // things work.
  451. //
  452. // 1) HEAD
  453. // Theory of operation:
  454. //
  455. // Page A | Page B
  456. // ---------|-----
  457. // 1|8 x
  458. // 1 2|8 x
  459. // 4|8 x
  460. // 1 4|8 x
  461. // 2 4|8 x
  462. // 1 2 4|8 x
  463. // |1
  464. // |2 x
  465. // |4 x
  466. //
  467. // page_size >= 4k (2^12). (x means 4, 2, 1)
  468. // Here we suppose Page A exists and Page B does not.
  469. //
  470. // As we move towards eight byte alignment we may encounter faults.
  471. // The numbers on each page show the size of the load (current alignment).
  472. //
  473. // Key point:
  474. // - if you fail on 1, 2, 4 then you have never executed any smaller
  475. // size loads, e.g. failing ld4 means no ld1 nor ld2 executed
  476. // before.
  477. //
  478. // This allows us to simplify the cleanup code, because basically you
  479. // only have to worry about "pending" stores in the case of a failing
  480. // ld8(). Given the way the code is written today, this means only
  481. // worry about st2, st4. There we can use the information encapsulated
  482. // into the predicates.
  483. //
  484. // Other key point:
  485. // - if you fail on the ld8 in the head, it means you went straight
  486. // to it, i.e. 8byte alignment within an unexisting page.
  487. // Again this comes from the fact that if you crossed just for the ld8 then
  488. // you are 8byte aligned but also 16byte align, therefore you would
  489. // either go for the 16byte copy loop OR the ld8 in the tail part.
  490. // The combination ld1, ld2, ld4, ld8 where you fail on ld8 is impossible
  491. // because it would mean you had 15bytes to copy in which case you
  492. // would have defaulted to the byte by byte copy.
  493. //
  494. //
  495. // 2) TAIL
  496. // Here we now we have less than 16 bytes AND we are either 8 or 16 byte
  497. // aligned.
  498. //
  499. // Key point:
  500. // This means that we either:
  501. // - are right on a page boundary
  502. // OR
  503. // - are at more than 16 bytes from a page boundary with
  504. // at most 15 bytes to copy: no chance of crossing.
  505. //
  506. // This allows us to assume that if we fail on a load we haven't possibly
  507. // executed any of the previous (tail) ones, so we don't need to do
  508. // any stores. For instance, if we fail on ld2, this means we had
  509. // 2 or 3 bytes left to copy and we did not execute the ld8 nor ld4.
  510. //
  511. // This means that we are in a situation similar the a fault in the
  512. // head part. That's nice!
  513. //
  514. .failure_in1:
  515. sub ret0=endsrc,src1 // number of bytes to zero, i.e. not copied
  516. sub len=endsrc,src1,1
  517. //
  518. // we know that ret0 can never be zero at this point
  519. // because we failed why trying to do a load, i.e. there is still
  520. // some work to do.
  521. // The failure_in1bis and length problem is taken care of at the
  522. // calling side.
  523. //
  524. ;;
  525. .failure_in1bis: // from (.failure_in3)
  526. mov ar.lc=len // Continue with a stupid byte store.
  527. ;;
  528. 5:
  529. st1 [dst1]=r0,1
  530. br.cloop.dptk 5b
  531. ;;
  532. mov pr=saved_pr,0xffffffffffff0000
  533. mov ar.lc=saved_lc
  534. mov ar.pfs=saved_pfs
  535. br.ret.sptk.many rp
  536. //
  537. // Here we simply restart the loop but instead
  538. // of doing loads we fill the pipeline with zeroes
  539. // We can't simply store r0 because we may have valid
  540. // data in transit in the pipeline.
  541. // ar.lc and ar.ec are setup correctly at this point
  542. //
  543. // we MUST use src1/endsrc here and not dst1/enddst because
  544. // of the pipeline effect.
  545. //
  546. .failure_in3:
  547. sub ret0=endsrc,src1 // number of bytes to zero, i.e. not copied
  548. ;;
  549. 2:
  550. (p16) mov val1[0]=r0
  551. (p16) mov val2[0]=r0
  552. (EPI) st8 [dst1]=val1[PIPE_DEPTH-1],16
  553. (EPI) st8 [dst2]=val2[PIPE_DEPTH-1],16
  554. br.ctop.dptk 2b
  555. ;;
  556. cmp.ne p6,p0=dst1,enddst // Do we need to finish the tail ?
  557. sub len=enddst,dst1,1 // precompute len
  558. (p6) br.cond.dptk .failure_in1bis
  559. ;;
  560. mov pr=saved_pr,0xffffffffffff0000
  561. mov ar.lc=saved_lc
  562. mov ar.pfs=saved_pfs
  563. br.ret.sptk.many rp
  564. .failure_in2:
  565. sub ret0=endsrc,src1
  566. cmp.ne p6,p0=dst1,enddst // Do we need to finish the tail ?
  567. sub len=enddst,dst1,1 // precompute len
  568. (p6) br.cond.dptk .failure_in1bis
  569. ;;
  570. mov pr=saved_pr,0xffffffffffff0000
  571. mov ar.lc=saved_lc
  572. mov ar.pfs=saved_pfs
  573. br.ret.sptk.many rp
  574. //
  575. // handling of failures on stores: that's the easy part
  576. //
  577. .failure_out:
  578. sub ret0=enddst,dst1
  579. mov pr=saved_pr,0xffffffffffff0000
  580. mov ar.lc=saved_lc
  581. mov ar.pfs=saved_pfs
  582. br.ret.sptk.many rp
  583. END(__copy_user)
  584. EXPORT_SYMBOL(__copy_user)