octeon-memcpy.S 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522
  1. /*
  2. * This file is subject to the terms and conditions of the GNU General Public
  3. * License. See the file "COPYING" in the main directory of this archive
  4. * for more details.
  5. *
  6. * Unified implementation of memcpy, memmove and the __copy_user backend.
  7. *
  8. * Copyright (C) 1998, 99, 2000, 01, 2002 Ralf Baechle (ralf@gnu.org)
  9. * Copyright (C) 1999, 2000, 01, 2002 Silicon Graphics, Inc.
  10. * Copyright (C) 2002 Broadcom, Inc.
  11. * memcpy/copy_user author: Mark Vandevoorde
  12. *
  13. * Mnemonic names for arguments to memcpy/__copy_user
  14. */
  15. #include <asm/asm.h>
  16. #include <asm/asm-offsets.h>
  17. #include <asm/regdef.h>
  18. #define dst a0
  19. #define src a1
  20. #define len a2
  21. /*
  22. * Spec
  23. *
  24. * memcpy copies len bytes from src to dst and sets v0 to dst.
  25. * It assumes that
  26. * - src and dst don't overlap
  27. * - src is readable
  28. * - dst is writable
  29. * memcpy uses the standard calling convention
  30. *
  31. * __copy_user copies up to len bytes from src to dst and sets a2 (len) to
  32. * the number of uncopied bytes due to an exception caused by a read or write.
  33. * __copy_user assumes that src and dst don't overlap, and that the call is
  34. * implementing one of the following:
  35. * copy_to_user
  36. * - src is readable (no exceptions when reading src)
  37. * copy_from_user
  38. * - dst is writable (no exceptions when writing dst)
  39. * __copy_user uses a non-standard calling convention; see
  40. * arch/mips/include/asm/uaccess.h
  41. *
  42. * When an exception happens on a load, the handler must
  43. # ensure that all of the destination buffer is overwritten to prevent
  44. * leaking information to user mode programs.
  45. */
  46. /*
  47. * Implementation
  48. */
  49. /*
  50. * The exception handler for loads requires that:
  51. * 1- AT contain the address of the byte just past the end of the source
  52. * of the copy,
  53. * 2- src_entry <= src < AT, and
  54. * 3- (dst - src) == (dst_entry - src_entry),
  55. * The _entry suffix denotes values when __copy_user was called.
  56. *
  57. * (1) is set up up by uaccess.h and maintained by not writing AT in copy_user
  58. * (2) is met by incrementing src by the number of bytes copied
  59. * (3) is met by not doing loads between a pair of increments of dst and src
  60. *
  61. * The exception handlers for stores adjust len (if necessary) and return.
  62. * These handlers do not need to overwrite any data.
  63. *
  64. * For __rmemcpy and memmove an exception is always a kernel bug, therefore
  65. * they're not protected.
  66. */
  67. #define EXC(inst_reg,addr,handler) \
  68. 9: inst_reg, addr; \
  69. .section __ex_table,"a"; \
  70. PTR 9b, handler; \
  71. .previous
  72. /*
  73. * Only on the 64-bit kernel we can made use of 64-bit registers.
  74. */
  75. #ifdef CONFIG_64BIT
  76. #define USE_DOUBLE
  77. #endif
  78. #ifdef USE_DOUBLE
  79. #define LOAD ld
  80. #define LOADL ldl
  81. #define LOADR ldr
  82. #define STOREL sdl
  83. #define STORER sdr
  84. #define STORE sd
  85. #define ADD daddu
  86. #define SUB dsubu
  87. #define SRL dsrl
  88. #define SRA dsra
  89. #define SLL dsll
  90. #define SLLV dsllv
  91. #define SRLV dsrlv
  92. #define NBYTES 8
  93. #define LOG_NBYTES 3
  94. /*
  95. * As we are sharing code base with the mips32 tree (which use the o32 ABI
  96. * register definitions). We need to redefine the register definitions from
  97. * the n64 ABI register naming to the o32 ABI register naming.
  98. */
  99. #undef t0
  100. #undef t1
  101. #undef t2
  102. #undef t3
  103. #define t0 $8
  104. #define t1 $9
  105. #define t2 $10
  106. #define t3 $11
  107. #define t4 $12
  108. #define t5 $13
  109. #define t6 $14
  110. #define t7 $15
  111. #else
  112. #define LOAD lw
  113. #define LOADL lwl
  114. #define LOADR lwr
  115. #define STOREL swl
  116. #define STORER swr
  117. #define STORE sw
  118. #define ADD addu
  119. #define SUB subu
  120. #define SRL srl
  121. #define SLL sll
  122. #define SRA sra
  123. #define SLLV sllv
  124. #define SRLV srlv
  125. #define NBYTES 4
  126. #define LOG_NBYTES 2
  127. #endif /* USE_DOUBLE */
  128. #ifdef CONFIG_CPU_LITTLE_ENDIAN
  129. #define LDFIRST LOADR
  130. #define LDREST LOADL
  131. #define STFIRST STORER
  132. #define STREST STOREL
  133. #define SHIFT_DISCARD SLLV
  134. #else
  135. #define LDFIRST LOADL
  136. #define LDREST LOADR
  137. #define STFIRST STOREL
  138. #define STREST STORER
  139. #define SHIFT_DISCARD SRLV
  140. #endif
  141. #define FIRST(unit) ((unit)*NBYTES)
  142. #define REST(unit) (FIRST(unit)+NBYTES-1)
  143. #define UNIT(unit) FIRST(unit)
  144. #define ADDRMASK (NBYTES-1)
  145. .text
  146. .set noreorder
  147. .set noat
  148. /*
  149. * A combined memcpy/__copy_user
  150. * __copy_user sets len to 0 for success; else to an upper bound of
  151. * the number of uncopied bytes.
  152. * memcpy sets v0 to dst.
  153. */
  154. .align 5
  155. LEAF(memcpy) /* a0=dst a1=src a2=len */
  156. move v0, dst /* return value */
  157. __memcpy:
  158. FEXPORT(__copy_user)
  159. /*
  160. * Note: dst & src may be unaligned, len may be 0
  161. * Temps
  162. */
  163. #
  164. # Octeon doesn't care if the destination is unaligned. The hardware
  165. # can fix it faster than we can special case the assembly.
  166. #
  167. pref 0, 0(src)
  168. sltu t0, len, NBYTES # Check if < 1 word
  169. bnez t0, copy_bytes_checklen
  170. and t0, src, ADDRMASK # Check if src unaligned
  171. bnez t0, src_unaligned
  172. sltu t0, len, 4*NBYTES # Check if < 4 words
  173. bnez t0, less_than_4units
  174. sltu t0, len, 8*NBYTES # Check if < 8 words
  175. bnez t0, less_than_8units
  176. sltu t0, len, 16*NBYTES # Check if < 16 words
  177. bnez t0, cleanup_both_aligned
  178. sltu t0, len, 128+1 # Check if len < 129
  179. bnez t0, 1f # Skip prefetch if len is too short
  180. sltu t0, len, 256+1 # Check if len < 257
  181. bnez t0, 1f # Skip prefetch if len is too short
  182. pref 0, 128(src) # We must not prefetch invalid addresses
  183. #
  184. # This is where we loop if there is more than 128 bytes left
  185. 2: pref 0, 256(src) # We must not prefetch invalid addresses
  186. #
  187. # This is where we loop if we can't prefetch anymore
  188. 1:
  189. EXC( LOAD t0, UNIT(0)(src), l_exc)
  190. EXC( LOAD t1, UNIT(1)(src), l_exc_copy)
  191. EXC( LOAD t2, UNIT(2)(src), l_exc_copy)
  192. EXC( LOAD t3, UNIT(3)(src), l_exc_copy)
  193. SUB len, len, 16*NBYTES
  194. EXC( STORE t0, UNIT(0)(dst), s_exc_p16u)
  195. EXC( STORE t1, UNIT(1)(dst), s_exc_p15u)
  196. EXC( STORE t2, UNIT(2)(dst), s_exc_p14u)
  197. EXC( STORE t3, UNIT(3)(dst), s_exc_p13u)
  198. EXC( LOAD t0, UNIT(4)(src), l_exc_copy)
  199. EXC( LOAD t1, UNIT(5)(src), l_exc_copy)
  200. EXC( LOAD t2, UNIT(6)(src), l_exc_copy)
  201. EXC( LOAD t3, UNIT(7)(src), l_exc_copy)
  202. EXC( STORE t0, UNIT(4)(dst), s_exc_p12u)
  203. EXC( STORE t1, UNIT(5)(dst), s_exc_p11u)
  204. EXC( STORE t2, UNIT(6)(dst), s_exc_p10u)
  205. ADD src, src, 16*NBYTES
  206. EXC( STORE t3, UNIT(7)(dst), s_exc_p9u)
  207. ADD dst, dst, 16*NBYTES
  208. EXC( LOAD t0, UNIT(-8)(src), l_exc_copy)
  209. EXC( LOAD t1, UNIT(-7)(src), l_exc_copy)
  210. EXC( LOAD t2, UNIT(-6)(src), l_exc_copy)
  211. EXC( LOAD t3, UNIT(-5)(src), l_exc_copy)
  212. EXC( STORE t0, UNIT(-8)(dst), s_exc_p8u)
  213. EXC( STORE t1, UNIT(-7)(dst), s_exc_p7u)
  214. EXC( STORE t2, UNIT(-6)(dst), s_exc_p6u)
  215. EXC( STORE t3, UNIT(-5)(dst), s_exc_p5u)
  216. EXC( LOAD t0, UNIT(-4)(src), l_exc_copy)
  217. EXC( LOAD t1, UNIT(-3)(src), l_exc_copy)
  218. EXC( LOAD t2, UNIT(-2)(src), l_exc_copy)
  219. EXC( LOAD t3, UNIT(-1)(src), l_exc_copy)
  220. EXC( STORE t0, UNIT(-4)(dst), s_exc_p4u)
  221. EXC( STORE t1, UNIT(-3)(dst), s_exc_p3u)
  222. EXC( STORE t2, UNIT(-2)(dst), s_exc_p2u)
  223. EXC( STORE t3, UNIT(-1)(dst), s_exc_p1u)
  224. sltu t0, len, 256+1 # See if we can prefetch more
  225. beqz t0, 2b
  226. sltu t0, len, 128 # See if we can loop more time
  227. beqz t0, 1b
  228. nop
  229. #
  230. # Jump here if there are less than 16*NBYTES left.
  231. #
  232. cleanup_both_aligned:
  233. beqz len, done
  234. sltu t0, len, 8*NBYTES
  235. bnez t0, less_than_8units
  236. nop
  237. EXC( LOAD t0, UNIT(0)(src), l_exc)
  238. EXC( LOAD t1, UNIT(1)(src), l_exc_copy)
  239. EXC( LOAD t2, UNIT(2)(src), l_exc_copy)
  240. EXC( LOAD t3, UNIT(3)(src), l_exc_copy)
  241. SUB len, len, 8*NBYTES
  242. EXC( STORE t0, UNIT(0)(dst), s_exc_p8u)
  243. EXC( STORE t1, UNIT(1)(dst), s_exc_p7u)
  244. EXC( STORE t2, UNIT(2)(dst), s_exc_p6u)
  245. EXC( STORE t3, UNIT(3)(dst), s_exc_p5u)
  246. EXC( LOAD t0, UNIT(4)(src), l_exc_copy)
  247. EXC( LOAD t1, UNIT(5)(src), l_exc_copy)
  248. EXC( LOAD t2, UNIT(6)(src), l_exc_copy)
  249. EXC( LOAD t3, UNIT(7)(src), l_exc_copy)
  250. EXC( STORE t0, UNIT(4)(dst), s_exc_p4u)
  251. EXC( STORE t1, UNIT(5)(dst), s_exc_p3u)
  252. EXC( STORE t2, UNIT(6)(dst), s_exc_p2u)
  253. EXC( STORE t3, UNIT(7)(dst), s_exc_p1u)
  254. ADD src, src, 8*NBYTES
  255. beqz len, done
  256. ADD dst, dst, 8*NBYTES
  257. #
  258. # Jump here if there are less than 8*NBYTES left.
  259. #
  260. less_than_8units:
  261. sltu t0, len, 4*NBYTES
  262. bnez t0, less_than_4units
  263. nop
  264. EXC( LOAD t0, UNIT(0)(src), l_exc)
  265. EXC( LOAD t1, UNIT(1)(src), l_exc_copy)
  266. EXC( LOAD t2, UNIT(2)(src), l_exc_copy)
  267. EXC( LOAD t3, UNIT(3)(src), l_exc_copy)
  268. SUB len, len, 4*NBYTES
  269. EXC( STORE t0, UNIT(0)(dst), s_exc_p4u)
  270. EXC( STORE t1, UNIT(1)(dst), s_exc_p3u)
  271. EXC( STORE t2, UNIT(2)(dst), s_exc_p2u)
  272. EXC( STORE t3, UNIT(3)(dst), s_exc_p1u)
  273. ADD src, src, 4*NBYTES
  274. beqz len, done
  275. ADD dst, dst, 4*NBYTES
  276. #
  277. # Jump here if there are less than 4*NBYTES left. This means
  278. # we may need to copy up to 3 NBYTES words.
  279. #
  280. less_than_4units:
  281. sltu t0, len, 1*NBYTES
  282. bnez t0, copy_bytes_checklen
  283. nop
  284. #
  285. # 1) Copy NBYTES, then check length again
  286. #
  287. EXC( LOAD t0, 0(src), l_exc)
  288. SUB len, len, NBYTES
  289. sltu t1, len, 8
  290. EXC( STORE t0, 0(dst), s_exc_p1u)
  291. ADD src, src, NBYTES
  292. bnez t1, copy_bytes_checklen
  293. ADD dst, dst, NBYTES
  294. #
  295. # 2) Copy NBYTES, then check length again
  296. #
  297. EXC( LOAD t0, 0(src), l_exc)
  298. SUB len, len, NBYTES
  299. sltu t1, len, 8
  300. EXC( STORE t0, 0(dst), s_exc_p1u)
  301. ADD src, src, NBYTES
  302. bnez t1, copy_bytes_checklen
  303. ADD dst, dst, NBYTES
  304. #
  305. # 3) Copy NBYTES, then check length again
  306. #
  307. EXC( LOAD t0, 0(src), l_exc)
  308. SUB len, len, NBYTES
  309. ADD src, src, NBYTES
  310. ADD dst, dst, NBYTES
  311. b copy_bytes_checklen
  312. EXC( STORE t0, -8(dst), s_exc_p1u)
  313. src_unaligned:
  314. #define rem t8
  315. SRL t0, len, LOG_NBYTES+2 # +2 for 4 units/iter
  316. beqz t0, cleanup_src_unaligned
  317. and rem, len, (4*NBYTES-1) # rem = len % 4*NBYTES
  318. 1:
  319. /*
  320. * Avoid consecutive LD*'s to the same register since some mips
  321. * implementations can't issue them in the same cycle.
  322. * It's OK to load FIRST(N+1) before REST(N) because the two addresses
  323. * are to the same unit (unless src is aligned, but it's not).
  324. */
  325. EXC( LDFIRST t0, FIRST(0)(src), l_exc)
  326. EXC( LDFIRST t1, FIRST(1)(src), l_exc_copy)
  327. SUB len, len, 4*NBYTES
  328. EXC( LDREST t0, REST(0)(src), l_exc_copy)
  329. EXC( LDREST t1, REST(1)(src), l_exc_copy)
  330. EXC( LDFIRST t2, FIRST(2)(src), l_exc_copy)
  331. EXC( LDFIRST t3, FIRST(3)(src), l_exc_copy)
  332. EXC( LDREST t2, REST(2)(src), l_exc_copy)
  333. EXC( LDREST t3, REST(3)(src), l_exc_copy)
  334. ADD src, src, 4*NBYTES
  335. EXC( STORE t0, UNIT(0)(dst), s_exc_p4u)
  336. EXC( STORE t1, UNIT(1)(dst), s_exc_p3u)
  337. EXC( STORE t2, UNIT(2)(dst), s_exc_p2u)
  338. EXC( STORE t3, UNIT(3)(dst), s_exc_p1u)
  339. bne len, rem, 1b
  340. ADD dst, dst, 4*NBYTES
  341. cleanup_src_unaligned:
  342. beqz len, done
  343. and rem, len, NBYTES-1 # rem = len % NBYTES
  344. beq rem, len, copy_bytes
  345. nop
  346. 1:
  347. EXC( LDFIRST t0, FIRST(0)(src), l_exc)
  348. EXC( LDREST t0, REST(0)(src), l_exc_copy)
  349. SUB len, len, NBYTES
  350. EXC( STORE t0, 0(dst), s_exc_p1u)
  351. ADD src, src, NBYTES
  352. bne len, rem, 1b
  353. ADD dst, dst, NBYTES
  354. copy_bytes_checklen:
  355. beqz len, done
  356. nop
  357. copy_bytes:
  358. /* 0 < len < NBYTES */
  359. #define COPY_BYTE(N) \
  360. EXC( lb t0, N(src), l_exc); \
  361. SUB len, len, 1; \
  362. beqz len, done; \
  363. EXC( sb t0, N(dst), s_exc_p1)
  364. COPY_BYTE(0)
  365. COPY_BYTE(1)
  366. #ifdef USE_DOUBLE
  367. COPY_BYTE(2)
  368. COPY_BYTE(3)
  369. COPY_BYTE(4)
  370. COPY_BYTE(5)
  371. #endif
  372. EXC( lb t0, NBYTES-2(src), l_exc)
  373. SUB len, len, 1
  374. jr ra
  375. EXC( sb t0, NBYTES-2(dst), s_exc_p1)
  376. done:
  377. jr ra
  378. nop
  379. END(memcpy)
  380. l_exc_copy:
  381. /*
  382. * Copy bytes from src until faulting load address (or until a
  383. * lb faults)
  384. *
  385. * When reached by a faulting LDFIRST/LDREST, THREAD_BUADDR($28)
  386. * may be more than a byte beyond the last address.
  387. * Hence, the lb below may get an exception.
  388. *
  389. * Assumes src < THREAD_BUADDR($28)
  390. */
  391. LOAD t0, TI_TASK($28)
  392. nop
  393. LOAD t0, THREAD_BUADDR(t0)
  394. 1:
  395. EXC( lb t1, 0(src), l_exc)
  396. ADD src, src, 1
  397. sb t1, 0(dst) # can't fault -- we're copy_from_user
  398. bne src, t0, 1b
  399. ADD dst, dst, 1
  400. l_exc:
  401. LOAD t0, TI_TASK($28)
  402. nop
  403. LOAD t0, THREAD_BUADDR(t0) # t0 is just past last good address
  404. nop
  405. SUB len, AT, t0 # len number of uncopied bytes
  406. /*
  407. * Here's where we rely on src and dst being incremented in tandem,
  408. * See (3) above.
  409. * dst += (fault addr - src) to put dst at first byte to clear
  410. */
  411. ADD dst, t0 # compute start address in a1
  412. SUB dst, src
  413. /*
  414. * Clear len bytes starting at dst. Can't call __bzero because it
  415. * might modify len. An inefficient loop for these rare times...
  416. */
  417. beqz len, done
  418. SUB src, len, 1
  419. 1: sb zero, 0(dst)
  420. ADD dst, dst, 1
  421. bnez src, 1b
  422. SUB src, src, 1
  423. jr ra
  424. nop
  425. #define SEXC(n) \
  426. s_exc_p ## n ## u: \
  427. jr ra; \
  428. ADD len, len, n*NBYTES
  429. SEXC(16)
  430. SEXC(15)
  431. SEXC(14)
  432. SEXC(13)
  433. SEXC(12)
  434. SEXC(11)
  435. SEXC(10)
  436. SEXC(9)
  437. SEXC(8)
  438. SEXC(7)
  439. SEXC(6)
  440. SEXC(5)
  441. SEXC(4)
  442. SEXC(3)
  443. SEXC(2)
  444. SEXC(1)
  445. s_exc_p1:
  446. jr ra
  447. ADD len, len, 1
  448. s_exc:
  449. jr ra
  450. nop
  451. .align 5
  452. LEAF(memmove)
  453. ADD t0, a0, a2
  454. ADD t1, a1, a2
  455. sltu t0, a1, t0 # dst + len <= src -> memcpy
  456. sltu t1, a0, t1 # dst >= src + len -> memcpy
  457. and t0, t1
  458. beqz t0, __memcpy
  459. move v0, a0 /* return value */
  460. beqz a2, r_out
  461. END(memmove)
  462. /* fall through to __rmemcpy */
  463. LEAF(__rmemcpy) /* a0=dst a1=src a2=len */
  464. sltu t0, a1, a0
  465. beqz t0, r_end_bytes_up # src >= dst
  466. nop
  467. ADD a0, a2 # dst = dst + len
  468. ADD a1, a2 # src = src + len
  469. r_end_bytes:
  470. lb t0, -1(a1)
  471. SUB a2, a2, 0x1
  472. sb t0, -1(a0)
  473. SUB a1, a1, 0x1
  474. bnez a2, r_end_bytes
  475. SUB a0, a0, 0x1
  476. r_out:
  477. jr ra
  478. move a2, zero
  479. r_end_bytes_up:
  480. lb t0, (a1)
  481. SUB a2, a2, 0x1
  482. sb t0, (a0)
  483. ADD a1, a1, 0x1
  484. bnez a2, r_end_bytes_up
  485. ADD a0, a0, 0x1
  486. jr ra
  487. move a2, zero
  488. END(__rmemcpy)