ev6-memset.S 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598
  1. /*
  2. * arch/alpha/lib/ev6-memset.S
  3. *
  4. * This is an efficient (and relatively small) implementation of the C library
  5. * "memset()" function for the 21264 implementation of Alpha.
  6. *
  7. * 21264 version contributed by Rick Gorton <rick.gorton@alpha-processor.com>
  8. *
  9. * Much of the information about 21264 scheduling/coding comes from:
  10. * Compiler Writer's Guide for the Alpha 21264
  11. * abbreviated as 'CWG' in other comments here
  12. * ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
  13. * Scheduling notation:
  14. * E - either cluster
  15. * U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1
  16. * L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1
  17. * The algorithm for the leading and trailing quadwords remains the same,
  18. * however the loop has been unrolled to enable better memory throughput,
  19. * and the code has been replicated for each of the entry points: __memset
  20. * and __memsetw to permit better scheduling to eliminate the stalling
  21. * encountered during the mask replication.
  22. * A future enhancement might be to put in a byte store loop for really
  23. * small (say < 32 bytes) memset()s. Whether or not that change would be
  24. * a win in the kernel would depend upon the contextual usage.
  25. * WARNING: Maintaining this is going to be more work than the above version,
  26. * as fixes will need to be made in multiple places. The performance gain
  27. * is worth it.
  28. */
  29. .set noat
  30. .set noreorder
  31. .text
  32. .globl __memset
  33. .globl __memsetw
  34. .globl __constant_c_memset
  35. .globl memset
  36. .ent __memset
  37. .align 5
  38. __memset:
  39. .frame $30,0,$26,0
  40. .prologue 0
  41. /*
  42. * Serious stalling happens. The only way to mitigate this is to
  43. * undertake a major re-write to interleave the constant materialization
  44. * with other parts of the fall-through code. This is important, even
  45. * though it makes maintenance tougher.
  46. * Do this later.
  47. */
  48. and $17,255,$1 # E : 00000000000000ch
  49. insbl $17,1,$2 # U : 000000000000ch00
  50. bis $16,$16,$0 # E : return value
  51. ble $18,end_b # U : zero length requested?
  52. addq $18,$16,$6 # E : max address to write to
  53. bis $1,$2,$17 # E : 000000000000chch
  54. insbl $1,2,$3 # U : 0000000000ch0000
  55. insbl $1,3,$4 # U : 00000000ch000000
  56. or $3,$4,$3 # E : 00000000chch0000
  57. inswl $17,4,$5 # U : 0000chch00000000
  58. xor $16,$6,$1 # E : will complete write be within one quadword?
  59. inswl $17,6,$2 # U : chch000000000000
  60. or $17,$3,$17 # E : 00000000chchchch
  61. or $2,$5,$2 # E : chchchch00000000
  62. bic $1,7,$1 # E : fit within a single quadword?
  63. and $16,7,$3 # E : Target addr misalignment
  64. or $17,$2,$17 # E : chchchchchchchch
  65. beq $1,within_quad_b # U :
  66. nop # E :
  67. beq $3,aligned_b # U : target is 0mod8
  68. /*
  69. * Target address is misaligned, and won't fit within a quadword
  70. */
  71. ldq_u $4,0($16) # L : Fetch first partial
  72. bis $16,$16,$5 # E : Save the address
  73. insql $17,$16,$2 # U : Insert new bytes
  74. subq $3,8,$3 # E : Invert (for addressing uses)
  75. addq $18,$3,$18 # E : $18 is new count ($3 is negative)
  76. mskql $4,$16,$4 # U : clear relevant parts of the quad
  77. subq $16,$3,$16 # E : $16 is new aligned destination
  78. bis $2,$4,$1 # E : Final bytes
  79. nop
  80. stq_u $1,0($5) # L : Store result
  81. nop
  82. nop
  83. .align 4
  84. aligned_b:
  85. /*
  86. * We are now guaranteed to be quad aligned, with at least
  87. * one partial quad to write.
  88. */
  89. sra $18,3,$3 # U : Number of remaining quads to write
  90. and $18,7,$18 # E : Number of trailing bytes to write
  91. bis $16,$16,$5 # E : Save dest address
  92. beq $3,no_quad_b # U : tail stuff only
  93. /*
  94. * it's worth the effort to unroll this and use wh64 if possible
  95. * Lifted a bunch of code from clear_user.S
  96. * At this point, entry values are:
  97. * $16 Current destination address
  98. * $5 A copy of $16
  99. * $6 The max quadword address to write to
  100. * $18 Number trailer bytes
  101. * $3 Number quads to write
  102. */
  103. and $16, 0x3f, $2 # E : Forward work (only useful for unrolled loop)
  104. subq $3, 16, $4 # E : Only try to unroll if > 128 bytes
  105. subq $2, 0x40, $1 # E : bias counter (aligning stuff 0mod64)
  106. blt $4, loop_b # U :
  107. /*
  108. * We know we've got at least 16 quads, minimum of one trip
  109. * through unrolled loop. Do a quad at a time to get us 0mod64
  110. * aligned.
  111. */
  112. nop # E :
  113. nop # E :
  114. nop # E :
  115. beq $1, $bigalign_b # U :
  116. $alignmod64_b:
  117. stq $17, 0($5) # L :
  118. subq $3, 1, $3 # E : For consistency later
  119. addq $1, 8, $1 # E : Increment towards zero for alignment
  120. addq $5, 8, $4 # E : Initial wh64 address (filler instruction)
  121. nop
  122. nop
  123. addq $5, 8, $5 # E : Inc address
  124. blt $1, $alignmod64_b # U :
  125. $bigalign_b:
  126. /*
  127. * $3 - number quads left to go
  128. * $5 - target address (aligned 0mod64)
  129. * $17 - mask of stuff to store
  130. * Scratch registers available: $7, $2, $4, $1
  131. * we know that we'll be taking a minimum of one trip through
  132. * CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle
  133. * Assumes the wh64 needs to be for 2 trips through the loop in the future
  134. * The wh64 is issued on for the starting destination address for trip +2
  135. * through the loop, and if there are less than two trips left, the target
  136. * address will be for the current trip.
  137. */
  138. $do_wh64_b:
  139. wh64 ($4) # L1 : memory subsystem write hint
  140. subq $3, 24, $2 # E : For determining future wh64 addresses
  141. stq $17, 0($5) # L :
  142. nop # E :
  143. addq $5, 128, $4 # E : speculative target of next wh64
  144. stq $17, 8($5) # L :
  145. stq $17, 16($5) # L :
  146. addq $5, 64, $7 # E : Fallback address for wh64 (== next trip addr)
  147. stq $17, 24($5) # L :
  148. stq $17, 32($5) # L :
  149. cmovlt $2, $7, $4 # E : Latency 2, extra mapping cycle
  150. nop
  151. stq $17, 40($5) # L :
  152. stq $17, 48($5) # L :
  153. subq $3, 16, $2 # E : Repeat the loop at least once more?
  154. nop
  155. stq $17, 56($5) # L :
  156. addq $5, 64, $5 # E :
  157. subq $3, 8, $3 # E :
  158. bge $2, $do_wh64_b # U :
  159. nop
  160. nop
  161. nop
  162. beq $3, no_quad_b # U : Might have finished already
  163. .align 4
  164. /*
  165. * Simple loop for trailing quadwords, or for small amounts
  166. * of data (where we can't use an unrolled loop and wh64)
  167. */
  168. loop_b:
  169. stq $17,0($5) # L :
  170. subq $3,1,$3 # E : Decrement number quads left
  171. addq $5,8,$5 # E : Inc address
  172. bne $3,loop_b # U : more?
  173. no_quad_b:
  174. /*
  175. * Write 0..7 trailing bytes.
  176. */
  177. nop # E :
  178. beq $18,end_b # U : All done?
  179. ldq $7,0($5) # L :
  180. mskqh $7,$6,$2 # U : Mask final quad
  181. insqh $17,$6,$4 # U : New bits
  182. bis $2,$4,$1 # E : Put it all together
  183. stq $1,0($5) # L : And back to memory
  184. ret $31,($26),1 # L0 :
  185. within_quad_b:
  186. ldq_u $1,0($16) # L :
  187. insql $17,$16,$2 # U : New bits
  188. mskql $1,$16,$4 # U : Clear old
  189. bis $2,$4,$2 # E : New result
  190. mskql $2,$6,$4 # U :
  191. mskqh $1,$6,$2 # U :
  192. bis $2,$4,$1 # E :
  193. stq_u $1,0($16) # L :
  194. end_b:
  195. nop
  196. nop
  197. nop
  198. ret $31,($26),1 # L0 :
  199. .end __memset
  200. /*
  201. * This is the original body of code, prior to replication and
  202. * rescheduling. Leave it here, as there may be calls to this
  203. * entry point.
  204. */
  205. .align 4
  206. .ent __constant_c_memset
  207. __constant_c_memset:
  208. .frame $30,0,$26,0
  209. .prologue 0
  210. addq $18,$16,$6 # E : max address to write to
  211. bis $16,$16,$0 # E : return value
  212. xor $16,$6,$1 # E : will complete write be within one quadword?
  213. ble $18,end # U : zero length requested?
  214. bic $1,7,$1 # E : fit within a single quadword
  215. beq $1,within_one_quad # U :
  216. and $16,7,$3 # E : Target addr misalignment
  217. beq $3,aligned # U : target is 0mod8
  218. /*
  219. * Target address is misaligned, and won't fit within a quadword
  220. */
  221. ldq_u $4,0($16) # L : Fetch first partial
  222. bis $16,$16,$5 # E : Save the address
  223. insql $17,$16,$2 # U : Insert new bytes
  224. subq $3,8,$3 # E : Invert (for addressing uses)
  225. addq $18,$3,$18 # E : $18 is new count ($3 is negative)
  226. mskql $4,$16,$4 # U : clear relevant parts of the quad
  227. subq $16,$3,$16 # E : $16 is new aligned destination
  228. bis $2,$4,$1 # E : Final bytes
  229. nop
  230. stq_u $1,0($5) # L : Store result
  231. nop
  232. nop
  233. .align 4
  234. aligned:
  235. /*
  236. * We are now guaranteed to be quad aligned, with at least
  237. * one partial quad to write.
  238. */
  239. sra $18,3,$3 # U : Number of remaining quads to write
  240. and $18,7,$18 # E : Number of trailing bytes to write
  241. bis $16,$16,$5 # E : Save dest address
  242. beq $3,no_quad # U : tail stuff only
  243. /*
  244. * it's worth the effort to unroll this and use wh64 if possible
  245. * Lifted a bunch of code from clear_user.S
  246. * At this point, entry values are:
  247. * $16 Current destination address
  248. * $5 A copy of $16
  249. * $6 The max quadword address to write to
  250. * $18 Number trailer bytes
  251. * $3 Number quads to write
  252. */
  253. and $16, 0x3f, $2 # E : Forward work (only useful for unrolled loop)
  254. subq $3, 16, $4 # E : Only try to unroll if > 128 bytes
  255. subq $2, 0x40, $1 # E : bias counter (aligning stuff 0mod64)
  256. blt $4, loop # U :
  257. /*
  258. * We know we've got at least 16 quads, minimum of one trip
  259. * through unrolled loop. Do a quad at a time to get us 0mod64
  260. * aligned.
  261. */
  262. nop # E :
  263. nop # E :
  264. nop # E :
  265. beq $1, $bigalign # U :
  266. $alignmod64:
  267. stq $17, 0($5) # L :
  268. subq $3, 1, $3 # E : For consistency later
  269. addq $1, 8, $1 # E : Increment towards zero for alignment
  270. addq $5, 8, $4 # E : Initial wh64 address (filler instruction)
  271. nop
  272. nop
  273. addq $5, 8, $5 # E : Inc address
  274. blt $1, $alignmod64 # U :
  275. $bigalign:
  276. /*
  277. * $3 - number quads left to go
  278. * $5 - target address (aligned 0mod64)
  279. * $17 - mask of stuff to store
  280. * Scratch registers available: $7, $2, $4, $1
  281. * we know that we'll be taking a minimum of one trip through
  282. * CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle
  283. * Assumes the wh64 needs to be for 2 trips through the loop in the future
  284. * The wh64 is issued on for the starting destination address for trip +2
  285. * through the loop, and if there are less than two trips left, the target
  286. * address will be for the current trip.
  287. */
  288. $do_wh64:
  289. wh64 ($4) # L1 : memory subsystem write hint
  290. subq $3, 24, $2 # E : For determining future wh64 addresses
  291. stq $17, 0($5) # L :
  292. nop # E :
  293. addq $5, 128, $4 # E : speculative target of next wh64
  294. stq $17, 8($5) # L :
  295. stq $17, 16($5) # L :
  296. addq $5, 64, $7 # E : Fallback address for wh64 (== next trip addr)
  297. stq $17, 24($5) # L :
  298. stq $17, 32($5) # L :
  299. cmovlt $2, $7, $4 # E : Latency 2, extra mapping cycle
  300. nop
  301. stq $17, 40($5) # L :
  302. stq $17, 48($5) # L :
  303. subq $3, 16, $2 # E : Repeat the loop at least once more?
  304. nop
  305. stq $17, 56($5) # L :
  306. addq $5, 64, $5 # E :
  307. subq $3, 8, $3 # E :
  308. bge $2, $do_wh64 # U :
  309. nop
  310. nop
  311. nop
  312. beq $3, no_quad # U : Might have finished already
  313. .align 4
  314. /*
  315. * Simple loop for trailing quadwords, or for small amounts
  316. * of data (where we can't use an unrolled loop and wh64)
  317. */
  318. loop:
  319. stq $17,0($5) # L :
  320. subq $3,1,$3 # E : Decrement number quads left
  321. addq $5,8,$5 # E : Inc address
  322. bne $3,loop # U : more?
  323. no_quad:
  324. /*
  325. * Write 0..7 trailing bytes.
  326. */
  327. nop # E :
  328. beq $18,end # U : All done?
  329. ldq $7,0($5) # L :
  330. mskqh $7,$6,$2 # U : Mask final quad
  331. insqh $17,$6,$4 # U : New bits
  332. bis $2,$4,$1 # E : Put it all together
  333. stq $1,0($5) # L : And back to memory
  334. ret $31,($26),1 # L0 :
  335. within_one_quad:
  336. ldq_u $1,0($16) # L :
  337. insql $17,$16,$2 # U : New bits
  338. mskql $1,$16,$4 # U : Clear old
  339. bis $2,$4,$2 # E : New result
  340. mskql $2,$6,$4 # U :
  341. mskqh $1,$6,$2 # U :
  342. bis $2,$4,$1 # E :
  343. stq_u $1,0($16) # L :
  344. end:
  345. nop
  346. nop
  347. nop
  348. ret $31,($26),1 # L0 :
  349. .end __constant_c_memset
  350. /*
  351. * This is a replicant of the __constant_c_memset code, rescheduled
  352. * to mask stalls. Note that entry point names also had to change
  353. */
  354. .align 5
  355. .ent __memsetw
  356. __memsetw:
  357. .frame $30,0,$26,0
  358. .prologue 0
  359. inswl $17,0,$5 # U : 000000000000c1c2
  360. inswl $17,2,$2 # U : 00000000c1c20000
  361. bis $16,$16,$0 # E : return value
  362. addq $18,$16,$6 # E : max address to write to
  363. ble $18, end_w # U : zero length requested?
  364. inswl $17,4,$3 # U : 0000c1c200000000
  365. inswl $17,6,$4 # U : c1c2000000000000
  366. xor $16,$6,$1 # E : will complete write be within one quadword?
  367. or $2,$5,$2 # E : 00000000c1c2c1c2
  368. or $3,$4,$17 # E : c1c2c1c200000000
  369. bic $1,7,$1 # E : fit within a single quadword
  370. and $16,7,$3 # E : Target addr misalignment
  371. or $17,$2,$17 # E : c1c2c1c2c1c2c1c2
  372. beq $1,within_quad_w # U :
  373. nop
  374. beq $3,aligned_w # U : target is 0mod8
  375. /*
  376. * Target address is misaligned, and won't fit within a quadword
  377. */
  378. ldq_u $4,0($16) # L : Fetch first partial
  379. bis $16,$16,$5 # E : Save the address
  380. insql $17,$16,$2 # U : Insert new bytes
  381. subq $3,8,$3 # E : Invert (for addressing uses)
  382. addq $18,$3,$18 # E : $18 is new count ($3 is negative)
  383. mskql $4,$16,$4 # U : clear relevant parts of the quad
  384. subq $16,$3,$16 # E : $16 is new aligned destination
  385. bis $2,$4,$1 # E : Final bytes
  386. nop
  387. stq_u $1,0($5) # L : Store result
  388. nop
  389. nop
  390. .align 4
  391. aligned_w:
  392. /*
  393. * We are now guaranteed to be quad aligned, with at least
  394. * one partial quad to write.
  395. */
  396. sra $18,3,$3 # U : Number of remaining quads to write
  397. and $18,7,$18 # E : Number of trailing bytes to write
  398. bis $16,$16,$5 # E : Save dest address
  399. beq $3,no_quad_w # U : tail stuff only
  400. /*
  401. * it's worth the effort to unroll this and use wh64 if possible
  402. * Lifted a bunch of code from clear_user.S
  403. * At this point, entry values are:
  404. * $16 Current destination address
  405. * $5 A copy of $16
  406. * $6 The max quadword address to write to
  407. * $18 Number trailer bytes
  408. * $3 Number quads to write
  409. */
  410. and $16, 0x3f, $2 # E : Forward work (only useful for unrolled loop)
  411. subq $3, 16, $4 # E : Only try to unroll if > 128 bytes
  412. subq $2, 0x40, $1 # E : bias counter (aligning stuff 0mod64)
  413. blt $4, loop_w # U :
  414. /*
  415. * We know we've got at least 16 quads, minimum of one trip
  416. * through unrolled loop. Do a quad at a time to get us 0mod64
  417. * aligned.
  418. */
  419. nop # E :
  420. nop # E :
  421. nop # E :
  422. beq $1, $bigalign_w # U :
  423. $alignmod64_w:
  424. stq $17, 0($5) # L :
  425. subq $3, 1, $3 # E : For consistency later
  426. addq $1, 8, $1 # E : Increment towards zero for alignment
  427. addq $5, 8, $4 # E : Initial wh64 address (filler instruction)
  428. nop
  429. nop
  430. addq $5, 8, $5 # E : Inc address
  431. blt $1, $alignmod64_w # U :
  432. $bigalign_w:
  433. /*
  434. * $3 - number quads left to go
  435. * $5 - target address (aligned 0mod64)
  436. * $17 - mask of stuff to store
  437. * Scratch registers available: $7, $2, $4, $1
  438. * we know that we'll be taking a minimum of one trip through
  439. * CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle
  440. * Assumes the wh64 needs to be for 2 trips through the loop in the future
  441. * The wh64 is issued on for the starting destination address for trip +2
  442. * through the loop, and if there are less than two trips left, the target
  443. * address will be for the current trip.
  444. */
  445. $do_wh64_w:
  446. wh64 ($4) # L1 : memory subsystem write hint
  447. subq $3, 24, $2 # E : For determining future wh64 addresses
  448. stq $17, 0($5) # L :
  449. nop # E :
  450. addq $5, 128, $4 # E : speculative target of next wh64
  451. stq $17, 8($5) # L :
  452. stq $17, 16($5) # L :
  453. addq $5, 64, $7 # E : Fallback address for wh64 (== next trip addr)
  454. stq $17, 24($5) # L :
  455. stq $17, 32($5) # L :
  456. cmovlt $2, $7, $4 # E : Latency 2, extra mapping cycle
  457. nop
  458. stq $17, 40($5) # L :
  459. stq $17, 48($5) # L :
  460. subq $3, 16, $2 # E : Repeat the loop at least once more?
  461. nop
  462. stq $17, 56($5) # L :
  463. addq $5, 64, $5 # E :
  464. subq $3, 8, $3 # E :
  465. bge $2, $do_wh64_w # U :
  466. nop
  467. nop
  468. nop
  469. beq $3, no_quad_w # U : Might have finished already
  470. .align 4
  471. /*
  472. * Simple loop for trailing quadwords, or for small amounts
  473. * of data (where we can't use an unrolled loop and wh64)
  474. */
  475. loop_w:
  476. stq $17,0($5) # L :
  477. subq $3,1,$3 # E : Decrement number quads left
  478. addq $5,8,$5 # E : Inc address
  479. bne $3,loop_w # U : more?
  480. no_quad_w:
  481. /*
  482. * Write 0..7 trailing bytes.
  483. */
  484. nop # E :
  485. beq $18,end_w # U : All done?
  486. ldq $7,0($5) # L :
  487. mskqh $7,$6,$2 # U : Mask final quad
  488. insqh $17,$6,$4 # U : New bits
  489. bis $2,$4,$1 # E : Put it all together
  490. stq $1,0($5) # L : And back to memory
  491. ret $31,($26),1 # L0 :
  492. within_quad_w:
  493. ldq_u $1,0($16) # L :
  494. insql $17,$16,$2 # U : New bits
  495. mskql $1,$16,$4 # U : Clear old
  496. bis $2,$4,$2 # E : New result
  497. mskql $2,$6,$4 # U :
  498. mskqh $1,$6,$2 # U :
  499. bis $2,$4,$1 # E :
  500. stq_u $1,0($16) # L :
  501. end_w:
  502. nop
  503. nop
  504. nop
  505. ret $31,($26),1 # L0 :
  506. .end __memsetw
  507. memset = __memset