ev6-divide.S 6.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263
  1. /*
  2. * arch/alpha/lib/ev6-divide.S
  3. *
  4. * 21264 version contributed by Rick Gorton <rick.gorton@alpha-processor.com>
  5. *
  6. * Alpha division..
  7. */
  8. /*
  9. * The alpha chip doesn't provide hardware division, so we have to do it
  10. * by hand. The compiler expects the functions
  11. *
  12. * __divqu: 64-bit unsigned long divide
  13. * __remqu: 64-bit unsigned long remainder
  14. * __divqs/__remqs: signed 64-bit
  15. * __divlu/__remlu: unsigned 32-bit
  16. * __divls/__remls: signed 32-bit
  17. *
  18. * These are not normal C functions: instead of the normal
  19. * calling sequence, these expect their arguments in registers
  20. * $24 and $25, and return the result in $27. Register $28 may
  21. * be clobbered (assembly temporary), anything else must be saved.
  22. *
  23. * In short: painful.
  24. *
  25. * This is a rather simple bit-at-a-time algorithm: it's very good
  26. * at dividing random 64-bit numbers, but the more usual case where
  27. * the divisor is small is handled better by the DEC algorithm
  28. * using lookup tables. This uses much less memory, though, and is
  29. * nicer on the cache.. Besides, I don't know the copyright status
  30. * of the DEC code.
  31. */
  32. /*
  33. * My temporaries:
  34. * $0 - current bit
  35. * $1 - shifted divisor
  36. * $2 - modulus/quotient
  37. *
  38. * $23 - return address
  39. * $24 - dividend
  40. * $25 - divisor
  41. *
  42. * $27 - quotient/modulus
  43. * $28 - compare status
  44. *
  45. * Much of the information about 21264 scheduling/coding comes from:
  46. * Compiler Writer's Guide for the Alpha 21264
  47. * abbreviated as 'CWG' in other comments here
  48. * ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
  49. * Scheduling notation:
  50. * E - either cluster
  51. * U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1
  52. * L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1
  53. * Try not to change the actual algorithm if possible for consistency.
  54. */
  55. #include <asm/export.h>
  56. #define halt .long 0
  57. /*
  58. * Select function type and registers
  59. */
  60. #define mask $0
  61. #define divisor $1
  62. #define compare $28
  63. #define tmp1 $3
  64. #define tmp2 $4
  65. #ifdef DIV
  66. #define DIV_ONLY(x,y...) x,##y
  67. #define MOD_ONLY(x,y...)
  68. #define func(x) __div##x
  69. #define modulus $2
  70. #define quotient $27
  71. #define GETSIGN(x) xor $24,$25,x
  72. #define STACK 48
  73. #else
  74. #define DIV_ONLY(x,y...)
  75. #define MOD_ONLY(x,y...) x,##y
  76. #define func(x) __rem##x
  77. #define modulus $27
  78. #define quotient $2
  79. #define GETSIGN(x) bis $24,$24,x
  80. #define STACK 32
  81. #endif
  82. /*
  83. * For 32-bit operations, we need to extend to 64-bit
  84. */
  85. #ifdef INTSIZE
  86. #define ufunction func(lu)
  87. #define sfunction func(l)
  88. #define LONGIFY(x) zapnot x,15,x
  89. #define SLONGIFY(x) addl x,0,x
  90. #else
  91. #define ufunction func(qu)
  92. #define sfunction func(q)
  93. #define LONGIFY(x)
  94. #define SLONGIFY(x)
  95. #endif
  96. .set noat
  97. .align 4
  98. .globl ufunction
  99. .ent ufunction
  100. ufunction:
  101. subq $30,STACK,$30 # E :
  102. .frame $30,STACK,$23
  103. .prologue 0
  104. 7: stq $1, 0($30) # L :
  105. bis $25,$25,divisor # E :
  106. stq $2, 8($30) # L : L U L U
  107. bis $24,$24,modulus # E :
  108. stq $0,16($30) # L :
  109. bis $31,$31,quotient # E :
  110. LONGIFY(divisor) # E : U L L U
  111. stq tmp1,24($30) # L :
  112. LONGIFY(modulus) # E :
  113. bis $31,1,mask # E :
  114. DIV_ONLY(stq tmp2,32($30)) # L : L U U L
  115. beq divisor, 9f /* div by zero */
  116. /*
  117. * In spite of the DIV_ONLY being either a non-instruction
  118. * or an actual stq, the addition of the .align directive
  119. * below ensures that label 1 is going to be nicely aligned
  120. */
  121. .align 4
  122. #ifdef INTSIZE
  123. /*
  124. * shift divisor left, using 3-bit shifts for
  125. * 32-bit divides as we can't overflow. Three-bit
  126. * shifts will result in looping three times less
  127. * here, but can result in two loops more later.
  128. * Thus using a large shift isn't worth it (and
  129. * s8add pairs better than a sll..)
  130. */
  131. 1: cmpult divisor,modulus,compare # E :
  132. s8addq divisor,$31,divisor # E :
  133. s8addq mask,$31,mask # E :
  134. bne compare,1b # U : U L U L
  135. #else
  136. 1: cmpult divisor,modulus,compare # E :
  137. nop # E :
  138. nop # E :
  139. blt divisor, 2f # U : U L U L
  140. addq divisor,divisor,divisor # E :
  141. addq mask,mask,mask # E :
  142. unop # E :
  143. bne compare,1b # U : U L U L
  144. #endif
  145. /* ok, start to go right again.. */
  146. 2:
  147. /*
  148. * Keep things nicely bundled... use a nop instead of not
  149. * having an instruction for DIV_ONLY
  150. */
  151. #ifdef DIV
  152. DIV_ONLY(addq quotient,mask,tmp2) # E :
  153. #else
  154. nop # E :
  155. #endif
  156. srl mask,1,mask # U :
  157. cmpule divisor,modulus,compare # E :
  158. subq modulus,divisor,tmp1 # E :
  159. #ifdef DIV
  160. DIV_ONLY(cmovne compare,tmp2,quotient) # E : Latency 2, extra map slot
  161. nop # E : as part of the cmovne
  162. srl divisor,1,divisor # U :
  163. nop # E : L U L U
  164. nop # E :
  165. cmovne compare,tmp1,modulus # E : Latency 2, extra map slot
  166. nop # E : as part of the cmovne
  167. bne mask,2b # U : U L U L
  168. #else
  169. srl divisor,1,divisor # U :
  170. cmovne compare,tmp1,modulus # E : Latency 2, extra map slot
  171. nop # E : as part of the cmovne
  172. bne mask,2b # U : U L L U
  173. #endif
  174. 9: ldq $1, 0($30) # L :
  175. ldq $2, 8($30) # L :
  176. nop # E :
  177. nop # E : U U L L
  178. ldq $0,16($30) # L :
  179. ldq tmp1,24($30) # L :
  180. nop # E :
  181. nop # E :
  182. #ifdef DIV
  183. DIV_ONLY(ldq tmp2,32($30)) # L :
  184. #else
  185. nop # E :
  186. #endif
  187. addq $30,STACK,$30 # E :
  188. ret $31,($23),1 # L0 : L U U L
  189. .end ufunction
  190. EXPORT_SYMBOL(ufunction)
  191. /*
  192. * Uhh.. Ugly signed division. I'd rather not have it at all, but
  193. * it's needed in some circumstances. There are different ways to
  194. * handle this, really. This does:
  195. * -a / b = a / -b = -(a / b)
  196. * -a % b = -(a % b)
  197. * a % -b = a % b
  198. * which is probably not the best solution, but at least should
  199. * have the property that (x/y)*y + (x%y) = x.
  200. */
  201. .align 4
  202. .globl sfunction
  203. .ent sfunction
  204. sfunction:
  205. subq $30,STACK,$30 # E :
  206. .frame $30,STACK,$23
  207. .prologue 0
  208. bis $24,$25,$28 # E :
  209. SLONGIFY($28) # E :
  210. bge $28,7b # U :
  211. stq $24,0($30) # L :
  212. subq $31,$24,$28 # E :
  213. stq $25,8($30) # L :
  214. nop # E : U L U L
  215. cmovlt $24,$28,$24 /* abs($24) */ # E : Latency 2, extra map slot
  216. nop # E : as part of the cmov
  217. stq $23,16($30) # L :
  218. subq $31,$25,$28 # E : U L U L
  219. stq tmp1,24($30) # L :
  220. cmovlt $25,$28,$25 /* abs($25) */ # E : Latency 2, extra map slot
  221. nop # E :
  222. bsr $23,ufunction # L0: L U L U
  223. ldq $24,0($30) # L :
  224. ldq $25,8($30) # L :
  225. GETSIGN($28) # E :
  226. subq $31,$27,tmp1 # E : U U L L
  227. SLONGIFY($28) # E :
  228. ldq $23,16($30) # L :
  229. cmovlt $28,tmp1,$27 # E : Latency 2, extra map slot
  230. nop # E : U L L U : as part of the cmov
  231. ldq tmp1,24($30) # L :
  232. nop # E : as part of the cmov
  233. addq $30,STACK,$30 # E :
  234. ret $31,($23),1 # L0 : L U U L
  235. .end sfunction
  236. EXPORT_SYMBOL(sfunction)