stan.S 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455
  1. |
  2. | stan.sa 3.3 7/29/91
  3. |
  4. | The entry point stan computes the tangent of
  5. | an input argument;
  6. | stand does the same except for denormalized input.
  7. |
  8. | Input: Double-extended number X in location pointed to
  9. | by address register a0.
  10. |
  11. | Output: The value tan(X) returned in floating-point register Fp0.
  12. |
  13. | Accuracy and Monotonicity: The returned result is within 3 ulp in
  14. | 64 significant bit, i.e. within 0.5001 ulp to 53 bits if the
  15. | result is subsequently rounded to double precision. The
  16. | result is provably monotonic in double precision.
  17. |
  18. | Speed: The program sTAN takes approximately 170 cycles for
  19. | input argument X such that |X| < 15Pi, which is the usual
  20. | situation.
  21. |
  22. | Algorithm:
  23. |
  24. | 1. If |X| >= 15Pi or |X| < 2**(-40), go to 6.
  25. |
  26. | 2. Decompose X as X = N(Pi/2) + r where |r| <= Pi/4. Let
  27. | k = N mod 2, so in particular, k = 0 or 1.
  28. |
  29. | 3. If k is odd, go to 5.
  30. |
  31. | 4. (k is even) Tan(X) = tan(r) and tan(r) is approximated by a
  32. | rational function U/V where
  33. | U = r + r*s*(P1 + s*(P2 + s*P3)), and
  34. | V = 1 + s*(Q1 + s*(Q2 + s*(Q3 + s*Q4))), s = r*r.
  35. | Exit.
  36. |
  37. | 4. (k is odd) Tan(X) = -cot(r). Since tan(r) is approximated by a
  38. | rational function U/V where
  39. | U = r + r*s*(P1 + s*(P2 + s*P3)), and
  40. | V = 1 + s*(Q1 + s*(Q2 + s*(Q3 + s*Q4))), s = r*r,
  41. | -Cot(r) = -V/U. Exit.
  42. |
  43. | 6. If |X| > 1, go to 8.
  44. |
  45. | 7. (|X|<2**(-40)) Tan(X) = X. Exit.
  46. |
  47. | 8. Overwrite X by X := X rem 2Pi. Now that |X| <= Pi, go back to 2.
  48. |
  49. | Copyright (C) Motorola, Inc. 1990
  50. | All Rights Reserved
  51. |
  52. | For details on the license for this file, please see the
  53. | file, README, in this same directory.
  54. |STAN idnt 2,1 | Motorola 040 Floating Point Software Package
  55. |section 8
  56. #include "fpsp.h"
  57. BOUNDS1: .long 0x3FD78000,0x4004BC7E
  58. TWOBYPI: .long 0x3FE45F30,0x6DC9C883
  59. TANQ4: .long 0x3EA0B759,0xF50F8688
  60. TANP3: .long 0xBEF2BAA5,0xA8924F04
  61. TANQ3: .long 0xBF346F59,0xB39BA65F,0x00000000,0x00000000
  62. TANP2: .long 0x3FF60000,0xE073D3FC,0x199C4A00,0x00000000
  63. TANQ2: .long 0x3FF90000,0xD23CD684,0x15D95FA1,0x00000000
  64. TANP1: .long 0xBFFC0000,0x8895A6C5,0xFB423BCA,0x00000000
  65. TANQ1: .long 0xBFFD0000,0xEEF57E0D,0xA84BC8CE,0x00000000
  66. INVTWOPI: .long 0x3FFC0000,0xA2F9836E,0x4E44152A,0x00000000
  67. TWOPI1: .long 0x40010000,0xC90FDAA2,0x00000000,0x00000000
  68. TWOPI2: .long 0x3FDF0000,0x85A308D4,0x00000000,0x00000000
  69. |--N*PI/2, -32 <= N <= 32, IN A LEADING TERM IN EXT. AND TRAILING
  70. |--TERM IN SGL. NOTE THAT PI IS 64-BIT LONG, THUS N*PI/2 IS AT
  71. |--MOST 69 BITS LONG.
  72. .global PITBL
  73. PITBL:
  74. .long 0xC0040000,0xC90FDAA2,0x2168C235,0x21800000
  75. .long 0xC0040000,0xC2C75BCD,0x105D7C23,0xA0D00000
  76. .long 0xC0040000,0xBC7EDCF7,0xFF523611,0xA1E80000
  77. .long 0xC0040000,0xB6365E22,0xEE46F000,0x21480000
  78. .long 0xC0040000,0xAFEDDF4D,0xDD3BA9EE,0xA1200000
  79. .long 0xC0040000,0xA9A56078,0xCC3063DD,0x21FC0000
  80. .long 0xC0040000,0xA35CE1A3,0xBB251DCB,0x21100000
  81. .long 0xC0040000,0x9D1462CE,0xAA19D7B9,0xA1580000
  82. .long 0xC0040000,0x96CBE3F9,0x990E91A8,0x21E00000
  83. .long 0xC0040000,0x90836524,0x88034B96,0x20B00000
  84. .long 0xC0040000,0x8A3AE64F,0x76F80584,0xA1880000
  85. .long 0xC0040000,0x83F2677A,0x65ECBF73,0x21C40000
  86. .long 0xC0030000,0xFB53D14A,0xA9C2F2C2,0x20000000
  87. .long 0xC0030000,0xEEC2D3A0,0x87AC669F,0x21380000
  88. .long 0xC0030000,0xE231D5F6,0x6595DA7B,0xA1300000
  89. .long 0xC0030000,0xD5A0D84C,0x437F4E58,0x9FC00000
  90. .long 0xC0030000,0xC90FDAA2,0x2168C235,0x21000000
  91. .long 0xC0030000,0xBC7EDCF7,0xFF523611,0xA1680000
  92. .long 0xC0030000,0xAFEDDF4D,0xDD3BA9EE,0xA0A00000
  93. .long 0xC0030000,0xA35CE1A3,0xBB251DCB,0x20900000
  94. .long 0xC0030000,0x96CBE3F9,0x990E91A8,0x21600000
  95. .long 0xC0030000,0x8A3AE64F,0x76F80584,0xA1080000
  96. .long 0xC0020000,0xFB53D14A,0xA9C2F2C2,0x1F800000
  97. .long 0xC0020000,0xE231D5F6,0x6595DA7B,0xA0B00000
  98. .long 0xC0020000,0xC90FDAA2,0x2168C235,0x20800000
  99. .long 0xC0020000,0xAFEDDF4D,0xDD3BA9EE,0xA0200000
  100. .long 0xC0020000,0x96CBE3F9,0x990E91A8,0x20E00000
  101. .long 0xC0010000,0xFB53D14A,0xA9C2F2C2,0x1F000000
  102. .long 0xC0010000,0xC90FDAA2,0x2168C235,0x20000000
  103. .long 0xC0010000,0x96CBE3F9,0x990E91A8,0x20600000
  104. .long 0xC0000000,0xC90FDAA2,0x2168C235,0x1F800000
  105. .long 0xBFFF0000,0xC90FDAA2,0x2168C235,0x1F000000
  106. .long 0x00000000,0x00000000,0x00000000,0x00000000
  107. .long 0x3FFF0000,0xC90FDAA2,0x2168C235,0x9F000000
  108. .long 0x40000000,0xC90FDAA2,0x2168C235,0x9F800000
  109. .long 0x40010000,0x96CBE3F9,0x990E91A8,0xA0600000
  110. .long 0x40010000,0xC90FDAA2,0x2168C235,0xA0000000
  111. .long 0x40010000,0xFB53D14A,0xA9C2F2C2,0x9F000000
  112. .long 0x40020000,0x96CBE3F9,0x990E91A8,0xA0E00000
  113. .long 0x40020000,0xAFEDDF4D,0xDD3BA9EE,0x20200000
  114. .long 0x40020000,0xC90FDAA2,0x2168C235,0xA0800000
  115. .long 0x40020000,0xE231D5F6,0x6595DA7B,0x20B00000
  116. .long 0x40020000,0xFB53D14A,0xA9C2F2C2,0x9F800000
  117. .long 0x40030000,0x8A3AE64F,0x76F80584,0x21080000
  118. .long 0x40030000,0x96CBE3F9,0x990E91A8,0xA1600000
  119. .long 0x40030000,0xA35CE1A3,0xBB251DCB,0xA0900000
  120. .long 0x40030000,0xAFEDDF4D,0xDD3BA9EE,0x20A00000
  121. .long 0x40030000,0xBC7EDCF7,0xFF523611,0x21680000
  122. .long 0x40030000,0xC90FDAA2,0x2168C235,0xA1000000
  123. .long 0x40030000,0xD5A0D84C,0x437F4E58,0x1FC00000
  124. .long 0x40030000,0xE231D5F6,0x6595DA7B,0x21300000
  125. .long 0x40030000,0xEEC2D3A0,0x87AC669F,0xA1380000
  126. .long 0x40030000,0xFB53D14A,0xA9C2F2C2,0xA0000000
  127. .long 0x40040000,0x83F2677A,0x65ECBF73,0xA1C40000
  128. .long 0x40040000,0x8A3AE64F,0x76F80584,0x21880000
  129. .long 0x40040000,0x90836524,0x88034B96,0xA0B00000
  130. .long 0x40040000,0x96CBE3F9,0x990E91A8,0xA1E00000
  131. .long 0x40040000,0x9D1462CE,0xAA19D7B9,0x21580000
  132. .long 0x40040000,0xA35CE1A3,0xBB251DCB,0xA1100000
  133. .long 0x40040000,0xA9A56078,0xCC3063DD,0xA1FC0000
  134. .long 0x40040000,0xAFEDDF4D,0xDD3BA9EE,0x21200000
  135. .long 0x40040000,0xB6365E22,0xEE46F000,0xA1480000
  136. .long 0x40040000,0xBC7EDCF7,0xFF523611,0x21E80000
  137. .long 0x40040000,0xC2C75BCD,0x105D7C23,0x20D00000
  138. .long 0x40040000,0xC90FDAA2,0x2168C235,0xA1800000
  139. .set INARG,FP_SCR4
  140. .set TWOTO63,L_SCR1
  141. .set ENDFLAG,L_SCR2
  142. .set N,L_SCR3
  143. | xref t_frcinx
  144. |xref t_extdnrm
  145. .global stand
  146. stand:
  147. |--TAN(X) = X FOR DENORMALIZED X
  148. bra t_extdnrm
  149. .global stan
  150. stan:
  151. fmovex (%a0),%fp0 | ...LOAD INPUT
  152. movel (%a0),%d0
  153. movew 4(%a0),%d0
  154. andil #0x7FFFFFFF,%d0
  155. cmpil #0x3FD78000,%d0 | ...|X| >= 2**(-40)?
  156. bges TANOK1
  157. bra TANSM
  158. TANOK1:
  159. cmpil #0x4004BC7E,%d0 | ...|X| < 15 PI?
  160. blts TANMAIN
  161. bra REDUCEX
  162. TANMAIN:
  163. |--THIS IS THE USUAL CASE, |X| <= 15 PI.
  164. |--THE ARGUMENT REDUCTION IS DONE BY TABLE LOOK UP.
  165. fmovex %fp0,%fp1
  166. fmuld TWOBYPI,%fp1 | ...X*2/PI
  167. |--HIDE THE NEXT TWO INSTRUCTIONS
  168. leal PITBL+0x200,%a1 | ...TABLE OF N*PI/2, N = -32,...,32
  169. |--FP1 IS NOW READY
  170. fmovel %fp1,%d0 | ...CONVERT TO INTEGER
  171. asll #4,%d0
  172. addal %d0,%a1 | ...ADDRESS N*PIBY2 IN Y1, Y2
  173. fsubx (%a1)+,%fp0 | ...X-Y1
  174. |--HIDE THE NEXT ONE
  175. fsubs (%a1),%fp0 | ...FP0 IS R = (X-Y1)-Y2
  176. rorl #5,%d0
  177. andil #0x80000000,%d0 | ...D0 WAS ODD IFF D0 < 0
  178. TANCONT:
  179. cmpil #0,%d0
  180. blt NODD
  181. fmovex %fp0,%fp1
  182. fmulx %fp1,%fp1 | ...S = R*R
  183. fmoved TANQ4,%fp3
  184. fmoved TANP3,%fp2
  185. fmulx %fp1,%fp3 | ...SQ4
  186. fmulx %fp1,%fp2 | ...SP3
  187. faddd TANQ3,%fp3 | ...Q3+SQ4
  188. faddx TANP2,%fp2 | ...P2+SP3
  189. fmulx %fp1,%fp3 | ...S(Q3+SQ4)
  190. fmulx %fp1,%fp2 | ...S(P2+SP3)
  191. faddx TANQ2,%fp3 | ...Q2+S(Q3+SQ4)
  192. faddx TANP1,%fp2 | ...P1+S(P2+SP3)
  193. fmulx %fp1,%fp3 | ...S(Q2+S(Q3+SQ4))
  194. fmulx %fp1,%fp2 | ...S(P1+S(P2+SP3))
  195. faddx TANQ1,%fp3 | ...Q1+S(Q2+S(Q3+SQ4))
  196. fmulx %fp0,%fp2 | ...RS(P1+S(P2+SP3))
  197. fmulx %fp3,%fp1 | ...S(Q1+S(Q2+S(Q3+SQ4)))
  198. faddx %fp2,%fp0 | ...R+RS(P1+S(P2+SP3))
  199. fadds #0x3F800000,%fp1 | ...1+S(Q1+...)
  200. fmovel %d1,%fpcr |restore users exceptions
  201. fdivx %fp1,%fp0 |last inst - possible exception set
  202. bra t_frcinx
  203. NODD:
  204. fmovex %fp0,%fp1
  205. fmulx %fp0,%fp0 | ...S = R*R
  206. fmoved TANQ4,%fp3
  207. fmoved TANP3,%fp2
  208. fmulx %fp0,%fp3 | ...SQ4
  209. fmulx %fp0,%fp2 | ...SP3
  210. faddd TANQ3,%fp3 | ...Q3+SQ4
  211. faddx TANP2,%fp2 | ...P2+SP3
  212. fmulx %fp0,%fp3 | ...S(Q3+SQ4)
  213. fmulx %fp0,%fp2 | ...S(P2+SP3)
  214. faddx TANQ2,%fp3 | ...Q2+S(Q3+SQ4)
  215. faddx TANP1,%fp2 | ...P1+S(P2+SP3)
  216. fmulx %fp0,%fp3 | ...S(Q2+S(Q3+SQ4))
  217. fmulx %fp0,%fp2 | ...S(P1+S(P2+SP3))
  218. faddx TANQ1,%fp3 | ...Q1+S(Q2+S(Q3+SQ4))
  219. fmulx %fp1,%fp2 | ...RS(P1+S(P2+SP3))
  220. fmulx %fp3,%fp0 | ...S(Q1+S(Q2+S(Q3+SQ4)))
  221. faddx %fp2,%fp1 | ...R+RS(P1+S(P2+SP3))
  222. fadds #0x3F800000,%fp0 | ...1+S(Q1+...)
  223. fmovex %fp1,-(%sp)
  224. eoril #0x80000000,(%sp)
  225. fmovel %d1,%fpcr |restore users exceptions
  226. fdivx (%sp)+,%fp0 |last inst - possible exception set
  227. bra t_frcinx
  228. TANBORS:
  229. |--IF |X| > 15PI, WE USE THE GENERAL ARGUMENT REDUCTION.
  230. |--IF |X| < 2**(-40), RETURN X OR 1.
  231. cmpil #0x3FFF8000,%d0
  232. bgts REDUCEX
  233. TANSM:
  234. fmovex %fp0,-(%sp)
  235. fmovel %d1,%fpcr |restore users exceptions
  236. fmovex (%sp)+,%fp0 |last inst - possible exception set
  237. bra t_frcinx
  238. REDUCEX:
  239. |--WHEN REDUCEX IS USED, THE CODE WILL INEVITABLY BE SLOW.
  240. |--THIS REDUCTION METHOD, HOWEVER, IS MUCH FASTER THAN USING
  241. |--THE REMAINDER INSTRUCTION WHICH IS NOW IN SOFTWARE.
  242. fmovemx %fp2-%fp5,-(%a7) | ...save FP2 through FP5
  243. movel %d2,-(%a7)
  244. fmoves #0x00000000,%fp1
  245. |--If compact form of abs(arg) in d0=$7ffeffff, argument is so large that
  246. |--there is a danger of unwanted overflow in first LOOP iteration. In this
  247. |--case, reduce argument by one remainder step to make subsequent reduction
  248. |--safe.
  249. cmpil #0x7ffeffff,%d0 |is argument dangerously large?
  250. bnes LOOP
  251. movel #0x7ffe0000,FP_SCR2(%a6) |yes
  252. | ;create 2**16383*PI/2
  253. movel #0xc90fdaa2,FP_SCR2+4(%a6)
  254. clrl FP_SCR2+8(%a6)
  255. ftstx %fp0 |test sign of argument
  256. movel #0x7fdc0000,FP_SCR3(%a6) |create low half of 2**16383*
  257. | ;PI/2 at FP_SCR3
  258. movel #0x85a308d3,FP_SCR3+4(%a6)
  259. clrl FP_SCR3+8(%a6)
  260. fblt red_neg
  261. orw #0x8000,FP_SCR2(%a6) |positive arg
  262. orw #0x8000,FP_SCR3(%a6)
  263. red_neg:
  264. faddx FP_SCR2(%a6),%fp0 |high part of reduction is exact
  265. fmovex %fp0,%fp1 |save high result in fp1
  266. faddx FP_SCR3(%a6),%fp0 |low part of reduction
  267. fsubx %fp0,%fp1 |determine low component of result
  268. faddx FP_SCR3(%a6),%fp1 |fp0/fp1 are reduced argument.
  269. |--ON ENTRY, FP0 IS X, ON RETURN, FP0 IS X REM PI/2, |X| <= PI/4.
  270. |--integer quotient will be stored in N
  271. |--Intermediate remainder is 66-bit long; (R,r) in (FP0,FP1)
  272. LOOP:
  273. fmovex %fp0,INARG(%a6) | ...+-2**K * F, 1 <= F < 2
  274. movew INARG(%a6),%d0
  275. movel %d0,%a1 | ...save a copy of D0
  276. andil #0x00007FFF,%d0
  277. subil #0x00003FFF,%d0 | ...D0 IS K
  278. cmpil #28,%d0
  279. bles LASTLOOP
  280. CONTLOOP:
  281. subil #27,%d0 | ...D0 IS L := K-27
  282. movel #0,ENDFLAG(%a6)
  283. bras WORK
  284. LASTLOOP:
  285. clrl %d0 | ...D0 IS L := 0
  286. movel #1,ENDFLAG(%a6)
  287. WORK:
  288. |--FIND THE REMAINDER OF (R,r) W.R.T. 2**L * (PI/2). L IS SO CHOSEN
  289. |--THAT INT( X * (2/PI) / 2**(L) ) < 2**29.
  290. |--CREATE 2**(-L) * (2/PI), SIGN(INARG)*2**(63),
  291. |--2**L * (PIby2_1), 2**L * (PIby2_2)
  292. movel #0x00003FFE,%d2 | ...BIASED EXPO OF 2/PI
  293. subl %d0,%d2 | ...BIASED EXPO OF 2**(-L)*(2/PI)
  294. movel #0xA2F9836E,FP_SCR1+4(%a6)
  295. movel #0x4E44152A,FP_SCR1+8(%a6)
  296. movew %d2,FP_SCR1(%a6) | ...FP_SCR1 is 2**(-L)*(2/PI)
  297. fmovex %fp0,%fp2
  298. fmulx FP_SCR1(%a6),%fp2
  299. |--WE MUST NOW FIND INT(FP2). SINCE WE NEED THIS VALUE IN
  300. |--FLOATING POINT FORMAT, THE TWO FMOVE'S FMOVE.L FP <--> N
  301. |--WILL BE TOO INEFFICIENT. THE WAY AROUND IT IS THAT
  302. |--(SIGN(INARG)*2**63 + FP2) - SIGN(INARG)*2**63 WILL GIVE
  303. |--US THE DESIRED VALUE IN FLOATING POINT.
  304. |--HIDE SIX CYCLES OF INSTRUCTION
  305. movel %a1,%d2
  306. swap %d2
  307. andil #0x80000000,%d2
  308. oril #0x5F000000,%d2 | ...D2 IS SIGN(INARG)*2**63 IN SGL
  309. movel %d2,TWOTO63(%a6)
  310. movel %d0,%d2
  311. addil #0x00003FFF,%d2 | ...BIASED EXPO OF 2**L * (PI/2)
  312. |--FP2 IS READY
  313. fadds TWOTO63(%a6),%fp2 | ...THE FRACTIONAL PART OF FP1 IS ROUNDED
  314. |--HIDE 4 CYCLES OF INSTRUCTION; creating 2**(L)*Piby2_1 and 2**(L)*Piby2_2
  315. movew %d2,FP_SCR2(%a6)
  316. clrw FP_SCR2+2(%a6)
  317. movel #0xC90FDAA2,FP_SCR2+4(%a6)
  318. clrl FP_SCR2+8(%a6) | ...FP_SCR2 is 2**(L) * Piby2_1
  319. |--FP2 IS READY
  320. fsubs TWOTO63(%a6),%fp2 | ...FP2 is N
  321. addil #0x00003FDD,%d0
  322. movew %d0,FP_SCR3(%a6)
  323. clrw FP_SCR3+2(%a6)
  324. movel #0x85A308D3,FP_SCR3+4(%a6)
  325. clrl FP_SCR3+8(%a6) | ...FP_SCR3 is 2**(L) * Piby2_2
  326. movel ENDFLAG(%a6),%d0
  327. |--We are now ready to perform (R+r) - N*P1 - N*P2, P1 = 2**(L) * Piby2_1 and
  328. |--P2 = 2**(L) * Piby2_2
  329. fmovex %fp2,%fp4
  330. fmulx FP_SCR2(%a6),%fp4 | ...W = N*P1
  331. fmovex %fp2,%fp5
  332. fmulx FP_SCR3(%a6),%fp5 | ...w = N*P2
  333. fmovex %fp4,%fp3
  334. |--we want P+p = W+w but |p| <= half ulp of P
  335. |--Then, we need to compute A := R-P and a := r-p
  336. faddx %fp5,%fp3 | ...FP3 is P
  337. fsubx %fp3,%fp4 | ...W-P
  338. fsubx %fp3,%fp0 | ...FP0 is A := R - P
  339. faddx %fp5,%fp4 | ...FP4 is p = (W-P)+w
  340. fmovex %fp0,%fp3 | ...FP3 A
  341. fsubx %fp4,%fp1 | ...FP1 is a := r - p
  342. |--Now we need to normalize (A,a) to "new (R,r)" where R+r = A+a but
  343. |--|r| <= half ulp of R.
  344. faddx %fp1,%fp0 | ...FP0 is R := A+a
  345. |--No need to calculate r if this is the last loop
  346. cmpil #0,%d0
  347. bgt RESTORE
  348. |--Need to calculate r
  349. fsubx %fp0,%fp3 | ...A-R
  350. faddx %fp3,%fp1 | ...FP1 is r := (A-R)+a
  351. bra LOOP
  352. RESTORE:
  353. fmovel %fp2,N(%a6)
  354. movel (%a7)+,%d2
  355. fmovemx (%a7)+,%fp2-%fp5
  356. movel N(%a6),%d0
  357. rorl #1,%d0
  358. bra TANCONT
  359. |end