checksum_64.S 3.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174
  1. /* checksum.S: Sparc V9 optimized checksum code.
  2. *
  3. * Copyright(C) 1995 Linus Torvalds
  4. * Copyright(C) 1995 Miguel de Icaza
  5. * Copyright(C) 1996, 2000 David S. Miller
  6. * Copyright(C) 1997 Jakub Jelinek
  7. *
  8. * derived from:
  9. * Linux/Alpha checksum c-code
  10. * Linux/ix86 inline checksum assembly
  11. * RFC1071 Computing the Internet Checksum (esp. Jacobsons m68k code)
  12. * David Mosberger-Tang for optimized reference c-code
  13. * BSD4.4 portable checksum routine
  14. */
  15. .text
  16. csum_partial_fix_alignment:
  17. /* We checked for zero length already, so there must be
  18. * at least one byte.
  19. */
  20. be,pt %icc, 1f
  21. nop
  22. ldub [%o0 + 0x00], %o4
  23. add %o0, 1, %o0
  24. sub %o1, 1, %o1
  25. 1: andcc %o0, 0x2, %g0
  26. be,pn %icc, csum_partial_post_align
  27. cmp %o1, 2
  28. blu,pn %icc, csum_partial_end_cruft
  29. nop
  30. lduh [%o0 + 0x00], %o5
  31. add %o0, 2, %o0
  32. sub %o1, 2, %o1
  33. ba,pt %xcc, csum_partial_post_align
  34. add %o5, %o4, %o4
  35. .align 32
  36. .globl csum_partial
  37. csum_partial: /* %o0=buff, %o1=len, %o2=sum */
  38. prefetch [%o0 + 0x000], #n_reads
  39. clr %o4
  40. prefetch [%o0 + 0x040], #n_reads
  41. brz,pn %o1, csum_partial_finish
  42. andcc %o0, 0x3, %g0
  43. /* We "remember" whether the lowest bit in the address
  44. * was set in %g7. Because if it is, we have to swap
  45. * upper and lower 8 bit fields of the sum we calculate.
  46. */
  47. bne,pn %icc, csum_partial_fix_alignment
  48. andcc %o0, 0x1, %g7
  49. csum_partial_post_align:
  50. prefetch [%o0 + 0x080], #n_reads
  51. andncc %o1, 0x3f, %o3
  52. prefetch [%o0 + 0x0c0], #n_reads
  53. sub %o1, %o3, %o1
  54. brz,pn %o3, 2f
  55. prefetch [%o0 + 0x100], #n_reads
  56. /* So that we don't need to use the non-pairing
  57. * add-with-carry instructions we accumulate 32-bit
  58. * values into a 64-bit register. At the end of the
  59. * loop we fold it down to 32-bits and so on.
  60. */
  61. prefetch [%o0 + 0x140], #n_reads
  62. 1: lduw [%o0 + 0x00], %o5
  63. lduw [%o0 + 0x04], %g1
  64. lduw [%o0 + 0x08], %g2
  65. add %o4, %o5, %o4
  66. lduw [%o0 + 0x0c], %g3
  67. add %o4, %g1, %o4
  68. lduw [%o0 + 0x10], %o5
  69. add %o4, %g2, %o4
  70. lduw [%o0 + 0x14], %g1
  71. add %o4, %g3, %o4
  72. lduw [%o0 + 0x18], %g2
  73. add %o4, %o5, %o4
  74. lduw [%o0 + 0x1c], %g3
  75. add %o4, %g1, %o4
  76. lduw [%o0 + 0x20], %o5
  77. add %o4, %g2, %o4
  78. lduw [%o0 + 0x24], %g1
  79. add %o4, %g3, %o4
  80. lduw [%o0 + 0x28], %g2
  81. add %o4, %o5, %o4
  82. lduw [%o0 + 0x2c], %g3
  83. add %o4, %g1, %o4
  84. lduw [%o0 + 0x30], %o5
  85. add %o4, %g2, %o4
  86. lduw [%o0 + 0x34], %g1
  87. add %o4, %g3, %o4
  88. lduw [%o0 + 0x38], %g2
  89. add %o4, %o5, %o4
  90. lduw [%o0 + 0x3c], %g3
  91. add %o4, %g1, %o4
  92. prefetch [%o0 + 0x180], #n_reads
  93. add %o4, %g2, %o4
  94. subcc %o3, 0x40, %o3
  95. add %o0, 0x40, %o0
  96. bne,pt %icc, 1b
  97. add %o4, %g3, %o4
  98. 2: and %o1, 0x3c, %o3
  99. brz,pn %o3, 2f
  100. sub %o1, %o3, %o1
  101. 1: lduw [%o0 + 0x00], %o5
  102. subcc %o3, 0x4, %o3
  103. add %o0, 0x4, %o0
  104. bne,pt %icc, 1b
  105. add %o4, %o5, %o4
  106. 2:
  107. /* fold 64-->32 */
  108. srlx %o4, 32, %o5
  109. srl %o4, 0, %o4
  110. add %o4, %o5, %o4
  111. srlx %o4, 32, %o5
  112. srl %o4, 0, %o4
  113. add %o4, %o5, %o4
  114. /* fold 32-->16 */
  115. sethi %hi(0xffff0000), %g1
  116. srl %o4, 16, %o5
  117. andn %o4, %g1, %g2
  118. add %o5, %g2, %o4
  119. srl %o4, 16, %o5
  120. andn %o4, %g1, %g2
  121. add %o5, %g2, %o4
  122. csum_partial_end_cruft:
  123. /* %o4 has the 16-bit sum we have calculated so-far. */
  124. cmp %o1, 2
  125. blu,pt %icc, 1f
  126. nop
  127. lduh [%o0 + 0x00], %o5
  128. sub %o1, 2, %o1
  129. add %o0, 2, %o0
  130. add %o4, %o5, %o4
  131. 1: brz,pt %o1, 1f
  132. nop
  133. ldub [%o0 + 0x00], %o5
  134. sub %o1, 1, %o1
  135. add %o0, 1, %o0
  136. sllx %o5, 8, %o5
  137. add %o4, %o5, %o4
  138. 1:
  139. /* fold 32-->16 */
  140. sethi %hi(0xffff0000), %g1
  141. srl %o4, 16, %o5
  142. andn %o4, %g1, %g2
  143. add %o5, %g2, %o4
  144. srl %o4, 16, %o5
  145. andn %o4, %g1, %g2
  146. add %o5, %g2, %o4
  147. 1: brz,pt %g7, 1f
  148. nop
  149. /* We started with an odd byte, byte-swap the result. */
  150. srl %o4, 8, %o5
  151. and %o4, 0xff, %g1
  152. sll %g1, 8, %g1
  153. or %o5, %g1, %o4
  154. 1: addcc %o2, %o4, %o2
  155. addc %g0, %o2, %o2
  156. csum_partial_finish:
  157. retl
  158. srl %o2, 0, %o0