checksum.S 8.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413
  1. /*
  2. * INET An implementation of the TCP/IP protocol suite for the LINUX
  3. * operating system. INET is implemented using the BSD Socket
  4. * interface as the means of communication with the user level.
  5. *
  6. * IP/TCP/UDP checksumming routines
  7. *
  8. * Xtensa version: Copyright (C) 2001 Tensilica, Inc. by Kevin Chea
  9. * Optimized by Joe Taylor
  10. *
  11. * This program is free software; you can redistribute it and/or
  12. * modify it under the terms of the GNU General Public License
  13. * as published by the Free Software Foundation; either version
  14. * 2 of the License, or (at your option) any later version.
  15. */
  16. #include <asm/errno.h>
  17. #include <linux/linkage.h>
  18. #include <variant/core.h>
  19. /*
  20. * computes a partial checksum, e.g. for TCP/UDP fragments
  21. */
  22. /*
  23. * unsigned int csum_partial(const unsigned char *buf, int len,
  24. * unsigned int sum);
  25. * a2 = buf
  26. * a3 = len
  27. * a4 = sum
  28. *
  29. * This function assumes 2- or 4-byte alignment. Other alignments will fail!
  30. */
  31. /* ONES_ADD converts twos-complement math to ones-complement. */
  32. #define ONES_ADD(sum, val) \
  33. add sum, sum, val ; \
  34. bgeu sum, val, 99f ; \
  35. addi sum, sum, 1 ; \
  36. 99: ;
  37. .text
  38. ENTRY(csum_partial)
  39. /*
  40. * Experiments with Ethernet and SLIP connections show that buf
  41. * is aligned on either a 2-byte or 4-byte boundary.
  42. */
  43. entry sp, 32
  44. extui a5, a2, 0, 2
  45. bnez a5, 8f /* branch if 2-byte aligned */
  46. /* Fall-through on common case, 4-byte alignment */
  47. 1:
  48. srli a5, a3, 5 /* 32-byte chunks */
  49. #if XCHAL_HAVE_LOOPS
  50. loopgtz a5, 2f
  51. #else
  52. beqz a5, 2f
  53. slli a5, a5, 5
  54. add a5, a5, a2 /* a5 = end of last 32-byte chunk */
  55. .Loop1:
  56. #endif
  57. l32i a6, a2, 0
  58. l32i a7, a2, 4
  59. ONES_ADD(a4, a6)
  60. ONES_ADD(a4, a7)
  61. l32i a6, a2, 8
  62. l32i a7, a2, 12
  63. ONES_ADD(a4, a6)
  64. ONES_ADD(a4, a7)
  65. l32i a6, a2, 16
  66. l32i a7, a2, 20
  67. ONES_ADD(a4, a6)
  68. ONES_ADD(a4, a7)
  69. l32i a6, a2, 24
  70. l32i a7, a2, 28
  71. ONES_ADD(a4, a6)
  72. ONES_ADD(a4, a7)
  73. addi a2, a2, 4*8
  74. #if !XCHAL_HAVE_LOOPS
  75. blt a2, a5, .Loop1
  76. #endif
  77. 2:
  78. extui a5, a3, 2, 3 /* remaining 4-byte chunks */
  79. #if XCHAL_HAVE_LOOPS
  80. loopgtz a5, 3f
  81. #else
  82. beqz a5, 3f
  83. slli a5, a5, 2
  84. add a5, a5, a2 /* a5 = end of last 4-byte chunk */
  85. .Loop2:
  86. #endif
  87. l32i a6, a2, 0
  88. ONES_ADD(a4, a6)
  89. addi a2, a2, 4
  90. #if !XCHAL_HAVE_LOOPS
  91. blt a2, a5, .Loop2
  92. #endif
  93. 3:
  94. _bbci.l a3, 1, 5f /* remaining 2-byte chunk */
  95. l16ui a6, a2, 0
  96. ONES_ADD(a4, a6)
  97. addi a2, a2, 2
  98. 5:
  99. _bbci.l a3, 0, 7f /* remaining 1-byte chunk */
  100. 6: l8ui a6, a2, 0
  101. #ifdef __XTENSA_EB__
  102. slli a6, a6, 8 /* load byte into bits 8..15 */
  103. #endif
  104. ONES_ADD(a4, a6)
  105. 7:
  106. mov a2, a4
  107. retw
  108. /* uncommon case, buf is 2-byte aligned */
  109. 8:
  110. beqz a3, 7b /* branch if len == 0 */
  111. beqi a3, 1, 6b /* branch if len == 1 */
  112. extui a5, a2, 0, 1
  113. bnez a5, 8f /* branch if 1-byte aligned */
  114. l16ui a6, a2, 0 /* common case, len >= 2 */
  115. ONES_ADD(a4, a6)
  116. addi a2, a2, 2 /* adjust buf */
  117. addi a3, a3, -2 /* adjust len */
  118. j 1b /* now buf is 4-byte aligned */
  119. /* case: odd-byte aligned, len > 1
  120. * This case is dog slow, so don't give us an odd address.
  121. * (I don't think this ever happens, but just in case.)
  122. */
  123. 8:
  124. srli a5, a3, 2 /* 4-byte chunks */
  125. #if XCHAL_HAVE_LOOPS
  126. loopgtz a5, 2f
  127. #else
  128. beqz a5, 2f
  129. slli a5, a5, 2
  130. add a5, a5, a2 /* a5 = end of last 4-byte chunk */
  131. .Loop3:
  132. #endif
  133. l8ui a6, a2, 0 /* bits 24..31 */
  134. l16ui a7, a2, 1 /* bits 8..23 */
  135. l8ui a8, a2, 3 /* bits 0.. 8 */
  136. #ifdef __XTENSA_EB__
  137. slli a6, a6, 24
  138. #else
  139. slli a8, a8, 24
  140. #endif
  141. slli a7, a7, 8
  142. or a7, a7, a6
  143. or a7, a7, a8
  144. ONES_ADD(a4, a7)
  145. addi a2, a2, 4
  146. #if !XCHAL_HAVE_LOOPS
  147. blt a2, a5, .Loop3
  148. #endif
  149. 2:
  150. _bbci.l a3, 1, 3f /* remaining 2-byte chunk, still odd addr */
  151. l8ui a6, a2, 0
  152. l8ui a7, a2, 1
  153. #ifdef __XTENSA_EB__
  154. slli a6, a6, 8
  155. #else
  156. slli a7, a7, 8
  157. #endif
  158. or a7, a7, a6
  159. ONES_ADD(a4, a7)
  160. addi a2, a2, 2
  161. 3:
  162. j 5b /* branch to handle the remaining byte */
  163. ENDPROC(csum_partial)
  164. /*
  165. * Copy from ds while checksumming, otherwise like csum_partial
  166. *
  167. * The macros SRC and DST specify the type of access for the instruction.
  168. * thus we can call a custom exception handler for each access type.
  169. */
  170. #define SRC(y...) \
  171. 9999: y; \
  172. .section __ex_table, "a"; \
  173. .long 9999b, 6001f ; \
  174. .previous
  175. #define DST(y...) \
  176. 9999: y; \
  177. .section __ex_table, "a"; \
  178. .long 9999b, 6002f ; \
  179. .previous
  180. /*
  181. unsigned int csum_partial_copy_generic (const char *src, char *dst, int len,
  182. int sum, int *src_err_ptr, int *dst_err_ptr)
  183. a2 = src
  184. a3 = dst
  185. a4 = len
  186. a5 = sum
  187. a6 = src_err_ptr
  188. a7 = dst_err_ptr
  189. a8 = temp
  190. a9 = temp
  191. a10 = temp
  192. a11 = original len for exception handling
  193. a12 = original dst for exception handling
  194. This function is optimized for 4-byte aligned addresses. Other
  195. alignments work, but not nearly as efficiently.
  196. */
  197. ENTRY(csum_partial_copy_generic)
  198. entry sp, 32
  199. mov a12, a3
  200. mov a11, a4
  201. or a10, a2, a3
  202. /* We optimize the following alignment tests for the 4-byte
  203. aligned case. Two bbsi.l instructions might seem more optimal
  204. (commented out below). However, both labels 5: and 3: are out
  205. of the imm8 range, so the assembler relaxes them into
  206. equivalent bbci.l, j combinations, which is actually
  207. slower. */
  208. extui a9, a10, 0, 2
  209. beqz a9, 1f /* branch if both are 4-byte aligned */
  210. bbsi.l a10, 0, 5f /* branch if one address is odd */
  211. j 3f /* one address is 2-byte aligned */
  212. /* _bbsi.l a10, 0, 5f */ /* branch if odd address */
  213. /* _bbsi.l a10, 1, 3f */ /* branch if 2-byte-aligned address */
  214. 1:
  215. /* src and dst are both 4-byte aligned */
  216. srli a10, a4, 5 /* 32-byte chunks */
  217. #if XCHAL_HAVE_LOOPS
  218. loopgtz a10, 2f
  219. #else
  220. beqz a10, 2f
  221. slli a10, a10, 5
  222. add a10, a10, a2 /* a10 = end of last 32-byte src chunk */
  223. .Loop5:
  224. #endif
  225. SRC( l32i a9, a2, 0 )
  226. SRC( l32i a8, a2, 4 )
  227. DST( s32i a9, a3, 0 )
  228. DST( s32i a8, a3, 4 )
  229. ONES_ADD(a5, a9)
  230. ONES_ADD(a5, a8)
  231. SRC( l32i a9, a2, 8 )
  232. SRC( l32i a8, a2, 12 )
  233. DST( s32i a9, a3, 8 )
  234. DST( s32i a8, a3, 12 )
  235. ONES_ADD(a5, a9)
  236. ONES_ADD(a5, a8)
  237. SRC( l32i a9, a2, 16 )
  238. SRC( l32i a8, a2, 20 )
  239. DST( s32i a9, a3, 16 )
  240. DST( s32i a8, a3, 20 )
  241. ONES_ADD(a5, a9)
  242. ONES_ADD(a5, a8)
  243. SRC( l32i a9, a2, 24 )
  244. SRC( l32i a8, a2, 28 )
  245. DST( s32i a9, a3, 24 )
  246. DST( s32i a8, a3, 28 )
  247. ONES_ADD(a5, a9)
  248. ONES_ADD(a5, a8)
  249. addi a2, a2, 32
  250. addi a3, a3, 32
  251. #if !XCHAL_HAVE_LOOPS
  252. blt a2, a10, .Loop5
  253. #endif
  254. 2:
  255. extui a10, a4, 2, 3 /* remaining 4-byte chunks */
  256. extui a4, a4, 0, 2 /* reset len for general-case, 2-byte chunks */
  257. #if XCHAL_HAVE_LOOPS
  258. loopgtz a10, 3f
  259. #else
  260. beqz a10, 3f
  261. slli a10, a10, 2
  262. add a10, a10, a2 /* a10 = end of last 4-byte src chunk */
  263. .Loop6:
  264. #endif
  265. SRC( l32i a9, a2, 0 )
  266. DST( s32i a9, a3, 0 )
  267. ONES_ADD(a5, a9)
  268. addi a2, a2, 4
  269. addi a3, a3, 4
  270. #if !XCHAL_HAVE_LOOPS
  271. blt a2, a10, .Loop6
  272. #endif
  273. 3:
  274. /*
  275. Control comes to here in two cases: (1) It may fall through
  276. to here from the 4-byte alignment case to process, at most,
  277. one 2-byte chunk. (2) It branches to here from above if
  278. either src or dst is 2-byte aligned, and we process all bytes
  279. here, except for perhaps a trailing odd byte. It's
  280. inefficient, so align your addresses to 4-byte boundaries.
  281. a2 = src
  282. a3 = dst
  283. a4 = len
  284. a5 = sum
  285. */
  286. srli a10, a4, 1 /* 2-byte chunks */
  287. #if XCHAL_HAVE_LOOPS
  288. loopgtz a10, 4f
  289. #else
  290. beqz a10, 4f
  291. slli a10, a10, 1
  292. add a10, a10, a2 /* a10 = end of last 2-byte src chunk */
  293. .Loop7:
  294. #endif
  295. SRC( l16ui a9, a2, 0 )
  296. DST( s16i a9, a3, 0 )
  297. ONES_ADD(a5, a9)
  298. addi a2, a2, 2
  299. addi a3, a3, 2
  300. #if !XCHAL_HAVE_LOOPS
  301. blt a2, a10, .Loop7
  302. #endif
  303. 4:
  304. /* This section processes a possible trailing odd byte. */
  305. _bbci.l a4, 0, 8f /* 1-byte chunk */
  306. SRC( l8ui a9, a2, 0 )
  307. DST( s8i a9, a3, 0 )
  308. #ifdef __XTENSA_EB__
  309. slli a9, a9, 8 /* shift byte to bits 8..15 */
  310. #endif
  311. ONES_ADD(a5, a9)
  312. 8:
  313. mov a2, a5
  314. retw
  315. 5:
  316. /* Control branch to here when either src or dst is odd. We
  317. process all bytes using 8-bit accesses. Grossly inefficient,
  318. so don't feed us an odd address. */
  319. srli a10, a4, 1 /* handle in pairs for 16-bit csum */
  320. #if XCHAL_HAVE_LOOPS
  321. loopgtz a10, 6f
  322. #else
  323. beqz a10, 6f
  324. slli a10, a10, 1
  325. add a10, a10, a2 /* a10 = end of last odd-aligned, 2-byte src chunk */
  326. .Loop8:
  327. #endif
  328. SRC( l8ui a9, a2, 0 )
  329. SRC( l8ui a8, a2, 1 )
  330. DST( s8i a9, a3, 0 )
  331. DST( s8i a8, a3, 1 )
  332. #ifdef __XTENSA_EB__
  333. slli a9, a9, 8 /* combine into a single 16-bit value */
  334. #else /* for checksum computation */
  335. slli a8, a8, 8
  336. #endif
  337. or a9, a9, a8
  338. ONES_ADD(a5, a9)
  339. addi a2, a2, 2
  340. addi a3, a3, 2
  341. #if !XCHAL_HAVE_LOOPS
  342. blt a2, a10, .Loop8
  343. #endif
  344. 6:
  345. j 4b /* process the possible trailing odd byte */
  346. ENDPROC(csum_partial_copy_generic)
  347. # Exception handler:
  348. .section .fixup, "ax"
  349. /*
  350. a6 = src_err_ptr
  351. a7 = dst_err_ptr
  352. a11 = original len for exception handling
  353. a12 = original dst for exception handling
  354. */
  355. 6001:
  356. _movi a2, -EFAULT
  357. s32i a2, a6, 0 /* src_err_ptr */
  358. # clear the complete destination - computing the rest
  359. # is too much work
  360. movi a2, 0
  361. #if XCHAL_HAVE_LOOPS
  362. loopgtz a11, 2f
  363. #else
  364. beqz a11, 2f
  365. add a11, a11, a12 /* a11 = ending address */
  366. .Leloop:
  367. #endif
  368. s8i a2, a12, 0
  369. addi a12, a12, 1
  370. #if !XCHAL_HAVE_LOOPS
  371. blt a12, a11, .Leloop
  372. #endif
  373. 2:
  374. retw
  375. 6002:
  376. movi a2, -EFAULT
  377. s32i a2, a7, 0 /* dst_err_ptr */
  378. movi a2, 0
  379. retw
  380. .previous