ip_fast_csum.S 2.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148
  1. /*
  2. * Optmized version of the ip_fast_csum() function
  3. * Used for calculating IP header checksum
  4. *
  5. * Return: 16bit checksum, complemented
  6. *
  7. * Inputs:
  8. * in0: address of buffer to checksum (char *)
  9. * in1: length of the buffer (int)
  10. *
  11. * Copyright (C) 2002, 2006 Intel Corp.
  12. * Copyright (C) 2002, 2006 Ken Chen <kenneth.w.chen@intel.com>
  13. */
  14. #include <asm/asmmacro.h>
  15. #include <asm/export.h>
  16. /*
  17. * Since we know that most likely this function is called with buf aligned
  18. * on 4-byte boundary and 20 bytes in length, we can execution rather quickly
  19. * versus calling generic version of do_csum, which has lots of overhead in
  20. * handling various alignments and sizes. However, due to lack of constrains
  21. * put on the function input argument, cases with alignment not on 4-byte or
  22. * size not equal to 20 bytes will be handled by the generic do_csum function.
  23. */
  24. #define in0 r32
  25. #define in1 r33
  26. #define in2 r34
  27. #define in3 r35
  28. #define in4 r36
  29. #define ret0 r8
  30. GLOBAL_ENTRY(ip_fast_csum)
  31. .prologue
  32. .body
  33. cmp.ne p6,p7=5,in1 // size other than 20 byte?
  34. and r14=3,in0 // is it aligned on 4-byte?
  35. add r15=4,in0 // second source pointer
  36. ;;
  37. cmp.ne.or.andcm p6,p7=r14,r0
  38. ;;
  39. (p7) ld4 r20=[in0],8
  40. (p7) ld4 r21=[r15],8
  41. (p6) br.spnt .generic
  42. ;;
  43. ld4 r22=[in0],8
  44. ld4 r23=[r15],8
  45. ;;
  46. ld4 r24=[in0]
  47. add r20=r20,r21
  48. add r22=r22,r23
  49. ;;
  50. add r20=r20,r22
  51. ;;
  52. add r20=r20,r24
  53. ;;
  54. shr.u ret0=r20,16 // now need to add the carry
  55. zxt2 r20=r20
  56. ;;
  57. add r20=ret0,r20
  58. ;;
  59. shr.u ret0=r20,16 // add carry again
  60. zxt2 r20=r20
  61. ;;
  62. add r20=ret0,r20
  63. ;;
  64. shr.u ret0=r20,16
  65. zxt2 r20=r20
  66. ;;
  67. add r20=ret0,r20
  68. mov r9=0xffff
  69. ;;
  70. andcm ret0=r9,r20
  71. .restore sp // reset frame state
  72. br.ret.sptk.many b0
  73. ;;
  74. .generic:
  75. .prologue
  76. .save ar.pfs, r35
  77. alloc r35=ar.pfs,2,2,2,0
  78. .save rp, r34
  79. mov r34=b0
  80. .body
  81. dep.z out1=in1,2,30
  82. mov out0=in0
  83. ;;
  84. br.call.sptk.many b0=do_csum
  85. ;;
  86. andcm ret0=-1,ret0
  87. mov ar.pfs=r35
  88. mov b0=r34
  89. br.ret.sptk.many b0
  90. END(ip_fast_csum)
  91. EXPORT_SYMBOL(ip_fast_csum)
  92. GLOBAL_ENTRY(csum_ipv6_magic)
  93. ld4 r20=[in0],4
  94. ld4 r21=[in1],4
  95. zxt4 in2=in2
  96. ;;
  97. ld4 r22=[in0],4
  98. ld4 r23=[in1],4
  99. dep r15=in3,in2,32,16
  100. ;;
  101. ld4 r24=[in0],4
  102. ld4 r25=[in1],4
  103. mux1 r15=r15,@rev
  104. add r16=r20,r21
  105. add r17=r22,r23
  106. zxt4 in4=in4
  107. ;;
  108. ld4 r26=[in0],4
  109. ld4 r27=[in1],4
  110. shr.u r15=r15,16
  111. add r18=r24,r25
  112. add r8=r16,r17
  113. ;;
  114. add r19=r26,r27
  115. add r8=r8,r18
  116. ;;
  117. add r8=r8,r19
  118. add r15=r15,in4
  119. ;;
  120. add r8=r8,r15
  121. ;;
  122. shr.u r10=r8,32 // now fold sum into short
  123. zxt4 r11=r8
  124. ;;
  125. add r8=r10,r11
  126. ;;
  127. shr.u r10=r8,16 // yeah, keep it rolling
  128. zxt2 r11=r8
  129. ;;
  130. add r8=r10,r11
  131. ;;
  132. shr.u r10=r8,16 // three times lucky
  133. zxt2 r11=r8
  134. ;;
  135. add r8=r10,r11
  136. mov r9=0xffff
  137. ;;
  138. andcm r8=r9,r8
  139. br.ret.sptk.many b0
  140. END(csum_ipv6_magic)
  141. EXPORT_SYMBOL(csum_ipv6_magic)