ip_fast_csum.S 2.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145
  1. /*
  2. * Optmized version of the ip_fast_csum() function
  3. * Used for calculating IP header checksum
  4. *
  5. * Return: 16bit checksum, complemented
  6. *
  7. * Inputs:
  8. * in0: address of buffer to checksum (char *)
  9. * in1: length of the buffer (int)
  10. *
  11. * Copyright (C) 2002, 2006 Intel Corp.
  12. * Copyright (C) 2002, 2006 Ken Chen <kenneth.w.chen@intel.com>
  13. */
  14. #include <asm/asmmacro.h>
  15. /*
  16. * Since we know that most likely this function is called with buf aligned
  17. * on 4-byte boundary and 20 bytes in length, we can execution rather quickly
  18. * versus calling generic version of do_csum, which has lots of overhead in
  19. * handling various alignments and sizes. However, due to lack of constrains
  20. * put on the function input argument, cases with alignment not on 4-byte or
  21. * size not equal to 20 bytes will be handled by the generic do_csum function.
  22. */
  23. #define in0 r32
  24. #define in1 r33
  25. #define in2 r34
  26. #define in3 r35
  27. #define in4 r36
  28. #define ret0 r8
  29. GLOBAL_ENTRY(ip_fast_csum)
  30. .prologue
  31. .body
  32. cmp.ne p6,p7=5,in1 // size other than 20 byte?
  33. and r14=3,in0 // is it aligned on 4-byte?
  34. add r15=4,in0 // second source pointer
  35. ;;
  36. cmp.ne.or.andcm p6,p7=r14,r0
  37. ;;
  38. (p7) ld4 r20=[in0],8
  39. (p7) ld4 r21=[r15],8
  40. (p6) br.spnt .generic
  41. ;;
  42. ld4 r22=[in0],8
  43. ld4 r23=[r15],8
  44. ;;
  45. ld4 r24=[in0]
  46. add r20=r20,r21
  47. add r22=r22,r23
  48. ;;
  49. add r20=r20,r22
  50. ;;
  51. add r20=r20,r24
  52. ;;
  53. shr.u ret0=r20,16 // now need to add the carry
  54. zxt2 r20=r20
  55. ;;
  56. add r20=ret0,r20
  57. ;;
  58. shr.u ret0=r20,16 // add carry again
  59. zxt2 r20=r20
  60. ;;
  61. add r20=ret0,r20
  62. ;;
  63. shr.u ret0=r20,16
  64. zxt2 r20=r20
  65. ;;
  66. add r20=ret0,r20
  67. mov r9=0xffff
  68. ;;
  69. andcm ret0=r9,r20
  70. .restore sp // reset frame state
  71. br.ret.sptk.many b0
  72. ;;
  73. .generic:
  74. .prologue
  75. .save ar.pfs, r35
  76. alloc r35=ar.pfs,2,2,2,0
  77. .save rp, r34
  78. mov r34=b0
  79. .body
  80. dep.z out1=in1,2,30
  81. mov out0=in0
  82. ;;
  83. br.call.sptk.many b0=do_csum
  84. ;;
  85. andcm ret0=-1,ret0
  86. mov ar.pfs=r35
  87. mov b0=r34
  88. br.ret.sptk.many b0
  89. END(ip_fast_csum)
  90. GLOBAL_ENTRY(csum_ipv6_magic)
  91. ld4 r20=[in0],4
  92. ld4 r21=[in1],4
  93. zxt4 in2=in2
  94. ;;
  95. ld4 r22=[in0],4
  96. ld4 r23=[in1],4
  97. dep r15=in3,in2,32,16
  98. ;;
  99. ld4 r24=[in0],4
  100. ld4 r25=[in1],4
  101. mux1 r15=r15,@rev
  102. add r16=r20,r21
  103. add r17=r22,r23
  104. zxt4 in4=in4
  105. ;;
  106. ld4 r26=[in0],4
  107. ld4 r27=[in1],4
  108. shr.u r15=r15,16
  109. add r18=r24,r25
  110. add r8=r16,r17
  111. ;;
  112. add r19=r26,r27
  113. add r8=r8,r18
  114. ;;
  115. add r8=r8,r19
  116. add r15=r15,in4
  117. ;;
  118. add r8=r8,r15
  119. ;;
  120. shr.u r10=r8,32 // now fold sum into short
  121. zxt4 r11=r8
  122. ;;
  123. add r8=r10,r11
  124. ;;
  125. shr.u r10=r8,16 // yeah, keep it rolling
  126. zxt2 r11=r8
  127. ;;
  128. add r8=r10,r11
  129. ;;
  130. shr.u r10=r8,16 // three times lucky
  131. zxt2 r11=r8
  132. ;;
  133. add r8=r10,r11
  134. mov r9=0xffff
  135. ;;
  136. andcm r8=r9,r8
  137. br.ret.sptk.many b0
  138. END(csum_ipv6_magic)