strncmp.S 8.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311
  1. /*
  2. * Copyright (C) 2013 ARM Ltd.
  3. * Copyright (C) 2013 Linaro.
  4. *
  5. * This code is based on glibc cortex strings work originally authored by Linaro
  6. * and re-licensed under GPLv2 for the Linux kernel. The original code can
  7. * be found @
  8. *
  9. * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
  10. * files/head:/src/aarch64/
  11. *
  12. * This program is free software; you can redistribute it and/or modify
  13. * it under the terms of the GNU General Public License version 2 as
  14. * published by the Free Software Foundation.
  15. *
  16. * This program is distributed in the hope that it will be useful,
  17. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  19. * GNU General Public License for more details.
  20. *
  21. * You should have received a copy of the GNU General Public License
  22. * along with this program. If not, see <http://www.gnu.org/licenses/>.
  23. */
  24. #include <linux/linkage.h>
  25. #include <asm/assembler.h>
  26. /*
  27. * compare two strings
  28. *
  29. * Parameters:
  30. * x0 - const string 1 pointer
  31. * x1 - const string 2 pointer
  32. * x2 - the maximal length to be compared
  33. * Returns:
  34. * x0 - an integer less than, equal to, or greater than zero if s1 is found,
  35. * respectively, to be less than, to match, or be greater than s2.
  36. */
  37. #define REP8_01 0x0101010101010101
  38. #define REP8_7f 0x7f7f7f7f7f7f7f7f
  39. #define REP8_80 0x8080808080808080
  40. /* Parameters and result. */
  41. src1 .req x0
  42. src2 .req x1
  43. limit .req x2
  44. result .req x0
  45. /* Internal variables. */
  46. data1 .req x3
  47. data1w .req w3
  48. data2 .req x4
  49. data2w .req w4
  50. has_nul .req x5
  51. diff .req x6
  52. syndrome .req x7
  53. tmp1 .req x8
  54. tmp2 .req x9
  55. tmp3 .req x10
  56. zeroones .req x11
  57. pos .req x12
  58. limit_wd .req x13
  59. mask .req x14
  60. endloop .req x15
  61. ENTRY(strncmp)
  62. cbz limit, .Lret0
  63. eor tmp1, src1, src2
  64. mov zeroones, #REP8_01
  65. tst tmp1, #7
  66. b.ne .Lmisaligned8
  67. ands tmp1, src1, #7
  68. b.ne .Lmutual_align
  69. /* Calculate the number of full and partial words -1. */
  70. /*
  71. * when limit is mulitply of 8, if not sub 1,
  72. * the judgement of last dword will wrong.
  73. */
  74. sub limit_wd, limit, #1 /* limit != 0, so no underflow. */
  75. lsr limit_wd, limit_wd, #3 /* Convert to Dwords. */
  76. /*
  77. * NUL detection works on the principle that (X - 1) & (~X) & 0x80
  78. * (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
  79. * can be done in parallel across the entire word.
  80. */
  81. .Lloop_aligned:
  82. ldr data1, [src1], #8
  83. ldr data2, [src2], #8
  84. .Lstart_realigned:
  85. subs limit_wd, limit_wd, #1
  86. sub tmp1, data1, zeroones
  87. orr tmp2, data1, #REP8_7f
  88. eor diff, data1, data2 /* Non-zero if differences found. */
  89. csinv endloop, diff, xzr, pl /* Last Dword or differences.*/
  90. bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
  91. ccmp endloop, #0, #0, eq
  92. b.eq .Lloop_aligned
  93. /*Not reached the limit, must have found the end or a diff. */
  94. tbz limit_wd, #63, .Lnot_limit
  95. /* Limit % 8 == 0 => all bytes significant. */
  96. ands limit, limit, #7
  97. b.eq .Lnot_limit
  98. lsl limit, limit, #3 /* Bits -> bytes. */
  99. mov mask, #~0
  100. CPU_BE( lsr mask, mask, limit )
  101. CPU_LE( lsl mask, mask, limit )
  102. bic data1, data1, mask
  103. bic data2, data2, mask
  104. /* Make sure that the NUL byte is marked in the syndrome. */
  105. orr has_nul, has_nul, mask
  106. .Lnot_limit:
  107. orr syndrome, diff, has_nul
  108. b .Lcal_cmpresult
  109. .Lmutual_align:
  110. /*
  111. * Sources are mutually aligned, but are not currently at an
  112. * alignment boundary. Round down the addresses and then mask off
  113. * the bytes that precede the start point.
  114. * We also need to adjust the limit calculations, but without
  115. * overflowing if the limit is near ULONG_MAX.
  116. */
  117. bic src1, src1, #7
  118. bic src2, src2, #7
  119. ldr data1, [src1], #8
  120. neg tmp3, tmp1, lsl #3 /* 64 - bits(bytes beyond align). */
  121. ldr data2, [src2], #8
  122. mov tmp2, #~0
  123. sub limit_wd, limit, #1 /* limit != 0, so no underflow. */
  124. /* Big-endian. Early bytes are at MSB. */
  125. CPU_BE( lsl tmp2, tmp2, tmp3 ) /* Shift (tmp1 & 63). */
  126. /* Little-endian. Early bytes are at LSB. */
  127. CPU_LE( lsr tmp2, tmp2, tmp3 ) /* Shift (tmp1 & 63). */
  128. and tmp3, limit_wd, #7
  129. lsr limit_wd, limit_wd, #3
  130. /* Adjust the limit. Only low 3 bits used, so overflow irrelevant.*/
  131. add limit, limit, tmp1
  132. add tmp3, tmp3, tmp1
  133. orr data1, data1, tmp2
  134. orr data2, data2, tmp2
  135. add limit_wd, limit_wd, tmp3, lsr #3
  136. b .Lstart_realigned
  137. /*when src1 offset is not equal to src2 offset...*/
  138. .Lmisaligned8:
  139. cmp limit, #8
  140. b.lo .Ltiny8proc /*limit < 8... */
  141. /*
  142. * Get the align offset length to compare per byte first.
  143. * After this process, one string's address will be aligned.*/
  144. and tmp1, src1, #7
  145. neg tmp1, tmp1
  146. add tmp1, tmp1, #8
  147. and tmp2, src2, #7
  148. neg tmp2, tmp2
  149. add tmp2, tmp2, #8
  150. subs tmp3, tmp1, tmp2
  151. csel pos, tmp1, tmp2, hi /*Choose the maximum. */
  152. /*
  153. * Here, limit is not less than 8, so directly run .Ltinycmp
  154. * without checking the limit.*/
  155. sub limit, limit, pos
  156. .Ltinycmp:
  157. ldrb data1w, [src1], #1
  158. ldrb data2w, [src2], #1
  159. subs pos, pos, #1
  160. ccmp data1w, #1, #0, ne /* NZCV = 0b0000. */
  161. ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
  162. b.eq .Ltinycmp
  163. cbnz pos, 1f /*find the null or unequal...*/
  164. cmp data1w, #1
  165. ccmp data1w, data2w, #0, cs
  166. b.eq .Lstart_align /*the last bytes are equal....*/
  167. 1:
  168. sub result, data1, data2
  169. ret
  170. .Lstart_align:
  171. lsr limit_wd, limit, #3
  172. cbz limit_wd, .Lremain8
  173. /*process more leading bytes to make str1 aligned...*/
  174. ands xzr, src1, #7
  175. b.eq .Lrecal_offset
  176. add src1, src1, tmp3 /*tmp3 is positive in this branch.*/
  177. add src2, src2, tmp3
  178. ldr data1, [src1], #8
  179. ldr data2, [src2], #8
  180. sub limit, limit, tmp3
  181. lsr limit_wd, limit, #3
  182. subs limit_wd, limit_wd, #1
  183. sub tmp1, data1, zeroones
  184. orr tmp2, data1, #REP8_7f
  185. eor diff, data1, data2 /* Non-zero if differences found. */
  186. csinv endloop, diff, xzr, ne/*if limit_wd is 0,will finish the cmp*/
  187. bics has_nul, tmp1, tmp2
  188. ccmp endloop, #0, #0, eq /*has_null is ZERO: no null byte*/
  189. b.ne .Lunequal_proc
  190. /*How far is the current str2 from the alignment boundary...*/
  191. and tmp3, tmp3, #7
  192. .Lrecal_offset:
  193. neg pos, tmp3
  194. .Lloopcmp_proc:
  195. /*
  196. * Divide the eight bytes into two parts. First,backwards the src2
  197. * to an alignment boundary,load eight bytes from the SRC2 alignment
  198. * boundary,then compare with the relative bytes from SRC1.
  199. * If all 8 bytes are equal,then start the second part's comparison.
  200. * Otherwise finish the comparison.
  201. * This special handle can garantee all the accesses are in the
  202. * thread/task space in avoid to overrange access.
  203. */
  204. ldr data1, [src1,pos]
  205. ldr data2, [src2,pos]
  206. sub tmp1, data1, zeroones
  207. orr tmp2, data1, #REP8_7f
  208. bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
  209. eor diff, data1, data2 /* Non-zero if differences found. */
  210. csinv endloop, diff, xzr, eq
  211. cbnz endloop, .Lunequal_proc
  212. /*The second part process*/
  213. ldr data1, [src1], #8
  214. ldr data2, [src2], #8
  215. subs limit_wd, limit_wd, #1
  216. sub tmp1, data1, zeroones
  217. orr tmp2, data1, #REP8_7f
  218. eor diff, data1, data2 /* Non-zero if differences found. */
  219. csinv endloop, diff, xzr, ne/*if limit_wd is 0,will finish the cmp*/
  220. bics has_nul, tmp1, tmp2
  221. ccmp endloop, #0, #0, eq /*has_null is ZERO: no null byte*/
  222. b.eq .Lloopcmp_proc
  223. .Lunequal_proc:
  224. orr syndrome, diff, has_nul
  225. cbz syndrome, .Lremain8
  226. .Lcal_cmpresult:
  227. /*
  228. * reversed the byte-order as big-endian,then CLZ can find the most
  229. * significant zero bits.
  230. */
  231. CPU_LE( rev syndrome, syndrome )
  232. CPU_LE( rev data1, data1 )
  233. CPU_LE( rev data2, data2 )
  234. /*
  235. * For big-endian we cannot use the trick with the syndrome value
  236. * as carry-propagation can corrupt the upper bits if the trailing
  237. * bytes in the string contain 0x01.
  238. * However, if there is no NUL byte in the dword, we can generate
  239. * the result directly. We can't just subtract the bytes as the
  240. * MSB might be significant.
  241. */
  242. CPU_BE( cbnz has_nul, 1f )
  243. CPU_BE( cmp data1, data2 )
  244. CPU_BE( cset result, ne )
  245. CPU_BE( cneg result, result, lo )
  246. CPU_BE( ret )
  247. CPU_BE( 1: )
  248. /* Re-compute the NUL-byte detection, using a byte-reversed value.*/
  249. CPU_BE( rev tmp3, data1 )
  250. CPU_BE( sub tmp1, tmp3, zeroones )
  251. CPU_BE( orr tmp2, tmp3, #REP8_7f )
  252. CPU_BE( bic has_nul, tmp1, tmp2 )
  253. CPU_BE( rev has_nul, has_nul )
  254. CPU_BE( orr syndrome, diff, has_nul )
  255. /*
  256. * The MS-non-zero bit of the syndrome marks either the first bit
  257. * that is different, or the top bit of the first zero byte.
  258. * Shifting left now will bring the critical information into the
  259. * top bits.
  260. */
  261. clz pos, syndrome
  262. lsl data1, data1, pos
  263. lsl data2, data2, pos
  264. /*
  265. * But we need to zero-extend (char is unsigned) the value and then
  266. * perform a signed 32-bit subtraction.
  267. */
  268. lsr data1, data1, #56
  269. sub result, data1, data2, lsr #56
  270. ret
  271. .Lremain8:
  272. /* Limit % 8 == 0 => all bytes significant. */
  273. ands limit, limit, #7
  274. b.eq .Lret0
  275. .Ltiny8proc:
  276. ldrb data1w, [src1], #1
  277. ldrb data2w, [src2], #1
  278. subs limit, limit, #1
  279. ccmp data1w, #1, #0, ne /* NZCV = 0b0000. */
  280. ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
  281. b.eq .Ltiny8proc
  282. sub result, data1, data2
  283. ret
  284. .Lret0:
  285. mov result, #0
  286. ret
  287. ENDPIPROC(strncmp)