atomic_asm_32.S 7.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207
  1. /*
  2. * Copyright 2010 Tilera Corporation. All Rights Reserved.
  3. *
  4. * This program is free software; you can redistribute it and/or
  5. * modify it under the terms of the GNU General Public License
  6. * as published by the Free Software Foundation, version 2.
  7. *
  8. * This program is distributed in the hope that it will be useful, but
  9. * WITHOUT ANY WARRANTY; without even the implied warranty of
  10. * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
  11. * NON INFRINGEMENT. See the GNU General Public License for
  12. * more details.
  13. *
  14. * Support routines for atomic operations. Each function takes:
  15. *
  16. * r0: address to manipulate
  17. * r1: pointer to atomic lock guarding this operation (for ATOMIC_LOCK_REG)
  18. * r2: new value to write, or for cmpxchg/add_unless, value to compare against
  19. * r3: (cmpxchg/xchg_add_unless) new value to write or add;
  20. * (atomic64 ops) high word of value to write
  21. * r4/r5: (cmpxchg64/add_unless64) new value to write or add
  22. *
  23. * The 32-bit routines return a "struct __get_user" so that the futex code
  24. * has an opportunity to return -EFAULT to the user if needed.
  25. * The 64-bit routines just return a "long long" with the value,
  26. * since they are only used from kernel space and don't expect to fault.
  27. * Support for 16-bit ops is included in the framework but we don't provide
  28. * any (x86_64 has an atomic_inc_short(), so we might want to some day).
  29. *
  30. * Note that the caller is advised to issue a suitable L1 or L2
  31. * prefetch on the address being manipulated to avoid extra stalls.
  32. * In addition, the hot path is on two icache lines, and we start with
  33. * a jump to the second line to make sure they are both in cache so
  34. * that we never stall waiting on icache fill while holding the lock.
  35. * (This doesn't work out with most 64-bit ops, since they consume
  36. * too many bundles, so may take an extra i-cache stall.)
  37. *
  38. * These routines set the INTERRUPT_CRITICAL_SECTION bit, just
  39. * like sys_cmpxchg(), so that NMIs like PERF_COUNT will not interrupt
  40. * the code, just page faults.
  41. *
  42. * If the load or store faults in a way that can be directly fixed in
  43. * the do_page_fault_ics() handler (e.g. a vmalloc reference) we fix it
  44. * directly, return to the instruction that faulted, and retry it.
  45. *
  46. * If the load or store faults in a way that potentially requires us
  47. * to release the atomic lock, then retry (e.g. a migrating PTE), we
  48. * reset the PC in do_page_fault_ics() to the "tns" instruction so
  49. * that on return we will reacquire the lock and restart the op. We
  50. * are somewhat overloading the exception_table_entry notion by doing
  51. * this, since those entries are not normally used for migrating PTEs.
  52. *
  53. * If the main page fault handler discovers a bad address, it will see
  54. * the PC pointing to the "tns" instruction (due to the earlier
  55. * exception_table_entry processing in do_page_fault_ics), and
  56. * re-reset the PC to the fault handler, atomic_bad_address(), which
  57. * effectively takes over from the atomic op and can either return a
  58. * bad "struct __get_user" (for user addresses) or can just panic (for
  59. * bad kernel addresses).
  60. *
  61. * Note that if the value we would store is the same as what we
  62. * loaded, we bypass the store. Other platforms with true atomics can
  63. * make the guarantee that a non-atomic __clear_bit(), for example,
  64. * can safely race with an atomic test_and_set_bit(); this example is
  65. * from bit_spinlock.h in slub_lock() / slub_unlock(). We can't do
  66. * that on Tile since the "atomic" op is really just a
  67. * read/modify/write, and can race with the non-atomic
  68. * read/modify/write. However, if we can short-circuit the write when
  69. * it is not needed, in the atomic case, we avoid the race.
  70. */
  71. #include <linux/linkage.h>
  72. #include <asm/atomic_32.h>
  73. #include <asm/page.h>
  74. #include <asm/processor.h>
  75. .section .text.atomic,"ax"
  76. ENTRY(__start_atomic_asm_code)
  77. .macro atomic_op, name, bitwidth, body
  78. .align 64
  79. STD_ENTRY_SECTION(__atomic\name, .text.atomic)
  80. {
  81. movei r24, 1
  82. j 4f /* branch to second cache line */
  83. }
  84. 1: {
  85. .ifc \bitwidth,16
  86. lh r22, r0
  87. .else
  88. lw r22, r0
  89. addi r28, r0, 4
  90. .endif
  91. }
  92. .ifc \bitwidth,64
  93. lw r23, r28
  94. .endif
  95. \body /* set r24, and r25 if 64-bit */
  96. {
  97. seq r26, r22, r24
  98. seq r27, r23, r25
  99. }
  100. .ifc \bitwidth,64
  101. bbnst r27, 2f
  102. .endif
  103. bbs r26, 3f /* skip write-back if it's the same value */
  104. 2: {
  105. .ifc \bitwidth,16
  106. sh r0, r24
  107. .else
  108. sw r0, r24
  109. .endif
  110. }
  111. .ifc \bitwidth,64
  112. sw r28, r25
  113. .endif
  114. mf
  115. 3: {
  116. move r0, r22
  117. .ifc \bitwidth,64
  118. move r1, r23
  119. .else
  120. move r1, zero
  121. .endif
  122. sw ATOMIC_LOCK_REG_NAME, zero
  123. }
  124. mtspr INTERRUPT_CRITICAL_SECTION, zero
  125. jrp lr
  126. 4: {
  127. move ATOMIC_LOCK_REG_NAME, r1
  128. mtspr INTERRUPT_CRITICAL_SECTION, r24
  129. }
  130. #ifndef CONFIG_SMP
  131. j 1b /* no atomic locks */
  132. #else
  133. {
  134. tns r21, ATOMIC_LOCK_REG_NAME
  135. moveli r23, 2048 /* maximum backoff time in cycles */
  136. }
  137. {
  138. bzt r21, 1b /* branch if lock acquired */
  139. moveli r25, 32 /* starting backoff time in cycles */
  140. }
  141. 5: mtspr INTERRUPT_CRITICAL_SECTION, zero
  142. mfspr r26, CYCLE_LOW /* get start point for this backoff */
  143. 6: mfspr r22, CYCLE_LOW /* test to see if we've backed off enough */
  144. sub r22, r22, r26
  145. slt r22, r22, r25
  146. bbst r22, 6b
  147. {
  148. mtspr INTERRUPT_CRITICAL_SECTION, r24
  149. shli r25, r25, 1 /* double the backoff; retry the tns */
  150. }
  151. {
  152. tns r21, ATOMIC_LOCK_REG_NAME
  153. slt r26, r23, r25 /* is the proposed backoff too big? */
  154. }
  155. {
  156. bzt r21, 1b /* branch if lock acquired */
  157. mvnz r25, r26, r23
  158. }
  159. j 5b
  160. #endif
  161. STD_ENDPROC(__atomic\name)
  162. .ifc \bitwidth,32
  163. .pushsection __ex_table,"a"
  164. .align 4
  165. .word 1b, __atomic\name
  166. .word 2b, __atomic\name
  167. .word __atomic\name, __atomic_bad_address
  168. .popsection
  169. .endif
  170. .endm
  171. /*
  172. * Use __atomic32 prefix to avoid collisions with GCC builtin __atomic functions.
  173. */
  174. atomic_op 32_cmpxchg, 32, "seq r26, r22, r2; { bbns r26, 3f; move r24, r3 }"
  175. atomic_op 32_xchg, 32, "move r24, r2"
  176. atomic_op 32_xchg_add, 32, "add r24, r22, r2"
  177. atomic_op 32_xchg_add_unless, 32, \
  178. "sne r26, r22, r2; { bbns r26, 3f; add r24, r22, r3 }"
  179. atomic_op 32_fetch_or, 32, "or r24, r22, r2"
  180. atomic_op 32_fetch_and, 32, "and r24, r22, r2"
  181. atomic_op 32_fetch_andn, 32, "nor r2, r2, zero; and r24, r22, r2"
  182. atomic_op 32_fetch_xor, 32, "xor r24, r22, r2"
  183. atomic_op 64_cmpxchg, 64, "{ seq r26, r22, r2; seq r27, r23, r3 }; \
  184. { bbns r26, 3f; move r24, r4 }; { bbns r27, 3f; move r25, r5 }"
  185. atomic_op 64_xchg, 64, "{ move r24, r2; move r25, r3 }"
  186. atomic_op 64_xchg_add, 64, "{ add r24, r22, r2; add r25, r23, r3 }; \
  187. slt_u r26, r24, r22; add r25, r25, r26"
  188. atomic_op 64_xchg_add_unless, 64, \
  189. "{ sne r26, r22, r2; sne r27, r23, r3 }; \
  190. { bbns r26, 3f; add r24, r22, r4 }; \
  191. { bbns r27, 3f; add r25, r23, r5 }; \
  192. slt_u r26, r24, r22; add r25, r25, r26"
  193. atomic_op 64_fetch_or, 64, "{ or r24, r22, r2; or r25, r23, r3 }"
  194. atomic_op 64_fetch_and, 64, "{ and r24, r22, r2; and r25, r23, r3 }"
  195. atomic_op 64_fetch_xor, 64, "{ xor r24, r22, r2; xor r25, r23, r3 }"
  196. jrp lr /* happy backtracer */
  197. ENTRY(__end_atomic_asm_code)