memset_64.S 3.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155
  1. /* Copyright 2002 Andi Kleen, SuSE Labs */
  2. #include <linux/linkage.h>
  3. #include <asm/dwarf2.h>
  4. #include <asm/cpufeature.h>
  5. #include <asm/alternative-asm.h>
  6. /*
  7. * ISO C memset - set a memory block to a byte value. This function uses fast
  8. * string to get better performance than the original function. The code is
  9. * simpler and shorter than the orignal function as well.
  10. *
  11. * rdi destination
  12. * rsi value (char)
  13. * rdx count (bytes)
  14. *
  15. * rax original destination
  16. */
  17. .section .altinstr_replacement, "ax", @progbits
  18. .Lmemset_c:
  19. movq %rdi,%r9
  20. movq %rdx,%rcx
  21. andl $7,%edx
  22. shrq $3,%rcx
  23. /* expand byte value */
  24. movzbl %sil,%esi
  25. movabs $0x0101010101010101,%rax
  26. imulq %rsi,%rax
  27. rep stosq
  28. movl %edx,%ecx
  29. rep stosb
  30. movq %r9,%rax
  31. ret
  32. .Lmemset_e:
  33. .previous
  34. /*
  35. * ISO C memset - set a memory block to a byte value. This function uses
  36. * enhanced rep stosb to override the fast string function.
  37. * The code is simpler and shorter than the fast string function as well.
  38. *
  39. * rdi destination
  40. * rsi value (char)
  41. * rdx count (bytes)
  42. *
  43. * rax original destination
  44. */
  45. .section .altinstr_replacement, "ax", @progbits
  46. .Lmemset_c_e:
  47. movq %rdi,%r9
  48. movb %sil,%al
  49. movq %rdx,%rcx
  50. rep stosb
  51. movq %r9,%rax
  52. ret
  53. .Lmemset_e_e:
  54. .previous
  55. ENTRY(memset)
  56. ENTRY(__memset)
  57. CFI_STARTPROC
  58. movq %rdi,%r10
  59. /* expand byte value */
  60. movzbl %sil,%ecx
  61. movabs $0x0101010101010101,%rax
  62. imulq %rcx,%rax
  63. /* align dst */
  64. movl %edi,%r9d
  65. andl $7,%r9d
  66. jnz .Lbad_alignment
  67. CFI_REMEMBER_STATE
  68. .Lafter_bad_alignment:
  69. movq %rdx,%rcx
  70. shrq $6,%rcx
  71. jz .Lhandle_tail
  72. .p2align 4
  73. .Lloop_64:
  74. decq %rcx
  75. movq %rax,(%rdi)
  76. movq %rax,8(%rdi)
  77. movq %rax,16(%rdi)
  78. movq %rax,24(%rdi)
  79. movq %rax,32(%rdi)
  80. movq %rax,40(%rdi)
  81. movq %rax,48(%rdi)
  82. movq %rax,56(%rdi)
  83. leaq 64(%rdi),%rdi
  84. jnz .Lloop_64
  85. /* Handle tail in loops. The loops should be faster than hard
  86. to predict jump tables. */
  87. .p2align 4
  88. .Lhandle_tail:
  89. movl %edx,%ecx
  90. andl $63&(~7),%ecx
  91. jz .Lhandle_7
  92. shrl $3,%ecx
  93. .p2align 4
  94. .Lloop_8:
  95. decl %ecx
  96. movq %rax,(%rdi)
  97. leaq 8(%rdi),%rdi
  98. jnz .Lloop_8
  99. .Lhandle_7:
  100. andl $7,%edx
  101. jz .Lende
  102. .p2align 4
  103. .Lloop_1:
  104. decl %edx
  105. movb %al,(%rdi)
  106. leaq 1(%rdi),%rdi
  107. jnz .Lloop_1
  108. .Lende:
  109. movq %r10,%rax
  110. ret
  111. CFI_RESTORE_STATE
  112. .Lbad_alignment:
  113. cmpq $7,%rdx
  114. jbe .Lhandle_7
  115. movq %rax,(%rdi) /* unaligned store */
  116. movq $8,%r8
  117. subq %r9,%r8
  118. addq %r8,%rdi
  119. subq %r8,%rdx
  120. jmp .Lafter_bad_alignment
  121. .Lfinal:
  122. CFI_ENDPROC
  123. ENDPROC(memset)
  124. ENDPROC(__memset)
  125. /* Some CPUs support enhanced REP MOVSB/STOSB feature.
  126. * It is recommended to use this when possible.
  127. *
  128. * If enhanced REP MOVSB/STOSB feature is not available, use fast string
  129. * instructions.
  130. *
  131. * Otherwise, use original memset function.
  132. *
  133. * In .altinstructions section, ERMS feature is placed after REG_GOOD
  134. * feature to implement the right patch order.
  135. */
  136. .section .altinstructions,"a"
  137. altinstruction_entry memset,.Lmemset_c,X86_FEATURE_REP_GOOD,\
  138. .Lfinal-memset,.Lmemset_e-.Lmemset_c
  139. altinstruction_entry memset,.Lmemset_c_e,X86_FEATURE_ERMS, \
  140. .Lfinal-memset,.Lmemset_e_e-.Lmemset_c_e
  141. .previous