sha1_mb_mgr_flush_avx2.S 8.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303
  1. /*
  2. * Flush routine for SHA1 multibuffer
  3. *
  4. * This file is provided under a dual BSD/GPLv2 license. When using or
  5. * redistributing this file, you may do so under either license.
  6. *
  7. * GPL LICENSE SUMMARY
  8. *
  9. * Copyright(c) 2014 Intel Corporation.
  10. *
  11. * This program is free software; you can redistribute it and/or modify
  12. * it under the terms of version 2 of the GNU General Public License as
  13. * published by the Free Software Foundation.
  14. *
  15. * This program is distributed in the hope that it will be useful, but
  16. * WITHOUT ANY WARRANTY; without even the implied warranty of
  17. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  18. * General Public License for more details.
  19. *
  20. * Contact Information:
  21. * James Guilford <james.guilford@intel.com>
  22. * Tim Chen <tim.c.chen@linux.intel.com>
  23. *
  24. * BSD LICENSE
  25. *
  26. * Copyright(c) 2014 Intel Corporation.
  27. *
  28. * Redistribution and use in source and binary forms, with or without
  29. * modification, are permitted provided that the following conditions
  30. * are met:
  31. *
  32. * * Redistributions of source code must retain the above copyright
  33. * notice, this list of conditions and the following disclaimer.
  34. * * Redistributions in binary form must reproduce the above copyright
  35. * notice, this list of conditions and the following disclaimer in
  36. * the documentation and/or other materials provided with the
  37. * distribution.
  38. * * Neither the name of Intel Corporation nor the names of its
  39. * contributors may be used to endorse or promote products derived
  40. * from this software without specific prior written permission.
  41. *
  42. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  43. * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  44. * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  45. * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  46. * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  47. * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  48. * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  49. * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  50. * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  51. * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  52. * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  53. */
  54. #include <linux/linkage.h>
  55. #include <asm/frame.h>
  56. #include "sha1_mb_mgr_datastruct.S"
  57. .extern sha1_x8_avx2
  58. # LINUX register definitions
  59. #define arg1 %rdi
  60. #define arg2 %rsi
  61. # Common definitions
  62. #define state arg1
  63. #define job arg2
  64. #define len2 arg2
  65. # idx must be a register not clobbered by sha1_x8_avx2
  66. #define idx %r8
  67. #define DWORD_idx %r8d
  68. #define unused_lanes %rbx
  69. #define lane_data %rbx
  70. #define tmp2 %rbx
  71. #define tmp2_w %ebx
  72. #define job_rax %rax
  73. #define tmp1 %rax
  74. #define size_offset %rax
  75. #define tmp %rax
  76. #define start_offset %rax
  77. #define tmp3 %arg1
  78. #define extra_blocks %arg2
  79. #define p %arg2
  80. .macro LABEL prefix n
  81. \prefix\n\():
  82. .endm
  83. .macro JNE_SKIP i
  84. jne skip_\i
  85. .endm
  86. .altmacro
  87. .macro SET_OFFSET _offset
  88. offset = \_offset
  89. .endm
  90. .noaltmacro
  91. # JOB* sha1_mb_mgr_flush_avx2(MB_MGR *state)
  92. # arg 1 : rcx : state
  93. ENTRY(sha1_mb_mgr_flush_avx2)
  94. FRAME_BEGIN
  95. push %rbx
  96. # If bit (32+3) is set, then all lanes are empty
  97. mov _unused_lanes(state), unused_lanes
  98. bt $32+3, unused_lanes
  99. jc return_null
  100. # find a lane with a non-null job
  101. xor idx, idx
  102. offset = (_ldata + 1 * _LANE_DATA_size + _job_in_lane)
  103. cmpq $0, offset(state)
  104. cmovne one(%rip), idx
  105. offset = (_ldata + 2 * _LANE_DATA_size + _job_in_lane)
  106. cmpq $0, offset(state)
  107. cmovne two(%rip), idx
  108. offset = (_ldata + 3 * _LANE_DATA_size + _job_in_lane)
  109. cmpq $0, offset(state)
  110. cmovne three(%rip), idx
  111. offset = (_ldata + 4 * _LANE_DATA_size + _job_in_lane)
  112. cmpq $0, offset(state)
  113. cmovne four(%rip), idx
  114. offset = (_ldata + 5 * _LANE_DATA_size + _job_in_lane)
  115. cmpq $0, offset(state)
  116. cmovne five(%rip), idx
  117. offset = (_ldata + 6 * _LANE_DATA_size + _job_in_lane)
  118. cmpq $0, offset(state)
  119. cmovne six(%rip), idx
  120. offset = (_ldata + 7 * _LANE_DATA_size + _job_in_lane)
  121. cmpq $0, offset(state)
  122. cmovne seven(%rip), idx
  123. # copy idx to empty lanes
  124. copy_lane_data:
  125. offset = (_args + _data_ptr)
  126. mov offset(state,idx,8), tmp
  127. I = 0
  128. .rep 8
  129. offset = (_ldata + I * _LANE_DATA_size + _job_in_lane)
  130. cmpq $0, offset(state)
  131. .altmacro
  132. JNE_SKIP %I
  133. offset = (_args + _data_ptr + 8*I)
  134. mov tmp, offset(state)
  135. offset = (_lens + 4*I)
  136. movl $0xFFFFFFFF, offset(state)
  137. LABEL skip_ %I
  138. I = (I+1)
  139. .noaltmacro
  140. .endr
  141. # Find min length
  142. vmovdqu _lens+0*16(state), %xmm0
  143. vmovdqu _lens+1*16(state), %xmm1
  144. vpminud %xmm1, %xmm0, %xmm2 # xmm2 has {D,C,B,A}
  145. vpalignr $8, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,D,C}
  146. vpminud %xmm3, %xmm2, %xmm2 # xmm2 has {x,x,E,F}
  147. vpalignr $4, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,x,E}
  148. vpminud %xmm3, %xmm2, %xmm2 # xmm2 has min value in low dword
  149. vmovd %xmm2, DWORD_idx
  150. mov idx, len2
  151. and $0xF, idx
  152. shr $4, len2
  153. jz len_is_0
  154. vpand clear_low_nibble(%rip), %xmm2, %xmm2
  155. vpshufd $0, %xmm2, %xmm2
  156. vpsubd %xmm2, %xmm0, %xmm0
  157. vpsubd %xmm2, %xmm1, %xmm1
  158. vmovdqu %xmm0, _lens+0*16(state)
  159. vmovdqu %xmm1, _lens+1*16(state)
  160. # "state" and "args" are the same address, arg1
  161. # len is arg2
  162. call sha1_x8_avx2
  163. # state and idx are intact
  164. len_is_0:
  165. # process completed job "idx"
  166. imul $_LANE_DATA_size, idx, lane_data
  167. lea _ldata(state, lane_data), lane_data
  168. mov _job_in_lane(lane_data), job_rax
  169. movq $0, _job_in_lane(lane_data)
  170. movl $STS_COMPLETED, _status(job_rax)
  171. mov _unused_lanes(state), unused_lanes
  172. shl $4, unused_lanes
  173. or idx, unused_lanes
  174. mov unused_lanes, _unused_lanes(state)
  175. movl $0xFFFFFFFF, _lens(state, idx, 4)
  176. vmovd _args_digest(state , idx, 4) , %xmm0
  177. vpinsrd $1, _args_digest+1*32(state, idx, 4), %xmm0, %xmm0
  178. vpinsrd $2, _args_digest+2*32(state, idx, 4), %xmm0, %xmm0
  179. vpinsrd $3, _args_digest+3*32(state, idx, 4), %xmm0, %xmm0
  180. movl _args_digest+4*32(state, idx, 4), tmp2_w
  181. vmovdqu %xmm0, _result_digest(job_rax)
  182. offset = (_result_digest + 1*16)
  183. mov tmp2_w, offset(job_rax)
  184. return:
  185. pop %rbx
  186. FRAME_END
  187. ret
  188. return_null:
  189. xor job_rax, job_rax
  190. jmp return
  191. ENDPROC(sha1_mb_mgr_flush_avx2)
  192. #################################################################
  193. .align 16
  194. ENTRY(sha1_mb_mgr_get_comp_job_avx2)
  195. push %rbx
  196. ## if bit 32+3 is set, then all lanes are empty
  197. mov _unused_lanes(state), unused_lanes
  198. bt $(32+3), unused_lanes
  199. jc .return_null
  200. # Find min length
  201. vmovdqu _lens(state), %xmm0
  202. vmovdqu _lens+1*16(state), %xmm1
  203. vpminud %xmm1, %xmm0, %xmm2 # xmm2 has {D,C,B,A}
  204. vpalignr $8, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,D,C}
  205. vpminud %xmm3, %xmm2, %xmm2 # xmm2 has {x,x,E,F}
  206. vpalignr $4, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,x,E}
  207. vpminud %xmm3, %xmm2, %xmm2 # xmm2 has min value in low dword
  208. vmovd %xmm2, DWORD_idx
  209. test $~0xF, idx
  210. jnz .return_null
  211. # process completed job "idx"
  212. imul $_LANE_DATA_size, idx, lane_data
  213. lea _ldata(state, lane_data), lane_data
  214. mov _job_in_lane(lane_data), job_rax
  215. movq $0, _job_in_lane(lane_data)
  216. movl $STS_COMPLETED, _status(job_rax)
  217. mov _unused_lanes(state), unused_lanes
  218. shl $4, unused_lanes
  219. or idx, unused_lanes
  220. mov unused_lanes, _unused_lanes(state)
  221. movl $0xFFFFFFFF, _lens(state, idx, 4)
  222. vmovd _args_digest(state, idx, 4), %xmm0
  223. vpinsrd $1, _args_digest+1*32(state, idx, 4), %xmm0, %xmm0
  224. vpinsrd $2, _args_digest+2*32(state, idx, 4), %xmm0, %xmm0
  225. vpinsrd $3, _args_digest+3*32(state, idx, 4), %xmm0, %xmm0
  226. movl _args_digest+4*32(state, idx, 4), tmp2_w
  227. vmovdqu %xmm0, _result_digest(job_rax)
  228. movl tmp2_w, _result_digest+1*16(job_rax)
  229. pop %rbx
  230. ret
  231. .return_null:
  232. xor job_rax, job_rax
  233. pop %rbx
  234. ret
  235. ENDPROC(sha1_mb_mgr_get_comp_job_avx2)
  236. .data
  237. .align 16
  238. clear_low_nibble:
  239. .octa 0x000000000000000000000000FFFFFFF0
  240. one:
  241. .quad 1
  242. two:
  243. .quad 2
  244. three:
  245. .quad 3
  246. four:
  247. .quad 4
  248. five:
  249. .quad 5
  250. six:
  251. .quad 6
  252. seven:
  253. .quad 7