sha256_sse2.c 7.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240
  1. #include "cpusupport.h"
  2. #ifdef CPUSUPPORT_X86_SSE2
  3. /**
  4. * CPUSUPPORT CFLAGS: X86_SSE2
  5. */
  6. #include <assert.h>
  7. #include <stdint.h>
  8. #include <string.h>
  9. #include <emmintrin.h>
  10. #include "sha256_sse2.h"
  11. /**
  12. * mm_bswap_epi32(a):
  13. * Byte-swap each 32-bit word.
  14. */
  15. static inline __m128i
  16. mm_bswap_epi32(__m128i a)
  17. {
  18. /* Swap bytes in each 16-bit word. */
  19. a = _mm_or_si128(_mm_slli_epi16(a, 8), _mm_srli_epi16(a, 8));
  20. /* Swap all 16-bit words. */
  21. a = _mm_shufflelo_epi16(a, _MM_SHUFFLE(2, 3, 0, 1));
  22. a = _mm_shufflehi_epi16(a, _MM_SHUFFLE(2, 3, 0, 1));
  23. return (a);
  24. }
  25. /* SHA256 round constants. */
  26. static const uint32_t Krnd[64] = {
  27. 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
  28. 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
  29. 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
  30. 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
  31. 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
  32. 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
  33. 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
  34. 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
  35. 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
  36. 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
  37. 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
  38. 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
  39. 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
  40. 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
  41. 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
  42. 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
  43. };
  44. /* Elementary functions used by SHA256 */
  45. #define Ch(x, y, z) ((x & (y ^ z)) ^ z)
  46. #define Maj(x, y, z) ((x & (y | z)) | (y & z))
  47. #define ROTR(x, n) ((x >> n) | (x << (32 - n)))
  48. #define S0(x) (ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22))
  49. #define S1(x) (ROTR(x, 6) ^ ROTR(x, 11) ^ ROTR(x, 25))
  50. /* SHA256 round function */
  51. #define RND(a, b, c, d, e, f, g, h, k) \
  52. h += S1(e) + Ch(e, f, g) + k; \
  53. d += h; \
  54. h += S0(a) + Maj(a, b, c)
  55. /* Adjusted round function for rotating state */
  56. #define RNDr(S, W, i, ii) \
  57. RND(S[(64 - i) % 8], S[(65 - i) % 8], \
  58. S[(66 - i) % 8], S[(67 - i) % 8], \
  59. S[(68 - i) % 8], S[(69 - i) % 8], \
  60. S[(70 - i) % 8], S[(71 - i) % 8], \
  61. W[i + ii] + Krnd[i + ii])
  62. /* Message schedule computation */
  63. #define SHR32(x, n) (_mm_srli_epi32(x, n))
  64. #define ROTR32(x, n) (_mm_or_si128(SHR32(x, n), _mm_slli_epi32(x, (32-n))))
  65. #define s0_128(x) _mm_xor_si128(_mm_xor_si128( \
  66. ROTR32(x, 7), ROTR32(x, 18)), SHR32(x, 3))
  67. static inline __m128i
  68. s1_128_high(__m128i a)
  69. {
  70. __m128i b;
  71. __m128i c;
  72. /* ROTR, loading data as {B, B, A, A}; lanes 1 & 3 will be junk. */
  73. b = _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 1, 0, 0));
  74. c = _mm_xor_si128(_mm_srli_epi64(b, 17), _mm_srli_epi64(b, 19));
  75. /* Shift and XOR with rotated data; lanes 1 & 3 will be junk. */
  76. c = _mm_xor_si128(c, _mm_srli_epi32(b, 10));
  77. /* Shuffle good data back and zero unwanted lanes. */
  78. c = _mm_shuffle_epi32(c, _MM_SHUFFLE(2, 0, 2, 0));
  79. c = _mm_slli_si128(c, 8);
  80. return (c);
  81. }
  82. static inline __m128i
  83. s1_128_low(__m128i a)
  84. {
  85. __m128i b;
  86. __m128i c;
  87. /* ROTR, loading data as {B, B, A, A}; lanes 1 & 3 will be junk. */
  88. b = _mm_shuffle_epi32(a, _MM_SHUFFLE(3, 3, 2, 2));
  89. c = _mm_xor_si128(_mm_srli_epi64(b, 17), _mm_srli_epi64(b, 19));
  90. /* Shift and XOR with rotated data; lanes 1 & 3 will be junk. */
  91. c = _mm_xor_si128(c, _mm_srli_epi32(b, 10));
  92. /* Shuffle good data back and zero unwanted lanes. */
  93. c = _mm_shuffle_epi32(c, _MM_SHUFFLE(2, 0, 2, 0));
  94. c = _mm_srli_si128(c, 8);
  95. return (c);
  96. }
  97. /**
  98. * SPAN_ONE_THREE(a, b):
  99. * Combine the upper three words of ${a} with the lowest word of ${b}. This
  100. * could also be thought of returning bits [159:32] of the 256-bit value
  101. * consisting of (b[127:0] a[127:0]). In other words, set:
  102. * dst[31:0] := a[63:32]
  103. * dst[63:32] := a[95:64]
  104. * dst[95:64] := a[127:96]
  105. * dst[127:96] := b[31:0]
  106. */
  107. #define SPAN_ONE_THREE(a, b) (_mm_shuffle_epi32(_mm_castps_si128( \
  108. _mm_move_ss(_mm_castsi128_ps(a), _mm_castsi128_ps(b))), \
  109. _MM_SHUFFLE(0, 3, 2, 1)))
  110. /**
  111. * MSG4(X0, X1, X2, X3):
  112. * Calculate the next four values of the message schedule. If we define
  113. * ${W[j]} as the first unknown value in the message schedule, then the input
  114. * arguments are:
  115. * X0 = W[j - 16] : W[j - 13]
  116. * X1 = W[j - 12] : W[j - 9]
  117. * X2 = W[j - 8] : W[j - 5]
  118. * X3 = W[j - 4] : W[j - 1]
  119. * This function therefore calculates:
  120. * X4 = W[j + 0] : W[j + 3]
  121. */
  122. static inline __m128i
  123. MSG4(__m128i X0, __m128i X1, __m128i X2, __m128i X3)
  124. {
  125. __m128i X4;
  126. __m128i Xj_minus_seven, Xj_minus_fifteen;
  127. /* Set up variables which span X values. */
  128. Xj_minus_seven = SPAN_ONE_THREE(X2, X3);
  129. Xj_minus_fifteen = SPAN_ONE_THREE(X0, X1);
  130. /* Begin computing X4. */
  131. X4 = _mm_add_epi32(X0, Xj_minus_seven);
  132. X4 = _mm_add_epi32(X4, s0_128(Xj_minus_fifteen));
  133. /* First half of s1. */
  134. X4 = _mm_add_epi32(X4, s1_128_low(X3));
  135. /* Second half of s1; this depends on the above value of X4. */
  136. X4 = _mm_add_epi32(X4, s1_128_high(X4));
  137. return (X4);
  138. }
  139. /**
  140. * SHA256_Transform_sse2(state, block):
  141. * Compute the SHA256 block compression function, transforming ${state} using
  142. * the data in ${block}. This implementation uses x86 SSE2 instructions, and
  143. * should only be used if _SSE2 is defined and cpusupport_x86_sse2() returns
  144. * nonzero. The arrays W and S may be filled with sensitive data, and should
  145. * be cleared by the callee.
  146. */
  147. #ifdef POSIXFAIL_ABSTRACT_DECLARATOR
  148. void
  149. SHA256_Transform_sse2(uint32_t state[8], const uint8_t block[64],
  150. uint32_t W[64], uint32_t S[8])
  151. #else
  152. void
  153. SHA256_Transform_sse2(uint32_t state[static restrict 8],
  154. const uint8_t block[static restrict 64], uint32_t W[static restrict 64],
  155. uint32_t S[static restrict 8])
  156. #endif
  157. {
  158. __m128i Y[4];
  159. int i;
  160. /* 1. Prepare the first part of the message schedule W. */
  161. Y[0] = mm_bswap_epi32(_mm_loadu_si128((const __m128i *)&block[0]));
  162. _mm_storeu_si128((__m128i *)&W[0], Y[0]);
  163. Y[1] = mm_bswap_epi32(_mm_loadu_si128((const __m128i *)&block[16]));
  164. _mm_storeu_si128((__m128i *)&W[4], Y[1]);
  165. Y[2] = mm_bswap_epi32(_mm_loadu_si128((const __m128i *)&block[32]));
  166. _mm_storeu_si128((__m128i *)&W[8], Y[2]);
  167. Y[3] = mm_bswap_epi32(_mm_loadu_si128((const __m128i *)&block[48]));
  168. _mm_storeu_si128((__m128i *)&W[12], Y[3]);
  169. /* 2. Initialize working variables. */
  170. memcpy(S, state, 32);
  171. /* 3. Mix. */
  172. for (i = 0; i < 64; i += 16) {
  173. RNDr(S, W, 0, i);
  174. RNDr(S, W, 1, i);
  175. RNDr(S, W, 2, i);
  176. RNDr(S, W, 3, i);
  177. RNDr(S, W, 4, i);
  178. RNDr(S, W, 5, i);
  179. RNDr(S, W, 6, i);
  180. RNDr(S, W, 7, i);
  181. RNDr(S, W, 8, i);
  182. RNDr(S, W, 9, i);
  183. RNDr(S, W, 10, i);
  184. RNDr(S, W, 11, i);
  185. RNDr(S, W, 12, i);
  186. RNDr(S, W, 13, i);
  187. RNDr(S, W, 14, i);
  188. RNDr(S, W, 15, i);
  189. if (i == 48)
  190. break;
  191. Y[0] = MSG4(Y[0], Y[1], Y[2], Y[3]);
  192. _mm_storeu_si128((__m128i *)&W[16 + i + 0], Y[0]);
  193. Y[1] = MSG4(Y[1], Y[2], Y[3], Y[0]);
  194. _mm_storeu_si128((__m128i *)&W[16 + i + 4], Y[1]);
  195. Y[2] = MSG4(Y[2], Y[3], Y[0], Y[1]);
  196. _mm_storeu_si128((__m128i *)&W[16 + i + 8], Y[2]);
  197. Y[3] = MSG4(Y[3], Y[0], Y[1], Y[2]);
  198. _mm_storeu_si128((__m128i *)&W[16 + i + 12], Y[3]);
  199. }
  200. /* 4. Mix local working variables into global state. */
  201. for (i = 0; i < 8; i++)
  202. state[i] += S[i];
  203. }
  204. #endif /* CPUSUPPORT_X86_SSE2 */