sha256_shani.c 5.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144
  1. #include "cpusupport.h"
  2. #if defined(CPUSUPPORT_X86_SHANI) && defined(CPUSUPPORT_X86_SSSE3)
  3. /**
  4. * CPUSUPPORT CFLAGS: X86_SHANI X86_SSSE3
  5. */
  6. #include <immintrin.h>
  7. #include <stdint.h>
  8. #include "sha256_shani.h"
  9. /**
  10. * This code uses intrinsics from the following feature sets:
  11. * SHANI: _mm_sha256msg1_epu32, _mm_sha256msg2_epu32, _mm_sha256rnds2_epu32
  12. * SSSE3: _mm_shuffle_epi8, _mm_alignr_epi8
  13. * SSE2: Everything else
  14. *
  15. * The SSSE3 intrinsics could be avoided at a slight cost by using a few SSE2
  16. * instructions in their place; we have not done this since to our knowledge
  17. * there are presently no CPUs which support the SHANI instruction set but do
  18. * not support SSSE3.
  19. */
  20. /* Load 32-bit big-endian words. */
  21. static __m128i
  22. be32dec_128(const uint8_t * src)
  23. {
  24. const __m128i SHUF = _mm_set_epi8(12, 13, 14, 15, 8, 9, 10, 11,
  25. 4, 5, 6, 7, 0, 1, 2, 3);
  26. __m128i x;
  27. /* Load four 32-bit words. */
  28. x = _mm_loadu_si128((const __m128i *)src);
  29. /* Reverse the order of the bytes in each word. */
  30. return (_mm_shuffle_epi8(x, SHUF));
  31. }
  32. /* Convert an unsigned 32-bit immediate into a signed value. */
  33. #define I32(a) ((UINT32_C(a) >= UINT32_C(0x80000000)) ? \
  34. -(int32_t)(UINT32_C(0xffffffff) - UINT32_C(a)) - 1 : (int32_t)INT32_C(a))
  35. /* Load four unsigned 32-bit immediates into a vector register. */
  36. #define IMM4(a, b, c, d) _mm_set_epi32(I32(a), I32(b), I32(c), I32(d))
  37. /* Run four rounds of SHA256. */
  38. #define RND4(S, W, K0, K1, K2, K3) do { \
  39. __m128i M; \
  40. \
  41. /* Add the next four words of message schedule and round constants. */ \
  42. M = _mm_add_epi32(W, IMM4(K3, K2, K1, K0)); \
  43. \
  44. /* Perform two rounds of SHA256, using the low two words in M. */ \
  45. S[1] = _mm_sha256rnds2_epu32(S[1], S[0], M); \
  46. \
  47. /* Shift the two words of M down and perform the next two rounds. */ \
  48. M = _mm_srli_si128(M, 8); \
  49. S[0] = _mm_sha256rnds2_epu32(S[0], S[1], M); \
  50. } while (0)
  51. /* Compute the ith set of four words of message schedule. */
  52. #define MSG4(W, i) do { \
  53. W[(i + 0) % 4] = _mm_sha256msg1_epu32(W[(i + 0) % 4], W[(i + 1) % 4]); \
  54. W[(i + 0) % 4] = _mm_add_epi32(W[(i + 0) % 4], \
  55. _mm_alignr_epi8(W[(i + 3) % 4], W[(i + 2) % 4], 4)); \
  56. W[(i + 0) % 4] = _mm_sha256msg2_epu32(W[(i + 0) % 4], W[(i + 3) % 4]); \
  57. } while (0)
  58. /* Perform 4 rounds of SHA256 and generate more message schedule if needed. */
  59. #define RNDMSG(S, W, i, K0, K1, K2, K3) do { \
  60. RND4(S, W[i % 4], K0, K1, K2, K3); \
  61. if (i < 12) \
  62. MSG4(W, i + 4); \
  63. } while (0)
  64. /**
  65. * SHA256_Transform_shani(state, block):
  66. * Compute the SHA256 block compression function, transforming ${state} using
  67. * the data in ${block}. This implementation uses x86 SHANI and SSSE3
  68. * instructions, and should only be used if CPUSUPPORT_X86_SHANI and _SSSE3
  69. * are defined and cpusupport_x86_shani() and _ssse3() return nonzero.
  70. */
  71. void
  72. SHA256_Transform_shani(uint32_t state[static restrict 8],
  73. const uint8_t block[static restrict 64])
  74. {
  75. __m128i S3210, S7654;
  76. __m128i S0123, S4567;
  77. __m128i S0145, S2367;
  78. __m128i W[4];
  79. __m128i S[2];
  80. /* Load state. */
  81. S3210 = _mm_loadu_si128((const __m128i *)&state[0]);
  82. S7654 = _mm_loadu_si128((const __m128i *)&state[4]);
  83. /* Shuffle the 8 32-bit values into the order we need them. */
  84. S0123 = _mm_shuffle_epi32(S3210, 0x1B);
  85. S4567 = _mm_shuffle_epi32(S7654, 0x1B);
  86. S0145 = _mm_unpackhi_epi64(S4567, S0123);
  87. S2367 = _mm_unpacklo_epi64(S4567, S0123);
  88. /* Load input block; this is the start of the message schedule. */
  89. W[0] = be32dec_128(&block[0]);
  90. W[1] = be32dec_128(&block[16]);
  91. W[2] = be32dec_128(&block[32]);
  92. W[3] = be32dec_128(&block[48]);
  93. /* Initialize working variables. */
  94. S[0] = S0145;
  95. S[1] = S2367;
  96. /* Perform 64 rounds, 4 at a time. */
  97. RNDMSG(S, W, 0, 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5);
  98. RNDMSG(S, W, 1, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5);
  99. RNDMSG(S, W, 2, 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3);
  100. RNDMSG(S, W, 3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174);
  101. RNDMSG(S, W, 4, 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc);
  102. RNDMSG(S, W, 5, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da);
  103. RNDMSG(S, W, 6, 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7);
  104. RNDMSG(S, W, 7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967);
  105. RNDMSG(S, W, 8, 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13);
  106. RNDMSG(S, W, 9, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85);
  107. RNDMSG(S, W, 10, 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3);
  108. RNDMSG(S, W, 11, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070);
  109. RNDMSG(S, W, 12, 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5);
  110. RNDMSG(S, W, 13, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3);
  111. RNDMSG(S, W, 14, 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208);
  112. RNDMSG(S, W, 15, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2);
  113. /* Mix local working variables into global state. */
  114. S0145 = _mm_add_epi32(S0145, S[0]);
  115. S2367 = _mm_add_epi32(S2367, S[1]);
  116. /* Shuffle state back to the original word order and store. */
  117. S0123 = _mm_unpackhi_epi64(S2367, S0145);
  118. S4567 = _mm_unpacklo_epi64(S2367, S0145);
  119. S3210 = _mm_shuffle_epi32(S0123, 0x1B);
  120. S7654 = _mm_shuffle_epi32(S4567, 0x1B);
  121. _mm_storeu_si128((__m128i *)&state[0], S3210);
  122. _mm_storeu_si128((__m128i *)&state[4], S7654);
  123. }
  124. #endif /* CPUSUPPORT_X86_SHANI && CPUSUPPORT_X86_SSSE3 */