|
| 1 | +/*- |
| 2 | + * Copyright 2021 Tarsnap Backup Inc. |
| 3 | + * All rights reserved. |
| 4 | + * |
| 5 | + * Redistribution and use in source and binary forms, with or without |
| 6 | + * modification, are permitted provided that the following conditions |
| 7 | + * are met: |
| 8 | + * 1. Redistributions of source code must retain the above copyright |
| 9 | + * notice, this list of conditions and the following disclaimer. |
| 10 | + * 2. Redistributions in binary form must reproduce the above copyright |
| 11 | + * notice, this list of conditions and the following disclaimer in the |
| 12 | + * documentation and/or other materials provided with the distribution. |
| 13 | + * |
| 14 | + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND |
| 15 | + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| 16 | + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
| 17 | + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE |
| 18 | + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
| 19 | + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS |
| 20 | + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
| 21 | + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
| 22 | + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
| 23 | + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
| 24 | + * SUCH DAMAGE. |
| 25 | + */ |
| 26 | + |
| 27 | +#include "php_hash.h" |
| 28 | +#include "php_hash_sha.h" |
| 29 | + |
| 30 | +#ifdef __SSE2__ |
| 31 | +# include <emmintrin.h> |
| 32 | + |
| 33 | +/* Original implementation from libcperciva follows. |
| 34 | + * |
| 35 | + * Modified to use `PHP_STATIC_RESTRICT` for MSVC compatibility. |
| 36 | + */ |
| 37 | + |
| 38 | +/** |
| 39 | + * mm_bswap_epi32(a): |
| 40 | + * Byte-swap each 32-bit word. |
| 41 | + */ |
| 42 | +static inline __m128i |
| 43 | +mm_bswap_epi32(__m128i a) |
| 44 | +{ |
| 45 | + |
| 46 | + /* Swap bytes in each 16-bit word. */ |
| 47 | + a = _mm_or_si128(_mm_slli_epi16(a, 8), _mm_srli_epi16(a, 8)); |
| 48 | + |
| 49 | + /* Swap all 16-bit words. */ |
| 50 | + a = _mm_shufflelo_epi16(a, _MM_SHUFFLE(2, 3, 0, 1)); |
| 51 | + a = _mm_shufflehi_epi16(a, _MM_SHUFFLE(2, 3, 0, 1)); |
| 52 | + |
| 53 | + return (a); |
| 54 | +} |
| 55 | + |
| 56 | +/* SHA256 round constants. */ |
| 57 | +static const uint32_t Krnd[64] = { |
| 58 | + 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, |
| 59 | + 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, |
| 60 | + 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, |
| 61 | + 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, |
| 62 | + 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, |
| 63 | + 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, |
| 64 | + 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, |
| 65 | + 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, |
| 66 | + 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, |
| 67 | + 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, |
| 68 | + 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, |
| 69 | + 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, |
| 70 | + 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, |
| 71 | + 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, |
| 72 | + 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, |
| 73 | + 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 |
| 74 | +}; |
| 75 | + |
| 76 | +/* Elementary functions used by SHA256 */ |
| 77 | +#define Ch(x, y, z) ((x & (y ^ z)) ^ z) |
| 78 | +#define Maj(x, y, z) ((x & (y | z)) | (y & z)) |
| 79 | +#define ROTR(x, n) ((x >> n) | (x << (32 - n))) |
| 80 | +#define S0(x) (ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22)) |
| 81 | +#define S1(x) (ROTR(x, 6) ^ ROTR(x, 11) ^ ROTR(x, 25)) |
| 82 | + |
| 83 | +/* SHA256 round function */ |
| 84 | +#define RND(a, b, c, d, e, f, g, h, k) \ |
| 85 | + h += S1(e) + Ch(e, f, g) + k; \ |
| 86 | + d += h; \ |
| 87 | + h += S0(a) + Maj(a, b, c) |
| 88 | + |
| 89 | +/* Adjusted round function for rotating state */ |
| 90 | +#define RNDr(S, W, i, ii) \ |
| 91 | + RND(S[(64 - i) % 8], S[(65 - i) % 8], \ |
| 92 | + S[(66 - i) % 8], S[(67 - i) % 8], \ |
| 93 | + S[(68 - i) % 8], S[(69 - i) % 8], \ |
| 94 | + S[(70 - i) % 8], S[(71 - i) % 8], \ |
| 95 | + W[i + ii] + Krnd[i + ii]) |
| 96 | + |
| 97 | +/* Message schedule computation */ |
| 98 | +#define SHR32(x, n) (_mm_srli_epi32(x, n)) |
| 99 | +#define ROTR32(x, n) (_mm_or_si128(SHR32(x, n), _mm_slli_epi32(x, (32-n)))) |
| 100 | +#define s0_128(x) _mm_xor_si128(_mm_xor_si128( \ |
| 101 | + ROTR32(x, 7), ROTR32(x, 18)), SHR32(x, 3)) |
| 102 | + |
| 103 | +static inline __m128i |
| 104 | +s1_128_high(__m128i a) |
| 105 | +{ |
| 106 | + __m128i b; |
| 107 | + __m128i c; |
| 108 | + |
| 109 | + /* ROTR, loading data as {B, B, A, A}; lanes 1 & 3 will be junk. */ |
| 110 | + b = _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 1, 0, 0)); |
| 111 | + c = _mm_xor_si128(_mm_srli_epi64(b, 17), _mm_srli_epi64(b, 19)); |
| 112 | + |
| 113 | + /* Shift and XOR with rotated data; lanes 1 & 3 will be junk. */ |
| 114 | + c = _mm_xor_si128(c, _mm_srli_epi32(b, 10)); |
| 115 | + |
| 116 | + /* Shuffle good data back and zero unwanted lanes. */ |
| 117 | + c = _mm_shuffle_epi32(c, _MM_SHUFFLE(2, 0, 2, 0)); |
| 118 | + c = _mm_slli_si128(c, 8); |
| 119 | + |
| 120 | + return (c); |
| 121 | +} |
| 122 | + |
| 123 | +static inline __m128i |
| 124 | +s1_128_low(__m128i a) |
| 125 | +{ |
| 126 | + __m128i b; |
| 127 | + __m128i c; |
| 128 | + |
| 129 | + /* ROTR, loading data as {B, B, A, A}; lanes 1 & 3 will be junk. */ |
| 130 | + b = _mm_shuffle_epi32(a, _MM_SHUFFLE(3, 3, 2, 2)); |
| 131 | + c = _mm_xor_si128(_mm_srli_epi64(b, 17), _mm_srli_epi64(b, 19)); |
| 132 | + |
| 133 | + /* Shift and XOR with rotated data; lanes 1 & 3 will be junk. */ |
| 134 | + c = _mm_xor_si128(c, _mm_srli_epi32(b, 10)); |
| 135 | + |
| 136 | + /* Shuffle good data back and zero unwanted lanes. */ |
| 137 | + c = _mm_shuffle_epi32(c, _MM_SHUFFLE(2, 0, 2, 0)); |
| 138 | + c = _mm_srli_si128(c, 8); |
| 139 | + |
| 140 | + return (c); |
| 141 | +} |
| 142 | + |
| 143 | +/** |
| 144 | + * SPAN_ONE_THREE(a, b): |
| 145 | + * Combine the upper three words of ${a} with the lowest word of ${b}. This |
| 146 | + * could also be thought of returning bits [159:32] of the 256-bit value |
| 147 | + * consisting of (b[127:0] a[127:0]). In other words, set: |
| 148 | + * dst[31:0] := a[63:32] |
| 149 | + * dst[63:32] := a[95:64] |
| 150 | + * dst[95:64] := a[127:96] |
| 151 | + * dst[127:96] := b[31:0] |
| 152 | + */ |
| 153 | +#define SPAN_ONE_THREE(a, b) (_mm_shuffle_epi32(_mm_castps_si128( \ |
| 154 | + _mm_move_ss(_mm_castsi128_ps(a), _mm_castsi128_ps(b))), \ |
| 155 | + _MM_SHUFFLE(0, 3, 2, 1))) |
| 156 | + |
| 157 | +/** |
| 158 | + * MSG4(X0, X1, X2, X3): |
| 159 | + * Calculate the next four values of the message schedule. If we define |
| 160 | + * ${W[j]} as the first unknown value in the message schedule, then the input |
| 161 | + * arguments are: |
| 162 | + * X0 = W[j - 16] : W[j - 13] |
| 163 | + * X1 = W[j - 12] : W[j - 9] |
| 164 | + * X2 = W[j - 8] : W[j - 5] |
| 165 | + * X3 = W[j - 4] : W[j - 1] |
| 166 | + * This function therefore calculates: |
| 167 | + * X4 = W[j + 0] : W[j + 3] |
| 168 | + */ |
| 169 | +static inline __m128i |
| 170 | +MSG4(__m128i X0, __m128i X1, __m128i X2, __m128i X3) |
| 171 | +{ |
| 172 | + __m128i X4; |
| 173 | + __m128i Xj_minus_seven, Xj_minus_fifteen; |
| 174 | + |
| 175 | + /* Set up variables which span X values. */ |
| 176 | + Xj_minus_seven = SPAN_ONE_THREE(X2, X3); |
| 177 | + Xj_minus_fifteen = SPAN_ONE_THREE(X0, X1); |
| 178 | + |
| 179 | + /* Begin computing X4. */ |
| 180 | + X4 = _mm_add_epi32(X0, Xj_minus_seven); |
| 181 | + X4 = _mm_add_epi32(X4, s0_128(Xj_minus_fifteen)); |
| 182 | + |
| 183 | + /* First half of s1. */ |
| 184 | + X4 = _mm_add_epi32(X4, s1_128_low(X3)); |
| 185 | + |
| 186 | + /* Second half of s1; this depends on the above value of X4. */ |
| 187 | + X4 = _mm_add_epi32(X4, s1_128_high(X4)); |
| 188 | + |
| 189 | + return (X4); |
| 190 | +} |
| 191 | + |
| 192 | +/** |
| 193 | + * SHA256_Transform_sse2(state, block, W, S): |
| 194 | + * Compute the SHA256 block compression function, transforming ${state} using |
| 195 | + * the data in ${block}. This implementation uses x86 SSE2 instructions, and |
| 196 | + * should only be used if _SSE2 is defined and cpusupport_x86_sse2() returns |
| 197 | + * nonzero. The arrays W and S may be filled with sensitive data, and should |
| 198 | + * be cleared by the callee. |
| 199 | + */ |
| 200 | +void |
| 201 | +SHA256_Transform_sse2(uint32_t state[PHP_STATIC_RESTRICT 8], |
| 202 | + const uint8_t block[PHP_STATIC_RESTRICT 64], uint32_t W[PHP_STATIC_RESTRICT 64], |
| 203 | + uint32_t S[PHP_STATIC_RESTRICT 8]) |
| 204 | +{ |
| 205 | + __m128i Y[4]; |
| 206 | + int i; |
| 207 | + |
| 208 | + /* 1. Prepare the first part of the message schedule W. */ |
| 209 | + Y[0] = mm_bswap_epi32(_mm_loadu_si128((const __m128i *)&block[0])); |
| 210 | + _mm_storeu_si128((__m128i *)&W[0], Y[0]); |
| 211 | + Y[1] = mm_bswap_epi32(_mm_loadu_si128((const __m128i *)&block[16])); |
| 212 | + _mm_storeu_si128((__m128i *)&W[4], Y[1]); |
| 213 | + Y[2] = mm_bswap_epi32(_mm_loadu_si128((const __m128i *)&block[32])); |
| 214 | + _mm_storeu_si128((__m128i *)&W[8], Y[2]); |
| 215 | + Y[3] = mm_bswap_epi32(_mm_loadu_si128((const __m128i *)&block[48])); |
| 216 | + _mm_storeu_si128((__m128i *)&W[12], Y[3]); |
| 217 | + |
| 218 | + /* 2. Initialize working variables. */ |
| 219 | + memcpy(S, state, 32); |
| 220 | + |
| 221 | + /* 3. Mix. */ |
| 222 | + for (i = 0; i < 64; i += 16) { |
| 223 | + RNDr(S, W, 0, i); |
| 224 | + RNDr(S, W, 1, i); |
| 225 | + RNDr(S, W, 2, i); |
| 226 | + RNDr(S, W, 3, i); |
| 227 | + RNDr(S, W, 4, i); |
| 228 | + RNDr(S, W, 5, i); |
| 229 | + RNDr(S, W, 6, i); |
| 230 | + RNDr(S, W, 7, i); |
| 231 | + RNDr(S, W, 8, i); |
| 232 | + RNDr(S, W, 9, i); |
| 233 | + RNDr(S, W, 10, i); |
| 234 | + RNDr(S, W, 11, i); |
| 235 | + RNDr(S, W, 12, i); |
| 236 | + RNDr(S, W, 13, i); |
| 237 | + RNDr(S, W, 14, i); |
| 238 | + RNDr(S, W, 15, i); |
| 239 | + |
| 240 | + if (i == 48) |
| 241 | + break; |
| 242 | + Y[0] = MSG4(Y[0], Y[1], Y[2], Y[3]); |
| 243 | + _mm_storeu_si128((__m128i *)&W[16 + i + 0], Y[0]); |
| 244 | + Y[1] = MSG4(Y[1], Y[2], Y[3], Y[0]); |
| 245 | + _mm_storeu_si128((__m128i *)&W[16 + i + 4], Y[1]); |
| 246 | + Y[2] = MSG4(Y[2], Y[3], Y[0], Y[1]); |
| 247 | + _mm_storeu_si128((__m128i *)&W[16 + i + 8], Y[2]); |
| 248 | + Y[3] = MSG4(Y[3], Y[0], Y[1], Y[2]); |
| 249 | + _mm_storeu_si128((__m128i *)&W[16 + i + 12], Y[3]); |
| 250 | + } |
| 251 | + |
| 252 | + /* 4. Mix local working variables into global state. */ |
| 253 | + for (i = 0; i < 8; i++) |
| 254 | + state[i] += S[i]; |
| 255 | +} |
| 256 | + |
| 257 | +#endif |
0 commit comments