diff --git a/build/php.m4 b/build/php.m4 index 16c5e25fbaab0..42616f6f213ec 100644 --- a/build/php.m4 +++ b/build/php.m4 @@ -2703,3 +2703,83 @@ AC_DEFUN([PHP_PATCH_CONFIG_HEADERS], [ $SED -e 's/^#undef PACKAGE_[^ ]*/\/\* & \*\//g' < $srcdir/$1 \ > $srcdir/$1.tmp && mv $srcdir/$1.tmp $srcdir/$1 ]) + + + + + +dnl +dnl PHP_CHECK_X86_TARGET +dnl +dnl check if we're compiling for x86/x86_64 +dnl +AC_DEFUN([PHP_CHECK_X86_TARGET], [ + AC_CACHE_CHECK([for x86 target],ac_cv_target_x86,[ + AC_RUN_IFELSE([AC_LANG_SOURCE([[ +int main(void) { +#if defined(__x86_64__) || defined(__i386__) + return 0; +#else + return 1; +#endif +} +]])],[ + ac_cv_target_x86=yes +],[ + ac_cv_target_x86=no +],[ + ac_cv_target_x86=no +])]) + +]) + + +dnl +dnl PHP_CHECK_WINDOWS_TARGET +dnl +dnl check if we're compiling for windows +dnl +AC_DEFUN([PHP_CHECK_WINDOWS_TARGET], [ + AC_CACHE_CHECK([for windows target],ac_cv_target_windows,[ + AC_RUN_IFELSE([AC_LANG_SOURCE([[ +int main(void) { +#if defined(_WIN32) + return 0; +#else + return 1; +#endif +} +]])],[ + ac_cv_target_windows=yes +],[ + ac_cv_target_windows=no +],[ + ac_cv_target_windows=no +])]) + +]) + +dnl +dnl PHP_CHECK_UNIX_TARGET +dnl +dnl check if we're compiling for a unix-ish target +dnl +AC_DEFUN([PHP_CHECK_UNIX_TARGET], [ + AC_CACHE_CHECK([for unix-ish target],ac_cv_target_unix,[ + AC_RUN_IFELSE([AC_LANG_SOURCE([[ +int main(void) { +#if defined(unix) || defined(__unix) || defined(__unix__) + return 0; +#else + return 1; +#endif +} +]])],[ + ac_cv_target_unix=yes +],[ + ac_cv_target_unix=no +],[ + ac_cv_target_unix=no +])]) + +]) diff --git a/configure.ac b/configure.ac index 6b33047538fe2..9d51d88e64cbd 100644 --- a/configure.ac +++ b/configure.ac @@ -324,6 +324,15 @@ PHP_EBCDIC dnl Check whether the system byte ordering is bigendian. PHP_C_BIGENDIAN +dnl Check if we're targeting x86 / x86_64 +PHP_CHECK_X86_TARGET + +dnl Check if we're targeting Windows +PHP_CHECK_WINDOWS_TARGET + +dnl Check whether we're targeting a unix-ish system +PHP_CHECK_UNIX_TARGET + dnl Check whether writing to stdout works. PHP_TEST_WRITE_STDOUT diff --git a/ext/hash/blake3/blake3.c b/ext/hash/blake3/blake3.c new file mode 100644 index 0000000000000..7abf5324ecd05 --- /dev/null +++ b/ext/hash/blake3/blake3.c @@ -0,0 +1,607 @@ +#include +#include +#include + +#include "blake3.h" +#include "blake3_impl.h" + +const char * blake3_version(void) { + return BLAKE3_VERSION_STRING; +} + +INLINE void chunk_state_init(blake3_chunk_state *self, const uint32_t key[8], + uint8_t flags) { + memcpy(self->cv, key, BLAKE3_KEY_LEN); + self->chunk_counter = 0; + memset(self->buf, 0, BLAKE3_BLOCK_LEN); + self->buf_len = 0; + self->blocks_compressed = 0; + self->flags = flags; +} + +INLINE void chunk_state_reset(blake3_chunk_state *self, const uint32_t key[8], + uint64_t chunk_counter) { + memcpy(self->cv, key, BLAKE3_KEY_LEN); + self->chunk_counter = chunk_counter; + self->blocks_compressed = 0; + memset(self->buf, 0, BLAKE3_BLOCK_LEN); + self->buf_len = 0; +} + +INLINE size_t chunk_state_len(const blake3_chunk_state *self) { + return (BLAKE3_BLOCK_LEN * (size_t)self->blocks_compressed) + + ((size_t)self->buf_len); +} + +INLINE size_t chunk_state_fill_buf(blake3_chunk_state *self, + const uint8_t *input, size_t input_len) { + size_t take = BLAKE3_BLOCK_LEN - ((size_t)self->buf_len); + if (take > input_len) { + take = input_len; + } + uint8_t *dest = self->buf + ((size_t)self->buf_len); + memcpy(dest, input, take); + self->buf_len += (uint8_t)take; + return take; +} + +INLINE uint8_t chunk_state_maybe_start_flag(const blake3_chunk_state *self) { + if (self->blocks_compressed == 0) { + return CHUNK_START; + } else { + return 0; + } +} + +typedef struct { + uint32_t input_cv[8]; + uint64_t counter; + uint8_t block[BLAKE3_BLOCK_LEN]; + uint8_t block_len; + uint8_t flags; +} output_t; + +INLINE output_t make_output(const uint32_t input_cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags) { + output_t ret; + memcpy(ret.input_cv, input_cv, 32); + memcpy(ret.block, block, BLAKE3_BLOCK_LEN); + ret.block_len = block_len; + ret.counter = counter; + ret.flags = flags; + return ret; +} + +// Chaining values within a given chunk (specifically the compress_in_place +// interface) are represented as words. This avoids unnecessary bytes<->words +// conversion overhead in the portable implementation. However, the hash_many +// interface handles both user input and parent node blocks, so it accepts +// bytes. For that reason, chaining values in the CV stack are represented as +// bytes. +INLINE void output_chaining_value(const output_t *self, uint8_t cv[32]) { + uint32_t cv_words[8]; + memcpy(cv_words, self->input_cv, 32); + blake3_compress_in_place(cv_words, self->block, self->block_len, + self->counter, self->flags); + store_cv_words(cv, cv_words); +} + +INLINE void output_root_bytes(const output_t *self, uint64_t seek, uint8_t *out, + size_t out_len) { + uint64_t output_block_counter = seek / 64; + size_t offset_within_block = seek % 64; + uint8_t wide_buf[64]; + while (out_len > 0) { + blake3_compress_xof(self->input_cv, self->block, self->block_len, + output_block_counter, self->flags | ROOT, wide_buf); + size_t available_bytes = 64 - offset_within_block; + size_t memcpy_len; + if (out_len > available_bytes) { + memcpy_len = available_bytes; + } else { + memcpy_len = out_len; + } + memcpy(out, wide_buf + offset_within_block, memcpy_len); + out += memcpy_len; + out_len -= memcpy_len; + output_block_counter += 1; + offset_within_block = 0; + } +} + +INLINE void chunk_state_update(blake3_chunk_state *self, const uint8_t *input, + size_t input_len) { + if (self->buf_len > 0) { + size_t take = chunk_state_fill_buf(self, input, input_len); + input += take; + input_len -= take; + if (input_len > 0) { + blake3_compress_in_place( + self->cv, self->buf, BLAKE3_BLOCK_LEN, self->chunk_counter, + self->flags | chunk_state_maybe_start_flag(self)); + self->blocks_compressed += 1; + self->buf_len = 0; + memset(self->buf, 0, BLAKE3_BLOCK_LEN); + } + } + + while (input_len > BLAKE3_BLOCK_LEN) { + blake3_compress_in_place(self->cv, input, BLAKE3_BLOCK_LEN, + self->chunk_counter, + self->flags | chunk_state_maybe_start_flag(self)); + self->blocks_compressed += 1; + input += BLAKE3_BLOCK_LEN; + input_len -= BLAKE3_BLOCK_LEN; + } + + size_t take = chunk_state_fill_buf(self, input, input_len); + input += take; + input_len -= take; +} + +INLINE output_t chunk_state_output(const blake3_chunk_state *self) { + uint8_t block_flags = + self->flags | chunk_state_maybe_start_flag(self) | CHUNK_END; + return make_output(self->cv, self->buf, self->buf_len, self->chunk_counter, + block_flags); +} + +INLINE output_t parent_output(const uint8_t block[BLAKE3_BLOCK_LEN], + const uint32_t key[8], uint8_t flags) { + return make_output(key, block, BLAKE3_BLOCK_LEN, 0, flags | PARENT); +} + +// Given some input larger than one chunk, return the number of bytes that +// should go in the left subtree. This is the largest power-of-2 number of +// chunks that leaves at least 1 byte for the right subtree. +INLINE size_t left_len(size_t content_len) { + // Subtract 1 to reserve at least one byte for the right side. content_len + // should always be greater than BLAKE3_CHUNK_LEN. + size_t full_chunks = (content_len - 1) / BLAKE3_CHUNK_LEN; + return round_down_to_power_of_2(full_chunks) * BLAKE3_CHUNK_LEN; +} + +// Use SIMD parallelism to hash up to MAX_SIMD_DEGREE chunks at the same time +// on a single thread. Write out the chunk chaining values and return the +// number of chunks hashed. These chunks are never the root and never empty; +// those cases use a different codepath. +INLINE size_t compress_chunks_parallel(const uint8_t *input, size_t input_len, + const uint32_t key[8], + uint64_t chunk_counter, uint8_t flags, + uint8_t *out) { +#if defined(BLAKE3_TESTING) + assert(0 < input_len); + assert(input_len <= MAX_SIMD_DEGREE * BLAKE3_CHUNK_LEN); +#endif + + const uint8_t *chunks_array[MAX_SIMD_DEGREE]; + size_t input_position = 0; + size_t chunks_array_len = 0; + while (input_len - input_position >= BLAKE3_CHUNK_LEN) { + chunks_array[chunks_array_len] = &input[input_position]; + input_position += BLAKE3_CHUNK_LEN; + chunks_array_len += 1; + } + + blake3_hash_many(chunks_array, chunks_array_len, + BLAKE3_CHUNK_LEN / BLAKE3_BLOCK_LEN, key, chunk_counter, + true, flags, CHUNK_START, CHUNK_END, out); + + // Hash the remaining partial chunk, if there is one. Note that the empty + // chunk (meaning the empty message) is a different codepath. + if (input_len > input_position) { + uint64_t counter = chunk_counter + (uint64_t)chunks_array_len; + blake3_chunk_state chunk_state; + chunk_state_init(&chunk_state, key, flags); + chunk_state.chunk_counter = counter; + chunk_state_update(&chunk_state, &input[input_position], + input_len - input_position); + output_t output = chunk_state_output(&chunk_state); + output_chaining_value(&output, &out[chunks_array_len * BLAKE3_OUT_LEN]); + return chunks_array_len + 1; + } else { + return chunks_array_len; + } +} + +// Use SIMD parallelism to hash up to MAX_SIMD_DEGREE parents at the same time +// on a single thread. Write out the parent chaining values and return the +// number of parents hashed. (If there's an odd input chaining value left over, +// return it as an additional output.) These parents are never the root and +// never empty; those cases use a different codepath. +INLINE size_t compress_parents_parallel(const uint8_t *child_chaining_values, + size_t num_chaining_values, + const uint32_t key[8], uint8_t flags, + uint8_t *out) { +#if defined(BLAKE3_TESTING) + assert(2 <= num_chaining_values); + assert(num_chaining_values <= 2 * MAX_SIMD_DEGREE_OR_2); +#endif + + const uint8_t *parents_array[MAX_SIMD_DEGREE_OR_2]; + size_t parents_array_len = 0; + while (num_chaining_values - (2 * parents_array_len) >= 2) { + parents_array[parents_array_len] = + &child_chaining_values[2 * parents_array_len * BLAKE3_OUT_LEN]; + parents_array_len += 1; + } + + blake3_hash_many(parents_array, parents_array_len, 1, key, + 0, // Parents always use counter 0. + false, flags | PARENT, + 0, // Parents have no start flags. + 0, // Parents have no end flags. + out); + + // If there's an odd child left over, it becomes an output. + if (num_chaining_values > 2 * parents_array_len) { + memcpy(&out[parents_array_len * BLAKE3_OUT_LEN], + &child_chaining_values[2 * parents_array_len * BLAKE3_OUT_LEN], + BLAKE3_OUT_LEN); + return parents_array_len + 1; + } else { + return parents_array_len; + } +} + +// The wide helper function returns (writes out) an array of chaining values +// and returns the length of that array. The number of chaining values returned +// is the dyanmically detected SIMD degree, at most MAX_SIMD_DEGREE. Or fewer, +// if the input is shorter than that many chunks. The reason for maintaining a +// wide array of chaining values going back up the tree, is to allow the +// implementation to hash as many parents in parallel as possible. +// +// As a special case when the SIMD degree is 1, this function will still return +// at least 2 outputs. This guarantees that this function doesn't perform the +// root compression. (If it did, it would use the wrong flags, and also we +// wouldn't be able to implement exendable ouput.) Note that this function is +// not used when the whole input is only 1 chunk long; that's a different +// codepath. +// +// Why not just have the caller split the input on the first update(), instead +// of implementing this special rule? Because we don't want to limit SIMD or +// multi-threading parallelism for that update(). +static size_t blake3_compress_subtree_wide(const uint8_t *input, + size_t input_len, + const uint32_t key[8], + uint64_t chunk_counter, + uint8_t flags, uint8_t *out) { + // Note that the single chunk case does *not* bump the SIMD degree up to 2 + // when it is 1. If this implementation adds multi-threading in the future, + // this gives us the option of multi-threading even the 2-chunk case, which + // can help performance on smaller platforms. + if (input_len <= blake3_simd_degree() * BLAKE3_CHUNK_LEN) { + return compress_chunks_parallel(input, input_len, key, chunk_counter, flags, + out); + } + + // With more than simd_degree chunks, we need to recurse. Start by dividing + // the input into left and right subtrees. (Note that this is only optimal + // as long as the SIMD degree is a power of 2. If we ever get a SIMD degree + // of 3 or something, we'll need a more complicated strategy.) + size_t left_input_len = left_len(input_len); + size_t right_input_len = input_len - left_input_len; + const uint8_t *right_input = &input[left_input_len]; + uint64_t right_chunk_counter = + chunk_counter + (uint64_t)(left_input_len / BLAKE3_CHUNK_LEN); + + // Make space for the child outputs. Here we use MAX_SIMD_DEGREE_OR_2 to + // account for the special case of returning 2 outputs when the SIMD degree + // is 1. + uint8_t cv_array[2 * MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN]; + size_t degree = blake3_simd_degree(); + if (left_input_len > BLAKE3_CHUNK_LEN && degree == 1) { + // The special case: We always use a degree of at least two, to make + // sure there are two outputs. Except, as noted above, at the chunk + // level, where we allow degree=1. (Note that the 1-chunk-input case is + // a different codepath.) + degree = 2; + } + uint8_t *right_cvs = &cv_array[degree * BLAKE3_OUT_LEN]; + + // Recurse! If this implementation adds multi-threading support in the + // future, this is where it will go. + size_t left_n = blake3_compress_subtree_wide(input, left_input_len, key, + chunk_counter, flags, cv_array); + size_t right_n = blake3_compress_subtree_wide( + right_input, right_input_len, key, right_chunk_counter, flags, right_cvs); + + // The special case again. If simd_degree=1, then we'll have left_n=1 and + // right_n=1. Rather than compressing them into a single output, return + // them directly, to make sure we always have at least two outputs. + if (left_n == 1) { + memcpy(out, cv_array, 2 * BLAKE3_OUT_LEN); + return 2; + } + + // Otherwise, do one layer of parent node compression. + size_t num_chaining_values = left_n + right_n; + return compress_parents_parallel(cv_array, num_chaining_values, key, flags, + out); +} + +// Hash a subtree with compress_subtree_wide(), and then condense the resulting +// list of chaining values down to a single parent node. Don't compress that +// last parent node, however. Instead, return its message bytes (the +// concatenated chaining values of its children). This is necessary when the +// first call to update() supplies a complete subtree, because the topmost +// parent node of that subtree could end up being the root. It's also necessary +// for extended output in the general case. +// +// As with compress_subtree_wide(), this function is not used on inputs of 1 +// chunk or less. That's a different codepath. +INLINE void compress_subtree_to_parent_node( + const uint8_t *input, size_t input_len, const uint32_t key[8], + uint64_t chunk_counter, uint8_t flags, uint8_t out[2 * BLAKE3_OUT_LEN]) { +#if defined(BLAKE3_TESTING) + assert(input_len > BLAKE3_CHUNK_LEN); +#endif + + uint8_t cv_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN]; + size_t num_cvs = blake3_compress_subtree_wide(input, input_len, key, + chunk_counter, flags, cv_array); + + // If MAX_SIMD_DEGREE is greater than 2 and there's enough input, + // compress_subtree_wide() returns more than 2 chaining values. Condense + // them into 2 by forming parent nodes repeatedly. + uint8_t out_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN / 2]; + while (num_cvs > 2) { + num_cvs = + compress_parents_parallel(cv_array, num_cvs, key, flags, out_array); + memcpy(cv_array, out_array, num_cvs * BLAKE3_OUT_LEN); + } + memcpy(out, cv_array, 2 * BLAKE3_OUT_LEN); +} + +INLINE void hasher_init_base(blake3_hasher *self, const uint32_t key[8], + uint8_t flags) { + memcpy(self->key, key, BLAKE3_KEY_LEN); + chunk_state_init(&self->chunk, key, flags); + self->cv_stack_len = 0; +} + +void blake3_hasher_init(blake3_hasher *self) { hasher_init_base(self, IV, 0); } + +void blake3_hasher_init_keyed(blake3_hasher *self, + const uint8_t key[BLAKE3_KEY_LEN]) { + uint32_t key_words[8]; + load_key_words(key, key_words); + hasher_init_base(self, key_words, KEYED_HASH); +} + +void blake3_hasher_init_derive_key_raw(blake3_hasher *self, const void *context, + size_t context_len) { + blake3_hasher context_hasher; + hasher_init_base(&context_hasher, IV, DERIVE_KEY_CONTEXT); + blake3_hasher_update(&context_hasher, context, context_len); + uint8_t context_key[BLAKE3_KEY_LEN]; + blake3_hasher_finalize(&context_hasher, context_key, BLAKE3_KEY_LEN); + uint32_t context_key_words[8]; + load_key_words(context_key, context_key_words); + hasher_init_base(self, context_key_words, DERIVE_KEY_MATERIAL); +} + +void blake3_hasher_init_derive_key(blake3_hasher *self, const char *context) { + blake3_hasher_init_derive_key_raw(self, context, strlen(context)); +} + +// As described in hasher_push_cv() below, we do "lazy merging", delaying +// merges until right before the next CV is about to be added. This is +// different from the reference implementation. Another difference is that we +// aren't always merging 1 chunk at a time. Instead, each CV might represent +// any power-of-two number of chunks, as long as the smaller-above-larger stack +// order is maintained. Instead of the "count the trailing 0-bits" algorithm +// described in the spec, we use a "count the total number of 1-bits" variant +// that doesn't require us to retain the subtree size of the CV on top of the +// stack. The principle is the same: each CV that should remain in the stack is +// represented by a 1-bit in the total number of chunks (or bytes) so far. +INLINE void hasher_merge_cv_stack(blake3_hasher *self, uint64_t total_len) { + size_t post_merge_stack_len = (size_t)popcnt(total_len); + while (self->cv_stack_len > post_merge_stack_len) { + uint8_t *parent_node = + &self->cv_stack[(self->cv_stack_len - 2) * BLAKE3_OUT_LEN]; + output_t output = parent_output(parent_node, self->key, self->chunk.flags); + output_chaining_value(&output, parent_node); + self->cv_stack_len -= 1; + } +} + +// In reference_impl.rs, we merge the new CV with existing CVs from the stack +// before pushing it. We can do that because we know more input is coming, so +// we know none of the merges are root. +// +// This setting is different. We want to feed as much input as possible to +// compress_subtree_wide(), without setting aside anything for the chunk_state. +// If the user gives us 64 KiB, we want to parallelize over all 64 KiB at once +// as a single subtree, if at all possible. +// +// This leads to two problems: +// 1) This 64 KiB input might be the only call that ever gets made to update. +// In this case, the root node of the 64 KiB subtree would be the root node +// of the whole tree, and it would need to be ROOT finalized. We can't +// compress it until we know. +// 2) This 64 KiB input might complete a larger tree, whose root node is +// similarly going to be the the root of the whole tree. For example, maybe +// we have 196 KiB (that is, 128 + 64) hashed so far. We can't compress the +// node at the root of the 256 KiB subtree until we know how to finalize it. +// +// The second problem is solved with "lazy merging". That is, when we're about +// to add a CV to the stack, we don't merge it with anything first, as the +// reference impl does. Instead we do merges using the *previous* CV that was +// added, which is sitting on top of the stack, and we put the new CV +// (unmerged) on top of the stack afterwards. This guarantees that we never +// merge the root node until finalize(). +// +// Solving the first problem requires an additional tool, +// compress_subtree_to_parent_node(). That function always returns the top +// *two* chaining values of the subtree it's compressing. We then do lazy +// merging with each of them separately, so that the second CV will always +// remain unmerged. (That also helps us support extendable output when we're +// hashing an input all-at-once.) +INLINE void hasher_push_cv(blake3_hasher *self, uint8_t new_cv[BLAKE3_OUT_LEN], + uint64_t chunk_counter) { + hasher_merge_cv_stack(self, chunk_counter); + memcpy(&self->cv_stack[self->cv_stack_len * BLAKE3_OUT_LEN], new_cv, + BLAKE3_OUT_LEN); + self->cv_stack_len += 1; +} + +void blake3_hasher_update(blake3_hasher *self, const void *input, + size_t input_len) { + // Explicitly checking for zero avoids causing UB by passing a null pointer + // to memcpy. This comes up in practice with things like: + // std::vector v; + // blake3_hasher_update(&hasher, v.data(), v.size()); + if (input_len == 0) { + return; + } + + const uint8_t *input_bytes = (const uint8_t *)input; + + // If we have some partial chunk bytes in the internal chunk_state, we need + // to finish that chunk first. + if (chunk_state_len(&self->chunk) > 0) { + size_t take = BLAKE3_CHUNK_LEN - chunk_state_len(&self->chunk); + if (take > input_len) { + take = input_len; + } + chunk_state_update(&self->chunk, input_bytes, take); + input_bytes += take; + input_len -= take; + // If we've filled the current chunk and there's more coming, finalize this + // chunk and proceed. In this case we know it's not the root. + if (input_len > 0) { + output_t output = chunk_state_output(&self->chunk); + uint8_t chunk_cv[32]; + output_chaining_value(&output, chunk_cv); + hasher_push_cv(self, chunk_cv, self->chunk.chunk_counter); + chunk_state_reset(&self->chunk, self->key, self->chunk.chunk_counter + 1); + } else { + return; + } + } + + // Now the chunk_state is clear, and we have more input. If there's more than + // a single chunk (so, definitely not the root chunk), hash the largest whole + // subtree we can, with the full benefits of SIMD (and maybe in the future, + // multi-threading) parallelism. Two restrictions: + // - The subtree has to be a power-of-2 number of chunks. Only subtrees along + // the right edge can be incomplete, and we don't know where the right edge + // is going to be until we get to finalize(). + // - The subtree must evenly divide the total number of chunks up until this + // point (if total is not 0). If the current incomplete subtree is only + // waiting for 1 more chunk, we can't hash a subtree of 4 chunks. We have + // to complete the current subtree first. + // Because we might need to break up the input to form powers of 2, or to + // evenly divide what we already have, this part runs in a loop. + while (input_len > BLAKE3_CHUNK_LEN) { + size_t subtree_len = round_down_to_power_of_2(input_len); + uint64_t count_so_far = self->chunk.chunk_counter * BLAKE3_CHUNK_LEN; + // Shrink the subtree_len until it evenly divides the count so far. We know + // that subtree_len itself is a power of 2, so we can use a bitmasking + // trick instead of an actual remainder operation. (Note that if the caller + // consistently passes power-of-2 inputs of the same size, as is hopefully + // typical, this loop condition will always fail, and subtree_len will + // always be the full length of the input.) + // + // An aside: We don't have to shrink subtree_len quite this much. For + // example, if count_so_far is 1, we could pass 2 chunks to + // compress_subtree_to_parent_node. Since we'll get 2 CVs back, we'll still + // get the right answer in the end, and we might get to use 2-way SIMD + // parallelism. The problem with this optimization, is that it gets us + // stuck always hashing 2 chunks. The total number of chunks will remain + // odd, and we'll never graduate to higher degrees of parallelism. See + // https://github.com/BLAKE3-team/BLAKE3/issues/69. + while ((((uint64_t)(subtree_len - 1)) & count_so_far) != 0) { + subtree_len /= 2; + } + // The shrunken subtree_len might now be 1 chunk long. If so, hash that one + // chunk by itself. Otherwise, compress the subtree into a pair of CVs. + uint64_t subtree_chunks = subtree_len / BLAKE3_CHUNK_LEN; + if (subtree_len <= BLAKE3_CHUNK_LEN) { + blake3_chunk_state chunk_state; + chunk_state_init(&chunk_state, self->key, self->chunk.flags); + chunk_state.chunk_counter = self->chunk.chunk_counter; + chunk_state_update(&chunk_state, input_bytes, subtree_len); + output_t output = chunk_state_output(&chunk_state); + uint8_t cv[BLAKE3_OUT_LEN]; + output_chaining_value(&output, cv); + hasher_push_cv(self, cv, chunk_state.chunk_counter); + } else { + // This is the high-performance happy path, though getting here depends + // on the caller giving us a long enough input. + uint8_t cv_pair[2 * BLAKE3_OUT_LEN]; + compress_subtree_to_parent_node(input_bytes, subtree_len, self->key, + self->chunk.chunk_counter, + self->chunk.flags, cv_pair); + hasher_push_cv(self, cv_pair, self->chunk.chunk_counter); + hasher_push_cv(self, &cv_pair[BLAKE3_OUT_LEN], + self->chunk.chunk_counter + (subtree_chunks / 2)); + } + self->chunk.chunk_counter += subtree_chunks; + input_bytes += subtree_len; + input_len -= subtree_len; + } + + // If there's any remaining input less than a full chunk, add it to the chunk + // state. In that case, also do a final merge loop to make sure the subtree + // stack doesn't contain any unmerged pairs. The remaining input means we + // know these merges are non-root. This merge loop isn't strictly necessary + // here, because hasher_push_chunk_cv already does its own merge loop, but it + // simplifies blake3_hasher_finalize below. + if (input_len > 0) { + chunk_state_update(&self->chunk, input_bytes, input_len); + hasher_merge_cv_stack(self, self->chunk.chunk_counter); + } +} + +void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out, + size_t out_len) { + blake3_hasher_finalize_seek(self, 0, out, out_len); +} + +void blake3_hasher_finalize_seek(const blake3_hasher *self, uint64_t seek, + uint8_t *out, size_t out_len) { + // Explicitly checking for zero avoids causing UB by passing a null pointer + // to memcpy. This comes up in practice with things like: + // std::vector v; + // blake3_hasher_finalize(&hasher, v.data(), v.size()); + if (out_len == 0) { + return; + } + + // If the subtree stack is empty, then the current chunk is the root. + if (self->cv_stack_len == 0) { + output_t output = chunk_state_output(&self->chunk); + output_root_bytes(&output, seek, out, out_len); + return; + } + // If there are any bytes in the chunk state, finalize that chunk and do a + // roll-up merge between that chunk hash and every subtree in the stack. In + // this case, the extra merge loop at the end of blake3_hasher_update + // guarantees that none of the subtrees in the stack need to be merged with + // each other first. Otherwise, if there are no bytes in the chunk state, + // then the top of the stack is a chunk hash, and we start the merge from + // that. + output_t output; + size_t cvs_remaining; + if (chunk_state_len(&self->chunk) > 0) { + cvs_remaining = self->cv_stack_len; + output = chunk_state_output(&self->chunk); + } else { + // There are always at least 2 CVs in the stack in this case. + cvs_remaining = self->cv_stack_len - 2; + output = parent_output(&self->cv_stack[cvs_remaining * 32], self->key, + self->chunk.flags); + } + while (cvs_remaining > 0) { + cvs_remaining -= 1; + uint8_t parent_block[BLAKE3_BLOCK_LEN]; + memcpy(parent_block, &self->cv_stack[cvs_remaining * 32], 32); + output_chaining_value(&output, &parent_block[32]); + output = parent_output(parent_block, self->key, self->chunk.flags); + } + output_root_bytes(&output, seek, out, out_len); +} diff --git a/ext/hash/blake3/blake3.h b/ext/hash/blake3/blake3.h new file mode 100644 index 0000000000000..57ebd5adc2c35 --- /dev/null +++ b/ext/hash/blake3/blake3.h @@ -0,0 +1,60 @@ +#ifndef BLAKE3_H +#define BLAKE3_H + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#define BLAKE3_VERSION_STRING "0.3.7" +#define BLAKE3_KEY_LEN 32 +#define BLAKE3_OUT_LEN 32 +#define BLAKE3_BLOCK_LEN 64 +#define BLAKE3_CHUNK_LEN 1024 +#define BLAKE3_MAX_DEPTH 54 +#define BLAKE3_MAX_SIMD_DEGREE 16 + +// This struct is a private implementation detail. It has to be here because +// it's part of blake3_hasher below. +typedef struct { + uint32_t cv[8]; + uint64_t chunk_counter; + uint8_t buf[BLAKE3_BLOCK_LEN]; + uint8_t buf_len; + uint8_t blocks_compressed; + uint8_t flags; +} blake3_chunk_state; + +typedef struct { + uint32_t key[8]; + blake3_chunk_state chunk; + uint8_t cv_stack_len; + // The stack size is MAX_DEPTH + 1 because we do lazy merging. For example, + // with 7 chunks, we have 3 entries in the stack. Adding an 8th chunk + // requires a 4th entry, rather than merging everything down to 1, because we + // don't know whether more input is coming. This is different from how the + // reference implementation does things. + uint8_t cv_stack[(BLAKE3_MAX_DEPTH + 1) * BLAKE3_OUT_LEN]; +} blake3_hasher; + +const char * blake3_version(void); +void blake3_hasher_init(blake3_hasher *self); +void blake3_hasher_init_keyed(blake3_hasher *self, + const uint8_t key[BLAKE3_KEY_LEN]); +void blake3_hasher_init_derive_key(blake3_hasher *self, const char *context); +void blake3_hasher_init_derive_key_raw(blake3_hasher *self, const void *context, + size_t context_len); +void blake3_hasher_update(blake3_hasher *self, const void *input, + size_t input_len); +void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out, + size_t out_len); +void blake3_hasher_finalize_seek(const blake3_hasher *self, uint64_t seek, + uint8_t *out, size_t out_len); + +#ifdef __cplusplus +} +#endif + +#endif /* BLAKE3_H */ diff --git a/ext/hash/blake3/blake3_dispatch.c b/ext/hash/blake3/blake3_dispatch.c new file mode 100644 index 0000000000000..f7e6a787612e8 --- /dev/null +++ b/ext/hash/blake3/blake3_dispatch.c @@ -0,0 +1,277 @@ +#include +#include +#include + +#include "blake3_impl.h" + +#if defined(IS_X86) +#if defined(_MSC_VER) +#include +#elif defined(__GNUC__) +#include +#else +#error "Unimplemented!" +#endif +#endif + +#define MAYBE_UNUSED(x) (void)((x)) + +#if defined(IS_X86) +static uint64_t xgetbv() { +#if defined(_MSC_VER) + return _xgetbv(0); +#else + uint32_t eax = 0, edx = 0; + __asm__ __volatile__("xgetbv\n" : "=a"(eax), "=d"(edx) : "c"(0)); + return ((uint64_t)edx << 32) | eax; +#endif +} + +static void cpuid(uint32_t out[4], uint32_t id) { +#if defined(_MSC_VER) + __cpuid((int *)out, id); +#elif defined(__i386__) || defined(_M_IX86) + __asm__ __volatile__("movl %%ebx, %1\n" + "cpuid\n" + "xchgl %1, %%ebx\n" + : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3]) + : "a"(id)); +#else + __asm__ __volatile__("cpuid\n" + : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) + : "a"(id)); +#endif +} + +static void cpuidex(uint32_t out[4], uint32_t id, uint32_t sid) { +#if defined(_MSC_VER) + __cpuidex((int *)out, id, sid); +#elif defined(__i386__) || defined(_M_IX86) + __asm__ __volatile__("movl %%ebx, %1\n" + "cpuid\n" + "xchgl %1, %%ebx\n" + : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3]) + : "a"(id), "c"(sid)); +#else + __asm__ __volatile__("cpuid\n" + : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) + : "a"(id), "c"(sid)); +#endif +} + +#endif + +enum cpu_feature { + SSE2 = 1 << 0, + SSSE3 = 1 << 1, + SSE41 = 1 << 2, + AVX = 1 << 3, + AVX2 = 1 << 4, + AVX512F = 1 << 5, + AVX512VL = 1 << 6, + /* ... */ + UNDEFINED = 1 << 30 +}; + +#if !defined(BLAKE3_TESTING) +static /* Allow the variable to be controlled manually for testing */ +#endif + enum cpu_feature g_cpu_features = UNDEFINED; + +#if !defined(BLAKE3_TESTING) +static +#endif + enum cpu_feature + get_cpu_features() { + + if (g_cpu_features != UNDEFINED) { + return g_cpu_features; + } else { +#if defined(IS_X86) + uint32_t regs[4] = {0}; + uint32_t *eax = ®s[0], *ebx = ®s[1], *ecx = ®s[2], *edx = ®s[3]; + (void)edx; + enum cpu_feature features = 0; + cpuid(regs, 0); + const int max_id = *eax; + cpuid(regs, 1); +#if defined(__amd64__) || defined(_M_X64) + features |= SSE2; +#else + if (*edx & (1UL << 26)) + features |= SSE2; +#endif + if (*ecx & (1UL << 0)) + features |= SSSE3; + if (*ecx & (1UL << 19)) + features |= SSE41; + + if (*ecx & (1UL << 27)) { // OSXSAVE + const uint64_t mask = xgetbv(); + if ((mask & 6) == 6) { // SSE and AVX states + if (*ecx & (1UL << 28)) + features |= AVX; + if (max_id >= 7) { + cpuidex(regs, 7, 0); + if (*ebx & (1UL << 5)) + features |= AVX2; + if ((mask & 224) == 224) { // Opmask, ZMM_Hi256, Hi16_Zmm + if (*ebx & (1UL << 31)) + features |= AVX512VL; + if (*ebx & (1UL << 16)) + features |= AVX512F; + } + } + } + } + g_cpu_features = features; + return features; +#else + /* How to detect NEON? */ + return 0; +#endif + } +} + +void blake3_compress_in_place(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags) { +#if defined(IS_X86) + const enum cpu_feature features = get_cpu_features(); + MAYBE_UNUSED(features); +#if !defined(BLAKE3_NO_AVX512) + if (features & AVX512VL) { + blake3_compress_in_place_avx512(cv, block, block_len, counter, flags); + return; + } +#endif +#if !defined(BLAKE3_NO_SSE41) + if (features & SSE41) { + blake3_compress_in_place_sse41(cv, block, block_len, counter, flags); + return; + } +#endif +#if !defined(BLAKE3_NO_SSE2) + if (features & SSE2) { + blake3_compress_in_place_sse2(cv, block, block_len, counter, flags); + return; + } +#endif +#endif + blake3_compress_in_place_portable(cv, block, block_len, counter, flags); +} + +void blake3_compress_xof(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, uint8_t flags, + uint8_t out[64]) { +#if defined(IS_X86) + const enum cpu_feature features = get_cpu_features(); + MAYBE_UNUSED(features); +#if !defined(BLAKE3_NO_AVX512) + if (features & AVX512VL) { + blake3_compress_xof_avx512(cv, block, block_len, counter, flags, out); + return; + } +#endif +#if !defined(BLAKE3_NO_SSE41) + if (features & SSE41) { + blake3_compress_xof_sse41(cv, block, block_len, counter, flags, out); + return; + } +#endif +#if !defined(BLAKE3_NO_SSE2) + if (features & SSE2) { + blake3_compress_xof_sse2(cv, block, block_len, counter, flags, out); + return; + } +#endif +#endif + blake3_compress_xof_portable(cv, block, block_len, counter, flags, out); +} + +void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs, + size_t blocks, const uint32_t key[8], uint64_t counter, + bool increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t *out) { +#if defined(IS_X86) + const enum cpu_feature features = get_cpu_features(); + MAYBE_UNUSED(features); +#if !defined(BLAKE3_NO_AVX512) + if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) { + blake3_hash_many_avx512(inputs, num_inputs, blocks, key, counter, + increment_counter, flags, flags_start, flags_end, + out); + return; + } +#endif +#if !defined(BLAKE3_NO_AVX2) + if (features & AVX2) { + blake3_hash_many_avx2(inputs, num_inputs, blocks, key, counter, + increment_counter, flags, flags_start, flags_end, + out); + return; + } +#endif +#if !defined(BLAKE3_NO_SSE41) + if (features & SSE41) { + blake3_hash_many_sse41(inputs, num_inputs, blocks, key, counter, + increment_counter, flags, flags_start, flags_end, + out); + return; + } +#endif +#if !defined(BLAKE3_NO_SSE2) + if (features & SSE2) { + blake3_hash_many_sse2(inputs, num_inputs, blocks, key, counter, + increment_counter, flags, flags_start, flags_end, + out); + return; + } +#endif +#endif + +#if defined(BLAKE3_USE_NEON) + blake3_hash_many_neon(inputs, num_inputs, blocks, key, counter, + increment_counter, flags, flags_start, flags_end, out); + return; +#endif + + blake3_hash_many_portable(inputs, num_inputs, blocks, key, counter, + increment_counter, flags, flags_start, flags_end, + out); +} + +// The dynamically detected SIMD degree of the current platform. +size_t blake3_simd_degree(void) { +#if defined(IS_X86) + const enum cpu_feature features = get_cpu_features(); + MAYBE_UNUSED(features); +#if !defined(BLAKE3_NO_AVX512) + if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) { + return 16; + } +#endif +#if !defined(BLAKE3_NO_AVX2) + if (features & AVX2) { + return 8; + } +#endif +#if !defined(BLAKE3_NO_SSE41) + if (features & SSE41) { + return 4; + } +#endif +#if !defined(BLAKE3_NO_SSE2) + if (features & SSE2) { + return 4; + } +#endif +#endif +#if defined(BLAKE3_USE_NEON) + return 4; +#endif + return 1; +} + diff --git a/ext/hash/blake3/blake3_impl.h b/ext/hash/blake3/blake3_impl.h new file mode 100644 index 0000000000000..86ab6aa253a38 --- /dev/null +++ b/ext/hash/blake3/blake3_impl.h @@ -0,0 +1,269 @@ +#ifndef BLAKE3_IMPL_H +#define BLAKE3_IMPL_H + +#include +#include +#include +#include +#include + +#include "blake3.h" + +// internal flags +enum blake3_flags { + CHUNK_START = 1 << 0, + CHUNK_END = 1 << 1, + PARENT = 1 << 2, + ROOT = 1 << 3, + KEYED_HASH = 1 << 4, + DERIVE_KEY_CONTEXT = 1 << 5, + DERIVE_KEY_MATERIAL = 1 << 6, +}; + +// This C implementation tries to support recent versions of GCC, Clang, and +// MSVC. +#if defined(_MSC_VER) +#define INLINE static __forceinline +#else +#define INLINE static inline __attribute__((always_inline)) +#endif + +#if defined(__x86_64__) || defined(_M_X64) +#define IS_X86 +#define IS_X86_64 +#endif + +#if defined(__i386__) || defined(_M_IX86) +#define IS_X86 +#define IS_X86_32 +#endif + +#if defined(IS_X86) +#if defined(_MSC_VER) +#include +#endif +#include +#endif + +#if defined(IS_X86) +#define MAX_SIMD_DEGREE 16 +#elif defined(BLAKE3_USE_NEON) +#define MAX_SIMD_DEGREE 4 +#else +#define MAX_SIMD_DEGREE 1 +#endif + +// There are some places where we want a static size that's equal to the +// MAX_SIMD_DEGREE, but also at least 2. +#define MAX_SIMD_DEGREE_OR_2 (MAX_SIMD_DEGREE > 2 ? MAX_SIMD_DEGREE : 2) + +static const uint32_t IV[8] = {0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL, + 0xA54FF53AUL, 0x510E527FUL, 0x9B05688CUL, + 0x1F83D9ABUL, 0x5BE0CD19UL}; + +static const uint8_t MSG_SCHEDULE[7][16] = { + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, + {2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8}, + {3, 4, 10, 12, 13, 2, 7, 14, 6, 5, 9, 0, 11, 15, 8, 1}, + {10, 7, 12, 9, 14, 3, 13, 15, 4, 0, 11, 2, 5, 8, 1, 6}, + {12, 13, 9, 11, 15, 10, 14, 8, 7, 2, 5, 3, 0, 1, 6, 4}, + {9, 14, 11, 5, 8, 12, 15, 1, 13, 3, 0, 10, 2, 6, 4, 7}, + {11, 15, 5, 0, 1, 9, 8, 6, 14, 10, 2, 12, 3, 4, 7, 13}, +}; + +/* Find index of the highest set bit */ +/* x is assumed to be nonzero. */ +static unsigned int highest_one(uint64_t x) { +#if defined(__GNUC__) || defined(__clang__) + return 63 ^ __builtin_clzll(x); +#elif defined(_MSC_VER) && defined(IS_X86_64) + unsigned long index; + _BitScanReverse64(&index, x); + return index; +#elif defined(_MSC_VER) && defined(IS_X86_32) + if(x >> 32) { + unsigned long index; + _BitScanReverse(&index, x >> 32); + return 32 + index; + } else { + unsigned long index; + _BitScanReverse(&index, x); + return index; + } +#else + unsigned int c = 0; + if(x & 0xffffffff00000000ULL) { x >>= 32; c += 32; } + if(x & 0x00000000ffff0000ULL) { x >>= 16; c += 16; } + if(x & 0x000000000000ff00ULL) { x >>= 8; c += 8; } + if(x & 0x00000000000000f0ULL) { x >>= 4; c += 4; } + if(x & 0x000000000000000cULL) { x >>= 2; c += 2; } + if(x & 0x0000000000000002ULL) { c += 1; } + return c; +#endif +} + +// Count the number of 1 bits. +INLINE unsigned int popcnt(uint64_t x) { +#if defined(__GNUC__) || defined(__clang__) + return __builtin_popcountll(x); +#else + unsigned int count = 0; + while (x != 0) { + count += 1; + x &= x - 1; + } + return count; +#endif +} + +// Largest power of two less than or equal to x. As a special case, returns 1 +// when x is 0. +INLINE uint64_t round_down_to_power_of_2(uint64_t x) { + return 1ULL << highest_one(x | 1); +} + +INLINE uint32_t counter_low(uint64_t counter) { return (uint32_t)counter; } + +INLINE uint32_t counter_high(uint64_t counter) { + return (uint32_t)(counter >> 32); +} + +INLINE uint32_t load32(const void *src) { + const uint8_t *p = (const uint8_t *)src; + return ((uint32_t)(p[0]) << 0) | ((uint32_t)(p[1]) << 8) | + ((uint32_t)(p[2]) << 16) | ((uint32_t)(p[3]) << 24); +} + +INLINE void load_key_words(const uint8_t key[BLAKE3_KEY_LEN], + uint32_t key_words[8]) { + key_words[0] = load32(&key[0 * 4]); + key_words[1] = load32(&key[1 * 4]); + key_words[2] = load32(&key[2 * 4]); + key_words[3] = load32(&key[3 * 4]); + key_words[4] = load32(&key[4 * 4]); + key_words[5] = load32(&key[5 * 4]); + key_words[6] = load32(&key[6 * 4]); + key_words[7] = load32(&key[7 * 4]); +} + +INLINE void store32(void *dst, uint32_t w) { + uint8_t *p = (uint8_t *)dst; + p[0] = (uint8_t)(w >> 0); + p[1] = (uint8_t)(w >> 8); + p[2] = (uint8_t)(w >> 16); + p[3] = (uint8_t)(w >> 24); +} + +INLINE void store_cv_words(uint8_t bytes_out[32], uint32_t cv_words[8]) { + store32(&bytes_out[0 * 4], cv_words[0]); + store32(&bytes_out[1 * 4], cv_words[1]); + store32(&bytes_out[2 * 4], cv_words[2]); + store32(&bytes_out[3 * 4], cv_words[3]); + store32(&bytes_out[4 * 4], cv_words[4]); + store32(&bytes_out[5 * 4], cv_words[5]); + store32(&bytes_out[6 * 4], cv_words[6]); + store32(&bytes_out[7 * 4], cv_words[7]); +} + +void blake3_compress_in_place(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags); + +void blake3_compress_xof(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, uint8_t flags, + uint8_t out[64]); + +void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs, + size_t blocks, const uint32_t key[8], uint64_t counter, + bool increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t *out); + +size_t blake3_simd_degree(void); + + +// Declarations for implementation-specific functions. +void blake3_compress_in_place_portable(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags); + +void blake3_compress_xof_portable(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags, uint8_t out[64]); + +void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs, + size_t blocks, const uint32_t key[8], + uint64_t counter, bool increment_counter, + uint8_t flags, uint8_t flags_start, + uint8_t flags_end, uint8_t *out); + +#if defined(IS_X86) +#if !defined(BLAKE3_NO_SSE2) +void blake3_compress_in_place_sse2(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags); +void blake3_compress_xof_sse2(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags, uint8_t out[64]); +void blake3_hash_many_sse2(const uint8_t *const *inputs, size_t num_inputs, + size_t blocks, const uint32_t key[8], + uint64_t counter, bool increment_counter, + uint8_t flags, uint8_t flags_start, + uint8_t flags_end, uint8_t *out); +#endif +#if !defined(BLAKE3_NO_SSE41) +void blake3_compress_in_place_sse41(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags); +void blake3_compress_xof_sse41(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags, uint8_t out[64]); +void blake3_hash_many_sse41(const uint8_t *const *inputs, size_t num_inputs, + size_t blocks, const uint32_t key[8], + uint64_t counter, bool increment_counter, + uint8_t flags, uint8_t flags_start, + uint8_t flags_end, uint8_t *out); +#endif +#if !defined(BLAKE3_NO_AVX2) +void blake3_hash_many_avx2(const uint8_t *const *inputs, size_t num_inputs, + size_t blocks, const uint32_t key[8], + uint64_t counter, bool increment_counter, + uint8_t flags, uint8_t flags_start, + uint8_t flags_end, uint8_t *out); +#endif +#if !defined(BLAKE3_NO_AVX512) +void blake3_compress_in_place_avx512(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags); + +void blake3_compress_xof_avx512(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags, uint8_t out[64]); + +void blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs, + size_t blocks, const uint32_t key[8], + uint64_t counter, bool increment_counter, + uint8_t flags, uint8_t flags_start, + uint8_t flags_end, uint8_t *out); +#endif +#endif + +#if defined(BLAKE3_USE_NEON) +void blake3_hash_many_neon(const uint8_t *const *inputs, size_t num_inputs, + size_t blocks, const uint32_t key[8], + uint64_t counter, bool increment_counter, + uint8_t flags, uint8_t flags_start, + uint8_t flags_end, uint8_t *out); +#endif + + +#endif /* BLAKE3_IMPL_H */ diff --git a/ext/hash/blake3/blake3_portable.c b/ext/hash/blake3/blake3_portable.c new file mode 100644 index 0000000000000..062dd1b47fb64 --- /dev/null +++ b/ext/hash/blake3/blake3_portable.c @@ -0,0 +1,160 @@ +#include "blake3_impl.h" +#include + +INLINE uint32_t rotr32(uint32_t w, uint32_t c) { + return (w >> c) | (w << (32 - c)); +} + +INLINE void g(uint32_t *state, size_t a, size_t b, size_t c, size_t d, + uint32_t x, uint32_t y) { + state[a] = state[a] + state[b] + x; + state[d] = rotr32(state[d] ^ state[a], 16); + state[c] = state[c] + state[d]; + state[b] = rotr32(state[b] ^ state[c], 12); + state[a] = state[a] + state[b] + y; + state[d] = rotr32(state[d] ^ state[a], 8); + state[c] = state[c] + state[d]; + state[b] = rotr32(state[b] ^ state[c], 7); +} + +INLINE void round_fn(uint32_t state[16], const uint32_t *msg, size_t round) { + // Select the message schedule based on the round. + const uint8_t *schedule = MSG_SCHEDULE[round]; + + // Mix the columns. + g(state, 0, 4, 8, 12, msg[schedule[0]], msg[schedule[1]]); + g(state, 1, 5, 9, 13, msg[schedule[2]], msg[schedule[3]]); + g(state, 2, 6, 10, 14, msg[schedule[4]], msg[schedule[5]]); + g(state, 3, 7, 11, 15, msg[schedule[6]], msg[schedule[7]]); + + // Mix the rows. + g(state, 0, 5, 10, 15, msg[schedule[8]], msg[schedule[9]]); + g(state, 1, 6, 11, 12, msg[schedule[10]], msg[schedule[11]]); + g(state, 2, 7, 8, 13, msg[schedule[12]], msg[schedule[13]]); + g(state, 3, 4, 9, 14, msg[schedule[14]], msg[schedule[15]]); +} + +INLINE void compress_pre(uint32_t state[16], const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, uint8_t flags) { + uint32_t block_words[16]; + block_words[0] = load32(block + 4 * 0); + block_words[1] = load32(block + 4 * 1); + block_words[2] = load32(block + 4 * 2); + block_words[3] = load32(block + 4 * 3); + block_words[4] = load32(block + 4 * 4); + block_words[5] = load32(block + 4 * 5); + block_words[6] = load32(block + 4 * 6); + block_words[7] = load32(block + 4 * 7); + block_words[8] = load32(block + 4 * 8); + block_words[9] = load32(block + 4 * 9); + block_words[10] = load32(block + 4 * 10); + block_words[11] = load32(block + 4 * 11); + block_words[12] = load32(block + 4 * 12); + block_words[13] = load32(block + 4 * 13); + block_words[14] = load32(block + 4 * 14); + block_words[15] = load32(block + 4 * 15); + + state[0] = cv[0]; + state[1] = cv[1]; + state[2] = cv[2]; + state[3] = cv[3]; + state[4] = cv[4]; + state[5] = cv[5]; + state[6] = cv[6]; + state[7] = cv[7]; + state[8] = IV[0]; + state[9] = IV[1]; + state[10] = IV[2]; + state[11] = IV[3]; + state[12] = counter_low(counter); + state[13] = counter_high(counter); + state[14] = (uint32_t)block_len; + state[15] = (uint32_t)flags; + + round_fn(state, &block_words[0], 0); + round_fn(state, &block_words[0], 1); + round_fn(state, &block_words[0], 2); + round_fn(state, &block_words[0], 3); + round_fn(state, &block_words[0], 4); + round_fn(state, &block_words[0], 5); + round_fn(state, &block_words[0], 6); +} + +void blake3_compress_in_place_portable(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags) { + uint32_t state[16]; + compress_pre(state, cv, block, block_len, counter, flags); + cv[0] = state[0] ^ state[8]; + cv[1] = state[1] ^ state[9]; + cv[2] = state[2] ^ state[10]; + cv[3] = state[3] ^ state[11]; + cv[4] = state[4] ^ state[12]; + cv[5] = state[5] ^ state[13]; + cv[6] = state[6] ^ state[14]; + cv[7] = state[7] ^ state[15]; +} + +void blake3_compress_xof_portable(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags, uint8_t out[64]) { + uint32_t state[16]; + compress_pre(state, cv, block, block_len, counter, flags); + + store32(&out[0 * 4], state[0] ^ state[8]); + store32(&out[1 * 4], state[1] ^ state[9]); + store32(&out[2 * 4], state[2] ^ state[10]); + store32(&out[3 * 4], state[3] ^ state[11]); + store32(&out[4 * 4], state[4] ^ state[12]); + store32(&out[5 * 4], state[5] ^ state[13]); + store32(&out[6 * 4], state[6] ^ state[14]); + store32(&out[7 * 4], state[7] ^ state[15]); + store32(&out[8 * 4], state[8] ^ cv[0]); + store32(&out[9 * 4], state[9] ^ cv[1]); + store32(&out[10 * 4], state[10] ^ cv[2]); + store32(&out[11 * 4], state[11] ^ cv[3]); + store32(&out[12 * 4], state[12] ^ cv[4]); + store32(&out[13 * 4], state[13] ^ cv[5]); + store32(&out[14 * 4], state[14] ^ cv[6]); + store32(&out[15 * 4], state[15] ^ cv[7]); +} + +INLINE void hash_one_portable(const uint8_t *input, size_t blocks, + const uint32_t key[8], uint64_t counter, + uint8_t flags, uint8_t flags_start, + uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN]) { + uint32_t cv[8]; + memcpy(cv, key, BLAKE3_KEY_LEN); + uint8_t block_flags = flags | flags_start; + while (blocks > 0) { + if (blocks == 1) { + block_flags |= flags_end; + } + blake3_compress_in_place_portable(cv, input, BLAKE3_BLOCK_LEN, counter, + block_flags); + input = &input[BLAKE3_BLOCK_LEN]; + blocks -= 1; + block_flags = flags; + } + store_cv_words(out, cv); +} + +void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs, + size_t blocks, const uint32_t key[8], + uint64_t counter, bool increment_counter, + uint8_t flags, uint8_t flags_start, + uint8_t flags_end, uint8_t *out) { + while (num_inputs > 0) { + hash_one_portable(inputs[0], blocks, key, counter, flags, flags_start, + flags_end, out); + if (increment_counter) { + counter += 1; + } + inputs += 1; + num_inputs -= 1; + out = &out[BLAKE3_OUT_LEN]; + } +} diff --git a/ext/hash/blake3/helper_script_blake3_version_fetcher.php b/ext/hash/blake3/helper_script_blake3_version_fetcher.php new file mode 100644 index 0000000000000..9b149bcc5062d --- /dev/null +++ b/ext/hash/blake3/helper_script_blake3_version_fetcher.php @@ -0,0 +1,61 @@ + $url, + CURLOPT_USERAGENT => 'php-src/ext/hash/blake3/helper_script bot', + CURLOPT_ENCODING => '', + CURLOPT_RETURNTRANSFER => 1 + )); + $ret = curl_exec($ch); + if (curl_errno($ch) !== CURLE_OK || curl_getinfo($ch, CURLINFO_HTTP_CODE) !== 200) { + throw new \LogicException(); + } + curl_close($ch); + return $ret; +} + +$files = json_decode(fetch("https://api.github.com/repos/BLAKE3-team/BLAKE3/contents/c?ref=" . urlencode(BLAKE3_TARGET_VERSION)), true); +foreach ($files as $file) { + // $file looks like: + array( + 'name' => 'blake3_avx512.c', + 'path' => 'c/blake3_avx512.c', + 'sha' => '77a5c385cb897f712d55dd884d8ae54f14ed2570', + 'size' => 47960, + 'url' => 'https://api.github.com/repos/BLAKE3-team/BLAKE3/contents/c/blake3_avx512.c?ref=0.3.7', + 'html_url' => 'https://github.com/BLAKE3-team/BLAKE3/blob/0.3.7/c/blake3_avx512.c', + 'git_url' => 'https://api.github.com/repos/BLAKE3-team/BLAKE3/git/blobs/77a5c385cb897f712d55dd884d8ae54f14ed2570', + 'download_url' => 'https://raw.githubusercontent.com/BLAKE3-team/BLAKE3/0.3.7/c/blake3_avx512.c', + 'type' => 'file', + '_links' => array( + 'self' => 'https://api.github.com/repos/BLAKE3-team/BLAKE3/contents/c/blake3_avx512.c?ref=0.3.7', + 'git' => 'https://api.github.com/repos/BLAKE3-team/BLAKE3/git/blobs/77a5c385cb897f712d55dd884d8ae54f14ed2570', + 'html' => 'https://github.com/BLAKE3-team/BLAKE3/blob/0.3.7/c/blake3_avx512.c' + ) + ); + $name = $file["name"]; + if ($file["type"] !== "file" || ! preg_match('/^blake3.*\\.(?:c|h|S|asm)$/', $name)) { + // the dir contains several things we're not interested in. + continue; + } + var_dump($file["name"]); + $content = fetch($file['download_url']); + file_put_contents($name, $content, LOCK_EX); +} diff --git a/ext/hash/config.m4 b/ext/hash/config.m4 index d84e89cc6b333..43cab675d56ba 100644 --- a/ext/hash/config.m4 +++ b/ext/hash/config.m4 @@ -11,7 +11,28 @@ if test "$PHP_MHASH" != "no"; then AC_DEFINE(PHP_MHASH_BC, 1, [ ]) fi + + +PHP_ARG_WITH([blake3-upstream-c-source-dir], + [for upstream blake3 implementation, github.com/blake3-team/BLAKE3/tree/0.3.7/c ], + [AS_HELP_STRING([[--with-blake3-upstream-c-source-dir=DIR]], + [Include upstream blake3 source code, github.com/blake3-team/BLAKE3/tree/0.3.7/c])], + "", "") + +if test "$PHP_BLAKE3_UPSTREAM_C_SOURCE_DIR" = ""; then + EXT_HASH_BLAKE3_SOURCES="hash_blake3.c blake3/blake3.c blake3/blake3_dispatch.c blake3/blake3_portable.c" + EXT_HASH_BLAKE3_CFLAGS="-I@ext_srcdir@/blake3" + PHP_ADD_BUILD_DIR(ext/hash/blake3, 1) +else + AC_DEFINE(PHP_BLAKE3_UPSTREAM_C_SOURCE_DIR, $PHP_BLAKE3_UPSTREAM_C_SOURCE_DIR, [ ]) + EXT_HASH_BLAKE3_SOURCES="hash_blake3.c $PHP_BLAKE3_UPSTREAM_C_SOURCE_DIR/blake3.c $PHP_BLAKE3_UPSTREAM_C_SOURCE_DIR/blake3_dispatch.c $PHP_BLAKE3_UPSTREAM_C_SOURCE_DIR/blake3_portable.c" + PHP_ADD_BUILD_DIR($PHP_BLAKE3_UPSTREAM_C_SOURCE_DIR, 1) + EXT_HASH_BLAKE3_CFLAGS="-I$PHP_BLAKE3_UPSTREAM_C_SOURCE_DIR" +fi + + if test $ac_cv_c_bigendian_php = yes; then + dnl todo: check if $PHP_BLAKE3_UPSTREAM_C_SOURCE_DIR/blake3_neon.c is applicable ? EXT_HASH_SHA3_SOURCES="hash_sha3.c" AC_DEFINE(HAVE_SLOW_HASH3, 1, [Define is hash3 algo is available]) AC_MSG_WARN("Use SHA3 slow implementation on bigendian") @@ -20,26 +41,53 @@ else AC_MSG_CHECKING([if we're at 64-bit platform]) AS_IF([test "$ac_cv_sizeof_long" -eq 4],[ AC_MSG_RESULT([no]) + if test $ac_cv_target_x86 = yes; then + dnl i think there are some 32bit x86 cpus with SSE2 but.. cba + EXT_HASH_BLAKE3_CFLAGS="$EXT_HASH_BLAKE3_CFLAGS -DBLAKE3_NO_SSE2 -DBLAKE3_NO_SSE41 -DBLAKE3_NO_AVX2 -DBLAKE3_NO_AVX512" + fi SHA3_DIR="sha3/generic32lc" SHA3_OPT_SRC="$SHA3_DIR/KeccakP-1600-inplace32BI.c" ],[ AC_MSG_RESULT([yes]) SHA3_DIR="sha3/generic64lc" SHA3_OPT_SRC="$SHA3_DIR/KeccakP-1600-opt64.c" + if test $ac_cv_target_x86 = yes; then + if test "$PHP_BLAKE3_UPSTREAM_C_SOURCE_DIR" = ""; then + dnl AC_MSG_WARN("Use BLAKE3 slow/portable implementation") + EXT_HASH_BLAKE3_CFLAGS="$EXT_HASH_BLAKE3_CFLAGS -DBLAKE3_NO_SSE2 -DBLAKE3_NO_SSE41 -DBLAKE3_NO_AVX2 -DBLAKE3_NO_AVX512" + else + if test $ac_cv_target_windows = yes; then + dnl x86_64 windows gnuc + EXT_HASH_BLAKE3_SOURCES="$EXT_HASH_BLAKE3_SOURCES $PHP_BLAKE3_UPSTREAM_C_SOURCE_DIR/blake3_avx512_x86-64_windows_gnu.S $PHP_BLAKE3_UPSTREAM_C_SOURCE_DIR/blake3_avx2_x86-64_windows_gnu.S $PHP_BLAKE3_UPSTREAM_C_SOURCE_DIR/blake3_sse41_x86-64_windows_gnu.S $PHP_BLAKE3_UPSTREAM_C_SOURCE_DIR/blake3_sse2_x86-64_windows_gnu.S" + else + if test $ac_cv_target_unix = yes; then + dnl x86_64 unix gnuc + EXT_HASH_BLAKE3_SOURCES="$EXT_HASH_BLAKE3_SOURCES $PHP_BLAKE3_UPSTREAM_C_SOURCE_DIR/blake3_avx512_x86-64_unix.S $PHP_BLAKE3_UPSTREAM_C_SOURCE_DIR/blake3_avx2_x86-64_unix.S $PHP_BLAKE3_UPSTREAM_C_SOURCE_DIR/blake3_sse41_x86-64_unix.S $PHP_BLAKE3_UPSTREAM_C_SOURCE_DIR/blake3_sse2_x86-64_unix.S" + else + dnl blake still has C instrictics versions (eg $PHP_BLAKE3_UPSTREAM_C_SOURCE_DIR/blake3_avx512.c ) which COULD be used in this situation, + dnl we're targeting x86_64 but neither windows nor unix-ish.. + dnl i have nothing to test this on, so i'm just disabling it for now, feel free to fix it. + EXT_HASH_BLAKE3_CFLAGS="$EXT_HASH_BLAKE3_CFLAGS -DBLAKE3_NO_SSE2 -DBLAKE3_NO_SSE41 -DBLAKE3_NO_AVX2 -DBLAKE3_NO_AVX512" + fi + fi + fi + fi ]) EXT_HASH_SHA3_SOURCES="$SHA3_OPT_SRC $SHA3_DIR/KeccakHash.c $SHA3_DIR/KeccakSponge.c hash_sha3.c" PHP_HASH_CFLAGS="-I@ext_srcdir@/$SHA3_DIR -DKeccakP200_excluded -DKeccakP400_excluded -DKeccakP800_excluded -DZEND_ENABLE_STATIC_TSRMLS_CACHE=1" PHP_ADD_BUILD_DIR(ext/hash/$SHA3_DIR, 1) fi +PHP_HASH_CFLAGS="$PHP_HASH_CFLAGS $EXT_HASH_BLAKE3_CFLAGS" EXT_HASH_SOURCES="hash.c hash_md.c hash_sha.c hash_ripemd.c hash_haval.c \ hash_tiger.c hash_gost.c hash_snefru.c hash_whirlpool.c hash_adler32.c \ - hash_crc32.c hash_fnv.c hash_joaat.c $EXT_HASH_SHA3_SOURCES" + hash_crc32.c hash_fnv.c hash_joaat.c $EXT_HASH_SHA3_SOURCES \ + $EXT_HASH_BLAKE3_SOURCES" EXT_HASH_HEADERS="php_hash.h php_hash_md.h php_hash_sha.h php_hash_ripemd.h \ php_hash_haval.h php_hash_tiger.h php_hash_gost.h php_hash_snefru.h \ php_hash_whirlpool.h php_hash_adler32.h php_hash_crc32.h \ - php_hash_fnv.h php_hash_joaat.h php_hash_sha3.h" + php_hash_fnv.h php_hash_joaat.h php_hash_sha3.h php_hash_blake3.h" PHP_NEW_EXTENSION(hash, $EXT_HASH_SOURCES, 0,,$PHP_HASH_CFLAGS) PHP_INSTALL_HEADERS(ext/hash, $EXT_HASH_HEADERS) diff --git a/ext/hash/config.w32 b/ext/hash/config.w32 index 5d6be6b92249a..bf7fb71832acf 100644 --- a/ext/hash/config.w32 +++ b/ext/hash/config.w32 @@ -11,7 +11,9 @@ PHP_HASH = 'yes'; EXTENSION('hash', 'hash.c hash_md.c hash_sha.c hash_ripemd.c hash_haval.c ' + 'hash_tiger.c hash_gost.c hash_snefru.c hash_whirlpool.c ' + 'hash_adler32.c hash_crc32.c hash_joaat.c hash_fnv.c ' + - 'hash_sha3.c', false); + 'hash_sha3.c hash_blake3.c', false); + +ADD_SOURCES('ext/hash/blake3', 'blake3.c blake3_dispatch.c blake3_portable.c'); var hash_sha3_dir = 'ext/hash/sha3/generic' + (X64 ? '64' : '32') + 'lc'; @@ -25,10 +27,18 @@ if (!CHECK_HEADER_ADD_INCLUDE('KeccakHash.h', 'CFLAGS_HASH', hash_sha3_dir)) { // Should NEVER happen ERROR('Unable to locate SHA3 headers'); } +var hash_blake3_dir = 'ext/hash/blake3'; +if (!CHECK_HEADER_ADD_INCLUDE('blake3.h', 'CFLAGS_HASH', hash_blake3_dir)) { + // Should NEVER happen + ERROR('Unable to locate BLAKE3 headers'); +} + -ADD_FLAG('CFLAGS_HASH', '/DKeccakP200_excluded /DKeccakP400_excluded /DKeccakP800_excluded /DZEND_ENABLE_STATIC_TSRMLS_CACHE=1'); +ADD_FLAG('CFLAGS_HASH', '/DKeccakP200_excluded /DKeccakP400_excluded /DKeccakP800_excluded /DZEND_ENABLE_STATIC_TSRMLS_CACHE=1 ' + + '/DBLAKE3_NO_SSE2 /DBLAKE3_NO_SSE41 /DBLAKE3_NO_AVX2 /DBLAKE3_NO_AVX512'); PHP_INSTALL_HEADERS('ext/hash/', 'php_hash.h php_hash_md.h php_hash_sha.h ' + 'php_hash_ripemd.h php_hash_haval.h php_hash_tiger.h ' + 'php_hash_gost.h php_hash_snefru.h php_hash_whirlpool.h ' + - 'php_hash_adler32.h php_hash_crc32.h php_hash_sha3.h'); + 'php_hash_adler32.h php_hash_crc32.h php_hash_sha3.h ' + + 'php_hash_blake3.h'); diff --git a/ext/hash/hash.c b/ext/hash/hash.c index 4e3820f35e6b4..f920f8a3784b8 100644 --- a/ext/hash/hash.c +++ b/ext/hash/hash.c @@ -19,6 +19,8 @@ #include "config.h" #endif +#include "blake3.h" + #include #include "php_hash.h" #include "ext/standard/info.h" @@ -1586,6 +1588,7 @@ PHP_MINIT_FUNCTION(hash) php_hash_register_algo("fnv164", &php_hash_fnv164_ops); php_hash_register_algo("fnv1a64", &php_hash_fnv1a64_ops); php_hash_register_algo("joaat", &php_hash_joaat_ops); + php_hash_register_algo("blake3", &php_hash_blake3_ops); PHP_HASH_HAVAL_REGISTER(3,128); PHP_HASH_HAVAL_REGISTER(3,160); @@ -1650,6 +1653,11 @@ PHP_MINFO_FUNCTION(hash) php_info_print_table_start(); php_info_print_table_row(2, "hash support", "enabled"); php_info_print_table_row(2, "Hashing Engines", buffer); +#ifdef PHP_BLAKE3_UPSTREAM_C_SOURCE_DIR + php_info_print_table_row(2, "blake3 implementation", "upstream " BLAKE3_VERSION_STRING); +#else + php_info_print_table_row(2, "blake3 implementation", "portable " BLAKE3_VERSION_STRING); +#endif php_info_print_table_end(); #ifdef PHP_MHASH_BC diff --git a/ext/hash/hash_blake3.c b/ext/hash/hash_blake3.c new file mode 100644 index 0000000000000..360ddaac47834 --- /dev/null +++ b/ext/hash/hash_blake3.c @@ -0,0 +1,27 @@ +#include "php_hash.h" +#include "php_hash_blake3.h" +#include "blake3.h" + + +#include // memcpy + +PHP_HASH_API void PHP_BLAKE3Init(PHP_BLAKE3_CTX *context) +{ + blake3_hasher_init(context); +} + +PHP_HASH_API void PHP_BLAKE3Update(PHP_BLAKE3_CTX *context, const unsigned char *input, size_t len) +{ + blake3_hasher_update(context, input, len); +} + +PHP_HASH_API void PHP_BLAKE3Final(unsigned char digest[BLAKE3_OUT_LEN/*32*/], PHP_BLAKE3_CTX *context) +{ + blake3_hasher_finalize(context, digest, BLAKE3_OUT_LEN); +} + +PHP_HASH_API int PHP_BLAKE3Copy(const php_hash_ops *ops, PHP_BLAKE3_CTX *orig_context, PHP_BLAKE3_CTX *copy_context) +{ + memcpy(copy_context, orig_context, sizeof(*orig_context)); + return SUCCESS; +} diff --git a/ext/hash/php_hash.h b/ext/hash/php_hash.h index 33cc6a473a7a1..70807b7f1ba6c 100644 --- a/ext/hash/php_hash.h +++ b/ext/hash/php_hash.h @@ -105,6 +105,7 @@ extern const php_hash_ops php_hash_fnv1a32_ops; extern const php_hash_ops php_hash_fnv164_ops; extern const php_hash_ops php_hash_fnv1a64_ops; extern const php_hash_ops php_hash_joaat_ops; +extern const php_hash_ops php_hash_blake3_ops; #define PHP_HASH_HAVAL_OPS(p,b) extern const php_hash_ops php_hash_##p##haval##b##_ops; diff --git a/ext/hash/php_hash_blake3.h b/ext/hash/php_hash_blake3.h new file mode 100644 index 0000000000000..f977b6818eb4c --- /dev/null +++ b/ext/hash/php_hash_blake3.h @@ -0,0 +1,35 @@ +#ifndef PHP_HASH_BLAKE3_H +#define PHP_HASH_BLAKE3_H + +#include "ext/standard/basic_functions.h" +#include "php_hash.h" +#include "blake3.h" + + +// typedef struct blake3_hasher PHP_BLAKE3_CTX; +#define PHP_BLAKE3_CTX blake3_hasher +// help: is V correct? +#define PHP_BLAKE3_SPEC "b8b8qb64bbbbb1760" + +PHP_HASH_API void PHP_BLAKE3Init(PHP_BLAKE3_CTX *context); +PHP_HASH_API void PHP_BLAKE3Update(PHP_BLAKE3_CTX *context, const unsigned char *input, size_t len); +PHP_HASH_API void PHP_BLAKE3Final(unsigned char digest[BLAKE3_OUT_LEN/*32*/], PHP_BLAKE3_CTX *context); +PHP_HASH_API int PHP_BLAKE3Copy(const php_hash_ops *ops, PHP_BLAKE3_CTX *orig_context, PHP_BLAKE3_CTX *copy_context); + +const php_hash_ops php_hash_blake3_ops = { + "blake3", + (php_hash_init_func_t) PHP_BLAKE3Init, + (php_hash_update_func_t) PHP_BLAKE3Update, + (php_hash_final_func_t) PHP_BLAKE3Final, + (php_hash_copy_func_t) PHP_BLAKE3Copy, + php_hash_serialize, + php_hash_unserialize, + PHP_BLAKE3_SPEC, // << don't know what this should be, hopefully a dev that knows can remove this comment + BLAKE3_OUT_LEN /*32*/, + BLAKE3_CHUNK_LEN /*1024*/, + sizeof(PHP_BLAKE3_CTX), + 1 +}; + +#endif + diff --git a/ext/hash/tests/hash-clone.phpt b/ext/hash/tests/hash-clone.phpt index 57567c0bc5014..271e310418346 100644 --- a/ext/hash/tests/hash-clone.phpt +++ b/ext/hash/tests/hash-clone.phpt @@ -143,6 +143,9 @@ string(16) "bebc746a33b6ab62" string(5) "joaat" string(8) "aaebf370" string(8) "aaebf370" +string(6) "blake3" +string(64) "e232493e3d416e24ffc588b24a1772ccc6f80290f1920cf15f21313bc3543a51" +string(64) "e232493e3d416e24ffc588b24a1772ccc6f80290f1920cf15f21313bc3543a51" string(10) "haval128,3" string(32) "86362472c8895e68e223ef8b3711d8d9" string(32) "86362472c8895e68e223ef8b3711d8d9" @@ -302,6 +305,9 @@ string(16) "893899e4415a920f" string(5) "joaat" string(8) "aaebf370" string(8) "836fb0e5" +string(6) "blake3" +string(64) "e232493e3d416e24ffc588b24a1772ccc6f80290f1920cf15f21313bc3543a51" +string(64) "dbdea45e5a6c3bad18a4f96d9d4b9105e4cceaa4fc06568f69829435c47587fb" string(10) "haval128,3" string(32) "86362472c8895e68e223ef8b3711d8d9" string(32) "ebeeeb05c18af1e53d2d127b561d5e0d" diff --git a/ext/hash/tests/hash_algos.phpt b/ext/hash/tests/hash_algos.phpt index cff71a7771235..daf9a4fc9b4a7 100644 --- a/ext/hash/tests/hash_algos.phpt +++ b/ext/hash/tests/hash_algos.phpt @@ -9,7 +9,7 @@ var_dump(hash_algos()); ?> --EXPECTF-- *** Testing hash_algos() : basic functionality *** -array(53) { +array(54) { [%d]=> string(3) "md2" [%d]=> @@ -87,6 +87,8 @@ array(53) { [%d]=> string(5) "joaat" [%d]=> + string(6) "blake3" + [%d]=> string(10) "haval128,3" [%d]=> string(10) "haval160,3" diff --git a/ext/hash/tests/hash_copy_001.phpt b/ext/hash/tests/hash_copy_001.phpt index 271326178d523..edc66f04f92a0 100644 --- a/ext/hash/tests/hash_copy_001.phpt +++ b/ext/hash/tests/hash_copy_001.phpt @@ -143,6 +143,9 @@ string(16) "bebc746a33b6ab62" string(5) "joaat" string(8) "aaebf370" string(8) "aaebf370" +string(6) "blake3" +string(64) "e232493e3d416e24ffc588b24a1772ccc6f80290f1920cf15f21313bc3543a51" +string(64) "e232493e3d416e24ffc588b24a1772ccc6f80290f1920cf15f21313bc3543a51" string(10) "haval128,3" string(32) "86362472c8895e68e223ef8b3711d8d9" string(32) "86362472c8895e68e223ef8b3711d8d9" @@ -302,6 +305,9 @@ string(16) "893899e4415a920f" string(5) "joaat" string(8) "aaebf370" string(8) "836fb0e5" +string(6) "blake3" +string(64) "e232493e3d416e24ffc588b24a1772ccc6f80290f1920cf15f21313bc3543a51" +string(64) "dbdea45e5a6c3bad18a4f96d9d4b9105e4cceaa4fc06568f69829435c47587fb" string(10) "haval128,3" string(32) "86362472c8895e68e223ef8b3711d8d9" string(32) "ebeeeb05c18af1e53d2d127b561d5e0d" diff --git a/ext/hash/tests/hash_hmac_algos.phpt b/ext/hash/tests/hash_hmac_algos.phpt index 16aec26df6d9d..9435bae60e655 100644 --- a/ext/hash/tests/hash_hmac_algos.phpt +++ b/ext/hash/tests/hash_hmac_algos.phpt @@ -37,6 +37,7 @@ Array [%d] => snefru256 [%d] => gost [%d] => gost-crypto + [%d] => blake3 [%d] => haval128,3 [%d] => haval160,3 [%d] => haval192,3 diff --git a/ext/hash/tests/hash_serialize_001.phpt b/ext/hash/tests/hash_serialize_001.phpt index d515d2c5c55cd..4ef4c518b702a 100644 --- a/ext/hash/tests/hash_serialize_001.phpt +++ b/ext/hash/tests/hash_serialize_001.phpt @@ -154,6 +154,9 @@ string(16) "bebc746a33b6ab62" string(5) "joaat" string(8) "aaebf370" string(8) "aaebf370" +string(6) "blake3" +string(64) "e232493e3d416e24ffc588b24a1772ccc6f80290f1920cf15f21313bc3543a51" +string(64) "e232493e3d416e24ffc588b24a1772ccc6f80290f1920cf15f21313bc3543a51" string(10) "haval128,3" string(32) "86362472c8895e68e223ef8b3711d8d9" string(32) "86362472c8895e68e223ef8b3711d8d9" @@ -313,6 +316,9 @@ string(16) "893899e4415a920f" string(5) "joaat" string(8) "836fb0e5" string(8) "836fb0e5" +string(6) "blake3" +string(64) "dbdea45e5a6c3bad18a4f96d9d4b9105e4cceaa4fc06568f69829435c47587fb" +string(64) "dbdea45e5a6c3bad18a4f96d9d4b9105e4cceaa4fc06568f69829435c47587fb" string(10) "haval128,3" string(32) "ebeeeb05c18af1e53d2d127b561d5e0d" string(32) "ebeeeb05c18af1e53d2d127b561d5e0d" diff --git a/ext/hash/tests/hash_serialize_002.phpt b/ext/hash/tests/hash_serialize_002.phpt index b6b4ccfb5eab7..20e3533ae1f23 100644 --- a/ext/hash/tests/hash_serialize_002.phpt +++ b/ext/hash/tests/hash_serialize_002.phpt @@ -83,6 +83,8 @@ string(4) "gost" HashContext with HASH_HMAC option cannot be serialized string(11) "gost-crypto" HashContext with HASH_HMAC option cannot be serialized +string(6) "blake3" +HashContext with HASH_HMAC option cannot be serialized string(10) "haval128,3" HashContext with HASH_HMAC option cannot be serialized string(10) "haval160,3"