|
41 | 41 | #ifdef __SSE2__
|
42 | 42 | #include <emmintrin.h>
|
43 | 43 | #endif
|
| 44 | +#if defined(__aarch64__) || defined(_M_ARM64) |
| 45 | +#include <arm_neon.h> |
| 46 | +#endif |
44 | 47 |
|
45 | 48 | #if defined(ZEND_WIN32) && !defined(ZTS) && defined(_MSC_VER)
|
46 | 49 | /* This performance improvement of tolower() on Windows gives 10-18% on bench.php */
|
@@ -105,7 +108,30 @@ static _locale_t current_locale = NULL;
|
105 | 108 | __m128i blconv_result = _mm_add_epi8(blconv_operand, blconv_add); \
|
106 | 109 | _mm_storeu_si128((__m128i *)(dest), blconv_result);
|
107 | 110 |
|
108 |
| -#endif /* __SSE2__ */ |
| 111 | +#elif defined(__aarch64__) || defined(_M_ARM64) |
| 112 | +#define HAVE_BLOCKCONV |
| 113 | + |
| 114 | +#define BLOCKCONV_INIT_RANGE(start, end) \ |
| 115 | + const int8x16_t blconv_offset = vdupq_n_s8((signed char)(SCHAR_MIN - start)); \ |
| 116 | + const int8x16_t blconv_threshold = vdupq_n_s8(SCHAR_MIN + (end - start) + 1); |
| 117 | + |
| 118 | +#define BLOCKCONV_STRIDE sizeof(int8x16_t) |
| 119 | + |
| 120 | +#define BLOCKCONV_INIT_DELTA(delta) \ |
| 121 | + const int8x16_t blconv_delta = vdupq_n_s8(delta); |
| 122 | + |
| 123 | +#define BLOCKCONV_LOAD(input) \ |
| 124 | + int8x16_t blconv_operand = vld1q_s8((const int8_t*)(input)); \ |
| 125 | + uint8x16_t blconv_mask = vcltq_s8(vaddq_s8(blconv_operand, blconv_offset), blconv_threshold); |
| 126 | + |
| 127 | +#define BLOCKCONV_FOUND() vmaxvq_u8(blconv_mask) |
| 128 | + |
| 129 | +#define BLOCKCONV_STORE(dest) \ |
| 130 | + int8x16_t blconv_add = vandq_s8(vreinterpretq_s8_u8(blconv_mask), blconv_delta); \ |
| 131 | + int8x16_t blconv_result = vaddq_s8(blconv_operand, blconv_add); \ |
| 132 | + vst1q_s8((int8_t *)(dest), blconv_result); |
| 133 | + |
| 134 | +#endif /* defined(__aarch64__) || defined(_M_ARM64) */ |
109 | 135 |
|
110 | 136 | ZEND_API const unsigned char zend_tolower_map[256] = {
|
111 | 137 | 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,
|
|
0 commit comments