Skip to content

Commit b817f87

Browse files
authored
ggml : convert interleaved addressing to sequential addressing for reduce functions (leejet#117)
* Convert interleaved addressing to sequential addressing for REDUCE * update addressing on new archs
1 parent b9b23b4 commit b817f87

File tree

1 file changed

+63
-42
lines changed

1 file changed

+63
-42
lines changed

src/ggml.c

Lines changed: 63 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -1613,14 +1613,17 @@ quantize_fns_t ggml_internal_get_quantize_fn(size_t i) {
16131613
#define GGML_F32x4_REDUCE_ONE(x) vaddvq_f32(x)
16141614
#define GGML_F32x4_REDUCE(res, x) \
16151615
{ \
1616-
for (int i = 0; i < GGML_F32_ARR/2; ++i) { \
1617-
x[2*i] = vaddq_f32(x[2*i], x[2*i+1]); \
1616+
int offset = GGML_F32_ARR >> 1; \
1617+
for (int i = 0; i < offset; ++i) { \
1618+
x[i] = vaddq_f32(x[i], x[offset+i]); \
16181619
} \
1619-
for (int i = 0; i < GGML_F32_ARR/4; ++i) { \
1620-
x[4*i] = vaddq_f32(x[4*i], x[4*i+2]); \
1620+
offset >>= 1; \
1621+
for (int i = 0; i < offset; ++i) { \
1622+
x[i] = vaddq_f32(x[i], x[offset+i]); \
16211623
} \
1622-
for (int i = 0; i < GGML_F32_ARR/8; ++i) { \
1623-
x[8*i] = vaddq_f32(x[8*i], x[8*i+4]); \
1624+
offset >>= 1; \
1625+
for (int i = 0; i < offset; ++i) { \
1626+
x[i] = vaddq_f32(x[i], x[offset+i]); \
16241627
} \
16251628
res = GGML_F32x4_REDUCE_ONE(x[0]); \
16261629
}
@@ -1651,14 +1654,17 @@ quantize_fns_t ggml_internal_get_quantize_fn(size_t i) {
16511654
#define GGML_F16x8_MUL vmulq_f16
16521655
#define GGML_F16x8_REDUCE(res, x) \
16531656
{ \
1654-
for (int i = 0; i < GGML_F16_ARR/2; ++i) { \
1655-
x[2*i] = vaddq_f16(x[2*i], x[2*i+1]); \
1657+
int offset = GGML_F16_ARR >> 1; \
1658+
for (int i = 0; i < offset; ++i) { \
1659+
x[i] = vaddq_f16(x[i], x[offset+i]); \
16561660
} \
1657-
for (int i = 0; i < GGML_F16_ARR/4; ++i) { \
1658-
x[4*i] = vaddq_f16(x[4*i], x[4*i+2]); \
1661+
offset >>= 1; \
1662+
for (int i = 0; i < offset; ++i) { \
1663+
x[i] = vaddq_f16(x[i], x[offset+i]); \
16591664
} \
1660-
for (int i = 0; i < GGML_F16_ARR/8; ++i) { \
1661-
x[8*i] = vaddq_f16(x[8*i], x[8*i+4]); \
1665+
offset >>= 1; \
1666+
for (int i = 0; i < offset; ++i) { \
1667+
x[i] = vaddq_f16(x[i], x[offset+i]); \
16621668
} \
16631669
const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 (x[0])); \
16641670
const float32x4_t t1 = vcvt_f32_f16(vget_high_f16(x[0])); \
@@ -1725,14 +1731,17 @@ quantize_fns_t ggml_internal_get_quantize_fn(size_t i) {
17251731
#define GGML_F32x8_MUL _mm256_mul_ps
17261732
#define GGML_F32x8_REDUCE(res, x) \
17271733
{ \
1728-
for (int i = 0; i < GGML_F32_ARR/2; ++i) { \
1729-
x[2*i] = _mm256_add_ps(x[2*i], x[2*i+1]); \
1734+
int offset = GGML_F32_ARR >> 1; \
1735+
for (int i = 0; i < offset; ++i) { \
1736+
x[i] = _mm256_add_ps(x[i], x[offset+i]); \
17301737
} \
1731-
for (int i = 0; i < GGML_F32_ARR/4; ++i) { \
1732-
x[4*i] = _mm256_add_ps(x[4*i], x[4*i+2]); \
1738+
offset >>= 1; \
1739+
for (int i = 0; i < offset; ++i) { \
1740+
x[i] = _mm256_add_ps(x[i], x[offset+i]); \
17331741
} \
1734-
for (int i = 0; i < GGML_F32_ARR/8; ++i) { \
1735-
x[8*i] = _mm256_add_ps(x[8*i], x[8*i+4]); \
1742+
offset >>= 1; \
1743+
for (int i = 0; i < offset; ++i) { \
1744+
x[i] = _mm256_add_ps(x[i], x[offset+i]); \
17361745
} \
17371746
const __m128 t0 = _mm_add_ps(_mm256_castps256_ps128(x[0]), \
17381747
_mm256_extractf128_ps(x[0], 1)); \
@@ -1822,14 +1831,17 @@ static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
18221831
#define GGML_F32x4_MUL vec_mul
18231832
#define GGML_F32x4_REDUCE(res, x) \
18241833
{ \
1825-
for (int i = 0; i < GGML_F32_ARR/2; ++i) { \
1826-
x[2*i] = vec_add(x[2*i], x[2*i+1]); \
1834+
int offset = GGML_F32_ARR >> 1; \
1835+
for (int i = 0; i < offset; ++i) { \
1836+
x[i] = vec_add(x[i], x[offset+i]); \
18271837
} \
1828-
for (int i = 0; i < GGML_F32_ARR/4; ++i) { \
1829-
x[4*i] = vec_add(x[4*i], x[4*i+2]); \
1838+
offset >>= 1; \
1839+
for (int i = 0; i < offset; ++i) { \
1840+
x[i] = vec_add(x[i], x[offset+i]); \
18301841
} \
1831-
for (int i = 0; i < GGML_F32_ARR/8; ++i) { \
1832-
x[8*i] = vec_add(x[8*i], x[8*i+4]); \
1842+
offset >>= 1; \
1843+
for (int i = 0; i < offset; ++i) { \
1844+
x[i] = vec_add(x[i], x[offset+i]); \
18331845
} \
18341846
res = vec_extract(x[0], 0) + \
18351847
vec_extract(x[0], 1) + \
@@ -1885,14 +1897,17 @@ static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
18851897
#define GGML_F32x4_MUL wasm_f32x4_mul
18861898
#define GGML_F32x4_REDUCE(res, x) \
18871899
{ \
1888-
for (int i = 0; i < GGML_F32_ARR/2; ++i) { \
1889-
x[2*i] = wasm_f32x4_add(x[2*i], x[2*i+1]); \
1900+
int offset = GGML_F32_ARR >> 1; \
1901+
for (int i = 0; i < offset; ++i) { \
1902+
x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
18901903
} \
1891-
for (int i = 0; i < GGML_F32_ARR/4; ++i) { \
1892-
x[4*i] = wasm_f32x4_add(x[4*i], x[4*i+2]); \
1904+
offset >>= 1; \
1905+
for (int i = 0; i < offset; ++i) { \
1906+
x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
18931907
} \
1894-
for (int i = 0; i < GGML_F32_ARR/8; ++i) { \
1895-
x[8*i] = wasm_f32x4_add(x[8*i], x[8*i+4]); \
1908+
offset >>= 1; \
1909+
for (int i = 0; i < offset; ++i) { \
1910+
x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
18961911
} \
18971912
res = wasm_f32x4_extract_lane(x[0], 0) + \
18981913
wasm_f32x4_extract_lane(x[0], 1) + \
@@ -1947,14 +1962,17 @@ inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) {
19471962
#define GGML_F16x4_MUL wasm_f32x4_mul
19481963
#define GGML_F16x4_REDUCE(res, x) \
19491964
{ \
1950-
for (int i = 0; i < GGML_F16_ARR/2; ++i) { \
1951-
x[2*i] = wasm_f32x4_add(x[2*i], x[2*i+1]); \
1965+
int offset = GGML_F16_ARR >> 1; \
1966+
for (int i = 0; i < offset; ++i) { \
1967+
x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
19521968
} \
1953-
for (int i = 0; i < GGML_F16_ARR/4; ++i) { \
1954-
x[4*i] = wasm_f32x4_add(x[4*i], x[4*i+2]); \
1969+
offset >>= 1; \
1970+
for (int i = 0; i < offset; ++i) { \
1971+
x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
19551972
} \
1956-
for (int i = 0; i < GGML_F16_ARR/8; ++i) { \
1957-
x[8*i] = wasm_f32x4_add(x[8*i], x[8*i+4]); \
1973+
offset >>= 1; \
1974+
for (int i = 0; i < offset; ++i) { \
1975+
x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
19581976
} \
19591977
res = wasm_f32x4_extract_lane(x[0], 0) + \
19601978
wasm_f32x4_extract_lane(x[0], 1) + \
@@ -1996,14 +2014,17 @@ inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) {
19962014
#define GGML_F32x4_MUL _mm_mul_ps
19972015
#define GGML_F32x4_REDUCE(res, x) \
19982016
{ \
1999-
for (int i = 0; i < GGML_F32_ARR/2; ++i) { \
2000-
x[2*i] = _mm_add_ps(x[2*i], x[2*i+1]); \
2017+
int offset = GGML_F32_ARR >> 1; \
2018+
for (int i = 0; i < offset; ++i) { \
2019+
x[i] = _mm_add_ps(x[i], x[offset+i]); \
20012020
} \
2002-
for (int i = 0; i < GGML_F32_ARR/4; ++i) { \
2003-
x[4*i] = _mm_add_ps(x[4*i], x[4*i+2]); \
2021+
offset >>= 1; \
2022+
for (int i = 0; i < offset; ++i) { \
2023+
x[i] = _mm_add_ps(x[i], x[offset+i]); \
20042024
} \
2005-
for (int i = 0; i < GGML_F32_ARR/8; ++i) { \
2006-
x[8*i] = _mm_add_ps(x[8*i], x[8*i+4]); \
2025+
offset >>= 1; \
2026+
for (int i = 0; i < offset; ++i) { \
2027+
x[i] = _mm_add_ps(x[i], x[offset+i]); \
20072028
} \
20082029
const __m128 t0 = _mm_hadd_ps(x[0], x[0]); \
20092030
res = _mm_cvtss_f32(_mm_hadd_ps(t0, t0)); \

0 commit comments

Comments
 (0)