@@ -1613,14 +1613,17 @@ quantize_fns_t ggml_internal_get_quantize_fn(size_t i) {
1613
1613
#define GGML_F32x4_REDUCE_ONE(x) vaddvq_f32(x)
1614
1614
#define GGML_F32x4_REDUCE(res, x) \
1615
1615
{ \
1616
- for (int i = 0; i < GGML_F32_ARR/2; ++i) { \
1617
- x[2*i] = vaddq_f32(x[2*i], x[2*i+1]); \
1616
+ int offset = GGML_F32_ARR >> 1; \
1617
+ for (int i = 0; i < offset; ++i) { \
1618
+ x[i] = vaddq_f32(x[i], x[offset+i]); \
1618
1619
} \
1619
- for (int i = 0; i < GGML_F32_ARR/4; ++i) { \
1620
- x[4*i] = vaddq_f32(x[4*i], x[4*i+2]); \
1620
+ offset >>= 1; \
1621
+ for (int i = 0; i < offset; ++i) { \
1622
+ x[i] = vaddq_f32(x[i], x[offset+i]); \
1621
1623
} \
1622
- for (int i = 0; i < GGML_F32_ARR/8; ++i) { \
1623
- x[8*i] = vaddq_f32(x[8*i], x[8*i+4]); \
1624
+ offset >>= 1; \
1625
+ for (int i = 0; i < offset; ++i) { \
1626
+ x[i] = vaddq_f32(x[i], x[offset+i]); \
1624
1627
} \
1625
1628
res = GGML_F32x4_REDUCE_ONE(x[0]); \
1626
1629
}
@@ -1651,14 +1654,17 @@ quantize_fns_t ggml_internal_get_quantize_fn(size_t i) {
1651
1654
#define GGML_F16x8_MUL vmulq_f16
1652
1655
#define GGML_F16x8_REDUCE(res, x) \
1653
1656
{ \
1654
- for (int i = 0; i < GGML_F16_ARR/2; ++i) { \
1655
- x[2*i] = vaddq_f16(x[2*i], x[2*i+1]); \
1657
+ int offset = GGML_F16_ARR >> 1; \
1658
+ for (int i = 0; i < offset; ++i) { \
1659
+ x[i] = vaddq_f16(x[i], x[offset+i]); \
1656
1660
} \
1657
- for (int i = 0; i < GGML_F16_ARR/4; ++i) { \
1658
- x[4*i] = vaddq_f16(x[4*i], x[4*i+2]); \
1661
+ offset >>= 1; \
1662
+ for (int i = 0; i < offset; ++i) { \
1663
+ x[i] = vaddq_f16(x[i], x[offset+i]); \
1659
1664
} \
1660
- for (int i = 0; i < GGML_F16_ARR/8; ++i) { \
1661
- x[8*i] = vaddq_f16(x[8*i], x[8*i+4]); \
1665
+ offset >>= 1; \
1666
+ for (int i = 0; i < offset; ++i) { \
1667
+ x[i] = vaddq_f16(x[i], x[offset+i]); \
1662
1668
} \
1663
1669
const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 (x[0])); \
1664
1670
const float32x4_t t1 = vcvt_f32_f16(vget_high_f16(x[0])); \
@@ -1725,14 +1731,17 @@ quantize_fns_t ggml_internal_get_quantize_fn(size_t i) {
1725
1731
#define GGML_F32x8_MUL _mm256_mul_ps
1726
1732
#define GGML_F32x8_REDUCE(res, x) \
1727
1733
{ \
1728
- for (int i = 0; i < GGML_F32_ARR/2; ++i) { \
1729
- x[2*i] = _mm256_add_ps(x[2*i], x[2*i+1]); \
1734
+ int offset = GGML_F32_ARR >> 1; \
1735
+ for (int i = 0; i < offset; ++i) { \
1736
+ x[i] = _mm256_add_ps(x[i], x[offset+i]); \
1730
1737
} \
1731
- for (int i = 0; i < GGML_F32_ARR/4; ++i) { \
1732
- x[4*i] = _mm256_add_ps(x[4*i], x[4*i+2]); \
1738
+ offset >>= 1; \
1739
+ for (int i = 0; i < offset; ++i) { \
1740
+ x[i] = _mm256_add_ps(x[i], x[offset+i]); \
1733
1741
} \
1734
- for (int i = 0; i < GGML_F32_ARR/8; ++i) { \
1735
- x[8*i] = _mm256_add_ps(x[8*i], x[8*i+4]); \
1742
+ offset >>= 1; \
1743
+ for (int i = 0; i < offset; ++i) { \
1744
+ x[i] = _mm256_add_ps(x[i], x[offset+i]); \
1736
1745
} \
1737
1746
const __m128 t0 = _mm_add_ps(_mm256_castps256_ps128(x[0]), \
1738
1747
_mm256_extractf128_ps(x[0], 1)); \
@@ -1822,14 +1831,17 @@ static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
1822
1831
#define GGML_F32x4_MUL vec_mul
1823
1832
#define GGML_F32x4_REDUCE(res, x) \
1824
1833
{ \
1825
- for (int i = 0; i < GGML_F32_ARR/2; ++i) { \
1826
- x[2*i] = vec_add(x[2*i], x[2*i+1]); \
1834
+ int offset = GGML_F32_ARR >> 1; \
1835
+ for (int i = 0; i < offset; ++i) { \
1836
+ x[i] = vec_add(x[i], x[offset+i]); \
1827
1837
} \
1828
- for (int i = 0; i < GGML_F32_ARR/4; ++i) { \
1829
- x[4*i] = vec_add(x[4*i], x[4*i+2]); \
1838
+ offset >>= 1; \
1839
+ for (int i = 0; i < offset; ++i) { \
1840
+ x[i] = vec_add(x[i], x[offset+i]); \
1830
1841
} \
1831
- for (int i = 0; i < GGML_F32_ARR/8; ++i) { \
1832
- x[8*i] = vec_add(x[8*i], x[8*i+4]); \
1842
+ offset >>= 1; \
1843
+ for (int i = 0; i < offset; ++i) { \
1844
+ x[i] = vec_add(x[i], x[offset+i]); \
1833
1845
} \
1834
1846
res = vec_extract(x[0], 0) + \
1835
1847
vec_extract(x[0], 1) + \
@@ -1885,14 +1897,17 @@ static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
1885
1897
#define GGML_F32x4_MUL wasm_f32x4_mul
1886
1898
#define GGML_F32x4_REDUCE(res, x) \
1887
1899
{ \
1888
- for (int i = 0; i < GGML_F32_ARR/2; ++i) { \
1889
- x[2*i] = wasm_f32x4_add(x[2*i], x[2*i+1]); \
1900
+ int offset = GGML_F32_ARR >> 1; \
1901
+ for (int i = 0; i < offset; ++i) { \
1902
+ x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
1890
1903
} \
1891
- for (int i = 0; i < GGML_F32_ARR/4; ++i) { \
1892
- x[4*i] = wasm_f32x4_add(x[4*i], x[4*i+2]); \
1904
+ offset >>= 1; \
1905
+ for (int i = 0; i < offset; ++i) { \
1906
+ x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
1893
1907
} \
1894
- for (int i = 0; i < GGML_F32_ARR/8; ++i) { \
1895
- x[8*i] = wasm_f32x4_add(x[8*i], x[8*i+4]); \
1908
+ offset >>= 1; \
1909
+ for (int i = 0; i < offset; ++i) { \
1910
+ x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
1896
1911
} \
1897
1912
res = wasm_f32x4_extract_lane(x[0], 0) + \
1898
1913
wasm_f32x4_extract_lane(x[0], 1) + \
@@ -1947,14 +1962,17 @@ inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) {
1947
1962
#define GGML_F16x4_MUL wasm_f32x4_mul
1948
1963
#define GGML_F16x4_REDUCE(res, x) \
1949
1964
{ \
1950
- for (int i = 0; i < GGML_F16_ARR/2; ++i) { \
1951
- x[2*i] = wasm_f32x4_add(x[2*i], x[2*i+1]); \
1965
+ int offset = GGML_F16_ARR >> 1; \
1966
+ for (int i = 0; i < offset; ++i) { \
1967
+ x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
1952
1968
} \
1953
- for (int i = 0; i < GGML_F16_ARR/4; ++i) { \
1954
- x[4*i] = wasm_f32x4_add(x[4*i], x[4*i+2]); \
1969
+ offset >>= 1; \
1970
+ for (int i = 0; i < offset; ++i) { \
1971
+ x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
1955
1972
} \
1956
- for (int i = 0; i < GGML_F16_ARR/8; ++i) { \
1957
- x[8*i] = wasm_f32x4_add(x[8*i], x[8*i+4]); \
1973
+ offset >>= 1; \
1974
+ for (int i = 0; i < offset; ++i) { \
1975
+ x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
1958
1976
} \
1959
1977
res = wasm_f32x4_extract_lane(x[0], 0) + \
1960
1978
wasm_f32x4_extract_lane(x[0], 1) + \
@@ -1996,14 +2014,17 @@ inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) {
1996
2014
#define GGML_F32x4_MUL _mm_mul_ps
1997
2015
#define GGML_F32x4_REDUCE(res, x) \
1998
2016
{ \
1999
- for (int i = 0; i < GGML_F32_ARR/2; ++i) { \
2000
- x[2*i] = _mm_add_ps(x[2*i], x[2*i+1]); \
2017
+ int offset = GGML_F32_ARR >> 1; \
2018
+ for (int i = 0; i < offset; ++i) { \
2019
+ x[i] = _mm_add_ps(x[i], x[offset+i]); \
2001
2020
} \
2002
- for (int i = 0; i < GGML_F32_ARR/4; ++i) { \
2003
- x[4*i] = _mm_add_ps(x[4*i], x[4*i+2]); \
2021
+ offset >>= 1; \
2022
+ for (int i = 0; i < offset; ++i) { \
2023
+ x[i] = _mm_add_ps(x[i], x[offset+i]); \
2004
2024
} \
2005
- for (int i = 0; i < GGML_F32_ARR/8; ++i) { \
2006
- x[8*i] = _mm_add_ps(x[8*i], x[8*i+4]); \
2025
+ offset >>= 1; \
2026
+ for (int i = 0; i < offset; ++i) { \
2027
+ x[i] = _mm_add_ps(x[i], x[offset+i]); \
2007
2028
} \
2008
2029
const __m128 t0 = _mm_hadd_ps(x[0], x[0]); \
2009
2030
res = _mm_cvtss_f32(_mm_hadd_ps(t0, t0)); \
0 commit comments