Skip to content

Commit 1b2db39

Browse files
committed
[#24465] Fix Usearch / Hnswlib precision discrepancy
Summary: Fixing some inconsistencies in index parameters that are causing a discrepancy between Usearch and Hnswlib performance: - Correctly specifying connectivity for hnswlib as num_neighbors_per_vertex instead of max_neighbors_per_vertex. - Passing the ef option into hnswlib configuration. Adding internal statistics introspection to Usearch and Hnswlib index wrappers. PR for hnswlib changes: nmslib/hnswlib#594. PR for usearch changes: unum-cloud/usearch#508 Also allow specifying multiple values of k to pass in as input, as long as they are not greater than the precomputed ground truth result list size. Updating hnsw_tool to always convert uint8_t coordinates to float32 when using Hnswlib to have a fair comparison with Usearch on the SIFT1B dataset. Usearch does not currently support the uint8_t type natively. The changes to src/inline-thirdparty will be pushed as separate commits generated by `build-support/thirdparty_tool --sync-inline-thirdparty`. Test Plan: Jenkins Manual testing using hnsw_tool - hnswlib: https://gist.githubusercontent.com/mbautin/d21580dcac0b51ad2d7bc9fc130c5f9e/raw ``` Hnswlib index with 5 levels max_elements: 1000000 M: 16 maxM: 16 maxM0: 32 ef_construction: 128 ef: 10 mult: 0.360674 Level 0: 1000000 nodes, 21613828 edges, 21.61 average edges per node Level 1: 62323 nodes, 885027 edges, 14.20 average edges per node Level 2: 3855 nodes, 50515 edges, 13.10 average edges per node Level 3: 238 nodes, 2543 edges, 10.68 average edges per node Level 4: 17 nodes, 244 edges, 14.35 average edges per node Totals: 1066433 nodes, 22552157 edges, 21.15 average edges per node i-recall @ 50, i=1..10: 1-recall @ 50: 0.9695000052 2-recall @ 50: 0.9645000100 3-recall @ 50: 0.9604333043 4-recall @ 50: 0.9568499923 5-recall @ 50: 0.9541400075 6-recall @ 50: 0.9504333138 7-recall @ 50: 0.9467428327 8-recall @ 50: 0.9435999990 9-recall @ 50: 0.9406333566 10-recall @ 50: 0.9377999902 ``` - usearch: https://gist.githubusercontent.com/mbautin/74948b310780562e74831eb29e43cb13/raw ``` Usearch index with 4 levels connectivity: 16 connectivity_base: 32 expansion_add: 128 expansion_search: 10 inverse_log_connectivity: 0.360674 Level 0: 1000000 nodes, 20973352 edges, 20.97 average edges per node Level 1: 64036 nodes, 890428 edges, 13.91 average edges per node Level 2: 5090 nodes, 66295 edges, 13.02 average edges per node Level 3: 481 nodes, 5304 edges, 11.03 average edges per node Totals: 1069607 nodes, 21935379 edges, 20.51 average edges per node i-recall@50, i=1..10: 1-recall @ 40: 0.9305999875 2-recall @ 40: 0.9201999903 3-recall @ 40: 0.9141333103 4-recall @ 40: 0.9085000157 5-recall @ 40: 0.9036399722 6-recall @ 40: 0.8987166882 7-recall @ 40: 0.8932142854 8-recall @ 40: 0.8890249729 9-recall @ 40: 0.8852999806 10-recall @ 40: 0.8813199997 ``` Reviewers: sergei, aleksandr.ponomarenko Reviewed By: sergei, aleksandr.ponomarenko Subscribers: ybase Differential Revision: https://phorge.dev.yugabyte.com/D38977
1 parent 9007253 commit 1b2db39

17 files changed

+608
-115
lines changed

build-support/inline_thirdparty.yml

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,8 @@
1515

1616
dependencies:
1717
- name: usearch
18-
git_url: https://github.com/unum-cloud/usearch
19-
commit: 191d9bb46fe5e2a44d1505ce7563ed51c7e55868
18+
git_url: https://github.com/yugabyte/usearch
19+
tag: v2.15.3-yb-2
2020
src_dir: include
2121
dest_dir: usearch
2222

@@ -27,8 +27,10 @@ dependencies:
2727
dest_dir: fp16
2828

2929
- name: hnswlib
30-
git_url: https://github.com/nmslib/hnswlib
31-
commit: 2142dc6f4dd08e64ab727a7bbd93be7f732e80b0
30+
git_url: https://github.com/yugabyte/hnswlib
31+
# c1b9b79a is the commit in the upstream repo that we branched off of. We apply our patches to
32+
# our branch c1b9b79a-yb while waiting for our PRs to be merged in the official Hnswlib.
33+
tag: vc1b9b79a-yb-1
3234
src_dir: hnswlib
3335
dest_dir: hnswlib/hnswlib
3436

src/yb/gutil/stl_util.h

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -898,6 +898,51 @@ struct PointerHash {
898898
template <class Value>
899899
using UnorderedStringMap = std::unordered_map<std::string, Value, StringHash, std::equal_to<void>>;
900900

901+
// Define a concept that ensures Container's value_type is T
902+
template<typename Container, typename T>
903+
concept ContainerOf = requires {
904+
typename Container::value_type;
905+
} && std::same_as<typename Container::value_type, T>;
906+
907+
// A unified way to insert values into supported containers
908+
909+
// Overload for std::set
910+
template <typename Element, typename Compare, typename Allocator>
911+
void InsertIntoContainer(
912+
std::set<Element, Compare, Allocator>& container,
913+
const Element& value) {
914+
container.insert(value);
915+
}
916+
917+
template <typename Element, typename Compare, typename Allocator>
918+
void InsertIntoContainer(
919+
std::set<Element, Compare, Allocator>& container,
920+
Element&& value) {
921+
container.insert(std::move(value));
922+
}
923+
924+
// Overload for std::vector
925+
template <typename Element, typename Allocator>
926+
void InsertIntoContainer(
927+
std::vector<Element, Allocator>& container,
928+
const Element& value) {
929+
container.push_back(value);
930+
}
931+
932+
template <typename Element, typename Allocator>
933+
void InsertIntoContainer(
934+
std::vector<Element, Allocator>& container,
935+
Element&& value) {
936+
container.push_back(std::move(value));
937+
}
938+
939+
// Fallback to generate a compile-time error for unsupported containers
940+
template <typename Container, typename Element>
941+
void InsertIntoContainer(Container&, const Element&) {
942+
static_assert(sizeof(Container) == 0,
943+
"InsertIntoContainer is not supported for this container type.");
944+
}
945+
901946
} // namespace yb
902947

903948
// For backward compatibility

src/yb/tools/hnsw_tool.cc

Lines changed: 112 additions & 70 deletions
Large diffs are not rendered by default.

src/yb/util/stol_utils-test.cc

Lines changed: 167 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -21,21 +21,27 @@ namespace util {
2121
class StolUtilsTest : public YBTest {
2222
protected:
2323
void TestStoi(const std::string& str, const int32_t int_val) {
24-
auto decoded_int = CheckedStoi(str);
25-
ASSERT_OK(decoded_int);
26-
ASSERT_EQ(int_val, *decoded_int);
24+
auto decoded_int = ASSERT_RESULT(CheckedStoi(str));
25+
ASSERT_EQ(int_val, decoded_int);
2726
}
2827

2928
void TestStoll(const std::string& str, const int64_t int_val) {
30-
auto decoded_int = CheckedStoll(str);
31-
ASSERT_OK(decoded_int);
32-
ASSERT_EQ(int_val, *decoded_int);
29+
auto decoded_int = ASSERT_RESULT(CheckedStoll(str));
30+
ASSERT_EQ(int_val, decoded_int);
31+
}
32+
33+
void TestStoull(const std::string& str, const uint64_t int_val) {
34+
auto decoded_int = ASSERT_RESULT(CheckedStoull(str));
35+
ASSERT_EQ(int_val, decoded_int);
3336
}
3437

3538
void TestStold(const std::string& str, const long double double_val) {
36-
auto decoded_double = CheckedStold(str);
37-
ASSERT_OK(decoded_double);
38-
ASSERT_DOUBLE_EQ(double_val, *decoded_double);
39+
auto decoded_double = ASSERT_RESULT(CheckedStold(str));
40+
if (std::isnan(double_val) && std::isnan(decoded_double)) {
41+
// Equality might not work for two NaN values, but we consider them equal here.
42+
return;
43+
}
44+
ASSERT_DOUBLE_EQ(double_val, decoded_double);
3945
}
4046
};
4147

@@ -70,11 +76,31 @@ TEST_F(StolUtilsTest, TestCheckedStoild) {
7076
ASSERT_NOK(CheckedStoll("123 123"));
7177
ASSERT_NOK(CheckedStoll(""));
7278

79+
TestStoull("0", 0);
80+
TestStoull("123", 123);
81+
TestStoull("9223372036854775808", 9223372036854775808ULL);
82+
TestStoull("18446744073709551615", 18446744073709551615ULL);
83+
ASSERT_NOK(CheckedStoull("abcd"));
84+
ASSERT_NOK(CheckedStoull(" 123"));
85+
ASSERT_NOK(CheckedStoull("123abc"));
86+
ASSERT_NOK(CheckedStoull("-1"));
87+
ASSERT_NOK(CheckedStoull("-9223372036854775809"));
88+
ASSERT_NOK(CheckedStoull("123.1"));
89+
ASSERT_NOK(CheckedStoull("123456789123456789123456789"));
90+
ASSERT_NOK(CheckedStoull("123 123"));
91+
ASSERT_NOK(CheckedStoull(""));
92+
7393
TestStold("123", 123);
7494
TestStold("-123", -123);
7595
TestStold("123.1", 123.1);
7696
TestStold("1.7e308", 1.7e308);
7797
TestStold("-1.7e308", -1.7e308);
98+
TestStold("inf", std::numeric_limits<long double>::infinity());
99+
TestStold("InF", std::numeric_limits<long double>::infinity());
100+
TestStold("-inf", -std::numeric_limits<long double>::infinity());
101+
TestStold("-iNf", -std::numeric_limits<long double>::infinity());
102+
TestStold("nAn", std::numeric_limits<long double>::quiet_NaN());
103+
TestStold("nan", std::numeric_limits<long double>::quiet_NaN());
78104
ASSERT_NOK(CheckedStold("abcd"));
79105
ASSERT_NOK(CheckedStold(" 123"));
80106
ASSERT_NOK(CheckedStold("123abc"));
@@ -84,5 +110,137 @@ TEST_F(StolUtilsTest, TestCheckedStoild) {
84110
ASSERT_NOK(CheckedStold(""));
85111
}
86112

113+
114+
TEST_F(StolUtilsTest, TestCheckedParseNumber) {
115+
// Test with double
116+
{
117+
auto result = CheckedParseNumber<double>("123.456");
118+
ASSERT_OK(result);
119+
ASSERT_DOUBLE_EQ(123.456, *result);
120+
121+
result = CheckedParseNumber<double>("-123.456");
122+
ASSERT_OK(result);
123+
ASSERT_DOUBLE_EQ(-123.456, *result);
124+
125+
result = CheckedParseNumber<double>("1.7e308");
126+
ASSERT_OK(result);
127+
ASSERT_DOUBLE_EQ(1.7e308, *result);
128+
129+
result = CheckedParseNumber<double>("inf");
130+
ASSERT_OK(result);
131+
ASSERT_EQ(std::numeric_limits<double>::infinity(), *result);
132+
133+
result = CheckedParseNumber<double>("-inf");
134+
ASSERT_OK(result);
135+
ASSERT_EQ(-std::numeric_limits<double>::infinity(), *result);
136+
137+
result = CheckedParseNumber<double>("nan");
138+
ASSERT_OK(result);
139+
ASSERT_TRUE(std::isnan(*result));
140+
141+
result = CheckedParseNumber<double>("1.8e308");
142+
ASSERT_NOK(result); // Out of range (infinite)
143+
144+
result = CheckedParseNumber<double>("abc");
145+
ASSERT_NOK(result); // Invalid input
146+
}
147+
148+
// Test with float
149+
{
150+
auto result = CheckedParseNumber<float>("123.456");
151+
ASSERT_OK(result);
152+
ASSERT_FLOAT_EQ(123.456f, *result);
153+
154+
result = CheckedParseNumber<float>("-123.456");
155+
ASSERT_OK(result);
156+
ASSERT_FLOAT_EQ(-123.456f, *result);
157+
158+
result = CheckedParseNumber<float>("3.4e38");
159+
ASSERT_OK(result);
160+
ASSERT_FLOAT_EQ(3.4e38f, *result);
161+
162+
result = CheckedParseNumber<float>("inf");
163+
ASSERT_OK(result);
164+
ASSERT_EQ(std::numeric_limits<float>::infinity(), *result);
165+
166+
result = CheckedParseNumber<float>("-inf");
167+
ASSERT_OK(result);
168+
ASSERT_EQ(-std::numeric_limits<float>::infinity(), *result);
169+
170+
result = CheckedParseNumber<float>("nan");
171+
ASSERT_OK(result);
172+
ASSERT_TRUE(std::isnan(*result));
173+
174+
result = CheckedParseNumber<float>("3.5e38");
175+
ASSERT_NOK(result); // Out of range (infinite)
176+
177+
result = CheckedParseNumber<float>("abc");
178+
ASSERT_NOK(result); // Invalid input
179+
}
180+
}
181+
182+
TEST_F(StolUtilsTest, TestParseCommaSeparatedListOfNumbers) {
183+
// Test with vector<int32_t>
184+
{
185+
auto result =
186+
ParseCommaSeparatedListOfNumbers<int32_t, std::vector<int32_t>>("1,2,3");
187+
ASSERT_OK(result);
188+
ASSERT_EQ((std::vector<int32_t>{1, 2, 3}), *result);
189+
190+
// Test with bounds
191+
result = ParseCommaSeparatedListOfNumbers<int32_t, std::vector<int32_t>>("4,5,6", 4, 6);
192+
ASSERT_OK(result);
193+
ASSERT_EQ((std::vector<int32_t>{4, 5, 6}), *result);
194+
195+
// Test with out-of-bounds value
196+
result = ParseCommaSeparatedListOfNumbers<int32_t, std::vector<int32_t>>("3,4,5", 4, 6);
197+
ASSERT_NOK(result); // 3 is less than lower bound 4
198+
}
199+
200+
// Test with set<double>
201+
{
202+
auto result = ParseCommaSeparatedListOfNumbers<double, std::set<double>>("1.1,2.2,3.3");
203+
ASSERT_OK(result);
204+
ASSERT_EQ((std::set<double>{1.1, 2.2, 3.3}), *result);
205+
206+
// Test with bounds
207+
result = ParseCommaSeparatedListOfNumbers<double, std::set<double>>("2.5,3.5", 2.0, 4.0);
208+
ASSERT_OK(result);
209+
ASSERT_EQ((std::set<double>{2.5, 3.5}), *result);
210+
211+
// Test with out-of-bounds value
212+
result = ParseCommaSeparatedListOfNumbers<double, std::set<double>>("1.5,2.5", 2.0, 4.0);
213+
ASSERT_NOK(result); // 1.5 is less than lower bound 2.0
214+
}
215+
216+
// Test with vector<uint64_t>
217+
{
218+
auto result =
219+
ParseCommaSeparatedListOfNumbers<uint64_t, std::vector<uint64_t>>("10000000000,20000000000");
220+
ASSERT_OK(result);
221+
ASSERT_EQ((std::vector<uint64_t>{10000000000ULL, 20000000000ULL}), *result);
222+
}
223+
224+
// Test with invalid input
225+
{
226+
auto result = ParseCommaSeparatedListOfNumbers<int32_t, std::vector<int32_t>>("1,abc,3");
227+
ASSERT_NOK(result); // "abc" cannot be parsed as int32_t
228+
}
229+
230+
// Test with empty input
231+
{
232+
auto result = ParseCommaSeparatedListOfNumbers<int32_t, std::vector<int32_t>>("");
233+
ASSERT_OK(result);
234+
ASSERT_TRUE(result->empty());
235+
}
236+
237+
// Test with spaces and tabs
238+
{
239+
auto result = ParseCommaSeparatedListOfNumbers<int32_t, std::vector<int32_t>>(" 7 ,\t8 , 9 ");
240+
ASSERT_OK(result);
241+
ASSERT_EQ((std::vector<int32_t>{7, 8, 9}), *result);
242+
}
243+
}
244+
87245
} // namespace util
88246
} // namespace yb

src/yb/util/stol_utils.cc

Lines changed: 23 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,14 +15,19 @@
1515

1616
#include <cstring>
1717

18+
#include "yb/util/string_util.h"
19+
1820
using namespace std::placeholders;
1921

2022
namespace yb {
2123

2224
namespace {
2325

24-
Status CreateInvalid(Slice input, int err = 0) {
26+
Status CreateInvalid(Slice input, int err = 0, const char* type_name = "") {
2527
auto message = Format("$0 is not a valid number", input.ToDebugString());
28+
if (*type_name) {
29+
message += Format(" for type $0", type_name);
30+
}
2631
if (err != 0) {
2732
message += ": ";
2833
message += std::strerror(err);
@@ -41,17 +46,31 @@ Status CheckNotSpace(Slice slice) {
4146
template <typename T, typename StrToT>
4247
Result<T> CheckedSton(Slice slice, StrToT str_to_t) {
4348
RETURN_NOT_OK(CheckNotSpace(slice));
49+
if constexpr (std::is_floating_point_v<T>) {
50+
auto maybe_special_value = TryParsingNonNumberValue<T>(slice);
51+
if (maybe_special_value) {
52+
return *maybe_special_value;
53+
}
54+
}
55+
4456
char* str_end;
4557
errno = 0;
4658
T result = str_to_t(slice.cdata(), &str_end);
4759
// Check errno.
4860
if (errno != 0) {
49-
return CreateInvalid(slice, errno);
61+
return CreateInvalid(slice, errno, typeid(T).name());
62+
}
63+
if constexpr (std::is_unsigned<T>::value) {
64+
// Do not allow any minus signs for unsigned types.
65+
for (auto* p = slice.cdata(); p != str_end; ++p) {
66+
if (*p == '-') {
67+
return CreateInvalid(slice, /* err= */ 0, typeid(T).name());
68+
}
69+
}
5070
}
51-
5271
// Check that entire string was processed.
5372
if (str_end != slice.cend()) {
54-
return CreateInvalid(slice);
73+
return CreateInvalid(slice, /* err= */ 0, typeid(T).name());
5574
}
5675

5776
return result;

0 commit comments

Comments
 (0)