Compare unicode character points instead of strcmp

peytondmurray · peytondmurray · commit bce48f6edf68 · 2023-02-21T20:50:38.000-08:00
diff --git a/stringdtype/stringdtype/src/casts.c b/stringdtype/stringdtype/src/casts.c
@@ -295,7 +295,7 @@ string_to_unicode_resolve_descriptors(PyObject *NPY_UNUSED(self),
 // codepoint for the next character, returning the size of the character in
 // bytes. Does not do any validation or error checking: assumes *c* is valid
 // utf-8
-static size_t
+size_t
 utf8_char_to_ucs4_code(unsigned char *c, Py_UCS4 *code)
 {
     if (c[0] <= 0x7F) {
diff --git a/stringdtype/stringdtype/src/casts.h b/stringdtype/stringdtype/src/casts.h
@@ -13,4 +13,7 @@
 PyArrayMethod_Spec **
 get_casts(void);
 
+size_t
+utf8_char_to_ucs4_code(unsigned char *, Py_UCS4 *);
+
 #endif /* _NPY_CASTS_H */
diff --git a/stringdtype/stringdtype/src/dtype.c b/stringdtype/stringdtype/src/dtype.c
@@ -156,12 +156,45 @@ stringdtype_getitem(StringDTypeObject *descr, char **dataptr)
 }
 
 // Implementation of PyArray_CompareFunc.
+// Compares unicode strings by their code points.
 int
-compare_strings(char **a, char **b, PyArrayObject *arr)
+compare_strings(char **a, char **b, PyArrayObject *NPY_UNUSED(arr))
 {
     ss *ss_a = (ss *)*a;
     ss *ss_b = (ss *)*b;
-    return strcmp(ss_a->buf, ss_b->buf);
+
+    // Index into utf8 byte array
+    int i_a = 0;
+    int i_b = 0;
+
+    Py_UCS4 code_a;
+    Py_UCS4 code_b;
+
+    while (i_a < ss_a->len && i_b < ss_b->len) {
+        unsigned char ca = ss_a->buf[i_a];
+        unsigned char cb = ss_b->buf[i_b];
+
+        i_a += utf8_char_to_ucs4_code(&ca, &code_a);
+        i_b += utf8_char_to_ucs4_code(&cb, &code_b);
+
+        // Only compare next code point if these are identical
+        if (code_a > code_b) {
+            return 1;
+        }
+        else if (code_a < code_b) {
+            return -1;
+        }
+    }
+
+    if (i_a == ss_a->len) {
+        if (i_b == ss_b->len) {
+            return 0;
+        }
+        return -1;
+    }
+    else {
+        return 1;
+    }
 }
 
 static StringDTypeObject *
@@ -321,7 +354,6 @@ init_string_dtype(void)
     /* Loaded dynamically, so may need to be set here: */
     ((PyObject *)&StringDType)->ob_type = &PyArrayDTypeMeta_Type;
     ((PyTypeObject *)&StringDType)->tp_base = &PyArrayDescr_Type;
-
     if (PyType_Ready((PyTypeObject *)&StringDType) < 0) {
         return -1;
     }
diff --git a/stringdtype/tests/test_stringdtype.py b/stringdtype/tests/test_stringdtype.py
@@ -1,7 +1,6 @@
 import concurrent.futures
 import os
 import pickle
-import string
 import tempfile
 
 import numpy as np
@@ -16,14 +15,8 @@ def string_list():
 
 
 @pytest.fixture
-def string_list_long():
-    abcs = string.ascii_lowercase
-
-    pairs = []
-    for pair in zip(abcs, abcs[1:] + abcs[0]):
-        pairs.append("".join(pair))
-
-    return pairs
+def string_list_similar():
+    return ["left", "right", "leftovers", "righty", "up" "down"]
 
 
 def test_scalar_creation():
@@ -175,10 +168,10 @@ def test_pickle(string_list):
     os.remove(f.name)
 
 
-def test_sort(string_list_long):
+def test_sort(string_list_similar):
     """Test that sorting matches python's internal sorting."""
-    arr = np.array(string_list_long, dtype=StringDType())
-    arr_sorted = np.array(sorted(string_list_long), dtype=StringDType())
+    arr = np.array(string_list_similar, dtype=StringDType())
+    arr_sorted = np.array(sorted(string_list_similar), dtype=StringDType())
 
     np.random.default_rng().shuffle(arr)
     arr.sort()

Original file line number	Diff line number	Diff line change
`@@ -295,7 +295,7 @@ string_to_unicode_resolve_descriptors(PyObject *NPY_UNUSED(self),`
`295`	`295`	`// codepoint for the next character, returning the size of the character in`
`296`	`296`	`// bytes. Does not do any validation or error checking: assumes c is valid`
`297`	`297`	`// utf-8`
`298`		`-static size_t`
	`298`	`+size_t`
`299`	`299`	`utf8_char_to_ucs4_code(unsigned char c, Py_UCS4 code)`
`300`	`300`	`{`
`301`	`301`	`if (c[0] <= 0x7F) {`