Skip to content

Store contents of static string struct in the array buffer #42

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Feb 28, 2023
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 28 additions & 29 deletions stringdtype/stringdtype/src/casts.c
Original file line number Diff line number Diff line change
Expand Up @@ -40,24 +40,26 @@ string_to_string_resolve_descriptors(PyObject *NPY_UNUSED(self),
}

static int
string_to_string(PyArrayMethod_Context *context, char *const data[],
npy_intp const dimensions[], npy_intp const strides[],
NpyAuxData *NPY_UNUSED(auxdata))
string_to_string(PyArrayMethod_Context *NPY_UNUSED(context),
char *const data[], npy_intp const dimensions[],
npy_intp const strides[], NpyAuxData *NPY_UNUSED(auxdata))
{
npy_intp N = dimensions[0];
ss **in = (ss **)data[0];
ss **out = (ss **)data[1];
// strides are in bytes but pointer offsets are in pointer widths, so
// divide by the element size (one pointer width) to get the pointer offset
npy_intp in_stride = strides[0] / context->descriptors[0]->elsize;
npy_intp out_stride = strides[1] / context->descriptors[1]->elsize;
char *in = data[0];
char *out = data[1];
npy_intp in_stride = strides[0];
npy_intp out_stride = strides[1];

ss *s = NULL, *os = NULL;

while (N--) {
out[0] = ssdup(in[0]);
if (out[0] == NULL) {
load_string(in, &s);
load_string(out, &os);
if (ssdup(s, os) == -1) {
gil_error(PyExc_MemoryError, "ssdup failed");
return -1;
}

in += in_stride;
out += out_stride;
}
Expand Down Expand Up @@ -114,7 +116,7 @@ unicode_to_string_resolve_descriptors(PyObject *NPY_UNUSED(self),
// is the number of codepoints that are not trailing null codepoints. Returns
// 0 on success and -1 when an invalid code point is found.
static int
utf8_size(Py_UCS4 *codepoints, long max_length, size_t *num_codepoints,
utf8_size(const Py_UCS4 *codepoints, long max_length, size_t *num_codepoints,
size_t *utf8_bytes)
{
size_t ucs4len = max_length;
Expand All @@ -126,7 +128,7 @@ utf8_size(Py_UCS4 *codepoints, long max_length, size_t *num_codepoints,

size_t num_bytes = 0;

for (int i = 0; i < ucs4len; i++) {
for (size_t i = 0; i < ucs4len; i++) {
Py_UCS4 code = codepoints[i];

if (code <= 0x7F) {
Expand Down Expand Up @@ -201,13 +203,11 @@ unicode_to_string(PyArrayMethod_Context *context, char *const data[],

npy_intp N = dimensions[0];
Py_UCS4 *in = (Py_UCS4 *)data[0];
ss **out = (ss **)data[1];
char *out = data[1];

// 4 bytes per UCS4 character
npy_intp in_stride = strides[0] / 4;
// strides are in bytes but pointer offsets are in pointer widths, so
// divide by the element size (one pointer width) to get the pointer offset
npy_intp out_stride = strides[1] / context->descriptors[1]->elsize;
npy_intp out_stride = strides[1];

while (N--) {
size_t out_num_bytes = 0;
Expand All @@ -217,9 +217,9 @@ unicode_to_string(PyArrayMethod_Context *context, char *const data[],
gil_error(PyExc_TypeError, "Invalid unicode code point found");
return -1;
}
ss *out_ss = ssnewempty(out_num_bytes);
if (out_ss == NULL) {
gil_error(PyExc_MemoryError, "ssnewempty failed");
ss *out_ss = (ss *)out;
if (ssnewemptylen(out_num_bytes, out_ss) == -1) {
gil_error(PyExc_MemoryError, "ssnewemptylen failed");
return -1;
}
char *out_buf = out_ss->buf;
Expand Down Expand Up @@ -247,9 +247,6 @@ unicode_to_string(PyArrayMethod_Context *context, char *const data[],
// pad string with null character
out_buf[out_num_bytes] = '\0';

// set out to the address of the beginning of the string
out[0] = out_ss;

in += in_stride;
out += out_stride;
}
Expand Down Expand Up @@ -329,19 +326,20 @@ string_to_unicode(PyArrayMethod_Context *context, char *const data[],
NpyAuxData *NPY_UNUSED(auxdata))
{
npy_intp N = dimensions[0];
ss **in = (ss **)data[0];
char *in = data[0];
Py_UCS4 *out = (Py_UCS4 *)data[1];
// strides are in bytes but pointer offsets are in pointer widths, so
// divide by the element size (one pointer width) to get the pointer offset
npy_intp in_stride = strides[0] / context->descriptors[0]->elsize;
npy_intp in_stride = strides[0];
// 4 bytes per UCS4 character
npy_intp out_stride = strides[1] / 4;
// max number of 4 byte UCS4 characters that can fit in the output
long max_out_size = (context->descriptors[1]->elsize) / 4;

ss *s = NULL;

while (N--) {
unsigned char *this_string = (unsigned char *)((*in)->buf);
size_t n_bytes = (*in)->len;
load_string(in, &s);
unsigned char *this_string = (unsigned char *)(s->buf);
size_t n_bytes = s->len;
size_t tot_n_bytes = 0;

for (int i = 0; i < max_out_size; i++) {
Expand All @@ -363,6 +361,7 @@ string_to_unicode(PyArrayMethod_Context *context, char *const data[],
break;
}
}

in += in_stride;
out += out_stride;
}
Expand Down
43 changes: 26 additions & 17 deletions stringdtype/stringdtype/src/dtype.c
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@ new_stringdtype_instance(void)
if (new == NULL) {
return NULL;
}
new->base.elsize = sizeof(ss *);
new->base.alignment = _Alignof(ss *);
new->base.elsize = sizeof(ss);
new->base.alignment = _Alignof(ss);
new->base.flags |= NPY_NEEDS_INIT;
new->base.flags |= NPY_LIST_PICKLE;
new->base.flags |= NPY_ITEM_REFCOUNT;
Expand Down Expand Up @@ -119,26 +119,35 @@ stringdtype_setitem(StringDTypeObject *NPY_UNUSED(descr), PyObject *obj,
return -1;
}

ss *str_val = ssnewlen(val, length);
if (str_val == NULL) {
// free if dataptr holds preexisting string data,
// ssfree does a NULL check
ssfree((ss *)dataptr);

// copies contents of val into item_val->buf
if (ssnewlen(val, length, (ss *)dataptr) == -1) {
PyErr_SetString(PyExc_MemoryError, "ssnewlen failed");
return -1;
}
// the dtype instance has the NPY_NEEDS_INIT flag set,
// so if *dataptr is NULL, that means we're initializing
// the array and don't need to free an existing string
if (*dataptr != NULL) {
free((ss *)*dataptr);
}
*dataptr = (char *)str_val;

// val_obj must stay alive until here to ensure *val* doesn't get
// deallocated
Py_DECREF(val_obj);
return 0;
}

static PyObject *
stringdtype_getitem(StringDTypeObject *descr, char **dataptr)
stringdtype_getitem(StringDTypeObject *NPY_UNUSED(descr), char **dataptr)
{
PyObject *val_obj = PyUnicode_FromString(((ss *)*dataptr)->buf);
char *data;

if (*dataptr == NULL) {
data = "\0";
}
else {
data = ((ss *)dataptr)->buf;
}

PyObject *val_obj = PyUnicode_FromString(data);

if (val_obj == NULL) {
return NULL;
Expand All @@ -161,8 +170,8 @@ stringdtype_getitem(StringDTypeObject *descr, char **dataptr)
int
compare_strings(char **a, char **b, PyArrayObject *NPY_UNUSED(arr))
{
ss *ss_a = (ss *)*a;
ss *ss_b = (ss *)*b;
ss *ss_a = (ss *)a;
ss *ss_b = (ss *)b;
return strcmp(ss_a->buf, ss_b->buf);
}

Expand All @@ -181,8 +190,8 @@ stringdtype_clear_loop(void *NPY_UNUSED(traverse_context),
{
while (size--) {
if (data != NULL) {
free(*(ss **)data);
*(ss **)data = NULL;
ssfree((ss *)data);
memset(data, 0, sizeof(ss));
}
data += stride;
}
Expand Down
7 changes: 3 additions & 4 deletions stringdtype/stringdtype/src/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -50,16 +50,15 @@ _memory_usage(PyObject *NPY_UNUSED(self), PyObject *obj)

// initialize with the size of the internal buffer
size_t memory_usage = PyArray_NBYTES(arr);
size_t struct_size = sizeof(ss);

do {
ss **in = (ss **)*dataptr;
npy_intp stride = *strideptr / descr->elsize;
char *in = dataptr[0];
npy_intp stride = *strideptr;
npy_intp count = *innersizeptr;

while (count--) {
// +1 byte for the null terminator
memory_usage += (*in)->len + struct_size + 1;
memory_usage += ((ss *)in)->len + 1;
in += stride;
}

Expand Down
79 changes: 57 additions & 22 deletions stringdtype/stringdtype/src/static_string.c
Original file line number Diff line number Diff line change
@@ -1,41 +1,76 @@
#include "static_string.h"

// allocates a new ss string of length len, filling with the contents of init
ss *
ssnewlen(const char *init, size_t len)
int
ssnewlen(const char *init, size_t len, ss *to_init)
{
if ((to_init->buf != NULL) || (to_init->len != 0)) {
return -1;
}

// one extra byte for null terminator
ss *ret = (ss *)malloc(sizeof(ss) + sizeof(char) * (len + 1));
char *ret_buf = (char *)malloc(sizeof(char) * (len + 1));

if (ret == NULL) {
return NULL;
if (ret_buf == NULL) {
return -1;
}

ret->len = len;
to_init->len = len;

if (len > 0) {
memcpy(ret->buf, init, len);
memcpy(ret_buf, init, len);
}

ret->buf[len] = '\0';
ret_buf[len] = '\0';

to_init->buf = ret_buf;

return 0;
}

return ret;
void
ssfree(ss *str)
{
if (str->buf != NULL) {
free(str->buf);
str->buf = NULL;
}
str->len = 0;
}

// returns a new heap-allocated copy of input string *s*
ss *
ssdup(const ss *s)
int
ssdup(ss *in, ss *out)
{
return ssnewlen(s->buf, s->len);
return ssnewlen(in->buf, in->len, out);
}

// returns a new, empty string of length len
// does not do any initialization, the caller must
// initialize and null-terminate the string
ss *
ssnewempty(size_t len)
int
ssnewemptylen(size_t num_bytes, ss *out)
{
ss *ret = (ss *)malloc(sizeof(ss) + sizeof(char) * (len + 1));
ret->len = len;
return ret;
if (out->len != 0 || out->buf != NULL) {
return -1;
}

char *buf = (char *)malloc(sizeof(char) * (num_bytes + 1));

if (buf == NULL) {
return -1;
}

out->buf = buf;
out->len = num_bytes;

return 0;
}

static ss EMPTY = {0, "\0"};

void
load_string(char *data, ss **out)
{
if (data == NULL) {
*out = &EMPTY;
}
else {
*out = (ss *)data;
}
}
43 changes: 31 additions & 12 deletions stringdtype/stringdtype/src/static_string.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,21 +6,40 @@

typedef struct ss {
size_t len;
char buf[];
char *buf;
} ss;

// allocates a new ss string of length len, filling with the contents of init
ss *
ssnewlen(const char *init, size_t len);
// Allocates a new buffer for *to_init*, filling with the copied contents of
// *init* and sets *to_init->len* to *len*.
int
ssnewlen(const char *init, size_t len, ss *to_init);

// returns a new heap-allocated copy of input string *s*
ss *
ssdup(const ss *s);
// Frees the internal string buffer held by *str*. If str->buf is NULL, does
// nothing.
void
ssfree(ss *str);

// returns a new, empty string of length len
// does not do any initialization, the caller must
// initialize and null-terminate the string
ss *
ssnewempty(size_t len);
// copies the contents out *in* into *out*. Allocates a new string buffer for
// *out*, checks that *out->buf* is NULL and *out->len* is zero and errors
// otherwise. Also errors if malloc fails.
int
ssdup(ss *in, ss *out);

// Allocates a new string buffer for *out* with enough capacity to store
// *num_bytes* of text. The actual allocation is num_bytes + 1 bytes, to
// account for the null terminator. Does not do any initialization, the caller
// must initialize and null-terminate the string buffer. Errors if *out*
// already contains data or if malloc fails.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🚀 This is super valuable documentation. Thank you!

int
ssnewemptylen(size_t num_bytes, ss *out);

// Interpret the contents of buffer *data* as an ss struct and set *out* to
// that struct. If *data* is NULL, set *out* to point to a statically
// allocated, empty SS struct. Since this function may set *out* to point to
// statically allocated data, do not ever free memory owned by an output of
// this function. That means this function is most useful for read-only
// applications.
void
load_string(char *data, ss **out);

#endif /*_NPY_STATIC_STRING_H */
Loading