diff --git a/pandas/_libs/include/pandas/parser/io.h b/pandas/_libs/include/pandas/parser/io.h index cbe6bc04b7663..c707c23b567d2 100644 --- a/pandas/_libs/include/pandas/parser/io.h +++ b/pandas/_libs/include/pandas/parser/io.h @@ -25,7 +25,7 @@ typedef struct _rd_source { void *new_rd_source(PyObject *obj); -int del_rd_source(void *src); +void del_rd_source(void *src); -void *buffer_rd_bytes(void *source, size_t nbytes, size_t *bytes_read, +char *buffer_rd_bytes(void *source, size_t nbytes, size_t *bytes_read, int *status, const char *encoding_errors); diff --git a/pandas/_libs/include/pandas/parser/pd_parser.h b/pandas/_libs/include/pandas/parser/pd_parser.h index 61f15dcef8d27..58a09ae1bba39 100644 --- a/pandas/_libs/include/pandas/parser/pd_parser.h +++ b/pandas/_libs/include/pandas/parser/pd_parser.h @@ -20,8 +20,8 @@ typedef struct { int (*to_double)(char *, double *, char, char, int *); int (*floatify)(PyObject *, double *, int *); void *(*new_rd_source)(PyObject *); - int (*del_rd_source)(void *); - void *(*buffer_rd_bytes)(void *, size_t, size_t *, int *, const char *); + void (*del_rd_source)(void *); + char *(*buffer_rd_bytes)(void *, size_t, size_t *, int *, const char *); void (*uint_state_init)(uint_state *); int (*uint64_conflict)(uint_state *); void (*coliter_setup)(coliter_t *, parser_t *, int64_t, int64_t); @@ -30,7 +30,7 @@ typedef struct { void (*parser_free)(parser_t *); void (*parser_del)(parser_t *); int (*parser_add_skiprow)(parser_t *, int64_t); - int (*parser_set_skipfirstnrows)(parser_t *, int64_t); + void (*parser_set_skipfirstnrows)(parser_t *, int64_t); void (*parser_set_default_options)(parser_t *); int (*parser_consume_rows)(parser_t *, size_t); int (*parser_trim_buffers)(parser_t *); diff --git a/pandas/_libs/include/pandas/parser/tokenizer.h b/pandas/_libs/include/pandas/parser/tokenizer.h index 6a46ad637a401..ade783e3716de 100644 --- a/pandas/_libs/include/pandas/parser/tokenizer.h +++ b/pandas/_libs/include/pandas/parser/tokenizer.h @@ -84,9 +84,9 @@ typedef enum { typedef enum { ERROR, WARN, SKIP } BadLineHandleMethod; -typedef void *(*io_callback)(void *src, size_t nbytes, size_t *bytes_read, +typedef char *(*io_callback)(void *src, size_t nbytes, size_t *bytes_read, int *status, const char *encoding_errors); -typedef int (*io_cleanup)(void *src); +typedef void (*io_cleanup)(void *src); typedef struct parser_t { void *source; @@ -187,7 +187,7 @@ int parser_trim_buffers(parser_t *self); int parser_add_skiprow(parser_t *self, int64_t row); -int parser_set_skipfirstnrows(parser_t *self, int64_t nrows); +void parser_set_skipfirstnrows(parser_t *self, int64_t nrows); void parser_free(parser_t *self); diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index ab28b34be58f2..204b242d9eb73 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -152,9 +152,9 @@ cdef extern from "pandas/parser/tokenizer.h": WARN, SKIP - ctypedef void* (*io_callback)(void *src, size_t nbytes, size_t *bytes_read, + ctypedef char* (*io_callback)(void *src, size_t nbytes, size_t *bytes_read, int *status, const char *encoding_errors) - ctypedef int (*io_cleanup)(void *src) + ctypedef void (*io_cleanup)(void *src) ctypedef struct parser_t: void *source @@ -247,9 +247,9 @@ cdef extern from "pandas/parser/tokenizer.h": cdef extern from "pandas/parser/pd_parser.h": void *new_rd_source(object obj) except NULL - int del_rd_source(void *src) + void del_rd_source(void *src) - void* buffer_rd_bytes(void *source, size_t nbytes, + char* buffer_rd_bytes(void *source, size_t nbytes, size_t *bytes_read, int *status, const char *encoding_errors) void uint_state_init(uint_state *self) @@ -266,7 +266,7 @@ cdef extern from "pandas/parser/pd_parser.h": void parser_del(parser_t *self) nogil int parser_add_skiprow(parser_t *self, int64_t row) - int parser_set_skipfirstnrows(parser_t *self, int64_t nrows) + void parser_set_skipfirstnrows(parser_t *self, int64_t nrows) void parser_set_default_options(parser_t *self) @@ -318,13 +318,13 @@ cdef double round_trip_wrapper(const char *p, char **q, char decimal, return round_trip(p, q, decimal, sci, tsep, skip_trailing, error, maybe_int) -cdef void* buffer_rd_bytes_wrapper(void *source, size_t nbytes, +cdef char* buffer_rd_bytes_wrapper(void *source, size_t nbytes, size_t *bytes_read, int *status, const char *encoding_errors) noexcept: return buffer_rd_bytes(source, nbytes, bytes_read, status, encoding_errors) -cdef int del_rd_source_wrapper(void *src) noexcept: - return del_rd_source(src) +cdef void del_rd_source_wrapper(void *src) noexcept: + del_rd_source(src) cdef class TextReader: diff --git a/pandas/_libs/src/parser/io.c b/pandas/_libs/src/parser/io.c index 29c2c8d095907..851901481d222 100644 --- a/pandas/_libs/src/parser/io.c +++ b/pandas/_libs/src/parser/io.c @@ -35,12 +35,10 @@ void *new_rd_source(PyObject *obj) { */ -int del_rd_source(void *rds) { +void del_rd_source(void *rds) { Py_XDECREF(RDS(rds)->obj); Py_XDECREF(RDS(rds)->buffer); free(rds); - - return 0; } /* @@ -49,26 +47,20 @@ int del_rd_source(void *rds) { */ -void *buffer_rd_bytes(void *source, size_t nbytes, size_t *bytes_read, +char *buffer_rd_bytes(void *source, size_t nbytes, size_t *bytes_read, int *status, const char *encoding_errors) { - PyGILState_STATE state; - PyObject *result, *func, *args, *tmp; - - void *retval; - - size_t length; rd_source *src = RDS(source); - state = PyGILState_Ensure(); + PyGILState_STATE state = PyGILState_Ensure(); /* delete old object */ Py_XDECREF(src->buffer); src->buffer = NULL; - args = Py_BuildValue("(i)", nbytes); + PyObject *args = Py_BuildValue("(i)", nbytes); - func = PyObject_GetAttrString(src->obj, "read"); + PyObject *func = PyObject_GetAttrString(src->obj, "read"); /* Note: PyObject_CallObject requires the GIL */ - result = PyObject_CallObject(func, args); + PyObject *result = PyObject_CallObject(func, args); Py_XDECREF(args); Py_XDECREF(func); @@ -78,7 +70,7 @@ void *buffer_rd_bytes(void *source, size_t nbytes, size_t *bytes_read, *status = CALLING_READ_FAILED; return NULL; } else if (!PyBytes_Check(result)) { - tmp = PyUnicode_AsEncodedString(result, "utf-8", encoding_errors); + PyObject *tmp = PyUnicode_AsEncodedString(result, "utf-8", encoding_errors); Py_DECREF(result); if (tmp == NULL) { PyGILState_Release(state); @@ -87,7 +79,7 @@ void *buffer_rd_bytes(void *source, size_t nbytes, size_t *bytes_read, result = tmp; } - length = PySequence_Length(result); + const size_t length = PySequence_Length(result); if (length == 0) *status = REACHED_EOF; @@ -96,7 +88,7 @@ void *buffer_rd_bytes(void *source, size_t nbytes, size_t *bytes_read, /* hang on to the Python object */ src->buffer = result; - retval = (void *)PyBytes_AsString(result); + char *retval = PyBytes_AsString(result); PyGILState_Release(state); diff --git a/pandas/_libs/src/parser/pd_parser.c b/pandas/_libs/src/parser/pd_parser.c index 41689704ccffc..88b6603c3c6f9 100644 --- a/pandas/_libs/src/parser/pd_parser.c +++ b/pandas/_libs/src/parser/pd_parser.c @@ -24,7 +24,6 @@ static int to_double(char *item, double *p_value, char sci, char decimal, } static int floatify(PyObject *str, double *result, int *maybe_int) { - int status; char *data; PyObject *tmp = NULL; const char sci = 'E'; @@ -43,7 +42,7 @@ static int floatify(PyObject *str, double *result, int *maybe_int) { return -1; } - status = to_double(data, result, sci, dec, maybe_int); + const int status = to_double(data, result, sci, dec, maybe_int); if (!status) { /* handle inf/-inf infinity/-infinity */ diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index c9466c485ae94..efe448b034806 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -22,6 +22,7 @@ GitHub. See Python Software Foundation License and BSD licenses for these. #include #include #include +#include #include "pandas/portable.h" @@ -107,18 +108,7 @@ void parser_set_default_options(parser_t *self) { parser_t *parser_new(void) { return (parser_t *)calloc(1, sizeof(parser_t)); } -int parser_clear_data_buffers(parser_t *self) { - free_if_not_null((void *)&self->stream); - free_if_not_null((void *)&self->words); - free_if_not_null((void *)&self->word_starts); - free_if_not_null((void *)&self->line_start); - free_if_not_null((void *)&self->line_fields); - return 0; -} - -int parser_cleanup(parser_t *self) { - int status = 0; - +static void parser_cleanup(parser_t *self) { // XXX where to put this free_if_not_null((void *)&self->error_msg); free_if_not_null((void *)&self->warn_msg); @@ -128,23 +118,13 @@ int parser_cleanup(parser_t *self) { self->skipset = NULL; } - if (parser_clear_data_buffers(self) < 0) { - status = -1; - } - if (self->cb_cleanup != NULL) { - if (self->cb_cleanup(self->source) < 0) { - status = -1; - } + self->cb_cleanup(self->source); self->cb_cleanup = NULL; } - - return status; } int parser_init(parser_t *self) { - int64_t sz; - /* Initialize data buffers */ @@ -167,8 +147,9 @@ int parser_init(parser_t *self) { self->stream_len = 0; // word pointers and metadata - sz = STREAM_INIT_SIZE / 10; - sz = sz ? sz : 1; + _Static_assert(STREAM_INIT_SIZE / 10 > 0, + "STREAM_INIT_SIZE must be defined and >= 10"); + const int64_t sz = STREAM_INIT_SIZE / 10; self->words = malloc(sz * sizeof(char *)); self->word_starts = malloc(sz * sizeof(int64_t)); self->max_words_cap = sz; @@ -220,17 +201,14 @@ void parser_free(parser_t *self) { void parser_del(parser_t *self) { free(self); } static int make_stream_space(parser_t *self, size_t nbytes) { - uint64_t i, cap, length; - int status; - void *orig_ptr, *newptr; - // Can we fit potentially nbytes tokens (+ null terminators) in the stream? /* TOKEN STREAM */ - orig_ptr = (void *)self->stream; + int status; + char *orig_ptr = (void *)self->stream; TRACE(("\n\nmake_stream_space: nbytes = %zu. grow_buffer(self->stream...)\n", nbytes)) self->stream = @@ -248,7 +226,7 @@ static int make_stream_space(parser_t *self, size_t nbytes) { if (self->stream != orig_ptr) { self->pword_start = self->stream + self->word_start; - for (i = 0; i < self->words_len; ++i) { + for (uint64_t i = 0; i < self->words_len; ++i) { self->words[i] = self->stream + self->word_starts[i]; } } @@ -257,7 +235,7 @@ static int make_stream_space(parser_t *self, size_t nbytes) { WORD VECTORS */ - cap = self->words_cap; + const uint64_t words_cap = self->words_cap; /** * If we are reading in chunks, we need to be aware of the maximum number @@ -267,11 +245,9 @@ static int make_stream_space(parser_t *self, size_t nbytes) { * Otherwise, we risk a buffer overflow if we mistakenly under-allocate * just because a recent chunk did not have as many words. */ - if (self->words_len + nbytes < self->max_words_cap) { - length = self->max_words_cap - nbytes - 1; - } else { - length = self->words_len; - } + const uint64_t length = self->words_len + nbytes < self->max_words_cap + ? self->max_words_cap - nbytes - 1 + : self->words_len; self->words = (char **)grow_buffer((void *)self->words, length, &self->words_cap, @@ -284,23 +260,23 @@ static int make_stream_space(parser_t *self, size_t nbytes) { } // realloc took place - if (cap != self->words_cap) { + if (words_cap != self->words_cap) { TRACE(("make_stream_space: cap != self->words_cap, nbytes = %d, " "self->words_cap=%d\n", nbytes, self->words_cap)) - newptr = - realloc((void *)self->word_starts, sizeof(int64_t) * self->words_cap); + int64_t *newptr = (int64_t *)realloc(self->word_starts, + sizeof(int64_t) * self->words_cap); if (newptr == NULL) { return PARSER_OUT_OF_MEMORY; } else { - self->word_starts = (int64_t *)newptr; + self->word_starts = newptr; } } /* LINE VECTORS */ - cap = self->lines_cap; + const uint64_t lines_cap = self->lines_cap; self->line_start = (int64_t *)grow_buffer((void *)self->line_start, self->lines + 1, &self->lines_cap, nbytes, sizeof(int64_t), &status); @@ -312,14 +288,14 @@ static int make_stream_space(parser_t *self, size_t nbytes) { } // realloc took place - if (cap != self->lines_cap) { + if (lines_cap != self->lines_cap) { TRACE(("make_stream_space: cap != self->lines_cap, nbytes = %d\n", nbytes)) - newptr = - realloc((void *)self->line_fields, sizeof(int64_t) * self->lines_cap); + int64_t *newptr = (int64_t *)realloc(self->line_fields, + sizeof(int64_t) * self->lines_cap); if (newptr == NULL) { return PARSER_OUT_OF_MEMORY; } else { - self->line_fields = (int64_t *)newptr; + self->line_fields = newptr; } } @@ -333,7 +309,7 @@ static int push_char(parser_t *self, char c) { TRACE(("push_char: ERROR!!! self->stream_len(%d) >= " "self->stream_cap(%d)\n", self->stream_len, self->stream_cap)) - int64_t bufsize = 100; + const size_t bufsize = 100; self->error_msg = malloc(bufsize); snprintf(self->error_msg, bufsize, "Buffer overflow caught - possible malformed input file.\n"); @@ -349,7 +325,7 @@ int PANDAS_INLINE end_field(parser_t *self) { TRACE(("end_field: ERROR!!! self->words_len(%zu) >= " "self->words_cap(%zu)\n", self->words_len, self->words_cap)) - int64_t bufsize = 100; + const size_t bufsize = 100; self->error_msg = malloc(bufsize); snprintf(self->error_msg, bufsize, "Buffer overflow caught - possible malformed input file.\n"); @@ -381,30 +357,24 @@ int PANDAS_INLINE end_field(parser_t *self) { } static void append_warning(parser_t *self, const char *msg) { - int64_t ex_length; - int64_t length = strlen(msg); - void *newptr; + const int64_t length = strlen(msg); if (self->warn_msg == NULL) { self->warn_msg = malloc(length + 1); snprintf(self->warn_msg, length + 1, "%s", msg); } else { - ex_length = strlen(self->warn_msg); - newptr = realloc(self->warn_msg, ex_length + length + 1); + const int64_t ex_length = strlen(self->warn_msg); + char *newptr = (char *)realloc(self->warn_msg, ex_length + length + 1); if (newptr != NULL) { - self->warn_msg = (char *)newptr; + self->warn_msg = newptr; snprintf(self->warn_msg + ex_length, length + 1, "%s", msg); } } } static int end_line(parser_t *self) { - char *msg; - int64_t fields; int64_t ex_fields = self->expected_fields; - int64_t bufsize = 100; // for error or warning messages - - fields = self->line_fields[self->lines]; + int64_t fields = self->line_fields[self->lines]; TRACE(("end_line: Line end, nfields: %d\n", fields)); @@ -447,6 +417,7 @@ static int end_line(parser_t *self) { // file_lines is now the actual file line number (starting at 1) if (self->on_bad_lines == ERROR) { + const size_t bufsize = 100; self->error_msg = malloc(bufsize); snprintf(self->error_msg, bufsize, "Expected %" PRId64 " fields in line %" PRIu64 ", saw %" PRId64 @@ -460,7 +431,8 @@ static int end_line(parser_t *self) { // simply skip bad lines if (self->on_bad_lines == WARN) { // pass up error message - msg = malloc(bufsize); + const size_t bufsize = 100; + char *msg = (char *)malloc(bufsize); snprintf(msg, bufsize, "Skipping line %" PRIu64 ": expected %" PRId64 " fields, saw %" PRId64 "\n", @@ -474,7 +446,7 @@ static int end_line(parser_t *self) { if ((self->lines >= self->header_end + 1) && fields < ex_fields) { // might overrun the buffer when closing fields if (make_stream_space(self, ex_fields - fields) < 0) { - int64_t bufsize = 100; + const size_t bufsize = 100; self->error_msg = malloc(bufsize); snprintf(self->error_msg, bufsize, "out of memory"); return -1; @@ -494,7 +466,7 @@ static int end_line(parser_t *self) { if (self->lines >= self->lines_cap) { TRACE(("end_line: ERROR!!! self->lines(%zu) >= self->lines_cap(%zu)\n", self->lines, self->lines_cap)) - int64_t bufsize = 100; + const size_t bufsize = 100; self->error_msg = malloc(bufsize); snprintf(self->error_msg, bufsize, "Buffer overflow caught - " @@ -532,13 +504,11 @@ int parser_add_skiprow(parser_t *self, int64_t row) { return 0; } -int parser_set_skipfirstnrows(parser_t *self, int64_t nrows) { +void parser_set_skipfirstnrows(parser_t *self, int64_t nrows) { // self->file_lines is zero based so subtract 1 from nrows if (nrows > 0) { self->skip_first_N_rows = nrows - 1; } - - return 0; } static int parser_buffer_bytes(parser_t *self, size_t nbytes, @@ -556,7 +526,7 @@ static int parser_buffer_bytes(parser_t *self, size_t nbytes, self->datalen = bytes_read; if (status != REACHED_EOF && self->data == NULL) { - int64_t bufsize = 200; + const size_t bufsize = 200; self->error_msg = malloc(bufsize); if (status == CALLING_READ_FAILED) { @@ -586,7 +556,7 @@ static int parser_buffer_bytes(parser_t *self, size_t nbytes, if (slen >= self->stream_cap) { \ TRACE(("PUSH_CHAR: ERROR!!! slen(%d) >= stream_cap(%d)\n", slen, \ self->stream_cap)) \ - int64_t bufsize = 100; \ + const size_t bufsize = 100; \ self->error_msg = malloc(bufsize); \ snprintf(self->error_msg, bufsize, \ "Buffer overflow caught - possible malformed input file.\n"); \ @@ -664,22 +634,14 @@ static int parser_buffer_bytes(parser_t *self, size_t nbytes, self->datapos += 3; \ } -int skip_this_line(parser_t *self, int64_t rownum) { - int should_skip; - PyObject *result; - PyGILState_STATE state; - +static int skip_this_line(parser_t *self, int64_t rownum) { if (self->skipfunc != NULL) { - state = PyGILState_Ensure(); - result = PyObject_CallFunction(self->skipfunc, "i", rownum); + PyGILState_STATE state = PyGILState_Ensure(); + PyObject *result = PyObject_CallFunction(self->skipfunc, "i", rownum); // Error occurred. It will be processed // and caught at the Cython level. - if (result == NULL) { - should_skip = -1; - } else { - should_skip = PyObject_IsTrue(result); - } + const int should_skip = result == NULL ? -1 : PyObject_IsTrue(result); Py_XDECREF(result); PyGILState_Release(state); @@ -693,12 +655,8 @@ int skip_this_line(parser_t *self, int64_t rownum) { } } -int tokenize_bytes(parser_t *self, size_t line_limit, uint64_t start_lines) { - int64_t i; - uint64_t slen; - int should_skip; - char c; - char *stream; +static int tokenize_bytes(parser_t *self, size_t line_limit, + uint64_t start_lines) { char *buf = self->data + self->datapos; const char lineterminator = @@ -716,14 +674,14 @@ int tokenize_bytes(parser_t *self, size_t line_limit, uint64_t start_lines) { (self->escapechar != '\0') ? self->escapechar : 1000; if (make_stream_space(self, self->datalen - self->datapos) < 0) { - int64_t bufsize = 100; + const size_t bufsize = 100; self->error_msg = malloc(bufsize); snprintf(self->error_msg, bufsize, "out of memory"); return -1; } - stream = self->stream + self->stream_len; - slen = self->stream_len; + char *stream = self->stream + self->stream_len; + uint64_t slen = self->stream_len; TRACE(("%s\n", buf)); @@ -731,6 +689,8 @@ int tokenize_bytes(parser_t *self, size_t line_limit, uint64_t start_lines) { CHECK_FOR_BOM(); } + char c; + int64_t i; for (i = self->datapos; i < self->datalen; ++i) { // next character in file c = *buf++; @@ -840,9 +800,9 @@ int tokenize_bytes(parser_t *self, size_t line_limit, uint64_t start_lines) { break; } - case START_RECORD: + case START_RECORD: { // start of record - should_skip = skip_this_line(self, self->file_lines); + const int should_skip = skip_this_line(self, self->file_lines); if (should_skip == -1) { goto parsingerror; @@ -894,7 +854,7 @@ int tokenize_bytes(parser_t *self, size_t line_limit, uint64_t start_lines) { // normal character - fall through // to handle as START_FIELD self->state = START_FIELD; - + } case START_FIELD: // expecting field if (IS_TERMINATOR(c)) { @@ -1111,7 +1071,7 @@ int tokenize_bytes(parser_t *self, size_t line_limit, uint64_t start_lines) { } static int parser_handle_eof(parser_t *self) { - int64_t bufsize = 100; + const size_t bufsize = 100; TRACE(("handling eof, datalen: %d, pstate: %d\n", self->datalen, self->state)) @@ -1155,9 +1115,6 @@ static int parser_handle_eof(parser_t *self) { } int parser_consume_rows(parser_t *self, size_t nrows) { - int64_t offset, word_deletions; - uint64_t char_count, i; - if (nrows > self->lines) { nrows = self->lines; } @@ -1167,15 +1124,15 @@ int parser_consume_rows(parser_t *self, size_t nrows) { return 0; /* cannot guarantee that nrows + 1 has been observed */ - word_deletions = self->line_start[nrows - 1] + self->line_fields[nrows - 1]; - if (word_deletions >= 1) { - char_count = (self->word_starts[word_deletions - 1] + - strlen(self->words[word_deletions - 1]) + 1); - } else { - /* if word_deletions == 0 (i.e. this case) then char_count must - * be 0 too, as no data needs to be skipped */ - char_count = 0; - } + const int64_t word_deletions = + self->line_start[nrows - 1] + self->line_fields[nrows - 1]; + + /* if word_deletions == 0 (i.e. this case) then char_count must + * be 0 too, as no data needs to be skipped */ + const int64_t char_count = word_deletions >= 1 + ? (self->word_starts[word_deletions - 1] + + strlen(self->words[word_deletions - 1]) + 1) + : 0; TRACE(("parser_consume_rows: Deleting %d words, %d chars\n", word_deletions, char_count)); @@ -1191,7 +1148,8 @@ int parser_consume_rows(parser_t *self, size_t nrows) { /* move token metadata */ // Note: We should always have words_len < word_deletions, so this // subtraction will remain appropriately-typed. - for (i = 0; i < self->words_len - word_deletions; ++i) { + int64_t offset; + for (uint64_t i = 0; i < self->words_len - word_deletions; ++i) { offset = i + word_deletions; self->words[i] = self->words[offset] - char_count; @@ -1206,7 +1164,7 @@ int parser_consume_rows(parser_t *self, size_t nrows) { /* move line metadata */ // Note: We should always have self->lines - nrows + 1 >= 0, so this // subtraction will remain appropriately-typed. - for (i = 0; i < self->lines - nrows + 1; ++i) { + for (uint64_t i = 0; i < self->lines - nrows + 1; ++i) { offset = i + nrows; self->line_start[i] = self->line_start[offset] - word_deletions; self->line_fields[i] = self->line_fields[offset]; @@ -1227,10 +1185,6 @@ int parser_trim_buffers(parser_t *self) { /* Free memory */ - size_t new_cap; - void *newptr; - - uint64_t i; /** * Before we free up space and trim, we should @@ -1246,7 +1200,7 @@ int parser_trim_buffers(parser_t *self) { } /* trim words, word_starts */ - new_cap = _next_pow2(self->words_len) + 1; + size_t new_cap = _next_pow2(self->words_len) + 1; if (new_cap < self->words_cap) { TRACE(("parser_trim_buffers: new_cap < self->words_cap\n")); self->words = realloc(self->words, new_cap * sizeof(char *)); @@ -1268,7 +1222,7 @@ int parser_trim_buffers(parser_t *self) { if (new_cap < self->stream_cap) { TRACE(("parser_trim_buffers: new_cap < self->stream_cap, calling " "realloc\n")); - newptr = realloc(self->stream, new_cap); + void *newptr = realloc(self->stream, new_cap); if (newptr == NULL) { return PARSER_OUT_OF_MEMORY; } else { @@ -1280,7 +1234,7 @@ int parser_trim_buffers(parser_t *self) { if (self->stream != newptr) { self->pword_start = (char *)newptr + self->word_start; - for (i = 0; i < self->words_len; ++i) { + for (uint64_t i = 0; i < self->words_len; ++i) { self->words[i] = (char *)newptr + self->word_starts[i]; } } @@ -1294,7 +1248,7 @@ int parser_trim_buffers(parser_t *self) { new_cap = _next_pow2(self->lines) + 1; if (new_cap < self->lines_cap) { TRACE(("parser_trim_buffers: new_cap < self->lines_cap\n")); - newptr = realloc(self->line_start, new_cap * sizeof(int64_t)); + void *newptr = realloc(self->line_start, new_cap * sizeof(int64_t)); if (newptr == NULL) { return PARSER_OUT_OF_MEMORY; } else { @@ -1317,10 +1271,10 @@ int parser_trim_buffers(parser_t *self) { all : tokenize all the data vs. certain number of rows */ -int _tokenize_helper(parser_t *self, size_t nrows, int all, - const char *encoding_errors) { +static int _tokenize_helper(parser_t *self, size_t nrows, int all, + const char *encoding_errors) { int status = 0; - uint64_t start_lines = self->lines; + const uint64_t start_lines = self->lines; if (self->state == FINISHED) { return 0; @@ -1367,13 +1321,11 @@ int _tokenize_helper(parser_t *self, size_t nrows, int all, } int tokenize_nrows(parser_t *self, size_t nrows, const char *encoding_errors) { - int status = _tokenize_helper(self, nrows, 0, encoding_errors); - return status; + return _tokenize_helper(self, nrows, 0, encoding_errors); } int tokenize_all_rows(parser_t *self, const char *encoding_errors) { - int status = _tokenize_helper(self, -1, 1, encoding_errors); - return status; + return _tokenize_helper(self, -1, 1, encoding_errors); } /* @@ -1449,22 +1401,9 @@ int to_boolean(const char *item, uint8_t *val) { // * Add tsep argument for thousands separator // -// pessimistic but quick assessment, -// assuming that each decimal digit requires 4 bits to store -const int max_int_decimal_digits = (sizeof(unsigned int) * 8) / 4; - double xstrtod(const char *str, char **endptr, char decimal, char sci, char tsep, int skip_trailing, int *error, int *maybe_int) { - double number; - unsigned int i_number = 0; - int exponent; - int negative; - char *p = (char *)str; - double p10; - int n; - int num_digits; - int num_decimals; - + const char *p = str; if (maybe_int != NULL) *maybe_int = 1; // Skip leading whitespace. @@ -1472,7 +1411,7 @@ double xstrtod(const char *str, char **endptr, char decimal, char sci, p++; // Handle optional sign. - negative = 0; + int negative = 0; switch (*p) { case '-': negative = 1; // Fall through to increment position. @@ -1480,11 +1419,17 @@ double xstrtod(const char *str, char **endptr, char decimal, char sci, p++; } - exponent = 0; - num_digits = 0; - num_decimals = 0; + int exponent = 0; + int num_digits = 0; + int num_decimals = 0; + + // pessimistic but quick assessment, + // assuming that each decimal digit requires 4 bits to store + // TODO: C23 has UINT64_WIDTH macro that can be used at compile time + const int max_int_decimal_digits = (sizeof(unsigned int) * 8) / 4; // Process string of digits. + unsigned int i_number = 0; while (isdigit_ascii(*p) && num_digits <= max_int_decimal_digits) { i_number = i_number * 10 + (*p - '0'); p++; @@ -1492,7 +1437,7 @@ double xstrtod(const char *str, char **endptr, char decimal, char sci, p += (tsep != '\0' && *p == tsep); } - number = i_number; + double number = i_number; if (num_digits > max_int_decimal_digits) { // process what's left as double @@ -1546,7 +1491,7 @@ double xstrtod(const char *str, char **endptr, char decimal, char sci, // Process string of digits. num_digits = 0; - n = 0; + int n = 0; while (isdigit_ascii(*p)) { n = n * 10 + (*p - '0'); num_digits++; @@ -1569,8 +1514,8 @@ double xstrtod(const char *str, char **endptr, char decimal, char sci, } // Scale the result. - p10 = 10.; - n = exponent; + double p10 = 10.; + int n = exponent; if (n < 0) n = -n; while (n) { @@ -1595,21 +1540,15 @@ double xstrtod(const char *str, char **endptr, char decimal, char sci, } if (endptr) - *endptr = p; + *endptr = (char *)p; return number; } double precise_xstrtod(const char *str, char **endptr, char decimal, char sci, char tsep, int skip_trailing, int *error, int *maybe_int) { - double number; - int exponent; - int negative; - char *p = (char *)str; - int num_digits; - int num_decimals; - int max_digits = 17; - int n; + const char *p = str; + const int max_digits = 17; if (maybe_int != NULL) *maybe_int = 1; @@ -1652,7 +1591,7 @@ double precise_xstrtod(const char *str, char **endptr, char decimal, char sci, p++; // Handle optional sign. - negative = 0; + int negative = 0; switch (*p) { case '-': negative = 1; // Fall through to increment position. @@ -1660,10 +1599,10 @@ double precise_xstrtod(const char *str, char **endptr, char decimal, char sci, p++; } - number = 0.; - exponent = 0; - num_digits = 0; - num_decimals = 0; + double number = 0.; + int exponent = 0; + int num_digits = 0; + int num_decimals = 0; // Process string of digits. while (isdigit_ascii(*p)) { @@ -1723,7 +1662,7 @@ double precise_xstrtod(const char *str, char **endptr, char decimal, char sci, // Process string of digits. num_digits = 0; - n = 0; + int n = 0; while (num_digits < max_digits && isdigit_ascii(*p)) { n = n * 10 + (*p - '0'); num_digits++; @@ -1767,7 +1706,7 @@ double precise_xstrtod(const char *str, char **endptr, char decimal, char sci, } if (endptr) - *endptr = p; + *endptr = (char *)p; return number; } @@ -1777,10 +1716,10 @@ double precise_xstrtod(const char *str, char **endptr, char decimal, char sci, with a call to `free`. */ -char *_str_copy_decimal_str_c(const char *s, char **endpos, char decimal, - char tsep) { +static char *_str_copy_decimal_str_c(const char *s, char **endpos, char decimal, + char tsep) { const char *p = s; - size_t length = strlen(s); + const size_t length = strlen(s); char *s_copy = malloc(length + 1); char *dst = s_copy; // Skip leading whitespace. @@ -1830,10 +1769,9 @@ double round_trip(const char *p, char **q, char decimal, char sci, char tsep, char *pc = _str_copy_decimal_str_c(p, &endptr, decimal, tsep); // This is called from a nogil block in parsers.pyx // so need to explicitly get GIL before Python calls - PyGILState_STATE gstate; - gstate = PyGILState_Ensure(); + PyGILState_STATE gstate = PyGILState_Ensure(); char *endpc; - double r = PyOS_string_to_double(pc, &endpc, 0); + const double r = PyOS_string_to_double(pc, &endpc, 0); // PyOS_string_to_double needs to consume the whole string if (endpc == pc + strlen(pc)) { if (q != NULL) { @@ -1882,20 +1820,15 @@ int uint64_conflict(uint_state *self) { int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, int *error, char tsep) { const char *p = p_item; - int isneg = 0; - int64_t number = 0; - int d; - // Skip leading spaces. while (isspace_ascii(*p)) { ++p; } // Handle sign. - if (*p == '-') { - isneg = 1; - ++p; - } else if (*p == '+') { + const bool isneg = *p == '-' ? true : false; + // Handle sign. + if (isneg || (*p == '+')) { p++; } @@ -1906,6 +1839,7 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, return 0; } + int64_t number = 0; if (isneg) { // If number is greater than pre_min, at least one more digit // can be processed without overflowing. @@ -1913,7 +1847,7 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, int64_t pre_min = int_min / 10; // Process the digits. - d = *p; + char d = *p; if (tsep != '\0') { while (1) { if (d == tsep) { @@ -1950,7 +1884,7 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, int dig_pre_max = int_max % 10; // Process the digits. - d = *p; + char d = *p; if (tsep != '\0') { while (1) { if (d == tsep) { @@ -2002,11 +1936,6 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max, uint64_t uint_max, int *error, char tsep) { const char *p = p_item; - uint64_t pre_max = uint_max / 10; - int dig_pre_max = uint_max % 10; - uint64_t number = 0; - int d; - // Skip leading spaces. while (isspace_ascii(*p)) { ++p; @@ -2032,7 +1961,10 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max, // can be processed without overflowing. // // Process the digits. - d = *p; + uint64_t number = 0; + const uint64_t pre_max = uint_max / 10; + const uint64_t dig_pre_max = uint_max % 10; + char d = *p; if (tsep != '\0') { while (1) { if (d == tsep) {