From 96308eb9ef6c01aa3e1fe9ccce20656806fd5178 Mon Sep 17 00:00:00 2001 From: Graham Inggs Date: Sun, 22 Apr 2018 14:08:35 +0200 Subject: [PATCH] BUG: Switch more size_t references to int64_t (#20785) --- doc/source/whatsnew/v0.23.0.txt | 1 + pandas/_libs/src/parser/tokenizer.c | 34 ++++++++++++++--------------- 2 files changed, 18 insertions(+), 17 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index bcc442189bf11..71ac8712eea0a 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -1140,6 +1140,7 @@ I/O - Bug in :func:`read_msgpack` with a non existent file is passed in Python 2 (:issue:`15296`) - Bug in :func:`read_csv` where a ``MultiIndex`` with duplicate columns was not being mangled appropriately (:issue:`18062`) - Bug in :func:`read_csv` where missing values were not being handled properly when ``keep_default_na=False`` with dictionary ``na_values`` (:issue:`19227`) +- Bug in :func:`read_csv` causing heap corruption on 32-bit, big-endian architectures (:issue:`20785`) - Bug in :func:`read_sas` where a file with 0 variables gave an ``AttributeError`` incorrectly. Now it gives an ``EmptyDataError`` (:issue:`18184`) - Bug in :func:`DataFrame.to_latex()` where pairs of braces meant to serve as invisible placeholders were escaped (:issue:`18667`) - Bug in :func:`read_json` where large numeric values were causing an ``OverflowError`` (:issue:`18842`) diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index 6e8c220eab6b8..25eede6c286dc 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -69,9 +69,9 @@ static void free_if_not_null(void **ptr) { */ -static void *grow_buffer(void *buffer, size_t length, size_t *capacity, - size_t space, size_t elsize, int *error) { - size_t cap = *capacity; +static void *grow_buffer(void *buffer, int64_t length, int64_t *capacity, + int64_t space, int64_t elsize, int *error) { + int64_t cap = *capacity; void *newbuffer = buffer; // Can we fit potentially nbytes tokens (+ null terminators) in the stream? @@ -169,7 +169,7 @@ int parser_cleanup(parser_t *self) { } int parser_init(parser_t *self) { - size_t sz; + int64_t sz; /* Initialize data buffers @@ -353,7 +353,7 @@ static int push_char(parser_t *self, char c) { ("push_char: ERROR!!! self->stream_len(%d) >= " "self->stream_cap(%d)\n", self->stream_len, self->stream_cap)) - size_t bufsize = 100; + int64_t bufsize = 100; self->error_msg = (char *)malloc(bufsize); snprintf(self->error_msg, bufsize, "Buffer overflow caught - possible malformed input file.\n"); @@ -370,7 +370,7 @@ int P_INLINE end_field(parser_t *self) { ("end_field: ERROR!!! self->words_len(%zu) >= " "self->words_cap(%zu)\n", self->words_len, self->words_cap)) - size_t bufsize = 100; + int64_t bufsize = 100; self->error_msg = (char *)malloc(bufsize); snprintf(self->error_msg, bufsize, "Buffer overflow caught - possible malformed input file.\n"); @@ -402,8 +402,8 @@ int P_INLINE end_field(parser_t *self) { } static void append_warning(parser_t *self, const char *msg) { - size_t ex_length; - size_t length = strlen(msg); + int64_t ex_length; + int64_t length = strlen(msg); void *newptr; if (self->warn_msg == NULL) { @@ -423,7 +423,7 @@ static int end_line(parser_t *self) { char *msg; int64_t fields; int ex_fields = self->expected_fields; - size_t bufsize = 100; // for error or warning messages + int64_t bufsize = 100; // for error or warning messages fields = self->line_fields[self->lines]; @@ -495,7 +495,7 @@ static int end_line(parser_t *self) { fields < ex_fields) { // might overrun the buffer when closing fields if (make_stream_space(self, ex_fields - fields) < 0) { - size_t bufsize = 100; + int64_t bufsize = 100; self->error_msg = (char *)malloc(bufsize); snprintf(self->error_msg, bufsize, "out of memory"); return -1; @@ -516,7 +516,7 @@ static int end_line(parser_t *self) { TRACE(( "end_line: ERROR!!! self->lines(%zu) >= self->lines_cap(%zu)\n", self->lines, self->lines_cap)) - size_t bufsize = 100; + int64_t bufsize = 100; self->error_msg = (char *)malloc(bufsize); snprintf(self->error_msg, bufsize, "Buffer overflow caught - " @@ -577,7 +577,7 @@ static int parser_buffer_bytes(parser_t *self, size_t nbytes) { self->datalen = bytes_read; if (status != REACHED_EOF && self->data == NULL) { - size_t bufsize = 200; + int64_t bufsize = 200; self->error_msg = (char *)malloc(bufsize); if (status == CALLING_READ_FAILED) { @@ -608,7 +608,7 @@ static int parser_buffer_bytes(parser_t *self, size_t nbytes) { if (slen >= self->stream_cap) { \ TRACE(("PUSH_CHAR: ERROR!!! slen(%d) >= stream_cap(%d)\n", slen, \ self->stream_cap)) \ - size_t bufsize = 100; \ + int64_t bufsize = 100; \ self->error_msg = (char *)malloc(bufsize); \ snprintf(self->error_msg, bufsize, \ "Buffer overflow caught - possible malformed input file.\n");\ @@ -729,7 +729,7 @@ int tokenize_bytes(parser_t *self, size_t line_limit, int64_t start_lines) { char *buf = self->data + self->datapos; if (make_stream_space(self, self->datalen - self->datapos) < 0) { - size_t bufsize = 100; + int64_t bufsize = 100; self->error_msg = (char *)malloc(bufsize); snprintf(self->error_msg, bufsize, "out of memory"); return -1; @@ -1036,7 +1036,7 @@ int tokenize_bytes(parser_t *self, size_t line_limit, int64_t start_lines) { PUSH_CHAR(c); self->state = IN_FIELD; } else { - size_t bufsize = 100; + int64_t bufsize = 100; self->error_msg = (char *)malloc(bufsize); snprintf(self->error_msg, bufsize, "delimiter expected after quote in quote"); @@ -1132,7 +1132,7 @@ int tokenize_bytes(parser_t *self, size_t line_limit, int64_t start_lines) { } static int parser_handle_eof(parser_t *self) { - size_t bufsize = 100; + int64_t bufsize = 100; TRACE( ("handling eof, datalen: %d, pstate: %d\n", self->datalen, self->state)) @@ -1177,7 +1177,7 @@ static int parser_handle_eof(parser_t *self) { } int parser_consume_rows(parser_t *self, size_t nrows) { - size_t i, offset, word_deletions, char_count; + int64_t i, offset, word_deletions, char_count; if (nrows > self->lines) { nrows = self->lines;