From 96308eb9ef6c01aa3e1fe9ccce20656806fd5178 Mon Sep 17 00:00:00 2001
From: Graham Inggs <ginggs@debian.org>
Date: Sun, 22 Apr 2018 14:08:35 +0200
Subject: [PATCH] BUG: Switch more size_t references to int64_t (#20785)

---
 doc/source/whatsnew/v0.23.0.txt     |  1 +
 pandas/_libs/src/parser/tokenizer.c | 34 ++++++++++++++---------------
 2 files changed, 18 insertions(+), 17 deletions(-)

diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt
index bcc442189bf11..71ac8712eea0a 100644
--- a/doc/source/whatsnew/v0.23.0.txt
+++ b/doc/source/whatsnew/v0.23.0.txt
@@ -1140,6 +1140,7 @@ I/O
 - Bug in :func:`read_msgpack` with a non existent file is passed in Python 2 (:issue:`15296`)
 - Bug in :func:`read_csv` where a ``MultiIndex`` with duplicate columns was not being mangled appropriately (:issue:`18062`)
 - Bug in :func:`read_csv` where missing values were not being handled properly when ``keep_default_na=False`` with dictionary ``na_values`` (:issue:`19227`)
+- Bug in :func:`read_csv` causing heap corruption on 32-bit, big-endian architectures (:issue:`20785`)
 - Bug in :func:`read_sas` where a file with 0 variables gave an ``AttributeError`` incorrectly. Now it gives an ``EmptyDataError`` (:issue:`18184`)
 - Bug in :func:`DataFrame.to_latex()` where pairs of braces meant to serve as invisible placeholders were escaped (:issue:`18667`)
 - Bug in :func:`read_json` where large numeric values were causing an ``OverflowError`` (:issue:`18842`)
diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c
index 6e8c220eab6b8..25eede6c286dc 100644
--- a/pandas/_libs/src/parser/tokenizer.c
+++ b/pandas/_libs/src/parser/tokenizer.c
@@ -69,9 +69,9 @@ static void free_if_not_null(void **ptr) {
 
 */
 
-static void *grow_buffer(void *buffer, size_t length, size_t *capacity,
-                         size_t space, size_t elsize, int *error) {
-    size_t cap = *capacity;
+static void *grow_buffer(void *buffer, int64_t length, int64_t *capacity,
+                         int64_t space, int64_t elsize, int *error) {
+    int64_t cap = *capacity;
     void *newbuffer = buffer;
 
     // Can we fit potentially nbytes tokens (+ null terminators) in the stream?
@@ -169,7 +169,7 @@ int parser_cleanup(parser_t *self) {
 }
 
 int parser_init(parser_t *self) {
-    size_t sz;
+    int64_t sz;
 
     /*
       Initialize data buffers
@@ -353,7 +353,7 @@ static int push_char(parser_t *self, char c) {
             ("push_char: ERROR!!! self->stream_len(%d) >= "
              "self->stream_cap(%d)\n",
              self->stream_len, self->stream_cap))
-        size_t bufsize = 100;
+        int64_t bufsize = 100;
         self->error_msg = (char *)malloc(bufsize);
         snprintf(self->error_msg, bufsize,
                  "Buffer overflow caught - possible malformed input file.\n");
@@ -370,7 +370,7 @@ int P_INLINE end_field(parser_t *self) {
             ("end_field: ERROR!!! self->words_len(%zu) >= "
              "self->words_cap(%zu)\n",
              self->words_len, self->words_cap))
-        size_t bufsize = 100;
+        int64_t bufsize = 100;
         self->error_msg = (char *)malloc(bufsize);
         snprintf(self->error_msg, bufsize,
                  "Buffer overflow caught - possible malformed input file.\n");
@@ -402,8 +402,8 @@ int P_INLINE end_field(parser_t *self) {
 }
 
 static void append_warning(parser_t *self, const char *msg) {
-    size_t ex_length;
-    size_t length = strlen(msg);
+    int64_t ex_length;
+    int64_t length = strlen(msg);
     void *newptr;
 
     if (self->warn_msg == NULL) {
@@ -423,7 +423,7 @@ static int end_line(parser_t *self) {
     char *msg;
     int64_t fields;
     int ex_fields = self->expected_fields;
-    size_t bufsize = 100;  // for error or warning messages
+    int64_t bufsize = 100;  // for error or warning messages
 
     fields = self->line_fields[self->lines];
 
@@ -495,7 +495,7 @@ static int end_line(parser_t *self) {
                 fields < ex_fields) {
             // might overrun the buffer when closing fields
             if (make_stream_space(self, ex_fields - fields) < 0) {
-                size_t bufsize = 100;
+                int64_t bufsize = 100;
                 self->error_msg = (char *)malloc(bufsize);
                 snprintf(self->error_msg, bufsize, "out of memory");
                 return -1;
@@ -516,7 +516,7 @@ static int end_line(parser_t *self) {
             TRACE((
                 "end_line: ERROR!!! self->lines(%zu) >= self->lines_cap(%zu)\n",
                 self->lines, self->lines_cap))
-            size_t bufsize = 100;
+            int64_t bufsize = 100;
             self->error_msg = (char *)malloc(bufsize);
             snprintf(self->error_msg, bufsize,
                      "Buffer overflow caught - "
@@ -577,7 +577,7 @@ static int parser_buffer_bytes(parser_t *self, size_t nbytes) {
     self->datalen = bytes_read;
 
     if (status != REACHED_EOF && self->data == NULL) {
-        size_t bufsize = 200;
+        int64_t bufsize = 200;
         self->error_msg = (char *)malloc(bufsize);
 
         if (status == CALLING_READ_FAILED) {
@@ -608,7 +608,7 @@ static int parser_buffer_bytes(parser_t *self, size_t nbytes) {
     if (slen >= self->stream_cap) {                                           \
         TRACE(("PUSH_CHAR: ERROR!!! slen(%d) >= stream_cap(%d)\n", slen,      \
                self->stream_cap))                                             \
-        size_t bufsize = 100;                                                 \
+        int64_t bufsize = 100;                                                \
         self->error_msg = (char *)malloc(bufsize);                            \
         snprintf(self->error_msg, bufsize,                                    \
                  "Buffer overflow caught - possible malformed input file.\n");\
@@ -729,7 +729,7 @@ int tokenize_bytes(parser_t *self, size_t line_limit, int64_t start_lines) {
     char *buf = self->data + self->datapos;
 
     if (make_stream_space(self, self->datalen - self->datapos) < 0) {
-        size_t bufsize = 100;
+        int64_t bufsize = 100;
         self->error_msg = (char *)malloc(bufsize);
         snprintf(self->error_msg, bufsize, "out of memory");
         return -1;
@@ -1036,7 +1036,7 @@ int tokenize_bytes(parser_t *self, size_t line_limit, int64_t start_lines) {
                     PUSH_CHAR(c);
                     self->state = IN_FIELD;
                 } else {
-                    size_t bufsize = 100;
+                    int64_t bufsize = 100;
                     self->error_msg = (char *)malloc(bufsize);
                     snprintf(self->error_msg, bufsize,
                             "delimiter expected after quote in quote");
@@ -1132,7 +1132,7 @@ int tokenize_bytes(parser_t *self, size_t line_limit, int64_t start_lines) {
 }
 
 static int parser_handle_eof(parser_t *self) {
-    size_t bufsize = 100;
+    int64_t bufsize = 100;
 
     TRACE(
         ("handling eof, datalen: %d, pstate: %d\n", self->datalen, self->state))
@@ -1177,7 +1177,7 @@ static int parser_handle_eof(parser_t *self) {
 }
 
 int parser_consume_rows(parser_t *self, size_t nrows) {
-    size_t i, offset, word_deletions, char_count;
+    int64_t i, offset, word_deletions, char_count;
 
     if (nrows > self->lines) {
         nrows = self->lines;