Skip to content

Commit e9bc234

Browse files
committed
REF: refactor parser IO to callback structure. tests pass
1 parent 5e5ef0e commit e9bc234

File tree

6 files changed

+541
-554
lines changed

6 files changed

+541
-554
lines changed

pandas/src/parser.pyx

Lines changed: 48 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,6 @@ from cpython cimport (PyObject, PyBytes_FromString,
1212

1313

1414
cdef extern from "Python.h":
15-
ctypedef struct FILE
16-
1715
object PyUnicode_FromString(char *v)
1816

1917
object PyUnicode_Decode(char *v, Py_ssize_t size, char *encoding,
@@ -78,9 +76,14 @@ cdef extern from "parser/parser.h":
7876
EAT_WHITESPACE
7977
FINISHED
8078

79+
ctypedef void* (*io_callback)(void *src, size_t nbytes, size_t *bytes_read,
80+
int *status)
81+
ctypedef int (*io_cleanup)(void *src)
82+
8183
ctypedef struct parser_t:
8284
void *source
83-
char sourcetype # 'M' for mmap, 'F' for FILE, 'A' for array
85+
io_callback cb_io
86+
io_cleanup cb_cleanup
8487

8588
int chunksize # Number of bytes to prepare for each chunk
8689
char *data # pointer to data to be processed
@@ -127,8 +130,6 @@ cdef extern from "parser/parser.h":
127130
int error_bad_lines
128131
int warn_bad_lines
129132

130-
int infer_types
131-
132133
# floating point options
133134
char decimal
134135
char sci
@@ -160,10 +161,6 @@ cdef extern from "parser/parser.h":
160161

161162
void parser_set_default_options(parser_t *self)
162163

163-
int parser_file_source_init(parser_t *self, FILE* fp)
164-
int parser_mmap_init(parser_t *self, FILE* fp)
165-
int parser_rd_source_init(parser_t *self, object source)
166-
167164
int parser_consume_rows(parser_t *self, size_t nrows)
168165

169166
int parser_trim_buffers(parser_t *self)
@@ -187,7 +184,28 @@ cdef extern from "parser/parser.h":
187184
inline int to_boolean(char *item, uint8_t *val)
188185

189186

190-
DEFAULT_CHUNKSIZE = 1024 * 1024
187+
cdef extern from "parser/io.h":
188+
void *new_mmap(char *fname)
189+
190+
void *new_file_source(char *fname, size_t buffer_size)
191+
192+
void *new_rd_source(object obj)
193+
194+
int del_file_source(void *src)
195+
int del_mmap(void *src)
196+
int del_rd_source(void *src)
197+
198+
void* buffer_file_bytes(void *source, size_t nbytes,
199+
size_t *bytes_read, int *status)
200+
201+
void* buffer_rd_bytes(void *source, size_t nbytes,
202+
size_t *bytes_read, int *status)
203+
204+
void* buffer_mmap_bytes(void *source, size_t nbytes,
205+
size_t *bytes_read, int *status)
206+
207+
208+
DEFAULT_CHUNKSIZE = 256 * 1024
191209

192210
# common NA values
193211
# no longer excluding inf representations
@@ -206,12 +224,11 @@ cdef class TextReader:
206224

207225
cdef:
208226
parser_t *parser
209-
object file_handle, should_close
227+
object file_handle
210228
bint factorize, na_filter, verbose, has_usecols
211229
int parser_start
212230
list clocks
213231
char *c_encoding
214-
FILE *fp
215232

216233
cdef public:
217234
int leading_cols, table_width, skip_footer, buffer_lines
@@ -330,8 +347,6 @@ cdef class TextReader:
330347
self.parser.error_bad_lines = 0
331348
self.parser.warn_bad_lines = 0
332349

333-
self.should_close = False
334-
335350
self.delimiter = delimiter
336351
self.delim_whitespace = delim_whitespace
337352

@@ -406,10 +421,6 @@ cdef class TextReader:
406421
def __dealloc__(self):
407422
parser_free(self.parser)
408423

409-
def __del__(self):
410-
if self.should_close:
411-
fclose(self.fp)
412-
413424
def set_error_bad_lines(self, int status):
414425
self.parser.error_bad_lines = status
415426

@@ -423,31 +434,37 @@ cdef class TextReader:
423434
cdef _setup_parser_source(self, source):
424435
cdef:
425436
int status
426-
427-
self.fp = NULL
437+
void *ptr
428438

429439
if isinstance(source, basestring):
430440
if not isinstance(source, bytes):
431441
source = source.encode('utf-8')
432442

433-
self.should_close = True
434-
self.fp = fopen(source, b'rb')
435-
stdio.setbuf(self.fp, NULL)
436-
437443
if self.memory_map:
438-
status = parser_mmap_init(self.parser, self.fp)
444+
ptr = new_mmap(source)
445+
self.parser.cb_io = &buffer_mmap_bytes
446+
self.parser.cb_cleanup = &del_mmap
439447
else:
440-
status = parser_file_source_init(self.parser, self.fp)
448+
ptr = new_file_source(source, self.parser.chunksize)
449+
self.parser.cb_io = &buffer_file_bytes
450+
self.parser.cb_cleanup = &del_file_source
441451

442-
if status != 0:
452+
if ptr == NULL:
443453
raise Exception('Initializing from file failed')
454+
455+
self.parser.source = ptr
456+
444457
elif hasattr(source, 'read'):
445458
# e.g., StringIO
446459

447-
status = parser_rd_source_init(self.parser, source)
448-
if status != 0:
460+
ptr = new_rd_source(source)
461+
if ptr == NULL:
449462
raise Exception('Initializing parser from file-like '
450463
'object failed')
464+
465+
self.parser.source = ptr
466+
self.parser.cb_io = &buffer_rd_bytes
467+
self.parser.cb_cleanup = &del_rd_source
451468
else:
452469
raise Exception('Expected file path name or file-like object,'
453470
' got %s type' % type(source))
@@ -1185,14 +1202,14 @@ cdef _try_int64(parser_t *parser, int col, int line_start, int line_end,
11851202
continue
11861203

11871204
data[i] = str_to_int64(word, INT64_MIN, INT64_MAX,
1188-
&error, parser.thousands);
1205+
&error, parser.thousands)
11891206
if error != 0:
11901207
return None, None
11911208
else:
11921209
for i in range(lines):
11931210
word = COLITER_NEXT(it)
11941211
data[i] = str_to_int64(word, INT64_MIN, INT64_MAX,
1195-
&error, parser.thousands);
1212+
&error, parser.thousands)
11961213
if error != 0:
11971214
return None, None
11981215

0 commit comments

Comments
 (0)