Skip to content

Commit 77b4139

Browse files
committed
move io.c from using unbuffered fread()s to read()s.
pandas already buffers reads coming from io.c itself, so it previously used setbuf() to disable buffering inside fread(). however, certain implementations of unbuffered stdio reads are sub-optimal. for example, fread() in solaris ends up doing a read() for each individual byte of the underlying filedescriptor, which turns out to be very slow. instead, this code now open()s a file descritor and read()s directly into the buffer that pandas has already allocated. this is effectively what other libcs (eg, glibc) do underneath an unbuffered fread() anyway, but this is more explicit. while here, this tweaks the mmap backend to use open() too, and also properly checks for mmap failure by comparing its result to MAP_FAILED instead of NULL.
1 parent cd1031f commit 77b4139

File tree

2 files changed

+88
-79
lines changed

2 files changed

+88
-79
lines changed

pandas/_libs/src/parser/io.c

Lines changed: 82 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -9,33 +9,40 @@ The full license is in the LICENSE file, distributed with this software.
99

1010
#include "io.h"
1111

12+
#include <sys/types.h>
13+
#include <sys/stat.h>
14+
#include <fcntl.h>
15+
1216
/*
1317
On-disk FILE, uncompressed
1418
*/
1519

1620
void *new_file_source(char *fname, size_t buffer_size) {
1721
file_source *fs = (file_source *)malloc(sizeof(file_source));
18-
fs->fp = fopen(fname, "rb");
19-
20-
if (fs->fp == NULL) {
21-
free(fs);
22+
if (fs == NULL) {
2223
return NULL;
2324
}
24-
setbuf(fs->fp, NULL);
2525

26-
fs->initial_file_pos = ftell(fs->fp);
26+
fs->fd = open(fname, O_RDONLY);
27+
if (fs->fd == -1) {
28+
goto err_free;
29+
}
2730

2831
// Only allocate this heap memory if we are not memory-mapping the file
2932
fs->buffer = (char *)malloc((buffer_size + 1) * sizeof(char));
3033

3134
if (fs->buffer == NULL) {
32-
return NULL;
35+
goto err_free;
3336
}
3437

35-
memset(fs->buffer, 0, buffer_size + 1);
36-
fs->buffer[buffer_size] = '\0';
38+
memset(fs->buffer, '\0', buffer_size + 1);
39+
fs->size = buffer_size;
3740

3841
return (void *)fs;
42+
43+
err_free:
44+
free(fs);
45+
return NULL;
3946
}
4047

4148
void *new_rd_source(PyObject *obj) {
@@ -56,12 +63,12 @@ void *new_rd_source(PyObject *obj) {
5663
5764
*/
5865

59-
int del_file_source(void *fs) {
66+
int del_file_source(void *ptr) {
67+
file_source *fs = ptr;
6068
if (fs == NULL) return 0;
6169

62-
/* allocated on the heap */
63-
free(FS(fs)->buffer);
64-
fclose(FS(fs)->fp);
70+
free(fs->buffer);
71+
close(fs->fd);
6572
free(fs);
6673

6774
return 0;
@@ -83,17 +90,31 @@ int del_rd_source(void *rds) {
8390

8491
void *buffer_file_bytes(void *source, size_t nbytes, size_t *bytes_read,
8592
int *status) {
86-
file_source *src = FS(source);
93+
file_source *fs = FS(source);
94+
ssize_t rv;
8795

88-
*bytes_read = fread((void *)src->buffer, sizeof(char), nbytes, src->fp);
96+
if (nbytes > fs->size) {
97+
nbytes = fs->size;
98+
}
8999

90-
if (*bytes_read == 0) {
100+
rv = read(fs->fd, fs->buffer, nbytes);
101+
switch (rv) {
102+
case -1:
103+
*status = CALLING_READ_FAILED;
104+
*bytes_read = 0;
105+
return NULL;
106+
case 0:
91107
*status = REACHED_EOF;
92-
} else {
108+
*bytes_read = 0;
109+
return NULL;
110+
default:
93111
*status = 0;
112+
*bytes_read = rv;
113+
fs->buffer[rv] = '\0';
114+
break;
94115
}
95116

96-
return (void *)src->buffer;
117+
return (void *)fs->buffer;
97118
}
98119

99120
void *buffer_rd_bytes(void *source, size_t nbytes, size_t *bytes_read,
@@ -155,77 +176,81 @@ void *buffer_rd_bytes(void *source, size_t nbytes, size_t *bytes_read,
155176
#include <sys/stat.h>
156177

157178
void *new_mmap(char *fname) {
158-
struct stat buf;
159-
int fd;
160179
memory_map *mm;
161-
off_t filesize;
180+
struct stat stat;
181+
size_t filesize;
162182

163183
mm = (memory_map *)malloc(sizeof(memory_map));
164-
mm->fp = fopen(fname, "rb");
165-
166-
fd = fileno(mm->fp);
167-
if (fstat(fd, &buf) == -1) {
168-
fprintf(stderr, "new_file_buffer: fstat() failed. errno =%d\n", errno);
169-
return NULL;
170-
}
171-
filesize = buf.st_size; /* XXX This might be 32 bits. */
172-
173184
if (mm == NULL) {
174-
/* XXX Eventually remove this print statement. */
175185
fprintf(stderr, "new_file_buffer: malloc() failed.\n");
176-
return NULL;
186+
return (NULL);
187+
}
188+
mm->fd = open(fname, O_RDONLY);
189+
if (mm->fd == -1) {
190+
fprintf(stderr, "new_file_buffer: open(%s) failed. errno =%d\n", fname, errno);
191+
goto err_free;
177192
}
178-
mm->size = (off_t)filesize;
179-
mm->line_number = 0;
180193

181-
mm->fileno = fd;
182-
mm->position = ftell(mm->fp);
183-
mm->last_pos = (off_t)filesize;
194+
if (fstat(mm->fd, &stat) == -1) {
195+
fprintf(stderr, "new_file_buffer: fstat() failed. errno =%d\n", errno);
196+
goto err_close;
197+
}
198+
filesize = stat.st_size; /* XXX This might be 32 bits. */
184199

185-
mm->memmap = mmap(NULL, filesize, PROT_READ, MAP_SHARED, fd, 0);
186-
if (mm->memmap == NULL) {
200+
mm->memmap = mmap(NULL, filesize, PROT_READ, MAP_SHARED, mm->fd, 0);
201+
if (mm->memmap == MAP_FAILED) {
187202
/* XXX Eventually remove this print statement. */
188203
fprintf(stderr, "new_file_buffer: mmap() failed.\n");
189-
free(mm);
190-
mm = NULL;
204+
goto err_close;
191205
}
192206

193-
return (void *)mm;
207+
mm->size = (off_t)filesize;
208+
mm->position = 0;
209+
210+
return mm;
211+
212+
err_close:
213+
close(mm->fd);
214+
err_free:
215+
free(mm);
216+
return NULL;
194217
}
195218

196-
int del_mmap(void *src) {
197-
munmap(MM(src)->memmap, MM(src)->size);
219+
int del_mmap(void *ptr) {
220+
memory_map *mm = ptr;
221+
222+
if (mm == NULL) return 0;
198223

199-
fclose(MM(src)->fp);
200-
free(src);
224+
munmap(mm->memmap, mm->size);
225+
close(mm->fd);
226+
free(mm);
201227

202228
return 0;
203229
}
204230

205231
void *buffer_mmap_bytes(void *source, size_t nbytes, size_t *bytes_read,
206232
int *status) {
207233
void *retval;
208-
memory_map *src = MM(source);
234+
memory_map *src = source;
235+
size_t remaining = src->size - src->position;
209236

210-
if (src->position == src->last_pos) {
237+
if (remaining == 0) {
211238
*bytes_read = 0;
212239
*status = REACHED_EOF;
213240
return NULL;
214241
}
215242

216-
retval = src->memmap + src->position;
217-
218-
if (src->position + (off_t)nbytes > src->last_pos) {
219-
// fewer than nbytes remaining
220-
*bytes_read = src->last_pos - src->position;
221-
} else {
222-
*bytes_read = nbytes;
243+
if (nbytes > remaining) {
244+
nbytes = remaining;
223245
}
224246

225-
*status = 0;
247+
retval = src->memmap + src->position;
226248

227249
/* advance position in mmap data structure */
228-
src->position += *bytes_read;
250+
src->position += nbytes;
251+
252+
*bytes_read = nbytes;
253+
*status = 0;
229254

230255
return retval;
231256
}

pandas/_libs/src/parser/io.h

Lines changed: 6 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -15,19 +15,10 @@ The full license is in the LICENSE file, distributed with this software.
1515

1616
typedef struct _file_source {
1717
/* The file being read. */
18-
FILE *fp;
18+
int fd;
1919

2020
char *buffer;
21-
22-
/* file position when the file_buffer was created. */
23-
off_t initial_file_pos;
24-
25-
/* Offset in the file of the data currently in the buffer. */
26-
off_t buffer_file_pos;
27-
28-
/* Actual number of bytes in the current buffer. (Can be less than
29-
* buffer_size.) */
30-
off_t last_pos;
21+
size_t size;
3122
} file_source;
3223

3324
#define FS(source) ((file_source *)source)
@@ -37,20 +28,13 @@ typedef struct _file_source {
3728
#endif
3829

3930
typedef struct _memory_map {
40-
FILE *fp;
31+
int fd;
4132

4233
/* Size of the file, in bytes. */
43-
off_t size;
44-
45-
/* file position when the file_buffer was created. */
46-
off_t initial_file_pos;
47-
48-
int line_number;
49-
50-
int fileno;
51-
off_t position;
52-
off_t last_pos;
5334
char *memmap;
35+
size_t size;
36+
37+
size_t position;
5438
} memory_map;
5539

5640
#define MM(src) ((memory_map *)src)

0 commit comments

Comments
 (0)