Skip to content

Commit 9116930

Browse files
committed
Merge remote-tracking branch 'upstream/master' into ea-repr
2 parents 0f4083e + 011b79f commit 9116930

File tree

12 files changed

+227
-28
lines changed

12 files changed

+227
-28
lines changed

doc/source/io.rst

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2854,6 +2854,11 @@ It is often the case that users will insert columns to do temporary computations
28542854
in Excel and you may not want to read in those columns. ``read_excel`` takes
28552855
a ``usecols`` keyword to allow you to specify a subset of columns to parse.
28562856

2857+
.. deprecated:: 0.24.0
2858+
2859+
Passing in an integer for ``usecols`` has been deprecated. Please pass in a list
2860+
of ints from 0 to ``usecols`` inclusive instead.
2861+
28572862
If ``usecols`` is an integer, then it is assumed to indicate the last column
28582863
to be parsed.
28592864

doc/source/whatsnew/v0.24.0.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -972,6 +972,7 @@ Deprecations
972972
- Deprecated the `nthreads` keyword of :func:`pandas.read_feather` in favor of
973973
`use_threads` to reflect the changes in pyarrow 0.11.0. (:issue:`23053`)
974974
- :meth:`ExtensionArray._formatting_values` is deprecated. Use `ExtensionArray._formatter` instead. (:issue:`23601`)
975+
- :func:`pandas.read_excel` has deprecated accepting ``usecols`` as an integer. Please pass in a list of ints from 0 to ``usecols`` inclusive instead (:issue:`23527`)
975976
- Constructing a :class:`TimedeltaIndex` from data with ``datetime64``-dtyped data is deprecated, will raise ``TypeError`` in a future version (:issue:`23539`)
976977

977978
.. _whatsnew_0240.deprecations.datetimelike_int_ops:
@@ -1301,6 +1302,7 @@ Notice how we now instead output ``np.nan`` itself instead of a stringified form
13011302
- :func:`read_excel()` will correctly show the deprecation warning for previously deprecated ``sheetname`` (:issue:`17994`)
13021303
- :func:`read_csv()` and func:`read_table()` will throw ``UnicodeError`` and not coredump on badly encoded strings (:issue:`22748`)
13031304
- :func:`read_csv()` will correctly parse timezone-aware datetimes (:issue:`22256`)
1305+
- Bug in :func:`read_csv()` in which memory management was prematurely optimized for the C engine when the data was being read in chunks (:issue:`23509`)
13041306
- :func:`read_sas()` will parse numbers in sas7bdat-files that have width less than 8 bytes correctly. (:issue:`21616`)
13051307
- :func:`read_sas()` will correctly parse sas7bdat files with many columns (:issue:`22628`)
13061308
- :func:`read_sas()` will correctly parse sas7bdat files with data page types having also bit 7 set (so page type is 128 + 256 = 384) (:issue:`16615`)

pandas/_libs/parsers.pyx

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,7 @@ cdef extern from "parser/tokenizer.h":
132132
int64_t *word_starts # where we are in the stream
133133
int64_t words_len
134134
int64_t words_cap
135+
int64_t max_words_cap # maximum word cap encountered
135136

136137
char *pword_start # pointer to stream start of current field
137138
int64_t word_start # position start of current field

pandas/_libs/src/parser/tokenizer.c

Lines changed: 31 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -197,6 +197,7 @@ int parser_init(parser_t *self) {
197197
sz = sz ? sz : 1;
198198
self->words = (char **)malloc(sz * sizeof(char *));
199199
self->word_starts = (int64_t *)malloc(sz * sizeof(int64_t));
200+
self->max_words_cap = sz;
200201
self->words_cap = sz;
201202
self->words_len = 0;
202203

@@ -247,7 +248,7 @@ void parser_del(parser_t *self) {
247248
}
248249

249250
static int make_stream_space(parser_t *self, size_t nbytes) {
250-
int64_t i, cap;
251+
int64_t i, cap, length;
251252
int status;
252253
void *orig_ptr, *newptr;
253254

@@ -287,8 +288,23 @@ static int make_stream_space(parser_t *self, size_t nbytes) {
287288
*/
288289

289290
cap = self->words_cap;
291+
292+
/**
293+
* If we are reading in chunks, we need to be aware of the maximum number
294+
* of words we have seen in previous chunks (self->max_words_cap), so
295+
* that way, we can properly allocate when reading subsequent ones.
296+
*
297+
* Otherwise, we risk a buffer overflow if we mistakenly under-allocate
298+
* just because a recent chunk did not have as many words.
299+
*/
300+
if (self->words_len + nbytes < self->max_words_cap) {
301+
length = self->max_words_cap - nbytes;
302+
} else {
303+
length = self->words_len;
304+
}
305+
290306
self->words =
291-
(char **)grow_buffer((void *)self->words, self->words_len,
307+
(char **)grow_buffer((void *)self->words, length,
292308
(int64_t*)&self->words_cap, nbytes,
293309
sizeof(char *), &status);
294310
TRACE(
@@ -1241,6 +1257,19 @@ int parser_trim_buffers(parser_t *self) {
12411257

12421258
int64_t i;
12431259

1260+
/**
1261+
* Before we free up space and trim, we should
1262+
* save how many words we saw when parsing, if
1263+
* it exceeds the maximum number we saw before.
1264+
*
1265+
* This is important for when we read in chunks,
1266+
* so that we can inform subsequent chunk parsing
1267+
* as to how many words we could possibly see.
1268+
*/
1269+
if (self->words_cap > self->max_words_cap) {
1270+
self->max_words_cap = self->words_cap;
1271+
}
1272+
12441273
/* trim words, word_starts */
12451274
new_cap = _next_pow2(self->words_len) + 1;
12461275
if (new_cap < self->words_cap) {

pandas/_libs/src/parser/tokenizer.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,7 @@ typedef struct parser_t {
142142
int64_t *word_starts; // where we are in the stream
143143
int64_t words_len;
144144
int64_t words_cap;
145+
int64_t max_words_cap; // maximum word cap encountered
145146

146147
char *pword_start; // pointer to stream start of current field
147148
int64_t word_start; // position start of current field

pandas/core/arrays/sparse.py

Lines changed: 91 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -284,6 +284,83 @@ def is_dtype(cls, dtype):
284284
return True
285285
return isinstance(dtype, np.dtype) or dtype == 'Sparse'
286286

287+
def update_dtype(self, dtype):
288+
"""Convert the SparseDtype to a new dtype.
289+
290+
This takes care of converting the ``fill_value``.
291+
292+
Parameters
293+
----------
294+
dtype : Union[str, numpy.dtype, SparseDtype]
295+
The new dtype to use.
296+
297+
* For a SparseDtype, it is simply returned
298+
* For a NumPy dtype (or str), the current fill value
299+
is converted to the new dtype, and a SparseDtype
300+
with `dtype` and the new fill value is returned.
301+
302+
Returns
303+
-------
304+
SparseDtype
305+
A new SparseDtype with the corret `dtype` and fill value
306+
for that `dtype`.
307+
308+
Raises
309+
------
310+
ValueError
311+
When the current fill value cannot be converted to the
312+
new `dtype` (e.g. trying to convert ``np.nan`` to an
313+
integer dtype).
314+
315+
316+
Examples
317+
--------
318+
>>> SparseDtype(int, 0).update_dtype(float)
319+
Sparse[float64, 0.0]
320+
321+
>>> SparseDtype(int, 1).update_dtype(SparseDtype(float, np.nan))
322+
Sparse[float64, nan]
323+
"""
324+
cls = type(self)
325+
dtype = pandas_dtype(dtype)
326+
327+
if not isinstance(dtype, cls):
328+
fill_value = astype_nansafe(np.array(self.fill_value),
329+
dtype).item()
330+
dtype = cls(dtype, fill_value=fill_value)
331+
332+
return dtype
333+
334+
@property
335+
def _subtype_with_str(self):
336+
"""
337+
Whether the SparseDtype's subtype should be considered ``str``.
338+
339+
Typically, pandas will store string data in an object-dtype array.
340+
When converting values to a dtype, e.g. in ``.astype``, we need to
341+
be more specific, we need the actual underlying type.
342+
343+
Returns
344+
-------
345+
346+
>>> SparseDtype(int, 1)._subtype_with_str
347+
dtype('int64')
348+
349+
>>> SparseDtype(object, 1)._subtype_with_str
350+
dtype('O')
351+
352+
>>> dtype = SparseDtype(str, '')
353+
>>> dtype.subtype
354+
dtype('O')
355+
356+
>>> dtype._subtype_with_str
357+
str
358+
"""
359+
if isinstance(self.fill_value, compat.string_types):
360+
return type(self.fill_value)
361+
return self.subtype
362+
363+
287364
# ----------------------------------------------------------------------------
288365
# Array
289366

@@ -614,7 +691,7 @@ def __array__(self, dtype=None, copy=True):
614691
# Can't put pd.NaT in a datetime64[ns]
615692
fill_value = np.datetime64('NaT')
616693
try:
617-
dtype = np.result_type(self.sp_values.dtype, fill_value)
694+
dtype = np.result_type(self.sp_values.dtype, type(fill_value))
618695
except TypeError:
619696
dtype = object
620697

@@ -996,7 +1073,7 @@ def _take_with_fill(self, indices, fill_value=None):
9961073
if len(self) == 0:
9971074
# Empty... Allow taking only if all empty
9981075
if (indices == -1).all():
999-
dtype = np.result_type(self.sp_values, fill_value)
1076+
dtype = np.result_type(self.sp_values, type(fill_value))
10001077
taken = np.empty_like(indices, dtype=dtype)
10011078
taken.fill(fill_value)
10021079
return taken
@@ -1009,7 +1086,7 @@ def _take_with_fill(self, indices, fill_value=None):
10091086
if self.sp_index.npoints == 0:
10101087
# Avoid taking from the empty self.sp_values
10111088
taken = np.full(sp_indexer.shape, fill_value=fill_value,
1012-
dtype=np.result_type(fill_value))
1089+
dtype=np.result_type(type(fill_value)))
10131090
else:
10141091
taken = self.sp_values.take(sp_indexer)
10151092

@@ -1030,12 +1107,13 @@ def _take_with_fill(self, indices, fill_value=None):
10301107
result_type = taken.dtype
10311108

10321109
if m0.any():
1033-
result_type = np.result_type(result_type, self.fill_value)
1110+
result_type = np.result_type(result_type,
1111+
type(self.fill_value))
10341112
taken = taken.astype(result_type)
10351113
taken[old_fill_indices] = self.fill_value
10361114

10371115
if m1.any():
1038-
result_type = np.result_type(result_type, fill_value)
1116+
result_type = np.result_type(result_type, type(fill_value))
10391117
taken = taken.astype(result_type)
10401118
taken[new_fill_indices] = fill_value
10411119

@@ -1061,7 +1139,7 @@ def _take_without_fill(self, indices):
10611139
# edge case in take...
10621140
# I think just return
10631141
out = np.full(indices.shape, self.fill_value,
1064-
dtype=np.result_type(self.fill_value))
1142+
dtype=np.result_type(type(self.fill_value)))
10651143
arr, sp_index, fill_value = make_sparse(out,
10661144
fill_value=self.fill_value)
10671145
return type(self)(arr, sparse_index=sp_index,
@@ -1073,7 +1151,7 @@ def _take_without_fill(self, indices):
10731151

10741152
if fillable.any():
10751153
# TODO: may need to coerce array to fill value
1076-
result_type = np.result_type(taken, self.fill_value)
1154+
result_type = np.result_type(taken, type(self.fill_value))
10771155
taken = taken.astype(result_type)
10781156
taken[fillable] = self.fill_value
10791157

@@ -1093,7 +1171,9 @@ def _concat_same_type(cls, to_concat):
10931171

10941172
fill_value = fill_values[0]
10951173

1096-
if len(set(fill_values)) > 1:
1174+
# np.nan isn't a singleton, so we may end up with multiple
1175+
# NaNs here, so we ignore tha all NA case too.
1176+
if not (len(set(fill_values)) == 1 or isna(fill_values).all()):
10971177
warnings.warn("Concatenating sparse arrays with multiple fill "
10981178
"values: '{}'. Picking the first and "
10991179
"converting the rest.".format(fill_values),
@@ -1212,13 +1292,10 @@ def astype(self, dtype=None, copy=True):
12121292
IntIndex
12131293
Indices: array([2, 3], dtype=int32)
12141294
"""
1215-
dtype = pandas_dtype(dtype)
1216-
1217-
if not isinstance(dtype, SparseDtype):
1218-
dtype = SparseDtype(dtype, fill_value=self.fill_value)
1219-
1295+
dtype = self.dtype.update_dtype(dtype)
1296+
subtype = dtype._subtype_with_str
12201297
sp_values = astype_nansafe(self.sp_values,
1221-
dtype.subtype,
1298+
subtype,
12221299
copy=copy)
12231300
if sp_values is self.sp_values and copy:
12241301
sp_values = sp_values.copy()

pandas/io/excel.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,10 @@
9595
usecols : int, str, list-like, or callable default None
9696
* If None, then parse all columns,
9797
* If int, then indicates last column to be parsed
98+
99+
.. deprecated:: 0.24.0
100+
Pass in a list of ints instead from 0 to `usecols` inclusive.
101+
98102
* If string, then indicates comma separated list of Excel column letters
99103
and column ranges (e.g. "A:E" or "A,C,E:F"). Ranges are inclusive of
100104
both sides.
@@ -778,6 +782,10 @@ def _maybe_convert_usecols(usecols):
778782
return usecols
779783

780784
if is_integer(usecols):
785+
warnings.warn(("Passing in an integer for `usecols` has been "
786+
"deprecated. Please pass in a list of ints from "
787+
"0 to `usecols` inclusive instead."),
788+
FutureWarning, stacklevel=2)
781789
return lrange(usecols + 1)
782790

783791
if isinstance(usecols, compat.string_types):

pandas/io/stata.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -461,7 +461,8 @@ def parse_dates_safe(dates, delta=False, year=False, days=False):
461461

462462
excessive_string_length_error = """
463463
Fixed width strings in Stata .dta files are limited to 244 (or fewer)
464-
characters. Column '%s' does not satisfy this restriction.
464+
characters. Column '%s' does not satisfy this restriction. Use the
465+
'version=117' parameter to write the newer (Stata 13 and later) format.
465466
"""
466467

467468

pandas/tests/arrays/sparse/test_array.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -477,6 +477,34 @@ def test_astype_all(self, any_real_dtype):
477477
tm.assert_numpy_array_equal(np.asarray(res.values),
478478
vals.astype(typ))
479479

480+
@pytest.mark.parametrize('array, dtype, expected', [
481+
(SparseArray([0, 1]), 'float',
482+
SparseArray([0., 1.], dtype=SparseDtype(float, 0.0))),
483+
(SparseArray([0, 1]), bool, SparseArray([False, True])),
484+
(SparseArray([0, 1], fill_value=1), bool,
485+
SparseArray([False, True], dtype=SparseDtype(bool, True))),
486+
pytest.param(
487+
SparseArray([0, 1]), 'datetime64[ns]',
488+
SparseArray(np.array([0, 1], dtype='datetime64[ns]'),
489+
dtype=SparseDtype('datetime64[ns]',
490+
pd.Timestamp('1970'))),
491+
marks=[pytest.mark.xfail(reason="NumPy-7619", strict=True)],
492+
),
493+
(SparseArray([0, 1, 10]), str,
494+
SparseArray(['0', '1', '10'], dtype=SparseDtype(str, '0'))),
495+
(SparseArray(['10', '20']), float, SparseArray([10.0, 20.0])),
496+
(SparseArray([0, 1, 0]), object,
497+
SparseArray([0, 1, 0], dtype=SparseDtype(object, 0))),
498+
])
499+
def test_astype_more(self, array, dtype, expected):
500+
result = array.astype(dtype)
501+
tm.assert_sp_array_equal(result, expected)
502+
503+
def test_astype_nan_raises(self):
504+
arr = SparseArray([1.0, np.nan])
505+
with pytest.raises(ValueError, match='Cannot convert non-finite'):
506+
arr.astype(int)
507+
480508
def test_set_fill_value(self):
481509
arr = SparseArray([1., np.nan, 2.], fill_value=np.nan)
482510
arr.fill_value = 2

pandas/tests/arrays/sparse/test_dtype.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -139,3 +139,23 @@ def test_parse_subtype(string, expected):
139139
def test_construct_from_string_fill_value_raises(string):
140140
with pytest.raises(TypeError, match='fill_value in the string is not'):
141141
SparseDtype.construct_from_string(string)
142+
143+
144+
@pytest.mark.parametrize('original, dtype, expected', [
145+
(SparseDtype(int, 0), float, SparseDtype(float, 0.0)),
146+
(SparseDtype(int, 1), float, SparseDtype(float, 1.0)),
147+
(SparseDtype(int, 1), str, SparseDtype(object, '1')),
148+
(SparseDtype(float, 1.5), int, SparseDtype(int, 1)),
149+
])
150+
def test_update_dtype(original, dtype, expected):
151+
result = original.update_dtype(dtype)
152+
assert result == expected
153+
154+
155+
@pytest.mark.parametrize("original, dtype", [
156+
(SparseDtype(float, np.nan), int),
157+
(SparseDtype(str, 'abc'), int),
158+
])
159+
def test_update_dtype_raises(original, dtype):
160+
with pytest.raises(ValueError):
161+
original.update_dtype(dtype)

pandas/tests/io/parser/common.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -458,6 +458,22 @@ def test_read_chunksize_generated_index(self):
458458

459459
tm.assert_frame_equal(pd.concat(reader), df)
460460

461+
def test_read_chunksize_jagged_names(self):
462+
# see gh-23509
463+
data = "\n".join(["0"] * 7 + [",".join(["0"] * 10)])
464+
reader = self.read_csv(StringIO(data), names=range(10), chunksize=4)
465+
466+
expected = DataFrame()
467+
468+
for i in range(10):
469+
if i == 0:
470+
expected[i] = [0] * 8
471+
else:
472+
expected[i] = [np.nan] * 7 + [0]
473+
474+
result = pd.concat(reader)
475+
tm.assert_frame_equal(result, expected)
476+
461477
def test_read_text_list(self):
462478
data = """A,B,C\nfoo,1,2,3\nbar,4,5,6"""
463479
as_list = [['A', 'B', 'C'], ['foo', '1', '2', '3'], ['bar',

0 commit comments

Comments
 (0)