diff --git a/pandas/io/tests/parser/__init__.py b/pandas/io/tests/parser/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/pandas/io/tests/parser/c_parser_only.py b/pandas/io/tests/parser/c_parser_only.py new file mode 100644 index 0000000000000..24c670abe8158 --- /dev/null +++ b/pandas/io/tests/parser/c_parser_only.py @@ -0,0 +1,521 @@ +# -*- coding: utf-8 -*- + +""" +Tests that apply specifically to the CParser. Unless specifically stated +as a CParser-specific issue, the goal is to eventually move as many of +these tests out of this module as soon as the Python parser can accept +further arguments when parsing. +""" + +import nose +import numpy as np + +import pandas as pd +import pandas.util.testing as tm +from pandas import DataFrame, Series, Index, MultiIndex +from pandas import compat +from pandas.compat import StringIO, range, lrange + + +class CParserTests(object): + def test_buffer_overflow(self): + # see gh-9205: test certain malformed input files that cause + # buffer overflows in tokenizer.c + + malfw = "1\r1\r1\r 1\r 1\r" # buffer overflow in words pointer + malfs = "1\r1\r1\r 1\r 1\r11\r" # buffer overflow in stream pointer + malfl = "1\r1\r1\r 1\r 1\r11\r1\r" # buffer overflow in lines pointer + + cperr = 'Buffer overflow caught - possible malformed input file.' + + for malf in (malfw, malfs, malfl): + try: + self.read_table(StringIO(malf)) + except Exception as err: + self.assertIn(cperr, str(err)) + + def test_buffer_rd_bytes(self): + # see gh-12098: src->buffer in the C parser can be freed twice leading + # to a segfault if a corrupt gzip file is read with 'read_csv' and the + # buffer is filled more than once before gzip throws an exception + + data = '\x1F\x8B\x08\x00\x00\x00\x00\x00\x00\x03\xED\xC3\x41\x09' \ + '\x00\x00\x08\x00\xB1\xB7\xB6\xBA\xFE\xA5\xCC\x21\x6C\xB0' \ + '\xA6\x4D' + '\x55' * 267 + \ + '\x7D\xF7\x00\x91\xE0\x47\x97\x14\x38\x04\x00' \ + '\x1f\x8b\x08\x00VT\x97V\x00\x03\xed]\xefO' + for i in range(100): + try: + self.read_csv(StringIO(data), + compression='gzip', + delim_whitespace=True) + except Exception: + pass + + def test_delim_whitespace_custom_terminator(self): + # See gh-12912 + data = """a b c~1 2 3~4 5 6~7 8 9""" + df = self.read_csv(StringIO(data), lineterminator='~', + delim_whitespace=True) + expected = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], + columns=['a', 'b', 'c']) + tm.assert_frame_equal(df, expected) + + def test_parse_dates_empty_string(self): + # see gh-2263 + s = StringIO("Date, test\n2012-01-01, 1\n,2") + result = self.read_csv(s, parse_dates=["Date"], na_filter=False) + self.assertTrue(result['Date'].isnull()[1]) + + def test_dtype_and_names_error(self): + # see gh-8833: passing both dtype and names + # resulting in an error reporting issue + data = """ +1.0 1 +2.0 2 +3.0 3 +""" + # base cases + result = self.read_csv(StringIO(data), sep='\s+', header=None) + expected = DataFrame([[1.0, 1], [2.0, 2], [3.0, 3]]) + tm.assert_frame_equal(result, expected) + + result = self.read_csv(StringIO(data), sep='\s+', + header=None, names=['a', 'b']) + expected = DataFrame( + [[1.0, 1], [2.0, 2], [3.0, 3]], columns=['a', 'b']) + tm.assert_frame_equal(result, expected) + + # fallback casting + result = self.read_csv(StringIO( + data), sep='\s+', header=None, + names=['a', 'b'], dtype={'a': np.int32}) + expected = DataFrame([[1, 1], [2, 2], [3, 3]], + columns=['a', 'b']) + expected['a'] = expected['a'].astype(np.int32) + tm.assert_frame_equal(result, expected) + + data = """ +1.0 1 +nan 2 +3.0 3 +""" + # fallback casting, but not castable + with tm.assertRaisesRegexp(ValueError, 'cannot safely convert'): + self.read_csv(StringIO(data), sep='\s+', header=None, + names=['a', 'b'], dtype={'a': np.int32}) + + def test_passing_dtype(self): + # see gh-6607 + df = DataFrame(np.random.rand(5, 2), columns=list( + 'AB'), index=['1A', '1B', '1C', '1D', '1E']) + + with tm.ensure_clean('__passing_str_as_dtype__.csv') as path: + df.to_csv(path) + + # see gh-3795: passing 'str' as the dtype + result = self.read_csv(path, dtype=str, index_col=0) + tm.assert_series_equal(result.dtypes, Series( + {'A': 'object', 'B': 'object'})) + + # we expect all object columns, so need to + # convert to test for equivalence + result = result.astype(float) + tm.assert_frame_equal(result, df) + + # invalid dtype + self.assertRaises(TypeError, self.read_csv, path, + dtype={'A': 'foo', 'B': 'float64'}, + index_col=0) + + # valid but we don't support it (date) + self.assertRaises(TypeError, self.read_csv, path, + dtype={'A': 'datetime64', 'B': 'float64'}, + index_col=0) + self.assertRaises(TypeError, self.read_csv, path, + dtype={'A': 'datetime64', 'B': 'float64'}, + index_col=0, parse_dates=['B']) + + # valid but we don't support it + self.assertRaises(TypeError, self.read_csv, path, + dtype={'A': 'timedelta64', 'B': 'float64'}, + index_col=0) + + # see gh-12048: empty frame + actual = self.read_csv(StringIO('A,B'), dtype=str) + expected = DataFrame({'A': [], 'B': []}, index=[], dtype=str) + tm.assert_frame_equal(actual, expected) + + def test_precise_conversion(self): + # see gh-8002 + tm._skip_if_32bit() + from decimal import Decimal + + normal_errors = [] + precise_errors = [] + + # test numbers between 1 and 2 + for num in np.linspace(1., 2., num=500): + # 25 decimal digits of precision + text = 'a\n{0:.25}'.format(num) + + normal_val = float(self.read_csv(StringIO(text))['a'][0]) + precise_val = float(self.read_csv( + StringIO(text), float_precision='high')['a'][0]) + roundtrip_val = float(self.read_csv( + StringIO(text), float_precision='round_trip')['a'][0]) + actual_val = Decimal(text[2:]) + + def error(val): + return abs(Decimal('{0:.100}'.format(val)) - actual_val) + + normal_errors.append(error(normal_val)) + precise_errors.append(error(precise_val)) + + # round-trip should match float() + self.assertEqual(roundtrip_val, float(text[2:])) + + self.assertTrue(sum(precise_errors) <= sum(normal_errors)) + self.assertTrue(max(precise_errors) <= max(normal_errors)) + + def test_compact_ints(self): + if compat.is_platform_windows() and not self.low_memory: + raise nose.SkipTest( + "segfaults on win-64, only when all tests are run") + + data = ('0,1,0,0\n' + '1,1,0,0\n' + '0,1,0,1') + + result = self.read_csv(StringIO(data), delimiter=',', header=None, + compact_ints=True, as_recarray=True) + ex_dtype = np.dtype([(str(i), 'i1') for i in range(4)]) + self.assertEqual(result.dtype, ex_dtype) + + result = self.read_csv(StringIO(data), delimiter=',', header=None, + as_recarray=True, compact_ints=True, + use_unsigned=True) + ex_dtype = np.dtype([(str(i), 'u1') for i in range(4)]) + self.assertEqual(result.dtype, ex_dtype) + + def test_compact_ints_as_recarray(self): + if compat.is_platform_windows() and self.low_memory: + raise nose.SkipTest( + "segfaults on win-64, only when all tests are run") + + data = ('0,1,0,0\n' + '1,1,0,0\n' + '0,1,0,1') + + result = self.read_csv(StringIO(data), delimiter=',', header=None, + compact_ints=True, as_recarray=True) + ex_dtype = np.dtype([(str(i), 'i1') for i in range(4)]) + self.assertEqual(result.dtype, ex_dtype) + + result = self.read_csv(StringIO(data), delimiter=',', header=None, + as_recarray=True, compact_ints=True, + use_unsigned=True) + ex_dtype = np.dtype([(str(i), 'u1') for i in range(4)]) + self.assertEqual(result.dtype, ex_dtype) + + def test_pass_dtype(self): + data = """\ +one,two +1,2.5 +2,3.5 +3,4.5 +4,5.5""" + + result = self.read_csv(StringIO(data), dtype={'one': 'u1', 1: 'S1'}) + self.assertEqual(result['one'].dtype, 'u1') + self.assertEqual(result['two'].dtype, 'object') + + def test_pass_dtype_as_recarray(self): + if compat.is_platform_windows() and self.low_memory: + raise nose.SkipTest( + "segfaults on win-64, only when all tests are run") + + data = """\ +one,two +1,2.5 +2,3.5 +3,4.5 +4,5.5""" + + result = self.read_csv(StringIO(data), dtype={'one': 'u1', 1: 'S1'}, + as_recarray=True) + self.assertEqual(result['one'].dtype, 'u1') + self.assertEqual(result['two'].dtype, 'S1') + + def test_empty_pass_dtype(self): + data = 'one,two' + result = self.read_csv(StringIO(data), dtype={'one': 'u1'}) + + expected = DataFrame({'one': np.empty(0, dtype='u1'), + 'two': np.empty(0, dtype=np.object)}) + tm.assert_frame_equal(result, expected, check_index_type=False) + + def test_empty_with_index_pass_dtype(self): + data = 'one,two' + result = self.read_csv(StringIO(data), index_col=['one'], + dtype={'one': 'u1', 1: 'f'}) + + expected = DataFrame({'two': np.empty(0, dtype='f')}, + index=Index([], dtype='u1', name='one')) + tm.assert_frame_equal(result, expected, check_index_type=False) + + def test_empty_with_multiindex_pass_dtype(self): + data = 'one,two,three' + result = self.read_csv(StringIO(data), index_col=['one', 'two'], + dtype={'one': 'u1', 1: 'f8'}) + + exp_idx = MultiIndex.from_arrays([np.empty(0, dtype='u1'), + np.empty(0, dtype='O')], + names=['one', 'two']) + expected = DataFrame( + {'three': np.empty(0, dtype=np.object)}, index=exp_idx) + tm.assert_frame_equal(result, expected, check_index_type=False) + + def test_empty_with_mangled_column_pass_dtype_by_names(self): + data = 'one,one' + result = self.read_csv(StringIO(data), dtype={ + 'one': 'u1', 'one.1': 'f'}) + + expected = DataFrame( + {'one': np.empty(0, dtype='u1'), 'one.1': np.empty(0, dtype='f')}) + tm.assert_frame_equal(result, expected, check_index_type=False) + + def test_empty_with_mangled_column_pass_dtype_by_indexes(self): + data = 'one,one' + result = self.read_csv(StringIO(data), dtype={0: 'u1', 1: 'f'}) + + expected = DataFrame( + {'one': np.empty(0, dtype='u1'), 'one.1': np.empty(0, dtype='f')}) + tm.assert_frame_equal(result, expected, check_index_type=False) + + def test_empty_with_dup_column_pass_dtype_by_names(self): + data = 'one,one' + result = self.read_csv( + StringIO(data), mangle_dupe_cols=False, dtype={'one': 'u1'}) + expected = pd.concat([Series([], name='one', dtype='u1')] * 2, axis=1) + tm.assert_frame_equal(result, expected, check_index_type=False) + + def test_empty_with_dup_column_pass_dtype_by_indexes(self): + # FIXME in gh-9424 + raise nose.SkipTest( + "gh-9424; known failure read_csv with duplicate columns") + + data = 'one,one' + result = self.read_csv( + StringIO(data), mangle_dupe_cols=False, dtype={0: 'u1', 1: 'f'}) + expected = pd.concat([Series([], name='one', dtype='u1'), + Series([], name='one', dtype='f')], axis=1) + tm.assert_frame_equal(result, expected, check_index_type=False) + + def test_usecols_dtypes(self): + data = """\ +1,2,3 +4,5,6 +7,8,9 +10,11,12""" + + result = self.read_csv(StringIO(data), usecols=(0, 1, 2), + names=('a', 'b', 'c'), + header=None, + converters={'a': str}, + dtype={'b': int, 'c': float}, + ) + result2 = self.read_csv(StringIO(data), usecols=(0, 2), + names=('a', 'b', 'c'), + header=None, + converters={'a': str}, + dtype={'b': int, 'c': float}, + ) + self.assertTrue((result.dtypes == [object, np.int, np.float]).all()) + self.assertTrue((result2.dtypes == [object, np.float]).all()) + + def test_memory_map(self): + # it works! + self.read_csv(self.csv1, memory_map=True) + + def test_disable_bool_parsing(self): + # #2090 + + data = """A,B,C +Yes,No,Yes +No,Yes,Yes +Yes,,Yes +No,No,No""" + + result = self.read_csv(StringIO(data), dtype=object) + self.assertTrue((result.dtypes == object).all()) + + result = self.read_csv(StringIO(data), dtype=object, na_filter=False) + self.assertEqual(result['B'][2], '') + + def test_euro_decimal_format(self): + data = """Id;Number1;Number2;Text1;Text2;Number3 +1;1521,1541;187101,9543;ABC;poi;4,738797819 +2;121,12;14897,76;DEF;uyt;0,377320872 +3;878,158;108013,434;GHI;rez;2,735694704""" + + df2 = self.read_csv(StringIO(data), sep=';', decimal=',') + self.assertEqual(df2['Number1'].dtype, float) + self.assertEqual(df2['Number2'].dtype, float) + self.assertEqual(df2['Number3'].dtype, float) + + def test_custom_lineterminator(self): + data = 'a,b,c~1,2,3~4,5,6' + + result = self.read_csv(StringIO(data), lineterminator='~') + expected = self.read_csv(StringIO(data.replace('~', '\n'))) + + tm.assert_frame_equal(result, expected) + + def test_raise_on_passed_int_dtype_with_nas(self): + # see gh-2631 + data = """YEAR, DOY, a +2001,106380451,10 +2001,,11 +2001,106380451,67""" + self.assertRaises(ValueError, self.read_csv, StringIO(data), + sep=",", skipinitialspace=True, + dtype={'DOY': np.int64}) + + def test_na_trailing_columns(self): + data = """Date,Currenncy,Symbol,Type,Units,UnitPrice,Cost,Tax +2012-03-14,USD,AAPL,BUY,1000 +2012-05-12,USD,SBUX,SELL,500""" + + result = self.read_csv(StringIO(data)) + self.assertEqual(result['Date'][1], '2012-05-12') + self.assertTrue(result['UnitPrice'].isnull().all()) + + def test_parse_ragged_csv(self): + data = """1,2,3 +1,2,3,4 +1,2,3,4,5 +1,2 +1,2,3,4""" + + nice_data = """1,2,3,, +1,2,3,4, +1,2,3,4,5 +1,2,,, +1,2,3,4,""" + result = self.read_csv(StringIO(data), header=None, + names=['a', 'b', 'c', 'd', 'e']) + + expected = self.read_csv(StringIO(nice_data), header=None, + names=['a', 'b', 'c', 'd', 'e']) + + tm.assert_frame_equal(result, expected) + + # too many columns, cause segfault if not careful + data = "1,2\n3,4,5" + + result = self.read_csv(StringIO(data), header=None, + names=lrange(50)) + expected = self.read_csv(StringIO(data), header=None, + names=lrange(3)).reindex(columns=lrange(50)) + + tm.assert_frame_equal(result, expected) + + def test_tokenize_CR_with_quoting(self): + # see gh-3453 + + data = ' a,b,c\r"a,b","e,d","f,f"' + + result = self.read_csv(StringIO(data), header=None) + expected = self.read_csv(StringIO(data.replace('\r', '\n')), + header=None) + tm.assert_frame_equal(result, expected) + + result = self.read_csv(StringIO(data)) + expected = self.read_csv(StringIO(data.replace('\r', '\n'))) + tm.assert_frame_equal(result, expected) + + def test_raise_on_no_columns(self): + # single newline + data = "\n" + self.assertRaises(ValueError, self.read_csv, StringIO(data)) + + # test with more than a single newline + data = "\n\n\n" + self.assertRaises(ValueError, self.read_csv, StringIO(data)) + + def test_1000_sep_with_decimal(self): + data = """A|B|C +1|2,334.01|5 +10|13|10. +""" + expected = DataFrame({ + 'A': [1, 10], + 'B': [2334.01, 13], + 'C': [5, 10.] + }) + + tm.assert_equal(expected.A.dtype, 'int64') + tm.assert_equal(expected.B.dtype, 'float') + tm.assert_equal(expected.C.dtype, 'float') + + df = self.read_csv(StringIO(data), sep='|', thousands=',', decimal='.') + tm.assert_frame_equal(df, expected) + + df = self.read_table(StringIO(data), sep='|', + thousands=',', decimal='.') + tm.assert_frame_equal(df, expected) + + data_with_odd_sep = """A|B|C +1|2.334,01|5 +10|13|10, +""" + df = self.read_csv(StringIO(data_with_odd_sep), + sep='|', thousands='.', decimal=',') + tm.assert_frame_equal(df, expected) + + df = self.read_table(StringIO(data_with_odd_sep), + sep='|', thousands='.', decimal=',') + tm.assert_frame_equal(df, expected) + + def test_grow_boundary_at_cap(self): + # See gh-12494 + # + # Cause of error was that the C parser + # was not increasing the buffer size when + # the desired space would fill the buffer + # to capacity, which would later cause a + # buffer overflow error when checking the + # EOF terminator of the CSV stream + def test_empty_header_read(count): + s = StringIO(',' * count) + expected = DataFrame(columns=[ + 'Unnamed: {i}'.format(i=i) + for i in range(count + 1)]) + df = self.read_csv(s) + tm.assert_frame_equal(df, expected) + + for count in range(1, 101): + test_empty_header_read(count) + + def test_inf_parsing(self): + data = """\ +,A +a,inf +b,-inf +c,Inf +d,-Inf +e,INF +f,-INF +g,INf +h,-INf +i,inF +j,-inF""" + inf = float('inf') + expected = Series([inf, -inf] * 5) + + df = self.read_csv(StringIO(data), index_col=0) + tm.assert_almost_equal(df['A'].values, expected.values) + + df = self.read_csv(StringIO(data), index_col=0, na_filter=False) + tm.assert_almost_equal(df['A'].values, expected.values) diff --git a/pandas/io/tests/parser/comment.py b/pandas/io/tests/parser/comment.py new file mode 100644 index 0000000000000..07fc6a167a6c0 --- /dev/null +++ b/pandas/io/tests/parser/comment.py @@ -0,0 +1,106 @@ +# -*- coding: utf-8 -*- + +""" +Tests that comments are properly handled during parsing +for all of the parsers defined in parsers.py +""" + +import numpy as np +import pandas.util.testing as tm + +from pandas import DataFrame +from pandas.compat import StringIO + + +class CommentTests(object): + + def test_comment(self): + data = """A,B,C +1,2.,4.#hello world +5.,NaN,10.0 +""" + expected = [[1., 2., 4.], + [5., np.nan, 10.]] + df = self.read_csv(StringIO(data), comment='#') + tm.assert_almost_equal(df.values, expected) + + df = self.read_table(StringIO(data), sep=',', comment='#', + na_values=['NaN']) + tm.assert_almost_equal(df.values, expected) + + def test_line_comment(self): + data = """# empty +A,B,C +1,2.,4.#hello world +#ignore this line +5.,NaN,10.0 +""" + expected = [[1., 2., 4.], + [5., np.nan, 10.]] + df = self.read_csv(StringIO(data), comment='#') + tm.assert_almost_equal(df.values, expected) + + # check with delim_whitespace=True + df = self.read_csv(StringIO(data.replace(',', ' ')), comment='#', + delim_whitespace=True) + tm.assert_almost_equal(df.values, expected) + + # custom line terminator is not supported + # with the Python parser yet + if self.engine == 'c': + expected = [[1., 2., 4.], + [5., np.nan, 10.]] + df = self.read_csv(StringIO(data.replace('\n', '*')), + comment='#', lineterminator='*') + tm.assert_almost_equal(df.values, expected) + + def test_comment_skiprows(self): + data = """# empty +random line +# second empty line +1,2,3 +A,B,C +1,2.,4. +5.,NaN,10.0 +""" + # this should ignore the first four lines (including comments) + expected = [[1., 2., 4.], [5., np.nan, 10.]] + df = self.read_csv(StringIO(data), comment='#', skiprows=4) + tm.assert_almost_equal(df.values, expected) + + def test_comment_header(self): + data = """# empty +# second empty line +1,2,3 +A,B,C +1,2.,4. +5.,NaN,10.0 +""" + # header should begin at the second non-comment line + expected = [[1., 2., 4.], [5., np.nan, 10.]] + df = self.read_csv(StringIO(data), comment='#', header=1) + tm.assert_almost_equal(df.values, expected) + + def test_comment_skiprows_header(self): + data = """# empty +# second empty line +# third empty line +X,Y,Z +1,2,3 +A,B,C +1,2.,4. +5.,NaN,10.0 +""" + # skiprows should skip the first 4 lines (including comments), while + # header should start from the second non-commented line starting + # with line 5 + expected = [[1., 2., 4.], [5., np.nan, 10.]] + df = self.read_csv(StringIO(data), comment='#', skiprows=4, header=1) + tm.assert_almost_equal(df.values, expected) + + def test_custom_comment_char(self): + data = "a,b,c\n1,2,3#ignore this!\n4,5,6#ignorethistoo" + + result = self.read_csv(StringIO(data), comment='#') + expected = DataFrame({'a': [1, 4], 'b': [2, 5], 'c': [3, 6]}) + tm.assert_frame_equal(result, expected) diff --git a/pandas/io/tests/parser/common.py b/pandas/io/tests/parser/common.py new file mode 100644 index 0000000000000..a9d4ca2e3621e --- /dev/null +++ b/pandas/io/tests/parser/common.py @@ -0,0 +1,1238 @@ +# -*- coding: utf-8 -*- + +import csv +import os +import platform + +import re +import sys +from datetime import datetime + +import nose +import numpy as np +from numpy.testing.decorators import slow +from pandas.lib import Timestamp + +import pandas as pd +import pandas.util.testing as tm +from pandas import DataFrame, Series, Index, MultiIndex +from pandas import compat +from pandas.compat import(StringIO, BytesIO, PY3, + range, lrange, u) +from pandas.io.common import DtypeWarning, EmptyDataError, URLError +from pandas.io.parsers import TextFileReader, TextParser + + +class ParserTests(object): + """ + Want to be able to test either C+Cython or Python+Cython parsers + """ + data1 = """index,A,B,C,D +foo,2,3,4,5 +bar,7,8,9,10 +baz,12,13,14,15 +qux,12,13,14,15 +foo2,12,13,14,15 +bar2,12,13,14,15 +""" + + def test_empty_decimal_marker(self): + data = """A|B|C +1|2,334|5 +10|13|10. +""" + # C parser: supports only length-1 decimals + # Python parser: 'decimal' not supported yet + self.assertRaises(ValueError, self.read_csv, + StringIO(data), decimal='') + + def test_read_csv(self): + if not compat.PY3: + if compat.is_platform_windows(): + prefix = u("file:///") + else: + prefix = u("file://") + + fname = prefix + compat.text_type(self.csv1) + self.read_csv(fname, index_col=0, parse_dates=True) + + def test_dialect(self): + data = """\ +label1,label2,label3 +index1,"a,c,e +index2,b,d,f +""" + + dia = csv.excel() + dia.quoting = csv.QUOTE_NONE + df = self.read_csv(StringIO(data), dialect=dia) + + data = '''\ +label1,label2,label3 +index1,a,c,e +index2,b,d,f +''' + exp = self.read_csv(StringIO(data)) + exp.replace('a', '"a', inplace=True) + tm.assert_frame_equal(df, exp) + + def test_dialect_str(self): + data = """\ +fruit:vegetable +apple:brocolli +pear:tomato +""" + exp = DataFrame({ + 'fruit': ['apple', 'pear'], + 'vegetable': ['brocolli', 'tomato'] + }) + dia = csv.register_dialect('mydialect', delimiter=':') # noqa + df = self.read_csv(StringIO(data), dialect='mydialect') + tm.assert_frame_equal(df, exp) + csv.unregister_dialect('mydialect') + + def test_1000_sep(self): + data = """A|B|C +1|2,334|5 +10|13|10. +""" + expected = DataFrame({ + 'A': [1, 10], + 'B': [2334, 13], + 'C': [5, 10.] + }) + + df = self.read_csv(StringIO(data), sep='|', thousands=',') + tm.assert_frame_equal(df, expected) + + df = self.read_table(StringIO(data), sep='|', thousands=',') + tm.assert_frame_equal(df, expected) + + def test_squeeze(self): + data = """\ +a,1 +b,2 +c,3 +""" + idx = Index(['a', 'b', 'c'], name=0) + expected = Series([1, 2, 3], name=1, index=idx) + result = self.read_table(StringIO(data), sep=',', index_col=0, + header=None, squeeze=True) + tm.assertIsInstance(result, Series) + tm.assert_series_equal(result, expected) + + def test_squeeze_no_view(self): + # see gh-8217 + # Series should not be a view + data = """time,data\n0,10\n1,11\n2,12\n4,14\n5,15\n3,13""" + result = self.read_csv(StringIO(data), index_col='time', squeeze=True) + self.assertFalse(result._is_view) + + def test_multiple_skts_example(self): + # TODO: Complete this + data = "year, month, a, b\n 2001, 01, 0.0, 10.\n 2001, 02, 1.1, 11." # noqa + pass + + def test_malformed(self): + # see gh-6607 + + # all + data = """ignore +A,B,C +1,2,3 # comment +1,2,3,4,5 +2,3,4 +""" + msg = 'Expected 3 fields in line 4, saw 5' + with tm.assertRaisesRegexp(Exception, msg): + self.read_table(StringIO(data), sep=',', + header=1, comment='#') + + # first chunk + data = """ignore +A,B,C +skip +1,2,3 +3,5,10 # comment +1,2,3,4,5 +2,3,4 +""" + msg = 'Expected 3 fields in line 6, saw 5' + with tm.assertRaisesRegexp(Exception, msg): + it = self.read_table(StringIO(data), sep=',', + header=1, comment='#', + iterator=True, chunksize=1, + skiprows=[2]) + it.read(5) + + # middle chunk + data = """ignore +A,B,C +skip +1,2,3 +3,5,10 # comment +1,2,3,4,5 +2,3,4 +""" + msg = 'Expected 3 fields in line 6, saw 5' + with tm.assertRaisesRegexp(Exception, msg): + it = self.read_table(StringIO(data), sep=',', header=1, + comment='#', iterator=True, chunksize=1, + skiprows=[2]) + it.read(3) + + # last chunk + data = """ignore +A,B,C +skip +1,2,3 +3,5,10 # comment +1,2,3,4,5 +2,3,4 +""" + msg = 'Expected 3 fields in line 6, saw 5' + with tm.assertRaisesRegexp(Exception, msg): + it = self.read_table(StringIO(data), sep=',', header=1, + comment='#', iterator=True, chunksize=1, + skiprows=[2]) + it.read() + + # skip_footer is not supported with the C parser yet + if self.engine == 'python': + # skip_footer + data = """ignore +A,B,C +1,2,3 # comment +1,2,3,4,5 +2,3,4 +footer +""" + msg = 'Expected 3 fields in line 4, saw 5' + with tm.assertRaisesRegexp(Exception, msg): + self.read_table(StringIO(data), sep=',', + header=1, comment='#', + skip_footer=1) + + def test_quoting(self): + bad_line_small = """printer\tresult\tvariant_name +Klosterdruckerei\tKlosterdruckerei (1611-1804)\tMuller, Jacob +Klosterdruckerei\tKlosterdruckerei (1611-1804)\tMuller, Jakob +Klosterdruckerei\tKlosterdruckerei (1609-1805)\t"Furststiftische Hofdruckerei, (1609-1805)\tGaller, Alois +Klosterdruckerei\tKlosterdruckerei (1609-1805)\tHochfurstliche Buchhandlung """ # noqa + self.assertRaises(Exception, self.read_table, StringIO(bad_line_small), + sep='\t') + + good_line_small = bad_line_small + '"' + df = self.read_table(StringIO(good_line_small), sep='\t') + self.assertEqual(len(df), 3) + + def test_unnamed_columns(self): + data = """A,B,C,, +1,2,3,4,5 +6,7,8,9,10 +11,12,13,14,15 +""" + expected = [[1, 2, 3, 4, 5.], + [6, 7, 8, 9, 10], + [11, 12, 13, 14, 15]] + df = self.read_table(StringIO(data), sep=',') + tm.assert_almost_equal(df.values, expected) + self.assert_numpy_array_equal(df.columns, + ['A', 'B', 'C', 'Unnamed: 3', + 'Unnamed: 4']) + + def test_duplicate_columns(self): + data = """A,A,B,B,B +1,2,3,4,5 +6,7,8,9,10 +11,12,13,14,15 +""" + + for method in ('read_csv', 'read_table'): + + # check default behavior + df = getattr(self, method)(StringIO(data), sep=',') + self.assertEqual(list(df.columns), + ['A', 'A.1', 'B', 'B.1', 'B.2']) + + df = getattr(self, method)(StringIO(data), sep=',', + mangle_dupe_cols=False) + self.assertEqual(list(df.columns), + ['A', 'A', 'B', 'B', 'B']) + + df = getattr(self, method)(StringIO(data), sep=',', + mangle_dupe_cols=True) + self.assertEqual(list(df.columns), + ['A', 'A.1', 'B', 'B.1', 'B.2']) + + def test_csv_mixed_type(self): + data = """A,B,C +a,1,2 +b,3,4 +c,4,5 +""" + # TODO: complete this + df = self.read_csv(StringIO(data)) # noqa + + def test_read_csv_dataframe(self): + df = self.read_csv(self.csv1, index_col=0, parse_dates=True) + df2 = self.read_table(self.csv1, sep=',', index_col=0, + parse_dates=True) + self.assert_numpy_array_equal(df.columns, ['A', 'B', 'C', 'D']) + self.assertEqual(df.index.name, 'index') + self.assertIsInstance( + df.index[0], (datetime, np.datetime64, Timestamp)) + self.assertEqual(df.values.dtype, np.float64) + tm.assert_frame_equal(df, df2) + + def test_read_csv_no_index_name(self): + df = self.read_csv(self.csv2, index_col=0, parse_dates=True) + df2 = self.read_table(self.csv2, sep=',', index_col=0, + parse_dates=True) + self.assert_numpy_array_equal(df.columns, ['A', 'B', 'C', 'D', 'E']) + self.assertIsInstance( + df.index[0], (datetime, np.datetime64, Timestamp)) + self.assertEqual(df.ix[ + :, ['A', 'B', 'C', 'D'] + ].values.dtype, np.float64) + tm.assert_frame_equal(df, df2) + + def test_read_table_unicode(self): + fin = BytesIO(u('\u0141aski, Jan;1').encode('utf-8')) + df1 = self.read_table(fin, sep=";", encoding="utf-8", header=None) + tm.assertIsInstance(df1[0].values[0], compat.text_type) + + def test_read_table_wrong_num_columns(self): + # too few! + data = """A,B,C,D,E,F +1,2,3,4,5,6 +6,7,8,9,10,11,12 +11,12,13,14,15,16 +""" + self.assertRaises(ValueError, self.read_csv, StringIO(data)) + + def test_read_duplicate_index_explicit(self): + data = """index,A,B,C,D +foo,2,3,4,5 +bar,7,8,9,10 +baz,12,13,14,15 +qux,12,13,14,15 +foo,12,13,14,15 +bar,12,13,14,15 +""" + + result = self.read_csv(StringIO(data), index_col=0) + expected = self.read_csv(StringIO(data)).set_index( + 'index', verify_integrity=False) + tm.assert_frame_equal(result, expected) + + result = self.read_table(StringIO(data), sep=',', index_col=0) + expected = self.read_table(StringIO(data), sep=',', ).set_index( + 'index', verify_integrity=False) + tm.assert_frame_equal(result, expected) + + def test_read_duplicate_index_implicit(self): + data = """A,B,C,D +foo,2,3,4,5 +bar,7,8,9,10 +baz,12,13,14,15 +qux,12,13,14,15 +foo,12,13,14,15 +bar,12,13,14,15 +""" + + # make sure an error isn't thrown + self.read_csv(StringIO(data)) + self.read_table(StringIO(data), sep=',') + + def test_parse_bools(self): + data = """A,B +True,1 +False,2 +True,3 +""" + data = self.read_csv(StringIO(data)) + self.assertEqual(data['A'].dtype, np.bool_) + + data = """A,B +YES,1 +no,2 +yes,3 +No,3 +Yes,3 +""" + data = self.read_csv(StringIO(data), + true_values=['yes', 'Yes', 'YES'], + false_values=['no', 'NO', 'No']) + self.assertEqual(data['A'].dtype, np.bool_) + + data = """A,B +TRUE,1 +FALSE,2 +TRUE,3 +""" + data = self.read_csv(StringIO(data)) + self.assertEqual(data['A'].dtype, np.bool_) + + data = """A,B +foo,bar +bar,foo""" + result = self.read_csv(StringIO(data), true_values=['foo'], + false_values=['bar']) + expected = DataFrame({'A': [True, False], 'B': [False, True]}) + tm.assert_frame_equal(result, expected) + + def test_int_conversion(self): + data = """A,B +1.0,1 +2.0,2 +3.0,3 +""" + data = self.read_csv(StringIO(data)) + self.assertEqual(data['A'].dtype, np.float64) + self.assertEqual(data['B'].dtype, np.int64) + + def test_read_nrows(self): + df = self.read_csv(StringIO(self.data1), nrows=3) + expected = self.read_csv(StringIO(self.data1))[:3] + tm.assert_frame_equal(df, expected) + + def test_read_chunksize(self): + reader = self.read_csv(StringIO(self.data1), index_col=0, chunksize=2) + df = self.read_csv(StringIO(self.data1), index_col=0) + + chunks = list(reader) + + tm.assert_frame_equal(chunks[0], df[:2]) + tm.assert_frame_equal(chunks[1], df[2:4]) + tm.assert_frame_equal(chunks[2], df[4:]) + + def test_read_chunksize_named(self): + reader = self.read_csv( + StringIO(self.data1), index_col='index', chunksize=2) + df = self.read_csv(StringIO(self.data1), index_col='index') + + chunks = list(reader) + + tm.assert_frame_equal(chunks[0], df[:2]) + tm.assert_frame_equal(chunks[1], df[2:4]) + tm.assert_frame_equal(chunks[2], df[4:]) + + def test_get_chunk_passed_chunksize(self): + data = """A,B,C +1,2,3 +4,5,6 +7,8,9 +1,2,3""" + result = self.read_csv(StringIO(data), chunksize=2) + + piece = result.get_chunk() + self.assertEqual(len(piece), 2) + + def test_read_text_list(self): + data = """A,B,C\nfoo,1,2,3\nbar,4,5,6""" + as_list = [['A', 'B', 'C'], ['foo', '1', '2', '3'], ['bar', + '4', '5', '6']] + df = self.read_csv(StringIO(data), index_col=0) + + parser = TextParser(as_list, index_col=0, chunksize=2) + chunk = parser.read(None) + + tm.assert_frame_equal(chunk, df) + + def test_iterator(self): + # See gh-6607 + reader = self.read_csv(StringIO(self.data1), index_col=0, + iterator=True) + df = self.read_csv(StringIO(self.data1), index_col=0) + + chunk = reader.read(3) + tm.assert_frame_equal(chunk, df[:3]) + + last_chunk = reader.read(5) + tm.assert_frame_equal(last_chunk, df[3:]) + + # pass list + lines = list(csv.reader(StringIO(self.data1))) + parser = TextParser(lines, index_col=0, chunksize=2) + + df = self.read_csv(StringIO(self.data1), index_col=0) + + chunks = list(parser) + tm.assert_frame_equal(chunks[0], df[:2]) + tm.assert_frame_equal(chunks[1], df[2:4]) + tm.assert_frame_equal(chunks[2], df[4:]) + + # pass skiprows + parser = TextParser(lines, index_col=0, chunksize=2, skiprows=[1]) + chunks = list(parser) + tm.assert_frame_equal(chunks[0], df[1:3]) + + treader = self.read_table(StringIO(self.data1), sep=',', index_col=0, + iterator=True) + tm.assertIsInstance(treader, TextFileReader) + + # gh-3967: stopping iteration when chunksize is specified + data = """A,B,C +foo,1,2,3 +bar,4,5,6 +baz,7,8,9 +""" + reader = self.read_csv(StringIO(data), iterator=True) + result = list(reader) + expected = DataFrame(dict(A=[1, 4, 7], B=[2, 5, 8], C=[ + 3, 6, 9]), index=['foo', 'bar', 'baz']) + tm.assert_frame_equal(result[0], expected) + + # chunksize = 1 + reader = self.read_csv(StringIO(data), chunksize=1) + result = list(reader) + expected = DataFrame(dict(A=[1, 4, 7], B=[2, 5, 8], C=[ + 3, 6, 9]), index=['foo', 'bar', 'baz']) + self.assertEqual(len(result), 3) + tm.assert_frame_equal(pd.concat(result), expected) + + # skip_footer is not supported with the C parser yet + if self.engine == 'python': + # test bad parameter (skip_footer) + reader = self.read_csv(StringIO(self.data1), index_col=0, + iterator=True, skip_footer=True) + self.assertRaises(ValueError, reader.read, 3) + + def test_pass_names_with_index(self): + lines = self.data1.split('\n') + no_header = '\n'.join(lines[1:]) + + # regular index + names = ['index', 'A', 'B', 'C', 'D'] + df = self.read_csv(StringIO(no_header), index_col=0, names=names) + expected = self.read_csv(StringIO(self.data1), index_col=0) + tm.assert_frame_equal(df, expected) + + # multi index + data = """index1,index2,A,B,C,D +foo,one,2,3,4,5 +foo,two,7,8,9,10 +foo,three,12,13,14,15 +bar,one,12,13,14,15 +bar,two,12,13,14,15 +""" + lines = data.split('\n') + no_header = '\n'.join(lines[1:]) + names = ['index1', 'index2', 'A', 'B', 'C', 'D'] + df = self.read_csv(StringIO(no_header), index_col=[0, 1], + names=names) + expected = self.read_csv(StringIO(data), index_col=[0, 1]) + tm.assert_frame_equal(df, expected) + + df = self.read_csv(StringIO(data), index_col=['index1', 'index2']) + tm.assert_frame_equal(df, expected) + + def test_multi_index_no_level_names(self): + data = """index1,index2,A,B,C,D +foo,one,2,3,4,5 +foo,two,7,8,9,10 +foo,three,12,13,14,15 +bar,one,12,13,14,15 +bar,two,12,13,14,15 +""" + + data2 = """A,B,C,D +foo,one,2,3,4,5 +foo,two,7,8,9,10 +foo,three,12,13,14,15 +bar,one,12,13,14,15 +bar,two,12,13,14,15 +""" + + lines = data.split('\n') + no_header = '\n'.join(lines[1:]) + names = ['A', 'B', 'C', 'D'] + + df = self.read_csv(StringIO(no_header), index_col=[0, 1], + header=None, names=names) + expected = self.read_csv(StringIO(data), index_col=[0, 1]) + tm.assert_frame_equal(df, expected, check_names=False) + + # 2 implicit first cols + df2 = self.read_csv(StringIO(data2)) + tm.assert_frame_equal(df2, df) + + # reverse order of index + df = self.read_csv(StringIO(no_header), index_col=[1, 0], names=names, + header=None) + expected = self.read_csv(StringIO(data), index_col=[1, 0]) + tm.assert_frame_equal(df, expected, check_names=False) + + def test_no_unnamed_index(self): + data = """ id c0 c1 c2 +0 1 0 a b +1 2 0 c d +2 2 2 e f +""" + df = self.read_table(StringIO(data), sep=' ') + self.assertIsNone(df.index.name) + + def test_read_csv_parse_simple_list(self): + text = """foo +bar baz +qux foo +foo +bar""" + df = self.read_csv(StringIO(text), header=None) + expected = DataFrame({0: ['foo', 'bar baz', 'qux foo', + 'foo', 'bar']}) + tm.assert_frame_equal(df, expected) + + @tm.network + def test_url(self): + # HTTP(S) + url = ('https://raw.github.com/pydata/pandas/master/' + 'pandas/io/tests/data/salary.table') + url_table = self.read_table(url) + dirpath = tm.get_data_path() + localtable = os.path.join(dirpath, 'salary.table') + local_table = self.read_table(localtable) + tm.assert_frame_equal(url_table, local_table) + # TODO: ftp testing + + @slow + def test_file(self): + + # FILE + if sys.version_info[:2] < (2, 6): + raise nose.SkipTest("file:// not supported with Python < 2.6") + dirpath = tm.get_data_path() + localtable = os.path.join(dirpath, 'salary.table') + local_table = self.read_table(localtable) + + try: + url_table = self.read_table('file://localhost/' + localtable) + except URLError: + # fails on some systems + raise nose.SkipTest("failing on %s" % + ' '.join(platform.uname()).strip()) + + tm.assert_frame_equal(url_table, local_table) + + def test_nonexistent_path(self): + # don't segfault pls #2428 + path = '%s.csv' % tm.rands(10) + self.assertRaises(IOError, self.read_csv, path) + + def test_missing_trailing_delimiters(self): + data = """A,B,C,D +1,2,3,4 +1,3,3, +1,4,5""" + result = self.read_csv(StringIO(data)) + self.assertTrue(result['D'].isnull()[1:].all()) + + def test_skipinitialspace(self): + s = ('"09-Apr-2012", "01:10:18.300", 2456026.548822908, 12849, ' + '1.00361, 1.12551, 330.65659, 0355626618.16711, 73.48821, ' + '314.11625, 1917.09447, 179.71425, 80.000, 240.000, -350, ' + '70.06056, 344.98370, 1, 1, -0.689265, -0.692787, ' + '0.212036, 14.7674, 41.605, -9999.0, -9999.0, ' + '-9999.0, -9999.0, -9999.0, -9999.0, 000, 012, 128') + + sfile = StringIO(s) + # it's 33 columns + result = self.read_csv(sfile, names=lrange(33), na_values=['-9999.0'], + header=None, skipinitialspace=True) + self.assertTrue(pd.isnull(result.ix[0, 29])) + + def test_utf16_bom_skiprows(self): + # #2298 + data = u("""skip this +skip this too +A\tB\tC +1\t2\t3 +4\t5\t6""") + + data2 = u("""skip this +skip this too +A,B,C +1,2,3 +4,5,6""") + + path = '__%s__.csv' % tm.rands(10) + + with tm.ensure_clean(path) as path: + for sep, dat in [('\t', data), (',', data2)]: + for enc in ['utf-16', 'utf-16le', 'utf-16be']: + bytes = dat.encode(enc) + with open(path, 'wb') as f: + f.write(bytes) + + s = BytesIO(dat.encode('utf-8')) + if compat.PY3: + # somewhat False since the code never sees bytes + from io import TextIOWrapper + s = TextIOWrapper(s, encoding='utf-8') + + result = self.read_csv(path, encoding=enc, skiprows=2, + sep=sep) + expected = self.read_csv(s, encoding='utf-8', skiprows=2, + sep=sep) + s.close() + + tm.assert_frame_equal(result, expected) + + def test_utf16_example(self): + path = tm.get_data_path('utf16_ex.txt') + + # it works! and is the right length + result = self.read_table(path, encoding='utf-16') + self.assertEqual(len(result), 50) + + if not compat.PY3: + buf = BytesIO(open(path, 'rb').read()) + result = self.read_table(buf, encoding='utf-16') + self.assertEqual(len(result), 50) + + def test_unicode_encoding(self): + pth = tm.get_data_path('unicode_series.csv') + + result = self.read_csv(pth, header=None, encoding='latin-1') + result = result.set_index(0) + + got = result[1][1632] + expected = u('\xc1 k\xf6ldum klaka (Cold Fever) (1994)') + + self.assertEqual(got, expected) + + def test_trailing_delimiters(self): + # #2442. grumble grumble + data = """A,B,C +1,2,3, +4,5,6, +7,8,9,""" + result = self.read_csv(StringIO(data), index_col=False) + + expected = DataFrame({'A': [1, 4, 7], 'B': [2, 5, 8], + 'C': [3, 6, 9]}) + + tm.assert_frame_equal(result, expected) + + def test_escapechar(self): + # http://stackoverflow.com/questions/13824840/feature-request-for- + # pandas-read-csv + data = '''SEARCH_TERM,ACTUAL_URL +"bra tv bord","http://www.ikea.com/se/sv/catalog/categories/departments/living_room/10475/?se%7cps%7cnonbranded%7cvardagsrum%7cgoogle%7ctv_bord" +"tv p\xc3\xa5 hjul","http://www.ikea.com/se/sv/catalog/categories/departments/living_room/10475/?se%7cps%7cnonbranded%7cvardagsrum%7cgoogle%7ctv_bord" +"SLAGBORD, \\"Bergslagen\\", IKEA:s 1700-tals serie","http://www.ikea.com/se/sv/catalog/categories/departments/living_room/10475/?se%7cps%7cnonbranded%7cvardagsrum%7cgoogle%7ctv_bord"''' # noqa + + result = self.read_csv(StringIO(data), escapechar='\\', + quotechar='"', encoding='utf-8') + self.assertEqual(result['SEARCH_TERM'][2], + 'SLAGBORD, "Bergslagen", IKEA:s 1700-tals serie') + self.assertTrue(np.array_equal(result.columns, + ['SEARCH_TERM', 'ACTUAL_URL'])) + + def test_int64_min_issues(self): + # #2599 + data = 'A,B\n0,0\n0,' + + result = self.read_csv(StringIO(data)) + expected = DataFrame({'A': [0, 0], 'B': [0, np.nan]}) + + tm.assert_frame_equal(result, expected) + + def test_parse_integers_above_fp_precision(self): + data = """Numbers +17007000002000191 +17007000002000191 +17007000002000191 +17007000002000191 +17007000002000192 +17007000002000192 +17007000002000192 +17007000002000192 +17007000002000192 +17007000002000194""" + + result = self.read_csv(StringIO(data)) + expected = DataFrame({'Numbers': [17007000002000191, + 17007000002000191, + 17007000002000191, + 17007000002000191, + 17007000002000192, + 17007000002000192, + 17007000002000192, + 17007000002000192, + 17007000002000192, + 17007000002000194]}) + + self.assertTrue(np.array_equal(result['Numbers'], expected['Numbers'])) + + def test_chunks_have_consistent_numerical_type(self): + integers = [str(i) for i in range(499999)] + data = "a\n" + "\n".join(integers + ["1.0", "2.0"] + integers) + + with tm.assert_produces_warning(False): + df = self.read_csv(StringIO(data)) + # Assert that types were coerced. + self.assertTrue(type(df.a[0]) is np.float64) + self.assertEqual(df.a.dtype, np.float) + + def test_warn_if_chunks_have_mismatched_type(self): + warning_type = False + integers = [str(i) for i in range(499999)] + data = "a\n" + "\n".join(integers + ['a', 'b'] + integers) + + # see gh-3866: if chunks are different types and can't + # be coerced using numerical types, then issue warning. + if self.engine == 'c' and self.low_memory: + warning_type = DtypeWarning + + with tm.assert_produces_warning(warning_type): + df = self.read_csv(StringIO(data)) + self.assertEqual(df.a.dtype, np.object) + + def test_integer_overflow_bug(self): + # see gh-2601 + data = "65248E10 11\n55555E55 22\n" + + result = self.read_csv(StringIO(data), header=None, sep=' ') + self.assertTrue(result[0].dtype == np.float64) + + result = self.read_csv(StringIO(data), header=None, sep='\s+') + self.assertTrue(result[0].dtype == np.float64) + + def test_catch_too_many_names(self): + # see gh-5156 + data = """\ +1,2,3 +4,,6 +7,8,9 +10,11,12\n""" + tm.assertRaises(ValueError, self.read_csv, StringIO(data), + header=0, names=['a', 'b', 'c', 'd']) + + def test_ignore_leading_whitespace(self): + # see gh-3374, gh-6607 + data = ' a b c\n 1 2 3\n 4 5 6\n 7 8 9' + result = self.read_table(StringIO(data), sep='\s+') + expected = DataFrame({'a': [1, 4, 7], 'b': [2, 5, 8], 'c': [3, 6, 9]}) + tm.assert_frame_equal(result, expected) + + def test_nrows_and_chunksize_raises_notimplemented(self): + data = 'a b c' + self.assertRaises(NotImplementedError, self.read_csv, StringIO(data), + nrows=10, chunksize=5) + + def test_chunk_begins_with_newline_whitespace(self): + # see gh-10022 + data = '\n hello\nworld\n' + result = self.read_csv(StringIO(data), header=None) + self.assertEqual(len(result), 2) + + # see gh-9735: this issue is C parser-specific (bug when + # parsing whitespace and characters at chunk boundary) + if self.engine == 'c': + chunk1 = 'a' * (1024 * 256 - 2) + '\na' + chunk2 = '\n a' + result = self.read_csv(StringIO(chunk1 + chunk2), header=None) + expected = DataFrame(['a' * (1024 * 256 - 2), 'a', ' a']) + tm.assert_frame_equal(result, expected) + + def test_empty_with_index(self): + # see gh-10184 + data = 'x,y' + result = self.read_csv(StringIO(data), index_col=0) + expected = DataFrame([], columns=['y'], index=Index([], name='x')) + tm.assert_frame_equal(result, expected) + + def test_empty_with_multiindex(self): + # see gh-10467 + data = 'x,y,z' + result = self.read_csv(StringIO(data), index_col=['x', 'y']) + expected = DataFrame([], columns=['z'], + index=MultiIndex.from_arrays( + [[]] * 2, names=['x', 'y'])) + tm.assert_frame_equal(result, expected, check_index_type=False) + + def test_empty_with_reversed_multiindex(self): + data = 'x,y,z' + result = self.read_csv(StringIO(data), index_col=[1, 0]) + expected = DataFrame([], columns=['z'], + index=MultiIndex.from_arrays( + [[]] * 2, names=['y', 'x'])) + tm.assert_frame_equal(result, expected, check_index_type=False) + + def test_float_parser(self): + # see gh-9565 + data = '45e-1,4.5,45.,inf,-inf' + result = self.read_csv(StringIO(data), header=None) + expected = DataFrame([[float(s) for s in data.split(',')]]) + tm.assert_frame_equal(result, expected) + + def test_scientific_no_exponent(self): + # see gh-12215 + df = DataFrame.from_items([('w', ['2e']), ('x', ['3E']), + ('y', ['42e']), ('z', ['632E'])]) + data = df.to_csv(index=False) + for prec in self.float_precision_choices: + df_roundtrip = self.read_csv( + StringIO(data), float_precision=prec) + tm.assert_frame_equal(df_roundtrip, df) + + def test_int64_overflow(self): + data = """ID +00013007854817840016671868 +00013007854817840016749251 +00013007854817840016754630 +00013007854817840016781876 +00013007854817840017028824 +00013007854817840017963235 +00013007854817840018860166""" + + result = self.read_csv(StringIO(data)) + self.assertTrue(result['ID'].dtype == object) + + self.assertRaises(OverflowError, self.read_csv, + StringIO(data), converters={'ID': np.int64}) + + # Just inside int64 range: parse as integer + i_max = np.iinfo(np.int64).max + i_min = np.iinfo(np.int64).min + for x in [i_max, i_min]: + result = self.read_csv(StringIO(str(x)), header=None) + expected = DataFrame([x]) + tm.assert_frame_equal(result, expected) + + # Just outside int64 range: parse as string + too_big = i_max + 1 + too_small = i_min - 1 + for x in [too_big, too_small]: + result = self.read_csv(StringIO(str(x)), header=None) + expected = DataFrame([str(x)]) + tm.assert_frame_equal(result, expected) + + def test_empty_with_nrows_chunksize(self): + # see gh-9535 + expected = DataFrame([], columns=['foo', 'bar']) + result = self.read_csv(StringIO('foo,bar\n'), nrows=10) + tm.assert_frame_equal(result, expected) + + result = next(iter(self.read_csv( + StringIO('foo,bar\n'), chunksize=10))) + tm.assert_frame_equal(result, expected) + + # 'as_recarray' is not supported yet for the Python parser + if self.engine == 'c': + result = self.read_csv(StringIO('foo,bar\n'), + nrows=10, as_recarray=True) + result = DataFrame(result[2], columns=result[1], + index=result[0]) + tm.assert_frame_equal(DataFrame.from_records( + result), expected, check_index_type=False) + + result = next(iter(self.read_csv( + StringIO('foo,bar\n'), chunksize=10, as_recarray=True))) + result = DataFrame(result[2], columns=result[1], index=result[0]) + tm.assert_frame_equal(DataFrame.from_records( + result), expected, check_index_type=False) + + def test_eof_states(self): + # see gh-10728, gh-10548 + + # With skip_blank_lines = True + expected = DataFrame([[4, 5, 6]], columns=['a', 'b', 'c']) + + # gh-10728: WHITESPACE_LINE + data = 'a,b,c\n4,5,6\n ' + result = self.read_csv(StringIO(data)) + tm.assert_frame_equal(result, expected) + + # gh-10548: EAT_LINE_COMMENT + data = 'a,b,c\n4,5,6\n#comment' + result = self.read_csv(StringIO(data), comment='#') + tm.assert_frame_equal(result, expected) + + # EAT_CRNL_NOP + data = 'a,b,c\n4,5,6\n\r' + result = self.read_csv(StringIO(data)) + tm.assert_frame_equal(result, expected) + + # EAT_COMMENT + data = 'a,b,c\n4,5,6#comment' + result = self.read_csv(StringIO(data), comment='#') + tm.assert_frame_equal(result, expected) + + # SKIP_LINE + data = 'a,b,c\n4,5,6\nskipme' + result = self.read_csv(StringIO(data), skiprows=[2]) + tm.assert_frame_equal(result, expected) + + # With skip_blank_lines = False + + # EAT_LINE_COMMENT + data = 'a,b,c\n4,5,6\n#comment' + result = self.read_csv( + StringIO(data), comment='#', skip_blank_lines=False) + expected = DataFrame([[4, 5, 6]], columns=['a', 'b', 'c']) + tm.assert_frame_equal(result, expected) + + # IN_FIELD + data = 'a,b,c\n4,5,6\n ' + result = self.read_csv(StringIO(data), skip_blank_lines=False) + expected = DataFrame( + [['4', 5, 6], [' ', None, None]], columns=['a', 'b', 'c']) + tm.assert_frame_equal(result, expected) + + # EAT_CRNL + data = 'a,b,c\n4,5,6\n\r' + result = self.read_csv(StringIO(data), skip_blank_lines=False) + expected = DataFrame( + [[4, 5, 6], [None, None, None]], columns=['a', 'b', 'c']) + tm.assert_frame_equal(result, expected) + + # Should produce exceptions + + # ESCAPED_CHAR + data = "a,b,c\n4,5,6\n\\" + self.assertRaises(Exception, self.read_csv, + StringIO(data), escapechar='\\') + + # ESCAPE_IN_QUOTED_FIELD + data = 'a,b,c\n4,5,6\n"\\' + self.assertRaises(Exception, self.read_csv, + StringIO(data), escapechar='\\') + + # IN_QUOTED_FIELD + data = 'a,b,c\n4,5,6\n"' + self.assertRaises(Exception, self.read_csv, + StringIO(data), escapechar='\\') + + def test_uneven_lines_with_usecols(self): + # See gh-12203 + csv = r"""a,b,c + 0,1,2 + 3,4,5,6,7 + 8,9,10 + """ + + # make sure that an error is still thrown + # when the 'usecols' parameter is not provided + msg = "Expected \d+ fields in line \d+, saw \d+" + with tm.assertRaisesRegexp(ValueError, msg): + df = self.read_csv(StringIO(csv)) + + expected = DataFrame({ + 'a': [0, 3, 8], + 'b': [1, 4, 9] + }) + + usecols = [0, 1] + df = self.read_csv(StringIO(csv), usecols=usecols) + tm.assert_frame_equal(df, expected) + + usecols = ['a', 'b'] + df = self.read_csv(StringIO(csv), usecols=usecols) + tm.assert_frame_equal(df, expected) + + def test_read_empty_with_usecols(self): + # See gh-12493 + names = ['Dummy', 'X', 'Dummy_2'] + usecols = names[1:2] # ['X'] + + # first, check to see that the response of + # parser when faced with no provided columns + # throws the correct error, with or without usecols + errmsg = "No columns to parse from file" + + with tm.assertRaisesRegexp(EmptyDataError, errmsg): + self.read_csv(StringIO('')) + + with tm.assertRaisesRegexp(EmptyDataError, errmsg): + self.read_csv(StringIO(''), usecols=usecols) + + expected = DataFrame(columns=usecols, index=[0], dtype=np.float64) + df = self.read_csv(StringIO(',,'), names=names, usecols=usecols) + tm.assert_frame_equal(df, expected) + + expected = DataFrame(columns=usecols) + df = self.read_csv(StringIO(''), names=names, usecols=usecols) + tm.assert_frame_equal(df, expected) + + def test_trailing_spaces(self): + data = "A B C \nrandom line with trailing spaces \nskip\n1,2,3\n1,2.,4.\nrandom line with trailing tabs\t\t\t\n \n5.1,NaN,10.0\n" # noqa + expected = DataFrame([[1., 2., 4.], + [5.1, np.nan, 10.]]) + + # gh-8661, gh-8679: this should ignore six lines including + # lines with trailing whitespace and blank lines + df = self.read_csv(StringIO(data.replace(',', ' ')), + header=None, delim_whitespace=True, + skiprows=[0, 1, 2, 3, 5, 6], skip_blank_lines=True) + tm.assert_frame_equal(df, expected) + df = self.read_table(StringIO(data.replace(',', ' ')), + header=None, delim_whitespace=True, + skiprows=[0, 1, 2, 3, 5, 6], + skip_blank_lines=True) + tm.assert_frame_equal(df, expected) + + # gh-8983: test skipping set of rows after a row with trailing spaces + expected = DataFrame({"A": [1., 5.1], "B": [2., np.nan], + "C": [4., 10]}) + df = self.read_table(StringIO(data.replace(',', ' ')), + delim_whitespace=True, + skiprows=[1, 2, 3, 5, 6], skip_blank_lines=True) + tm.assert_frame_equal(df, expected) + + def test_raise_on_sep_with_delim_whitespace(self): + # see gh-6607 + data = 'a b c\n1 2 3' + with tm.assertRaisesRegexp(ValueError, 'you can only specify one'): + self.read_table(StringIO(data), sep='\s', delim_whitespace=True) + + def test_single_char_leading_whitespace(self): + # see gh-9710 + data = """\ +MyColumn + a + b + a + b\n""" + + expected = DataFrame({'MyColumn': list('abab')}) + + result = self.read_csv(StringIO(data), delim_whitespace=True, + skipinitialspace=True) + tm.assert_frame_equal(result, expected) + + result = self.read_csv(StringIO(data), skipinitialspace=True) + tm.assert_frame_equal(result, expected) + + def test_empty_lines(self): + data = """\ +A,B,C +1,2.,4. + + +5.,NaN,10.0 + +-70,.4,1 +""" + expected = [[1., 2., 4.], + [5., np.nan, 10.], + [-70., .4, 1.]] + df = self.read_csv(StringIO(data)) + tm.assert_almost_equal(df.values, expected) + df = self.read_csv(StringIO(data.replace(',', ' ')), sep='\s+') + tm.assert_almost_equal(df.values, expected) + expected = [[1., 2., 4.], + [np.nan, np.nan, np.nan], + [np.nan, np.nan, np.nan], + [5., np.nan, 10.], + [np.nan, np.nan, np.nan], + [-70., .4, 1.]] + df = self.read_csv(StringIO(data), skip_blank_lines=False) + tm.assert_almost_equal(list(df.values), list(expected)) + + def test_whitespace_lines(self): + data = """ + +\t \t\t + \t +A,B,C + \t 1,2.,4. +5.,NaN,10.0 +""" + expected = [[1, 2., 4.], + [5., np.nan, 10.]] + df = self.read_csv(StringIO(data)) + tm.assert_almost_equal(df.values, expected) + + def test_regex_separator(self): + # see gh-6607 + data = """ A B C D +a 1 2 3 4 +b 1 2 3 4 +c 1 2 3 4 +""" + df = self.read_table(StringIO(data), sep='\s+') + expected = self.read_csv(StringIO(re.sub('[ ]+', ',', data)), + index_col=0) + self.assertIsNone(expected.index.name) + tm.assert_frame_equal(df, expected) + + data = ' a b c\n1 2 3 \n4 5 6\n 7 8 9' + result = self.read_table(StringIO(data), sep='\s+') + expected = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], + columns=['a', 'b', 'c']) + tm.assert_frame_equal(result, expected) + + def test_verbose_import(self): + text = """a,b,c,d +one,1,2,3 +one,1,2,3 +,1,2,3 +one,1,2,3 +,1,2,3 +,1,2,3 +one,1,2,3 +two,1,2,3""" + + buf = StringIO() + sys.stdout = buf + + try: # engines are verbose in different ways + self.read_csv(StringIO(text), verbose=True) + if self.engine == 'c': + self.assertIn('Tokenization took:', buf.getvalue()) + self.assertIn('Parser memory cleanup took:', buf.getvalue()) + else: # Python engine + self.assertEqual(buf.getvalue(), + 'Filled 3 NA values in column a\n') + finally: + sys.stdout = sys.__stdout__ + + buf = StringIO() + sys.stdout = buf + + text = """a,b,c,d +one,1,2,3 +two,1,2,3 +three,1,2,3 +four,1,2,3 +five,1,2,3 +,1,2,3 +seven,1,2,3 +eight,1,2,3""" + + try: # engines are verbose in different ways + self.read_csv(StringIO(text), verbose=True, index_col=0) + if self.engine == 'c': + self.assertIn('Tokenization took:', buf.getvalue()) + self.assertIn('Parser memory cleanup took:', buf.getvalue()) + else: # Python engine + self.assertEqual(buf.getvalue(), + 'Filled 1 NA values in column a\n') + finally: + sys.stdout = sys.__stdout__ + + def test_iteration_open_handle(self): + if PY3: + raise nose.SkipTest( + "won't work in Python 3 {0}".format(sys.version_info)) + + with tm.ensure_clean() as path: + with open(path, 'wb') as f: + f.write('AAA\nBBB\nCCC\nDDD\nEEE\nFFF\nGGG') + + with open(path, 'rb') as f: + for line in f: + if 'CCC' in line: + break + + if self.engine == 'c': + tm.assertRaises(Exception, self.read_table, + f, squeeze=True, header=None) + else: + result = self.read_table(f, squeeze=True, header=None) + expected = Series(['DDD', 'EEE', 'FFF', 'GGG'], name=0) + tm.assert_series_equal(result, expected) diff --git a/pandas/io/tests/parser/compression.py b/pandas/io/tests/parser/compression.py new file mode 100644 index 0000000000000..47ae7be1cbf05 --- /dev/null +++ b/pandas/io/tests/parser/compression.py @@ -0,0 +1,170 @@ +# -*- coding: utf-8 -*- + +""" +Tests compressed data parsing functionality for all +of the parsers defined in parsers.py +""" + +import nose + +import pandas.util.testing as tm +from pandas import compat + + +class CompressionTests(object): + def test_zip(self): + try: + import zipfile + except ImportError: + raise nose.SkipTest('need zipfile to run') + + with open(self.csv1, 'rb') as data_file: + data = data_file.read() + expected = self.read_csv(self.csv1) + + with tm.ensure_clean('test_file.zip') as path: + tmp = zipfile.ZipFile(path, mode='w') + tmp.writestr('test_file', data) + tmp.close() + + result = self.read_csv(path, compression='zip') + tm.assert_frame_equal(result, expected) + + result = self.read_csv(path, compression='infer') + tm.assert_frame_equal(result, expected) + + if self.engine is not 'python': + with open(path, 'rb') as f: + result = self.read_csv(f, compression='zip') + tm.assert_frame_equal(result, expected) + + with tm.ensure_clean('combined_zip.zip') as path: + inner_file_names = ['test_file', 'second_file'] + tmp = zipfile.ZipFile(path, mode='w') + for file_name in inner_file_names: + tmp.writestr(file_name, data) + tmp.close() + + self.assertRaisesRegexp(ValueError, 'Multiple files', + self.read_csv, path, compression='zip') + + self.assertRaisesRegexp(ValueError, 'Multiple files', + self.read_csv, path, compression='infer') + + with tm.ensure_clean() as path: + tmp = zipfile.ZipFile(path, mode='w') + tmp.close() + + self.assertRaisesRegexp(ValueError, 'Zero files', + self.read_csv, path, compression='zip') + + with tm.ensure_clean() as path: + with open(path, 'wb') as f: + self.assertRaises(zipfile.BadZipfile, self.read_csv, + f, compression='zip') + + def test_gzip(self): + try: + import gzip + except ImportError: + raise nose.SkipTest('need gzip to run') + + with open(self.csv1, 'rb') as data_file: + data = data_file.read() + expected = self.read_csv(self.csv1) + + with tm.ensure_clean() as path: + tmp = gzip.GzipFile(path, mode='wb') + tmp.write(data) + tmp.close() + + result = self.read_csv(path, compression='gzip') + tm.assert_frame_equal(result, expected) + + with open(path, 'rb') as f: + result = self.read_csv(f, compression='gzip') + tm.assert_frame_equal(result, expected) + + with tm.ensure_clean('test.gz') as path: + tmp = gzip.GzipFile(path, mode='wb') + tmp.write(data) + tmp.close() + result = self.read_csv(path, compression='infer') + tm.assert_frame_equal(result, expected) + + def test_bz2(self): + try: + import bz2 + except ImportError: + raise nose.SkipTest('need bz2 to run') + + with open(self.csv1, 'rb') as data_file: + data = data_file.read() + expected = self.read_csv(self.csv1) + + with tm.ensure_clean() as path: + tmp = bz2.BZ2File(path, mode='wb') + tmp.write(data) + tmp.close() + + result = self.read_csv(path, compression='bz2') + tm.assert_frame_equal(result, expected) + + self.assertRaises(ValueError, self.read_csv, + path, compression='bz3') + + with open(path, 'rb') as fin: + if compat.PY3: + result = self.read_csv(fin, compression='bz2') + tm.assert_frame_equal(result, expected) + elif self.engine is not 'python': + self.assertRaises(ValueError, self.read_csv, + fin, compression='bz2') + + with tm.ensure_clean('test.bz2') as path: + tmp = bz2.BZ2File(path, mode='wb') + tmp.write(data) + tmp.close() + result = self.read_csv(path, compression='infer') + tm.assert_frame_equal(result, expected) + + def test_xz(self): + lzma = tm._skip_if_no_lzma() + + with open(self.csv1, 'rb') as data_file: + data = data_file.read() + expected = self.read_csv(self.csv1) + + with tm.ensure_clean() as path: + tmp = lzma.LZMAFile(path, mode='wb') + tmp.write(data) + tmp.close() + + result = self.read_csv(path, compression='xz') + tm.assert_frame_equal(result, expected) + + with open(path, 'rb') as f: + result = self.read_csv(f, compression='xz') + tm.assert_frame_equal(result, expected) + + with tm.ensure_clean('test.xz') as path: + tmp = lzma.LZMAFile(path, mode='wb') + tmp.write(data) + tmp.close() + result = self.read_csv(path, compression='infer') + tm.assert_frame_equal(result, expected) + + def test_read_csv_infer_compression(self): + # see gh-9770 + expected = self.read_csv(self.csv1, index_col=0, parse_dates=True) + + inputs = [self.csv1, self.csv1 + '.gz', + self.csv1 + '.bz2', open(self.csv1)] + + for f in inputs: + df = self.read_csv(f, index_col=0, parse_dates=True, + compression='infer') + + tm.assert_frame_equal(expected, df) + + inputs[3].close() diff --git a/pandas/io/tests/parser/converters.py b/pandas/io/tests/parser/converters.py new file mode 100644 index 0000000000000..68231d67534ee --- /dev/null +++ b/pandas/io/tests/parser/converters.py @@ -0,0 +1,152 @@ +# -*- coding: utf-8 -*- + +""" +Tests column conversion functionality during parsing +for all of the parsers defined in parsers.py +""" + +from datetime import datetime + +import nose + +import numpy as np +import pandas as pd +import pandas.util.testing as tm + +from pandas.lib import Timestamp +from pandas import DataFrame, Index +from pandas.compat import parse_date, StringIO, lmap + + +class ConverterTests(object): + def test_converters_type_must_be_dict(self): + data = """index,A,B,C,D +foo,2,3,4,5 +""" + with tm.assertRaisesRegexp(TypeError, 'Type converters.+'): + self.read_csv(StringIO(data), converters=0) + + def test_converters(self): + data = """A,B,C,D +a,1,2,01/01/2009 +b,3,4,01/02/2009 +c,4,5,01/03/2009 +""" + result = self.read_csv(StringIO(data), converters={'D': parse_date}) + result2 = self.read_csv(StringIO(data), converters={3: parse_date}) + + expected = self.read_csv(StringIO(data)) + expected['D'] = expected['D'].map(parse_date) + + tm.assertIsInstance(result['D'][0], (datetime, Timestamp)) + tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result2, expected) + + # produce integer + converter = lambda x: int(x.split('/')[2]) + result = self.read_csv(StringIO(data), converters={'D': converter}) + expected = self.read_csv(StringIO(data)) + expected['D'] = expected['D'].map(converter) + tm.assert_frame_equal(result, expected) + + def test_converters_no_implicit_conv(self): + # see gh-2184 + data = """000102,1.2,A\n001245,2,B""" + f = lambda x: x.strip() + converter = {0: f} + df = self.read_csv(StringIO(data), header=None, converters=converter) + self.assertEqual(df[0].dtype, object) + + def test_converters_euro_decimal_format(self): + data = """Id;Number1;Number2;Text1;Text2;Number3 +1;1521,1541;187101,9543;ABC;poi;4,738797819 +2;121,12;14897,76;DEF;uyt;0,377320872 +3;878,158;108013,434;GHI;rez;2,735694704""" + f = lambda x: float(x.replace(",", ".")) + converter = {'Number1': f, 'Number2': f, 'Number3': f} + df2 = self.read_csv(StringIO(data), sep=';', converters=converter) + self.assertEqual(df2['Number1'].dtype, float) + self.assertEqual(df2['Number2'].dtype, float) + self.assertEqual(df2['Number3'].dtype, float) + + def test_converter_return_string_bug(self): + # see gh-583 + data = """Id;Number1;Number2;Text1;Text2;Number3 +1;1521,1541;187101,9543;ABC;poi;4,738797819 +2;121,12;14897,76;DEF;uyt;0,377320872 +3;878,158;108013,434;GHI;rez;2,735694704""" + f = lambda x: float(x.replace(",", ".")) + converter = {'Number1': f, 'Number2': f, 'Number3': f} + df2 = self.read_csv(StringIO(data), sep=';', converters=converter) + self.assertEqual(df2['Number1'].dtype, float) + + def test_converters_corner_with_nas(self): + # skip aberration observed on Win64 Python 3.2.2 + if hash(np.int64(-1)) != -2: + raise nose.SkipTest("skipping because of windows hash on Python" + " 3.2.2") + + data = """id,score,days +1,2,12 +2,2-5, +3,,14+ +4,6-12,2""" + + def convert_days(x): + x = x.strip() + if not x: + return np.nan + + is_plus = x.endswith('+') + if is_plus: + x = int(x[:-1]) + 1 + else: + x = int(x) + return x + + def convert_days_sentinel(x): + x = x.strip() + if not x: + return np.nan + + is_plus = x.endswith('+') + if is_plus: + x = int(x[:-1]) + 1 + else: + x = int(x) + return x + + def convert_score(x): + x = x.strip() + if not x: + return np.nan + if x.find('-') > 0: + valmin, valmax = lmap(int, x.split('-')) + val = 0.5 * (valmin + valmax) + else: + val = float(x) + + return val + + fh = StringIO(data) + result = self.read_csv(fh, converters={'score': convert_score, + 'days': convert_days}, + na_values=['', None]) + self.assertTrue(pd.isnull(result['days'][1])) + + fh = StringIO(data) + result2 = self.read_csv(fh, converters={'score': convert_score, + 'days': convert_days_sentinel}, + na_values=['', None]) + tm.assert_frame_equal(result, result2) + + def test_converter_index_col_bug(self): + # see gh-1835 + data = "A;B\n1;2\n3;4" + + rs = self.read_csv(StringIO(data), sep=';', index_col='A', + converters={'A': lambda x: x}) + + xp = DataFrame({'B': [2, 4]}, index=Index([1, 3], name='A')) + tm.assert_frame_equal(rs, xp) + self.assertEqual(rs.index.name, xp.index.name) diff --git a/pandas/io/tests/parser/data/iris.csv b/pandas/io/tests/parser/data/iris.csv new file mode 100644 index 0000000000000..c19b9c3688515 --- /dev/null +++ b/pandas/io/tests/parser/data/iris.csv @@ -0,0 +1,151 @@ +SepalLength,SepalWidth,PetalLength,PetalWidth,Name +5.1,3.5,1.4,0.2,Iris-setosa +4.9,3.0,1.4,0.2,Iris-setosa +4.7,3.2,1.3,0.2,Iris-setosa +4.6,3.1,1.5,0.2,Iris-setosa +5.0,3.6,1.4,0.2,Iris-setosa +5.4,3.9,1.7,0.4,Iris-setosa +4.6,3.4,1.4,0.3,Iris-setosa +5.0,3.4,1.5,0.2,Iris-setosa +4.4,2.9,1.4,0.2,Iris-setosa +4.9,3.1,1.5,0.1,Iris-setosa +5.4,3.7,1.5,0.2,Iris-setosa +4.8,3.4,1.6,0.2,Iris-setosa +4.8,3.0,1.4,0.1,Iris-setosa +4.3,3.0,1.1,0.1,Iris-setosa +5.8,4.0,1.2,0.2,Iris-setosa +5.7,4.4,1.5,0.4,Iris-setosa +5.4,3.9,1.3,0.4,Iris-setosa +5.1,3.5,1.4,0.3,Iris-setosa +5.7,3.8,1.7,0.3,Iris-setosa +5.1,3.8,1.5,0.3,Iris-setosa +5.4,3.4,1.7,0.2,Iris-setosa +5.1,3.7,1.5,0.4,Iris-setosa +4.6,3.6,1.0,0.2,Iris-setosa +5.1,3.3,1.7,0.5,Iris-setosa +4.8,3.4,1.9,0.2,Iris-setosa +5.0,3.0,1.6,0.2,Iris-setosa +5.0,3.4,1.6,0.4,Iris-setosa +5.2,3.5,1.5,0.2,Iris-setosa +5.2,3.4,1.4,0.2,Iris-setosa +4.7,3.2,1.6,0.2,Iris-setosa +4.8,3.1,1.6,0.2,Iris-setosa +5.4,3.4,1.5,0.4,Iris-setosa +5.2,4.1,1.5,0.1,Iris-setosa +5.5,4.2,1.4,0.2,Iris-setosa +4.9,3.1,1.5,0.1,Iris-setosa +5.0,3.2,1.2,0.2,Iris-setosa +5.5,3.5,1.3,0.2,Iris-setosa +4.9,3.1,1.5,0.1,Iris-setosa +4.4,3.0,1.3,0.2,Iris-setosa +5.1,3.4,1.5,0.2,Iris-setosa +5.0,3.5,1.3,0.3,Iris-setosa +4.5,2.3,1.3,0.3,Iris-setosa +4.4,3.2,1.3,0.2,Iris-setosa +5.0,3.5,1.6,0.6,Iris-setosa +5.1,3.8,1.9,0.4,Iris-setosa +4.8,3.0,1.4,0.3,Iris-setosa +5.1,3.8,1.6,0.2,Iris-setosa +4.6,3.2,1.4,0.2,Iris-setosa +5.3,3.7,1.5,0.2,Iris-setosa +5.0,3.3,1.4,0.2,Iris-setosa +7.0,3.2,4.7,1.4,Iris-versicolor +6.4,3.2,4.5,1.5,Iris-versicolor +6.9,3.1,4.9,1.5,Iris-versicolor +5.5,2.3,4.0,1.3,Iris-versicolor +6.5,2.8,4.6,1.5,Iris-versicolor +5.7,2.8,4.5,1.3,Iris-versicolor +6.3,3.3,4.7,1.6,Iris-versicolor +4.9,2.4,3.3,1.0,Iris-versicolor +6.6,2.9,4.6,1.3,Iris-versicolor +5.2,2.7,3.9,1.4,Iris-versicolor +5.0,2.0,3.5,1.0,Iris-versicolor +5.9,3.0,4.2,1.5,Iris-versicolor +6.0,2.2,4.0,1.0,Iris-versicolor +6.1,2.9,4.7,1.4,Iris-versicolor +5.6,2.9,3.6,1.3,Iris-versicolor +6.7,3.1,4.4,1.4,Iris-versicolor +5.6,3.0,4.5,1.5,Iris-versicolor +5.8,2.7,4.1,1.0,Iris-versicolor +6.2,2.2,4.5,1.5,Iris-versicolor +5.6,2.5,3.9,1.1,Iris-versicolor +5.9,3.2,4.8,1.8,Iris-versicolor +6.1,2.8,4.0,1.3,Iris-versicolor +6.3,2.5,4.9,1.5,Iris-versicolor +6.1,2.8,4.7,1.2,Iris-versicolor +6.4,2.9,4.3,1.3,Iris-versicolor +6.6,3.0,4.4,1.4,Iris-versicolor +6.8,2.8,4.8,1.4,Iris-versicolor +6.7,3.0,5.0,1.7,Iris-versicolor +6.0,2.9,4.5,1.5,Iris-versicolor +5.7,2.6,3.5,1.0,Iris-versicolor +5.5,2.4,3.8,1.1,Iris-versicolor +5.5,2.4,3.7,1.0,Iris-versicolor +5.8,2.7,3.9,1.2,Iris-versicolor +6.0,2.7,5.1,1.6,Iris-versicolor +5.4,3.0,4.5,1.5,Iris-versicolor +6.0,3.4,4.5,1.6,Iris-versicolor +6.7,3.1,4.7,1.5,Iris-versicolor +6.3,2.3,4.4,1.3,Iris-versicolor +5.6,3.0,4.1,1.3,Iris-versicolor +5.5,2.5,4.0,1.3,Iris-versicolor +5.5,2.6,4.4,1.2,Iris-versicolor +6.1,3.0,4.6,1.4,Iris-versicolor +5.8,2.6,4.0,1.2,Iris-versicolor +5.0,2.3,3.3,1.0,Iris-versicolor +5.6,2.7,4.2,1.3,Iris-versicolor +5.7,3.0,4.2,1.2,Iris-versicolor +5.7,2.9,4.2,1.3,Iris-versicolor +6.2,2.9,4.3,1.3,Iris-versicolor +5.1,2.5,3.0,1.1,Iris-versicolor +5.7,2.8,4.1,1.3,Iris-versicolor +6.3,3.3,6.0,2.5,Iris-virginica +5.8,2.7,5.1,1.9,Iris-virginica +7.1,3.0,5.9,2.1,Iris-virginica +6.3,2.9,5.6,1.8,Iris-virginica +6.5,3.0,5.8,2.2,Iris-virginica +7.6,3.0,6.6,2.1,Iris-virginica +4.9,2.5,4.5,1.7,Iris-virginica +7.3,2.9,6.3,1.8,Iris-virginica +6.7,2.5,5.8,1.8,Iris-virginica +7.2,3.6,6.1,2.5,Iris-virginica +6.5,3.2,5.1,2.0,Iris-virginica +6.4,2.7,5.3,1.9,Iris-virginica +6.8,3.0,5.5,2.1,Iris-virginica +5.7,2.5,5.0,2.0,Iris-virginica +5.8,2.8,5.1,2.4,Iris-virginica +6.4,3.2,5.3,2.3,Iris-virginica +6.5,3.0,5.5,1.8,Iris-virginica +7.7,3.8,6.7,2.2,Iris-virginica +7.7,2.6,6.9,2.3,Iris-virginica +6.0,2.2,5.0,1.5,Iris-virginica +6.9,3.2,5.7,2.3,Iris-virginica +5.6,2.8,4.9,2.0,Iris-virginica +7.7,2.8,6.7,2.0,Iris-virginica +6.3,2.7,4.9,1.8,Iris-virginica +6.7,3.3,5.7,2.1,Iris-virginica +7.2,3.2,6.0,1.8,Iris-virginica +6.2,2.8,4.8,1.8,Iris-virginica +6.1,3.0,4.9,1.8,Iris-virginica +6.4,2.8,5.6,2.1,Iris-virginica +7.2,3.0,5.8,1.6,Iris-virginica +7.4,2.8,6.1,1.9,Iris-virginica +7.9,3.8,6.4,2.0,Iris-virginica +6.4,2.8,5.6,2.2,Iris-virginica +6.3,2.8,5.1,1.5,Iris-virginica +6.1,2.6,5.6,1.4,Iris-virginica +7.7,3.0,6.1,2.3,Iris-virginica +6.3,3.4,5.6,2.4,Iris-virginica +6.4,3.1,5.5,1.8,Iris-virginica +6.0,3.0,4.8,1.8,Iris-virginica +6.9,3.1,5.4,2.1,Iris-virginica +6.7,3.1,5.6,2.4,Iris-virginica +6.9,3.1,5.1,2.3,Iris-virginica +5.8,2.7,5.1,1.9,Iris-virginica +6.8,3.2,5.9,2.3,Iris-virginica +6.7,3.3,5.7,2.5,Iris-virginica +6.7,3.0,5.2,2.3,Iris-virginica +6.3,2.5,5.0,1.9,Iris-virginica +6.5,3.0,5.2,2.0,Iris-virginica +6.2,3.4,5.4,2.3,Iris-virginica +5.9,3.0,5.1,1.8,Iris-virginica \ No newline at end of file diff --git a/pandas/io/tests/parser/data/salary.table b/pandas/io/tests/parser/data/salary.table new file mode 100644 index 0000000000000..ea7803339e98d --- /dev/null +++ b/pandas/io/tests/parser/data/salary.table @@ -0,0 +1,47 @@ +S X E M +13876 1 1 1 +11608 1 3 0 +18701 1 3 1 +11283 1 2 0 +11767 1 3 0 +20872 2 2 1 +11772 2 2 0 +10535 2 1 0 +12195 2 3 0 +12313 3 2 0 +14975 3 1 1 +21371 3 2 1 +19800 3 3 1 +11417 4 1 0 +20263 4 3 1 +13231 4 3 0 +12884 4 2 0 +13245 5 2 0 +13677 5 3 0 +15965 5 1 1 +12336 6 1 0 +21352 6 3 1 +13839 6 2 0 +22884 6 2 1 +16978 7 1 1 +14803 8 2 0 +17404 8 1 1 +22184 8 3 1 +13548 8 1 0 +14467 10 1 0 +15942 10 2 0 +23174 10 3 1 +23780 10 2 1 +25410 11 2 1 +14861 11 1 0 +16882 12 2 0 +24170 12 3 1 +15990 13 1 0 +26330 13 2 1 +17949 14 2 0 +25685 15 3 1 +27837 16 2 1 +18838 16 2 0 +17483 16 1 0 +19207 17 2 0 +19346 20 1 0 diff --git a/pandas/io/tests/parser/data/test1.csv b/pandas/io/tests/parser/data/test1.csv new file mode 100644 index 0000000000000..4bdb62943c4c8 --- /dev/null +++ b/pandas/io/tests/parser/data/test1.csv @@ -0,0 +1,8 @@ +index,A,B,C,D +2000-01-03 00:00:00,0.980268513777,3.68573087906,-0.364216805298,-1.15973806169 +2000-01-04 00:00:00,1.04791624281,-0.0412318367011,-0.16181208307,0.212549316967 +2000-01-05 00:00:00,0.498580885705,0.731167677815,-0.537677223318,1.34627041952 +2000-01-06 00:00:00,1.12020151869,1.56762092543,0.00364077397681,0.67525259227 +2000-01-07 00:00:00,-0.487094399463,0.571454623474,-1.6116394093,0.103468562917 +2000-01-10 00:00:00,0.836648671666,0.246461918642,0.588542635376,1.0627820613 +2000-01-11 00:00:00,-0.157160753327,1.34030689438,1.19577795622,-1.09700699751 \ No newline at end of file diff --git a/pandas/io/tests/data/test1.csv.bz2 b/pandas/io/tests/parser/data/test1.csv.bz2 similarity index 100% rename from pandas/io/tests/data/test1.csv.bz2 rename to pandas/io/tests/parser/data/test1.csv.bz2 diff --git a/pandas/io/tests/data/test1.csv.gz b/pandas/io/tests/parser/data/test1.csv.gz similarity index 100% rename from pandas/io/tests/data/test1.csv.gz rename to pandas/io/tests/parser/data/test1.csv.gz diff --git a/pandas/io/tests/data/test2.csv b/pandas/io/tests/parser/data/test2.csv similarity index 100% rename from pandas/io/tests/data/test2.csv rename to pandas/io/tests/parser/data/test2.csv diff --git a/pandas/io/tests/parser/data/tips.csv b/pandas/io/tests/parser/data/tips.csv new file mode 100644 index 0000000000000..856a65a69e647 --- /dev/null +++ b/pandas/io/tests/parser/data/tips.csv @@ -0,0 +1,245 @@ +total_bill,tip,sex,smoker,day,time,size +16.99,1.01,Female,No,Sun,Dinner,2 +10.34,1.66,Male,No,Sun,Dinner,3 +21.01,3.5,Male,No,Sun,Dinner,3 +23.68,3.31,Male,No,Sun,Dinner,2 +24.59,3.61,Female,No,Sun,Dinner,4 +25.29,4.71,Male,No,Sun,Dinner,4 +8.77,2.0,Male,No,Sun,Dinner,2 +26.88,3.12,Male,No,Sun,Dinner,4 +15.04,1.96,Male,No,Sun,Dinner,2 +14.78,3.23,Male,No,Sun,Dinner,2 +10.27,1.71,Male,No,Sun,Dinner,2 +35.26,5.0,Female,No,Sun,Dinner,4 +15.42,1.57,Male,No,Sun,Dinner,2 +18.43,3.0,Male,No,Sun,Dinner,4 +14.83,3.02,Female,No,Sun,Dinner,2 +21.58,3.92,Male,No,Sun,Dinner,2 +10.33,1.67,Female,No,Sun,Dinner,3 +16.29,3.71,Male,No,Sun,Dinner,3 +16.97,3.5,Female,No,Sun,Dinner,3 +20.65,3.35,Male,No,Sat,Dinner,3 +17.92,4.08,Male,No,Sat,Dinner,2 +20.29,2.75,Female,No,Sat,Dinner,2 +15.77,2.23,Female,No,Sat,Dinner,2 +39.42,7.58,Male,No,Sat,Dinner,4 +19.82,3.18,Male,No,Sat,Dinner,2 +17.81,2.34,Male,No,Sat,Dinner,4 +13.37,2.0,Male,No,Sat,Dinner,2 +12.69,2.0,Male,No,Sat,Dinner,2 +21.7,4.3,Male,No,Sat,Dinner,2 +19.65,3.0,Female,No,Sat,Dinner,2 +9.55,1.45,Male,No,Sat,Dinner,2 +18.35,2.5,Male,No,Sat,Dinner,4 +15.06,3.0,Female,No,Sat,Dinner,2 +20.69,2.45,Female,No,Sat,Dinner,4 +17.78,3.27,Male,No,Sat,Dinner,2 +24.06,3.6,Male,No,Sat,Dinner,3 +16.31,2.0,Male,No,Sat,Dinner,3 +16.93,3.07,Female,No,Sat,Dinner,3 +18.69,2.31,Male,No,Sat,Dinner,3 +31.27,5.0,Male,No,Sat,Dinner,3 +16.04,2.24,Male,No,Sat,Dinner,3 +17.46,2.54,Male,No,Sun,Dinner,2 +13.94,3.06,Male,No,Sun,Dinner,2 +9.68,1.32,Male,No,Sun,Dinner,2 +30.4,5.6,Male,No,Sun,Dinner,4 +18.29,3.0,Male,No,Sun,Dinner,2 +22.23,5.0,Male,No,Sun,Dinner,2 +32.4,6.0,Male,No,Sun,Dinner,4 +28.55,2.05,Male,No,Sun,Dinner,3 +18.04,3.0,Male,No,Sun,Dinner,2 +12.54,2.5,Male,No,Sun,Dinner,2 +10.29,2.6,Female,No,Sun,Dinner,2 +34.81,5.2,Female,No,Sun,Dinner,4 +9.94,1.56,Male,No,Sun,Dinner,2 +25.56,4.34,Male,No,Sun,Dinner,4 +19.49,3.51,Male,No,Sun,Dinner,2 +38.01,3.0,Male,Yes,Sat,Dinner,4 +26.41,1.5,Female,No,Sat,Dinner,2 +11.24,1.76,Male,Yes,Sat,Dinner,2 +48.27,6.73,Male,No,Sat,Dinner,4 +20.29,3.21,Male,Yes,Sat,Dinner,2 +13.81,2.0,Male,Yes,Sat,Dinner,2 +11.02,1.98,Male,Yes,Sat,Dinner,2 +18.29,3.76,Male,Yes,Sat,Dinner,4 +17.59,2.64,Male,No,Sat,Dinner,3 +20.08,3.15,Male,No,Sat,Dinner,3 +16.45,2.47,Female,No,Sat,Dinner,2 +3.07,1.0,Female,Yes,Sat,Dinner,1 +20.23,2.01,Male,No,Sat,Dinner,2 +15.01,2.09,Male,Yes,Sat,Dinner,2 +12.02,1.97,Male,No,Sat,Dinner,2 +17.07,3.0,Female,No,Sat,Dinner,3 +26.86,3.14,Female,Yes,Sat,Dinner,2 +25.28,5.0,Female,Yes,Sat,Dinner,2 +14.73,2.2,Female,No,Sat,Dinner,2 +10.51,1.25,Male,No,Sat,Dinner,2 +17.92,3.08,Male,Yes,Sat,Dinner,2 +27.2,4.0,Male,No,Thur,Lunch,4 +22.76,3.0,Male,No,Thur,Lunch,2 +17.29,2.71,Male,No,Thur,Lunch,2 +19.44,3.0,Male,Yes,Thur,Lunch,2 +16.66,3.4,Male,No,Thur,Lunch,2 +10.07,1.83,Female,No,Thur,Lunch,1 +32.68,5.0,Male,Yes,Thur,Lunch,2 +15.98,2.03,Male,No,Thur,Lunch,2 +34.83,5.17,Female,No,Thur,Lunch,4 +13.03,2.0,Male,No,Thur,Lunch,2 +18.28,4.0,Male,No,Thur,Lunch,2 +24.71,5.85,Male,No,Thur,Lunch,2 +21.16,3.0,Male,No,Thur,Lunch,2 +28.97,3.0,Male,Yes,Fri,Dinner,2 +22.49,3.5,Male,No,Fri,Dinner,2 +5.75,1.0,Female,Yes,Fri,Dinner,2 +16.32,4.3,Female,Yes,Fri,Dinner,2 +22.75,3.25,Female,No,Fri,Dinner,2 +40.17,4.73,Male,Yes,Fri,Dinner,4 +27.28,4.0,Male,Yes,Fri,Dinner,2 +12.03,1.5,Male,Yes,Fri,Dinner,2 +21.01,3.0,Male,Yes,Fri,Dinner,2 +12.46,1.5,Male,No,Fri,Dinner,2 +11.35,2.5,Female,Yes,Fri,Dinner,2 +15.38,3.0,Female,Yes,Fri,Dinner,2 +44.3,2.5,Female,Yes,Sat,Dinner,3 +22.42,3.48,Female,Yes,Sat,Dinner,2 +20.92,4.08,Female,No,Sat,Dinner,2 +15.36,1.64,Male,Yes,Sat,Dinner,2 +20.49,4.06,Male,Yes,Sat,Dinner,2 +25.21,4.29,Male,Yes,Sat,Dinner,2 +18.24,3.76,Male,No,Sat,Dinner,2 +14.31,4.0,Female,Yes,Sat,Dinner,2 +14.0,3.0,Male,No,Sat,Dinner,2 +7.25,1.0,Female,No,Sat,Dinner,1 +38.07,4.0,Male,No,Sun,Dinner,3 +23.95,2.55,Male,No,Sun,Dinner,2 +25.71,4.0,Female,No,Sun,Dinner,3 +17.31,3.5,Female,No,Sun,Dinner,2 +29.93,5.07,Male,No,Sun,Dinner,4 +10.65,1.5,Female,No,Thur,Lunch,2 +12.43,1.8,Female,No,Thur,Lunch,2 +24.08,2.92,Female,No,Thur,Lunch,4 +11.69,2.31,Male,No,Thur,Lunch,2 +13.42,1.68,Female,No,Thur,Lunch,2 +14.26,2.5,Male,No,Thur,Lunch,2 +15.95,2.0,Male,No,Thur,Lunch,2 +12.48,2.52,Female,No,Thur,Lunch,2 +29.8,4.2,Female,No,Thur,Lunch,6 +8.52,1.48,Male,No,Thur,Lunch,2 +14.52,2.0,Female,No,Thur,Lunch,2 +11.38,2.0,Female,No,Thur,Lunch,2 +22.82,2.18,Male,No,Thur,Lunch,3 +19.08,1.5,Male,No,Thur,Lunch,2 +20.27,2.83,Female,No,Thur,Lunch,2 +11.17,1.5,Female,No,Thur,Lunch,2 +12.26,2.0,Female,No,Thur,Lunch,2 +18.26,3.25,Female,No,Thur,Lunch,2 +8.51,1.25,Female,No,Thur,Lunch,2 +10.33,2.0,Female,No,Thur,Lunch,2 +14.15,2.0,Female,No,Thur,Lunch,2 +16.0,2.0,Male,Yes,Thur,Lunch,2 +13.16,2.75,Female,No,Thur,Lunch,2 +17.47,3.5,Female,No,Thur,Lunch,2 +34.3,6.7,Male,No,Thur,Lunch,6 +41.19,5.0,Male,No,Thur,Lunch,5 +27.05,5.0,Female,No,Thur,Lunch,6 +16.43,2.3,Female,No,Thur,Lunch,2 +8.35,1.5,Female,No,Thur,Lunch,2 +18.64,1.36,Female,No,Thur,Lunch,3 +11.87,1.63,Female,No,Thur,Lunch,2 +9.78,1.73,Male,No,Thur,Lunch,2 +7.51,2.0,Male,No,Thur,Lunch,2 +14.07,2.5,Male,No,Sun,Dinner,2 +13.13,2.0,Male,No,Sun,Dinner,2 +17.26,2.74,Male,No,Sun,Dinner,3 +24.55,2.0,Male,No,Sun,Dinner,4 +19.77,2.0,Male,No,Sun,Dinner,4 +29.85,5.14,Female,No,Sun,Dinner,5 +48.17,5.0,Male,No,Sun,Dinner,6 +25.0,3.75,Female,No,Sun,Dinner,4 +13.39,2.61,Female,No,Sun,Dinner,2 +16.49,2.0,Male,No,Sun,Dinner,4 +21.5,3.5,Male,No,Sun,Dinner,4 +12.66,2.5,Male,No,Sun,Dinner,2 +16.21,2.0,Female,No,Sun,Dinner,3 +13.81,2.0,Male,No,Sun,Dinner,2 +17.51,3.0,Female,Yes,Sun,Dinner,2 +24.52,3.48,Male,No,Sun,Dinner,3 +20.76,2.24,Male,No,Sun,Dinner,2 +31.71,4.5,Male,No,Sun,Dinner,4 +10.59,1.61,Female,Yes,Sat,Dinner,2 +10.63,2.0,Female,Yes,Sat,Dinner,2 +50.81,10.0,Male,Yes,Sat,Dinner,3 +15.81,3.16,Male,Yes,Sat,Dinner,2 +7.25,5.15,Male,Yes,Sun,Dinner,2 +31.85,3.18,Male,Yes,Sun,Dinner,2 +16.82,4.0,Male,Yes,Sun,Dinner,2 +32.9,3.11,Male,Yes,Sun,Dinner,2 +17.89,2.0,Male,Yes,Sun,Dinner,2 +14.48,2.0,Male,Yes,Sun,Dinner,2 +9.6,4.0,Female,Yes,Sun,Dinner,2 +34.63,3.55,Male,Yes,Sun,Dinner,2 +34.65,3.68,Male,Yes,Sun,Dinner,4 +23.33,5.65,Male,Yes,Sun,Dinner,2 +45.35,3.5,Male,Yes,Sun,Dinner,3 +23.17,6.5,Male,Yes,Sun,Dinner,4 +40.55,3.0,Male,Yes,Sun,Dinner,2 +20.69,5.0,Male,No,Sun,Dinner,5 +20.9,3.5,Female,Yes,Sun,Dinner,3 +30.46,2.0,Male,Yes,Sun,Dinner,5 +18.15,3.5,Female,Yes,Sun,Dinner,3 +23.1,4.0,Male,Yes,Sun,Dinner,3 +15.69,1.5,Male,Yes,Sun,Dinner,2 +19.81,4.19,Female,Yes,Thur,Lunch,2 +28.44,2.56,Male,Yes,Thur,Lunch,2 +15.48,2.02,Male,Yes,Thur,Lunch,2 +16.58,4.0,Male,Yes,Thur,Lunch,2 +7.56,1.44,Male,No,Thur,Lunch,2 +10.34,2.0,Male,Yes,Thur,Lunch,2 +43.11,5.0,Female,Yes,Thur,Lunch,4 +13.0,2.0,Female,Yes,Thur,Lunch,2 +13.51,2.0,Male,Yes,Thur,Lunch,2 +18.71,4.0,Male,Yes,Thur,Lunch,3 +12.74,2.01,Female,Yes,Thur,Lunch,2 +13.0,2.0,Female,Yes,Thur,Lunch,2 +16.4,2.5,Female,Yes,Thur,Lunch,2 +20.53,4.0,Male,Yes,Thur,Lunch,4 +16.47,3.23,Female,Yes,Thur,Lunch,3 +26.59,3.41,Male,Yes,Sat,Dinner,3 +38.73,3.0,Male,Yes,Sat,Dinner,4 +24.27,2.03,Male,Yes,Sat,Dinner,2 +12.76,2.23,Female,Yes,Sat,Dinner,2 +30.06,2.0,Male,Yes,Sat,Dinner,3 +25.89,5.16,Male,Yes,Sat,Dinner,4 +48.33,9.0,Male,No,Sat,Dinner,4 +13.27,2.5,Female,Yes,Sat,Dinner,2 +28.17,6.5,Female,Yes,Sat,Dinner,3 +12.9,1.1,Female,Yes,Sat,Dinner,2 +28.15,3.0,Male,Yes,Sat,Dinner,5 +11.59,1.5,Male,Yes,Sat,Dinner,2 +7.74,1.44,Male,Yes,Sat,Dinner,2 +30.14,3.09,Female,Yes,Sat,Dinner,4 +12.16,2.2,Male,Yes,Fri,Lunch,2 +13.42,3.48,Female,Yes,Fri,Lunch,2 +8.58,1.92,Male,Yes,Fri,Lunch,1 +15.98,3.0,Female,No,Fri,Lunch,3 +13.42,1.58,Male,Yes,Fri,Lunch,2 +16.27,2.5,Female,Yes,Fri,Lunch,2 +10.09,2.0,Female,Yes,Fri,Lunch,2 +20.45,3.0,Male,No,Sat,Dinner,4 +13.28,2.72,Male,No,Sat,Dinner,2 +22.12,2.88,Female,Yes,Sat,Dinner,2 +24.01,2.0,Male,Yes,Sat,Dinner,4 +15.69,3.0,Male,Yes,Sat,Dinner,3 +11.61,3.39,Male,No,Sat,Dinner,2 +10.77,1.47,Male,No,Sat,Dinner,2 +15.53,3.0,Male,Yes,Sat,Dinner,2 +10.07,1.25,Male,No,Sat,Dinner,2 +12.6,1.0,Male,Yes,Sat,Dinner,2 +32.83,1.17,Male,Yes,Sat,Dinner,2 +35.83,4.67,Female,No,Sat,Dinner,3 +29.03,5.92,Male,No,Sat,Dinner,3 +27.18,2.0,Female,Yes,Sat,Dinner,2 +22.67,2.0,Male,Yes,Sat,Dinner,2 +17.82,1.75,Male,No,Sat,Dinner,2 +18.78,3.0,Female,No,Thur,Dinner,2 diff --git a/pandas/io/tests/data/unicode_series.csv b/pandas/io/tests/parser/data/unicode_series.csv similarity index 100% rename from pandas/io/tests/data/unicode_series.csv rename to pandas/io/tests/parser/data/unicode_series.csv diff --git a/pandas/io/tests/data/utf16_ex.txt b/pandas/io/tests/parser/data/utf16_ex.txt similarity index 100% rename from pandas/io/tests/data/utf16_ex.txt rename to pandas/io/tests/parser/data/utf16_ex.txt diff --git a/pandas/io/tests/parser/header.py b/pandas/io/tests/parser/header.py new file mode 100644 index 0000000000000..e3c408f0af907 --- /dev/null +++ b/pandas/io/tests/parser/header.py @@ -0,0 +1,275 @@ +# -*- coding: utf-8 -*- + +""" +Tests that the file header is properly handled or inferred +during parsing for all of the parsers defined in parsers.py +""" + +import numpy as np +import pandas.util.testing as tm + +from pandas import DataFrame, Index, MultiIndex +from pandas.compat import StringIO, lrange, u + + +class HeaderTests(object): + + def test_read_with_bad_header(self): + errmsg = "but only \d+ lines in file" + + with tm.assertRaisesRegexp(ValueError, errmsg): + s = StringIO(',,') + self.read_csv(s, header=[10]) + + def test_bool_header_arg(self): + # see gh-6114 + data = """\ +MyColumn + a + b + a + b""" + for arg in [True, False]: + with tm.assertRaises(TypeError): + self.read_csv(StringIO(data), header=arg) + with tm.assertRaises(TypeError): + self.read_table(StringIO(data), header=arg) + + def test_no_header_prefix(self): + data = """1,2,3,4,5 +6,7,8,9,10 +11,12,13,14,15 +""" + df_pref = self.read_table(StringIO(data), sep=',', prefix='Field', + header=None) + + expected = [[1, 2, 3, 4, 5.], + [6, 7, 8, 9, 10], + [11, 12, 13, 14, 15]] + tm.assert_almost_equal(df_pref.values, expected) + + self.assert_numpy_array_equal( + df_pref.columns, ['Field0', 'Field1', 'Field2', + 'Field3', 'Field4']) + + def test_header_with_index_col(self): + data = """foo,1,2,3 +bar,4,5,6 +baz,7,8,9 +""" + names = ['A', 'B', 'C'] + df = self.read_csv(StringIO(data), names=names) + + self.assertEqual(names, ['A', 'B', 'C']) + + values = [[1, 2, 3], [4, 5, 6], [7, 8, 9]] + expected = DataFrame(values, index=['foo', 'bar', 'baz'], + columns=['A', 'B', 'C']) + tm.assert_frame_equal(df, expected) + + def test_header_not_first_line(self): + data = """got,to,ignore,this,line +got,to,ignore,this,line +index,A,B,C,D +foo,2,3,4,5 +bar,7,8,9,10 +baz,12,13,14,15 +""" + data2 = """index,A,B,C,D +foo,2,3,4,5 +bar,7,8,9,10 +baz,12,13,14,15 +""" + + df = self.read_csv(StringIO(data), header=2, index_col=0) + expected = self.read_csv(StringIO(data2), header=0, index_col=0) + tm.assert_frame_equal(df, expected) + + def test_header_multi_index(self): + expected = tm.makeCustomDataframe( + 5, 3, r_idx_nlevels=2, c_idx_nlevels=4) + + data = """\ +C0,,C_l0_g0,C_l0_g1,C_l0_g2 + +C1,,C_l1_g0,C_l1_g1,C_l1_g2 +C2,,C_l2_g0,C_l2_g1,C_l2_g2 +C3,,C_l3_g0,C_l3_g1,C_l3_g2 +R0,R1,,, +R_l0_g0,R_l1_g0,R0C0,R0C1,R0C2 +R_l0_g1,R_l1_g1,R1C0,R1C1,R1C2 +R_l0_g2,R_l1_g2,R2C0,R2C1,R2C2 +R_l0_g3,R_l1_g3,R3C0,R3C1,R3C2 +R_l0_g4,R_l1_g4,R4C0,R4C1,R4C2 +""" + + df = self.read_csv(StringIO(data), header=[0, 1, 2, 3], index_col=[ + 0, 1], tupleize_cols=False) + tm.assert_frame_equal(df, expected) + + # skipping lines in the header + df = self.read_csv(StringIO(data), header=[0, 1, 2, 3], index_col=[ + 0, 1], tupleize_cols=False) + tm.assert_frame_equal(df, expected) + + # INVALID OPTIONS + + # no as_recarray + self.assertRaises(ValueError, self.read_csv, + StringIO(data), header=[0, 1, 2, 3], + index_col=[0, 1], as_recarray=True, + tupleize_cols=False) + + # names + self.assertRaises(ValueError, self.read_csv, + StringIO(data), header=[0, 1, 2, 3], + index_col=[0, 1], names=['foo', 'bar'], + tupleize_cols=False) + + # usecols + self.assertRaises(ValueError, self.read_csv, + StringIO(data), header=[0, 1, 2, 3], + index_col=[0, 1], usecols=['foo', 'bar'], + tupleize_cols=False) + + # non-numeric index_col + self.assertRaises(ValueError, self.read_csv, + StringIO(data), header=[0, 1, 2, 3], + index_col=['foo', 'bar'], tupleize_cols=False) + + def test_header_multiindex_common_format(self): + + df = DataFrame([[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]], + index=['one', 'two'], + columns=MultiIndex.from_tuples( + [('a', 'q'), ('a', 'r'), ('a', 's'), + ('b', 't'), ('c', 'u'), ('c', 'v')])) + + # to_csv + data = """,a,a,a,b,c,c +,q,r,s,t,u,v +,,,,,, +one,1,2,3,4,5,6 +two,7,8,9,10,11,12""" + + result = self.read_csv(StringIO(data), header=[0, 1], index_col=0) + tm.assert_frame_equal(df, result) + + # common + data = """,a,a,a,b,c,c +,q,r,s,t,u,v +one,1,2,3,4,5,6 +two,7,8,9,10,11,12""" + + result = self.read_csv(StringIO(data), header=[0, 1], index_col=0) + tm.assert_frame_equal(df, result) + + # common, no index_col + data = """a,a,a,b,c,c +q,r,s,t,u,v +1,2,3,4,5,6 +7,8,9,10,11,12""" + + result = self.read_csv(StringIO(data), header=[0, 1], index_col=None) + tm.assert_frame_equal(df.reset_index(drop=True), result) + + # malformed case 1 + expected = DataFrame(np.array( + [[2, 3, 4, 5, 6], [8, 9, 10, 11, 12]], dtype='int64'), + index=Index([1, 7]), + columns=MultiIndex(levels=[[u('a'), u('b'), u('c')], + [u('r'), u('s'), u('t'), + u('u'), u('v')]], + labels=[[0, 0, 1, 2, 2], [0, 1, 2, 3, 4]], + names=[u('a'), u('q')])) + + data = """a,a,a,b,c,c +q,r,s,t,u,v +1,2,3,4,5,6 +7,8,9,10,11,12""" + + result = self.read_csv(StringIO(data), header=[0, 1], index_col=0) + tm.assert_frame_equal(expected, result) + + # malformed case 2 + expected = DataFrame(np.array( + [[2, 3, 4, 5, 6], [8, 9, 10, 11, 12]], dtype='int64'), + index=Index([1, 7]), + columns=MultiIndex(levels=[[u('a'), u('b'), u('c')], + [u('r'), u('s'), u('t'), + u('u'), u('v')]], + labels=[[0, 0, 1, 2, 2], [0, 1, 2, 3, 4]], + names=[None, u('q')])) + + data = """,a,a,b,c,c +q,r,s,t,u,v +1,2,3,4,5,6 +7,8,9,10,11,12""" + + result = self.read_csv(StringIO(data), header=[0, 1], index_col=0) + tm.assert_frame_equal(expected, result) + + # mi on columns and index (malformed) + expected = DataFrame(np.array( + [[3, 4, 5, 6], [9, 10, 11, 12]], dtype='int64'), + index=MultiIndex(levels=[[1, 7], [2, 8]], + labels=[[0, 1], [0, 1]]), + columns=MultiIndex(levels=[[u('a'), u('b'), u('c')], + [u('s'), u('t'), u('u'), u('v')]], + labels=[[0, 1, 2, 2], [0, 1, 2, 3]], + names=[None, u('q')])) + + data = """,a,a,b,c,c +q,r,s,t,u,v +1,2,3,4,5,6 +7,8,9,10,11,12""" + + result = self.read_csv(StringIO(data), header=[0, 1], index_col=[0, 1]) + tm.assert_frame_equal(expected, result) + + def test_header_names_backward_compat(self): + # #2539 + data = '1,2,3\n4,5,6' + + result = self.read_csv(StringIO(data), names=['a', 'b', 'c']) + expected = self.read_csv(StringIO(data), names=['a', 'b', 'c'], + header=None) + tm.assert_frame_equal(result, expected) + + data2 = 'foo,bar,baz\n' + data + result = self.read_csv(StringIO(data2), names=['a', 'b', 'c'], + header=0) + tm.assert_frame_equal(result, expected) + + def test_read_only_header_no_rows(self): + # See gh-7773 + expected = DataFrame(columns=['a', 'b', 'c']) + + df = self.read_csv(StringIO('a,b,c')) + tm.assert_frame_equal(df, expected) + + df = self.read_csv(StringIO('a,b,c'), index_col=False) + tm.assert_frame_equal(df, expected) + + def test_no_header(self): + data = """1,2,3,4,5 +6,7,8,9,10 +11,12,13,14,15 +""" + df = self.read_table(StringIO(data), sep=',', header=None) + df_pref = self.read_table(StringIO(data), sep=',', prefix='X', + header=None) + + names = ['foo', 'bar', 'baz', 'quux', 'panda'] + df2 = self.read_table(StringIO(data), sep=',', names=names) + expected = [[1, 2, 3, 4, 5.], + [6, 7, 8, 9, 10], + [11, 12, 13, 14, 15]] + tm.assert_almost_equal(df.values, expected) + tm.assert_almost_equal(df.values, df2.values) + + self.assert_numpy_array_equal(df_pref.columns, + ['X0', 'X1', 'X2', 'X3', 'X4']) + self.assert_numpy_array_equal(df.columns, lrange(5)) + + self.assert_numpy_array_equal(df2.columns, names) diff --git a/pandas/io/tests/parser/index_col.py b/pandas/io/tests/parser/index_col.py new file mode 100644 index 0000000000000..6eb15eb3e043c --- /dev/null +++ b/pandas/io/tests/parser/index_col.py @@ -0,0 +1,141 @@ +# -*- coding: utf-8 -*- + +""" +Tests that the specified index column (a.k.a 'index_col') +is properly handled or inferred during parsing for all of +the parsers defined in parsers.py +""" + +import pandas.util.testing as tm + +from pandas import DataFrame, Index, MultiIndex +from pandas.compat import StringIO + + +class IndexColTests(object): + + def test_index_col_named(self): + no_header = """\ +KORD1,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 +KORD2,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 +KORD3,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 +KORD4,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 +KORD5,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 +KORD6,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000""" # noqa + + h = "ID,date,NominalTime,ActualTime,TDew,TAir,Windspeed,Precip,WindDir\n" # noqa + data = h + no_header + rs = self.read_csv(StringIO(data), index_col='ID') + xp = self.read_csv(StringIO(data), header=0).set_index('ID') + tm.assert_frame_equal(rs, xp) + + self.assertRaises(ValueError, self.read_csv, StringIO(no_header), + index_col='ID') + + data = """\ +1,2,3,4,hello +5,6,7,8,world +9,10,11,12,foo +""" + names = ['a', 'b', 'c', 'd', 'message'] + xp = DataFrame({'a': [1, 5, 9], 'b': [2, 6, 10], 'c': [3, 7, 11], + 'd': [4, 8, 12]}, + index=Index(['hello', 'world', 'foo'], name='message')) + rs = self.read_csv(StringIO(data), names=names, index_col=['message']) + tm.assert_frame_equal(xp, rs) + self.assertEqual(xp.index.name, rs.index.name) + + rs = self.read_csv(StringIO(data), names=names, index_col='message') + tm.assert_frame_equal(xp, rs) + self.assertEqual(xp.index.name, rs.index.name) + + def test_index_col_is_true(self): + # see gh-9798 + self.assertRaises(ValueError, self.read_csv, + StringIO(self.ts_data), index_col=True) + + def test_infer_index_col(self): + data = """A,B,C +foo,1,2,3 +bar,4,5,6 +baz,7,8,9 +""" + data = self.read_csv(StringIO(data)) + self.assertTrue(data.index.equals(Index(['foo', 'bar', 'baz']))) + + def test_empty_index_col_scenarios(self): + data = 'x,y,z' + + # None, no index + index_col, expected = None, DataFrame([], columns=list('xyz')), + tm.assert_frame_equal(self.read_csv( + StringIO(data), index_col=index_col), expected) + + # False, no index + index_col, expected = False, DataFrame([], columns=list('xyz')), + tm.assert_frame_equal(self.read_csv( + StringIO(data), index_col=index_col), expected) + + # int, first column + index_col, expected = 0, DataFrame( + [], columns=['y', 'z'], index=Index([], name='x')) + tm.assert_frame_equal(self.read_csv( + StringIO(data), index_col=index_col), expected) + + # int, not first column + index_col, expected = 1, DataFrame( + [], columns=['x', 'z'], index=Index([], name='y')) + tm.assert_frame_equal(self.read_csv( + StringIO(data), index_col=index_col), expected) + + # str, first column + index_col, expected = 'x', DataFrame( + [], columns=['y', 'z'], index=Index([], name='x')) + tm.assert_frame_equal(self.read_csv( + StringIO(data), index_col=index_col), expected) + + # str, not the first column + index_col, expected = 'y', DataFrame( + [], columns=['x', 'z'], index=Index([], name='y')) + tm.assert_frame_equal(self.read_csv( + StringIO(data), index_col=index_col), expected) + + # list of int + index_col, expected = [0, 1], DataFrame( + [], columns=['z'], index=MultiIndex.from_arrays( + [[]] * 2, names=['x', 'y'])) + tm.assert_frame_equal(self.read_csv( + StringIO(data), index_col=index_col), + expected, check_index_type=False) + + # list of str + index_col = ['x', 'y'] + expected = DataFrame([], columns=['z'], + index=MultiIndex.from_arrays( + [[]] * 2, names=['x', 'y'])) + tm.assert_frame_equal(self.read_csv(StringIO( + data), index_col=index_col), + expected, check_index_type=False) + + # list of int, reversed sequence + index_col = [1, 0] + expected = DataFrame([], columns=['z'], index=MultiIndex.from_arrays( + [[]] * 2, names=['y', 'x'])) + tm.assert_frame_equal(self.read_csv( + StringIO(data), index_col=index_col), + expected, check_index_type=False) + + # list of str, reversed sequence + index_col = ['y', 'x'] + expected = DataFrame([], columns=['z'], index=MultiIndex.from_arrays( + [[]] * 2, names=['y', 'x'])) + tm.assert_frame_equal(self.read_csv(StringIO( + data), index_col=index_col), + expected, check_index_type=False) + + def test_empty_with_index_col_false(self): + # see gh-10413 + data = 'x,y' + result = self.read_csv(StringIO(data), index_col=False) + expected = DataFrame([], columns=['x', 'y']) + tm.assert_frame_equal(result, expected) diff --git a/pandas/io/tests/parser/multithread.py b/pandas/io/tests/parser/multithread.py new file mode 100644 index 0000000000000..2aaef889db6de --- /dev/null +++ b/pandas/io/tests/parser/multithread.py @@ -0,0 +1,99 @@ +# -*- coding: utf-8 -*- + +""" +Tests multithreading behaviour for reading and +parsing files for each parser defined in parsers.py +""" + +from __future__ import division +from multiprocessing.pool import ThreadPool + +import numpy as np +import pandas as pd +import pandas.util.testing as tm + +from pandas import DataFrame +from pandas.compat import BytesIO, range + + +def _construct_dataframe(num_rows): + + df = DataFrame(np.random.rand(num_rows, 5), columns=list('abcde')) + df['foo'] = 'foo' + df['bar'] = 'bar' + df['baz'] = 'baz' + df['date'] = pd.date_range('20000101 09:00:00', + periods=num_rows, + freq='s') + df['int'] = np.arange(num_rows, dtype='int64') + return df + + +class MultithreadTests(object): + + def _generate_multithread_dataframe(self, path, num_rows, num_tasks): + + def reader(arg): + start, nrows = arg + + if not start: + return self.read_csv(path, index_col=0, header=0, + nrows=nrows, parse_dates=['date']) + + return self.read_csv(path, + index_col=0, + header=None, + skiprows=int(start) + 1, + nrows=nrows, + parse_dates=[9]) + + tasks = [ + (num_rows * i // num_tasks, + num_rows // num_tasks) for i in range(num_tasks) + ] + + pool = ThreadPool(processes=num_tasks) + + results = pool.map(reader, tasks) + + header = results[0].columns + for r in results[1:]: + r.columns = header + + final_dataframe = pd.concat(results) + + return final_dataframe + + def test_multithread_stringio_read_csv(self): + # see gh-11786 + max_row_range = 10000 + num_files = 100 + + bytes_to_df = [ + '\n'.join( + ['%d,%d,%d' % (i, i, i) for i in range(max_row_range)] + ).encode() for j in range(num_files)] + files = [BytesIO(b) for b in bytes_to_df] + + # read all files in many threads + pool = ThreadPool(8) + results = pool.map(self.read_csv, files) + first_result = results[0] + + for result in results: + tm.assert_frame_equal(first_result, result) + + def test_multithread_path_multipart_read_csv(self): + # see gh-11786 + num_tasks = 4 + file_name = '__threadpool_reader__.csv' + num_rows = 100000 + + df = _construct_dataframe(num_rows) + + with tm.ensure_clean(file_name) as path: + df.to_csv(path) + + final_dataframe = self._generate_multithread_dataframe( + path, num_rows, num_tasks) + tm.assert_frame_equal(df, final_dataframe) diff --git a/pandas/io/tests/parser/na_values.py b/pandas/io/tests/parser/na_values.py new file mode 100644 index 0000000000000..853e6242751c9 --- /dev/null +++ b/pandas/io/tests/parser/na_values.py @@ -0,0 +1,365 @@ +# -*- coding: utf-8 -*- + +""" +Tests that NA values are properly handled during +parsing for all of the parsers defined in parsers.py +""" + +import numpy as np +from numpy import nan + +import pandas.io.parsers as parsers +import pandas.util.testing as tm + +from pandas import DataFrame, MultiIndex, read_csv +from pandas.compat import StringIO, range + + +class NAvaluesTests(object): + + def test_string_nas(self): + data = """A,B,C +a,b,c +d,,f +,g,h +""" + result = self.read_csv(StringIO(data)) + expected = DataFrame([['a', 'b', 'c'], + ['d', np.nan, 'f'], + [np.nan, 'g', 'h']], + columns=['A', 'B', 'C']) + + tm.assert_frame_equal(result, expected) + + def test_detect_string_na(self): + data = """A,B +foo,bar +NA,baz +NaN,nan +""" + expected = [['foo', 'bar'], [nan, 'baz'], [nan, nan]] + df = self.read_csv(StringIO(data)) + tm.assert_almost_equal(df.values, expected) + + def test_non_string_na_values(self): + # see gh-3611, na_values that are not a string are an issue + with tm.ensure_clean('__non_string_na_values__.csv') as path: + df = DataFrame({'A': [-999, 2, 3], 'B': [1.2, -999, 4.5]}) + df.to_csv(path, sep=' ', index=False) + result1 = self.read_csv(path, sep=' ', header=0, + na_values=['-999.0', '-999']) + result2 = self.read_csv(path, sep=' ', header=0, + na_values=[-999, -999.0]) + result3 = self.read_csv(path, sep=' ', header=0, + na_values=[-999.0, -999]) + tm.assert_frame_equal(result1, result2) + tm.assert_frame_equal(result2, result3) + + result4 = self.read_csv( + path, sep=' ', header=0, na_values=['-999.0']) + result5 = self.read_csv( + path, sep=' ', header=0, na_values=['-999']) + result6 = self.read_csv( + path, sep=' ', header=0, na_values=[-999.0]) + result7 = self.read_csv( + path, sep=' ', header=0, na_values=[-999]) + tm.assert_frame_equal(result4, result3) + tm.assert_frame_equal(result5, result3) + tm.assert_frame_equal(result6, result3) + tm.assert_frame_equal(result7, result3) + + good_compare = result3 + + # with an odd float format, so we can't match the string 999.0 + # exactly, but need float matching + # TODO: change these to self.read_csv when Python bug is squashed + df.to_csv(path, sep=' ', index=False, float_format='%.3f') + result1 = read_csv(path, sep=' ', header=0, + na_values=['-999.0', '-999']) + result2 = read_csv(path, sep=' ', header=0, + na_values=[-999.0, -999]) + tm.assert_frame_equal(result1, good_compare) + tm.assert_frame_equal(result2, good_compare) + + result3 = read_csv(path, sep=' ', + header=0, na_values=['-999.0']) + result4 = read_csv(path, sep=' ', + header=0, na_values=['-999']) + result5 = read_csv(path, sep=' ', + header=0, na_values=[-999.0]) + result6 = read_csv(path, sep=' ', + header=0, na_values=[-999]) + tm.assert_frame_equal(result3, good_compare) + tm.assert_frame_equal(result4, good_compare) + tm.assert_frame_equal(result5, good_compare) + tm.assert_frame_equal(result6, good_compare) + + def test_default_na_values(self): + _NA_VALUES = set(['-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN', + '#N/A', 'N/A', 'NA', '#NA', 'NULL', 'NaN', + 'nan', '-NaN', '-nan', '#N/A N/A', '']) + self.assertEqual(_NA_VALUES, parsers._NA_VALUES) + nv = len(_NA_VALUES) + + def f(i, v): + if i == 0: + buf = '' + elif i > 0: + buf = ''.join([','] * i) + + buf = "{0}{1}".format(buf, v) + + if i < nv - 1: + buf = "{0}{1}".format(buf, ''.join([','] * (nv - i - 1))) + + return buf + + data = StringIO('\n'.join([f(i, v) for i, v in enumerate(_NA_VALUES)])) + expected = DataFrame(np.nan, columns=range(nv), index=range(nv)) + df = self.read_csv(data, header=None) + tm.assert_frame_equal(df, expected) + + def test_custom_na_values(self): + data = """A,B,C +ignore,this,row +1,NA,3 +-1.#IND,5,baz +7,8,NaN +""" + expected = [[1., nan, 3], + [nan, 5, nan], + [7, 8, nan]] + + df = self.read_csv(StringIO(data), na_values=['baz'], skiprows=[1]) + tm.assert_almost_equal(df.values, expected) + + df2 = self.read_table(StringIO(data), sep=',', na_values=['baz'], + skiprows=[1]) + tm.assert_almost_equal(df2.values, expected) + + df3 = self.read_table(StringIO(data), sep=',', na_values='baz', + skiprows=[1]) + tm.assert_almost_equal(df3.values, expected) + + def test_bool_na_values(self): + data = """A,B,C +True,False,True +NA,True,False +False,NA,True""" + + result = self.read_csv(StringIO(data)) + expected = DataFrame({'A': np.array([True, nan, False], dtype=object), + 'B': np.array([False, True, nan], dtype=object), + 'C': [True, False, True]}) + + tm.assert_frame_equal(result, expected) + + def test_na_value_dict(self): + data = """A,B,C +foo,bar,NA +bar,foo,foo +foo,bar,NA +bar,foo,foo""" + + df = self.read_csv(StringIO(data), + na_values={'A': ['foo'], 'B': ['bar']}) + expected = DataFrame({'A': [np.nan, 'bar', np.nan, 'bar'], + 'B': [np.nan, 'foo', np.nan, 'foo'], + 'C': [np.nan, 'foo', np.nan, 'foo']}) + tm.assert_frame_equal(df, expected) + + data = """\ +a,b,c,d +0,NA,1,5 +""" + xp = DataFrame({'b': [np.nan], 'c': [1], 'd': [5]}, index=[0]) + xp.index.name = 'a' + df = self.read_csv(StringIO(data), na_values={}, index_col=0) + tm.assert_frame_equal(df, xp) + + xp = DataFrame({'b': [np.nan], 'd': [5]}, + MultiIndex.from_tuples([(0, 1)])) + xp.index.names = ['a', 'c'] + df = self.read_csv(StringIO(data), na_values={}, index_col=[0, 2]) + tm.assert_frame_equal(df, xp) + + xp = DataFrame({'b': [np.nan], 'd': [5]}, + MultiIndex.from_tuples([(0, 1)])) + xp.index.names = ['a', 'c'] + df = self.read_csv(StringIO(data), na_values={}, index_col=['a', 'c']) + tm.assert_frame_equal(df, xp) + + def test_na_values_keep_default(self): + data = """\ +One,Two,Three +a,1,one +b,2,two +,3,three +d,4,nan +e,5,five +nan,6, +g,7,seven +""" + df = self.read_csv(StringIO(data)) + xp = DataFrame({'One': ['a', 'b', np.nan, 'd', 'e', np.nan, 'g'], + 'Two': [1, 2, 3, 4, 5, 6, 7], + 'Three': ['one', 'two', 'three', np.nan, 'five', + np.nan, 'seven']}) + tm.assert_frame_equal(xp.reindex(columns=df.columns), df) + + df = self.read_csv(StringIO(data), na_values={'One': [], 'Three': []}, + keep_default_na=False) + xp = DataFrame({'One': ['a', 'b', '', 'd', 'e', 'nan', 'g'], + 'Two': [1, 2, 3, 4, 5, 6, 7], + 'Three': ['one', 'two', 'three', 'nan', 'five', + '', 'seven']}) + tm.assert_frame_equal(xp.reindex(columns=df.columns), df) + + df = self.read_csv( + StringIO(data), na_values=['a'], keep_default_na=False) + xp = DataFrame({'One': [np.nan, 'b', '', 'd', 'e', 'nan', 'g'], + 'Two': [1, 2, 3, 4, 5, 6, 7], + 'Three': ['one', 'two', 'three', 'nan', 'five', '', + 'seven']}) + tm.assert_frame_equal(xp.reindex(columns=df.columns), df) + + df = self.read_csv(StringIO(data), na_values={'One': [], 'Three': []}) + xp = DataFrame({'One': ['a', 'b', np.nan, 'd', 'e', np.nan, 'g'], + 'Two': [1, 2, 3, 4, 5, 6, 7], + 'Three': ['one', 'two', 'three', np.nan, 'five', + np.nan, 'seven']}) + tm.assert_frame_equal(xp.reindex(columns=df.columns), df) + + # see gh-4318: passing na_values=None and + # keep_default_na=False yields 'None' as a na_value + data = """\ +One,Two,Three +a,1,None +b,2,two +,3,None +d,4,nan +e,5,five +nan,6, +g,7,seven +""" + df = self.read_csv( + StringIO(data), keep_default_na=False) + xp = DataFrame({'One': ['a', 'b', '', 'd', 'e', 'nan', 'g'], + 'Two': [1, 2, 3, 4, 5, 6, 7], + 'Three': ['None', 'two', 'None', 'nan', 'five', '', + 'seven']}) + tm.assert_frame_equal(xp.reindex(columns=df.columns), df) + + def test_skiprow_with_newline(self): + # see gh-12775 and gh-10911 + data = """id,text,num_lines +1,"line 11 +line 12",2 +2,"line 21 +line 22",2 +3,"line 31",1""" + expected = [[2, 'line 21\nline 22', 2], + [3, 'line 31', 1]] + expected = DataFrame(expected, columns=[ + 'id', 'text', 'num_lines']) + df = self.read_csv(StringIO(data), skiprows=[1]) + tm.assert_frame_equal(df, expected) + + data = ('a,b,c\n~a\n b~,~e\n d~,' + '~f\n f~\n1,2,~12\n 13\n 14~') + expected = [['a\n b', 'e\n d', 'f\n f']] + expected = DataFrame(expected, columns=[ + 'a', 'b', 'c']) + df = self.read_csv(StringIO(data), + quotechar="~", + skiprows=[2]) + tm.assert_frame_equal(df, expected) + + data = ('Text,url\n~example\n ' + 'sentence\n one~,url1\n~' + 'example\n sentence\n two~,url2\n~' + 'example\n sentence\n three~,url3') + expected = [['example\n sentence\n two', 'url2']] + expected = DataFrame(expected, columns=[ + 'Text', 'url']) + df = self.read_csv(StringIO(data), + quotechar="~", + skiprows=[1, 3]) + tm.assert_frame_equal(df, expected) + + def test_skiprow_with_quote(self): + # see gh-12775 and gh-10911 + data = """id,text,num_lines +1,"line '11' line 12",2 +2,"line '21' line 22",2 +3,"line '31' line 32",1""" + expected = [[2, "line '21' line 22", 2], + [3, "line '31' line 32", 1]] + expected = DataFrame(expected, columns=[ + 'id', 'text', 'num_lines']) + df = self.read_csv(StringIO(data), skiprows=[1]) + tm.assert_frame_equal(df, expected) + + def test_skiprow_with_newline_and_quote(self): + # see gh-12775 and gh-10911 + data = """id,text,num_lines +1,"line \n'11' line 12",2 +2,"line \n'21' line 22",2 +3,"line \n'31' line 32",1""" + expected = [[2, "line \n'21' line 22", 2], + [3, "line \n'31' line 32", 1]] + expected = DataFrame(expected, columns=[ + 'id', 'text', 'num_lines']) + df = self.read_csv(StringIO(data), skiprows=[1]) + tm.assert_frame_equal(df, expected) + + data = """id,text,num_lines +1,"line '11\n' line 12",2 +2,"line '21\n' line 22",2 +3,"line '31\n' line 32",1""" + expected = [[2, "line '21\n' line 22", 2], + [3, "line '31\n' line 32", 1]] + expected = DataFrame(expected, columns=[ + 'id', 'text', 'num_lines']) + df = self.read_csv(StringIO(data), skiprows=[1]) + tm.assert_frame_equal(df, expected) + + data = """id,text,num_lines +1,"line '11\n' \r\tline 12",2 +2,"line '21\n' \r\tline 22",2 +3,"line '31\n' \r\tline 32",1""" + expected = [[2, "line '21\n' \r\tline 22", 2], + [3, "line '31\n' \r\tline 32", 1]] + expected = DataFrame(expected, columns=[ + 'id', 'text', 'num_lines']) + df = self.read_csv(StringIO(data), skiprows=[1]) + tm.assert_frame_equal(df, expected) + + def test_skiprows_lineterminator(self): + # see gh-9079 + data = '\n'.join(['SMOSMANIA ThetaProbe-ML2X ', + '2007/01/01 01:00 0.2140 U M ', + '2007/01/01 02:00 0.2141 M O ', + '2007/01/01 04:00 0.2142 D M ']) + expected = DataFrame([['2007/01/01', '01:00', 0.2140, 'U', 'M'], + ['2007/01/01', '02:00', 0.2141, 'M', 'O'], + ['2007/01/01', '04:00', 0.2142, 'D', 'M']], + columns=['date', 'time', 'var', 'flag', + 'oflag']) + + # test with default line terminators "LF" and "CRLF" + df = self.read_csv(StringIO(data), skiprows=1, delim_whitespace=True, + names=['date', 'time', 'var', 'flag', 'oflag']) + tm.assert_frame_equal(df, expected) + + df = self.read_csv(StringIO(data.replace('\n', '\r\n')), + skiprows=1, delim_whitespace=True, + names=['date', 'time', 'var', 'flag', 'oflag']) + tm.assert_frame_equal(df, expected) + + # "CR" is not respected with the Python parser yet + if self.engine == 'c': + df = self.read_csv(StringIO(data.replace('\n', '\r')), + skiprows=1, delim_whitespace=True, + names=['date', 'time', 'var', 'flag', 'oflag']) + tm.assert_frame_equal(df, expected) diff --git a/pandas/io/tests/parser/parse_dates.py b/pandas/io/tests/parser/parse_dates.py new file mode 100644 index 0000000000000..ec368bb358ad5 --- /dev/null +++ b/pandas/io/tests/parser/parse_dates.py @@ -0,0 +1,469 @@ +# -*- coding: utf-8 -*- + +""" +Tests date parsing functionality for all of the +parsers defined in parsers.py +""" + +from distutils.version import LooseVersion +from datetime import datetime + +import nose +import numpy as np +import pandas.lib as lib +from pandas.lib import Timestamp + +import pandas as pd +import pandas.io.parsers as parsers +import pandas.tseries.tools as tools +import pandas.util.testing as tm + +from pandas import DataFrame, Series, Index, DatetimeIndex +from pandas import compat +from pandas.compat import(parse_date, StringIO, + lrange, lmap) +from pandas.tseries.index import date_range + + +class ParseDatesTests(object): + def test_separator_date_conflict(self): + # Regression test for gh-4678: make sure thousands separator and + # date parsing do not conflict. + data = '06-02-2013;13:00;1-000.215' + expected = DataFrame( + [[datetime(2013, 6, 2, 13, 0, 0), 1000.215]], + columns=['Date', 2] + ) + + df = self.read_csv(StringIO(data), sep=';', thousands='-', + parse_dates={'Date': [0, 1]}, header=None) + tm.assert_frame_equal(df, expected) + + def test_multiple_date_col(self): + # Can use multiple date parsers + data = """\ +KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 +KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 +KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 +KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 +KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 +KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000 +""" + + def func(*date_cols): + return lib.try_parse_dates(parsers._concat_date_cols(date_cols)) + + df = self.read_csv(StringIO(data), header=None, + date_parser=func, + prefix='X', + parse_dates={'nominal': [1, 2], + 'actual': [1, 3]}) + self.assertIn('nominal', df) + self.assertIn('actual', df) + self.assertNotIn('X1', df) + self.assertNotIn('X2', df) + self.assertNotIn('X3', df) + + d = datetime(1999, 1, 27, 19, 0) + self.assertEqual(df.ix[0, 'nominal'], d) + + df = self.read_csv(StringIO(data), header=None, + date_parser=func, + parse_dates={'nominal': [1, 2], + 'actual': [1, 3]}, + keep_date_col=True) + self.assertIn('nominal', df) + self.assertIn('actual', df) + + self.assertIn(1, df) + self.assertIn(2, df) + self.assertIn(3, df) + + data = """\ +KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 +KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 +KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 +KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 +KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 +KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000 +""" + df = self.read_csv(StringIO(data), header=None, + prefix='X', parse_dates=[[1, 2], [1, 3]]) + + self.assertIn('X1_X2', df) + self.assertIn('X1_X3', df) + self.assertNotIn('X1', df) + self.assertNotIn('X2', df) + self.assertNotIn('X3', df) + + d = datetime(1999, 1, 27, 19, 0) + self.assertEqual(df.ix[0, 'X1_X2'], d) + + df = self.read_csv(StringIO(data), header=None, + parse_dates=[[1, 2], [1, 3]], keep_date_col=True) + + self.assertIn('1_2', df) + self.assertIn('1_3', df) + self.assertIn(1, df) + self.assertIn(2, df) + self.assertIn(3, df) + + data = '''\ +KORD,19990127 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 +KORD,19990127 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 +KORD,19990127 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 +KORD,19990127 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 +KORD,19990127 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 +''' + df = self.read_csv(StringIO(data), sep=',', header=None, + parse_dates=[1], index_col=1) + d = datetime(1999, 1, 27, 19, 0) + self.assertEqual(df.index[0], d) + + def test_multiple_date_cols_int_cast(self): + data = ("KORD,19990127, 19:00:00, 18:56:00, 0.8100\n" + "KORD,19990127, 20:00:00, 19:56:00, 0.0100\n" + "KORD,19990127, 21:00:00, 20:56:00, -0.5900\n" + "KORD,19990127, 21:00:00, 21:18:00, -0.9900\n" + "KORD,19990127, 22:00:00, 21:56:00, -0.5900\n" + "KORD,19990127, 23:00:00, 22:56:00, -0.5900") + date_spec = {'nominal': [1, 2], 'actual': [1, 3]} + import pandas.io.date_converters as conv + + # it works! + df = self.read_csv(StringIO(data), header=None, parse_dates=date_spec, + date_parser=conv.parse_date_time) + self.assertIn('nominal', df) + + def test_multiple_date_col_timestamp_parse(self): + data = """05/31/2012,15:30:00.029,1306.25,1,E,0,,1306.25 +05/31/2012,15:30:00.029,1306.25,8,E,0,,1306.25""" + result = self.read_csv(StringIO(data), sep=',', header=None, + parse_dates=[[0, 1]], date_parser=Timestamp) + + ex_val = Timestamp('05/31/2012 15:30:00.029') + self.assertEqual(result['0_1'][0], ex_val) + + def test_multiple_date_cols_with_header(self): + data = """\ +ID,date,NominalTime,ActualTime,TDew,TAir,Windspeed,Precip,WindDir +KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 +KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 +KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 +KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 +KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 +KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000""" + + df = self.read_csv(StringIO(data), parse_dates={'nominal': [1, 2]}) + self.assertNotIsInstance(df.nominal[0], compat.string_types) + + ts_data = """\ +ID,date,nominalTime,actualTime,A,B,C,D,E +KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 +KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 +KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 +KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 +KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 +KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000 +""" + + def test_multiple_date_col_name_collision(self): + self.assertRaises(ValueError, self.read_csv, StringIO(self.ts_data), + parse_dates={'ID': [1, 2]}) + + data = """\ +date_NominalTime,date,NominalTime,ActualTime,TDew,TAir,Windspeed,Precip,WindDir +KORD1,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 +KORD2,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 +KORD3,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 +KORD4,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 +KORD5,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 +KORD6,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000""" # noqa + + self.assertRaises(ValueError, self.read_csv, StringIO(data), + parse_dates=[[1, 2]]) + + def test_date_parser_int_bug(self): + # See gh-3071 + log_file = StringIO( + 'posix_timestamp,elapsed,sys,user,queries,query_time,rows,' + 'accountid,userid,contactid,level,silo,method\n' + '1343103150,0.062353,0,4,6,0.01690,3,' + '12345,1,-1,3,invoice_InvoiceResource,search\n' + ) + + def f(posix_string): + return datetime.utcfromtimestamp(int(posix_string)) + + # it works! + self.read_csv(log_file, index_col=0, parse_dates=[0], date_parser=f) + + def test_nat_parse(self): + # See gh-3062 + df = DataFrame(dict({ + 'A': np.asarray(lrange(10), dtype='float64'), + 'B': pd.Timestamp('20010101')})) + df.iloc[3:6, :] = np.nan + + with tm.ensure_clean('__nat_parse_.csv') as path: + df.to_csv(path) + result = self.read_csv(path, index_col=0, parse_dates=['B']) + tm.assert_frame_equal(result, df) + + expected = Series(dict(A='float64', B='datetime64[ns]')) + tm.assert_series_equal(expected, result.dtypes) + + # test with NaT for the nan_rep + # we don't have a method to specif the Datetime na_rep (it defaults + # to '') + df.to_csv(path) + result = self.read_csv(path, index_col=0, parse_dates=['B']) + tm.assert_frame_equal(result, df) + + def test_csv_custom_parser(self): + data = """A,B,C +20090101,a,1,2 +20090102,b,3,4 +20090103,c,4,5 +""" + f = lambda x: datetime.strptime(x, '%Y%m%d') + df = self.read_csv(StringIO(data), date_parser=f) + expected = self.read_csv(StringIO(data), parse_dates=True) + tm.assert_frame_equal(df, expected) + + def test_parse_dates_implicit_first_col(self): + data = """A,B,C +20090101,a,1,2 +20090102,b,3,4 +20090103,c,4,5 +""" + df = self.read_csv(StringIO(data), parse_dates=True) + expected = self.read_csv(StringIO(data), index_col=0, parse_dates=True) + self.assertIsInstance( + df.index[0], (datetime, np.datetime64, Timestamp)) + tm.assert_frame_equal(df, expected) + + def test_parse_dates_string(self): + data = """date,A,B,C +20090101,a,1,2 +20090102,b,3,4 +20090103,c,4,5 +""" + rs = self.read_csv( + StringIO(data), index_col='date', parse_dates=['date']) + idx = date_range('1/1/2009', periods=3) + idx.name = 'date' + xp = DataFrame({'A': ['a', 'b', 'c'], + 'B': [1, 3, 4], + 'C': [2, 4, 5]}, idx) + tm.assert_frame_equal(rs, xp) + + def test_yy_format_with_yearfirst(self): + data = """date,time,B,C +090131,0010,1,2 +090228,1020,3,4 +090331,0830,5,6 +""" + + # See gh-217 + import dateutil + if dateutil.__version__ >= LooseVersion('2.5.0'): + raise nose.SkipTest("testing yearfirst=True not-support" + "on datetutil < 2.5.0 this works but" + "is wrong") + + rs = self.read_csv(StringIO(data), index_col=0, + parse_dates=[['date', 'time']]) + idx = DatetimeIndex([datetime(2009, 1, 31, 0, 10, 0), + datetime(2009, 2, 28, 10, 20, 0), + datetime(2009, 3, 31, 8, 30, 0)], + dtype=object, name='date_time') + xp = DataFrame({'B': [1, 3, 5], 'C': [2, 4, 6]}, idx) + tm.assert_frame_equal(rs, xp) + + rs = self.read_csv(StringIO(data), index_col=0, + parse_dates=[[0, 1]]) + idx = DatetimeIndex([datetime(2009, 1, 31, 0, 10, 0), + datetime(2009, 2, 28, 10, 20, 0), + datetime(2009, 3, 31, 8, 30, 0)], + dtype=object, name='date_time') + xp = DataFrame({'B': [1, 3, 5], 'C': [2, 4, 6]}, idx) + tm.assert_frame_equal(rs, xp) + + def test_parse_dates_column_list(self): + from pandas.core.datetools import to_datetime + + data = '''date;destination;ventilationcode;unitcode;units;aux_date +01/01/2010;P;P;50;1;12/1/2011 +01/01/2010;P;R;50;1;13/1/2011 +15/01/2010;P;P;50;1;14/1/2011 +01/05/2010;P;P;50;1;15/1/2011''' + + expected = self.read_csv(StringIO(data), sep=";", index_col=lrange(4)) + + lev = expected.index.levels[0] + levels = list(expected.index.levels) + levels[0] = lev.to_datetime(dayfirst=True) + # hack to get this to work - remove for final test + levels[0].name = lev.name + expected.index.set_levels(levels, inplace=True) + expected['aux_date'] = to_datetime(expected['aux_date'], + dayfirst=True) + expected['aux_date'] = lmap(Timestamp, expected['aux_date']) + tm.assertIsInstance(expected['aux_date'][0], datetime) + + df = self.read_csv(StringIO(data), sep=";", index_col=lrange(4), + parse_dates=[0, 5], dayfirst=True) + tm.assert_frame_equal(df, expected) + + df = self.read_csv(StringIO(data), sep=";", index_col=lrange(4), + parse_dates=['date', 'aux_date'], dayfirst=True) + tm.assert_frame_equal(df, expected) + + def test_multi_index_parse_dates(self): + data = """index1,index2,A,B,C +20090101,one,a,1,2 +20090101,two,b,3,4 +20090101,three,c,4,5 +20090102,one,a,1,2 +20090102,two,b,3,4 +20090102,three,c,4,5 +20090103,one,a,1,2 +20090103,two,b,3,4 +20090103,three,c,4,5 +""" + df = self.read_csv(StringIO(data), index_col=[0, 1], parse_dates=True) + self.assertIsInstance(df.index.levels[0][0], + (datetime, np.datetime64, Timestamp)) + + # specify columns out of order! + df2 = self.read_csv(StringIO(data), index_col=[1, 0], parse_dates=True) + self.assertIsInstance(df2.index.levels[1][0], + (datetime, np.datetime64, Timestamp)) + + def test_parse_dates_custom_euroformat(self): + text = """foo,bar,baz +31/01/2010,1,2 +01/02/2010,1,NA +02/02/2010,1,2 +""" + parser = lambda d: parse_date(d, dayfirst=True) + df = self.read_csv(StringIO(text), + names=['time', 'Q', 'NTU'], header=0, + index_col=0, parse_dates=True, + date_parser=parser, na_values=['NA']) + + exp_index = Index([datetime(2010, 1, 31), datetime(2010, 2, 1), + datetime(2010, 2, 2)], name='time') + expected = DataFrame({'Q': [1, 1, 1], 'NTU': [2, np.nan, 2]}, + index=exp_index, columns=['Q', 'NTU']) + tm.assert_frame_equal(df, expected) + + parser = lambda d: parse_date(d, day_first=True) + self.assertRaises(TypeError, self.read_csv, + StringIO(text), skiprows=[0], + names=['time', 'Q', 'NTU'], index_col=0, + parse_dates=True, date_parser=parser, + na_values=['NA']) + + def test_parse_tz_aware(self): + # See gh-1693 + import pytz + data = StringIO("Date,x\n2012-06-13T01:39:00Z,0.5") + + # it works + result = self.read_csv(data, index_col=0, parse_dates=True) + stamp = result.index[0] + self.assertEqual(stamp.minute, 39) + try: + self.assertIs(result.index.tz, pytz.utc) + except AssertionError: # hello Yaroslav + arr = result.index.to_pydatetime() + result = tools.to_datetime(arr, utc=True)[0] + self.assertEqual(stamp.minute, result.minute) + self.assertEqual(stamp.hour, result.hour) + self.assertEqual(stamp.day, result.day) + + def test_multiple_date_cols_index(self): + data = """ +ID,date,NominalTime,ActualTime,TDew,TAir,Windspeed,Precip,WindDir +KORD1,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 +KORD2,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 +KORD3,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 +KORD4,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 +KORD5,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 +KORD6,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000 +""" + + xp = self.read_csv(StringIO(data), parse_dates={'nominal': [1, 2]}) + df = self.read_csv(StringIO(data), parse_dates={'nominal': [1, 2]}, + index_col='nominal') + tm.assert_frame_equal(xp.set_index('nominal'), df) + df2 = self.read_csv(StringIO(data), parse_dates={'nominal': [1, 2]}, + index_col=0) + tm.assert_frame_equal(df2, df) + + df3 = self.read_csv(StringIO(data), parse_dates=[[1, 2]], index_col=0) + tm.assert_frame_equal(df3, df, check_names=False) + + def test_multiple_date_cols_chunked(self): + df = self.read_csv(StringIO(self.ts_data), parse_dates={ + 'nominal': [1, 2]}, index_col='nominal') + reader = self.read_csv(StringIO(self.ts_data), + parse_dates={'nominal': [1, 2]}, + index_col='nominal', chunksize=2) + + chunks = list(reader) + + self.assertNotIn('nominalTime', df) + + tm.assert_frame_equal(chunks[0], df[:2]) + tm.assert_frame_equal(chunks[1], df[2:4]) + tm.assert_frame_equal(chunks[2], df[4:]) + + def test_multiple_date_col_named_components(self): + xp = self.read_csv(StringIO(self.ts_data), + parse_dates={'nominal': [1, 2]}, + index_col='nominal') + colspec = {'nominal': ['date', 'nominalTime']} + df = self.read_csv(StringIO(self.ts_data), parse_dates=colspec, + index_col='nominal') + tm.assert_frame_equal(df, xp) + + def test_multiple_date_col_multiple_index(self): + df = self.read_csv(StringIO(self.ts_data), + parse_dates={'nominal': [1, 2]}, + index_col=['nominal', 'ID']) + + xp = self.read_csv(StringIO(self.ts_data), + parse_dates={'nominal': [1, 2]}) + + tm.assert_frame_equal(xp.set_index(['nominal', 'ID']), df) + + def test_read_with_parse_dates_scalar_non_bool(self): + # See gh-5636 + errmsg = ("Only booleans, lists, and " + "dictionaries are accepted " + "for the 'parse_dates' parameter") + data = """A,B,C + 1,2,2003-11-1""" + + tm.assertRaisesRegexp(TypeError, errmsg, self.read_csv, + StringIO(data), parse_dates="C") + tm.assertRaisesRegexp(TypeError, errmsg, self.read_csv, + StringIO(data), parse_dates="C", + index_col="C") + + def test_read_with_parse_dates_invalid_type(self): + errmsg = ("Only booleans, lists, and " + "dictionaries are accepted " + "for the 'parse_dates' parameter") + data = """A,B,C + 1,2,2003-11-1""" + + tm.assertRaisesRegexp(TypeError, errmsg, self.read_csv, + StringIO(data), parse_dates=(1,)) + tm.assertRaisesRegexp(TypeError, errmsg, self.read_csv, + StringIO(data), parse_dates=np.array([4, 5])) + tm.assertRaisesRegexp(TypeError, errmsg, self.read_csv, + StringIO(data), parse_dates=set([1, 3, 3])) diff --git a/pandas/io/tests/parser/python_parser_only.py b/pandas/io/tests/parser/python_parser_only.py new file mode 100644 index 0000000000000..7d1793c429f4e --- /dev/null +++ b/pandas/io/tests/parser/python_parser_only.py @@ -0,0 +1,172 @@ +# -*- coding: utf-8 -*- + +""" +Tests that apply specifically to the Python parser. Unless specifically +stated as a Python-specific issue, the goal is to eventually move as many of +these tests out of this module as soon as the C parser can accept further +arguments when parsing. +""" + +import sys +import nose + +import pandas.util.testing as tm +from pandas import DataFrame, Index +from pandas import compat +from pandas.compat import StringIO, BytesIO, u + + +class PythonParserTests(object): + def test_negative_skipfooter_raises(self): + text = """#foo,a,b,c +#foo,a,b,c +#foo,a,b,c +#foo,a,b,c +#foo,a,b,c +#foo,a,b,c +1/1/2000,1.,2.,3. +1/2/2000,4,5,6 +1/3/2000,7,8,9 +""" + + with tm.assertRaisesRegexp( + ValueError, 'skip footer cannot be negative'): + self.read_csv(StringIO(text), skipfooter=-1) + + def test_sniff_delimiter(self): + text = """index|A|B|C +foo|1|2|3 +bar|4|5|6 +baz|7|8|9 +""" + data = self.read_csv(StringIO(text), index_col=0, sep=None) + self.assertTrue(data.index.equals(Index(['foo', 'bar', 'baz']))) + + data2 = self.read_csv(StringIO(text), index_col=0, delimiter='|') + tm.assert_frame_equal(data, data2) + + text = """ignore this +ignore this too +index|A|B|C +foo|1|2|3 +bar|4|5|6 +baz|7|8|9 +""" + data3 = self.read_csv(StringIO(text), index_col=0, + sep=None, skiprows=2) + tm.assert_frame_equal(data, data3) + + text = u("""ignore this +ignore this too +index|A|B|C +foo|1|2|3 +bar|4|5|6 +baz|7|8|9 +""").encode('utf-8') + + s = BytesIO(text) + if compat.PY3: + # somewhat False since the code never sees bytes + from io import TextIOWrapper + s = TextIOWrapper(s, encoding='utf-8') + + data4 = self.read_csv(s, index_col=0, sep=None, skiprows=2, + encoding='utf-8') + tm.assert_frame_equal(data, data4) + + def test_BytesIO_input(self): + if not compat.PY3: + raise nose.SkipTest( + "Bytes-related test - only needs to work on Python 3") + + data = BytesIO("שלום::1234\n562::123".encode('cp1255')) + result = self.read_table(data, sep="::", encoding='cp1255') + expected = DataFrame([[562, 123]], columns=["שלום", "1234"]) + tm.assert_frame_equal(result, expected) + + def test_single_line(self): + # see gh-6607: sniff separator + + buf = StringIO() + sys.stdout = buf + + try: + df = self.read_csv(StringIO('1,2'), names=['a', 'b'], + header=None, sep=None) + tm.assert_frame_equal(DataFrame({'a': [1], 'b': [2]}), df) + finally: + sys.stdout = sys.__stdout__ + + def test_skip_footer(self): + # see gh-6607 + data = """A,B,C +1,2,3 +4,5,6 +7,8,9 +want to skip this +also also skip this +""" + result = self.read_csv(StringIO(data), skip_footer=2) + no_footer = '\n'.join(data.split('\n')[:-3]) + expected = self.read_csv(StringIO(no_footer)) + tm.assert_frame_equal(result, expected) + + result = self.read_csv(StringIO(data), nrows=3) + tm.assert_frame_equal(result, expected) + + # skipfooter alias + result = self.read_csv(StringIO(data), skipfooter=2) + no_footer = '\n'.join(data.split('\n')[:-3]) + expected = self.read_csv(StringIO(no_footer)) + tm.assert_frame_equal(result, expected) + + def test_decompression_regex_sep(self): + # see gh-6607 + + try: + import gzip + import bz2 + except ImportError: + raise nose.SkipTest('need gzip and bz2 to run') + + data = open(self.csv1, 'rb').read() + data = data.replace(b',', b'::') + expected = self.read_csv(self.csv1) + + with tm.ensure_clean() as path: + tmp = gzip.GzipFile(path, mode='wb') + tmp.write(data) + tmp.close() + + result = self.read_csv(path, sep='::', compression='gzip') + tm.assert_frame_equal(result, expected) + + with tm.ensure_clean() as path: + tmp = bz2.BZ2File(path, mode='wb') + tmp.write(data) + tmp.close() + + result = self.read_csv(path, sep='::', compression='bz2') + tm.assert_frame_equal(result, expected) + + self.assertRaises(ValueError, self.read_csv, + path, compression='bz3') + + def test_read_table_buglet_4x_multiindex(self): + # see gh-6607 + text = """ A B C D E +one two three four +a b 10.0032 5 -0.5109 -2.3358 -0.4645 0.05076 0.3640 +a q 20 4 0.4473 1.4152 0.2834 1.00661 0.1744 +x q 30 3 -0.6662 -0.5243 -0.3580 0.89145 2.5838""" + + df = self.read_table(StringIO(text), sep='\s+') + self.assertEqual(df.index.names, ('one', 'two', 'three', 'four')) + + # see gh-6893 + data = ' A B C\na b c\n1 3 7 0 3 6\n3 1 4 1 5 9' + expected = DataFrame.from_records( + [(1, 3, 7, 0, 3, 6), (3, 1, 4, 1, 5, 9)], + columns=list('abcABC'), index=list('abc')) + actual = self.read_table(StringIO(data), sep='\s+') + tm.assert_frame_equal(actual, expected) diff --git a/pandas/io/tests/parser/skiprows.py b/pandas/io/tests/parser/skiprows.py new file mode 100644 index 0000000000000..3e585a9a623c9 --- /dev/null +++ b/pandas/io/tests/parser/skiprows.py @@ -0,0 +1,78 @@ +# -*- coding: utf-8 -*- + +""" +Tests that skipped rows are properly handled during +parsing for all of the parsers defined in parsers.py +""" + +from datetime import datetime + +import numpy as np + +import pandas.util.testing as tm + +from pandas import DataFrame +from pandas.compat import StringIO, range, lrange + + +class SkipRowsTests(object): + + def test_skiprows_bug(self): + # see gh-505 + text = """#foo,a,b,c +#foo,a,b,c +#foo,a,b,c +#foo,a,b,c +#foo,a,b,c +#foo,a,b,c +1/1/2000,1.,2.,3. +1/2/2000,4,5,6 +1/3/2000,7,8,9 +""" + data = self.read_csv(StringIO(text), skiprows=lrange(6), header=None, + index_col=0, parse_dates=True) + + data2 = self.read_csv(StringIO(text), skiprows=6, header=None, + index_col=0, parse_dates=True) + + expected = DataFrame(np.arange(1., 10.).reshape((3, 3)), + columns=[1, 2, 3], + index=[datetime(2000, 1, 1), datetime(2000, 1, 2), + datetime(2000, 1, 3)]) + expected.index.name = 0 + tm.assert_frame_equal(data, expected) + tm.assert_frame_equal(data, data2) + + def test_deep_skiprows(self): + # see gh-4382 + text = "a,b,c\n" + \ + "\n".join([",".join([str(i), str(i + 1), str(i + 2)]) + for i in range(10)]) + condensed_text = "a,b,c\n" + \ + "\n".join([",".join([str(i), str(i + 1), str(i + 2)]) + for i in [0, 1, 2, 3, 4, 6, 8, 9]]) + data = self.read_csv(StringIO(text), skiprows=[6, 8]) + condensed_data = self.read_csv(StringIO(condensed_text)) + tm.assert_frame_equal(data, condensed_data) + + def test_skiprows_blank(self): + # see gh-9832 + text = """#foo,a,b,c +#foo,a,b,c + +#foo,a,b,c +#foo,a,b,c + +1/1/2000,1.,2.,3. +1/2/2000,4,5,6 +1/3/2000,7,8,9 +""" + data = self.read_csv(StringIO(text), skiprows=6, header=None, + index_col=0, parse_dates=True) + + expected = DataFrame(np.arange(1., 10.).reshape((3, 3)), + columns=[1, 2, 3], + index=[datetime(2000, 1, 1), datetime(2000, 1, 2), + datetime(2000, 1, 3)]) + expected.index.name = 0 + tm.assert_frame_equal(data, expected) diff --git a/pandas/io/tests/parser/test_network.py b/pandas/io/tests/parser/test_network.py new file mode 100644 index 0000000000000..f0c8f417eb92f --- /dev/null +++ b/pandas/io/tests/parser/test_network.py @@ -0,0 +1,195 @@ +# -*- coding: utf-8 -*- + +""" +Tests parsers ability to read and parse non-local files +and hence require a network connection to be read. +""" + +import os +import nose + +import pandas.util.testing as tm +from pandas import DataFrame +from pandas import compat +from pandas.io.parsers import read_csv, read_table + + +class TestUrlGz(tm.TestCase): + + def setUp(self): + dirpath = tm.get_data_path() + localtable = os.path.join(dirpath, 'salary.table') + self.local_table = read_table(localtable) + + @tm.network + def test_url_gz(self): + url = ('https://raw.github.com/pydata/pandas/' + 'master/pandas/io/tests/data/salary.table.gz') + url_table = read_table(url, compression="gzip", engine="python") + tm.assert_frame_equal(url_table, self.local_table) + + @tm.network + def test_url_gz_infer(self): + url = 'https://s3.amazonaws.com/pandas-test/salary.table.gz' + url_table = read_table(url, compression="infer", engine="python") + tm.assert_frame_equal(url_table, self.local_table) + + +class TestS3(tm.TestCase): + + def setUp(self): + try: + import boto # noqa + except ImportError: + raise nose.SkipTest("boto not installed") + + @tm.network + def test_parse_public_s3_bucket(self): + for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]: + if comp == 'bz2' and compat.PY2: + # The Python 2 C parser can't read bz2 from S3. + self.assertRaises(ValueError, read_csv, + 's3://pandas-test/tips.csv' + ext, + compression=comp) + else: + df = read_csv('s3://pandas-test/tips.csv' + + ext, compression=comp) + self.assertTrue(isinstance(df, DataFrame)) + self.assertFalse(df.empty) + tm.assert_frame_equal(read_csv( + tm.get_data_path('tips.csv')), df) + + # Read public file from bucket with not-public contents + df = read_csv('s3://cant_get_it/tips.csv') + self.assertTrue(isinstance(df, DataFrame)) + self.assertFalse(df.empty) + tm.assert_frame_equal(read_csv(tm.get_data_path('tips.csv')), df) + + @tm.network + def test_parse_public_s3n_bucket(self): + # Read from AWS s3 as "s3n" URL + df = read_csv('s3n://pandas-test/tips.csv', nrows=10) + self.assertTrue(isinstance(df, DataFrame)) + self.assertFalse(df.empty) + tm.assert_frame_equal(read_csv( + tm.get_data_path('tips.csv')).iloc[:10], df) + + @tm.network + def test_parse_public_s3a_bucket(self): + # Read from AWS s3 as "s3a" URL + df = read_csv('s3a://pandas-test/tips.csv', nrows=10) + self.assertTrue(isinstance(df, DataFrame)) + self.assertFalse(df.empty) + tm.assert_frame_equal(read_csv( + tm.get_data_path('tips.csv')).iloc[:10], df) + + @tm.network + def test_parse_public_s3_bucket_nrows(self): + for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]: + if comp == 'bz2' and compat.PY2: + # The Python 2 C parser can't read bz2 from S3. + self.assertRaises(ValueError, read_csv, + 's3://pandas-test/tips.csv' + ext, + compression=comp) + else: + df = read_csv('s3://pandas-test/tips.csv' + + ext, nrows=10, compression=comp) + self.assertTrue(isinstance(df, DataFrame)) + self.assertFalse(df.empty) + tm.assert_frame_equal(read_csv( + tm.get_data_path('tips.csv')).iloc[:10], df) + + @tm.network + def test_parse_public_s3_bucket_chunked(self): + # Read with a chunksize + chunksize = 5 + local_tips = read_csv(tm.get_data_path('tips.csv')) + for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]: + if comp == 'bz2' and compat.PY2: + # The Python 2 C parser can't read bz2 from S3. + self.assertRaises(ValueError, read_csv, + 's3://pandas-test/tips.csv' + ext, + compression=comp) + else: + df_reader = read_csv('s3://pandas-test/tips.csv' + ext, + chunksize=chunksize, compression=comp) + self.assertEqual(df_reader.chunksize, chunksize) + for i_chunk in [0, 1, 2]: + # Read a couple of chunks and make sure we see them + # properly. + df = df_reader.get_chunk() + self.assertTrue(isinstance(df, DataFrame)) + self.assertFalse(df.empty) + true_df = local_tips.iloc[ + chunksize * i_chunk: chunksize * (i_chunk + 1)] + # Chunking doesn't preserve row numbering + true_df = true_df.reset_index().drop('index', axis=1) + tm.assert_frame_equal(true_df, df) + + @tm.network + def test_parse_public_s3_bucket_chunked_python(self): + # Read with a chunksize using the Python parser + chunksize = 5 + local_tips = read_csv(tm.get_data_path('tips.csv')) + for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]: + df_reader = read_csv('s3://pandas-test/tips.csv' + ext, + chunksize=chunksize, compression=comp, + engine='python') + self.assertEqual(df_reader.chunksize, chunksize) + for i_chunk in [0, 1, 2]: + # Read a couple of chunks and make sure we see them properly. + df = df_reader.get_chunk() + self.assertTrue(isinstance(df, DataFrame)) + self.assertFalse(df.empty) + true_df = local_tips.iloc[ + chunksize * i_chunk: chunksize * (i_chunk + 1)] + # Chunking doesn't preserve row numbering + true_df = true_df.reset_index().drop('index', axis=1) + tm.assert_frame_equal(true_df, df) + + @tm.network + def test_parse_public_s3_bucket_python(self): + for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]: + df = read_csv('s3://pandas-test/tips.csv' + ext, engine='python', + compression=comp) + self.assertTrue(isinstance(df, DataFrame)) + self.assertFalse(df.empty) + tm.assert_frame_equal(read_csv( + tm.get_data_path('tips.csv')), df) + + @tm.network + def test_infer_s3_compression(self): + for ext in ['', '.gz', '.bz2']: + df = read_csv('s3://pandas-test/tips.csv' + ext, + engine='python', compression='infer') + self.assertTrue(isinstance(df, DataFrame)) + self.assertFalse(df.empty) + tm.assert_frame_equal(read_csv( + tm.get_data_path('tips.csv')), df) + + @tm.network + def test_parse_public_s3_bucket_nrows_python(self): + for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]: + df = read_csv('s3://pandas-test/tips.csv' + ext, engine='python', + nrows=10, compression=comp) + self.assertTrue(isinstance(df, DataFrame)) + self.assertFalse(df.empty) + tm.assert_frame_equal(read_csv( + tm.get_data_path('tips.csv')).iloc[:10], df) + + @tm.network + def test_s3_fails(self): + import boto + with tm.assertRaisesRegexp(boto.exception.S3ResponseError, + 'S3ResponseError: 404 Not Found'): + read_csv('s3://nyqpug/asdf.csv') + + # Receive a permission error when trying to read a private bucket. + # It's irrelevant here that this isn't actually a table. + with tm.assertRaisesRegexp(boto.exception.S3ResponseError, + 'S3ResponseError: 403 Forbidden'): + read_csv('s3://cant_get_it/') + +if __name__ == '__main__': + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/io/tests/parser/test_parsers.py b/pandas/io/tests/parser/test_parsers.py new file mode 100644 index 0000000000000..374485b5ddaad --- /dev/null +++ b/pandas/io/tests/parser/test_parsers.py @@ -0,0 +1,109 @@ +# -*- coding: utf-8 -*- + +import os +import nose + +import pandas.util.testing as tm + +from pandas import read_csv, read_table +from pandas.core.common import AbstractMethodError + +from .common import ParserTests +from .header import HeaderTests +from .comment import CommentTests +from .usecols import UsecolsTests +from .skiprows import SkipRowsTests +from .index_col import IndexColTests +from .na_values import NAvaluesTests +from .converters import ConverterTests +from .c_parser_only import CParserTests +from .parse_dates import ParseDatesTests +from .compression import CompressionTests +from .multithread import MultithreadTests +from .python_parser_only import PythonParserTests + + +class BaseParser(CommentTests, CompressionTests, + ConverterTests, HeaderTests, + IndexColTests, MultithreadTests, + NAvaluesTests, ParseDatesTests, + ParserTests, SkipRowsTests, + UsecolsTests): + def read_csv(self, *args, **kwargs): + raise NotImplementedError + + def read_table(self, *args, **kwargs): + raise NotImplementedError + + def float_precision_choices(self): + raise AbstractMethodError(self) + + def setUp(self): + self.dirpath = tm.get_data_path() + self.csv1 = os.path.join(self.dirpath, 'test1.csv') + self.csv2 = os.path.join(self.dirpath, 'test2.csv') + self.xls1 = os.path.join(self.dirpath, 'test.xls') + + +class TestCParserHighMemory(BaseParser, CParserTests, tm.TestCase): + engine = 'c' + low_memory = False + float_precision_choices = [None, 'high', 'round_trip'] + + def read_csv(self, *args, **kwds): + kwds = kwds.copy() + kwds['engine'] = self.engine + kwds['low_memory'] = self.low_memory + return read_csv(*args, **kwds) + + def read_table(self, *args, **kwds): + kwds = kwds.copy() + kwds['engine'] = self.engine + kwds['low_memory'] = self.low_memory + return read_table(*args, **kwds) + + +class TestCParserLowMemory(BaseParser, CParserTests, tm.TestCase): + engine = 'c' + low_memory = True + float_precision_choices = [None, 'high', 'round_trip'] + + def read_csv(self, *args, **kwds): + kwds = kwds.copy() + kwds['engine'] = self.engine + kwds['low_memory'] = self.low_memory + kwds['buffer_lines'] = 2 + return read_csv(*args, **kwds) + + def read_table(self, *args, **kwds): + kwds = kwds.copy() + kwds['engine'] = self.engine + kwds['low_memory'] = True + kwds['buffer_lines'] = 2 + return read_table(*args, **kwds) + + +class TestPythonParser(BaseParser, PythonParserTests, tm.TestCase): + """ + Class for Python parser testing. Unless specifically stated + as a PythonParser-specific issue, the goal is to eventually move + as many of these tests into ParserTests as soon as the C parser + can accept further specific arguments when parsing. + """ + + engine = 'python' + float_precision_choices = [None] + + def read_csv(self, *args, **kwds): + kwds = kwds.copy() + kwds['engine'] = self.engine + return read_csv(*args, **kwds) + + def read_table(self, *args, **kwds): + kwds = kwds.copy() + kwds['engine'] = self.engine + return read_table(*args, **kwds) + +if __name__ == '__main__': + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/io/tests/parser/test_read_fwf.py b/pandas/io/tests/parser/test_read_fwf.py new file mode 100644 index 0000000000000..5599188400368 --- /dev/null +++ b/pandas/io/tests/parser/test_read_fwf.py @@ -0,0 +1,347 @@ +# -*- coding: utf-8 -*- + +""" +Tests the 'read_fwf' function in parsers.py. This +test suite is independent of the others because the +engine is set to 'python-fwf' internally. +""" + +from datetime import datetime + +import nose +import numpy as np +import pandas as pd +import pandas.util.testing as tm + +from pandas import DataFrame +from pandas import compat +from pandas.compat import StringIO, BytesIO +from pandas.io.parsers import read_csv, read_fwf + + +class TestFwfParsing(tm.TestCase): + + def test_fwf(self): + data_expected = """\ +2011,58,360.242940,149.910199,11950.7 +2011,59,444.953632,166.985655,11788.4 +2011,60,364.136849,183.628767,11806.2 +2011,61,413.836124,184.375703,11916.8 +2011,62,502.953953,173.237159,12468.3 +""" + expected = read_csv(StringIO(data_expected), + engine='python', header=None) + + data1 = """\ +201158 360.242940 149.910199 11950.7 +201159 444.953632 166.985655 11788.4 +201160 364.136849 183.628767 11806.2 +201161 413.836124 184.375703 11916.8 +201162 502.953953 173.237159 12468.3 +""" + colspecs = [(0, 4), (4, 8), (8, 20), (21, 33), (34, 43)] + df = read_fwf(StringIO(data1), colspecs=colspecs, header=None) + tm.assert_frame_equal(df, expected) + + data2 = """\ +2011 58 360.242940 149.910199 11950.7 +2011 59 444.953632 166.985655 11788.4 +2011 60 364.136849 183.628767 11806.2 +2011 61 413.836124 184.375703 11916.8 +2011 62 502.953953 173.237159 12468.3 +""" + df = read_fwf(StringIO(data2), widths=[5, 5, 13, 13, 7], header=None) + tm.assert_frame_equal(df, expected) + + # From Thomas Kluyver: apparently some non-space filler characters can + # be seen, this is supported by specifying the 'delimiter' character: + # http://publib.boulder.ibm.com/infocenter/dmndhelp/v6r1mx/index.jsp?topic=/com.ibm.wbit.612.help.config.doc/topics/rfixwidth.html + data3 = """\ +201158~~~~360.242940~~~149.910199~~~11950.7 +201159~~~~444.953632~~~166.985655~~~11788.4 +201160~~~~364.136849~~~183.628767~~~11806.2 +201161~~~~413.836124~~~184.375703~~~11916.8 +201162~~~~502.953953~~~173.237159~~~12468.3 +""" + df = read_fwf( + StringIO(data3), colspecs=colspecs, delimiter='~', header=None) + tm.assert_frame_equal(df, expected) + + with tm.assertRaisesRegexp(ValueError, "must specify only one of"): + read_fwf(StringIO(data3), colspecs=colspecs, widths=[6, 10, 10, 7]) + + with tm.assertRaisesRegexp(ValueError, "Must specify either"): + read_fwf(StringIO(data3), colspecs=None, widths=None) + + def test_BytesIO_input(self): + if not compat.PY3: + raise nose.SkipTest( + "Bytes-related test - only needs to work on Python 3") + + result = read_fwf(BytesIO("שלום\nשלום".encode('utf8')), widths=[ + 2, 2], encoding='utf8') + expected = DataFrame([["של", "ום"]], columns=["של", "ום"]) + tm.assert_frame_equal(result, expected) + + def test_fwf_colspecs_is_list_or_tuple(self): + data = """index,A,B,C,D +foo,2,3,4,5 +bar,7,8,9,10 +baz,12,13,14,15 +qux,12,13,14,15 +foo2,12,13,14,15 +bar2,12,13,14,15 +""" + + with tm.assertRaisesRegexp(TypeError, + 'column specifications must be a list or ' + 'tuple.+'): + pd.io.parsers.FixedWidthReader(StringIO(data), + {'a': 1}, ',', '#') + + def test_fwf_colspecs_is_list_or_tuple_of_two_element_tuples(self): + data = """index,A,B,C,D +foo,2,3,4,5 +bar,7,8,9,10 +baz,12,13,14,15 +qux,12,13,14,15 +foo2,12,13,14,15 +bar2,12,13,14,15 +""" + + with tm.assertRaisesRegexp(TypeError, + 'Each column specification must be.+'): + read_fwf(StringIO(data), [('a', 1)]) + + def test_fwf_colspecs_None(self): + # GH 7079 + data = """\ +123456 +456789 +""" + colspecs = [(0, 3), (3, None)] + result = read_fwf(StringIO(data), colspecs=colspecs, header=None) + expected = DataFrame([[123, 456], [456, 789]]) + tm.assert_frame_equal(result, expected) + + colspecs = [(None, 3), (3, 6)] + result = read_fwf(StringIO(data), colspecs=colspecs, header=None) + expected = DataFrame([[123, 456], [456, 789]]) + tm.assert_frame_equal(result, expected) + + colspecs = [(0, None), (3, None)] + result = read_fwf(StringIO(data), colspecs=colspecs, header=None) + expected = DataFrame([[123456, 456], [456789, 789]]) + tm.assert_frame_equal(result, expected) + + colspecs = [(None, None), (3, 6)] + result = read_fwf(StringIO(data), colspecs=colspecs, header=None) + expected = DataFrame([[123456, 456], [456789, 789]]) + tm.assert_frame_equal(result, expected) + + def test_fwf_regression(self): + # GH 3594 + # turns out 'T060' is parsable as a datetime slice! + + tzlist = [1, 10, 20, 30, 60, 80, 100] + ntz = len(tzlist) + tcolspecs = [16] + [8] * ntz + tcolnames = ['SST'] + ["T%03d" % z for z in tzlist[1:]] + data = """ 2009164202000 9.5403 9.4105 8.6571 7.8372 6.0612 5.8843 5.5192 + 2009164203000 9.5435 9.2010 8.6167 7.8176 6.0804 5.8728 5.4869 + 2009164204000 9.5873 9.1326 8.4694 7.5889 6.0422 5.8526 5.4657 + 2009164205000 9.5810 9.0896 8.4009 7.4652 6.0322 5.8189 5.4379 + 2009164210000 9.6034 9.0897 8.3822 7.4905 6.0908 5.7904 5.4039 +""" + + df = read_fwf(StringIO(data), + index_col=0, + header=None, + names=tcolnames, + widths=tcolspecs, + parse_dates=True, + date_parser=lambda s: datetime.strptime(s, '%Y%j%H%M%S')) + + for c in df.columns: + res = df.loc[:, c] + self.assertTrue(len(res)) + + def test_fwf_for_uint8(self): + data = """1421302965.213420 PRI=3 PGN=0xef00 DST=0x17 SRC=0x28 04 154 00 00 00 00 00 127 +1421302964.226776 PRI=6 PGN=0xf002 SRC=0x47 243 00 00 255 247 00 00 71""" # noqa + df = read_fwf(StringIO(data), + colspecs=[(0, 17), (25, 26), (33, 37), + (49, 51), (58, 62), (63, 1000)], + names=['time', 'pri', 'pgn', 'dst', 'src', 'data'], + converters={ + 'pgn': lambda x: int(x, 16), + 'src': lambda x: int(x, 16), + 'dst': lambda x: int(x, 16), + 'data': lambda x: len(x.split(' '))}) + + expected = DataFrame([[1421302965.213420, 3, 61184, 23, 40, 8], + [1421302964.226776, 6, 61442, None, 71, 8]], + columns=["time", "pri", "pgn", + "dst", "src", "data"]) + expected["dst"] = expected["dst"].astype(object) + + tm.assert_frame_equal(df, expected) + + def test_fwf_compression(self): + try: + import gzip + import bz2 + except ImportError: + raise nose.SkipTest("Need gzip and bz2 to run this test") + + data = """1111111111 + 2222222222 + 3333333333""".strip() + widths = [5, 5] + names = ['one', 'two'] + expected = read_fwf(StringIO(data), widths=widths, names=names) + if compat.PY3: + data = bytes(data, encoding='utf-8') + comps = [('gzip', gzip.GzipFile), ('bz2', bz2.BZ2File)] + for comp_name, compresser in comps: + with tm.ensure_clean() as path: + tmp = compresser(path, mode='wb') + tmp.write(data) + tmp.close() + result = read_fwf(path, widths=widths, names=names, + compression=comp_name) + tm.assert_frame_equal(result, expected) + + def test_comment_fwf(self): + data = """ + 1 2. 4 #hello world + 5 NaN 10.0 +""" + expected = [[1, 2., 4], + [5, np.nan, 10.]] + df = read_fwf(StringIO(data), colspecs=[(0, 3), (4, 9), (9, 25)], + comment='#') + tm.assert_almost_equal(df.values, expected) + + def test_1000_fwf(self): + data = """ + 1 2,334.0 5 +10 13 10. +""" + expected = [[1, 2334., 5], + [10, 13, 10]] + df = read_fwf(StringIO(data), colspecs=[(0, 3), (3, 11), (12, 16)], + thousands=',') + tm.assert_almost_equal(df.values, expected) + + def test_bool_header_arg(self): + # see gh-6114 + data = """\ +MyColumn + a + b + a + b""" + for arg in [True, False]: + with tm.assertRaises(TypeError): + read_fwf(StringIO(data), header=arg) + + def test_full_file(self): + # File with all values + test = '''index A B C +2000-01-03T00:00:00 0.980268513777 3 foo +2000-01-04T00:00:00 1.04791624281 -4 bar +2000-01-05T00:00:00 0.498580885705 73 baz +2000-01-06T00:00:00 1.12020151869 1 foo +2000-01-07T00:00:00 0.487094399463 0 bar +2000-01-10T00:00:00 0.836648671666 2 baz +2000-01-11T00:00:00 0.157160753327 34 foo''' + colspecs = ((0, 19), (21, 35), (38, 40), (42, 45)) + expected = read_fwf(StringIO(test), colspecs=colspecs) + tm.assert_frame_equal(expected, read_fwf(StringIO(test))) + + def test_full_file_with_missing(self): + # File with missing values + test = '''index A B C +2000-01-03T00:00:00 0.980268513777 3 foo +2000-01-04T00:00:00 1.04791624281 -4 bar + 0.498580885705 73 baz +2000-01-06T00:00:00 1.12020151869 1 foo +2000-01-07T00:00:00 0 bar +2000-01-10T00:00:00 0.836648671666 2 baz + 34''' + colspecs = ((0, 19), (21, 35), (38, 40), (42, 45)) + expected = read_fwf(StringIO(test), colspecs=colspecs) + tm.assert_frame_equal(expected, read_fwf(StringIO(test))) + + def test_full_file_with_spaces(self): + # File with spaces in columns + test = ''' +Account Name Balance CreditLimit AccountCreated +101 Keanu Reeves 9315.45 10000.00 1/17/1998 +312 Gerard Butler 90.00 1000.00 8/6/2003 +868 Jennifer Love Hewitt 0 17000.00 5/25/1985 +761 Jada Pinkett-Smith 49654.87 100000.00 12/5/2006 +317 Bill Murray 789.65 5000.00 2/5/2007 +'''.strip('\r\n') + colspecs = ((0, 7), (8, 28), (30, 38), (42, 53), (56, 70)) + expected = read_fwf(StringIO(test), colspecs=colspecs) + tm.assert_frame_equal(expected, read_fwf(StringIO(test))) + + def test_full_file_with_spaces_and_missing(self): + # File with spaces and missing values in columsn + test = ''' +Account Name Balance CreditLimit AccountCreated +101 10000.00 1/17/1998 +312 Gerard Butler 90.00 1000.00 8/6/2003 +868 5/25/1985 +761 Jada Pinkett-Smith 49654.87 100000.00 12/5/2006 +317 Bill Murray 789.65 +'''.strip('\r\n') + colspecs = ((0, 7), (8, 28), (30, 38), (42, 53), (56, 70)) + expected = read_fwf(StringIO(test), colspecs=colspecs) + tm.assert_frame_equal(expected, read_fwf(StringIO(test))) + + def test_messed_up_data(self): + # Completely messed up file + test = ''' + Account Name Balance Credit Limit Account Created + 101 10000.00 1/17/1998 + 312 Gerard Butler 90.00 1000.00 + + 761 Jada Pinkett-Smith 49654.87 100000.00 12/5/2006 + 317 Bill Murray 789.65 +'''.strip('\r\n') + colspecs = ((2, 10), (15, 33), (37, 45), (49, 61), (64, 79)) + expected = read_fwf(StringIO(test), colspecs=colspecs) + tm.assert_frame_equal(expected, read_fwf(StringIO(test))) + + def test_multiple_delimiters(self): + test = r''' +col1~~~~~col2 col3++++++++++++++++++col4 +~~22.....11.0+++foo~~~~~~~~~~Keanu Reeves + 33+++122.33\\\bar.........Gerard Butler +++44~~~~12.01 baz~~Jennifer Love Hewitt +~~55 11+++foo++++Jada Pinkett-Smith +..66++++++.03~~~bar Bill Murray +'''.strip('\r\n') + colspecs = ((0, 4), (7, 13), (15, 19), (21, 41)) + expected = read_fwf(StringIO(test), colspecs=colspecs, + delimiter=' +~.\\') + tm.assert_frame_equal(expected, read_fwf(StringIO(test), + delimiter=' +~.\\')) + + def test_variable_width_unicode(self): + if not compat.PY3: + raise nose.SkipTest( + 'Bytes-related test - only needs to work on Python 3') + test = ''' +שלום שלום +ום שלל +של ום +'''.strip('\r\n') + expected = read_fwf(BytesIO(test.encode('utf8')), + colspecs=[(0, 4), (5, 9)], + header=None, encoding='utf8') + tm.assert_frame_equal(expected, read_fwf( + BytesIO(test.encode('utf8')), header=None, encoding='utf8')) diff --git a/pandas/io/tests/test_cparser.py b/pandas/io/tests/parser/test_textreader.py similarity index 98% rename from pandas/io/tests/test_cparser.py rename to pandas/io/tests/parser/test_textreader.py index ce6fce7b792b5..f3de604f1ec48 100644 --- a/pandas/io/tests/test_cparser.py +++ b/pandas/io/tests/parser/test_textreader.py @@ -1,12 +1,15 @@ +# -*- coding: utf-8 -*- + """ -C/Cython ascii file parser tests +Tests the TextReader class in parsers.pyx, which +is integral to the C engine in parsers.py """ from pandas.compat import StringIO, BytesIO, map from pandas import compat + import os import sys - import nose from numpy import nan @@ -22,7 +25,7 @@ import pandas.parser as parser -class TestCParser(tm.TestCase): +class TestTextReader(tm.TestCase): def setUp(self): self.dirpath = tm.get_data_path() diff --git a/pandas/io/tests/parser/test_unsupported.py b/pandas/io/tests/parser/test_unsupported.py new file mode 100644 index 0000000000000..1813a95d7a306 --- /dev/null +++ b/pandas/io/tests/parser/test_unsupported.py @@ -0,0 +1,103 @@ +# -*- coding: utf-8 -*- + +""" +Tests that features that are currently unsupported in +either the Python or C parser are actually enforced +and are clearly communicated to the user. + +Ultimately, the goal is to remove test cases from this +test suite as new feature support is added to the parsers. +""" + +import nose + +import pandas.io.parsers as parsers +import pandas.util.testing as tm + +from pandas.compat import StringIO +from pandas.io.common import CParserError +from pandas.io.parsers import read_csv, read_table + + +class TestUnsupportedFeatures(tm.TestCase): + def test_c_engine(self): + # see gh-6607 + data = 'a b c\n1 2 3' + msg = 'does not support' + + # specify C-unsupported options with python-unsupported option + # (options will be ignored on fallback, raise) + with tm.assertRaisesRegexp(ValueError, msg): + read_table(StringIO(data), sep=None, + delim_whitespace=False, dtype={'a': float}) + with tm.assertRaisesRegexp(ValueError, msg): + read_table(StringIO(data), sep='\s', dtype={'a': float}) + with tm.assertRaisesRegexp(ValueError, msg): + read_table(StringIO(data), skip_footer=1, dtype={'a': float}) + + # specify C engine with unsupported options (raise) + with tm.assertRaisesRegexp(ValueError, msg): + read_table(StringIO(data), engine='c', + sep=None, delim_whitespace=False) + with tm.assertRaisesRegexp(ValueError, msg): + read_table(StringIO(data), engine='c', sep='\s') + with tm.assertRaisesRegexp(ValueError, msg): + read_table(StringIO(data), engine='c', skip_footer=1) + + # specify C-unsupported options without python-unsupported options + with tm.assert_produces_warning(parsers.ParserWarning): + read_table(StringIO(data), sep=None, delim_whitespace=False) + with tm.assert_produces_warning(parsers.ParserWarning): + read_table(StringIO(data), sep='\s') + with tm.assert_produces_warning(parsers.ParserWarning): + read_table(StringIO(data), skip_footer=1) + + text = """ A B C D E +one two three four +a b 10.0032 5 -0.5109 -2.3358 -0.4645 0.05076 0.3640 +a q 20 4 0.4473 1.4152 0.2834 1.00661 0.1744 +x q 30 3 -0.6662 -0.5243 -0.3580 0.89145 2.5838""" + msg = 'Error tokenizing data' + + with tm.assertRaisesRegexp(CParserError, msg): + read_table(StringIO(text), sep='\s+') + with tm.assertRaisesRegexp(CParserError, msg): + read_table(StringIO(text), engine='c', sep='\s+') + + msg = "Only length-1 thousands markers supported" + data = """A|B|C +1|2,334|5 +10|13|10. +""" + with tm.assertRaisesRegexp(ValueError, msg): + read_csv(StringIO(data), thousands=',,') + with tm.assertRaisesRegexp(ValueError, msg): + read_csv(StringIO(data), thousands='') + + msg = "Only length-1 line terminators supported" + data = 'a,b,c~~1,2,3~~4,5,6' + with tm.assertRaisesRegexp(ValueError, msg): + read_csv(StringIO(data), lineterminator='~~') + + def test_python_engine(self): + from pandas.io.parsers import _python_unsupported as py_unsupported + + data = """1,2,3,, +1,2,3,4, +1,2,3,4,5 +1,2,,, +1,2,3,4,""" + engines = 'python', 'python-fwf' + + for engine in engines: + for default in py_unsupported: + msg = ('The %r option is not supported ' + 'with the %r engine' % (default, engine)) + + kwargs = {default: object()} + with tm.assertRaisesRegexp(ValueError, msg): + read_csv(StringIO(data), engine=engine, **kwargs) + +if __name__ == '__main__': + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/io/tests/parser/usecols.py b/pandas/io/tests/parser/usecols.py new file mode 100644 index 0000000000000..06275c168becd --- /dev/null +++ b/pandas/io/tests/parser/usecols.py @@ -0,0 +1,256 @@ +# -*- coding: utf-8 -*- + +""" +Tests the usecols functionality during parsing +for all of the parsers defined in parsers.py +""" + +from datetime import datetime + +import pandas.util.testing as tm + +from pandas import DataFrame +from pandas.lib import Timestamp +from pandas.compat import StringIO + + +class UsecolsTests(object): + + def test_raise_on_mixed_dtype_usecols(self): + # See gh-12678 + data = """a,b,c + 1000,2000,3000 + 4000,5000,6000 + """ + msg = ("The elements of \'usecols\' " + "must either be all strings " + "or all integers") + usecols = [0, 'b', 2] + + with tm.assertRaisesRegexp(ValueError, msg): + self.read_csv(StringIO(data), usecols=usecols) + + def test_usecols(self): + data = """\ +a,b,c +1,2,3 +4,5,6 +7,8,9 +10,11,12""" + + result = self.read_csv(StringIO(data), usecols=(1, 2)) + result2 = self.read_csv(StringIO(data), usecols=('b', 'c')) + exp = self.read_csv(StringIO(data)) + + self.assertEqual(len(result.columns), 2) + self.assertTrue((result['b'] == exp['b']).all()) + self.assertTrue((result['c'] == exp['c']).all()) + + tm.assert_frame_equal(result, result2) + + result = self.read_csv(StringIO(data), usecols=[1, 2], header=0, + names=['foo', 'bar']) + expected = self.read_csv(StringIO(data), usecols=[1, 2]) + expected.columns = ['foo', 'bar'] + tm.assert_frame_equal(result, expected) + + data = """\ +1,2,3 +4,5,6 +7,8,9 +10,11,12""" + result = self.read_csv(StringIO(data), names=['b', 'c'], + header=None, usecols=[1, 2]) + + expected = self.read_csv(StringIO(data), names=['a', 'b', 'c'], + header=None) + expected = expected[['b', 'c']] + tm.assert_frame_equal(result, expected) + + result2 = self.read_csv(StringIO(data), names=['a', 'b', 'c'], + header=None, usecols=['b', 'c']) + tm.assert_frame_equal(result2, result) + + # see gh-5766 + result = self.read_csv(StringIO(data), names=['a', 'b'], + header=None, usecols=[0, 1]) + + expected = self.read_csv(StringIO(data), names=['a', 'b', 'c'], + header=None) + expected = expected[['a', 'b']] + tm.assert_frame_equal(result, expected) + + # length conflict, passed names and usecols disagree + self.assertRaises(ValueError, self.read_csv, StringIO(data), + names=['a', 'b'], usecols=[1], header=None) + + def test_usecols_index_col_False(self): + # see gh-9082 + s = "a,b,c,d\n1,2,3,4\n5,6,7,8" + s_malformed = "a,b,c,d\n1,2,3,4,\n5,6,7,8," + cols = ['a', 'c', 'd'] + expected = DataFrame({'a': [1, 5], 'c': [3, 7], 'd': [4, 8]}) + df = self.read_csv(StringIO(s), usecols=cols, index_col=False) + tm.assert_frame_equal(expected, df) + df = self.read_csv(StringIO(s_malformed), + usecols=cols, index_col=False) + tm.assert_frame_equal(expected, df) + + def test_usecols_index_col_conflict(self): + # see gh-4201: test that index_col as integer reflects usecols + data = """SecId,Time,Price,P2,P3 +10000,2013-5-11,100,10,1 +500,2013-5-12,101,11,1 +""" + expected = DataFrame({'Price': [100, 101]}, index=[ + datetime(2013, 5, 11), datetime(2013, 5, 12)]) + expected.index.name = 'Time' + + df = self.read_csv(StringIO(data), usecols=[ + 'Time', 'Price'], parse_dates=True, index_col=0) + tm.assert_frame_equal(expected, df) + + df = self.read_csv(StringIO(data), usecols=[ + 'Time', 'Price'], parse_dates=True, index_col='Time') + tm.assert_frame_equal(expected, df) + + df = self.read_csv(StringIO(data), usecols=[ + 1, 2], parse_dates=True, index_col='Time') + tm.assert_frame_equal(expected, df) + + df = self.read_csv(StringIO(data), usecols=[ + 1, 2], parse_dates=True, index_col=0) + tm.assert_frame_equal(expected, df) + + expected = DataFrame( + {'P3': [1, 1], 'Price': (100, 101), 'P2': (10, 11)}) + expected = expected.set_index(['Price', 'P2']) + df = self.read_csv(StringIO(data), usecols=[ + 'Price', 'P2', 'P3'], parse_dates=True, index_col=['Price', 'P2']) + tm.assert_frame_equal(expected, df) + + def test_usecols_implicit_index_col(self): + # see gh-2654 + data = 'a,b,c\n4,apple,bat,5.7\n8,orange,cow,10' + + result = self.read_csv(StringIO(data), usecols=['a', 'b']) + expected = DataFrame({'a': ['apple', 'orange'], + 'b': ['bat', 'cow']}, index=[4, 8]) + + tm.assert_frame_equal(result, expected) + + def test_usecols_regex_sep(self): + # see gh-2733 + data = 'a b c\n4 apple bat 5.7\n8 orange cow 10' + + df = self.read_csv(StringIO(data), sep='\s+', usecols=('a', 'b')) + + expected = DataFrame({'a': ['apple', 'orange'], + 'b': ['bat', 'cow']}, index=[4, 8]) + tm.assert_frame_equal(df, expected) + + def test_usecols_with_whitespace(self): + data = 'a b c\n4 apple bat 5.7\n8 orange cow 10' + + result = self.read_csv(StringIO(data), delim_whitespace=True, + usecols=('a', 'b')) + expected = DataFrame({'a': ['apple', 'orange'], + 'b': ['bat', 'cow']}, index=[4, 8]) + + tm.assert_frame_equal(result, expected) + + def test_usecols_with_integer_like_header(self): + data = """2,0,1 + 1000,2000,3000 + 4000,5000,6000 + """ + + usecols = [0, 1] # column selection by index + expected = DataFrame(data=[[1000, 2000], + [4000, 5000]], + columns=['2', '0']) + df = self.read_csv(StringIO(data), usecols=usecols) + tm.assert_frame_equal(df, expected) + + usecols = ['0', '1'] # column selection by name + expected = DataFrame(data=[[2000, 3000], + [5000, 6000]], + columns=['0', '1']) + df = self.read_csv(StringIO(data), usecols=usecols) + tm.assert_frame_equal(df, expected) + + def test_usecols_with_parse_dates(self): + # See gh-9755 + s = """a,b,c,d,e + 0,1,20140101,0900,4 + 0,1,20140102,1000,4""" + parse_dates = [[1, 2]] + + cols = { + 'a': [0, 0], + 'c_d': [ + Timestamp('2014-01-01 09:00:00'), + Timestamp('2014-01-02 10:00:00') + ] + } + expected = DataFrame(cols, columns=['c_d', 'a']) + + df = self.read_csv(StringIO(s), usecols=[0, 2, 3], + parse_dates=parse_dates) + tm.assert_frame_equal(df, expected) + + df = self.read_csv(StringIO(s), usecols=[3, 0, 2], + parse_dates=parse_dates) + tm.assert_frame_equal(df, expected) + + def test_usecols_with_parse_dates_and_full_names(self): + # See gh-9755 + s = """0,1,20140101,0900,4 + 0,1,20140102,1000,4""" + parse_dates = [[1, 2]] + names = list('abcde') + + cols = { + 'a': [0, 0], + 'c_d': [ + Timestamp('2014-01-01 09:00:00'), + Timestamp('2014-01-02 10:00:00') + ] + } + expected = DataFrame(cols, columns=['c_d', 'a']) + + df = self.read_csv(StringIO(s), names=names, + usecols=[0, 2, 3], + parse_dates=parse_dates) + tm.assert_frame_equal(df, expected) + + df = self.read_csv(StringIO(s), names=names, + usecols=[3, 0, 2], + parse_dates=parse_dates) + tm.assert_frame_equal(df, expected) + + def test_usecols_with_parse_dates_and_usecol_names(self): + # See gh-9755 + s = """0,1,20140101,0900,4 + 0,1,20140102,1000,4""" + parse_dates = [[1, 2]] + names = list('acd') + + cols = { + 'a': [0, 0], + 'c_d': [ + Timestamp('2014-01-01 09:00:00'), + Timestamp('2014-01-02 10:00:00') + ] + } + expected = DataFrame(cols, columns=['c_d', 'a']) + + df = self.read_csv(StringIO(s), names=names, + usecols=[0, 2, 3], + parse_dates=parse_dates) + tm.assert_frame_equal(df, expected) + + df = self.read_csv(StringIO(s), names=names, + usecols=[3, 0, 2], + parse_dates=parse_dates) + tm.assert_frame_equal(df, expected) diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py deleted file mode 100755 index 3c1a918bd5628..0000000000000 --- a/pandas/io/tests/test_parsers.py +++ /dev/null @@ -1,5055 +0,0 @@ -# -*- coding: utf-8 -*- -# pylint: disable=E1101 - -# flake8: noqa - -import csv -import os -import platform -from distutils.version import LooseVersion - -import re -import sys -from datetime import datetime -from multiprocessing.pool import ThreadPool - -import nose -import numpy as np -import pandas.lib as lib -from numpy import nan -from numpy.testing.decorators import slow -from pandas.lib import Timestamp - -import pandas as pd -import pandas.io.parsers as parsers -import pandas.tseries.tools as tools -import pandas.util.testing as tm -from pandas import DataFrame, Series, Index, MultiIndex, DatetimeIndex -from pandas import compat -from pandas.compat import( - StringIO, BytesIO, PY3, range, long, lrange, lmap, u -) -from pandas.compat import parse_date -from pandas.core.common import AbstractMethodError -from pandas.io.common import (CParserError, DtypeWarning, - EmptyDataError, URLError) -from pandas.io.parsers import (read_csv, read_table, read_fwf, - TextFileReader, TextParser) -from pandas.tseries.index import date_range - - -class ParseDatesTests(object): - def test_separator_date_conflict(self): - # Regression test for issue #4678: make sure thousands separator and - # date parsing do not conflict. - data = '06-02-2013;13:00;1-000.215' - expected = DataFrame( - [[datetime(2013, 6, 2, 13, 0, 0), 1000.215]], - columns=['Date', 2] - ) - - df = self.read_csv(StringIO(data), sep=';', thousands='-', - parse_dates={'Date': [0, 1]}, header=None) - tm.assert_frame_equal(df, expected) - - def test_multiple_date_col(self): - # Can use multiple date parsers - data = """\ -KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 -KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 -KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 -KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 -KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 -KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000 -""" - - def func(*date_cols): - return lib.try_parse_dates(parsers._concat_date_cols(date_cols)) - - df = self.read_csv(StringIO(data), header=None, - date_parser=func, - prefix='X', - parse_dates={'nominal': [1, 2], - 'actual': [1, 3]}) - self.assertIn('nominal', df) - self.assertIn('actual', df) - self.assertNotIn('X1', df) - self.assertNotIn('X2', df) - self.assertNotIn('X3', df) - - d = datetime(1999, 1, 27, 19, 0) - self.assertEqual(df.ix[0, 'nominal'], d) - - df = self.read_csv(StringIO(data), header=None, - date_parser=func, - parse_dates={'nominal': [1, 2], - 'actual': [1, 3]}, - keep_date_col=True) - self.assertIn('nominal', df) - self.assertIn('actual', df) - - self.assertIn(1, df) - self.assertIn(2, df) - self.assertIn(3, df) - - data = """\ -KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 -KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 -KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 -KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 -KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 -KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000 -""" - df = read_csv(StringIO(data), header=None, - prefix='X', - parse_dates=[[1, 2], [1, 3]]) - - self.assertIn('X1_X2', df) - self.assertIn('X1_X3', df) - self.assertNotIn('X1', df) - self.assertNotIn('X2', df) - self.assertNotIn('X3', df) - - d = datetime(1999, 1, 27, 19, 0) - self.assertEqual(df.ix[0, 'X1_X2'], d) - - df = read_csv(StringIO(data), header=None, - parse_dates=[[1, 2], [1, 3]], keep_date_col=True) - - self.assertIn('1_2', df) - self.assertIn('1_3', df) - self.assertIn(1, df) - self.assertIn(2, df) - self.assertIn(3, df) - - data = '''\ -KORD,19990127 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 -KORD,19990127 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 -KORD,19990127 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 -KORD,19990127 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 -KORD,19990127 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 -''' - df = self.read_csv(StringIO(data), sep=',', header=None, - parse_dates=[1], index_col=1) - d = datetime(1999, 1, 27, 19, 0) - self.assertEqual(df.index[0], d) - - def test_multiple_date_cols_int_cast(self): - data = ("KORD,19990127, 19:00:00, 18:56:00, 0.8100\n" - "KORD,19990127, 20:00:00, 19:56:00, 0.0100\n" - "KORD,19990127, 21:00:00, 20:56:00, -0.5900\n" - "KORD,19990127, 21:00:00, 21:18:00, -0.9900\n" - "KORD,19990127, 22:00:00, 21:56:00, -0.5900\n" - "KORD,19990127, 23:00:00, 22:56:00, -0.5900") - date_spec = {'nominal': [1, 2], 'actual': [1, 3]} - import pandas.io.date_converters as conv - - # it works! - df = self.read_csv(StringIO(data), header=None, parse_dates=date_spec, - date_parser=conv.parse_date_time) - self.assertIn('nominal', df) - - def test_multiple_date_col_timestamp_parse(self): - data = """05/31/2012,15:30:00.029,1306.25,1,E,0,,1306.25 -05/31/2012,15:30:00.029,1306.25,8,E,0,,1306.25""" - result = self.read_csv(StringIO(data), sep=',', header=None, - parse_dates=[[0, 1]], date_parser=Timestamp) - - ex_val = Timestamp('05/31/2012 15:30:00.029') - self.assertEqual(result['0_1'][0], ex_val) - - def test_multiple_date_cols_with_header(self): - data = """\ -ID,date,NominalTime,ActualTime,TDew,TAir,Windspeed,Precip,WindDir -KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 -KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 -KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 -KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 -KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 -KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000""" - - df = self.read_csv(StringIO(data), parse_dates={'nominal': [1, 2]}) - self.assertNotIsInstance(df.nominal[0], compat.string_types) - - ts_data = """\ -ID,date,nominalTime,actualTime,A,B,C,D,E -KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 -KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 -KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 -KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 -KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 -KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000 -""" - - def test_multiple_date_col_name_collision(self): - self.assertRaises(ValueError, self.read_csv, StringIO(self.ts_data), - parse_dates={'ID': [1, 2]}) - - data = """\ -date_NominalTime,date,NominalTime,ActualTime,TDew,TAir,Windspeed,Precip,WindDir -KORD1,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 -KORD2,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 -KORD3,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 -KORD4,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 -KORD5,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 -KORD6,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000""" # noqa - - self.assertRaises(ValueError, self.read_csv, StringIO(data), - parse_dates=[[1, 2]]) - - def test_date_parser_int_bug(self): - # #3071 - log_file = StringIO( - 'posix_timestamp,elapsed,sys,user,queries,query_time,rows,' - 'accountid,userid,contactid,level,silo,method\n' - '1343103150,0.062353,0,4,6,0.01690,3,' - '12345,1,-1,3,invoice_InvoiceResource,search\n' - ) - - def f(posix_string): - return datetime.utcfromtimestamp(int(posix_string)) - - # it works! - read_csv(log_file, index_col=0, parse_dates=[0], date_parser=f) - - def test_nat_parse(self): - - # GH 3062 - df = DataFrame(dict({ - 'A': np.asarray(lrange(10), dtype='float64'), - 'B': pd.Timestamp('20010101')})) - df.iloc[3:6, :] = np.nan - - with tm.ensure_clean('__nat_parse_.csv') as path: - df.to_csv(path) - result = read_csv(path, index_col=0, parse_dates=['B']) - tm.assert_frame_equal(result, df) - - expected = Series(dict(A='float64', B='datetime64[ns]')) - tm.assert_series_equal(expected, result.dtypes) - - # test with NaT for the nan_rep - # we don't have a method to specif the Datetime na_rep (it defaults - # to '') - df.to_csv(path) - result = read_csv(path, index_col=0, parse_dates=['B']) - tm.assert_frame_equal(result, df) - - def test_csv_custom_parser(self): - data = """A,B,C -20090101,a,1,2 -20090102,b,3,4 -20090103,c,4,5 -""" - f = lambda x: datetime.strptime(x, '%Y%m%d') - df = self.read_csv(StringIO(data), date_parser=f) - expected = self.read_csv(StringIO(data), parse_dates=True) - tm.assert_frame_equal(df, expected) - - def test_parse_dates_implicit_first_col(self): - data = """A,B,C -20090101,a,1,2 -20090102,b,3,4 -20090103,c,4,5 -""" - df = self.read_csv(StringIO(data), parse_dates=True) - expected = self.read_csv(StringIO(data), index_col=0, parse_dates=True) - self.assertIsInstance( - df.index[0], (datetime, np.datetime64, Timestamp)) - tm.assert_frame_equal(df, expected) - - def test_parse_dates_string(self): - data = """date,A,B,C -20090101,a,1,2 -20090102,b,3,4 -20090103,c,4,5 -""" - rs = self.read_csv( - StringIO(data), index_col='date', parse_dates=['date']) - idx = date_range('1/1/2009', periods=3) - idx.name = 'date' - xp = DataFrame({'A': ['a', 'b', 'c'], - 'B': [1, 3, 4], - 'C': [2, 4, 5]}, idx) - tm.assert_frame_equal(rs, xp) - - def test_yy_format_with_yearfirst(self): - data = """date,time,B,C -090131,0010,1,2 -090228,1020,3,4 -090331,0830,5,6 -""" - - # https://github.com/dateutil/dateutil/issues/217 - import dateutil - if dateutil.__version__ >= LooseVersion('2.5.0'): - raise nose.SkipTest("testing yearfirst=True not-support" - "on datetutil < 2.5.0 this works but" - "is wrong") - - rs = self.read_csv(StringIO(data), index_col=0, - parse_dates=[['date', 'time']]) - idx = DatetimeIndex([datetime(2009, 1, 31, 0, 10, 0), - datetime(2009, 2, 28, 10, 20, 0), - datetime(2009, 3, 31, 8, 30, 0)], - dtype=object, name='date_time') - xp = DataFrame({'B': [1, 3, 5], 'C': [2, 4, 6]}, idx) - tm.assert_frame_equal(rs, xp) - - rs = self.read_csv(StringIO(data), index_col=0, - parse_dates=[[0, 1]]) - idx = DatetimeIndex([datetime(2009, 1, 31, 0, 10, 0), - datetime(2009, 2, 28, 10, 20, 0), - datetime(2009, 3, 31, 8, 30, 0)], - dtype=object, name='date_time') - xp = DataFrame({'B': [1, 3, 5], 'C': [2, 4, 6]}, idx) - tm.assert_frame_equal(rs, xp) - - def test_parse_dates_column_list(self): - from pandas.core.datetools import to_datetime - - data = '''date;destination;ventilationcode;unitcode;units;aux_date -01/01/2010;P;P;50;1;12/1/2011 -01/01/2010;P;R;50;1;13/1/2011 -15/01/2010;P;P;50;1;14/1/2011 -01/05/2010;P;P;50;1;15/1/2011''' - - expected = self.read_csv(StringIO(data), sep=";", index_col=lrange(4)) - - lev = expected.index.levels[0] - levels = list(expected.index.levels) - levels[0] = lev.to_datetime(dayfirst=True) - # hack to get this to work - remove for final test - levels[0].name = lev.name - expected.index.set_levels(levels, inplace=True) - expected['aux_date'] = to_datetime(expected['aux_date'], - dayfirst=True) - expected['aux_date'] = lmap(Timestamp, expected['aux_date']) - tm.assertIsInstance(expected['aux_date'][0], datetime) - - df = self.read_csv(StringIO(data), sep=";", index_col=lrange(4), - parse_dates=[0, 5], dayfirst=True) - tm.assert_frame_equal(df, expected) - - df = self.read_csv(StringIO(data), sep=";", index_col=lrange(4), - parse_dates=['date', 'aux_date'], dayfirst=True) - tm.assert_frame_equal(df, expected) - - def test_multi_index_parse_dates(self): - data = """index1,index2,A,B,C -20090101,one,a,1,2 -20090101,two,b,3,4 -20090101,three,c,4,5 -20090102,one,a,1,2 -20090102,two,b,3,4 -20090102,three,c,4,5 -20090103,one,a,1,2 -20090103,two,b,3,4 -20090103,three,c,4,5 -""" - df = self.read_csv(StringIO(data), index_col=[0, 1], parse_dates=True) - self.assertIsInstance(df.index.levels[0][0], - (datetime, np.datetime64, Timestamp)) - - # specify columns out of order! - df2 = self.read_csv(StringIO(data), index_col=[1, 0], parse_dates=True) - self.assertIsInstance(df2.index.levels[1][0], - (datetime, np.datetime64, Timestamp)) - - def test_parse_dates_custom_euroformat(self): - text = """foo,bar,baz -31/01/2010,1,2 -01/02/2010,1,NA -02/02/2010,1,2 -""" - parser = lambda d: parse_date(d, dayfirst=True) - df = self.read_csv(StringIO(text), - names=['time', 'Q', 'NTU'], header=0, - index_col=0, parse_dates=True, - date_parser=parser, na_values=['NA']) - - exp_index = Index([datetime(2010, 1, 31), datetime(2010, 2, 1), - datetime(2010, 2, 2)], name='time') - expected = DataFrame({'Q': [1, 1, 1], 'NTU': [2, np.nan, 2]}, - index=exp_index, columns=['Q', 'NTU']) - tm.assert_frame_equal(df, expected) - - parser = lambda d: parse_date(d, day_first=True) - self.assertRaises(TypeError, self.read_csv, - StringIO(text), skiprows=[0], - names=['time', 'Q', 'NTU'], index_col=0, - parse_dates=True, date_parser=parser, - na_values=['NA']) - - def test_parse_tz_aware(self): - import pytz - # #1693 - data = StringIO("Date,x\n2012-06-13T01:39:00Z,0.5") - - # it works - result = read_csv(data, index_col=0, parse_dates=True) - stamp = result.index[0] - self.assertEqual(stamp.minute, 39) - try: - self.assertIs(result.index.tz, pytz.utc) - except AssertionError: # hello Yaroslav - arr = result.index.to_pydatetime() - result = tools.to_datetime(arr, utc=True)[0] - self.assertEqual(stamp.minute, result.minute) - self.assertEqual(stamp.hour, result.hour) - self.assertEqual(stamp.day, result.day) - - def test_multiple_date_cols_index(self): - data = """\ -ID,date,NominalTime,ActualTime,TDew,TAir,Windspeed,Precip,WindDir -KORD1,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 -KORD2,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 -KORD3,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 -KORD4,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 -KORD5,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 -KORD6,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000""" - - xp = self.read_csv(StringIO(data), parse_dates={'nominal': [1, 2]}) - df = self.read_csv(StringIO(data), parse_dates={'nominal': [1, 2]}, - index_col='nominal') - tm.assert_frame_equal(xp.set_index('nominal'), df) - df2 = self.read_csv(StringIO(data), parse_dates={'nominal': [1, 2]}, - index_col=0) - tm.assert_frame_equal(df2, df) - - df3 = self.read_csv(StringIO(data), parse_dates=[[1, 2]], index_col=0) - tm.assert_frame_equal(df3, df, check_names=False) - - def test_multiple_date_cols_chunked(self): - df = self.read_csv(StringIO(self.ts_data), parse_dates={ - 'nominal': [1, 2]}, index_col='nominal') - reader = self.read_csv(StringIO(self.ts_data), parse_dates={'nominal': - [1, 2]}, index_col='nominal', chunksize=2) - - chunks = list(reader) - - self.assertNotIn('nominalTime', df) - - tm.assert_frame_equal(chunks[0], df[:2]) - tm.assert_frame_equal(chunks[1], df[2:4]) - tm.assert_frame_equal(chunks[2], df[4:]) - - def test_multiple_date_col_named_components(self): - xp = self.read_csv(StringIO(self.ts_data), - parse_dates={'nominal': [1, 2]}, - index_col='nominal') - colspec = {'nominal': ['date', 'nominalTime']} - df = self.read_csv(StringIO(self.ts_data), parse_dates=colspec, - index_col='nominal') - tm.assert_frame_equal(df, xp) - - def test_multiple_date_col_multiple_index(self): - df = self.read_csv(StringIO(self.ts_data), - parse_dates={'nominal': [1, 2]}, - index_col=['nominal', 'ID']) - - xp = self.read_csv(StringIO(self.ts_data), - parse_dates={'nominal': [1, 2]}) - - tm.assert_frame_equal(xp.set_index(['nominal', 'ID']), df) - - def test_read_with_parse_dates_scalar_non_bool(self): - # See gh-5636 - errmsg = ("Only booleans, lists, and " - "dictionaries are accepted " - "for the 'parse_dates' parameter") - data = """A,B,C - 1,2,2003-11-1""" - - tm.assertRaisesRegexp(TypeError, errmsg, self.read_csv, - StringIO(data), parse_dates="C") - tm.assertRaisesRegexp(TypeError, errmsg, self.read_csv, - StringIO(data), parse_dates="C", - index_col="C") - - def test_read_with_parse_dates_invalid_type(self): - errmsg = ("Only booleans, lists, and " - "dictionaries are accepted " - "for the 'parse_dates' parameter") - data = """A,B,C - 1,2,2003-11-1""" - - tm.assertRaisesRegexp(TypeError, errmsg, self.read_csv, - StringIO(data), parse_dates=(1,)) - tm.assertRaisesRegexp(TypeError, errmsg, self.read_csv, - StringIO(data), parse_dates=np.array([4, 5])) - tm.assertRaisesRegexp(TypeError, errmsg, self.read_csv, - StringIO(data), parse_dates=set([1, 3, 3])) - - -class ParserTests(ParseDatesTests): - """ - Want to be able to test either C+Cython or Python+Cython parsers - """ - data1 = """index,A,B,C,D -foo,2,3,4,5 -bar,7,8,9,10 -baz,12,13,14,15 -qux,12,13,14,15 -foo2,12,13,14,15 -bar2,12,13,14,15 -""" - - def read_csv(self, *args, **kwargs): - raise NotImplementedError - - def read_table(self, *args, **kwargs): - raise NotImplementedError - - def setUp(self): - import warnings - warnings.filterwarnings(action='ignore', category=FutureWarning) - - self.dirpath = tm.get_data_path() - self.csv1 = os.path.join(self.dirpath, 'test1.csv') - self.csv2 = os.path.join(self.dirpath, 'test2.csv') - self.xls1 = os.path.join(self.dirpath, 'test.xls') - - def construct_dataframe(self, num_rows): - - df = DataFrame(np.random.rand(num_rows, 5), columns=list('abcde')) - df['foo'] = 'foo' - df['bar'] = 'bar' - df['baz'] = 'baz' - df['date'] = pd.date_range('20000101 09:00:00', - periods=num_rows, - freq='s') - df['int'] = np.arange(num_rows, dtype='int64') - return df - - def generate_multithread_dataframe(self, path, num_rows, num_tasks): - - def reader(arg): - start, nrows = arg - - if not start: - return pd.read_csv(path, index_col=0, header=0, nrows=nrows, - parse_dates=['date']) - - return pd.read_csv(path, - index_col=0, - header=None, - skiprows=int(start) + 1, - nrows=nrows, - parse_dates=[9]) - - tasks = [ - (num_rows * i / num_tasks, - num_rows / num_tasks) for i in range(num_tasks) - ] - - pool = ThreadPool(processes=num_tasks) - - results = pool.map(reader, tasks) - - header = results[0].columns - for r in results[1:]: - r.columns = header - - final_dataframe = pd.concat(results) - - return final_dataframe - - def test_converters_type_must_be_dict(self): - with tm.assertRaisesRegexp(TypeError, 'Type converters.+'): - self.read_csv(StringIO(self.data1), converters=0) - - def test_empty_decimal_marker(self): - data = """A|B|C -1|2,334|5 -10|13|10. -""" - self.assertRaises(ValueError, read_csv, StringIO(data), decimal='') - - def test_empty_thousands_marker(self): - data = """A|B|C -1|2,334|5 -10|13|10. -""" - self.assertRaises(ValueError, read_csv, StringIO(data), thousands='') - - def test_multi_character_decimal_marker(self): - data = """A|B|C -1|2,334|5 -10|13|10. -""" - self.assertRaises(ValueError, read_csv, StringIO(data), thousands=',,') - - def test_empty_string(self): - data = """\ -One,Two,Three -a,1,one -b,2,two -,3,three -d,4,nan -e,5,five -nan,6, -g,7,seven -""" - df = self.read_csv(StringIO(data)) - xp = DataFrame({'One': ['a', 'b', np.nan, 'd', 'e', np.nan, 'g'], - 'Two': [1, 2, 3, 4, 5, 6, 7], - 'Three': ['one', 'two', 'three', np.nan, 'five', - np.nan, 'seven']}) - tm.assert_frame_equal(xp.reindex(columns=df.columns), df) - - df = self.read_csv(StringIO(data), na_values={'One': [], 'Three': []}, - keep_default_na=False) - xp = DataFrame({'One': ['a', 'b', '', 'd', 'e', 'nan', 'g'], - 'Two': [1, 2, 3, 4, 5, 6, 7], - 'Three': ['one', 'two', 'three', 'nan', 'five', - '', 'seven']}) - tm.assert_frame_equal(xp.reindex(columns=df.columns), df) - - df = self.read_csv( - StringIO(data), na_values=['a'], keep_default_na=False) - xp = DataFrame({'One': [np.nan, 'b', '', 'd', 'e', 'nan', 'g'], - 'Two': [1, 2, 3, 4, 5, 6, 7], - 'Three': ['one', 'two', 'three', 'nan', 'five', '', - 'seven']}) - tm.assert_frame_equal(xp.reindex(columns=df.columns), df) - - df = self.read_csv(StringIO(data), na_values={'One': [], 'Three': []}) - xp = DataFrame({'One': ['a', 'b', np.nan, 'd', 'e', np.nan, 'g'], - 'Two': [1, 2, 3, 4, 5, 6, 7], - 'Three': ['one', 'two', 'three', np.nan, 'five', - np.nan, 'seven']}) - tm.assert_frame_equal(xp.reindex(columns=df.columns), df) - - # GH4318, passing na_values=None and keep_default_na=False yields - # 'None' as a na_value - data = """\ -One,Two,Three -a,1,None -b,2,two -,3,None -d,4,nan -e,5,five -nan,6, -g,7,seven -""" - df = self.read_csv( - StringIO(data), keep_default_na=False) - xp = DataFrame({'One': ['a', 'b', '', 'd', 'e', 'nan', 'g'], - 'Two': [1, 2, 3, 4, 5, 6, 7], - 'Three': ['None', 'two', 'None', 'nan', 'five', '', - 'seven']}) - tm.assert_frame_equal(xp.reindex(columns=df.columns), df) - - def test_read_csv(self): - if not compat.PY3: - if compat.is_platform_windows(): - prefix = u("file:///") - else: - prefix = u("file://") - fname = prefix + compat.text_type(self.csv1) - # it works! - read_csv(fname, index_col=0, parse_dates=True) - - def test_dialect(self): - data = """\ -label1,label2,label3 -index1,"a,c,e -index2,b,d,f -""" - - dia = csv.excel() - dia.quoting = csv.QUOTE_NONE - df = self.read_csv(StringIO(data), dialect=dia) - - data = '''\ -label1,label2,label3 -index1,a,c,e -index2,b,d,f -''' - exp = self.read_csv(StringIO(data)) - exp.replace('a', '"a', inplace=True) - tm.assert_frame_equal(df, exp) - - def test_dialect_str(self): - data = """\ -fruit:vegetable -apple:brocolli -pear:tomato -""" - exp = DataFrame({ - 'fruit': ['apple', 'pear'], - 'vegetable': ['brocolli', 'tomato'] - }) - dia = csv.register_dialect('mydialect', delimiter=':') # noqa - df = self.read_csv(StringIO(data), dialect='mydialect') - tm.assert_frame_equal(df, exp) - csv.unregister_dialect('mydialect') - - def test_1000_sep(self): - data = """A|B|C -1|2,334|5 -10|13|10. -""" - expected = DataFrame({ - 'A': [1, 10], - 'B': [2334, 13], - 'C': [5, 10.] - }) - - df = self.read_csv(StringIO(data), sep='|', thousands=',') - tm.assert_frame_equal(df, expected) - - df = self.read_table(StringIO(data), sep='|', thousands=',') - tm.assert_frame_equal(df, expected) - - def test_1000_sep_with_decimal(self): - data = """A|B|C -1|2,334.01|5 -10|13|10. -""" - expected = DataFrame({ - 'A': [1, 10], - 'B': [2334.01, 13], - 'C': [5, 10.] - }) - - tm.assert_equal(expected.A.dtype, 'int64') - tm.assert_equal(expected.B.dtype, 'float') - tm.assert_equal(expected.C.dtype, 'float') - - df = self.read_csv(StringIO(data), sep='|', thousands=',', decimal='.') - tm.assert_frame_equal(df, expected) - - df = self.read_table(StringIO(data), sep='|', - thousands=',', decimal='.') - tm.assert_frame_equal(df, expected) - - data_with_odd_sep = """A|B|C -1|2.334,01|5 -10|13|10, -""" - df = self.read_csv(StringIO(data_with_odd_sep), - sep='|', thousands='.', decimal=',') - tm.assert_frame_equal(df, expected) - - df = self.read_table(StringIO(data_with_odd_sep), - sep='|', thousands='.', decimal=',') - tm.assert_frame_equal(df, expected) - - def test_squeeze(self): - data = """\ -a,1 -b,2 -c,3 -""" - idx = Index(['a', 'b', 'c'], name=0) - expected = Series([1, 2, 3], name=1, index=idx) - result = self.read_table(StringIO(data), sep=',', index_col=0, - header=None, squeeze=True) - tm.assertIsInstance(result, Series) - tm.assert_series_equal(result, expected) - - def test_squeeze_no_view(self): - - # GH 8217 - # series should not be a view - - data = """time,data\n0,10\n1,11\n2,12\n4,14\n5,15\n3,13""" - result = self.read_csv(StringIO(data), index_col='time', squeeze=True) - self.assertFalse(result._is_view) - - def test_inf_parsing(self): - data = """\ -,A -a,inf -b,-inf -c,Inf -d,-Inf -e,INF -f,-INF -g,INf -h,-INf -i,inF -j,-inF""" - inf = float('inf') - expected = Series([inf, -inf] * 5) - df = read_csv(StringIO(data), index_col=0) - tm.assert_almost_equal(df['A'].values, expected.values) - df = read_csv(StringIO(data), index_col=0, na_filter=False) - tm.assert_almost_equal(df['A'].values, expected.values) - - def test_single_line(self): - # GH 6607 - # Test currently only valid with python engine because sep=None and - # delim_whitespace=False. Temporarily copied to TestPythonParser. - # Test for ValueError with other engines: - - with tm.assertRaisesRegexp(ValueError, - 'sep=None with delim_whitespace=False'): - # sniff separator - buf = StringIO() - sys.stdout = buf - - # printing warning message when engine == 'c' for now - - try: - # it works! - df = self.read_csv(StringIO('1,2'), names=['a', 'b'], - header=None, sep=None) - tm.assert_frame_equal(DataFrame({'a': [1], 'b': [2]}), df) - finally: - sys.stdout = sys.__stdout__ - - def test_index_col_named(self): - no_header = """\ -KORD1,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 -KORD2,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 -KORD3,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 -KORD4,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 -KORD5,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 -KORD6,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000""" - - h = "ID,date,NominalTime,ActualTime,TDew,TAir,Windspeed,Precip,WindDir\n" - data = h + no_header - rs = self.read_csv(StringIO(data), index_col='ID') - xp = self.read_csv(StringIO(data), header=0).set_index('ID') - tm.assert_frame_equal(rs, xp) - - self.assertRaises(ValueError, self.read_csv, StringIO(no_header), - index_col='ID') - - data = """\ -1,2,3,4,hello -5,6,7,8,world -9,10,11,12,foo -""" - names = ['a', 'b', 'c', 'd', 'message'] - xp = DataFrame({'a': [1, 5, 9], 'b': [2, 6, 10], 'c': [3, 7, 11], - 'd': [4, 8, 12]}, - index=Index(['hello', 'world', 'foo'], name='message')) - rs = self.read_csv(StringIO(data), names=names, index_col=['message']) - tm.assert_frame_equal(xp, rs) - self.assertEqual(xp.index.name, rs.index.name) - - rs = self.read_csv(StringIO(data), names=names, index_col='message') - tm.assert_frame_equal(xp, rs) - self.assertEqual(xp.index.name, rs.index.name) - - def test_usecols_index_col_False(self): - # Issue 9082 - s = "a,b,c,d\n1,2,3,4\n5,6,7,8" - s_malformed = "a,b,c,d\n1,2,3,4,\n5,6,7,8," - cols = ['a', 'c', 'd'] - expected = DataFrame({'a': [1, 5], 'c': [3, 7], 'd': [4, 8]}) - df = self.read_csv(StringIO(s), usecols=cols, index_col=False) - tm.assert_frame_equal(expected, df) - df = self.read_csv(StringIO(s_malformed), - usecols=cols, index_col=False) - tm.assert_frame_equal(expected, df) - - def test_index_col_is_True(self): - # Issue 9798 - self.assertRaises(ValueError, self.read_csv, StringIO(self.ts_data), - index_col=True) - - def test_converter_index_col_bug(self): - # 1835 - data = "A;B\n1;2\n3;4" - - rs = self.read_csv(StringIO(data), sep=';', index_col='A', - converters={'A': lambda x: x}) - - xp = DataFrame({'B': [2, 4]}, index=Index([1, 3], name='A')) - tm.assert_frame_equal(rs, xp) - self.assertEqual(rs.index.name, xp.index.name) - - def test_multiple_skts_example(self): - data = "year, month, a, b\n 2001, 01, 0.0, 10.\n 2001, 02, 1.1, 11." - pass - - def test_malformed(self): - # all - data = """ignore -A,B,C -1,2,3 # comment -1,2,3,4,5 -2,3,4 -""" - - try: - df = self.read_table( - StringIO(data), sep=',', header=1, comment='#') - self.assertTrue(False) - except Exception as inst: - self.assertIn('Expected 3 fields in line 4, saw 5', str(inst)) - - # skip_footer - data = """ignore -A,B,C -1,2,3 # comment -1,2,3,4,5 -2,3,4 -footer -""" - - # GH 6607 - # Test currently only valid with python engine because - # skip_footer != 0. Temporarily copied to TestPythonParser. - # Test for ValueError with other engines: - - try: - with tm.assertRaisesRegexp(ValueError, 'skip_footer'): # XXX - df = self.read_table( - StringIO(data), sep=',', header=1, comment='#', - skip_footer=1) - self.assertTrue(False) - except Exception as inst: - self.assertIn('Expected 3 fields in line 4, saw 5', str(inst)) - - # first chunk - data = """ignore -A,B,C -skip -1,2,3 -3,5,10 # comment -1,2,3,4,5 -2,3,4 -""" - try: - it = self.read_table(StringIO(data), sep=',', - header=1, comment='#', - iterator=True, chunksize=1, - skiprows=[2]) - df = it.read(5) - self.assertTrue(False) - except Exception as inst: - self.assertIn('Expected 3 fields in line 6, saw 5', str(inst)) - - # middle chunk - data = """ignore -A,B,C -skip -1,2,3 -3,5,10 # comment -1,2,3,4,5 -2,3,4 -""" - try: - it = self.read_table(StringIO(data), sep=',', header=1, - comment='#', iterator=True, chunksize=1, - skiprows=[2]) - df = it.read(1) - it.read(2) - self.assertTrue(False) - except Exception as inst: - self.assertIn('Expected 3 fields in line 6, saw 5', str(inst)) - - # last chunk - data = """ignore -A,B,C -skip -1,2,3 -3,5,10 # comment -1,2,3,4,5 -2,3,4 -""" - try: - it = self.read_table(StringIO(data), sep=',', - header=1, comment='#', - iterator=True, chunksize=1, skiprows=[2]) - df = it.read(1) - it.read() - self.assertTrue(False) - except Exception as inst: - self.assertIn('Expected 3 fields in line 6, saw 5', str(inst)) - - def test_passing_dtype(self): - # GH 6607 - # Passing dtype is currently only supported by the C engine. - # Temporarily copied to TestCParser*. - # Test for ValueError with other engines: - - with tm.assertRaisesRegexp(ValueError, - "The 'dtype' option is not supported"): - - df = DataFrame(np.random.rand(5, 2), columns=list( - 'AB'), index=['1A', '1B', '1C', '1D', '1E']) - - with tm.ensure_clean('__passing_str_as_dtype__.csv') as path: - df.to_csv(path) - - # GH 3795 - # passing 'str' as the dtype - result = self.read_csv(path, dtype=str, index_col=0) - tm.assert_series_equal(result.dtypes, Series( - {'A': 'object', 'B': 'object'})) - - # we expect all object columns, so need to convert to test for - # equivalence - result = result.astype(float) - tm.assert_frame_equal(result, df) - - # invalid dtype - self.assertRaises(TypeError, self.read_csv, path, - dtype={'A': 'foo', 'B': 'float64'}, - index_col=0) - - # valid but we don't support it (date) - self.assertRaises(TypeError, self.read_csv, path, - dtype={'A': 'datetime64', 'B': 'float64'}, - index_col=0) - self.assertRaises(TypeError, self.read_csv, path, - dtype={'A': 'datetime64', 'B': 'float64'}, - index_col=0, parse_dates=['B']) - - # valid but we don't support it - self.assertRaises(TypeError, self.read_csv, path, - dtype={'A': 'timedelta64', 'B': 'float64'}, - index_col=0) - - with tm.assertRaisesRegexp(ValueError, - "The 'dtype' option is not supported"): - - # empty frame - # GH12048 - self.read_csv(StringIO('A,B'), dtype=str) - - - def test_quoting(self): - bad_line_small = """printer\tresult\tvariant_name -Klosterdruckerei\tKlosterdruckerei (1611-1804)\tMuller, Jacob -Klosterdruckerei\tKlosterdruckerei (1611-1804)\tMuller, Jakob -Klosterdruckerei\tKlosterdruckerei (1609-1805)\t"Furststiftische Hofdruckerei, (1609-1805)\tGaller, Alois -Klosterdruckerei\tKlosterdruckerei (1609-1805)\tHochfurstliche Buchhandlung """ - self.assertRaises(Exception, self.read_table, StringIO(bad_line_small), - sep='\t') - - good_line_small = bad_line_small + '"' - df = self.read_table(StringIO(good_line_small), sep='\t') - self.assertEqual(len(df), 3) - - def test_non_string_na_values(self): - # GH3611, na_values that are not a string are an issue - with tm.ensure_clean('__non_string_na_values__.csv') as path: - df = DataFrame({'A': [-999, 2, 3], 'B': [1.2, -999, 4.5]}) - df.to_csv(path, sep=' ', index=False) - result1 = read_csv(path, sep=' ', header=0, - na_values=['-999.0', '-999']) - result2 = read_csv(path, sep=' ', header=0, - na_values=[-999, -999.0]) - result3 = read_csv(path, sep=' ', header=0, - na_values=[-999.0, -999]) - tm.assert_frame_equal(result1, result2) - tm.assert_frame_equal(result2, result3) - - result4 = read_csv(path, sep=' ', header=0, na_values=['-999.0']) - result5 = read_csv(path, sep=' ', header=0, na_values=['-999']) - result6 = read_csv(path, sep=' ', header=0, na_values=[-999.0]) - result7 = read_csv(path, sep=' ', header=0, na_values=[-999]) - tm.assert_frame_equal(result4, result3) - tm.assert_frame_equal(result5, result3) - tm.assert_frame_equal(result6, result3) - tm.assert_frame_equal(result7, result3) - - good_compare = result3 - - # with an odd float format, so we can't match the string 999.0 - # exactly, but need float matching - df.to_csv(path, sep=' ', index=False, float_format='%.3f') - result1 = read_csv(path, sep=' ', header=0, - na_values=['-999.0', '-999']) - result2 = read_csv(path, sep=' ', header=0, - na_values=[-999, -999.0]) - result3 = read_csv(path, sep=' ', header=0, - na_values=[-999.0, -999]) - tm.assert_frame_equal(result1, good_compare) - tm.assert_frame_equal(result2, good_compare) - tm.assert_frame_equal(result3, good_compare) - - result4 = read_csv(path, sep=' ', header=0, na_values=['-999.0']) - result5 = read_csv(path, sep=' ', header=0, na_values=['-999']) - result6 = read_csv(path, sep=' ', header=0, na_values=[-999.0]) - result7 = read_csv(path, sep=' ', header=0, na_values=[-999]) - tm.assert_frame_equal(result4, good_compare) - tm.assert_frame_equal(result5, good_compare) - tm.assert_frame_equal(result6, good_compare) - tm.assert_frame_equal(result7, good_compare) - - def test_default_na_values(self): - _NA_VALUES = set(['-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN', - '#N/A', 'N/A', 'NA', '#NA', 'NULL', 'NaN', - 'nan', '-NaN', '-nan', '#N/A N/A', '']) - self.assertEqual(_NA_VALUES, parsers._NA_VALUES) - nv = len(_NA_VALUES) - - def f(i, v): - if i == 0: - buf = '' - elif i > 0: - buf = ''.join([','] * i) - - buf = "{0}{1}".format(buf, v) - - if i < nv - 1: - buf = "{0}{1}".format(buf, ''.join([','] * (nv - i - 1))) - - return buf - - data = StringIO('\n'.join([f(i, v) for i, v in enumerate(_NA_VALUES)])) - expected = DataFrame(np.nan, columns=range(nv), index=range(nv)) - df = self.read_csv(data, header=None) - tm.assert_frame_equal(df, expected) - - def test_custom_na_values(self): - data = """A,B,C -ignore,this,row -1,NA,3 --1.#IND,5,baz -7,8,NaN -""" - expected = [[1., nan, 3], - [nan, 5, nan], - [7, 8, nan]] - - df = self.read_csv(StringIO(data), na_values=['baz'], skiprows=[1]) - tm.assert_almost_equal(df.values, expected) - - df2 = self.read_table(StringIO(data), sep=',', na_values=['baz'], - skiprows=[1]) - tm.assert_almost_equal(df2.values, expected) - - df3 = self.read_table(StringIO(data), sep=',', na_values='baz', - skiprows=[1]) - tm.assert_almost_equal(df3.values, expected) - - def test_skiprows_bug(self): - # GH #505 - text = """#foo,a,b,c -#foo,a,b,c -#foo,a,b,c -#foo,a,b,c -#foo,a,b,c -#foo,a,b,c -1/1/2000,1.,2.,3. -1/2/2000,4,5,6 -1/3/2000,7,8,9 -""" - data = self.read_csv(StringIO(text), skiprows=lrange(6), header=None, - index_col=0, parse_dates=True) - - data2 = self.read_csv(StringIO(text), skiprows=6, header=None, - index_col=0, parse_dates=True) - - expected = DataFrame(np.arange(1., 10.).reshape((3, 3)), - columns=[1, 2, 3], - index=[datetime(2000, 1, 1), datetime(2000, 1, 2), - datetime(2000, 1, 3)]) - expected.index.name = 0 - tm.assert_frame_equal(data, expected) - tm.assert_frame_equal(data, data2) - - def test_deep_skiprows(self): - # GH #4382 - text = "a,b,c\n" + \ - "\n".join([",".join([str(i), str(i + 1), str(i + 2)]) - for i in range(10)]) - condensed_text = "a,b,c\n" + \ - "\n".join([",".join([str(i), str(i + 1), str(i + 2)]) - for i in [0, 1, 2, 3, 4, 6, 8, 9]]) - data = self.read_csv(StringIO(text), skiprows=[6, 8]) - condensed_data = self.read_csv(StringIO(condensed_text)) - tm.assert_frame_equal(data, condensed_data) - - def test_skiprows_blank(self): - # GH 9832 - text = """#foo,a,b,c -#foo,a,b,c - -#foo,a,b,c -#foo,a,b,c - -1/1/2000,1.,2.,3. -1/2/2000,4,5,6 -1/3/2000,7,8,9 -""" - data = self.read_csv(StringIO(text), skiprows=6, header=None, - index_col=0, parse_dates=True) - - expected = DataFrame(np.arange(1., 10.).reshape((3, 3)), - columns=[1, 2, 3], - index=[datetime(2000, 1, 1), datetime(2000, 1, 2), - datetime(2000, 1, 3)]) - expected.index.name = 0 - tm.assert_frame_equal(data, expected) - - def test_detect_string_na(self): - data = """A,B -foo,bar -NA,baz -NaN,nan -""" - expected = [['foo', 'bar'], - [nan, 'baz'], - [nan, nan]] - - df = self.read_csv(StringIO(data)) - tm.assert_almost_equal(df.values, expected) - - def test_unnamed_columns(self): - data = """A,B,C,, -1,2,3,4,5 -6,7,8,9,10 -11,12,13,14,15 -""" - expected = [[1, 2, 3, 4, 5.], - [6, 7, 8, 9, 10], - [11, 12, 13, 14, 15]] - df = self.read_table(StringIO(data), sep=',') - tm.assert_almost_equal(df.values, expected) - self.assert_numpy_array_equal(df.columns, - ['A', 'B', 'C', 'Unnamed: 3', - 'Unnamed: 4']) - - def test_string_nas(self): - data = """A,B,C -a,b,c -d,,f -,g,h -""" - result = self.read_csv(StringIO(data)) - expected = DataFrame([['a', 'b', 'c'], - ['d', np.nan, 'f'], - [np.nan, 'g', 'h']], - columns=['A', 'B', 'C']) - - tm.assert_frame_equal(result, expected) - - def test_duplicate_columns(self): - for engine in ['python', 'c']: - data = """A,A,B,B,B - 1,2,3,4,5 - 6,7,8,9,10 - 11,12,13,14,15 - """ - # check default beahviour - df = self.read_table(StringIO(data), sep=',', engine=engine) - self.assertEqual(list(df.columns), ['A', 'A.1', 'B', 'B.1', 'B.2']) - - df = self.read_table(StringIO(data), sep=',', - engine=engine, mangle_dupe_cols=False) - self.assertEqual(list(df.columns), ['A', 'A', 'B', 'B', 'B']) - - df = self.read_table(StringIO(data), sep=',', - engine=engine, mangle_dupe_cols=True) - self.assertEqual(list(df.columns), ['A', 'A.1', 'B', 'B.1', 'B.2']) - - def test_csv_mixed_type(self): - data = """A,B,C -a,1,2 -b,3,4 -c,4,5 -""" - df = self.read_csv(StringIO(data)) - # TODO - - def test_no_header(self): - data = """1,2,3,4,5 -6,7,8,9,10 -11,12,13,14,15 -""" - df = self.read_table(StringIO(data), sep=',', header=None) - df_pref = self.read_table(StringIO(data), sep=',', prefix='X', - header=None) - - names = ['foo', 'bar', 'baz', 'quux', 'panda'] - df2 = self.read_table(StringIO(data), sep=',', names=names) - expected = [[1, 2, 3, 4, 5.], - [6, 7, 8, 9, 10], - [11, 12, 13, 14, 15]] - tm.assert_almost_equal(df.values, expected) - tm.assert_almost_equal(df.values, df2.values) - - self.assert_numpy_array_equal(df_pref.columns, - ['X0', 'X1', 'X2', 'X3', 'X4']) - self.assert_numpy_array_equal(df.columns, lrange(5)) - - self.assert_numpy_array_equal(df2.columns, names) - - def test_no_header_prefix(self): - data = """1,2,3,4,5 -6,7,8,9,10 -11,12,13,14,15 -""" - df_pref = self.read_table(StringIO(data), sep=',', prefix='Field', - header=None) - - expected = [[1, 2, 3, 4, 5.], - [6, 7, 8, 9, 10], - [11, 12, 13, 14, 15]] - tm.assert_almost_equal(df_pref.values, expected) - - self.assert_numpy_array_equal(df_pref.columns, - ['Field0', 'Field1', 'Field2', 'Field3', 'Field4']) - - def test_header_with_index_col(self): - data = """foo,1,2,3 -bar,4,5,6 -baz,7,8,9 -""" - names = ['A', 'B', 'C'] - df = self.read_csv(StringIO(data), names=names) - - self.assertEqual(names, ['A', 'B', 'C']) - - values = [[1, 2, 3], [4, 5, 6], [7, 8, 9]] - expected = DataFrame(values, index=['foo', 'bar', 'baz'], - columns=['A', 'B', 'C']) - tm.assert_frame_equal(df, expected) - - def test_read_csv_dataframe(self): - df = self.read_csv(self.csv1, index_col=0, parse_dates=True) - df2 = self.read_table(self.csv1, sep=',', index_col=0, - parse_dates=True) - self.assert_numpy_array_equal(df.columns, ['A', 'B', 'C', 'D']) - self.assertEqual(df.index.name, 'index') - self.assertIsInstance( - df.index[0], (datetime, np.datetime64, Timestamp)) - self.assertEqual(df.values.dtype, np.float64) - tm.assert_frame_equal(df, df2) - - def test_read_csv_no_index_name(self): - df = self.read_csv(self.csv2, index_col=0, parse_dates=True) - df2 = self.read_table(self.csv2, sep=',', index_col=0, - parse_dates=True) - self.assert_numpy_array_equal(df.columns, ['A', 'B', 'C', 'D', 'E']) - self.assertIsInstance( - df.index[0], (datetime, np.datetime64, Timestamp)) - self.assertEqual(df.ix[:, ['A', 'B', 'C', 'D'] - ].values.dtype, np.float64) - tm.assert_frame_equal(df, df2) - - def test_read_csv_infer_compression(self): - # GH 9770 - expected = self.read_csv(self.csv1, index_col=0, parse_dates=True) - - inputs = [self.csv1, self.csv1 + '.gz', - self.csv1 + '.bz2', open(self.csv1)] - - for f in inputs: - df = self.read_csv(f, index_col=0, parse_dates=True, - compression='infer') - - tm.assert_frame_equal(expected, df) - - inputs[3].close() - - def test_read_table_unicode(self): - fin = BytesIO(u('\u0141aski, Jan;1').encode('utf-8')) - df1 = read_table(fin, sep=";", encoding="utf-8", header=None) - tm.assertIsInstance(df1[0].values[0], compat.text_type) - - def test_read_table_wrong_num_columns(self): - # too few! - data = """A,B,C,D,E,F -1,2,3,4,5,6 -6,7,8,9,10,11,12 -11,12,13,14,15,16 -""" - self.assertRaises(ValueError, self.read_csv, StringIO(data)) - - def test_read_table_duplicate_index(self): - data = """index,A,B,C,D -foo,2,3,4,5 -bar,7,8,9,10 -baz,12,13,14,15 -qux,12,13,14,15 -foo,12,13,14,15 -bar,12,13,14,15 -""" - - result = self.read_csv(StringIO(data), index_col=0) - expected = self.read_csv(StringIO(data)).set_index('index', - verify_integrity=False) - tm.assert_frame_equal(result, expected) - - def test_read_table_duplicate_index_implicit(self): - data = """A,B,C,D -foo,2,3,4,5 -bar,7,8,9,10 -baz,12,13,14,15 -qux,12,13,14,15 -foo,12,13,14,15 -bar,12,13,14,15 -""" - - # it works! - result = self.read_csv(StringIO(data)) - - def test_parse_bools(self): - data = """A,B -True,1 -False,2 -True,3 -""" - data = self.read_csv(StringIO(data)) - self.assertEqual(data['A'].dtype, np.bool_) - - data = """A,B -YES,1 -no,2 -yes,3 -No,3 -Yes,3 -""" - data = self.read_csv(StringIO(data), - true_values=['yes', 'Yes', 'YES'], - false_values=['no', 'NO', 'No']) - self.assertEqual(data['A'].dtype, np.bool_) - - data = """A,B -TRUE,1 -FALSE,2 -TRUE,3 -""" - data = self.read_csv(StringIO(data)) - self.assertEqual(data['A'].dtype, np.bool_) - - data = """A,B -foo,bar -bar,foo""" - result = self.read_csv(StringIO(data), true_values=['foo'], - false_values=['bar']) - expected = DataFrame({'A': [True, False], 'B': [False, True]}) - tm.assert_frame_equal(result, expected) - - def test_int_conversion(self): - data = """A,B -1.0,1 -2.0,2 -3.0,3 -""" - data = self.read_csv(StringIO(data)) - self.assertEqual(data['A'].dtype, np.float64) - self.assertEqual(data['B'].dtype, np.int64) - - def test_infer_index_col(self): - data = """A,B,C -foo,1,2,3 -bar,4,5,6 -baz,7,8,9 -""" - data = self.read_csv(StringIO(data)) - self.assertTrue(data.index.equals(Index(['foo', 'bar', 'baz']))) - - def test_read_nrows(self): - df = self.read_csv(StringIO(self.data1), nrows=3) - expected = self.read_csv(StringIO(self.data1))[:3] - tm.assert_frame_equal(df, expected) - - def test_read_chunksize(self): - reader = self.read_csv(StringIO(self.data1), index_col=0, chunksize=2) - df = self.read_csv(StringIO(self.data1), index_col=0) - - chunks = list(reader) - - tm.assert_frame_equal(chunks[0], df[:2]) - tm.assert_frame_equal(chunks[1], df[2:4]) - tm.assert_frame_equal(chunks[2], df[4:]) - - def test_read_chunksize_named(self): - reader = self.read_csv( - StringIO(self.data1), index_col='index', chunksize=2) - df = self.read_csv(StringIO(self.data1), index_col='index') - - chunks = list(reader) - - tm.assert_frame_equal(chunks[0], df[:2]) - tm.assert_frame_equal(chunks[1], df[2:4]) - tm.assert_frame_equal(chunks[2], df[4:]) - - def test_get_chunk_passed_chunksize(self): - data = """A,B,C -1,2,3 -4,5,6 -7,8,9 -1,2,3""" - result = self.read_csv(StringIO(data), chunksize=2) - - piece = result.get_chunk() - self.assertEqual(len(piece), 2) - - def test_read_text_list(self): - data = """A,B,C\nfoo,1,2,3\nbar,4,5,6""" - as_list = [['A', 'B', 'C'], ['foo', '1', '2', '3'], ['bar', - '4', '5', '6']] - df = self.read_csv(StringIO(data), index_col=0) - - parser = TextParser(as_list, index_col=0, chunksize=2) - chunk = parser.read(None) - - tm.assert_frame_equal(chunk, df) - - def test_iterator(self): - # GH 6607 - # Test currently only valid with python engine because - # skip_footer != 0. Temporarily copied to TestPythonParser. - # Test for ValueError with other engines: - - with tm.assertRaisesRegexp(ValueError, 'skip_footer'): - reader = self.read_csv(StringIO(self.data1), index_col=0, - iterator=True) - df = self.read_csv(StringIO(self.data1), index_col=0) - - chunk = reader.read(3) - tm.assert_frame_equal(chunk, df[:3]) - - last_chunk = reader.read(5) - tm.assert_frame_equal(last_chunk, df[3:]) - - # pass list - lines = list(csv.reader(StringIO(self.data1))) - parser = TextParser(lines, index_col=0, chunksize=2) - - df = self.read_csv(StringIO(self.data1), index_col=0) - - chunks = list(parser) - tm.assert_frame_equal(chunks[0], df[:2]) - tm.assert_frame_equal(chunks[1], df[2:4]) - tm.assert_frame_equal(chunks[2], df[4:]) - - # pass skiprows - parser = TextParser(lines, index_col=0, chunksize=2, skiprows=[1]) - chunks = list(parser) - tm.assert_frame_equal(chunks[0], df[1:3]) - - # test bad parameter (skip_footer) - reader = self.read_csv(StringIO(self.data1), index_col=0, - iterator=True, skip_footer=True) - self.assertRaises(ValueError, reader.read, 3) - - treader = self.read_table(StringIO(self.data1), sep=',', index_col=0, - iterator=True) - tm.assertIsInstance(treader, TextFileReader) - - # stopping iteration when on chunksize is specified, GH 3967 - data = """A,B,C -foo,1,2,3 -bar,4,5,6 -baz,7,8,9 -""" - reader = self.read_csv(StringIO(data), iterator=True) - result = list(reader) - expected = DataFrame(dict(A=[1, 4, 7], B=[2, 5, 8], C=[ - 3, 6, 9]), index=['foo', 'bar', 'baz']) - tm.assert_frame_equal(result[0], expected) - - # chunksize = 1 - reader = self.read_csv(StringIO(data), chunksize=1) - result = list(reader) - expected = DataFrame(dict(A=[1, 4, 7], B=[2, 5, 8], C=[ - 3, 6, 9]), index=['foo', 'bar', 'baz']) - self.assertEqual(len(result), 3) - tm.assert_frame_equal(pd.concat(result), expected) - - def test_header_not_first_line(self): - data = """got,to,ignore,this,line -got,to,ignore,this,line -index,A,B,C,D -foo,2,3,4,5 -bar,7,8,9,10 -baz,12,13,14,15 -""" - data2 = """index,A,B,C,D -foo,2,3,4,5 -bar,7,8,9,10 -baz,12,13,14,15 -""" - - df = self.read_csv(StringIO(data), header=2, index_col=0) - expected = self.read_csv(StringIO(data2), header=0, index_col=0) - tm.assert_frame_equal(df, expected) - - def test_header_multi_index(self): - expected = tm.makeCustomDataframe( - 5, 3, r_idx_nlevels=2, c_idx_nlevels=4) - - data = """\ -C0,,C_l0_g0,C_l0_g1,C_l0_g2 - -C1,,C_l1_g0,C_l1_g1,C_l1_g2 -C2,,C_l2_g0,C_l2_g1,C_l2_g2 -C3,,C_l3_g0,C_l3_g1,C_l3_g2 -R0,R1,,, -R_l0_g0,R_l1_g0,R0C0,R0C1,R0C2 -R_l0_g1,R_l1_g1,R1C0,R1C1,R1C2 -R_l0_g2,R_l1_g2,R2C0,R2C1,R2C2 -R_l0_g3,R_l1_g3,R3C0,R3C1,R3C2 -R_l0_g4,R_l1_g4,R4C0,R4C1,R4C2 -""" - - df = self.read_csv(StringIO(data), header=[0, 1, 2, 3], index_col=[ - 0, 1], tupleize_cols=False) - tm.assert_frame_equal(df, expected) - - # skipping lines in the header - df = self.read_csv(StringIO(data), header=[0, 1, 2, 3], index_col=[ - 0, 1], tupleize_cols=False) - tm.assert_frame_equal(df, expected) - - #### invalid options #### - - # no as_recarray - self.assertRaises(ValueError, self.read_csv, StringIO(data), header=[0, 1, 2, 3], - index_col=[0, 1], as_recarray=True, tupleize_cols=False) - - # names - self.assertRaises(ValueError, self.read_csv, StringIO(data), header=[0, 1, 2, 3], - index_col=[0, 1], names=['foo', 'bar'], tupleize_cols=False) - # usecols - self.assertRaises(ValueError, self.read_csv, StringIO(data), header=[0, 1, 2, 3], - index_col=[0, 1], usecols=['foo', 'bar'], tupleize_cols=False) - # non-numeric index_col - self.assertRaises(ValueError, self.read_csv, StringIO(data), header=[0, 1, 2, 3], - index_col=['foo', 'bar'], tupleize_cols=False) - - def test_header_multiindex_common_format(self): - - df = DataFrame([[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]], - index=['one', 'two'], - columns=MultiIndex.from_tuples([('a', 'q'), ('a', 'r'), ('a', 's'), - ('b', 't'), ('c', 'u'), ('c', 'v')])) - - # to_csv - data = """,a,a,a,b,c,c -,q,r,s,t,u,v -,,,,,, -one,1,2,3,4,5,6 -two,7,8,9,10,11,12""" - - result = self.read_csv(StringIO(data), header=[0, 1], index_col=0) - tm.assert_frame_equal(df, result) - - # common - data = """,a,a,a,b,c,c -,q,r,s,t,u,v -one,1,2,3,4,5,6 -two,7,8,9,10,11,12""" - - result = self.read_csv(StringIO(data), header=[0, 1], index_col=0) - tm.assert_frame_equal(df, result) - - # common, no index_col - data = """a,a,a,b,c,c -q,r,s,t,u,v -1,2,3,4,5,6 -7,8,9,10,11,12""" - - result = self.read_csv(StringIO(data), header=[0, 1], index_col=None) - tm.assert_frame_equal(df.reset_index(drop=True), result) - - # malformed case 1 - expected = DataFrame(np.array([[2, 3, 4, 5, 6], - [8, 9, 10, 11, 12]], dtype='int64'), - index=Index([1, 7]), - columns=MultiIndex(levels=[[u('a'), u('b'), u('c')], [u('r'), u('s'), u('t'), u('u'), u('v')]], - labels=[[0, 0, 1, 2, 2], [ - 0, 1, 2, 3, 4]], - names=[u('a'), u('q')])) - - data = """a,a,a,b,c,c -q,r,s,t,u,v -1,2,3,4,5,6 -7,8,9,10,11,12""" - - result = self.read_csv(StringIO(data), header=[0, 1], index_col=0) - tm.assert_frame_equal(expected, result) - - # malformed case 2 - expected = DataFrame(np.array([[2, 3, 4, 5, 6], - [8, 9, 10, 11, 12]], dtype='int64'), - index=Index([1, 7]), - columns=MultiIndex(levels=[[u('a'), u('b'), u('c')], [u('r'), u('s'), u('t'), u('u'), u('v')]], - labels=[[0, 0, 1, 2, 2], [ - 0, 1, 2, 3, 4]], - names=[None, u('q')])) - - data = """,a,a,b,c,c -q,r,s,t,u,v -1,2,3,4,5,6 -7,8,9,10,11,12""" - - result = self.read_csv(StringIO(data), header=[0, 1], index_col=0) - tm.assert_frame_equal(expected, result) - - # mi on columns and index (malformed) - expected = DataFrame(np.array([[3, 4, 5, 6], - [9, 10, 11, 12]], dtype='int64'), - index=MultiIndex(levels=[[1, 7], [2, 8]], - labels=[[0, 1], [0, 1]]), - columns=MultiIndex(levels=[[u('a'), u('b'), u('c')], [u('s'), u('t'), u('u'), u('v')]], - labels=[[0, 1, 2, 2], - [0, 1, 2, 3]], - names=[None, u('q')])) - - data = """,a,a,b,c,c -q,r,s,t,u,v -1,2,3,4,5,6 -7,8,9,10,11,12""" - - result = self.read_csv(StringIO(data), header=[0, 1], index_col=[0, 1]) - tm.assert_frame_equal(expected, result) - - def test_pass_names_with_index(self): - lines = self.data1.split('\n') - no_header = '\n'.join(lines[1:]) - - # regular index - names = ['index', 'A', 'B', 'C', 'D'] - df = self.read_csv(StringIO(no_header), index_col=0, names=names) - expected = self.read_csv(StringIO(self.data1), index_col=0) - tm.assert_frame_equal(df, expected) - - # multi index - data = """index1,index2,A,B,C,D -foo,one,2,3,4,5 -foo,two,7,8,9,10 -foo,three,12,13,14,15 -bar,one,12,13,14,15 -bar,two,12,13,14,15 -""" - lines = data.split('\n') - no_header = '\n'.join(lines[1:]) - names = ['index1', 'index2', 'A', 'B', 'C', 'D'] - df = self.read_csv(StringIO(no_header), index_col=[0, 1], - names=names) - expected = self.read_csv(StringIO(data), index_col=[0, 1]) - tm.assert_frame_equal(df, expected) - - df = self.read_csv(StringIO(data), index_col=['index1', 'index2']) - tm.assert_frame_equal(df, expected) - - def test_multi_index_no_level_names(self): - data = """index1,index2,A,B,C,D -foo,one,2,3,4,5 -foo,two,7,8,9,10 -foo,three,12,13,14,15 -bar,one,12,13,14,15 -bar,two,12,13,14,15 -""" - - data2 = """A,B,C,D -foo,one,2,3,4,5 -foo,two,7,8,9,10 -foo,three,12,13,14,15 -bar,one,12,13,14,15 -bar,two,12,13,14,15 -""" - - lines = data.split('\n') - no_header = '\n'.join(lines[1:]) - names = ['A', 'B', 'C', 'D'] - - df = self.read_csv(StringIO(no_header), index_col=[0, 1], - header=None, names=names) - expected = self.read_csv(StringIO(data), index_col=[0, 1]) - tm.assert_frame_equal(df, expected, check_names=False) - - # 2 implicit first cols - df2 = self.read_csv(StringIO(data2)) - tm.assert_frame_equal(df2, df) - - # reverse order of index - df = self.read_csv(StringIO(no_header), index_col=[1, 0], names=names, - header=None) - expected = self.read_csv(StringIO(data), index_col=[1, 0]) - tm.assert_frame_equal(df, expected, check_names=False) - - def test_skip_footer(self): - # GH 6607 - # Test currently only valid with python engine because - # skip_footer != 0. Temporarily copied to TestPythonParser. - # Test for ValueError with other engines: - - with tm.assertRaisesRegexp(ValueError, 'skip_footer'): - data = """A,B,C -1,2,3 -4,5,6 -7,8,9 -want to skip this -also also skip this -""" - result = self.read_csv(StringIO(data), skip_footer=2) - no_footer = '\n'.join(data.split('\n')[:-3]) - expected = self.read_csv(StringIO(no_footer)) - - tm.assert_frame_equal(result, expected) - - result = self.read_csv(StringIO(data), nrows=3) - tm.assert_frame_equal(result, expected) - - # skipfooter alias - result = read_csv(StringIO(data), skipfooter=2) - no_footer = '\n'.join(data.split('\n')[:-3]) - expected = read_csv(StringIO(no_footer)) - - tm.assert_frame_equal(result, expected) - - def test_no_unnamed_index(self): - data = """ id c0 c1 c2 -0 1 0 a b -1 2 0 c d -2 2 2 e f -""" - df = self.read_table(StringIO(data), sep=' ') - self.assertIsNone(df.index.name) - - def test_converters(self): - data = """A,B,C,D -a,1,2,01/01/2009 -b,3,4,01/02/2009 -c,4,5,01/03/2009 -""" - from pandas.compat import parse_date - - result = self.read_csv(StringIO(data), converters={'D': parse_date}) - result2 = self.read_csv(StringIO(data), converters={3: parse_date}) - - expected = self.read_csv(StringIO(data)) - expected['D'] = expected['D'].map(parse_date) - - tm.assertIsInstance(result['D'][0], (datetime, Timestamp)) - tm.assert_frame_equal(result, expected) - tm.assert_frame_equal(result2, expected) - - # produce integer - converter = lambda x: int(x.split('/')[2]) - result = self.read_csv(StringIO(data), converters={'D': converter}) - expected = self.read_csv(StringIO(data)) - expected['D'] = expected['D'].map(converter) - tm.assert_frame_equal(result, expected) - - def test_converters_no_implicit_conv(self): - # GH2184 - data = """000102,1.2,A\n001245,2,B""" - f = lambda x: x.strip() - converter = {0: f} - df = self.read_csv(StringIO(data), header=None, converters=converter) - self.assertEqual(df[0].dtype, object) - - def test_converters_euro_decimal_format(self): - data = """Id;Number1;Number2;Text1;Text2;Number3 -1;1521,1541;187101,9543;ABC;poi;4,738797819 -2;121,12;14897,76;DEF;uyt;0,377320872 -3;878,158;108013,434;GHI;rez;2,735694704""" - f = lambda x: float(x.replace(",", ".")) - converter = {'Number1': f, 'Number2': f, 'Number3': f} - df2 = self.read_csv(StringIO(data), sep=';', converters=converter) - self.assertEqual(df2['Number1'].dtype, float) - self.assertEqual(df2['Number2'].dtype, float) - self.assertEqual(df2['Number3'].dtype, float) - - def test_converter_return_string_bug(self): - # GH #583 - data = """Id;Number1;Number2;Text1;Text2;Number3 -1;1521,1541;187101,9543;ABC;poi;4,738797819 -2;121,12;14897,76;DEF;uyt;0,377320872 -3;878,158;108013,434;GHI;rez;2,735694704""" - f = lambda x: float(x.replace(",", ".")) - converter = {'Number1': f, 'Number2': f, 'Number3': f} - df2 = self.read_csv(StringIO(data), sep=';', converters=converter) - self.assertEqual(df2['Number1'].dtype, float) - - def test_read_table_buglet_4x_multiindex(self): - # GH 6607 - # Parsing multi-level index currently causes an error in the C parser. - # Temporarily copied to TestPythonParser. - # Here test that CParserError is raised: - - with tm.assertRaises(CParserError): - text = """ A B C D E -one two three four -a b 10.0032 5 -0.5109 -2.3358 -0.4645 0.05076 0.3640 -a q 20 4 0.4473 1.4152 0.2834 1.00661 0.1744 -x q 30 3 -0.6662 -0.5243 -0.3580 0.89145 2.5838""" - - # it works! - df = self.read_table(StringIO(text), sep='\s+') - self.assertEqual(df.index.names, ('one', 'two', 'three', 'four')) - - def test_comment_skiprows(self): - data = """# empty -random line -# second empty line -1,2,3 -A,B,C -1,2.,4. -5.,NaN,10.0 -""" - expected = [[1., 2., 4.], - [5., np.nan, 10.]] - # this should ignore the first four lines (including comments) - df = self.read_csv(StringIO(data), comment='#', skiprows=4) - tm.assert_almost_equal(df.values, expected) - - def test_comment_header(self): - data = """# empty -# second empty line -1,2,3 -A,B,C -1,2.,4. -5.,NaN,10.0 -""" - expected = [[1., 2., 4.], - [5., np.nan, 10.]] - # header should begin at the second non-comment line - df = self.read_csv(StringIO(data), comment='#', header=1) - tm.assert_almost_equal(df.values, expected) - - def test_comment_skiprows_header(self): - data = """# empty -# second empty line -# third empty line -X,Y,Z -1,2,3 -A,B,C -1,2.,4. -5.,NaN,10.0 -""" - expected = [[1., 2., 4.], - [5., np.nan, 10.]] - # skiprows should skip the first 4 lines (including comments), while - # header should start from the second non-commented line starting - # with line 5 - df = self.read_csv(StringIO(data), comment='#', skiprows=4, header=1) - tm.assert_almost_equal(df.values, expected) - - def test_read_csv_parse_simple_list(self): - text = """foo -bar baz -qux foo -foo -bar""" - df = read_csv(StringIO(text), header=None) - expected = DataFrame({0: ['foo', 'bar baz', 'qux foo', - 'foo', 'bar']}) - tm.assert_frame_equal(df, expected) - - def test_na_value_dict(self): - data = """A,B,C -foo,bar,NA -bar,foo,foo -foo,bar,NA -bar,foo,foo""" - - df = self.read_csv(StringIO(data), - na_values={'A': ['foo'], 'B': ['bar']}) - expected = DataFrame({'A': [np.nan, 'bar', np.nan, 'bar'], - 'B': [np.nan, 'foo', np.nan, 'foo'], - 'C': [np.nan, 'foo', np.nan, 'foo']}) - tm.assert_frame_equal(df, expected) - - data = """\ -a,b,c,d -0,NA,1,5 -""" - xp = DataFrame({'b': [np.nan], 'c': [1], 'd': [5]}, index=[0]) - xp.index.name = 'a' - df = self.read_csv(StringIO(data), na_values={}, index_col=0) - tm.assert_frame_equal(df, xp) - - xp = DataFrame({'b': [np.nan], 'd': [5]}, - MultiIndex.from_tuples([(0, 1)])) - xp.index.names = ['a', 'c'] - df = self.read_csv(StringIO(data), na_values={}, index_col=[0, 2]) - tm.assert_frame_equal(df, xp) - - xp = DataFrame({'b': [np.nan], 'd': [5]}, - MultiIndex.from_tuples([(0, 1)])) - xp.index.names = ['a', 'c'] - df = self.read_csv(StringIO(data), na_values={}, index_col=['a', 'c']) - tm.assert_frame_equal(df, xp) - - @tm.network - def test_url(self): - # HTTP(S) - url = ('https://raw.github.com/pydata/pandas/master/' - 'pandas/io/tests/data/salary.table') - url_table = self.read_table(url) - dirpath = tm.get_data_path() - localtable = os.path.join(dirpath, 'salary.table') - local_table = self.read_table(localtable) - tm.assert_frame_equal(url_table, local_table) - # TODO: ftp testing - - @slow - def test_file(self): - - # FILE - if sys.version_info[:2] < (2, 6): - raise nose.SkipTest("file:// not supported with Python < 2.6") - dirpath = tm.get_data_path() - localtable = os.path.join(dirpath, 'salary.table') - local_table = self.read_table(localtable) - - try: - url_table = self.read_table('file://localhost/' + localtable) - except URLError: - # fails on some systems - raise nose.SkipTest("failing on %s" % - ' '.join(platform.uname()).strip()) - - tm.assert_frame_equal(url_table, local_table) - - def test_comment(self): - data = """A,B,C -1,2.,4.#hello world -5.,NaN,10.0 -""" - expected = [[1., 2., 4.], - [5., np.nan, 10.]] - df = self.read_csv(StringIO(data), comment='#') - tm.assert_almost_equal(df.values, expected) - - df = self.read_table(StringIO(data), sep=',', comment='#', - na_values=['NaN']) - tm.assert_almost_equal(df.values, expected) - - def test_bool_na_values(self): - data = """A,B,C -True,False,True -NA,True,False -False,NA,True""" - - result = self.read_csv(StringIO(data)) - expected = DataFrame({'A': np.array([True, nan, False], dtype=object), - 'B': np.array([False, True, nan], dtype=object), - 'C': [True, False, True]}) - - tm.assert_frame_equal(result, expected) - - def test_nonexistent_path(self): - # don't segfault pls #2428 - path = '%s.csv' % tm.rands(10) - self.assertRaises(IOError, self.read_csv, path) - - def test_missing_trailing_delimiters(self): - data = """A,B,C,D -1,2,3,4 -1,3,3, -1,4,5""" - result = self.read_csv(StringIO(data)) - self.assertTrue(result['D'].isnull()[1:].all()) - - def test_skipinitialspace(self): - s = ('"09-Apr-2012", "01:10:18.300", 2456026.548822908, 12849, ' - '1.00361, 1.12551, 330.65659, 0355626618.16711, 73.48821, ' - '314.11625, 1917.09447, 179.71425, 80.000, 240.000, -350, ' - '70.06056, 344.98370, 1, 1, -0.689265, -0.692787, ' - '0.212036, 14.7674, 41.605, -9999.0, -9999.0, ' - '-9999.0, -9999.0, -9999.0, -9999.0, 000, 012, 128') - - sfile = StringIO(s) - # it's 33 columns - result = self.read_csv(sfile, names=lrange(33), na_values=['-9999.0'], - header=None, skipinitialspace=True) - self.assertTrue(pd.isnull(result.ix[0, 29])) - - def test_utf16_bom_skiprows(self): - # #2298 - data = u("""skip this -skip this too -A\tB\tC -1\t2\t3 -4\t5\t6""") - - data2 = u("""skip this -skip this too -A,B,C -1,2,3 -4,5,6""") - - path = '__%s__.csv' % tm.rands(10) - - with tm.ensure_clean(path) as path: - for sep, dat in [('\t', data), (',', data2)]: - for enc in ['utf-16', 'utf-16le', 'utf-16be']: - bytes = dat.encode(enc) - with open(path, 'wb') as f: - f.write(bytes) - - s = BytesIO(dat.encode('utf-8')) - if compat.PY3: - # somewhat False since the code never sees bytes - from io import TextIOWrapper - s = TextIOWrapper(s, encoding='utf-8') - - result = self.read_csv(path, encoding=enc, skiprows=2, - sep=sep) - expected = self.read_csv(s, encoding='utf-8', skiprows=2, - sep=sep) - s.close() - - tm.assert_frame_equal(result, expected) - - def test_utf16_example(self): - path = tm.get_data_path('utf16_ex.txt') - - # it works! and is the right length - result = self.read_table(path, encoding='utf-16') - self.assertEqual(len(result), 50) - - if not compat.PY3: - buf = BytesIO(open(path, 'rb').read()) - result = self.read_table(buf, encoding='utf-16') - self.assertEqual(len(result), 50) - - def test_converters_corner_with_nas(self): - # skip aberration observed on Win64 Python 3.2.2 - if hash(np.int64(-1)) != -2: - raise nose.SkipTest("skipping because of windows hash on Python" - " 3.2.2") - - csv = """id,score,days -1,2,12 -2,2-5, -3,,14+ -4,6-12,2""" - - def convert_days(x): - x = x.strip() - if not x: - return np.nan - - is_plus = x.endswith('+') - if is_plus: - x = int(x[:-1]) + 1 - else: - x = int(x) - return x - - def convert_days_sentinel(x): - x = x.strip() - if not x: - return np.nan - - is_plus = x.endswith('+') - if is_plus: - x = int(x[:-1]) + 1 - else: - x = int(x) - return x - - def convert_score(x): - x = x.strip() - if not x: - return np.nan - if x.find('-') > 0: - valmin, valmax = lmap(int, x.split('-')) - val = 0.5 * (valmin + valmax) - else: - val = float(x) - - return val - - fh = StringIO(csv) - result = self.read_csv(fh, converters={'score': convert_score, - 'days': convert_days}, - na_values=['', None]) - self.assertTrue(pd.isnull(result['days'][1])) - - fh = StringIO(csv) - result2 = self.read_csv(fh, converters={'score': convert_score, - 'days': convert_days_sentinel}, - na_values=['', None]) - tm.assert_frame_equal(result, result2) - - def test_unicode_encoding(self): - pth = tm.get_data_path('unicode_series.csv') - - result = self.read_csv(pth, header=None, encoding='latin-1') - result = result.set_index(0) - - got = result[1][1632] - expected = u('\xc1 k\xf6ldum klaka (Cold Fever) (1994)') - - self.assertEqual(got, expected) - - def test_trailing_delimiters(self): - # #2442. grumble grumble - data = """A,B,C -1,2,3, -4,5,6, -7,8,9,""" - result = self.read_csv(StringIO(data), index_col=False) - - expected = DataFrame({'A': [1, 4, 7], 'B': [2, 5, 8], - 'C': [3, 6, 9]}) - - tm.assert_frame_equal(result, expected) - - def test_escapechar(self): - # http://stackoverflow.com/questions/13824840/feature-request-for- - # pandas-read-csv - data = '''SEARCH_TERM,ACTUAL_URL -"bra tv bord","http://www.ikea.com/se/sv/catalog/categories/departments/living_room/10475/?se%7cps%7cnonbranded%7cvardagsrum%7cgoogle%7ctv_bord" -"tv p\xc3\xa5 hjul","http://www.ikea.com/se/sv/catalog/categories/departments/living_room/10475/?se%7cps%7cnonbranded%7cvardagsrum%7cgoogle%7ctv_bord" -"SLAGBORD, \\"Bergslagen\\", IKEA:s 1700-tals serie","http://www.ikea.com/se/sv/catalog/categories/departments/living_room/10475/?se%7cps%7cnonbranded%7cvardagsrum%7cgoogle%7ctv_bord"''' - - result = self.read_csv(StringIO(data), escapechar='\\', - quotechar='"', encoding='utf-8') - self.assertEqual(result['SEARCH_TERM'][2], - 'SLAGBORD, "Bergslagen", IKEA:s 1700-tals serie') - self.assertTrue(np.array_equal(result.columns, - ['SEARCH_TERM', 'ACTUAL_URL'])) - - def test_header_names_backward_compat(self): - # #2539 - data = '1,2,3\n4,5,6' - - result = self.read_csv(StringIO(data), names=['a', 'b', 'c']) - expected = self.read_csv(StringIO(data), names=['a', 'b', 'c'], - header=None) - tm.assert_frame_equal(result, expected) - - data2 = 'foo,bar,baz\n' + data - result = self.read_csv(StringIO(data2), names=['a', 'b', 'c'], - header=0) - tm.assert_frame_equal(result, expected) - - def test_int64_min_issues(self): - # #2599 - data = 'A,B\n0,0\n0,' - - result = self.read_csv(StringIO(data)) - expected = DataFrame({'A': [0, 0], 'B': [0, np.nan]}) - - tm.assert_frame_equal(result, expected) - - def test_parse_integers_above_fp_precision(self): - data = """Numbers -17007000002000191 -17007000002000191 -17007000002000191 -17007000002000191 -17007000002000192 -17007000002000192 -17007000002000192 -17007000002000192 -17007000002000192 -17007000002000194""" - - result = self.read_csv(StringIO(data)) - expected = DataFrame({'Numbers': [17007000002000191, - 17007000002000191, - 17007000002000191, - 17007000002000191, - 17007000002000192, - 17007000002000192, - 17007000002000192, - 17007000002000192, - 17007000002000192, - 17007000002000194]}) - - self.assertTrue(np.array_equal(result['Numbers'], expected['Numbers'])) - - def test_usecols_index_col_conflict(self): - # Issue 4201 Test that index_col as integer reflects usecols - data = """SecId,Time,Price,P2,P3 -10000,2013-5-11,100,10,1 -500,2013-5-12,101,11,1 -""" - expected = DataFrame({'Price': [100, 101]}, index=[ - datetime(2013, 5, 11), datetime(2013, 5, 12)]) - expected.index.name = 'Time' - - df = self.read_csv(StringIO(data), usecols=[ - 'Time', 'Price'], parse_dates=True, index_col=0) - tm.assert_frame_equal(expected, df) - - df = self.read_csv(StringIO(data), usecols=[ - 'Time', 'Price'], parse_dates=True, index_col='Time') - tm.assert_frame_equal(expected, df) - - df = self.read_csv(StringIO(data), usecols=[ - 1, 2], parse_dates=True, index_col='Time') - tm.assert_frame_equal(expected, df) - - df = self.read_csv(StringIO(data), usecols=[ - 1, 2], parse_dates=True, index_col=0) - tm.assert_frame_equal(expected, df) - - expected = DataFrame( - {'P3': [1, 1], 'Price': (100, 101), 'P2': (10, 11)}) - expected = expected.set_index(['Price', 'P2']) - df = self.read_csv(StringIO(data), usecols=[ - 'Price', 'P2', 'P3'], parse_dates=True, index_col=['Price', 'P2']) - tm.assert_frame_equal(expected, df) - - def test_chunks_have_consistent_numerical_type(self): - integers = [str(i) for i in range(499999)] - data = "a\n" + "\n".join(integers + ["1.0", "2.0"] + integers) - - with tm.assert_produces_warning(False): - df = self.read_csv(StringIO(data)) - # Assert that types were coerced. - self.assertTrue(type(df.a[0]) is np.float64) - self.assertEqual(df.a.dtype, np.float) - - def test_warn_if_chunks_have_mismatched_type(self): - # See test in TestCParserLowMemory. - integers = [str(i) for i in range(499999)] - data = "a\n" + "\n".join(integers + ['a', 'b'] + integers) - - with tm.assert_produces_warning(False): - df = self.read_csv(StringIO(data)) - self.assertEqual(df.a.dtype, np.object) - - def test_usecols(self): - data = """\ -a,b,c -1,2,3 -4,5,6 -7,8,9 -10,11,12""" - - result = self.read_csv(StringIO(data), usecols=(1, 2)) - result2 = self.read_csv(StringIO(data), usecols=('b', 'c')) - exp = self.read_csv(StringIO(data)) - - self.assertEqual(len(result.columns), 2) - self.assertTrue((result['b'] == exp['b']).all()) - self.assertTrue((result['c'] == exp['c']).all()) - - tm.assert_frame_equal(result, result2) - - result = self.read_csv(StringIO(data), usecols=[1, 2], header=0, - names=['foo', 'bar']) - expected = self.read_csv(StringIO(data), usecols=[1, 2]) - expected.columns = ['foo', 'bar'] - tm.assert_frame_equal(result, expected) - - data = """\ -1,2,3 -4,5,6 -7,8,9 -10,11,12""" - result = self.read_csv(StringIO(data), names=['b', 'c'], - header=None, usecols=[1, 2]) - - expected = self.read_csv(StringIO(data), names=['a', 'b', 'c'], - header=None) - expected = expected[['b', 'c']] - tm.assert_frame_equal(result, expected) - - result2 = self.read_csv(StringIO(data), names=['a', 'b', 'c'], - header=None, usecols=['b', 'c']) - tm.assert_frame_equal(result2, result) - - # 5766 - result = self.read_csv(StringIO(data), names=['a', 'b'], - header=None, usecols=[0, 1]) - - expected = self.read_csv(StringIO(data), names=['a', 'b', 'c'], - header=None) - expected = expected[['a', 'b']] - tm.assert_frame_equal(result, expected) - - # length conflict, passed names and usecols disagree - self.assertRaises(ValueError, self.read_csv, StringIO(data), - names=['a', 'b'], usecols=[1], header=None) - - def test_integer_overflow_bug(self): - # #2601 - data = "65248E10 11\n55555E55 22\n" - - result = self.read_csv(StringIO(data), header=None, sep=' ') - self.assertTrue(result[0].dtype == np.float64) - - result = self.read_csv(StringIO(data), header=None, sep='\s+') - self.assertTrue(result[0].dtype == np.float64) - - def test_catch_too_many_names(self): - # Issue 5156 - data = """\ -1,2,3 -4,,6 -7,8,9 -10,11,12\n""" - tm.assertRaises(ValueError, read_csv, StringIO(data), - header=0, names=['a', 'b', 'c', 'd']) - - def test_ignore_leading_whitespace(self): - # GH 6607, GH 3374 - data = ' a b c\n 1 2 3\n 4 5 6\n 7 8 9' - result = self.read_table(StringIO(data), sep='\s+') - expected = DataFrame({'a': [1, 4, 7], 'b': [2, 5, 8], 'c': [3, 6, 9]}) - tm.assert_frame_equal(result, expected) - - def test_nrows_and_chunksize_raises_notimplemented(self): - data = 'a b c' - self.assertRaises(NotImplementedError, self.read_csv, StringIO(data), - nrows=10, chunksize=5) - - def test_chunk_begins_with_newline_whitespace(self): - # GH 10022 - data = '\n hello\nworld\n' - result = self.read_csv(StringIO(data), header=None) - self.assertEqual(len(result), 2) - - # GH 9735 - chunk1 = 'a' * (1024 * 256 - 2) + '\na' - chunk2 = '\n a' - result = pd.read_csv(StringIO(chunk1 + chunk2), header=None) - expected = pd.DataFrame(['a' * (1024 * 256 - 2), 'a', ' a']) - tm.assert_frame_equal(result, expected) - - def test_empty_with_index(self): - # GH 10184 - data = 'x,y' - result = self.read_csv(StringIO(data), index_col=0) - expected = DataFrame([], columns=['y'], index=Index([], name='x')) - tm.assert_frame_equal(result, expected) - - def test_emtpy_with_multiindex(self): - # GH 10467 - data = 'x,y,z' - result = self.read_csv(StringIO(data), index_col=['x', 'y']) - expected = DataFrame([], columns=['z'], - index=MultiIndex.from_arrays([[]] * 2, names=['x', 'y'])) - tm.assert_frame_equal(result, expected, check_index_type=False) - - def test_empty_with_reversed_multiindex(self): - data = 'x,y,z' - result = self.read_csv(StringIO(data), index_col=[1, 0]) - expected = DataFrame([], columns=['z'], - index=MultiIndex.from_arrays([[]] * 2, names=['y', 'x'])) - tm.assert_frame_equal(result, expected, check_index_type=False) - - def test_empty_index_col_scenarios(self): - data = 'x,y,z' - - # None, no index - index_col, expected = None, DataFrame([], columns=list('xyz')), - tm.assert_frame_equal(self.read_csv( - StringIO(data), index_col=index_col), expected) - - # False, no index - index_col, expected = False, DataFrame([], columns=list('xyz')), - tm.assert_frame_equal(self.read_csv( - StringIO(data), index_col=index_col), expected) - - # int, first column - index_col, expected = 0, DataFrame( - [], columns=['y', 'z'], index=Index([], name='x')) - tm.assert_frame_equal(self.read_csv( - StringIO(data), index_col=index_col), expected) - - # int, not first column - index_col, expected = 1, DataFrame( - [], columns=['x', 'z'], index=Index([], name='y')) - tm.assert_frame_equal(self.read_csv( - StringIO(data), index_col=index_col), expected) - - # str, first column - index_col, expected = 'x', DataFrame( - [], columns=['y', 'z'], index=Index([], name='x')) - tm.assert_frame_equal(self.read_csv( - StringIO(data), index_col=index_col), expected) - - # str, not the first column - index_col, expected = 'y', DataFrame( - [], columns=['x', 'z'], index=Index([], name='y')) - tm.assert_frame_equal(self.read_csv( - StringIO(data), index_col=index_col), expected) - - # list of int - index_col, expected = [0, 1], DataFrame([], columns=['z'], - index=MultiIndex.from_arrays([[]] * 2, names=['x', 'y'])) - tm.assert_frame_equal(self.read_csv(StringIO(data), index_col=index_col), expected, - check_index_type=False) - - # list of str - index_col = ['x', 'y'] - expected = DataFrame([], columns=['z'], index=MultiIndex.from_arrays( - [[]] * 2, names=['x', 'y'])) - tm.assert_frame_equal(self.read_csv(StringIO(data), index_col=index_col), expected, - check_index_type=False) - - # list of int, reversed sequence - index_col = [1, 0] - expected = DataFrame([], columns=['z'], index=MultiIndex.from_arrays( - [[]] * 2, names=['y', 'x'])) - tm.assert_frame_equal(self.read_csv(StringIO(data), index_col=index_col), expected, - check_index_type=False) - - # list of str, reversed sequence - index_col = ['y', 'x'] - expected = DataFrame([], columns=['z'], index=MultiIndex.from_arrays( - [[]] * 2, names=['y', 'x'])) - tm.assert_frame_equal(self.read_csv(StringIO(data), index_col=index_col), expected, - check_index_type=False) - - def test_empty_with_index_col_false(self): - # GH 10413 - data = 'x,y' - result = self.read_csv(StringIO(data), index_col=False) - expected = DataFrame([], columns=['x', 'y']) - tm.assert_frame_equal(result, expected) - - def test_float_parser(self): - # GH 9565 - data = '45e-1,4.5,45.,inf,-inf' - result = self.read_csv(StringIO(data), header=None) - expected = pd.DataFrame([[float(s) for s in data.split(',')]]) - tm.assert_frame_equal(result, expected) - - def float_precision_choices(self): - raise AbstractMethodError(self) - - def test_scientific_no_exponent(self): - # See PR 12215 - df = DataFrame.from_items([('w', ['2e']), ('x', ['3E']), - ('y', ['42e']), ('z', ['632E'])]) - data = df.to_csv(index=False) - for prec in self.float_precision_choices(): - df_roundtrip = self.read_csv(StringIO(data), float_precision=prec) - tm.assert_frame_equal(df_roundtrip, df) - - def test_int64_overflow(self): - data = """ID -00013007854817840016671868 -00013007854817840016749251 -00013007854817840016754630 -00013007854817840016781876 -00013007854817840017028824 -00013007854817840017963235 -00013007854817840018860166""" - - result = self.read_csv(StringIO(data)) - self.assertTrue(result['ID'].dtype == object) - - self.assertRaises(OverflowError, self.read_csv, - StringIO(data), converters={'ID': np.int64}) - - # Just inside int64 range: parse as integer - i_max = np.iinfo(np.int64).max - i_min = np.iinfo(np.int64).min - for x in [i_max, i_min]: - result = pd.read_csv(StringIO(str(x)), header=None) - expected = pd.DataFrame([x]) - tm.assert_frame_equal(result, expected) - - # Just outside int64 range: parse as string - too_big = i_max + 1 - too_small = i_min - 1 - for x in [too_big, too_small]: - result = pd.read_csv(StringIO(str(x)), header=None) - expected = pd.DataFrame([str(x)]) - tm.assert_frame_equal(result, expected) - - def test_empty_with_nrows_chunksize(self): - # GH 9535 - expected = pd.DataFrame([], columns=['foo', 'bar']) - - result = self.read_csv(StringIO('foo,bar\n'), nrows=10) - tm.assert_frame_equal(result, expected) - - result = next(iter(pd.read_csv(StringIO('foo,bar\n'), chunksize=10))) - tm.assert_frame_equal(result, expected) - - result = pd.read_csv(StringIO('foo,bar\n'), nrows=10, as_recarray=True) - result = pd.DataFrame(result[2], columns=result[1], index=result[0]) - tm.assert_frame_equal(pd.DataFrame.from_records( - result), expected, check_index_type=False) - - result = next( - iter(pd.read_csv(StringIO('foo,bar\n'), chunksize=10, as_recarray=True))) - result = pd.DataFrame(result[2], columns=result[1], index=result[0]) - tm.assert_frame_equal(pd.DataFrame.from_records( - result), expected, check_index_type=False) - - def test_eof_states(self): - # GH 10728 and 10548 - - # With skip_blank_lines = True - expected = pd.DataFrame([[4, 5, 6]], columns=['a', 'b', 'c']) - - # GH 10728 - # WHITESPACE_LINE - data = 'a,b,c\n4,5,6\n ' - result = self.read_csv(StringIO(data)) - tm.assert_frame_equal(result, expected) - - # GH 10548 - # EAT_LINE_COMMENT - data = 'a,b,c\n4,5,6\n#comment' - result = self.read_csv(StringIO(data), comment='#') - tm.assert_frame_equal(result, expected) - - # EAT_CRNL_NOP - data = 'a,b,c\n4,5,6\n\r' - result = self.read_csv(StringIO(data)) - tm.assert_frame_equal(result, expected) - - # EAT_COMMENT - data = 'a,b,c\n4,5,6#comment' - result = self.read_csv(StringIO(data), comment='#') - tm.assert_frame_equal(result, expected) - - # SKIP_LINE - data = 'a,b,c\n4,5,6\nskipme' - result = self.read_csv(StringIO(data), skiprows=[2]) - tm.assert_frame_equal(result, expected) - - # With skip_blank_lines = False - - # EAT_LINE_COMMENT - data = 'a,b,c\n4,5,6\n#comment' - result = self.read_csv( - StringIO(data), comment='#', skip_blank_lines=False) - expected = pd.DataFrame([[4, 5, 6]], columns=['a', 'b', 'c']) - tm.assert_frame_equal(result, expected) - - # IN_FIELD - data = 'a,b,c\n4,5,6\n ' - result = self.read_csv(StringIO(data), skip_blank_lines=False) - expected = pd.DataFrame( - [['4', 5, 6], [' ', None, None]], columns=['a', 'b', 'c']) - tm.assert_frame_equal(result, expected) - - # EAT_CRNL - data = 'a,b,c\n4,5,6\n\r' - result = self.read_csv(StringIO(data), skip_blank_lines=False) - expected = pd.DataFrame( - [[4, 5, 6], [None, None, None]], columns=['a', 'b', 'c']) - tm.assert_frame_equal(result, expected) - - # Should produce exceptions - - # ESCAPED_CHAR - data = "a,b,c\n4,5,6\n\\" - self.assertRaises(Exception, self.read_csv, - StringIO(data), escapechar='\\') - - # ESCAPE_IN_QUOTED_FIELD - data = 'a,b,c\n4,5,6\n"\\' - self.assertRaises(Exception, self.read_csv, - StringIO(data), escapechar='\\') - - # IN_QUOTED_FIELD - data = 'a,b,c\n4,5,6\n"' - self.assertRaises(Exception, self.read_csv, - StringIO(data), escapechar='\\') - - def test_grow_boundary_at_cap(self): - # See gh-12494 - # - # Cause of error was the fact that pandas - # was not increasing the buffer size when - # the desired space would fill the buffer - # to capacity, which later would cause a - # buffer overflow error when checking the - # EOF terminator of the CSV stream - def test_empty_header_read(count): - s = StringIO(',' * count) - expected = DataFrame(columns=[ - 'Unnamed: {i}'.format(i=i) - for i in range(count + 1)]) - df = read_csv(s) - tm.assert_frame_equal(df, expected) - - for count in range(1, 101): - test_empty_header_read(count) - - def test_uneven_lines_with_usecols(self): - # See gh-12203 - csv = r"""a,b,c - 0,1,2 - 3,4,5,6,7 - 8,9,10 - """ - - # make sure that an error is still thrown - # when the 'usecols' parameter is not provided - msg = "Expected \d+ fields in line \d+, saw \d+" - with tm.assertRaisesRegexp(ValueError, msg): - df = self.read_csv(StringIO(csv)) - - expected = DataFrame({ - 'a': [0, 3, 8], - 'b': [1, 4, 9] - }) - - usecols = [0, 1] - df = self.read_csv(StringIO(csv), usecols=usecols) - tm.assert_frame_equal(df, expected) - - usecols = ['a', 'b'] - df = self.read_csv(StringIO(csv), usecols=usecols) - tm.assert_frame_equal(df, expected) - - def test_usecols_with_parse_dates(self): - # See gh-9755 - s = """a,b,c,d,e - 0,1,20140101,0900,4 - 0,1,20140102,1000,4""" - parse_dates = [[1, 2]] - - cols = { - 'a' : [0, 0], - 'c_d': [ - Timestamp('2014-01-01 09:00:00'), - Timestamp('2014-01-02 10:00:00') - ] - } - expected = DataFrame(cols, columns=['c_d', 'a']) - - df = self.read_csv(StringIO(s), usecols=[0, 2, 3], - parse_dates=parse_dates) - tm.assert_frame_equal(df, expected) - - df = self.read_csv(StringIO(s), usecols=[3, 0, 2], - parse_dates=parse_dates) - tm.assert_frame_equal(df, expected) - - def test_usecols_with_parse_dates_and_full_names(self): - # See gh-9755 - s = """0,1,20140101,0900,4 - 0,1,20140102,1000,4""" - parse_dates = [[1, 2]] - names = list('abcde') - - cols = { - 'a' : [0, 0], - 'c_d': [ - Timestamp('2014-01-01 09:00:00'), - Timestamp('2014-01-02 10:00:00') - ] - } - expected = DataFrame(cols, columns=['c_d', 'a']) - - df = self.read_csv(StringIO(s), names=names, - usecols=[0, 2, 3], - parse_dates=parse_dates) - tm.assert_frame_equal(df, expected) - - df = self.read_csv(StringIO(s), names=names, - usecols=[3, 0, 2], - parse_dates=parse_dates) - tm.assert_frame_equal(df, expected) - - def test_usecols_with_parse_dates_and_usecol_names(self): - # See gh-9755 - s = """0,1,20140101,0900,4 - 0,1,20140102,1000,4""" - parse_dates = [[1, 2]] - names = list('acd') - - cols = { - 'a' : [0, 0], - 'c_d': [ - Timestamp('2014-01-01 09:00:00'), - Timestamp('2014-01-02 10:00:00') - ] - } - expected = DataFrame(cols, columns=['c_d', 'a']) - - df = self.read_csv(StringIO(s), names=names, - usecols=[0, 2, 3], - parse_dates=parse_dates) - tm.assert_frame_equal(df, expected) - - df = self.read_csv(StringIO(s), names=names, - usecols=[3, 0, 2], - parse_dates=parse_dates) - tm.assert_frame_equal(df, expected) - - def test_mixed_dtype_usecols(self): - # See gh-12678 - data = """a,b,c - 1000,2000,3000 - 4000,5000,6000 - """ - msg = ("The elements of \'usecols\' " - "must either be all strings " - "or all integers") - usecols = [0, 'b', 2] - - with tm.assertRaisesRegexp(ValueError, msg): - self.read_csv(StringIO(data), usecols=usecols) - - def test_usecols_with_integer_like_header(self): - data = """2,0,1 - 1000,2000,3000 - 4000,5000,6000 - """ - - usecols = [0, 1] # column selection by index - expected = DataFrame(data=[[1000, 2000], - [4000, 5000]], - columns=['2', '0']) - df = self.read_csv(StringIO(data), usecols=usecols) - tm.assert_frame_equal(df, expected) - - usecols = ['0', '1'] # column selection by name - expected = DataFrame(data=[[2000, 3000], - [5000, 6000]], - columns=['0', '1']) - df = self.read_csv(StringIO(data), usecols=usecols) - tm.assert_frame_equal(df, expected) - - def test_read_empty_with_usecols(self): - # See gh-12493 - names = ['Dummy', 'X', 'Dummy_2'] - usecols = names[1:2] # ['X'] - - # first, check to see that the response of - # parser when faced with no provided columns - # throws the correct error, with or without usecols - errmsg = "No columns to parse from file" - - with tm.assertRaisesRegexp(EmptyDataError, errmsg): - self.read_csv(StringIO('')) - - with tm.assertRaisesRegexp(EmptyDataError, errmsg): - self.read_csv(StringIO(''), usecols=usecols) - - expected = DataFrame(columns=usecols, index=[0], dtype=np.float64) - df = self.read_csv(StringIO(',,'), names=names, usecols=usecols) - tm.assert_frame_equal(df, expected) - - expected = DataFrame(columns=usecols) - df = self.read_csv(StringIO(''), names=names, usecols=usecols) - tm.assert_frame_equal(df, expected) - - def test_read_with_bad_header(self): - errmsg = "but only \d+ lines in file" - - with tm.assertRaisesRegexp(ValueError, errmsg): - s = StringIO(',,') - self.read_csv(s, header=[10]) - - def test_read_only_header_no_rows(self): - # See gh-7773 - expected = DataFrame(columns=['a', 'b', 'c']) - - df = self.read_csv(StringIO('a,b,c')) - tm.assert_frame_equal(df, expected) - - df = self.read_csv(StringIO('a,b,c'), index_col=False) - tm.assert_frame_equal(df, expected) - - def test_skiprow_with_newline(self): - # see gh-12775 and gh-10911 - data = """id,text,num_lines -1,"line 11 -line 12",2 -2,"line 21 -line 22",2 -3,"line 31",1""" - expected = [[2, 'line 21\nline 22', 2], - [3, 'line 31', 1]] - expected = DataFrame(expected, columns=[ - 'id', 'text', 'num_lines']) - df = self.read_csv(StringIO(data), skiprows=[1]) - tm.assert_frame_equal(df, expected) - - data = ('a,b,c\n~a\n b~,~e\n d~,' - '~f\n f~\n1,2,~12\n 13\n 14~') - expected = [['a\n b', 'e\n d', 'f\n f']] - expected = DataFrame(expected, columns=[ - 'a', 'b', 'c']) - df = self.read_csv(StringIO(data), - quotechar="~", - skiprows=[2]) - tm.assert_frame_equal(df, expected) - - data = ('Text,url\n~example\n ' - 'sentence\n one~,url1\n~' - 'example\n sentence\n two~,url2\n~' - 'example\n sentence\n three~,url3') - expected = [['example\n sentence\n two', 'url2']] - expected = DataFrame(expected, columns=[ - 'Text', 'url']) - df = self.read_csv(StringIO(data), - quotechar="~", - skiprows=[1, 3]) - tm.assert_frame_equal(df, expected) - - def test_skiprow_with_quote(self): - # see gh-12775 and gh-10911 - data = """id,text,num_lines -1,"line '11' line 12",2 -2,"line '21' line 22",2 -3,"line '31' line 32",1""" - expected = [[2, "line '21' line 22", 2], - [3, "line '31' line 32", 1]] - expected = DataFrame(expected, columns=[ - 'id', 'text', 'num_lines']) - df = self.read_csv(StringIO(data), skiprows=[1]) - tm.assert_frame_equal(df, expected) - - def test_skiprow_with_newline_and_quote(self): - # see gh-12775 and gh-10911 - data = """id,text,num_lines -1,"line \n'11' line 12",2 -2,"line \n'21' line 22",2 -3,"line \n'31' line 32",1""" - expected = [[2, "line \n'21' line 22", 2], - [3, "line \n'31' line 32", 1]] - expected = DataFrame(expected, columns=[ - 'id', 'text', 'num_lines']) - df = self.read_csv(StringIO(data), skiprows=[1]) - tm.assert_frame_equal(df, expected) - - data = """id,text,num_lines -1,"line '11\n' line 12",2 -2,"line '21\n' line 22",2 -3,"line '31\n' line 32",1""" - expected = [[2, "line '21\n' line 22", 2], - [3, "line '31\n' line 32", 1]] - expected = DataFrame(expected, columns=[ - 'id', 'text', 'num_lines']) - df = self.read_csv(StringIO(data), skiprows=[1]) - tm.assert_frame_equal(df, expected) - - data = """id,text,num_lines -1,"line '11\n' \r\tline 12",2 -2,"line '21\n' \r\tline 22",2 -3,"line '31\n' \r\tline 32",1""" - expected = [[2, "line '21\n' \r\tline 22", 2], - [3, "line '31\n' \r\tline 32", 1]] - expected = DataFrame(expected, columns=[ - 'id', 'text', 'num_lines']) - df = self.read_csv(StringIO(data), skiprows=[1]) - tm.assert_frame_equal(df, expected) - - def test_line_comment(self): - data = """# empty -A,B,C -1,2.,4.#hello world -#ignore this line -5.,NaN,10.0 -""" - expected = [[1., 2., 4.], - [5., np.nan, 10.]] - df = self.read_csv(StringIO(data), comment='#') - tm.assert_almost_equal(df.values, expected) - # check with delim_whitespace=True - df = self.read_csv(StringIO(data.replace(',', ' ')), comment='#', - delim_whitespace=True) - tm.assert_almost_equal(df.values, expected) - - def test_skiprows_lineterminator(self): - # see gh-9079 - data = '\n'.join(['SMOSMANIA ThetaProbe-ML2X ', - '2007/01/01 01:00 0.2140 U M ', - '2007/01/01 02:00 0.2141 M O ', - '2007/01/01 04:00 0.2142 D M ']) - expected = pd.DataFrame([['2007/01/01', '01:00', 0.2140, 'U', 'M'], - ['2007/01/01', '02:00', 0.2141, 'M', 'O'], - ['2007/01/01', '04:00', 0.2142, 'D', 'M']], - columns=['date', 'time', 'var', 'flag', - 'oflag']) - # test with default lineterminators LF and CRLF - # "CR" is not respected with the Python parser, so - # there is a separate test "test_skiprows_lineterminator_cr" - # in the C engine for that - df = self.read_csv(StringIO(data), skiprows=1, delim_whitespace=True, - names=['date', 'time', 'var', 'flag', 'oflag']) - tm.assert_frame_equal(df, expected) - df = self.read_csv(StringIO(data.replace('\n', '\r\n')), - skiprows=1, delim_whitespace=True, - names=['date', 'time', 'var', 'flag', 'oflag']) - tm.assert_frame_equal(df, expected) - - def test_trailing_spaces(self): - data = "A B C \nrandom line with trailing spaces \nskip\n1,2,3\n1,2.,4.\nrandom line with trailing tabs\t\t\t\n \n5.1,NaN,10.0\n" - expected = pd.DataFrame([[1., 2., 4.], - [5.1, np.nan, 10.]]) - - # gh-8661, gh-8679: this should ignore six lines including - # lines with trailing whitespace and blank lines - df = self.read_csv(StringIO(data.replace(',', ' ')), - header=None, delim_whitespace=True, - skiprows=[0, 1, 2, 3, 5, 6], skip_blank_lines=True) - tm.assert_frame_equal(df, expected) - df = self.read_table(StringIO(data.replace(',', ' ')), - header=None, delim_whitespace=True, - skiprows=[0, 1, 2, 3, 5, 6], skip_blank_lines=True) - tm.assert_frame_equal(df, expected) - - # gh-8983: test skipping set of rows after a row with trailing spaces - expected = pd.DataFrame({"A": [1., 5.1], "B": [2., np.nan], - "C": [4., 10]}) - df = self.read_table(StringIO(data.replace(',', ' ')), - delim_whitespace=True, - skiprows=[1, 2, 3, 5, 6], skip_blank_lines=True) - tm.assert_frame_equal(df, expected) - - def test_raise_on_sep_with_delim_whitespace(self): - # see gh-6607 - data = 'a b c\n1 2 3' - with tm.assertRaisesRegexp(ValueError, 'you can only specify one'): - self.read_table(StringIO(data), sep='\s', delim_whitespace=True) - - def test_single_char_leading_whitespace(self): - # see gh-9710 - data = """\ -MyColumn - a - b - a - b\n""" - - expected = DataFrame({'MyColumn': list('abab')}) - - result = self.read_csv(StringIO(data), delim_whitespace=True, - skipinitialspace=True) - tm.assert_frame_equal(result, expected) - - result = self.read_csv(StringIO(data), skipinitialspace=True) - tm.assert_frame_equal(result, expected) - - def test_usecols_with_whitespace(self): - data = 'a b c\n4 apple bat 5.7\n8 orange cow 10' - - result = self.read_csv(StringIO(data), delim_whitespace=True, - usecols=('a', 'b')) - expected = DataFrame({'a': ['apple', 'orange'], - 'b': ['bat', 'cow']}, index=[4, 8]) - - tm.assert_frame_equal(result, expected) - - -class CompressionTests(object): - def test_zip(self): - try: - import zipfile - except ImportError: - raise nose.SkipTest('need zipfile to run') - - with open(self.csv1, 'rb') as data_file: - data = data_file.read() - expected = self.read_csv(self.csv1) - - with tm.ensure_clean('test_file.zip') as path: - tmp = zipfile.ZipFile(path, mode='w') - tmp.writestr('test_file', data) - tmp.close() - - result = self.read_csv(path, compression='zip') - tm.assert_frame_equal(result, expected) - - result = self.read_csv(path, compression='infer') - tm.assert_frame_equal(result, expected) - - if self.engine is not 'python': - with open(path, 'rb') as f: - result = self.read_csv(f, compression='zip') - tm.assert_frame_equal(result, expected) - - with tm.ensure_clean('combined_zip.zip') as path: - inner_file_names = ['test_file', 'second_file'] - tmp = zipfile.ZipFile(path, mode='w') - for file_name in inner_file_names: - tmp.writestr(file_name, data) - tmp.close() - - self.assertRaisesRegexp(ValueError, 'Multiple files', self.read_csv, - path, compression='zip') - - self.assertRaisesRegexp(ValueError, 'Multiple files', self.read_csv, - path, compression='infer') - - with tm.ensure_clean() as path: - tmp = zipfile.ZipFile(path, mode='w') - tmp.close() - - self.assertRaisesRegexp(ValueError, 'Zero files',self.read_csv, - path, compression='zip') - - with tm.ensure_clean() as path: - with open(path, 'wb') as f: - self.assertRaises(zipfile.BadZipfile, self.read_csv, f, compression='zip') - - - def test_gzip(self): - try: - import gzip - except ImportError: - raise nose.SkipTest('need gzip to run') - - with open(self.csv1, 'rb') as data_file: - data = data_file.read() - expected = self.read_csv(self.csv1) - - with tm.ensure_clean() as path: - tmp = gzip.GzipFile(path, mode='wb') - tmp.write(data) - tmp.close() - - result = self.read_csv(path, compression='gzip') - tm.assert_frame_equal(result, expected) - - with open(path, 'rb') as f: - result = self.read_csv(f, compression='gzip') - tm.assert_frame_equal(result, expected) - - with tm.ensure_clean('test.gz') as path: - tmp = gzip.GzipFile(path, mode='wb') - tmp.write(data) - tmp.close() - result = self.read_csv(path, compression='infer') - tm.assert_frame_equal(result, expected) - - def test_bz2(self): - try: - import bz2 - except ImportError: - raise nose.SkipTest('need bz2 to run') - - with open(self.csv1, 'rb') as data_file: - data = data_file.read() - expected = self.read_csv(self.csv1) - - with tm.ensure_clean() as path: - tmp = bz2.BZ2File(path, mode='wb') - tmp.write(data) - tmp.close() - - result = self.read_csv(path, compression='bz2') - tm.assert_frame_equal(result, expected) - - self.assertRaises(ValueError, self.read_csv, - path, compression='bz3') - - with open(path, 'rb') as fin: - if compat.PY3: - result = self.read_csv(fin, compression='bz2') - tm.assert_frame_equal(result, expected) - elif self.engine is not 'python': - self.assertRaises(ValueError, self.read_csv, - fin, compression='bz2') - - with tm.ensure_clean('test.bz2') as path: - tmp = bz2.BZ2File(path, mode='wb') - tmp.write(data) - tmp.close() - result = self.read_csv(path, compression='infer') - tm.assert_frame_equal(result, expected) - - def test_xz(self): - lzma = tm._skip_if_no_lzma() - - with open(self.csv1, 'rb') as data_file: - data = data_file.read() - expected = self.read_csv(self.csv1) - - with tm.ensure_clean() as path: - tmp = lzma.LZMAFile(path, mode='wb') - tmp.write(data) - tmp.close() - - result = self.read_csv(path, compression='xz') - tm.assert_frame_equal(result, expected) - - with open(path, 'rb') as f: - result = self.read_csv(f, compression='xz') - tm.assert_frame_equal(result, expected) - - with tm.ensure_clean('test.xz') as path: - tmp = lzma.LZMAFile(path, mode='wb') - tmp.write(data) - tmp.close() - result = self.read_csv(path, compression='infer') - tm.assert_frame_equal(result, expected) - - def test_decompression_regex_sep(self): - try: - import gzip - import bz2 - except ImportError: - raise nose.SkipTest('need gzip and bz2 to run') - - with open(self.csv1, 'rb') as data_file: - data = data_file.read() - data = data.replace(b',', b'::') - expected = self.read_csv(self.csv1) - - with tm.ensure_clean() as path: - tmp = gzip.GzipFile(path, mode='wb') - tmp.write(data) - tmp.close() - - # GH 6607 - # Test currently only valid with the python engine because of - # regex sep. Temporarily copied to TestPythonParser. - # Here test for ValueError when passing regex sep: - - with tm.assertRaisesRegexp(ValueError, 'regex sep'): #XXX - result = self.read_csv(path, sep='::', compression='gzip', engine='c') - tm.assert_frame_equal(result, expected) - - with tm.ensure_clean() as path: - tmp = bz2.BZ2File(path, mode='wb') - tmp.write(data) - tmp.close() - - # GH 6607 - with tm.assertRaisesRegexp(ValueError, 'regex sep'): #XXX - result = self.read_csv(path, sep='::', compression='bz2', engine='c') - tm.assert_frame_equal(result, expected) - - self.assertRaises(ValueError, self.read_csv, - path, compression='bz3') - - -class TestPythonParser(ParserTests, CompressionTests, tm.TestCase): - - engine = 'python' - - def test_negative_skipfooter_raises(self): - text = """#foo,a,b,c -#foo,a,b,c -#foo,a,b,c -#foo,a,b,c -#foo,a,b,c -#foo,a,b,c -1/1/2000,1.,2.,3. -1/2/2000,4,5,6 -1/3/2000,7,8,9 -""" - - with tm.assertRaisesRegexp(ValueError, - 'skip footer cannot be negative'): - df = self.read_csv(StringIO(text), skipfooter=-1) - - def read_csv(self, *args, **kwds): - kwds = kwds.copy() - kwds['engine'] = self.engine - return read_csv(*args, **kwds) - - def read_table(self, *args, **kwds): - kwds = kwds.copy() - kwds['engine'] = self.engine - return read_table(*args, **kwds) - - def float_precision_choices(self): - return [None] - - def test_sniff_delimiter(self): - text = """index|A|B|C -foo|1|2|3 -bar|4|5|6 -baz|7|8|9 -""" - data = self.read_csv(StringIO(text), index_col=0, sep=None) - self.assertTrue(data.index.equals(Index(['foo', 'bar', 'baz']))) - - data2 = self.read_csv(StringIO(text), index_col=0, delimiter='|') - tm.assert_frame_equal(data, data2) - - text = """ignore this -ignore this too -index|A|B|C -foo|1|2|3 -bar|4|5|6 -baz|7|8|9 -""" - data3 = self.read_csv(StringIO(text), index_col=0, - sep=None, skiprows=2) - tm.assert_frame_equal(data, data3) - - text = u("""ignore this -ignore this too -index|A|B|C -foo|1|2|3 -bar|4|5|6 -baz|7|8|9 -""").encode('utf-8') - - s = BytesIO(text) - if compat.PY3: - # somewhat False since the code never sees bytes - from io import TextIOWrapper - s = TextIOWrapper(s, encoding='utf-8') - - data4 = self.read_csv(s, index_col=0, sep=None, skiprows=2, - encoding='utf-8') - tm.assert_frame_equal(data, data4) - - def test_regex_separator(self): - data = """ A B C D -a 1 2 3 4 -b 1 2 3 4 -c 1 2 3 4 -""" - df = self.read_table(StringIO(data), sep='\s+') - expected = self.read_csv(StringIO(re.sub('[ ]+', ',', data)), - index_col=0) - self.assertIsNone(expected.index.name) - tm.assert_frame_equal(df, expected) - - def test_1000_fwf(self): - data = """ - 1 2,334.0 5 -10 13 10. -""" - expected = [[1, 2334., 5], - [10, 13, 10]] - df = read_fwf(StringIO(data), colspecs=[(0, 3), (3, 11), (12, 16)], - thousands=',') - tm.assert_almost_equal(df.values, expected) - - def test_1000_sep_with_decimal(self): - data = """A|B|C -1|2,334.01|5 -10|13|10. -""" - - expected = DataFrame({ - 'A': [1, 10], - 'B': [2334.01, 13], - 'C': [5, 10.] - }) - - df = self.read_csv(StringIO(data), sep='|', thousands=',') - tm.assert_frame_equal(df, expected) - - df = self.read_table(StringIO(data), sep='|', thousands=',') - tm.assert_frame_equal(df, expected) - - def test_comment_fwf(self): - data = """ - 1 2. 4 #hello world - 5 NaN 10.0 -""" - expected = [[1, 2., 4], - [5, np.nan, 10.]] - df = read_fwf(StringIO(data), colspecs=[(0, 3), (4, 9), (9, 25)], - comment='#') - tm.assert_almost_equal(df.values, expected) - - def test_fwf(self): - data_expected = """\ -2011,58,360.242940,149.910199,11950.7 -2011,59,444.953632,166.985655,11788.4 -2011,60,364.136849,183.628767,11806.2 -2011,61,413.836124,184.375703,11916.8 -2011,62,502.953953,173.237159,12468.3 -""" - expected = self.read_csv(StringIO(data_expected), header=None) - - data1 = """\ -201158 360.242940 149.910199 11950.7 -201159 444.953632 166.985655 11788.4 -201160 364.136849 183.628767 11806.2 -201161 413.836124 184.375703 11916.8 -201162 502.953953 173.237159 12468.3 -""" - colspecs = [(0, 4), (4, 8), (8, 20), (21, 33), (34, 43)] - df = read_fwf(StringIO(data1), colspecs=colspecs, header=None) - tm.assert_frame_equal(df, expected) - - data2 = """\ -2011 58 360.242940 149.910199 11950.7 -2011 59 444.953632 166.985655 11788.4 -2011 60 364.136849 183.628767 11806.2 -2011 61 413.836124 184.375703 11916.8 -2011 62 502.953953 173.237159 12468.3 -""" - df = read_fwf(StringIO(data2), widths=[5, 5, 13, 13, 7], header=None) - tm.assert_frame_equal(df, expected) - - # From Thomas Kluyver: apparently some non-space filler characters can - # be seen, this is supported by specifying the 'delimiter' character: - # http://publib.boulder.ibm.com/infocenter/dmndhelp/v6r1mx/index.jsp?topic=/com.ibm.wbit.612.help.config.doc/topics/rfixwidth.html - data3 = """\ -201158~~~~360.242940~~~149.910199~~~11950.7 -201159~~~~444.953632~~~166.985655~~~11788.4 -201160~~~~364.136849~~~183.628767~~~11806.2 -201161~~~~413.836124~~~184.375703~~~11916.8 -201162~~~~502.953953~~~173.237159~~~12468.3 -""" - df = read_fwf( - StringIO(data3), colspecs=colspecs, delimiter='~', header=None) - tm.assert_frame_equal(df, expected) - - with tm.assertRaisesRegexp(ValueError, "must specify only one of"): - read_fwf(StringIO(data3), colspecs=colspecs, widths=[6, 10, 10, 7]) - - with tm.assertRaisesRegexp(ValueError, "Must specify either"): - read_fwf(StringIO(data3), colspecs=None, widths=None) - - def test_fwf_colspecs_is_list_or_tuple(self): - with tm.assertRaisesRegexp(TypeError, - 'column specifications must be a list or ' - 'tuple.+'): - pd.io.parsers.FixedWidthReader(StringIO(self.data1), - {'a': 1}, ',', '#') - - def test_fwf_colspecs_is_list_or_tuple_of_two_element_tuples(self): - with tm.assertRaisesRegexp(TypeError, - 'Each column specification must be.+'): - read_fwf(StringIO(self.data1), [('a', 1)]) - - def test_fwf_colspecs_None(self): - # GH 7079 - data = """\ -123456 -456789 -""" - colspecs = [(0, 3), (3, None)] - result = read_fwf(StringIO(data), colspecs=colspecs, header=None) - expected = DataFrame([[123, 456], [456, 789]]) - tm.assert_frame_equal(result, expected) - - colspecs = [(None, 3), (3, 6)] - result = read_fwf(StringIO(data), colspecs=colspecs, header=None) - expected = DataFrame([[123, 456], [456, 789]]) - tm.assert_frame_equal(result, expected) - - colspecs = [(0, None), (3, None)] - result = read_fwf(StringIO(data), colspecs=colspecs, header=None) - expected = DataFrame([[123456, 456], [456789, 789]]) - tm.assert_frame_equal(result, expected) - - colspecs = [(None, None), (3, 6)] - result = read_fwf(StringIO(data), colspecs=colspecs, header=None) - expected = DataFrame([[123456, 456], [456789, 789]]) - tm.assert_frame_equal(result, expected) - - def test_fwf_regression(self): - # GH 3594 - # turns out 'T060' is parsable as a datetime slice! - - tzlist = [1, 10, 20, 30, 60, 80, 100] - ntz = len(tzlist) - tcolspecs = [16] + [8] * ntz - tcolnames = ['SST'] + ["T%03d" % z for z in tzlist[1:]] - data = """ 2009164202000 9.5403 9.4105 8.6571 7.8372 6.0612 5.8843 5.5192 - 2009164203000 9.5435 9.2010 8.6167 7.8176 6.0804 5.8728 5.4869 - 2009164204000 9.5873 9.1326 8.4694 7.5889 6.0422 5.8526 5.4657 - 2009164205000 9.5810 9.0896 8.4009 7.4652 6.0322 5.8189 5.4379 - 2009164210000 9.6034 9.0897 8.3822 7.4905 6.0908 5.7904 5.4039 -""" - - df = read_fwf(StringIO(data), - index_col=0, - header=None, - names=tcolnames, - widths=tcolspecs, - parse_dates=True, - date_parser=lambda s: datetime.strptime(s, '%Y%j%H%M%S')) - - for c in df.columns: - res = df.loc[:, c] - self.assertTrue(len(res)) - - def test_fwf_for_uint8(self): - data = """1421302965.213420 PRI=3 PGN=0xef00 DST=0x17 SRC=0x28 04 154 00 00 00 00 00 127 -1421302964.226776 PRI=6 PGN=0xf002 SRC=0x47 243 00 00 255 247 00 00 71""" - df = read_fwf(StringIO(data), - colspecs=[(0, 17), (25, 26), (33, 37), - (49, 51), (58, 62), (63, 1000)], - names=['time', 'pri', 'pgn', 'dst', 'src', 'data'], - converters={ - 'pgn': lambda x: int(x, 16), - 'src': lambda x: int(x, 16), - 'dst': lambda x: int(x, 16), - 'data': lambda x: len(x.split(' '))}) - - expected = DataFrame([[1421302965.213420, 3, 61184, 23, 40, 8], - [1421302964.226776, 6, 61442, None, 71, 8]], - columns=["time", "pri", "pgn", "dst", "src", "data"]) - expected["dst"] = expected["dst"].astype(object) - - tm.assert_frame_equal(df, expected) - - def test_fwf_compression(self): - try: - import gzip - import bz2 - except ImportError: - raise nose.SkipTest("Need gzip and bz2 to run this test") - - data = """1111111111 - 2222222222 - 3333333333""".strip() - widths = [5, 5] - names = ['one', 'two'] - expected = read_fwf(StringIO(data), widths=widths, names=names) - if compat.PY3: - data = bytes(data, encoding='utf-8') - comps = [('gzip', gzip.GzipFile), ('bz2', bz2.BZ2File)] - for comp_name, compresser in comps: - with tm.ensure_clean() as path: - tmp = compresser(path, mode='wb') - tmp.write(data) - tmp.close() - result = read_fwf(path, widths=widths, names=names, - compression=comp_name) - tm.assert_frame_equal(result, expected) - - def test_BytesIO_input(self): - if not compat.PY3: - raise nose.SkipTest( - "Bytes-related test - only needs to work on Python 3") - result = pd.read_fwf(BytesIO("שלום\nשלום".encode('utf8')), widths=[ - 2, 2], encoding='utf8') - expected = pd.DataFrame([["של", "ום"]], columns=["של", "ום"]) - tm.assert_frame_equal(result, expected) - data = BytesIO("שלום::1234\n562::123".encode('cp1255')) - result = pd.read_table(data, sep="::", engine='python', - encoding='cp1255') - expected = pd.DataFrame([[562, 123]], columns=["שלום", "1234"]) - tm.assert_frame_equal(result, expected) - - def test_verbose_import(self): - text = """a,b,c,d -one,1,2,3 -one,1,2,3 -,1,2,3 -one,1,2,3 -,1,2,3 -,1,2,3 -one,1,2,3 -two,1,2,3""" - - buf = StringIO() - sys.stdout = buf - - try: - # it works! - df = self.read_csv(StringIO(text), verbose=True) - self.assertEqual( - buf.getvalue(), 'Filled 3 NA values in column a\n') - finally: - sys.stdout = sys.__stdout__ - - buf = StringIO() - sys.stdout = buf - - text = """a,b,c,d -one,1,2,3 -two,1,2,3 -three,1,2,3 -four,1,2,3 -five,1,2,3 -,1,2,3 -seven,1,2,3 -eight,1,2,3""" - - try: - # it works! - df = self.read_csv(StringIO(text), verbose=True, index_col=0) - self.assertEqual( - buf.getvalue(), 'Filled 1 NA values in column a\n') - finally: - sys.stdout = sys.__stdout__ - - def test_float_precision_specified(self): - # Should raise an error if float_precision (C parser option) is - # specified - with tm.assertRaisesRegexp(ValueError, "The 'float_precision' option " - "is not supported with the 'python' engine"): - self.read_csv(StringIO('a,b,c\n1,2,3'), float_precision='high') - - def test_iteration_open_handle(self): - if PY3: - raise nose.SkipTest( - "won't work in Python 3 {0}".format(sys.version_info)) - - with tm.ensure_clean() as path: - with open(path, 'wb') as f: - f.write('AAA\nBBB\nCCC\nDDD\nEEE\nFFF\nGGG') - - with open(path, 'rb') as f: - for line in f: - if 'CCC' in line: - break - - try: - read_table(f, squeeze=True, header=None, engine='c') - except Exception: - pass - else: - raise ValueError('this should not happen') - - result = read_table(f, squeeze=True, header=None, - engine='python') - - expected = Series(['DDD', 'EEE', 'FFF', 'GGG'], name=0) - tm.assert_series_equal(result, expected) - - def test_iterator(self): - # GH 6607 - # This is a copy which should eventually be merged into ParserTests - # when the issue with the C parser is fixed - - reader = self.read_csv(StringIO(self.data1), index_col=0, - iterator=True) - df = self.read_csv(StringIO(self.data1), index_col=0) - - chunk = reader.read(3) - tm.assert_frame_equal(chunk, df[:3]) - - last_chunk = reader.read(5) - tm.assert_frame_equal(last_chunk, df[3:]) - - # pass list - lines = list(csv.reader(StringIO(self.data1))) - parser = TextParser(lines, index_col=0, chunksize=2) - - df = self.read_csv(StringIO(self.data1), index_col=0) - - chunks = list(parser) - tm.assert_frame_equal(chunks[0], df[:2]) - tm.assert_frame_equal(chunks[1], df[2:4]) - tm.assert_frame_equal(chunks[2], df[4:]) - - # pass skiprows - parser = TextParser(lines, index_col=0, chunksize=2, skiprows=[1]) - chunks = list(parser) - tm.assert_frame_equal(chunks[0], df[1:3]) - - # test bad parameter (skip_footer) - reader = self.read_csv(StringIO(self.data1), index_col=0, - iterator=True, skip_footer=True) - self.assertRaises(ValueError, reader.read, 3) - - treader = self.read_table(StringIO(self.data1), sep=',', index_col=0, - iterator=True) - tm.assertIsInstance(treader, TextFileReader) - - # stopping iteration when on chunksize is specified, GH 3967 - data = """A,B,C -foo,1,2,3 -bar,4,5,6 -baz,7,8,9 -""" - reader = self.read_csv(StringIO(data), iterator=True) - result = list(reader) - expected = DataFrame(dict(A=[1, 4, 7], B=[2, 5, 8], C=[ - 3, 6, 9]), index=['foo', 'bar', 'baz']) - tm.assert_frame_equal(result[0], expected) - - # chunksize = 1 - reader = self.read_csv(StringIO(data), chunksize=1) - result = list(reader) - expected = DataFrame(dict(A=[1, 4, 7], B=[2, 5, 8], C=[ - 3, 6, 9]), index=['foo', 'bar', 'baz']) - self.assertEqual(len(result), 3) - tm.assert_frame_equal(pd.concat(result), expected) - - def test_single_line(self): - # GH 6607 - # This is a copy which should eventually be merged into ParserTests - # when the issue with the C parser is fixed - - # sniff separator - buf = StringIO() - sys.stdout = buf - - # printing warning message when engine == 'c' for now - - try: - # it works! - df = self.read_csv(StringIO('1,2'), names=['a', 'b'], - header=None, sep=None) - tm.assert_frame_equal(DataFrame({'a': [1], 'b': [2]}), df) - finally: - sys.stdout = sys.__stdout__ - - def test_malformed(self): - # GH 6607 - # This is a copy which should eventually be merged into ParserTests - # when the issue with the C parser is fixed - - # all - data = """ignore -A,B,C -1,2,3 # comment -1,2,3,4,5 -2,3,4 -""" - - try: - df = self.read_table( - StringIO(data), sep=',', header=1, comment='#') - self.assertTrue(False) - except Exception as inst: - self.assertIn('Expected 3 fields in line 4, saw 5', str(inst)) - - # skip_footer - data = """ignore -A,B,C -1,2,3 # comment -1,2,3,4,5 -2,3,4 -footer -""" - - try: - df = self.read_table( - StringIO(data), sep=',', header=1, comment='#', - skip_footer=1) - self.assertTrue(False) - except Exception as inst: - self.assertIn('Expected 3 fields in line 4, saw 5', str(inst)) - - # first chunk - data = """ignore -A,B,C -skip -1,2,3 -3,5,10 # comment -1,2,3,4,5 -2,3,4 -""" - try: - it = self.read_table(StringIO(data), sep=',', - header=1, comment='#', iterator=True, chunksize=1, - skiprows=[2]) - df = it.read(5) - self.assertTrue(False) - except Exception as inst: - self.assertIn('Expected 3 fields in line 6, saw 5', str(inst)) - - # middle chunk - data = """ignore -A,B,C -skip -1,2,3 -3,5,10 # comment -1,2,3,4,5 -2,3,4 -""" - try: - it = self.read_table(StringIO(data), sep=',', header=1, - comment='#', iterator=True, chunksize=1, - skiprows=[2]) - df = it.read(1) - it.read(2) - self.assertTrue(False) - except Exception as inst: - self.assertIn('Expected 3 fields in line 6, saw 5', str(inst)) - - # last chunk - data = """ignore -A,B,C -skip -1,2,3 -3,5,10 # comment -1,2,3,4,5 -2,3,4 -""" - try: - it = self.read_table(StringIO(data), sep=',', - header=1, comment='#', iterator=True, chunksize=1, - skiprows=[2]) - df = it.read(1) - it.read() - self.assertTrue(False) - except Exception as inst: - self.assertIn('Expected 3 fields in line 6, saw 5', str(inst)) - - def test_skip_footer(self): - # GH 6607 - # This is a copy which should eventually be merged into ParserTests - # when the issue with the C parser is fixed - - data = """A,B,C -1,2,3 -4,5,6 -7,8,9 -want to skip this -also also skip this -""" - result = self.read_csv(StringIO(data), skip_footer=2) - no_footer = '\n'.join(data.split('\n')[:-3]) - expected = self.read_csv(StringIO(no_footer)) - - tm.assert_frame_equal(result, expected) - - result = self.read_csv(StringIO(data), nrows=3) - tm.assert_frame_equal(result, expected) - - # skipfooter alias - result = self.read_csv(StringIO(data), skipfooter=2) - no_footer = '\n'.join(data.split('\n')[:-3]) - expected = self.read_csv(StringIO(no_footer)) - - tm.assert_frame_equal(result, expected) - - def test_decompression_regex_sep(self): - # GH 6607 - # This is a copy which should eventually be moved to ParserTests - # when the issue with the C parser is fixed - - try: - import gzip - import bz2 - except ImportError: - raise nose.SkipTest('need gzip and bz2 to run') - - data = open(self.csv1, 'rb').read() - data = data.replace(b',', b'::') - expected = self.read_csv(self.csv1) - - with tm.ensure_clean() as path: - tmp = gzip.GzipFile(path, mode='wb') - tmp.write(data) - tmp.close() - - result = self.read_csv(path, sep='::', compression='gzip') - tm.assert_frame_equal(result, expected) - - with tm.ensure_clean() as path: - tmp = bz2.BZ2File(path, mode='wb') - tmp.write(data) - tmp.close() - - result = self.read_csv(path, sep='::', compression='bz2') - tm.assert_frame_equal(result, expected) - - self.assertRaises(ValueError, self.read_csv, - path, compression='bz3') - - def test_read_table_buglet_4x_multiindex(self): - # GH 6607 - # This is a copy which should eventually be merged into ParserTests - # when the issue with multi-level index is fixed in the C parser. - - text = """ A B C D E -one two three four -a b 10.0032 5 -0.5109 -2.3358 -0.4645 0.05076 0.3640 -a q 20 4 0.4473 1.4152 0.2834 1.00661 0.1744 -x q 30 3 -0.6662 -0.5243 -0.3580 0.89145 2.5838""" - - # it works! - df = self.read_table(StringIO(text), sep='\s+') - self.assertEqual(df.index.names, ('one', 'two', 'three', 'four')) - - # GH 6893 - data = ' A B C\na b c\n1 3 7 0 3 6\n3 1 4 1 5 9' - expected = DataFrame.from_records([(1, 3, 7, 0, 3, 6), (3, 1, 4, 1, 5, 9)], - columns=list('abcABC'), index=list('abc')) - actual = self.read_table(StringIO(data), sep='\s+') - tm.assert_frame_equal(actual, expected) - - def test_empty_lines(self): - data = """\ -A,B,C -1,2.,4. - - -5.,NaN,10.0 - --70,.4,1 -""" - expected = [[1., 2., 4.], - [5., np.nan, 10.], - [-70., .4, 1.]] - df = self.read_csv(StringIO(data)) - tm.assert_almost_equal(df.values, expected) - df = self.read_csv(StringIO(data.replace(',', ' ')), sep='\s+') - tm.assert_almost_equal(df.values, expected) - expected = [[1., 2., 4.], - [np.nan, np.nan, np.nan], - [np.nan, np.nan, np.nan], - [5., np.nan, 10.], - [np.nan, np.nan, np.nan], - [-70., .4, 1.]] - df = self.read_csv(StringIO(data), skip_blank_lines=False) - tm.assert_almost_equal(list(df.values), list(expected)) - - def test_whitespace_lines(self): - data = """ - -\t \t\t - \t -A,B,C - \t 1,2.,4. -5.,NaN,10.0 -""" - expected = [[1, 2., 4.], - [5., np.nan, 10.]] - df = self.read_csv(StringIO(data)) - tm.assert_almost_equal(df.values, expected) - - -class TestFwfColspaceSniffing(tm.TestCase): - - def test_full_file(self): - # File with all values - test = '''index A B C -2000-01-03T00:00:00 0.980268513777 3 foo -2000-01-04T00:00:00 1.04791624281 -4 bar -2000-01-05T00:00:00 0.498580885705 73 baz -2000-01-06T00:00:00 1.12020151869 1 foo -2000-01-07T00:00:00 0.487094399463 0 bar -2000-01-10T00:00:00 0.836648671666 2 baz -2000-01-11T00:00:00 0.157160753327 34 foo''' - colspecs = ((0, 19), (21, 35), (38, 40), (42, 45)) - expected = read_fwf(StringIO(test), colspecs=colspecs) - tm.assert_frame_equal(expected, read_fwf(StringIO(test))) - - def test_full_file_with_missing(self): - # File with missing values - test = '''index A B C -2000-01-03T00:00:00 0.980268513777 3 foo -2000-01-04T00:00:00 1.04791624281 -4 bar - 0.498580885705 73 baz -2000-01-06T00:00:00 1.12020151869 1 foo -2000-01-07T00:00:00 0 bar -2000-01-10T00:00:00 0.836648671666 2 baz - 34''' - colspecs = ((0, 19), (21, 35), (38, 40), (42, 45)) - expected = read_fwf(StringIO(test), colspecs=colspecs) - tm.assert_frame_equal(expected, read_fwf(StringIO(test))) - - def test_full_file_with_spaces(self): - # File with spaces in columns - test = ''' -Account Name Balance CreditLimit AccountCreated -101 Keanu Reeves 9315.45 10000.00 1/17/1998 -312 Gerard Butler 90.00 1000.00 8/6/2003 -868 Jennifer Love Hewitt 0 17000.00 5/25/1985 -761 Jada Pinkett-Smith 49654.87 100000.00 12/5/2006 -317 Bill Murray 789.65 5000.00 2/5/2007 -'''.strip('\r\n') - colspecs = ((0, 7), (8, 28), (30, 38), (42, 53), (56, 70)) - expected = read_fwf(StringIO(test), colspecs=colspecs) - tm.assert_frame_equal(expected, read_fwf(StringIO(test))) - - def test_full_file_with_spaces_and_missing(self): - # File with spaces and missing values in columsn - test = ''' -Account Name Balance CreditLimit AccountCreated -101 10000.00 1/17/1998 -312 Gerard Butler 90.00 1000.00 8/6/2003 -868 5/25/1985 -761 Jada Pinkett-Smith 49654.87 100000.00 12/5/2006 -317 Bill Murray 789.65 -'''.strip('\r\n') - colspecs = ((0, 7), (8, 28), (30, 38), (42, 53), (56, 70)) - expected = read_fwf(StringIO(test), colspecs=colspecs) - tm.assert_frame_equal(expected, read_fwf(StringIO(test))) - - def test_messed_up_data(self): - # Completely messed up file - test = ''' - Account Name Balance Credit Limit Account Created - 101 10000.00 1/17/1998 - 312 Gerard Butler 90.00 1000.00 - - 761 Jada Pinkett-Smith 49654.87 100000.00 12/5/2006 - 317 Bill Murray 789.65 -'''.strip('\r\n') - colspecs = ((2, 10), (15, 33), (37, 45), (49, 61), (64, 79)) - expected = read_fwf(StringIO(test), colspecs=colspecs) - tm.assert_frame_equal(expected, read_fwf(StringIO(test))) - - def test_multiple_delimiters(self): - test = r''' -col1~~~~~col2 col3++++++++++++++++++col4 -~~22.....11.0+++foo~~~~~~~~~~Keanu Reeves - 33+++122.33\\\bar.........Gerard Butler -++44~~~~12.01 baz~~Jennifer Love Hewitt -~~55 11+++foo++++Jada Pinkett-Smith -..66++++++.03~~~bar Bill Murray -'''.strip('\r\n') - colspecs = ((0, 4), (7, 13), (15, 19), (21, 41)) - expected = read_fwf(StringIO(test), colspecs=colspecs, - delimiter=' +~.\\') - tm.assert_frame_equal(expected, read_fwf(StringIO(test), - delimiter=' +~.\\')) - - def test_variable_width_unicode(self): - if not compat.PY3: - raise nose.SkipTest( - 'Bytes-related test - only needs to work on Python 3') - test = ''' -שלום שלום -ום שלל -של ום -'''.strip('\r\n') - expected = pd.read_fwf(BytesIO(test.encode('utf8')), - colspecs=[(0, 4), (5, 9)], header=None, encoding='utf8') - tm.assert_frame_equal(expected, read_fwf(BytesIO(test.encode('utf8')), - header=None, encoding='utf8')) - -class CParserTests(ParserTests): - """ base class for CParser Testsing """ - - def float_precision_choices(self): - return [None, 'high', 'round_trip'] - - def test_buffer_overflow(self): - # GH9205 - # test certain malformed input files that cause buffer overflows in - # tokenizer.c - malfw = "1\r1\r1\r 1\r 1\r" # buffer overflow in words pointer - malfs = "1\r1\r1\r 1\r 1\r11\r" # buffer overflow in stream pointer - malfl = "1\r1\r1\r 1\r 1\r11\r1\r" # buffer overflow in lines pointer - for malf in (malfw, malfs, malfl): - try: - df = self.read_table(StringIO(malf)) - except Exception as cperr: - self.assertIn( - 'Buffer overflow caught - possible malformed input file.', str(cperr)) - - def test_buffer_rd_bytes(self): - # GH 12098 - # src->buffer can be freed twice leading to a segfault if a corrupt - # gzip file is read with read_csv and the buffer is filled more than - # once before gzip throws an exception - - data = '\x1F\x8B\x08\x00\x00\x00\x00\x00\x00\x03\xED\xC3\x41\x09' \ - '\x00\x00\x08\x00\xB1\xB7\xB6\xBA\xFE\xA5\xCC\x21\x6C\xB0' \ - '\xA6\x4D' + '\x55' * 267 + \ - '\x7D\xF7\x00\x91\xE0\x47\x97\x14\x38\x04\x00' \ - '\x1f\x8b\x08\x00VT\x97V\x00\x03\xed]\xefO' - for i in range(100): - try: - _ = self.read_csv(StringIO(data), - compression='gzip', - delim_whitespace=True) - except Exception as e: - pass - - def test_delim_whitespace_custom_terminator(self): - # See gh-12912 - data = """a b c~1 2 3~4 5 6~7 8 9""" - df = self.read_csv(StringIO(data), lineterminator='~', - delim_whitespace=True) - expected = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], - columns=['a', 'b', 'c']) - tm.assert_frame_equal(df, expected) - - def test_line_comment_customterm(self): - # TODO: move into ParserTests once Python supports custom terminator - data = """# empty -A,B,C -1,2.,4.#hello world -#ignore this line -5.,NaN,10.0 -""" - expected = [[1., 2., 4.], - [5., np.nan, 10.]] - df = self.read_csv(StringIO(data.replace('\n', '*')), comment='#', - lineterminator='*') - tm.assert_almost_equal(df.values, expected) - - def test_skiprows_lineterminator_cr(self): - # see gh-9079 - # TODO: move into ParserTests once Python supports custom terminator - data = '\n'.join(['SMOSMANIA ThetaProbe-ML2X ', - '2007/01/01 01:00 0.2140 U M ', - '2007/01/01 02:00 0.2141 M O ', - '2007/01/01 04:00 0.2142 D M ']) - expected = pd.DataFrame([['2007/01/01', '01:00', 0.2140, 'U', 'M'], - ['2007/01/01', '02:00', 0.2141, 'M', 'O'], - ['2007/01/01', '04:00', 0.2142, 'D', 'M']], - columns=['date', 'time', 'var', 'flag', - 'oflag']) - # test with the three default lineterminators LF, CR and CRLF - df = self.read_csv(StringIO(data), skiprows=1, delim_whitespace=True, - names=['date', 'time', 'var', 'flag', 'oflag']) - tm.assert_frame_equal(df, expected) - df = self.read_csv(StringIO(data.replace('\n', '\r')), - skiprows=1, delim_whitespace=True, - names=['date', 'time', 'var', 'flag', 'oflag']) - tm.assert_frame_equal(df, expected) - df = self.read_csv(StringIO(data.replace('\n', '\r\n')), - skiprows=1, delim_whitespace=True, - names=['date', 'time', 'var', 'flag', 'oflag']) - tm.assert_frame_equal(df, expected) - - -class TestCParserHighMemory(CParserTests, CompressionTests, tm.TestCase): - engine = 'c' - - def read_csv(self, *args, **kwds): - kwds = kwds.copy() - kwds['engine'] = self.engine - kwds['low_memory'] = False - return read_csv(*args, **kwds) - - def read_table(self, *args, **kwds): - kwds = kwds.copy() - kwds['engine'] = self.engine - kwds['low_memory'] = False - return read_table(*args, **kwds) - - def test_compact_ints(self): - if compat.is_platform_windows(): - raise nose.SkipTest( - "segfaults on win-64, only when all tests are run") - - data = ('0,1,0,0\n' - '1,1,0,0\n' - '0,1,0,1') - - result = read_csv(StringIO(data), delimiter=',', header=None, - compact_ints=True, as_recarray=True) - ex_dtype = np.dtype([(str(i), 'i1') for i in range(4)]) - self.assertEqual(result.dtype, ex_dtype) - - result = read_csv(StringIO(data), delimiter=',', header=None, - as_recarray=True, compact_ints=True, - use_unsigned=True) - ex_dtype = np.dtype([(str(i), 'u1') for i in range(4)]) - self.assertEqual(result.dtype, ex_dtype) - - def test_parse_dates_empty_string(self): - # #2263 - s = StringIO("Date, test\n2012-01-01, 1\n,2") - result = self.read_csv(s, parse_dates=["Date"], na_filter=False) - self.assertTrue(result['Date'].isnull()[1]) - - def test_usecols(self): - raise nose.SkipTest( - "Usecols is not supported in C High Memory engine.") - - def test_comment_skiprows(self): - data = """# empty -random line -# second empty line -1,2,3 -A,B,C -1,2.,4. -5.,NaN,10.0 -""" - expected = [[1., 2., 4.], - [5., np.nan, 10.]] - # this should ignore the first four lines (including comments) - df = self.read_csv(StringIO(data), comment='#', skiprows=4) - tm.assert_almost_equal(df.values, expected) - - def test_comment_header(self): - data = """# empty -# second empty line -1,2,3 -A,B,C -1,2.,4. -5.,NaN,10.0 -""" - expected = [[1., 2., 4.], - [5., np.nan, 10.]] - # header should begin at the second non-comment line - df = self.read_csv(StringIO(data), comment='#', header=1) - tm.assert_almost_equal(df.values, expected) - - def test_comment_skiprows_header(self): - data = """# empty -# second empty line -# third empty line -X,Y,Z -1,2,3 -A,B,C -1,2.,4. -5.,NaN,10.0 -""" - expected = [[1., 2., 4.], - [5., np.nan, 10.]] - # skiprows should skip the first 4 lines (including comments), while - # header should start from the second non-commented line starting - # with line 5 - df = self.read_csv(StringIO(data), comment='#', skiprows=4, header=1) - tm.assert_almost_equal(df.values, expected) - - def test_empty_lines(self): - data = """\ -A,B,C -1,2.,4. - - -5.,NaN,10.0 - --70,.4,1 -""" - expected = [[1., 2., 4.], - [5., np.nan, 10.], - [-70., .4, 1.]] - df = self.read_csv(StringIO(data)) - tm.assert_almost_equal(df.values, expected) - df = self.read_csv(StringIO(data.replace(',', ' ')), sep='\s+') - tm.assert_almost_equal(df.values, expected) - expected = [[1., 2., 4.], - [np.nan, np.nan, np.nan], - [np.nan, np.nan, np.nan], - [5., np.nan, 10.], - [np.nan, np.nan, np.nan], - [-70., .4, 1.]] - df = self.read_csv(StringIO(data), skip_blank_lines=False) - tm.assert_almost_equal(list(df.values), list(expected)) - - def test_whitespace_lines(self): - data = """ - -\t \t\t - \t -A,B,C - \t 1,2.,4. -5.,NaN,10.0 -""" - expected = [[1, 2., 4.], - [5., np.nan, 10.]] - df = self.read_csv(StringIO(data)) - tm.assert_almost_equal(df.values, expected) - - def test_passing_dtype(self): - # GH 6607 - # This is a copy which should eventually be merged into ParserTests - # when the dtype argument is supported by all engines. - - df = DataFrame(np.random.rand(5, 2), columns=list( - 'AB'), index=['1A', '1B', '1C', '1D', '1E']) - - with tm.ensure_clean('__passing_str_as_dtype__.csv') as path: - df.to_csv(path) - - # GH 3795 - # passing 'str' as the dtype - result = self.read_csv(path, dtype=str, index_col=0) - tm.assert_series_equal(result.dtypes, Series( - {'A': 'object', 'B': 'object'})) - - # we expect all object columns, so need to convert to test for - # equivalence - result = result.astype(float) - tm.assert_frame_equal(result, df) - - # invalid dtype - self.assertRaises(TypeError, self.read_csv, path, dtype={'A': 'foo', 'B': 'float64'}, - index_col=0) - - # valid but we don't support it (date) - self.assertRaises(TypeError, self.read_csv, path, dtype={'A': 'datetime64', 'B': 'float64'}, - index_col=0) - self.assertRaises(TypeError, self.read_csv, path, dtype={'A': 'datetime64', 'B': 'float64'}, - index_col=0, parse_dates=['B']) - - # valid but we don't support it - self.assertRaises(TypeError, self.read_csv, path, dtype={'A': 'timedelta64', 'B': 'float64'}, - index_col=0) - - # empty frame - # GH12048 - actual = self.read_csv(StringIO('A,B'), dtype=str) - expected = DataFrame({'A': [], 'B': []}, index=[], dtype=str) - tm.assert_frame_equal(actual, expected) - - def test_dtype_and_names_error(self): - - # GH 8833 - # passing both dtype and names resulting in an error reporting issue - - data = """ -1.0 1 -2.0 2 -3.0 3 -""" - # base cases - result = self.read_csv(StringIO(data), sep='\s+', header=None) - expected = DataFrame([[1.0, 1], [2.0, 2], [3.0, 3]]) - tm.assert_frame_equal(result, expected) - - result = self.read_csv(StringIO(data), sep='\s+', - header=None, names=['a', 'b']) - expected = DataFrame( - [[1.0, 1], [2.0, 2], [3.0, 3]], columns=['a', 'b']) - tm.assert_frame_equal(result, expected) - - # fallback casting - result = self.read_csv(StringIO( - data), sep='\s+', header=None, names=['a', 'b'], dtype={'a': np.int32}) - expected = DataFrame([[1, 1], [2, 2], [3, 3]], columns=['a', 'b']) - expected['a'] = expected['a'].astype(np.int32) - tm.assert_frame_equal(result, expected) - - data = """ -1.0 1 -nan 2 -3.0 3 -""" - # fallback casting, but not castable - with tm.assertRaisesRegexp(ValueError, 'cannot safely convert'): - self.read_csv(StringIO(data), sep='\s+', header=None, - names=['a', 'b'], dtype={'a': np.int32}) - - def test_fallback_to_python(self): - # GH 6607 - data = 'a b c\n1 2 3' - - # specify C engine with unsupported options (raise) - with tm.assertRaisesRegexp(ValueError, 'does not support'): - self.read_table(StringIO(data), engine='c', sep=None, - delim_whitespace=False) - with tm.assertRaisesRegexp(ValueError, 'does not support'): - self.read_table(StringIO(data), engine='c', sep='\s') - with tm.assertRaisesRegexp(ValueError, 'does not support'): - self.read_table(StringIO(data), engine='c', skip_footer=1) - - -class TestCParserLowMemory(CParserTests, CompressionTests, tm.TestCase): - - engine = 'c' - - def read_csv(self, *args, **kwds): - kwds = kwds.copy() - kwds['engine'] = self.engine - kwds['low_memory'] = True - kwds['buffer_lines'] = 2 - return read_csv(*args, **kwds) - - def read_table(self, *args, **kwds): - kwds = kwds.copy() - kwds['engine'] = self.engine - kwds['low_memory'] = True - kwds['buffer_lines'] = 2 - return read_table(*args, **kwds) - - def test_compact_ints(self): - data = ('0,1,0,0\n' - '1,1,0,0\n' - '0,1,0,1') - - result = read_csv(StringIO(data), delimiter=',', header=None, - compact_ints=True) - ex_dtype = np.dtype([(str(i), 'i1') for i in range(4)]) - self.assertEqual(result.to_records(index=False).dtype, ex_dtype) - - result = read_csv(StringIO(data), delimiter=',', header=None, - compact_ints=True, - use_unsigned=True) - ex_dtype = np.dtype([(str(i), 'u1') for i in range(4)]) - self.assertEqual(result.to_records(index=False).dtype, ex_dtype) - - def test_compact_ints_as_recarray(self): - if compat.is_platform_windows(): - raise nose.SkipTest( - "segfaults on win-64, only when all tests are run") - - data = ('0,1,0,0\n' - '1,1,0,0\n' - '0,1,0,1') - - result = read_csv(StringIO(data), delimiter=',', header=None, - compact_ints=True, as_recarray=True) - ex_dtype = np.dtype([(str(i), 'i1') for i in range(4)]) - self.assertEqual(result.dtype, ex_dtype) - - result = read_csv(StringIO(data), delimiter=',', header=None, - as_recarray=True, compact_ints=True, - use_unsigned=True) - ex_dtype = np.dtype([(str(i), 'u1') for i in range(4)]) - self.assertEqual(result.dtype, ex_dtype) - - def test_precise_conversion(self): - # GH #8002 - tm._skip_if_32bit() - from decimal import Decimal - normal_errors = [] - precise_errors = [] - for num in np.linspace(1., 2., num=500): # test numbers between 1 and 2 - text = 'a\n{0:.25}'.format(num) # 25 decimal digits of precision - normal_val = float(self.read_csv(StringIO(text))['a'][0]) - precise_val = float(self.read_csv( - StringIO(text), float_precision='high')['a'][0]) - roundtrip_val = float(self.read_csv( - StringIO(text), float_precision='round_trip')['a'][0]) - actual_val = Decimal(text[2:]) - - def error(val): - return abs(Decimal('{0:.100}'.format(val)) - actual_val) - normal_errors.append(error(normal_val)) - precise_errors.append(error(precise_val)) - # round-trip should match float() - self.assertEqual(roundtrip_val, float(text[2:])) - self.assertTrue(sum(precise_errors) <= sum(normal_errors)) - self.assertTrue(max(precise_errors) <= max(normal_errors)) - - def test_pass_dtype(self): - data = """\ -one,two -1,2.5 -2,3.5 -3,4.5 -4,5.5""" - - result = self.read_csv(StringIO(data), dtype={'one': 'u1', 1: 'S1'}) - self.assertEqual(result['one'].dtype, 'u1') - self.assertEqual(result['two'].dtype, 'object') - - def test_pass_dtype_as_recarray(self): - data = """\ -one,two -1,2.5 -2,3.5 -3,4.5 -4,5.5""" - - if compat.is_platform_windows(): - raise nose.SkipTest( - "segfaults on win-64, only when all tests are run") - - result = self.read_csv(StringIO(data), dtype={'one': 'u1', 1: 'S1'}, - as_recarray=True) - self.assertEqual(result['one'].dtype, 'u1') - self.assertEqual(result['two'].dtype, 'S1') - - def test_empty_pass_dtype(self): - data = 'one,two' - result = self.read_csv(StringIO(data), dtype={'one': 'u1'}) - - expected = DataFrame({'one': np.empty(0, dtype='u1'), - 'two': np.empty(0, dtype=np.object)}) - tm.assert_frame_equal(result, expected, check_index_type=False) - - def test_empty_with_index_pass_dtype(self): - data = 'one,two' - result = self.read_csv(StringIO(data), index_col=['one'], - dtype={'one': 'u1', 1: 'f'}) - - expected = DataFrame({'two': np.empty(0, dtype='f')}, - index=Index([], dtype='u1', name='one')) - tm.assert_frame_equal(result, expected, check_index_type=False) - - def test_empty_with_multiindex_pass_dtype(self): - data = 'one,two,three' - result = self.read_csv(StringIO(data), index_col=['one', 'two'], - dtype={'one': 'u1', 1: 'f8'}) - - exp_idx = MultiIndex.from_arrays([np.empty(0, dtype='u1'), np.empty(0, dtype='O')], - names=['one', 'two']) - expected = DataFrame( - {'three': np.empty(0, dtype=np.object)}, index=exp_idx) - tm.assert_frame_equal(result, expected, check_index_type=False) - - def test_empty_with_mangled_column_pass_dtype_by_names(self): - data = 'one,one' - result = self.read_csv(StringIO(data), dtype={ - 'one': 'u1', 'one.1': 'f'}) - - expected = DataFrame( - {'one': np.empty(0, dtype='u1'), 'one.1': np.empty(0, dtype='f')}) - tm.assert_frame_equal(result, expected, check_index_type=False) - - def test_empty_with_mangled_column_pass_dtype_by_indexes(self): - data = 'one,one' - result = self.read_csv(StringIO(data), dtype={0: 'u1', 1: 'f'}) - - expected = DataFrame( - {'one': np.empty(0, dtype='u1'), 'one.1': np.empty(0, dtype='f')}) - tm.assert_frame_equal(result, expected, check_index_type=False) - - def test_empty_with_dup_column_pass_dtype_by_names(self): - data = 'one,one' - result = self.read_csv( - StringIO(data), mangle_dupe_cols=False, dtype={'one': 'u1'}) - expected = pd.concat([Series([], name='one', dtype='u1')] * 2, axis=1) - tm.assert_frame_equal(result, expected, check_index_type=False) - - def test_empty_with_dup_column_pass_dtype_by_indexes(self): - ### FIXME in GH9424 - raise nose.SkipTest( - "GH 9424; known failure read_csv with duplicate columns") - - data = 'one,one' - result = self.read_csv( - StringIO(data), mangle_dupe_cols=False, dtype={0: 'u1', 1: 'f'}) - expected = pd.concat([Series([], name='one', dtype='u1'), - Series([], name='one', dtype='f')], axis=1) - tm.assert_frame_equal(result, expected, check_index_type=False) - - def test_usecols_dtypes(self): - data = """\ -1,2,3 -4,5,6 -7,8,9 -10,11,12""" - - result = self.read_csv(StringIO(data), usecols=(0, 1, 2), - names=('a', 'b', 'c'), - header=None, - converters={'a': str}, - dtype={'b': int, 'c': float}, - ) - result2 = self.read_csv(StringIO(data), usecols=(0, 2), - names=('a', 'b', 'c'), - header=None, - converters={'a': str}, - dtype={'b': int, 'c': float}, - ) - self.assertTrue((result.dtypes == [object, np.int, np.float]).all()) - self.assertTrue((result2.dtypes == [object, np.float]).all()) - - def test_usecols_implicit_index_col(self): - # #2654 - data = 'a,b,c\n4,apple,bat,5.7\n8,orange,cow,10' - - result = self.read_csv(StringIO(data), usecols=['a', 'b']) - expected = DataFrame({'a': ['apple', 'orange'], - 'b': ['bat', 'cow']}, index=[4, 8]) - - tm.assert_frame_equal(result, expected) - - def test_usecols_regex_sep(self): - # #2733 - data = 'a b c\n4 apple bat 5.7\n8 orange cow 10' - - df = self.read_csv(StringIO(data), sep='\s+', usecols=('a', 'b')) - - expected = DataFrame({'a': ['apple', 'orange'], - 'b': ['bat', 'cow']}, index=[4, 8]) - tm.assert_frame_equal(df, expected) - - def test_pure_python_failover(self): - data = "a,b,c\n1,2,3#ignore this!\n4,5,6#ignorethistoo" - - result = self.read_csv(StringIO(data), comment='#') - expected = DataFrame({'a': [1, 4], 'b': [2, 5], 'c': [3, 6]}) - tm.assert_frame_equal(result, expected) - - - def test_memory_map(self): - # it works! - result = self.read_csv(self.csv1, memory_map=True) - - def test_disable_bool_parsing(self): - # #2090 - - data = """A,B,C -Yes,No,Yes -No,Yes,Yes -Yes,,Yes -No,No,No""" - - result = read_csv(StringIO(data), dtype=object) - self.assertTrue((result.dtypes == object).all()) - - result = read_csv(StringIO(data), dtype=object, na_filter=False) - self.assertEqual(result['B'][2], '') - - def test_euro_decimal_format(self): - data = """Id;Number1;Number2;Text1;Text2;Number3 -1;1521,1541;187101,9543;ABC;poi;4,738797819 -2;121,12;14897,76;DEF;uyt;0,377320872 -3;878,158;108013,434;GHI;rez;2,735694704""" - - df2 = self.read_csv(StringIO(data), sep=';', decimal=',') - self.assertEqual(df2['Number1'].dtype, float) - self.assertEqual(df2['Number2'].dtype, float) - self.assertEqual(df2['Number3'].dtype, float) - - def test_custom_lineterminator(self): - data = 'a,b,c~1,2,3~4,5,6' - - result = self.read_csv(StringIO(data), lineterminator='~') - expected = self.read_csv(StringIO(data.replace('~', '\n'))) - - tm.assert_frame_equal(result, expected) - - data2 = data.replace('~', '~~') - result = self.assertRaises(ValueError, read_csv, StringIO(data2), - lineterminator='~~') - - def test_raise_on_passed_int_dtype_with_nas(self): - # #2631 - data = """YEAR, DOY, a -2001,106380451,10 -2001,,11 -2001,106380451,67""" - self.assertRaises(ValueError, read_csv, StringIO(data), sep=",", - skipinitialspace=True, - dtype={'DOY': np.int64}) - - def test_na_trailing_columns(self): - data = """Date,Currenncy,Symbol,Type,Units,UnitPrice,Cost,Tax -2012-03-14,USD,AAPL,BUY,1000 -2012-05-12,USD,SBUX,SELL,500""" - - result = self.read_csv(StringIO(data)) - self.assertEqual(result['Date'][1], '2012-05-12') - self.assertTrue(result['UnitPrice'].isnull().all()) - - def test_parse_ragged_csv(self): - data = """1,2,3 -1,2,3,4 -1,2,3,4,5 -1,2 -1,2,3,4""" - - nice_data = """1,2,3,, -1,2,3,4, -1,2,3,4,5 -1,2,,, -1,2,3,4,""" - result = self.read_csv(StringIO(data), header=None, - names=['a', 'b', 'c', 'd', 'e']) - - expected = self.read_csv(StringIO(nice_data), header=None, - names=['a', 'b', 'c', 'd', 'e']) - - tm.assert_frame_equal(result, expected) - - # too many columns, cause segfault if not careful - data = "1,2\n3,4,5" - - result = self.read_csv(StringIO(data), header=None, - names=lrange(50)) - expected = self.read_csv(StringIO(data), header=None, - names=lrange(3)).reindex(columns=lrange(50)) - - tm.assert_frame_equal(result, expected) - - def test_tokenize_CR_with_quoting(self): - # #3453, this doesn't work with Python parser for some reason - - data = ' a,b,c\r"a,b","e,d","f,f"' - - result = self.read_csv(StringIO(data), header=None) - expected = self.read_csv(StringIO(data.replace('\r', '\n')), - header=None) - tm.assert_frame_equal(result, expected) - - result = self.read_csv(StringIO(data)) - expected = self.read_csv(StringIO(data.replace('\r', '\n'))) - tm.assert_frame_equal(result, expected) - - def test_raise_on_no_columns(self): - # single newline - data = "\n" - self.assertRaises(ValueError, self.read_csv, StringIO(data)) - - # test with more than a single newline - data = "\n\n\n" - self.assertRaises(ValueError, self.read_csv, StringIO(data)) - - def test_warn_if_chunks_have_mismatched_type(self): - # Issue #3866 If chunks are different types and can't - # be coerced using numerical types, then issue warning. - integers = [str(i) for i in range(499999)] - data = "a\n" + "\n".join(integers + ['a', 'b'] + integers) - - with tm.assert_produces_warning(DtypeWarning): - df = self.read_csv(StringIO(data)) - self.assertEqual(df.a.dtype, np.object) - - def test_invalid_c_parser_opts_with_not_c_parser(self): - from pandas.io.parsers import _c_parser_defaults as c_defaults - from pandas.io.parsers import _python_unsupported as py_unsupported - - data = """1,2,3,, -1,2,3,4, -1,2,3,4,5 -1,2,,, -1,2,3,4,""" - - engines = 'python', 'python-fwf' - for default in c_defaults: - for engine in engines: - if 'python' in engine and default not in py_unsupported: - continue - - kwargs = {default: object()} - with tm.assertRaisesRegexp(ValueError, - 'The %r option is not supported ' - 'with the %r engine' % (default, - engine)): - read_csv(StringIO(data), engine=engine, **kwargs) - - def test_passing_dtype(self): - # GH 6607 - # This is a copy which should eventually be merged into ParserTests - # when the dtype argument is supported by all engines. - - df = DataFrame(np.random.rand(5, 2), columns=list( - 'AB'), index=['1A', '1B', '1C', '1D', '1E']) - - with tm.ensure_clean('__passing_str_as_dtype__.csv') as path: - df.to_csv(path) - - # GH 3795 - # passing 'str' as the dtype - result = self.read_csv(path, dtype=str, index_col=0) - tm.assert_series_equal(result.dtypes, Series( - {'A': 'object', 'B': 'object'})) - - # we expect all object columns, so need to convert to test for - # equivalence - result = result.astype(float) - tm.assert_frame_equal(result, df) - - # invalid dtype - self.assertRaises(TypeError, self.read_csv, path, dtype={'A': 'foo', 'B': 'float64'}, - index_col=0) - - # valid but we don't support it (date) - self.assertRaises(TypeError, self.read_csv, path, dtype={'A': 'datetime64', 'B': 'float64'}, - index_col=0) - self.assertRaises(TypeError, self.read_csv, path, dtype={'A': 'datetime64', 'B': 'float64'}, - index_col=0, parse_dates=['B']) - - # valid but we don't support it - self.assertRaises(TypeError, self.read_csv, path, dtype={'A': 'timedelta64', 'B': 'float64'}, - index_col=0) - - def test_fallback_to_python(self): - # GH 6607 - data = 'a b c\n1 2 3' - - # specify C engine with C-unsupported options (raise) - with tm.assertRaisesRegexp(ValueError, 'does not support'): - self.read_table(StringIO(data), engine='c', sep=None, - delim_whitespace=False) - with tm.assertRaisesRegexp(ValueError, 'does not support'): - self.read_table(StringIO(data), engine='c', sep='\s') - with tm.assertRaisesRegexp(ValueError, 'does not support'): - self.read_table(StringIO(data), engine='c', skip_footer=1) - - def test_bool_header_arg(self): - # GH 6114 - data = """\ -MyColumn - a - b - a - b""" - for arg in [True, False]: - with tm.assertRaises(TypeError): - pd.read_csv(StringIO(data), header=arg) - with tm.assertRaises(TypeError): - pd.read_table(StringIO(data), header=arg) - with tm.assertRaises(TypeError): - pd.read_fwf(StringIO(data), header=arg) - - def test_multithread_stringio_read_csv(self): - # GH 11786 - max_row_range = 10000 - num_files = 100 - - bytes_to_df = [ - '\n'.join( - ['%d,%d,%d' % (i, i, i) for i in range(max_row_range)] - ).encode() for j in range(num_files)] - files = [BytesIO(b) for b in bytes_to_df] - - # Read all files in many threads - pool = ThreadPool(8) - results = pool.map(pd.read_csv, files) - first_result = results[0] - - for result in results: - tm.assert_frame_equal(first_result, result) - - def test_multithread_path_multipart_read_csv(self): - # GH 11786 - num_tasks = 4 - file_name = '__threadpool_reader__.csv' - num_rows = 100000 - - df = self.construct_dataframe(num_rows) - - with tm.ensure_clean(file_name) as path: - df.to_csv(path) - - final_dataframe = self.generate_multithread_dataframe(path, - num_rows, - num_tasks) - tm.assert_frame_equal(df, final_dataframe) - - -class TestMiscellaneous(tm.TestCase): - - # for tests that don't fit into any of the other classes, e.g. those that - # compare results for different engines or test the behavior when 'engine' - # is not passed - - def test_compare_whitespace_regex(self): - # GH 6607 - data = ' a b c\n1 2 3 \n4 5 6\n 7 8 9' - result_c = pd.read_table(StringIO(data), sep='\s+', engine='c') - result_py = pd.read_table(StringIO(data), sep='\s+', engine='python') - print(result_c) - tm.assert_frame_equal(result_c, result_py) - - def test_fallback_to_python(self): - # GH 6607 - data = 'a b c\n1 2 3' - - # specify C-unsupported options with python-unsupported option - # (options will be ignored on fallback, raise) - with tm.assertRaisesRegexp(ValueError, 'Falling back'): - pd.read_table(StringIO(data), sep=None, - delim_whitespace=False, dtype={'a': float}) - with tm.assertRaisesRegexp(ValueError, 'Falling back'): - pd.read_table(StringIO(data), sep='\s', dtype={'a': float}) - with tm.assertRaisesRegexp(ValueError, 'Falling back'): - pd.read_table(StringIO(data), skip_footer=1, dtype={'a': float}) - - # specify C-unsupported options without python-unsupported options - with tm.assert_produces_warning(parsers.ParserWarning): - pd.read_table(StringIO(data), sep=None, delim_whitespace=False) - with tm.assert_produces_warning(parsers.ParserWarning): - pd.read_table(StringIO(data), sep='\s') - with tm.assert_produces_warning(parsers.ParserWarning): - pd.read_table(StringIO(data), skip_footer=1) - - -class TestParseSQL(tm.TestCase): - - def test_convert_sql_column_floats(self): - arr = np.array([1.5, None, 3, 4.2], dtype=object) - result = lib.convert_sql_column(arr) - expected = np.array([1.5, np.nan, 3, 4.2], dtype='f8') - assert_same_values_and_dtype(result, expected) - - def test_convert_sql_column_strings(self): - arr = np.array(['1.5', None, '3', '4.2'], dtype=object) - result = lib.convert_sql_column(arr) - expected = np.array(['1.5', np.nan, '3', '4.2'], dtype=object) - assert_same_values_and_dtype(result, expected) - - def test_convert_sql_column_unicode(self): - arr = np.array([u('1.5'), None, u('3'), u('4.2')], - dtype=object) - result = lib.convert_sql_column(arr) - expected = np.array([u('1.5'), np.nan, u('3'), u('4.2')], - dtype=object) - assert_same_values_and_dtype(result, expected) - - def test_convert_sql_column_ints(self): - arr = np.array([1, 2, 3, 4], dtype='O') - arr2 = np.array([1, 2, 3, 4], dtype='i4').astype('O') - result = lib.convert_sql_column(arr) - result2 = lib.convert_sql_column(arr2) - expected = np.array([1, 2, 3, 4], dtype='i8') - assert_same_values_and_dtype(result, expected) - assert_same_values_and_dtype(result2, expected) - - arr = np.array([1, 2, 3, None, 4], dtype='O') - result = lib.convert_sql_column(arr) - expected = np.array([1, 2, 3, np.nan, 4], dtype='f8') - assert_same_values_and_dtype(result, expected) - - def test_convert_sql_column_longs(self): - arr = np.array([long(1), long(2), long(3), long(4)], dtype='O') - result = lib.convert_sql_column(arr) - expected = np.array([1, 2, 3, 4], dtype='i8') - assert_same_values_and_dtype(result, expected) - - arr = np.array([long(1), long(2), long(3), None, long(4)], dtype='O') - result = lib.convert_sql_column(arr) - expected = np.array([1, 2, 3, np.nan, 4], dtype='f8') - assert_same_values_and_dtype(result, expected) - - def test_convert_sql_column_bools(self): - arr = np.array([True, False, True, False], dtype='O') - result = lib.convert_sql_column(arr) - expected = np.array([True, False, True, False], dtype=bool) - assert_same_values_and_dtype(result, expected) - - arr = np.array([True, False, None, False], dtype='O') - result = lib.convert_sql_column(arr) - expected = np.array([True, False, np.nan, False], dtype=object) - assert_same_values_and_dtype(result, expected) - - def test_convert_sql_column_decimals(self): - from decimal import Decimal - arr = np.array([Decimal('1.5'), None, Decimal('3'), Decimal('4.2')]) - result = lib.convert_sql_column(arr) - expected = np.array([1.5, np.nan, 3, 4.2], dtype='f8') - assert_same_values_and_dtype(result, expected) - - -class TestUrlGz(tm.TestCase): - - def setUp(self): - dirpath = tm.get_data_path() - localtable = os.path.join(dirpath, 'salary.table') - self.local_table = read_table(localtable) - - @tm.network - def test_url_gz(self): - url = 'https://raw.github.com/pydata/pandas/master/pandas/io/tests/data/salary.table.gz' - url_table = read_table(url, compression="gzip", engine="python") - tm.assert_frame_equal(url_table, self.local_table) - - @tm.network - def test_url_gz_infer(self): - url = ('https://s3.amazonaws.com/pandas-test/salary.table.gz') - url_table = read_table(url, compression="infer", engine="python") - tm.assert_frame_equal(url_table, self.local_table) - - -class TestS3(tm.TestCase): - - def setUp(self): - try: - import boto - except ImportError: - raise nose.SkipTest("boto not installed") - - @tm.network - def test_parse_public_s3_bucket(self): - for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]: - if comp == 'bz2' and compat.PY2: - # The Python 2 C parser can't read bz2 from S3. - self.assertRaises(ValueError, pd.read_csv, - 's3://pandas-test/tips.csv' + ext, - compression=comp) - else: - df = pd.read_csv('s3://pandas-test/tips.csv' + - ext, compression=comp) - self.assertTrue(isinstance(df, pd.DataFrame)) - self.assertFalse(df.empty) - tm.assert_frame_equal(pd.read_csv( - tm.get_data_path('tips.csv')), df) - - # Read public file from bucket with not-public contents - df = pd.read_csv('s3://cant_get_it/tips.csv') - self.assertTrue(isinstance(df, pd.DataFrame)) - self.assertFalse(df.empty) - tm.assert_frame_equal(pd.read_csv(tm.get_data_path('tips.csv')), df) - - @tm.network - def test_parse_public_s3n_bucket(self): - # Read from AWS s3 as "s3n" URL - df = pd.read_csv('s3n://pandas-test/tips.csv', nrows=10) - self.assertTrue(isinstance(df, pd.DataFrame)) - self.assertFalse(df.empty) - tm.assert_frame_equal(pd.read_csv( - tm.get_data_path('tips.csv')).iloc[:10], df) - - @tm.network - def test_parse_public_s3a_bucket(self): - # Read from AWS s3 as "s3a" URL - df = pd.read_csv('s3a://pandas-test/tips.csv', nrows=10) - self.assertTrue(isinstance(df, pd.DataFrame)) - self.assertFalse(df.empty) - tm.assert_frame_equal(pd.read_csv( - tm.get_data_path('tips.csv')).iloc[:10], df) - - @tm.network - def test_parse_public_s3_bucket_nrows(self): - for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]: - if comp == 'bz2' and compat.PY2: - # The Python 2 C parser can't read bz2 from S3. - self.assertRaises(ValueError, pd.read_csv, - 's3://pandas-test/tips.csv' + ext, - compression=comp) - else: - df = pd.read_csv('s3://pandas-test/tips.csv' + - ext, nrows=10, compression=comp) - self.assertTrue(isinstance(df, pd.DataFrame)) - self.assertFalse(df.empty) - tm.assert_frame_equal(pd.read_csv( - tm.get_data_path('tips.csv')).iloc[:10], df) - - @tm.network - def test_parse_public_s3_bucket_chunked(self): - # Read with a chunksize - chunksize = 5 - local_tips = pd.read_csv(tm.get_data_path('tips.csv')) - for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]: - if comp == 'bz2' and compat.PY2: - # The Python 2 C parser can't read bz2 from S3. - self.assertRaises(ValueError, pd.read_csv, - 's3://pandas-test/tips.csv' + ext, - compression=comp) - else: - df_reader = pd.read_csv('s3://pandas-test/tips.csv' + ext, - chunksize=chunksize, compression=comp) - self.assertEqual(df_reader.chunksize, chunksize) - for i_chunk in [0, 1, 2]: - # Read a couple of chunks and make sure we see them - # properly. - df = df_reader.get_chunk() - self.assertTrue(isinstance(df, pd.DataFrame)) - self.assertFalse(df.empty) - true_df = local_tips.iloc[ - chunksize * i_chunk: chunksize * (i_chunk + 1)] - # Chunking doesn't preserve row numbering - true_df = true_df.reset_index().drop('index', axis=1) - tm.assert_frame_equal(true_df, df) - - @tm.network - def test_parse_public_s3_bucket_chunked_python(self): - # Read with a chunksize using the Python parser - chunksize = 5 - local_tips = pd.read_csv(tm.get_data_path('tips.csv')) - for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]: - df_reader = pd.read_csv('s3://pandas-test/tips.csv' + ext, - chunksize=chunksize, compression=comp, - engine='python') - self.assertEqual(df_reader.chunksize, chunksize) - for i_chunk in [0, 1, 2]: - # Read a couple of chunks and make sure we see them properly. - df = df_reader.get_chunk() - self.assertTrue(isinstance(df, pd.DataFrame)) - self.assertFalse(df.empty) - true_df = local_tips.iloc[ - chunksize * i_chunk: chunksize * (i_chunk + 1)] - # Chunking doesn't preserve row numbering - true_df = true_df.reset_index().drop('index', axis=1) - tm.assert_frame_equal(true_df, df) - - @tm.network - def test_parse_public_s3_bucket_python(self): - for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]: - df = pd.read_csv('s3://pandas-test/tips.csv' + ext, engine='python', - compression=comp) - self.assertTrue(isinstance(df, pd.DataFrame)) - self.assertFalse(df.empty) - tm.assert_frame_equal(pd.read_csv( - tm.get_data_path('tips.csv')), df) - - @tm.network - def test_infer_s3_compression(self): - for ext in ['', '.gz', '.bz2']: - df = pd.read_csv('s3://pandas-test/tips.csv' + ext, - engine='python', compression='infer') - self.assertTrue(isinstance(df, pd.DataFrame)) - self.assertFalse(df.empty) - tm.assert_frame_equal(pd.read_csv( - tm.get_data_path('tips.csv')), df) - - @tm.network - def test_parse_public_s3_bucket_nrows_python(self): - for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]: - df = pd.read_csv('s3://pandas-test/tips.csv' + ext, engine='python', - nrows=10, compression=comp) - self.assertTrue(isinstance(df, pd.DataFrame)) - self.assertFalse(df.empty) - tm.assert_frame_equal(pd.read_csv( - tm.get_data_path('tips.csv')).iloc[:10], df) - - @tm.network - def test_s3_fails(self): - import boto - with tm.assertRaisesRegexp(boto.exception.S3ResponseError, - 'S3ResponseError: 404 Not Found'): - pd.read_csv('s3://nyqpug/asdf.csv') - - # Receive a permission error when trying to read a private bucket. - # It's irrelevant here that this isn't actually a table. - with tm.assertRaisesRegexp(boto.exception.S3ResponseError, - 'S3ResponseError: 403 Forbidden'): - pd.read_csv('s3://cant_get_it/') - - -def assert_same_values_and_dtype(res, exp): - tm.assert_equal(res.dtype, exp.dtype) - tm.assert_almost_equal(res, exp) - - -if __name__ == '__main__': - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) diff --git a/pandas/tests/test_lib.py b/pandas/tests/test_lib.py index 5b1c82f8ff5e7..6912e3a7ff68c 100644 --- a/pandas/tests/test_lib.py +++ b/pandas/tests/test_lib.py @@ -1,12 +1,18 @@ # -*- coding: utf-8 -*- + from datetime import datetime, timedelta, date, time import numpy as np - import pandas as pd import pandas.lib as lib import pandas.util.testing as tm -from pandas.compat import u, PY2 + +from pandas.compat import long, u, PY2 + + +def _assert_same_values_and_dtype(res, exp): + tm.assert_equal(res.dtype, exp.dtype) + tm.assert_almost_equal(res, exp) class TestMisc(tm.TestCase): @@ -249,6 +255,71 @@ def test_lisscalar_pandas_containers(self): self.assertFalse(lib.isscalar(pd.Index([1]))) +class TestParseSQL(tm.TestCase): + + def test_convert_sql_column_floats(self): + arr = np.array([1.5, None, 3, 4.2], dtype=object) + result = lib.convert_sql_column(arr) + expected = np.array([1.5, np.nan, 3, 4.2], dtype='f8') + _assert_same_values_and_dtype(result, expected) + + def test_convert_sql_column_strings(self): + arr = np.array(['1.5', None, '3', '4.2'], dtype=object) + result = lib.convert_sql_column(arr) + expected = np.array(['1.5', np.nan, '3', '4.2'], dtype=object) + _assert_same_values_and_dtype(result, expected) + + def test_convert_sql_column_unicode(self): + arr = np.array([u('1.5'), None, u('3'), u('4.2')], + dtype=object) + result = lib.convert_sql_column(arr) + expected = np.array([u('1.5'), np.nan, u('3'), u('4.2')], + dtype=object) + _assert_same_values_and_dtype(result, expected) + + def test_convert_sql_column_ints(self): + arr = np.array([1, 2, 3, 4], dtype='O') + arr2 = np.array([1, 2, 3, 4], dtype='i4').astype('O') + result = lib.convert_sql_column(arr) + result2 = lib.convert_sql_column(arr2) + expected = np.array([1, 2, 3, 4], dtype='i8') + _assert_same_values_and_dtype(result, expected) + _assert_same_values_and_dtype(result2, expected) + + arr = np.array([1, 2, 3, None, 4], dtype='O') + result = lib.convert_sql_column(arr) + expected = np.array([1, 2, 3, np.nan, 4], dtype='f8') + _assert_same_values_and_dtype(result, expected) + + def test_convert_sql_column_longs(self): + arr = np.array([long(1), long(2), long(3), long(4)], dtype='O') + result = lib.convert_sql_column(arr) + expected = np.array([1, 2, 3, 4], dtype='i8') + _assert_same_values_and_dtype(result, expected) + + arr = np.array([long(1), long(2), long(3), None, long(4)], dtype='O') + result = lib.convert_sql_column(arr) + expected = np.array([1, 2, 3, np.nan, 4], dtype='f8') + _assert_same_values_and_dtype(result, expected) + + def test_convert_sql_column_bools(self): + arr = np.array([True, False, True, False], dtype='O') + result = lib.convert_sql_column(arr) + expected = np.array([True, False, True, False], dtype=bool) + _assert_same_values_and_dtype(result, expected) + + arr = np.array([True, False, None, False], dtype='O') + result = lib.convert_sql_column(arr) + expected = np.array([True, False, np.nan, False], dtype=object) + _assert_same_values_and_dtype(result, expected) + + def test_convert_sql_column_decimals(self): + from decimal import Decimal + arr = np.array([Decimal('1.5'), None, Decimal('3'), Decimal('4.2')]) + result = lib.convert_sql_column(arr) + expected = np.array([1.5, np.nan, 3, 4.2], dtype='f8') + _assert_same_values_and_dtype(result, expected) + if __name__ == '__main__': import nose diff --git a/setup.py b/setup.py index 29d6ce2ab5b46..3da90bc446aae 100755 --- a/setup.py +++ b/setup.py @@ -584,6 +584,8 @@ def pxd(name): 'pandas.types', 'pandas.io.tests', 'pandas.io.tests.json', + 'pandas.io.tests.parser', + 'pandas.io.tests.sas', 'pandas.stats.tests', 'pandas.msgpack' ],