--- build/html/user_guide/Orig_io.html 2019-08-23 21:06:44.000000000 +0530 +++ build/html/user_guide/io.html 2019-08-22 20:09:45.000000000 +0530 @@ -7,7 +7,7 @@ - IO tools (text, CSV, HDF5, …) — pandas 0.24.0.dev0+2875.g37d420afc.dirty documentation + IO tools (text, CSV, HDF5, …) — pandas 0.24.0.dev0+2867.g035e35b36.dirty documentation @@ -32,7 +32,7 @@
  • previous |
  • - +
    @@ -359,23 +359,27 @@ ['bar', 'foo'] order.

    If callable, the callable function will be evaluated against the column names, returning names where the callable function evaluates to True:

    -
    In [1]: from io import StringIO, BytesIO
    +
    In [1]: import pandas as pd
     
    -In [2]: data = ('col1,col2,col3\n'
    +In [2]: pd.options.display.max_rows = 15
    +
    +In [3]: from io import StringIO
    +
    +In [4]: data = ('col1,col2,col3\n'
        ...:         'a,b,1\n'
        ...:         'a,b,2\n'
        ...:         'c,d,3')
        ...: 
     
    -In [3]: pd.read_csv(StringIO(data))
    -Out[3]: 
    +In [5]: pd.read_csv(StringIO(data))
    +Out[5]: 
       col1 col2  col3
     0    a    b     1
     1    a    b     2
     2    c    d     3
     
    -In [4]: pd.read_csv(StringIO(data), usecols=lambda x: x.upper() in ['COL1', 'COL3'])
    -Out[4]: 
    +In [6]: pd.read_csv(StringIO(data), usecols=lambda x: x.upper() in ['COL1', 'COL3'])
    +Out[6]: 
       col1  col3
     0    a     1
     1    a     2
    @@ -423,21 +427,21 @@
     of the file.

    If callable, the callable function will be evaluated against the row indices, returning True if the row should be skipped and False otherwise:

    -
    In [5]: data = ('col1,col2,col3\n'
    +
    In [7]: data = ('col1,col2,col3\n'
        ...:         'a,b,1\n'
        ...:         'a,b,2\n'
        ...:         'c,d,3')
        ...: 
     
    -In [6]: pd.read_csv(StringIO(data))
    -Out[6]: 
    +In [8]: pd.read_csv(StringIO(data))
    +Out[8]: 
       col1 col2  col3
     0    a    b     1
     1    a    b     2
     2    c    d     3
     
    -In [7]: pd.read_csv(StringIO(data), skiprows=lambda x: x % 2 != 0)
    -Out[7]: 
    +In [9]: pd.read_csv(StringIO(data), skiprows=lambda x: x % 2 != 0)
    +Out[9]: 
       col1 col2  col3
     0    a    b     2
     
    @@ -617,36 +621,40 @@

    Specifying column data types

    You can indicate the data type for the whole DataFrame or individual columns:

    -
    In [8]: data = ('a,b,c,d\n'
    -   ...:         '1,2,3,4\n'
    -   ...:         '5,6,7,8\n'
    -   ...:         '9,10,11')
    -   ...: 
    +
    In [10]: import numpy as np
    +
    +In [11]: np.set_printoptions(precision=4, suppress=True)
     
    -In [9]: print(data)
    +In [12]: data = ('a,b,c,d\n'
    +   ....:         '1,2,3,4\n'
    +   ....:         '5,6,7,8\n'
    +   ....:         '9,10,11')
    +   ....: 
    +
    +In [13]: print(data)
     a,b,c,d
     1,2,3,4
     5,6,7,8
     9,10,11
     
    -In [10]: df = pd.read_csv(StringIO(data), dtype=object)
    +In [14]: df = pd.read_csv(StringIO(data), dtype=object)
     
    -In [11]: df
    -Out[11]: 
    +In [15]: df
    +Out[15]: 
        a   b   c    d
     0  1   2   3    4
     1  5   6   7    8
     2  9  10  11  NaN
     
    -In [12]: df['a'][0]
    -Out[12]: '1'
    +In [16]: df['a'][0]
    +Out[16]: '1'
     
    -In [13]: df = pd.read_csv(StringIO(data),
    +In [17]: df = pd.read_csv(StringIO(data),
        ....:                  dtype={'b': object, 'c': np.float64, 'd': 'Int64'})
        ....: 
     
    -In [14]: df.dtypes
    -Out[14]: 
    +In [18]: df.dtypes
    +Out[18]: 
     a      int64
     b     object
     c    float64
    @@ -661,45 +669,45 @@
     pandas.

    For instance, you can use the converters argument of read_csv():

    -
    In [15]: data = ("col_1\n"
    +
    In [19]: data = ("col_1\n"
        ....:         "1\n"
        ....:         "2\n"
        ....:         "'A'\n"
        ....:         "4.22")
        ....: 
     
    -In [16]: df = pd.read_csv(StringIO(data), converters={'col_1': str})
    +In [20]: df = pd.read_csv(StringIO(data), converters={'col_1': str})
     
    -In [17]: df
    -Out[17]: 
    +In [21]: df
    +Out[21]: 
       col_1
     0     1
     1     2
     2   'A'
     3  4.22
     
    -In [18]: df['col_1'].apply(type).value_counts()
    -Out[18]: 
    +In [22]: df['col_1'].apply(type).value_counts()
    +Out[22]: 
     <class 'str'>    4
     Name: col_1, dtype: int64
     

    Or you can use the to_numeric() function to coerce the dtypes after reading in the data,

    -
    In [19]: df2 = pd.read_csv(StringIO(data))
    +
    In [23]: df2 = pd.read_csv(StringIO(data))
     
    -In [20]: df2['col_1'] = pd.to_numeric(df2['col_1'], errors='coerce')
    +In [24]: df2['col_1'] = pd.to_numeric(df2['col_1'], errors='coerce')
     
    -In [21]: df2
    -Out[21]: 
    +In [25]: df2
    +Out[25]: 
        col_1
     0   1.00
     1   2.00
     2    NaN
     3   4.22
     
    -In [22]: df2['col_1'].apply(type).value_counts()
    -Out[22]: 
    +In [26]: df2['col_1'].apply(type).value_counts()
    +Out[26]: 
     <class 'float'>    4
     Name: col_1, dtype: int64
     
    @@ -725,22 +733,22 @@ dtypes of your columns, the parsing engine will go and infer the dtypes for different chunks of the data, rather than the whole dataset at once. Consequently, you can end up with column(s) with mixed dtypes. For example,

    -
    In [23]: col_1 = list(range(500000)) + ['a', 'b'] + list(range(500000))
    +
    In [27]: col_1 = list(range(500000)) + ['a', 'b'] + list(range(500000))
     
    -In [24]: df = pd.DataFrame({'col_1': col_1})
    +In [28]: df = pd.DataFrame({'col_1': col_1})
     
    -In [25]: df.to_csv('foo.csv')
    +In [29]: df.to_csv('foo.csv')
     
    -In [26]: mixed_df = pd.read_csv('foo.csv')
    +In [30]: mixed_df = pd.read_csv('foo.csv')
     
    -In [27]: mixed_df['col_1'].apply(type).value_counts()
    -Out[27]: 
    +In [31]: mixed_df['col_1'].apply(type).value_counts()
    +Out[31]: 
     <class 'int'>    737858
     <class 'str'>    262144
     Name: col_1, dtype: int64
     
    -In [28]: mixed_df['col_1'].dtype
    -Out[28]: dtype('O')
    +In [32]: mixed_df['col_1'].dtype
    +Out[32]: dtype('O')
     

    will result with mixed_df containing an int dtype for certain chunks @@ -753,28 +761,28 @@

    Specifying categorical dtype

    Categorical columns can be parsed directly by specifying dtype='category' or dtype=CategoricalDtype(categories, ordered).

    -
    In [29]: data = ('col1,col2,col3\n'
    +
    In [33]: data = ('col1,col2,col3\n'
        ....:         'a,b,1\n'
        ....:         'a,b,2\n'
        ....:         'c,d,3')
        ....: 
     
    -In [30]: pd.read_csv(StringIO(data))
    -Out[30]: 
    +In [34]: pd.read_csv(StringIO(data))
    +Out[34]: 
       col1 col2  col3
     0    a    b     1
     1    a    b     2
     2    c    d     3
     
    -In [31]: pd.read_csv(StringIO(data)).dtypes
    -Out[31]: 
    +In [35]: pd.read_csv(StringIO(data)).dtypes
    +Out[35]: 
     col1    object
     col2    object
     col3     int64
     dtype: object
     
    -In [32]: pd.read_csv(StringIO(data), dtype='category').dtypes
    -Out[32]: 
    +In [36]: pd.read_csv(StringIO(data), dtype='category').dtypes
    +Out[36]: 
     col1    category
     col2    category
     col3    category
    @@ -783,8 +791,8 @@
     

    Individual columns can be parsed as a Categorical using a dict specification:

    -
    In [33]: pd.read_csv(StringIO(data), dtype={'col1': 'category'}).dtypes
    -Out[33]: 
    +
    In [37]: pd.read_csv(StringIO(data), dtype={'col1': 'category'}).dtypes
    +Out[37]: 
     col1    category
     col2      object
     col3       int64
    @@ -799,12 +807,12 @@
     control on the categories and order, create a
     CategoricalDtype ahead of time, and pass that for
     that column’s dtype.

    -
    In [34]: from pandas.api.types import CategoricalDtype
    +
    In [38]: from pandas.api.types import CategoricalDtype
     
    -In [35]: dtype = CategoricalDtype(['d', 'c', 'b', 'a'], ordered=True)
    +In [39]: dtype = CategoricalDtype(['d', 'c', 'b', 'a'], ordered=True)
     
    -In [36]: pd.read_csv(StringIO(data), dtype={'col1': dtype}).dtypes
    -Out[36]: 
    +In [40]: pd.read_csv(StringIO(data), dtype={'col1': dtype}).dtypes
    +Out[40]: 
     col1    category
     col2      object
     col3       int64
    @@ -813,10 +821,10 @@
     

    When using dtype=CategoricalDtype, “unexpected” values outside of dtype.categories are treated as missing values.

    -
    In [37]: dtype = CategoricalDtype(['a', 'b', 'd'])  # No 'c'
    +
    In [41]: dtype = CategoricalDtype(['a', 'b', 'd'])  # No 'c'
     
    -In [38]: pd.read_csv(StringIO(data), dtype={'col1': dtype}).col1
    -Out[38]: 
    +In [42]: pd.read_csv(StringIO(data), dtype={'col1': dtype}).col1
    +Out[42]: 
     0      a
     1      a
     2    NaN
    @@ -833,27 +841,27 @@
     converter such as to_datetime().

    When dtype is a CategoricalDtype with homogeneous categories ( all numeric, all datetimes, etc.), the conversion is done automatically.

    -
    In [39]: df = pd.read_csv(StringIO(data), dtype='category')
    +
    In [43]: df = pd.read_csv(StringIO(data), dtype='category')
     
    -In [40]: df.dtypes
    -Out[40]: 
    +In [44]: df.dtypes
    +Out[44]: 
     col1    category
     col2    category
     col3    category
     dtype: object
     
    -In [41]: df['col3']
    -Out[41]: 
    +In [45]: df['col3']
    +Out[45]: 
     0    1
     1    2
     2    3
     Name: col3, dtype: category
     Categories (3, object): [1, 2, 3]
     
    -In [42]: df['col3'].cat.categories = pd.to_numeric(df['col3'].cat.categories)
    +In [46]: df['col3'].cat.categories = pd.to_numeric(df['col3'].cat.categories)
     
    -In [43]: df['col3']
    -Out[43]: 
    +In [47]: df['col3']
    +Out[47]: 
     0    1
     1    2
     2    3
    @@ -869,20 +877,20 @@
     

    Handling column names

    A file may or may not have a header row. pandas assumes the first row should be used as the column names:

    -
    In [44]: data = ('a,b,c\n'
    +
    In [48]: data = ('a,b,c\n'
        ....:         '1,2,3\n'
        ....:         '4,5,6\n'
        ....:         '7,8,9')
        ....: 
     
    -In [45]: print(data)
    +In [49]: print(data)
     a,b,c
     1,2,3
     4,5,6
     7,8,9
     
    -In [46]: pd.read_csv(StringIO(data))
    -Out[46]: 
    +In [50]: pd.read_csv(StringIO(data))
    +Out[50]: 
        a  b  c
     0  1  2  3
     1  4  5  6
    @@ -892,21 +900,21 @@
     

    By specifying the names argument in conjunction with header you can indicate other names to use and whether or not to throw away the header row (if any):

    -
    In [47]: print(data)
    +
    In [51]: print(data)
     a,b,c
     1,2,3
     4,5,6
     7,8,9
     
    -In [48]: pd.read_csv(StringIO(data), names=['foo', 'bar', 'baz'], header=0)
    -Out[48]: 
    +In [52]: pd.read_csv(StringIO(data), names=['foo', 'bar', 'baz'], header=0)
    +Out[52]: 
        foo  bar  baz
     0    1    2    3
     1    4    5    6
     2    7    8    9
     
    -In [49]: pd.read_csv(StringIO(data), names=['foo', 'bar', 'baz'], header=None)
    -Out[49]: 
    +In [53]: pd.read_csv(StringIO(data), names=['foo', 'bar', 'baz'], header=None)
    +Out[53]: 
       foo bar baz
     0   a   b   c
     1   1   2   3
    @@ -916,15 +924,15 @@
     

    If the header is in a row other than the first, pass the row number to header. This will skip the preceding rows:

    -
    In [50]: data = ('skip this skip it\n'
    +
    In [54]: data = ('skip this skip it\n'
        ....:         'a,b,c\n'
        ....:         '1,2,3\n'
        ....:         '4,5,6\n'
        ....:         '7,8,9')
        ....: 
     
    -In [51]: pd.read_csv(StringIO(data), header=1)
    -Out[51]: 
    +In [55]: pd.read_csv(StringIO(data), header=1)
    +Out[55]: 
        a  b  c
     0  1  2  3
     1  4  5  6
    @@ -945,13 +953,13 @@
     

    Duplicate names parsing

    If the file or header contains duplicate names, pandas will by default distinguish between them so as to prevent overwriting data:

    -
    In [52]: data = ('a,b,a\n'
    +
    In [56]: data = ('a,b,a\n'
        ....:         '0,1,2\n'
        ....:         '3,4,5')
        ....: 
     
    -In [53]: pd.read_csv(StringIO(data))
    -Out[53]: 
    +In [57]: pd.read_csv(StringIO(data))
    +Out[57]: 
        a  b  a.1
     0  0  1    2
     1  3  4    5
    @@ -984,31 +992,31 @@
     

    New in version 0.20.0: support for callable usecols arguments

    -
    In [54]: data = 'a,b,c,d\n1,2,3,foo\n4,5,6,bar\n7,8,9,baz'
    +
    In [58]: data = 'a,b,c,d\n1,2,3,foo\n4,5,6,bar\n7,8,9,baz'
     
    -In [55]: pd.read_csv(StringIO(data))
    -Out[55]: 
    +In [59]: pd.read_csv(StringIO(data))
    +Out[59]: 
        a  b  c    d
     0  1  2  3  foo
     1  4  5  6  bar
     2  7  8  9  baz
     
    -In [56]: pd.read_csv(StringIO(data), usecols=['b', 'd'])
    -Out[56]: 
    +In [60]: pd.read_csv(StringIO(data), usecols=['b', 'd'])
    +Out[60]: 
        b    d
     0  2  foo
     1  5  bar
     2  8  baz
     
    -In [57]: pd.read_csv(StringIO(data), usecols=[0, 2, 3])
    -Out[57]: 
    +In [61]: pd.read_csv(StringIO(data), usecols=[0, 2, 3])
    +Out[61]: 
        a  c    d
     0  1  3  foo
     1  4  6  bar
     2  7  9  baz
     
    -In [58]: pd.read_csv(StringIO(data), usecols=lambda x: x.upper() in ['A', 'C'])
    -Out[58]: 
    +In [62]: pd.read_csv(StringIO(data), usecols=lambda x: x.upper() in ['A', 'C'])
    +Out[62]: 
        a  c
     0  1  3
     1  4  6
    @@ -1017,8 +1025,8 @@
     

    The usecols argument can also be used to specify which columns not to use in the final result:

    -
    In [59]: pd.read_csv(StringIO(data), usecols=lambda x: x not in ['a', 'c'])
    -Out[59]: 
    +
    In [63]: pd.read_csv(StringIO(data), usecols=lambda x: x not in ['a', 'c'])
    +Out[63]: 
        b    d
     0  2  foo
     1  5  bar
    @@ -1035,7 +1043,7 @@
     

    Ignoring line comments and empty lines

    If the comment parameter is specified, then completely commented lines will be ignored. By default, completely blank lines will be ignored as well.

    -
    In [60]: data = ('\n'
    +
    In [64]: data = ('\n'
        ....:         'a,b,c\n'
        ....:         '  \n'
        ....:         '# commented line\n'
    @@ -1044,7 +1052,7 @@
        ....:         '4,5,6')
        ....: 
     
    -In [61]: print(data)
    +In [65]: print(data)
     
     a,b,c
       
    @@ -1053,15 +1061,15 @@
     
     4,5,6
     
    -In [62]: pd.read_csv(StringIO(data), comment='#')
    -Out[62]: 
    +In [66]: pd.read_csv(StringIO(data), comment='#')
    +Out[66]: 
        a  b  c
     0  1  2  3
     1  4  5  6
     

    If skip_blank_lines=False, then read_csv will not ignore blank lines:

    -
    In [63]: data = ('a,b,c\n'
    +
    In [67]: data = ('a,b,c\n'
        ....:         '\n'
        ....:         '1,2,3\n'
        ....:         '\n'
    @@ -1069,8 +1077,8 @@
        ....:         '4,5,6')
        ....: 
     
    -In [64]: pd.read_csv(StringIO(data), skip_blank_lines=False)
    -Out[64]: 
    +In [68]: pd.read_csv(StringIO(data), skip_blank_lines=False)
    +Out[68]: 
          a    b    c
     0  NaN  NaN  NaN
     1  1.0  2.0  3.0
    @@ -1084,25 +1092,25 @@
     

    The presence of ignored lines might create ambiguities involving line numbers; the parameter header uses row numbers (ignoring commented/empty lines), while skiprows uses line numbers (including commented/empty lines):

    -
    In [65]: data = ('#comment\n'
    +
    In [69]: data = ('#comment\n'
        ....:         'a,b,c\n'
        ....:         'A,B,C\n'
        ....:         '1,2,3')
        ....: 
     
    -In [66]: pd.read_csv(StringIO(data), comment='#', header=1)
    -Out[66]: 
    +In [70]: pd.read_csv(StringIO(data), comment='#', header=1)
    +Out[70]: 
        A  B  C
     0  1  2  3
     
    -In [67]: data = ('A,B,C\n'
    +In [71]: data = ('A,B,C\n'
        ....:         '#comment\n'
        ....:         'a,b,c\n'
        ....:         '1,2,3')
        ....: 
     
    -In [68]: pd.read_csv(StringIO(data), comment='#', skiprows=2)
    -Out[68]: 
    +In [72]: pd.read_csv(StringIO(data), comment='#', skiprows=2)
    +Out[72]: 
        a  b  c
     0  1  2  3
     
    @@ -1110,7 +1118,7 @@

    If both header and skiprows are specified, header will be relative to the end of skiprows. For example:

    -
    In [69]: data = ('# empty\n'
    +
    In [73]: data = ('# empty\n'
        ....:         '# second empty line\n'
        ....:         '# third emptyline\n'
        ....:         'X,Y,Z\n'
    @@ -1120,7 +1128,7 @@
        ....:         '5.,NaN,10.0\n')
        ....: 
     
    -In [70]: print(data)
    +In [74]: print(data)
     # empty
     # second empty line
     # third emptyline
    @@ -1131,8 +1139,8 @@
     5.,NaN,10.0
     
     
    -In [71]: pd.read_csv(StringIO(data), comment='#', skiprows=4, header=1)
    -Out[71]: 
    +In [75]: pd.read_csv(StringIO(data), comment='#', skiprows=4, header=1)
    +Out[75]: 
          A    B     C
     0  1.0  2.0   4.0
     1  5.0  NaN  10.0
    @@ -1142,7 +1150,7 @@
     

    Comments

    Sometimes comments or meta data may be included in a file:

    -
    In [72]: print(open('tmp.csv').read())
    +
    In [76]: print(open('tmp.csv').read())
     ID,level,category
     Patient1,123000,x # really unpleasant
     Patient2,23000,y # wouldn't take his medicine
    @@ -1150,10 +1158,10 @@
     

    By default, the parser includes the comments in the output:

    -
    In [73]: df = pd.read_csv('tmp.csv')
    +
    In [77]: df = pd.read_csv('tmp.csv')
     
    -In [74]: df
    -Out[74]: 
    +In [78]: df
    +Out[78]: 
              ID    level                        category
     0  Patient1   123000           x # really unpleasant
     1  Patient2    23000  y # wouldn't take his medicine
    @@ -1161,10 +1169,10 @@
     

    We can suppress the comments using the comment keyword:

    -
    In [75]: df = pd.read_csv('tmp.csv', comment='#')
    +
    In [79]: df = pd.read_csv('tmp.csv', comment='#')
     
    -In [76]: df
    -Out[76]: 
    +In [80]: df
    +Out[80]: 
              ID    level category
     0  Patient1   123000       x 
     1  Patient2    23000       y 
    @@ -1177,23 +1185,25 @@
     

    Dealing with Unicode data

    The encoding argument should be used for encoded unicode data, which will result in byte strings being decoded to unicode in the result:

    -
    In [77]: data = (b'word,length\n'
    +
    In [81]: from io import BytesIO
    +
    +In [82]: data = (b'word,length\n'
        ....:         b'Tr\xc3\xa4umen,7\n'
        ....:         b'Gr\xc3\xbc\xc3\x9fe,5')
        ....: 
     
    -In [78]: data = data.decode('utf8').encode('latin-1')
    +In [83]: data = data.decode('utf8').encode('latin-1')
     
    -In [79]: df = pd.read_csv(BytesIO(data), encoding='latin-1')
    +In [84]: df = pd.read_csv(BytesIO(data), encoding='latin-1')
     
    -In [80]: df
    -Out[80]: 
    +In [85]: df
    +Out[85]: 
           word  length
     0  Träumen       7
     1    Grüße       5
     
    -In [81]: df['word'][1]
    -Out[81]: 'Grüße'
    +In [86]: df['word'][1]
    +Out[86]: 'Grüße'
     

    Some formats which encode all characters as multiple bytes, like UTF-16, won’t @@ -1204,25 +1214,25 @@

    Index columns and trailing delimiters

    If a file has one more column of data than the number of column names, the first column will be used as the DataFrame’s row names:

    -
    In [82]: data = ('a,b,c\n'
    +
    In [87]: data = ('a,b,c\n'
        ....:         '4,apple,bat,5.7\n'
        ....:         '8,orange,cow,10')
        ....: 
     
    -In [83]: pd.read_csv(StringIO(data))
    -Out[83]: 
    +In [88]: pd.read_csv(StringIO(data))
    +Out[88]: 
             a    b     c
     4   apple  bat   5.7
     8  orange  cow  10.0
     
    -
    In [84]: data = ('index,a,b,c\n'
    +
    In [89]: data = ('index,a,b,c\n'
        ....:         '4,apple,bat,5.7\n'
        ....:         '8,orange,cow,10')
        ....: 
     
    -In [85]: pd.read_csv(StringIO(data), index_col=0)
    -Out[85]: 
    +In [90]: pd.read_csv(StringIO(data), index_col=0)
    +Out[90]: 
                 a    b     c
     index                   
     4       apple  bat   5.7
    @@ -1233,24 +1243,24 @@
     

    There are some exception cases when a file has been prepared with delimiters at the end of each data line, confusing the parser. To explicitly disable the index column inference and discard the last column, pass index_col=False:

    -
    In [86]: data = ('a,b,c\n'
    +
    In [91]: data = ('a,b,c\n'
        ....:         '4,apple,bat,\n'
        ....:         '8,orange,cow,')
        ....: 
     
    -In [87]: print(data)
    +In [92]: print(data)
     a,b,c
     4,apple,bat,
     8,orange,cow,
     
    -In [88]: pd.read_csv(StringIO(data))
    -Out[88]: 
    +In [93]: pd.read_csv(StringIO(data))
    +Out[93]: 
             a    b   c
     4   apple  bat NaN
     8  orange  cow NaN
     
    -In [89]: pd.read_csv(StringIO(data), index_col=False)
    -Out[89]: 
    +In [94]: pd.read_csv(StringIO(data), index_col=False)
    +Out[94]: 
        a       b    c
     0  4   apple  bat
     1  8  orange  cow
    @@ -1258,24 +1268,24 @@
     

    If a subset of data is being parsed using the usecols option, the index_col specification is based on that subset, not the original data.

    -
    In [90]: data = ('a,b,c\n'
    +
    In [95]: data = ('a,b,c\n'
        ....:         '4,apple,bat,\n'
        ....:         '8,orange,cow,')
        ....: 
     
    -In [91]: print(data)
    +In [96]: print(data)
     a,b,c
     4,apple,bat,
     8,orange,cow,
     
    -In [92]: pd.read_csv(StringIO(data), usecols=['b', 'c'])
    -Out[92]: 
    +In [97]: pd.read_csv(StringIO(data), usecols=['b', 'c'])
    +Out[97]: 
          b   c
     4  bat NaN
     8  cow NaN
     
    -In [93]: pd.read_csv(StringIO(data), usecols=['b', 'c'], index_col=0)
    -Out[93]: 
    +In [98]: pd.read_csv(StringIO(data), usecols=['b', 'c'], index_col=0)
    +Out[98]: 
          b   c
     4  bat NaN
     8  cow NaN
    @@ -1292,10 +1302,10 @@
     input text data into datetime objects.

    The simplest case is to just pass in parse_dates=True:

    # Use a column as an index, and parse it as dates.
    -In [94]: df = pd.read_csv('foo.csv', index_col=0, parse_dates=True)
    +In [99]: df = pd.read_csv('foo.csv', index_col=0, parse_dates=True)
     
    -In [95]: df
    -Out[95]: 
    +In [100]: df
    +Out[100]: 
                 A  B  C
     date               
     2009-01-01  a  1  2
    @@ -1303,8 +1313,8 @@
     2009-01-03  c  4  5
     
     # These are Python datetime objects
    -In [96]: df.index
    -Out[96]: DatetimeIndex(['2009-01-01', '2009-01-02', '2009-01-03'], dtype='datetime64[ns]', name='date', freq=None)
    +In [101]: df.index
    +Out[101]: DatetimeIndex(['2009-01-01', '2009-01-02', '2009-01-03'], dtype='datetime64[ns]', name='date', freq=None)
     

    It is often the case that we may want to store date and time data separately, @@ -1314,7 +1324,7 @@ columns will be prepended to the output (so as to not affect the existing column order) and the new column names will be the concatenation of the component column names:

    -
    In [97]: print(open('tmp.csv').read())
    +
    In [102]: print(open('tmp.csv').read())
     KORD,19990127, 19:00:00, 18:56:00, 0.8100
     KORD,19990127, 20:00:00, 19:56:00, 0.0100
     KORD,19990127, 21:00:00, 20:56:00, -0.5900
    @@ -1322,10 +1332,10 @@
     KORD,19990127, 22:00:00, 21:56:00, -0.5900
     KORD,19990127, 23:00:00, 22:56:00, -0.5900
     
    -In [98]: df = pd.read_csv('tmp.csv', header=None, parse_dates=[[1, 2], [1, 3]])
    +In [103]: df = pd.read_csv('tmp.csv', header=None, parse_dates=[[1, 2], [1, 3]])
     
    -In [99]: df
    -Out[99]: 
    +In [104]: df
    +Out[104]: 
                       1_2                 1_3     0     4
     0 1999-01-27 19:00:00 1999-01-27 18:56:00  KORD  0.81
     1 1999-01-27 20:00:00 1999-01-27 19:56:00  KORD  0.01
    @@ -1337,12 +1347,12 @@
     

    By default the parser removes the component date columns, but you can choose to retain them via the keep_date_col keyword:

    -
    In [100]: df = pd.read_csv('tmp.csv', header=None, parse_dates=[[1, 2], [1, 3]],
    +
    In [105]: df = pd.read_csv('tmp.csv', header=None, parse_dates=[[1, 2], [1, 3]],
        .....:                  keep_date_col=True)
        .....: 
     
    -In [101]: df
    -Out[101]: 
    +In [106]: df
    +Out[106]: 
                       1_2                 1_3     0         1          2          3     4
     0 1999-01-27 19:00:00 1999-01-27 18:56:00  KORD  19990127   19:00:00   18:56:00  0.81
     1 1999-01-27 20:00:00 1999-01-27 19:56:00  KORD  19990127   20:00:00   19:56:00  0.01
    @@ -1358,12 +1368,12 @@
     while parse_dates=[[1, 2]] means the two columns should be parsed into a
     single column.

    You can also use a dict to specify custom name columns:

    -
    In [102]: date_spec = {'nominal': [1, 2], 'actual': [1, 3]}
    +
    In [107]: date_spec = {'nominal': [1, 2], 'actual': [1, 3]}
     
    -In [103]: df = pd.read_csv('tmp.csv', header=None, parse_dates=date_spec)
    +In [108]: df = pd.read_csv('tmp.csv', header=None, parse_dates=date_spec)
     
    -In [104]: df
    -Out[104]: 
    +In [109]: df
    +Out[109]: 
                   nominal              actual     0     4
     0 1999-01-27 19:00:00 1999-01-27 18:56:00  KORD  0.81
     1 1999-01-27 20:00:00 1999-01-27 19:56:00  KORD  0.01
    @@ -1377,14 +1387,14 @@
     a single date column, then a new column is prepended to the data. The index_col
     specification is based off of this new set of columns rather than the original
     data columns:

    -
    In [105]: date_spec = {'nominal': [1, 2], 'actual': [1, 3]}
    +
    In [110]: date_spec = {'nominal': [1, 2], 'actual': [1, 3]}
     
    -In [106]: df = pd.read_csv('tmp.csv', header=None, parse_dates=date_spec,
    +In [111]: df = pd.read_csv('tmp.csv', header=None, parse_dates=date_spec,
        .....:                  index_col=0)  # index is the nominal column
        .....: 
     
    -In [107]: df
    -Out[107]: 
    +In [112]: df
    +Out[112]: 
                                      actual     0     4
     nominal                                            
     1999-01-27 19:00:00 1999-01-27 18:56:00  KORD  0.81
    @@ -1422,12 +1432,12 @@
     

    Date parsing functions

    Finally, the parser allows you to specify a custom date_parser function to take full advantage of the flexibility of the date parsing API:

    -
    In [108]: df = pd.read_csv('tmp.csv', header=None, parse_dates=date_spec,
    +
    In [113]: df = pd.read_csv('tmp.csv', header=None, parse_dates=date_spec,
        .....:                  date_parser=pd.io.date_converters.parse_date_time)
        .....: 
     
    -In [109]: df
    -Out[109]: 
    +In [114]: df
    +Out[114]: 
                   nominal              actual     0     4
     0 1999-01-27 19:00:00 1999-01-27 18:56:00  KORD  0.81
     1 1999-01-27 20:00:00 1999-01-27 19:56:00  KORD  0.01
    @@ -1472,16 +1482,16 @@
     

    Pandas cannot natively represent a column or index with mixed timezones. If your CSV file contains columns with a mixture of timezones, the default result will be an object-dtype column with strings, even with parse_dates.

    -
    In [110]: content = """\
    +
    In [115]: content = """\
        .....: a
        .....: 2000-01-01T00:00:00+05:00
        .....: 2000-01-01T00:00:00+06:00"""
        .....: 
     
    -In [111]: df = pd.read_csv(StringIO(content), parse_dates=['a'])
    +In [116]: df = pd.read_csv(StringIO(content), parse_dates=['a'])
     
    -In [112]: df['a']
    -Out[112]: 
    +In [117]: df['a']
    +Out[117]: 
     0    2000-01-01 00:00:00+05:00
     1    2000-01-01 00:00:00+06:00
     Name: a, dtype: object
    @@ -1489,12 +1499,12 @@
     

    To parse the mixed-timezone values as a datetime column, pass a partially-applied to_datetime() with utc=True as the date_parser.

    -
    In [113]: df = pd.read_csv(StringIO(content), parse_dates=['a'],
    +
    In [118]: df = pd.read_csv(StringIO(content), parse_dates=['a'],
        .....:                  date_parser=lambda col: pd.to_datetime(col, utc=True))
        .....: 
     
    -In [114]: df['a']
    -Out[114]: 
    +In [119]: df['a']
    +Out[119]: 
     0   1999-12-31 19:00:00+00:00
     1   1999-12-31 18:00:00+00:00
     Name: a, dtype: datetime64[ns, UTC]
    @@ -1526,12 +1536,12 @@
     dayfirst=True, it will guess “01/12/2011” to be December 1st. With
     dayfirst=False (default) it will guess “01/12/2011” to be January 12th.

    # Try to infer the format for the index column
    -In [115]: df = pd.read_csv('foo.csv', index_col=0, parse_dates=True,
    +In [120]: df = pd.read_csv('foo.csv', index_col=0, parse_dates=True,
        .....:                  infer_datetime_format=True)
        .....: 
     
    -In [116]: df
    -Out[116]: 
    +In [121]: df
    +Out[121]: 
                 A  B  C
     date               
     2009-01-01  a  1  2
    @@ -1544,21 +1554,21 @@
     

    International date formats

    While US date formats tend to be MM/DD/YYYY, many international formats use DD/MM/YYYY instead. For convenience, a dayfirst keyword is provided:

    -
    In [117]: print(open('tmp.csv').read())
    +
    In [122]: print(open('tmp.csv').read())
     date,value,cat
     1/6/2000,5,a
     2/6/2000,10,b
     3/6/2000,15,c
     
    -In [118]: pd.read_csv('tmp.csv', parse_dates=[0])
    -Out[118]: 
    +In [123]: pd.read_csv('tmp.csv', parse_dates=[0])
    +Out[123]: 
             date  value cat
     0 2000-01-06      5   a
     1 2000-02-06     10   b
     2 2000-03-06     15   c
     
    -In [119]: pd.read_csv('tmp.csv', dayfirst=True, parse_dates=[0])
    -Out[119]: 
    +In [124]: pd.read_csv('tmp.csv', dayfirst=True, parse_dates=[0])
    +Out[124]: 
             date  value cat
     0 2000-06-01      5   a
     1 2000-06-02     10   b
    @@ -1574,24 +1584,24 @@
     The options are the ordinary converter, the high-precision converter, and
     the round-trip converter (which is guaranteed to round-trip values after
     writing to a file). For example:

    -
    In [120]: val = '0.3066101993807095471566981359501369297504425048828125'
    +
    In [125]: val = '0.3066101993807095471566981359501369297504425048828125'
     
    -In [121]: data = 'a,b,c\n1,2,{0}'.format(val)
    +In [126]: data = 'a,b,c\n1,2,{0}'.format(val)
     
    -In [122]: abs(pd.read_csv(StringIO(data), engine='c',
    +In [127]: abs(pd.read_csv(StringIO(data), engine='c',
        .....:                 float_precision=None)['c'][0] - float(val))
        .....: 
    -Out[122]: 1.1102230246251565e-16
    +Out[127]: 1.1102230246251565e-16
     
    -In [123]: abs(pd.read_csv(StringIO(data), engine='c',
    +In [128]: abs(pd.read_csv(StringIO(data), engine='c',
        .....:                 float_precision='high')['c'][0] - float(val))
        .....: 
    -Out[123]: 5.551115123125783e-17
    +Out[128]: 5.551115123125783e-17
     
    -In [124]: abs(pd.read_csv(StringIO(data), engine='c',
    +In [129]: abs(pd.read_csv(StringIO(data), engine='c',
        .....:                 float_precision='round_trip')['c'][0] - float(val))
        .....: 
    -Out[124]: 0.0
    +Out[129]: 0.0
     
    @@ -1601,43 +1611,43 @@ set the thousands keyword to a string of length 1 so that integers will be parsed correctly:

    By default, numbers with a thousands separator will be parsed as strings:

    -
    In [125]: print(open('tmp.csv').read())
    +
    In [130]: print(open('tmp.csv').read())
     ID|level|category
     Patient1|123,000|x
     Patient2|23,000|y
     Patient3|1,234,018|z
     
    -In [126]: df = pd.read_csv('tmp.csv', sep='|')
    +In [131]: df = pd.read_csv('tmp.csv', sep='|')
     
    -In [127]: df
    -Out[127]: 
    +In [132]: df
    +Out[132]: 
              ID      level category
     0  Patient1    123,000        x
     1  Patient2     23,000        y
     2  Patient3  1,234,018        z
     
    -In [128]: df.level.dtype
    -Out[128]: dtype('O')
    +In [133]: df.level.dtype
    +Out[133]: dtype('O')
     

    The thousands keyword allows integers to be parsed correctly:

    -
    In [129]: print(open('tmp.csv').read())
    +
    In [134]: print(open('tmp.csv').read())
     ID|level|category
     Patient1|123,000|x
     Patient2|23,000|y
     Patient3|1,234,018|z
     
    -In [130]: df = pd.read_csv('tmp.csv', sep='|', thousands=',')
    +In [135]: df = pd.read_csv('tmp.csv', sep='|', thousands=',')
     
    -In [131]: df
    -Out[131]: 
    +In [136]: df
    +Out[136]: 
              ID    level category
     0  Patient1   123000        x
     1  Patient2    23000        y
     2  Patient3  1234018        z
     
    -In [132]: df.level.dtype
    -Out[132]: dtype('int64')
    +In [137]: df.level.dtype
    +Out[137]: dtype('int64')
     
    @@ -1682,23 +1692,23 @@

    Returning Series

    Using the squeeze keyword, the parser will return output with a single column as a Series:

    -
    In [133]: print(open('tmp.csv').read())
    +
    In [138]: print(open('tmp.csv').read())
     level
     Patient1,123000
     Patient2,23000
     Patient3,1234018
     
    -In [134]: output = pd.read_csv('tmp.csv', squeeze=True)
    +In [139]: output = pd.read_csv('tmp.csv', squeeze=True)
     
    -In [135]: output
    -Out[135]: 
    +In [140]: output
    +Out[140]: 
     Patient1     123000
     Patient2      23000
     Patient3    1234018
     Name: level, dtype: int64
     
    -In [136]: type(output)
    -Out[136]: pandas.core.series.Series
    +In [141]: type(output)
    +Out[141]: pandas.core.series.Series
     
    @@ -1708,24 +1718,24 @@ recognized as boolean. Occasionally you might want to recognize other values as being boolean. To do this, use the true_values and false_values options as follows:

    -
    In [137]: data = ('a,b,c\n'
    +
    In [142]: data = ('a,b,c\n'
        .....:         '1,Yes,2\n'
        .....:         '3,No,4')
        .....: 
     
    -In [138]: print(data)
    +In [143]: print(data)
     a,b,c
     1,Yes,2
     3,No,4
     
    -In [139]: pd.read_csv(StringIO(data))
    -Out[139]: 
    +In [144]: pd.read_csv(StringIO(data))
    +Out[144]: 
        a    b  c
     0  1  Yes  2
     1  3   No  4
     
    -In [140]: pd.read_csv(StringIO(data), true_values=['Yes'], false_values=['No'])
    -Out[140]: 
    +In [145]: pd.read_csv(StringIO(data), true_values=['Yes'], false_values=['No'])
    +Out[145]: 
        a      b  c
     0  1   True  2
     1  3  False  4
    @@ -1737,16 +1747,16 @@
     

    Some files may have malformed lines with too few fields or too many. Lines with too few fields will have NA values filled in the trailing fields. Lines with too many fields will raise an error by default:

    -
    In [141]: data = ('a,b,c\n'
    +
    In [146]: data = ('a,b,c\n'
        .....:         '1,2,3\n'
        .....:         '4,5,6,7\n'
        .....:         '8,9,10')
        .....: 
     
    -In [142]: pd.read_csv(StringIO(data))
    +In [147]: pd.read_csv(StringIO(data))
     ---------------------------------------------------------------------------
     ParserError                               Traceback (most recent call last)
    -<ipython-input-142-6388c394e6b8> in <module>
    +<ipython-input-147-6388c394e6b8> in <module>
     ----> 1 pd.read_csv(StringIO(data))
     
     ~/Documents/GitHub/pandas-tanyaacjain/pandas/io/parsers.py in parser_f(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, dialect, error_bad_lines, warn_bad_lines, delim_whitespace, low_memory, memory_map, float_precision)
    @@ -1818,7 +1828,7 @@
     By default it uses the Excel dialect but you can specify either the dialect name
     or a python:csv.Dialect instance.

    Suppose you had data with unenclosed quotes:

    -
    In [143]: print(data)
    +
    In [148]: print(data)
     label1,label2,label3
     index1,"a,c,e
     index2,b,d,f
    @@ -1828,24 +1838,24 @@
     the quote character, which causes it to fail when it finds a newline before it
     finds the closing double quote.

    We can get around this using dialect:

    -
    In [144]: import csv
    +
    In [149]: import csv
     
    -In [145]: dia = csv.excel()
    +In [150]: dia = csv.excel()
     
    -In [146]: dia.quoting = csv.QUOTE_NONE
    +In [151]: dia.quoting = csv.QUOTE_NONE
     
    -In [147]: pd.read_csv(StringIO(data), dialect=dia)
    -Out[147]: 
    +In [152]: pd.read_csv(StringIO(data), dialect=dia)
    +Out[152]: 
            label1 label2 label3
     index1     "a      c      e
     index2      b      d      f
     

    All of the dialect options can be specified separately by keyword arguments:

    -
    In [148]: data = 'a,b,c~1,2,3~4,5,6'
    +
    In [153]: data = 'a,b,c~1,2,3~4,5,6'
     
    -In [149]: pd.read_csv(StringIO(data), lineterminator='~')
    -Out[149]: 
    +In [154]: pd.read_csv(StringIO(data), lineterminator='~')
    +Out[154]: 
        a  b  c
     0  1  2  3
     1  4  5  6
    @@ -1853,15 +1863,15 @@
     

    Another common dialect option is skipinitialspace, to skip any whitespace after a delimiter:

    -
    In [150]: data = 'a, b, c\n1, 2, 3\n4, 5, 6'
    +
    In [155]: data = 'a, b, c\n1, 2, 3\n4, 5, 6'
     
    -In [151]: print(data)
    +In [156]: print(data)
     a, b, c
     1, 2, 3
     4, 5, 6
     
    -In [152]: pd.read_csv(StringIO(data), skipinitialspace=True)
    -Out[152]: 
    +In [157]: pd.read_csv(StringIO(data), skipinitialspace=True)
    +Out[157]: 
        a  b  c
     0  1  2  3
     1  4  5  6
    @@ -1877,14 +1887,14 @@
     

    Quotes (and other escape characters) in embedded fields can be handled in any number of ways. One way is to use backslashes; to properly parse this data, you should pass the escapechar option:

    -
    In [153]: data = 'a,b\n"hello, \\"Bob\\", nice to see you",5'
    +
    In [158]: data = 'a,b\n"hello, \\"Bob\\", nice to see you",5'
     
    -In [154]: print(data)
    +In [159]: print(data)
     a,b
     "hello, \"Bob\", nice to see you",5
     
    -In [155]: pd.read_csv(StringIO(data), escapechar='\\')
    -Out[155]: 
    +In [160]: pd.read_csv(StringIO(data), escapechar='\\')
    +Out[160]: 
                                    a  b
     0  hello, "Bob", nice to see you  5
     
    @@ -1909,7 +1919,7 @@ if it is not spaces (e.g., ‘~’).

    Consider a typical fixed-width data file:

    -
    In [156]: print(open('bar.csv').read())
    +
    In [161]: print(open('bar.csv').read())
     id8141    360.242940   149.910199   11950.7
     id1594    444.953632   166.985655   11788.4
     id1849    364.136849   183.628767   11806.2
    @@ -1920,12 +1930,12 @@
     

    In order to parse this file into a DataFrame, we simply need to supply the column specifications to the read_fwf function along with the file name:

    # Column specifications are a list of half-intervals
    -In [157]: colspecs = [(0, 6), (8, 20), (21, 33), (34, 43)]
    +In [162]: colspecs = [(0, 6), (8, 20), (21, 33), (34, 43)]
     
    -In [158]: df = pd.read_fwf('bar.csv', colspecs=colspecs, header=None, index_col=0)
    +In [163]: df = pd.read_fwf('bar.csv', colspecs=colspecs, header=None, index_col=0)
     
    -In [159]: df
    -Out[159]: 
    +In [164]: df
    +Out[164]: 
                      1           2        3
     0                                      
     id8141  360.242940  149.910199  11950.7
    @@ -1939,12 +1949,12 @@
     header=None argument is specified. Alternatively, you can supply just the
     column widths for contiguous columns:

    # Widths are a list of integers
    -In [160]: widths = [6, 14, 13, 10]
    +In [165]: widths = [6, 14, 13, 10]
     
    -In [161]: df = pd.read_fwf('bar.csv', widths=widths, header=None)
    +In [166]: df = pd.read_fwf('bar.csv', widths=widths, header=None)
     
    -In [162]: df
    -Out[162]: 
    +In [167]: df
    +Out[167]: 
             0           1           2        3
     0  id8141  360.242940  149.910199  11950.7
     1  id1594  444.953632  166.985655  11788.4
    @@ -1959,10 +1969,10 @@
     first 100 rows of the file. It can do it only in cases when the columns are
     aligned and correctly separated by the provided delimiter (default delimiter
     is whitespace).

    -
    In [163]: df = pd.read_fwf('bar.csv', header=None, index_col=0)
    +
    In [168]: df = pd.read_fwf('bar.csv', header=None, index_col=0)
     
    -In [164]: df
    -Out[164]: 
    +In [169]: df
    +Out[169]: 
                      1           2        3
     0                                      
     id8141  360.242940  149.910199  11950.7
    @@ -1977,15 +1987,15 @@
     

    read_fwf supports the dtype parameter for specifying the types of parsed columns to be different from the inferred type.

    -
    In [165]: pd.read_fwf('bar.csv', header=None, index_col=0).dtypes
    -Out[165]: 
    +
    In [170]: pd.read_fwf('bar.csv', header=None, index_col=0).dtypes
    +Out[170]: 
     1    float64
     2    float64
     3    float64
     dtype: object
     
    -In [166]: pd.read_fwf('bar.csv', header=None, dtype={2: 'object'}).dtypes
    -Out[166]: 
    +In [171]: pd.read_fwf('bar.csv', header=None, dtype={2: 'object'}).dtypes
    +Out[171]: 
     0     object
     1    float64
     2     object
    @@ -2000,7 +2010,7 @@
     

    Files with an “implicit” index column

    Consider a file with one less entry in the header than the number of data column:

    -
    In [167]: print(open('foo.csv').read())
    +
    In [172]: print(open('foo.csv').read())
     A,B,C
     20090101,a,1,2
     20090102,b,3,4
    @@ -2009,8 +2019,8 @@
     

    In this special case, read_csv assumes that the first column is to be used as the index of the DataFrame:

    -
    In [168]: pd.read_csv('foo.csv')
    -Out[168]: 
    +
    In [173]: pd.read_csv('foo.csv')
    +Out[173]: 
               A  B  C
     20090101  a  1  2
     20090102  b  3  4
    @@ -2019,17 +2029,17 @@
     

    Note that the dates weren’t automatically parsed. In that case you would need to do as before:

    -
    In [169]: df = pd.read_csv('foo.csv', parse_dates=True)
    +
    In [174]: df = pd.read_csv('foo.csv', parse_dates=True)
     
    -In [170]: df.index
    -Out[170]: DatetimeIndex(['2009-01-01', '2009-01-02', '2009-01-03'], dtype='datetime64[ns]', freq=None)
    +In [175]: df.index
    +Out[175]: DatetimeIndex(['2009-01-01', '2009-01-02', '2009-01-03'], dtype='datetime64[ns]', freq=None)
     

    Reading an index with a MultiIndex

    Suppose you have data indexed by two columns:

    -
    In [171]: print(open('data/mindex_ex.csv').read())
    +
    In [176]: print(open('data/mindex_ex.csv').read())
     year,indiv,zit,xit
     1977,"A",1.2,.6
     1977,"B",1.5,.5
    @@ -2051,10 +2061,10 @@
     

    The index_col argument to read_csv can take a list of column numbers to turn multiple columns into a MultiIndex for the index of the returned object:

    -
    In [172]: df = pd.read_csv("data/mindex_ex.csv", index_col=[0, 1])
    +
    In [177]: df = pd.read_csv("data/mindex_ex.csv", index_col=[0, 1])
     
    -In [173]: df
    -Out[173]: 
    +In [178]: df
    +Out[178]: 
                  zit   xit
     year indiv            
     1977 A      1.20  0.60
    @@ -2073,8 +2083,8 @@
          H      5.40  2.70
          I      6.40  1.20
     
    -In [174]: df.loc[1978]
    -Out[174]: 
    +In [179]: df.loc[1978]
    +Out[179]: 
            zit   xit
     indiv           
     A      0.2  0.06
    @@ -2090,13 +2100,13 @@
     

    By specifying list of row locations for the header argument, you can read in a MultiIndex for the columns. Specifying non-consecutive rows will skip the intervening rows.

    -
    In [175]: from pandas.util.testing import makeCustomDataframe as mkdf
    +
    In [180]: from pandas.util.testing import makeCustomDataframe as mkdf
     
    -In [176]: df = mkdf(5, 3, r_idx_nlevels=2, c_idx_nlevels=4)
    +In [181]: df = mkdf(5, 3, r_idx_nlevels=2, c_idx_nlevels=4)
     
    -In [177]: df.to_csv('mi.csv')
    +In [182]: df.to_csv('mi.csv')
     
    -In [178]: print(open('mi.csv').read())
    +In [183]: print(open('mi.csv').read())
     C0,,C_l0_g0,C_l0_g1,C_l0_g2
     C1,,C_l1_g0,C_l1_g1,C_l1_g2
     C2,,C_l2_g0,C_l2_g1,C_l2_g2
    @@ -2109,8 +2119,8 @@
     R_l0_g4,R_l1_g4,R4C0,R4C1,R4C2
     
     
    -In [179]: pd.read_csv('mi.csv', header=[0, 1, 2, 3], index_col=[0, 1])
    -Out[179]: 
    +In [184]: pd.read_csv('mi.csv', header=[0, 1, 2, 3], index_col=[0, 1])
    +Out[184]: 
     C0              C_l0_g0 C_l0_g1 C_l0_g2
     C1              C_l1_g0 C_l1_g1 C_l1_g2
     C2              C_l2_g0 C_l2_g1 C_l2_g2
    @@ -2125,14 +2135,14 @@
     

    read_csv is also able to interpret a more common format of multi-columns indices.

    -
    In [180]: print(open('mi2.csv').read())
    +
    In [185]: print(open('mi2.csv').read())
     ,a,a,a,b,c,c
     ,q,r,s,t,u,v
     one,1,2,3,4,5,6
     two,7,8,9,10,11,12
     
    -In [181]: pd.read_csv('mi2.csv', header=[0, 1], index_col=0)
    -Out[181]: 
    +In [186]: pd.read_csv('mi2.csv', header=[0, 1], index_col=0)
    +Out[186]: 
          a         b   c    
          q  r  s   t   u   v
     one  1  2  3   4   5   6
    @@ -2148,7 +2158,7 @@
     

    read_csv is capable of inferring delimited (not necessarily comma-separated) files, as pandas uses the python:csv.Sniffer class of the csv module. For this, you have to specify sep=None.

    -
    In [182]: print(open('tmp2.sv').read())
    +
    In [187]: print(open('tmp2.sv').read())
     :0:1:2:3
     0:0.4691122999071863:-0.2828633443286633:-1.5090585031735124:-1.1356323710171934
     1:1.2121120250208506:-0.17321464905330858:0.11920871129693428:-1.0442359662799567
    @@ -2162,8 +2172,8 @@
     9:0.35702056413309086:-0.6746001037299882:-1.776903716971867:-0.9689138124473498
     
     
    -In [183]: pd.read_csv('tmp2.sv', sep=None, engine='python')
    -Out[183]: 
    +In [188]: pd.read_csv('tmp2.sv', sep=None, engine='python')
    +Out[188]: 
        Unnamed: 0         0         1         2         3
     0           0  0.469112 -0.282863 -1.509059 -1.135632
     1           1  1.212112 -0.173215  0.119209 -1.044236
    @@ -2187,7 +2197,7 @@
     

    Iterating through files chunk by chunk

    Suppose you wish to iterate through a (potentially very large) file lazily rather than reading the entire file into memory, such as the following:

    -
    In [184]: print(open('tmp.sv').read())
    +
    In [189]: print(open('tmp.sv').read())
     |0|1|2|3
     0|0.4691122999071863|-0.2828633443286633|-1.5090585031735124|-1.1356323710171934
     1|1.2121120250208506|-0.17321464905330858|0.11920871129693428|-1.0442359662799567
    @@ -2201,10 +2211,10 @@
     9|0.35702056413309086|-0.6746001037299882|-1.776903716971867|-0.9689138124473498
     
     
    -In [185]: table = pd.read_csv('tmp.sv', sep='|')
    +In [190]: table = pd.read_csv('tmp.sv', sep='|')
     
    -In [186]: table
    -Out[186]: 
    +In [191]: table
    +Out[191]: 
        Unnamed: 0         0         1         2         3
     0           0  0.469112 -0.282863 -1.509059 -1.135632
     1           1  1.212112 -0.173215  0.119209 -1.044236
    @@ -2220,12 +2230,12 @@
     

    By specifying a chunksize to read_csv, the return value will be an iterable object of type TextFileReader:

    -
    In [187]: reader = pd.read_csv('tmp.sv', sep='|', chunksize=4)
    +
    In [192]: reader = pd.read_csv('tmp.sv', sep='|', chunksize=4)
     
    -In [188]: reader
    -Out[188]: <pandas.io.parsers.TextFileReader at 0x116e53ad0>
    +In [193]: reader
    +Out[193]: <pandas.io.parsers.TextFileReader at 0x123034f10>
     
    -In [189]: for chunk in reader:
    +In [194]: for chunk in reader:
        .....:     print(chunk)
        .....: 
        Unnamed: 0         0         1         2         3
    @@ -2244,10 +2254,10 @@
     

    Specifying iterator=True will also return the TextFileReader object:

    -
    In [190]: reader = pd.read_csv('tmp.sv', sep='|', iterator=True)
    +
    In [195]: reader = pd.read_csv('tmp.sv', sep='|', iterator=True)
     
    -In [191]: reader.get_chunk(5)
    -Out[191]: 
    +In [196]: reader.get_chunk(5)
    +Out[196]: 
        Unnamed: 0         0         1         2         3
     0           0  0.469112 -0.282863 -1.509059 -1.135632
     1           1  1.212112 -0.173215  0.119209 -1.044236
    @@ -2411,33 +2421,33 @@
     
     
     

    Note NaN’s, NaT’s and None will be converted to null and datetime objects will be converted based on the date_format and date_unit parameters.

    -
    In [192]: dfj = pd.DataFrame(np.random.randn(5, 2), columns=list('AB'))
    +
    In [197]: dfj = pd.DataFrame(np.random.randn(5, 2), columns=list('AB'))
     
    -In [193]: json = dfj.to_json()
    +In [198]: json = dfj.to_json()
     
    -In [194]: json
    -Out[194]: '{"A":{"0":-1.2945235903,"1":0.2766617129,"2":-0.0139597524,"3":-0.0061535699,"4":0.8957173022},"B":{"0":0.4137381054,"1":-0.472034511,"2":-0.3625429925,"3":-0.923060654,"4":0.8052440254}}'
    +In [199]: json
    +Out[199]: '{"A":{"0":-1.2945235903,"1":0.2766617129,"2":-0.0139597524,"3":-0.0061535699,"4":0.8957173022},"B":{"0":0.4137381054,"1":-0.472034511,"2":-0.3625429925,"3":-0.923060654,"4":0.8052440254}}'
     

    Orient options

    There are a number of different options for the format of the resulting JSON file / string. Consider the following DataFrame and Series:

    -
    In [195]: dfjo = pd.DataFrame(dict(A=range(1, 4), B=range(4, 7), C=range(7, 10)),
    +
    In [200]: dfjo = pd.DataFrame(dict(A=range(1, 4), B=range(4, 7), C=range(7, 10)),
        .....:                     columns=list('ABC'), index=list('xyz'))
        .....: 
     
    -In [196]: dfjo
    -Out[196]: 
    +In [201]: dfjo
    +Out[201]: 
        A  B  C
     x  1  4  7
     y  2  5  8
     z  3  6  9
     
    -In [197]: sjo = pd.Series(dict(x=15, y=16, z=17), name='D')
    +In [202]: sjo = pd.Series(dict(x=15, y=16, z=17), name='D')
     
    -In [198]: sjo
    -Out[198]: 
    +In [203]: sjo
    +Out[203]: 
     x    15
     y    16
     z    17
    @@ -2446,46 +2456,46 @@
     

    Column oriented (the default for DataFrame) serializes the data as nested JSON objects with column labels acting as the primary index:

    -
    In [199]: dfjo.to_json(orient="columns")
    -Out[199]: '{"A":{"x":1,"y":2,"z":3},"B":{"x":4,"y":5,"z":6},"C":{"x":7,"y":8,"z":9}}'
    +
    In [204]: dfjo.to_json(orient="columns")
    +Out[204]: '{"A":{"x":1,"y":2,"z":3},"B":{"x":4,"y":5,"z":6},"C":{"x":7,"y":8,"z":9}}'
     
     # Not available for Series
     

    Index oriented (the default for Series) similar to column oriented but the index labels are now primary:

    -
    In [200]: dfjo.to_json(orient="index")
    -Out[200]: '{"x":{"A":1,"B":4,"C":7},"y":{"A":2,"B":5,"C":8},"z":{"A":3,"B":6,"C":9}}'
    +
    In [205]: dfjo.to_json(orient="index")
    +Out[205]: '{"x":{"A":1,"B":4,"C":7},"y":{"A":2,"B":5,"C":8},"z":{"A":3,"B":6,"C":9}}'
     
    -In [201]: sjo.to_json(orient="index")
    -Out[201]: '{"x":15,"y":16,"z":17}'
    +In [206]: sjo.to_json(orient="index")
    +Out[206]: '{"x":15,"y":16,"z":17}'
     

    Record oriented serializes the data to a JSON array of column -> value records, index labels are not included. This is useful for passing DataFrame data to plotting libraries, for example the JavaScript library d3.js:

    -
    In [202]: dfjo.to_json(orient="records")
    -Out[202]: '[{"A":1,"B":4,"C":7},{"A":2,"B":5,"C":8},{"A":3,"B":6,"C":9}]'
    +
    In [207]: dfjo.to_json(orient="records")
    +Out[207]: '[{"A":1,"B":4,"C":7},{"A":2,"B":5,"C":8},{"A":3,"B":6,"C":9}]'
     
    -In [203]: sjo.to_json(orient="records")
    -Out[203]: '[15,16,17]'
    +In [208]: sjo.to_json(orient="records")
    +Out[208]: '[15,16,17]'
     

    Value oriented is a bare-bones option which serializes to nested JSON arrays of values only, column and index labels are not included:

    -
    In [204]: dfjo.to_json(orient="values")
    -Out[204]: '[[1,4,7],[2,5,8],[3,6,9]]'
    +
    In [209]: dfjo.to_json(orient="values")
    +Out[209]: '[[1,4,7],[2,5,8],[3,6,9]]'
     
     # Not available for Series
     

    Split oriented serializes to a JSON object containing separate entries for values, index and columns. Name is also included for Series:

    -
    In [205]: dfjo.to_json(orient="split")
    -Out[205]: '{"columns":["A","B","C"],"index":["x","y","z"],"data":[[1,4,7],[2,5,8],[3,6,9]]}'
    +
    In [210]: dfjo.to_json(orient="split")
    +Out[210]: '{"columns":["A","B","C"],"index":["x","y","z"],"data":[[1,4,7],[2,5,8],[3,6,9]]}'
     
    -In [206]: sjo.to_json(orient="split")
    -Out[206]: '{"name":"D","index":["x","y","z"],"data":[15,16,17]}'
    +In [211]: sjo.to_json(orient="split")
    +Out[211]: '{"name":"D","index":["x","y","z"],"data":[15,16,17]}'
     

    Table oriented serializes to the JSON Table Schema, allowing for the @@ -2500,46 +2510,46 @@

    Date handling

    Writing in ISO date format:

    -
    In [207]: dfd = pd.DataFrame(np.random.randn(5, 2), columns=list('AB'))
    +
    In [212]: dfd = pd.DataFrame(np.random.randn(5, 2), columns=list('AB'))
     
    -In [208]: dfd['date'] = pd.Timestamp('20130101')
    +In [213]: dfd['date'] = pd.Timestamp('20130101')
     
    -In [209]: dfd = dfd.sort_index(1, ascending=False)
    +In [214]: dfd = dfd.sort_index(1, ascending=False)
     
    -In [210]: json = dfd.to_json(date_format='iso')
    +In [215]: json = dfd.to_json(date_format='iso')
     
    -In [211]: json
    -Out[211]: '{"date":{"0":"2013-01-01T00:00:00.000Z","1":"2013-01-01T00:00:00.000Z","2":"2013-01-01T00:00:00.000Z","3":"2013-01-01T00:00:00.000Z","4":"2013-01-01T00:00:00.000Z"},"B":{"0":2.5656459463,"1":1.3403088498,"2":-0.2261692849,"3":0.8138502857,"4":-0.8273169356},"A":{"0":-1.2064117817,"1":1.4312559863,"2":-1.1702987971,"3":0.4108345112,"4":0.1320031703}}'
    +In [216]: json
    +Out[216]: '{"date":{"0":"2013-01-01T00:00:00.000Z","1":"2013-01-01T00:00:00.000Z","2":"2013-01-01T00:00:00.000Z","3":"2013-01-01T00:00:00.000Z","4":"2013-01-01T00:00:00.000Z"},"B":{"0":2.5656459463,"1":1.3403088498,"2":-0.2261692849,"3":0.8138502857,"4":-0.8273169356},"A":{"0":-1.2064117817,"1":1.4312559863,"2":-1.1702987971,"3":0.4108345112,"4":0.1320031703}}'
     

    Writing in ISO date format, with microseconds:

    -
    In [212]: json = dfd.to_json(date_format='iso', date_unit='us')
    +
    In [217]: json = dfd.to_json(date_format='iso', date_unit='us')
     
    -In [213]: json
    -Out[213]: '{"date":{"0":"2013-01-01T00:00:00.000000Z","1":"2013-01-01T00:00:00.000000Z","2":"2013-01-01T00:00:00.000000Z","3":"2013-01-01T00:00:00.000000Z","4":"2013-01-01T00:00:00.000000Z"},"B":{"0":2.5656459463,"1":1.3403088498,"2":-0.2261692849,"3":0.8138502857,"4":-0.8273169356},"A":{"0":-1.2064117817,"1":1.4312559863,"2":-1.1702987971,"3":0.4108345112,"4":0.1320031703}}'
    +In [218]: json
    +Out[218]: '{"date":{"0":"2013-01-01T00:00:00.000000Z","1":"2013-01-01T00:00:00.000000Z","2":"2013-01-01T00:00:00.000000Z","3":"2013-01-01T00:00:00.000000Z","4":"2013-01-01T00:00:00.000000Z"},"B":{"0":2.5656459463,"1":1.3403088498,"2":-0.2261692849,"3":0.8138502857,"4":-0.8273169356},"A":{"0":-1.2064117817,"1":1.4312559863,"2":-1.1702987971,"3":0.4108345112,"4":0.1320031703}}'
     

    Epoch timestamps, in seconds:

    -
    In [214]: json = dfd.to_json(date_format='epoch', date_unit='s')
    +
    In [219]: json = dfd.to_json(date_format='epoch', date_unit='s')
     
    -In [215]: json
    -Out[215]: '{"date":{"0":1356998400,"1":1356998400,"2":1356998400,"3":1356998400,"4":1356998400},"B":{"0":2.5656459463,"1":1.3403088498,"2":-0.2261692849,"3":0.8138502857,"4":-0.8273169356},"A":{"0":-1.2064117817,"1":1.4312559863,"2":-1.1702987971,"3":0.4108345112,"4":0.1320031703}}'
    +In [220]: json
    +Out[220]: '{"date":{"0":1356998400,"1":1356998400,"2":1356998400,"3":1356998400,"4":1356998400},"B":{"0":2.5656459463,"1":1.3403088498,"2":-0.2261692849,"3":0.8138502857,"4":-0.8273169356},"A":{"0":-1.2064117817,"1":1.4312559863,"2":-1.1702987971,"3":0.4108345112,"4":0.1320031703}}'
     

    Writing to a file, with a date index and a date column:

    -
    In [216]: dfj2 = dfj.copy()
    +
    In [221]: dfj2 = dfj.copy()
     
    -In [217]: dfj2['date'] = pd.Timestamp('20130101')
    +In [222]: dfj2['date'] = pd.Timestamp('20130101')
     
    -In [218]: dfj2['ints'] = list(range(5))
    +In [223]: dfj2['ints'] = list(range(5))
     
    -In [219]: dfj2['bools'] = True
    +In [224]: dfj2['bools'] = True
     
    -In [220]: dfj2.index = pd.date_range('20130101', periods=5)
    +In [225]: dfj2.index = pd.date_range('20130101', periods=5)
     
    -In [221]: dfj2.to_json('test.json')
    +In [226]: dfj2.to_json('test.json')
     
    -In [222]: with open('test.json') as fh:
    +In [227]: with open('test.json') as fh:
        .....:     print(fh.read())
        .....: 
     {"A":{"1356998400000":-1.2945235903,"1357084800000":0.2766617129,"1357171200000":-0.0139597524,"1357257600000":-0.0061535699,"1357344000000":0.8957173022},"B":{"1356998400000":0.4137381054,"1357084800000":-0.472034511,"1357171200000":-0.3625429925,"1357257600000":-0.923060654,"1357344000000":0.8052440254},"date":{"1356998400000":1356998400000,"1357084800000":1356998400000,"1357171200000":1356998400000,"1357257600000":1356998400000,"1357344000000":1356998400000},"ints":{"1356998400000":0,"1357084800000":1,"1357171200000":2,"1357257600000":3,"1357344000000":4},"bools":{"1356998400000":true,"1357084800000":true,"1357171200000":true,"1357257600000":true,"1357344000000":true}}
    @@ -2573,8 +2583,8 @@
     

    can be dealt with by specifying a simple default_handler:

    -
    In [223]: pd.DataFrame([1.0, 2.0, complex(1.0, 2.0)]).to_json(default_handler=str)
    -Out[223]: '{"0":{"0":"(1+0j)","1":"(2+0j)","2":"(1+2j)"}}'
    +
    In [228]: pd.DataFrame([1.0, 2.0, complex(1.0, 2.0)]).to_json(default_handler=str)
    +Out[228]: '{"0":{"0":"(1+0j)","1":"(2+0j)","2":"(1+2j)"}}'
     
    @@ -2697,8 +2707,8 @@

    Thus there are times where you may want to specify specific dtypes via the dtype keyword argument.

    Reading from a JSON string:

    -
    In [224]: pd.read_json(json)
    -Out[224]: 
    +
    In [229]: pd.read_json(json)
    +Out[229]: 
             date         B         A
     0 2013-01-01  2.565646 -1.206412
     1 2013-01-01  1.340309  1.431256
    @@ -2708,8 +2718,8 @@
     

    Reading from a file:

    -
    In [225]: pd.read_json('test.json')
    -Out[225]: 
    +
    In [230]: pd.read_json('test.json')
    +Out[230]: 
                        A         B       date  ints  bools
     2013-01-01 -1.294524  0.413738 2013-01-01     0   True
     2013-01-02  0.276662 -0.472035 2013-01-01     1   True
    @@ -2719,8 +2729,8 @@
     

    Don’t convert any data (but still convert axes and dates):

    -
    In [226]: pd.read_json('test.json', dtype=object).dtypes
    -Out[226]: 
    +
    In [231]: pd.read_json('test.json', dtype=object).dtypes
    +Out[231]: 
     A        object
     B        object
     date     object
    @@ -2730,8 +2740,8 @@
     

    Specify dtypes for conversion:

    -
    In [227]: pd.read_json('test.json', dtype={'A': 'float32', 'bools': 'int8'}).dtypes
    -Out[227]: 
    +
    In [232]: pd.read_json('test.json', dtype={'A': 'float32', 'bools': 'int8'}).dtypes
    +Out[232]: 
     A               float32
     B               float64
     date     datetime64[ns]
    @@ -2741,51 +2751,51 @@
     

    Preserve string indices:

    -
    In [228]: si = pd.DataFrame(np.zeros((4, 4)), columns=list(range(4)),
    +
    In [233]: si = pd.DataFrame(np.zeros((4, 4)), columns=list(range(4)),
        .....:                   index=[str(i) for i in range(4)])
        .....: 
     
    -In [229]: si
    -Out[229]: 
    +In [234]: si
    +Out[234]: 
          0    1    2    3
     0  0.0  0.0  0.0  0.0
     1  0.0  0.0  0.0  0.0
     2  0.0  0.0  0.0  0.0
     3  0.0  0.0  0.0  0.0
     
    -In [230]: si.index
    -Out[230]: Index(['0', '1', '2', '3'], dtype='object')
    +In [235]: si.index
    +Out[235]: Index(['0', '1', '2', '3'], dtype='object')
     
    -In [231]: si.columns
    -Out[231]: Int64Index([0, 1, 2, 3], dtype='int64')
    +In [236]: si.columns
    +Out[236]: Int64Index([0, 1, 2, 3], dtype='int64')
     
    -In [232]: json = si.to_json()
    +In [237]: json = si.to_json()
     
    -In [233]: sij = pd.read_json(json, convert_axes=False)
    +In [238]: sij = pd.read_json(json, convert_axes=False)
     
    -In [234]: sij
    -Out[234]: 
    +In [239]: sij
    +Out[239]: 
        0  1  2  3
     0  0  0  0  0
     1  0  0  0  0
     2  0  0  0  0
     3  0  0  0  0
     
    -In [235]: sij.index
    -Out[235]: Index(['0', '1', '2', '3'], dtype='object')
    +In [240]: sij.index
    +Out[240]: Index(['0', '1', '2', '3'], dtype='object')
     
    -In [236]: sij.columns
    -Out[236]: Index(['0', '1', '2', '3'], dtype='object')
    +In [241]: sij.columns
    +Out[241]: Index(['0', '1', '2', '3'], dtype='object')
     

    Dates written in nanoseconds need to be read back in nanoseconds:

    -
    In [237]: json = dfj2.to_json(date_unit='ns')
    +
    In [242]: json = dfj2.to_json(date_unit='ns')
     
     # Try to parse timestamps as milliseconds -> Won't Work
    -In [238]: dfju = pd.read_json(json, date_unit='ms')
    +In [243]: dfju = pd.read_json(json, date_unit='ms')
     
    -In [239]: dfju
    -Out[239]: 
    +In [244]: dfju
    +Out[244]: 
                                 A         B                 date  ints  bools
     1356998400000000000 -1.294524  0.413738  1356998400000000000     0   True
     1357084800000000000  0.276662 -0.472035  1356998400000000000     1   True
    @@ -2794,10 +2804,10 @@
     1357344000000000000  0.895717  0.805244  1356998400000000000     4   True
     
     # Let pandas detect the correct precision
    -In [240]: dfju = pd.read_json(json)
    +In [245]: dfju = pd.read_json(json)
     
    -In [241]: dfju
    -Out[241]: 
    +In [246]: dfju
    +Out[246]: 
                        A         B       date  ints  bools
     2013-01-01 -1.294524  0.413738 2013-01-01     0   True
     2013-01-02  0.276662 -0.472035 2013-01-01     1   True
    @@ -2806,10 +2816,10 @@
     2013-01-05  0.895717  0.805244 2013-01-01     4   True
     
     # Or specify that all timestamps are in nanoseconds
    -In [242]: dfju = pd.read_json(json, date_unit='ns')
    +In [247]: dfju = pd.read_json(json, date_unit='ns')
     
    -In [243]: dfju
    -Out[243]: 
    +In [248]: dfju
    +Out[248]: 
                        A         B       date  ints  bools
     2013-01-01 -1.294524  0.413738 2013-01-01     0   True
     2013-01-02  0.276662 -0.472035 2013-01-01     1   True
    @@ -2830,33 +2840,33 @@
     to NumPy arrays, bypassing the need for intermediate Python objects.

    This can provide speedups if you are deserialising a large amount of numeric data:

    -
    In [244]: randfloats = np.random.uniform(-100, 1000, 10000)
    +
    In [249]: randfloats = np.random.uniform(-100, 1000, 10000)
     
    -In [245]: randfloats.shape = (1000, 10)
    +In [250]: randfloats.shape = (1000, 10)
     
    -In [246]: dffloats = pd.DataFrame(randfloats, columns=list('ABCDEFGHIJ'))
    +In [251]: dffloats = pd.DataFrame(randfloats, columns=list('ABCDEFGHIJ'))
     
    -In [247]: jsonfloats = dffloats.to_json()
    +In [252]: jsonfloats = dffloats.to_json()
     
    -
    In [248]: %timeit pd.read_json(jsonfloats)
    -17.2 ms +- 128 us per loop (mean +- std. dev. of 7 runs, 100 loops each)
    +
    In [253]: %timeit pd.read_json(jsonfloats)
    +7.88 ms +- 31.4 us per loop (mean +- std. dev. of 7 runs, 100 loops each)
     
    -
    In [249]: %timeit pd.read_json(jsonfloats, numpy=True)
    -12.9 ms +- 149 us per loop (mean +- std. dev. of 7 runs, 100 loops each)
    +
    In [254]: %timeit pd.read_json(jsonfloats, numpy=True)
    +5.8 ms +- 50.2 us per loop (mean +- std. dev. of 7 runs, 100 loops each)
     

    The speedup is less noticeable for smaller datasets:

    -
    In [250]: jsonfloats = dffloats.head(100).to_json()
    +
    In [255]: jsonfloats = dffloats.head(100).to_json()
     
    -
    In [251]: %timeit pd.read_json(jsonfloats)
    -12.1 ms +- 431 us per loop (mean +- std. dev. of 7 runs, 100 loops each)
    +
    In [256]: %timeit pd.read_json(jsonfloats)
    +5.36 ms +- 19.5 us per loop (mean +- std. dev. of 7 runs, 100 loops each)
     
    -
    In [252]: %timeit pd.read_json(jsonfloats, numpy=True)
    -10.4 ms +- 225 us per loop (mean +- std. dev. of 7 runs, 100 loops each)
    +
    In [257]: %timeit pd.read_json(jsonfloats, numpy=True)
    +4.67 ms +- 27.1 us per loop (mean +- std. dev. of 7 runs, 100 loops each)
     
    @@ -2882,22 +2892,22 @@

    Normalization

    pandas provides a utility function to take a dict or list of dicts and normalize this semi-structured data into a flat table.

    -
    In [253]: from pandas.io.json import json_normalize
    +
    In [258]: from pandas.io.json import json_normalize
     
    -In [254]: data = [{'id': 1, 'name': {'first': 'Coleen', 'last': 'Volk'}},
    +In [259]: data = [{'id': 1, 'name': {'first': 'Coleen', 'last': 'Volk'}},
        .....:         {'name': {'given': 'Mose', 'family': 'Regner'}},
        .....:         {'id': 2, 'name': 'Faye Raker'}]
        .....: 
     
    -In [255]: json_normalize(data)
    -Out[255]: 
    +In [260]: json_normalize(data)
    +Out[260]: 
         id name.first name.last name.given name.family        name
     0  1.0     Coleen      Volk        NaN         NaN         NaN
     1  NaN        NaN       NaN       Mose      Regner         NaN
     2  2.0        NaN       NaN        NaN         NaN  Faye Raker
     
    -
    In [256]: data = [{'state': 'Florida',
    +
    In [261]: data = [{'state': 'Florida',
        .....:          'shortname': 'FL',
        .....:          'info': {'governor': 'Rick Scott'},
        .....:          'counties': [{'name': 'Dade', 'population': 12345},
    @@ -2910,8 +2920,8 @@
        .....:                       {'name': 'Cuyahoga', 'population': 1337}]}]
        .....: 
     
    -In [257]: json_normalize(data, 'counties', ['state', 'shortname', ['info', 'governor']])
    -Out[257]: 
    +In [262]: json_normalize(data, 'counties', ['state', 'shortname', ['info', 'governor']])
    +Out[262]: 
              name  population    state shortname info.governor
     0        Dade       12345  Florida        FL    Rick Scott
     1     Broward       40000  Florida        FL    Rick Scott
    @@ -2922,7 +2932,7 @@
     

    The max_level parameter provides more control over which level to end normalization. With max_level=1 the following snippet normalizes until 1st nesting level of the provided dict.

    -
    In [258]: data = [{'CreatedBy': {'Name': 'User001'},
    +
    In [263]: data = [{'CreatedBy': {'Name': 'User001'},
        .....:          'Lookup': {'TextField': 'Some text',
        .....:                     'UserField': {'Id': 'ID001',
        .....:                                   'Name': 'Name001'}},
    @@ -2930,8 +2940,8 @@
        .....:          }]
        .....: 
     
    -In [259]: json_normalize(data, max_level=1)
    -Out[259]: 
    +In [264]: json_normalize(data, max_level=1)
    +Out[264]: 
       CreatedBy.Name Lookup.TextField                    Lookup.UserField Image.a
     0        User001        Some text  {'Id': 'ID001', 'Name': 'Name001'}       b
     
    @@ -2945,30 +2955,30 @@

    New in version 0.21.0.

    For line-delimited json files, pandas can also return an iterator which reads in chunksize lines at a time. This can be useful for large files or to read from a stream.

    -
    In [260]: jsonl = '''
    +
    In [265]: jsonl = '''
        .....:     {"a": 1, "b": 2}
        .....:     {"a": 3, "b": 4}
        .....: '''
        .....: 
     
    -In [261]: df = pd.read_json(jsonl, lines=True)
    +In [266]: df = pd.read_json(jsonl, lines=True)
     
    -In [262]: df
    -Out[262]: 
    +In [267]: df
    +Out[267]: 
        a  b
     0  1  2
     1  3  4
     
    -In [263]: df.to_json(orient='records', lines=True)
    -Out[263]: '{"a":1,"b":2}\n{"a":3,"b":4}'
    +In [268]: df.to_json(orient='records', lines=True)
    +Out[268]: '{"a":1,"b":2}\n{"a":3,"b":4}'
     
     # reader is an iterator that returns `chunksize` lines each iteration
    -In [264]: reader = pd.read_json(StringIO(jsonl), lines=True, chunksize=1)
    +In [269]: reader = pd.read_json(StringIO(jsonl), lines=True, chunksize=1)
     
    -In [265]: reader
    -Out[265]: <pandas.io.json._json.JsonReader at 0x11702a950>
    +In [270]: reader
    +Out[270]: <pandas.io.json._json.JsonReader at 0x122346550>
     
    -In [266]: for chunk in reader:
    +In [271]: for chunk in reader:
        .....:     print(chunk)
        .....: 
     Empty DataFrame
    @@ -2990,22 +3000,22 @@
     object. The JSON includes information on the field names, types, and
     other attributes. You can use the orient table to build
     a JSON string with two fields, schema and data.

    -
    In [267]: df = pd.DataFrame({'A': [1, 2, 3],
    +
    In [272]: df = pd.DataFrame({'A': [1, 2, 3],
        .....:                    'B': ['a', 'b', 'c'],
        .....:                    'C': pd.date_range('2016-01-01', freq='d', periods=3)},
        .....:                   index=pd.Index(range(3), name='idx'))
        .....: 
     
    -In [268]: df
    -Out[268]: 
    +In [273]: df
    +Out[273]: 
          A  B          C
     idx                 
     0    1  a 2016-01-01
     1    2  b 2016-01-02
     2    3  c 2016-01-03
     
    -In [269]: df.to_json(orient='table', date_format="iso")
    -Out[269]: '{"schema": {"fields":[{"name":"idx","type":"integer"},{"name":"A","type":"integer"},{"name":"B","type":"string"},{"name":"C","type":"datetime"}],"primaryKey":["idx"],"pandas_version":"0.20.0"}, "data": [{"idx":0,"A":1,"B":"a","C":"2016-01-01T00:00:00.000Z"},{"idx":1,"A":2,"B":"b","C":"2016-01-02T00:00:00.000Z"},{"idx":2,"A":3,"B":"c","C":"2016-01-03T00:00:00.000Z"}]}'
    +In [274]: df.to_json(orient='table', date_format="iso")
    +Out[274]: '{"schema": {"fields":[{"name":"idx","type":"integer"},{"name":"A","type":"integer"},{"name":"B","type":"string"},{"name":"C","type":"datetime"}],"primaryKey":["idx"],"pandas_version":"0.20.0"}, "data": [{"idx":0,"A":1,"B":"a","C":"2016-01-01T00:00:00.000Z"},{"idx":1,"A":2,"B":"b","C":"2016-01-02T00:00:00.000Z"},{"idx":2,"A":3,"B":"c","C":"2016-01-03T00:00:00.000Z"}]}'
     

    The schema field contains the fields key, which itself contains @@ -3061,12 +3071,12 @@

  • All dates are converted to UTC when serializing. Even timezone naive values, which are treated as UTC with an offset of 0.

    -
    In [270]: from pandas.io.json import build_table_schema
    +
    In [275]: from pandas.io.json import build_table_schema
     
    -In [271]: s = pd.Series(pd.date_range('2016', periods=4))
    +In [276]: s = pd.Series(pd.date_range('2016', periods=4))
     
    -In [272]: build_table_schema(s)
    -Out[272]: 
    +In [277]: build_table_schema(s)
    +Out[277]: 
     {'fields': [{'name': 'index', 'type': 'integer'},
       {'name': 'values', 'type': 'datetime'}],
      'primaryKey': ['index'],
    @@ -3076,12 +3086,12 @@
     
  • datetimes with a timezone (before serializing), include an additional field tz with the time zone name (e.g. 'US/Central').

    -
    In [273]: s_tz = pd.Series(pd.date_range('2016', periods=12,
    +
    In [278]: s_tz = pd.Series(pd.date_range('2016', periods=12,
        .....:                                tz='US/Central'))
        .....: 
     
    -In [274]: build_table_schema(s_tz)
    -Out[274]: 
    +In [279]: build_table_schema(s_tz)
    +Out[279]: 
     {'fields': [{'name': 'index', 'type': 'integer'},
       {'name': 'values', 'type': 'datetime', 'tz': 'US/Central'}],
      'primaryKey': ['index'],
    @@ -3092,12 +3102,12 @@
     
  • Periods are converted to timestamps before serialization, and so have the same behavior of being converted to UTC. In addition, periods will contain and additional field freq with the period’s frequency, e.g. 'A-DEC'.

    -
    In [275]: s_per = pd.Series(1, index=pd.period_range('2016', freq='A-DEC',
    +
    In [280]: s_per = pd.Series(1, index=pd.period_range('2016', freq='A-DEC',
        .....:                                            periods=4))
        .....: 
     
    -In [276]: build_table_schema(s_per)
    -Out[276]: 
    +In [281]: build_table_schema(s_per)
    +Out[281]: 
     {'fields': [{'name': 'index', 'type': 'datetime', 'freq': 'A-DEC'},
       {'name': 'values', 'type': 'integer'}],
      'primaryKey': ['index'],
    @@ -3107,10 +3117,10 @@
     
  • Categoricals use the any type and an enum constraint listing the set of possible values. Additionally, an ordered field is included:

    -
    In [277]: s_cat = pd.Series(pd.Categorical(['a', 'b', 'a']))
    +
    In [282]: s_cat = pd.Series(pd.Categorical(['a', 'b', 'a']))
     
    -In [278]: build_table_schema(s_cat)
    -Out[278]: 
    +In [283]: build_table_schema(s_cat)
    +Out[283]: 
     {'fields': [{'name': 'index', 'type': 'integer'},
       {'name': 'values',
        'type': 'any',
    @@ -3123,10 +3133,10 @@
     
  • A primaryKey field, containing an array of labels, is included if the index is unique:

    -
    In [279]: s_dupe = pd.Series([1, 2], index=[1, 1])
    +
    In [284]: s_dupe = pd.Series([1, 2], index=[1, 1])
     
    -In [280]: build_table_schema(s_dupe)
    -Out[280]: 
    +In [285]: build_table_schema(s_dupe)
    +Out[285]: 
     {'fields': [{'name': 'index', 'type': 'integer'},
       {'name': 'values', 'type': 'integer'}],
      'pandas_version': '0.20.0'}
    @@ -3135,12 +3145,12 @@
     
  • The primaryKey behavior is the same with MultiIndexes, but in this case the primaryKey is an array:

    -
    In [281]: s_multi = pd.Series(1, index=pd.MultiIndex.from_product([('a', 'b'),
    +
    In [286]: s_multi = pd.Series(1, index=pd.MultiIndex.from_product([('a', 'b'),
        .....:                                                          (0, 1)]))
        .....: 
     
    -In [282]: build_table_schema(s_multi)
    -Out[282]: 
    +In [287]: build_table_schema(s_multi)
    +Out[287]: 
     {'fields': [{'name': 'level_0', 'type': 'string'},
       {'name': 'level_1', 'type': 'integer'},
       {'name': 'values', 'type': 'integer'}],
    @@ -3170,15 +3180,15 @@
     the preservation of metadata such as dtypes and index names in a
     round-trippable manner.

    -
    In [283]: df = pd.DataFrame({'foo': [1, 2, 3, 4],
    +
    In [288]: df = pd.DataFrame({'foo': [1, 2, 3, 4],
        .....:        'bar': ['a', 'b', 'c', 'd'],
        .....:        'baz': pd.date_range('2018-01-01', freq='d', periods=4),
        .....:        'qux': pd.Categorical(['a', 'b', 'c', 'c'])
        .....:        }, index=pd.Index(range(4), name='idx'))
        .....: 
     
    -In [284]: df
    -Out[284]: 
    +In [289]: df
    +Out[289]: 
          foo bar        baz qux
     idx                        
     0      1   a 2018-01-01   a
    @@ -3186,20 +3196,20 @@
     2      3   c 2018-01-03   c
     3      4   d 2018-01-04   c
     
    -In [285]: df.dtypes
    -Out[285]: 
    +In [290]: df.dtypes
    +Out[290]: 
     foo             int64
     bar            object
     baz    datetime64[ns]
     qux          category
     dtype: object
     
    -In [286]: df.to_json('test.json', orient='table')
    +In [291]: df.to_json('test.json', orient='table')
     
    -In [287]: new_df = pd.read_json('test.json', orient='table')
    +In [292]: new_df = pd.read_json('test.json', orient='table')
     
    -In [288]: new_df
    -Out[288]: 
    +In [293]: new_df
    +Out[293]: 
          foo bar        baz qux
     idx                        
     0      1   a 2018-01-01   a
    @@ -3207,8 +3217,8 @@
     2      3   c 2018-01-03   c
     3      4   d 2018-01-04   c
     
    -In [289]: new_df.dtypes
    -Out[289]: 
    +In [294]: new_df.dtypes
    +Out[294]: 
     foo             int64
     bar            object
     baz    datetime64[ns]
    @@ -3221,13 +3231,13 @@
     is not round-trippable, nor are any names beginning with 'level_' within a
     MultiIndex. These are used by default in DataFrame.to_json() to
     indicate missing values and the subsequent read cannot distinguish the intent.

    -
    In [290]: df.index.name = 'index'
    +
    In [295]: df.index.name = 'index'
     
    -In [291]: df.to_json('test.json', orient='table')
    +In [296]: df.to_json('test.json', orient='table')
     
    -In [292]: new_df = pd.read_json('test.json', orient='table')
    +In [297]: new_df = pd.read_json('test.json', orient='table')
     
    -In [293]: print(new_df.index.name)
    +In [298]: print(new_df.index.name)
     None
     
    @@ -3251,14 +3261,14 @@ only a single table contained in the HTML content.

    Read a URL with no options:

    -
    In [294]: url = 'https://www.fdic.gov/bank/individual/failed/banklist.html'
    +
    In [299]: url = 'https://www.fdic.gov/bank/individual/failed/banklist.html'
     
    -In [295]: dfs = pd.read_html(url)
    +In [300]: dfs = pd.read_html(url)
     
    -In [296]: dfs
    -Out[296]: 
    +In [301]: dfs
    +Out[301]: 
     [                                             Bank Name        City  ST   CERT                Acquiring Institution       Closing Date       Updated Date
    - 0                                 The Enloe State Bank      Cooper  TX  10716                   Legend Bank, N. A.       May 31, 2019    August 22, 2019
    + 0                                 The Enloe State Bank      Cooper  TX  10716                   Legend Bank, N. A.       May 31, 2019      June 18, 2019
      1                  Washington Federal Bank for Savings     Chicago  IL  30570                   Royal Savings Bank  December 15, 2017      July 24, 2019
      2      The Farmers and Merchants State Bank of Argonia     Argonia  KS  17719                          Conway Bank   October 13, 2017    August 12, 2019
      3                                  Fayette County Bank  Saint Elmo  IL   1802            United Fidelity Bank, fsb       May 26, 2017   January 29, 2019
    @@ -3280,12 +3290,12 @@
     

    Read in the content of the file from the above URL and pass it to read_html as a string:

    -
    In [297]: with open(file_path, 'r') as f:
    +
    In [302]: with open(file_path, 'r') as f:
        .....:     dfs = pd.read_html(f.read())
        .....: 
     
    -In [298]: dfs
    -Out[298]: 
    +In [303]: dfs
    +Out[303]: 
     [                                    Bank Name          City  ST   CERT                Acquiring Institution       Closing Date       Updated Date
      0    Banks of Wisconsin d/b/a Bank of Kenosha       Kenosha  WI  35386                North Shore Bank, FSB       May 31, 2013       May 31, 2013
      1                        Central Arizona Bank    Scottsdale  AZ  34527                   Western State Bank       May 14, 2013       May 20, 2013
    @@ -3303,14 +3313,14 @@
     

    You can even pass in an instance of StringIO if you so desire:

    -
    In [299]: with open(file_path, 'r') as f:
    +
    In [304]: with open(file_path, 'r') as f:
        .....:     sio = StringIO(f.read())
        .....: 
     
    -In [300]: dfs = pd.read_html(sio)
    +In [305]: dfs = pd.read_html(sio)
     
    -In [301]: dfs
    -Out[301]: 
    +In [306]: dfs
    +Out[306]: 
     [                                    Bank Name          City  ST   CERT                Acquiring Institution       Closing Date       Updated Date
      0    Banks of Wisconsin d/b/a Bank of Kenosha       Kenosha  WI  35386                North Shore Bank, FSB       May 31, 2013       May 31, 2013
      1                        Central Arizona Bank    Scottsdale  AZ  34527                   Western State Bank       May 14, 2013       May 20, 2013
    @@ -3421,15 +3431,15 @@
     brevity’s sake. See to_html() for the
     full set of options.

    -
    In [302]: df = pd.DataFrame(np.random.randn(2, 2))
    +
    In [307]: df = pd.DataFrame(np.random.randn(2, 2))
     
    -In [303]: df
    -Out[303]: 
    +In [308]: df
    +Out[308]: 
               0         1
     0 -0.184744  0.496971
     1 -0.856240  1.857977
     
    -In [304]: print(df.to_html())  # raw html
    +In [309]: print(df.to_html())  # raw html
     <table border="1" class="dataframe">
       <thead>
         <tr style="text-align: right;">
    @@ -3475,7 +3485,7 @@
         
       
     

    The columns argument will limit the columns shown:

    -
    In [305]: print(df.to_html(columns=[0]))
    +
    In [310]: print(df.to_html(columns=[0]))
     <table border="1" class="dataframe">
       <thead>
         <tr style="text-align: right;">
    @@ -3516,7 +3526,7 @@
       
     

    float_format takes a Python callable to control the precision of floating point values:

    -
    In [306]: print(df.to_html(float_format='{0:.10f}'.format))
    +
    In [311]: print(df.to_html(float_format='{0:.10f}'.format))
     <table border="1" class="dataframe">
       <thead>
         <tr style="text-align: right;">
    @@ -3563,7 +3573,7 @@
       
     

    bold_rows will make the row labels bold by default, but you can turn that off:

    -
    In [307]: print(df.to_html(bold_rows=False))
    +
    In [312]: print(df.to_html(bold_rows=False))
     <table border="1" class="dataframe">
       <thead>
         <tr style="text-align: right;">
    @@ -3610,7 +3620,7 @@
     

    The classes argument provides the ability to give the resulting HTML table CSS classes. Note that these classes are appended to the existing 'dataframe' class.

    -
    In [308]: print(df.to_html(classes=['awesome_table_class', 'even_more_awesome_class']))
    +
    In [313]: print(df.to_html(classes=['awesome_table_class', 'even_more_awesome_class']))
     <table border="1" class="dataframe awesome_table_class even_more_awesome_class">
       <thead>
         <tr style="text-align: right;">
    @@ -3639,12 +3649,12 @@
     

    New in version 0.24.

    -
    In [309]: url_df = pd.DataFrame({
    +
    In [314]: url_df = pd.DataFrame({
        .....:     'name': ['Python', 'Pandas'],
        .....:     'url': ['https://www.python.org/', 'http://pandas.pydata.org']})
        .....: 
     
    -In [310]: print(url_df.to_html(render_links=True))
    +In [315]: print(url_df.to_html(render_links=True))
     <table border="1" class="dataframe">
       <thead>
         <tr style="text-align: right;">
    @@ -3692,11 +3702,11 @@
     

    Finally, the escape argument allows you to control whether the “<”, “>” and “&” characters escaped in the resulting HTML (by default it is True). So to get the HTML without escaped characters pass escape=False

    -
    In [311]: df = pd.DataFrame({'a': list('&<>'), 'b': np.random.randn(3)})
    +
    In [316]: df = pd.DataFrame({'a': list('&<>'), 'b': np.random.randn(3)})
     

    Escaped:

    -
    In [312]: print(df.to_html())
    +
    In [317]: print(df.to_html())
     <table border="1" class="dataframe">
       <thead>
         <tr style="text-align: right;">
    @@ -3751,7 +3761,7 @@
         
       
     

    Not escaped:

    -
    In [313]: print(df.to_html(escape=False))
    +
    In [318]: print(df.to_html(escape=False))
     <table border="1" class="dataframe">
       <thead>
         <tr style="text-align: right;">
    @@ -4005,16 +4015,16 @@
     or columns have serialized level names those will be read in as well by specifying
     the rows/columns that make up the levels.

    For example, to read in a MultiIndex index without names:

    -
    In [314]: df = pd.DataFrame({'a': [1, 2, 3, 4], 'b': [5, 6, 7, 8]},
    +
    In [319]: df = pd.DataFrame({'a': [1, 2, 3, 4], 'b': [5, 6, 7, 8]},
        .....:                   index=pd.MultiIndex.from_product([['a', 'b'], ['c', 'd']]))
        .....: 
     
    -In [315]: df.to_excel('path_to_file.xlsx')
    +In [320]: df.to_excel('path_to_file.xlsx')
     
    -In [316]: df = pd.read_excel('path_to_file.xlsx', index_col=[0, 1])
    +In [321]: df = pd.read_excel('path_to_file.xlsx', index_col=[0, 1])
     
    -In [317]: df
    -Out[317]: 
    +In [322]: df
    +Out[322]: 
          a  b
     a c  1  5
       d  2  6
    @@ -4024,14 +4034,14 @@
     

    If the index has level names, they will parsed as well, using the same parameters.

    -
    In [318]: df.index = df.index.set_names(['lvl1', 'lvl2'])
    +
    In [323]: df.index = df.index.set_names(['lvl1', 'lvl2'])
     
    -In [319]: df.to_excel('path_to_file.xlsx')
    +In [324]: df.to_excel('path_to_file.xlsx')
     
    -In [320]: df = pd.read_excel('path_to_file.xlsx', index_col=[0, 1])
    +In [325]: df = pd.read_excel('path_to_file.xlsx', index_col=[0, 1])
     
    -In [321]: df
    -Out[321]: 
    +In [326]: df
    +Out[326]: 
                a  b
     lvl1 lvl2      
     a    c     1  5
    @@ -4042,16 +4052,16 @@
     

    If the source file has both MultiIndex index and columns, lists specifying each should be passed to index_col and header:

    -
    In [322]: df.columns = pd.MultiIndex.from_product([['a'], ['b', 'd']],
    +
    In [327]: df.columns = pd.MultiIndex.from_product([['a'], ['b', 'd']],
        .....:                                         names=['c1', 'c2'])
        .....: 
     
    -In [323]: df.to_excel('path_to_file.xlsx')
    +In [328]: df.to_excel('path_to_file.xlsx')
     
    -In [324]: df = pd.read_excel('path_to_file.xlsx', index_col=[0, 1], header=[0, 1])
    +In [329]: df = pd.read_excel('path_to_file.xlsx', index_col=[0, 1], header=[0, 1])
     
    -In [325]: df
    -Out[325]: 
    +In [330]: df
    +Out[330]: 
     c1         a   
     c2         b  d
     lvl1 lvl2      
    @@ -4341,8 +4351,8 @@
     

    Pickling

    All pandas objects are equipped with to_pickle methods which use Python’s cPickle module to save data structures to disk using the pickle format.

    -
    In [326]: df
    -Out[326]: 
    +
    In [331]: df
    +Out[331]: 
     c1         a   
     c2         b  d
     lvl1 lvl2      
    @@ -4351,13 +4361,13 @@
     b    c     3  7
          d     4  8
     
    -In [327]: df.to_pickle('foo.pkl')
    +In [332]: df.to_pickle('foo.pkl')
     

    The read_pickle function in the pandas namespace can be used to load any pickled pandas object (or any other pickled object) from file:

    -
    In [328]: pd.read_pickle('foo.pkl')
    -Out[328]: 
    +
    In [333]: pd.read_pickle('foo.pkl')
    +Out[333]: 
     c1         a   
     c2         b  d
     lvl1 lvl2      
    @@ -4388,14 +4398,14 @@
     

    The compression type can be an explicit parameter or be inferred from the file extension. If ‘infer’, then use gzip, bz2, zip, or xz if filename ends in '.gz', '.bz2', '.zip', or '.xz', respectively.

    -
    In [329]: df = pd.DataFrame({
    +
    In [334]: df = pd.DataFrame({
        .....:     'A': np.random.randn(1000),
        .....:     'B': 'foo',
        .....:     'C': pd.date_range('20130101', periods=1000, freq='s')})
        .....: 
     
    -In [330]: df
    -Out[330]: 
    +In [335]: df
    +Out[335]: 
                 A    B                   C
     0   -0.288267  foo 2013-01-01 00:00:00
     1   -0.084905  foo 2013-01-01 00:00:01
    @@ -4413,12 +4423,12 @@
     

    Using an explicit compression type:

    -
    In [331]: df.to_pickle("data.pkl.compress", compression="gzip")
    +
    In [336]: df.to_pickle("data.pkl.compress", compression="gzip")
     
    -In [332]: rt = pd.read_pickle("data.pkl.compress", compression="gzip")
    +In [337]: rt = pd.read_pickle("data.pkl.compress", compression="gzip")
     
    -In [333]: rt
    -Out[333]: 
    +In [338]: rt
    +Out[338]: 
                 A    B                   C
     0   -0.288267  foo 2013-01-01 00:00:00
     1   -0.084905  foo 2013-01-01 00:00:01
    @@ -4436,12 +4446,12 @@
     

    Inferring compression type from the extension:

    -
    In [334]: df.to_pickle("data.pkl.xz", compression="infer")
    +
    In [339]: df.to_pickle("data.pkl.xz", compression="infer")
     
    -In [335]: rt = pd.read_pickle("data.pkl.xz", compression="infer")
    +In [340]: rt = pd.read_pickle("data.pkl.xz", compression="infer")
     
    -In [336]: rt
    -Out[336]: 
    +In [341]: rt
    +Out[341]: 
                 A    B                   C
     0   -0.288267  foo 2013-01-01 00:00:00
     1   -0.084905  foo 2013-01-01 00:00:01
    @@ -4459,12 +4469,12 @@
     

    The default is to ‘infer’:

    -
    In [337]: df.to_pickle("data.pkl.gz")
    +
    In [342]: df.to_pickle("data.pkl.gz")
     
    -In [338]: rt = pd.read_pickle("data.pkl.gz")
    +In [343]: rt = pd.read_pickle("data.pkl.gz")
     
    -In [339]: rt
    -Out[339]: 
    +In [344]: rt
    +Out[344]: 
                 A    B                   C
     0   -0.288267  foo 2013-01-01 00:00:00
     1   -0.084905  foo 2013-01-01 00:00:01
    @@ -4480,12 +4490,12 @@
     
     [1000 rows x 3 columns]
     
    -In [340]: df["A"].to_pickle("s1.pkl.bz2")
    +In [345]: df["A"].to_pickle("s1.pkl.bz2")
     
    -In [341]: rt = pd.read_pickle("s1.pkl.bz2")
    +In [346]: rt = pd.read_pickle("s1.pkl.bz2")
     
    -In [342]: rt
    -Out[342]: 
    +In [347]: rt
    +Out[347]: 
     0     -0.288267
     1     -0.084905
     2      0.004772
    @@ -4517,12 +4527,12 @@
     

    Warning

    read_msgpack() is only guaranteed backwards compatible back to pandas version 0.20.3

    -
    In [343]: df = pd.DataFrame(np.random.rand(5, 2), columns=list('AB'))
    +
    In [348]: df = pd.DataFrame(np.random.rand(5, 2), columns=list('AB'))
     
    -In [344]: df.to_msgpack('foo.msg')
    +In [349]: df.to_msgpack('foo.msg')
     
    -In [345]: pd.read_msgpack('foo.msg')
    -Out[345]: 
    +In [350]: pd.read_msgpack('foo.msg')
    +Out[350]: 
               A         B
     0  0.275432  0.293583
     1  0.842639  0.165381
    @@ -4530,14 +4540,14 @@
     3  0.136543  0.029703
     4  0.318083  0.604870
     
    -In [346]: s = pd.Series(np.random.rand(5), index=pd.date_range('20130101', periods=5))
    +In [351]: s = pd.Series(np.random.rand(5), index=pd.date_range('20130101', periods=5))
     

    You can pass a list of objects and you will receive them back on deserialization.

    -
    In [347]: pd.to_msgpack('foo.msg', df, 'foo', np.array([1, 2, 3]), s)
    +
    In [352]: pd.to_msgpack('foo.msg', df, 'foo', np.array([1, 2, 3]), s)
     
    -In [348]: pd.read_msgpack('foo.msg')
    -Out[348]: 
    +In [353]: pd.read_msgpack('foo.msg')
    +Out[353]: 
     [          A         B
      0  0.275432  0.293583
      1  0.842639  0.165381
    @@ -4552,7 +4562,7 @@
     

    You can pass iterator=True to iterate over the unpacked results:

    -
    In [349]: for o in pd.read_msgpack('foo.msg', iterator=True):
    +
    In [354]: for o in pd.read_msgpack('foo.msg', iterator=True):
        .....:     print(o)
        .....: 
               A         B
    @@ -4572,10 +4582,10 @@
     

    You can pass append=True to the writer to append to an existing pack:

    -
    In [350]: df.to_msgpack('foo.msg', append=True)
    +
    In [355]: df.to_msgpack('foo.msg', append=True)
     
    -In [351]: pd.read_msgpack('foo.msg')
    -Out[351]: 
    +In [356]: pd.read_msgpack('foo.msg')
    +Out[356]: 
     [          A         B
      0  0.275432  0.293583
      1  0.842639  0.165381
    @@ -4598,12 +4608,12 @@
     df.to_msgpack() and using the top-level pd.to_msgpack(...) where you
     can pack arbitrary collections of Python lists, dicts, scalars, while intermixing
     pandas objects.

    -
    In [352]: pd.to_msgpack('foo2.msg', {'dict': [{'df': df}, {'string': 'foo'},
    +
    In [357]: pd.to_msgpack('foo2.msg', {'dict': [{'df': df}, {'string': 'foo'},
        .....:                                     {'scalar': 1.}, {'s': s}]})
        .....: 
     
    -In [353]: pd.read_msgpack('foo2.msg')
    -Out[353]: 
    +In [358]: pd.read_msgpack('foo2.msg')
    +Out[358]: 
     {'dict': ({'df':           A         B
        0  0.275432  0.293583
        1  0.842639  0.165381
    @@ -4623,13 +4633,13 @@
     

    Read/write API

    Msgpacks can also be read from and written to strings.

    -
    In [354]: df.to_msgpack()
    -Out[354]: b'\x84\xa3typ\xadblock_manager\xa5klass\xa9DataFrame\xa4axes\x92\x86\xa3typ\xa5index\xa5klass\xa5Index\xa4name\xc0\xa5dtype\xa6object\xa4data\x92\xa1A\xa1B\xa8compress\xc0\x86\xa3typ\xabrange_index\xa5klass\xaaRangeIndex\xa4name\xc0\xa5start\x00\xa4stop\x05\xa4step\x01\xa6blocks\x91\x86\xa4locs\x86\xa3typ\xa7ndarray\xa5shape\x91\x02\xa4ndim\x01\xa5dtype\xa5int64\xa4data\xd8\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\xa8compress\xc0\xa6values\xc7P\x00\xc84 \x84\xac\xa0\xd1?\x0f\xa4.\xb5\xe6\xf6\xea?\xb9\x85\x9aLO|\xe3?\xac\xf0\xd7\x81>z\xc1?\\\xca\x97\ty[\xd4?\x9c\x9b\x8a:\x11\xca\xd2?\x14zX\xd01+\xc5?4=\x19b\xad\xec\xe8?\xc0!\xe9\xf4\x8ej\x9e?\xa7>_\xac\x17[\xe3?\xa5shape\x92\x02\x05\xa5dtype\xa7float64\xa5klass\xaaFloatBlock\xa8compress\xc0'
    +
    In [359]: df.to_msgpack()
    +Out[359]: b'\x84\xa3typ\xadblock_manager\xa5klass\xa9DataFrame\xa4axes\x92\x86\xa3typ\xa5index\xa5klass\xa5Index\xa4name\xc0\xa5dtype\xa6object\xa4data\x92\xa1A\xa1B\xa8compress\xc0\x86\xa3typ\xabrange_index\xa5klass\xaaRangeIndex\xa4name\xc0\xa5start\x00\xa4stop\x05\xa4step\x01\xa6blocks\x91\x86\xa4locs\x86\xa3typ\xa7ndarray\xa5shape\x91\x02\xa4ndim\x01\xa5dtype\xa5int64\xa4data\xd8\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\xa8compress\xc0\xa6values\xc7P\x00\xc84 \x84\xac\xa0\xd1?\x0f\xa4.\xb5\xe6\xf6\xea?\xb9\x85\x9aLO|\xe3?\xac\xf0\xd7\x81>z\xc1?\\\xca\x97\ty[\xd4?\x9c\x9b\x8a:\x11\xca\xd2?\x14zX\xd01+\xc5?4=\x19b\xad\xec\xe8?\xc0!\xe9\xf4\x8ej\x9e?\xa7>_\xac\x17[\xe3?\xa5shape\x92\x02\x05\xa5dtype\xa7float64\xa5klass\xaaFloatBlock\xa8compress\xc0'
     

    Furthermore you can concatenate the strings to produce a list of the original objects.

    -
    In [355]: pd.read_msgpack(df.to_msgpack() + s.to_msgpack())
    -Out[355]: 
    +
    In [360]: pd.read_msgpack(df.to_msgpack() + s.to_msgpack())
    +Out[360]: 
     [          A         B
      0  0.275432  0.293583
      1  0.842639  0.165381
    @@ -4657,38 +4667,38 @@
     If you see a subset of results being returned, upgrade to PyTables >= 3.2.
     Stores created previously will need to be rewritten using the updated version.

    -
    In [356]: store = pd.HDFStore('store.h5')
    +
    In [361]: store = pd.HDFStore('store.h5')
     
    -In [357]: print(store)
    +In [362]: print(store)
     <class 'pandas.io.pytables.HDFStore'>
     File path: store.h5
     

    Objects can be written to the file just like adding key-value pairs to a dict:

    -
    In [358]: index = pd.date_range('1/1/2000', periods=8)
    +
    In [363]: index = pd.date_range('1/1/2000', periods=8)
     
    -In [359]: s = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e'])
    +In [364]: s = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e'])
     
    -In [360]: df = pd.DataFrame(np.random.randn(8, 3), index=index,
    +In [365]: df = pd.DataFrame(np.random.randn(8, 3), index=index,
        .....:                   columns=['A', 'B', 'C'])
        .....: 
     
     # store.put('s', s) is an equivalent method
    -In [361]: store['s'] = s
    +In [366]: store['s'] = s
     
    -In [362]: store['df'] = df
    +In [367]: store['df'] = df
     
    -In [363]: store
    -Out[363]: 
    +In [368]: store
    +Out[368]: 
     <class 'pandas.io.pytables.HDFStore'>
     File path: store.h5
     

    In a current or later Python session, you can retrieve stored objects:

    # store.get('df') is an equivalent method
    -In [364]: store['df']
    -Out[364]: 
    +In [369]: store['df']
    +Out[369]: 
                        A         B         C
     2000-01-01 -0.426936 -1.780784  0.322691
     2000-01-02  1.638174 -2.184251  0.049673
    @@ -4700,8 +4710,8 @@
     2000-01-08 -0.810399  0.254343 -0.875526
     
     # dotted (attribute) access provides get as well
    -In [365]: store.df
    -Out[365]: 
    +In [370]: store.df
    +Out[370]: 
                        A         B         C
     2000-01-01 -0.426936 -1.780784  0.322691
     2000-01-02  1.638174 -2.184251  0.049673
    @@ -4715,27 +4725,27 @@
     

    Deletion of the object specified by the key:

    # store.remove('df') is an equivalent method
    -In [366]: del store['df']
    +In [371]: del store['df']
     
    -In [367]: store
    -Out[367]: 
    +In [372]: store
    +Out[372]: 
     <class 'pandas.io.pytables.HDFStore'>
     File path: store.h5
     

    Closing a Store and using a context manager:

    -
    In [368]: store.close()
    +
    In [373]: store.close()
     
    -In [369]: store
    -Out[369]: 
    +In [374]: store
    +Out[374]: 
     <class 'pandas.io.pytables.HDFStore'>
     File path: store.h5
     
    -In [370]: store.is_open
    -Out[370]: False
    +In [375]: store.is_open
    +Out[375]: False
     
     # Working with, and automatically closing the store using a context manager
    -In [371]: with pd.HDFStore('store.h5') as store:
    +In [376]: with pd.HDFStore('store.h5') as store:
        .....:     store.keys()
        .....: 
     
    @@ -4744,46 +4754,46 @@

    Read/write API

    HDFStore supports a top-level API using read_hdf for reading and to_hdf for writing, similar to how read_csv and to_csv work.

    -
    In [372]: df_tl = pd.DataFrame({'A': list(range(5)), 'B': list(range(5))})
    +
    In [377]: df_tl = pd.DataFrame({'A': list(range(5)), 'B': list(range(5))})
     
    -In [373]: df_tl.to_hdf('store_tl.h5', 'table', append=True)
    +In [378]: df_tl.to_hdf('store_tl.h5', 'table', append=True)
     
    -In [374]: pd.read_hdf('store_tl.h5', 'table', where=['index>2'])
    -Out[374]: 
    +In [379]: pd.read_hdf('store_tl.h5', 'table', where=['index>2'])
    +Out[379]: 
        A  B
     3  3  3
     4  4  4
     

    HDFStore will by default not drop rows that are all missing. This behavior can be changed by setting dropna=True.

    -
    In [375]: df_with_missing = pd.DataFrame({'col1': [0, np.nan, 2],
    +
    In [380]: df_with_missing = pd.DataFrame({'col1': [0, np.nan, 2],
        .....:                                 'col2': [1, np.nan, np.nan]})
        .....: 
     
    -In [376]: df_with_missing
    -Out[376]: 
    +In [381]: df_with_missing
    +Out[381]: 
        col1  col2
     0   0.0   1.0
     1   NaN   NaN
     2   2.0   NaN
     
    -In [377]: df_with_missing.to_hdf('file.h5', 'df_with_missing',
    +In [382]: df_with_missing.to_hdf('file.h5', 'df_with_missing',
        .....:                        format='table', mode='w')
        .....: 
     
    -In [378]: pd.read_hdf('file.h5', 'df_with_missing')
    -Out[378]: 
    +In [383]: pd.read_hdf('file.h5', 'df_with_missing')
    +Out[383]: 
        col1  col2
     0   0.0   1.0
     1   NaN   NaN
     2   2.0   NaN
     
    -In [379]: df_with_missing.to_hdf('file.h5', 'df_with_missing',
    +In [384]: df_with_missing.to_hdf('file.h5', 'df_with_missing',
        .....:                        format='table', mode='w', dropna=True)
        .....: 
     
    -In [380]: pd.read_hdf('file.h5', 'df_with_missing')
    -Out[380]: 
    +In [385]: pd.read_hdf('file.h5', 'df_with_missing')
    +Out[385]: 
        col1  col2
     0   0.0   1.0
     2   2.0   NaN
    @@ -4819,25 +4829,25 @@
     to append or put or to_hdf.

    This format can be set as an option as well pd.set_option('io.hdf.default_format','table') to enable put/append/to_hdf to by default store in the table format.

    -
    In [381]: store = pd.HDFStore('store.h5')
    +
    In [386]: store = pd.HDFStore('store.h5')
     
    -In [382]: df1 = df[0:4]
    +In [387]: df1 = df[0:4]
     
    -In [383]: df2 = df[4:]
    +In [388]: df2 = df[4:]
     
     # append data (creates a table automatically)
    -In [384]: store.append('df', df1)
    +In [389]: store.append('df', df1)
     
    -In [385]: store.append('df', df2)
    +In [390]: store.append('df', df2)
     
    -In [386]: store
    -Out[386]: 
    +In [391]: store
    +Out[391]: 
     <class 'pandas.io.pytables.HDFStore'>
     File path: store.h5
     
     # select the entire object
    -In [387]: store.select('df')
    -Out[387]: 
    +In [392]: store.select('df')
    +Out[392]: 
                        A         B         C
     2000-01-01 -0.426936 -1.780784  0.322691
     2000-01-02  1.638174 -2.184251  0.049673
    @@ -4849,8 +4859,8 @@
     2000-01-08 -0.810399  0.254343 -0.875526
     
     # the type of stored data
    -In [388]: store.root.df._v_attrs.pandas_type
    -Out[388]: 'frame_table'
    +In [393]: store.root.df._v_attrs.pandas_type
    +Out[393]: 'frame_table'
     
    @@ -4866,26 +4876,26 @@ parlance). Keys can be specified without the leading ‘/’ and are always absolute (e.g. ‘foo’ refers to ‘/foo’). Removal operations can remove everything in the sub-store and below, so be careful.

    -
    In [389]: store.put('foo/bar/bah', df)
    +
    In [394]: store.put('foo/bar/bah', df)
     
    -In [390]: store.append('food/orange', df)
    +In [395]: store.append('food/orange', df)
     
    -In [391]: store.append('food/apple', df)
    +In [396]: store.append('food/apple', df)
     
    -In [392]: store
    -Out[392]: 
    +In [397]: store
    +Out[397]: 
     <class 'pandas.io.pytables.HDFStore'>
     File path: store.h5
     
     # a list of keys are returned
    -In [393]: store.keys()
    -Out[393]: ['/df', '/food/apple', '/food/orange', '/foo/bar/bah']
    +In [398]: store.keys()
    +Out[398]: ['/df', '/food/apple', '/food/orange', '/foo/bar/bah']
     
     # remove all nodes under this level
    -In [394]: store.remove('food')
    +In [399]: store.remove('food')
     
    -In [395]: store
    -Out[395]: 
    +In [400]: store
    +Out[400]: 
     <class 'pandas.io.pytables.HDFStore'>
     File path: store.h5
     
    @@ -4895,7 +4905,7 @@

    New in version 0.24.0.

    -
    In [396]: for (path, subgroups, subkeys) in store.walk():
    +
    In [401]: for (path, subgroups, subkeys) in store.walk():
        .....:     for subgroup in subgroups:
        .....:         print('GROUP: {}/{}'.format(path, subgroup))
        .....:     for subkey in subkeys:
    @@ -4941,8 +4951,8 @@
     

    Instead, use explicit string based keys:

    -
    In [397]: store['foo/bar/bah']
    -Out[397]: 
    +
    In [402]: store['foo/bar/bah']
    +Out[402]: 
                        A         B         C
     2000-01-01 -0.426936 -1.780784  0.322691
     2000-01-02  1.638174 -2.184251  0.049673
    @@ -4969,7 +4979,7 @@
     columns, passing nan_rep = 'nan' to append will change the default
     nan representation on disk (which converts to/from np.nan), this
     defaults to nan.

    -
    In [398]: df_mixed = pd.DataFrame({'A': np.random.randn(8),
    +
    In [403]: df_mixed = pd.DataFrame({'A': np.random.randn(8),
        .....:                          'B': np.random.randn(8),
        .....:                          'C': np.array(np.random.randn(8), dtype='float32'),
        .....:                          'string': 'string',
    @@ -4979,16 +4989,16 @@
        .....:                         index=list(range(8)))
        .....: 
     
    -In [399]: df_mixed.loc[df_mixed.index[3:5],
    +In [404]: df_mixed.loc[df_mixed.index[3:5],
        .....:              ['A', 'B', 'string', 'datetime64']] = np.nan
        .....: 
     
    -In [400]: store.append('df_mixed', df_mixed, min_itemsize={'values': 50})
    +In [405]: store.append('df_mixed', df_mixed, min_itemsize={'values': 50})
     
    -In [401]: df_mixed1 = store.select('df_mixed')
    +In [406]: df_mixed1 = store.select('df_mixed')
     
    -In [402]: df_mixed1
    -Out[402]: 
    +In [407]: df_mixed1
    +Out[407]: 
               A         B         C  string  int  bool datetime64
     0 -0.980856  0.298656  0.151508  string    1  True 2001-01-02
     1 -0.906920 -1.294022  0.587939  string    1  True 2001-01-02
    @@ -4999,8 +5009,8 @@
     6 -0.043509 -0.303900  0.567265  string    1  True 2001-01-02
     7  0.768606 -0.871948 -0.044348  string    1  True 2001-01-02
     
    -In [403]: df_mixed1.dtypes.value_counts()
    -Out[403]: 
    +In [408]: df_mixed1.dtypes.value_counts()
    +Out[408]: 
     float64           2
     float32           1
     int64             1
    @@ -5010,8 +5020,8 @@
     dtype: int64
     
     # we have provided a minimum string column size
    -In [404]: store.root.df_mixed.table
    -Out[404]: 
    +In [409]: store.root.df_mixed.table
    +Out[409]: 
     /df_mixed/table (Table(8,)) ''
       description := {
       "index": Int64Col(shape=(), dflt=0, pos=0),
    @@ -5033,19 +5043,19 @@
     

    Storing MultiIndex DataFrames

    Storing MultiIndex DataFrames as tables is very similar to storing/selecting from homogeneous index DataFrames.

    -
    In [405]: index = pd.MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'],
    +
    In [410]: index = pd.MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'],
        .....:                               ['one', 'two', 'three']],
        .....:                       codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
        .....:                              [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
        .....:                       names=['foo', 'bar'])
        .....: 
     
    -In [406]: df_mi = pd.DataFrame(np.random.randn(10, 3), index=index,
    +In [411]: df_mi = pd.DataFrame(np.random.randn(10, 3), index=index,
        .....:                      columns=['A', 'B', 'C'])
        .....: 
     
    -In [407]: df_mi
    -Out[407]: 
    +In [412]: df_mi
    +Out[412]: 
                       A         B         C
     foo bar                                
     foo one    0.031885  0.641045  0.479460
    @@ -5059,10 +5069,10 @@
         two    0.170697 -0.200289  1.220322
         three -1.001273  0.162172  0.376816
     
    -In [408]: store.append('df_mi', df_mi)
    +In [413]: store.append('df_mi', df_mi)
     
    -In [409]: store.select('df_mi')
    -Out[409]: 
    +In [414]: store.select('df_mi')
    +Out[414]: 
                       A         B         C
     foo bar                                
     foo one    0.031885  0.641045  0.479460
    @@ -5077,8 +5087,8 @@
         three -1.001273  0.162172  0.376816
     
     # the levels are automatically included as data columns
    -In [410]: store.select('df_mi', 'foo=bar')
    -Out[410]: 
    +In [415]: store.select('df_mi', 'foo=bar')
    +Out[415]: 
                     A         B         C
     foo bar                              
     bar one  0.758552  0.384775 -1.133177
    @@ -5165,16 +5175,16 @@
     

    which will quote string.

    Here are some examples:

    -
    In [411]: dfq = pd.DataFrame(np.random.randn(10, 4), columns=list('ABCD'),
    +
    In [416]: dfq = pd.DataFrame(np.random.randn(10, 4), columns=list('ABCD'),
        .....:                    index=pd.date_range('20130101', periods=10))
        .....: 
     
    -In [412]: store.append('dfq', dfq, format='table', data_columns=True)
    +In [417]: store.append('dfq', dfq, format='table', data_columns=True)
     

    Use boolean expressions, with in-line function evaluation.

    -
    In [413]: store.select('dfq', "index>pd.Timestamp('20130104') & columns=['A', 'B']")
    -Out[413]: 
    +
    In [418]: store.select('dfq', "index>pd.Timestamp('20130104') & columns=['A', 'B']")
    +Out[418]: 
                        A         B
     2013-01-05  0.450263  0.755221
     2013-01-06  0.019915  0.300003
    @@ -5185,8 +5195,8 @@
     

    Use inline column reference.

    -
    In [414]: store.select('dfq', where="A>0 or C>0")
    -Out[414]: 
    +
    In [419]: store.select('dfq', where="A>0 or C>0")
    +Out[419]: 
                        A         B         C         D
     2013-01-01 -0.161614 -1.636805  0.835417  0.864817
     2013-01-02  0.843452 -0.122918 -0.026122 -1.507533
    @@ -5201,8 +5211,8 @@
     

    The columns keyword can be supplied to select a list of columns to be returned, this is equivalent to passing a 'columns=list_of_columns_to_filter':

    -
    In [415]: store.select('df', "columns=['A', 'B']")
    -Out[415]: 
    +
    In [420]: store.select('df', "columns=['A', 'B']")
    +Out[420]: 
                        A         B
     2000-01-01 -0.426936 -1.780784
     2000-01-02  1.638174 -2.184251
    @@ -5229,18 +5239,18 @@
     

    You can store and query using the timedelta64[ns] type. Terms can be specified in the format: <float>(<unit>), where float may be signed (and fractional), and unit can be D,s,ms,us,ns for the timedelta. Here’s an example:

    -
    In [416]: from datetime import timedelta
    +
    In [421]: from datetime import timedelta
     
    -In [417]: dftd = pd.DataFrame({'A': pd.Timestamp('20130101'),
    +In [422]: dftd = pd.DataFrame({'A': pd.Timestamp('20130101'),
        .....:                      'B': [pd.Timestamp('20130101') + timedelta(days=i,
        .....:                                                                 seconds=10)
        .....:                            for i in range(10)]})
        .....: 
     
    -In [418]: dftd['C'] = dftd['A'] - dftd['B']
    +In [423]: dftd['C'] = dftd['A'] - dftd['B']
     
    -In [419]: dftd
    -Out[419]: 
    +In [424]: dftd
    +Out[424]: 
                A                   B                  C
     0 2013-01-01 2013-01-01 00:00:10  -1 days +23:59:50
     1 2013-01-01 2013-01-02 00:00:10  -2 days +23:59:50
    @@ -5253,10 +5263,10 @@
     8 2013-01-01 2013-01-09 00:00:10  -9 days +23:59:50
     9 2013-01-01 2013-01-10 00:00:10 -10 days +23:59:50
     
    -In [420]: store.append('dftd', dftd, data_columns=True)
    +In [425]: store.append('dftd', dftd, data_columns=True)
     
    -In [421]: store.select('dftd', "C<'-3.5D'")
    -Out[421]: 
    +In [426]: store.select('dftd', "C<'-3.5D'")
    +Out[426]: 
                A                   B                  C
     4 2013-01-01 2013-01-05 00:00:10  -5 days +23:59:50
     5 2013-01-01 2013-01-06 00:00:10  -6 days +23:59:50
    @@ -5281,33 +5291,33 @@
     index=False to append.

    # we have automagically already created an index (in the first section)
    -In [422]: i = store.root.df.table.cols.index.index
    +In [427]: i = store.root.df.table.cols.index.index
     
    -In [423]: i.optlevel, i.kind
    -Out[423]: (6, 'medium')
    +In [428]: i.optlevel, i.kind
    +Out[428]: (6, 'medium')
     
     # change an index by passing new parameters
    -In [424]: store.create_table_index('df', optlevel=9, kind='full')
    +In [429]: store.create_table_index('df', optlevel=9, kind='full')
     
    -In [425]: i = store.root.df.table.cols.index.index
    +In [430]: i = store.root.df.table.cols.index.index
     
    -In [426]: i.optlevel, i.kind
    -Out[426]: (9, 'full')
    +In [431]: i.optlevel, i.kind
    +Out[431]: (9, 'full')
     

    Oftentimes when appending large amounts of data to a store, it is useful to turn off index creation for each append, then recreate at the end.

    -
    In [427]: df_1 = pd.DataFrame(np.random.randn(10, 2), columns=list('AB'))
    +
    In [432]: df_1 = pd.DataFrame(np.random.randn(10, 2), columns=list('AB'))
     
    -In [428]: df_2 = pd.DataFrame(np.random.randn(10, 2), columns=list('AB'))
    +In [433]: df_2 = pd.DataFrame(np.random.randn(10, 2), columns=list('AB'))
     
    -In [429]: st = pd.HDFStore('appends.h5', mode='w')
    +In [434]: st = pd.HDFStore('appends.h5', mode='w')
     
    -In [430]: st.append('df', df_1, data_columns=['B'], index=False)
    +In [435]: st.append('df', df_1, data_columns=['B'], index=False)
     
    -In [431]: st.append('df', df_2, data_columns=['B'], index=False)
    +In [436]: st.append('df', df_2, data_columns=['B'], index=False)
     
    -In [432]: st.get_storer('df').table
    -Out[432]: 
    +In [437]: st.get_storer('df').table
    +Out[437]: 
     /df/table (Table(20,)) ''
       description := {
       "index": Int64Col(shape=(), dflt=0, pos=0),
    @@ -5318,10 +5328,10 @@
     

    Then create the index when finished appending.

    -
    In [433]: st.create_table_index('df', columns=['B'], optlevel=9, kind='full')
    +
    In [438]: st.create_table_index('df', columns=['B'], optlevel=9, kind='full')
     
    -In [434]: st.get_storer('df').table
    -Out[434]: 
    +In [439]: st.get_storer('df').table
    +Out[439]: 
     /df/table (Table(20,)) ''
       description := {
       "index": Int64Col(shape=(), dflt=0, pos=0),
    @@ -5333,7 +5343,7 @@
       colindexes := {
         "B": Index(9, full, shuffle, zlib(1)).is_csi=True}
     
    -In [435]: st.close()
    +In [440]: st.close()
     

    See here for how to create a completely-sorted-index (CSI) on an existing store.

    @@ -5346,20 +5356,20 @@ operation, on-disk, and return just the frame that matches this query. You can specify data_columns = True to force all columns to be data_columns.

    -
    In [436]: df_dc = df.copy()
    +
    In [441]: df_dc = df.copy()
     
    -In [437]: df_dc['string'] = 'foo'
    +In [442]: df_dc['string'] = 'foo'
     
    -In [438]: df_dc.loc[df_dc.index[4:6], 'string'] = np.nan
    +In [443]: df_dc.loc[df_dc.index[4:6], 'string'] = np.nan
     
    -In [439]: df_dc.loc[df_dc.index[7:9], 'string'] = 'bar'
    +In [444]: df_dc.loc[df_dc.index[7:9], 'string'] = 'bar'
     
    -In [440]: df_dc['string2'] = 'cool'
    +In [445]: df_dc['string2'] = 'cool'
     
    -In [441]: df_dc.loc[df_dc.index[1:3], ['B', 'C']] = 1.0
    +In [446]: df_dc.loc[df_dc.index[1:3], ['B', 'C']] = 1.0
     
    -In [442]: df_dc
    -Out[442]: 
    +In [447]: df_dc
    +Out[447]: 
                        A         B         C string string2
     2000-01-01 -0.426936 -1.780784  0.322691    foo    cool
     2000-01-02  1.638174  1.000000  1.000000    foo    cool
    @@ -5371,10 +5381,10 @@
     2000-01-08 -0.810399  0.254343 -0.875526    bar    cool
     
     # on-disk operations
    -In [443]: store.append('df_dc', df_dc, data_columns=['B', 'C', 'string', 'string2'])
    +In [448]: store.append('df_dc', df_dc, data_columns=['B', 'C', 'string', 'string2'])
     
    -In [444]: store.select('df_dc', where='B > 0')
    -Out[444]: 
    +In [449]: store.select('df_dc', where='B > 0')
    +Out[449]: 
                        A         B         C string string2
     2000-01-02  1.638174  1.000000  1.000000    foo    cool
     2000-01-03 -1.022803  1.000000  1.000000    foo    cool
    @@ -5383,16 +5393,16 @@
     2000-01-08 -0.810399  0.254343 -0.875526    bar    cool
     
     # getting creative
    -In [445]: store.select('df_dc', 'B > 0 & C > 0 & string == foo')
    -Out[445]: 
    +In [450]: store.select('df_dc', 'B > 0 & C > 0 & string == foo')
    +Out[450]: 
                        A        B         C string string2
     2000-01-02  1.638174  1.00000  1.000000    foo    cool
     2000-01-03 -1.022803  1.00000  1.000000    foo    cool
     2000-01-07 -0.450781  1.06465  1.014927    foo    cool
     
     # this is in-memory version of this type of selection
    -In [446]: df_dc[(df_dc.B > 0) & (df_dc.C > 0) & (df_dc.string == 'foo')]
    -Out[446]: 
    +In [451]: df_dc[(df_dc.B > 0) & (df_dc.C > 0) & (df_dc.string == 'foo')]
    +Out[451]: 
                        A        B         C string string2
     2000-01-02  1.638174  1.00000  1.000000    foo    cool
     2000-01-03 -1.022803  1.00000  1.000000    foo    cool
    @@ -5400,8 +5410,8 @@
     
     # we have automagically created this index and the B/C/string/string2
     # columns are stored separately as ``PyTables`` columns
    -In [447]: store.root.df_dc.table
    -Out[447]: 
    +In [452]: store.root.df_dc.table
    +Out[452]: 
     /df_dc/table (Table(8,)) ''
       description := {
       "index": Int64Col(shape=(), dflt=0, pos=0),
    @@ -5432,7 +5442,7 @@
     

    You can pass iterator=True or chunksize=number_in_a_chunk to select and select_as_multiple to return an iterator on the results. The default is 50,000 rows returned in a chunk.

    -
    In [448]: for df in store.select('df', chunksize=3):
    +
    In [453]: for df in store.select('df', chunksize=3):
        .....:     print(df)
        .....: 
                        A         B         C
    @@ -5462,10 +5472,10 @@
     and the query applied, returning an iterator on potentially unequal sized chunks.

    Here is a recipe for generating a query and using it to create equal sized return chunks.

    -
    In [449]: dfeq = pd.DataFrame({'number': np.arange(1, 11)})
    +
    In [454]: dfeq = pd.DataFrame({'number': np.arange(1, 11)})
     
    -In [450]: dfeq
    -Out[450]: 
    +In [455]: dfeq
    +Out[455]: 
        number
     0       1
     1       2
    @@ -5478,17 +5488,17 @@
     8       9
     9      10
     
    -In [451]: store.append('dfeq', dfeq, data_columns=['number'])
    +In [456]: store.append('dfeq', dfeq, data_columns=['number'])
     
    -In [452]: def chunks(l, n):
    +In [457]: def chunks(l, n):
        .....:     return [l[i:i + n] for i in range(0, len(l), n)]
        .....: 
     
    -In [453]: evens = [2, 4, 6, 8, 10]
    +In [458]: evens = [2, 4, 6, 8, 10]
     
    -In [454]: coordinates = store.select_as_coordinates('dfeq', 'number=evens')
    +In [459]: coordinates = store.select_as_coordinates('dfeq', 'number=evens')
     
    -In [455]: for c in chunks(coordinates, 2):
    +In [460]: for c in chunks(coordinates, 2):
        .....:     print(store.select('dfeq', where=c))
        .....: 
        number
    @@ -5510,8 +5520,8 @@
     method select_column. This will, for example, enable you to get the index
     very quickly. These return a Series of the result, indexed by the row number.
     These do not currently accept the where selector.

    -
    In [456]: store.select_column('df_dc', 'index')
    -Out[456]: 
    +
    In [461]: store.select_column('df_dc', 'index')
    +Out[461]: 
     0   2000-01-01
     1   2000-01-02
     2   2000-01-03
    @@ -5522,8 +5532,8 @@
     7   2000-01-08
     Name: index, dtype: datetime64[ns]
     
    -In [457]: store.select_column('df_dc', 'string')
    -Out[457]: 
    +In [462]: store.select_column('df_dc', 'string')
    +Out[462]: 
     0    foo
     1    foo
     2    foo
    @@ -5541,23 +5551,23 @@
     

    Sometimes you want to get the coordinates (a.k.a the index locations) of your query. This returns an Int64Index of the resulting locations. These coordinates can also be passed to subsequent where operations.

    -
    In [458]: df_coord = pd.DataFrame(np.random.randn(1000, 2),
    +
    In [463]: df_coord = pd.DataFrame(np.random.randn(1000, 2),
        .....:                         index=pd.date_range('20000101', periods=1000))
        .....: 
     
    -In [459]: store.append('df_coord', df_coord)
    +In [464]: store.append('df_coord', df_coord)
     
    -In [460]: c = store.select_as_coordinates('df_coord', 'index > 20020101')
    +In [465]: c = store.select_as_coordinates('df_coord', 'index > 20020101')
     
    -In [461]: c
    -Out[461]: 
    +In [466]: c
    +Out[466]: 
     Int64Index([732, 733, 734, 735, 736, 737, 738, 739, 740, 741,
                 ...
                 990, 991, 992, 993, 994, 995, 996, 997, 998, 999],
                dtype='int64', length=268)
     
    -In [462]: store.select('df_coord', where=c)
    -Out[462]: 
    +In [467]: store.select('df_coord', where=c)
    +Out[467]: 
                        0         1
     2002-01-02  0.440865 -0.151651
     2002-01-03 -1.195089  0.285093
    @@ -5580,18 +5590,18 @@
     

    Sometime your query can involve creating a list of rows to select. Usually this mask would be a resulting index from an indexing operation. This example selects the months of a datetimeindex which are 5.

    -
    In [463]: df_mask = pd.DataFrame(np.random.randn(1000, 2),
    +
    In [468]: df_mask = pd.DataFrame(np.random.randn(1000, 2),
        .....:                        index=pd.date_range('20000101', periods=1000))
        .....: 
     
    -In [464]: store.append('df_mask', df_mask)
    +In [469]: store.append('df_mask', df_mask)
     
    -In [465]: c = store.select_column('df_mask', 'index')
    +In [470]: c = store.select_column('df_mask', 'index')
     
    -In [466]: where = c[pd.DatetimeIndex(c).month == 5].index
    +In [471]: where = c[pd.DatetimeIndex(c).month == 5].index
     
    -In [467]: store.select('df_mask', where=where)
    -Out[467]: 
    +In [472]: store.select('df_mask', where=where)
    +Out[472]: 
                        0         1
     2000-05-01 -1.199892  1.073701
     2000-05-02 -1.058552  0.658487
    @@ -5614,8 +5624,8 @@
     

    If you want to inspect the stored object, retrieve via get_storer. You could use this programmatically to say get the number of rows in an object.

    -
    In [468]: store.get_storer('df_dc').nrows
    -Out[468]: 8
    +
    In [473]: store.get_storer('df_dc').nrows
    +Out[473]: 8
     
    @@ -5644,28 +5654,28 @@ you choose to call dropna=False, some tables may have more rows than others, and therefore select_as_multiple may not work or it may return unexpected results.

    -
    In [469]: df_mt = pd.DataFrame(np.random.randn(8, 6),
    +
    In [474]: df_mt = pd.DataFrame(np.random.randn(8, 6),
        .....:                      index=pd.date_range('1/1/2000', periods=8),
        .....:                      columns=['A', 'B', 'C', 'D', 'E', 'F'])
        .....: 
     
    -In [470]: df_mt['foo'] = 'bar'
    +In [475]: df_mt['foo'] = 'bar'
     
    -In [471]: df_mt.loc[df_mt.index[1], ('A', 'B')] = np.nan
    +In [476]: df_mt.loc[df_mt.index[1], ('A', 'B')] = np.nan
     
     # you can also create the tables individually
    -In [472]: store.append_to_multiple({'df1_mt': ['A', 'B'], 'df2_mt': None},
    +In [477]: store.append_to_multiple({'df1_mt': ['A', 'B'], 'df2_mt': None},
        .....:                          df_mt, selector='df1_mt')
        .....: 
     
    -In [473]: store
    -Out[473]: 
    +In [478]: store
    +Out[478]: 
     <class 'pandas.io.pytables.HDFStore'>
     File path: store.h5
     
     # individual tables were created
    -In [474]: store.select('df1_mt')
    -Out[474]: 
    +In [479]: store.select('df1_mt')
    +Out[479]: 
                        A         B
     2000-01-01  0.475158  0.427905
     2000-01-02       NaN       NaN
    @@ -5676,8 +5686,8 @@
     2000-01-07 -1.801014  0.244721
     2000-01-08  3.055033 -0.683085
     
    -In [475]: store.select('df2_mt')
    -Out[475]: 
    +In [480]: store.select('df2_mt')
    +Out[480]: 
                        C         D         E         F  foo
     2000-01-01  1.846285 -0.044826  0.074867  0.156213  bar
     2000-01-02  0.446978 -0.323516  0.311549 -0.661368  bar
    @@ -5689,10 +5699,10 @@
     2000-01-08 -0.922301  2.760888  0.515474 -0.129319  bar
     
     # as a multiple
    -In [476]: store.select_as_multiple(['df1_mt', 'df2_mt'], where=['A>0', 'B>0'],
    +In [481]: store.select_as_multiple(['df1_mt', 'df2_mt'], where=['A>0', 'B>0'],
        .....:                          selector='df1_mt')
        .....: 
    -Out[476]: 
    +Out[481]: 
                        A         B         C         D         E         F  foo
     2000-01-01  0.475158  0.427905  1.846285 -0.044826  0.074867  0.156213  bar
     2000-01-06  0.187880  1.536245  0.831475 -0.566063  1.130163 -1.004539  bar
    @@ -5908,12 +5918,12 @@
     

    You can write data that contains category dtypes to a HDFStore. Queries work the same as if it was an object array. However, the category dtyped data is stored in a more efficient manner.

    -
    In [477]: dfcat = pd.DataFrame({'A': pd.Series(list('aabbcdba')).astype('category'),
    +
    In [482]: dfcat = pd.DataFrame({'A': pd.Series(list('aabbcdba')).astype('category'),
        .....:                       'B': np.random.randn(8)})
        .....: 
     
    -In [478]: dfcat
    -Out[478]: 
    +In [483]: dfcat
    +Out[483]: 
        A         B
     0  a  1.706605
     1  a  1.373485
    @@ -5924,28 +5934,28 @@
     6  b -0.453150
     7  a -0.827739
     
    -In [479]: dfcat.dtypes
    -Out[479]: 
    +In [484]: dfcat.dtypes
    +Out[484]: 
     A    category
     B     float64
     dtype: object
     
    -In [480]: cstore = pd.HDFStore('cats.h5', mode='w')
    +In [485]: cstore = pd.HDFStore('cats.h5', mode='w')
     
    -In [481]: cstore.append('dfcat', dfcat, format='table', data_columns=['A'])
    +In [486]: cstore.append('dfcat', dfcat, format='table', data_columns=['A'])
     
    -In [482]: result = cstore.select('dfcat', where="A in ['b', 'c']")
    +In [487]: result = cstore.select('dfcat', where="A in ['b', 'c']")
     
    -In [483]: result
    -Out[483]: 
    +In [488]: result
    +Out[488]: 
        A         B
     2  b -0.758424
     3  b -0.116984
     4  c -0.959461
     6  b -0.453150
     
    -In [484]: result.dtypes
    -Out[484]: 
    +In [489]: result.dtypes
    +Out[489]: 
     A    category
     B     float64
     dtype: object
    @@ -5969,10 +5979,10 @@
     

    Note

    If you are not passing any data_columns, then the min_itemsize will be the maximum of the length of any string passed

    -
    In [485]: dfs = pd.DataFrame({'A': 'foo', 'B': 'bar'}, index=list(range(5)))
    +
    In [490]: dfs = pd.DataFrame({'A': 'foo', 'B': 'bar'}, index=list(range(5)))
     
    -In [486]: dfs
    -Out[486]: 
    +In [491]: dfs
    +Out[491]: 
          A    B
     0  foo  bar
     1  foo  bar
    @@ -5981,10 +5991,10 @@
     4  foo  bar
     
     # A and B have a size of 30
    -In [487]: store.append('dfs', dfs, min_itemsize=30)
    +In [492]: store.append('dfs', dfs, min_itemsize=30)
     
    -In [488]: store.get_storer('dfs').table
    -Out[488]: 
    +In [493]: store.get_storer('dfs').table
    +Out[493]: 
     /dfs/table (Table(5,)) ''
       description := {
       "index": Int64Col(shape=(), dflt=0, pos=0),
    @@ -5997,10 +6007,10 @@
     
     # A is created as a data_column with a size of 30
     # B is size is calculated
    -In [489]: store.append('dfs2', dfs, min_itemsize={'A': 30})
    +In [494]: store.append('dfs2', dfs, min_itemsize={'A': 30})
     
    -In [490]: store.get_storer('dfs2').table
    -Out[490]: 
    +In [495]: store.get_storer('dfs2').table
    +Out[495]: 
     /dfs2/table (Table(5,)) ''
       description := {
       "index": Int64Col(shape=(), dflt=0, pos=0),
    @@ -6017,29 +6027,29 @@
     

    nan_rep

    String columns will serialize a np.nan (a missing value) with the nan_rep string representation. This defaults to the string value nan. You could inadvertently turn an actual nan value into a missing value.

    -
    In [491]: dfss = pd.DataFrame({'A': ['foo', 'bar', 'nan']})
    +
    In [496]: dfss = pd.DataFrame({'A': ['foo', 'bar', 'nan']})
     
    -In [492]: dfss
    -Out[492]: 
    +In [497]: dfss
    +Out[497]: 
          A
     0  foo
     1  bar
     2  nan
     
    -In [493]: store.append('dfss', dfss)
    +In [498]: store.append('dfss', dfss)
     
    -In [494]: store.select('dfss')
    -Out[494]: 
    +In [499]: store.select('dfss')
    +Out[499]: 
          A
     0  foo
     1  bar
     2  NaN
     
     # here you need to specify a different nan rep
    -In [495]: store.append('dfss2', dfss, nan_rep='_nan_')
    +In [500]: store.append('dfss2', dfss, nan_rep='_nan_')
     
    -In [496]: store.select('dfss2')
    -Out[496]: 
    +In [501]: store.select('dfss2')
    +Out[501]: 
          A
     0  foo
     1  bar
    @@ -6056,14 +6066,14 @@
     tables.

    It is possible to write an HDFStore object that can easily be imported into R using the rhdf5 library (Package website). Create a table format store like this:

    -
    In [497]: df_for_r = pd.DataFrame({"first": np.random.rand(100),
    +
    In [502]: df_for_r = pd.DataFrame({"first": np.random.rand(100),
        .....:                          "second": np.random.rand(100),
        .....:                          "class": np.random.randint(0, 2, (100, ))},
        .....:                         index=range(100))
        .....: 
     
    -In [498]: df_for_r.head()
    -Out[498]: 
    +In [503]: df_for_r.head()
    +Out[503]: 
           first    second  class
     0  0.366979  0.794525      0
     1  0.296639  0.635178      1
    @@ -6071,12 +6081,12 @@
     3  0.484648  0.970016      1
     4  0.810047  0.332303      0
     
    -In [499]: store_export = pd.HDFStore('export.h5')
    +In [504]: store_export = pd.HDFStore('export.h5')
     
    -In [500]: store_export.append('df_for_r', df_for_r, data_columns=df_dc.columns)
    +In [505]: store_export.append('df_for_r', df_for_r, data_columns=df_dc.columns)
     
    -In [501]: store_export
    -Out[501]: 
    +In [506]: store_export
    +Out[506]: 
     <class 'pandas.io.pytables.HDFStore'>
     File path: export.h5
     
    @@ -6182,7 +6192,7 @@ on an attempt at serialization.
  • See the Full Documentation.

    -
    In [502]: df = pd.DataFrame({'a': list('abc'),
    +
    In [507]: df = pd.DataFrame({'a': list('abc'),
        .....:                    'b': list(range(1, 4)),
        .....:                    'c': np.arange(3, 6).astype('u1'),
        .....:                    'd': np.arange(4.0, 7.0, dtype='float64'),
    @@ -6193,15 +6203,15 @@
        .....:                    'i': pd.date_range('20130101', periods=3, freq='ns')})
        .....: 
     
    -In [503]: df
    -Out[503]: 
    +In [508]: df
    +Out[508]: 
        a  b  c    d      e  f          g                         h                             i
     0  a  1  3  4.0   True  a 2013-01-01 2013-01-01 00:00:00-05:00 2013-01-01 00:00:00.000000000
     1  b  2  4  5.0  False  b 2013-01-02 2013-01-02 00:00:00-05:00 2013-01-01 00:00:00.000000001
     2  c  3  5  6.0   True  c 2013-01-03 2013-01-03 00:00:00-05:00 2013-01-01 00:00:00.000000002
     
    -In [504]: df.dtypes
    -Out[504]: 
    +In [509]: df.dtypes
    +Out[509]: 
     a                        object
     b                         int64
     c                         uint8
    @@ -6215,22 +6225,22 @@
     

    Write to a feather file.

    -
    In [505]: df.to_feather('example.feather')
    +
    In [510]: df.to_feather('example.feather')
     

    Read from a feather file.

    -
    In [506]: result = pd.read_feather('example.feather')
    +
    In [511]: result = pd.read_feather('example.feather')
     
    -In [507]: result
    -Out[507]: 
    +In [512]: result
    +Out[512]: 
        a  b  c    d      e  f          g                         h                             i
     0  a  1  3  4.0   True  a 2013-01-01 2013-01-01 00:00:00-05:00 2013-01-01 00:00:00.000000000
     1  b  2  4  5.0  False  b 2013-01-02 2013-01-02 00:00:00-05:00 2013-01-01 00:00:00.000000001
     2  c  3  5  6.0   True  c 2013-01-03 2013-01-03 00:00:00-05:00 2013-01-01 00:00:00.000000002
     
     # we preserve dtypes
    -In [508]: result.dtypes
    -Out[508]: 
    +In [513]: result.dtypes
    +Out[513]: 
     a                        object
     b                         int64
     c                         uint8
    @@ -6276,7 +6286,7 @@
     Currently pyarrow does not support timedelta data, fastparquet>=0.1.4 supports timezone aware datetimes.
     These libraries differ by having different underlying dependencies (fastparquet by using numba, while pyarrow uses a c-library).

    -
    In [509]: df = pd.DataFrame({'a': list('abc'),
    +
    In [514]: df = pd.DataFrame({'a': list('abc'),
        .....:                    'b': list(range(1, 4)),
        .....:                    'c': np.arange(3, 6).astype('u1'),
        .....:                    'd': np.arange(4.0, 7.0, dtype='float64'),
    @@ -6285,15 +6295,15 @@
        .....:                    'g': pd.date_range('20130101', periods=3, tz='US/Eastern')})
        .....: 
     
    -In [510]: df
    -Out[510]: 
    +In [515]: df
    +Out[515]: 
        a  b  c    d      e          f                         g
     0  a  1  3  4.0   True 2013-01-01 2013-01-01 00:00:00-05:00
     1  b  2  4  5.0  False 2013-01-02 2013-01-02 00:00:00-05:00
     2  c  3  5  6.0   True 2013-01-03 2013-01-03 00:00:00-05:00
     
    -In [511]: df.dtypes
    -Out[511]: 
    +In [516]: df.dtypes
    +Out[516]: 
     a                        object
     b                         int64
     c                         uint8
    @@ -6305,18 +6315,18 @@
     

    Write to a parquet file.

    -
    In [512]: df.to_parquet('example_pa.parquet', engine='pyarrow')
    +
    In [517]: df.to_parquet('example_pa.parquet', engine='pyarrow')
     
    -In [513]: df.to_parquet('example_fp.parquet', engine='fastparquet')
    +In [518]: df.to_parquet('example_fp.parquet', engine='fastparquet')
     

    Read from a parquet file.

    -
    In [514]: result = pd.read_parquet('example_fp.parquet', engine='fastparquet')
    +
    In [519]: result = pd.read_parquet('example_fp.parquet', engine='fastparquet')
     
    -In [515]: result = pd.read_parquet('example_pa.parquet', engine='pyarrow')
    +In [520]: result = pd.read_parquet('example_pa.parquet', engine='pyarrow')
     
    -In [516]: result.dtypes
    -Out[516]: 
    +In [521]: result.dtypes
    +Out[521]: 
     a                        object
     b                         int64
     c                         uint8
    @@ -6328,16 +6338,16 @@
     

    Read only certain columns of a parquet file.

    -
    In [517]: result = pd.read_parquet('example_fp.parquet',
    +
    In [522]: result = pd.read_parquet('example_fp.parquet',
        .....:                          engine='fastparquet', columns=['a', 'b'])
        .....: 
     
    -In [518]: result = pd.read_parquet('example_pa.parquet',
    +In [523]: result = pd.read_parquet('example_pa.parquet',
        .....:                          engine='pyarrow', columns=['a', 'b'])
        .....: 
     
    -In [519]: result.dtypes
    -Out[519]: 
    +In [524]: result.dtypes
    +Out[524]: 
     a    object
     b     int64
     dtype: object
    @@ -6347,9 +6357,9 @@
     

    Handling indexes

    Serializing a DataFrame to parquet may include the implicit index as one or more columns in the output file. Thus, this code:

    -
    In [520]: df = pd.DataFrame({'a': [1, 2], 'b': [3, 4]})
    +
    In [525]: df = pd.DataFrame({'a': [1, 2], 'b': [3, 4]})
     
    -In [521]: df.to_parquet('test.parquet', engine='pyarrow')
    +In [526]: df.to_parquet('test.parquet', engine='pyarrow')
     

    creates a parquet file with three columns if you use pyarrow for serialization: @@ -6360,7 +6370,7 @@ the file, because that column doesn’t exist in the target table.

    If you want to omit a dataframe’s indexes when writing, pass index=False to to_parquet():

    -
    In [522]: df.to_parquet('test.parquet', index=False)
    +
    In [527]: df.to_parquet('test.parquet', index=False)
     

    This creates a parquet file with just the two expected columns, a and b. @@ -6375,9 +6385,9 @@

    New in version 0.24.0.

    Parquet supports partitioning of data based on the values of one or more columns.

    -
    In [523]: df = pd.DataFrame({'a': [0, 0, 1, 1], 'b': [0, 1, 0, 1]})
    +
    In [528]: df = pd.DataFrame({'a': [0, 0, 1, 1], 'b': [0, 1, 0, 1]})
     
    -In [524]: df.to_parquet(fname='test', engine='pyarrow',
    +In [529]: df.to_parquet(fname='test', engine='pyarrow',
        .....:               partition_cols=['a'], compression=None)
        .....: 
     
    @@ -6452,10 +6462,10 @@ connecting to. For more information on create_engine() and the URI formatting, see the examples below and the SQLAlchemy documentation

    -
    In [525]: from sqlalchemy import create_engine
    +
    In [530]: from sqlalchemy import create_engine
     
     # Create your engine.
    -In [526]: engine = create_engine('sqlite:///:memory:')
    +In [531]: engine = create_engine('sqlite:///:memory:')
     

    If you want to manage your own connections you can pass one of those instead:

    @@ -6504,21 +6514,21 @@ -
    In [527]: data
    -Out[527]: 
    +
    In [532]: data
    +Out[532]: 
        id       Date Col_1  Col_2  Col_3
     0  26 2010-10-18     X  27.50   True
     1  42 2010-10-19     Y -12.50  False
     2  63 2010-10-20     Z   5.73   True
     
    -In [528]: data.to_sql('data', engine)
    +In [533]: data.to_sql('data', engine)
     

    With some databases, writing large DataFrames can result in errors due to packet size limitations being exceeded. This can be avoided by setting the chunksize parameter when calling to_sql. For example, the following writes data to the database in batches of 1000 rows at a time:

    -
    In [529]: data.to_sql('data_chunked', engine, chunksize=1000)
    +
    In [534]: data.to_sql('data_chunked', engine, chunksize=1000)
     
    @@ -6532,9 +6542,9 @@ fallback mode). For example, specifying to use the sqlalchemy String type instead of the default Text type for string columns:

    -
    In [530]: from sqlalchemy.types import String
    +
    In [535]: from sqlalchemy.types import String
     
    -In [531]: data.to_sql('data_dtype', engine, dtype={'Col_1': String})
    +In [536]: data.to_sql('data_dtype', engine, dtype={'Col_1': String})
     
    @@ -6650,8 +6660,8 @@

    In order to use read_sql_table(), you must have the SQLAlchemy optional dependency installed.

    -
    In [532]: pd.read_sql_table('data', engine)
    -Out[532]: 
    +
    In [537]: pd.read_sql_table('data', engine)
    +Out[537]: 
        index  id       Date Col_1  Col_2  Col_3
     0      0  26 2010-10-18     X  27.50   True
     1      1  42 2010-10-19     Y -12.50  False
    @@ -6660,16 +6670,16 @@
     

    You can also specify the name of the column as the DataFrame index, and specify a subset of columns to be read.

    -
    In [533]: pd.read_sql_table('data', engine, index_col='id')
    -Out[533]: 
    +
    In [538]: pd.read_sql_table('data', engine, index_col='id')
    +Out[538]: 
         index       Date Col_1  Col_2  Col_3
     id                                      
     26      0 2010-10-18     X  27.50   True
     42      1 2010-10-19     Y -12.50  False
     63      2 2010-10-20     Z   5.73   True
     
    -In [534]: pd.read_sql_table('data', engine, columns=['Col_1', 'Col_2'])
    -Out[534]: 
    +In [539]: pd.read_sql_table('data', engine, columns=['Col_1', 'Col_2'])
    +Out[539]: 
       Col_1  Col_2
     0     X  27.50
     1     Y -12.50
    @@ -6677,8 +6687,8 @@
     

    And you can explicitly force columns to be parsed as dates:

    -
    In [535]: pd.read_sql_table('data', engine, parse_dates=['Date'])
    -Out[535]: 
    +
    In [540]: pd.read_sql_table('data', engine, parse_dates=['Date'])
    +Out[540]: 
        index  id       Date Col_1  Col_2  Col_3
     0      0  26 2010-10-18     X  27.50   True
     1      1  42 2010-10-19     Y -12.50  False
    @@ -6711,8 +6721,8 @@
     In this case you must use the SQL variant appropriate for your database.
     When using SQLAlchemy, you can also pass SQLAlchemy Expression language constructs,
     which are database-agnostic.

    -
    In [536]: pd.read_sql_query('SELECT * FROM data', engine)
    -Out[536]: 
    +
    In [541]: pd.read_sql_query('SELECT * FROM data', engine)
    +Out[541]: 
        index  id                        Date Col_1  Col_2  Col_3
     0      0  26  2010-10-18 00:00:00.000000     X  27.50      1
     1      1  42  2010-10-19 00:00:00.000000     Y -12.50      0
    @@ -6720,20 +6730,20 @@
     

    Of course, you can specify a more “complex” query.

    -
    In [537]: pd.read_sql_query("SELECT id, Col_1, Col_2 FROM data WHERE id = 42;", engine)
    -Out[537]: 
    +
    In [542]: pd.read_sql_query("SELECT id, Col_1, Col_2 FROM data WHERE id = 42;", engine)
    +Out[542]: 
        id Col_1  Col_2
     0  42     Y  -12.5
     

    The read_sql_query() function supports a chunksize argument. Specifying this will return an iterator through chunks of the query result:

    -
    In [538]: df = pd.DataFrame(np.random.randn(20, 3), columns=list('abc'))
    +
    In [543]: df = pd.DataFrame(np.random.randn(20, 3), columns=list('abc'))
     
    -In [539]: df.to_sql('data_chunks', engine, index=False)
    +In [544]: df.to_sql('data_chunks', engine, index=False)
     
    -
    In [540]: for chunk in pd.read_sql_query("SELECT * FROM data_chunks",
    +
    In [545]: for chunk in pd.read_sql_query("SELECT * FROM data_chunks",
        .....:                                engine, chunksize=5):
        .....:     print(chunk)
        .....: 
    @@ -6804,20 +6814,20 @@
     

    Advanced SQLAlchemy queries

    You can use SQLAlchemy constructs to describe your query.

    Use sqlalchemy.text() to specify query parameters in a backend-neutral way

    -
    In [541]: import sqlalchemy as sa
    +
    In [546]: import sqlalchemy as sa
     
    -In [542]: pd.read_sql(sa.text('SELECT * FROM data where Col_1=:col1'),
    +In [547]: pd.read_sql(sa.text('SELECT * FROM data where Col_1=:col1'),
        .....:             engine, params={'col1': 'X'})
        .....: 
    -Out[542]: 
    +Out[547]: 
        index  id                        Date Col_1  Col_2  Col_3
     0      0  26  2010-10-18 00:00:00.000000     X   27.5      1
     

    If you have an SQLAlchemy description of your database you can express where conditions using SQLAlchemy expressions

    -
    In [543]: metadata = sa.MetaData()
    +
    In [548]: metadata = sa.MetaData()
     
    -In [544]: data_table = sa.Table('data', metadata,
    +In [549]: data_table = sa.Table('data', metadata,
        .....:                       sa.Column('index', sa.Integer),
        .....:                       sa.Column('Date', sa.DateTime),
        .....:                       sa.Column('Col_1', sa.String),
    @@ -6826,20 +6836,20 @@
        .....:                       )
        .....: 
     
    -In [545]: pd.read_sql(sa.select([data_table]).where(data_table.c.Col_3 is True), engine)
    -Out[545]: 
    +In [550]: pd.read_sql(sa.select([data_table]).where(data_table.c.Col_3 is True), engine)
    +Out[550]: 
     Empty DataFrame
     Columns: [index, Date, Col_1, Col_2, Col_3]
     Index: []
     

    You can combine SQLAlchemy expressions with parameters passed to read_sql() using sqlalchemy.bindparam()

    -
    In [546]: import datetime as dt
    +
    In [551]: import datetime as dt
     
    -In [547]: expr = sa.select([data_table]).where(data_table.c.Date > sa.bindparam('date'))
    +In [552]: expr = sa.select([data_table]).where(data_table.c.Date > sa.bindparam('date'))
     
    -In [548]: pd.read_sql(expr, engine, params={'date': dt.datetime(2010, 10, 18)})
    -Out[548]: 
    +In [553]: pd.read_sql(expr, engine, params={'date': dt.datetime(2010, 10, 18)})
    +Out[553]: 
        index       Date Col_1  Col_2  Col_3
     0      1 2010-10-19     Y -12.50  False
     1      2 2010-10-20     Z   5.73   True
    @@ -6882,9 +6892,9 @@
     

    Writing to stata format

    The method to_stata() will write a DataFrame into a .dta file. The format version of this file is always 115 (Stata 12).

    -
    In [549]: df = pd.DataFrame(np.random.randn(10, 2), columns=list('AB'))
    +
    In [554]: df = pd.DataFrame(np.random.randn(10, 2), columns=list('AB'))
     
    -In [550]: df.to_stata('stata.dta')
    +In [555]: df.to_stata('stata.dta')
     

    Stata data files have limited data type support; only strings with @@ -6926,8 +6936,8 @@

    The top-level function read_stata will read a dta file and return either a DataFrame or a StataReader that can be used to read the file incrementally.

    -
    In [551]: pd.read_stata('stata.dta')
    -Out[551]: 
    +
    In [556]: pd.read_stata('stata.dta')
    +Out[556]: 
        index         A         B
     0      0  1.031231  0.196447
     1      1  0.190188  0.619078
    @@ -6945,9 +6955,9 @@
     StataReader instance that can be used to
     read chunksize lines from the file at a time.  The StataReader
     object can be used as an iterator.

    -
    In [552]: reader = pd.read_stata('stata.dta', chunksize=3)
    +
    In [557]: reader = pd.read_stata('stata.dta', chunksize=3)
     
    -In [553]: for df in reader:
    +In [558]: for df in reader:
        .....:     print(df.shape)
        .....: 
     (3, 3)
    @@ -6959,11 +6969,11 @@
     

    For more fine-grained control, use iterator=True and specify chunksize with each call to read().

    -
    In [554]: reader = pd.read_stata('stata.dta', iterator=True)
    +
    In [559]: reader = pd.read_stata('stata.dta', iterator=True)
     
    -In [555]: chunk1 = reader.read(5)
    +In [560]: chunk1 = reader.read(5)
     
    -In [556]: chunk2 = reader.read(5)
    +In [561]: chunk2 = reader.read(5)
     

    Currently the index is retrieved as a column.

    @@ -7116,10 +7126,10 @@

    Given the next test set:

    -
    from numpy.random import randn
    +
    import os
     
     sz = 1000000
    -df = pd.DataFrame({'A': randn(sz), 'B': [1] * sz})
    +df = pd.DataFrame({'A': np.random.randn(sz), 'B': [1] * sz})
     
     
     def test_sql_write(df):
    @@ -7298,7 +7308,7 @@
             
  • previous |
  • - +