From d1d3e4fe82d476575af9528b1a3dce1fd2ff978d Mon Sep 17 00:00:00 2001 From: Parfait Gasana Date: Sun, 11 Apr 2021 12:53:58 -0500 Subject: [PATCH 01/35] ENH: Add IO support for R data files with pandas.read_rdata and DataFrame.to_rdata --- doc/source/getting_started/install.rst | 2 + doc/source/user_guide/io.rst | 300 +++ doc/source/whatsnew/v1.3.0.rst | 103 + environment.yml | 1 + pandas/__init__.py | 1 + pandas/core/frame.py | 225 ++ pandas/io/api.py | 1 + pandas/io/rdata.py | 1826 +++++++++++++++++ .../io/data/rdata/climate_non_utf8_df.rda | Bin 0 -> 423 bytes .../io/data/rdata/climate_non_utf8_df.rds | Bin 0 -> 400 bytes pandas/tests/io/data/rdata/env_data_dfs.rda | Bin 0 -> 7259 bytes .../tests/io/data/rdata/env_data_non_dfs.rda | Bin 0 -> 8948 bytes pandas/tests/io/data/rdata/env_data_objs.rda | Bin 0 -> 13735 bytes pandas/tests/io/data/rdata/ghg_df.rds | Bin 0 -> 1475 bytes pandas/tests/io/data/rdata/ghg_t_tests.rds | Bin 0 -> 1136 bytes pandas/tests/io/data/rdata/plants_arry.rds | Bin 0 -> 584 bytes pandas/tests/io/data/rdata/plants_df.rds | Bin 0 -> 325 bytes pandas/tests/io/data/rdata/ppm_df.csv | 757 +++++++ pandas/tests/io/data/rdata/ppm_ts.rds | Bin 0 -> 9004 bytes pandas/tests/io/data/rdata/sea_ice_df.rds | Bin 0 -> 5374 bytes pandas/tests/io/data/rdata/species_mtx.rds | Bin 0 -> 1907 bytes pandas/tests/io/rdata/test_pyreadr.py | 596 ++++++ pandas/tests/io/rdata/test_rscript.py | 972 +++++++++ requirements-dev.txt | 1 + 24 files changed, 4785 insertions(+) create mode 100644 pandas/io/rdata.py create mode 100644 pandas/tests/io/data/rdata/climate_non_utf8_df.rda create mode 100644 pandas/tests/io/data/rdata/climate_non_utf8_df.rds create mode 100644 pandas/tests/io/data/rdata/env_data_dfs.rda create mode 100644 pandas/tests/io/data/rdata/env_data_non_dfs.rda create mode 100644 pandas/tests/io/data/rdata/env_data_objs.rda create mode 100644 pandas/tests/io/data/rdata/ghg_df.rds create mode 100644 pandas/tests/io/data/rdata/ghg_t_tests.rds create mode 100644 pandas/tests/io/data/rdata/plants_arry.rds create mode 100644 pandas/tests/io/data/rdata/plants_df.rds create mode 100644 pandas/tests/io/data/rdata/ppm_df.csv create mode 100644 pandas/tests/io/data/rdata/ppm_ts.rds create mode 100644 pandas/tests/io/data/rdata/sea_ice_df.rds create mode 100644 pandas/tests/io/data/rdata/species_mtx.rds create mode 100644 pandas/tests/io/rdata/test_pyreadr.py create mode 100644 pandas/tests/io/rdata/test_rscript.py diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index a9c3d637a41e3..99ebe01b0e53f 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -358,6 +358,8 @@ zlib Compression for HDF5 fastparquet 0.4.0 Parquet reading / writing pyarrow 0.15.0 Parquet, ORC, and feather reading / writing pyreadstat SPSS files (.sav) reading +pyreadr R files (.RData, .rda, .rds) reading / writing +Rscript R files (.RData, .rda, .rds) reading / writing ========================= ================== ============================================================= Access data in the cloud diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 3b7a6037a9715..75a3626ef80b5 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -31,6 +31,7 @@ The pandas I/O API is a set of top level ``reader`` functions accessed like binary;`Parquet Format `__;:ref:`read_parquet`;:ref:`to_parquet` binary;`ORC Format `__;:ref:`read_orc`; binary;`Msgpack `__;:ref:`read_msgpack`;:ref:`to_msgpack` + binary;`R `__;:ref:`read_rdata`;:ref:`to_rdata` binary;`Stata `__;:ref:`read_stata`;:ref:`to_stata` binary;`SAS `__;:ref:`read_sas`; binary;`SPSS `__;:ref:`read_spss`; @@ -5903,6 +5904,304 @@ respective functions from ``pandas-gbq``. Full documentation can be found `here `__. + +.. _io.rdata: + +R data format +------------- + +.. _io.rdata_reader: + +Reading R data +'''''''''''''' + +.. versionadded:: 1.3.0 + +The top-level function ``read_rdata`` will read the native serialization types +in the R language and environment. For .RData and its synonymous shorthand, .rda, +that can hold multiple R objects, method will return a ``dict`` of ``DataFrames``. +For .rds types that only contains a single R object, method will return a single +``DataFrame``. + +.. note:: + + Since *any* R object can be saved in these types, this method will only return + data.frame objects or objects coercible to data.frames including matrices, + tibbles, and data.tables even 3D arrays. Depending on engine used, either + an error raises for non-data.frame objects or such objects are ignored. + +For example, consider the following generated data.frames in R using samples from +US EPA, UK BGCI, and NOAA pubilc data: + +.. code-block:: r + + ghg_df <- data.frame( + gas = c("Carbon dioxide", "Methane", "Nitrous oxide", + "Fluorinated gases", "Total"), + year = c(2018, 2018, 2018, 2018, 2018), + emissions = c(5424.88150213288, 634.457127078267, 434.528555376666, + 182.782432461777, 6676.64961704959), + row.names = c(141:145), + stringsAsFactors = FALSE + ) + + saveRDS(ghg_df, file="ghg_df.rds") + + plants_df <- data.frame( + plant_group = c("Pteridophytes", "Pteridophytes", "Pteridophytes", + "Pteridophytes", "Pteridophytes"), + status = c("Data Deficient", "Extinct", "Not Threatened", + "Possibly Threatened", "Threatened"), + count = c(398, 65, 1294, 408, 1275), + row.names = c(16:20), + stringsAsFactors = FALSE + ) + + saveRDS(plants_df, file="plants_df.rds") + + sea_ice_df_new <- data.frame( + year = c(2016, 2017, 2018, 2019, 2020), + mo = c(12, 12, 12, 12, 12), + data.type = c("Goddard", "Goddard", "Goddard", "Goddard", "NRTSI-G"), + region = c("S", "S", "S", "S", "S"), + extent = c(8.28, 9.48, 9.19, 9.41, 10.44), + area = c(5.51, 6.23, 5.59, 6.59, 6.5), + row.names = c(1012:1016), + stringsAsFactors = FALSE + ) + + saveRDS(sea_ice_df, file="sea_ice_df.rds") + + save(ghg_df, plants_df, sea_ice_df, file="env_data_dfs.rda") + +Then in pandas you can read the .rds or .rda files: + +.. ipython:: python + :suppress: + + rel_path = os.path.join("..", "pandas", "tests", "io", "data", "rdata") + file_path = os.path.abspath(rel_path) + +.. ipython:: python + + rds_file = os.path.join(file_path, "ghg_df.rds") + ghg_df = pd.read_rdata(rds_file).tail() + ghg_df + + rda_file = os.path.join(file_path, "env_data_dfs.rda") + env_dfs = pd.read_rdata(rda_file) + env_dfs + +To ignore the rownames of data.frame, use option ``rownames=False``: + +.. ipython:: python + + rds_file = os.path.join(file_path, "plants_df.rds") + plants_df = pd.read_rdata(rds_file, rownames=False).tail() + plants_df + + +To select specific objects in .rda, pass a list of names into ``select_frames``: + +.. ipython:: python + + rda_file = os.path.join(file_path, "env_data_dfs.rda") + env_dfs = pd.read_rdata(rda_file, select_frames=["sea_ice_df"]) + env_dfs + +To read from URL, pass link directly into method: + +.. ipython:: python + + url = ("https://github.com/hadley/nycflights13/" + "blob/master/data/airlines.rda?raw=true") + + airlines = pd.read_rdata(url, file_format="rda") + airlines + +To read from a file-like object, read object in argument, ``path_or_buffer``: + +.. ipython:: python + + rds_file = os.path.join(file_path, "sea_ice_df.rds") + with open(rds_file, "rb") as f: + sea_ice_df = pd.read_rdata(f.read(), file_format="rds") + + sea_ice_df + +With ``rscript`` as ``engine``, a direct command line call to Rscript is run +to read data natively in R and transfer content with several options of ``mode``. + +.. note:: + + If you do not have R installed and attempt to use the ``rscript`` ``engine``, + then an ``ImportError`` will raise. Do note: Rscript must be recognized as a + top-level command on machine. Hence, R's bin folder must be in Path environment + variable for the OS. If Rscript is not recognized even if you have R installed, + you will receive same ``ImportError``. + +- For the ``csv`` mode (default), no other package in R is required. + Data types are adhered in this data exchange following a text approach. + +- For the ``feather`` mode, the ``arrow`` package in R must be installed. + Additionally, the counterpart ``pyarrow`` package in Python must be + installed. This binary approach allows faster data exchange than text approach. + +- For the ``parquet`` mode, again the ``arrow`` package in R must be installed. + and again ``pyarrow`` package in Python must be installed. Similarly, this + binary approach allows faster data exchange than text approach. + +- For the ``sqlite`` mode, the ``RSQLite`` package in R (part of DBI family of + database APIs) must be installed with no additional package needed for Python. + This database approach ensures data type integrity. + +.. ipython:: python + + rds_file = os.path.join(file_path, "plants_df.rds") + plants_df = pd.read_rdata(rds_file, engine="rscript", mode="sqlite").tail() + plants_df + +.. note:: + + The above selected options for ``mode`` will not generate such formats but + uses them under the hood in disk transfer of data between R and Python. + + +.. _io.rdata_writer: + +Writing R data +'''''''''''''' + +.. versionadded:: 1.3.0 + +The method :func:`~pandas.core.frame.DataFrame.to_rdata` will write a DataFrame +or multiple DataFrames into R data files (.Rdata, .rda, and .rds). + +For single object in rds type: + +.. ipython:: python + + plants_df.to_rdata("plants_df.rds") + +For multiple objects in RData or rda types using the ``rscript`` engine, +use the ``other_frames`` argument and be sure to provide ``rda_names`` for all +DataFrames: + +.. ipython:: python + + plants_df.to_rdata( + "env_dfs.rda", + engine="rscript", + other_frames=[ghg_df, sea_ice_df], + rda_names=["plants_df", "ghg_df", "sea_ice_df"] + ) + +With either engine, pandas index will not map into R rownames. Using the default +``index=True`` will output an index column or multiple columns for MultiIndex. + +.. ipython:: python + + (ghg_df.rename_axis(None) + .to_rdata("ghg_df.rds", engine="rscript") + ) + pd.read_rdata("ghg_df.rds").tail() + +Otherwise, use ``index=False``: + +.. ipython:: python + + (ghg_df.rename_axis(None) + .to_rdata("ghg_df.rds", engine="rscript", index=False) + ) + pd.read_rdata("ghg_df.rds").tail() + +With both engines, the default compression of R data files will be ``gzip``. +Notice the different sizes of compressed and uncompressed files: + +.. ipython:: python + + plants_df.to_rdata("plants_df_uncomp.rds", compress=False) + + os.stat("plants_df.rds").st_size + os.stat("plants_df_uncomp.rds").st_size + +The ``rscript`` engine supports all listed compression types including: +``gzip``, ``bzip2``, and ``xz``. + +Additionally, with ``rscript`` engine, data files can be written in ascii (text) +rather than default binary with ``ascii`` argument: + +.. ipython:: python + + sea_ice_df.to_rdata("sea_ice_df_ascii.rda", engine="rscript", + ascii=True, compress=False) + + with open("sea_ice_df_ascii.rda", "r") as f: + for i in range(10): + line = next(f).strip() + print(line) + +.. ipython:: python + :suppress: + + os.remove("ghg_df.rds") + os.remove("plants_df.rds") + os.remove("env_dfs.rda") + os.remove("plants_df_uncomp.rds") + os.remove("sea_ice_df_ascii.rda") + +Once exported, the single DataFrame can be read back in R or multiple DataFrames +loaded in R: + +.. code-block:: r + + plants_df <- readRDS("plants_df.rds") + tail(plants_df, 5) + plant_group status count + 16 Pteridophytes Data Deficient 398 + 17 Pteridophytes Extinct 65 + 18 Pteridophytes Not Threatened 1294 + 19 Pteridophytes Possibly Threatened 408 + 20 Pteridophytes Threatened 1275 + + + load("env_dfs.rda") + eapply(.GlobalEnv, tail, 5) + $plants_df + plant_group status count + 16 Pteridophytes Data Deficient 398 + 17 Pteridophytes Extinct 65 + 18 Pteridophytes Not Threatened 1294 + 19 Pteridophytes Possibly Threatened 408 + 20 Pteridophytes Threatened 1275 + + $sea_ice_df + year mo data.type region extent area + 1012 2016 12 Goddard S 8.28 5.51 + 1013 2017 12 Goddard S 9.48 6.23 + 1014 2018 12 Goddard S 9.19 5.59 + 1015 2019 12 Goddard S 9.41 6.59 + 1016 2020 12 NRTSI-G S 10.44 6.50 + + $ghg_df + gas year emissions + 141 Carbon dioxide 2018 5424.8815 + 142 Methane 2018 634.4571 + 143 Nitrous oxide 2018 434.5286 + 144 Fluorinated gases 2018 182.7824 + 145 Total 2018 6676.6496 + +For more information of ``pyreadr`` engine, see main page of `pyreadr`_ package for +further notes on support and limitations. For more information of R serialization +data types, see docs on `rds`_ and `rda`_ data files. + +.. _pyreadr: https://github.com/ofajardo/pyreadr + +.. _rds: https://www.rdocumentation.org/packages/base/versions/3.6.2/topics/readRDS + +.. _rda: https://www.rdocumentation.org/packages/base/versions/3.6.2/topics/save + + .. _io.stata: Stata format @@ -5958,6 +6257,7 @@ outside of this range, the variable is cast to ``int16``. 115 dta file format. Attempting to write *Stata* dta files with strings longer than 244 characters raises a ``ValueError``. + .. _io.stata_reader: Reading from Stata format diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 92efb225682b7..b85a773014c59 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -110,6 +110,109 @@ both XPath 1.0 and XSLT 1.0 is available. (:issue:`27554`) For more, see :ref:`io.xml` in the user guide on IO tools. +.. _whatsnew_130.read_to_rdata: + +Read and write R data files +^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +We added I/O support to read and write R data files (.rda, .Rdata, .rds) using +:func:`pandas.read_rdata` and :meth:`DataFrame.to_rdata`. Equipped with two engines, +`pyreadr`_ and command line caller, `rscript`_, these methods will maintain fast and +durable support for open source data migration between R and Python. (:issue:`40287`) + +.. _pyreadr: https://github.com/ofajardo/pyreadr +.. _rscript: https://www.rdocumentation.org/packages/utils/versions/3.6.2/topics/Rscript + +In R, the below generated data frame and matrix: + +.. code-block:: r + + In [1]: carbon_ppm_df <- data.frame( + ...: year = c(2020, 2020, 2020, 2021, 2021), + ...: month = c(10, 11, 12, 1, 2), + ...: monthly_average = c(411.51, 413.11, 414.25, 415.52, 416.75), + ...: num_days = c(30, 27, 30, 29, 28), + ...: st_dev_of_days = c(0.22, 0.8, 0.48, 0.44, 1.01), + ...: unc_mon_mean = c(0.08, 0.29, 0.17, 0.16, 0.36) + ...: ) + + In [2]: iucn_species_mtx <- matrix( + ...: c(102, 79, 159, 63, 30, 13, 267, 35, 85, + ...: 30, 10, 5, 1, 2, 7, 14, 2, 2, + ...: 409, 121, 22, 75, 40, 78, 134, 146, 28, + ...: 29, 6, 0, 0, 0, 12, 2, 1, 0, + ...: 3770, 627, 223, 365, 332, 699, 604, 663, 225, + ...: 6972, 989, 460, 730, 588, 1302, 518, 1060, 542, + ...: 7089, 1219, 798, 831, 538, 1051, 975, 719, 556, + ...: 2990, 4251, 52, 2819, 1220, 914, 1648, 1184, 845, + ...: 43885, 20685, 11158, 10865, 8492, 8192, 7326, 7212, 5940 + ...: ), + ...: ncol=9, nrow=9, + ...: dimnames = list( + ...: c("MAGNOLIOPSIDA", "ACTINOPTERYGII", "AVES", + ...: "INSECTA", "REPTILIA", "LILIOPSIDA", + ...: "GASTROPODA", "AMPHIBIA", "MAMMALIA"), + ...: c("EX", "EW", "CR(PE)", "CR(PEW)", "CR", + ...: "EN", "VU", "DD", "Total") + ...: ) + ...: ) + + In [3]: saveRDS(ppm_df, "ppm_df_r.rds") + In [4]: save(ppm_df, iucn_species_mtx, "env_objs_r.rda") + +Can then be read in pandas with either engine: + +.. code-block:: ipython + + In [1]: ppm_df = pd.read_rdata("ppm_df_r.rds", engine="pyreadr") + In [2]: ppm_df + Out[3]: + year month monthly_average num_days st_dev_of_days unc_mon_mean + 0 2020 10 411.51 30 0.22 0.08 + 1 2020 11 413.11 27 0.80 0.29 + 2 2020 12 414.25 30 0.48 0.17 + 3 2021 1 415.52 29 0.44 0.16 + 4 2021 2 416.75 28 1.01 0.36 + + In [4]: env_objs = pd.read_rdata("env_objs_r.rda", engine="rscript") + Out[5]: + {'carbon_ppm_df': + year month monthly_average num_days st_dev_of_days unc_mon_mean + 0 2020 10 411.51 30 0.22 0.08 + 1 2020 11 413.11 27 0.80 0.29 + 2 2020 12 414.25 30 0.48 0.17 + 3 2021 1 415.52 29 0.44 0.16 + 4 2021 2 416.75 28 1.01 0.36 + + [5 rows x 6 columns], + 'species_matrix': + EX EW CR(PE) CR(PEW) CR EN VU DD Total + rownames + MAGNOLIOPSIDA 102 30 409 29 3770 6972 7089 2990 43885 + ACTINOPTERYGII 79 10 121 6 627 989 1219 4251 20685 + AVES 159 5 22 0 223 460 798 52 11158 + INSECTA 63 1 75 0 365 730 831 2819 10865 + REPTILIA 30 2 40 0 332 588 538 1220 8492 + LILIOPSIDA 13 7 78 12 699 1302 1051 914 8192 + GASTROPODA 267 14 134 2 604 518 975 1648 7326 + AMPHIBIA 35 2 146 1 663 1060 719 1184 7212 + + [8 rows x 9 columns]} + +Even exported back out to R data files: + +.. code-block:: ipython + + In [5]: ppm_df.to_rdata("ppm_df_py.rds") + In [6]: ppm_df.to_rdata( + ...: "env_objs_py.rda", + ...: engine="rscript", + ...: other_frames=env_objs["species_matrix"], + ...: rda_names=["ppm_df", "species_mtx"] + ...: ) + +For more, see :ref:`io.read_rdata` in the user guide on IO tools. + .. _whatsnew_130.enhancements.other: Other enhancements diff --git a/environment.yml b/environment.yml index 1259d0dd4ae44..88df4ee035da0 100644 --- a/environment.yml +++ b/environment.yml @@ -110,6 +110,7 @@ dependencies: - xarray # DataFrame.to_xarray - cftime # Needed for downstream xarray.CFTimeIndex test - pyreadstat # pandas.read_spss + - pyreadr # pandas.read_rdata, DataFrame.to_rdata - tabulate>=0.8.3 # DataFrame.to_markdown - natsort # DataFrame.sort_values - pip: diff --git a/pandas/__init__.py b/pandas/__init__.py index 7cad3eded0585..5c18d6072b9a3 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -172,6 +172,7 @@ read_stata, read_sas, read_spss, + read_rdata, ) from pandas.io.json import _json_normalize as json_normalize diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 62341045413a7..46aecd6d3a087 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2270,6 +2270,231 @@ def _from_arrays( ) return cls(mgr) + @doc(storage_options=generic._shared_docs["storage_options"]) + def to_rdata( + self, + path_or_buffer: FilePathOrBuffer, + file_format: str = "infer", + engine: str = "pyreadr", + mode: str = "csv", + other_frames: Optional[List[DataFrame]] = None, + rda_names: List[str] = ["pandas_dataframe"], + index: bool = True, + ascii: bool = False, + compress: Union[bool, str] = "gzip", + encoding: str = "utf-8", + storage_options: StorageOptions = None, + ) -> None: + """ + Render one or more DataFrames to R data (.rda, .Rdata, .rds). + + .. versionadded:: 1.3.0 + + Parameters + ---------- + path_or_buffer : a valid str, path object or file-like object + Any valid string path is acceptable. + + file_format : {{'infer', 'rda', 'rdata', 'rds'}}, default 'infer' + R serialization type generated from native commands: base::save + (that saves multiple objects) or base::saveRDS (that saves a + single object to disk). Default 'infer' will use extension in file + name to determine the format type. + + engine : {{'pyreadr', 'rscript'}}, default 'pyreadr' + Engine used to write to R data files. Currently, two types are + supported: ``pyreadr`` which requires the pyreadr package to be + installed and ``rscript`` which requires R to be installed on machine. + For ``rscript``, be sure the R bin installation folder is included in + the system Path environment variable. The ``pyreadr`` is the faster + parser to handle most needs but ``rscript`` engine provides fuller + support of rda and rds formats since it calls native R commands. + + mode : {{'csv', 'parquet', 'feather'}}, default 'csv' + Python and R I/O transfer mode that only applies to ``rscript`` + engine (ignored for ``pyreadr``). Using ``csv`` (text approach), no + additional packages are required. Using ``parquet`` or ``feather`` + (binary approach) requires pyarrow installed in Python and arrow + package installed in R. Using ``sqlite`` (database approach) requires + RSQLite package installed in R. Binary will usually be faster to process + than text data. Database usually ensures data type integrity. + + other_frames : list, optional + Other DataFrames to be included in rda (not rds) files that can + contain multiple objects. Ignored ``pyreadr`` engine that currently + supports only a single DataFrame written to rda files. + + rda_names : list, default ["pandas_dataframe"] + Names for current and other DataFrames in rda file. The number of names + should equal the number of current DataFrame and ``other_frames``. + For ``pyreadr`` engine that can only write one DataFrame to rda file, + only the first name in list will be used. + + index : bool, default True + Include index or MulitIndex in output as separate columns. Since + DataFrame indexes can include multiple columns and R rownames can only + include one column, neither ``pyreadr`` nor ``rscript`` engines will + map DataFrame index to R data.frame rownames. + + ascii : bool, default False + Write data into ASCII (text) representation. Only supported with + ``rscript`` engine. + + compress : bool or {{'gzip', 'bzip2', 'xz'}}, default 'gzip' + Compression types for R data files. Use False for uncompressed + files. For ``pyreadr`` engine, False and 'gzip' is supported. + + encoding : str, optional, default 'utf-8' + Encoding of R data. + + {storage_options} + + Returns + ------- + None + Either None or ValueError is raised. + + See Also + -------- + to_stata : Convert DataFrame to a Stata dataset. + to_parquet : Convert DataFrame to parquet format. + to_feather: Convert DataFrame to feather formatt. + + Examples + -------- + To save an .rds file which only contains a single DataFrame: + + >>> ghg_df = pd.DataFrame( + ... {{'gas': ['Carbon dioxide', 'Methane', + ... 'Nitrous oxide', + ... 'Fluorinated gases', + ... 'Total'], + ... 'year': [2018, 2018, 2018, 2018, 2018], + ... 'emissions': [5424.88, 634.46, 434.53, + ... 182.78, 6676.65] + ... }}) + >>> ghg_df.to_rdata("ghg_df.rds") + + >>> R_code = ''' + ... ghg_df <- readRDS("ghg_df.rds") + ... ghg_df + ... index gas year emissions + ... 1 0 Carbon dioxide 2018 5424.88 + ... 2 1 Methane 2018 634.46 + ... 3 2 Nitrous oxide 2018 434.53 + ... 4 3 Fluorinated gases 2018 182.78 + ... 5 4 Total 2018 6676.65 + ... ''' + + To save an .rda or .RData file which can contains one or more + DataFrames: + + >>> plants_df = pd.DataFrame( + ... {{'plant_group': ['Pteridophytes', + ... 'Pteridophytes', + ... 'Pteridophytes', + ... 'Pteridophytes', + ... 'Pteridophytes'], + ... 'status': ['Data Deficient', + ... 'Extinct', + ... 'Not Threatened', + ... 'Possibly Threatened', + ... 'Threatened'], + ... 'count': [398, 65, 1294, 408, 1275] + ... }}) + >>> sea_ice_df = pd.DataFrame( + ... {{'year': [2016, 2017, 2018, 2019, 2020], + ... 'mo': [12, 12, 12, 12, 12], + ... 'data.type': ['Goddard', + ... 'Goddard', + ... 'Goddard', + ... 'Goddard', + ... 'NRTSI-G'], + ... 'region': ['S', 'S', 'S', 'S', 'S'], + ... 'extent': [8.28, 9.48, 9.19, 9.41, 10.44], + ... 'area': [5.51, 6.23, 5.59, 6.59, 6.5] + ... }}) + >>> ghg_df.to_rdata( + ... "env_data_df.rda", + ... engine="rscript", + ... other_frames=[plants_df, sea_ice_df], + ... rda_names=["ghg_df", "plants_df", "sea_ice_df"] + ... ) # doctest: +SKIP + + >>> R_code = ''' + ... load("env_data_df.rds") + ... + ... mget(ls()) + ... $ghg_df + ... index gas year emissions + ... 1 0 Carbon dioxide 2018 5424.88 + ... 2 1 Methane 2018 634.46 + ... 3 2 Nitrous oxide 2018 434.53 + ... 4 3 Fluorinated gases 2018 182.78 + ... 5 4 Total 2018 6676.65 + ... + ... $plants_df + ... index plant_group status count + ... 1 0 Pteridophytes Data Deficient 398 + ... 2 1 Pteridophytes Extinct 65 + ... 3 2 Pteridophytes Not Threatened 1294 + ... 4 3 Pteridophytes Possibly Threatened 408 + ... 5 4 Pteridophytes Threatened 1275 + ... + ... $sea_ice_df + ... index year mo data.type region extent area + ... 1 0 2016 12 Goddard S 8.28 5.51 + ... 2 1 2017 12 Goddard S 9.48 6.23 + ... 3 2 2018 12 Goddard S 9.19 5.59 + ... 4 3 2019 12 Goddard S 9.41 6.59 + ... 5 4 2020 12 NRTSI-G S 10.44 6.50 + ... ''' + """ + from pandas.io.rdata import ( + RSCRIPT_EXISTS, + PyReadrWriter, + RscriptWriter, + ) + + pyreadr = import_optional_dependency("pyreadr", errors="ignore") + pyarrow = import_optional_dependency("pyarrow", errors="ignore") + + RDataWriter: Union[Type[PyReadrWriter], Type[RscriptWriter]] + + if engine == "pyreadr": + if pyreadr is None: + raise ImportError("pyreadr not found, please install for this engine.") + RDataWriter = PyReadrWriter + + elif engine == "rscript": + if RSCRIPT_EXISTS is None: + raise FileNotFoundError( + "R is either not installed on this system or its " + "bin folder is not in Path environment variable." + ) + if pyarrow is None and mode in ["parquet", "feather"]: + raise ImportError("pyarrow not found, please install for this mode.") + RDataWriter = RscriptWriter + else: + raise ValueError(f"{engine} is not a supported engine.") + + rdata_writer = RDataWriter( + self, + path_or_buffer=path_or_buffer, + file_format=file_format, + engine=engine, + mode=mode, + other_frames=other_frames, + rda_names=rda_names, + index=index, + ascii=ascii, + compress=compress, + encoding=encoding, + storage_options=storage_options, + ) + + return rdata_writer.write_data() + @doc(storage_options=generic._shared_docs["storage_options"]) @deprecate_kwarg(old_arg_name="fname", new_arg_name="path") def to_stata( diff --git a/pandas/io/api.py b/pandas/io/api.py index 5926f2166ee9d..9cacb014e7dd0 100644 --- a/pandas/io/api.py +++ b/pandas/io/api.py @@ -29,6 +29,7 @@ HDFStore, read_hdf, ) +from pandas.io.rdata import read_rdata from pandas.io.sas import read_sas from pandas.io.spss import read_spss from pandas.io.sql import ( diff --git a/pandas/io/rdata.py b/pandas/io/rdata.py new file mode 100644 index 0000000000000..ffd726e8cfbff --- /dev/null +++ b/pandas/io/rdata.py @@ -0,0 +1,1826 @@ +from datetime import datetime +import io +import os +import platform +import subprocess +from tempfile import TemporaryDirectory +from typing import ( + Dict, + List, + Optional, + Type, + Union, +) + +from pandas._typing import ( + Buffer, + FilePathOrBuffer, + StorageOptions, +) +from pandas.compat._optional import import_optional_dependency +from pandas.errors import ( + AbstractMethodError, + ParserError, +) +from pandas.util._decorators import doc + +from pandas.core.dtypes.common import is_list_like + +from pandas.core.frame import DataFrame +from pandas.core.shared_docs import _shared_docs + +from pandas.io.common import ( + file_exists, + get_handle, + is_fsspec_url, + is_url, + stringify_path, +) +from pandas.io.feather_format import read_feather +from pandas.io.parquet import read_parquet +from pandas.io.parsers import read_csv +from pandas.io.sql import read_sql + + +class RScriptError(Exception): + """ + Exception raises when command line call to RScript throws a non-empty + error message. Message will capture verbatim R output in console. + """ + + pass + + +def _executable_exists(name) -> bool: + """ + Internal method to check if R exists on system. + + This method will return True if R is installed for Rscript command + line call and if machine recognizes Rscript in Path env variable. + """ + + WHICH_CMD = "where" if platform.system() == "Windows" else "which" + + return ( + subprocess.call( + [WHICH_CMD, name], stdout=subprocess.PIPE, stderr=subprocess.PIPE + ) + == 0 + ) + + +RSCRIPT_EXISTS = _executable_exists("Rscript") + + +@doc(storage_options=_shared_docs["storage_options"]) +def read_rdata( + path_or_buffer: FilePathOrBuffer, + file_format: str = "infer", + engine: str = "pyreadr", + mode: str = "csv", + select_frames: Optional[List[str]] = None, + rownames: bool = True, + encoding: str = "utf-8", + storage_options: StorageOptions = None, +) -> Union[DataFrame, Dict[str, DataFrame]]: + r""" + Read R data (.RData, .rda, .rds) into DataFrame or ``dict`` of DataFrames. + + .. versionadded:: 1.3.0 + + Parameters + ---------- + path_or_buffer : str, path object, or file-like object + Any valid file path is acceptable. The string could be a URL. + Valid URL schemes include http, ftp, s3, and file. + + file_format : {{'infer', 'rda', 'rdata', 'rds'}}, default 'infer' + R serialization type as output from R's base::save or base::saveRDS + commands. Default 'infer' will use extension in file name to + to determine the format type. + + engine : {{'pyreadr'. 'rscript'}}, default 'pyreadr' + Engine used to parse or read R data. Currently, two types are + supported: ``pyreadr`` which requires the pyreadr package to be + installed and ``rscript`` which requires R to be installed on machine. + For ``rscript``, be sure the R bin installation folder is included in + the system Path environment variable. The ``pyreadr`` is the faster + parser to handle most needs but ``rscript`` engine provides fuller + support of rda and rds formats since it calls native R commands. + + mode : {{'csv', 'parquet', 'feather', 'sqlite'}}, default 'csv' + Python and R I/O transfer mode that only applies to ``rscript`` + engine (ignored for ``pyreadr``). Using ``csv`` (text approach), no + additional packages are required. Using ``parquet`` or ``feather`` + (binary approach) requires pyarrow installed in Python and arrow + package installed in R. Using ``sqlite`` (database approach) requires + RSQLite package installed in R. Binary will usually be faster to process + than text data. Database usually ensures data type integrity. + + select_frames : list, default None + Selected names of DataFrames to return from R rda and RData types that + can contain multiple objects. + + rownames : bool, default True + Include original rownames in R data frames to map into a DataFrame index. + + encoding : str, optional, default 'utf-8' + Encoding of R data. Currently, ``pyreadr`` engine only supports utf-8 + encoded data. + + {storage_options} + + Returns + ------- + DataFrame or dict of DataFrames + Depends on R data type where rds formats returns a single DataFrame and + rda or RData formats return ``dict`` of DataFrames. + + See Also + -------- + read_sas : Read SAS datasets into DataFrame. + read_stata : Read Stata datasets into DataFrame. + read_spss : Read SPSS datasets into DataFrame. + + Notes + ----- + For ``pyreadr`` engine, any R data file that contains a non-data.frame object + may raise parsing errors. For ``rscript`` engine, such objects will be + ignored. Both methods will or attempt to return data.frame objects or any + object that is coercible to R's data.frame such as matrix, tibble, + and data.table. For arrays, method will attempt to convert to 2D + structure and may not reproduce original R object representation. + + If object in rds types or all objects in rda or RData types are not data + frames, this method will raise an error and will not return None or an empty + dictionary. + + For ``pyreadr`` engine, ``select_frames`` above is synonymous to ``use_objects`` + in package's `read_r` method. Also, ``timezone`` argument defaults to current + system regional timezone in order to correspond to original date/times in R. + + Examples + -------- + To read an .rds file which only contains a single object, below returns a + DataFrame: + + >>> R_code = ''' + ... ghg_df <- data.frame( + ... gas = c('Carbon dioxide', + ... 'Methane', + ... 'Nitrous oxide', + ... 'Fluorinated gases', + ... 'Total'), + ... year = c(2018, + ... 2018, + ... 2018, + ... 2018, + ... 2018), + ... emissions = c(5424.88, + ... 634.46, + ... 434.53, + ... 182.78, + ... 6676.65) + ... ) + ... saveRDS(ghg_df, file="ghg_df.rds") + ... ''' + + >>> ghg_df = pd.read_rdata("ghg_df.rds") # doctest: +SKIP + >>> ghg_df # doctest: +SKIP + gas year emissions + rownames + 1 Carbon dioxide 2018 5424.88 + 2 Methane 2018 634.46 + 3 Nitrous oxide 2018 434.53 + 4 Fluorinated gases 2018 182.78 + 5 Total 2018 6676.65 + + To read an .rda or .RData file which can contain multiple objects, blue + returns a ``dict`` of DataFrames: + + >>> R_code = ''' + ... plants_df <- pd.DataFrame( + ... plant_group = c('Pteridophytes', + ... 'Pteridophytes', + ... 'Pteridophytes', + ... 'Pteridophytes', + ... 'Pteridophytes'), + ... status = c('Data Deficient', + ... 'Extinct', + ... 'Not Threatened', + ... 'Possibly Threatened', + ... 'Threatened'), + ... count = c(398, 65, 1294, 408, 1275) + ... ) + ... sea_ice_df <- pd.DataFrame( + ... year = c(2016, 2017, 2018, 2019, 2020), + ... mo = c(12, 12, 12, 12, 12], + ... data.type: c('Goddard', + ... 'Goddard', + ... 'Goddard', + ... 'Goddard', + ... 'NRTSI-G'), + ... region = c('S', 'S', 'S', 'S', 'S'), + ... extent = c(8.28, 9.48, 9.19, 9.41, 10.44), + ... area = c(5.51, 6.23, 5.59, 6.59, 6.5) + ... ) + ... save(ghg_df, plants_df, sea_ice_df, file="env_data_dfs.rda") + ... ''' + + >>> env_dfs = pd.read_rdata("env_data_dfs.rda") # doctest: +SKIP + >>> env_dfs # doctest: +SKIP + {{'ghg_df': + gas year emissions + rownames + 1 Carbon dioxide 2018 5424.88 + 2 Methane 2018 634.46 + 3 Nitrous oxide 2018 434.53 + 4 Fluorinated gases 2018 182.79 + 5 Total 2018 6676.65, + 'plants_df': + plant_group status count + rownames + 1 Pteridophytes Data Deficient 398 + 2 Pteridophytes Extinct 65 + 3 Pteridophytes Not Threatened 1294 + 4 Pteridophytes Possibly Threatened 408 + 5 Pteridophytes Threatened 1275, + 'sea_ice_df': + year mo data.type region extent area + rownames + 1 2016 12 Goddard S 8.28 5.51 + 2 2017 12 Goddard S 9.48 6.23 + 3 2018 12 Goddard S 9.19 5.59 + 4 2019 12 Goddard S 9.41 6.59 + 5 2020 12 NRTSI-G S 10.44 6.50}} + """ + + return _parse( + path_or_buffer=path_or_buffer, + file_format=file_format, + engine=engine, + mode=mode, + select_frames=select_frames, + rownames=rownames, + encoding=encoding, + storage_options=storage_options, + ) + + +def _parse( + path_or_buffer, + file_format, + engine, + mode, + select_frames, + rownames, + encoding, + storage_options, + **kwargs, +) -> Union[DataFrame, Dict[str, DataFrame]]: + """ + Call internal parser classes. + + This method will conditionally call internal parsers: + _PyReadrParser or _RscriptParser. + + Raises + ------ + FileNotFoundError + * If Rscript bin executable is not installed or found on machine. + + ImportError + * If pyreadr for engine and pyarrow for mode is not installed. + + ValueError + * If engine is neither pyreadr or rscript. + """ + pyreadr = import_optional_dependency("pyreadr", errors="ignore") + pyarrow = import_optional_dependency("pyarrow", errors="ignore") + + RDataReader: Union[Type[_PyReadrParser], Type[_RscriptParser]] + + if engine == "pyreadr": + if pyreadr is None: + raise ImportError("pyreadr not found, please install for this engine.") + + RDataReader = _PyReadrParser + + elif engine == "rscript": + if RSCRIPT_EXISTS is None: + raise FileNotFoundError( + "R is either not installed on this system or its " + "bin folder is not in Path environment variable." + ) + + if pyarrow is None and mode in ["parquet", "feather"]: + raise ImportError("pyarrow not found, please install for this mode.") + + RDataReader = _RscriptParser + else: + raise ValueError(f"{engine} is not a supported engine.") + + rdr = RDataReader( + path_or_buffer, + file_format, + engine, + mode, + select_frames, + rownames, + encoding, + storage_options, + ) + + return rdr.parse_data() + + +def _get_data_from_filepath( + filepath_or_buffer, + encoding, + compression, + storage_options, +) -> Union[str, bytes, Buffer]: + """ + Extract raw R data. + + The method accepts three input types: + 1. filepath (string-like) + 2. file-like object (e.g. open file object, BytesIO) + 3. R data file in ascii or binary content + + This method turns (1) into (2) to simplify the rest of the processing. + It returns input types (2) and (3) unchanged. + """ + filepath_or_buffer = stringify_path(filepath_or_buffer) + + if ( + not isinstance(filepath_or_buffer, str) + or is_url(filepath_or_buffer) + or is_fsspec_url(filepath_or_buffer) + or file_exists(filepath_or_buffer) + ): + with get_handle( + filepath_or_buffer, + "rb", + encoding=encoding, + compression=compression, + storage_options=storage_options, + is_text=False, + ) as handle_obj: + filepath_or_buffer = ( + handle_obj.handle.read() + if hasattr(handle_obj.handle, "read") + else handle_obj.handle + ) + else: + raise FileNotFoundError(f"{filepath_or_buffer} file cannot be found.") + + return filepath_or_buffer + + +def _preprocess_data(data) -> Union[io.StringIO, io.BytesIO]: + """ + Convert extracted raw data. + + This method will return underlying data of extracted R data formats. + The data either has a `read` attribute (e.g. a file object or a + StringIO/BytesIO) or is bytes that represents the R data. + """ + + if isinstance(data, str): + data = io.StringIO(data) + + elif isinstance(data, bytes): + data = io.BytesIO(data) + + return data + + +class _RDataReader: + """ + Internal subclass to parse R data files into dict of DataFrames. + + Parameters + ---------- + path_or_buffer : a valid str, path object or file-like object + Any valid string path is acceptable. The string could be a URL. Valid + URL schemes include http, ftp, s3, and file. + + file_format : {{'infer', 'rda', 'rdata', 'rds'}}, default 'infer' + R serialization type. + + engine : {{'pyreadr', 'rscript'}}, default 'pyreadr' + Engine used to parse or read R data. + + mode : {{'csv', 'parquet', 'feather', 'sqlite'}}, default 'csv' + Python and R i/o transfer mode. + + select_frames : list, default None + Selected names of DataFrames to return from R data. + + rownames : bool, default True + Include original rownames in R data frames. + + encoding : str, optional, default 'utf-8' + Encoding of R data. + + storage_options : dict, optional + Extra options that make sense for a particular storage connection, + e.g. host, port, username, password, etc., + + See also + -------- + pandas.io.rdata._PyReadrParser + pandas.io.rdata._RscriptParser + + Notes + ----- + To subclass this class effectively you must override the following methods:` + * :func:`handle_rownames` + * :func:`parse_data` + + See each method's respective documentation for details on their + functionality. + """ + + def __init__( + self, + path_or_buffer, + file_format, + engine, + mode, + select_frames, + rownames, + encoding, + storage_options, + ) -> None: + self.path_or_buffer = path_or_buffer + self.file_format = file_format.lower() + self.engine = engine + self.mode = mode + self.select_frames = select_frames + self.rownames = rownames + self.encoding = encoding + self.storage_options = storage_options + + def verify_params(self) -> None: + """ + Verify user entries of parameters. + + This method will check the values and types of select parameters + and raise appropriate errors. + """ + + if self.file_format not in ["infer", "rda", "rdata", "rds"]: + raise ValueError( + f"'{self.file_format}' is not a valid value for file_format" + ) + + if ( + self.file_format == "infer" + and isinstance(self.path_or_buffer, str) + and not self.path_or_buffer.lower().endswith((".rda", ".rdata", ".rds")) + ) or (self.file_format == "infer" and not isinstance(self.path_or_buffer, str)): + raise ValueError( + f"Unable to infer file format from file name: {self.path_or_buffer}. " + "Please use known R data type (.rda, .rdata, .rds)." + ) + + if self.file_format == "infer": + self.file_format = os.path.splitext(self.path_or_buffer.lower())[1][1:] + + if self.mode is not None and self.mode not in [ + "csv", + "feather", + "parquet", + "sqlite", + ]: + raise ValueError(f"'{self.mode}' is not supported value for mode.") + + if self.select_frames is not None and not is_list_like(self.select_frames): + raise TypeError( + f"{type(self.select_frames).__name__} is " + "not a valid type for select_frames" + ) + + def buffer_to_disk(self, tmp_dir: str) -> str: + """ + Convert path or buffer to disk file. + + This method will convert path_or_buffer to temp file + for pyreadr to parse and rscript to import. + """ + + r_temp = os.path.join(tmp_dir, "rdata.rda") + + handle_data = _get_data_from_filepath( + filepath_or_buffer=self.path_or_buffer, + encoding=self.encoding, + compression=None, + storage_options=self.storage_options, + ) + + with _preprocess_data(handle_data) as r_data: + mode = "wb" if isinstance(r_data, io.BytesIO) else "w" + with open(r_temp, mode) as f: + f.write(r_data.read()) + + return r_temp + + def handle_row_names(self) -> DataFrame: + """ + Migrate R rownames to DataFrame index. + + This method will conditionally adjust index to reflect + original R rownames. + """ + + raise AbstractMethodError(self) + + def parse_data(self) -> Union[DataFrame, Dict[str, DataFrame]]: + """ + Parse R data files. + + This method will run engine methods to return a single DataFrame + for rds type or dictionary of DataFrames for RData or rda types. + """ + + raise AbstractMethodError(self) + + +class _PyReadrParser(_RDataReader): + """ + Internal class to parse R data types using third-party + package, pyreadr. + """ + + def __init__(self, *args, **kwargs) -> None: + super().__init__(*args, **kwargs) + self.verify_params() + + def handle_rownames(self, df) -> DataFrame: + if not self.rownames: + df = df.reset_index(drop=True) + df.index.name = None + + if self.rownames and df.index.name != "rownames": + df.index.name = "rownames" + if df.index[0] == 0: + df.index += 1 + + return df + + def parse_data(self) -> Union[DataFrame, Dict[str, DataFrame]]: + from pyreadr import read_r + + tz = datetime.now().astimezone().tzinfo + with TemporaryDirectory() as tmp_dir: + r_temp = self.buffer_to_disk(tmp_dir) + rdata = read_r(r_temp, use_objects=self.select_frames, timezone=tz) + + rdata = {k: self.handle_rownames(df) for k, df in rdata.items()} + rdata = rdata[None] if self.file_format == "rds" else dict(rdata) + + return rdata + + +class _RscriptParser(_RDataReader): + """ + Internal class to parse R data types using temp script and data + files and command line call to installed Rscript executable. + """ + + def __init__(self, *args, **kwargs) -> None: + super().__init__(*args, **kwargs) + self.verify_params() + + def handle_rownames(self, df) -> DataFrame: + if self.rownames: + df = df.set_index("rownames") + else: + df = df.drop(["rownames"], axis=1) + + return df + + def run_rscript(self, tmp_dir, r_batch, cmds) -> str: + """ + Run R script at command line. + + This method will call subprocess.Popen to run R script that + saves temp data and meta files and returns R's console output. + """ + + with open(cmds[1], "w") as f: + f.write(r_batch) + + p = subprocess.Popen( + cmds, + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + cwd=tmp_dir, + ) + output, error = p.communicate() + if len(error) != 0: + raise RScriptError(error.decode(self.encoding)) + + return output.decode(self.encoding) + + def parse_data(self) -> Union[DataFrame, Dict[str, DataFrame]]: + self.r_to_py_types = { + "logical": "bool", + "integer": "int64", + "numeric": "float64", + "factor": "category", + "character": "str", + "Date": "date", + "POSIXct": "date", + } + + switch_board = { + "rda": { + "csv": self.read_rdata_csv, + "feather": self.read_rdata_feather, + "parquet": self.read_rdata_parquet, + "sqlite": self.read_rdata_sqlite, + }, + "rdata": { + "csv": self.read_rdata_csv, + "feather": self.read_rdata_feather, + "parquet": self.read_rdata_parquet, + "sqlite": self.read_rdata_sqlite, + }, + "rds": { + "csv": self.read_rds_csv, + "feather": self.read_rds_feather, + "parquet": self.read_rds_parquet, + "sqlite": self.read_rds_sqlite, + }, + } + + rdata: Union[DataFrame, Dict[str, DataFrame], None] + rdata = switch_board[self.file_format][self.mode]() + + rdata = ( + {k: v for k, v in rdata.items() if k in self.select_frames} + if self.select_frames + else rdata + ) + rdata = {k: self.handle_rownames(df) for k, df in rdata.items()} + + rdata = rdata or None + rdata = ( + rdata["r_df"] + if (self.file_format == "rds" and rdata is not None) + else rdata + ) + + if rdata is None: + raise ValueError( + "No actual data frame or coercible data frames found in R data file." + ) + return rdata + + def read_rdata_csv(self) -> Dict[str, DataFrame]: + """ + Read R rda data via IO csv. + + This method will call `load` and `write.csv` in R to export all + data frames and metadata into temp csv files for pandas `read_csv`. . + """ + + r_batch = """ + args <- commandArgs(trailingOnly=TRUE) + load(args[1], temp_env <- new.env()) + + env_list <- as.list.environment(temp_env) + rm(temp_env) + + output_data_meta <- function(obj, nm) { + df <- tryCatch(data.frame(obj, + check.names=FALSE, + stringsAsFactors=FALSE + ), error=function(e) NULL) + + if (!is.null(df)) { + cat(nm, "\n", sep="") + + df <- data.frame(rownames = row.names(df), df, + check.names=FALSE, + stringsAsFactors=FALSE) + writeLines( + c(paste0(colnames(df), collapse=","), + paste0(sapply(df, + function(x) class(x)[1]), collapse=",")), + con=paste0("meta_", nm, ".txt") + ) + + write.csv(df, paste0("data_", nm, ".csv"), + row.names=FALSE, na="") + } + } + + output <- mapply(output_data_meta, env_list, names(env_list)) + """ + + with TemporaryDirectory() as tmp_dir: + r_file = os.path.join(tmp_dir, "r_batch.R") + rda_file = self.buffer_to_disk(tmp_dir) + + output = self.run_rscript(tmp_dir, r_batch, ["Rscript", r_file, rda_file]) + + oline: str + dfs: Dict[str, DataFrame] = {} + for oline in filter(None, output.strip().split("\n")): + with open( + os.path.join(tmp_dir, f"meta_{oline}.txt"), + encoding=self.encoding, + ) as f: + flines = [fline.strip() for fline in f] + + r_hdrs: List[List[str]] = [h.split(",") for h in flines] + py_types = {n: self.r_to_py_types[d] for n, d in zip(*r_hdrs)} + + dt_cols = [col for col, d in py_types.items() if d == "date"] + py_types = {k: v for k, v in py_types.items() if v != "date"} + + try: + dfs[oline] = read_csv( + os.path.join(tmp_dir, f"data_{oline}.csv"), + dtype=py_types, # type: ignore[arg-type] + parse_dates=dt_cols, + encoding=self.encoding, + ) + except (ParserError, ValueError): + dfs[oline] = read_csv( + os.path.join(tmp_dir, f"data_{oline}.csv"), + encoding=self.encoding, + ) + + return dfs + + def read_rdata_feather(self) -> Dict[str, DataFrame]: + """ + Read R rda data via IO feather. + + This method will call `readRDS` and `write_feather` in R to export all + data frames into temp feather files for pandas `read_feather`. + """ + + r_batch = """ + suppressPackageStartupMessages(library(arrow)) + + args <- commandArgs(trailingOnly=TRUE) + + load(args[1], temp_env <- new.env()) + env_list <- as.list.environment(temp_env) + rm(temp_env) + + output_data_meta <- function(obj, nm) { + df <- tryCatch(data.frame(obj, + check.names=FALSE, + stringsAsFactors=FALSE + ), error=function(e) NULL) + + if (!is.null(df)) { + cat(nm, "\n", sep="") + df <- data.frame(rownames = row.names(df), df, + check.names=FALSE, + stringsAsFactors=FALSE) + arrow::write_feather(df, paste0("data_", nm, ".feather")) + } + } + + output <- mapply(output_data_meta, env_list, names(env_list)) + """ + + with TemporaryDirectory() as tmp_dir: + r_file = os.path.join(tmp_dir, "r_batch.R") + rda_file = self.buffer_to_disk(tmp_dir) + + output = self.run_rscript(tmp_dir, r_batch, ["Rscript", r_file, rda_file]) + + oline: str + dfs: Dict[str, DataFrame] = { + oline: read_feather(os.path.join(tmp_dir, f"data_{oline}.feather")) + for oline in filter(None, output.strip().split("\n")) + } + + return dfs + + def read_rdata_parquet(self) -> Dict[str, DataFrame]: + """ + Read R rda data via IO parquet. + + This method will call `load` and `write_parquet` in R to export all + data frames into temp parquet files for pandas `read_parquet`. + """ + + r_batch = """ + suppressPackageStartupMessages(library(arrow)) + + args <- commandArgs(trailingOnly=TRUE) + + load(args[1], temp_env <- new.env()) + env_list <- as.list.environment(temp_env) + rm(temp_env) + + output_data_meta <- function(obj, nm) { + df <- tryCatch(data.frame(obj, + check.names=FALSE, + stringsAsFactors=FALSE + ), error=function(e) NULL) + + if (!is.null(df)) { + cat(nm, "\n", sep="") + df <- data.frame(rownames = row.names(df), df, + check.names=FALSE, + stringsAsFactors=FALSE) + arrow::write_parquet(df, paste0("data_", nm, ".parquet")) + } + } + + output <- mapply(output_data_meta, env_list, names(env_list)) + """ + + with TemporaryDirectory() as tmp_dir: + r_file = os.path.join(tmp_dir, "r_batch.R") + rda_file = self.buffer_to_disk(tmp_dir) + + output = self.run_rscript(tmp_dir, r_batch, ["Rscript", r_file, rda_file]) + + oline: str + dfs: Dict[str, DataFrame] = { + oline: read_parquet(os.path.join(tmp_dir, f"data_{oline}.parquet")) + for oline in filter(None, output.strip().split("\n")) + } + + return dfs + + def read_rdata_sqlite(self) -> Dict[str, DataFrame]: + """ + Read R rda data via IO sql. + + This method will call `load` and `dbWriteTable` in R to export all + data frames into a temp SQLite database for pandas `read_sql`. + """ + import sqlite3 + + r_batch = """ + suppressPackageStartupMessages(library(RSQLite)) + + args <- commandArgs(trailingOnly=TRUE) + + load(args[1], temp_env <- new.env()) + env_list <- as.list.environment(temp_env) + rm(temp_env) + + conn <- dbConnect(RSQLite::SQLite(), "r_data.db") + output_data_meta <- function(obj, nm) { + df <- tryCatch(data.frame(obj, + check.names=FALSE, + stringsAsFactors=FALSE + ), error=function(e) NULL) + + if (!is.null(df)) { + cat(nm, "\n", sep="") + df <- data.frame(rownames = row.names(df), df, + check.names=FALSE, + stringsAsFactors=FALSE) + dbWriteTable(conn, paste0("data_", nm), df, row.names=FALSE) + } + } + + output <- mapply(output_data_meta, env_list, names(env_list)) + dbDisconnect(conn) + """ + + with TemporaryDirectory() as tmp_dir: + r_db = os.path.join(tmp_dir, "r_data.db") + r_file = os.path.join(tmp_dir, "r_batch.R") + rda_file = self.buffer_to_disk(tmp_dir) + + output = self.run_rscript(tmp_dir, r_batch, ["Rscript", r_file, rda_file]) + + oline: str + conn = sqlite3.connect(r_db) + dfs: Dict[str, DataFrame] = { + oline: read_sql(f"SELECT * FROM data_{oline}", conn) + for oline in filter(None, output.strip().split("\n")) + } + conn.close() + + return dfs + + def read_rds_csv(self) -> Dict[str, DataFrame]: + """ + Read R rds data via IO csv. + + This method will call `readRDS` and `write.csv` in R to export single + data frame and metadata into temp csv files for pandas `read_csv`. + """ + + r_batch = """ + args <- commandArgs(trailingOnly=TRUE) + + raw <- readRDS(args[1]) + df <- tryCatch(data.frame(raw, + check.names=FALSE, + stringsAsFactors=FALSE + ), error = function(e) NULL) + + if(!is.null(df)) { + df <- data.frame(rownames = row.names(df), df, + check.names=FALSE, + stringsAsFactors=FALSE) + write.csv(df, file=args[2], row.names=FALSE) + + cat(paste0(colnames(df), collapse=","),"|", + paste0(sapply(df, function(x) + class(x)[1]), collapse=","), + sep="") + } + """ + + dfs: Dict[str, DataFrame] = {} + with TemporaryDirectory() as tmp_dir: + r_data = os.path.join(tmp_dir, "r_data.csv") + r_file = os.path.join(tmp_dir, "r_batch.R") + + rds_file = self.buffer_to_disk(tmp_dir) + output = self.run_rscript( + tmp_dir, r_batch, ["Rscript", r_file, rds_file, r_data] + ) + + if os.path.isfile(r_data): + r_hdrs = [h.split(",") for h in output.split("|")] + n: str + py_types = {n: self.r_to_py_types[d] for n, d in zip(*r_hdrs)} + + dt_cols = [col for col, d in py_types.items() if d == "date"] + py_types = {k: v for k, v in py_types.items() if v != "date"} + + try: + dfs["r_df"] = read_csv( + r_data, + dtype=py_types, # type: ignore[arg-type] + parse_dates=dt_cols, + encoding=self.encoding, + ) + except (ParserError, ValueError): + dfs["r_df"] = read_csv(r_data) + + return dfs + + def read_rds_feather(self) -> Dict[str, DataFrame]: + """ + Read R rds data via IO feather. + + This method will call `readRDS` and `write_feather` in R to export single + data frame into a temp feather file for pandas `read_feather`. + """ + + r_batch = """ + suppressPackageStartupMessages(library(arrow)) + args <- commandArgs(trailingOnly=TRUE) + + raw <- readRDS(args[1]) + df <- tryCatch(data.frame(raw, + check.names=FALSE, + stringsAsFactors=FALSE + ), error = function(e) NULL) + + if(!is.null(df)) { + df <- data.frame(rownames = row.names(df), df, + check.names=FALSE, + stringsAsFactors=FALSE) + arrow::write_feather(df, args[2]) + } + """ + + with TemporaryDirectory() as tmp_dir: + r_data = os.path.join(tmp_dir, "r_data.feather") + r_file = os.path.join(tmp_dir, "r_batch.R") + + rds_file = self.buffer_to_disk(tmp_dir) + self.run_rscript(tmp_dir, r_batch, ["Rscript", r_file, rds_file, r_data]) + + dfs: Dict[str, DataFrame] = ( + {"r_df": read_feather(r_data)} if os.path.isfile(r_data) else {} + ) + + return dfs + + def read_rds_parquet(self) -> Dict[str, DataFrame]: + """ + Read R rds data via IO parquet. + + This method will call `readRDS` and `write_parquet` in R to export + single data frame into a temp parquet file for pandas `read_parquet`. + """ + + r_batch = """ + suppressPackageStartupMessages(library(arrow)) + args <- commandArgs(trailingOnly=TRUE) + + raw <- readRDS(args[1]) + df <- tryCatch(data.frame(raw, + check.names=FALSE, + stringsAsFactors=FALSE + ), error = function(e) NULL) + + if(!is.null(df)) { + df <- data.frame(rownames = row.names(df), df, + check.names=FALSE, + stringsAsFactors=FALSE) + arrow::write_parquet(df, args[2]) + } + """ + + with TemporaryDirectory() as tmp_dir: + r_data = os.path.join(tmp_dir, "r_data.parquet") + r_file = os.path.join(tmp_dir, "r_batch.R") + + rds_file = self.buffer_to_disk(tmp_dir) + self.run_rscript(tmp_dir, r_batch, ["Rscript", r_file, rds_file, r_data]) + + dfs: Dict[str, DataFrame] = ( + {"r_df": read_parquet(r_data, engine="pyarrow")} + if os.path.isfile(r_data) + else {} + ) + + return dfs + + def read_rds_sqlite(self) -> Dict[str, DataFrame]: + """ + Read R rds data via IO sql. + + This method will call `readRDS` and `dbWriteTable` in R to export + single data frame into a temp SQLite database for pandas `read_sql`. + """ + import sqlite3 + + r_batch = """ + suppressPackageStartupMessages(library(RSQLite)) + args <- commandArgs(trailingOnly=TRUE) + + raw <- readRDS(args[1]) + df <- tryCatch(data.frame(raw, + check.names=FALSE, + stringsAsFactors=FALSE + ), error = function(e) NULL) + + if(!is.null(df)) { + conn <- dbConnect(RSQLite::SQLite(), args[2]) + df <- data.frame(rownames = row.names(df), df, + check.names=FALSE, + stringsAsFactors=FALSE) + dbWriteTable(conn, "rdata", df, row.names=FALSE) + dbDisconnect(conn) + } + """ + + dfs: Dict[str, DataFrame] = {} + with TemporaryDirectory() as tmp_dir: + r_data = os.path.join(tmp_dir, "r_data.db") + r_file = os.path.join(tmp_dir, "r_batch.R") + + rds_file = self.buffer_to_disk(tmp_dir) + self.run_rscript(tmp_dir, r_batch, ["Rscript", r_file, rds_file, r_data]) + + if os.path.isfile(r_data): + conn = sqlite3.connect(r_data) + dfs["r_df"] = read_sql("SELECT * FROM rdata", conn) + conn.close() + + return dfs + + +class RDataWriter: + """ + Subclass to write pandas DataFrames into R data files. + + Parameters + ---------- + path_or_buffer : a valid str, path object or file-like object + Any valid string path is acceptable. + + file_format : {{'infer', 'rda', 'rdata', 'rds'}}, default 'infer' + R serialization type. + + engine : {{'rscript','pyreadr'}}, default 'utf-8' + Engine used to write R data. + + mode : {{'csv', 'parquet', 'feather'}}, default 'csv' + Python and R i/o transfer mode. + + other_frames : list, optional + Other DataFrames to be included in rda (not rds) files + that can contain multiple objects. + + rda_names : list, default ["pandas_dataframe"] + Names for all exported objects in rda file. + + index : bool, default True + Include index or MultiIndex in output as separate columns. + + ascii : bool, default False + Write data in ASCII representation. + + compress : bool or {{'gzip', 'bzip2', 'xz'}}, default 'gzip' + Compression types for R data. For pyreadr engine, only gzip + is supported. Use False for uncompressed files. + + encoding : str, optional, default 'utf-8' + Encoding of R data. + + storage_options : dict, optional + Extra options that make sense for a particular storage connection, + e.g. host, port, username, password, etc. + + See also + -------- + pandas.io.rdata.PyReadrWriter + pandas.io.rdata.RscriptWriter + + Notes + ----- + To subclass this class effectively you must override the following methods:` + * :func:`write_data` + + See each method's respective documentation for details on their + functionality. + """ + + def __init__( + self, + frame: DataFrame, + path_or_buffer: FilePathOrBuffer, + file_format: str = "infer", + engine: str = "rscript", + mode: str = "csv", + other_frames: Optional[List[DataFrame]] = None, + rda_names: List[str] = ["pandas_dataframe"], + index: bool = True, + ascii: bool = False, + compress: Union[bool, str] = "gzip", + encoding: str = "utf-8", + storage_options: StorageOptions = None, + ) -> None: + self.frame = frame + self.path_or_buffer = path_or_buffer + self.file_format = file_format.lower() + self.engine = engine + self.mode = mode + self.other_frames = other_frames + self.rda_names = rda_names + self.index = index + self.ascii = ascii + self.compress = compress + self.encoding = encoding + self.storage_options = storage_options + + def verify_params(self) -> None: + """ + Verify user entries of parameters. + + This method will check the values and types of select parameters + and raise appropriate errors. + """ + + if self.file_format not in ["infer", "rda", "rdata", "rds"]: + raise ValueError( + f"{self.file_format} is not a valid value for file_format." + ) + + if ( + self.file_format == "infer" + and isinstance(self.path_or_buffer, str) + and not self.path_or_buffer.lower().endswith((".rda", ".rdata", ".rds")) + ): + raise ValueError( + f"Unable to infer file format from file name: {self.path_or_buffer}" + "Please use known R data type (.rda, .rdata, .rds)." + ) + + if self.file_format == "infer" and isinstance(self.path_or_buffer, str): + self.file_format = os.path.splitext(self.path_or_buffer.lower())[1][1:] + + if self.mode is not None and self.mode not in [ + "csv", + "feather", + "parquet", + "sqlite", + ]: + raise ValueError(f"{self.mode} is not supported value for mode.") + + if self.other_frames is not None and not is_list_like(self.other_frames): + raise TypeError( + f"{type(self.other_frames).__name__} is not " + " a valid type for other_frames." + ) + elif self.other_frames is not None: + for df in self.other_frames: + if not isinstance(df, DataFrame): + raise TypeError( + "One or more of the objects in " + "other_frames is not a DataFrame." + ) + + if self.rda_names is not None and not is_list_like(self.rda_names): + raise TypeError( + f"{type(self.rda_names).__name__} is not a valid type for rda_names." + ) + + if self.compress is not None and self.compress not in [ + True, + False, + "gzip", + "bzip2", + "xz", + ]: + raise ValueError(f"{self.compress} is not a supported value for compress.") + + def disk_to_buffer(self, r_file: str) -> None: + """ + Save temp file to path or buffer. + + This method will convert written R data to path_or_buffer. + """ + + with open(r_file, "rb") as rdata: + with get_handle( + self.path_or_buffer, + "wb", + compression=None, + storage_options=self.storage_options, + is_text=False, + ) as handles: + handles.handle.write(rdata.read()) # type: ignore[arg-type] + + return None + + def write_data(self) -> None: + """ + Write DataFrames to R data files. + + This method will run engine methods to export DataFrames + to R data files. + """ + + raise AbstractMethodError(self) + + +class PyReadrWriter(RDataWriter): + """ + Main class called in `pandas.core.frame` to write DataFrame to R + data types using third-party package, pyreadr. + """ + + def __init__(self, *args, **kwargs) -> None: + super().__init__(*args, **kwargs) + self.verify_params() + + def write_data(self) -> None: + from pyreadr import ( + write_rdata, + write_rds, + ) + + self.frame = ( + self.frame.reset_index() + if self.index + else self.frame.reset_index(drop=True) + ) + + with TemporaryDirectory() as tmp_dir: + r_temp = os.path.join(tmp_dir, "rdata.rda") + + if self.file_format in ["rda", "rdata"]: + write_rdata( + path=r_temp, + df=self.frame, + df_name=self.rda_names[0], + compress=self.compress, + ) + elif self.file_format == "rds": + write_rds(path=r_temp, df=self.frame, compress=self.compress) + + self.disk_to_buffer(r_temp) + + return None + + +class RscriptWriter(RDataWriter): + """ + Main class called in `pandas.core.frame` to write DataFrame(s) to R + data types using command line to Rscript. + """ + + def __init__(self, *args, **kwargs) -> None: + super().__init__(*args, **kwargs) + self.verify_params() + self.handle_objects() + + def handle_objects(self) -> None: + + self.all_frames = ( + [self.frame] + self.other_frames if self.other_frames else [self.frame] + ) + + if len(self.rda_names) != len(self.all_frames): + raise ValueError( + f"Length of {self.rda_names} does not match number " + "of current DataFrame and other_frames" + ) + + return None + + def run_rscript(self, tmp_dir, r_batch, cmds) -> None: + """ + Run R script at command line. + + This method will call subprocess.Popen to run R script + and return only non-empty error R output in console. + """ + + with open(cmds[1], "w") as f: + f.write(r_batch) + + a = subprocess.Popen( + cmds, + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + cwd=tmp_dir, + ) + output, error = a.communicate() + if len(error) != 0: + raise RScriptError(error.decode(self.encoding)) + + return None + + def write_data(self) -> None: + self.py_to_r_types = { + "int32": "integer", + "int64": "integer", + "float64": "numeric", + "category": "factor", + "object": "character", + "bool": "logical", + "datetime64[ns]": "POSIXct", + } + + switch_board = { + "rda": { + "csv": self.write_rdata_csv, + "feather": self.write_rdata_feather, + "parquet": self.write_rdata_parquet, + "sqlite": self.write_rdata_sqlite, + }, + "rdata": { + "csv": self.write_rdata_csv, + "feather": self.write_rdata_feather, + "parquet": self.write_rdata_parquet, + "sqlite": self.write_rdata_sqlite, + }, + "rds": { + "csv": self.write_rds_csv, + "feather": self.write_rds_feather, + "parquet": self.write_rds_parquet, + "sqlite": self.write_rds_sqlite, + }, + } + + switch_board[self.file_format][self.mode]() + + return None + + def write_rdata_csv(self) -> None: + """ + Write R rda data via IO csv. + + This method will export one or more DataFrames into temp data + and metadata csv files and call `read.csv` and `save` in R. + """ + + r_batch = """ + args <- commandArgs(trailingOnly=TRUE) + + py_names <- strsplit(args[1], ",")[[1]] + + for(obj in py_names) { + meta <- paste0("meta_", obj, ".txt") + r_types <- strsplit(readLines(meta, n=-1, + warn=FALSE), ",")[[1]] + + data <- paste0("data_", obj, ".csv") + df <- tryCatch( + read.csv(data, colClasses=r_types), + error = function(e) read.csv(data) + ) + assign(obj, df) + rm(df) + } + + r_ascii <- as.logical(args[3]) + r_compress <- ifelse(args[4] %in% c("True", "False"), + as.logical(args[4]), + args[4]) + + dfs <- names(Filter(is.data.frame, mget(ls()))) + save(list=dfs, file=args[2], + ascii=r_ascii, compress=r_compress) + """ + + with TemporaryDirectory() as tmp_dir: + for nm, df in zip(self.rda_names, self.all_frames): + + data_file = os.path.join(tmp_dir, f"data_{nm}.csv") + meta_file = os.path.join(tmp_dir, f"meta_{nm}.txt") + r_code = os.path.join(tmp_dir, "rbatch.R") + r_temp = os.path.join(tmp_dir, "rdata.rda") + + df = df.reset_index() if self.index else df + df.to_csv(data_file, index=False) + + with open(meta_file, "w") as f: + f.write( + ",".join( + self.py_to_r_types[p] + for p in df.dtypes.astype(str).tolist() + ) + ) + + cmds = [ + "Rscript", + r_code, + ",".join(self.rda_names), + r_temp, + str(self.ascii), + str(self.compress), + ] + self.run_rscript(tmp_dir, r_batch, cmds) + + self.disk_to_buffer(r_temp) + + return None + + def write_rdata_feather(self) -> None: + """ + Write R rda data via IO feather. + + This method will export one or more DataFrames into temp + feather files and call `read_feather` and `save` in R. + """ + + r_batch = """ + suppressPackageStartupMessages(library(arrow)) + args <- commandArgs(trailingOnly=TRUE) + + py_names <- strsplit(args[1], ",")[[1]] + + for(obj in py_names) { + data <- paste0("data_", obj, ".feather") + df <- arrow::read_feather(data) + assign(obj, df) + rm(df) + } + + r_ascii <- as.logical(args[3]) + r_compress <- ifelse(args[4] %in% c("True", "False"), + as.logical(args[4]), + args[4]) + + dfs <- names(Filter(is.data.frame, mget(ls()))) + save(list=dfs, file=args[2], + ascii=r_ascii, compress=r_compress) + """ + + with TemporaryDirectory() as tmp_dir: + for nm, df in zip(self.rda_names, self.all_frames): + + data_file = os.path.join(tmp_dir, f"data_{nm}.feather") + r_code = os.path.join(tmp_dir, "rbatch.R") + r_temp = os.path.join(tmp_dir, "rdata.rda") + + df = df.reset_index() if self.index else df.reset_index(drop=True) + df.to_feather(data_file) + + cmds = [ + "Rscript", + r_code, + ",".join(self.rda_names), + r_temp, + str(self.ascii), + str(self.compress), + ] + self.run_rscript(tmp_dir, r_batch, cmds) + + self.disk_to_buffer(r_temp) + + def write_rdata_parquet(self) -> None: + """ + Write R rda data via IO parquet. + + This method will export one or more DataFrames into temp + parquet files and call `read_parquet` and `save` in R. + """ + + r_batch = """ + suppressPackageStartupMessages(library(arrow)) + args <- commandArgs(trailingOnly=TRUE) + + py_names <- strsplit(args[1], ",")[[1]] + + for(obj in py_names) { + data <- paste0("data_", obj, ".parquet") + df <- arrow::read_parquet(data) + assign(obj, df) + rm(df) + } + + r_ascii <- as.logical(args[3]) + r_compress <- ifelse(args[4] %in% c("True", "False"), + as.logical(args[4]), + args[4]) + + dfs <- names(Filter(is.data.frame, mget(ls()))) + save(list=dfs, file=args[2], + ascii=r_ascii, compress=r_compress) + """ + + with TemporaryDirectory() as tmp_dir: + for nm, df in zip(self.rda_names, self.all_frames): + + data_file = os.path.join(tmp_dir, f"data_{nm}.parquet") + r_code = os.path.join(tmp_dir, "rbatch.R") + r_temp = os.path.join(tmp_dir, "rdata.rda") + + df = df.reset_index() if self.index else df + df.to_parquet(data_file, index=False) + + cmds = [ + "Rscript", + r_code, + ",".join(self.rda_names), + r_temp, + str(self.ascii), + str(self.compress), + ] + self.run_rscript(tmp_dir, r_batch, cmds) + + self.disk_to_buffer(r_temp) + + def write_rdata_sqlite(self) -> None: + """ + Write R rda data via IO sql. + + This method will export one or more DataFrames into a temp + SQLite database and call `dbReadTable` and `save` in R. + """ + import sqlite3 + + r_batch = """ + suppressPackageStartupMessages(library(RSQLite)) + args <- commandArgs(trailingOnly=TRUE) + + conn <- dbConnect(RSQLite::SQLite(), args[1]) + py_names <- strsplit(args[2], ",")[[1]] + + for(obj in py_names) { + data <- paste0("data_", obj) + df <- dbReadTable(conn, data) + assign(obj, df) + rm(df) + } + dbDisconnect(conn) + + r_ascii <- as.logical(args[4]) + r_compress <- ifelse(args[5] %in% c("True", "False"), + as.logical(args[5]), + args[5]) + + dfs <- names(Filter(is.data.frame, mget(ls()))) + save(list=dfs, file=args[3], + ascii=r_ascii, compress=r_compress) + """ + + with TemporaryDirectory() as tmp_dir: + r_db = os.path.join(tmp_dir, "rdata.db") + conn = sqlite3.connect(r_db) + + for nm, df in zip(self.rda_names, self.all_frames): + r_code = os.path.join(tmp_dir, "rbatch.R") + r_temp = os.path.join(tmp_dir, "rdata.rda") + + df = df.reset_index() if self.index else df + df.to_sql(f"data_{nm}", conn, index=False) + + conn.close() + cmds = [ + "Rscript", + r_code, + r_db, + ",".join(self.rda_names), + r_temp, + str(self.ascii), + str(self.compress), + ] + self.run_rscript(tmp_dir, r_batch, cmds) + + self.disk_to_buffer(r_temp) + + def write_rds_csv(self) -> None: + """ + Write R rds data via IO csv. + + This method will export a single DataFrame into temp csv + data and call `read.csv` and `saveRDS` in R. + """ + + r_batch = """ + args <- commandArgs(trailingOnly=TRUE) + py_data <- args[1] + r_types <- strsplit(args[2], ",")[[1]] + + df <- tryCatch( + read.csv(py_data, colClasses=r_types), + error = function(e) read.csv(py_data) + ) + + r_ascii <- as.logical(args[4]) + r_compress <- ifelse(args[5] %in% c("True", "False"), + as.logical(args[5]), + args[5]) + + saveRDS(df, file=args[3], + ascii=r_ascii, compress=r_compress) + """ + + with TemporaryDirectory() as tmp_dir: + r_code = os.path.join(tmp_dir, "rbatch.R") + py_data = os.path.join(tmp_dir, "pydata.csv") + r_temp = os.path.join(tmp_dir, "rdata.rds") + + py_df = self.frame.reset_index() if self.index else self.frame + r_types = ",".join(py_df.dtypes.astype(str).replace(self.py_to_r_types)) + + py_df.to_csv(py_data, index=False) + + cmds = [ + "Rscript", + r_code, + py_data, + r_types, + r_temp, + str(self.ascii), + str(self.compress), + ] + self.run_rscript(tmp_dir, r_batch, cmds) + + self.disk_to_buffer(r_temp) + + return None + + def write_rds_feather(self) -> None: + """ + Write R rds data via IO feather. + + This method will export a single DataFrame into a temp + feather file to call `read_feather` and `saveRDS` in R. + """ + + r_batch = """ + suppressPackageStartupMessages(library(arrow)) + args <- commandArgs(trailingOnly=TRUE) + + df <- arrow::read_feather(args[1]) + + r_ascii <- as.logical(args[3]) + r_compress <- ifelse(args[4] %in% c("True", "False"), + as.logical(args[4]), + args[4]) + + saveRDS(df, file=args[2], + ascii=r_ascii, compress=r_compress) + """ + + with TemporaryDirectory() as tmp_dir: + r_code = os.path.join(tmp_dir, "rbatch.R") + py_data = os.path.join(tmp_dir, "pydata.feather") + r_temp = os.path.join(tmp_dir, "rdata.rds") + + py_df = ( + self.frame.reset_index() + if self.index + else self.frame.reset_index(drop=True) + ) + + py_df.to_feather(py_data) + + cmds = [ + "Rscript", + r_code, + py_data, + r_temp, + str(self.ascii), + str(self.compress), + ] + self.run_rscript(tmp_dir, r_batch, cmds) + + self.disk_to_buffer(r_temp) + + def write_rds_parquet(self) -> None: + """ + Write R rds data via IO parquet. + + This method will export a single DataFrame into a temp + parquet file for `read_parquet` and `saveRDS` in R. + """ + + r_batch = """ + suppressPackageStartupMessages(library(arrow)) + args <- commandArgs(trailingOnly=TRUE) + + df <- arrow::read_parquet(args[1]) + + r_ascii <- as.logical(args[3]) + r_compress <- ifelse(args[4] %in% c("True", "False"), + as.logical(args[4]), + args[4]) + + saveRDS(df, file=args[2], + ascii=r_ascii, compress=r_compress) + """ + + with TemporaryDirectory() as tmp_dir: + r_code = os.path.join(tmp_dir, "rbatch.R") + py_data = os.path.join(tmp_dir, "pydata.parquet") + r_temp = os.path.join(tmp_dir, "rdata.rds") + + py_df = self.frame.reset_index() if self.index else self.frame + + py_df.to_parquet(py_data, index=False) + + cmds = [ + "Rscript", + r_code, + py_data, + r_temp, + str(self.ascii), + str(self.compress), + ] + self.run_rscript(tmp_dir, r_batch, cmds) + + self.disk_to_buffer(r_temp) + + def write_rds_sqlite(self) -> None: + """ + Write R rds data via IO sql. + + This method will export a single DataFrame into a temp + parquet file for `dbReadTable` and `saveRDS` in R. + """ + import sqlite3 + + r_batch = """ + suppressPackageStartupMessages(library(RSQLite)) + args <- commandArgs(trailingOnly=TRUE) + + conn <- dbConnect(RSQLite::SQLite(), args[1]) + df <- dbReadTable(conn, "pydata") + + r_ascii <- as.logical(args[3]) + r_compress <- ifelse(args[4] %in% c("True", "False"), + as.logical(args[4]), + args[4]) + + saveRDS(df, file=args[2], + ascii=r_ascii, compress=r_compress) + dbDisconnect(conn) + """ + + with TemporaryDirectory() as tmp_dir: + r_code = os.path.join(tmp_dir, "rbatch.R") + py_data = os.path.join(tmp_dir, "pydata.db") + r_temp = os.path.join(tmp_dir, "rdata.rds") + + py_df = self.frame.reset_index() if self.index else self.frame + + conn = sqlite3.connect(py_data) + py_df.to_sql("pydata", conn, index=False) + conn.close() + + cmds = [ + "Rscript", + r_code, + py_data, + r_temp, + str(self.ascii), + str(self.compress), + ] + self.run_rscript(tmp_dir, r_batch, cmds) + + self.disk_to_buffer(r_temp) diff --git a/pandas/tests/io/data/rdata/climate_non_utf8_df.rda b/pandas/tests/io/data/rdata/climate_non_utf8_df.rda new file mode 100644 index 0000000000000000000000000000000000000000..a506806405f5e57cb8bb8a0e57788a8f1384f856 GIT binary patch literal 423 zcmV;Y0a*SYiwFP!000001C3HmOB_KEZTCa73ql}Q!M>PVgy2OG50XXkAS{c5w@`N1 zBnADksp(PJlYh;h;%^Xg$icOGrgtNoOAfU)UG?hK+p3yPZjZ*dW2IDI4b)Kehw`b0 zH`nK1j+Gkpcha!b}D8mh~s0 zizRjz(laNLcF`EJ7!tTbvUk-WCoI4k-92dVuy_e%pkNNZfxn}c%*P-pjX^(EEQt4n z-Kh4b$Q~+=b<$adWk|7ZYcWBH1*YHil{7G;p$c1|iTqdJKfODpm%-9X-`D>e#05tT z4%)&ZLJH>&7T`(#LTsme7wcaQiep@h_&h_~n8NJ}qX(wY)jV-ti{3bzVE~VH*?X^0 zZHGUy*3}gD=1uNvq&t@_cb7)BkDGMHLA|0|AEt!_hZRew4fu%T!{JInzSQ2M{4AHt R-{M}gy#a()t7QNK0084$%B=tZ literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/rdata/climate_non_utf8_df.rds b/pandas/tests/io/data/rdata/climate_non_utf8_df.rds new file mode 100644 index 0000000000000000000000000000000000000000..85a65550ad80f68eb9272937fddec42ae81e182c GIT binary patch literal 400 zcmV;B0dM{viwFP!000001C3HWO9Md=J@cgo1i?1oHrgrJ31Sfx#X<<8U_05H=)iu& zxxEOf{A>9s{szIq!Z>@`iy<}^?{;VQ&6{^S``FF0teLg4cGhgmHfulJUmcxFZcl=S zU`G&18!2uIT9GK|2sQ;2zL2EdSm%263mnpl?xG<~?3nUrX>R~DIa@(zq7 zjp_&U?gD9_B+f6fAQ$k7mxWOZ#+B+bq46o#cj27_NjtAK<>&&KB+|Prkpm`Tfo1{< zEKENfX-J%d3$R0~#H@F6q|m9^h=s8#*s93;17r&$_C-+9Ma~d>QPpAq=Mzn>^p#^^ zMolBEh}!e7zdyYfPH$XJwZD`9UBoqebOx%zWI_tZ2Bu(1e!*9_JoLp&iLx**O&s1~ zUh1T6lA;ABx2TzM9E+0aH>&5bP;Wc1Sfsf(l2pfi$yn=K=#9(RS4($JQ`Uwo1S+gw um(vD{TCPrZa|m!eV(GL--)8@MTsw#_wecpO`F#E<%l8kzoFhox0ssJiCc9|> literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/rdata/env_data_dfs.rda b/pandas/tests/io/data/rdata/env_data_dfs.rda new file mode 100644 index 0000000000000000000000000000000000000000..07fbef3ecb00decb344bf9f64777d67fdd609630 GIT binary patch literal 7259 zcmaJ@dpHw}+eV2}D5oS_rAU&zlCxDRZ+Wxil;e^j#B!XCZ73;XBqU}@QclBiJ`Cl2 z8gf2t&SMT6Gn;Mpy}$4K=lkQkzURKK=eh3dzMp@d>$#rm0mg|7{deqJ!@`4Xhw9s) zi^3x5W{VjaMoZHflW`T^EzixpHEezy^42`?UdZCkvtzE2`f~TmO@H>?Ee{Si{n2G; z;&lDCoZ7|mI&Wof zLs{F?dnWS+rY(7)!&1=Id54S$u+-oC&K3rMLp*AdlTgly4U3l^Rz33bgdv`cwbTCr z`|l#uiI#?SIZ#(9rZUiKqwE&e#keP+r$9(Ebdfb6sL>enhfcU4w%Jbsdn$w>_ zZPL)P3#p$x)F)-owj&voInxZAsinKHFNG^KNRXp9{-pw9GZ?&exUD*?&e#2COxrV% zVva8`hadEg)qZ@<)BUHi`%l4%9Deva7WMdAsrI;5>oW|dKkImoiNWnK?{};&?eRhP zpOEw^W3LR>C2Vf(bkx8 zHhpZO=h2gqthVW@j&Ar`J}*+uPgiMQ7SBSD-!#ej1HB3?TRR&#^HvKsyvIg&$Hw(Y z6?FA!t4#+y-BF)FKf20WFKN=ti|!sb8*XUW{fYY|;kmdW_U)0c<6zyIHI6voZI%N$zv#f-XVebtK4Afws{jU3xr%FrC^!q-3EJwe#h(OX~M+aqRVZ!J2z z*6w=gu%pM&t^}XdbUkiT=j7C~|Jz*rQwU3efNGl1gS_as)Cnr~iZ3kIeo63{Ea5YL z-81JhB^qR0+$y?@Cs?&3WIY1b!LlB82I)a`w)&fB-QRQ-T*qO8ZI3m9 zbInS}l=334uaKeX;^38vB?^Sg$?4B)DzX~Naya`5{%b`}dY9n+g|+85GV?lPyE#um z-DtNNUyl-K8%s~JjvVB>Y~;4^li&Rf@8TRzPX=9~UG;nRaJySJaJn|diCn#9?o|sq z%eHHU{Ila#*`r_0)Fmuetp18Y1-)`u+HlyqC9n*fcIq=Jr7+aGDN}o2{`@xjXJ_0& zYU;B4w=o;j%Gu>)yp9jlWOmhF!PTRe+d4sT?Y%oe-d@P#qZa)nTKp4vB!9aa~KURy2 zaah(&npj=Ao}7ay)~W5^n(VAt8r@&em`T=;lYXs5Cy}Mre%Q zcz2+Q3VQj-2qBC~Irp#eFjEjL-`Ql{8*wm(b@Ab!3F6lRsJ{S2v6^v1c7>%~bT%bz z-vOmxMsjAetk|F?^IL6_LED-uJpEoUYo<+3d-=`J=6ds>V69KY?**RMW=4~`4#;nJ z+rtLBr-e5d65N`BYF_vIokG};ImoIJayI)%#cBL;f$V@N9l z%m0n)%Kqw(++f_WAavZUO{VA)_J0tjgq`z}QU?7T-w6k|>U*cpeSaFCoN8j}*sJeV zJ@<9tUjnjpY|{6Np8I<5xVi7FzBi5qSYO{+M{WG?Bw*VaZ)Wvz-YnN+vC2p+E`|skRaIXXDp-hYSW~bLj={e#YbzV!1Xjo>_l);@7lVA%gD-Pu}`oc z3_`83TMBV02(0Da7%4Zb{EuZytTrt7XiW(ugN%X02Jcq*L06zj{6{Pgm)!$A``R-{svzhF0zWqgUz8ZK(eH z2(WnxcZ^xP_kX>1P;Hp0no*L;!?EwO*SJK!oDAA)cWpt#^{-OU6uqn23!=j~=L`g( zcyJk578FT|3LK0uQQhON&wykEEO8+jnaSmMf-yQ-QU>zrH2~H2TV}a!5UQt5Zy6vB zoWBU@!TcbvHDX+@7#EU$e^$9V)-)23X@-_XI2|W9SIM(|6k8olpMCi?H8*`(ljPn8 zoFk3>6+GXU5Er7q`6eMzNn5k$*M5BITn^Oe#W?IvvU=olHeS37Vv zbUJo&T-A{7*fhAJ$RY55S6hNkvq?kamoJvwPL9Cs4?0J3FIaTx(r3QcpNdSbn#wj3 zHXbuuT|X-NarB5Jk&$`yCU99P)SpT~;6vDi-J3-1)y9>;Py2|X27lxI$3*!RW4*kJ z8|H$d7=r*CVo@h&sxuy5MDIVuAFvQXIL?6i`w`Eh@pkOL4;zH5eI8jk!dIne2osA) zDOJhjNF_A6l&Bw25du21w}}}B&gH6g=tNE)8%3V1Da<-c9+Ip(9wOi7T5c*s)na76 z)QWU%u@`|)EKzk&8zoPj!i?Tan4BBMZfnq!uW4-pwdp>{w#WkVTBu1HNE$>MiZXUD zG^*`Q3VY|187$id{w*0{y88DE&6AuXRHz%j9egt>V;e{siJEkZP^(S2;Tbo4aQDsh z1|<#R9Z5dVk``^?Im*Is^}{4KF;o~86YIy=$G76>&3i7n;g8bI6Wwq2l`0tlxbfG_ z_lH|iZYRO^!=uv`nx|FTO%9{oDA7so`?-=nrkj$4OU>=%^p#4BM_7QKu?jbMJ5a12K>ES!l#A5@Yyc^O+QjcN6Y^fVi>Co#uYayg4$^IHW z$wIn%e~A(Zafl)&LV7xL4Fg0k##{n%zXKcgjk@b>`n-mKHW`G53{f#Y=QBaEo;&kP zO`lqH5Xnw@DOH1P|JAk;HTMz#4BAah2Gc)x#Z4+DnahMwTc+z8y{_SotOmMLcOHwm zF6t%JAezWRlZfG%wC%C;yyiG0ymaYy>ySJC1UaJNM8CZX-;rr8g^j1y1FCf|Hyq#~ zZYd!A8d0=HYq3nkyz?(ael*c>cZgEtHf1MNc?fAK43!J9>Zp1!Zzx)4{h4sAO70y# zBJ1vI!%2fwod!wX09cv5a!zWuoLzf@RgIt~yRJ0Zt06wxAHi-r7xc%_->e@f)@{!_ zP;DvfryD-f2$q}M4ohJ_ahBtyJ`&4qEQ)COYcZvHwL#_}kKONs#*imIOP7737^{Ny-Z*1VU_tcD32B}~e ztAGquB){Q52)wD^-z&RmIVzNwInjrj`(%gR&dps&1dF3(9=S@_ z>|^Ih2Mm}>M`V3w$*k?$lxya^CzZ3Yhg7Y_2OuY}_Aw(Id?j6Ayqr}dE9%>3@J;`NtZzGY;+t=-#T3VCor#i~I!b-uEF%Ja$T&=w z8+lc-PpPd$5Z&){93EW?R`NU#5Ox(06K?{GZoY@8M?z?@2>aK_v_`aOfEJ8h5uo{W zste+UKL4xlAWD&S-@u!A{0TUXfqu@|mSk^m2&^4?H=FPYl}icGjPAQTfB&s% zLHTqNk;gbYZ_~}YLprVBY#&hHEU-?yu1iKSAx12XHDE2^D`t;K31_I&^I!Hw(?3@8HK1mt* zy`DbBO`H8B#9H)3Z)r3LgT;}rev1WhnSeaDG+bTs%`y_gHQ+w8;g+zF3!u+xU+P|N z9_fTW0*zPZ)a~2~g_Ogk7VJOdHA#MH*Xaz+5VJce?BcfIBTM|O*JM5U1$8*Q)1pNd z{TU}euWUX25B&Oq1Ig(j=Q#DC2JYqLM2Xo#q;HlD=iUO&L7}MI;=I-&7CREtBB0srj%N%9%cXTL`WF|l>i{Z7XOANRW+6n;yCz(op5$~4q3ODM;VtCuEZCZBG;((LZmJNFYw^@D;O?0`+09Bi ziI|mF95R2dVJ&b~cRN4uo?mnyS$`|J5Mm+5I%V*UE}_oE-Z1@vttmtu#Qyp`m+-QBytt+ZNnq^3x4I?IaRyO)5r!QR}?`*-k%P)=G8 zkcYyD!R})DJ{=&RhHb(oY;&i*Y1fKl)Nf%t6q5L>k_S_wb?o{N6DW@Q)Pf^nwn*#X zj92s=@KEJh;DUO1Ea-z+j3jRMz-}x!DpuiS!&=z0#9(qEZ$E9MuVJ}OaDJmO^lal^ zvA#J2EW{D}AZAsU%eI_rq?rSI3Y8X&Y+&N?v6WHm>{B7Cre>e=^ey0ju_Q8LUTQ6L zb!?3qHq43)8Qfs{nusE9kd$WO;$bS#M+*ftRyR5FEmoa@=Ox`y`#GaLQJ(sG>?yPp zK62x7i2QK)ZoZT{`5W&wwhxsX{-y`_88!rMZni+F|K^=S>3^+xziB|v+f7K>$0|U4 zD1x1ac8TrAo=I@N1@5u zWN;HRc~+}sEVeJ#enD;xF1dOkc50NDLs{Y9WDR!?;UmeOvp4x=4GWnyEsJz!MWhk9E5ZT7rpVh$6^kEWC*EQo!m zOd%g~hr01_MPe97vfZWk#8q&B40bfG++uN}rewHarSc1|>hznseTcqWeb-k{e@6V+ z+@eU7n2L-ODPZoyYaEA}$cE26EbCUQXtvgO{wFDOowMPAG`I?aWoCnF^s%Ymm`+%#!Jwy6}A$9uo>B;QB(x~ELg)$5q( zUp2NSJzX`MXT+S+Y+U2{-{g0_8OixecG)o}Q23eZm7IKyk!zksD{Y-pQRds_>LhC| zLz0hf$Z2+`$~Z53(}z8Af^MS@>d;@2^f(m$quB8+(9V?LyzSL$>$Z{7N$}31yady` z((`;2nJJIZ3-nyXZFN+sEHcH2{(SOl&7EAzY1oi|3gn`Ua;HL|ZM{|aa`g%zfAFDU zlDRs<8hK+o?ils{)OEPv@w|?rq3VmCUw*NX)=|7zcKv-M_2auN2^-*jmjlN6GU|OU zXE<(ZodpU7>steDwA*q#*VduMq=y`1+Nga7rF=G zM~AL|UAi!D(@`G9IM;|z#6DAw&u?STV-|34_3GHcxL&go!S@DAG+A{~mu8ZPY+UT` zFJK7|?fcX;@eVg)I{KB5srqryz#AxI3fvs2PEd^>L5T=h&NAW8AX|@duCocMs}6HE zc!hB67Rhu*l(jI(Z^|f*nO(*mM$s>03z&Pcko&x2 zHkqw^MS%ga$s*J{jLi5A*^$qBSe*D9t@<4q0jE@>1ui^viwV%$lZC3g+BhG`@)&}W z2+EkcU`uzAfMm3-%fsL&93>8P$X1Qi26d|LBz|u&JX+t0@tjkz#awJ&x83M@TDLNk zFR>(S&ou)Ce8AbeR9y@Dk`~TR#9{v?N~5;Y?f2u77nXuWp{ASN2mB<7OG0`;qwW?< z6UnJHpkYu2nnEAj%U0Q7O(pOvHUg+#0%?5!b#A;tY1f$27h#FXQIpv4ozI9g)j7F4 z4l@XNE{NER>8r{+$jwTrT~njAkK3w- z(3(u2w#hN0!L*x ze&PEHV>mx5bh5?VmDG60q0Fp2qzhAoXvx*%zA}Lvo;IBS zuHTb|m&MizR-AJcc&_!HwEB~Vw=lD{hVyoD#wSPxy*gK4jx6qEJ1vQ>?cHs-wghYz*B}EAV0!S7KR9{J7Il|nIR9lst=SMC!N0I^@^!Izzyp9Ie7c1C z0FbhDl~};&+)(cmM;@J*wcWGlfSatQKk2Lg`)^f7yJqyAOd$^MXt^yY_P^ve+@4F> zum)IdMgkVNA7Xxz=N_qPQ)9L9qT$-~-=EFM{FV=G=DoYqB-d<}Z{gbhj}9Z4*o@7g z-G;xHZoC4M?~HG}3oYgaKmTx&S^qKjWvCU|c9~mue2%GX8|9xr62Ay0%d|YqV{tDG zdmV34*q@>t@nZTzz7B+T zjm%l*1xV~U?)Jl=EUu0V{F2eoj%UUCbzaVfb`kQR7OEqwOCCk-H}-+H52BU|$hTrv z@6eMnR&{y}Q}=Az26ii#@IQ-7REG3|{Y>}QyRko@CHvEGspNjHqjZf8&L+#16OR2Q z`Ne)no$2xs3A}|ly{1&SZ|JQVyZRJr0tIe06nC@FhpZq+hb{J-6AQ9Vp`~y~xBGMc zJ>xkOJMQ$co#*UQvc-8^q4CjOJHYs2bk&7Rv%EZ{)X>iT-2`*;Aa!zCmRW3UuT1-Q zW1lSGgPvy9i&Q!?J}z$6$+Hcijz#(h*}K!sXyEi{YA|HfY^9x`B^ zaDhiMpLimS)5DBa1o+o#X;RySG*~dxA6H$KRvXou5Sj41ar2yuar+w0vE+NDd42KI zeJ&>ZMEd(jhGruA00$sf4tqajg|SvUVOgFBeGdd4GM5h83BI^~cPB)uQXuLZt$O_O Iu#nJy0pnhx_y7O^ literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/rdata/env_data_non_dfs.rda b/pandas/tests/io/data/rdata/env_data_non_dfs.rda new file mode 100644 index 0000000000000000000000000000000000000000..e1b6bbb5e117ea72764d8dcc41593b9f24806b3b GIT binary patch literal 8948 zcmVnitj$_}qEGbJQ>86s*l&G}Wqa+<1OL1_{ zp_Chv7Sf(JlvL76X+bqpB<+f_l{HJ2>}vuiu}`|Y#t3Jik8ImY(iq{~WSOvPr|9)>prtkLW-5hP!Ru8URI|)>{v|kF)&+;1 zkqP_{;8gxm#>_jwm*K@RD$VWlT+^6;E1;SaB%e|S{{i$r2NdhdEySKTCXNlz#Y0^t z&(>;=4ba(1&L#avOeT%MhGUZF7{2enjrT1FnG1UF{3Sp$`jiu^@X=kF=LQ&892g<*}YtHu4Gd}*FcWh9CTJ8ILl8%lMN)3|}lN~||zX3)M>KU(- zy%zdkp~iJk6~+YR@Nx5BZ0Ub+r2OCRnf~-^jdmyKF}6!3caaEB9{t zr?S7~FaLZK|5JbY-`>5K1%f1fMbUp~{2ORoFd*=;J#_onXf$EmJ**qSJ;ryQy+sv^ zyW|&rqmglax6ypM4ffY3E|=}d`b&QD&v)-J`pN&*J}%Rqtfm{R_Ya?c1D`94^%ZTm z9~kQoM&&j9@-N27r`mL`1*=5kzVVr!HGcJGXoqh-HD873(Jw$H-(rNuIS#)fjj14&U020hit1n9QiU_8_M0G)`~S z7ko28>M!}sKWB1`KJ&lMrQzkC%H!sdi6NHA1_4&?`G)jb@BQkPpC^0N2`bG8+X8ilf`(e_4Hc`Ky9MLah z_z#tgHY=-&t(% zN2`r&jl@nUPS~ScSmO(oOT*p>w6Nhgm8>57Om(PgmZoO&{5_NxRiNBBvll8B)muIi zl!l7*EiR{Di9u;<(coQ~OsKL>|K!C39Qn8@CC0XIpu9f0(tcee9IsBb%QjsKg}j+N z>)g|!Tv5puKXL(-(_AUJ&_5L_%a2oN_(dH`rJCP|4QWuyD9JV0tQN}D-d=tpqY{e0 zw{%@qcor(3&()TW4~5DPC3hm|0;M%J9&`9K11f2%3t1F!K?#vV1t-?%L8au9$|+Vhy45*S8#o;_N;8!5I*e+Eb=j=cPdL zB`(jL&s;)}>W}GFAN>gVp_Ttu!3an;z47Vzyq!=~B;?t+4Ghq`&9G^w;0MsbbENyw z^cv79X`{8%J0HAC|8T6?Bp&jo^zDh~mVknTLb?WB_Mp#e#+_(KHl!3>kUmkv1sZOu z*hg8tf)uF;7lWqtkWZf5U+}Rq6hTv|Gn&?ezT?IBT;~~s&X*b3zvvFAr>c=ilL_eR zTar>%_(90Ex$W#sFA+#|yXmTRJQF?RZoSg-NE&ia9a^p@bQts8Nyr}`;|yMGo$-;! z$Q-?Uy+G1Cu>&-2y}hPsjwPye^yBF{zYKNCmCGazOhL8MiynDIokc^E1*ypT4(JNF zJYS#Z3TUU^AhZ<&VUZ0#>1=umKxYlmCjrkgD=g2OK00}c?)F;n6iC&ZYu!%t{ z_}F=J)9I%RQM=X0Mi<5XsP}}^%l({Lr*=-MMU7GCTDe8*aYp*-3m$FCia;(|2g{&~I zNc0o!3JT`s7VC2ujK;jH*wG!UWidrJVXkRgGWv`yJaKVg4fN)wp5lUlyTgPiJcUy zQtyqJCnhRTpQj27o2EHvO%6bv1}~gxoiV6-ckzu~MjJ7&u$I*Y=x)V?mbGSW+?J3~x9$ma-{u?o>R8W#nB=N@_B(oN0F7rHKYuph0=d3E^9qPsda6 zQ~3rgwXeJJZ1y_LpL9Jr^*+LK_N_W|lQcLytO?E+;=+`7#xd>f4wzrg!S=2BJ1o(d z_EIL*2}=%b1)kfUVX1NZl-8UCSmtWXv}wkAnC$vGG~{#+=C_k43`rfuQhL$LmyEZ; zJm%E3s=GGgah;UylQMg;_>v`Jt2UZoF^8@IvxFmERA}l$BcO{nX(^TkuoQ?%)Yt9Ao-NC|}ulbn-Z^Ckt!449+C$WGj86F`bm01@d}Z#5XAD zfY#DABK#faA@4eCeHSmxZVDFJNE#UG3Ln`wRJXd`UUQ%~erVcqnffV}- zPEB=Cxc9Ym%~AKN?ezZQ$BiqxM`0nog4G--!+k|?lm}o zS-D{<5DP8{HMN*~4Y9L}jUz7#fnpbnD}CpL!9yR`P5WdiFwiH>m7s78^0p3`qU|RMkl*&khtBJvkbiD;VgTt1=ylbZ zX_eQA+Tz=WVhvxTL5!%SL<1LB&b<$k1XymBz=Z zM^=_VuKNlJCFjKfyH+Waeu1NZw|au_uUe4n(Y&ed29Y40*X2^Jj5OwHT+$1x<$@^p6y>|DD7Z)6>AG?03edy-YkkW5YRLCC;@0^tUMT+DNXr&cAb&wv zmzDW^&|PW4DhZp8N@brt7u&{#I*k1zP6ex?E|r#6*W~A*)@(vl0Gm(olDc z!`9McF(5m!Xw3;yH|S9T_u`t|pisUc>Fo46&|?>I-q1Ks-bn{^ ztj5nZj@p1W3HQ<{(-o-uD(WQD@-ctqR(4Kt6edL+5%;~e15+6F4eu(>pgw*Bp4^*F zpzFQcxg|PF1VA5&+ z9R(;)V5#Tr3)?`K#j#|(H5l_0&Y6-Rn}*7KCLLlv7y|dm_e55CzXM(Fs`=Y&b0BGB z#hkO-k3#~pF~+3o3}}}%WtWug!=&rY`5_b6V4mmfGiw7Dpn>%d8z(09ppIih+INkv zgD#zeis?6#FpqK-ulcw{OkliCaxQ%a>il!*R(EnB_d7n>xWlfH&p<1yrD-lC4D`w5 zFET+53iX$pRogK+B#USJ#o1WULg(P-S`kcEaC4g%`37}}ZGJDKy$CcZr<>&_nPIM! zU)p?zR$(q$((#@QLo|fuo?2i`0fl>WT5Rd(LBsfzG>a>pU_emm^{b&7sOK=%y!x&< z7V@;*x#^l8mb^sMcw)ynU(HfnRMnS)xmLZBy!mk-%Jy3kxmEu$78O-#pB1c%DUYN* zjuuOx-ohD|m5w?g_B3+FN>&A^mF{i3YV#4jjFWZ<-fV&SLL?rhb2t-`amyYpXVLb7D8=VgMI6cS7Hfm_l1kjbJqEV<~OF3jWHqmLPA}? z6{^oHA?})?4vJYSA+tLqQLA3~`|%zVQQI%Co2S+J$kJDd@t!T zD6!C3FoiONijDm&ONFCQS7X0u%6cjmkE1m=zV*gZS($0^q_I_*Fm{OHrj?26{z>r0=p#Z5-ea_dw`f; zf+Yfur(W8dfaO*?U1K_7ELpR}=*4UUOtM_{AmNxbdS|FRZkd!27B!ZVQj)I0LLm3l zD$P#REg#5}=9q>aoYyaZJ#P@S(Q;&7cr}AJTU~@J6ZRo?b5rJxvjo&vF^kkQqZJF+ zew_ULi3sM=xHWZ;j6EhRbU?>h zSLmJ8jqGzz6|gA(G4C~Td{|_0&g|IoA-5Y z6Vf7sF`sTyrtvaAEUGs@x!}GHmLx2>ml3Z3Ox zEa;=>>AZeSdGUUG;`JO%5eR4qalC;Vx9~=8<(xM%R<0Pby;O#BO6I!&=ZDAM>JJQWtfbo>Gq$F2!8kHL=~C7ekq?0WR4~#$lz?DaCakGcYCL zUXAe?C)697+7j**0`hl^cja}6$9y;MKYMiiFc#SK`1usK4%BOZ!h&J_LVZZRc9oy+xe#>m3*iC$x$44YH z{F55U1hVI@KfQs4q^`%9*2ZIELT*=o;9FExy@hsl{(VqwjqH~?8++TI-6$XZsg}{4|SaiU+E& zhyrQOMWYv(61rvIURD(PsAzEix~?RsUp0qvVv9K>+*%abU)>1WRMuZFrSpTcAL91M z<)~mm``0`aq?oW_QNJbs9)SQ#K)}T|DOMQLvcJRPJHT6!6Ar^BfbY{W;%pEoONh8=5 zlU_NpN}LCuojaXDq^mWcqnxR~QKW<*m7T zwIAqw-7|NVRV(CHba;={#)1AJ2G`oye$@HG?Z~mseW+(%ge6BxIc9VVzXb}b-B;+dv7NCJeYVGJ>$Fk)`GGG^sh3x7em63 z9IrBA0^c)8JY#Qef0_uI8wXabY1)r^#p&@a#2sj8$hd}_U4!bKoKSPu%z@vb*1 zRp1j-;zL@xCF&e+NYvbA$a!B#IM=acJ}B3Ko;fcgKzsHD!v2^6$U_dWdHB%>a`F0- z?XN8cH4j|XYg0r~$G&*gTPp~hbEt;d%Wb@<^*9?{v=BlU{F!u}sCv+`x)=FR1|e}9kT=oU4eFYYPF8DC z0T&z6rfI&ez=SoS+UH_9<5YVqSiAThXy3MLPw|c#$V({F)-m6QK1?gN_MWYXGJTd( z3oKP2VVkEwnwlZ#sJUBpfTRY+ZlyOyR+Am@DSSg8KMG#k9SxL9VIUKi@V zIX?T%jOVEMO46XF#ALv3uhPzq7=%J`Ue1xamq2;2q9?u44BbB1+|;!CASm5ipyj;& zDx^$ynGiHH0P-v3Y}NTV6^dysCo%U40`~aUzWXApP`j29nO1QJlkPCK7w7e%hAQb8 z>U}@Z-6_(MFI5DE7phY?+r9wRg2iU0n~0!hoXpKs(Q}}6-s+;1BmR(bsxupRB|~W) z+a96(cTj9mq}sjN2(Swdba;Qb1nQ{s4OhraLhYjC`_>hA#vFSn*tIxaHS`*kJj$$p*M1m|<9(&KB0B)eDBP2; ziZq0xZxbI%mz9E!vNY1I^R}SE)H`s3egWj(cyq?hcrPewR3KEbLhMi~t%TRXB!_xEMC6vsZuew2Z z78EjzD;jj%47&VmBBb_oKrTz_q%_;zP_VW$=-ss)P=Q*YkYwf!`O0Q2aeuKG3WV?r z{z58*5hF#-h2n4b9^XTn4W&|4B9_@D zLIq2urcY#dsA?(@zjsXxRNnt$$F7aVP^!&Pdr7_ll&HD9;=qGjP?*KMuIw5GMVAd3 zI(A?v*EGEtWYpv4IC=Eh{-!8_v;r5e+&mYiA_#qvZ z8EG?;AGwd0#K~M@eTR?FgHh`XT}xk7Lu&&Y{g5Ut9Oad)>|Y{<@@mNn`@_aTgZHK` zI|zZ$S!;@@5NGq7kmK{qhK;{+@>6#aK>#?InN84oQiiEPZ1&uEC8#)kn!~Qsf>2S& z1AJ2Q!iwq%5-UL&o+SA+EV6OI$l&JeJCQG%p4B=hruyM5&%E7p4$#tO57@1o`A|ur zbH1L$SEEY~5q^3@DfPMhXE%}*%|#;@Y+8TkZ6v?xsQY^xmYc4MDTCqrEdYR}@vAFzE^IcbLE8OJ0ty%=pSkSt)5$r%fj0z{twvY`5&|UUqb1JX{L-QbftQG z|LjVd%%Cy7+ybcHBL#x4zBG4tFIO)b$H$GV9jdn%$5W2Pw(u~$Jbb9%MLvzF(q;Mw zP#NFkgs5(7bw?KpxQ~w2;q(}&8$f4rhWI&7=%>*#0?Ys(A1WhA_jB~i?=Q9eys31S zv#X~U(|z^EGvYU(=wxujz)0uc_s5`N3iRL7`vM15IBQ z{CmD%cs`XyRhds4hHHc^(+w;yy6Z?*c$qIt#nzKS}+}kH~M9fHqXm~jz`*4!XJ%?r^6;{ zQX913z9|(h*5b+Vw1Yqjbz&u4Sg~N<_9T6%)!llNde#;)&G*yKXI+77w4~%zWn5P>bS1qq0_d*0zu!K7DmX9EPg38i0{SLjffe;1q4rSEIRm4EQ00={%4nkj zOmfX-C%ITbvfb6)yQ8mQ1>eV2DT!<>&UN~IS^f^ptJOB~0e2CWHqvuGWH1r(cB#5X zul9tp?)i6!drYC$j)OgpDjA^jA^$>8%R*GGJ(sAPB#mY1KBfxxQK+GP>&a9Z!T-M# zqaSzKINo77R#e~*P%hdIC12+!CM|OY+4hP1&i1{5(nW*TJM#}gT~Uo*US=~?4c>9& zN^&X|VDD-_yMF@|ythPhNWKD(S1fO*P4LFz7djMvy|okb`ll~g6t9dG*L}*8>r;i~ zNlQ(69j8F$ac83?-Adr3a{p&`CW4@AWrazy;8av!#tM89B!m@oJ!?%H!%>@`{Ds*Q zs{S8N%n!R7d1M>>s{yos80|WICh>J)(inl?9*v*2!H?$3`j-FQvEy@iErZ4!L$EAi O@c#gnnX0jnMF0RrD%-mN literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/rdata/env_data_objs.rda b/pandas/tests/io/data/rdata/env_data_objs.rda new file mode 100644 index 0000000000000000000000000000000000000000..61731d7774e4544850a6cb232ab1ac07625a06ba GIT binary patch literal 13735 zcmY+~do&d9;|K5}gir|~gd~woD&(@2q*7FpOR=8_No>e{FD^+Eo0P3wHc66La=$K- zJ4-I>zKdnucP(r0J3oE@`_1c|_sp3y=b3q)XU@!=c^t<*6enM`X3dd-e-G*BH7k^Xk>-?@q;zH%r_tb7Li94htO@l7Dr; zYVVz!2mZu_Vls&5x}Th4!im&crUf(HqLotR4dBOou@&QEvW$mJDCRHt&-BwH#`}lB z;^l*m)`frHPALQj81J8T(6Vjn_;H<)j6Hbj*YA7s-w%HI!5A7ybPUdP)(VeX`?4SH zecskh_0ikX&&Q0gj#&dm3TfmTv6|ehl}+Q)K*^0?B3AvWoMw7;o~4n;t|b2* zlI_qmCuCCfJ5D711?z;Kuh9PeFqkfp=;~Z?*hl0VL9cR#-2E&4fU)F(C+RxnjiX=? z_}avvkmuaY{?Mj?_e=6s1tPtH3Uf8)74jm&9E#j4@zla(jY0iDicEWFaLtpmS5+Nv zU7cF*9Z1Z~(lgh;ZF9}M{(5Z0lbV{`QT@>D8Mz_&C;A5)b>Uz4ZP6-^O@6qh)t-;m zsz5E7m>(~b*m!1q&-ub9xNJ5e-Re!j{~r1#=n#<9OJxudtPxXRghy}O*VDAJp8vGTGD_I#W}(O(NCTm4ho z*;mrMqOux)WhJD2Xf~6u)xA5NCfjqkuw}GuRJ$hpu@3^$BVT^4mECFWyWzRIKHK(= zT_?f1HKaj2BBtBbhIY97;l>xYBhvcJyHUB0v)9BJhf0gX81+pTJz6!@h}^!n)=7$} zim)dGaprfN%WDKuF*b*oKCK ztyvADPU^?R$V}NB^`K$aqk*af)6bphzDDDSlH=3B5gz6_kV?bs1Qh?pM;b#Ve4!G2e`;re4~rFeO%~s zge)C;QE;Ojr^q(a1iZ+=9yZugfPVu7k%2>Ou(be00z`PYC;&|Z=tqMFiaLAPsCPpK zUQc{BhG}`pKlZ~US$AoLqN{?jnjb zNWv6Y{)&Kolm!Ql*)bh%rb>!3mZdt)3Lw#Ehm26U787tWXj$ znkq(=X;9I|Q066bT%owhO!)v6r`(rB1UXfun(40$CQG z1davvAP(|QvL4VNjmVwb6>+3Nnvf!3A;d1;ZWfe;iv~mmrYtH67po=$mOvcf$S30&7o?;{~D5 zq!H&>izM$T%ua+T>m$iK1|yD;XE~F+qcKv5v#e1PB?=>qkPefj9VOjpOph@Vfl6fU z{0E`WswPolFk%Q*Ryc_gi;+QCgc;JjNIi||u|{H0=_#Q)kuXVG8tFvSy_iZNsK}H= z-I*|7+AztlaXhMWCvFE%h$h(x+J)Q8yGToH1PS4G*PRHvN2?$y#6tEU3|LwubQDAg zp~K20p<^nAp<+|gb>?A_v@McKqiIy=4(jfyeRY~)+B92IW}|6rs3>*Mlt`Uqm;$Yc zwo;Clu4phvJ>KKyQm^l zl64AU^8a%-Y+IG&gvgo@RccDP&L~X!fAqutQT>NCM?%L!q^HE{WWv;Fu>TRF{d4i3 zr$$xE6H;!w{y~Va&V<6%PIhF&-F&ZL;IKvtu5uzZ%2&pi|e^`fU7f7*oNecUne>2m{C~trvW?nN5E+Cyi$OAqBJMy4 zvo6sRNU}}PUATR`%e3@Hs1Qz^=S53zf{J2xS%|~-vrdpIqliKlim+3xhcq3MePdn} zQ39dMDkN1#6GaiqtUI)Pl6@0Y45z{ip_w*Pci@D1wlvcw>Mo3^g%nJQTCW;`8 zSd*mCSfVgOhGkA-#trTV~^-x zA~4sgJ8Awrv)ne8w-j~!$MDk2X}F(buW!=PgHzn>+V`B;AK%A3!@NOtGmY=Qy$IdM z81hQ-4FezYLhPXq>)eocTd$wyd!@Orq+VUeCM52=AsLVq8P}@D`i==C^qCllAqXN*3pB&+nYg zMgc}0xJJ%IsrpW9(^WqMm^aJNf_iL#=D_(E*pEX~p-Nj^S&rXA>CAEao6TH(mMClG z0xG9{v3PSm*X3D5kX|JBQ?yaZ%0fXbOzapzgA%bh9)pzbqeA5~%QlAM@Zq;zBDlRJ zV_c=bYhoiNkVfWS+9o#|+T1SksWl(UXs-s&z+-eV!V))l7L}n(L>wkf0hk$@R4nkx`J&O0Sygh4Y>js@ zUW0|)J(<@Y+V_Roc5iEgP(H<-4F@#o@TmJi>&*KrrJn>1Eo)*WZJ;y^PvvnD%Kjw@F9-TaX9U*eJ0q8K>zjy8O>09Mtn8+(w1v_E3LeF=uTYkh>4m- zH%`#sPN|&f+-`zM(R$2hK&KD&IV<*1HLX^{!fmrAjv1;+@L1S53Pb;0yFreO*^+&O z+<~Z7+HTe5de+>76_(j5F`M@$P?q-*#KT5|qj(o|lX(bUur@ENRT|ru72bwNwBl64 zw;lj0W06~a6|m3;mCl>myy4r3UFO`Kz&@L)^Jal+TKWyGS>2(6(S?muFtnw11l1A` zKEc^BYj_?e_UT@P*e9oe{Z}-lneG5;x>hmrWD(!GFyeD`uH)yDIYMLgT_C#aXGrvQ zj@FuxA`MW+t$mq@oxt6r3J0I47os?YNEauIj!KQ+LfoMih1}C8{I<;6>oT zl%cASUrMc5#5)b}vR9)oTl?hmQ_Ad5Umz!J1n2!&#i%`HmAnH_*Bd@WD6ZDMlWi0( zW0IJOC%9ru_XLusFHgy{3Y9k<|3f#BG#5cphf-52Uqyg9%hHgo{OFB-Y@(b|v__}I z3f@e<>LVxio!$9RnJrIqnUns)=AXO15Th4DRRT+iyH92b8>V>dNXf z-!=ZXxqCou_Qc6VR=QZj+pP>sq4Wi&tgf4oO`}?qzwCFrMhH`by0OWWN6Z}B|6i8) ziWQ2QkW)D53=zg#hAQ)@)BGninc`XR&1Qi)tsvL>WY1g8uo|sU+JV+4Fl`@)`BJ&J zF#1GG2Ph5E`v9F2$oeJ@jz}Ti`#=zg;4zOSRCP+?RV(rszYD%k*73F<|#3+1*LYFeOKjoYQ zl*T2d-`d+E;W`!z5JBg8ZKz6-LQCur(x&C+9V83_G-HhUyJDD2S- zxk~k6ophotSs-fBrYD#lIAQUCuJ>RIqP}IC-P2zA0de;?MRKL9PAR!^EG|{g43;*B zKDn;ACNlXaEn|9s#Lpj+)eS`gB0VRBfGrJkW4V5YfM8IsG-Yn2c_^vN(U-1 zYvXO@df?PXLr3K1W}w@8Z@l0Hy>KHO_onVUnL)X??l%BBOs(yHOZay0vfKNO_e?lP z9WvvoLMzg+yOH&~OVHQL^(U*6g05|uxY2Fh=x3%F@6^Nt7rb`~wQ zaUqafYnKGrH&B`IRh75EQ6wM-YtT^Ro(S=R|DwunX#l9cRWaZr$8D1@Z4#Ho=&n_O z&k0C-5z`q^c1#ib zi2f+^xvV&|Dm1(py-@;6;2og&LzMU_cFf!qZJQ{IkGx0bbT6rPVy48@-WtPpo2;u1 zYu8yeZomP$r}qWqS+>crj~eAaqBFVnR3lzXX5@L@=Lz2ol-W;8LmIhjtPjL}YKkFB z)Za_N`AHGawvT6-{pZaJdCN152);y>rg?(by1o+EMhukef8|pSLyE|MN@CMPR%+pK z|AgC7Jmm<24b?M_S(X81W;Pa(1mAc!W=E_rkad) z!=2+{1b5;s8uq;4)`-XHO^@R7W1DK$Ca(ps1?Q`9J^obzWKG#W3d{{8`n7}D3;81q z|CPh+hr^ac&CO=>Z&XFHH-s9Q5*8*Mj@gO)fKc57FbjVHZxn_{5G!AXBo?=6kmX8u9U7HKHpaV9mSYzgROa3Fjv2V=ix+E@|0L_{a32#O5pDL z)pO4#^2W3_vZyjFK~g3nubw;rT*&eud&{;121HJ3d%>R+fIKm0r;XGOhs~QKEtk5H z0>gG@0D_6JMyws8W4kQ{UYi3kJc<=vm3NyJd;~Nqer{fy+Yl90hY4@Tn}5SZT7tuO z<1V=U(%vL%^6J~MZy}q^3q&DuwGe{5%2TC}cUudh2RrcWj~JftG(insq=7QqQTx2ju;?yXI!4!R4Ag)HnD$+b~E0|~%6C*aJ716C7;>PpYRH#9JmF5?aGL~1*ZA1aVdkexoAj*}?;{XZQm8_R1#pLT z3L(4#5n3@6A{z(b^BQ7MuL$aK=xJbsMi1XM68!3Rc1xB=`an^MvY3E zWG)P(p5rE;cYM0W_R9!RIr&Cg8DT;WopX!kUpcH$quU2Y)osj?Z@IP)#c!WasG9uV zgeuw|ziXsz7w3)G)*Ir9R(-Oy5dTdjjy$kvO{Wn0fSVjeAut`ZWgg|K0XR(=LUzFd z!BO`K_7sG{$$gxwEKdpg4NLvCt`#2R*-9@}UI^7p*|);lMYnjtdrL*`A+?ed$a*rOqJ@3UsoMD z#@sz(Pv_P4qk`^|4u{hrpHsQVhKY^D_GXhK3x`3-0Bye+7qD-J&A%|Ffaz#H8Z)xV?7cNgr+J z-+dFHas;JV_s+~uoBrE1oG5JZ6MtcN{qXFG55hRJ1x4ruBkCw1<_wAbiEYa6le$>6 zj~r+1yngT@M)i=GdH9dd=s+P}$MP;kwWbbQbu6sx9kVZaecwZ{KLzIuj^tTM1O*!N|GQ-Vz4V7abOYBpKyTfoO^p3gfd_rk90Tz5M`GeU}chBmVVhjnC7Zc z4)g5hxP-ujj6UT=CS9K8KNY5hPdM?&P zeN12OE}eeRuBpY|XG-lAwDjggrVQ92dHx#RCfe6|anSktLFjW3<-#lV7mLbeBeM$2ao2MQtNG)F>6bxOwEkHLD#zy7 z_+x5r{pPr{?jMMBVWuj&@EB-tY4!B|%>yQzVauO~=C+^fTu3E)&Cy3Y;-y&xZ#nEI zYnZ*vPvrku`)!BOKIao`ZYx5VwPh`@PoB7(j~9-pWAxFSc!Hs@zP7ktJ1T_petlR+ zwBEu$m>>B#x50K{>-8DISNOLrH#Kiy%`Cf19Tsf<%)-2!S(C{KC}QStFEF8YNw(y_ z6h~LvdiYO4u*oxUNdLmK$R&E~E3}i~l_%hYcoSirWXr@^5W6B>RVOWeL(q_^(Bx?5 z;M6jIX}Q54@3dU5kG9DA`i^}Yg?8F_Ic4{=D(yDaP7TYc%VA>M{C-yT6PKUWxVB8y z5WlK+sYaMBAo?jmlhP59=F$v?6%){U_p5h-vovc}8Dz9tZ?X2w#M7KGYdCE79iLjV zWpI(;?2ARUB^!+Wd0Uw}tUt_2%`Ybqm+5B)Xvi~m%Lo+1Z7K8gIXL3YFU!|87SP3Y z?6?hz>9p~E%Hpp1u!R+d`pj&&hDNPOEVo#kbz@?C9Z`mv0L}$<&S!NKu?hd$Ks;bk zvPozw@y9w*vYbL*mb)#?8VrYDQj4$Q)u?!6f;`v;@|mjG2-C@;8ncq&MST}nDtOp} zNt{ISblJhC3tWQJa?%?9Ah*Di0^EY*p8hg(&ida?3XFPsF@kA7@rGbzAxCg_Lvxv0 z@??S&lzsW*@=v0bw|_YTpU%!_)N0f=)N{^pYOXI+Ol}w!0D9jaJAPvXZ9e!2X$nne zFdNSjmoU__b8p$|o14B}m8uQH_f%35{FAGhS}3y;H$cq(tb+xm-Ql;Xrg zs!<^tMD5#cy|r(;IEK?(ABl~jGO%38dh8}p-ZzUn;f{RTZ@+SMtFk@N51uO!K^$m^ zC+TjFrI?Or1zt5xv7EWILfV+)zJE+N?7 zaGTZ*&|Af|2{f=RseOs&%xh-80Y?#X+$$4T%^KTO)$V<1OPX|R9d(q{2X+FKQr3(nS_u7mUT>#^qeY+~h@@xE$L{vhFMf`YSSi0|yhg_a_BJ7?T31ygY( zyx~pFf;PelyQs_0ItgyHuFRnkvE}1=i~OW{nG|bC}{T2fPUc1$s_t& z(z*35sjOQl@AYQ4LI@nEGkal0eM>If)2tn)8GyStU;{C-68L>O#PtM6xw#uf9(p|M zvT}@ju-}rW8=$+zbKWSRp@XkLv@Zd2Wt9Y3KX947a@&kiwnFVIvoENYlo3nTt3%vl zT5GETB7z$Zn-+gm#Ylf~BW+V{CqW7P7My!t^O(h@O?C<^HB8oGKj0pkpQ!rRZQgQJ z?d9ZGND4mLW7PZc_I<-YpCO2ud{ z8rI={D#dLw!6lU}N1}p_>%4-Z+Fw*RajWL|a#;mPp8XutuD-4VV+4oio=tw(P=-%p z1>lZWl1|yZ0?*ay20OPMaeH|G6|zM?J`8!dHS8A?SkR&}jEOlt&ERP@Meq2spxgz1 zy_aK2Da&fhD8{BPaZ*Md)efa9B;c@T>Nok=mcj=70o>I(q@50bVHx!^e#U#!_6rlV zux`lbA6fp`l3n4}#Q|;9x1bQc=-aI+_4sd@3+jQ18DA=Y&i;0sSu|yhvG*n@7r>aG zJ@Nluh7ZHHA3FlP{wlmDaJXFLrw6qd(ca+vG#k=#PLm=uO1$dY>(k|gb{ud6c}~6+ zylGCekWR?p2FV>V4GJzv&(OXEp4Jq|3ui-bk)m-%#tyxQg@#iXx^_&ekoCGo)wAx}Z zqY!TAgOVwUty3^rG28#lT|32FnCsUcS$?)5x1MFca0%iAqULTN9P!E`#>VUlym}sz zVn1_jW<$l#qJOS`_I_nSOX()g({U4?wf!fpdgQ{je(PTKnn?!h0B2|ncFu7l&jMIj z$8tU6hHF=DF`S$}H@bH_+``bZmlwpatOY^6}{$vqGZb5@Lw|8KpU`y-Ma^>rr!;@nPQD zta-frfD&82o;?m(_@i1fb8OhSZ?*JFjzaZj_+<>%N*}JmIo1jT!R|g1jD2Qk&swt* z%{LiWnFVXwC9$v7QfxTR`8}F{-8c#je^LI{C6%;(rd$otJqhcCkgI+M-{fZ(5gy-g zTeN28Sm=uom=}KI8=!q#o?p=Ix#QYta7OTjya9qvy;_3gI<_6B-zvC>!A|wf*z)S{ zB+mL73}1jx_7OI!Kbddx-}Ei7ZiZi~jBT(RX4=ePdd6T{vu;c0)JpQ&v$y|hvYhjJ z2ya7O^G0px=`-GNx0h#R7B_GWy~~uH^^i;~>6hBOx*7DC+3~G@{4Gvh!}*PZzD0I$ zBqSl@t=-^iQCpkuScAP9_$_jff0%#G6;t^Y0w*4>qNUzm!D7^cBjB#`?BMFJ?+`}5 z!IRkAkk37q`|+FzEm#W*@)$wDi*ssnPD%kJ!`~5ggB!Y1};P?{wtTyqf9P>+2_!?QdDgoBZ&^ zb*QV^Z@B=2KkZkM0$@OWOvQp4R+`No`mo;qgb_aafqg!oor1eb_%x_=s%B!@N@1rj zN{8u?yvNz)a!OuAo9-FiI!_PS{IZ(Ci#l7$yJ;wpvdOTcLEV?X5ongo>BXUEcfVN^eFI%n@!pp$GU1=oa>D1y|eBGZ}TvEVskWxXO zDlx8d0due)$Knsaj8J1_e90;WjOdkrmlBczLxV3H$I6PUtTo9t=SrNv_{5GS0}YT( z0Z_{m=wd##S}Dz$0AC8(3JH*Z=LA0e>N0lunU%;^RlD_R{lBq7+g)B`T%V<*DAS>! zw1~Roky#C+7`JCEY>VaBwX4ap9*CcRj2iA;LgpvST3oF>rlbYsHA~yzo;(UQ{`uir zVEd@&7h5cP-r!YKEADS0q)=%z-{*@j_x0jDe!L*9Lry}eBG`Gg{CvQK~jn`Y5RI~ z-P~KsI^-RGuwD`ft-DobHqK1&rpiUl@h?9omZ3(zI`5i)9p-J3r<2kXZaYfX+W;Be zh?EI*5)6?Se?GIjpZyu-FlWi2p)um$q`macgp8CxdblF-a+-OUx>EnEu$!+P1Gnf9 zThfsXjI27+FTP&&4`%VDs_Z(`;$O zuV&>k-P?8nm(*z0-8tR4XY)Ejzjq=SPfDhSTyi?gm#TNv`#PU%kqU6~z#VXFdGC+e)>^hI2&Bwtb~<*RNM^LJk-A~&(8NT7lc}*1UX3g-eFLHR*N6sP%H7OEQM!5 z@2*(nt_3Y|m!%_#DYQ%un(a8XAEi-Q4KbpWo=yHQnAK3_m`o2C&qR8)gbpwtV-8g|-SSH$qe&jELH2Fp0>3W{W3JhE?9a7*jO^4s z!MmU9d*5L52iACa-^TUZ=F*;kIPTr_5@hz!kO1sZZ*%AD4bz{C&jjm_b5Op0x^_86 z;BuAz$cihq#v~ozPOE)o^JYKii)*|NW#m=#@PCE`d;_GdX|wlS%aaF@_s<}&4P}Ko zam6=01ZYDq)0ME7=^}!Zh|A7(nw|ybZtv%_o!Dj${n~B?pX7{_@ul2Y7tT=#iM|4+?#c zJ^sS0*DcrUe$Pf)Z$4lfSnyxVrm7^nrQJ73Rbd!Y+G;uDM9z94q6AqC&A`H*UDkC3 zB=7TGY)+|n4g6@=DnB=LhU@uacp+v_B1fqGxxPi0N2t>=A}5W$+C6(8rg>bq6E5eW z`@$=%4F%O*^_`2Ou|-JZBUT zQ$_>4-jn5O60hCQ+G!FV{uT(3KQK4X=nNWT{C<0%dA@QmHB;65uKxqKiXA$=?thvx z(X0E9^p6r8^m-!a7{T+*^3hxd-B_|TaAU&CKfPJUx#-J%?}pN{r5~)p|MHn z%2APCA(3&fYbTy=;k!`#jmpnES68*TKE0n?BeM25x^%*-+8vr*qiS-Eq0sLg84FRht zu(3t#^|;%O)%xShhh{m94QbGmg_EM54=ack+Fee6gD%CFg_kC=O@e@lb7g~7g{xhK zXH|?JT^|8Pgh8q|3_6v!>UiS7X^#XbA0oq9R^EtRj0>f@cubt5_8WdG{OEPvxN32s zf#=iu=5?J1My0ONi9Yd5v|6%8dBbrIQd4RVo*kC2b)5Ii^t--bzErN=`d%>>4Suuk zQM=aqwg-hYrA@oF#+2j@ve|6RYaN`Ce!+Xe&s%fcxh+*0xr4ExZl`|Wr|n+n<{bCZ z$Awpv9QK>&WVW>_+E1-L9qU*2R+7DZKfJlu$MrS6Yl^cZSWQjJO_)}cp_uvVEo}0@ z&8qOu4}ms7d~XCv(&P6WAA#5cp$w{sW3A;Votg>(KYm3vF|^8Q=NuE{Xnnir^xfoP)^f_8CbE;%;G@O zkLlQSsM>m&LD?34D@PV{MjlP&4~?##@jzf}OhV18rasJkEJZZ_K6-g5Wk^?2q44DK z8e65Z`R=yYT1DE@Hm^0UnjvB#qeA-?*CW*gaCcuac^Bi{{&&98*DzE<#CLM6xdGzb>MESEHE9Rcf zrV*7vru1Ru<7<%>#)p`R22c75M1<`|p?Zij2ELujqpfvb9+=gEmCpfP(|+kMpc4DRdzT5+j__w+3i7==`vM{1;}71y${oHD zI9K)Gy~X47;fsYO+Jd4W-U7YE0_q&@>G{F*<_7X>^%%$Rdfn)3YVr~{wCy|tMkz+% zDkdIGE~n=o*M3|6%n1iF1iGm6P`z$$HRCR!V+kH{B0eXe`Znv$%%o#Nzuk?|tJh+j zB@B_J|4k)K=5r>RO`#oZDa^RCA5xS!7P?d2_ex3Ku-4>hlG%*C%_3&<{t?BJCb- z5wt$_8eBCONuI^UokX^IPa%(5^<`q+3UUiLwnW~I7vXl^0_k5@a%5F+odO)y;^)-t z34v|vkyi#F<+g*BF5g1-F^pZMU_-7Xu+EIZaUB131~s;>j8 zq=~e17ffkd#2mFWj~`q%vt~f!e*~Wd&2-oPw*kPk4gRYDo%gmMK1W+Rue@HZGXPl% z-#s>P4EfXv;p4t2AIwG{t|%oZ)$BM;T*SREGAeU-vhwu&({~DMx0)Dl`dCn1=JVEL zpVPG#f8WTKH6QO5{{>$LOtz_c+87ac&nARE`go?jz3AZC!1i_tQ*YNcIeO9|(@Q^s zCEM7iuB~huJ!<|E@Yq(#>>O;S;N9+)CD1LegOrhJ&9?A1$2-X>3HrH?o?#QLeGf2j j`71r*C`IKP@7%}*-w&DGSm{Fws(|rEr}>9fcbdZc?~Kp`OAo z6n>>}o5CFmcPTVbXr$0Y;U0zGDBP#;fWkux%@kTFv{HCPp^ZX2g~t>+{=<+M_hBzV zX1eb6;`@C__X<+h?y3XnI`sa|^I|Zc<(Y~a%b;{7OtVx^asJ-#YJcq}q;KrtOgZ-k zk^?wB$uXT6-61z2~zdYE?*F(Q`4u$_9*TKWF82Unpy|tDUdEgWSM0{D&{+ zBYnndzwk{bKr;J{OTV%N^Wxr_>(m)gO;Zz&Gue>)ceZ$JpMkW_A&WfbW+d89@AP0U zz?SdIJ@fAJpc2vy3>bruzkMp$v?~W`9~6MUhiC}PgYy4Quv}$yYBP$g$(R<>8M#9I2B}2?3rWyaIi1uIR547 z3w2Z8pPP>?gCgGARwu4>CGzXM2VZSWC}8%5F`ChPLCxk(EVIEIjp|cl0MnQ<#UR2Az-&6x}`gkD(J5D3;6E7K0TNuXg&)jsKd z3q|vZ6&$xjNa>e41?#&&cF(?4Jarz}-oe&JTb4l6RO;Sp6$C}!AGwp9zk$>@$#vFr zK1g#1Wr#%=*y7fU5zULBX}cDEt)dZ%Ld)@2m$*P0ULeY>dI!nP*u%-gVPH228hpgv z&>W1+WY8W$@nY*xcfd(Vw}yFV=gt7Rt7m4VXgN6aR61D_0L_4iX*-w%h1-6Y2_5E; z9^V%-aG?bf$;QivcSeGfKDO+(wiOz;8IyWD8z2{&IY;g8fi&Qd18rg!BvEYV*60Fo z#$5J_qZL8L>Ubvu1@tDJ!d1CbUDo-z4dtjU`fTPMfwxmxC14*F9f+1ze3FG}+G^ zs&5ZW@NGH`xw2|{#>0t78L+SJKYv!Y@BLl+3|@g3QBL1%{S2ydTA=joa>$+Hi`F{z zBBigTF{+^vA_IhhTGlWoDW&q@?Blg*+|**I;6u$xBm)kt()6j z!K*MXWZ!auGST+Pft_)XN%Q7z8dU|?WoU;#pARv?psH8jLc*8<2824bKjGn8iH1JaBR zA+FW@tw9A03@l6_K2RGcNK;;7ZfY@*A%G;vSOO&e!FAe(t=rgn>ls8R<9{^8Oetv) z)eKIBedtSWsi}7c~31lJoP@^m0-iQj1dal2bv(fQ&0f4gm+h+|LJT90$#U+U)nZ+fU$uNO}#3EoMm!uX!1=tJp$`W%*VFAJcj)Kg*5*Qz7cV=#4 z2~3_VuQVqIMF(4PNlI!F%sB4E9H7~GKugQu3V_ySoBAu@4R2P}-~6xa$+Xj@+Q70cBJI&6JZp?KJ>lcdC% zlwH%fRJ@FW#3_%e#x18bzbF$JsHrKSxK1smY68_z5jn&CZ#02Yom?)%k3e25|}0dU^bWRuacl3 zfeNFl3CS->%%NtY40Tq1&#iWNG*MEa)U?htu$%pajkrW<|4jQ(|MHncCQ5bZ`MMGH z*Wn2)ukyQ+&XFvK!u5%BKPO5Pl_;UjO#)RqjwU62>vCoD?Pc$6r;nyeDioETy|PYi zNB$C*D%}?UzxR5Q0CB0(JTL6-mA3m+;6-KQqX{2Z1%(`FXit65>5xEFs)W|z1Uf?m z>uGxRhg5s^d|=?R9!;54NSTq9XNsrz3lf(yZ9~~FE1Mo6GG+enl8)WPmIhCmJNDc& zpEITP!1lu@_U-(AkEHSve13S)mWu18AOjP=W zrb_~)CEkQbr!rHq+U@sO^rSbMCdo;Y(Vy-=C|6}7GEJ7=x)3D(h{ziJbN;i>f0O#) zX>!-2%}1x*TXJCMYa^a(DxxHnnfTKrk!2>$)1}qA@4L6&9Zi>{r%M1-c^j=i8UO%A Cs3WTY literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/rdata/plants_arry.rds b/pandas/tests/io/data/rdata/plants_arry.rds new file mode 100644 index 0000000000000000000000000000000000000000..e1d7032acebeb5dcea077192b55d847154bf3256 GIT binary patch literal 584 zcmV-O0=NAiiwFP!000001D%q+YZO5khu^#1%O-bRJUldl8c!5I3JZfs1e=6Jv4~y~ zHHZ*4*-5hC?STP_nqIo zA2Tyo77<6B0Vpw*(&^b_2gXH;RgCBGOr+4lt}(w0`{5jXg=_Ex4niF!K^{penO8&}8I z_f=ZoZ?~qgoL%UzgRdgyTrXPNvG!3hXI{u@N8zW9o8_+AaV)2EEty7-`n48wng*<~ ze1dVr_Am(#V2}KJ1A70A?OE6A*;t3{p6t$bO~2r_)sEQlo0xkUHNCl}CRc{E9fUl+ z?vrw|V>G4r$Oe%`eoN^mQ@7ghf7>&f4S`xxajb&*B#pypxo_EB@DpWy)!fPb&iP31 zy@J>92HwIucn=?d*XDhK&$$?4W7N8(n_R;e6rz%o`_12s1BncNTbk;AGa~_k=wJVj0XgO>rt150KnRqhpv{zR;smeT+8dU|?WoU}ghiRv?psH8jLc*8(Ue43=PE5CCEk1{MZR zAkFQVm!6qlT##CnTTBBXE~lc({DO?il2mGoaJyIL=Aqe7MIqjRlGLKi6jVE@D~K8q zd@hM4i3%>MX_?8Hsd*((QFhmglFYnh7?;m4zeFJ$OmN40IK~B#7sajMyQRfd5O93M8OP8m<2hBc_s1b zMfs%#P$9PBlEjiySdwB*&MyUe8qKKWoWx?7Q7}QS6ks^%r4<2fK~v6IlwYm~vmNAB XrUoGK|Ns9tKxLsFi4{uHu~U-7ta3^cJ12EiE~^}^5<-n_?AwVFVj}6nR*5BP zAt7Tf<&xTn+-)vn?wc9AeYfw={Jzh}zcOWHa&2+bEv+Qn0IPH8Izwfs+ldCBwu2Qfk zcJ1X!7*lMCBWePqG|6VeeQ59V0?JVmP5`}lzPt4^l~wlgrn|d)OUA41dw~S3%9<7t z8-O@ie$Jhx&BYqW|En^7=&_r2I%l`ueB(r4E}G;UKKrV33O^Qkw#&)EoaQ|uI9$Il zO!34e_YQxf!<>XnLF+kSqNIZd^R4~4=Z5)h-yW?q|L60?sJ9oSPYt`Fk6S%B<)b-= zqpgQu8Q}F!<=z3>JLd+8SnM|y?4OLvKRm47`d*cr=ibr32FDD)-Rj}^*Q@TgVSBfW zFn_8N8UStC$_B(C(o&K0R$#?KK-uS+L1U_joGpKZ9Ll0R4IQYl^kTW{F}%YX9>4Ol z9(N1b<{$wLTDd|G6LM| zq1G8bO$vE{kcEnRSZN>yR?T5wK_|u3I2H9JgdK$p&ng#JkQR|f0m(Xr8v`mKC z3a#U@kEyP!BfSCIAj6mdnj-#Ib(jEF>M$1Zx2eMva8$=qk^a_&VAepG0f;Vyj(}yq zL5MF@%n(GMfjXps_MH9zbhHigK$Xw}?2#i5L)%B0E0oN`P;fJ3D8|IATw4GOIl@V` zm$wil4G`|o04~)c-CN~NGxb>66u<=PU158-N7_ZMT(mt>O>dD+q?*Q}-y_wG79k_m zmM=1mRMS~x9jUf%k!7Ts;UfD;HP|Ba$OWLiG+Zd?=hiY~0WI#zMH<`%&{EZM5GFNV zL>9saP&JbOtCpzMssVSP70?=$HHek6h07OV`k7V0f6!V;Ph|<(N`r;mer7EY3u&TO zt3k_EFi@-BnRmd^q<%)*PH5WiTt&XE|KFm!`fAaOV)YlK-&TXKL2V$JG5$}Np|(#A z+$sHAnAPw9e*^vthWpi=YsvTE8VD^l)GE??aFet^=r>SbV_CJR#&Q8U8V*Bgk{rMY z>5TBiBAtHoD)epYYLqs~1T>Sn2vhsbYti?p>rpF7YeC%z|3!KcYG>4>xk8TtS`9jy zx(cO1ItbpBjtQ;%u~qEbp3!R5H7Fg@4#vSns~BoK7Hwpx?Nk(Gl! zWgWWOL#PXSOUr~|1N>@sG!=$|lN`ZF=}O_ce*G#$3}!X+Hi~!)Otbu>d?TEfxk081<`lRMVp|Fs3mH2@<3lLc@OBs^A#zYNj?x2izfb6FwiX ztqr~x9;3!x##~8S2mT|?7kUk3_N$dt2S;;Pv7`Cbv;8Ht!LeLT!TLqZ1Pj=){My+8 zpGDzSg4^7sOdXOr=pJFKCiM}P_lH*t?r}Akh8&|s%QI0 zK?7ZjEQLw>9eZOH!X?I4%Oh564Qz9`<1AclT;Sn6{vT=?e7zc= ztJ(?vE%g)D_mis^TCpau#c_q2Y6ZAPsv~^4$b0}@LybYL1lFo_BR8pmSEa2&+5o$X z`X6dJuu=7HXpFphqsk>a{ghQ+nV{9&KIdn`;JZ^_zUxpRmzYO7!w4_L?TD-j<3)^wP&l=qING8aqM5-N|E z7ud11toM#2c|SYBM(jR;6@O*rHz*qC_-U{Df(`bZ*^<=dJGAliB=J0v~X{aRm)U zm03Hasn#;zzoh1ODiT`RmczV=;Y@G(>5DB^elQiZN{M|XSTXq|+F`6bS_K@UbA5;c zY5ZNy6Zl`=MEVAmB28{tfIU`!Fv3aPKpIVuKhKyUC&s$_T*yW)X> zBt95lnm0ZvzBT}@vNC4uZ!3RRTJ}(4v`I1>0c>V}ZJ}=>Y;jh8ZgZ{97Pxf+o;k8g z0}DjIEJ$3I#^1 z+(%|6E__>w7F#mNe<^e>=y7IF^e5tj?tx7{Tks_<=y0$%T{Zm`1Kpf>F4-Fz`%u z^5?ami(&}xh#XB`{$cXnQJYnOL!2+!L!bv*8^|!H1Oein;c>;$38m;?>KYMKatRvOi~9979oQvs9VGt{OAO{&H5f|+f#9Jjl6=J1 zhL<^2*AhGm@@$W&){H9hW_nhEWn`RxMifa;AEtV0m81;A1KUYuoU!2|mDCOI*f=IA znl>Z+b8}|OGmIcj+}Tre4giB#04bQmb-}MxcHAH+oLe#Y?vBtQK}j|0XOZcAc=!;H z%RZo0jZu!3SQ4mY?pZE4-VhK^ML26t(78s+NPSNgcc#a(Q@Kp3^-7iLfO%jX_^?CU z?&>7rzQm~|;J`d1w%6h#QA_FjA_^sDJ`5?pJ2SJXgSIIh=&$d%3auGVf0Xc$O&DM< zl)KVN-?@?$tv89s;o`TW~a`}#9< zykjam{`Tb!TJ(GW_9z`52Vusbvm9&qozJ6vo9z$2Kq)RK| z*=3kU-wVpPvkuRt@ZCSUddx?|J_gs9QinkJqwpd9E#%5y1|=KYn5&rHFu^fkDPjGe z+)4)w8puy>va>m`T=A@|bj|ym@Ac3-zv|@c0}to+5mn(VA#R zyYlOWr{v6~il8jjtv^9RiIvPNDhH}>HUqmSUN^6nMM>CPyAsaK}d#^dz;)ZlXG!sHI8E-bqb+M6>p$>ST)OmYv z*vKWgi>fjXPdDXPz>LLDC% zXK}I36Bozxe7$%bwUs-97+IEAxNC@}VuPv(&6gldHRP=y!i_J59hE((FK!4%EA5+s zC8`mMK}C#XaKD7zk{y4*&jjuyqyBr=lhTAn+LwZa*;15<*rM7X z&mRD{c)mc+7>B)ofcVP%5`MDMJn4;O{oL9&w%@3TL)_yUv+A63xv;39%aI2?$cn$f-J-hBAxX3vl)uB{i?9hFhB2N8AM|`< z8$)nv4%byZJ0bgh%J!u#KDN^Npi=SA&H4H5>`lR0DsS<#7)t7WTan4kF}b3vLg01J zp?`wsT*l6B?0H=#$mR|qaXDr5*XDmgQKEdE2vi*7jYzKJ9ceyio+8N{5xb4MnbN;0 zcPR~&$q1Jst=A}Xv>V0bhwP6=h1H;a)u|IKyQ?Dit0J2*{G-{H;H5aW)BEw?chFTf z@>DGs1g-Ddx8hOeX662e9llwhtxWN&;hdWz&YE-?I8kYS{V*iREitN$<^SGUu7Ny5 zRqn1>Nt{z&upO#F+ElP_q!F93wxpiBy~s|PHEz;_V@M>u!ZbpO(n0%s>9weh_t)?RW&3XfJAPV^roM}GFa=Wy=t@pKauVIRK8uYqd>qaUi! zX>_IhSvhUSv!nyNWreA^z*?Ez{8cjXL-=0V`-Ojzhp=}?eE;IwunxiJ!|1DIh27#N zJBLPo$Cv46yx@RsCHv+3&9pjo%iJCTb?@ss`&O4_)K_@R1Mk8SRqxkDRBlps??NQO z-@h{A2MJDy9;as^_#4Q?xAk_N<=Uvela3#Z=1L_yxUd621YQc^CdXo+;Nhpl?0RHP z`CU2Bh|%*QJ*uI69mGilBgITZ;3jS1%#5?0m%N%h0T|=v2g=Nq77$&aA=yR^E0e62 zTf&i>n3`X;zL^)n3G5av?w$(IW@=c4?X!pojVR22P&7%kiccDNh!8h~ZQqVZ2Ae6v zSdwD{=`CMsuYk37--(75oH_yhqTov1kPnail@-F^GX!oqegA@c45GfGzL2SxI&{T@ zoJqIC@ZlM3Gqn=?rY(HwydGH*YpY*=*@G;2WvnQ#CrlSmhyi2dTil))j@LkBy7<#T zdS7Rz!@_`Em&VV0<(*#cOe_1Nr=wC!Y~JTl zLk!gf(REMGGX-ZG=qVU6qLyRX%-_s=STfu5kb4;^3Lbn4X0YWvZVbcl1I-lAGn`o` zv0<7La5N%qq`nUpLskTUNiaJJCT z`~qp0Kho!WZx9mi-Bx&2$&g!78y}egxP3C05&ys9881Zh8Q4b%EotA;6gEn;>^t}N ztZS8>=~kw8h6}QiyOQ>TD*DEoyJOuP8+P=@Mna}j*Af<}F@7mHN|cn};75t2@Gqea zDR$v2ehftN(FbM^9)HPR*--ujd|zpK!+1^iDZ%DRW(;FCoA)>3PkPf8Hh#wySKp_K zphJ#k{abXrj~SO5$@=D{cfsQ~kaEt8$b-tA0LfAr{*uaSFojYgVaHF11&ze^gsN7_ zQt{_>Y$alqG8fu9w0E`g-V1$hQwliu2gnj^yaSlY{5%} zksH(R`8S5;`L5&@xSO%l8tNZsQ&St9jGYzVM#`6PvuifVbPrtS zA=qA-_27sj{zdX^=mAa0?Mo7Q1};lY4!eHbci+F5^)l8L(tnYa&+kqW2R4AY4oH0c z4Wp?>znHv&bit`@vhT^)XZw>cQ^ z5MU>IHwkDn&EpQxcg!W3up0&5Ci4~diQ%Wy9IO>*PdmwoqPpV7auKocjV58A2gew> z`DapCz`{vFW_-g@{(*UP(pH=M@Re%=4YTmIXnz|!XhoIsk7a&{RhkScuKvkz{u17n z=@7D+*&FRZX6)lWH5obFrcX2;#4njM&c=vnEq~i63%^g=w_R~qylzVCEzf?P>}}~ze4lCKavjY+Q8=WPvpYz!|A~BjV z_-LPya_eBw5yj<5SbzEH?}A;a3{WO!N9`Pu{1-yx^?fIjoVB_47+wwZB37MGi}QBb zLo)$OX4aTRYiXsSNVeKx?6^3lbSY;-M&gB%<75?=9M(rszOwj@0c11s!Yyi0C9`hm z3R@t$G-#!|&5-Gi&z3VH4$hT1f1wUKx3@%Ss?OI$Jj5i$6+KZneHZ*UsKMYKNAZ?k zs>IasZuyDw_uy|o;ZA+EqU0RR2rG5)Hn!4Ak8Ogc_nvuPXyl(Obh3G7XJyD%agU6xklLs=PLtGUX`qwU|*j_Plf#;+Ot)yw`#_> zm9=YH(h~8(%Io~Hsrsa7Bkukjinwqh?^o+5c02P|ZIYk02V34c{`_~}!bH#g5Y`Ze zwB0{9Deal_734d%qf&zBIutqQA`E9a^8u2mg*PaQe>Jhp)6OFT8UK1YUq^Kfp3k0Q zvpmbEJ5_#(10HNQ{oV(od`(e|-qwjks&u1g$vsoTE?2z4VWIOCo_gv|4_sZuPF z&fk$k@qwS-|4nqg<99=-9kZWPl%Mq}>J$GuU$>ldjuMEGqI>*%sFRG6p@8A;14Sa} z8tU4m?k>`#-A{_`cJt>-oUs?+UoQD~*6EY7hE;!EqHV90uWBcEO{aI&?eRhy(m!R+ z<^3}6aqRo<(4#8K41-!{8!%(R$_s3I*Sw)U2T_dbp%mkLt0EzuvQLOO!Tf`V|=8qCX``BS`M~vmz z!%-c>pQ%~lKYS8<)26x0%|o@6k+cz2FCrJsj3dn^Pmnm>d&2V-BuaT&Xr_W&;?nI3 zNZ7$W!$CRJxmZW964tFjSH9Jd@+D;Oj33Ll3SrixyYp`;{0CvbQXTV7URAy%y{7XT z5tcKw?mW1ph)HrOOT+yXSR#k`w?>CmPjA%rCN}iNc+cg9`~?}Frp}zTf@=L~;0~z# zTA&Vbj`1MmQEy*E8pN#8a*6Z3W{%V~7d7cWh?VPjBD)ZFqM;*$N9K=1!=0u?J;|UOW(+6GVhS|TzV|B2bJ!Q@zw1`9Qdr^-eJjZOxhW+Wc*m&nI zI-Z8#s?@_4V&n;~Nt1tB5VHlV#1DOcVy#tYm;?lI7@bHT?CPXGNFBV0R3-$cRCZ?1 z9VT)A+tOvhYygT1oTmQe{{!w&E>z9a$#i6)N9eG#?5L`_^o`GklF@C*U(AtoBY1^A z_w}u(^SHj|(y1@(pO~2}!cWvWbBkil2Ie?=g4!Lx*hl7}5`n@>3(c#vTfXuJwAU2` z%EI?GWR>srG2Q!X>fo`d{K!P|;E5rsG!_>RTuC(%TQqe7s4g!3OfP^v&3>W#VM6ZhE{^v8Th;ErAXJTRuG@ zP2oLe537=aZ2CxayWv+Q!7shfHXAyTRgcSreQ+hYj=X%@AND8haywY#R3;1_px!$iP@8!tKcqFfcMQxZBIs%puELdZq7eC7#v;DL;>_l z<}-3bJQ#x4r%^blh4d!m6Dg&SSm39nRK8H247oBNFa@0#=%MB*^i#7t?Xa!xxZ}ZN z7Ujic*Y|^04MS5S?+@0cd4$^m|KH5i-Go_wl4aA{)v=?H995%w8CFy}j=oH7sW`;9 zM@~@oCr05nP6g&hxhh{ezoN?j_C3ybn93vkgKTsWU1(?_S=jr>o)}XO9}FjXSSa)R z2&f*hWqBKUypg=n)t-0fKFTDpF7tK4E3^F~I&b0}sWs}zyh~D@muM;?mooX#h(Hf} zIPLti=W(IykSWTAbAXeFGDQ7E&o?4={c;bTLN?32hJJ_6RVX$pp>EcIsdt)4DsPc<2KEAul(R!`?Tv4S3TBq6B$y zMlOz>-?b)yU+JjAuYZ5SUr<_iwWa&wG7Bb(#f6GyaQ8$7@B1G^#m!=x%v2_d?W0bX zc88?E_W`fdMe^VqjB61 z3R%RE7SZ@rR1fRDX}i{Z#&x#qX6p9K6+amrMQNtM2`wV4XvVtZUB|a_mW}EZ$z$kx z$KC4_$v)7OW3XWPCyb@1H~;Xk9@A#x4d^HM8|Op4Qg>$VItgj#`HCirYZvTybQd;d{uJWz zdU~F6L1JW&no^#WzuT&{8~F%U!gres=m;aCKJp{S{N_p&k&hJw6HJJIhrxt-W|J;8 zERm`kJ0JP)M84zA3YkQ39|t_U+#X!6bzYfD%$;t%j9j=`CVUGjue4OzkZ=_H;uCzH zu4T&D$o-57Zqre!I?4rUEB!QW1d zKN?HnmU)pG)0v-nc-e<6qIi6)uM0W-NS`NZEH|z85!q+Y&i8`|uDX>h!EhYEk18z|m?Ln_QQZ!3=Rlltd#v4VyoIVOknSs0J>mih(%3+J| z=kSZQHPA^t2iqO9{2y)ivG%&E7Uo~JZt>yMlyrB$gbQ)TcwcJJU5LF62Ei>}=L`$vJ)LM#^CE`YrN@OzxvLJ{^St6nZ zL=K;Iu=jL(@3zg87*SC=K9s@y#DXw4wFrUV{xmb zm&$@BjCUfntOFRGE{xRmH1#~_D1WJZ-h>HAq!Kz9msdb9l=+dm-&n>k6A8o9f|p8K zr0y8Y_(dXoWSVewh4Dfuh}0D}8LqG>4?3uCr{VD{h!;vyq%MVJ1Si7ZOat*NZZDMI zv5a1c@X=}Ll@)?rGu+`9yXu0mesO*Ne*5cF@BRRvjG}yTeer&~sQlU7tK%zYCG!bXm<}^7d5?I76t}>p(zX}9Q{AJ1Np9j)SK$j$gT_!P>7^_Y7x{9rp zUBKkr<=fA7bn=^}AN?X@mSXdgUUW^I{<*v>qOPh5oL=_fsW@D?ukqB}<@D!S8_D>m zwtsz`3SZi1_x)VhZ>jjFmVb%;GW;ClmV#yf*QI^e|9{XYzLaqLdB)9$U5~g?)PWvl zNm^ObyVG4K{XXrRds6j3o74mZr}}^0mL-9K(eLN^Q<~~e+(MJGl!xz0oHk9`#rwSv zHuPovLZm44S$`QK#NDlHcEs; zE+eLK#+|+G`)31-OtVvfpuylaTuU-$O}kyjOS->axkW4w8UDfCRiIjiRu4!F%Tuin z6F>o(q!jer{uFH>bLI-VJUK=!s)?J^zDW=?=?1AVGSiMnJ=pLvOQ-i7N*D8G!pCJZ zH8&gM%}u`)NxB~mIQFscp+dP^>Jks7?yD_t$b|>#bWcBJ3qlusyGjR~bK=h*F%=rQ z11Q`sw9v_wV7YubAe*p0&6>$$qNx2x6yECjonzd*$Kv(U_I%EMhW*zom_q^*6jvfV zjWHvFFm)=!V=8mMb|ucDjzJPg$@GvOWrvlvUh+nO|2f3~q8=`pHDlb-`(QH4h{Vfi!b6(k&|;W5Wcv!~T@@nGcEUWhDY(dTn)rAH81X+&Qb8 zj8WUG9cvv7VIX~rB>08_GNg;f?Se8toT!1YVwzGHIK1h2m$g;99#V+g%`axN>w#<| zSQG{w4@sJb))EV6$-VDe0-00k)wTGsQ~p@Pj}F+t$xsWmw!=oYR4W0?5V}AG@<|l7 zmRgSs#d?9VuIEyVcg_UUk}d`*XYQ2vssnmgc1e6?*Sv@YJ*3hsE8{P{iS9ICs~ zNt7A6aR6tQ>D-&hB`gUWODIeIVPNZpA8AR~gwIRdo`pvA-l}mjHf@SItQ>bjv)gE2 z$r}3+)|!d)K=^zub#vO!<5||hU-gF|t|-oi+Sm(QcHuWutr|8!?1fFc8h)47Hr}3v z-3xOJ{7)QWFL!4>Ibz!w$zL^;x;9^|J9!5534kx z$Q)D6gdT^6F;O`j{Jj}39_Sw8WEWHJrHnF%*&Yd_YqHnUKEvfJRDD^!U2~hnVC}4# z1bnk?;Fa}4mjwqRoLw*W<&o;*Ey%#JlBgoj9Jt~0KID@iq1#7^&7`2D7S^-#7V%5S z$V~p&o)(=jH0T{>WNn>h=PovcD;`j-*Drf>0OVp>GFeJ4artc>(pl?UNm>v0tBtq9 zTn~}l;c-*#0%vYh(xtP<5LZ}NUI*X&TvWUky(zX*-Jc%S#t1mzLm8A@Q9s{h^47&V zs`s~~w(MoQ7+Z^CV~~3lXD{k4m3B&)={)Afn#NrUDm4YCEnf?pEEIkGfxHoL*Dtr( zAa{_8(T~(s0dgm149)FuX_@Y*OQw@}zt~KRS(rL1u~v^0tpH44vR31ypBP72cP-J+ zLZkDkP6{EZ9s~&=3eRIm!u4=_Xae;Z`lpSEQJ{eIb_q@uQ@E0$&}D>+)RB1HEl|=M zQH63@F3b3m6H1n{Z&7U0I!bZ*mzsr}~>7eRpz%EH4!JDA#&kLWAUxLJn~Y^XUb{q3kA*x)jsGMCM8S!m4- zR>er@a2(pszuC*QFYU81^e{bfy>a^v(#Wg5HH?=n+nEgSo9!Fi3VS2Xfo*9XdmIF# zGMzVi_Grym)rq9S*!g(dBaP~*6GEzGjWd08RbIl&Jf+Ph-;NdB<5@2t?E-jj14`wzn*!PAv^N5M%+^7N9q@%R2(ulju538UxKo^r*Fgpv6Xlt|1I`_(2OP=X4f$o&1sZCvgi@b05rl{kw7#B&r3be&51oin^7%T!r_?_HaL0+|fVjBIjCzf0rh9-=s?{hRzK|4)sV6qSl*LrJ)l$`?K(hq^1jr+&?VcOV)u$=Jv+GE@o9qQdh~6#O3|zfFQ&3Z(Cw`EAK0X?MZy5L)D6V${~;wDqQg|W%sm_Ug#m}?>$9K z!M8{FC+1IeeFNjCIM>n6ig$2sht?o38`(ie(oQjiR-Lg!BW{&pzJxbg=SQ#(hTcn2 zoR_LIvh17{@YZa_aJfndBhEg7z^Nv=}cl6otWV z5!BIWJ9Vjzrs{%&-6EZWan5?oC2XK-<7UkvoNwSHplYC}7+TrtS`GbqtUoU1`{3#s z$$9^aurkRYxC17f;5;zx!1)_bt9$V!bvQk|C^c9KIk3l16EhfGhtPj5@LWFLZQV3%Po2Ppv7~WBQ&M}*=tUF)l8FUriuCnv>VE$W!|^Z&mYJAYS@(J9D-E4 zeRLEdR|AdX0R?LfwwXI~pi3uM4ZRLnr@+>hr?{I~b)=!%=<0vl?E`Sqb9F)(l|oqq z_OQ;WlUXlsnAvEZ;)pE2=9w*n_~2&50XO;Q_B@cB%RK9qREt2r`#P4cDN!*ui73~w zIcrgufTnreNT4%*VhSiuEy69)sY$Mb8N=$aWNvdE{wvWCIzK(1pO?{i%kA2Qc-d51 zP4XWoZ)G8Ups!q$A`3c8^8>ezjX!Z)`&^iC3{fUHyXht%lfRz(oge#OS8SC8w ztpgqgvj0hbp-;KagYTB?jb5-{3z?5BKj*nLrx|K?SgnA{J8%Alu|>8Qh@RA6pDl{L z9S`wQhb6roJ~E?@?s*hZB;Pq1Zb2^4*WbWEOR+4}#K~e+Tlq7)L@3h@^{wJptoKq= ziyx7qdPR|z#g$L@C|dD@UCf$!##%k(S4k-WZOrM>#qJ343KK)>Hhjn8~$khM%yv<~7#6 zE^PuW#?rco1MpFU75TZeJI7FgysHIcn?_PaW@qDXQ1kGcWhGsCj8WQoRlg~`%}fyK z(AXO`L9X4Sj9KneKixvutBAUYsRXvxX!eROvpU`~BkzoPE6jJYn%CO5iT=L5^G|R> zY_WmARco_a#!5ww53HxUNoTds(ihEyLG9Szuv>#zY-O$B+W@90_@+Brez){rd$~h} ztYfrjP7a+#R*3t`(?7WATQwsdqW48JuQ|w(mjN&}*Uq?I-ko#|2vCwX!dX7P(n4op z;=z1}&5N;^(Jsb8M6>gCxo#hu{yvoP zkF>WT_Vr6M<7sZQ50Kd(kU4vllgM9jp#{lT61g4XmpQYDl$f@$T>aI`_?QIyU}7#q zJMy~**TP?QXg3v#wxBIoGmJ8$=2^->Az7={S_TZb-6+# z6MAuv6LUbSj%~`w-Qq7VF-l&>dZHPqSFi`Y5;Vgr0Y0=bT}tymq%KkC06e{;tXUxY z3^#q;cv=bENq#6HsyFh=?<70vNP>l;vL)f04DiQ5o|A&FDd<9LI52(0KueK)iFE>v zFNg9xcqe$?(w~}vTEPx{H>4rirSlWK*(xfG1qi9ljj;`Qz@~Ev-bCaaC2Cjw^?$C> z!{hc+8yJe|YEn&n#X!r|?4eK?J*}zj&4rdO*?NpefFM=dU#447UOdB58~o!T$y@AC z3d0?dCHnJHp4W{c{C7;AX4+1g#p#vzPyiHoQa-%@<0^!KiOIYLPcvO5+?BuPdh8D4 zwI6{WsO~mpw!APm-j_^K%0n9zI{^FA8F$%{Z-_PVO*wut9diZMHwk~^kRf)LtqfTW z#_@qhW`qxsi~UZ&h%#4U7(SWU@lMG4Os;4aau3CS_o8t4aQzPPR z^jG(sXNGhOk>OYZ`_g0Jto4rg9`SA*7B+FIdPo)C^xjprd?_CbW965qlR#ZMmOm4R zrWQnJzO~Pi&lc(xj%->|rtpN(P|}Wf)iI$*O{*@?^}7WX`Gi$6`BvJEwZL}Q%(cJ( zQL~v^r7w`-C4EwcZ028+dlIv-Cwld^u~HHs4TLg0k*n+_nuPOF=R*u)pD`H!v*bDm z#o`^6dt>!KJ|5lItadCsD&%Acc6ic3Aral_}TM3?$V?h(5|xiGrd zSlJ637iup2aLd|G-K~h&T~?_+Q&6`0D9%_(mXKAq`ZcD@^{u@FouD~#7C~33f(*~e z&LIipzzlJygi6cJ)2XoB{2I&r`e2jwRV*aZ1<7Bj^ il|EA$>2siOAAlmp#-SK!6=5yb1aDye?S!YUTlZgkDF9di literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/rdata/species_mtx.rds b/pandas/tests/io/data/rdata/species_mtx.rds new file mode 100644 index 0000000000000000000000000000000000000000..aa9ebe379e50a81a2dc3ad5a7b3bef3d04154e0b GIT binary patch literal 1907 zcmV-(2aNb1iwFP!000001MOEyY#c=tt?n7yGoHmx2ni%1nUEkQI0UgNVG*R=)0t`9 z(>-+e*q#6)PHe&wJH|0W9FQUrApzoo5C|c$h)saRc0%ET1RTN^Aub3B5JwQoB^*)U z{jRE>>Yg#d1a6e{yQ+Tu|Lgx(zm}dHWQ=84rXSRYChKd3doJ6B#wm~v(pZ+|Q5PRr zn3})i?zzR8gEI@MvF!e-BQt_miuk-f!+AQ|nIF*hK${sac~QRrs))A6l0}c4ON?m+ zW&9@40H`Ho)4EhPEMa@OC7bG}?Uh7*XAd8gk~xOYV2`oBW%^T2`J(ln@2`A0=aR;R z%7CahD7;?SQ!cbYmx#X1hgaD0^5H^dd^acunqW)k!&0`FTe7Kk+TJQrA5(Z4)>Sg! z;PW8pNnzuBF|ZmmB-+xpj6bW)FW3DMr9TE*30nUb{%=k7v+Uo=k+cn$^R#Cb|8m`G zS;C;7o%_0^=Ra3_|D^QV7e!HDbQay6pl?ARFz&nDN5?^zi?*)Rln;d4fWjlZUlM(? z@i=H5=m=YSU(^+O8N)xx2Q~H_>(2Q(ZdbofnXkaQO73CEz+e3t#^b9B*&i9tsT`Ac z5TC;la`Ju=`$E61)7NHEKLPrQb?{PTBN>NvFpzt#sL08p`rPa6y+CPSBYcwu_Vf=~ zbcUCY3th^K-QDwS{2gw54>z`;vlcWdM83esWc=QdYejd+yx>)mwse^zgvocPkso^c zZd78>ofMzi*Y=3>LPbu+M`E%?;Tg&p(Hgi+=NzzHcLU>i?_+#@Q~vBk?AedOI*HC_ zpc^?qLorY;t!8oVN48&QJV#b0{mGDiUbOLk(6Bc8cJynC|NSh!kMy-HxhJT7NQp_# zDc?OZAC;Vn$|gnUX4cskWD|EqmiFf`aClGXH9>(ghp=>3+OvO)^`H+ z=b#Z0cOQ>IJHYtfZwXr>r*l{E`ABX6cZUwSKGJm++zEq%u>Zc|b6#L1eU-5iU7_&$ zgpRJz?FtSb30qPYXJ6r0)Y}-dX#FT(zhr#BG9e>yc=9fm%r~|4x{v=(QO~L#cT`mV zi+5D7BHu$#oki|lxU`6HUq$;J7QZ_(KjTC1c$tsVZ+g4ad3GsFY_XL%z zL2t4+AGDVR|0~eF+_vgl|2*p7ShVH^$Y<|?T7z}YSt`o?k^LkUGv%-P){$qT>Ti$t zJX@j2QoUX9#XnPq-pN?g!rKBP+NVvHEYP)ggN?IeJ@hlvU_HwDBTu7GcM`p&u)g{! zZae!fYWhb#A?6l$!20LWrFYXXw#gzp-UC(_BV&R%_s@qav8>#G>Q<4ATti_Y>2#kmh+vwUtHe0`5H2VY0Mg!?(G z{;OxE@1E_r%SO4236UH`=B8$6DD3lu+!+o5WVwuPO~$PW(Hai!-@kZxaq3WIt2H@d zO>7hVe$(IDv}A7}>cxYVl+s9rOOFUKDW{u$JfSOF<1Nuz+p)5E*TT%y;>_I4bmhqW z{0^z#=lGW&m=>La8&(c4RKS{Blq|;-tV0K+`D#@vhh$s85J7h|PBwxr^G9z;vwtWM z4o73$VA(K3+x42E<+p3LErB;0&AL5m8*$$a>)u{Z_Vb2sn042#O2(iOz=CI2Ey>9m z<5nQmN9}RL8IR2sM}5OJ>$cbQVhbxxt64W3BWT)Hui^QaUjo8MRN*|76O6K1_uQ&) zoAt2Xj^`aT9Xqrf#|UG^s@+T|4q7nfs*d+tfWzsGyP;Qejj$ebYKYZWZItSsxLDZ$@x|@v^$fR0>d}cc~J;k{wM+J$8zOX)$&}} zN=#qGido>aP0O(2RI5SH^crA>@fr;oK2poI)9fk`1)M2%*#gR8T6nl-c{Rrig0vCG zG7Z=BQ|yvsb=xSqURW`Lz^=KL+-;?%wb$O8HX3+-6@jGKD_Q~IGipet)NBS4(;_s2 z&VCt=Hk66Q3={nPsNYU7N?0VHfHV{&!E`)71#iJ@n?^OEJhTU51Y%OM2q(vCUP5#b zVJBT#adXS9S$@NCu{a4)2WgVvl#xr6rAcn7VbokNB|6kVTGbJKlAUkTA*7n51nn|8 tMBI?Jd4gRGkZ!f4fnsQk8DmNHLdzLXVG$pw|KBO>%%7I+L+6DT007bjz$gF! literal 0 HcmV?d00001 diff --git a/pandas/tests/io/rdata/test_pyreadr.py b/pandas/tests/io/rdata/test_pyreadr.py new file mode 100644 index 0000000000000..6aa6840fc0499 --- /dev/null +++ b/pandas/tests/io/rdata/test_pyreadr.py @@ -0,0 +1,596 @@ +from io import BytesIO +import os +from urllib.error import HTTPError + +import pytest + +from pandas import DataFrame +import pandas._testing as tm + +from pandas.io.rdata import read_rdata + +pyreadr = pytest.importorskip("pyreadr") + + +ghg_df = DataFrame( + { + "gas": { + 141: "Carbon dioxide", + 142: "Methane", + 143: "Nitrous oxide", + 144: "Fluorinated gases", + 145: "Total", + }, + "year": {141: 2018, 142: 2018, 143: 2018, 144: 2018, 145: 2018}, + "emissions": { + 141: 5424.881502132882, + 142: 634.4571270782675, + 143: 434.52855537666636, + 144: 182.78243246177678, + 145: 6676.649617049592, + }, + } +).rename_axis("rownames") + +plants_df = DataFrame( + { + "plant_group": { + 16: "Pteridophytes", + 17: "Pteridophytes", + 18: "Pteridophytes", + 19: "Pteridophytes", + 20: "Pteridophytes", + }, + "status": { + 16: "Data Deficient", + 17: "Extinct", + 18: "Not Threatened", + 19: "Possibly Threatened", + 20: "Threatened", + }, + "count": {16: 398, 17: 65, 18: 1294, 19: 408, 20: 1275}, + } +).rename_axis("rownames") + +sea_ice_df = DataFrame( + { + "year": {1012: 2016, 1013: 2017, 1014: 2018, 1015: 2019, 1016: 2020}, + "mo": {1012: 12, 1013: 12, 1014: 12, 1015: 12, 1016: 12}, + "data.type": { + 1012: "Goddard", + 1013: "Goddard", + 1014: "Goddard", + 1015: "Goddard", + 1016: "NRTSI-G", + }, + "region": {1012: "S", 1013: "S", 1014: "S", 1015: "S", 1016: "S"}, + "extent": {1012: 8.28, 1013: 9.48, 1014: 9.19, 1015: 9.41, 1016: 10.44}, + "area": {1012: 5.51, 1013: 6.23, 1014: 5.59, 1015: 6.59, 1016: 6.5}, + } +).rename_axis("rownames") + + +@pytest.fixture(params=["rda", "rds"]) +def rtype(request): + return request.param + + +@pytest.fixture(params=[None, False, "gzip"]) +def ok_comp(request): + return request.param + + +@pytest.fixture(params=[True, "bzip2", "xz"]) +def bad_comp(request): + return request.param + + +def adj_int(df): + """ + Convert int32 columns to int64. + + Since pyreadr engine reads ints int int32 and writes ints + to floats this method converts such columns for testing. + """ + int_cols = df.select_dtypes("int32").columns + df[int_cols] = df[int_cols].astype("int64") + + if "index" in df.columns: + df["index"] = df["index"].astype("int64") + + if "year" in df.columns: + df["year"] = df["year"].astype("int64") + if "mo" in df.columns: + df["mo"] = df["mo"].astype("int64") + + return df + + +# RDA READER + +# PATH_OR_BUFFER + + +def test_read_rds_file(datapath): + filename = datapath("io", "data", "rdata", "ghg_df.rds") + r_df = read_rdata(filename, engine="pyreadr") + r_df = adj_int(r_df) + + tm.assert_frame_equal(ghg_df, r_df.tail()) + + +def test_read_rda_file(datapath): + filename = datapath("io", "data", "rdata", "env_data_dfs.rda") + r_dfs = read_rdata(filename, engine="pyreadr") + + r_dfs = {k: adj_int(v) for k, v in r_dfs.items()} + + assert list(r_dfs.keys()) == ["ghg_df", "plants_df", "sea_ice_df"] + + tm.assert_frame_equal(ghg_df, r_dfs["ghg_df"].tail()) + tm.assert_frame_equal(plants_df, r_dfs["plants_df"].tail()) + tm.assert_frame_equal(sea_ice_df, r_dfs["sea_ice_df"].tail()) + + +def test_bytes_read_rds(datapath): + filename = datapath("io", "data", "rdata", "sea_ice_df.rds") + + with open(filename, "rb") as f: + r_df = read_rdata(f.read(), file_format="rds", engine="pyreadr") + + r_df = adj_int(r_df) + + tm.assert_frame_equal(sea_ice_df, r_df.tail()) + + +def test_bytes_read_rda(datapath): + filename = datapath("io", "data", "rdata", "env_data_dfs.rda") + + with open(filename, "rb") as f: + r_dfs = read_rdata(f.read(), file_format="rda", engine="pyreadr") + + r_dfs = {k: adj_int(v) for k, v in r_dfs.items()} + + assert list(r_dfs.keys()) == ["ghg_df", "plants_df", "sea_ice_df"] + + tm.assert_frame_equal(ghg_df, r_dfs["ghg_df"].tail()) + tm.assert_frame_equal(plants_df, r_dfs["plants_df"].tail()) + tm.assert_frame_equal(sea_ice_df, r_dfs["sea_ice_df"].tail()) + + +def test_bytesio_rds(datapath): + filename = datapath("io", "data", "rdata", "sea_ice_df.rds") + + with open(filename, "rb") as f: + with BytesIO(f.read()) as b_io: + r_df = read_rdata(b_io, file_format="rds", engine="pyreadr") + + r_df = adj_int(r_df) + + tm.assert_frame_equal(sea_ice_df, r_df.tail()) + + +def test_bytesio_rda(datapath): + filename = datapath("io", "data", "rdata", "env_data_dfs.rda") + + with open(filename, "rb") as f: + with BytesIO(f.read()) as b_io: + r_dfs = read_rdata(b_io, file_format="rda", engine="pyreadr") + + r_dfs = {k: adj_int(v) for k, v in r_dfs.items()} + + assert list(r_dfs.keys()) == ["ghg_df", "plants_df", "sea_ice_df"] + + tm.assert_frame_equal(ghg_df, r_dfs["ghg_df"].tail()) + tm.assert_frame_equal(plants_df, r_dfs["plants_df"].tail()) + tm.assert_frame_equal(sea_ice_df, r_dfs["sea_ice_df"].tail()) + + +# FILE FORMAT + + +def test_read_wrong_format(datapath): + with pytest.raises(ValueError, match="not a valid value for file_format"): + filename = datapath("io", "data", "rdata", "plants_df.rds") + read_rdata(filename, engine="pyreadr", file_format="r") + + +def test_read_wrong_file(): + with pytest.raises(FileNotFoundError, match="file cannot be found"): + filename = os.path.join("data", "rdata", "plants_df.rda") + read_rdata(filename, engine="pyreadr") + + +def test_read_rds_non_df(datapath): + from pyreadr import custom_errors + + with pytest.raises( + custom_errors.LibrdataError, + match="Invalid file, or file has unsupported features", + ): + filename = datapath("io", "data", "rdata", "ppm_ts.rds") + read_rdata(filename, engine="pyreadr") + + +def test_read_rda_non_dfs(datapath): + from pyreadr import custom_errors + + with pytest.raises( + custom_errors.LibrdataError, + match="Invalid file, or file has unsupported features", + ): + filename = datapath("io", "data", "rdata", "env_data_non_dfs.rda") + read_rdata(filename, engine="pyreadr") + + +def test_read_not_rda_file(datapath): + from pyreadr import custom_errors + + with pytest.raises( + custom_errors.LibrdataError, match="The file contains an unrecognized object" + ): + filename = datapath("io", "data", "rdata", "ppm_df.csv") + read_rdata(filename, file_format="rda", engine="pyreadr") + + +def test_bytes_read_infer_rds(datapath): + filename = datapath("io", "data", "rdata", "sea_ice_df.rds") + + with pytest.raises(ValueError, match="Unable to infer file format from file name"): + with open(filename, "rb") as f: + read_rdata(f.read(), engine="pyreadr") + + +def test_bytes_read_infer_rda(datapath): + filename = datapath("io", "data", "rdata", "env_data_dfs.rda") + + with pytest.raises(ValueError, match="Unable to infer file format from file name"): + with open(filename, "rb") as f: + read_rdata(f.read(), engine="pyreadr") + + +# URL + + +@tm.network +def test_read_rda_url(): + url_df = DataFrame( + { + "carrier": {1: "9E", 2: "AA", 3: "AS", 4: "B6", 5: "DL"}, + "name": { + 1: "Endeavor Air Inc.", + 2: "American Airlines Inc.", + 3: "Alaska Airlines Inc.", + 4: "JetBlue Airways", + 5: "Delta Air Lines Inc.", + }, + } + ).rename_axis("rownames") + + url = ( + "https://github.com/hadley/nycflights13/blob/master/data/airlines.rda?raw=true" + ) + r_dfs = read_rdata(url, file_format="rda", engine="pyreadr") + + tm.assert_frame_equal(url_df, r_dfs["airlines"].head()) + + +@tm.network +def test_read_unable_infer_format(): + with pytest.raises(ValueError, match="Unable to infer file format from file name"): + url = ( + "https://github.com/hadley/nycflights13/" + "blob/master/data/airlines.rda?raw=true" + ) + read_rdata(url, engine="pyreadr") + + +@tm.network +def test_read_wrong_url(): + with pytest.raises(HTTPError, match="HTTP Error 404: Not Found"): + url = "https://example.com/data.rdata" + read_rdata(url, engine="pyreadr") + + +# S3 + + +@tm.network +@pytest.mark.slow +def test_read_rda_s3(): + s3 = "s3://assets.datacamp.com/production/course_1478/datasets/wine.RData" + s3_df = DataFrame( + { + "Alcohol": {1: 13.2, 2: 13.16, 3: 14.37, 4: 13.24, 5: 14.2}, + "Malic.acid": {1: 1.78, 2: 2.36, 3: 1.95, 4: 2.59, 5: 1.76}, + "Ash": {1: 2.14, 2: 2.67, 3: 2.5, 4: 2.87, 5: 2.45}, + "Alcalinity.of.ash": {1: 11.2, 2: 18.6, 3: 16.8, 4: 21.0, 5: 15.2}, + "Magnesium": {1: 100, 2: 101, 3: 113, 4: 118, 5: 112}, + "Total.phenols": {1: 2.65, 2: 2.8, 3: 3.85, 4: 2.8, 5: 3.27}, + "Flavanoids": {1: 2.76, 2: 3.24, 3: 3.49, 4: 2.69, 5: 3.39}, + "Nonflavanoid.phenols": {1: 0.26, 2: 0.3, 3: 0.24, 4: 0.39, 5: 0.34}, + "Proanthocyanins": {1: 1.28, 2: 2.81, 3: 2.18, 4: 1.82, 5: 1.97}, + "Color.intensity": {1: 4.38, 2: 5.68, 3: 7.8, 4: 4.32, 5: 6.75}, + "Hue": {1: 3.4, 2: 3.17, 3: 3.45, 4: 2.93, 5: 2.85}, + "Proline": {1: 1050, 2: 1185, 3: 1480, 4: 735, 5: 1450}, + } + ).rename_axis("rownames") + r_dfs = read_rdata(s3, engine="pyreadr") + r_dfs["wine"] = adj_int(r_dfs["wine"]) + + # pyreadr remove dots in colnames + r_dfs["wine"].columns = r_dfs["wine"].columns.str.replace(" ", ".") + + tm.assert_frame_equal(s3_df, r_dfs["wine"].head()) + + +# ENGINE + + +def test_read_rds_df_output(datapath): + filename = datapath("io", "data", "rdata", "sea_ice_df.rds") + r_df = read_rdata(filename, engine="pyreadr") + + assert isinstance(r_df, DataFrame) + + +def test_read_rda_dict_output(datapath): + filename = datapath("io", "data", "rdata", "env_data_dfs.rda") + r_dfs = read_rdata(filename, engine="pyreadr") + + assert isinstance(r_dfs, dict) + assert list(r_dfs.keys()) == ["ghg_df", "plants_df", "sea_ice_df"] + + +def test_read_wrong_engine(datapath): + with pytest.raises(ValueError, match="not a supported engine"): + filename = datapath("io", "data", "rdata", "sea_ice_df.rds") + read_rdata(filename, engine="rpy2") + + +# MODE + +# IGNORED OPTION FOR pyreadr ENGINE + + +# USE_OBJECTS + + +def test_read_select_frames_rda_dfs(datapath): + filename = datapath("io", "data", "rdata", "env_data_dfs.rda") + r_dfs = read_rdata( + filename, engine="pyreadr", select_frames=["ghg_df", "sea_ice_df"] + ) + + assert "plants_df" not in list(r_dfs.keys()) + assert "ghg_df" in list(r_dfs.keys()) + assert "sea_ice_df" in list(r_dfs.keys()) + + +def test_read_wrong_select_frames(datapath): + with pytest.raises(TypeError, match="not a valid type for select_frames"): + filename = datapath("io", "data", "rdata", "env_data_dfs.rda") + read_rdata(filename, engine="pyreadr", select_frames="plants_df") + + +# ROWNAMES + + +def test_read_rownames_true_rds(datapath): + filename = datapath("io", "data", "rdata", "sea_ice_df.rds") + r_df = read_rdata(filename, engine="pyreadr", rownames=True) + + assert r_df.index.name == "rownames" + + +def test_read_rownames_false_rds(datapath): + filename = datapath("io", "data", "rdata", "sea_ice_df.rds") + r_df = read_rdata(filename, engine="pyreadr", rownames=False) + + assert r_df.index.name != "rownames" + + +def test_read_rownames_true_rda(datapath): + filename = datapath("io", "data", "rdata", "env_data_dfs.rda") + r_dfs = read_rdata(filename, engine="pyreadr", rownames=True) + + assert r_dfs["ghg_df"].index.name == "rownames" + assert r_dfs["plants_df"].index.name == "rownames" + assert r_dfs["sea_ice_df"].index.name == "rownames" + + +def test_read_rownames_false_rda(datapath): + filename = datapath("io", "data", "rdata", "env_data_dfs.rda") + r_dfs = read_rdata(filename, engine="pyreadr", rownames=False) + + assert r_dfs["ghg_df"].index.name != "rownames" + assert r_dfs["plants_df"].index.name != "rownames" + assert r_dfs["sea_ice_df"].index.name != "rownames" + + +# ENCODING + + +def test_non_utf8_data(datapath, rtype): + filename = datapath("io", "data", "rdata", f"climate_non_utf8_df.{rtype}") + with pytest.raises(UnicodeDecodeError, match=("'utf-8' codec can't decode byte")): + read_rdata(filename, engine="pyreadr") + + +# RDA WRITER + +# PATH_OR_BUFFER + + +def test_write_read_file(rtype): + with tm.ensure_clean("test.out") as path: + ghg_df.to_rdata(path, file_format=rtype, engine="pyreadr", index=False) + r_dfs = read_rdata(path, file_format=rtype, engine="pyreadr", rownames=False) + + expected = ghg_df.reset_index(drop=True) + output = ( + adj_int(r_dfs["pandas_dataframe"]) if rtype == "rda" else adj_int(r_dfs) + ) + + tm.assert_frame_equal(output, expected) + + +def test_write_read_pathlib(rtype): + from pathlib import Path + + with tm.ensure_clean_dir() as tmp_dir: + tmp_file = Path(tmp_dir).joinpath("test.out") + sea_ice_df.to_rdata(tmp_file, file_format=rtype, engine="pyreadr", index=False) + r_dfs = read_rdata( + tmp_file, file_format=rtype, engine="pyreadr", rownames=False + ) + + expected = sea_ice_df.reset_index(drop=True) + output = ( + adj_int(r_dfs["pandas_dataframe"]) if rtype == "rda" else adj_int(r_dfs) + ) + + tm.assert_frame_equal(output, expected) + + +def test_write_read_filelike(rtype): + with BytesIO() as b_io: + sea_ice_df.to_rdata(b_io, file_format=rtype, engine="pyreadr", index=False) + r_dfs = read_rdata( + b_io.getvalue(), file_format=rtype, engine="pyreadr", rownames=False + ) + + expected = sea_ice_df.reset_index(drop=True) + output = ( + adj_int(r_dfs["pandas_dataframe"]) if rtype == "rda" else adj_int(r_dfs) + ) + + tm.assert_frame_equal(output, expected) + + +# FILE FORMAT + + +def test_write_wrong_format(): + with tm.ensure_clean("test.rda") as path: + with pytest.raises(ValueError, match=("not a valid value for file_format")): + ghg_df.to_rdata(path, engine="pyreadr", file_format="csv") + + +def test_write_unable_to_infer(): + with tm.ensure_clean("test") as path: + with pytest.raises( + ValueError, match=("Unable to infer file format from file name") + ): + ghg_df.to_rdata(path, engine="pyreadr") + + +# ENGINE + + +def test_write_wrong_engine(): + with tm.ensure_clean("test.rda") as path: + with pytest.raises(ValueError, match=("not a supported engine")): + ghg_df.to_rdata(path, engine="rpy2") + + +# MODE + +# IGNORED OPTION FOR pyreadr ENGINE + + +# INDEX + + +def test_index_true(rtype): + with tm.ensure_clean("test.out") as path: + plants_df.rename_axis(None).to_rdata( + path, file_format=rtype, engine="pyreadr", index=True + ) + r_dfs = read_rdata(path, file_format=rtype, engine="pyreadr") + + r_df = r_dfs if rtype == "rds" else r_dfs["pandas_dataframe"] + + assert "index" in r_df.columns + + +def test_index_false(rtype): + with tm.ensure_clean("test.out") as path: + plants_df.rename_axis(None).to_rdata( + path, file_format=rtype, engine="pyreadr", index=False + ) + r_dfs = read_rdata(path, file_format=rtype, engine="pyreadr") + + r_df = r_dfs if rtype == "rds" else r_dfs["pandas_dataframe"] + + assert "index" not in r_df.columns + + +# ASCII + +# IGNORED OPTION FOR pyreadr ENGINE + + +# COMPRESS + + +def test_compress_ok_comp(rtype, ok_comp): + with tm.ensure_clean("test.out") as path: + ghg_df.to_rdata( + path, file_format=rtype, engine="pyreadr", compress=ok_comp, index=False + ) + r_dfs = read_rdata(path, file_format=rtype, engine="pyreadr", rownames=False) + + expected = ghg_df.reset_index(drop=True) + output = ( + adj_int(r_dfs["pandas_dataframe"]) if rtype == "rda" else adj_int(r_dfs) + ) + + tm.assert_frame_equal(output, expected) + + +def test_compress_bad_comp(rtype, bad_comp): + from pyreadr import custom_errors + + with tm.ensure_clean("test.out") as path: + with pytest.raises( + custom_errors.PyreadrError, + match=(f"compression {bad_comp} not implemented!"), + ): + ghg_df.to_rdata( + path, + file_format=rtype, + engine="pyreadr", + index=False, + compress=bad_comp, + ) + + +def test_compress_zip(rtype): + with tm.ensure_clean("test.out") as path: + with pytest.raises(ValueError, match=("not a supported value for compress")): + ghg_df.to_rdata( + path, file_format=rtype, engine="pyreadr", index=False, compress="zip" + ) + + +# OTHER_FRAMES + +# IGNORED OPTION FOR pyreadr ENGINE + + +# RDA_NAMES + + +def test_new_rda_name(): + with tm.ensure_clean("test.rda") as path: + ghg_df.to_rdata(path, engine="pyreadr", rda_names=["py_df"]) + r_dfs = read_rdata(path, engine="pyreadr") + + assert "py_df" in list(r_dfs.keys()) + + +def test_type_rda_name(): + with tm.ensure_clean("test.rds") as path: + with pytest.raises(TypeError, match=("not a valid type for rda_names")): + ghg_df.to_rdata(path, engine="rscript", rda_names="py)df") diff --git a/pandas/tests/io/rdata/test_rscript.py b/pandas/tests/io/rdata/test_rscript.py new file mode 100644 index 0000000000000..660187d3c2a1b --- /dev/null +++ b/pandas/tests/io/rdata/test_rscript.py @@ -0,0 +1,972 @@ +from io import BytesIO +import os +import subprocess +from urllib.error import HTTPError + +import pytest + +from pandas.compat._optional import import_optional_dependency +import pandas.util._test_decorators as td + +from pandas import DataFrame +import pandas._testing as tm + +from pandas.io.rdata import ( + RSCRIPT_EXISTS, + RScriptError, + read_rdata, +) + +pytestmark = pytest.mark.skipif(not RSCRIPT_EXISTS, reason="R is not installed.") + +ghg_df = DataFrame( + { + "gas": { + "141": "Carbon dioxide", + "142": "Methane", + "143": "Nitrous oxide", + "144": "Fluorinated gases", + "145": "Total", + }, + "year": {"141": 2018, "142": 2018, "143": 2018, "144": 2018, "145": 2018}, + "emissions": { + "141": 5424.88150213288, + "142": 634.457127078267, + "143": 434.528555376666, + "144": 182.782432461777, + "145": 6676.64961704959, + }, + } +).rename_axis("rownames") + +plants_df = DataFrame( + { + "plant_group": { + "16": "Pteridophytes", + "17": "Pteridophytes", + "18": "Pteridophytes", + "19": "Pteridophytes", + "20": "Pteridophytes", + }, + "status": { + "16": "Data Deficient", + "17": "Extinct", + "18": "Not Threatened", + "19": "Possibly Threatened", + "20": "Threatened", + }, + "count": {"16": 398, "17": 65, "18": 1294, "19": 408, "20": 1275}, + } +).rename_axis("rownames") + +sea_ice_df = DataFrame( + { + "year": {"1012": 2016, "1013": 2017, "1014": 2018, "1015": 2019, "1016": 2020}, + "mo": {"1012": 12, "1013": 12, "1014": 12, "1015": 12, "1016": 12}, + "data.type": { + "1012": "Goddard", + "1013": "Goddard", + "1014": "Goddard", + "1015": "Goddard", + "1016": "NRTSI-G", + }, + "region": {"1012": "S", "1013": "S", "1014": "S", "1015": "S", "1016": "S"}, + "extent": { + "1012": 8.28, + "1013": 9.48, + "1014": 9.19, + "1015": 9.41, + "1016": 10.44, + }, + "area": {"1012": 5.51, "1013": 6.23, "1014": 5.59, "1015": 6.59, "1016": 6.5}, + } +).rename_axis("rownames") + + +def r_package_installed(name): + """ + Check if R package is installed. + + Method runs a quick command line call to Rscript to + check if library call succeeds on named package. + """ + + p = subprocess.Popen( + ["Rscript", "-e", f"suppressPackageStartupMessages(library({name}))"], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + out, err = p.communicate() + + return len(err) == 0 + + +def run_rscript(cmds) -> str: + """ + Run R script at command line. + + This method will read write_rdata output and check + console output. + """ + + r_batch = """ + args <- commandArgs(trailingOnly=TRUE) + + switch(args[2], + "rda" = load(args[1]), + "rds" = { + pandas_dataframe <- readRDS(args[1]) + } + ) + + rm(args) + mget(ls()) + """ + with open(cmds[1], "w") as f: + f.write(r_batch) + + p = subprocess.Popen( + cmds, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE + ) + output, error = p.communicate() + if len(error) != 0: + raise ValueError(error.decode("UTF-8")) + + return output.decode("UTF-8") + + +def adj_int(df): + """ + Convert int32 columns to int64. + + Since parquet and feather modes parses ints int int32, + this method converts for testing. + """ + for col in df.select_dtypes("int32").columns: + df[col] = df[col].astype("int64") + + return df + + +def handle_index_rownames(df): + df = df.drop(["rownames"], axis=1).set_index("index").rename_axis(None) + + return df + + +R_ARROW = r_package_installed("arrow") +R_RSQLITE = r_package_installed("RSQLite") +PYARROW = import_optional_dependency("pyarrow") + + +@pytest.fixture(params=["rda", "rds"]) +def rtype(request): + return request.param + + +@pytest.fixture( + params=[ + "csv", + pytest.param( + "parquet", + marks=pytest.mark.skipif( + R_ARROW is None or PYARROW is None, + reason="R arrow or pyarrow not installed", + ), + ), + pytest.param( + "feather", + marks=pytest.mark.skipif( + R_ARROW is None or PYARROW is None, + reason="R arrow or pyarrow not installed", + ), + ), + pytest.param( + "sqlite", + marks=pytest.mark.skipif( + R_RSQLITE is None, reason="R RSQLite not installed" + ), + ), + ] +) +def mode(request): + return request.param + + +@pytest.fixture(params=[True, False, None]) +def ascii(request): + return request.param + + +@pytest.fixture(params=[False, "gzip", "bzip2", "xz"]) +def comp(request): + return request.param + + +# RDA READER + +# PATH_OR_BUFFER + + +def test_read_rds_file(datapath): + filename = datapath("io", "data", "rdata", "ghg_df.rds") + r_df = read_rdata(filename, engine="rscript") + + tm.assert_frame_equal(ghg_df, r_df.tail()) + + +def test_read_rda_file(datapath): + filename = datapath("io", "data", "rdata", "env_data_dfs.rda") + r_dfs = read_rdata(filename, engine="rscript") + + assert list(r_dfs.keys()) == ["plants_df", "sea_ice_df", "ghg_df"] + + tm.assert_frame_equal(ghg_df, r_dfs["ghg_df"].tail()) + tm.assert_frame_equal(plants_df, r_dfs["plants_df"].tail()) + tm.assert_frame_equal(sea_ice_df, r_dfs["sea_ice_df"].tail()) + + +def test_buffer_read_rds(datapath): + filename = datapath("io", "data", "rdata", "sea_ice_df.rds") + + with open(filename, "rb") as f: + r_df = read_rdata(f, file_format="rds", engine="rscript") + + r_df = adj_int(r_df) + + tm.assert_frame_equal(sea_ice_df, r_df.tail()) + + +def test_bytes_read_rda(datapath): + filename = datapath("io", "data", "rdata", "env_data_dfs.rda") + + with open(filename, "rb") as f: + r_dfs = read_rdata(f.read(), file_format="rda", engine="rscript") + + r_dfs = {k: adj_int(v) for k, v in r_dfs.items()} + + assert list(r_dfs.keys()) == ["plants_df", "sea_ice_df", "ghg_df"] + + tm.assert_frame_equal(ghg_df, r_dfs["ghg_df"].tail()) + tm.assert_frame_equal(plants_df, r_dfs["plants_df"].tail()) + tm.assert_frame_equal(sea_ice_df, r_dfs["sea_ice_df"].tail()) + + +def test_bytesio_rds(datapath): + filename = datapath("io", "data", "rdata", "sea_ice_df.rds") + + with open(filename, "rb") as f: + with BytesIO(f.read()) as b_io: + r_df = read_rdata(b_io, file_format="rds", engine="rscript") + + r_df = adj_int(r_df) + + tm.assert_frame_equal(sea_ice_df, r_df.tail()) + + +def test_bytesio_rda(datapath): + filename = datapath("io", "data", "rdata", "env_data_dfs.rda") + + with open(filename, "rb") as f: + with BytesIO(f.read()) as b_io: + r_dfs = read_rdata(b_io, file_format="rda", engine="rscript") + + r_dfs = {k: adj_int(v) for k, v in r_dfs.items()} + + assert list(r_dfs.keys()) == ["plants_df", "sea_ice_df", "ghg_df"] + + tm.assert_frame_equal(ghg_df, r_dfs["ghg_df"].tail()) + tm.assert_frame_equal(plants_df, r_dfs["plants_df"].tail()) + tm.assert_frame_equal(sea_ice_df, r_dfs["sea_ice_df"].tail()) + + +# FILE FORMAT + + +def test_read_wrong_format(datapath): + with pytest.raises(ValueError, match="not a valid value for file_format"): + filename = datapath("io", "data", "rdata", "plants_df.rds") + read_rdata(filename, engine="rscript", file_format="r") + + +def test_read_wrong_file(): + with pytest.raises(FileNotFoundError, match="file cannot be found"): + filename = os.path.join("data", "rdata", "plants_df.rda") + read_rdata(filename, engine="rscript") + + +@pytest.mark.slow +def test_read_rds_non_dfs(datapath, mode): + with pytest.raises( + ValueError, match="No actual data frame or coercible data frames" + ): + filename = datapath("io", "data", "rdata", "ghg_t_tests.rds") + read_rdata(filename, engine="rscript", mode=mode) + + +@pytest.mark.slow +def test_read_rda_non_dfs(datapath, mode): + with pytest.raises( + ValueError, match="No actual data frame or coercible data frames" + ): + filename = datapath("io", "data", "rdata", "env_data_non_dfs.rda") + read_rdata(filename, engine="rscript", mode=mode) + + +def test_read_not_rda_file(datapath, mode): + with pytest.raises(RScriptError, match="bad restore file magic number"): + read_rdata( + datapath("io", "data", "rdata", "ppm_df.csv"), + file_format="rda", + engine="rscript", + mode=mode, + ) + + +def test_read_not_rds_file(datapath, mode): + with pytest.raises(RScriptError, match="unknown input format"): + read_rdata( + datapath("io", "data", "rdata", "ppm_df.csv"), + file_format="rds", + engine="rscript", + mode=mode, + ) + + +def test_bytes_read_infer_rds(datapath): + filename = datapath("io", "data", "rdata", "sea_ice_df.rds") + + with pytest.raises(ValueError, match="Unable to infer file format from file name"): + with open(filename, "rb") as f: + read_rdata(f.read(), engine="rscript") + + +def test_bytes_read_infer_rda(datapath): + filename = datapath("io", "data", "rdata", "env_data_dfs.rda") + + with pytest.raises(ValueError, match="Unable to infer file format from file name"): + with open(filename, "rb") as f: + read_rdata(f.read(), engine="rscript") + + +# URL + + +@tm.network +def test_read_rda_url(): + url_df = DataFrame( + { + "carrier": {"1": "9E", "2": "AA", "3": "AS", "4": "B6", "5": "DL"}, + "name": { + "1": "Endeavor Air Inc.", + "2": "American Airlines Inc.", + "3": "Alaska Airlines Inc.", + "4": "JetBlue Airways", + "5": "Delta Air Lines Inc.", + }, + } + ).rename_axis("rownames") + + url = ( + "https://github.com/hadley/nycflights13/blob/master/data/airlines.rda?raw=true" + ) + r_df = read_rdata(url, file_format="rda", engine="rscript")["airlines"] + + tm.assert_frame_equal(url_df, r_df.head()) + + +@tm.network +def test_read_unable_infer_format(): + with pytest.raises(ValueError, match="Unable to infer file format from file name"): + url = ( + "https://github.com/hadley/nycflights13/" + "blob/master/data/airlines.rda?raw=true" + ) + read_rdata(url, engine="rscript") + + +@tm.network +def test_read_wrong_url(): + with pytest.raises(HTTPError, match="HTTP Error 404: Not Found"): + url = "https://example.com/data.rdata" + read_rdata(url, engine="rscript") + + +# S3 + + +@tm.network +@pytest.mark.slow +def test_read_rda_s3(): + s3 = "s3://assets.datacamp.com/production/course_1478/datasets/wine.RData" + s3_df = DataFrame( + { + "Alcohol": {"1": 13.2, "2": 13.16, "3": 14.37, "4": 13.24, "5": 14.2}, + "Malic acid": {"1": 1.78, "2": 2.36, "3": 1.95, "4": 2.59, "5": 1.76}, + "Ash": {"1": 2.14, "2": 2.67, "3": 2.5, "4": 2.87, "5": 2.45}, + "Alcalinity of ash": { + "1": 11.2, + "2": 18.6, + "3": 16.8, + "4": 21.0, + "5": 15.2, + }, + "Magnesium": {"1": 100, "2": 101, "3": 113, "4": 118, "5": 112}, + "Total phenols": {"1": 2.65, "2": 2.8, "3": 3.85, "4": 2.8, "5": 3.27}, + "Flavanoids": {"1": 2.76, "2": 3.24, "3": 3.49, "4": 2.69, "5": 3.39}, + "Nonflavanoid phenols": { + "1": 0.26, + "2": 0.3, + "3": 0.24, + "4": 0.39, + "5": 0.34, + }, + "Proanthocyanins": {"1": 1.28, "2": 2.81, "3": 2.18, "4": 1.82, "5": 1.97}, + "Color intensity": {"1": 4.38, "2": 5.68, "3": 7.8, "4": 4.32, "5": 6.75}, + "Hue": {"1": 3.4, "2": 3.17, "3": 3.45, "4": 2.93, "5": 2.85}, + "Proline": {"1": 1050, "2": 1185, "3": 1480, "4": 735, "5": 1450}, + } + ).rename_axis("rownames") + r_dfs = read_rdata(s3, engine="rscript") + + tm.assert_frame_equal(s3_df, r_dfs["wine"].head()) + + +# ENGINE + + +def test_read_rds_df_output(datapath): + filename = datapath("io", "data", "rdata", "sea_ice_df.rds") + r_dfs = read_rdata(filename, engine="rscript") + + assert isinstance(r_dfs, DataFrame) + + +def test_read_rda_dict_output(datapath): + filename = datapath("io", "data", "rdata", "env_data_dfs.rda") + r_dfs = read_rdata(filename, engine="rscript") + + assert isinstance(r_dfs, dict) + assert list(r_dfs.keys()) == ["plants_df", "sea_ice_df", "ghg_df"] + + +def test_read_wrong_engine(datapath): + with pytest.raises(ValueError, match="not a supported engine"): + filename = datapath("io", "data", "rdata", "sea_ice_df.rds") + read_rdata(filename, engine="rpy2") + + +# MODE + + +@pytest.mark.slow +def test_read_rds_mode_file(datapath, mode): + filename = datapath("io", "data", "rdata", "ghg_df.rds") + r_df = read_rdata(filename, engine="rscript", mode=mode) + + r_df = adj_int(r_df) + + tm.assert_frame_equal(ghg_df, r_df.tail()) + + +@pytest.mark.slow +def test_read_rda_mode_file(datapath, mode): + filename = datapath("io", "data", "rdata", "env_data_dfs.rda") + r_dfs = read_rdata(filename, engine="rscript", mode=mode) + + if mode in ["parquet", "feather"]: + (r_dfs["ghg_df"], r_dfs["plants_df"], r_dfs["sea_ice_df"]) = ( + adj_int(r_dfs["ghg_df"]), + adj_int(r_dfs["plants_df"]), + adj_int(r_dfs["sea_ice_df"]), + ) + + assert list(r_dfs.keys()) == ["plants_df", "sea_ice_df", "ghg_df"] + + tm.assert_frame_equal(ghg_df, r_dfs["ghg_df"].tail()) + tm.assert_frame_equal(plants_df, r_dfs["plants_df"].tail()) + tm.assert_frame_equal(sea_ice_df, r_dfs["sea_ice_df"].tail()) + + +def test_read_wrong_mode(datapath): + with pytest.raises(ValueError, match="not supported value for mode"): + filename = datapath("io", "data", "rdata", "plants_df.rds") + read_rdata(filename, engine="rscript", mode="pickle") + + +# USE_OBJECTS + + +def test_read_select_frames_rda_dfs(datapath): + filename = datapath("io", "data", "rdata", "env_data_dfs.rda") + r_dfs = read_rdata( + filename, engine="rscript", select_frames=["ghg_df", "sea_ice_df"] + ) + + assert "plants_df" not in list(r_dfs.keys()) + assert "ghg_df" in list(r_dfs.keys()) + assert "sea_ice_df" in list(r_dfs.keys()) + + +def test_read_select_frames_rda_objs(datapath): + filename = datapath("io", "data", "rdata", "env_data_objs.rda") + r_dfs = read_rdata( + filename, + engine="rscript", + select_frames=["ppm_ts", "species_mtx", "plants_arry"], + ) + + assert "species_vec" not in list(r_dfs.keys()) + assert "ghg_df" not in list(r_dfs.keys()) + + assert "ppm_ts" in list(r_dfs.keys()) + assert "species_mtx" in list(r_dfs.keys()) + assert "plants_arry" in list(r_dfs.keys()) + + +def test_read_wrong_select_frames(datapath): + with pytest.raises(TypeError, match="not a valid type for select_frames"): + filename = datapath("io", "data", "rdata", "env_data_dfs.rda") + read_rdata(filename, engine="rscript", select_frames="plants_df") + + +# ROWNAMES + + +def test_read_rownames_true_rds(datapath): + filename = datapath("io", "data", "rdata", "sea_ice_df.rds") + r_df = read_rdata(filename, engine="rscript", rownames=True) + + assert r_df.index.name == "rownames" + + +def test_read_rownames_false_rds(datapath): + filename = datapath("io", "data", "rdata", "sea_ice_df.rds") + r_df = read_rdata(filename, engine="rscript", rownames=False) + + assert r_df.index.name != "rownames" + + +def test_read_rownames_true_rda(datapath): + filename = datapath("io", "data", "rdata", "env_data_dfs.rda") + r_dfs = read_rdata(filename, engine="rscript", rownames=True) + + assert r_dfs["ghg_df"].index.name == "rownames" + assert r_dfs["plants_df"].index.name == "rownames" + assert r_dfs["sea_ice_df"].index.name == "rownames" + + +def test_read_rownames_false_rda(datapath): + filename = datapath("io", "data", "rdata", "env_data_dfs.rda") + r_dfs = read_rdata(filename, engine="rscript", rownames=False) + + assert r_dfs["ghg_df"].index.name != "rownames" + assert r_dfs["plants_df"].index.name != "rownames" + assert r_dfs["sea_ice_df"].index.name != "rownames" + + +# ENCODING + + +def test_non_utf8_data(datapath, rtype): + filename = datapath("io", "data", "rdata", f"climate_non_utf8_df.{rtype}") + + expected = DataFrame( + { + "número": { + "1": 1, + "2": 2, + "3": 3, + "4": 4, + "5": 5, + "6": 6, + "7": 7, + "8": 8, + "9": 9, + "10": 10, + }, + "punto central del climatismo": { + "1": "Parada de la circulación de vuelco meridional del Atlántico", + "2": "Desintegración de la capa de hielo de la Antártida occidental", + "3": "Muerte de la selva amazónica", + "4": "Cambio de monzón en África occidental", + "5": "Permafrost e hidratos de metano", + "6": "Muerte de los arrecifes de coral", + "7": "Cambio de monzón de la India", + "8": "Desintegración de la capa de hielo de Groenlandia", + "9": "Desplazamiento del bosque boreal", + "10": "Reducción del hielo marino del Ártico ", + }, + }, + index=[str(i) for i in range(1, 11)], + ).rename_axis("rownames") + + rdfs = read_rdata(filename, engine="rscript", encoding="iso-8859-1", mode="csv") + + output = rdfs["climate_df"] if rtype == "rda" else rdfs + + tm.assert_frame_equal(output, expected) + + +# RDA WRITER + +# PATH_OR_BUFFER + + +@pytest.mark.slow +def test_write_read_file(datapath, rtype, mode): + with tm.ensure_clean("test.out") as path: + ghg_df.to_rdata( + path, file_format=rtype, engine="rscript", mode=mode, index=False + ) + r_dfs = read_rdata( + path, file_format=rtype, engine="rscript", mode=mode, rownames=False + ) + + expected = ghg_df.reset_index(drop=True) + output = r_dfs if rtype == "rds" else r_dfs["pandas_dataframe"] + output["year"] = output["year"].astype("int64") + + tm.assert_frame_equal(output, expected) + + +@pytest.mark.slow +def test_write_read_bytes_io(datapath, rtype, mode): + with BytesIO() as b_io: + sea_ice_df.to_rdata( + b_io, file_format=rtype, engine="rscript", mode=mode, index=False + ) + r_dfs = read_rdata( + b_io.getvalue(), + file_format=rtype, + engine="rscript", + mode=mode, + rownames=False, + ) + + expected = sea_ice_df.reset_index(drop=True) + output = r_dfs if rtype == "rds" else r_dfs["pandas_dataframe"] + output["year"] = output["year"].astype("int64") + output["mo"] = output["mo"].astype("int64") + + tm.assert_frame_equal(output, expected) + + +# FILE_FORMAT + + +def test_write_rda_file(rtype): + expected = """\ +$pandas_dataframe + rownames year mo data.type region extent area +1 1012 2016 12 Goddard S 8.28 5.51 +2 1013 2017 12 Goddard S 9.48 6.23 +3 1014 2018 12 Goddard S 9.19 5.59 +4 1015 2019 12 Goddard S 9.41 6.59 +5 1016 2020 12 NRTSI-G S 10.44 6.50 + +""" + with tm.ensure_clean_dir() as tmp_dir: + out_file = os.path.join(tmp_dir, "rdata.out") + r_code = os.path.join(tmp_dir, "r_test.R") + + sea_ice_df.to_rdata(out_file, file_format=rtype, engine="rscript") + + cmds = ["Rscript", r_code, out_file, rtype, "pandas_dataframe"] + output = run_rscript(cmds) + + assert output == expected + + +def test_write_wrong_format(): + with tm.ensure_clean("test.rda") as path: + with pytest.raises(ValueError, match=("not a valid value for file_format")): + ghg_df.to_rdata(path, engine="rscript", file_format="csv") + + +def test_write_unable_to_infer(): + with tm.ensure_clean("test") as path: + with pytest.raises( + ValueError, match=("Unable to infer file format from file name") + ): + ghg_df.to_rdata(path, engine="rscript") + + +# ENGINE + + +@td.skip_if_no("pyreadr") +def test_write_engine_consistency(rtype): + expected = """\ +$pandas_dataframe + rownames plant_group status count +1 16 Pteridophytes Data Deficient 398 +2 17 Pteridophytes Extinct 65 +3 18 Pteridophytes Not Threatened 1294 +4 19 Pteridophytes Possibly Threatened 408 +5 20 Pteridophytes Threatened 1275 + +""" + with tm.ensure_clean_dir() as tmp_dir: + out_file = os.path.join(tmp_dir, "rdata.out") + r_code = os.path.join(tmp_dir, "r_test.R") + + plants_df.to_rdata(out_file, file_format=rtype, engine="pyreadr") + cmds = ["Rscript", r_code, out_file, rtype, "pandas_dataframe"] + pyr_output = run_rscript(cmds) + + plants_df.to_rdata(out_file, file_format=rtype, engine="rscript") + cmds = ["Rscript", r_code, out_file, rtype, "pandas_dataframe"] + rcomp_output = run_rscript(cmds) + + assert pyr_output == expected + assert pyr_output == rcomp_output + + +def test_write_wrong_engine(): + with tm.ensure_clean("test.rda") as path: + with pytest.raises(ValueError, match=("not a supported engine")): + ghg_df.to_rdata(path, engine="rpy2") + + +# MODE + + +@pytest.mark.slow +def test_write_mode(rtype, mode): + expected = """\ +$pandas_dataframe + rownames gas year emissions +1 141 Carbon dioxide 2018 5424.8815 +2 142 Methane 2018 634.4571 +3 143 Nitrous oxide 2018 434.5286 +4 144 Fluorinated gases 2018 182.7824 +5 145 Total 2018 6676.6496 + +""" + with tm.ensure_clean_dir() as tmp_dir: + out_file = os.path.join(tmp_dir, "rdata.out") + r_code = os.path.join(tmp_dir, "r_test.R") + + ghg_df.to_rdata(out_file, file_format=rtype, engine="rscript", mode=mode) + cmds = ["Rscript", r_code, out_file, rtype, "pandas_dataframe"] + output = run_rscript(cmds) + + assert output == expected + + +def test_write_wrong_mode(): + with tm.ensure_clean("test.rds") as path: + with pytest.raises(ValueError, match=("not supported value for mode")): + ghg_df.to_rdata(path, engine="rscript", mode="pickle") + + +# INDEX + + +@pytest.mark.slow +def test_write_index_false(rtype, mode): + expected = """\ +$pandas_dataframe + gas year emissions +1 Carbon dioxide 2018 5424.8815 +2 Methane 2018 634.4571 +3 Nitrous oxide 2018 434.5286 +4 Fluorinated gases 2018 182.7824 +5 Total 2018 6676.6496 + +""" + with tm.ensure_clean_dir() as tmp_dir: + out_file = os.path.join(tmp_dir, "rdata.out") + r_code = os.path.join(tmp_dir, "r_test.R") + + ghg_df.to_rdata( + out_file, file_format=rtype, index=False, engine="rscript", mode=mode + ) + + cmds = ["Rscript", r_code, out_file, rtype, "pandas_dataframe"] + output = run_rscript(cmds) + + assert output == expected + + +# ASCII + + +@pytest.mark.slow +def test_write_ascii_output(rtype, mode, ascii): + expected = """\ +$pandas_dataframe + rownames gas year emissions +1 141 Carbon dioxide 2018 5424.8815 +2 142 Methane 2018 634.4571 +3 143 Nitrous oxide 2018 434.5286 +4 144 Fluorinated gases 2018 182.7824 +5 145 Total 2018 6676.6496 + +""" + with tm.ensure_clean_dir() as tmp_dir: + out_file = os.path.join(tmp_dir, "rdata.out") + r_code = os.path.join(tmp_dir, "r_test.R") + + ghg_df.to_rdata( + out_file, file_format=rtype, engine="rscript", mode=mode, ascii=ascii + ) + + cmds = ["Rscript", r_code, out_file, rtype, "pandas_dataframe"] + output = run_rscript(cmds) + + assert output == expected + + +def test_write_read_ascii(rtype): + with tm.ensure_clean_dir() as tmp_dir: + out_file = os.path.join(tmp_dir, "rdata.out") + + ghg_df.to_rdata( + out_file, + file_format=rtype, + engine="rscript", + index=False, + ascii=True, + compress=False, + ) + + with open(out_file) as f: + r_dfs = read_rdata(f, file_format=rtype, engine="rscript", rownames=False) + + expected = ghg_df.reset_index(drop=True) + output = r_dfs if rtype == "rds" else r_dfs["pandas_dataframe"] + output["year"] = output["year"].astype("int64") + + tm.assert_frame_equal(output, expected) + + +# COMPRESS + + +@pytest.mark.slow +def test_write_compress_types(rtype, mode, comp): + expected = """\ +$pandas_dataframe + rownames year mo data.type region extent area +1 1012 2016 12 Goddard S 8.28 5.51 +2 1013 2017 12 Goddard S 9.48 6.23 +3 1014 2018 12 Goddard S 9.19 5.59 +4 1015 2019 12 Goddard S 9.41 6.59 +5 1016 2020 12 NRTSI-G S 10.44 6.50 + +""" + with tm.ensure_clean_dir() as tmp_dir: + out_file = os.path.join(tmp_dir, "rdata.out") + r_code = os.path.join(tmp_dir, "r_test.R") + + sea_ice_df.to_rdata( + out_file, file_format=rtype, engine="rscript", mode=mode, compress=comp + ) + + cmds = ["Rscript", r_code, out_file, rtype, "pandas_dataframe"] + output = run_rscript(cmds) + + assert output == expected + + +def test_write_wrong_comp(): + with tm.ensure_clean("test.rds") as path: + with pytest.raises(ValueError, match=("not a supported value for compress")): + ghg_df.to_rdata(path, engine="rscript", compress="zip") + + +def test_write_none_comp(): + with tm.ensure_clean("test.rds") as path: + with pytest.raises(RScriptError, match=("invalid 'compress' argument")): + ghg_df.to_rdata(path, engine="rscript", compress=None) + + +# OTHER_FRAMES + + +@pytest.mark.slow +def test_write_other_frames(mode): + expected = """\ +$ghg_df + rownames gas year emissions +1 141 Carbon dioxide 2018 5424.8815 +2 142 Methane 2018 634.4571 +3 143 Nitrous oxide 2018 434.5286 +4 144 Fluorinated gases 2018 182.7824 +5 145 Total 2018 6676.6496 + +$plants_df + rownames plant_group status count +1 16 Pteridophytes Data Deficient 398 +2 17 Pteridophytes Extinct 65 +3 18 Pteridophytes Not Threatened 1294 +4 19 Pteridophytes Possibly Threatened 408 +5 20 Pteridophytes Threatened 1275 + +$sea_ice_df + rownames year mo data.type region extent area +1 1012 2016 12 Goddard S 8.28 5.51 +2 1013 2017 12 Goddard S 9.48 6.23 +3 1014 2018 12 Goddard S 9.19 5.59 +4 1015 2019 12 Goddard S 9.41 6.59 +5 1016 2020 12 NRTSI-G S 10.44 6.50 + +""" + with tm.ensure_clean_dir() as tmp_dir: + out_file = os.path.join(tmp_dir, "rdata.rda") + r_code = os.path.join(tmp_dir, "r_test.R") + + ghg_df.to_rdata( + out_file, + engine="rscript", + other_frames=[plants_df, sea_ice_df], + rda_names=["ghg_df", "plants_df", "sea_ice_df"], + mode=mode, + ) + + cmds = ["Rscript", r_code, out_file, "rda", ""] + output = run_rscript(cmds) + + assert output == expected + + +def test_write_other_frames_wrong_type(): + with tm.ensure_clean("test.rds") as path: + with pytest.raises( + TypeError, match=("objects in other_frames is not a DataFrame") + ): + ghg_df.to_rdata( + path, engine="rscript", other_frames=plants_df, rda_names=["plants_df"] + ) + + +def test_write_read_other_frames(datapath): + with tm.ensure_clean("test.rda") as path: + ghg_df.to_rdata( + path, + engine="rscript", + other_frames=[plants_df, sea_ice_df], + rda_names=["ghg_df", "plants_df", "sea_ice_df"], + ) + r_dfs = read_rdata(path, engine="rscript") + + assert list(r_dfs.keys()) == ["plants_df", "sea_ice_df", "ghg_df"] + + +# RDA NAMES + + +def test_write_mismatched_names_frames(): + with tm.ensure_clean("test.rds") as path: + with pytest.raises( + ValueError, + match=("does not match number of current DataFrame and other_frames"), + ): + ghg_df.to_rdata( + path, + engine="rscript", + other_frames=[plants_df, sea_ice_df], + rda_names=["plants_df", "sea_ice_df"], + ) diff --git a/requirements-dev.txt b/requirements-dev.txt index 1817d79f96139..5325ffb3c52b3 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -74,6 +74,7 @@ sqlalchemy xarray cftime pyreadstat +pyreadr tabulate>=0.8.3 natsort git+https://github.com/pydata/pydata-sphinx-theme.git@master From 3379fa1e562c0bd17de9d645abb58047513ec29f Mon Sep 17 00:00:00 2001 From: Parfait Gasana Date: Sun, 11 Apr 2021 16:17:33 -0500 Subject: [PATCH 02/35] Fix skipif logic for test params, move package checks, add to test_api --- pandas/io/rdata.py | 22 +++++++++++++++++ pandas/tests/api/test_api.py | 1 + pandas/tests/io/rdata/test_rscript.py | 35 ++++++--------------------- 3 files changed, 30 insertions(+), 28 deletions(-) diff --git a/pandas/io/rdata.py b/pandas/io/rdata.py index ffd726e8cfbff..72c207b6adb88 100644 --- a/pandas/io/rdata.py +++ b/pandas/io/rdata.py @@ -72,6 +72,28 @@ def _executable_exists(name) -> bool: RSCRIPT_EXISTS = _executable_exists("Rscript") +def r_package_installed(name): + """ + Check if R package is installed. + + Method runs a quick command line call to Rscript to + check if library call succeeds on named package. + """ + + p = subprocess.Popen( + ["Rscript", "-e", f"suppressPackageStartupMessages(library({name}))"], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + out, err = p.communicate() + + return len(err) == 0 + + +R_ARROW = r_package_installed("arrow") if RSCRIPT_EXISTS else None +R_RSQLITE = r_package_installed("RSQLite") if RSCRIPT_EXISTS else None + + @doc(storage_options=_shared_docs["storage_options"]) def read_rdata( path_or_buffer: FilePathOrBuffer, diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py index 11bb554a0dc5a..d9934c89e9cf5 100644 --- a/pandas/tests/api/test_api.py +++ b/pandas/tests/api/test_api.py @@ -162,6 +162,7 @@ class TestPDApi(Base): "read_xml", "read_json", "read_pickle", + "read_rdata", "read_sas", "read_sql", "read_sql_query", diff --git a/pandas/tests/io/rdata/test_rscript.py b/pandas/tests/io/rdata/test_rscript.py index 660187d3c2a1b..8050ab2baaca2 100644 --- a/pandas/tests/io/rdata/test_rscript.py +++ b/pandas/tests/io/rdata/test_rscript.py @@ -12,6 +12,8 @@ import pandas._testing as tm from pandas.io.rdata import ( + R_ARROW, + R_RSQLITE, RSCRIPT_EXISTS, RScriptError, read_rdata, @@ -19,6 +21,8 @@ pytestmark = pytest.mark.skipif(not RSCRIPT_EXISTS, reason="R is not installed.") +PYARROW = import_optional_dependency("pyarrow") + ghg_df = DataFrame( { "gas": { @@ -83,24 +87,6 @@ ).rename_axis("rownames") -def r_package_installed(name): - """ - Check if R package is installed. - - Method runs a quick command line call to Rscript to - check if library call succeeds on named package. - """ - - p = subprocess.Popen( - ["Rscript", "-e", f"suppressPackageStartupMessages(library({name}))"], - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - ) - out, err = p.communicate() - - return len(err) == 0 - - def run_rscript(cmds) -> str: """ Run R script at command line. @@ -154,11 +140,6 @@ def handle_index_rownames(df): return df -R_ARROW = r_package_installed("arrow") -R_RSQLITE = r_package_installed("RSQLite") -PYARROW = import_optional_dependency("pyarrow") - - @pytest.fixture(params=["rda", "rds"]) def rtype(request): return request.param @@ -170,22 +151,20 @@ def rtype(request): pytest.param( "parquet", marks=pytest.mark.skipif( - R_ARROW is None or PYARROW is None, + not R_ARROW or not PYARROW, reason="R arrow or pyarrow not installed", ), ), pytest.param( "feather", marks=pytest.mark.skipif( - R_ARROW is None or PYARROW is None, + not R_ARROW or not PYARROW, reason="R arrow or pyarrow not installed", ), ), pytest.param( "sqlite", - marks=pytest.mark.skipif( - R_RSQLITE is None, reason="R RSQLite not installed" - ), + marks=pytest.mark.skipif(not R_RSQLITE, reason="R RSQLite not installed"), ), ] ) From 966cb789ec5abae2ce5cb7555b9347c6a792fb24 Mon Sep 17 00:00:00 2001 From: Parfait Gasana Date: Sun, 11 Apr 2021 19:11:56 -0500 Subject: [PATCH 03/35] Refactor from built-in filter, add encoding to subprocess and locale skip --- pandas/io/rdata.py | 43 ++++++++------------------- pandas/tests/io/rdata/test_rscript.py | 38 ++++++++++++++++++----- 2 files changed, 44 insertions(+), 37 deletions(-) diff --git a/pandas/io/rdata.py b/pandas/io/rdata.py index 72c207b6adb88..f9f800128ef5d 100644 --- a/pandas/io/rdata.py +++ b/pandas/io/rdata.py @@ -72,28 +72,6 @@ def _executable_exists(name) -> bool: RSCRIPT_EXISTS = _executable_exists("Rscript") -def r_package_installed(name): - """ - Check if R package is installed. - - Method runs a quick command line call to Rscript to - check if library call succeeds on named package. - """ - - p = subprocess.Popen( - ["Rscript", "-e", f"suppressPackageStartupMessages(library({name}))"], - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - ) - out, err = p.communicate() - - return len(err) == 0 - - -R_ARROW = r_package_installed("arrow") if RSCRIPT_EXISTS else None -R_RSQLITE = r_package_installed("RSQLite") if RSCRIPT_EXISTS else None - - @doc(storage_options=_shared_docs["storage_options"]) def read_rdata( path_or_buffer: FilePathOrBuffer, @@ -640,13 +618,14 @@ def run_rscript(self, tmp_dir, r_batch, cmds) -> str: stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, + encoding=self.encoding, cwd=tmp_dir, ) output, error = p.communicate() if len(error) != 0: - raise RScriptError(error.decode(self.encoding)) + raise RScriptError(error) - return output.decode(self.encoding) + return output def parse_data(self) -> Union[DataFrame, Dict[str, DataFrame]]: self.r_to_py_types = { @@ -750,10 +729,11 @@ def read_rdata_csv(self) -> Dict[str, DataFrame]: rda_file = self.buffer_to_disk(tmp_dir) output = self.run_rscript(tmp_dir, r_batch, ["Rscript", r_file, rda_file]) + output = [i for i in output.strip().split("\n") if i != ""] oline: str dfs: Dict[str, DataFrame] = {} - for oline in filter(None, output.strip().split("\n")): + for oline in output: with open( os.path.join(tmp_dir, f"meta_{oline}.txt"), encoding=self.encoding, @@ -821,11 +801,12 @@ def read_rdata_feather(self) -> Dict[str, DataFrame]: rda_file = self.buffer_to_disk(tmp_dir) output = self.run_rscript(tmp_dir, r_batch, ["Rscript", r_file, rda_file]) + output = [i for i in output.strip().split("\n") if i != ""] oline: str dfs: Dict[str, DataFrame] = { oline: read_feather(os.path.join(tmp_dir, f"data_{oline}.feather")) - for oline in filter(None, output.strip().split("\n")) + for oline in output } return dfs @@ -870,11 +851,12 @@ def read_rdata_parquet(self) -> Dict[str, DataFrame]: rda_file = self.buffer_to_disk(tmp_dir) output = self.run_rscript(tmp_dir, r_batch, ["Rscript", r_file, rda_file]) + output = [i for i in output.strip().split("\n") if i != ""] oline: str dfs: Dict[str, DataFrame] = { oline: read_parquet(os.path.join(tmp_dir, f"data_{oline}.parquet")) - for oline in filter(None, output.strip().split("\n")) + for oline in output } return dfs @@ -923,12 +905,12 @@ def read_rdata_sqlite(self) -> Dict[str, DataFrame]: rda_file = self.buffer_to_disk(tmp_dir) output = self.run_rscript(tmp_dir, r_batch, ["Rscript", r_file, rda_file]) + output = [i for i in output.strip().split("\n") if i != ""] oline: str conn = sqlite3.connect(r_db) dfs: Dict[str, DataFrame] = { - oline: read_sql(f"SELECT * FROM data_{oline}", conn) - for oline in filter(None, output.strip().split("\n")) + oline: read_sql(f"SELECT * FROM data_{oline}", conn) for oline in output } conn.close() @@ -1375,11 +1357,12 @@ def run_rscript(self, tmp_dir, r_batch, cmds) -> None: stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, + encoding=self.encoding, cwd=tmp_dir, ) output, error = a.communicate() if len(error) != 0: - raise RScriptError(error.decode(self.encoding)) + raise RScriptError(error) return None diff --git a/pandas/tests/io/rdata/test_rscript.py b/pandas/tests/io/rdata/test_rscript.py index 8050ab2baaca2..b003c16f3abb3 100644 --- a/pandas/tests/io/rdata/test_rscript.py +++ b/pandas/tests/io/rdata/test_rscript.py @@ -12,8 +12,6 @@ import pandas._testing as tm from pandas.io.rdata import ( - R_ARROW, - R_RSQLITE, RSCRIPT_EXISTS, RScriptError, read_rdata, @@ -21,8 +19,6 @@ pytestmark = pytest.mark.skipif(not RSCRIPT_EXISTS, reason="R is not installed.") -PYARROW = import_optional_dependency("pyarrow") - ghg_df = DataFrame( { "gas": { @@ -112,13 +108,40 @@ def run_rscript(cmds) -> str: f.write(r_batch) p = subprocess.Popen( - cmds, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE + cmds, + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + encoding="UTF-8", ) output, error = p.communicate() if len(error) != 0: - raise ValueError(error.decode("UTF-8")) + raise ValueError(error) + + return output + + +def r_package_installed(name): + """ + Check if R package is installed. + + Method runs a quick command line call to Rscript to + check if library call succeeds on named package. + """ + + p = subprocess.Popen( + ["Rscript", "-e", f"suppressPackageStartupMessages(library({name}))"], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + out, err = p.communicate() + + return len(err) == 0 + - return output.decode("UTF-8") +R_ARROW = r_package_installed("arrow") if RSCRIPT_EXISTS else None +R_RSQLITE = r_package_installed("RSQLite") if RSCRIPT_EXISTS else None +PYARROW = import_optional_dependency("pyarrow", errors="ignore") def adj_int(df): @@ -547,6 +570,7 @@ def test_read_rownames_false_rda(datapath): # ENCODING +@td.skip_if_not_us_locale def test_non_utf8_data(datapath, rtype): filename = datapath("io", "data", "rdata", f"climate_non_utf8_df.{rtype}") From 22c7ade5fa905687d2d6367ff73cab199754deb5 Mon Sep 17 00:00:00 2001 From: Parfait Gasana Date: Sun, 11 Apr 2021 23:03:04 -0500 Subject: [PATCH 04/35] Fix tests for OS newline and mypy, mark xfail, use default mode in io.rst --- doc/source/user_guide/io.rst | 2 +- pandas/io/rdata.py | 17 +++++----- pandas/tests/io/rdata/test_pyreadr.py | 49 ++++++++++++++++----------- pandas/tests/io/rdata/test_rscript.py | 45 ++++++++++++++---------- 4 files changed, 67 insertions(+), 46 deletions(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 75a3626ef80b5..f264ec0aba0f6 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -6058,7 +6058,7 @@ to read data natively in R and transfer content with several options of ``mode`` .. ipython:: python rds_file = os.path.join(file_path, "plants_df.rds") - plants_df = pd.read_rdata(rds_file, engine="rscript", mode="sqlite").tail() + plants_df = pd.read_rdata(rds_file, engine="rscript", mode="csv").tail() plants_df .. note:: diff --git a/pandas/io/rdata.py b/pandas/io/rdata.py index f9f800128ef5d..2595149c03444 100644 --- a/pandas/io/rdata.py +++ b/pandas/io/rdata.py @@ -729,11 +729,11 @@ def read_rdata_csv(self) -> Dict[str, DataFrame]: rda_file = self.buffer_to_disk(tmp_dir) output = self.run_rscript(tmp_dir, r_batch, ["Rscript", r_file, rda_file]) - output = [i for i in output.strip().split("\n") if i != ""] + output_list = [i for i in output.strip().split("\n") if i != ""] oline: str dfs: Dict[str, DataFrame] = {} - for oline in output: + for oline in output_list: with open( os.path.join(tmp_dir, f"meta_{oline}.txt"), encoding=self.encoding, @@ -801,12 +801,12 @@ def read_rdata_feather(self) -> Dict[str, DataFrame]: rda_file = self.buffer_to_disk(tmp_dir) output = self.run_rscript(tmp_dir, r_batch, ["Rscript", r_file, rda_file]) - output = [i for i in output.strip().split("\n") if i != ""] + output_list = [i for i in output.strip().split("\n") if i != ""] oline: str dfs: Dict[str, DataFrame] = { oline: read_feather(os.path.join(tmp_dir, f"data_{oline}.feather")) - for oline in output + for oline in output_list } return dfs @@ -851,12 +851,12 @@ def read_rdata_parquet(self) -> Dict[str, DataFrame]: rda_file = self.buffer_to_disk(tmp_dir) output = self.run_rscript(tmp_dir, r_batch, ["Rscript", r_file, rda_file]) - output = [i for i in output.strip().split("\n") if i != ""] + output_list = [i for i in output.strip().split("\n") if i != ""] oline: str dfs: Dict[str, DataFrame] = { oline: read_parquet(os.path.join(tmp_dir, f"data_{oline}.parquet")) - for oline in output + for oline in output_list } return dfs @@ -905,12 +905,13 @@ def read_rdata_sqlite(self) -> Dict[str, DataFrame]: rda_file = self.buffer_to_disk(tmp_dir) output = self.run_rscript(tmp_dir, r_batch, ["Rscript", r_file, rda_file]) - output = [i for i in output.strip().split("\n") if i != ""] + output_list = [i for i in output.strip().split("\n") if i != ""] oline: str conn = sqlite3.connect(r_db) dfs: Dict[str, DataFrame] = { - oline: read_sql(f"SELECT * FROM data_{oline}", conn) for oline in output + oline: read_sql(f"SELECT * FROM data_{oline}", conn) + for oline in output_list } conn.close() diff --git a/pandas/tests/io/rdata/test_pyreadr.py b/pandas/tests/io/rdata/test_pyreadr.py index 6aa6840fc0499..a7565eb729a42 100644 --- a/pandas/tests/io/rdata/test_pyreadr.py +++ b/pandas/tests/io/rdata/test_pyreadr.py @@ -114,16 +114,16 @@ def adj_int(df): def test_read_rds_file(datapath): filename = datapath("io", "data", "rdata", "ghg_df.rds") r_df = read_rdata(filename, engine="pyreadr") - r_df = adj_int(r_df) + output = adj_int(r_df).tail() - tm.assert_frame_equal(ghg_df, r_df.tail()) + tm.assert_frame_equal(ghg_df, output) def test_read_rda_file(datapath): filename = datapath("io", "data", "rdata", "env_data_dfs.rda") r_dfs = read_rdata(filename, engine="pyreadr") - r_dfs = {k: adj_int(v) for k, v in r_dfs.items()} + r_dfs = {str(k): adj_int(v) for k, v in r_dfs.items()} assert list(r_dfs.keys()) == ["ghg_df", "plants_df", "sea_ice_df"] @@ -136,20 +136,20 @@ def test_bytes_read_rds(datapath): filename = datapath("io", "data", "rdata", "sea_ice_df.rds") with open(filename, "rb") as f: - r_df = read_rdata(f.read(), file_format="rds", engine="pyreadr") + r_df = read_rdata(f, file_format="rds", engine="pyreadr") - r_df = adj_int(r_df) + output = adj_int(r_df).tail() - tm.assert_frame_equal(sea_ice_df, r_df.tail()) + tm.assert_frame_equal(sea_ice_df, output) def test_bytes_read_rda(datapath): filename = datapath("io", "data", "rdata", "env_data_dfs.rda") with open(filename, "rb") as f: - r_dfs = read_rdata(f.read(), file_format="rda", engine="pyreadr") + r_dfs = read_rdata(f, file_format="rda", engine="pyreadr") - r_dfs = {k: adj_int(v) for k, v in r_dfs.items()} + r_dfs = {str(k): adj_int(v) for k, v in r_dfs.items()} assert list(r_dfs.keys()) == ["ghg_df", "plants_df", "sea_ice_df"] @@ -165,9 +165,9 @@ def test_bytesio_rds(datapath): with BytesIO(f.read()) as b_io: r_df = read_rdata(b_io, file_format="rds", engine="pyreadr") - r_df = adj_int(r_df) + output = adj_int(r_df).tail() - tm.assert_frame_equal(sea_ice_df, r_df.tail()) + tm.assert_frame_equal(sea_ice_df, output) def test_bytesio_rda(datapath): @@ -177,7 +177,7 @@ def test_bytesio_rda(datapath): with BytesIO(f.read()) as b_io: r_dfs = read_rdata(b_io, file_format="rda", engine="pyreadr") - r_dfs = {k: adj_int(v) for k, v in r_dfs.items()} + r_dfs = {str(k): adj_int(v) for k, v in r_dfs.items()} assert list(r_dfs.keys()) == ["ghg_df", "plants_df", "sea_ice_df"] @@ -238,7 +238,7 @@ def test_bytes_read_infer_rds(datapath): with pytest.raises(ValueError, match="Unable to infer file format from file name"): with open(filename, "rb") as f: - read_rdata(f.read(), engine="pyreadr") + read_rdata(f, engine="pyreadr") def test_bytes_read_infer_rda(datapath): @@ -246,7 +246,7 @@ def test_bytes_read_infer_rda(datapath): with pytest.raises(ValueError, match="Unable to infer file format from file name"): with open(filename, "rb") as f: - read_rdata(f.read(), engine="pyreadr") + read_rdata(f, engine="pyreadr") # URL @@ -370,7 +370,11 @@ def test_read_select_frames_rda_dfs(datapath): def test_read_wrong_select_frames(datapath): with pytest.raises(TypeError, match="not a valid type for select_frames"): filename = datapath("io", "data", "rdata", "env_data_dfs.rda") - read_rdata(filename, engine="pyreadr", select_frames="plants_df") + read_rdata( + filename, + engine="pyreadr", + select_frames="plants_df", # type: ignore[arg-type] + ) # ROWNAMES @@ -380,14 +384,16 @@ def test_read_rownames_true_rds(datapath): filename = datapath("io", "data", "rdata", "sea_ice_df.rds") r_df = read_rdata(filename, engine="pyreadr", rownames=True) - assert r_df.index.name == "rownames" + if isinstance(r_df, DataFrame): + assert r_df.index.name == "rownames" def test_read_rownames_false_rds(datapath): filename = datapath("io", "data", "rdata", "sea_ice_df.rds") r_df = read_rdata(filename, engine="pyreadr", rownames=False) - assert r_df.index.name != "rownames" + if isinstance(r_df, DataFrame): + assert r_df.index.name != "rownames" def test_read_rownames_true_rda(datapath): @@ -457,7 +463,10 @@ def test_write_read_filelike(rtype): with BytesIO() as b_io: sea_ice_df.to_rdata(b_io, file_format=rtype, engine="pyreadr", index=False) r_dfs = read_rdata( - b_io.getvalue(), file_format=rtype, engine="pyreadr", rownames=False + b_io.getvalue(), # type: ignore[arg-type] + file_format=rtype, + engine="pyreadr", + rownames=False, ) expected = sea_ice_df.reset_index(drop=True) @@ -511,7 +520,8 @@ def test_index_true(rtype): r_df = r_dfs if rtype == "rds" else r_dfs["pandas_dataframe"] - assert "index" in r_df.columns + if isinstance(r_df, DataFrame): + assert "index" in r_df.columns def test_index_false(rtype): @@ -523,7 +533,8 @@ def test_index_false(rtype): r_df = r_dfs if rtype == "rds" else r_dfs["pandas_dataframe"] - assert "index" not in r_df.columns + if isinstance(r_df, DataFrame): + assert "index" not in r_df.columns # ASCII diff --git a/pandas/tests/io/rdata/test_rscript.py b/pandas/tests/io/rdata/test_rscript.py index b003c16f3abb3..95c0a6714c645 100644 --- a/pandas/tests/io/rdata/test_rscript.py +++ b/pandas/tests/io/rdata/test_rscript.py @@ -214,7 +214,8 @@ def test_read_rds_file(datapath): filename = datapath("io", "data", "rdata", "ghg_df.rds") r_df = read_rdata(filename, engine="rscript") - tm.assert_frame_equal(ghg_df, r_df.tail()) + if isinstance(r_df, DataFrame): + tm.assert_frame_equal(ghg_df, r_df.tail()) def test_read_rda_file(datapath): @@ -234,18 +235,18 @@ def test_buffer_read_rds(datapath): with open(filename, "rb") as f: r_df = read_rdata(f, file_format="rds", engine="rscript") - r_df = adj_int(r_df) + output = adj_int(r_df).tail() - tm.assert_frame_equal(sea_ice_df, r_df.tail()) + tm.assert_frame_equal(sea_ice_df, output) def test_bytes_read_rda(datapath): filename = datapath("io", "data", "rdata", "env_data_dfs.rda") with open(filename, "rb") as f: - r_dfs = read_rdata(f.read(), file_format="rda", engine="rscript") + r_dfs = read_rdata(f, file_format="rda", engine="rscript") - r_dfs = {k: adj_int(v) for k, v in r_dfs.items()} + r_dfs = {str(k): adj_int(v) for k, v in r_dfs.items()} assert list(r_dfs.keys()) == ["plants_df", "sea_ice_df", "ghg_df"] @@ -261,9 +262,9 @@ def test_bytesio_rds(datapath): with BytesIO(f.read()) as b_io: r_df = read_rdata(b_io, file_format="rds", engine="rscript") - r_df = adj_int(r_df) + output = adj_int(r_df).tail() - tm.assert_frame_equal(sea_ice_df, r_df.tail()) + tm.assert_frame_equal(sea_ice_df, output) def test_bytesio_rda(datapath): @@ -273,7 +274,7 @@ def test_bytesio_rda(datapath): with BytesIO(f.read()) as b_io: r_dfs = read_rdata(b_io, file_format="rda", engine="rscript") - r_dfs = {k: adj_int(v) for k, v in r_dfs.items()} + r_dfs = {str(k): adj_int(v) for k, v in r_dfs.items()} assert list(r_dfs.keys()) == ["plants_df", "sea_ice_df", "ghg_df"] @@ -340,7 +341,7 @@ def test_bytes_read_infer_rds(datapath): with pytest.raises(ValueError, match="Unable to infer file format from file name"): with open(filename, "rb") as f: - read_rdata(f.read(), engine="rscript") + read_rdata(f, engine="rscript") def test_bytes_read_infer_rda(datapath): @@ -348,7 +349,7 @@ def test_bytes_read_infer_rda(datapath): with pytest.raises(ValueError, match="Unable to infer file format from file name"): with open(filename, "rb") as f: - read_rdata(f.read(), engine="rscript") + read_rdata(f, engine="rscript") # URL @@ -466,9 +467,9 @@ def test_read_rds_mode_file(datapath, mode): filename = datapath("io", "data", "rdata", "ghg_df.rds") r_df = read_rdata(filename, engine="rscript", mode=mode) - r_df = adj_int(r_df) + output = adj_int(r_df).tail() - tm.assert_frame_equal(ghg_df, r_df.tail()) + tm.assert_frame_equal(ghg_df, output) @pytest.mark.slow @@ -529,7 +530,11 @@ def test_read_select_frames_rda_objs(datapath): def test_read_wrong_select_frames(datapath): with pytest.raises(TypeError, match="not a valid type for select_frames"): filename = datapath("io", "data", "rdata", "env_data_dfs.rda") - read_rdata(filename, engine="rscript", select_frames="plants_df") + read_rdata( + filename, + engine="rscript", + select_frames="plants_df", # type: ignore[arg-type] + ) # ROWNAMES @@ -539,14 +544,16 @@ def test_read_rownames_true_rds(datapath): filename = datapath("io", "data", "rdata", "sea_ice_df.rds") r_df = read_rdata(filename, engine="rscript", rownames=True) - assert r_df.index.name == "rownames" + if isinstance(r_df, DataFrame): + assert r_df.index.name == "rownames" def test_read_rownames_false_rds(datapath): filename = datapath("io", "data", "rdata", "sea_ice_df.rds") r_df = read_rdata(filename, engine="rscript", rownames=False) - assert r_df.index.name != "rownames" + if isinstance(r_df, DataFrame): + assert r_df.index.name != "rownames" def test_read_rownames_true_rda(datapath): @@ -570,7 +577,9 @@ def test_read_rownames_false_rda(datapath): # ENCODING -@td.skip_if_not_us_locale +@pytest.mark.xfail( + reason="R encoding is locale specific. Need to think about workaround." +) def test_non_utf8_data(datapath, rtype): filename = datapath("io", "data", "rdata", f"climate_non_utf8_df.{rtype}") @@ -640,7 +649,7 @@ def test_write_read_bytes_io(datapath, rtype, mode): b_io, file_format=rtype, engine="rscript", mode=mode, index=False ) r_dfs = read_rdata( - b_io.getvalue(), + b_io.getvalue(), # type: ignore[arg-type] file_format=rtype, engine="rscript", mode=mode, @@ -835,7 +844,7 @@ def test_write_read_ascii(rtype): compress=False, ) - with open(out_file) as f: + with open(out_file, newline="") as f: r_dfs = read_rdata(f, file_format=rtype, engine="rscript", rownames=False) expected = ghg_df.reset_index(drop=True) From 8b1aa9c4513ec1f5cf3bcd5cac8e949e61e5abfa Mon Sep 17 00:00:00 2001 From: Parfait Gasana Date: Mon, 12 Apr 2021 08:03:35 -0500 Subject: [PATCH 05/35] Added needed test skips and fixed io docs ref in whatsnew --- doc/source/whatsnew/v1.3.0.rst | 4 ++-- pandas/tests/io/rdata/test_pyreadr.py | 4 +++- pandas/tests/io/rdata/test_rscript.py | 5 ++++- 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index a039c2d959f02..6b5e7d3eccd15 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -115,7 +115,7 @@ For more, see :ref:`io.xml` in the user guide on IO tools. Read and write R data files ^^^^^^^^^^^^^^^^^^^^^^^^^^^ -We added I/O support to read and write R data files (.rda, .Rdata, .rds) using +We added I/O support to read and write R data files (.RData, .rda, .rds) using :func:`pandas.read_rdata` and :meth:`DataFrame.to_rdata`. Equipped with two engines, `pyreadr`_ and command line caller, `rscript`_, these methods will maintain fast and durable support for open source data migration between R and Python. (:issue:`40287`) @@ -211,7 +211,7 @@ Even exported back out to R data files: ...: rda_names=["ppm_df", "species_mtx"] ...: ) -For more, see :ref:`io.read_rdata` in the user guide on IO tools. +For more, see :ref:`io.rdata` in the user guide on IO tools. Styler Upgrades ^^^^^^^^^^^^^^^ diff --git a/pandas/tests/io/rdata/test_pyreadr.py b/pandas/tests/io/rdata/test_pyreadr.py index a7565eb729a42..fbcc9b06523fc 100644 --- a/pandas/tests/io/rdata/test_pyreadr.py +++ b/pandas/tests/io/rdata/test_pyreadr.py @@ -4,6 +4,8 @@ import pytest +import pandas.util._test_decorators as td + from pandas import DataFrame import pandas._testing as tm @@ -296,7 +298,7 @@ def test_read_wrong_url(): @tm.network -@pytest.mark.slow +@td.skip_if_no("s3fs") def test_read_rda_s3(): s3 = "s3://assets.datacamp.com/production/course_1478/datasets/wine.RData" s3_df = DataFrame( diff --git a/pandas/tests/io/rdata/test_rscript.py b/pandas/tests/io/rdata/test_rscript.py index 95c0a6714c645..5df2b499a66a3 100644 --- a/pandas/tests/io/rdata/test_rscript.py +++ b/pandas/tests/io/rdata/test_rscript.py @@ -316,6 +316,7 @@ def test_read_rda_non_dfs(datapath, mode): read_rdata(filename, engine="rscript", mode=mode) +@td.skip_if_not_us_locale def test_read_not_rda_file(datapath, mode): with pytest.raises(RScriptError, match="bad restore file magic number"): read_rdata( @@ -326,6 +327,7 @@ def test_read_not_rda_file(datapath, mode): ) +@td.skip_if_not_us_locale def test_read_not_rds_file(datapath, mode): with pytest.raises(RScriptError, match="unknown input format"): read_rdata( @@ -399,7 +401,7 @@ def test_read_wrong_url(): @tm.network -@pytest.mark.slow +@td.skip_if_no("s3fs") def test_read_rda_s3(): s3 = "s3://assets.datacamp.com/production/course_1478/datasets/wine.RData" s3_df = DataFrame( @@ -831,6 +833,7 @@ def test_write_ascii_output(rtype, mode, ascii): assert output == expected +@td.skip_if_windows def test_write_read_ascii(rtype): with tm.ensure_clean_dir() as tmp_dir: out_file = os.path.join(tmp_dir, "rdata.out") From 2341dffb0a20a4307c9ca9d6bf23d468c3b62680 Mon Sep 17 00:00:00 2001 From: Parfait Gasana Date: Tue, 13 Apr 2021 22:57:09 -0500 Subject: [PATCH 06/35] Remove rscript implementation from code, tests, and docs --- ci/deps/actions-37.yaml | 1 + ci/deps/azure-macos-37.yaml | 1 + ci/deps/azure-windows-37.yaml | 2 + doc/source/getting_started/install.rst | 1 - doc/source/user_guide/io.rst | 165 +-- doc/source/whatsnew/v1.3.0.rst | 28 +- pandas/core/frame.py | 141 +- pandas/io/rdata.py | 1308 +---------------- pandas/tests/io/rdata/test_rscript.py | 987 ------------- .../{rdata/test_pyreadr.py => test_rdata.py} | 176 +-- 10 files changed, 196 insertions(+), 2614 deletions(-) delete mode 100644 pandas/tests/io/rdata/test_rscript.py rename pandas/tests/io/{rdata/test_pyreadr.py => test_rdata.py} (73%) diff --git a/ci/deps/actions-37.yaml b/ci/deps/actions-37.yaml index 61f431256dd4a..9292e2aa7db39 100644 --- a/ci/deps/actions-37.yaml +++ b/ci/deps/actions-37.yaml @@ -25,4 +25,5 @@ dependencies: - flask - tabulate - pyreadstat + - pyreadr - pip diff --git a/ci/deps/azure-macos-37.yaml b/ci/deps/azure-macos-37.yaml index d667adddda859..f39f63c66d102 100644 --- a/ci/deps/azure-macos-37.yaml +++ b/ci/deps/azure-macos-37.yaml @@ -33,4 +33,5 @@ dependencies: - pip: - cython>=0.29.21 - pyreadstat + - pyreadr - pyxlsb diff --git a/ci/deps/azure-windows-37.yaml b/ci/deps/azure-windows-37.yaml index e7ac4c783b855..e9707030a4def 100644 --- a/ci/deps/azure-windows-37.yaml +++ b/ci/deps/azure-windows-37.yaml @@ -37,6 +37,8 @@ dependencies: - xlsxwriter - xlwt - pyreadstat + - pyreadr + - pyreadr - pip - pip: - pyxlsb diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index 8bdd01007516b..14379fc0dd0da 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -361,7 +361,6 @@ fastparquet 0.4.0 Parquet reading / writing pyarrow 0.15.0 Parquet, ORC, and feather reading / writing pyreadstat SPSS files (.sav) reading pyreadr R files (.RData, .rda, .rds) reading / writing -Rscript R files (.RData, .rda, .rds) reading / writing ========================= ================== ============================================================= Access data in the cloud diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index f264ec0aba0f6..0023f70e699bd 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -5925,13 +5925,12 @@ For .rds types that only contains a single R object, method will return a single .. note:: - Since *any* R object can be saved in these types, this method will only return + Since any R object can be saved in these types, this method will only return data.frame objects or objects coercible to data.frames including matrices, - tibbles, and data.tables even 3D arrays. Depending on engine used, either - an error raises for non-data.frame objects or such objects are ignored. + tibbles, and data.tables and to some extent, arrays. -For example, consider the following generated data.frames in R using samples from -US EPA, UK BGCI, and NOAA pubilc data: +For example, consider the following generated data.frames in R using environment +data samples from US EPA, UK BGCI, and NOAA pubilc data: .. code-block:: r @@ -5974,7 +5973,7 @@ US EPA, UK BGCI, and NOAA pubilc data: save(ghg_df, plants_df, sea_ice_df, file="env_data_dfs.rda") -Then in pandas you can read the .rds or .rda files: +With ``read_rdata``, you can read these above .rds or .rda files: .. ipython:: python :suppress: @@ -5990,7 +5989,7 @@ Then in pandas you can read the .rds or .rda files: rda_file = os.path.join(file_path, "env_data_dfs.rda") env_dfs = pd.read_rdata(rda_file) - env_dfs + {k: df.tail() for k, df in env_dfs.items()} To ignore the rownames of data.frame, use option ``rownames=False``: @@ -6009,16 +6008,6 @@ To select specific objects in .rda, pass a list of names into ``select_frames``: env_dfs = pd.read_rdata(rda_file, select_frames=["sea_ice_df"]) env_dfs -To read from URL, pass link directly into method: - -.. ipython:: python - - url = ("https://github.com/hadley/nycflights13/" - "blob/master/data/airlines.rda?raw=true") - - airlines = pd.read_rdata(url, file_format="rda") - airlines - To read from a file-like object, read object in argument, ``path_or_buffer``: .. ipython:: python @@ -6029,42 +6018,34 @@ To read from a file-like object, read object in argument, ``path_or_buffer``: sea_ice_df -With ``rscript`` as ``engine``, a direct command line call to Rscript is run -to read data natively in R and transfer content with several options of ``mode``. - -.. note:: +To read from URL, pass link directly into method: - If you do not have R installed and attempt to use the ``rscript`` ``engine``, - then an ``ImportError`` will raise. Do note: Rscript must be recognized as a - top-level command on machine. Hence, R's bin folder must be in Path environment - variable for the OS. If Rscript is not recognized even if you have R installed, - you will receive same ``ImportError``. +.. ipython:: python -- For the ``csv`` mode (default), no other package in R is required. - Data types are adhered in this data exchange following a text approach. + url = ("https://github.com/hadley/nycflights13/" + "blob/master/data/airlines.rda?raw=true") -- For the ``feather`` mode, the ``arrow`` package in R must be installed. - Additionally, the counterpart ``pyarrow`` package in Python must be - installed. This binary approach allows faster data exchange than text approach. + airlines = pd.read_rdata(url, file_format="rda") + airlines -- For the ``parquet`` mode, again the ``arrow`` package in R must be installed. - and again ``pyarrow`` package in Python must be installed. Similarly, this - binary approach allows faster data exchange than text approach. +To read from an Amazon S3 bucket, point to the storage path. This also raises +another issue. Any R data encoded in non utf-8 is currently not supported: -- For the ``sqlite`` mode, the ``RSQLite`` package in R (part of DBI family of - database APIs) must be installed with no additional package needed for Python. - This database approach ensures data type integrity. +.. code-block:: ipython -.. ipython:: python + In [608]: ghcran = pd.read_rdata("s3://public-r-data/ghcran.Rdata") + ... + UnicodeDecodeError: 'utf-8' codec can't decode byte 0xe9 in position 45: +invalid continuation byte - rds_file = os.path.join(file_path, "plants_df.rds") - plants_df = pd.read_rdata(rds_file, engine="rscript", mode="csv").tail() - plants_df +Also, remember if R data files do not contain any data frame object, a parsing error +will occur: -.. note:: +.. code-block:: ipython - The above selected options for ``mode`` will not generate such formats but - uses them under the hood in disk transfer of data between R and Python. + In [608]: rds_file = os.path.join(file_path, "env_data_non_dfs.rda") + ... + LibrdataError: Invalid file, or file has unsupported features .. _io.rdata_writer: @@ -6075,80 +6056,90 @@ Writing R data .. versionadded:: 1.3.0 The method :func:`~pandas.core.frame.DataFrame.to_rdata` will write a DataFrame -or multiple DataFrames into R data files (.Rdata, .rda, and .rds). +or multiple DataFrames into R data files (.RData, .rda, and .rds). -For single object in rds type: +For a single DataFrame in rds type, pass in a file or buffer in method: .. ipython:: python plants_df.to_rdata("plants_df.rds") -For multiple objects in RData or rda types using the ``rscript`` engine, -use the ``other_frames`` argument and be sure to provide ``rda_names`` for all -DataFrames: +For a single DataFrame in RData or rda types, pass in a file or buffer in method +and optionally give it a name: .. ipython:: python - plants_df.to_rdata( - "env_dfs.rda", - engine="rscript", - other_frames=[ghg_df, sea_ice_df], - rda_names=["plants_df", "ghg_df", "sea_ice_df"] - ) + plants_df.to_rdata("plants_df.rda", rda_name="plants_df") -With either engine, pandas index will not map into R rownames. Using the default -``index=True`` will output an index column or multiple columns for MultiIndex. +While RData and rda types can hold multiple R objects, this method currently +only supports writing out a single DataFrame. + +Even write to a buffer and read its content: .. ipython:: python - (ghg_df.rename_axis(None) - .to_rdata("ghg_df.rds", engine="rscript") - ) - pd.read_rdata("ghg_df.rds").tail() + with BytesIO() as b_io: + sea_ice_df.to_rdata(b_io, file_format="rda", index=False) + print( + pd.read_rdata( + b_io.getvalue(), + file_format="rda", + rownames=False, + )["pandas_dataframe"].tail() + ) -Otherwise, use ``index=False``: +While DataFrame index will not map into R rownames, by default ``index=True`` +will output as a named column or multiple columns for MultiIndex. .. ipython:: python - (ghg_df.rename_axis(None) - .to_rdata("ghg_df.rds", engine="rscript", index=False) - ) + ghg_df.rename_axis(None).to_rdata("ghg_df.rds") + pd.read_rdata("ghg_df.rds").tail() -With both engines, the default compression of R data files will be ``gzip``. -Notice the different sizes of compressed and uncompressed files: +To ignore the index, use ``index=False``: .. ipython:: python - plants_df.to_rdata("plants_df_uncomp.rds", compress=False) - - os.stat("plants_df.rds").st_size - os.stat("plants_df_uncomp.rds").st_size + ghg_df.rename_axis(None).to_rdata("ghg_df.rds", index=False) -The ``rscript`` engine supports all listed compression types including: -``gzip``, ``bzip2``, and ``xz``. + pd.read_rdata("ghg_df.rds").tail() -Additionally, with ``rscript`` engine, data files can be written in ascii (text) -rather than default binary with ``ascii`` argument: +By default, these R serialized types are compressed files in either gzip, bzip2, +or xz algorithms. Similarly to R, the default type in this method is "gzip" or +"gz". Notice difference of compressed and uncompressed files .. ipython:: python - sea_ice_df.to_rdata("sea_ice_df_ascii.rda", engine="rscript", - ascii=True, compress=False) + plants_df.to_rdata("plants_df_gz.rds") + plants_df.to_rdata("plants_df_bz2.rds", compression="bz2") + plants_df.to_rdata("plants_df_xz.rds", compression="xz") + plants_df.to_rdata("plants_df_non_comp.rds", compression=None) + + os.stat("plants_df_gz.rds").st_size + os.stat("plants_df_bz2.rds").st_size + os.stat("plants_df_xz.rds").st_size + os.stat("plants_df_non_comp.rds").st_size - with open("sea_ice_df_ascii.rda", "r") as f: - for i in range(10): - line = next(f).strip() - print(line) +Like other IO methods, ``storage_options`` are enabled to write to those platforms: + +.. code-block:: ipython + + ghg_df.to_rdata( + "s3://path/to/my/storage/pandas_df.rda", + storage_options={"user": "xxx", "password": "???"} + ) .. ipython:: python :suppress: os.remove("ghg_df.rds") os.remove("plants_df.rds") - os.remove("env_dfs.rda") - os.remove("plants_df_uncomp.rds") - os.remove("sea_ice_df_ascii.rda") + os.remove("plants_df.rda") + os.remove("plants_df_gz.rds") + os.remove("plants_df_bz2.rds") + os.remove("plants_df_xz.rds") + os.remove("plants_df_non_comp.rds") Once exported, the single DataFrame can be read back in R or multiple DataFrames loaded in R: @@ -6191,9 +6182,9 @@ loaded in R: 144 Fluorinated gases 2018 182.7824 145 Total 2018 6676.6496 -For more information of ``pyreadr`` engine, see main page of `pyreadr`_ package for -further notes on support and limitations. For more information of R serialization -data types, see docs on `rds`_ and `rda`_ data files. +For more information of the underlying ``pyreadr`` package, see main page of +`pyreadr`_ for further notes on support and limitations. For more information of R +serialization data types, see docs on `rds`_ and `rda`_ data files. .. _pyreadr: https://github.com/ofajardo/pyreadr diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 22f26f19c229e..b02c456a6ac0b 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -116,14 +116,13 @@ Read and write R data files ^^^^^^^^^^^^^^^^^^^^^^^^^^^ We added I/O support to read and write R data files (.RData, .rda, .rds) using -:func:`pandas.read_rdata` and :meth:`DataFrame.to_rdata`. Equipped with two engines, -`pyreadr`_ and command line caller, `rscript`_, these methods will maintain fast and -durable support for open source data migration between R and Python. (:issue:`40287`) +:func:`pandas.read_rdata` and :meth:`DataFrame.to_rdata`. Both methods rely on +the `pyreadr`_ package to support open source data migration between R and +Python pandas. (:issue:`40287`) .. _pyreadr: https://github.com/ofajardo/pyreadr -.. _rscript: https://www.rdocumentation.org/packages/utils/versions/3.6.2/topics/Rscript -In R, the below generated data frame and matrix: +For example, consider the below generated data frame and matrix in R: .. code-block:: r @@ -160,11 +159,12 @@ In R, the below generated data frame and matrix: In [3]: saveRDS(ppm_df, "ppm_df_r.rds") In [4]: save(ppm_df, iucn_species_mtx, "env_objs_r.rda") -Can then be read in pandas with either engine: +Now, both R data files can be read in pandas to return either DataFrame +for .rds types or ``dict`` of DataFrames for .RData and .rda types: .. code-block:: ipython - In [1]: ppm_df = pd.read_rdata("ppm_df_r.rds", engine="pyreadr") + In [1]: ppm_df = pd.read_rdata("ppm_df_r.rds") In [2]: ppm_df Out[3]: year month monthly_average num_days st_dev_of_days unc_mon_mean @@ -174,7 +174,7 @@ Can then be read in pandas with either engine: 3 2021 1 415.52 29 0.44 0.16 4 2021 2 416.75 28 1.01 0.36 - In [4]: env_objs = pd.read_rdata("env_objs_r.rda", engine="rscript") + In [4]: env_objs = pd.read_rdata("env_objs_r.rda") Out[5]: {'carbon_ppm_df': year month monthly_average num_days st_dev_of_days unc_mon_mean @@ -185,7 +185,7 @@ Can then be read in pandas with either engine: 4 2021 2 416.75 28 1.01 0.36 [5 rows x 6 columns], - 'species_matrix': + 'iucn_species_mtx': EX EW CR(PE) CR(PEW) CR EN VU DD Total rownames MAGNOLIOPSIDA 102 30 409 29 3770 6972 7089 2990 43885 @@ -199,16 +199,14 @@ Can then be read in pandas with either engine: [8 rows x 9 columns]} -Even exported back out to R data files: +Additionally, pandas data can be written back out into the same R data files: .. code-block:: ipython In [5]: ppm_df.to_rdata("ppm_df_py.rds") - In [6]: ppm_df.to_rdata( - ...: "env_objs_py.rda", - ...: engine="rscript", - ...: other_frames=env_objs["species_matrix"], - ...: rda_names=["ppm_df", "species_mtx"] + In [6]: env_objs['iucn_species_mtx'].to_rdata( + ...: "iucn_species_py.rda", + ...: rda_name="iucn_species_df" ...: ) For more, see :ref:`io.rdata` in the user guide on IO tools. diff --git a/pandas/core/frame.py b/pandas/core/frame.py index b121720744f28..6ea26ee04c307 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2295,18 +2295,13 @@ def to_rdata( self, path_or_buffer: FilePathOrBuffer, file_format: str = "infer", - engine: str = "pyreadr", - mode: str = "csv", - other_frames: list[DataFrame] | None = None, - rda_names: list[str] = ["pandas_dataframe"], + rda_name: str = "pandas_dataframe", index: bool = True, - ascii: bool = False, - compress: bool | str = "gzip", - encoding: str = "utf-8", + compression: CompressionOptions = "gzip", storage_options: StorageOptions = None, ) -> None: """ - Render one or more DataFrames to R data (.rda, .Rdata, .rds). + Render one or more DataFrames to R data (.RData, .rda, .rds). .. versionadded:: 1.3.0 @@ -2321,58 +2316,24 @@ def to_rdata( single object to disk). Default 'infer' will use extension in file name to determine the format type. - engine : {{'pyreadr', 'rscript'}}, default 'pyreadr' - Engine used to write to R data files. Currently, two types are - supported: ``pyreadr`` which requires the pyreadr package to be - installed and ``rscript`` which requires R to be installed on machine. - For ``rscript``, be sure the R bin installation folder is included in - the system Path environment variable. The ``pyreadr`` is the faster - parser to handle most needs but ``rscript`` engine provides fuller - support of rda and rds formats since it calls native R commands. - - mode : {{'csv', 'parquet', 'feather'}}, default 'csv' - Python and R I/O transfer mode that only applies to ``rscript`` - engine (ignored for ``pyreadr``). Using ``csv`` (text approach), no - additional packages are required. Using ``parquet`` or ``feather`` - (binary approach) requires pyarrow installed in Python and arrow - package installed in R. Using ``sqlite`` (database approach) requires - RSQLite package installed in R. Binary will usually be faster to process - than text data. Database usually ensures data type integrity. - - other_frames : list, optional - Other DataFrames to be included in rda (not rds) files that can - contain multiple objects. Ignored ``pyreadr`` engine that currently - supports only a single DataFrame written to rda files. - - rda_names : list, default ["pandas_dataframe"] - Names for current and other DataFrames in rda file. The number of names - should equal the number of current DataFrame and ``other_frames``. - For ``pyreadr`` engine that can only write one DataFrame to rda file, - only the first name in list will be used. + rda_name : str, default "pandas_dataframe" + Name for R data.frame in RData/rda file. index : bool, default True Include index or MulitIndex in output as separate columns. Since - DataFrame indexes can include multiple columns and R rownames can only - include one column, neither ``pyreadr`` nor ``rscript`` engines will - map DataFrame index to R data.frame rownames. - - ascii : bool, default False - Write data into ASCII (text) representation. Only supported with - ``rscript`` engine. - - compress : bool or {{'gzip', 'bzip2', 'xz'}}, default 'gzip' - Compression types for R data files. Use False for uncompressed - files. For ``pyreadr`` engine, False and 'gzip' is supported. + DataFrame indexes can include multiple columns and R rownames can + only include one column, DataFrame index will not map to R data.frame + rownames. - encoding : str, optional, default 'utf-8' - Encoding of R data. + compression : {{'gzip', 'bz2', 'xz', None}}, default 'gzip' + Compression type for on-the-fly decompression of on-disk data. {storage_options} Returns ------- None - Either None or ValueError is raised. + Either None for successful output or raises an error. See Also -------- @@ -2406,8 +2367,7 @@ def to_rdata( ... 5 4 Total 2018 6676.65 ... ''' - To save an .rda or .RData file which can contains one or more - DataFrames: + To save an .rda or .RData file: >>> plants_df = pd.DataFrame( ... {{'plant_group': ['Pteridophytes', @@ -2422,37 +2382,15 @@ def to_rdata( ... 'Threatened'], ... 'count': [398, 65, 1294, 408, 1275] ... }}) - >>> sea_ice_df = pd.DataFrame( - ... {{'year': [2016, 2017, 2018, 2019, 2020], - ... 'mo': [12, 12, 12, 12, 12], - ... 'data.type': ['Goddard', - ... 'Goddard', - ... 'Goddard', - ... 'Goddard', - ... 'NRTSI-G'], - ... 'region': ['S', 'S', 'S', 'S', 'S'], - ... 'extent': [8.28, 9.48, 9.19, 9.41, 10.44], - ... 'area': [5.51, 6.23, 5.59, 6.59, 6.5] - ... }}) - >>> ghg_df.to_rdata( - ... "env_data_df.rda", - ... engine="rscript", - ... other_frames=[plants_df, sea_ice_df], - ... rda_names=["ghg_df", "plants_df", "sea_ice_df"] + >>> plants_df.to_rdata( + ... "plants_df.rda", + ... rda_name="plants_df", ... ) # doctest: +SKIP >>> R_code = ''' - ... load("env_data_df.rds") + ... load("plants_df.rda") ... ... mget(ls()) - ... $ghg_df - ... index gas year emissions - ... 1 0 Carbon dioxide 2018 5424.88 - ... 2 1 Methane 2018 634.46 - ... 3 2 Nitrous oxide 2018 434.53 - ... 4 3 Fluorinated gases 2018 182.78 - ... 5 4 Total 2018 6676.65 - ... ... $plants_df ... index plant_group status count ... 1 0 Pteridophytes Data Deficient 398 @@ -2460,56 +2398,19 @@ def to_rdata( ... 3 2 Pteridophytes Not Threatened 1294 ... 4 3 Pteridophytes Possibly Threatened 408 ... 5 4 Pteridophytes Threatened 1275 - ... - ... $sea_ice_df - ... index year mo data.type region extent area - ... 1 0 2016 12 Goddard S 8.28 5.51 - ... 2 1 2017 12 Goddard S 9.48 6.23 - ... 3 2 2018 12 Goddard S 9.19 5.59 - ... 4 3 2019 12 Goddard S 9.41 6.59 - ... 5 4 2020 12 NRTSI-G S 10.44 6.50 ... ''' """ - from pandas.io.rdata import ( - RSCRIPT_EXISTS, - PyReadrWriter, - RscriptWriter, - ) - - pyreadr = import_optional_dependency("pyreadr", errors="ignore") - pyarrow = import_optional_dependency("pyarrow", errors="ignore") - - RDataWriter: type[PyReadrWriter] | type[RscriptWriter] + from pandas.io.rdata import PyReadrWriter - if engine == "pyreadr": - if pyreadr is None: - raise ImportError("pyreadr not found, please install for this engine.") - RDataWriter = PyReadrWriter + import_optional_dependency("pyreadr") - elif engine == "rscript": - if RSCRIPT_EXISTS is None: - raise FileNotFoundError( - "R is either not installed on this system or its " - "bin folder is not in Path environment variable." - ) - if pyarrow is None and mode in ["parquet", "feather"]: - raise ImportError("pyarrow not found, please install for this mode.") - RDataWriter = RscriptWriter - else: - raise ValueError(f"{engine} is not a supported engine.") - - rdata_writer = RDataWriter( + rdata_writer = PyReadrWriter( self, path_or_buffer=path_or_buffer, file_format=file_format, - engine=engine, - mode=mode, - other_frames=other_frames, - rda_names=rda_names, + rda_name=rda_name, index=index, - ascii=ascii, - compress=compress, - encoding=encoding, + compression=compression, storage_options=storage_options, ) diff --git a/pandas/io/rdata.py b/pandas/io/rdata.py index 2595149c03444..91852f5bd281a 100644 --- a/pandas/io/rdata.py +++ b/pandas/io/rdata.py @@ -1,27 +1,22 @@ from datetime import datetime import io import os -import platform -import subprocess from tempfile import TemporaryDirectory from typing import ( Dict, List, Optional, - Type, Union, ) from pandas._typing import ( Buffer, + CompressionOptions, FilePathOrBuffer, StorageOptions, ) from pandas.compat._optional import import_optional_dependency -from pandas.errors import ( - AbstractMethodError, - ParserError, -) +from pandas.errors import AbstractMethodError from pandas.util._decorators import doc from pandas.core.dtypes.common import is_list_like @@ -36,51 +31,14 @@ is_url, stringify_path, ) -from pandas.io.feather_format import read_feather -from pandas.io.parquet import read_parquet -from pandas.io.parsers import read_csv -from pandas.io.sql import read_sql - - -class RScriptError(Exception): - """ - Exception raises when command line call to RScript throws a non-empty - error message. Message will capture verbatim R output in console. - """ - - pass - - -def _executable_exists(name) -> bool: - """ - Internal method to check if R exists on system. - - This method will return True if R is installed for Rscript command - line call and if machine recognizes Rscript in Path env variable. - """ - - WHICH_CMD = "where" if platform.system() == "Windows" else "which" - - return ( - subprocess.call( - [WHICH_CMD, name], stdout=subprocess.PIPE, stderr=subprocess.PIPE - ) - == 0 - ) - - -RSCRIPT_EXISTS = _executable_exists("Rscript") @doc(storage_options=_shared_docs["storage_options"]) def read_rdata( path_or_buffer: FilePathOrBuffer, file_format: str = "infer", - engine: str = "pyreadr", - mode: str = "csv", select_frames: Optional[List[str]] = None, rownames: bool = True, - encoding: str = "utf-8", storage_options: StorageOptions = None, ) -> Union[DataFrame, Dict[str, DataFrame]]: r""" @@ -99,24 +57,6 @@ def read_rdata( commands. Default 'infer' will use extension in file name to to determine the format type. - engine : {{'pyreadr'. 'rscript'}}, default 'pyreadr' - Engine used to parse or read R data. Currently, two types are - supported: ``pyreadr`` which requires the pyreadr package to be - installed and ``rscript`` which requires R to be installed on machine. - For ``rscript``, be sure the R bin installation folder is included in - the system Path environment variable. The ``pyreadr`` is the faster - parser to handle most needs but ``rscript`` engine provides fuller - support of rda and rds formats since it calls native R commands. - - mode : {{'csv', 'parquet', 'feather', 'sqlite'}}, default 'csv' - Python and R I/O transfer mode that only applies to ``rscript`` - engine (ignored for ``pyreadr``). Using ``csv`` (text approach), no - additional packages are required. Using ``parquet`` or ``feather`` - (binary approach) requires pyarrow installed in Python and arrow - package installed in R. Using ``sqlite`` (database approach) requires - RSQLite package installed in R. Binary will usually be faster to process - than text data. Database usually ensures data type integrity. - select_frames : list, default None Selected names of DataFrames to return from R rda and RData types that can contain multiple objects. @@ -124,10 +64,6 @@ def read_rdata( rownames : bool, default True Include original rownames in R data frames to map into a DataFrame index. - encoding : str, optional, default 'utf-8' - Encoding of R data. Currently, ``pyreadr`` engine only supports utf-8 - encoded data. - {storage_options} Returns @@ -144,16 +80,9 @@ def read_rdata( Notes ----- - For ``pyreadr`` engine, any R data file that contains a non-data.frame object - may raise parsing errors. For ``rscript`` engine, such objects will be - ignored. Both methods will or attempt to return data.frame objects or any - object that is coercible to R's data.frame such as matrix, tibble, - and data.table. For arrays, method will attempt to convert to 2D - structure and may not reproduce original R object representation. - - If object in rds types or all objects in rda or RData types are not data - frames, this method will raise an error and will not return None or an empty - dictionary. + Any R data file that contains a non-data.frame object may raise parsing errors. + Method will return data.frame, matrix, and data.frame like object such as + tibbles and data.tables. For ``pyreadr`` engine, ``select_frames`` above is synonymous to ``use_objects`` in package's `read_r` method. Also, ``timezone`` argument defaults to current @@ -161,7 +90,7 @@ def read_rdata( Examples -------- - To read an .rds file which only contains a single object, below returns a + For an .rds file which only contains a single R object, method returns a DataFrame: >>> R_code = ''' @@ -195,7 +124,7 @@ def read_rdata( 4 Fluorinated gases 2018 182.78 5 Total 2018 6676.65 - To read an .rda or .RData file which can contain multiple objects, blue + For an .RData or .rda file which can contain multiple R objects, method returns a ``dict`` of DataFrames: >>> R_code = ''' @@ -255,79 +184,13 @@ def read_rdata( 5 2020 12 NRTSI-G S 10.44 6.50}} """ - return _parse( - path_or_buffer=path_or_buffer, - file_format=file_format, - engine=engine, - mode=mode, - select_frames=select_frames, - rownames=rownames, - encoding=encoding, - storage_options=storage_options, - ) - + import_optional_dependency("pyreadr") -def _parse( - path_or_buffer, - file_format, - engine, - mode, - select_frames, - rownames, - encoding, - storage_options, - **kwargs, -) -> Union[DataFrame, Dict[str, DataFrame]]: - """ - Call internal parser classes. - - This method will conditionally call internal parsers: - _PyReadrParser or _RscriptParser. - - Raises - ------ - FileNotFoundError - * If Rscript bin executable is not installed or found on machine. - - ImportError - * If pyreadr for engine and pyarrow for mode is not installed. - - ValueError - * If engine is neither pyreadr or rscript. - """ - pyreadr = import_optional_dependency("pyreadr", errors="ignore") - pyarrow = import_optional_dependency("pyarrow", errors="ignore") - - RDataReader: Union[Type[_PyReadrParser], Type[_RscriptParser]] - - if engine == "pyreadr": - if pyreadr is None: - raise ImportError("pyreadr not found, please install for this engine.") - - RDataReader = _PyReadrParser - - elif engine == "rscript": - if RSCRIPT_EXISTS is None: - raise FileNotFoundError( - "R is either not installed on this system or its " - "bin folder is not in Path environment variable." - ) - - if pyarrow is None and mode in ["parquet", "feather"]: - raise ImportError("pyarrow not found, please install for this mode.") - - RDataReader = _RscriptParser - else: - raise ValueError(f"{engine} is not a supported engine.") - - rdr = RDataReader( + rdr = _PyReadrParser( path_or_buffer, file_format, - engine, - mode, select_frames, rownames, - encoding, storage_options, ) @@ -409,21 +272,12 @@ class _RDataReader: file_format : {{'infer', 'rda', 'rdata', 'rds'}}, default 'infer' R serialization type. - engine : {{'pyreadr', 'rscript'}}, default 'pyreadr' - Engine used to parse or read R data. - - mode : {{'csv', 'parquet', 'feather', 'sqlite'}}, default 'csv' - Python and R i/o transfer mode. - select_frames : list, default None Selected names of DataFrames to return from R data. rownames : bool, default True Include original rownames in R data frames. - encoding : str, optional, default 'utf-8' - Encoding of R data. - storage_options : dict, optional Extra options that make sense for a particular storage connection, e.g. host, port, username, password, etc., @@ -431,7 +285,6 @@ class _RDataReader: See also -------- pandas.io.rdata._PyReadrParser - pandas.io.rdata._RscriptParser Notes ----- @@ -447,20 +300,14 @@ def __init__( self, path_or_buffer, file_format, - engine, - mode, select_frames, rownames, - encoding, storage_options, ) -> None: self.path_or_buffer = path_or_buffer self.file_format = file_format.lower() - self.engine = engine - self.mode = mode self.select_frames = select_frames self.rownames = rownames - self.encoding = encoding self.storage_options = storage_options def verify_params(self) -> None: @@ -489,14 +336,6 @@ def verify_params(self) -> None: if self.file_format == "infer": self.file_format = os.path.splitext(self.path_or_buffer.lower())[1][1:] - if self.mode is not None and self.mode not in [ - "csv", - "feather", - "parquet", - "sqlite", - ]: - raise ValueError(f"'{self.mode}' is not supported value for mode.") - if self.select_frames is not None and not is_list_like(self.select_frames): raise TypeError( f"{type(self.select_frames).__name__} is " @@ -508,14 +347,14 @@ def buffer_to_disk(self, tmp_dir: str) -> str: Convert path or buffer to disk file. This method will convert path_or_buffer to temp file - for pyreadr to parse and rscript to import. + for pyreadr to parse from disk. """ r_temp = os.path.join(tmp_dir, "rdata.rda") handle_data = _get_data_from_filepath( filepath_or_buffer=self.path_or_buffer, - encoding=self.encoding, + encoding="utf-8", compression=None, storage_options=self.storage_options, ) @@ -584,524 +423,6 @@ def parse_data(self) -> Union[DataFrame, Dict[str, DataFrame]]: return rdata -class _RscriptParser(_RDataReader): - """ - Internal class to parse R data types using temp script and data - files and command line call to installed Rscript executable. - """ - - def __init__(self, *args, **kwargs) -> None: - super().__init__(*args, **kwargs) - self.verify_params() - - def handle_rownames(self, df) -> DataFrame: - if self.rownames: - df = df.set_index("rownames") - else: - df = df.drop(["rownames"], axis=1) - - return df - - def run_rscript(self, tmp_dir, r_batch, cmds) -> str: - """ - Run R script at command line. - - This method will call subprocess.Popen to run R script that - saves temp data and meta files and returns R's console output. - """ - - with open(cmds[1], "w") as f: - f.write(r_batch) - - p = subprocess.Popen( - cmds, - stdin=subprocess.PIPE, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - encoding=self.encoding, - cwd=tmp_dir, - ) - output, error = p.communicate() - if len(error) != 0: - raise RScriptError(error) - - return output - - def parse_data(self) -> Union[DataFrame, Dict[str, DataFrame]]: - self.r_to_py_types = { - "logical": "bool", - "integer": "int64", - "numeric": "float64", - "factor": "category", - "character": "str", - "Date": "date", - "POSIXct": "date", - } - - switch_board = { - "rda": { - "csv": self.read_rdata_csv, - "feather": self.read_rdata_feather, - "parquet": self.read_rdata_parquet, - "sqlite": self.read_rdata_sqlite, - }, - "rdata": { - "csv": self.read_rdata_csv, - "feather": self.read_rdata_feather, - "parquet": self.read_rdata_parquet, - "sqlite": self.read_rdata_sqlite, - }, - "rds": { - "csv": self.read_rds_csv, - "feather": self.read_rds_feather, - "parquet": self.read_rds_parquet, - "sqlite": self.read_rds_sqlite, - }, - } - - rdata: Union[DataFrame, Dict[str, DataFrame], None] - rdata = switch_board[self.file_format][self.mode]() - - rdata = ( - {k: v for k, v in rdata.items() if k in self.select_frames} - if self.select_frames - else rdata - ) - rdata = {k: self.handle_rownames(df) for k, df in rdata.items()} - - rdata = rdata or None - rdata = ( - rdata["r_df"] - if (self.file_format == "rds" and rdata is not None) - else rdata - ) - - if rdata is None: - raise ValueError( - "No actual data frame or coercible data frames found in R data file." - ) - return rdata - - def read_rdata_csv(self) -> Dict[str, DataFrame]: - """ - Read R rda data via IO csv. - - This method will call `load` and `write.csv` in R to export all - data frames and metadata into temp csv files for pandas `read_csv`. . - """ - - r_batch = """ - args <- commandArgs(trailingOnly=TRUE) - load(args[1], temp_env <- new.env()) - - env_list <- as.list.environment(temp_env) - rm(temp_env) - - output_data_meta <- function(obj, nm) { - df <- tryCatch(data.frame(obj, - check.names=FALSE, - stringsAsFactors=FALSE - ), error=function(e) NULL) - - if (!is.null(df)) { - cat(nm, "\n", sep="") - - df <- data.frame(rownames = row.names(df), df, - check.names=FALSE, - stringsAsFactors=FALSE) - writeLines( - c(paste0(colnames(df), collapse=","), - paste0(sapply(df, - function(x) class(x)[1]), collapse=",")), - con=paste0("meta_", nm, ".txt") - ) - - write.csv(df, paste0("data_", nm, ".csv"), - row.names=FALSE, na="") - } - } - - output <- mapply(output_data_meta, env_list, names(env_list)) - """ - - with TemporaryDirectory() as tmp_dir: - r_file = os.path.join(tmp_dir, "r_batch.R") - rda_file = self.buffer_to_disk(tmp_dir) - - output = self.run_rscript(tmp_dir, r_batch, ["Rscript", r_file, rda_file]) - output_list = [i for i in output.strip().split("\n") if i != ""] - - oline: str - dfs: Dict[str, DataFrame] = {} - for oline in output_list: - with open( - os.path.join(tmp_dir, f"meta_{oline}.txt"), - encoding=self.encoding, - ) as f: - flines = [fline.strip() for fline in f] - - r_hdrs: List[List[str]] = [h.split(",") for h in flines] - py_types = {n: self.r_to_py_types[d] for n, d in zip(*r_hdrs)} - - dt_cols = [col for col, d in py_types.items() if d == "date"] - py_types = {k: v for k, v in py_types.items() if v != "date"} - - try: - dfs[oline] = read_csv( - os.path.join(tmp_dir, f"data_{oline}.csv"), - dtype=py_types, # type: ignore[arg-type] - parse_dates=dt_cols, - encoding=self.encoding, - ) - except (ParserError, ValueError): - dfs[oline] = read_csv( - os.path.join(tmp_dir, f"data_{oline}.csv"), - encoding=self.encoding, - ) - - return dfs - - def read_rdata_feather(self) -> Dict[str, DataFrame]: - """ - Read R rda data via IO feather. - - This method will call `readRDS` and `write_feather` in R to export all - data frames into temp feather files for pandas `read_feather`. - """ - - r_batch = """ - suppressPackageStartupMessages(library(arrow)) - - args <- commandArgs(trailingOnly=TRUE) - - load(args[1], temp_env <- new.env()) - env_list <- as.list.environment(temp_env) - rm(temp_env) - - output_data_meta <- function(obj, nm) { - df <- tryCatch(data.frame(obj, - check.names=FALSE, - stringsAsFactors=FALSE - ), error=function(e) NULL) - - if (!is.null(df)) { - cat(nm, "\n", sep="") - df <- data.frame(rownames = row.names(df), df, - check.names=FALSE, - stringsAsFactors=FALSE) - arrow::write_feather(df, paste0("data_", nm, ".feather")) - } - } - - output <- mapply(output_data_meta, env_list, names(env_list)) - """ - - with TemporaryDirectory() as tmp_dir: - r_file = os.path.join(tmp_dir, "r_batch.R") - rda_file = self.buffer_to_disk(tmp_dir) - - output = self.run_rscript(tmp_dir, r_batch, ["Rscript", r_file, rda_file]) - output_list = [i for i in output.strip().split("\n") if i != ""] - - oline: str - dfs: Dict[str, DataFrame] = { - oline: read_feather(os.path.join(tmp_dir, f"data_{oline}.feather")) - for oline in output_list - } - - return dfs - - def read_rdata_parquet(self) -> Dict[str, DataFrame]: - """ - Read R rda data via IO parquet. - - This method will call `load` and `write_parquet` in R to export all - data frames into temp parquet files for pandas `read_parquet`. - """ - - r_batch = """ - suppressPackageStartupMessages(library(arrow)) - - args <- commandArgs(trailingOnly=TRUE) - - load(args[1], temp_env <- new.env()) - env_list <- as.list.environment(temp_env) - rm(temp_env) - - output_data_meta <- function(obj, nm) { - df <- tryCatch(data.frame(obj, - check.names=FALSE, - stringsAsFactors=FALSE - ), error=function(e) NULL) - - if (!is.null(df)) { - cat(nm, "\n", sep="") - df <- data.frame(rownames = row.names(df), df, - check.names=FALSE, - stringsAsFactors=FALSE) - arrow::write_parquet(df, paste0("data_", nm, ".parquet")) - } - } - - output <- mapply(output_data_meta, env_list, names(env_list)) - """ - - with TemporaryDirectory() as tmp_dir: - r_file = os.path.join(tmp_dir, "r_batch.R") - rda_file = self.buffer_to_disk(tmp_dir) - - output = self.run_rscript(tmp_dir, r_batch, ["Rscript", r_file, rda_file]) - output_list = [i for i in output.strip().split("\n") if i != ""] - - oline: str - dfs: Dict[str, DataFrame] = { - oline: read_parquet(os.path.join(tmp_dir, f"data_{oline}.parquet")) - for oline in output_list - } - - return dfs - - def read_rdata_sqlite(self) -> Dict[str, DataFrame]: - """ - Read R rda data via IO sql. - - This method will call `load` and `dbWriteTable` in R to export all - data frames into a temp SQLite database for pandas `read_sql`. - """ - import sqlite3 - - r_batch = """ - suppressPackageStartupMessages(library(RSQLite)) - - args <- commandArgs(trailingOnly=TRUE) - - load(args[1], temp_env <- new.env()) - env_list <- as.list.environment(temp_env) - rm(temp_env) - - conn <- dbConnect(RSQLite::SQLite(), "r_data.db") - output_data_meta <- function(obj, nm) { - df <- tryCatch(data.frame(obj, - check.names=FALSE, - stringsAsFactors=FALSE - ), error=function(e) NULL) - - if (!is.null(df)) { - cat(nm, "\n", sep="") - df <- data.frame(rownames = row.names(df), df, - check.names=FALSE, - stringsAsFactors=FALSE) - dbWriteTable(conn, paste0("data_", nm), df, row.names=FALSE) - } - } - - output <- mapply(output_data_meta, env_list, names(env_list)) - dbDisconnect(conn) - """ - - with TemporaryDirectory() as tmp_dir: - r_db = os.path.join(tmp_dir, "r_data.db") - r_file = os.path.join(tmp_dir, "r_batch.R") - rda_file = self.buffer_to_disk(tmp_dir) - - output = self.run_rscript(tmp_dir, r_batch, ["Rscript", r_file, rda_file]) - output_list = [i for i in output.strip().split("\n") if i != ""] - - oline: str - conn = sqlite3.connect(r_db) - dfs: Dict[str, DataFrame] = { - oline: read_sql(f"SELECT * FROM data_{oline}", conn) - for oline in output_list - } - conn.close() - - return dfs - - def read_rds_csv(self) -> Dict[str, DataFrame]: - """ - Read R rds data via IO csv. - - This method will call `readRDS` and `write.csv` in R to export single - data frame and metadata into temp csv files for pandas `read_csv`. - """ - - r_batch = """ - args <- commandArgs(trailingOnly=TRUE) - - raw <- readRDS(args[1]) - df <- tryCatch(data.frame(raw, - check.names=FALSE, - stringsAsFactors=FALSE - ), error = function(e) NULL) - - if(!is.null(df)) { - df <- data.frame(rownames = row.names(df), df, - check.names=FALSE, - stringsAsFactors=FALSE) - write.csv(df, file=args[2], row.names=FALSE) - - cat(paste0(colnames(df), collapse=","),"|", - paste0(sapply(df, function(x) - class(x)[1]), collapse=","), - sep="") - } - """ - - dfs: Dict[str, DataFrame] = {} - with TemporaryDirectory() as tmp_dir: - r_data = os.path.join(tmp_dir, "r_data.csv") - r_file = os.path.join(tmp_dir, "r_batch.R") - - rds_file = self.buffer_to_disk(tmp_dir) - output = self.run_rscript( - tmp_dir, r_batch, ["Rscript", r_file, rds_file, r_data] - ) - - if os.path.isfile(r_data): - r_hdrs = [h.split(",") for h in output.split("|")] - n: str - py_types = {n: self.r_to_py_types[d] for n, d in zip(*r_hdrs)} - - dt_cols = [col for col, d in py_types.items() if d == "date"] - py_types = {k: v for k, v in py_types.items() if v != "date"} - - try: - dfs["r_df"] = read_csv( - r_data, - dtype=py_types, # type: ignore[arg-type] - parse_dates=dt_cols, - encoding=self.encoding, - ) - except (ParserError, ValueError): - dfs["r_df"] = read_csv(r_data) - - return dfs - - def read_rds_feather(self) -> Dict[str, DataFrame]: - """ - Read R rds data via IO feather. - - This method will call `readRDS` and `write_feather` in R to export single - data frame into a temp feather file for pandas `read_feather`. - """ - - r_batch = """ - suppressPackageStartupMessages(library(arrow)) - args <- commandArgs(trailingOnly=TRUE) - - raw <- readRDS(args[1]) - df <- tryCatch(data.frame(raw, - check.names=FALSE, - stringsAsFactors=FALSE - ), error = function(e) NULL) - - if(!is.null(df)) { - df <- data.frame(rownames = row.names(df), df, - check.names=FALSE, - stringsAsFactors=FALSE) - arrow::write_feather(df, args[2]) - } - """ - - with TemporaryDirectory() as tmp_dir: - r_data = os.path.join(tmp_dir, "r_data.feather") - r_file = os.path.join(tmp_dir, "r_batch.R") - - rds_file = self.buffer_to_disk(tmp_dir) - self.run_rscript(tmp_dir, r_batch, ["Rscript", r_file, rds_file, r_data]) - - dfs: Dict[str, DataFrame] = ( - {"r_df": read_feather(r_data)} if os.path.isfile(r_data) else {} - ) - - return dfs - - def read_rds_parquet(self) -> Dict[str, DataFrame]: - """ - Read R rds data via IO parquet. - - This method will call `readRDS` and `write_parquet` in R to export - single data frame into a temp parquet file for pandas `read_parquet`. - """ - - r_batch = """ - suppressPackageStartupMessages(library(arrow)) - args <- commandArgs(trailingOnly=TRUE) - - raw <- readRDS(args[1]) - df <- tryCatch(data.frame(raw, - check.names=FALSE, - stringsAsFactors=FALSE - ), error = function(e) NULL) - - if(!is.null(df)) { - df <- data.frame(rownames = row.names(df), df, - check.names=FALSE, - stringsAsFactors=FALSE) - arrow::write_parquet(df, args[2]) - } - """ - - with TemporaryDirectory() as tmp_dir: - r_data = os.path.join(tmp_dir, "r_data.parquet") - r_file = os.path.join(tmp_dir, "r_batch.R") - - rds_file = self.buffer_to_disk(tmp_dir) - self.run_rscript(tmp_dir, r_batch, ["Rscript", r_file, rds_file, r_data]) - - dfs: Dict[str, DataFrame] = ( - {"r_df": read_parquet(r_data, engine="pyarrow")} - if os.path.isfile(r_data) - else {} - ) - - return dfs - - def read_rds_sqlite(self) -> Dict[str, DataFrame]: - """ - Read R rds data via IO sql. - - This method will call `readRDS` and `dbWriteTable` in R to export - single data frame into a temp SQLite database for pandas `read_sql`. - """ - import sqlite3 - - r_batch = """ - suppressPackageStartupMessages(library(RSQLite)) - args <- commandArgs(trailingOnly=TRUE) - - raw <- readRDS(args[1]) - df <- tryCatch(data.frame(raw, - check.names=FALSE, - stringsAsFactors=FALSE - ), error = function(e) NULL) - - if(!is.null(df)) { - conn <- dbConnect(RSQLite::SQLite(), args[2]) - df <- data.frame(rownames = row.names(df), df, - check.names=FALSE, - stringsAsFactors=FALSE) - dbWriteTable(conn, "rdata", df, row.names=FALSE) - dbDisconnect(conn) - } - """ - - dfs: Dict[str, DataFrame] = {} - with TemporaryDirectory() as tmp_dir: - r_data = os.path.join(tmp_dir, "r_data.db") - r_file = os.path.join(tmp_dir, "r_batch.R") - - rds_file = self.buffer_to_disk(tmp_dir) - self.run_rscript(tmp_dir, r_batch, ["Rscript", r_file, rds_file, r_data]) - - if os.path.isfile(r_data): - conn = sqlite3.connect(r_data) - dfs["r_df"] = read_sql("SELECT * FROM rdata", conn) - conn.close() - - return dfs - - class RDataWriter: """ Subclass to write pandas DataFrames into R data files. @@ -1114,31 +435,14 @@ class RDataWriter: file_format : {{'infer', 'rda', 'rdata', 'rds'}}, default 'infer' R serialization type. - engine : {{'rscript','pyreadr'}}, default 'utf-8' - Engine used to write R data. - - mode : {{'csv', 'parquet', 'feather'}}, default 'csv' - Python and R i/o transfer mode. - - other_frames : list, optional - Other DataFrames to be included in rda (not rds) files - that can contain multiple objects. - - rda_names : list, default ["pandas_dataframe"] - Names for all exported objects in rda file. + rda_name : str, default "pandas_dataframe" + Name for exported DataFrame in rda file. index : bool, default True Include index or MultiIndex in output as separate columns. - ascii : bool, default False - Write data in ASCII representation. - - compress : bool or {{'gzip', 'bzip2', 'xz'}}, default 'gzip' - Compression types for R data. For pyreadr engine, only gzip - is supported. Use False for uncompressed files. - - encoding : str, optional, default 'utf-8' - Encoding of R data. + compression : {'gzip', 'bz2', 'xz', None}, default 'gzip' + Compression type for on-the-fly decompression of on-disk data. storage_options : dict, optional Extra options that make sense for a particular storage connection, @@ -1147,7 +451,6 @@ class RDataWriter: See also -------- pandas.io.rdata.PyReadrWriter - pandas.io.rdata.RscriptWriter Notes ----- @@ -1163,27 +466,17 @@ def __init__( frame: DataFrame, path_or_buffer: FilePathOrBuffer, file_format: str = "infer", - engine: str = "rscript", - mode: str = "csv", - other_frames: Optional[List[DataFrame]] = None, - rda_names: List[str] = ["pandas_dataframe"], + rda_name: str = "pandas_dataframe", index: bool = True, - ascii: bool = False, - compress: Union[bool, str] = "gzip", - encoding: str = "utf-8", + compression: CompressionOptions = "gzip", storage_options: StorageOptions = None, ) -> None: self.frame = frame self.path_or_buffer = path_or_buffer self.file_format = file_format.lower() - self.engine = engine - self.mode = mode - self.other_frames = other_frames - self.rda_names = rda_names + self.rda_name = rda_name self.index = index - self.ascii = ascii - self.compress = compress - self.encoding = encoding + self.compression = compression self.storage_options = storage_options def verify_params(self) -> None: @@ -1212,40 +505,14 @@ def verify_params(self) -> None: if self.file_format == "infer" and isinstance(self.path_or_buffer, str): self.file_format = os.path.splitext(self.path_or_buffer.lower())[1][1:] - if self.mode is not None and self.mode not in [ - "csv", - "feather", - "parquet", - "sqlite", - ]: - raise ValueError(f"{self.mode} is not supported value for mode.") - - if self.other_frames is not None and not is_list_like(self.other_frames): - raise TypeError( - f"{type(self.other_frames).__name__} is not " - " a valid type for other_frames." - ) - elif self.other_frames is not None: - for df in self.other_frames: - if not isinstance(df, DataFrame): - raise TypeError( - "One or more of the objects in " - "other_frames is not a DataFrame." - ) - - if self.rda_names is not None and not is_list_like(self.rda_names): - raise TypeError( - f"{type(self.rda_names).__name__} is not a valid type for rda_names." - ) - - if self.compress is not None and self.compress not in [ - True, - False, + if self.compression is not None and self.compression not in [ "gzip", - "bzip2", + "bz2", "xz", ]: - raise ValueError(f"{self.compress} is not a supported value for compress.") + raise ValueError( + f"{self.compression} is not a supported value for compression." + ) def disk_to_buffer(self, r_file: str) -> None: """ @@ -1258,7 +525,7 @@ def disk_to_buffer(self, r_file: str) -> None: with get_handle( self.path_or_buffer, "wb", - compression=None, + compression=self.compression, storage_options=self.storage_options, is_text=False, ) as handles: @@ -1306,527 +573,16 @@ def write_data(self) -> None: write_rdata( path=r_temp, df=self.frame, - df_name=self.rda_names[0], - compress=self.compress, + df_name=self.rda_name, + compress=None, ) elif self.file_format == "rds": - write_rds(path=r_temp, df=self.frame, compress=self.compress) - - self.disk_to_buffer(r_temp) - - return None - - -class RscriptWriter(RDataWriter): - """ - Main class called in `pandas.core.frame` to write DataFrame(s) to R - data types using command line to Rscript. - """ - - def __init__(self, *args, **kwargs) -> None: - super().__init__(*args, **kwargs) - self.verify_params() - self.handle_objects() - - def handle_objects(self) -> None: - - self.all_frames = ( - [self.frame] + self.other_frames if self.other_frames else [self.frame] - ) - - if len(self.rda_names) != len(self.all_frames): - raise ValueError( - f"Length of {self.rda_names} does not match number " - "of current DataFrame and other_frames" - ) - - return None - - def run_rscript(self, tmp_dir, r_batch, cmds) -> None: - """ - Run R script at command line. - - This method will call subprocess.Popen to run R script - and return only non-empty error R output in console. - """ - - with open(cmds[1], "w") as f: - f.write(r_batch) - - a = subprocess.Popen( - cmds, - stdin=subprocess.PIPE, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - encoding=self.encoding, - cwd=tmp_dir, - ) - output, error = a.communicate() - if len(error) != 0: - raise RScriptError(error) - - return None - - def write_data(self) -> None: - self.py_to_r_types = { - "int32": "integer", - "int64": "integer", - "float64": "numeric", - "category": "factor", - "object": "character", - "bool": "logical", - "datetime64[ns]": "POSIXct", - } - - switch_board = { - "rda": { - "csv": self.write_rdata_csv, - "feather": self.write_rdata_feather, - "parquet": self.write_rdata_parquet, - "sqlite": self.write_rdata_sqlite, - }, - "rdata": { - "csv": self.write_rdata_csv, - "feather": self.write_rdata_feather, - "parquet": self.write_rdata_parquet, - "sqlite": self.write_rdata_sqlite, - }, - "rds": { - "csv": self.write_rds_csv, - "feather": self.write_rds_feather, - "parquet": self.write_rds_parquet, - "sqlite": self.write_rds_sqlite, - }, - } - - switch_board[self.file_format][self.mode]() - - return None - - def write_rdata_csv(self) -> None: - """ - Write R rda data via IO csv. - - This method will export one or more DataFrames into temp data - and metadata csv files and call `read.csv` and `save` in R. - """ - - r_batch = """ - args <- commandArgs(trailingOnly=TRUE) - - py_names <- strsplit(args[1], ",")[[1]] - - for(obj in py_names) { - meta <- paste0("meta_", obj, ".txt") - r_types <- strsplit(readLines(meta, n=-1, - warn=FALSE), ",")[[1]] - - data <- paste0("data_", obj, ".csv") - df <- tryCatch( - read.csv(data, colClasses=r_types), - error = function(e) read.csv(data) - ) - assign(obj, df) - rm(df) - } - - r_ascii <- as.logical(args[3]) - r_compress <- ifelse(args[4] %in% c("True", "False"), - as.logical(args[4]), - args[4]) - - dfs <- names(Filter(is.data.frame, mget(ls()))) - save(list=dfs, file=args[2], - ascii=r_ascii, compress=r_compress) - """ - - with TemporaryDirectory() as tmp_dir: - for nm, df in zip(self.rda_names, self.all_frames): - - data_file = os.path.join(tmp_dir, f"data_{nm}.csv") - meta_file = os.path.join(tmp_dir, f"meta_{nm}.txt") - r_code = os.path.join(tmp_dir, "rbatch.R") - r_temp = os.path.join(tmp_dir, "rdata.rda") - - df = df.reset_index() if self.index else df - df.to_csv(data_file, index=False) - - with open(meta_file, "w") as f: - f.write( - ",".join( - self.py_to_r_types[p] - for p in df.dtypes.astype(str).tolist() - ) - ) - - cmds = [ - "Rscript", - r_code, - ",".join(self.rda_names), - r_temp, - str(self.ascii), - str(self.compress), - ] - self.run_rscript(tmp_dir, r_batch, cmds) - - self.disk_to_buffer(r_temp) - - return None - - def write_rdata_feather(self) -> None: - """ - Write R rda data via IO feather. - - This method will export one or more DataFrames into temp - feather files and call `read_feather` and `save` in R. - """ - - r_batch = """ - suppressPackageStartupMessages(library(arrow)) - args <- commandArgs(trailingOnly=TRUE) - - py_names <- strsplit(args[1], ",")[[1]] - - for(obj in py_names) { - data <- paste0("data_", obj, ".feather") - df <- arrow::read_feather(data) - assign(obj, df) - rm(df) - } - - r_ascii <- as.logical(args[3]) - r_compress <- ifelse(args[4] %in% c("True", "False"), - as.logical(args[4]), - args[4]) - - dfs <- names(Filter(is.data.frame, mget(ls()))) - save(list=dfs, file=args[2], - ascii=r_ascii, compress=r_compress) - """ - - with TemporaryDirectory() as tmp_dir: - for nm, df in zip(self.rda_names, self.all_frames): - - data_file = os.path.join(tmp_dir, f"data_{nm}.feather") - r_code = os.path.join(tmp_dir, "rbatch.R") - r_temp = os.path.join(tmp_dir, "rdata.rda") - - df = df.reset_index() if self.index else df.reset_index(drop=True) - df.to_feather(data_file) - - cmds = [ - "Rscript", - r_code, - ",".join(self.rda_names), - r_temp, - str(self.ascii), - str(self.compress), - ] - self.run_rscript(tmp_dir, r_batch, cmds) - - self.disk_to_buffer(r_temp) - - def write_rdata_parquet(self) -> None: - """ - Write R rda data via IO parquet. - - This method will export one or more DataFrames into temp - parquet files and call `read_parquet` and `save` in R. - """ - - r_batch = """ - suppressPackageStartupMessages(library(arrow)) - args <- commandArgs(trailingOnly=TRUE) - - py_names <- strsplit(args[1], ",")[[1]] - - for(obj in py_names) { - data <- paste0("data_", obj, ".parquet") - df <- arrow::read_parquet(data) - assign(obj, df) - rm(df) - } - - r_ascii <- as.logical(args[3]) - r_compress <- ifelse(args[4] %in% c("True", "False"), - as.logical(args[4]), - args[4]) - - dfs <- names(Filter(is.data.frame, mget(ls()))) - save(list=dfs, file=args[2], - ascii=r_ascii, compress=r_compress) - """ - - with TemporaryDirectory() as tmp_dir: - for nm, df in zip(self.rda_names, self.all_frames): - - data_file = os.path.join(tmp_dir, f"data_{nm}.parquet") - r_code = os.path.join(tmp_dir, "rbatch.R") - r_temp = os.path.join(tmp_dir, "rdata.rda") - - df = df.reset_index() if self.index else df - df.to_parquet(data_file, index=False) - - cmds = [ - "Rscript", - r_code, - ",".join(self.rda_names), - r_temp, - str(self.ascii), - str(self.compress), - ] - self.run_rscript(tmp_dir, r_batch, cmds) - - self.disk_to_buffer(r_temp) - - def write_rdata_sqlite(self) -> None: - """ - Write R rda data via IO sql. - - This method will export one or more DataFrames into a temp - SQLite database and call `dbReadTable` and `save` in R. - """ - import sqlite3 - - r_batch = """ - suppressPackageStartupMessages(library(RSQLite)) - args <- commandArgs(trailingOnly=TRUE) - - conn <- dbConnect(RSQLite::SQLite(), args[1]) - py_names <- strsplit(args[2], ",")[[1]] - - for(obj in py_names) { - data <- paste0("data_", obj) - df <- dbReadTable(conn, data) - assign(obj, df) - rm(df) - } - dbDisconnect(conn) - - r_ascii <- as.logical(args[4]) - r_compress <- ifelse(args[5] %in% c("True", "False"), - as.logical(args[5]), - args[5]) - - dfs <- names(Filter(is.data.frame, mget(ls()))) - save(list=dfs, file=args[3], - ascii=r_ascii, compress=r_compress) - """ - - with TemporaryDirectory() as tmp_dir: - r_db = os.path.join(tmp_dir, "rdata.db") - conn = sqlite3.connect(r_db) - - for nm, df in zip(self.rda_names, self.all_frames): - r_code = os.path.join(tmp_dir, "rbatch.R") - r_temp = os.path.join(tmp_dir, "rdata.rda") - - df = df.reset_index() if self.index else df - df.to_sql(f"data_{nm}", conn, index=False) - - conn.close() - cmds = [ - "Rscript", - r_code, - r_db, - ",".join(self.rda_names), - r_temp, - str(self.ascii), - str(self.compress), - ] - self.run_rscript(tmp_dir, r_batch, cmds) - - self.disk_to_buffer(r_temp) - - def write_rds_csv(self) -> None: - """ - Write R rds data via IO csv. - - This method will export a single DataFrame into temp csv - data and call `read.csv` and `saveRDS` in R. - """ - - r_batch = """ - args <- commandArgs(trailingOnly=TRUE) - py_data <- args[1] - r_types <- strsplit(args[2], ",")[[1]] - - df <- tryCatch( - read.csv(py_data, colClasses=r_types), - error = function(e) read.csv(py_data) - ) - - r_ascii <- as.logical(args[4]) - r_compress <- ifelse(args[5] %in% c("True", "False"), - as.logical(args[5]), - args[5]) - - saveRDS(df, file=args[3], - ascii=r_ascii, compress=r_compress) - """ - - with TemporaryDirectory() as tmp_dir: - r_code = os.path.join(tmp_dir, "rbatch.R") - py_data = os.path.join(tmp_dir, "pydata.csv") - r_temp = os.path.join(tmp_dir, "rdata.rds") - - py_df = self.frame.reset_index() if self.index else self.frame - r_types = ",".join(py_df.dtypes.astype(str).replace(self.py_to_r_types)) - - py_df.to_csv(py_data, index=False) - - cmds = [ - "Rscript", - r_code, - py_data, - r_types, - r_temp, - str(self.ascii), - str(self.compress), - ] - self.run_rscript(tmp_dir, r_batch, cmds) + write_rds( + path=r_temp, + df=self.frame, + compress=None, + ) self.disk_to_buffer(r_temp) return None - - def write_rds_feather(self) -> None: - """ - Write R rds data via IO feather. - - This method will export a single DataFrame into a temp - feather file to call `read_feather` and `saveRDS` in R. - """ - - r_batch = """ - suppressPackageStartupMessages(library(arrow)) - args <- commandArgs(trailingOnly=TRUE) - - df <- arrow::read_feather(args[1]) - - r_ascii <- as.logical(args[3]) - r_compress <- ifelse(args[4] %in% c("True", "False"), - as.logical(args[4]), - args[4]) - - saveRDS(df, file=args[2], - ascii=r_ascii, compress=r_compress) - """ - - with TemporaryDirectory() as tmp_dir: - r_code = os.path.join(tmp_dir, "rbatch.R") - py_data = os.path.join(tmp_dir, "pydata.feather") - r_temp = os.path.join(tmp_dir, "rdata.rds") - - py_df = ( - self.frame.reset_index() - if self.index - else self.frame.reset_index(drop=True) - ) - - py_df.to_feather(py_data) - - cmds = [ - "Rscript", - r_code, - py_data, - r_temp, - str(self.ascii), - str(self.compress), - ] - self.run_rscript(tmp_dir, r_batch, cmds) - - self.disk_to_buffer(r_temp) - - def write_rds_parquet(self) -> None: - """ - Write R rds data via IO parquet. - - This method will export a single DataFrame into a temp - parquet file for `read_parquet` and `saveRDS` in R. - """ - - r_batch = """ - suppressPackageStartupMessages(library(arrow)) - args <- commandArgs(trailingOnly=TRUE) - - df <- arrow::read_parquet(args[1]) - - r_ascii <- as.logical(args[3]) - r_compress <- ifelse(args[4] %in% c("True", "False"), - as.logical(args[4]), - args[4]) - - saveRDS(df, file=args[2], - ascii=r_ascii, compress=r_compress) - """ - - with TemporaryDirectory() as tmp_dir: - r_code = os.path.join(tmp_dir, "rbatch.R") - py_data = os.path.join(tmp_dir, "pydata.parquet") - r_temp = os.path.join(tmp_dir, "rdata.rds") - - py_df = self.frame.reset_index() if self.index else self.frame - - py_df.to_parquet(py_data, index=False) - - cmds = [ - "Rscript", - r_code, - py_data, - r_temp, - str(self.ascii), - str(self.compress), - ] - self.run_rscript(tmp_dir, r_batch, cmds) - - self.disk_to_buffer(r_temp) - - def write_rds_sqlite(self) -> None: - """ - Write R rds data via IO sql. - - This method will export a single DataFrame into a temp - parquet file for `dbReadTable` and `saveRDS` in R. - """ - import sqlite3 - - r_batch = """ - suppressPackageStartupMessages(library(RSQLite)) - args <- commandArgs(trailingOnly=TRUE) - - conn <- dbConnect(RSQLite::SQLite(), args[1]) - df <- dbReadTable(conn, "pydata") - - r_ascii <- as.logical(args[3]) - r_compress <- ifelse(args[4] %in% c("True", "False"), - as.logical(args[4]), - args[4]) - - saveRDS(df, file=args[2], - ascii=r_ascii, compress=r_compress) - dbDisconnect(conn) - """ - - with TemporaryDirectory() as tmp_dir: - r_code = os.path.join(tmp_dir, "rbatch.R") - py_data = os.path.join(tmp_dir, "pydata.db") - r_temp = os.path.join(tmp_dir, "rdata.rds") - - py_df = self.frame.reset_index() if self.index else self.frame - - conn = sqlite3.connect(py_data) - py_df.to_sql("pydata", conn, index=False) - conn.close() - - cmds = [ - "Rscript", - r_code, - py_data, - r_temp, - str(self.ascii), - str(self.compress), - ] - self.run_rscript(tmp_dir, r_batch, cmds) - - self.disk_to_buffer(r_temp) diff --git a/pandas/tests/io/rdata/test_rscript.py b/pandas/tests/io/rdata/test_rscript.py deleted file mode 100644 index 5df2b499a66a3..0000000000000 --- a/pandas/tests/io/rdata/test_rscript.py +++ /dev/null @@ -1,987 +0,0 @@ -from io import BytesIO -import os -import subprocess -from urllib.error import HTTPError - -import pytest - -from pandas.compat._optional import import_optional_dependency -import pandas.util._test_decorators as td - -from pandas import DataFrame -import pandas._testing as tm - -from pandas.io.rdata import ( - RSCRIPT_EXISTS, - RScriptError, - read_rdata, -) - -pytestmark = pytest.mark.skipif(not RSCRIPT_EXISTS, reason="R is not installed.") - -ghg_df = DataFrame( - { - "gas": { - "141": "Carbon dioxide", - "142": "Methane", - "143": "Nitrous oxide", - "144": "Fluorinated gases", - "145": "Total", - }, - "year": {"141": 2018, "142": 2018, "143": 2018, "144": 2018, "145": 2018}, - "emissions": { - "141": 5424.88150213288, - "142": 634.457127078267, - "143": 434.528555376666, - "144": 182.782432461777, - "145": 6676.64961704959, - }, - } -).rename_axis("rownames") - -plants_df = DataFrame( - { - "plant_group": { - "16": "Pteridophytes", - "17": "Pteridophytes", - "18": "Pteridophytes", - "19": "Pteridophytes", - "20": "Pteridophytes", - }, - "status": { - "16": "Data Deficient", - "17": "Extinct", - "18": "Not Threatened", - "19": "Possibly Threatened", - "20": "Threatened", - }, - "count": {"16": 398, "17": 65, "18": 1294, "19": 408, "20": 1275}, - } -).rename_axis("rownames") - -sea_ice_df = DataFrame( - { - "year": {"1012": 2016, "1013": 2017, "1014": 2018, "1015": 2019, "1016": 2020}, - "mo": {"1012": 12, "1013": 12, "1014": 12, "1015": 12, "1016": 12}, - "data.type": { - "1012": "Goddard", - "1013": "Goddard", - "1014": "Goddard", - "1015": "Goddard", - "1016": "NRTSI-G", - }, - "region": {"1012": "S", "1013": "S", "1014": "S", "1015": "S", "1016": "S"}, - "extent": { - "1012": 8.28, - "1013": 9.48, - "1014": 9.19, - "1015": 9.41, - "1016": 10.44, - }, - "area": {"1012": 5.51, "1013": 6.23, "1014": 5.59, "1015": 6.59, "1016": 6.5}, - } -).rename_axis("rownames") - - -def run_rscript(cmds) -> str: - """ - Run R script at command line. - - This method will read write_rdata output and check - console output. - """ - - r_batch = """ - args <- commandArgs(trailingOnly=TRUE) - - switch(args[2], - "rda" = load(args[1]), - "rds" = { - pandas_dataframe <- readRDS(args[1]) - } - ) - - rm(args) - mget(ls()) - """ - with open(cmds[1], "w") as f: - f.write(r_batch) - - p = subprocess.Popen( - cmds, - stdin=subprocess.PIPE, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - encoding="UTF-8", - ) - output, error = p.communicate() - if len(error) != 0: - raise ValueError(error) - - return output - - -def r_package_installed(name): - """ - Check if R package is installed. - - Method runs a quick command line call to Rscript to - check if library call succeeds on named package. - """ - - p = subprocess.Popen( - ["Rscript", "-e", f"suppressPackageStartupMessages(library({name}))"], - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - ) - out, err = p.communicate() - - return len(err) == 0 - - -R_ARROW = r_package_installed("arrow") if RSCRIPT_EXISTS else None -R_RSQLITE = r_package_installed("RSQLite") if RSCRIPT_EXISTS else None -PYARROW = import_optional_dependency("pyarrow", errors="ignore") - - -def adj_int(df): - """ - Convert int32 columns to int64. - - Since parquet and feather modes parses ints int int32, - this method converts for testing. - """ - for col in df.select_dtypes("int32").columns: - df[col] = df[col].astype("int64") - - return df - - -def handle_index_rownames(df): - df = df.drop(["rownames"], axis=1).set_index("index").rename_axis(None) - - return df - - -@pytest.fixture(params=["rda", "rds"]) -def rtype(request): - return request.param - - -@pytest.fixture( - params=[ - "csv", - pytest.param( - "parquet", - marks=pytest.mark.skipif( - not R_ARROW or not PYARROW, - reason="R arrow or pyarrow not installed", - ), - ), - pytest.param( - "feather", - marks=pytest.mark.skipif( - not R_ARROW or not PYARROW, - reason="R arrow or pyarrow not installed", - ), - ), - pytest.param( - "sqlite", - marks=pytest.mark.skipif(not R_RSQLITE, reason="R RSQLite not installed"), - ), - ] -) -def mode(request): - return request.param - - -@pytest.fixture(params=[True, False, None]) -def ascii(request): - return request.param - - -@pytest.fixture(params=[False, "gzip", "bzip2", "xz"]) -def comp(request): - return request.param - - -# RDA READER - -# PATH_OR_BUFFER - - -def test_read_rds_file(datapath): - filename = datapath("io", "data", "rdata", "ghg_df.rds") - r_df = read_rdata(filename, engine="rscript") - - if isinstance(r_df, DataFrame): - tm.assert_frame_equal(ghg_df, r_df.tail()) - - -def test_read_rda_file(datapath): - filename = datapath("io", "data", "rdata", "env_data_dfs.rda") - r_dfs = read_rdata(filename, engine="rscript") - - assert list(r_dfs.keys()) == ["plants_df", "sea_ice_df", "ghg_df"] - - tm.assert_frame_equal(ghg_df, r_dfs["ghg_df"].tail()) - tm.assert_frame_equal(plants_df, r_dfs["plants_df"].tail()) - tm.assert_frame_equal(sea_ice_df, r_dfs["sea_ice_df"].tail()) - - -def test_buffer_read_rds(datapath): - filename = datapath("io", "data", "rdata", "sea_ice_df.rds") - - with open(filename, "rb") as f: - r_df = read_rdata(f, file_format="rds", engine="rscript") - - output = adj_int(r_df).tail() - - tm.assert_frame_equal(sea_ice_df, output) - - -def test_bytes_read_rda(datapath): - filename = datapath("io", "data", "rdata", "env_data_dfs.rda") - - with open(filename, "rb") as f: - r_dfs = read_rdata(f, file_format="rda", engine="rscript") - - r_dfs = {str(k): adj_int(v) for k, v in r_dfs.items()} - - assert list(r_dfs.keys()) == ["plants_df", "sea_ice_df", "ghg_df"] - - tm.assert_frame_equal(ghg_df, r_dfs["ghg_df"].tail()) - tm.assert_frame_equal(plants_df, r_dfs["plants_df"].tail()) - tm.assert_frame_equal(sea_ice_df, r_dfs["sea_ice_df"].tail()) - - -def test_bytesio_rds(datapath): - filename = datapath("io", "data", "rdata", "sea_ice_df.rds") - - with open(filename, "rb") as f: - with BytesIO(f.read()) as b_io: - r_df = read_rdata(b_io, file_format="rds", engine="rscript") - - output = adj_int(r_df).tail() - - tm.assert_frame_equal(sea_ice_df, output) - - -def test_bytesio_rda(datapath): - filename = datapath("io", "data", "rdata", "env_data_dfs.rda") - - with open(filename, "rb") as f: - with BytesIO(f.read()) as b_io: - r_dfs = read_rdata(b_io, file_format="rda", engine="rscript") - - r_dfs = {str(k): adj_int(v) for k, v in r_dfs.items()} - - assert list(r_dfs.keys()) == ["plants_df", "sea_ice_df", "ghg_df"] - - tm.assert_frame_equal(ghg_df, r_dfs["ghg_df"].tail()) - tm.assert_frame_equal(plants_df, r_dfs["plants_df"].tail()) - tm.assert_frame_equal(sea_ice_df, r_dfs["sea_ice_df"].tail()) - - -# FILE FORMAT - - -def test_read_wrong_format(datapath): - with pytest.raises(ValueError, match="not a valid value for file_format"): - filename = datapath("io", "data", "rdata", "plants_df.rds") - read_rdata(filename, engine="rscript", file_format="r") - - -def test_read_wrong_file(): - with pytest.raises(FileNotFoundError, match="file cannot be found"): - filename = os.path.join("data", "rdata", "plants_df.rda") - read_rdata(filename, engine="rscript") - - -@pytest.mark.slow -def test_read_rds_non_dfs(datapath, mode): - with pytest.raises( - ValueError, match="No actual data frame or coercible data frames" - ): - filename = datapath("io", "data", "rdata", "ghg_t_tests.rds") - read_rdata(filename, engine="rscript", mode=mode) - - -@pytest.mark.slow -def test_read_rda_non_dfs(datapath, mode): - with pytest.raises( - ValueError, match="No actual data frame or coercible data frames" - ): - filename = datapath("io", "data", "rdata", "env_data_non_dfs.rda") - read_rdata(filename, engine="rscript", mode=mode) - - -@td.skip_if_not_us_locale -def test_read_not_rda_file(datapath, mode): - with pytest.raises(RScriptError, match="bad restore file magic number"): - read_rdata( - datapath("io", "data", "rdata", "ppm_df.csv"), - file_format="rda", - engine="rscript", - mode=mode, - ) - - -@td.skip_if_not_us_locale -def test_read_not_rds_file(datapath, mode): - with pytest.raises(RScriptError, match="unknown input format"): - read_rdata( - datapath("io", "data", "rdata", "ppm_df.csv"), - file_format="rds", - engine="rscript", - mode=mode, - ) - - -def test_bytes_read_infer_rds(datapath): - filename = datapath("io", "data", "rdata", "sea_ice_df.rds") - - with pytest.raises(ValueError, match="Unable to infer file format from file name"): - with open(filename, "rb") as f: - read_rdata(f, engine="rscript") - - -def test_bytes_read_infer_rda(datapath): - filename = datapath("io", "data", "rdata", "env_data_dfs.rda") - - with pytest.raises(ValueError, match="Unable to infer file format from file name"): - with open(filename, "rb") as f: - read_rdata(f, engine="rscript") - - -# URL - - -@tm.network -def test_read_rda_url(): - url_df = DataFrame( - { - "carrier": {"1": "9E", "2": "AA", "3": "AS", "4": "B6", "5": "DL"}, - "name": { - "1": "Endeavor Air Inc.", - "2": "American Airlines Inc.", - "3": "Alaska Airlines Inc.", - "4": "JetBlue Airways", - "5": "Delta Air Lines Inc.", - }, - } - ).rename_axis("rownames") - - url = ( - "https://github.com/hadley/nycflights13/blob/master/data/airlines.rda?raw=true" - ) - r_df = read_rdata(url, file_format="rda", engine="rscript")["airlines"] - - tm.assert_frame_equal(url_df, r_df.head()) - - -@tm.network -def test_read_unable_infer_format(): - with pytest.raises(ValueError, match="Unable to infer file format from file name"): - url = ( - "https://github.com/hadley/nycflights13/" - "blob/master/data/airlines.rda?raw=true" - ) - read_rdata(url, engine="rscript") - - -@tm.network -def test_read_wrong_url(): - with pytest.raises(HTTPError, match="HTTP Error 404: Not Found"): - url = "https://example.com/data.rdata" - read_rdata(url, engine="rscript") - - -# S3 - - -@tm.network -@td.skip_if_no("s3fs") -def test_read_rda_s3(): - s3 = "s3://assets.datacamp.com/production/course_1478/datasets/wine.RData" - s3_df = DataFrame( - { - "Alcohol": {"1": 13.2, "2": 13.16, "3": 14.37, "4": 13.24, "5": 14.2}, - "Malic acid": {"1": 1.78, "2": 2.36, "3": 1.95, "4": 2.59, "5": 1.76}, - "Ash": {"1": 2.14, "2": 2.67, "3": 2.5, "4": 2.87, "5": 2.45}, - "Alcalinity of ash": { - "1": 11.2, - "2": 18.6, - "3": 16.8, - "4": 21.0, - "5": 15.2, - }, - "Magnesium": {"1": 100, "2": 101, "3": 113, "4": 118, "5": 112}, - "Total phenols": {"1": 2.65, "2": 2.8, "3": 3.85, "4": 2.8, "5": 3.27}, - "Flavanoids": {"1": 2.76, "2": 3.24, "3": 3.49, "4": 2.69, "5": 3.39}, - "Nonflavanoid phenols": { - "1": 0.26, - "2": 0.3, - "3": 0.24, - "4": 0.39, - "5": 0.34, - }, - "Proanthocyanins": {"1": 1.28, "2": 2.81, "3": 2.18, "4": 1.82, "5": 1.97}, - "Color intensity": {"1": 4.38, "2": 5.68, "3": 7.8, "4": 4.32, "5": 6.75}, - "Hue": {"1": 3.4, "2": 3.17, "3": 3.45, "4": 2.93, "5": 2.85}, - "Proline": {"1": 1050, "2": 1185, "3": 1480, "4": 735, "5": 1450}, - } - ).rename_axis("rownames") - r_dfs = read_rdata(s3, engine="rscript") - - tm.assert_frame_equal(s3_df, r_dfs["wine"].head()) - - -# ENGINE - - -def test_read_rds_df_output(datapath): - filename = datapath("io", "data", "rdata", "sea_ice_df.rds") - r_dfs = read_rdata(filename, engine="rscript") - - assert isinstance(r_dfs, DataFrame) - - -def test_read_rda_dict_output(datapath): - filename = datapath("io", "data", "rdata", "env_data_dfs.rda") - r_dfs = read_rdata(filename, engine="rscript") - - assert isinstance(r_dfs, dict) - assert list(r_dfs.keys()) == ["plants_df", "sea_ice_df", "ghg_df"] - - -def test_read_wrong_engine(datapath): - with pytest.raises(ValueError, match="not a supported engine"): - filename = datapath("io", "data", "rdata", "sea_ice_df.rds") - read_rdata(filename, engine="rpy2") - - -# MODE - - -@pytest.mark.slow -def test_read_rds_mode_file(datapath, mode): - filename = datapath("io", "data", "rdata", "ghg_df.rds") - r_df = read_rdata(filename, engine="rscript", mode=mode) - - output = adj_int(r_df).tail() - - tm.assert_frame_equal(ghg_df, output) - - -@pytest.mark.slow -def test_read_rda_mode_file(datapath, mode): - filename = datapath("io", "data", "rdata", "env_data_dfs.rda") - r_dfs = read_rdata(filename, engine="rscript", mode=mode) - - if mode in ["parquet", "feather"]: - (r_dfs["ghg_df"], r_dfs["plants_df"], r_dfs["sea_ice_df"]) = ( - adj_int(r_dfs["ghg_df"]), - adj_int(r_dfs["plants_df"]), - adj_int(r_dfs["sea_ice_df"]), - ) - - assert list(r_dfs.keys()) == ["plants_df", "sea_ice_df", "ghg_df"] - - tm.assert_frame_equal(ghg_df, r_dfs["ghg_df"].tail()) - tm.assert_frame_equal(plants_df, r_dfs["plants_df"].tail()) - tm.assert_frame_equal(sea_ice_df, r_dfs["sea_ice_df"].tail()) - - -def test_read_wrong_mode(datapath): - with pytest.raises(ValueError, match="not supported value for mode"): - filename = datapath("io", "data", "rdata", "plants_df.rds") - read_rdata(filename, engine="rscript", mode="pickle") - - -# USE_OBJECTS - - -def test_read_select_frames_rda_dfs(datapath): - filename = datapath("io", "data", "rdata", "env_data_dfs.rda") - r_dfs = read_rdata( - filename, engine="rscript", select_frames=["ghg_df", "sea_ice_df"] - ) - - assert "plants_df" not in list(r_dfs.keys()) - assert "ghg_df" in list(r_dfs.keys()) - assert "sea_ice_df" in list(r_dfs.keys()) - - -def test_read_select_frames_rda_objs(datapath): - filename = datapath("io", "data", "rdata", "env_data_objs.rda") - r_dfs = read_rdata( - filename, - engine="rscript", - select_frames=["ppm_ts", "species_mtx", "plants_arry"], - ) - - assert "species_vec" not in list(r_dfs.keys()) - assert "ghg_df" not in list(r_dfs.keys()) - - assert "ppm_ts" in list(r_dfs.keys()) - assert "species_mtx" in list(r_dfs.keys()) - assert "plants_arry" in list(r_dfs.keys()) - - -def test_read_wrong_select_frames(datapath): - with pytest.raises(TypeError, match="not a valid type for select_frames"): - filename = datapath("io", "data", "rdata", "env_data_dfs.rda") - read_rdata( - filename, - engine="rscript", - select_frames="plants_df", # type: ignore[arg-type] - ) - - -# ROWNAMES - - -def test_read_rownames_true_rds(datapath): - filename = datapath("io", "data", "rdata", "sea_ice_df.rds") - r_df = read_rdata(filename, engine="rscript", rownames=True) - - if isinstance(r_df, DataFrame): - assert r_df.index.name == "rownames" - - -def test_read_rownames_false_rds(datapath): - filename = datapath("io", "data", "rdata", "sea_ice_df.rds") - r_df = read_rdata(filename, engine="rscript", rownames=False) - - if isinstance(r_df, DataFrame): - assert r_df.index.name != "rownames" - - -def test_read_rownames_true_rda(datapath): - filename = datapath("io", "data", "rdata", "env_data_dfs.rda") - r_dfs = read_rdata(filename, engine="rscript", rownames=True) - - assert r_dfs["ghg_df"].index.name == "rownames" - assert r_dfs["plants_df"].index.name == "rownames" - assert r_dfs["sea_ice_df"].index.name == "rownames" - - -def test_read_rownames_false_rda(datapath): - filename = datapath("io", "data", "rdata", "env_data_dfs.rda") - r_dfs = read_rdata(filename, engine="rscript", rownames=False) - - assert r_dfs["ghg_df"].index.name != "rownames" - assert r_dfs["plants_df"].index.name != "rownames" - assert r_dfs["sea_ice_df"].index.name != "rownames" - - -# ENCODING - - -@pytest.mark.xfail( - reason="R encoding is locale specific. Need to think about workaround." -) -def test_non_utf8_data(datapath, rtype): - filename = datapath("io", "data", "rdata", f"climate_non_utf8_df.{rtype}") - - expected = DataFrame( - { - "número": { - "1": 1, - "2": 2, - "3": 3, - "4": 4, - "5": 5, - "6": 6, - "7": 7, - "8": 8, - "9": 9, - "10": 10, - }, - "punto central del climatismo": { - "1": "Parada de la circulación de vuelco meridional del Atlántico", - "2": "Desintegración de la capa de hielo de la Antártida occidental", - "3": "Muerte de la selva amazónica", - "4": "Cambio de monzón en África occidental", - "5": "Permafrost e hidratos de metano", - "6": "Muerte de los arrecifes de coral", - "7": "Cambio de monzón de la India", - "8": "Desintegración de la capa de hielo de Groenlandia", - "9": "Desplazamiento del bosque boreal", - "10": "Reducción del hielo marino del Ártico ", - }, - }, - index=[str(i) for i in range(1, 11)], - ).rename_axis("rownames") - - rdfs = read_rdata(filename, engine="rscript", encoding="iso-8859-1", mode="csv") - - output = rdfs["climate_df"] if rtype == "rda" else rdfs - - tm.assert_frame_equal(output, expected) - - -# RDA WRITER - -# PATH_OR_BUFFER - - -@pytest.mark.slow -def test_write_read_file(datapath, rtype, mode): - with tm.ensure_clean("test.out") as path: - ghg_df.to_rdata( - path, file_format=rtype, engine="rscript", mode=mode, index=False - ) - r_dfs = read_rdata( - path, file_format=rtype, engine="rscript", mode=mode, rownames=False - ) - - expected = ghg_df.reset_index(drop=True) - output = r_dfs if rtype == "rds" else r_dfs["pandas_dataframe"] - output["year"] = output["year"].astype("int64") - - tm.assert_frame_equal(output, expected) - - -@pytest.mark.slow -def test_write_read_bytes_io(datapath, rtype, mode): - with BytesIO() as b_io: - sea_ice_df.to_rdata( - b_io, file_format=rtype, engine="rscript", mode=mode, index=False - ) - r_dfs = read_rdata( - b_io.getvalue(), # type: ignore[arg-type] - file_format=rtype, - engine="rscript", - mode=mode, - rownames=False, - ) - - expected = sea_ice_df.reset_index(drop=True) - output = r_dfs if rtype == "rds" else r_dfs["pandas_dataframe"] - output["year"] = output["year"].astype("int64") - output["mo"] = output["mo"].astype("int64") - - tm.assert_frame_equal(output, expected) - - -# FILE_FORMAT - - -def test_write_rda_file(rtype): - expected = """\ -$pandas_dataframe - rownames year mo data.type region extent area -1 1012 2016 12 Goddard S 8.28 5.51 -2 1013 2017 12 Goddard S 9.48 6.23 -3 1014 2018 12 Goddard S 9.19 5.59 -4 1015 2019 12 Goddard S 9.41 6.59 -5 1016 2020 12 NRTSI-G S 10.44 6.50 - -""" - with tm.ensure_clean_dir() as tmp_dir: - out_file = os.path.join(tmp_dir, "rdata.out") - r_code = os.path.join(tmp_dir, "r_test.R") - - sea_ice_df.to_rdata(out_file, file_format=rtype, engine="rscript") - - cmds = ["Rscript", r_code, out_file, rtype, "pandas_dataframe"] - output = run_rscript(cmds) - - assert output == expected - - -def test_write_wrong_format(): - with tm.ensure_clean("test.rda") as path: - with pytest.raises(ValueError, match=("not a valid value for file_format")): - ghg_df.to_rdata(path, engine="rscript", file_format="csv") - - -def test_write_unable_to_infer(): - with tm.ensure_clean("test") as path: - with pytest.raises( - ValueError, match=("Unable to infer file format from file name") - ): - ghg_df.to_rdata(path, engine="rscript") - - -# ENGINE - - -@td.skip_if_no("pyreadr") -def test_write_engine_consistency(rtype): - expected = """\ -$pandas_dataframe - rownames plant_group status count -1 16 Pteridophytes Data Deficient 398 -2 17 Pteridophytes Extinct 65 -3 18 Pteridophytes Not Threatened 1294 -4 19 Pteridophytes Possibly Threatened 408 -5 20 Pteridophytes Threatened 1275 - -""" - with tm.ensure_clean_dir() as tmp_dir: - out_file = os.path.join(tmp_dir, "rdata.out") - r_code = os.path.join(tmp_dir, "r_test.R") - - plants_df.to_rdata(out_file, file_format=rtype, engine="pyreadr") - cmds = ["Rscript", r_code, out_file, rtype, "pandas_dataframe"] - pyr_output = run_rscript(cmds) - - plants_df.to_rdata(out_file, file_format=rtype, engine="rscript") - cmds = ["Rscript", r_code, out_file, rtype, "pandas_dataframe"] - rcomp_output = run_rscript(cmds) - - assert pyr_output == expected - assert pyr_output == rcomp_output - - -def test_write_wrong_engine(): - with tm.ensure_clean("test.rda") as path: - with pytest.raises(ValueError, match=("not a supported engine")): - ghg_df.to_rdata(path, engine="rpy2") - - -# MODE - - -@pytest.mark.slow -def test_write_mode(rtype, mode): - expected = """\ -$pandas_dataframe - rownames gas year emissions -1 141 Carbon dioxide 2018 5424.8815 -2 142 Methane 2018 634.4571 -3 143 Nitrous oxide 2018 434.5286 -4 144 Fluorinated gases 2018 182.7824 -5 145 Total 2018 6676.6496 - -""" - with tm.ensure_clean_dir() as tmp_dir: - out_file = os.path.join(tmp_dir, "rdata.out") - r_code = os.path.join(tmp_dir, "r_test.R") - - ghg_df.to_rdata(out_file, file_format=rtype, engine="rscript", mode=mode) - cmds = ["Rscript", r_code, out_file, rtype, "pandas_dataframe"] - output = run_rscript(cmds) - - assert output == expected - - -def test_write_wrong_mode(): - with tm.ensure_clean("test.rds") as path: - with pytest.raises(ValueError, match=("not supported value for mode")): - ghg_df.to_rdata(path, engine="rscript", mode="pickle") - - -# INDEX - - -@pytest.mark.slow -def test_write_index_false(rtype, mode): - expected = """\ -$pandas_dataframe - gas year emissions -1 Carbon dioxide 2018 5424.8815 -2 Methane 2018 634.4571 -3 Nitrous oxide 2018 434.5286 -4 Fluorinated gases 2018 182.7824 -5 Total 2018 6676.6496 - -""" - with tm.ensure_clean_dir() as tmp_dir: - out_file = os.path.join(tmp_dir, "rdata.out") - r_code = os.path.join(tmp_dir, "r_test.R") - - ghg_df.to_rdata( - out_file, file_format=rtype, index=False, engine="rscript", mode=mode - ) - - cmds = ["Rscript", r_code, out_file, rtype, "pandas_dataframe"] - output = run_rscript(cmds) - - assert output == expected - - -# ASCII - - -@pytest.mark.slow -def test_write_ascii_output(rtype, mode, ascii): - expected = """\ -$pandas_dataframe - rownames gas year emissions -1 141 Carbon dioxide 2018 5424.8815 -2 142 Methane 2018 634.4571 -3 143 Nitrous oxide 2018 434.5286 -4 144 Fluorinated gases 2018 182.7824 -5 145 Total 2018 6676.6496 - -""" - with tm.ensure_clean_dir() as tmp_dir: - out_file = os.path.join(tmp_dir, "rdata.out") - r_code = os.path.join(tmp_dir, "r_test.R") - - ghg_df.to_rdata( - out_file, file_format=rtype, engine="rscript", mode=mode, ascii=ascii - ) - - cmds = ["Rscript", r_code, out_file, rtype, "pandas_dataframe"] - output = run_rscript(cmds) - - assert output == expected - - -@td.skip_if_windows -def test_write_read_ascii(rtype): - with tm.ensure_clean_dir() as tmp_dir: - out_file = os.path.join(tmp_dir, "rdata.out") - - ghg_df.to_rdata( - out_file, - file_format=rtype, - engine="rscript", - index=False, - ascii=True, - compress=False, - ) - - with open(out_file, newline="") as f: - r_dfs = read_rdata(f, file_format=rtype, engine="rscript", rownames=False) - - expected = ghg_df.reset_index(drop=True) - output = r_dfs if rtype == "rds" else r_dfs["pandas_dataframe"] - output["year"] = output["year"].astype("int64") - - tm.assert_frame_equal(output, expected) - - -# COMPRESS - - -@pytest.mark.slow -def test_write_compress_types(rtype, mode, comp): - expected = """\ -$pandas_dataframe - rownames year mo data.type region extent area -1 1012 2016 12 Goddard S 8.28 5.51 -2 1013 2017 12 Goddard S 9.48 6.23 -3 1014 2018 12 Goddard S 9.19 5.59 -4 1015 2019 12 Goddard S 9.41 6.59 -5 1016 2020 12 NRTSI-G S 10.44 6.50 - -""" - with tm.ensure_clean_dir() as tmp_dir: - out_file = os.path.join(tmp_dir, "rdata.out") - r_code = os.path.join(tmp_dir, "r_test.R") - - sea_ice_df.to_rdata( - out_file, file_format=rtype, engine="rscript", mode=mode, compress=comp - ) - - cmds = ["Rscript", r_code, out_file, rtype, "pandas_dataframe"] - output = run_rscript(cmds) - - assert output == expected - - -def test_write_wrong_comp(): - with tm.ensure_clean("test.rds") as path: - with pytest.raises(ValueError, match=("not a supported value for compress")): - ghg_df.to_rdata(path, engine="rscript", compress="zip") - - -def test_write_none_comp(): - with tm.ensure_clean("test.rds") as path: - with pytest.raises(RScriptError, match=("invalid 'compress' argument")): - ghg_df.to_rdata(path, engine="rscript", compress=None) - - -# OTHER_FRAMES - - -@pytest.mark.slow -def test_write_other_frames(mode): - expected = """\ -$ghg_df - rownames gas year emissions -1 141 Carbon dioxide 2018 5424.8815 -2 142 Methane 2018 634.4571 -3 143 Nitrous oxide 2018 434.5286 -4 144 Fluorinated gases 2018 182.7824 -5 145 Total 2018 6676.6496 - -$plants_df - rownames plant_group status count -1 16 Pteridophytes Data Deficient 398 -2 17 Pteridophytes Extinct 65 -3 18 Pteridophytes Not Threatened 1294 -4 19 Pteridophytes Possibly Threatened 408 -5 20 Pteridophytes Threatened 1275 - -$sea_ice_df - rownames year mo data.type region extent area -1 1012 2016 12 Goddard S 8.28 5.51 -2 1013 2017 12 Goddard S 9.48 6.23 -3 1014 2018 12 Goddard S 9.19 5.59 -4 1015 2019 12 Goddard S 9.41 6.59 -5 1016 2020 12 NRTSI-G S 10.44 6.50 - -""" - with tm.ensure_clean_dir() as tmp_dir: - out_file = os.path.join(tmp_dir, "rdata.rda") - r_code = os.path.join(tmp_dir, "r_test.R") - - ghg_df.to_rdata( - out_file, - engine="rscript", - other_frames=[plants_df, sea_ice_df], - rda_names=["ghg_df", "plants_df", "sea_ice_df"], - mode=mode, - ) - - cmds = ["Rscript", r_code, out_file, "rda", ""] - output = run_rscript(cmds) - - assert output == expected - - -def test_write_other_frames_wrong_type(): - with tm.ensure_clean("test.rds") as path: - with pytest.raises( - TypeError, match=("objects in other_frames is not a DataFrame") - ): - ghg_df.to_rdata( - path, engine="rscript", other_frames=plants_df, rda_names=["plants_df"] - ) - - -def test_write_read_other_frames(datapath): - with tm.ensure_clean("test.rda") as path: - ghg_df.to_rdata( - path, - engine="rscript", - other_frames=[plants_df, sea_ice_df], - rda_names=["ghg_df", "plants_df", "sea_ice_df"], - ) - r_dfs = read_rdata(path, engine="rscript") - - assert list(r_dfs.keys()) == ["plants_df", "sea_ice_df", "ghg_df"] - - -# RDA NAMES - - -def test_write_mismatched_names_frames(): - with tm.ensure_clean("test.rds") as path: - with pytest.raises( - ValueError, - match=("does not match number of current DataFrame and other_frames"), - ): - ghg_df.to_rdata( - path, - engine="rscript", - other_frames=[plants_df, sea_ice_df], - rda_names=["plants_df", "sea_ice_df"], - ) diff --git a/pandas/tests/io/rdata/test_pyreadr.py b/pandas/tests/io/test_rdata.py similarity index 73% rename from pandas/tests/io/rdata/test_pyreadr.py rename to pandas/tests/io/test_rdata.py index fbcc9b06523fc..129764e69596a 100644 --- a/pandas/tests/io/rdata/test_pyreadr.py +++ b/pandas/tests/io/test_rdata.py @@ -77,13 +77,8 @@ def rtype(request): return request.param -@pytest.fixture(params=[None, False, "gzip"]) -def ok_comp(request): - return request.param - - -@pytest.fixture(params=[True, "bzip2", "xz"]) -def bad_comp(request): +@pytest.fixture(params=[None, "gzip", "bz2", "xz"]) +def comp(request): return request.param @@ -115,7 +110,7 @@ def adj_int(df): def test_read_rds_file(datapath): filename = datapath("io", "data", "rdata", "ghg_df.rds") - r_df = read_rdata(filename, engine="pyreadr") + r_df = read_rdata(filename) output = adj_int(r_df).tail() tm.assert_frame_equal(ghg_df, output) @@ -123,7 +118,7 @@ def test_read_rds_file(datapath): def test_read_rda_file(datapath): filename = datapath("io", "data", "rdata", "env_data_dfs.rda") - r_dfs = read_rdata(filename, engine="pyreadr") + r_dfs = read_rdata(filename) r_dfs = {str(k): adj_int(v) for k, v in r_dfs.items()} @@ -138,7 +133,7 @@ def test_bytes_read_rds(datapath): filename = datapath("io", "data", "rdata", "sea_ice_df.rds") with open(filename, "rb") as f: - r_df = read_rdata(f, file_format="rds", engine="pyreadr") + r_df = read_rdata(f, file_format="rds") output = adj_int(r_df).tail() @@ -149,7 +144,7 @@ def test_bytes_read_rda(datapath): filename = datapath("io", "data", "rdata", "env_data_dfs.rda") with open(filename, "rb") as f: - r_dfs = read_rdata(f, file_format="rda", engine="pyreadr") + r_dfs = read_rdata(f, file_format="rda") r_dfs = {str(k): adj_int(v) for k, v in r_dfs.items()} @@ -165,7 +160,7 @@ def test_bytesio_rds(datapath): with open(filename, "rb") as f: with BytesIO(f.read()) as b_io: - r_df = read_rdata(b_io, file_format="rds", engine="pyreadr") + r_df = read_rdata(b_io, file_format="rds") output = adj_int(r_df).tail() @@ -177,7 +172,7 @@ def test_bytesio_rda(datapath): with open(filename, "rb") as f: with BytesIO(f.read()) as b_io: - r_dfs = read_rdata(b_io, file_format="rda", engine="pyreadr") + r_dfs = read_rdata(b_io, file_format="rda") r_dfs = {str(k): adj_int(v) for k, v in r_dfs.items()} @@ -194,13 +189,13 @@ def test_bytesio_rda(datapath): def test_read_wrong_format(datapath): with pytest.raises(ValueError, match="not a valid value for file_format"): filename = datapath("io", "data", "rdata", "plants_df.rds") - read_rdata(filename, engine="pyreadr", file_format="r") + read_rdata(filename, file_format="r") def test_read_wrong_file(): with pytest.raises(FileNotFoundError, match="file cannot be found"): filename = os.path.join("data", "rdata", "plants_df.rda") - read_rdata(filename, engine="pyreadr") + read_rdata(filename) def test_read_rds_non_df(datapath): @@ -211,7 +206,7 @@ def test_read_rds_non_df(datapath): match="Invalid file, or file has unsupported features", ): filename = datapath("io", "data", "rdata", "ppm_ts.rds") - read_rdata(filename, engine="pyreadr") + read_rdata(filename) def test_read_rda_non_dfs(datapath): @@ -222,7 +217,7 @@ def test_read_rda_non_dfs(datapath): match="Invalid file, or file has unsupported features", ): filename = datapath("io", "data", "rdata", "env_data_non_dfs.rda") - read_rdata(filename, engine="pyreadr") + read_rdata(filename) def test_read_not_rda_file(datapath): @@ -232,7 +227,7 @@ def test_read_not_rda_file(datapath): custom_errors.LibrdataError, match="The file contains an unrecognized object" ): filename = datapath("io", "data", "rdata", "ppm_df.csv") - read_rdata(filename, file_format="rda", engine="pyreadr") + read_rdata(filename, file_format="rda") def test_bytes_read_infer_rds(datapath): @@ -240,7 +235,7 @@ def test_bytes_read_infer_rds(datapath): with pytest.raises(ValueError, match="Unable to infer file format from file name"): with open(filename, "rb") as f: - read_rdata(f, engine="pyreadr") + read_rdata(f) def test_bytes_read_infer_rda(datapath): @@ -248,7 +243,7 @@ def test_bytes_read_infer_rda(datapath): with pytest.raises(ValueError, match="Unable to infer file format from file name"): with open(filename, "rb") as f: - read_rdata(f, engine="pyreadr") + read_rdata(f) # URL @@ -272,7 +267,7 @@ def test_read_rda_url(): url = ( "https://github.com/hadley/nycflights13/blob/master/data/airlines.rda?raw=true" ) - r_dfs = read_rdata(url, file_format="rda", engine="pyreadr") + r_dfs = read_rdata(url, file_format="rda") tm.assert_frame_equal(url_df, r_dfs["airlines"].head()) @@ -284,14 +279,14 @@ def test_read_unable_infer_format(): "https://github.com/hadley/nycflights13/" "blob/master/data/airlines.rda?raw=true" ) - read_rdata(url, engine="pyreadr") + read_rdata(url) @tm.network def test_read_wrong_url(): with pytest.raises(HTTPError, match="HTTP Error 404: Not Found"): url = "https://example.com/data.rdata" - read_rdata(url, engine="pyreadr") + read_rdata(url) # S3 @@ -317,7 +312,7 @@ def test_read_rda_s3(): "Proline": {1: 1050, 2: 1185, 3: 1480, 4: 735, 5: 1450}, } ).rename_axis("rownames") - r_dfs = read_rdata(s3, engine="pyreadr") + r_dfs = read_rdata(s3) r_dfs["wine"] = adj_int(r_dfs["wine"]) # pyreadr remove dots in colnames @@ -331,38 +326,25 @@ def test_read_rda_s3(): def test_read_rds_df_output(datapath): filename = datapath("io", "data", "rdata", "sea_ice_df.rds") - r_df = read_rdata(filename, engine="pyreadr") + r_df = read_rdata(filename) assert isinstance(r_df, DataFrame) def test_read_rda_dict_output(datapath): filename = datapath("io", "data", "rdata", "env_data_dfs.rda") - r_dfs = read_rdata(filename, engine="pyreadr") + r_dfs = read_rdata(filename) assert isinstance(r_dfs, dict) assert list(r_dfs.keys()) == ["ghg_df", "plants_df", "sea_ice_df"] -def test_read_wrong_engine(datapath): - with pytest.raises(ValueError, match="not a supported engine"): - filename = datapath("io", "data", "rdata", "sea_ice_df.rds") - read_rdata(filename, engine="rpy2") - - -# MODE - -# IGNORED OPTION FOR pyreadr ENGINE - - -# USE_OBJECTS +# SELECT_FRAMES def test_read_select_frames_rda_dfs(datapath): filename = datapath("io", "data", "rdata", "env_data_dfs.rda") - r_dfs = read_rdata( - filename, engine="pyreadr", select_frames=["ghg_df", "sea_ice_df"] - ) + r_dfs = read_rdata(filename, select_frames=["ghg_df", "sea_ice_df"]) assert "plants_df" not in list(r_dfs.keys()) assert "ghg_df" in list(r_dfs.keys()) @@ -372,11 +354,7 @@ def test_read_select_frames_rda_dfs(datapath): def test_read_wrong_select_frames(datapath): with pytest.raises(TypeError, match="not a valid type for select_frames"): filename = datapath("io", "data", "rdata", "env_data_dfs.rda") - read_rdata( - filename, - engine="pyreadr", - select_frames="plants_df", # type: ignore[arg-type] - ) + read_rdata(filename, select_frames="plants_df") # ROWNAMES @@ -384,7 +362,7 @@ def test_read_wrong_select_frames(datapath): def test_read_rownames_true_rds(datapath): filename = datapath("io", "data", "rdata", "sea_ice_df.rds") - r_df = read_rdata(filename, engine="pyreadr", rownames=True) + r_df = read_rdata(filename, rownames=True) if isinstance(r_df, DataFrame): assert r_df.index.name == "rownames" @@ -392,7 +370,7 @@ def test_read_rownames_true_rds(datapath): def test_read_rownames_false_rds(datapath): filename = datapath("io", "data", "rdata", "sea_ice_df.rds") - r_df = read_rdata(filename, engine="pyreadr", rownames=False) + r_df = read_rdata(filename, rownames=False) if isinstance(r_df, DataFrame): assert r_df.index.name != "rownames" @@ -400,7 +378,7 @@ def test_read_rownames_false_rds(datapath): def test_read_rownames_true_rda(datapath): filename = datapath("io", "data", "rdata", "env_data_dfs.rda") - r_dfs = read_rdata(filename, engine="pyreadr", rownames=True) + r_dfs = read_rdata(filename, rownames=True) assert r_dfs["ghg_df"].index.name == "rownames" assert r_dfs["plants_df"].index.name == "rownames" @@ -409,7 +387,7 @@ def test_read_rownames_true_rda(datapath): def test_read_rownames_false_rda(datapath): filename = datapath("io", "data", "rdata", "env_data_dfs.rda") - r_dfs = read_rdata(filename, engine="pyreadr", rownames=False) + r_dfs = read_rdata(filename, rownames=False) assert r_dfs["ghg_df"].index.name != "rownames" assert r_dfs["plants_df"].index.name != "rownames" @@ -422,7 +400,7 @@ def test_read_rownames_false_rda(datapath): def test_non_utf8_data(datapath, rtype): filename = datapath("io", "data", "rdata", f"climate_non_utf8_df.{rtype}") with pytest.raises(UnicodeDecodeError, match=("'utf-8' codec can't decode byte")): - read_rdata(filename, engine="pyreadr") + read_rdata(filename) # RDA WRITER @@ -432,8 +410,8 @@ def test_non_utf8_data(datapath, rtype): def test_write_read_file(rtype): with tm.ensure_clean("test.out") as path: - ghg_df.to_rdata(path, file_format=rtype, engine="pyreadr", index=False) - r_dfs = read_rdata(path, file_format=rtype, engine="pyreadr", rownames=False) + ghg_df.to_rdata(path, file_format=rtype, index=False) + r_dfs = read_rdata(path, file_format=rtype, rownames=False) expected = ghg_df.reset_index(drop=True) output = ( @@ -448,10 +426,8 @@ def test_write_read_pathlib(rtype): with tm.ensure_clean_dir() as tmp_dir: tmp_file = Path(tmp_dir).joinpath("test.out") - sea_ice_df.to_rdata(tmp_file, file_format=rtype, engine="pyreadr", index=False) - r_dfs = read_rdata( - tmp_file, file_format=rtype, engine="pyreadr", rownames=False - ) + sea_ice_df.to_rdata(tmp_file, file_format=rtype, index=False) + r_dfs = read_rdata(tmp_file, file_format=rtype, rownames=False) expected = sea_ice_df.reset_index(drop=True) output = ( @@ -463,11 +439,10 @@ def test_write_read_pathlib(rtype): def test_write_read_filelike(rtype): with BytesIO() as b_io: - sea_ice_df.to_rdata(b_io, file_format=rtype, engine="pyreadr", index=False) + sea_ice_df.to_rdata(b_io, file_format=rtype, index=False) r_dfs = read_rdata( - b_io.getvalue(), # type: ignore[arg-type] + b_io.getvalue(), file_format=rtype, - engine="pyreadr", rownames=False, ) @@ -485,7 +460,7 @@ def test_write_read_filelike(rtype): def test_write_wrong_format(): with tm.ensure_clean("test.rda") as path: with pytest.raises(ValueError, match=("not a valid value for file_format")): - ghg_df.to_rdata(path, engine="pyreadr", file_format="csv") + ghg_df.to_rdata(path, file_format="csv") def test_write_unable_to_infer(): @@ -493,21 +468,7 @@ def test_write_unable_to_infer(): with pytest.raises( ValueError, match=("Unable to infer file format from file name") ): - ghg_df.to_rdata(path, engine="pyreadr") - - -# ENGINE - - -def test_write_wrong_engine(): - with tm.ensure_clean("test.rda") as path: - with pytest.raises(ValueError, match=("not a supported engine")): - ghg_df.to_rdata(path, engine="rpy2") - - -# MODE - -# IGNORED OPTION FOR pyreadr ENGINE + ghg_df.to_rdata(path) # INDEX @@ -515,10 +476,8 @@ def test_write_wrong_engine(): def test_index_true(rtype): with tm.ensure_clean("test.out") as path: - plants_df.rename_axis(None).to_rdata( - path, file_format=rtype, engine="pyreadr", index=True - ) - r_dfs = read_rdata(path, file_format=rtype, engine="pyreadr") + plants_df.rename_axis(None).to_rdata(path, file_format=rtype, index=True) + r_dfs = read_rdata(path, file_format=rtype) r_df = r_dfs if rtype == "rds" else r_dfs["pandas_dataframe"] @@ -528,10 +487,8 @@ def test_index_true(rtype): def test_index_false(rtype): with tm.ensure_clean("test.out") as path: - plants_df.rename_axis(None).to_rdata( - path, file_format=rtype, engine="pyreadr", index=False - ) - r_dfs = read_rdata(path, file_format=rtype, engine="pyreadr") + plants_df.rename_axis(None).to_rdata(path, file_format=rtype, index=False) + r_dfs = read_rdata(path, file_format=rtype) r_df = r_dfs if rtype == "rds" else r_dfs["pandas_dataframe"] @@ -539,20 +496,13 @@ def test_index_false(rtype): assert "index" not in r_df.columns -# ASCII - -# IGNORED OPTION FOR pyreadr ENGINE - - # COMPRESS -def test_compress_ok_comp(rtype, ok_comp): +def test_compress_ok_comp(rtype, comp): with tm.ensure_clean("test.out") as path: - ghg_df.to_rdata( - path, file_format=rtype, engine="pyreadr", compress=ok_comp, index=False - ) - r_dfs = read_rdata(path, file_format=rtype, engine="pyreadr", rownames=False) + ghg_df.to_rdata(path, file_format=rtype, compression=comp, index=False) + r_dfs = read_rdata(path, file_format=rtype, rownames=False) expected = ghg_df.reset_index(drop=True) output = ( @@ -562,34 +512,10 @@ def test_compress_ok_comp(rtype, ok_comp): tm.assert_frame_equal(output, expected) -def test_compress_bad_comp(rtype, bad_comp): - from pyreadr import custom_errors - - with tm.ensure_clean("test.out") as path: - with pytest.raises( - custom_errors.PyreadrError, - match=(f"compression {bad_comp} not implemented!"), - ): - ghg_df.to_rdata( - path, - file_format=rtype, - engine="pyreadr", - index=False, - compress=bad_comp, - ) - - def test_compress_zip(rtype): with tm.ensure_clean("test.out") as path: - with pytest.raises(ValueError, match=("not a supported value for compress")): - ghg_df.to_rdata( - path, file_format=rtype, engine="pyreadr", index=False, compress="zip" - ) - - -# OTHER_FRAMES - -# IGNORED OPTION FOR pyreadr ENGINE + with pytest.raises(ValueError, match=("not a supported value for compression")): + ghg_df.to_rdata(path, file_format=rtype, index=False, compression="zip") # RDA_NAMES @@ -597,13 +523,7 @@ def test_compress_zip(rtype): def test_new_rda_name(): with tm.ensure_clean("test.rda") as path: - ghg_df.to_rdata(path, engine="pyreadr", rda_names=["py_df"]) - r_dfs = read_rdata(path, engine="pyreadr") + ghg_df.to_rdata(path, rda_name="py_df") + r_dfs = read_rdata(path) assert "py_df" in list(r_dfs.keys()) - - -def test_type_rda_name(): - with tm.ensure_clean("test.rds") as path: - with pytest.raises(TypeError, match=("not a valid type for rda_names")): - ghg_df.to_rdata(path, engine="rscript", rda_names="py)df") From a5983e006022a4afa17b02f3c604138689c688cc Mon Sep 17 00:00:00 2001 From: Parfait Gasana Date: Wed, 14 Apr 2021 07:08:05 -0500 Subject: [PATCH 07/35] Fix duplicate entry in ci dep yaml --- ci/deps/azure-windows-37.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/ci/deps/azure-windows-37.yaml b/ci/deps/azure-windows-37.yaml index 2dde2e2892f99..6e7be62cdc56f 100644 --- a/ci/deps/azure-windows-37.yaml +++ b/ci/deps/azure-windows-37.yaml @@ -38,7 +38,6 @@ dependencies: - xlwt - pyreadstat - pyreadr - - pyreadr - pip - pip: - pyxlsb From e78bf6ed22dfa5804b2ee9fc85b620ffc97103ba Mon Sep 17 00:00:00 2001 From: Parfait Gasana Date: Thu, 15 Apr 2021 22:29:00 -0500 Subject: [PATCH 08/35] Refactor to handle binary content, add datetime notes in docs --- doc/source/user_guide/io.rst | 48 +++++++++++--------------- pandas/io/rdata.py | 48 ++++++++++++++++---------- pandas/tests/io/data/rdata/ppm_df.rds | Bin 0 -> 14315 bytes pandas/tests/io/test_rdata.py | 4 +-- 4 files changed, 52 insertions(+), 48 deletions(-) create mode 100644 pandas/tests/io/data/rdata/ppm_df.rds diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 0023f70e699bd..f4bbde8efcd92 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -6012,11 +6012,11 @@ To read from a file-like object, read object in argument, ``path_or_buffer``: .. ipython:: python - rds_file = os.path.join(file_path, "sea_ice_df.rds") + rds_file = os.path.join(file_path, "plants_df.rds") with open(rds_file, "rb") as f: - sea_ice_df = pd.read_rdata(f.read(), file_format="rds") + plants_df = pd.read_rdata(f.read(), file_format="rds") - sea_ice_df + plants_df To read from URL, pass link directly into method: @@ -6035,8 +6035,7 @@ another issue. Any R data encoded in non utf-8 is currently not supported: In [608]: ghcran = pd.read_rdata("s3://public-r-data/ghcran.Rdata") ... - UnicodeDecodeError: 'utf-8' codec can't decode byte 0xe9 in position 45: -invalid continuation byte + UnicodeDecodeError: 'utf-8' codec can't decode byte 0xe9 in position 45: invalid continuation byte Also, remember if R data files do not contain any data frame object, a parsing error will occur: @@ -6050,6 +6049,17 @@ will occur: .. _io.rdata_writer: +Please note R's ``Date`` (without time component) will translate to ``object`` type +in pandas. Also, R's date/time field type, ``POSIXct``, will translate to UTC time +in pandas. + +.. ipython:: python + + ppm_df = pd.read_rdata(os.path.join(file_path, "ppm_df.rds")) + ppm_df.head() + ppm_df.tail() + ppm_df.dtypes + Writing R data '''''''''''''' @@ -6069,7 +6079,7 @@ and optionally give it a name: .. ipython:: python - plants_df.to_rdata("plants_df.rda", rda_name="plants_df") + ghg_df.to_rdata("ghg_df.rda", rda_name="ghg_df") While RData and rda types can hold multiple R objects, this method currently only supports writing out a single DataFrame. @@ -6079,7 +6089,7 @@ Even write to a buffer and read its content: .. ipython:: python with BytesIO() as b_io: - sea_ice_df.to_rdata(b_io, file_format="rda", index=False) + env_dfs["sea_ice_df"].to_rdata(b_io, file_format="rda", index=False) print( pd.read_rdata( b_io.getvalue(), @@ -6134,8 +6144,8 @@ Like other IO methods, ``storage_options`` are enabled to write to those platfor :suppress: os.remove("ghg_df.rds") + os.remove("ghg_df.rda") os.remove("plants_df.rds") - os.remove("plants_df.rda") os.remove("plants_df_gz.rds") os.remove("plants_df_bz2.rds") os.remove("plants_df_xz.rds") @@ -6147,18 +6157,7 @@ loaded in R: .. code-block:: r plants_df <- readRDS("plants_df.rds") - tail(plants_df, 5) - plant_group status count - 16 Pteridophytes Data Deficient 398 - 17 Pteridophytes Extinct 65 - 18 Pteridophytes Not Threatened 1294 - 19 Pteridophytes Possibly Threatened 408 - 20 Pteridophytes Threatened 1275 - - - load("env_dfs.rda") - eapply(.GlobalEnv, tail, 5) - $plants_df + plants_df plant_group status count 16 Pteridophytes Data Deficient 398 17 Pteridophytes Extinct 65 @@ -6166,14 +6165,9 @@ loaded in R: 19 Pteridophytes Possibly Threatened 408 20 Pteridophytes Threatened 1275 - $sea_ice_df - year mo data.type region extent area - 1012 2016 12 Goddard S 8.28 5.51 - 1013 2017 12 Goddard S 9.48 6.23 - 1014 2018 12 Goddard S 9.19 5.59 - 1015 2019 12 Goddard S 9.41 6.59 - 1016 2020 12 NRTSI-G S 10.44 6.50 + load("ghg_df.rda") + mget(list=ls()) $ghg_df gas year emissions 141 Carbon dioxide 2018 5424.8815 diff --git a/pandas/io/rdata.py b/pandas/io/rdata.py index 91852f5bd281a..4114b6d1f8349 100644 --- a/pandas/io/rdata.py +++ b/pandas/io/rdata.py @@ -1,4 +1,3 @@ -from datetime import datetime import io import os from tempfile import TemporaryDirectory @@ -52,7 +51,7 @@ def read_rdata( Any valid file path is acceptable. The string could be a URL. Valid URL schemes include http, ftp, s3, and file. - file_format : {{'infer', 'rda', 'rdata', 'rds'}}, default 'infer' + file_format : {{'infer', 'rdata', 'rda', 'rds'}}, default 'infer' R serialization type as output from R's base::save or base::saveRDS commands. Default 'infer' will use extension in file name to to determine the format type. @@ -269,7 +268,7 @@ class _RDataReader: Any valid string path is acceptable. The string could be a URL. Valid URL schemes include http, ftp, s3, and file. - file_format : {{'infer', 'rda', 'rdata', 'rds'}}, default 'infer' + file_format : {{'infer', 'rdata', 'rda', 'rds'}}, default 'infer' R serialization type. select_frames : list, default None @@ -318,7 +317,13 @@ def verify_params(self) -> None: and raise appropriate errors. """ - if self.file_format not in ["infer", "rda", "rdata", "rds"]: + path_ext: Optional[str] = ( + os.path.splitext(self.path_or_buffer.lower())[1][1:] + if isinstance(self.path_or_buffer, str) + else None + ) + + if self.file_format not in ["infer", "rdata", "rda", "rds"]: raise ValueError( f"'{self.file_format}' is not a valid value for file_format" ) @@ -326,15 +331,15 @@ def verify_params(self) -> None: if ( self.file_format == "infer" and isinstance(self.path_or_buffer, str) - and not self.path_or_buffer.lower().endswith((".rda", ".rdata", ".rds")) + and path_ext not in ["rdata", "rda", "rds"] ) or (self.file_format == "infer" and not isinstance(self.path_or_buffer, str)): raise ValueError( f"Unable to infer file format from file name: {self.path_or_buffer}. " - "Please use known R data type (.rda, .rdata, .rds)." + "Please use known R data type (rdata, rda, rds)." ) - if self.file_format == "infer": - self.file_format = os.path.splitext(self.path_or_buffer.lower())[1][1:] + if self.file_format == "infer" and isinstance(path_ext, str): + self.file_format = path_ext if self.select_frames is not None and not is_list_like(self.select_frames): raise TypeError( @@ -360,9 +365,9 @@ def buffer_to_disk(self, tmp_dir: str) -> str: ) with _preprocess_data(handle_data) as r_data: - mode = "wb" if isinstance(r_data, io.BytesIO) else "w" - with open(r_temp, mode) as f: - f.write(r_data.read()) + if isinstance(r_data, io.BytesIO): + with open(r_temp, "wb") as f: + f.write(r_data.read()) return r_temp @@ -412,10 +417,9 @@ def handle_rownames(self, df) -> DataFrame: def parse_data(self) -> Union[DataFrame, Dict[str, DataFrame]]: from pyreadr import read_r - tz = datetime.now().astimezone().tzinfo with TemporaryDirectory() as tmp_dir: r_temp = self.buffer_to_disk(tmp_dir) - rdata = read_r(r_temp, use_objects=self.select_frames, timezone=tz) + rdata = read_r(r_temp, use_objects=self.select_frames) rdata = {k: self.handle_rownames(df) for k, df in rdata.items()} rdata = rdata[None] if self.file_format == "rds" else dict(rdata) @@ -432,7 +436,7 @@ class RDataWriter: path_or_buffer : a valid str, path object or file-like object Any valid string path is acceptable. - file_format : {{'infer', 'rda', 'rdata', 'rds'}}, default 'infer' + file_format : {{'infer', 'rdata', 'rda', 'rds'}}, default 'infer' R serialization type. rda_name : str, default "pandas_dataframe" @@ -487,7 +491,13 @@ def verify_params(self) -> None: and raise appropriate errors. """ - if self.file_format not in ["infer", "rda", "rdata", "rds"]: + path_ext: Optional[str] = ( + os.path.splitext(self.path_or_buffer.lower())[1][1:] + if isinstance(self.path_or_buffer, str) + else None + ) + + if self.file_format not in ["infer", "rdata", "rda", "rds"]: raise ValueError( f"{self.file_format} is not a valid value for file_format." ) @@ -495,15 +505,15 @@ def verify_params(self) -> None: if ( self.file_format == "infer" and isinstance(self.path_or_buffer, str) - and not self.path_or_buffer.lower().endswith((".rda", ".rdata", ".rds")) + and path_ext not in ["rdata", "rda", "rds"] ): raise ValueError( f"Unable to infer file format from file name: {self.path_or_buffer}" - "Please use known R data type (.rda, .rdata, .rds)." + "Please use known R data type (rdata, rda, rds)." ) - if self.file_format == "infer" and isinstance(self.path_or_buffer, str): - self.file_format = os.path.splitext(self.path_or_buffer.lower())[1][1:] + if self.file_format == "infer" and isinstance(path_ext, str): + self.file_format = path_ext if self.compression is not None and self.compression not in [ "gzip", diff --git a/pandas/tests/io/data/rdata/ppm_df.rds b/pandas/tests/io/data/rdata/ppm_df.rds new file mode 100644 index 0000000000000000000000000000000000000000..242a3e2b112367d0b7adfaa29679057819489cfb GIT binary patch literal 14315 zcmdsec~lcy(5Hxi$|i0oTNG3tBJf<<$x}fTggik-w#4Nr2to`90TL1c6%iq#D9Vzk zsCkGG5!pimE})=9WC?2$)&La)t1_SrQiFvcZF z*CyN7#@ENj*Vm@g$EMTQ#@5G%@HpV?lYq0!SP;;kAJG3Kp#O2eZyy_uug&c1{QChu zS8OgHwb@yo|1jKV@SM-!X`k;luP2`bEc)0;U+1R>bW8{MT(EilZ+<|S&tQO$jm_(8 zcCWA5zP{%08V&W4zRdqnn*X6X|3i8H;$@qu@_>$ofDU{>2P2?^63{_^oewXbk?vt- zZ9zTN{f6*2h_jsX?8q+O9cRLBxMxXJhpL#1>L=4DNV0 zuN!@!-hEFBy5U%p{ig6gxvw9S-uxIENSc0<9hzgX+03WQ2T}8W&uV5Bsp0Ct zZ7C-aw5&&-#zaY3;lq8l?DSIjo~T_Z^A4_Eb{oS->aPqH<_9+n(7s~q#lrW5Or_gA z`w|!=mkv0zU^K+K@2L&5a2;*G&F-;-^_`h_?e~zmL*)G6+kSZW>m4h*FX7L9h|icA z&mU$;w_QHltyM7xZalkG3bt<+*mZ|`G&KL-U{ih12BsMB!%h8I=O?uTs9Nee7IuxQ z;6DKVRb2s4H>vB!)Q1$zJL=&os=0i)O6@ELgRC!YS{EMIY1`aE{2?3v1OXKbvm^Nq zEy$a^6-*7v2DCNc&ktucwII`$E$svfSgzUm-AXEASLHC6fmhOcDFTMci5R)AMv)+x3DhxzcX!7q-%L z;ChTL z-xdcA>MBJ5-?!C43vR$LBRd0^Sp{h@ZN^HnR+29N3CpfEG#zGuaR#PXK{sI<7%kv2 zD<~bd2D1q`!wO2PTM5@RBeR`Ke~maMcu3j;d5F@np6Yw!K7$@KZ~9L zGsL(jneq>@Vp;OkE5JNM%XVn1&cIpky_-pwc8OdHM`f zvSSj6@4-rK0jlkWPAh07cC}(V zKeH92jopIY2K>cOW8rRsSE^P6+xUlB87(>)hT0%Kk5%Z^z&id5*3s6?bVE&$wuc_t z3AoN5V_CI$-^8r|t?^io-jTGikCW|g*jXW%qInjYGUQ+)4MkQPYSV>5aWaEXOW18aL2pv?d$zJN8++MWS6 zP&ok;tPL&JHz6yqD-~<_)-BfQkX3}$svW>179<_4rP>GtvL3Zqr$JU?O%zx8#8!d^ zR$H-^|DE-y)j9*LtJ(upvuJ5x9hDidk#EBaY$2vYvTjv0L^SnYRE0Y}ZTO%%n#q{Ef0@9C)Z#;#?-0&0f>H zqefx4tfH;>^~EbyH!^^n{PV}79G&yl&l_O0=oRm=zLkoCg7x!$t{Jb+W=xHPxH(lN zzH2DEX3V_<(%~18E*M$q%GcVIr%3`z9$ z;cInx3*v9JooN)tr}*%nB>}r%t%B{q2p4jUPp+~5%y^?APzR)c{o-t{-u$~uZy|FF z;XGO17myD5wNTCyzOU1hM^^g+f6_CnB%3fB$LX10F73wny%`Dp5}_q_JXN_`o!hv$ePZ1B?fgQfo`l*?y`~8H zk>s$;Blx4l83Fcj!^BN&e9Ia#>`O?Rx#mbc5)rRHJzhl+@0*2h77Tq)!s5(D)s|wD z3#4bJ?SK0k6Msz&kZx0{((zhclImo0Pq_Ms{)pS^65cf8kH_*lqk^Vl*o)XM+OIQV zbc8Ciqh7T`@ost=`EQ=k>V~sy7awm=d65~eIP$q(y-s#^h}t4GcgYaH{hL~5s{SSV z;;keXs0^3AO1Y1~JW0xVJ*okuY9gYSS2$FV5j*sKyrW9*6ZJt+(2Ih*!=X#^gdxIl zX=eSxO_1XS9(4ZxE7*p5uom^oZ-UoAxL30mAaN0SPpC)l52xOTJV2DR)ss(}F-uh@ zOS_gf&sRflO=nPVs*N0RN4B>`?-Mh27=TU@60GwNH#PCu+XU(kq}onau5$MtVZ3p# zDahy|D*R>0Pg2uEfKpd@5hVDhSvU1KcKU*2PTUYLV@c>3-t{Da4rRx-!5WAV2j z5f&EPR9XXe+b#GmOW92Zgjw09%CWYC=-g;9x}w}T_Gr_OCjQ`nIs-A9q0**88s%dh zm9R0Heb)tXH$MSjT40+>hnu7uDXYT0CcY`uIbzzl2YNLuomnyK?Wl-g>fvroY*@&S z&BW@@!h+e@db326^VHX-M1IZ)uOq=>F2@t6Z5T+^oxqf3W9#pkQm^YPVt&+zACv|wv#(Dd?)W`jh;D7>|aZ{4_kQ3O7@BF(k(Fx+&ah~TS za9Wr~O`aC@wqcg2T8fUlUk6N`bLtO$>3VGMpXU!S_0_Tq z$Q@H{7jen!lUu&JK1@&TFZkxQROe22gLvL)N;XN}__M)rU$%G+ASB8UXOOMij7O(F}qRq%r1dlhxA~Fbbm-| z&e8nTZ7DgX^qH8Cjw^mj|CH%*J#a8c0JL-TmU4QD=i|ue4lQG5tBwgG%qFCJWS7U= z8|*uTXY<+5Be880{`1J!iVOgpn*g*=VxA5;}i6vwf}+k+m0mDaHm_CftAakwB#ZMs^#pzi#0q z-j`Zrc<-GKzmN9WGd-PrZI3uJfNI5svc}EIeNv}U_v0_mUm5RIUrE&7g`s3TKuyX@ypFncmP<-B_CEWK4HbfEcxUh z<14eS1@@-IxOGsH;nuE`tU4KqugieiuhA!e(%avSXW&Z$aE_={w>F8Yf@OIDA?fis3QofjJywdjF(_N~7|7cJZ* znJ(STPyyRuUf^avcZ888`lI(zG`{HQJID+C0Ov4vVJ9`-5iw@AsnHlL|6MsvJOFmg z$5cUYy&N#jR&K$#`7~Dnd-1BP_?-ANGi$~Vtw9w-judAe>EXw_hzAUh6qj$1l7j~> zfpmXb<`zdBfv*SmKk}r^`N>FA9>1bHuzFp|u85y5>tw6IPi{KArt;lC<_pdf*2KI5 zao)Gbqwb>PEU5)uZi9b?3F~3U)dvjQ8f=%e5CdLKu)PkF09eg5HCMK@cV2m%dI08y zw>M)ol*FHTntJd@d^B(xA5Gf^U*YhXC?Ho#Kla!_ENsIv*r=b?XWH!*`g9Gp)cr@i zH-PTQ3o@jC2LF`s$>tu0So<$sUO<9Pqn49GA9e)mUx}xu|hI7 zY0nfWB5c<+z{QR|$X@xJu4*RMN z#%QM&gTSZ!gazINh~c>N`l4Q*@2oM8Xf;`4#dkS^*&er5 z`pEV>J?y8lf8YW(_dU*=75Yk3>=Ix>@rq@8u+^*O(YKTbf8*ks1)S}R2XXb%En~$N zTYZx8AS?({WzNKsbWY%fOYYWBQ>lTp7*8^ZH(thavEO|Mw*IZtWo|=BZ zEY42F(=3Y%taP)O`(#yxfL*B@dtkNn-fH>9TM}^IwfRT1-1j^$pS#3q+BY3f@!wcS zgz};4p+oM2h`Z>|qV2NpsLF510lJ%m{QRWTgxVNWvV&>#m(b!2&)b=F%=V%B-;=5g zr9%gb@dvxlv*F$(C`oO0nVF6Elir!c4z#~Cj)dl*yKYY&EzG0VZvlr~_0R{01mNPM zms}Pd3W-e`*a7CtdB;L` zQ%2oz5~ojQe^VbDJA7lK@>z?>sl(l`Y4YGp+Akb?Mu+wyOJTu_csZF_N-|4_Jf=w&Vf)ZT>XrNK;w?glLwI?La!X=zZkE9H<-iR^avFx>Lz#_PjE?_y zQ~=#k`#`3E?rI}v+ifS*+*@3}esU}sZ*`HAWyOiylbQ7RPv~&{75vvE%v}NAqZTKJ z^%x}I+``**k$Fk&B-m!SZbEWkuOnfl!aRqn0?TP4+rL9!JvnZkXL3SaiTk2XeRJF| zFU2y!CNO(ZpoVaQ2g=tbZVm{a)O)pyd*=1esPo?&O+@9VsJkU^1Bu#=7LvZYsC@TF zABHj;^Mg)y_Kk(4mSx9q;Xhw_oYgEp0`85qQ~aCrMWB5`aJnglWHJ>!7LmL__xlj& z{=LrS#1v=OSJa)S{Hi~5l#rziIdQJ!73J)U zoa%?;B(;AolH0w&n6N77M&)= z1}J6sLvwbJ&;Cv|r)#IFkFjloAg9P7xoK_y1yh zkG`tq=%`$2eLBsgy{9RW>he`)5g)_NjYSRq0ej}G2j3NMq)j^i8^13(r_66~67g=q z6jc*yb@j(%u)e0c#P{IC`7^5(y!a3~9vx0-{2uRQ-}mriSA{H%DhZmvyLoez4lc=l zS0CIuxLQ%7s~&DnNn75b9D><}y7MJ_mxTWwV^QTXi$f&Nm2jBcrEM-~k@0B$=st%V zW{)IB=ARqWsI=-y<-P{o_or=Vcx9w0>t{VRE3pK+2t7p21-SGRyE$yXj233YdWQ&R zbfqgV2-T$sQd7%@2QJBmhlj2e!D)Rli=%SRD%ERwNoY}+g!1;Mfdv2BFRxRwqY^pT zerqfyZ0Oa6qGr^5e4weM@cLgx4pmJ(O3dyH$U1o{w8$@?IC=}4Jdi*#Ux=w5cBg-I zzRQ?UsaIsl=BV<9fPzEvtL@EiB1hYvk$zoqm2hcdSjpgSf8aJQr-UCD9M(@gQ}veqemtR`UAl?@R4C3wi?Dj5OzdmkgG}i zk?GrFZ>2h26^Tt?l=nMvFPR;LHl?cGhL^O5s8}O(_GAN822|gW-?PI0_rI<3XJd0FYE~%lK zK+IvwR#et)C)l5%LqQ5tQ2}OF_8bAeZ_XvQc}>*iVItkR?8D=4RAs4Zw;okA_9G!G zXyRW)chXFKWb+sDS3^;ZqBe@Ten(w8bwUqZGlDv154%d~=5N17-V`m}?M zKy$MSy2y_XDuHwek9L47`zUSR?LiVF5rlh{yn*w4p|sIK)*on|`NhRsEtV&}#Bfxz zOzQ;74CMp8UDu9?=u4VYy{>ON!Ijf>oBLI>PLonuIah@fX}GGK-YSn$Qgl+4sv$>< zIUJVh!#_#4U8=kP2<+d;jBYYy7D#v&|?nVOVDCEIT*qwVNKxoS7)Mss&E zl}>k1Q_57lB`M-KO>@b2;tA^oj&9=Z-X00QflgpVx#8q(XSf`XNv8Ej^=vO>-kKu( z6$`>8jC>@wol$nHX54-!{TnWt*cQT0O%Gzha5A~`&Q{IqF|dh8PVMjQuPSb!5Bh^; zUE7BUyk}S!W^Zm6Hm3<14gN@}%uMn4Re!Z<4tCLkPdwZjSqMGtn5^O3tfXq6crTX5o?xEWg0T!=&KS2v}i= z_9<4+7BD099F=k_oR8ws;+OI&)Rnv+5jwyTRtA<6kp5rGi- z3rsm-2tL~0SJ_2`R~3EBoFOAZgjb?*jrFQ>HM2KMv@1tidtX{}2hpcIiLj%l3+T!a zdU$icu*U^>=#@(Cm%ieTd4B@OcoXP1lzq}ldWb)F2Og42yF3KpW=4rZY1VK1uvn@{ zbqv-tuE3t|jsS;s3 zJg9wnxTIDP1>J~l`OGTt0fQ8n^0*uGl>fvl)r|8=dCAm7T|ZsW8#e^2AW3Rcn@P%E z)gP6@!=xVcL^)=T{9D@4R)+_a0$MyaEWsmMo`Ua`NQqO%8+7`HK<*%5Vu3&i@#w6> z+lzCE*kw7ay8e}Xh>qa@#Hq&!;vTB%-C_=D(K4@nQ3O#2ft7B(DMMI0HV%hv?hy!h zV3p_`EpQ0>0&WorKErXQQ~l+12a&Im2^cDR1QHTKmC&=y6RK4R5iWz^9R=acD)pF2 zk!~NMDyF2*6TCjz1O!$H=!+&eY@xUMizXI*7U(F4sS?_#(B&-k4>hPCRr6@Ulyn4! z>HyV{1biOkKg>O9J1i~5Ty@<_N$})E%cFvZ1?{IMDiRg!wd#812CC|6Sf2wHt6(3+ zQa_+oQLZY)k}v?Fc0SJCxprdohoW~RbK!t@k~`pbd`gJO8DT3gf1jmE$_b5zN>@}b0Jyq<^Tq#hMC(IsN@rb z3ZXT<4jS#TgKq5b({;v4PQQ%}$y9KtSb_v>2n>sc&y1+a>IS%Z9{>C4>fw)ACFs;1p9T8!C@U=$H1x)gChvupsO$t%oA| zLW79{2$qOTI*R3uNc*ctwQU?q@A0?bJe09T{X{fr1`sKg%&-U505zz19 z*|9Ul7AKOGDD4aM7x^RgpO z#DsA*Z12^y9*9t~1E2hL_Ipn;=82dv!CUFqiYr^XWP$}>8*=rBw3B8P>Y#|?+-cqi z4;+>zM8XN`@3Y45<)4g*ut_yZZiz@P(d}Jpycz&t;XZKR{6{{sP zBCPy2rgfGs5GE+l8EWi^>EWfl#0Y^Tarr!ZMTpe7vVrMr{8Hz6R;QUva7FWgh=E?? zM`gTCr|2v73I!=mW{q|!W`$I$Kuq$d_N;Z+a|fkmoZfQ>*?Y_?@ z|MbV6-kL)Ijj_E=i&aiH+h{oN7gE%B(0(wx?ym{90{82S@KIajxO9y`F{w*5{*$`- zPw?g&p5VKf^-6me^Je<&ZJ^=(b`z}eJ`deXGX>qg-S_FAKqb&G^>$IY6m$12Kbd9c z4j7i)X7HAX{P7;)ud*4&O#3hR^JfKw^_dVXB4zJHEwFZ}WX5P_b_ViGI>Rf51{nFd z74VU(o~)zo{1U%r;{6VNQ$sTOHY-9s%iho5KJ+mpYzi5gd>6LH*9n!7kZsxv8pL%xk#WO0Q ze|Y}BuuS=pBq^IdG_&__??2VWMh~6cL=s%FXNXu=jMA!l~H$g;mfsO zX0dmVH{26l>GL24p=PSP`TvMgBgY$3b`>U7GL}W6@3IR!f0c9}{Y85*_Px+=23`UU zwfNci{t30Nw$?$0Sh}`ka)2{5#4LukOPczP=m}_cq&agexV^Df$TwYyS^&#p88|7H!QOz=hMpleEJ*3WJv% z`d}nbqgqkOn|ocYezPC^Tk3Jvg$eheYLIp5Xu{6aEl!+EF3jNLSWV?#V0UHwFV~i_ zt1ir`&YPsI2fNWo+YH{3%Ok718Vg$`;rouo{Zw?aT=loc|I%%t$CYF(B?7Qycz!SCXOJ6jx|*QI}cw1=|c*-y;F z@J*@qPE4!LE7dyO*jEcN%8sgyBx zjM7slKwlRVbI|bUOU4J-O*v@C2rFj7pp$-aLfi!UdVgn5LnMXTn8(YAoirn_!CxD9 z#TGe9VjI`~lr$a^xR?1UTcrzKZ`!-}w=g{&R%BWit!<%B7V@%VFT#F|DGEL#SH|0q z7xLasUAOo#=4_qG+d0K|Cz$mM{%0y)9wCC_OOX`9v(T+S7hi!NuF#$^GzOZk=n;t z$Wb0l-VajSK_>VIJZ)OKTWxTFlcf5yGPqs%Ka=ANXZ3Wv$2WFUxfcJ`%>?rQO6T1Q zdHjt(j>X5?-?Y|xH}OBHsPm0e`1i4*@{0II`$FK_6-I2V21Z|Wmp+}l{9Zrae(~=A7?RVlvyau$V&tXy#IIxAhwNSA8+D~O7tL|@{z?Q5-c5tGtxo!0?cx|z=VI5_Y ze<{OugrM2w{FcArdRO4>=oZJD>{A5cnKAq86kb@;Ee^=0o&1N(*FoF2EdI;tmO);2 zmj--p&KLo;kGd@Q>-kAHBbHXj0PjTs*Y#i8H{eIb!&OhN(yh0^64bprLVdDOPA6j| zbAOgMd0ejES|+KeB7W$lXo-7^WWO$OyyjRXhsqzgpzmMyje4IR-WTv0H{9oVF!7Ch z_?uiz+m*cm?foXvqWVg^3JY+D$LK2${N}2ZPoPA{lO+QS8s!ADh~Q$`=_ff+b3KoZ zgOw+(+CxVlE=8T($NLFtsK!*G4SCplf%0|RoXT>qslIX!_vW~wyANB^> zHb-%q8{fa}+YjflmIMdNVW`=Vau^l;Z_hB7eld`_+7%qpmx$>KNqy4{ZW68~Vxz8# zigWGQ=iG%)dop*QQ%=_e`-3I7dR!xdmHj*O63&8MMrYH9-(lp%_xue{)=e>>2fOBw zySr}4C{4wx$od{E^rEP#4wRUZ7)->aUgeVM)317Z3Y?nZxBB88l>M^;o^!L*Jd~%5 zmpi$mKlaR4yi+|NqWMc)KhN&eiD@?MG$A&T5@gD~{vh5p5jpX3F^2N8!jZ?hRt}D8 zpwDJimoO4ZeIQ2zSM%o4ez_CpLOGirJHOPD>hA!(BC?)J_eYvu;H2)81P~!NKBgQl z4Ri@j?Zp2)tCQ)ZY;$j}=yIA_LYgMFi$0m~hB&wB{t1oD^O+EjiyXVK;!9k+k-^V% zUYAK9ftS8-n>ev>XZOT=6?HSsjs9$P6xU8hOGGKJW<-f)J>vFJ(Sx!6E}{dnRs;^O z?wr-MwWHK`oJ2L(_i6b{%Az=S_yg^;w{gL!Y-&9%T;10?x6{ug6#bpc8%ptgx#3;& z&iJ=ZC8@n*;K@PG>l&2HXl+JC-O`wyKkeKHH73 zB5B+B(&T~x4BgPg$h#tnxI8_aLI_7!^o&M@q6qFOzr(3Lzr)!jQ%PFFI;dFe*C0xSpf2zXWd*5Bz>q%P1t(hM=N5B?*nm zq1mKn`8A~b!X+a3>2fu8#o50fTB zDO(E6Rp*>DY2Oln5z}|gB3nqjNcUxJiAZkXqWF?}mKgM{hWpC#={3&lI^gG!FPv~i z=|?3te9r7b0Jk&@u*t-l0uNqH5Oy@Uwq(g}!knVM)kD>P&;6$uL8&M}Fm(H&ZD%Cs zu=`zN;*xmf<`#NR=zxKt&ZOp=aw^Me*#Fp-@Ju*$C>)f`ghpsvd{A9JJA8!`aVYuo zQ0KMf1si!Ek)*CQz*W3Y+)qIL=)($89QgEaxj8<3eyOsyw&VxMEfjuqaIBwkXDBF` z>UaP}8;gl{A`km#8qQzj#%u{4?q~k_nexPUl+VDLQ7%5{Np_8KH5HSm=Su!P&%Io7 zHd6UB>cK_9yUheAa_YdCJMJ7R_^3yXsCrj%^ziqT0T)xXIt{HsCG2swo)dYz_sZCU+0}i(h4%sY1Mw%I|T=5{oL;Ww~}$L`GWqA2q!;LR1obEu+#q9IvPWAnw~!2|EfpWE*vQid)JcfCPQ zH>B9kIiVU*@2ExM8;Jld+72H-^k_4!Ox;vd5I>j_3MQuN4*UDl9L+uI@AOPq4<*Jy z$=!YO^Hntk)hYQRM|d8h(Fy$p|V&=@vLpa zEZT5j3qLkgsuX?w&{)}51E#p}vmuzpr`CoxC7+2HmL>b`4#f&9QN3ez=Ez1kXxJY* zX%^}LUpx@)02hb%^T-fKCpY=pOdc>g)_&|9N_shDP$sQm)Nv;r%V+tAXfC57$6g}n0WGzA4CRj$V|ITK>H7QDa5whCA_eKwIVG`@modcW z?IpF)*5`WBr1K;D`%oW2?FUG;21b!%k=ufeN}(aCf{!KarcF{j%GLsLqkvEfIDai9E<4U3_bNDO9c9L7R1 zASUvGgvwE7=3t!HRAY$x-rJM4iP>{{RpNUl%@M;6jsFzX*cw*VDqq%Ghf6~#AH??- z-d<4J&BYhxK)PFqF-=~reO-(J=Pt%8ZWrVJ<6VyNOGwyA-PIa z{qJ+qvI3hl2yEoqA-JhQt%I(DlA56TgUcpu@lnHem5jC_+t@A3H5Z9LD*uRe{`?qj zUX9s|Ja>K~5Z!)wm@tm&s^M;u@D}Fvo7)Dn^^xLTH>$at!IF2?lDIq7lIgHv+;}pq zVbOPA1>tM7`aQyiapN`@fll1!@rxdsvX}KR2!39Ynv|5{}BQI zZ5aV?%dQ27LTT|+%xco>z`8w7*CwhjPbDAn2;H|3&z*uG#kHBm&CMWZl=qv4M94qE zsGVulcg+viqGG^4Z|PN2z9!o52s`~k2Z8*LW>@5aaif{F%@4(}5Y)~RdMIkAI~F09 zaX)gO=v7S5e~m?mcZEgCPc4pQR4O{_LcqR^;>d}BeefDCnwms$yeqP;+SgS4hhtLLWzSWBkI; zb7QzST84&)?AmLaYU&VTZRo_j7H3#>&3yRhW5SIJQ9S2<-g0y>Lc~P1XcQiX(v5OO z*{ZIK_t_)Gi_wjVb%L#&OPs8_Ytops6aBQ9P4k@Y-k3iBA=S0$U!Q90 zD!#9XhKFY7_A_cZZK0^)AxSkimZNB_sjFabjvz7j4dY^tQjk-nJC~I=>>jMYhN>*# z(rOFtg*U;UAg4Ust|^CWC&MC;G0qY2OVM-;;@jY8-{%_a`9fRu(SbpE3`K-Q4Jrjd z{f;7I?1r;D*Qu?Z+mp^xgxqjf@r>!@hvH^E+5WiJQ%5tG3>EHH#%dk}+qykfTSD&* z*YX?vDwAw|p`~qTz8-xyyGJhK$@wggj1*BeTWpqi$L`NR`pY5p>bo9>yI2?{DrcS< zG5Kp*+uiLg7R(CFyru2S9XMWTi_;U&(ZM6pGdKTPTKe^X()=Zs)T)&0ogzLQanBZwoq+!qOP%*Zv zitYpQDgYinPW8Pcnj-*O!26 z2Ef}l?kv}`6oBM=)@P}YS$b?Ar@CiRVOi|Ni4A*HxmI|=eQI_#yE!QPi%=2m)xYco z=L2+y%Kwj}%diWtk)syx2>qw@xI!78y9~jKx@`g`{Y&-ZBCl~?NVexaukj5ZJ$I-m zdsU`cfTt>KhswhWKXqRX-LCQ<^UVX`hZD2sjk2iNd+J>i(RqMt2f)i5AM6G+cB>4Q zh2`&$E8IO5j_%BCvrW#s*3-U&jI*fw?`@cl$OBMC>bqaotW)csQYfJc$Mw|gFV$oL zQ>fSkf*DCc4qbNxJv;8^Z?drtfKHw^dbR?pmACv)NA@=W_1LRxU72Ovn{UCEXS{Ab zg(l>gSRVBHr*hVNJc8z}&9n8YTyOLV6_g!j6<6^s!Lq`u()g;?L(7M| zf1jPy&P%j(y!X*~c3a$|Gn}@KR@Uk#|wr%m& zH_tq4FZ}vvI~G5F+i&^E3%@DZ|DStz`OKnSmI}!&HJ;DhphA7ZLIBocQlIjZ(FTk?m@oLkU5p^_44D F{1 Date: Sat, 8 May 2021 13:06:26 -0500 Subject: [PATCH 09/35] ENH: IO support for R data files with C extension --- LICENSES/LIBRDATA_LICENSE | 19 + doc/source/user_guide/io.rst | 86 ++-- doc/source/whatsnew/v1.3.0.rst | 36 +- pandas/core/frame.py | 29 +- pandas/io/rdata/__init__.py | 1 + pandas/io/rdata/_rdata.pxd | 267 +++++++++++ pandas/io/rdata/_rdata.pyx | 413 ++++++++++++++++++ pandas/io/rdata/librdata/CKHashTable.h | 55 +++ pandas/io/rdata/librdata/rdata.h | 257 +++++++++++ pandas/io/rdata/librdata/rdata_bits.h | 21 + pandas/io/rdata/librdata/rdata_internal.h | 89 ++++ pandas/io/rdata/librdata/rdata_io_unistd.h | 26 ++ pandas/io/rdata/librdata/win_iconv.h | 40 ++ pandas/io/{rdata.py => rdata/rdata_reader.py} | 373 +++++----------- pandas/io/rdata/rdata_writer.py | 174 ++++++++ pandas/tests/io/data/rdata/ppm_df.rda | Bin 0 -> 11221 bytes pandas/tests/io/test_rdata.py | 198 +++++---- setup.py | 21 + 18 files changed, 1705 insertions(+), 400 deletions(-) create mode 100644 LICENSES/LIBRDATA_LICENSE create mode 100644 pandas/io/rdata/__init__.py create mode 100644 pandas/io/rdata/_rdata.pxd create mode 100644 pandas/io/rdata/_rdata.pyx create mode 100644 pandas/io/rdata/librdata/CKHashTable.h create mode 100644 pandas/io/rdata/librdata/rdata.h create mode 100644 pandas/io/rdata/librdata/rdata_bits.h create mode 100644 pandas/io/rdata/librdata/rdata_internal.h create mode 100644 pandas/io/rdata/librdata/rdata_io_unistd.h create mode 100644 pandas/io/rdata/librdata/win_iconv.h rename pandas/io/{rdata.py => rdata/rdata_reader.py} (54%) create mode 100644 pandas/io/rdata/rdata_writer.py create mode 100644 pandas/tests/io/data/rdata/ppm_df.rda diff --git a/LICENSES/LIBRDATA_LICENSE b/LICENSES/LIBRDATA_LICENSE new file mode 100644 index 0000000000000..4f24e6b9127ff --- /dev/null +++ b/LICENSES/LIBRDATA_LICENSE @@ -0,0 +1,19 @@ +Copyright (c) 2013-2020 Evan Miller (except where otherwise noted) + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index f4bbde8efcd92..ea91a568e04a3 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -5920,14 +5920,21 @@ Reading R data The top-level function ``read_rdata`` will read the native serialization types in the R language and environment. For .RData and its synonymous shorthand, .rda, that can hold multiple R objects, method will return a ``dict`` of ``DataFrames``. -For .rds types that only contains a single R object, method will return a single -``DataFrame``. +For .rds types that only contains a single R object, method will return a ``dict`` +of a single ``DataFrame``. .. note:: Since any R object can be saved in these types, this method will only return data.frame objects or objects coercible to data.frames including matrices, - tibbles, and data.tables and to some extent, arrays. + tibbles, and data.tables. + +For more information of R serialization data types, see docs on `rds`_ +and `rda`_ data formats. + +.. _rds: https://www.rdocumentation.org/packages/base/versions/3.6.2/topics/readRDS + +.. _rda: https://www.rdocumentation.org/packages/base/versions/3.6.2/topics/save For example, consider the following generated data.frames in R using environment data samples from US EPA, UK BGCI, and NOAA pubilc data: @@ -5984,7 +5991,7 @@ With ``read_rdata``, you can read these above .rds or .rda files: .. ipython:: python rds_file = os.path.join(file_path, "ghg_df.rds") - ghg_df = pd.read_rdata(rds_file).tail() + ghg_df = pd.read_rdata(rds_file)["r_dataframe"].tail() ghg_df rda_file = os.path.join(file_path, "env_data_dfs.rda") @@ -5996,7 +6003,7 @@ To ignore the rownames of data.frame, use option ``rownames=False``: .. ipython:: python rds_file = os.path.join(file_path, "plants_df.rds") - plants_df = pd.read_rdata(rds_file, rownames=False).tail() + plants_df = pd.read_rdata(rds_file, rownames=False)["r_dataframe"].tail() plants_df @@ -6014,7 +6021,10 @@ To read from a file-like object, read object in argument, ``path_or_buffer``: rds_file = os.path.join(file_path, "plants_df.rds") with open(rds_file, "rb") as f: - plants_df = pd.read_rdata(f.read(), file_format="rds") + plants_df = pd.read_rdata( + f, + file_format="rds", + )["r_dataframe"] plants_df @@ -6044,21 +6054,34 @@ will occur: In [608]: rds_file = os.path.join(file_path, "env_data_non_dfs.rda") ... - LibrdataError: Invalid file, or file has unsupported features + LibrdataParserError: Invalid file, or file has unsupported features .. _io.rdata_writer: -Please note R's ``Date`` (without time component) will translate to ``object`` type -in pandas. Also, R's date/time field type, ``POSIXct``, will translate to UTC time -in pandas. +Finally, please note R's ``Date`` (without time component) will translate to +``datetime64`` in pandas. Also, R's date/time field type, ``POSIXct``, that can +carry varying timezones will translate to UTC time in pandas. For example, in R, +the following data sample from an .rda shows date/time in 'America/Chicago' local +timezone: + +.. code-block:: r + + load("ppm_df.rda") + tail(ppm_df, 5) + date decimal_date monthly_average deseasonalized num_days std_dev_of_days unc_of_mon_mean + 612 2020-12-16 17:42:25 2020.958 414.25 414.98 30 0.47 0.17 + 613 2021-01-16 05:17:31 2021.042 415.52 415.26 29 0.44 0.16 + 614 2021-02-15 15:00:00 2021.125 416.75 415.93 28 1.02 0.37 + 615 2021-03-18 01:42:28 2021.208 417.64 416.18 28 0.86 0.31 + 616 2021-04-17 12:17:31 2021.292 419.05 416.23 24 1.12 0.44 + +In pandas, conversion shows adjustment in hours to UTC: .. ipython:: python - ppm_df = pd.read_rdata(os.path.join(file_path, "ppm_df.rds")) - ppm_df.head() - ppm_df.tail() - ppm_df.dtypes + r_dfs = pd.read_rdata(os.path.join(file_path, "ppm_df.rda")) + r_dfs["ppm_df"].tail() Writing R data '''''''''''''' @@ -6066,7 +6089,7 @@ Writing R data .. versionadded:: 1.3.0 The method :func:`~pandas.core.frame.DataFrame.to_rdata` will write a DataFrame -or multiple DataFrames into R data files (.RData, .rda, and .rds). +into R data files (.RData, .rda, and .rds). For a single DataFrame in rds type, pass in a file or buffer in method: @@ -6084,17 +6107,24 @@ and optionally give it a name: While RData and rda types can hold multiple R objects, this method currently only supports writing out a single DataFrame. -Even write to a buffer and read its content: +Even write to a buffer and read its content (and be sure to adjust default +``gzip`` compression to ``compression=None``): .. ipython:: python with BytesIO() as b_io: - env_dfs["sea_ice_df"].to_rdata(b_io, file_format="rda", index=False) + env_dfs["sea_ice_df"].to_rdata( + b_io, + file_format="rda", + index=False, + compression=None, + ) print( pd.read_rdata( b_io.getvalue(), file_format="rda", rownames=False, + compression=None, )["pandas_dataframe"].tail() ) @@ -6105,7 +6135,7 @@ will output as a named column or multiple columns for MultiIndex. ghg_df.rename_axis(None).to_rdata("ghg_df.rds") - pd.read_rdata("ghg_df.rds").tail() + pd.read_rdata("ghg_df.rds")["r_dataframe"].tail() To ignore the index, use ``index=False``: @@ -6113,11 +6143,11 @@ To ignore the index, use ``index=False``: ghg_df.rename_axis(None).to_rdata("ghg_df.rds", index=False) - pd.read_rdata("ghg_df.rds").tail() + pd.read_rdata("ghg_df.rds")["r_dataframe"].tail() By default, these R serialized types are compressed files in either gzip, bzip2, -or xz algorithms. Similarly to R, the default type in this method is "gzip" or -"gz". Notice difference of compressed and uncompressed files +or xz algorithms. Similar to R, the default ``compression`` type in this method +is "gzip" or "gz". Notice size difference of compressed and uncompressed files: .. ipython:: python @@ -6151,8 +6181,7 @@ Like other IO methods, ``storage_options`` are enabled to write to those platfor os.remove("plants_df_xz.rds") os.remove("plants_df_non_comp.rds") -Once exported, the single DataFrame can be read back in R or multiple DataFrames -loaded in R: +Once exported, the single DataFrame can be read or loaded in R: .. code-block:: r @@ -6176,17 +6205,6 @@ loaded in R: 144 Fluorinated gases 2018 182.7824 145 Total 2018 6676.6496 -For more information of the underlying ``pyreadr`` package, see main page of -`pyreadr`_ for further notes on support and limitations. For more information of R -serialization data types, see docs on `rds`_ and `rda`_ data files. - -.. _pyreadr: https://github.com/ofajardo/pyreadr - -.. _rds: https://www.rdocumentation.org/packages/base/versions/3.6.2/topics/readRDS - -.. _rda: https://www.rdocumentation.org/packages/base/versions/3.6.2/topics/save - - .. _io.stata: Stata format diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index dc5a87f39e1b7..0c95874572551 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -117,10 +117,10 @@ Read and write R data files We added I/O support to read and write R data files (.RData, .rda, .rds) using :func:`pandas.read_rdata` and :meth:`DataFrame.to_rdata`. Both methods rely on -the `pyreadr`_ package to support open source data migration between R and +the `librdata`_ C library to support open source data migration between R and Python pandas. (:issue:`40287`) -.. _pyreadr: https://github.com/ofajardo/pyreadr +.. _librdata: https://github.com/WizardMac/librdata For example, consider the below generated data frame and matrix in R: @@ -164,27 +164,29 @@ for .rds types or ``dict`` of DataFrames for .RData and .rda types: .. code-block:: ipython - In [1]: ppm_df = pd.read_rdata("ppm_df_r.rds") + In [1]: ppm_df = pd.read_rdata("ppm_df_r.rda")["r_dataframe"] In [2]: ppm_df Out[3]: - year month monthly_average num_days st_dev_of_days unc_mon_mean - 0 2020 10 411.51 30 0.22 0.08 - 1 2020 11 413.11 27 0.80 0.29 - 2 2020 12 414.25 30 0.48 0.17 - 3 2021 1 415.52 29 0.44 0.16 - 4 2021 2 416.75 28 1.01 0.36 + year month monthly_average deseasonalized num_days std_dev_of_days unc_of_mon_mean + rownames + 1 2020 12 414.25 414.98 30 0.47 0.17 + 2 2021 1 415.52 415.26 29 0.44 0.16 + 3 2021 2 416.75 415.93 28 1.02 0.37 + 4 2021 3 417.64 416.18 28 0.86 0.31 + 5 2021 4 419.05 416.23 24 1.12 0.44 In [4]: env_objs = pd.read_rdata("env_objs_r.rda") Out[5]: {'carbon_ppm_df': - year month monthly_average num_days st_dev_of_days unc_mon_mean - 0 2020 10 411.51 30 0.22 0.08 - 1 2020 11 413.11 27 0.80 0.29 - 2 2020 12 414.25 30 0.48 0.17 - 3 2021 1 415.52 29 0.44 0.16 - 4 2021 2 416.75 28 1.01 0.36 - - [5 rows x 6 columns], + year month monthly_average deseasonalized num_days std_dev_of_days unc_of_mon_mean + rownames + 1 2020 12 414.25 414.98 30 0.47 0.17 + 2 2021 1 415.52 415.26 29 0.44 0.16 + 3 2021 2 416.75 415.93 28 1.02 0.37 + 4 2021 3 417.64 416.18 28 0.86 0.31 + 5 2021 4 419.05 416.23 24 1.12 0.44 + + [5 rows x 7 columns], 'iucn_species_mtx': EX EW CR(PE) CR(PEW) CR EN VU DD Total rownames diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 15220093493e2..d299d16e9c042 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2322,8 +2322,8 @@ def to_rdata( index : bool, default True Include index or MulitIndex in output as separate columns. Since DataFrame indexes can include multiple columns and R rownames can - only include one column, DataFrame index will not map to R data.frame - rownames. + only include one column, DataFrame index will not map to R + data.frame rownames. compression : {{'gzip', 'bz2', 'xz', None}}, default 'gzip' Compression type for on-the-fly decompression of on-disk data. @@ -2338,8 +2338,17 @@ def to_rdata( See Also -------- to_stata : Convert DataFrame to a Stata dataset. - to_parquet : Convert DataFrame to parquet format. - to_feather: Convert DataFrame to feather formatt. + + Notes + ----- + For more information of R serialization data types, see docs on + `rda`_ and `rds`_ formats. + + .. _rda: https://www.rdocumentation.org/packages/base/versions/3.6.2/ + topics/save + + .. _rds: https://www.rdocumentation.org/packages/base/versions/3.6.2/ + topics/readRDS Examples -------- @@ -2354,7 +2363,7 @@ def to_rdata( ... 'emissions': [5424.88, 634.46, 434.53, ... 182.78, 6676.65] ... }}) - >>> ghg_df.to_rdata("ghg_df.rds") + >>> ghg_df.to_rdata("ghg_df.rds") # doctest: +SKIP >>> R_code = ''' ... ghg_df <- readRDS("ghg_df.rds") @@ -2367,7 +2376,7 @@ def to_rdata( ... 5 4 Total 2018 6676.65 ... ''' - To save an .rda or .RData file: + To save an .RData or .rda file: >>> plants_df = pd.DataFrame( ... {{'plant_group': ['Pteridophytes', @@ -2400,11 +2409,9 @@ def to_rdata( ... 5 4 Pteridophytes Threatened 1275 ... ''' """ - from pandas.io.rdata import PyReadrWriter - - import_optional_dependency("pyreadr") + from pandas.io.rdata.rdata_writer import RDataWriter - rdata_writer = PyReadrWriter( + r = RDataWriter( self, path_or_buffer=path_or_buffer, file_format=file_format, @@ -2414,7 +2421,7 @@ def to_rdata( storage_options=storage_options, ) - return rdata_writer.write_data() + return r.write_data() @doc(storage_options=generic._shared_docs["storage_options"]) @deprecate_kwarg(old_arg_name="fname", new_arg_name="path") diff --git a/pandas/io/rdata/__init__.py b/pandas/io/rdata/__init__.py new file mode 100644 index 0000000000000..aee8bdaa19c2c --- /dev/null +++ b/pandas/io/rdata/__init__.py @@ -0,0 +1 @@ +from pandas.io.rdata.rdata_reader import read_rdata # noqa diff --git a/pandas/io/rdata/_rdata.pxd b/pandas/io/rdata/_rdata.pxd new file mode 100644 index 0000000000000..fc38e10cfe5f5 --- /dev/null +++ b/pandas/io/rdata/_rdata.pxd @@ -0,0 +1,267 @@ +# cython: c_string_type=str, c_string_encoding=utf8, language_level=3 + +from posix.types cimport off_t + +from libc.stdint cimport ( + int32_t, + int64_t, +) +from libc.time cimport ( + mktime, + time_t, + tm, +) + + +cdef extern from 'librdata/rdata.h': + + ctypedef enum rdata_type_t: + RDATA_TYPE_STRING, + RDATA_TYPE_INT32, + RDATA_TYPE_REAL, + RDATA_TYPE_LOGICAL, + RDATA_TYPE_TIMESTAMP, + RDATA_TYPE_DATE + + ctypedef enum rdata_error_t: + RDATA_OK, + RDATA_ERROR_OPEN = 1, + RDATA_ERROR_SEEK, + RDATA_ERROR_READ, + RDATA_ERROR_MALLOC, + RDATA_ERROR_USER_ABORT, + RDATA_ERROR_PARSE, + RDATA_ERROR_WRITE, + RDATA_ERROR_FACTOR, + RDATA_ERROR_UNSUPPORTED_COMPRESSION, + RDATA_ERROR_UNSUPPORTED_CHARSET, + RDATA_ERROR_CONVERT, + RDATA_ERROR_CONVERT_BAD_STRING, + RDATA_ERROR_CONVERT_LONG_STRING, + RDATA_ERROR_CONVERT_SHORT_STRING, + RDATA_ERROR_UNSUPPORTED_S_EXPRESSION, + RDATA_ERROR_UNSUPPORTED_STORAGE_CLASS + + ctypedef enum rdata_file_format_t: + RDATA_WORKSPACE, + RDATA_SINGLE_OBJECT + + cdef const char *rdata_error_message(rdata_error_t error_code) + + ctypedef int (*rdata_column_handler)( + const char *name, rdata_type_t type, + void *data, long count, void *ctx + ) except * + ctypedef int ( + *rdata_table_handler)(const char *name, void *ctx + ) except * + ctypedef int ( + *rdata_text_value_handler)(const char *value, int index, void *ctx + ) except * + ctypedef int ( + *rdata_column_name_handler)(const char *value, int index, void *ctx + ) except * + ctypedef void (*rdata_error_handler)(const char *error_message, void *ctx) + ctypedef int (*rdata_progress_handler)(double progress, void *ctx) + + IF UNAME_SYSNAME == "AIX": + ctypedef off64_t rdata_off_t + ELSE: + ctypedef off_t rdata_off_t + + # Read API + + ctypedef enum rdata_io_flags_t: + RDATA_SEEK_SET, + RDATA_SEEK_CUR, + RDATA_SEEK_END + + ctypedef int (*rdata_open_handler)(const char *path, void *io_ctx) + ctypedef int (*rdata_close_handler)(void *io_ctx) + ctypedef rdata_off_t ( + *rdata_seek_handler + )(rdata_off_t offset, rdata_io_flags_t whence, void *io_ctx) + ctypedef ssize_t ( + *rdata_read_handler + )(void *buf, size_t nbyte, void *io_ctx) + ctypedef rdata_error_t ( + *rdata_update_handler + )( + long file_size, + rdata_progress_handler progress_handler, + void *user_ctx, + void *io_ctx + ) + + ctypedef struct rdata_io_t: + rdata_open_handler open + rdata_close_handler close + rdata_seek_handler seek + rdata_read_handler read + rdata_update_handler update + void *io_ctx + int external_io + + ctypedef struct rdata_parser_t: + rdata_table_handler table_handler + rdata_column_handler column_handler + rdata_column_name_handler column_name_handler + rdata_column_name_handler row_name_handler + rdata_text_value_handler text_value_handler + rdata_text_value_handler value_label_handler + rdata_column_handler dim_handler + rdata_text_value_handler dim_name_handler + rdata_error_handler error_handler + rdata_io_t *io + + cdef rdata_parser_t *rdata_parser_init() + cdef void rdata_parser_free(rdata_parser_t *parser) + + cdef rdata_error_t rdata_set_table_handler( + rdata_parser_t *parser, rdata_table_handler table_handler + ) + cdef rdata_error_t rdata_set_column_handler( + rdata_parser_t *parser, rdata_column_handler column_handler + ) + cdef rdata_error_t rdata_set_column_name_handler( + rdata_parser_t *parser, rdata_column_name_handler column_name_handler + ) + cdef rdata_error_t rdata_set_row_name_handler( + rdata_parser_t *parser, rdata_column_name_handler row_name_handler + ) + cdef rdata_error_t rdata_set_text_value_handler( + rdata_parser_t *parser, rdata_text_value_handler text_value_handler + ) + cdef rdata_error_t rdata_set_value_label_handler( + rdata_parser_t *parser, rdata_text_value_handler value_label_handler + ) + cdef rdata_error_t rdata_set_dim_handler( + rdata_parser_t *parser, rdata_column_handler dim_handler + ) + cdef rdata_error_t rdata_set_dim_name_handler( + rdata_parser_t *parser, rdata_text_value_handler dim_name_handler + ) + cdef rdata_error_t rdata_set_error_handler( + rdata_parser_t *parser, rdata_error_handler error_handler + ) + cdef rdata_error_t rdata_set_open_handler( + rdata_parser_t *parser, rdata_open_handler open_handler + ) + cdef rdata_error_t rdata_set_close_handler( + rdata_parser_t *parser, rdata_close_handler close_handler + ) + cdef rdata_error_t rdata_set_seek_handler( + rdata_parser_t *parser, rdata_seek_handler seek_handler + ) + cdef rdata_error_t rdata_set_read_handler( + rdata_parser_t *parser, rdata_read_handler read_handler + ) + cdef rdata_error_t rdata_set_update_handler( + rdata_parser_t *parser, rdata_update_handler update_handler + ) + cdef rdata_error_t rdata_set_io_ctx( + rdata_parser_t *parser, void *io_ctx + ) + cdef rdata_error_t rdata_parse( + rdata_parser_t *parser, const char *filename, void *user_ctx + ) + + # Write API + ctypedef ssize_t ( + *rdata_data_writer)(const void *data, size_t len, void *ctx + ) + + ctypedef struct rdata_column_t: + rdata_type_t type + int index + char name[256] + char label[1024] + + int32_t factor_count + char **factor + + ctypedef struct rdata_writer_t: + rdata_file_format_t file_format + rdata_data_writer data_writer + size_t bytes_written + + rdata_error_handler error_handler + void *user_ctx + + void *atom_table + int bswap + + rdata_column_t **columns + int32_t columns_count + int32_t columns_capacity + + cdef rdata_writer_t *rdata_writer_init( + rdata_data_writer write_callback, rdata_file_format_t format + ) + cdef void rdata_writer_free(rdata_writer_t *writer) + + cdef rdata_column_t *rdata_add_column( + rdata_writer_t *writer, const char *name, rdata_type_t type + ) + + cdef rdata_error_t rdata_column_set_label( + rdata_column_t *column, const char *label + ) + cdef rdata_error_t rdata_column_add_factor( + rdata_column_t *column, const char *factor + ) + + cdef rdata_column_t *rdata_get_column(rdata_writer_t *writer, int32_t j) + + cdef rdata_error_t rdata_begin_file(rdata_writer_t *writer, void *ctx) + cdef rdata_error_t rdata_begin_table( + rdata_writer_t *writer, const char *variable_name + ) + cdef rdata_error_t rdata_begin_column( + rdata_writer_t *writer, rdata_column_t *column, int32_t row_count + ) + + cdef rdata_error_t rdata_append_real_value( + rdata_writer_t *writer, double value + ) + cdef rdata_error_t rdata_append_int32_value( + rdata_writer_t *writer, int32_t value + ) + cdef rdata_error_t rdata_append_timestamp_value( + rdata_writer_t *writer, time_t value + ) + cdef rdata_error_t rdata_append_date_value( + rdata_writer_t *writer, tm *value + ) + cdef rdata_error_t rdata_append_logical_value( + rdata_writer_t *writer, int value + ) + cdef rdata_error_t rdata_append_string_value( + rdata_writer_t *writer, const char *value + ) + + cdef rdata_error_t rdata_end_column( + rdata_writer_t *writer, rdata_column_t *column + ) + cdef rdata_error_t rdata_end_table( + rdata_writer_t *writer, int32_t row_count, const char *datalabel + ) + cdef rdata_error_t rdata_end_file( + rdata_writer_t *writer + ) + + cdef extern from "": + int open(const char *path, int oflag, int mode) + + IF UNAME_SYSNAME == "Windows": + cdef extern from "": + int close(int fd) + ssize_t write(int fd, const void *buf, size_t nbyte) + ELSE: + cdef extern from "": + int close(int fd) + ssize_t write(int fd, const void *buf, size_t nbyte) + + cdef extern from "" nogil: + enum: O_CREAT + enum: O_WRONLY diff --git a/pandas/io/rdata/_rdata.pyx b/pandas/io/rdata/_rdata.pyx new file mode 100644 index 0000000000000..33a6cd60e6396 --- /dev/null +++ b/pandas/io/rdata/_rdata.pyx @@ -0,0 +1,413 @@ +# cython: c_string_type=str, c_string_encoding=utf8, language_level=3 + +cdef int handle_table(const char *name, void *ctx) except *: + """ + Retrieves original R object name. + + Called once per data frame in RData files, + and zero times on RDS files. + """ + lbr = ctx + + lbr.colidx = 0 + lbr.rows = 0 + lbr.rlevels = {} + lbr.rtext = {} + lbr.is_factor = False + lbr.rownames = {} + lbr.colnames = {} + lbr.dims = 0 + lbr.dim_str = {} + + if name != NULL: + lbr.tblname = name + + if "r_dataframe" in lbr.rvalues.keys(): + lbr.rvalues[lbr.tblname] = lbr.rvalues.pop("r_dataframe") + else: + lbr.rvalues[lbr.tblname] = { + "data": {}, + "dtypes": {}, + "colnames": None, + "rownames": None + } + return 0 # non-zero to abort processing + + +cdef int handle_column( + const char *name, + rdata_type_t dtype, + void *data, + long count, + void *ctx +) except *: + """ + Parses each non-string column in data frame. + + Called once for all columns with the following caveats: + * `name` is NULL for some columns (see handle_column_name below) + * `data` is NULL for text columns (see handle_text_value below) + Special conditon for matrices with dims attribute. + """ + lbr = ctx + + lbr.rows = count + cdef int *rints = data + cdef double *rdoubles = data + + if dtype in [ + rdata_type_t.RDATA_TYPE_REAL, + rdata_type_t.RDATA_TYPE_DATE, + rdata_type_t.RDATA_TYPE_TIMESTAMP + ]: + lbr.rvalues[lbr.tblname]["dtypes"][lbr.colidx] = lbr.rtypes[dtype] + lbr.rvalues[lbr.tblname]["data"][lbr.colidx] = { + i: rdoubles[i] for i in range(count) + } + lbr.colidx += 1 + + elif dtype in [ + rdata_type_t.RDATA_TYPE_INT32, + rdata_type_t.RDATA_TYPE_LOGICAL + ]: + if lbr.is_factor: + lbr.rvalues[lbr.tblname]["dtypes"][lbr.colidx] = "factor" + lbr.rvalues[lbr.tblname]["data"][lbr.colidx] = { + i: float('nan') if rints[i] < 0 else lbr.rlevels[rints[i]-1] + for i in range(count) + } + lbr.is_factor = False + else: + lbr.rvalues[lbr.tblname]["dtypes"][lbr.colidx] = lbr.rtypes[dtype] + lbr.rvalues[lbr.tblname]["data"][lbr.colidx] = { + i: rints[i] for i in range(count) + } + lbr.colidx += 1 + + if lbr.dims > 0: + lbr.tblname = "r_matrix" + lbr.rvalues[lbr.tblname] = lbr.rvalues.pop("r_dataframe") + dim_data = list(lbr.rvalues[lbr.tblname]["data"][0].values()) + + n = 0 + rows, cols = lbr.dim_str.values() + for col in range(cols): + lbr.rvalues[lbr.tblname]["dtypes"][col] = lbr.rtypes[dtype] + lbr.rvalues[lbr.tblname]["data"][col] = { + i: d for i, d in enumerate(dim_data[n:n+rows]) + } + n += rows + + return 0 + +cdef int handle_text_value(const char *value, int index, void *ctx) except *: + """ + Parses string data. + + Called once per row for a text column. + """ + lbr = ctx + + lbr.rtext[index] = value if value != NULL else None + + if index == (lbr.rows - 1): + lbr.rvalues[lbr.tblname]["dtypes"][lbr.colidx] = "str" + lbr.rvalues[lbr.tblname]["data"][lbr.colidx] = lbr.rtext + lbr.colidx += 1 + lbr.rtext = {} + + return 0 + +cdef int handle_value_label(const char *value, int index, void *ctx) except *: + """ + Parses factor levels. + + Called for factor variables, once for each level + """ + lbr = ctx + + lbr.is_factor = True + lbr.rlevels[index] = value + + return 0 + +cdef int handle_dim( + const char *name, + rdata_type_t dtype, + void *data, + long count, + void *ctx +) except *: + """ + Parses meta data on non-dataframe objects + + Called once for objects with R dims (matrices, arrays, etc.)). + Special condition for character matrices. + """ + lbr = ctx + + cdef int *rdims = data + + lbr.dims = count + lbr.dim_str = {i: rdims[i] for i in range(count)} + + if lbr.rvalues[lbr.tblname]["dtypes"] == {0: "str"}: + dim_data = list(lbr.rvalues[lbr.tblname]["data"][0].values()) + + n = 0 + rows, cols = lbr.dim_str.values() + + for col in range(cols): + lbr.rvalues[lbr.tblname]["dtypes"][col] = "str" + lbr.rvalues[lbr.tblname]["data"][col] = dim_data[n:n+rows] + n += rows + + return 0 + +cdef int handle_column_name(const char *name, int index, void *ctx) except *: + """ + Retrieves column names of data frame + + Returns only non-NULL column names after parsing data. + """ + lbr = ctx + + lbr.colnames[index] = name + lbr.rvalues[lbr.tblname]["colnames"] = lbr.colnames + + return 0 + +cdef int handle_row_name(const char *name, int index, void *ctx) except *: + """ + Retrieves row names of data frame + + Returns only non-NULL row names appear after parsing data. + """ + lbr = ctx + + lbr.rownames[index] = name + lbr.rvalues[lbr.tblname]["rownames"] = lbr.rownames + + return 0 + +cdef int handle_dim_name(const char *name, int index, void *ctx) except *: + """ + Retrieves dim names of matrices or arrays + + Returns only non-NULL dim names appear after parsing data. + """ + + lbr = ctx + + if (index < lbr.dim_str[0]) and lbr.rownames.get(index) is None: + lbr.rownames[index] = name if name != NULL else str(index) + else: + lbr.rvalues[lbr.tblname]["rownames"] = lbr.rownames + + if index < lbr.dim_str[1]: + lbr.colnames[index] = name if name != NULL else str(index) + else: + lbr.rvalues[lbr.tblname]["colnames"] = lbr.colnames + + return 0 + + +class LibrdataParserError(Exception): + """ + Base error class to capture exceptions in librdata parsing. + """ + pass + + +cdef class LibrdataReader: + """ + Base class to read RData files. + + Class interfaces with librdata C library to builds dictionaries + of each data frame including data content and meta (dtypes, colnames, + and rownames). Callbacks above are used in ``rdata_`` method attributes. + """ + cdef rdata_parser_t *rparser + cdef public: + int colidx + int rows + dict rlevels + dict rtext + bint is_factor + dict rownames + dict colnames + dict rtypes + str tblname + dict rvalues + int dims + dict dim_str + + cpdef read_rdata(self, rfile): + self.rparser = rdata_parser_init() + + self.colidx = 0 + self.rows = 0 + self.rlevels = {} + self.rtext = {} + self.is_factor = False + self.rownames = {} + self.colnames = {} + self.dims = 0 + self.dim_str = {} + self.rtypes = { + rdata_type_t.RDATA_TYPE_LOGICAL: "bool", + rdata_type_t.RDATA_TYPE_INT32: "int", + rdata_type_t.RDATA_TYPE_REAL: "float", + rdata_type_t.RDATA_TYPE_DATE: "date", + rdata_type_t.RDATA_TYPE_TIMESTAMP: "datetime", + rdata_type_t.RDATA_TYPE_STRING: "str" + } + self.tblname = "r_dataframe" + self.rvalues = { + "r_dataframe": { + "data": {}, + "dtypes": {}, + "colnames": None, + "rownames": None + } + } + + err = RDATA_OK + while err == RDATA_OK: + err = rdata_set_table_handler(self.rparser, handle_table) + err = rdata_set_dim_handler(self.rparser, handle_dim) + err = rdata_set_column_handler(self.rparser, handle_column) + err = rdata_set_text_value_handler(self.rparser, handle_text_value) + err = rdata_set_value_label_handler(self.rparser, handle_value_label) + err = rdata_set_column_name_handler(self.rparser, handle_column_name) + err = rdata_set_row_name_handler(self.rparser, handle_row_name) + err = rdata_set_dim_name_handler(self.rparser, handle_dim_name) + + err = rdata_parse(self.rparser, rfile, self) + rdata_parser_free(self.rparser) + break + + if err != RDATA_OK: + msg = rdata_error_message(err) + raise LibrdataParserError(msg) + + return self.rvalues + + +class LibrdataWriterError(Exception): + """ + Base error class to capture exceptions in librdata writing. + """ + pass + + +cdef ssize_t write_data(const void *bytes, size_t len, void *ctx): + cdef int fd = (ctx)[0] + return write(fd, bytes, len) + +cdef class LibrdataWriter(): + """ + Base class to write RData files. + + Class interfaces with librdata C library to iterate through dictionaries + of each DataFrame column according to correspoinding dtype. + Single callback above is usedd in exposed `init`` method. + """ + cdef: + int fd + int row_count + dict rdict + dict rformats + dict rtypes + str tbl_name + rdata_writer_t *writer + rdata_column_t *py_col + + cdef write_col_data(self, i, kdata, vdata, ktype, vtype): + py_col = rdata_get_column(self.writer, i) + rdata_begin_column(self.writer, py_col, self.row_count) + + if vtype == "bool": + for k, v in vdata.items(): + rdata_append_logical_value(self.writer, v) + + if vtype.startswith(("int", "uint")): + for k, v in vdata.items(): + rdata_append_int32_value(self.writer, v) + + if vtype.startswith("float"): + for k, v in vdata.items(): + rdata_append_real_value(self.writer, v) + + if vtype.startswith("datetime64"): + for k, v in vdata.items(): + rdata_append_timestamp_value(self.writer, v) + + if vtype == "object": + for k, v in vdata.items(): + if v == v: + rdata_append_string_value(self.writer, v) + else: + rdata_append_string_value(self.writer, NULL) + + rdata_end_column(self.writer, py_col) + + cpdef write_rdata(self, rfile, rdict, rformat, tbl_name=None): + + self.rdict = rdict + self.tbl_name = tbl_name + self.row_count = len(next(iter(rdict["data"].items()))[1]) + + self.rformats = { + "rdata": RDATA_WORKSPACE, + "rda": RDATA_WORKSPACE, + "rds": RDATA_SINGLE_OBJECT + } + + self.rtypes = { + "bool": RDATA_TYPE_LOGICAL, + "int8": RDATA_TYPE_INT32, + "int16": RDATA_TYPE_INT32, + "int32": RDATA_TYPE_INT32, + "int64": RDATA_TYPE_INT32, + "uint8": RDATA_TYPE_INT32, + "uint16": RDATA_TYPE_INT32, + "uint32": RDATA_TYPE_INT32, + "uint64": RDATA_TYPE_INT32, + "float8": RDATA_TYPE_REAL, + "float16": RDATA_TYPE_REAL, + "float32": RDATA_TYPE_REAL, + "float64": RDATA_TYPE_REAL, + "datetime64[ns]": RDATA_TYPE_TIMESTAMP, + "object": RDATA_TYPE_STRING + } + + self.fd = open(rfile, O_CREAT | O_WRONLY, 0644); + self.writer = rdata_writer_init(write_data, self.rformats[rformat]) + + for k, v in self.rdict["dtypes"].items(): + rdata_add_column(self.writer, k, self.rtypes[v]) + + rdata_begin_file(self.writer, &self.fd) + rdata_begin_table(self.writer, self.tbl_name) + + try: + for n, ((kd, vd), (kt, vt)) in enumerate( + zip( + self.rdict["data"].items(), + self.rdict["dtypes"].items() + ) + ): + self.write_col_data(n, kd, vd, kt, vt) + + except (TypeError, ValueError, UnicodeDecodeError) as e: + raise LibrdataWriterError( + "DataFrame contains one more invalid types or data values. " + "that does not conform to R data types." + ) + + rdata_end_table(self.writer, self.row_count, "pandas_dataframe") + rdata_end_file(self.writer) + + close(self.fd) + rdata_writer_free(self.writer) diff --git a/pandas/io/rdata/librdata/CKHashTable.h b/pandas/io/rdata/librdata/CKHashTable.h new file mode 100644 index 0000000000000..021a04025079d --- /dev/null +++ b/pandas/io/rdata/librdata/CKHashTable.h @@ -0,0 +1,55 @@ +// CKHashTable - A simple hash table +// Copyright 2010-2020 Evan Miller (see LICENSE) + +#ifndef PANDAS_IO_RDATA_LIBRDATA_CKHASHTABLE_H_ +#define PANDAS_IO_RDATA_LIBRDATA_CKHASHTABLE_H_ + +#include +#include + +typedef struct ck_hash_entry_s { + off_t key_offset; + size_t key_length; + const void *value; +} ck_hash_entry_t; + +typedef struct ck_hash_table_s { + size_t capacity; + size_t count; + ck_hash_entry_t *entries; + char *keys; + size_t keys_used; + size_t keys_capacity; +} ck_hash_table_t; + +int ck_str_hash_insert( + const char *key, const void *value, ck_hash_table_t *table +); +const void *ck_str_hash_lookup(const char *key, ck_hash_table_t *table); + +int ck_str_n_hash_insert( + const char *key, size_t keylen, const void *value, ck_hash_table_t *table +); +const void *ck_str_n_hash_lookup( + const char *key, size_t keylen, ck_hash_table_t *table +); + +int ck_float_hash_insert( + float key, const void *value, ck_hash_table_t *table +); +const void *ck_float_hash_lookup(float key, ck_hash_table_t *table); + +int ck_double_hash_insert( + double key, const void *value, ck_hash_table_t *table +); +const void *ck_double_hash_lookup(double key, ck_hash_table_t *table); + +ck_hash_table_t *ck_hash_table_init( + size_t num_entries, size_t mean_key_length +); +void ck_hash_table_wipe(ck_hash_table_t *table); +int ck_hash_table_grow(ck_hash_table_t *table); +void ck_hash_table_free(ck_hash_table_t *table); +uint64_t ck_hash_str(const char *str, size_t keylen); + +#endif // PANDAS_IO_RDATA_LIBRDATA_CKHASHTABLE_H_ diff --git a/pandas/io/rdata/librdata/rdata.h b/pandas/io/rdata/librdata/rdata.h new file mode 100644 index 0000000000000..9571f5da4c357 --- /dev/null +++ b/pandas/io/rdata/librdata/rdata.h @@ -0,0 +1,257 @@ +/* +Copyright (c) 2020 Evan Miller +*/ + +#ifndef PANDAS_IO_RDATA_LIBRDATA_RDATA_H_ +#define PANDAS_IO_RDATA_LIBRDATA_RDATA_H_ + +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +typedef enum rdata_type_e { + RDATA_TYPE_STRING, + RDATA_TYPE_INT32, + RDATA_TYPE_REAL, + RDATA_TYPE_LOGICAL, + RDATA_TYPE_TIMESTAMP, + RDATA_TYPE_DATE +} rdata_type_t; + +typedef enum rdata_error_e { + RDATA_OK, + RDATA_ERROR_OPEN = 1, + RDATA_ERROR_SEEK, + RDATA_ERROR_READ, + RDATA_ERROR_MALLOC, + RDATA_ERROR_USER_ABORT, + RDATA_ERROR_PARSE, + RDATA_ERROR_WRITE, + RDATA_ERROR_FACTOR, + RDATA_ERROR_UNSUPPORTED_COMPRESSION, + RDATA_ERROR_UNSUPPORTED_CHARSET, + RDATA_ERROR_CONVERT, + RDATA_ERROR_CONVERT_BAD_STRING, + RDATA_ERROR_CONVERT_LONG_STRING, + RDATA_ERROR_CONVERT_SHORT_STRING, + RDATA_ERROR_UNSUPPORTED_S_EXPRESSION, + RDATA_ERROR_UNSUPPORTED_STORAGE_CLASS +} rdata_error_t; + +typedef enum rdata_file_format_e { + RDATA_WORKSPACE, + RDATA_SINGLE_OBJECT +} rdata_file_format_t; + +const char *rdata_error_message(rdata_error_t error_code); + +typedef int (*rdata_column_handler)(const char *name, rdata_type_t type, + void *data, long count, void *ctx); +typedef int (*rdata_table_handler)(const char *name, void *ctx); +typedef int (*rdata_text_value_handler)( + const char *value, int index, void *ctx +); +typedef int (*rdata_column_name_handler)( + const char *value, int index, void *ctx +); +typedef void (*rdata_error_handler)(const char *error_message, void *ctx); +typedef int (*rdata_progress_handler)(double progress, void *ctx); + +#if defined(_MSC_VER) +#include +typedef SSIZE_T ssize_t; +typedef __int64 rdata_off_t; +#elif defined _WIN32 || defined __CYGWIN__ +typedef _off64_t rdata_off_t; +#elif defined _AIX +typedef off64_t rdata_off_t; +#else +typedef off_t rdata_off_t; +#endif + +typedef enum rdata_io_flags_e { + RDATA_SEEK_SET, + RDATA_SEEK_CUR, + RDATA_SEEK_END +} rdata_io_flags_t; + +typedef int (*rdata_open_handler)(const char *path, void *io_ctx); +typedef int (*rdata_close_handler)(void *io_ctx); +typedef rdata_off_t (*rdata_seek_handler)( + rdata_off_t offset, rdata_io_flags_t whence, void *io_ctx +); +typedef ssize_t (*rdata_read_handler)(void *buf, size_t nbyte, void *io_ctx); +typedef rdata_error_t (*rdata_update_handler)( + long file_size, + rdata_progress_handler progress_handler, + void *user_ctx, + void *io_ctx +); + +typedef struct rdata_io_s { + rdata_open_handler open; + rdata_close_handler close; + rdata_seek_handler seek; + rdata_read_handler read; + rdata_update_handler update; + void *io_ctx; + int external_io; +} rdata_io_t; + +typedef struct rdata_parser_s { + rdata_table_handler table_handler; + rdata_column_handler column_handler; + rdata_column_name_handler column_name_handler; + rdata_column_name_handler row_name_handler; + rdata_text_value_handler text_value_handler; + rdata_text_value_handler value_label_handler; + rdata_column_handler dim_handler; + rdata_text_value_handler dim_name_handler; + rdata_error_handler error_handler; + rdata_io_t *io; +} rdata_parser_t; + +rdata_parser_t *rdata_parser_init(void); +void rdata_parser_free(rdata_parser_t *parser); + +rdata_error_t rdata_set_table_handler( + rdata_parser_t *parser, rdata_table_handler table_handler +); +rdata_error_t rdata_set_column_handler( + rdata_parser_t *parser, rdata_column_handler column_handler +); +rdata_error_t rdata_set_column_name_handler( + rdata_parser_t *parser, rdata_column_name_handler column_name_handler +); +rdata_error_t rdata_set_row_name_handler( + rdata_parser_t *parser, rdata_column_name_handler row_name_handler +); +rdata_error_t rdata_set_text_value_handler( + rdata_parser_t *parser, rdata_text_value_handler text_value_handler +); +rdata_error_t rdata_set_value_label_handler( + rdata_parser_t *parser, rdata_text_value_handler value_label_handler +); +rdata_error_t rdata_set_dim_handler( + rdata_parser_t *parser, rdata_column_handler dim_handler +); +rdata_error_t rdata_set_dim_name_handler( + rdata_parser_t *parser, rdata_text_value_handler dim_name_handler +); +rdata_error_t rdata_set_error_handler( + rdata_parser_t *parser, rdata_error_handler error_handler +); +rdata_error_t rdata_set_open_handler( + rdata_parser_t *parser, rdata_open_handler open_handler +); +rdata_error_t rdata_set_close_handler( + rdata_parser_t *parser, rdata_close_handler close_handler +); +rdata_error_t rdata_set_seek_handler( + rdata_parser_t *parser, rdata_seek_handler seek_handler +); +rdata_error_t rdata_set_read_handler( + rdata_parser_t *parser, rdata_read_handler read_handler +); +rdata_error_t rdata_set_update_handler( + rdata_parser_t *parser, rdata_update_handler update_handler); +rdata_error_t rdata_set_io_ctx(rdata_parser_t *parser, void *io_ctx); + +/* rdata_parse works on RData and RDS. The table handler will be called once + * per data frame in RData files, and zero times on RDS files. */ + +rdata_error_t rdata_parse( + rdata_parser_t *parser, const char *filename, void *user_ctx +); + + +// Write API +typedef ssize_t (*rdata_data_writer)(const void *data, size_t len, void *ctx); + +typedef struct rdata_column_s { + rdata_type_t type; + int index; + char name[256]; + char label[1024]; + + int32_t factor_count; + char **factor; +} rdata_column_t; + +typedef struct rdata_writer_s { + rdata_file_format_t file_format; + rdata_data_writer data_writer; + size_t bytes_written; + + rdata_error_handler error_handler; + void *user_ctx; + + void *atom_table; + int bswap; + + rdata_column_t **columns; + int32_t columns_count; + int32_t columns_capacity; +} rdata_writer_t; + +rdata_writer_t *rdata_writer_init( + rdata_data_writer write_callback, rdata_file_format_t format +); +void rdata_writer_free(rdata_writer_t *writer); + +rdata_column_t *rdata_add_column( + rdata_writer_t *writer, const char *name, rdata_type_t type +); + +rdata_error_t rdata_column_set_label( + rdata_column_t *column, const char *label +); +rdata_error_t rdata_column_add_factor( + rdata_column_t *column, const char *factor); + +rdata_column_t *rdata_get_column(rdata_writer_t *writer, int32_t j +); + +rdata_error_t rdata_begin_file(rdata_writer_t *writer, void *ctx); +rdata_error_t rdata_begin_table( + rdata_writer_t *writer, const char *variable_name); +rdata_error_t rdata_begin_column( + rdata_writer_t *writer, rdata_column_t *column, int32_t row_count +); + +rdata_error_t rdata_append_real_value( + rdata_writer_t *writer, double value +); +rdata_error_t rdata_append_int32_value( + rdata_writer_t *writer, int32_t value +); +rdata_error_t rdata_append_timestamp_value( + rdata_writer_t *writer, time_t value +); +rdata_error_t rdata_append_date_value( + rdata_writer_t *writer, struct tm *value +); +rdata_error_t rdata_append_logical_value( + rdata_writer_t *writer, int value); +rdata_error_t rdata_append_string_value( + rdata_writer_t *writer, const char *value +); + +rdata_error_t rdata_end_column( + rdata_writer_t *writer, rdata_column_t *column +); +rdata_error_t rdata_end_table( + rdata_writer_t *writer, int32_t row_count, const char *datalabel +); +rdata_error_t rdata_end_file(rdata_writer_t *writer); + +#ifdef __cplusplus +} // extern c block +#endif + +#endif // PANDAS_IO_RDATA_LIBRDATA_RDATA_H_ diff --git a/pandas/io/rdata/librdata/rdata_bits.h b/pandas/io/rdata/librdata/rdata_bits.h new file mode 100644 index 0000000000000..e53128171fd0e --- /dev/null +++ b/pandas/io/rdata/librdata/rdata_bits.h @@ -0,0 +1,21 @@ +/* +Copyright (c) 2020 Evan Miller +*/ + +// +// rdata_bit.h - Bit-twiddling utility functions +// + +#ifndef PANDAS_IO_RDATA_LIBRDATA_RDATA_BITS_H_ +#define PANDAS_IO_RDATA_LIBRDATA_RDATA_BITS_H_ + +int machine_is_little_endian(void); + +uint16_t byteswap2(uint16_t num); +uint32_t byteswap4(uint32_t num); +uint64_t byteswap8(uint64_t num); + +float byteswap_float(float num); +double byteswap_double(double num); + +#endif // PANDAS_IO_RDATA_LIBRDATA_RDATA_BITS_H_ diff --git a/pandas/io/rdata/librdata/rdata_internal.h b/pandas/io/rdata/librdata/rdata_internal.h new file mode 100644 index 0000000000000..56b2108739560 --- /dev/null +++ b/pandas/io/rdata/librdata/rdata_internal.h @@ -0,0 +1,89 @@ +/* +Copyright (c) 2020 Evan Miller +*/ + +// +// rdata_internal.h +// + +#ifndef PANDAS_IO_RDATA_LIBRDATA_RDATA_INTERNAL_H_ +#define PANDAS_IO_RDATA_LIBRDATA_RDATA_INTERNAL_H_ + +#include "rdata_bits.h" + +#pragma pack(push, 1) + +typedef struct rdata_v2_header_s { + char header[2]; + uint32_t format_version; + uint32_t writer_version; + uint32_t reader_version; +} rdata_v2_header_t; + +typedef struct rdata_sexptype_header_s { + unsigned int type:8; + unsigned int object:1; + unsigned int attributes:1; + unsigned int tag:1; + unsigned int unused:1; + unsigned int gp:16; + unsigned int padding:4; +} rdata_sexptype_header_t; + +typedef struct rdata_sexptype_info_s { + rdata_sexptype_header_t header; + int32_t attributes; + int32_t tag; + int32_t ref; +} rdata_sexptype_info_t; + +#pragma pack(pop) + +#define RDATA_SEXPTYPE_NIL 0 +#define RDATA_SEXPTYPE_SYMBOL 1 +#define RDATA_SEXPTYPE_PAIRLIST 2 +#define RDATA_SEXPTYPE_CLOSURE 3 +#define RDATA_SEXPTYPE_ENVIRONMENT 4 +#define RDATA_SEXPTYPE_PROMISE 5 +#define RDATA_SEXPTYPE_LANGUAGE_OBJECT 6 +#define RDATA_SEXPTYPE_SPECIAL_FUNCTION 7 +#define RDATA_SEXPTYPE_BUILTIN_FUNCTION 8 +#define RDATA_SEXPTYPE_CHARACTER_STRING 9 +#define RDATA_SEXPTYPE_LOGICAL_VECTOR 10 +#define RDATA_SEXPTYPE_INTEGER_VECTOR 13 +#define RDATA_SEXPTYPE_REAL_VECTOR 14 +#define RDATA_SEXPTYPE_COMPLEX_VECTOR 15 +#define RDATA_SEXPTYPE_CHARACTER_VECTOR 16 +#define RDATA_SEXPTYPE_DOT_DOT_DOT 17 +#define RDATA_SEXPTYPE_ANY 18 +#define RDATA_SEXPTYPE_GENERIC_VECTOR 19 +#define RDATA_SEXPTYPE_EXPRESSION_VECTOR 20 +#define RDATA_SEXPTYPE_BYTE_CODE 21 +#define RDATA_SEXPTYPE_EXTERNAL_POINTER 22 +#define RDATA_SEXPTYPE_WEAK_REFERENCE 23 +#define RDATA_SEXPTYPE_RAW_VECTOR 24 +#define RDATA_SEXPTYPE_S4_CLASS 25 + +#define RDATA_SEXPTYPE_FUN 99 + +#define RDATA_PSEUDO_SXP_REF 255 +#define RDATA_PSEUDO_SXP_NIL 254 +#define RDATA_PSEUDO_SXP_GLOBAL_ENVIRONMENT 253 +#define RDATA_PSEUDO_SXP_UNBOUND_VALUE 252 +#define RDATA_PSEUDO_SXP_MISSING_ARGUMENT 251 +#define RDATA_PSEUDO_SXP_BASE_NAMESPACE 250 +#define RDATA_PSEUDO_SXP_NAMESPACE 249 +#define RDATA_PSEUDO_SXP_PACKAGE 248 +#define RDATA_PSEUDO_SXP_PERSIST 247 +#define RDATA_PSEUDO_SXP_CLASS_REF 246 +#define RDATA_PSEUDO_SXP_GENERIC_REF 245 +#define RDATA_PSEUDO_SXP_BYTE_CODE_REP_DEF 244 +#define RDATA_PSEUDO_SXP_BYTE_CODE_REP_REF 243 +#define RDATA_PSEUDO_SXP_EMPTY_ENVIRONMENT 242 +#define RDATA_PSEUDO_SXP_BASE_ENVIRONMENT 241 + +#define RDATA_SEXPTYPE_LANGUAGE_OBJECT_ATTR 240 +#define RDATA_SEXPTYPE_PAIRLIST_ATTR 239 +#define RDATA_PSEUDO_SXP_ALTREP 238 + +#endif // PANDAS_IO_RDATA_LIBRDATA_RDATA_INTERNAL_H_ diff --git a/pandas/io/rdata/librdata/rdata_io_unistd.h b/pandas/io/rdata/librdata/rdata_io_unistd.h new file mode 100644 index 0000000000000..02cfba60b5720 --- /dev/null +++ b/pandas/io/rdata/librdata/rdata_io_unistd.h @@ -0,0 +1,26 @@ +/* +Copyright (c) 2020 Evan Miller +*/ + +#ifndef PANDAS_IO_RDATA_LIBRDATA_RDATA_IO_UNISTD_H_ +#define PANDAS_IO_RDATA_LIBRDATA_RDATA_IO_UNISTD_H_ + +typedef struct rdata_unistd_io_ctx_s { + int fd; +} rdata_unistd_io_ctx_t; + +int rdata_unistd_open_handler(const char *path, void *io_ctx); +int rdata_unistd_close_handler(void *io_ctx); +rdata_off_t rdata_unistd_seek_handler( + rdata_off_t offset, rdata_io_flags_t whence, void *io_ctx +); +ssize_t rdata_unistd_read_handler(void *buf, size_t nbytes, void *io_ctx); +rdata_error_t rdata_unistd_update_handler( + long file_size, + rdata_progress_handler progress_handler, + void *user_ctx, + void *io_ctx +); +void rdata_unistd_io_init(rdata_parser_t *parser); + +#endif // PANDAS_IO_RDATA_LIBRDATA_RDATA_IO_UNISTD_H_ diff --git a/pandas/io/rdata/librdata/win_iconv.h b/pandas/io/rdata/librdata/win_iconv.h new file mode 100644 index 0000000000000..fcdfbdd571e83 --- /dev/null +++ b/pandas/io/rdata/librdata/win_iconv.h @@ -0,0 +1,40 @@ +/* + * No Copyright. + * + * iconv implementation using Win32 API to convert. + * This file is placed in the public domain. + */ + +#ifndef PANDAS_IO_RDATA_LIBRDATA_WIN_ICONV_H_ +#define PANDAS_IO_RDATA_LIBRDATA_WIN_ICONV_H_ + +// #ifndef _LIBICONV_H + #define _LIBICONV_H + #include + #ifndef WINICONV_CONST + # ifdef ICONV_CONST + # define WINICONV_CONST ICONV_CONST + # else + # define WINICONV_CONST const + # endif + #endif + #ifdef __cplusplus + extern "C" { + #endif + + typedef void* iconv_t; + iconv_t iconv_open(const char *tocode, const char *fromcode); + int iconv_close(iconv_t cd); + size_t iconv( + iconv_t cd, + WINICONV_CONST char **inbuf, + size_t *inbytesleft, + char **outbuf, + size_t *outbytesleft); + + #ifdef __cplusplus + } + #endif +// #endif + +#endif // PANDAS_IO_RDATA_LIBRDATA_WIN_ICONV_H_ diff --git a/pandas/io/rdata.py b/pandas/io/rdata/rdata_reader.py similarity index 54% rename from pandas/io/rdata.py rename to pandas/io/rdata/rdata_reader.py index 4114b6d1f8349..59f633119537c 100644 --- a/pandas/io/rdata.py +++ b/pandas/io/rdata/rdata_reader.py @@ -1,12 +1,14 @@ +""" +Read R data files (RData, rda, rds). + +This IO module interfaces with the librdata C library by Evan Miller: + https://github.com/WizardMac/librdata +""" +from __future__ import annotations + import io import os from tempfile import TemporaryDirectory -from typing import ( - Dict, - List, - Optional, - Union, -) from pandas._typing import ( Buffer, @@ -14,13 +16,17 @@ FilePathOrBuffer, StorageOptions, ) -from pandas.compat._optional import import_optional_dependency -from pandas.errors import AbstractMethodError from pandas.util._decorators import doc from pandas.core.dtypes.common import is_list_like -from pandas.core.frame import DataFrame +from pandas.core.api import to_datetime +from pandas.core.arrays import Categorical +from pandas.core.frame import ( + DataFrame, + Index, + Series, +) from pandas.core.shared_docs import _shared_docs from pandas.io.common import ( @@ -30,16 +36,18 @@ is_url, stringify_path, ) +from pandas.io.rdata._rdata import LibrdataReader @doc(storage_options=_shared_docs["storage_options"]) def read_rdata( path_or_buffer: FilePathOrBuffer, file_format: str = "infer", - select_frames: Optional[List[str]] = None, + select_frames: list[str] | None = None, rownames: bool = True, + compression: CompressionOptions = "gzip", storage_options: StorageOptions = None, -) -> Union[DataFrame, Dict[str, DataFrame]]: +) -> dict[str, DataFrame]: r""" Read R data (.RData, .rda, .rds) into DataFrame or ``dict`` of DataFrames. @@ -56,20 +64,30 @@ def read_rdata( commands. Default 'infer' will use extension in file name to to determine the format type. - select_frames : list, default None - Selected names of DataFrames to return from R rda and RData types that + select_frames : list, default returns all DataFrames + Selected names of DataFrames to return from R RData and rdata types that can contain multiple objects. rownames : bool, default True Include original rownames in R data frames to map into a DataFrame index. + compression : {{'infer', 'gzip', 'bz2', 'zip', 'xz', None}}, default 'gzip' + For on-the-fly decompression of on-disk data. If 'infer', then use + gzip, bz2, zip or xz if path_or_buffer is a string ending in + '.gz', '.bz2', '.zip', or 'xz', respectively, and no decompression + otherwise. If using 'zip', the ZIP file must contain only one data + file to be read in. Set to None for no decompression. This method will + default to 'gzip' since 'gzip2` is the default compression in R for + RData and rds types. + {storage_options} Returns ------- - DataFrame or dict of DataFrames - Depends on R data type where rds formats returns a single DataFrame and - rda or RData formats return ``dict`` of DataFrames. + Dict of DataFrames + Depends on R data type where rds formats returns a ``dict`` of a single + DataFrame and RData or rda formats can return ``dict`` of one or more + DataFrames. See Also -------- @@ -80,12 +98,11 @@ def read_rdata( Notes ----- Any R data file that contains a non-data.frame object may raise parsing errors. - Method will return data.frame, matrix, and data.frame like object such as - tibbles and data.tables. - - For ``pyreadr`` engine, ``select_frames`` above is synonymous to ``use_objects`` - in package's `read_r` method. Also, ``timezone`` argument defaults to current - system regional timezone in order to correspond to original date/times in R. + Method will return data.frame and data.frame like objects such as tibbles and + data.tables. For more information of R serialization data types, see docs on + `rds`__ + and `rda`__ + formats. Examples -------- @@ -115,13 +132,14 @@ def read_rdata( >>> ghg_df = pd.read_rdata("ghg_df.rds") # doctest: +SKIP >>> ghg_df # doctest: +SKIP - gas year emissions + {{'r_dataframe': + gas year emissions rownames - 1 Carbon dioxide 2018 5424.88 - 2 Methane 2018 634.46 - 3 Nitrous oxide 2018 434.53 - 4 Fluorinated gases 2018 182.78 - 5 Total 2018 6676.65 + 1 Carbon dioxide 2018 5424.88 + 2 Methane 2018 634.46 + 3 Nitrous oxide 2018 434.53 + 4 Fluorinated gases 2018 182.79 + 5 Total 2018 6676.65}} For an .RData or .rda file which can contain multiple R objects, method returns a ``dict`` of DataFrames: @@ -183,25 +201,26 @@ def read_rdata( 5 2020 12 NRTSI-G S 10.44 6.50}} """ - import_optional_dependency("pyreadr") - - rdr = _PyReadrParser( + rdr = _RDataReader( path_or_buffer, file_format, select_frames, rownames, + compression, storage_options, ) - return rdr.parse_data() + r_dfs = rdr.parse_data() + + return r_dfs -def _get_data_from_filepath( +def get_data_from_filepath( filepath_or_buffer, encoding, compression, storage_options, -) -> Union[str, bytes, Buffer]: +) -> str | bytes | Buffer: """ Extract raw R data. @@ -240,7 +259,7 @@ def _get_data_from_filepath( return filepath_or_buffer -def _preprocess_data(data) -> Union[io.StringIO, io.BytesIO]: +def preprocess_data(data) -> io.StringIO | io.BytesIO: """ Convert extracted raw data. @@ -277,22 +296,13 @@ class _RDataReader: rownames : bool, default True Include original rownames in R data frames. + compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer' + Compression type for on-the-fly decompression of on-disk data. + If 'infer', then use extension for gzip, bz2, zip or xz. + storage_options : dict, optional Extra options that make sense for a particular storage connection, - e.g. host, port, username, password, etc., - - See also - -------- - pandas.io.rdata._PyReadrParser - - Notes - ----- - To subclass this class effectively you must override the following methods:` - * :func:`handle_rownames` - * :func:`parse_data` - - See each method's respective documentation for details on their - functionality. + e.g. host, port, username, password, etc. """ def __init__( @@ -301,13 +311,16 @@ def __init__( file_format, select_frames, rownames, + compression, storage_options, ) -> None: self.path_or_buffer = path_or_buffer self.file_format = file_format.lower() self.select_frames = select_frames self.rownames = rownames + self.compression = compression self.storage_options = storage_options + self.verify_params() def verify_params(self) -> None: """ @@ -317,7 +330,7 @@ def verify_params(self) -> None: and raise appropriate errors. """ - path_ext: Optional[str] = ( + path_ext: str | None = ( os.path.splitext(self.path_or_buffer.lower())[1][1:] if isinstance(self.path_or_buffer, str) else None @@ -352,247 +365,93 @@ def buffer_to_disk(self, tmp_dir: str) -> str: Convert path or buffer to disk file. This method will convert path_or_buffer to temp file - for pyreadr to parse from disk. + to parse RData from disk. """ r_temp = os.path.join(tmp_dir, "rdata.rda") - handle_data = _get_data_from_filepath( + handle_data = get_data_from_filepath( filepath_or_buffer=self.path_or_buffer, encoding="utf-8", - compression=None, + compression=self.compression, storage_options=self.storage_options, ) - with _preprocess_data(handle_data) as r_data: + with preprocess_data(handle_data) as r_data: if isinstance(r_data, io.BytesIO): with open(r_temp, "wb") as f: f.write(r_data.read()) return r_temp - def handle_row_names(self) -> DataFrame: + def build_frame(self, data_dict: dict) -> DataFrame: """ - Migrate R rownames to DataFrame index. + Builds DataFrame from raw, nested parsed RData dict. - This method will conditionally adjust index to reflect - original R rownames. + Converts special class variables (bools, factors, dates, datetimes), + then binds all columns together with DataFrame constructor. """ - raise AbstractMethodError(self) + final_dict = { + k: Series(v) + for k, v in data_dict["data"].items() + if k not in ["dtypes", "colnames", "rownames"] + } - def parse_data(self) -> Union[DataFrame, Dict[str, DataFrame]]: - """ - Parse R data files. + rdf = DataFrame(data=final_dict) - This method will run engine methods to return a single DataFrame - for rds type or dictionary of DataFrames for RData or rda types. - """ + for col, dtype in data_dict["dtypes"].items(): + if dtype == "bool": + rdf[col] = rdf[col].astype(bool) - raise AbstractMethodError(self) + if dtype == "factor": + rdf[col] = Categorical(rdf[col]) + if dtype == "date": + rdf[col] = to_datetime(rdf[col], unit="d") -class _PyReadrParser(_RDataReader): - """ - Internal class to parse R data types using third-party - package, pyreadr. - """ + if dtype == "datetime": + rdf[col] = to_datetime(rdf[col], unit="s") - def __init__(self, *args, **kwargs) -> None: - super().__init__(*args, **kwargs) - self.verify_params() - - def handle_rownames(self, df) -> DataFrame: - if not self.rownames: - df = df.reset_index(drop=True) - df.index.name = None - - if self.rownames and df.index.name != "rownames": - df.index.name = "rownames" - if df.index[0] == 0: - df.index += 1 - - return df - - def parse_data(self) -> Union[DataFrame, Dict[str, DataFrame]]: - from pyreadr import read_r - - with TemporaryDirectory() as tmp_dir: - r_temp = self.buffer_to_disk(tmp_dir) - rdata = read_r(r_temp, use_objects=self.select_frames) - - rdata = {k: self.handle_rownames(df) for k, df in rdata.items()} - rdata = rdata[None] if self.file_format == "rds" else dict(rdata) - - return rdata - - -class RDataWriter: - """ - Subclass to write pandas DataFrames into R data files. - - Parameters - ---------- - path_or_buffer : a valid str, path object or file-like object - Any valid string path is acceptable. - - file_format : {{'infer', 'rdata', 'rda', 'rds'}}, default 'infer' - R serialization type. - - rda_name : str, default "pandas_dataframe" - Name for exported DataFrame in rda file. - - index : bool, default True - Include index or MultiIndex in output as separate columns. - - compression : {'gzip', 'bz2', 'xz', None}, default 'gzip' - Compression type for on-the-fly decompression of on-disk data. - - storage_options : dict, optional - Extra options that make sense for a particular storage connection, - e.g. host, port, username, password, etc. - - See also - -------- - pandas.io.rdata.PyReadrWriter - - Notes - ----- - To subclass this class effectively you must override the following methods:` - * :func:`write_data` - - See each method's respective documentation for details on their - functionality. - """ - - def __init__( - self, - frame: DataFrame, - path_or_buffer: FilePathOrBuffer, - file_format: str = "infer", - rda_name: str = "pandas_dataframe", - index: bool = True, - compression: CompressionOptions = "gzip", - storage_options: StorageOptions = None, - ) -> None: - self.frame = frame - self.path_or_buffer = path_or_buffer - self.file_format = file_format.lower() - self.rda_name = rda_name - self.index = index - self.compression = compression - self.storage_options = storage_options - - def verify_params(self) -> None: - """ - Verify user entries of parameters. - - This method will check the values and types of select parameters - and raise appropriate errors. - """ - - path_ext: Optional[str] = ( - os.path.splitext(self.path_or_buffer.lower())[1][1:] - if isinstance(self.path_or_buffer, str) - else None + colnames = ( + None + if data_dict["colnames"] is None + else list(data_dict["colnames"].values()) ) + if colnames is not None: + rdf.columns = Index(colnames) - if self.file_format not in ["infer", "rdata", "rda", "rds"]: - raise ValueError( - f"{self.file_format} is not a valid value for file_format." - ) - - if ( - self.file_format == "infer" - and isinstance(self.path_or_buffer, str) - and path_ext not in ["rdata", "rda", "rds"] - ): - raise ValueError( - f"Unable to infer file format from file name: {self.path_or_buffer}" - "Please use known R data type (rdata, rda, rds)." - ) - - if self.file_format == "infer" and isinstance(path_ext, str): - self.file_format = path_ext - - if self.compression is not None and self.compression not in [ - "gzip", - "bz2", - "xz", - ]: - raise ValueError( - f"{self.compression} is not a supported value for compression." - ) - - def disk_to_buffer(self, r_file: str) -> None: - """ - Save temp file to path or buffer. - - This method will convert written R data to path_or_buffer. - """ - - with open(r_file, "rb") as rdata: - with get_handle( - self.path_or_buffer, - "wb", - compression=self.compression, - storage_options=self.storage_options, - is_text=False, - ) as handles: - handles.handle.write(rdata.read()) # type: ignore[arg-type] + rownames = ( + None + if data_dict["rownames"] is None + else list(data_dict["rownames"].values()) + ) + if self.rownames: + if rownames is not None: + rdf.index = Index(rownames) + else: + rdf.index += 1 + rdf.index.name = "rownames" - return None + return rdf - def write_data(self) -> None: + def parse_data(self) -> dict[str, DataFrame]: """ - Write DataFrames to R data files. + Parse R data files into DataFrames - This method will run engine methods to export DataFrames - to R data files. + This method will retrieve dictionary of R data and build + DataFrame for each item in data file """ - raise AbstractMethodError(self) - - -class PyReadrWriter(RDataWriter): - """ - Main class called in `pandas.core.frame` to write DataFrame to R - data types using third-party package, pyreadr. - """ + lbr = LibrdataReader() - def __init__(self, *args, **kwargs) -> None: - super().__init__(*args, **kwargs) - self.verify_params() + with TemporaryDirectory() as tmp_dir: + r_temp = self.buffer_to_disk(tmp_dir) + rdict = lbr.read_rdata(r_temp) - def write_data(self) -> None: - from pyreadr import ( - write_rdata, - write_rds, - ) + r_dfs = {k: self.build_frame(v) for k, v in rdict.items()} - self.frame = ( - self.frame.reset_index() - if self.index - else self.frame.reset_index(drop=True) - ) + if self.select_frames: + r_dfs = {k: v for k, v in r_dfs.items() if k in self.select_frames} - with TemporaryDirectory() as tmp_dir: - r_temp = os.path.join(tmp_dir, "rdata.rda") - - if self.file_format in ["rda", "rdata"]: - write_rdata( - path=r_temp, - df=self.frame, - df_name=self.rda_name, - compress=None, - ) - elif self.file_format == "rds": - write_rds( - path=r_temp, - df=self.frame, - compress=None, - ) - - self.disk_to_buffer(r_temp) - - return None + return r_dfs diff --git a/pandas/io/rdata/rdata_writer.py b/pandas/io/rdata/rdata_writer.py new file mode 100644 index 0000000000000..dddfe9684353f --- /dev/null +++ b/pandas/io/rdata/rdata_writer.py @@ -0,0 +1,174 @@ +""" +write R data files (RData, rda, rds). + +This IO module interfaces with the librdata C library by Evan Miller: + https://github.com/WizardMac/librdata +""" +from __future__ import annotations + +import os +from tempfile import TemporaryDirectory + +from pandas._typing import ( + CompressionOptions, + FilePathOrBuffer, + StorageOptions, +) + +from pandas.core.frame import DataFrame + +from pandas.io.common import get_handle +from pandas.io.rdata._rdata import LibrdataWriter + + +class RDataWriter: + """ + Subclass to write pandas DataFrames into R data files. + + Parameters + ---------- + path_or_buffer : a valid str, path object or file-like object + Any valid string path is acceptable. + + file_format : {{'infer', 'rdata', 'rda', 'rds'}}, default 'infer' + R serialization type. + + rda_name : str, default "pandas_dataframe" + Name for exported DataFrame in rda file. + + index : bool, default True + Include index or MultiIndex in output as separate columns. + + compression : {'gzip', 'bz2', 'xz', None}, default 'gzip' + Compression type for on-the-fly decompression of on-disk data. + + storage_options : dict, optional + Extra options that make sense for a particular storage connection, + e.g. host, port, username, password, etc. + """ + + def __init__( + self, + frame: DataFrame, + path_or_buffer: FilePathOrBuffer, + file_format: str = "infer", + rda_name: str = "pandas_dataframe", + index: bool = True, + compression: CompressionOptions = "gzip", + storage_options: StorageOptions = None, + ) -> None: + self.frame = frame + self.path_or_buffer = path_or_buffer + self.file_format = file_format.lower() + self.rda_name = rda_name + self.index = index + self.compression = compression + self.storage_options = storage_options + self.verify_params() + + def verify_params(self) -> None: + """ + Verify user entries of parameters. + + This method will check the values and types of select parameters + and raise appropriate errors. + """ + + path_ext: str | None = ( + os.path.splitext(self.path_or_buffer.lower())[1][1:] + if isinstance(self.path_or_buffer, str) + else None + ) + + if self.file_format not in ["infer", "rdata", "rda", "rds"]: + raise ValueError( + f"{self.file_format} is not a valid value for file_format." + ) + + if ( + self.file_format == "infer" + and isinstance(self.path_or_buffer, str) + and path_ext not in ["rdata", "rda", "rds"] + ): + raise ValueError( + f"Unable to infer file format from file name: {self.path_or_buffer}" + "Please use known R data type (rdata, rda, rds)." + ) + + if self.file_format == "infer" and isinstance(path_ext, str): + self.file_format = path_ext + + if self.compression is not None and self.compression not in [ + "gzip", + "bz2", + "xz", + ]: + raise ValueError( + f"{self.compression} is not a supported value for compression." + ) + + def disk_to_buffer(self, r_file: str) -> None: + """ + Save temp file to path or buffer. + + This method will convert written R data to path_or_buffer. + """ + + with open(r_file, "rb") as rdata: + with get_handle( + self.path_or_buffer, + "wb", + compression=self.compression, + storage_options=self.storage_options, + is_text=False, + ) as handles: + handles.handle.write(rdata.read()) # type: ignore[arg-type] + + return None + + def write_data(self) -> None: + """ + Write DataFrames to R data files. + + Converts special class variables (Categorical, datetimes, etc.), + then exports dictionaries of each column to write to disk. For + datetimes, data is converted to epoch seconds with timezone + handling for midnight time-stamped datetimes to align to R types. + """ + + self.frame = ( + self.frame.reset_index() + if self.index + else self.frame.reset_index(drop=True) + ) + + for col in self.frame.select_dtypes(include="category").columns: + self.frame[col] = self.frame[col].astype(str) + + for col in self.frame.select_dtypes(include=["datetimetz"]).columns: + self.frame[col] = self.frame[col].dt.tz_localize(None) + + for col in self.frame.select_dtypes(include=["timedelta"]).columns: + self.frame[col] = self.frame[col].dt.total_seconds() + + rdict = {"dtypes": {k: str(v) for k, v in self.frame.dtypes.to_dict().items()}} + + for col in self.frame.select_dtypes(include=["datetime"]).columns: + self.frame[col] = self.frame[col].values.view("int64") / (10 ** 9) + + rdict["data"] = self.frame.to_dict() + + lbw = LibrdataWriter() + + with TemporaryDirectory() as tmp_dir: + r_temp = os.path.join(tmp_dir, "rdata.rda") + lbw.write_rdata( + rfile=r_temp, + rdict=rdict, + rformat=self.file_format, + tbl_name=self.rda_name, + ) + + self.disk_to_buffer(r_temp) + + return None diff --git a/pandas/tests/io/data/rdata/ppm_df.rda b/pandas/tests/io/data/rdata/ppm_df.rda new file mode 100644 index 0000000000000000000000000000000000000000..5b967767c7216f84425fb3255f058f461ca54c2f GIT binary patch literal 11221 zcmXBY3p~^N|G@Ec>XcJrNfO0QNs>@W%4Iv9$gi7KawOS`N=VCf*zE`@LQ%O}Ny&AU z5N30WB<6m%88OXeWB2?2Yk$wjo2Q#so2Qt&8`2Sh zk)u_9m*3=dzO*Z3?}}%C)g~WiH$=;8d5@L{4$PvzGtJaT?jA>v2Ba?@Mct#D3Q*o) z2u8%TK&uHK6cq3b1@{d8{o2Aa_et>^v^4co&I=$t=-IAZ*DeT0>;QT5Vj-rA2Lbm5 zJe(LgcEr>)C}X02zKYs#4!vOKfDwR(p(D!Mj*hvAh2}9CJZh=*59y;Fome(=%=u+j zaOe_=EV$?;pg@HkBtZ{%UNbr3*Hi3|kq3tk9TG42G(+`nRGhC)N%XTmmLHyvl@&9+ zz4MJ0wQGS8ZRP5~`BPYw0 zyt@SPWNN?CJBVkM-sFF&V@1dUgg^IRhzL?-maIIp!DEOJa7!uw}Ja@+M z7GX?GFb`o~`%JCiKY8@%TOgtO`i^r!k5>lP9k3hT!{)zr};7Bb4It zekR^r>b^|acJ7Q|SeHjSAfQ+b|GR*f^Bq1?IGM4f85x8H^;}agtj2=+v~s~3wd5_$ z-=cA*r3G^bv9*uu2St1iI^_mMKbhYzpd4RG8dg@FDIQi1a?^*EM_)$|Bd*pK4kJbb9u6aFX9flP zQB|Yqai4`$2kEC|Px?1aznh4vsToO=GS9Awnr=e@GnpMsbb(^UtnIA=rLh}GTGeSe z+vOpU0Nj2)1sr}=ri4_!5>a-O%0(3KhRw_PujOy2l;4};zbapNB_=A(h8!O&OYUM@ zly2tkiOP~~hB`|3iEri#VwH`Sw4_jDo3sRGxmgM&g`Qea9$Y#7QF-zzaD3j)&9B?N z8~%Y9Ia6PRBfs;sq^~)47h#2Ky2S|4)!d2;Z4QJYvW=v#2*k3e%C`h!V6N54h`V2P znsh0Wd`=5$NSz{HX5z@@KZbV$Wymst3cDapg@p)RGT$biQ}GtMDaql6^5fqjJ>0fO zux(z3u8j(x!ZP*Pl-e!Em|Nn$CM(agROM-!}TV4Las)S&Cs8O97+`@X~2=7VPfP&NQxXwfb0mUbtEJ$ z?ggfJ^DWq;7D?LJqwjX=V2{$|Uc8a5a7W%qzvy_6Z{L4Ij)e0S(9PJRi~0v7cWV#2 z@J6nQ&^^8(6JGh$S&K{{AB8Q-&VY>x%Ip+-GX?Tvy{Q6Ov_ck9-p>5Cj8F8<5>ak3 zE-m9bq8F!>FXv5Klx1)4Jyv=yioYt$>RFHTJPY~T9IEeST1MS-z@FocZ@ct2Z~Whq z=((D*>CJk?KqL3|h(M{L;K@)GFZeKgOo4N54M4{+fH7P7m*g&Zr={esawTJ~!eNO# zS0VrInomvc{tiTP-ict~Q2PjCx7XH@U@-6d7-E-9+>8yMr5Q;UN^$@4Mn+~qJ-*ED zfAXnaNW@lb_+E;!WP#Lonm6Kxdp}n(U-m1XT8`^*B6uP8Jc87P%XUi^Y*C)Pkz69M zmo*iObsaC?PciEd5t2*i&RI&AY~bIw(#VRP46HtCD}>bS+2k8awZwQ(92sK|12^Z{ z(u`yaKiuQ!p5lN07o!k3tebM8IxtK$C&YfWSXhhvGUI$gM=hNlWD!E$mc&N&+N^R*?@IFBk)o9}lE;bB~F|$GgT16Em-^IPO=r9D{^Qb#dhC zbBUTjZ%Yq1?GXGhB2Bzn+k^(?{)HL7S;+YnGQR;)`_q!)S?$H0ud*>dIV+}Fxfe>p z#?ZvctU)B&+O|8hA2{NYTi8Ll6qM-+`vzD0Z`egSrp z-DN6i@!h%=J7xJ&7dypv*TM>P9&g49+%T@Z5qGv%k1rxo=tRJJp%Wp}oi07T^e#Z^a50z4l8M>^=eK88u_l<4ZC3;EjNzF7ZYXGx>8B6oq+@FTP|vpQ@N^ z%BSLxem%a*0)J+#WpTwd(Zlz&Che=0I=K80Q-+OqvRE8#zZ3z5UWod645-f~D~4cQ zSZuYIBAx?>-qV5bCPBCf&2v&W0bwm&(br2Hi!Pg?>EvBj zhHto`ev`DtfAQ(^nLE#or(bk0XFYLm`VW&2YkHp#WFZ(xfEjMyvJ@sl9zzR&`vtWDn+qlRu&EiS zBO#{hT0XV0`$7-%`ip}=XZZXeyD<)XQ*L;ToytdZ z8m<6KDX?^4l(>Jwr3DjH`PrBKD@y4FtRP@JANNwer>HY<1eO}*U@AS`+s*n!*mEB3 zDo86pLZ8a__;r{ZhM#Une2Xy7g_}>m2sQC&H4{JP9)`EifPou__V1GHv57jxJH2$p z3kdyh-f6=TN8ahYfvddJ3Ub|C8mWRZmzFcDgH5HvHe*wjS*DVLtrB20GCXN6ZDr+X zkICB+NRP=(^W7ekTit%ZRAH76`%c0;5}t!s5+SBRUx8_1BXHYH7LaClM?$}M=S}&$ zfA5*i+wdzbKH*&H2K(l1Dy5q}C`C(DUVE3dYLr(Phdu^F4E2vy#c+o8%Fjb%5l`Z9DYP=8V)i(hi2%-a$%gOt7`5;JyJM@hwu!}vWL zh@w{yIl3zr+Z1OS$D8Q;^vz|9y=}48AfhG7^D8k{ShW27zOQ*ahY@BKmy+VY7t`u# z7Ka~@on~~!U;}LO4DaV|#e8@AdP+)E%n`=86M=)Rs=EFs-h)iZYyVHMa$lScG zS}M0MYp|^$6zI^8J4ZyEIn8iOX_vtgkKr#y#MCLZXnP9BPxfBbEjJ;Bc=VW<&jP0> z@VYTDRlS=_J8Jt6?{u~-0hrn>Bn9U84oC{h?(UKl1l~206f7?7lN1<-Yhim2Md(FCZu)P5mTd=+J=D@;^@~zlj z&r*shX=3nbQ{B?&UnF6)f0IGGb{yzz-G0nC6}*9f^Cxn|OtTFLiPWAEaBl+m*rtMF z+n9drA#;by_Y2Hl&dtT5u(ME4pG-~(ahn#7_^gd;75urL$<$j1NyS3mL@3ubHOE_T zgNHTgGqftsZa{u;&`*iZRm zk0BmwZGi=vVLoi|Jse2E??CQW&!tt~c@M<900<+b$0T6ra*v4_Lmdbs27^qe)R9O{=A6zc71BH6iN-bl_(L(#o>X)*-s+NGlbONO9OFy zfl44aESU8X%xZjn`ycFCFu4KUAZ;Ipwc-v7Tt5&V$hR&OdP8-jMZ?g`e6~6kDtuvd zH0k{W-Q5r(>k4SUsIKXr8L?|fA0cj^(`XZlzw_Rs$YKls6IW-VX5yDtUYtY*Vgf!A ze~1_pCeF_0|0~2vID4BxF2Wc6)CT1y8IW@_YRhVxlm}LGS4M3G0(~Gi8V2%a7|J4C zJBr90@pehaTvShXlP*KmmAn@m9}a;5ZN@(Y$FCy%rxHs5DAy?0Db!_KRgTD3ZBjWP zTjeeyau5vWqzZ!x(4ACa=mIE>_!#AS${#A5D67_}Y@`5`1`0bFN@T4PCR(MTa$dB` zTqRJn%0R_Sw8}vR%z;kg8;D(iF0)QKdNqz->@z;;w2`Y32e5|3QDpuQ6N$ijHqk|9?&D075Ku z1AYtqPnGTPRez{#g##2@Wu_umjli1CcGFJi0x#(VK& zjPQt)t3*3EHj~H(dMsrFUR$wEW-jvNoKk7!tlF#cjNqHdxk}lj3Zo3bZluRh)R1dc)RC*yR6s}o`?pniumU2(MmVdOM6B{%B~s&n zCczERSp1*L&9rqYIQ2NFdws*TDbZ~Wz{?OMgL3jz;G9)7 z6$ob)Sp~;grBL~uvnpC8g#m7cJ_J2bc?bg7O=^BwyA*53aQULX^q_9?-j^Lq57Co` zU(}Z3&fkl2lz%8T$h1WlR1S((-Bz8 z>o$o&BXRc-&23uD?JVj)O!lJ?E4cx$=olw^pYvY61s9`-rcAcokVy<3-c1fM`#qmP zJUOJGu(RSS#mF*wy# zSSFsqWC_JR)AsVO+H4%pwktxJY{0MhO|aja_|Q7J>(8?$zJUwrO>~OJL{~o3U{vSx zRLDaQZi7dAR-20*n|>l)@DkBg^^(w4YoGp6hMKLK+91;N??n1Kb%1;WzwhNFza)3T zU*fu=N$k_&qjL8uS@^bhl--|+{z(*W81Ev~P8*L1$jNEeV?Lg30x1RRKjhbG#yk+}d2XEh3%`kvUK!sG&FUsw1!?XrzeplHL2w`W8iZF*D{W=j zFzGIl5;86LeR}%(wZg*|6+JOO`FWsKhmBxiwWA-ZQ$|8S;9)=TqkXp8-1rE~W z5KDPp10fMOyUVyk%f=@&WPmbn zU6N~-ipxZm7k*TNrlYb}cH)#b>QOqMfcsR3h6g3rvZnDyvOwpNZ&|SNtmjUbooAJF zo^mVO15}Ghbb{3yeyGg6qFZg%L~t)HOao zIcYIkeR~}7frrxh*h+OzQ_}5}8swKN)N|U+%UKDOv%lERt)|JMVc%<| z;31va9%{>1G3)A?`j5keCMlbt)_o`u=kjLAz)TZ$q6367?LZ1$OJm}PdnC|9NX<#% zq1pC8GYHDCB+stD_p298-v(vg0TP}av_-AHs?txj&Ck6^AUi)wlg8Oq&rs?L_%yGV zf&ugTiDxefOG5{8$2I2SrLlH4f_wRG=L5)f+!Df|sIH`s8t^iC={AoZEvmglFS_zg?fSsKlT+!pz>x7BtVr3;oz%*WMQR1EA3v}(2p zz0X((7p{>oWkHrFDeWyoiITu&iU#);m}=w)%?;j#Bfh=~`H~hvJAvVLpB`(QL?R^y zF-tkqOKe6?-M!fu>6&G)-Fe*M^RFlTzqgYJ(YTJ~0L$f_Z=r$s0euq?^Lb%LrG-OH zhI9J+^ac_<#z6p5QsQ)Q36bH=QG%ZzriAyAyo2yO)=Nd+nSar+->0U?zg$eGSn1pA>zL*YSK!7Zw*4u0hz8P&soX|2d34OSH9=%g3Ew06lGnyWU> zk>~aMoMoA(Ct5bZ*vY~9_74qa>xWL&nGHKMeYxw1BsV*#mqyMr zayAaz#nCC=%;;Nm&+nh6HL7{SVCBkv`tr>w`oEHq#9X7f4l=9LuaY}k{iV9p+Ja^~ zXwN%ZKvRE7prt+$YRK$QKyy%VL7*jfqQiFOFq`z#l>}b^H7kA_wW$mh{#z9)*!IQ& zSgh|I+l0-e(`a*3A49oauV`yn=PPCt)6p^2Wi!=6oij^#xZooDZ<)bk{Pd#$3x#VrN6ceU-KP5R zi-vFI$7Orjo3p7a(4+kcHi8WSUQzfr=bDI}qMA(Hx5NniTMD#P_;FnsbNKK}+DFUp zn2q1xJVQ*9f=Mv=D{cB>S$n{JXc)&}@jBsts1bNsOPp-+QycO#Q%Z>8(w2Yciy`+z zjV&87LA<@tel=;&PgluP_S{)H?q{0hsqgl`^-kh7SF~axmSFg-!amaEg@q8DZR-?e zHmUWQZsrrsEa45B?QA#IeXe@D@;Kh0qrDEBNJ(T={9>vf6aJ<~=WCLk*BaC5$;Lct zrVhEi?w|Ei_ zuX1hFWM-7j9IOKxsniWuo+yOgAx0A8Mcn`F&sdQA0)K*6xbo~6`*lJiW*WvffLyjc zGS=HZeJ|`{7CU%pq{|mLP_Vp$jgfH)JnqiO1bvC2VvB(F6dI8k71r)^2c9CWvPwO+ zzymwkV=Rxo`35GrV97gjG}>dHs@*?oLaSab)Deit;Z=JzMX9@*P97jnAmO6iFj5WG9pT5=FLpyge@D)f2X4E-B?*ydUi+~LkYl@{l#w& zs5!%_Cq`CJ4d0U2es2)xp_UD%iXH0{L~hBKxKIqfZvQ0Adci&gVR>QjK?Tms`mxY> zWi7wm=12?7Zkg0&MV~JR{z6DCywFt}?!=&G4+oqxJjKPIpQCKxU)939uAJ=rh9bow zI{oMw5!p|b3^;45?kitYkex>KJ8wAa5X!J?9Wcb(-JvDoj!^l#DO1h;$YoAq!>MxK*wu?y)Af+)J0@tqy! zC!w9L*d)WV5mZ~0yn$;gGp4hr#!+Q&=-{F@hv7=Cx&V%kn&2Dk%h72#QI_)mg?)vCE_HbUr#dxP=lBjJ0W+Vlo*T)YClw=^AXuR}?x zVOqht-B=6Uxn{+hH?)6N2pu1&!mMoRC$zaFaCfr7T=iqgxX;8-%kiAVdpJ-qxB(&# zv|;AiX**aGPT?A`S%SiD{+{JjXm}dxK4DtCC1|BCUAkXj&F(lpM{XBETZqta_~CtG zFE={+3o&ZrQ;?}6B$i8UW|IwE4BJF;shQ;j1o<>ROBhDnx-dZ$-hQiFtYNg5g0W7BjE(ZJU9I6 z?JdHfDXCswl9Kr=#Xgs5!y6!NWB$i{c`?{9LXj_DYu+ADGeM>%%FaSotbe##DsfGo z&t>jftdJD$id|{-n#H`rn*;&*oHXj^{5LRqhlgBk#TVI)H(I%fQH!KE!%b*hbYZob z0P+{86*8aH2sUp|`D4Xr9JWSW-YELj0>aTZ{S-fZYBOEe+QCkRhyWaFP>!?1dp%hXIBV}`CCDHb}wSBhzi{U+4Oi5S8%?<}OT1HzgyWONolC?Szn z`%+$a1o5VY;=gfE?Ee$4mvh)~lFc$#U$M*%Ab}1Tc#GkWDrcWj`_mNmqs>aHqGIaP z3xRxDiMPGjDlByDrgw1Ll)`*6)EJAGDcfkbid+*U>oEAdCitOYSDERO)HMTY{EN69 z`>^Rj$A!`OUh;N!Lf^YTZXKtUAYb>JpYck1eSrVMul}%oiRG=2Wy>!&B^=uNZ|t%D zd#18|onR2jChK?^?;SG(TlLYg#RZJkz;H8*v$9oJ6xIU z^wdAkVavka;R_T0m?nBtE^ju|-Ws4KiP--?yd3?3_BJr3)lHM0II76iHs0sfmwICK zg?ew@xo&Xo+G5YhF-nNl7Z=X4=NkU*N8e*3Tgx0bTk7!OZPq^o?5qpWm^P{sV!to> zQ)U+ivvF#m0W{-Be%Z(7`FrWT!U3dTBJvoL*~!}7lsmB5iPgN^cLOQoUj(! z1y16>vg01`Hs!_Gn#2e*diymD+ooEFOD49Q@F|}+85gjx?bo_#*&?7bKermnuH^y$ zCmU9p9C_bg<3DdJJ>#)wcUS1 zDkN#pq+#>}0X(8lDQ+FXp%L;2QQe?zGQ_d%L+N5q+hu&klT5n@WkljqbH1dz{en78l?pIy2p z9Oj`+wmkey!rQ^I)TFIO1Fy+FTA^g_*<>4P?2-+j4No-S?-1|2CkKU!Dz~hcLrPbS@ycf?Fy)K z7=2199;c%vHVdcTaF-4Qbt|a;*w*O`7HH_0o6P{dcyP~VA_RP6=fG0fLa`#gd)I}j z^eR_qXx@7Zwe6rqSj}LcXdsytkrDZN`Mt93k@NIGB=IdP^kvZHdKW!Ku~%-}sA8j} zgknqhOlC!%U{)NWhF)_6=b~)EAGEK4+YG&Iq6M{G76(CL5PwuAngD$q zL(A*MBVCDe?-WJYkUT$`eoAXmxSR8U6TdsCCpO8{i@glAh^@UYyZFf|039E>i9z1Y*Fz4+-%e1bz4G*M7MRPYig~u-!}`ODrc1fkrn1n=U{8P-&s?JeK^>BFe(cIIlsjj{Sl5Aq8pvF$kg- zS0@uAEeCxk5TV#R&|17&mzzUj68fX0P|y#`CHN27rwjGx|0nEdKp9O(v~xUbmeOqV zB=sfKsN`Rm9^f#&slCRO&$N{mHpmHIc=gArR6Qw2qt$t6By76w(lpt7uGn_|%bfgF zO&4lFZmW2&jF#@^vP-qnBb5V@0W_Gzm=2;b<79wR`;-k>%5X6Kj>9h*pac!T0c8;YZol;TFzKc&ky$^ zvOj|0X5%2VhE%?pE)#Sv_eKF1)oXT`vKReC68KU-JvoI{AQ)VMORbnGlEUR8i>v;2 zl>xM&bM;{)H*lZjqVrZgZl$t{;qNy|6RmoDVhtZ2&7PT=qv}iSl`LlI!%va_$L&aQk!#jz#Jf*Ypj;&f$MV-eVPWJ)}cY zuLK5R5RsyJ)!!aCWMMYU!a+K6p>RHF`G^o4qR5`!;>#mWU;mM!WDr{YmY`!v%L8Fr z-vacOikAJj zos}QvN9K@cYpxGkQB5nrxw^Jw7QWYKNp}`kIhcX6rHv&;Vq}MyuOjbdFvU?PL0m`y zzcYLm;?25FOY4-TCd1x+MEt59^6g7z72z_3_92QrdeX{S=hRL0M?kNq?=GpQjK%+u z)9sm^?B6R8edWPisq+2Dlw!y%>Uj+jdAQYWJ~Q$|L;V&gzmDENkpnJCLKoKHj=-)b z#|TQOv*fq(EaCbYf7sapx(s(@KAZVTf|R^{E6Db<&z>%(Wq*W&$La*;esS|i5Zp>YSmeRysT{)aHCf!t*lLO2xGi_Yw# z?m20a8$T=mRx130p+rI!j|!07ok!##Ogc;W`6D4?e&`x;R#DDHv{A1yqcM9FL7ExL zfJ^c^e&V3gx(L%S?ofL?UyN=v?3qCG=Lf6LZZ;c7{*8C0l4%wb*Lq@V<<3E`4SQnV z)M*%YQ#|SLk39~r#8Nw{SRbtxFx{AJ=oRHySG(y^ Date: Sat, 8 May 2021 15:16:24 -0500 Subject: [PATCH 10/35] Move C src files to _libs directory --- .../rdata => _libs/src}/librdata/CKHashTable.h | 6 +++--- .../rdata => _libs/src}/librdata/rdata_bits.h | 6 +++--- .../src}/librdata/rdata_internal.h | 6 +++--- .../src}/librdata/rdata_io_unistd.h | 6 +++--- .../rdata => _libs/src}/librdata/win_iconv.h | 6 +++--- pandas/io/rdata/_rdata.pxd | 2 +- pandas/io/rdata/{librdata => }/rdata.h | 6 +++--- setup.py | 18 +++++++++--------- 8 files changed, 28 insertions(+), 28 deletions(-) rename pandas/{io/rdata => _libs/src}/librdata/CKHashTable.h (90%) rename pandas/{io/rdata => _libs/src}/librdata/rdata_bits.h (67%) rename pandas/{io/rdata => _libs/src}/librdata/rdata_internal.h (94%) rename pandas/{io/rdata => _libs/src}/librdata/rdata_io_unistd.h (79%) rename pandas/{io/rdata => _libs/src}/librdata/win_iconv.h (84%) rename pandas/io/rdata/{librdata => }/rdata.h (98%) diff --git a/pandas/io/rdata/librdata/CKHashTable.h b/pandas/_libs/src/librdata/CKHashTable.h similarity index 90% rename from pandas/io/rdata/librdata/CKHashTable.h rename to pandas/_libs/src/librdata/CKHashTable.h index 021a04025079d..17190e02e3521 100644 --- a/pandas/io/rdata/librdata/CKHashTable.h +++ b/pandas/_libs/src/librdata/CKHashTable.h @@ -1,8 +1,8 @@ // CKHashTable - A simple hash table // Copyright 2010-2020 Evan Miller (see LICENSE) -#ifndef PANDAS_IO_RDATA_LIBRDATA_CKHASHTABLE_H_ -#define PANDAS_IO_RDATA_LIBRDATA_CKHASHTABLE_H_ +#ifndef PANDAS__LIBS_SRC_LIBRDATA_CKHASHTABLE_H_ +#define PANDAS__LIBS_SRC_LIBRDATA_CKHASHTABLE_H_ #include #include @@ -52,4 +52,4 @@ int ck_hash_table_grow(ck_hash_table_t *table); void ck_hash_table_free(ck_hash_table_t *table); uint64_t ck_hash_str(const char *str, size_t keylen); -#endif // PANDAS_IO_RDATA_LIBRDATA_CKHASHTABLE_H_ +#endif // PANDAS__LIBS_SRC_LIBRDATA_CKHASHTABLE_H_ diff --git a/pandas/io/rdata/librdata/rdata_bits.h b/pandas/_libs/src/librdata/rdata_bits.h similarity index 67% rename from pandas/io/rdata/librdata/rdata_bits.h rename to pandas/_libs/src/librdata/rdata_bits.h index e53128171fd0e..1bd6493dfb230 100644 --- a/pandas/io/rdata/librdata/rdata_bits.h +++ b/pandas/_libs/src/librdata/rdata_bits.h @@ -6,8 +6,8 @@ Copyright (c) 2020 Evan Miller // rdata_bit.h - Bit-twiddling utility functions // -#ifndef PANDAS_IO_RDATA_LIBRDATA_RDATA_BITS_H_ -#define PANDAS_IO_RDATA_LIBRDATA_RDATA_BITS_H_ +#ifndef PANDAS__LIBS_SRC_LIBRDATA_RDATA_BITS_H_ +#define PANDAS__LIBS_SRC_LIBRDATA_RDATA_BITS_H_ int machine_is_little_endian(void); @@ -18,4 +18,4 @@ uint64_t byteswap8(uint64_t num); float byteswap_float(float num); double byteswap_double(double num); -#endif // PANDAS_IO_RDATA_LIBRDATA_RDATA_BITS_H_ +#endif // PANDAS__LIBS_SRC_LIBRDATA_RDATA_BITS_H_ diff --git a/pandas/io/rdata/librdata/rdata_internal.h b/pandas/_libs/src/librdata/rdata_internal.h similarity index 94% rename from pandas/io/rdata/librdata/rdata_internal.h rename to pandas/_libs/src/librdata/rdata_internal.h index 56b2108739560..ba1ba11c91f78 100644 --- a/pandas/io/rdata/librdata/rdata_internal.h +++ b/pandas/_libs/src/librdata/rdata_internal.h @@ -6,8 +6,8 @@ Copyright (c) 2020 Evan Miller // rdata_internal.h // -#ifndef PANDAS_IO_RDATA_LIBRDATA_RDATA_INTERNAL_H_ -#define PANDAS_IO_RDATA_LIBRDATA_RDATA_INTERNAL_H_ +#ifndef PANDAS__LIBS_SRC_LIBRDATA_RDATA_INTERNAL_H_ +#define PANDAS__LIBS_SRC_LIBRDATA_RDATA_INTERNAL_H_ #include "rdata_bits.h" @@ -86,4 +86,4 @@ typedef struct rdata_sexptype_info_s { #define RDATA_SEXPTYPE_PAIRLIST_ATTR 239 #define RDATA_PSEUDO_SXP_ALTREP 238 -#endif // PANDAS_IO_RDATA_LIBRDATA_RDATA_INTERNAL_H_ +#endif // PANDAS__LIBS_SRC_LIBRDATA_RDATA_INTERNAL_H_ diff --git a/pandas/io/rdata/librdata/rdata_io_unistd.h b/pandas/_libs/src/librdata/rdata_io_unistd.h similarity index 79% rename from pandas/io/rdata/librdata/rdata_io_unistd.h rename to pandas/_libs/src/librdata/rdata_io_unistd.h index 02cfba60b5720..661010c76c4aa 100644 --- a/pandas/io/rdata/librdata/rdata_io_unistd.h +++ b/pandas/_libs/src/librdata/rdata_io_unistd.h @@ -2,8 +2,8 @@ Copyright (c) 2020 Evan Miller */ -#ifndef PANDAS_IO_RDATA_LIBRDATA_RDATA_IO_UNISTD_H_ -#define PANDAS_IO_RDATA_LIBRDATA_RDATA_IO_UNISTD_H_ +#ifndef PANDAS__LIBS_SRC_LIBRDATA_RDATA_IO_UNISTD_H_ +#define PANDAS__LIBS_SRC_LIBRDATA_RDATA_IO_UNISTD_H_ typedef struct rdata_unistd_io_ctx_s { int fd; @@ -23,4 +23,4 @@ rdata_error_t rdata_unistd_update_handler( ); void rdata_unistd_io_init(rdata_parser_t *parser); -#endif // PANDAS_IO_RDATA_LIBRDATA_RDATA_IO_UNISTD_H_ +#endif // PANDAS__LIBS_SRC_LIBRDATA_RDATA_IO_UNISTD_H_ diff --git a/pandas/io/rdata/librdata/win_iconv.h b/pandas/_libs/src/librdata/win_iconv.h similarity index 84% rename from pandas/io/rdata/librdata/win_iconv.h rename to pandas/_libs/src/librdata/win_iconv.h index fcdfbdd571e83..ac30123596971 100644 --- a/pandas/io/rdata/librdata/win_iconv.h +++ b/pandas/_libs/src/librdata/win_iconv.h @@ -5,8 +5,8 @@ * This file is placed in the public domain. */ -#ifndef PANDAS_IO_RDATA_LIBRDATA_WIN_ICONV_H_ -#define PANDAS_IO_RDATA_LIBRDATA_WIN_ICONV_H_ +#ifndef PANDAS__LIBS_SRC_LIBRDATA_WIN_ICONV_H_ +#define PANDAS__LIBS_SRC_LIBRDATA_WIN_ICONV_H_ // #ifndef _LIBICONV_H #define _LIBICONV_H @@ -37,4 +37,4 @@ #endif // #endif -#endif // PANDAS_IO_RDATA_LIBRDATA_WIN_ICONV_H_ +#endif // PANDAS__LIBS_SRC_LIBRDATA_WIN_ICONV_H_ diff --git a/pandas/io/rdata/_rdata.pxd b/pandas/io/rdata/_rdata.pxd index fc38e10cfe5f5..66b829a1e7eec 100644 --- a/pandas/io/rdata/_rdata.pxd +++ b/pandas/io/rdata/_rdata.pxd @@ -13,7 +13,7 @@ from libc.time cimport ( ) -cdef extern from 'librdata/rdata.h': +cdef extern from 'rdata.h': ctypedef enum rdata_type_t: RDATA_TYPE_STRING, diff --git a/pandas/io/rdata/librdata/rdata.h b/pandas/io/rdata/rdata.h similarity index 98% rename from pandas/io/rdata/librdata/rdata.h rename to pandas/io/rdata/rdata.h index 9571f5da4c357..7d49be71f16c4 100644 --- a/pandas/io/rdata/librdata/rdata.h +++ b/pandas/io/rdata/rdata.h @@ -2,8 +2,8 @@ Copyright (c) 2020 Evan Miller */ -#ifndef PANDAS_IO_RDATA_LIBRDATA_RDATA_H_ -#define PANDAS_IO_RDATA_LIBRDATA_RDATA_H_ +#ifndef PANDAS_IO_RDATA_RDATA_H_ +#define PANDAS_IO_RDATA_RDATA_H_ #include #include @@ -254,4 +254,4 @@ rdata_error_t rdata_end_file(rdata_writer_t *writer); } // extern c block #endif -#endif // PANDAS_IO_RDATA_LIBRDATA_RDATA_H_ +#endif // PANDAS_IO_RDATA_RDATA_H_ diff --git a/setup.py b/setup.py index 09fdd5194224e..7b316e1fc0de1 100755 --- a/setup.py +++ b/setup.py @@ -226,7 +226,7 @@ class CheckSDist(sdist_class): "pandas/_libs/window/indexers.pyx", "pandas/_libs/writers.pyx", "pandas/io/sas/sas.pyx", - "pandas/io/rdata/rdata.pyx", + "pandas/io/rdata/_rdata.pyx", ] _cpp_pyxfiles = [ @@ -437,17 +437,17 @@ def srcpath(name=None, suffix=".pyx", subdir="src"): ] rdata_srcs = [ - "pandas/io/rdata/librdata/rdata_parser.c", - "pandas/io/rdata/librdata/rdata_read.c", - "pandas/io/rdata/librdata/rdata_write.c", - "pandas/io/rdata/librdata/rdata_io_unistd.c", - "pandas/io/rdata/librdata/rdata_error.c", - "pandas/io/rdata/librdata/rdata_bits.c", - "pandas/io/rdata/librdata/CKHashTable.c", + "pandas/_libs/src/librdata/rdata_parser.c", + "pandas/_libs/src/librdata/rdata_read.c", + "pandas/_libs/src/librdata/rdata_write.c", + "pandas/_libs/src/librdata/rdata_io_unistd.c", + "pandas/_libs/src/librdata/rdata_error.c", + "pandas/_libs/src/librdata/rdata_bits.c", + "pandas/_libs/src/librdata/CKHashTable.c", ] if is_platform_windows(): - rdata_srcs.append("pandas/io/rdata/librdata/win_iconv.c") + rdata_srcs.append("pandas/_libs/src/librdata/win_iconv.c") ext_data = { "_libs.algos": { From 1ef9e9acc67e4e4a12ccff4658033487da3be339 Mon Sep 17 00:00:00 2001 From: Parfait Gasana Date: Sat, 8 May 2021 18:16:04 -0500 Subject: [PATCH 11/35] Adjust C src files to conform to cpplint --- pandas/_libs/src/librdata/CKHashTable.c | 350 +++ pandas/_libs/src/librdata/rdata_bits.c | 52 + pandas/_libs/src/librdata/rdata_error.c | 64 + pandas/_libs/src/librdata/rdata_io_unistd.c | 101 + pandas/_libs/src/librdata/rdata_parser.c | 147 ++ pandas/_libs/src/librdata/rdata_read.c | 2131 ++++++++++++++++++ pandas/_libs/src/librdata/rdata_write.c | 704 ++++++ pandas/_libs/src/librdata/win_iconv.c | 2228 +++++++++++++++++++ 8 files changed, 5777 insertions(+) create mode 100644 pandas/_libs/src/librdata/CKHashTable.c create mode 100644 pandas/_libs/src/librdata/rdata_bits.c create mode 100644 pandas/_libs/src/librdata/rdata_error.c create mode 100644 pandas/_libs/src/librdata/rdata_io_unistd.c create mode 100644 pandas/_libs/src/librdata/rdata_parser.c create mode 100644 pandas/_libs/src/librdata/rdata_read.c create mode 100644 pandas/_libs/src/librdata/rdata_write.c create mode 100644 pandas/_libs/src/librdata/win_iconv.c diff --git a/pandas/_libs/src/librdata/CKHashTable.c b/pandas/_libs/src/librdata/CKHashTable.c new file mode 100644 index 0000000000000..6178f0360a7d9 --- /dev/null +++ b/pandas/_libs/src/librdata/CKHashTable.c @@ -0,0 +1,350 @@ +// CKHashTable - A simple hash table +// Copyright 2010-2020 Evan Miller (see LICENSE) + +#include "CKHashTable.h" + +/* + SipHash reference C implementation + + Copyright (c) 2012 Jean-Philippe Aumasson + Copyright (c) 2012 Daniel J. Bernstein + + To the extent possible under law, the author(s) have dedicated all copyright + and related and neighboring rights to this software to the public domain + worldwide. This software is distributed without any warranty. + + You should have received a copy of the CC0 Public Domain Dedication along with + this software. If not, see . + */ +#include +#include +#include +typedef uint64_t u64; +typedef uint32_t u32; +typedef uint8_t u8; + + +#define ROTL(x, b) (u64)( ((x) << (b)) | ( (x) >> (64 - (b))) ) + +#define U32TO8_LE(p, v) \ +(p)[0] = (u8)((v) ); (p)[1] = (u8)((v) >> 8); \ +(p)[2] = (u8)((v) >> 16); (p)[3] = (u8)((v) >> 24); + +#define U64TO8_LE(p, v) \ +U32TO8_LE((p), (u32)((v) )); \ +U32TO8_LE((p) + 4, (u32)((v) >> 32)); + +#define U8TO64_LE(p) \ +(((u64)((p)[0])) | \ +((u64)((p)[1]) << 8) | \ +((u64)((p)[2]) << 16) | \ +((u64)((p)[3]) << 24) | \ +((u64)((p)[4]) << 32) | \ +((u64)((p)[5]) << 40) | \ +((u64)((p)[6]) << 48) | \ +((u64)((p)[7]) << 56)) + +#define SIPROUND \ +do { \ +v0 += v1; v1=ROTL(v1, 13); v1 ^= v0; v0=ROTL(v0, 32); \ +v2 += v3; v3=ROTL(v3, 16); v3 ^= v2; \ +v0 += v3; v3=ROTL(v3, 21); v3 ^= v0; \ +v2 += v1; v1=ROTL(v1, 17); v1 ^= v2; v2=ROTL(v2, 32); \ +} while (0) + +/* SipHash-1-2 */ +static int siphash( + unsigned char *out, + const unsigned char *in, + unsigned long long inlen, + const unsigned char *k) { + /* "somepseudorandomlygeneratedbytes" */ + u64 v0 = 0x736f6d6570736575ULL; + u64 v1 = 0x646f72616e646f6dULL; + u64 v2 = 0x6c7967656e657261ULL; + u64 v3 = 0x7465646279746573ULL; + u64 b; + u64 k0 = U8TO64_LE(k); + u64 k1 = U8TO64_LE(k + 8); + u64 m; + const u8 *end = in + inlen - ( inlen % sizeof( u64 ) ); + const int left = inlen & 7; + b = ((u64)inlen) << 56; + v3 ^= k1; + v2 ^= k0; + v1 ^= k1; + v0 ^= k0; + + for ( ; in != end; in += 8 ) { + m = U8TO64_LE(in); + + v3 ^= m; + + SIPROUND; + + v0 ^= m; + } + + switch ( left ) { + case 7: b |= ((u64)in[ 6]) << 48; + + case 6: b |= ((u64)in[ 5]) << 40; + + case 5: b |= ((u64)in[ 4]) << 32; + + case 4: b |= ((u64)in[ 3]) << 24; + + case 3: b |= ((u64)in[ 2]) << 16; + + case 2: b |= ((u64)in[ 1]) << 8; + + case 1: b |= ((u64)in[ 0]); break; + + case 0: break; + } + + v3 ^= b; + + SIPROUND; + + v0 ^= b; + v2 ^= 0xff; + + SIPROUND; + SIPROUND; + + b = v0 ^ v1 ^ v2 ^ v3; + U64TO8_LE(out, b); + return 0; +} + +inline uint64_t ck_hash_str(const char *str, size_t keylen) { + uint64_t hash; + unsigned char k[16] = { 0 }; + siphash((unsigned char *)&hash, (const unsigned char *)str, keylen, k); + return hash; +} + +const void *ck_float_hash_lookup(float key, ck_hash_table_t *table) { + return ck_str_n_hash_lookup((const char *)&key, sizeof(float), table); +} + +int ck_float_hash_insert( + float key, + const void *value, + ck_hash_table_t *table +) { + return ck_str_n_hash_insert( + (const char *)&key, + sizeof(float), + value, + table); +} + +const void *ck_double_hash_lookup(double key, ck_hash_table_t *table) { + return ck_str_n_hash_lookup((const char *)&key, sizeof(double), table); +} + +int ck_double_hash_insert( + double key, + const void *value, + ck_hash_table_t *table +) { + return ck_str_n_hash_insert( + (const char *)&key, + sizeof(double), + value, + table); +} + +const void *ck_str_hash_lookup(const char *key, ck_hash_table_t *table) { + size_t keylen = strlen(key); + return ck_str_n_hash_lookup(key, keylen, table); +} + +const void *ck_str_n_hash_lookup( + const char *key, + size_t keylen, + ck_hash_table_t *table +) { + if (table->count == 0) + return NULL; + + if (keylen == 0) + return NULL; + + uint64_t hash_key = ck_hash_str(key, keylen); + hash_key %= table->capacity; + uint64_t end = hash_key; + do { + char *this_key = &table->keys[table->entries[hash_key].key_offset]; + size_t this_keylen = table->entries[hash_key].key_length; + if (this_keylen == 0) + return NULL; + if (this_keylen == keylen && memcmp(this_key, key, keylen) == 0) { + return table->entries[hash_key].value; + } + hash_key++; + hash_key %= table->capacity; + } while (hash_key != end); + return NULL; +} + +int ck_str_hash_insert( + const char *key, + const void *value, + ck_hash_table_t *table +) { + size_t keylen = strlen(key); + return ck_str_n_hash_insert(key, keylen, value, table); +} + +static int ck_hash_insert_nocopy( + off_t key_offset, + size_t keylen, + uint64_t hash_key, + const void *value, + ck_hash_table_t *table +) { + if (table->capacity == 0) + return 0; + + hash_key %= table->capacity; + uint64_t end = (hash_key + table->capacity - 1) % table->capacity; + while (hash_key != end) { + ck_hash_entry_t *entry = &table->entries[hash_key]; + if (table->entries[hash_key].key_length == 0) { + table->count++; + entry->key_offset = key_offset; + entry->key_length = keylen; + entry->value = value; + return 1; + } else if (entry->key_length == keylen && + entry->key_offset == key_offset) { + entry->value = value; + return 1; + } + hash_key++; + hash_key %= table->capacity; + } + return 0; +} + +int ck_str_n_hash_insert( + const char *key, + size_t keylen, + const void *value, + ck_hash_table_t *table +) { + if (table->capacity == 0) + return 0; + + if (keylen == 0) + return 0; + + if (table->count >= 0.75 * table->capacity) { + if (ck_hash_table_grow(table) == -1) { + return 0; + } + } + + uint64_t hash_key = ck_hash_str(key, keylen); + hash_key %= table->capacity; + uint64_t end = hash_key; + do { + ck_hash_entry_t *entry = &table->entries[hash_key]; + char *this_key = &table->keys[entry->key_offset]; + if (entry->key_length == 0) { + table->count++; + while (table->keys_used + keylen > table->keys_capacity) { + table->keys_capacity *= 2; + table->keys = realloc(table->keys, table->keys_capacity); + } + memcpy(table->keys + table->keys_used, key, keylen); + entry->key_offset = table->keys_used; + entry->key_length = keylen; + table->keys_used += keylen; + entry->value = value; + return 1; + } else if (entry->key_length == keylen && + memcmp(this_key, key, keylen) == 0) { + table->entries[hash_key].value = value; + return 1; + } + hash_key++; + hash_key %= table->capacity; + } while (hash_key != end); + return 0; +} + +ck_hash_table_t *ck_hash_table_init( + size_t num_entries, + size_t mean_key_length +) { + ck_hash_table_t *table; + if ((table = malloc(sizeof(ck_hash_table_t))) == NULL) + return NULL; + + if ((table->keys = malloc(num_entries * mean_key_length)) == NULL) { + free(table); + return NULL; + } + table->keys_capacity = num_entries * mean_key_length; + + num_entries *= 2; + + if ((table->entries = malloc( + num_entries * sizeof(ck_hash_entry_t))) == NULL + ) { + free(table->keys); + free(table); + return NULL; + } + table->capacity = num_entries; + ck_hash_table_wipe(table); + return table; +} + +void ck_hash_table_free(ck_hash_table_t *table) { + free(table->entries); + if (table->keys) + free(table->keys); + free(table); +} + +void ck_hash_table_wipe(ck_hash_table_t *table) { + table->keys_used = 0; + table->count = 0; + memset(table->entries, 0, table->capacity * sizeof(ck_hash_entry_t)); +} + +int ck_hash_table_grow(ck_hash_table_t *table) { + ck_hash_entry_t *old_entries = table->entries; + uint64_t old_capacity = table->capacity; + uint64_t new_capacity = 2 * table->capacity; + if ((table->entries = calloc( + new_capacity, + sizeof(ck_hash_entry_t))) == NULL + ) { + return -1; + } + table->capacity = new_capacity; + table->count = 0; + for (int i = 0; i < old_capacity; i++) { + if (old_entries[i].key_length != 0) { + char *this_key = &table->keys[old_entries[i].key_offset]; + uint64_t hash_key = ck_hash_str( + this_key, + old_entries[i].key_length); + if (!ck_hash_insert_nocopy( + old_entries[i].key_offset, + old_entries[i].key_length, + hash_key, + old_entries[i].value, table) + ) + return -1; + } + } + free(old_entries); + return 0; +} diff --git a/pandas/_libs/src/librdata/rdata_bits.c b/pandas/_libs/src/librdata/rdata_bits.c new file mode 100644 index 0000000000000..dd308d0e5002f --- /dev/null +++ b/pandas/_libs/src/librdata/rdata_bits.c @@ -0,0 +1,52 @@ +/* +Copyright (c) 2020 Evan Miller +*/ + +// +// readstat_bits.c - Bit-twiddling utility functions +// + +#include +#include +#include + +#include "rdata_bits.h" + +int machine_is_little_endian() { + int test_byte_order = 1; + return ((char *)&test_byte_order)[0]; +} + +uint16_t byteswap2(uint16_t num) { + return ((num & 0xFF00) >> 8) | ((num & 0x00FF) << 8); +} + +uint32_t byteswap4(uint32_t num) { + num = ((num & 0xFFFF0000) >> 16) | ((num & 0x0000FFFF) << 16); + return ((num & 0xFF00FF00) >> 8) | ((num & 0x00FF00FF) << 8); +} + +uint64_t byteswap8(uint64_t num) { + num = ((num & 0xFFFFFFFF00000000) >> 32) | + ((num & 0x00000000FFFFFFFF) << 32); + num = ((num & 0xFFFF0000FFFF0000) >> 16) | + ((num & 0x0000FFFF0000FFFF) << 16); + return ((num & 0xFF00FF00FF00FF00) >> 8) | + ((num & 0x00FF00FF00FF00FF) << 8); +} + +float byteswap_float(float num) { + uint32_t answer = 0; + memcpy(&answer, &num, 4); + answer = byteswap4(answer); + memcpy(&num, &answer, 4); + return num; +} + +double byteswap_double(double num) { + uint64_t answer = 0; + memcpy(&answer, &num, 8); + answer = byteswap8(answer); + memcpy(&num, &answer, 8); + return num; +} diff --git a/pandas/_libs/src/librdata/rdata_error.c b/pandas/_libs/src/librdata/rdata_error.c new file mode 100644 index 0000000000000..5a5cabc1f55b7 --- /dev/null +++ b/pandas/_libs/src/librdata/rdata_error.c @@ -0,0 +1,64 @@ +/* +Copyright (c) 2020 Evan Miller +*/ + +#include "rdata.h" + +const char *rdata_error_message(rdata_error_t error_code) { + if (error_code == RDATA_OK) + return NULL; + + if (error_code == RDATA_ERROR_OPEN) + return "Unable to open file"; + + if (error_code == RDATA_ERROR_SEEK) + return "Unable to seek within file"; + + if (error_code == RDATA_ERROR_READ) + return "Unable to read from file"; + + if (error_code == RDATA_ERROR_MALLOC) + return "Unable to allocate memory"; + + if (error_code == RDATA_ERROR_USER_ABORT) + return "The parsing was aborted (callback returned non-zero value)"; + + if (error_code == RDATA_ERROR_PARSE) + return "Invalid file, or file has unsupported features"; + + if (error_code == RDATA_ERROR_WRITE) + return "Unable to write to file"; + + if (error_code == RDATA_ERROR_FACTOR) + return "The provided column does not support factors"; + + if (error_code == RDATA_ERROR_UNSUPPORTED_COMPRESSION) + return "The file is compressed using an unsupported " + "compression scheme"; + + if (error_code == RDATA_ERROR_UNSUPPORTED_CHARSET) + return "File has an unsupported character set"; + + if (error_code == RDATA_ERROR_CONVERT) + return "Unable to convert string to the requested encoding"; + + if (error_code == RDATA_ERROR_CONVERT_BAD_STRING) + return "Unable to convert string to the requested " + "encoding (invalid byte sequence)"; + + if (error_code == RDATA_ERROR_CONVERT_SHORT_STRING) + return "Unable to convert string to the requested " + "encoding (incomplete byte sequence)"; + + if (error_code == RDATA_ERROR_CONVERT_LONG_STRING) + return "Unable to convert string to the requested " + "encoding (output buffer too small)"; + + if (error_code == RDATA_ERROR_UNSUPPORTED_S_EXPRESSION) + return "The file contains an unrecognized object"; + + if (error_code == RDATA_ERROR_UNSUPPORTED_STORAGE_CLASS) + return "The file contains an unrecognized object"; + + return "Unknown error"; +} diff --git a/pandas/_libs/src/librdata/rdata_io_unistd.c b/pandas/_libs/src/librdata/rdata_io_unistd.c new file mode 100644 index 0000000000000..118eb4a64a968 --- /dev/null +++ b/pandas/_libs/src/librdata/rdata_io_unistd.c @@ -0,0 +1,101 @@ +/* +Copyright (c) 2020 Evan Miller +*/ + +#include +#include +#if defined _WIN32 || defined __CYGWIN__ + #include +#else + #include +#endif + + +#include "rdata.h" +#include "rdata_io_unistd.h" + +#if defined _WIN32 || defined __CYGWIN__ +#define UNISTD_OPEN_OPTIONS O_RDONLY | O_BINARY +#elif defined _AIX +#define UNISTD_OPEN_OPTIONS O_RDONLY | O_LARGEFILE +#else +#define UNISTD_OPEN_OPTIONS O_RDONLY +#endif + +#if defined _WIN32 || defined _AIX +#define lseek lseek +#endif + + +int rdata_unistd_open_handler(const char *path, void *io_ctx) { + int fd = open(path, UNISTD_OPEN_OPTIONS); + ((rdata_unistd_io_ctx_t*) io_ctx)->fd = fd; + return fd; +} + +int rdata_unistd_close_handler(void *io_ctx) { + int fd = ((rdata_unistd_io_ctx_t*) io_ctx)->fd; + if (fd != -1) + return close(fd); + else + return 0; +} + +rdata_off_t rdata_unistd_seek_handler( + rdata_off_t offset, + rdata_io_flags_t whence, + void *io_ctx +) { + int flag = 0; + switch (whence) { + case RDATA_SEEK_SET: + flag = SEEK_SET; + break; + case RDATA_SEEK_CUR: + flag = SEEK_CUR; + break; + case RDATA_SEEK_END: + flag = SEEK_END; + break; + default: + return -1; + } + int fd = ((rdata_unistd_io_ctx_t*) io_ctx)->fd; + return lseek(fd, offset, flag); +} + +ssize_t rdata_unistd_read_handler(void *buf, size_t nbyte, void *io_ctx) { + int fd = ((rdata_unistd_io_ctx_t*) io_ctx)->fd; + ssize_t out = read(fd, buf, nbyte); + return out; +} + +rdata_error_t rdata_unistd_update_handler(long file_size, + rdata_progress_handler progress_handler, void *user_ctx, + void *io_ctx) { + if (!progress_handler) + return RDATA_OK; + + int fd = ((rdata_unistd_io_ctx_t*) io_ctx)->fd; + long current_offset = lseek(fd, 0, SEEK_CUR); + + if (current_offset == -1) + return RDATA_ERROR_SEEK; + + if (progress_handler(1.0 * current_offset / file_size, user_ctx)) + return RDATA_ERROR_USER_ABORT; + + return RDATA_OK; +} + +void rdata_unistd_io_init(rdata_parser_t *parser) { + rdata_set_open_handler(parser, rdata_unistd_open_handler); + rdata_set_close_handler(parser, rdata_unistd_close_handler); + rdata_set_seek_handler(parser, rdata_unistd_seek_handler); + rdata_set_read_handler(parser, rdata_unistd_read_handler); + rdata_set_update_handler(parser, rdata_unistd_update_handler); + + rdata_unistd_io_ctx_t *io_ctx = calloc(1, sizeof(rdata_unistd_io_ctx_t)); + io_ctx->fd = -1; + rdata_set_io_ctx(parser, (void*) io_ctx); +} diff --git a/pandas/_libs/src/librdata/rdata_parser.c b/pandas/_libs/src/librdata/rdata_parser.c new file mode 100644 index 0000000000000..5d948a449fba3 --- /dev/null +++ b/pandas/_libs/src/librdata/rdata_parser.c @@ -0,0 +1,147 @@ +/* +Copyright (c) 2020 Evan Miller +*/ + +#include +#include "rdata.h" +#include "rdata_io_unistd.h" + +rdata_parser_t *rdata_parser_init() { + rdata_parser_t *parser = calloc(1, sizeof(rdata_parser_t)); + parser->io = calloc(1, sizeof(rdata_io_t)); + rdata_unistd_io_init(parser); + return parser; +} + +void rdata_parser_free(rdata_parser_t *parser) { + if (parser) { + if (parser->io) + free(parser->io); + free(parser); + } +} + +rdata_error_t rdata_set_table_handler( + rdata_parser_t *parser, + rdata_table_handler table_handler +) { + parser->table_handler = table_handler; + return RDATA_OK; +} + +rdata_error_t rdata_set_column_handler( + rdata_parser_t *parser, + rdata_column_handler column_handler +) { + parser->column_handler = column_handler; + return RDATA_OK; +} + +rdata_error_t rdata_set_column_name_handler( + rdata_parser_t *parser, + rdata_column_name_handler column_name_handler +) { + parser->column_name_handler = column_name_handler; + return RDATA_OK; +} + +rdata_error_t rdata_set_row_name_handler( + rdata_parser_t *parser, + rdata_column_name_handler row_name_handler +) { + parser->row_name_handler = row_name_handler; + return RDATA_OK; +} + +rdata_error_t rdata_set_text_value_handler( + rdata_parser_t *parser, + rdata_text_value_handler text_value_handler +) { + parser->text_value_handler = text_value_handler; + return RDATA_OK; +} + +rdata_error_t rdata_set_value_label_handler( + rdata_parser_t *parser, + rdata_text_value_handler value_label_handler +) { + parser->value_label_handler = value_label_handler; + return RDATA_OK; +} + +rdata_error_t rdata_set_dim_handler( + rdata_parser_t *parser, + rdata_column_handler dim_handler +) { + parser->dim_handler = dim_handler; + return RDATA_OK; +} + +rdata_error_t rdata_set_dim_name_handler( + rdata_parser_t *parser, + rdata_text_value_handler dim_name_handler +) { + parser->dim_name_handler = dim_name_handler; + return RDATA_OK; +} + +rdata_error_t rdata_set_error_handler( + rdata_parser_t *parser, + rdata_error_handler error_handler +) { + parser->error_handler = error_handler; + return RDATA_OK; +} + +rdata_error_t rdata_set_open_handler( + rdata_parser_t *parser, + rdata_open_handler open_handler +) { + parser->io->open = open_handler; + return RDATA_OK; +} + +rdata_error_t rdata_set_close_handler( + rdata_parser_t *parser, + rdata_close_handler close_handler +) { + parser->io->close = close_handler; + return RDATA_OK; +} + +rdata_error_t rdata_set_seek_handler( + rdata_parser_t *parser, + rdata_seek_handler seek_handler +) { + parser->io->seek = seek_handler; + return RDATA_OK; +} + +rdata_error_t rdata_set_read_handler( + rdata_parser_t *parser, + rdata_read_handler read_handler +) { + parser->io->read = read_handler; + return RDATA_OK; +} + +rdata_error_t rdata_set_update_handler( + rdata_parser_t *parser, + rdata_update_handler update_handler +) { + parser->io->update = update_handler; + return RDATA_OK; +} + +rdata_error_t rdata_set_io_ctx( + rdata_parser_t *parser, + void *io_ctx +) { + if (!parser->io->external_io) + free(parser->io->io_ctx); + + parser->io->io_ctx = io_ctx; + parser->io->external_io = 1; + + return RDATA_OK; +} diff --git a/pandas/_libs/src/librdata/rdata_read.c b/pandas/_libs/src/librdata/rdata_read.c new file mode 100644 index 0000000000000..42eda92d61841 --- /dev/null +++ b/pandas/_libs/src/librdata/rdata_read.c @@ -0,0 +1,2131 @@ +/* +Copyright (c) 2020 Evan Miller +*/ + +// +// rdata_rdata.c +// + +#include +#include +#include +#include +#include +#include +#include + +#ifdef _WIN32 +#include "win_iconv.h" +#else +#include "/usr/include/iconv.h" +#endif + +#include +#include + +#if HAVE_BZIP2 +#include +#endif + +#if HAVE_APPLE_COMPRESSION +#include +#endif + +#if HAVE_ZLIB +#include +#endif + +#if HAVE_LZMA +#include +#endif + +#include "rdata.h" +#include "rdata_internal.h" + +#define RDATA_CLASS_POSIXCT 0x01 +#define RDATA_CLASS_DATE 0x02 + +#define STREAM_BUFFER_SIZE 65536 +#define MAX_ARRAY_DIMENSIONS 3 + +/* ICONV_CONST defined by autotools during configure according + * to the current platform. Some people copy-paste the source code, so + * provide some fallback logic */ +#ifndef ICONV_CONST +#define ICONV_CONST +#endif + +typedef struct rdata_atom_table_s { + int count; + char **data; +} rdata_atom_table_t; + +typedef struct rdata_ctx_s { + int machine_needs_byteswap; + rdata_table_handler table_handler; + rdata_column_handler column_handler; + rdata_column_name_handler column_name_handler; + rdata_column_name_handler row_name_handler; + rdata_text_value_handler text_value_handler; + rdata_text_value_handler value_label_handler; + rdata_column_handler dim_handler; + rdata_text_value_handler dim_name_handler; + rdata_error_handler error_handler; + void *user_ctx; +#if HAVE_BZIP2 + bz_stream *bz_strm; +#endif +#if HAVE_APPLE_COMPRESSION + compression_stream *compression_strm; +#endif +#if HAVE_ZLIB + z_stream *z_strm; +#endif +#if HAVE_LZMA + lzma_stream *lzma_strm; +#endif + void *strm_buffer; + rdata_io_t *io; + size_t bytes_read; + + rdata_atom_table_t *atom_table; + unsigned int column_class; + + iconv_t converter; + + int32_t dims[MAX_ARRAY_DIMENSIONS]; + bool is_dimnames; +} rdata_ctx_t; + +static int atom_table_add(rdata_atom_table_t *table, char *key); +static char *atom_table_lookup(rdata_atom_table_t *table, int index); + +static rdata_error_t read_environment( + const char *table_name, + rdata_ctx_t *ctx); +static rdata_error_t read_toplevel_object( + const char *table_name, + const char *key, + rdata_ctx_t *ctx); +static rdata_error_t read_sexptype_header( + rdata_sexptype_info_t *header, + rdata_ctx_t *ctx); +static rdata_error_t read_length( + int32_t *outLength, + rdata_ctx_t *ctx); +static rdata_error_t read_string_vector_n( + int attributes, + int32_t length, + rdata_text_value_handler text_value_handler, + void *callback_ctx, + rdata_ctx_t *ctx); +static rdata_error_t read_string_vector( + int attributes, + rdata_text_value_handler text_value_handler, + void *callback_ctx, + rdata_ctx_t *ctx); +static rdata_error_t read_value_vector( + rdata_sexptype_header_t header, + const char *name, + rdata_ctx_t *ctx); +static rdata_error_t read_value_vector_cb( + rdata_sexptype_header_t header, + const char *name, + rdata_column_handler column_handler, + void *user_ctx, + rdata_ctx_t *ctx); +static rdata_error_t read_character_string( + char **key, + rdata_ctx_t *ctx); +static rdata_error_t read_generic_list( + int attributes, + rdata_ctx_t *ctx); +static rdata_error_t read_altrep_vector( + const char *name, + rdata_ctx_t *ctx); +static rdata_error_t read_attributes(int (*handle_attribute)( + char *key, + rdata_sexptype_info_t val_info, + rdata_ctx_t *ctx), + rdata_ctx_t *ctx); +static rdata_error_t recursive_discard( + rdata_sexptype_header_t sexptype_header, + rdata_ctx_t *ctx); + +static void *rdata_malloc(size_t len) { + if (len == 0) + return NULL; + + return malloc(len); +} + +static void *rdata_realloc(void *buf, size_t len) { + if (len == 0) + return NULL; + + return realloc(buf, len); +} + +static int atom_table_add(rdata_atom_table_t *table, char *key) { + table->data = realloc(table->data, sizeof(char *) * (table->count + 1)); + table->data[table->count++] = strdup(key); + return table->count; +} + +static char *atom_table_lookup(rdata_atom_table_t *table, int index) { + if (index <= 0 || index > table->count) { + return NULL; + } + return table->data[(index-1)]; +} + +#if HAVE_BZIP2 +static ssize_t read_st_bzip2(rdata_ctx_t *ctx, void *buffer, size_t len) { + ssize_t bytes_written = 0; + int error = 0; + int result = BZ_OK; + while (1) { + ssize_t start_out = ctx->bz_strm->total_out_lo32 + + ((ssize_t)ctx->bz_strm->total_out_hi32 << 32LL); + + ctx->bz_strm->next_out = (char *)buffer + bytes_written; + ctx->bz_strm->avail_out = len - bytes_written; + + result = BZ2_bzDecompress(ctx->bz_strm); + + if (result != BZ_OK && result != BZ_STREAM_END) { + error = -1; + break; + } + + bytes_written += ctx->bz_strm->total_out_lo32 + + ((ssize_t)ctx->bz_strm->total_out_hi32 << 32LL) - start_out; + + if (result == BZ_STREAM_END) + break; + + if (ctx->bz_strm->avail_in == 0) { + int bytes_read = 0; + bytes_read = ctx->io->read( + ctx->strm_buffer, + STREAM_BUFFER_SIZE, + ctx->io->io_ctx); + if (bytes_read < 0) { + error = bytes_read; + break; + } + if (bytes_read == 0) + break; + + ctx->bz_strm->next_in = ctx->strm_buffer; + ctx->bz_strm->avail_in = bytes_read; + } + if (bytes_written == len) + break; + } + + if (error != 0) + return error; + + return bytes_written; +} +#endif /* HAVE_BZIP2 */ + +#if HAVE_APPLE_COMPRESSION +static ssize_t read_st_compression( + rdata_ctx_t *ctx, + void *buffer, + size_t len +) { + ssize_t bytes_written = 0; + int error = 0; + compression_status result = COMPRESSION_STATUS_OK; + size_t start_size = len; + + ctx->compression_strm->dst_ptr = (unsigned char *)buffer; + ctx->compression_strm->dst_size = len; + + while (1) { + start_size = ctx->compression_strm->dst_size; + + result = compression_stream_process(ctx->compression_strm, 0); + + if (result == COMPRESSION_STATUS_OK) { + bytes_written += start_size - ctx->compression_strm->dst_size; + } else { + error = -1; + break; + } + + if (ctx->compression_strm->src_size == 0) { + int bytes_read = 0; + bytes_read = ctx->io->read( + ctx->compression_strm, + STREAM_BUFFER_SIZE, + ctx->io->io_ctx); + if (bytes_read < 0) { + error = bytes_read; + break; + } + if (bytes_read == 0) { + start_size = ctx->compression_strm->dst_size; + result = compression_stream_process( + ctx->compression_strm, + COMPRESSION_STREAM_FINALIZE); + if (result == COMPRESSION_STATUS_END) { + bytes_written += ( + start_size - ctx->compression_strm->dst_size); + } else { + error = -1; + } + break; + } + + ctx->compression_strm->src_ptr = ctx->strm_buffer; + ctx->compression_strm->src_size = bytes_read; + } + if (bytes_written == len) + break; + } + + if (error != 0) + return error; + + return bytes_written; +} +#endif /* HAVE_APPLE_COMPRESSION */ + +#if HAVE_ZLIB +static ssize_t read_st_z(rdata_ctx_t *ctx, void *buffer, size_t len) { + ssize_t bytes_written = 0; + int error = 0; + int result = Z_OK; + while (1) { + long start_out = ctx->z_strm->total_out; + + ctx->z_strm->next_out = (unsigned char *)buffer + bytes_written; + ctx->z_strm->avail_out = len - bytes_written; + + result = inflate(ctx->z_strm, Z_SYNC_FLUSH); + + if (result != Z_OK && result != Z_STREAM_END) { + error = -1; + break; + } + + bytes_written += ctx->z_strm->total_out - start_out; + + if (result == Z_STREAM_END) + break; + + if (ctx->z_strm->avail_in == 0) { + int bytes_read = 0; + bytes_read = ctx->io->read( + ctx->strm_buffer, + STREAM_BUFFER_SIZE, + ctx->io->io_ctx); + if (bytes_read < 0) { + error = bytes_read; + break; + } + if (bytes_read == 0) + break; + + ctx->z_strm->next_in = ctx->strm_buffer; + ctx->z_strm->avail_in = bytes_read; + } + if (bytes_written == len) + break; + } + + if (error != 0) + return error; + + return bytes_written; +} +#endif /* HAVE_ZLIB */ + +#if HAVE_LZMA +static ssize_t read_st_lzma(rdata_ctx_t *ctx, void *buffer, size_t len) { + ssize_t bytes_written = 0; + int error = 0; + int result = LZMA_OK; + while (1) { + long start_out = ctx->lzma_strm->total_out; + + ctx->lzma_strm->next_out = (unsigned char *)buffer + bytes_written; + ctx->lzma_strm->avail_out = len - bytes_written; + + result = lzma_code(ctx->lzma_strm, LZMA_RUN); + + if (result != LZMA_OK && result != LZMA_STREAM_END) { + error = -1; + break; + } + + bytes_written += ctx->lzma_strm->total_out - start_out; + + if (result == LZMA_STREAM_END) + break; + + if (ctx->lzma_strm->avail_in == 0) { + int bytes_read = 0; + bytes_read = ctx->io->read( + ctx->strm_buffer, + STREAM_BUFFER_SIZE, + ctx->io->io_ctx); + if (bytes_read < 0) { + error = bytes_read; + break; + } + if (bytes_read == 0) + break; + + ctx->lzma_strm->next_in = ctx->strm_buffer; + ctx->lzma_strm->avail_in = bytes_read; + } + if (bytes_written == len) + break; + } + + if (error != 0) + return error; + + return bytes_written; +} +#endif /* HAVE_LZMA */ + +static ssize_t read_st(rdata_ctx_t *ctx, void *buffer, size_t len) { + ssize_t bytes_read = 0; + + if (len == 0) + return 0; + +#if HAVE_BZIP2 + if (ctx->bz_strm) { + bytes_read = read_st_bzip2(ctx, buffer, len); + } else // NOLINT +#endif +#if HAVE_APPLE_COMPRESSION + if (ctx->compression_strm) { + bytes_read = read_st_compression(ctx, buffer, len); + } else // NOLINT +#endif +#if HAVE_ZLIB + if (ctx->z_strm) { + bytes_read = read_st_z(ctx, buffer, len); + } else // NOLINT +#endif +#if HAVE_LZMA + if (ctx->lzma_strm) { + bytes_read = read_st_lzma(ctx, buffer, len); + } else // NOLINT +#endif + { + bytes_read = ctx->io->read(buffer, len, ctx->io->io_ctx); + } + + if (bytes_read > 0) { + ctx->bytes_read += bytes_read; + } + + return bytes_read; +} + +static int lseek_st(rdata_ctx_t *ctx, size_t len) { + if (0 +#if HAVE_BZIP2 + || ctx->bz_strm +#endif +#if HAVE_APPLE_COMPRESSION + || ctx->compression_strm +#endif +#if HAVE_ZLIB + || ctx->z_strm +#endif +#if HAVE_LZMA + || ctx->lzma_strm +#endif + ) { + int retval = 0; + char *buf = rdata_malloc(len); + if (buf == NULL) { + retval = -1; + } else if (read_st(ctx, buf, len) != len) { + retval = -1; + } + if (buf) + free(buf); + + return retval; + } + + return ctx->io->seek(len, SEEK_CUR, ctx->io->io_ctx); +} + +static rdata_error_t init_bz_stream(rdata_ctx_t *ctx) { + rdata_error_t retval = RDATA_OK; + ctx->strm_buffer = malloc(STREAM_BUFFER_SIZE); + int bytes_read = ctx->io->read( + ctx->strm_buffer, + STREAM_BUFFER_SIZE, + ctx->io->io_ctx); + if (bytes_read <= 0) { + retval = RDATA_ERROR_READ; + goto cleanup; + } + +#if HAVE_BZIP2 + ctx->bz_strm = calloc(1, sizeof(bz_stream)); + ctx->bz_strm->next_in = ctx->strm_buffer; + ctx->bz_strm->avail_in = bytes_read; + + if (BZ2_bzDecompressInit(ctx->bz_strm, 0, 0) != BZ_OK) { + retval = RDATA_ERROR_MALLOC; + goto cleanup; + } +#else + retval = RDATA_ERROR_UNSUPPORTED_COMPRESSION; + goto cleanup; +#endif + +cleanup: + return retval; +} + +static rdata_error_t init_z_stream(rdata_ctx_t *ctx) { + rdata_error_t retval = RDATA_OK; + ctx->strm_buffer = malloc(STREAM_BUFFER_SIZE); + int bytes_read = ctx->io->read( + ctx->strm_buffer, + STREAM_BUFFER_SIZE, + ctx->io->io_ctx); + if (bytes_read <= 0) { + retval = RDATA_ERROR_READ; + goto cleanup; + } + +#if HAVE_ZLIB + ctx->z_strm = calloc(1, sizeof(z_stream)); + ctx->z_strm->next_in = ctx->strm_buffer; + ctx->z_strm->avail_in = bytes_read; + + if (inflateInit2(ctx->z_strm, (15+32)) != Z_OK) { + retval = RDATA_ERROR_MALLOC; + goto cleanup; + } +#else + retval = RDATA_ERROR_UNSUPPORTED_COMPRESSION; + goto cleanup; +#endif + +cleanup: + return retval; +} + +static rdata_error_t init_lzma_stream(rdata_ctx_t *ctx) { + rdata_error_t retval = RDATA_OK; + ctx->strm_buffer = malloc(STREAM_BUFFER_SIZE); + int bytes_read = ctx->io->read( + ctx->strm_buffer, + STREAM_BUFFER_SIZE, + ctx->io->io_ctx); + if (bytes_read <= 0) { + retval = RDATA_ERROR_READ; + goto cleanup; + } + +#if HAVE_APPLE_COMPRESSION + ctx->compression_strm = calloc(1, sizeof(compression_stream)); + + if (compression_stream_init( + ctx->compression_strm, + COMPRESSION_STREAM_DECODE, + COMPRESSION_LZMA) == COMPRESSION_STATUS_ERROR + ) { + retval = RDATA_ERROR_MALLOC; + goto cleanup; + } + + ctx->compression_strm->src_ptr = ctx->strm_buffer; + ctx->compression_strm->src_size = bytes_read; +#elif HAVE_LZMA + ctx->lzma_strm = calloc(1, sizeof(lzma_stream)); + + if (lzma_stream_decoder(ctx->lzma_strm, UINT64_MAX, 0) != LZMA_OK) { + retval = RDATA_ERROR_MALLOC; + goto cleanup; + } + + ctx->lzma_strm->next_in = ctx->strm_buffer; + ctx->lzma_strm->avail_in = bytes_read; +#else + retval = RDATA_ERROR_UNSUPPORTED_COMPRESSION; + goto cleanup; +#endif + +cleanup: + return retval; +} + +static rdata_error_t init_stream(rdata_ctx_t *ctx) { + rdata_error_t retval = RDATA_OK; + char header[5]; + + if (ctx->io->read( + &header, + sizeof(header), + ctx->io->io_ctx) != sizeof(header) + ) { + retval = RDATA_ERROR_READ; + goto cleanup; + } + + if (ctx->io->seek(0, SEEK_SET, ctx->io->io_ctx) == -1) { + retval = RDATA_ERROR_SEEK; + goto cleanup; + } + + if (header[0] == 'B' && header[1] == 'Z' && header[2] == 'h' && + header[3] >= '0' && header[3] <= '9') { + return init_bz_stream(ctx); + } + if (header[0] == '\x1f' && header[1] == '\x8b') { + return init_z_stream(ctx); + } + if (strncmp("\xFD" "7zXZ", header, sizeof(header)) == 0) { + return init_lzma_stream(ctx); + } + +cleanup: + return retval; +} + +static rdata_error_t reset_stream(rdata_ctx_t *ctx) { +#if HAVE_BZIP2 + if (ctx->bz_strm) { + BZ2_bzDecompressEnd(ctx->bz_strm); + free(ctx->bz_strm); + ctx->bz_strm = NULL; + } +#endif +#if HAVE_APPLE_COMPRESSION + if (ctx->compression_strm) { + compression_stream_destroy(ctx->compression_strm); + free(ctx->compression_strm); + ctx->compression_strm = NULL; + } +#endif +#if HAVE_ZLIB + if (ctx->z_strm) { + inflateEnd(ctx->z_strm); + free(ctx->z_strm); + ctx->z_strm = NULL; + } +#endif +#if HAVE_LZMA + if (ctx->lzma_strm) { + lzma_end(ctx->lzma_strm); + free(ctx->lzma_strm); + ctx->lzma_strm = NULL; + } +#endif + + if (ctx->io->seek(0, SEEK_SET, ctx->io->io_ctx) == -1) { + return RDATA_ERROR_SEEK; + } + return init_stream(ctx); +} + +static rdata_error_t rdata_convert( + char *dst, + size_t dst_len, + const char *src, + size_t src_len, + iconv_t converter +) { + if (dst_len == 0) { + return RDATA_ERROR_CONVERT_LONG_STRING; + } else if (converter) { + size_t dst_left = dst_len - 1; + char *dst_end = dst; + size_t status = iconv(converter, ( + ICONV_CONST char **)&src, + &src_len, + &dst_end, + &dst_left); + if (status == (size_t)-1) { + if (errno == E2BIG) { + return RDATA_ERROR_CONVERT_LONG_STRING; + } else if (errno == EILSEQ) { + return RDATA_ERROR_CONVERT_BAD_STRING; + } else if (errno != EINVAL) { + /* EINVAL indicates improper truncation; accept it */ + return RDATA_ERROR_CONVERT; + } + } + dst[dst_len - dst_left - 1] = '\0'; + } else if (src_len + 1 > dst_len) { + return RDATA_ERROR_CONVERT_LONG_STRING; + } else { + memcpy(dst, src, src_len); + dst[src_len] = '\0'; + } + return RDATA_OK; +} + +rdata_ctx_t *rdata_ctx_init(rdata_io_t *io, const char *filename) { + int fd = io->open(filename, io->io_ctx); + if (fd == -1) { + return NULL; + } + rdata_ctx_t *ctx = calloc(1, sizeof(rdata_ctx_t)); + rdata_atom_table_t *atom_table = malloc(sizeof(rdata_atom_table_t)); + + atom_table->count = 0; + atom_table->data = NULL; + + ctx->atom_table = atom_table; + + ctx->machine_needs_byteswap = 0; + if (machine_is_little_endian()) { + ctx->machine_needs_byteswap = 1; + } + + ctx->io = io; + + return ctx; +} + +void free_rdata_ctx(rdata_ctx_t *ctx) { + if (ctx->io) { + ctx->io->close(ctx->io->io_ctx); + } + if (ctx->atom_table) { + if (ctx->atom_table->data) { + int i; + for (i=0; i < ctx->atom_table->count; i++) + free(ctx->atom_table->data[i]); + free(ctx->atom_table->data); + } + free(ctx->atom_table); + } +#if HAVE_BZIP2 + if (ctx->bz_strm) { + BZ2_bzDecompressEnd(ctx->bz_strm); + free(ctx->bz_strm); + } +#endif +#if HAVE_APPLE_COMPRESSION + if (ctx->compression_strm) { + compression_stream_destroy(ctx->compression_strm); + free(ctx->compression_strm); + } +#endif +#if HAVE_ZLIB + if (ctx->z_strm) { + inflateEnd(ctx->z_strm); + free(ctx->z_strm); + } +#endif +#if HAVE_LZMA + if (ctx->lzma_strm) { + lzma_end(ctx->lzma_strm); + free(ctx->lzma_strm); + } +#endif + if (ctx->strm_buffer) { + free(ctx->strm_buffer); + } + if (ctx->converter) { + iconv_close(ctx->converter); + } + free(ctx); +} + +rdata_error_t rdata_parse( + rdata_parser_t *parser, + const char *filename, + void *user_ctx +) { + int is_rdata = 0; + rdata_error_t retval = RDATA_OK; + rdata_v2_header_t v2_header; + rdata_ctx_t *ctx = rdata_ctx_init(parser->io, filename); + char *encoding = NULL; + + if (ctx == NULL) { + retval = RDATA_ERROR_OPEN; + goto cleanup; + } + + ctx->user_ctx = user_ctx; + ctx->table_handler = parser->table_handler; + ctx->column_handler = parser->column_handler; + ctx->column_name_handler = parser->column_name_handler; + ctx->row_name_handler = parser->row_name_handler; + ctx->text_value_handler = parser->text_value_handler; + ctx->value_label_handler = parser->value_label_handler; + ctx->dim_handler = parser->dim_handler; + ctx->dim_name_handler = parser->dim_name_handler; + ctx->error_handler = parser->error_handler; + + ctx->is_dimnames = false; + + if ((retval = init_stream(ctx)) != RDATA_OK) { + goto cleanup; + } + + char header_line[5]; + if (read_st( + ctx, &header_line, + sizeof(header_line)) != sizeof(header_line) + ) { + retval = RDATA_ERROR_READ; + goto cleanup; + } + if (memcmp("RDX", header_line, 3) == 0 && header_line[4] == '\n') { + is_rdata = 1; + } else { + reset_stream(ctx); + } + + if (read_st(ctx, &v2_header, sizeof(v2_header)) != sizeof(v2_header)) { + retval = RDATA_ERROR_READ; + goto cleanup; + } + + if (ctx->machine_needs_byteswap) { + v2_header.format_version = byteswap4(v2_header.format_version); + v2_header.writer_version = byteswap4(v2_header.writer_version); + v2_header.reader_version = byteswap4(v2_header.reader_version); + } + + if (is_rdata && v2_header.format_version != header_line[3] - '0') { + retval = RDATA_ERROR_PARSE; + goto cleanup; + } + + if (v2_header.format_version == 3) { + retval = read_character_string(&encoding, ctx); + if (retval != RDATA_OK) + goto cleanup; + + if (strcmp("UTF-8", encoding) != 0) { + if ((ctx->converter = iconv_open("UTF-8", encoding)) + == (iconv_t)-1 + ) { + ctx->converter = NULL; + retval = RDATA_ERROR_UNSUPPORTED_CHARSET; + goto cleanup; + } + } + } + + if (is_rdata) { + retval = read_environment(NULL, ctx); + } else { + retval = read_toplevel_object(NULL, NULL, ctx); + } + if (retval != RDATA_OK) + goto cleanup; + + char test; + + if (read_st(ctx, &test, 1) == 1) { + retval = RDATA_ERROR_PARSE; + goto cleanup; + } + +cleanup: + if (encoding) + free(encoding); + if (ctx) { + free_rdata_ctx(ctx); + } + + return retval; +} + + +static rdata_error_t read_toplevel_object( + const char *table_name, + const char *key, + rdata_ctx_t *ctx +) { + rdata_sexptype_info_t sexptype_info; + rdata_error_t retval = RDATA_OK; + + if ((retval = read_sexptype_header(&sexptype_info, ctx)) != RDATA_OK) + goto cleanup; + + if (sexptype_info.header.type == RDATA_SEXPTYPE_REAL_VECTOR || + sexptype_info.header.type == RDATA_SEXPTYPE_INTEGER_VECTOR || + sexptype_info.header.type == RDATA_SEXPTYPE_LOGICAL_VECTOR) { + if (table_name == NULL && ctx->table_handler) { + if (ctx->table_handler(key, ctx->user_ctx)) { + retval = RDATA_ERROR_USER_ABORT; + goto cleanup; + } + } + + if ((retval = read_value_vector( + sexptype_info.header, + key, + ctx)) != RDATA_OK + ) + goto cleanup; + } else if (sexptype_info.header.type == RDATA_SEXPTYPE_CHARACTER_VECTOR) { + if (table_name == NULL && ctx->table_handler) { + if (ctx->table_handler(key, ctx->user_ctx)) { + retval = RDATA_ERROR_USER_ABORT; + goto cleanup; + } + } + int32_t length; + + if ((retval = read_length(&length, ctx)) != RDATA_OK) + goto cleanup; + + if (ctx->column_handler) { + if (ctx->column_handler( + key, + RDATA_TYPE_STRING, NULL, + length, ctx->user_ctx) + ) { + retval = RDATA_ERROR_USER_ABORT; + goto cleanup; + } + } + + if ((retval = read_string_vector_n( + sexptype_info.header.attributes, + length, + ctx->text_value_handler, + ctx->user_ctx, ctx)) != RDATA_OK) + goto cleanup; + } else if (sexptype_info.header.type == RDATA_PSEUDO_SXP_ALTREP) { + if (table_name == NULL && ctx->table_handler) { + if (ctx->table_handler(key, ctx->user_ctx)) { + retval = RDATA_ERROR_USER_ABORT; + goto cleanup; + } + } + if ((retval = read_altrep_vector(key, ctx)) != RDATA_OK) + goto cleanup; + } else if (sexptype_info.header.type == RDATA_SEXPTYPE_GENERIC_VECTOR && + sexptype_info.header.object && sexptype_info.header.attributes) { + if (table_name != NULL) { + retval = recursive_discard(sexptype_info.header, ctx); + } else { + if (ctx->table_handler) { + if (ctx->table_handler(key, ctx->user_ctx)) { + retval = RDATA_ERROR_USER_ABORT; + goto cleanup; + } + } + retval = read_generic_list(sexptype_info.header.attributes, ctx); + } + if (retval != RDATA_OK) + goto cleanup; + } else { + if ((retval = recursive_discard(sexptype_info.header, ctx)) + != RDATA_OK + ) + goto cleanup; + } + +cleanup: + + return retval; +} + +static rdata_error_t read_environment( + const char *table_name, + rdata_ctx_t *ctx +) { + rdata_error_t retval = RDATA_OK; + char *key = NULL; + + while (1) { + rdata_sexptype_info_t sexptype_info; + + if ((retval = read_sexptype_header(&sexptype_info, ctx)) != RDATA_OK) + goto cleanup; + + if (sexptype_info.header.type == RDATA_PSEUDO_SXP_NIL) + break; + + if (sexptype_info.header.type != RDATA_SEXPTYPE_PAIRLIST) { + if ((retval = recursive_discard( + sexptype_info.header, + ctx)) != RDATA_OK) + goto cleanup; + continue; + } + + if ((key = atom_table_lookup( + ctx->atom_table, + sexptype_info.ref)) == NULL) { + retval = RDATA_ERROR_PARSE; + goto cleanup; + } + + if ((retval = read_toplevel_object(table_name, key, ctx)) != RDATA_OK) + goto cleanup; + } + +cleanup: + + return retval; +} + +static rdata_error_t read_sexptype_header( + rdata_sexptype_info_t *header_info, + rdata_ctx_t *ctx +) { + uint32_t sexptype; + rdata_sexptype_header_t header; + rdata_error_t retval = RDATA_OK; + if (read_st(ctx, &sexptype, sizeof(sexptype)) != sizeof(sexptype)) { + retval = RDATA_ERROR_READ; + goto cleanup; + } + if (ctx->machine_needs_byteswap) + sexptype = byteswap4(sexptype); + + memcpy(&header, &sexptype, sizeof(sexptype)); + uint32_t attributes = 0, tag = 0, ref = 0; + + if (header.type == RDATA_SEXPTYPE_PAIRLIST_ATTR) { + header.attributes = 1; + header.type = RDATA_SEXPTYPE_PAIRLIST; + } + if (header.type == RDATA_SEXPTYPE_LANGUAGE_OBJECT_ATTR) { + header.attributes = 1; + header.type = RDATA_SEXPTYPE_LANGUAGE_OBJECT; + } + if (header.type == RDATA_SEXPTYPE_PAIRLIST) { + if (header.attributes) { + if (read_st( + ctx, + &attributes, + sizeof(attributes)) != sizeof(attributes) + ) { + retval = RDATA_ERROR_READ; + goto cleanup; + } + if (ctx->machine_needs_byteswap) + header_info->attributes = byteswap4(header_info->attributes); + } + if (header.tag) { + if (read_st(ctx, &tag, sizeof(tag)) != sizeof(tag)) { + retval = RDATA_ERROR_READ; + goto cleanup; + } + if (ctx->machine_needs_byteswap) + tag = byteswap4(tag); + } + + if (tag == 1) { + rdata_sexptype_info_t key_info; + + if ((retval = read_sexptype_header(&key_info, ctx)) != RDATA_OK) + goto cleanup; + + if (key_info.header.type != RDATA_SEXPTYPE_CHARACTER_STRING) { + retval = RDATA_ERROR_PARSE; + goto cleanup; + } + + char *key = NULL; + if ((retval = read_character_string(&key, ctx)) != RDATA_OK) + goto cleanup; + + ref = atom_table_add(ctx->atom_table, key); + + free(key); + } else if ((tag & 0xFF) == RDATA_PSEUDO_SXP_REF) { + ref = (tag >> 8); + } + } + if (header.type == RDATA_PSEUDO_SXP_REF) { + ref = (sexptype >> 8); + } + + header_info->header = header; + header_info->attributes = attributes; + header_info->tag = tag; + header_info->ref = ref; + +cleanup: + + return retval; +} + +static int handle_class_name(const char *buf, int i, void *ctx) { + unsigned int *column_class = (unsigned int *)ctx; + if (buf) { + if (strcmp(buf, "POSIXct") == 0) { + *column_class |= RDATA_CLASS_POSIXCT; + } + if (strcmp(buf, "Date") == 0) { + *column_class |= RDATA_CLASS_DATE; + } + } + return RDATA_OK; +} + +static int handle_vector_attribute( + char *key, + rdata_sexptype_info_t val_info, + rdata_ctx_t *ctx +) { + rdata_error_t retval = RDATA_OK; + if (strcmp(key, "levels") == 0) { + retval = read_string_vector( + val_info.header.attributes, + ctx->value_label_handler, + ctx->user_ctx, ctx); + } else if (strcmp(key, "class") == 0) { + ctx->column_class = 0; + retval = read_string_vector( + val_info.header.attributes, + &handle_class_name, + &ctx->column_class, ctx); + } else if (strcmp(key, "dim") == 0) { + if (val_info.header.type == RDATA_SEXPTYPE_INTEGER_VECTOR) { + int32_t length; + if ((retval = read_length(&length, ctx)) != RDATA_OK) + goto cleanup; + + if (length <= sizeof(ctx->dims)/sizeof(ctx->dims[0])) { + int buf_len = length * sizeof(int32_t); + if (read_st(ctx, ctx->dims, buf_len) != buf_len) { + retval = RDATA_ERROR_READ; + goto cleanup; + } + if (ctx->machine_needs_byteswap) { + int i; + for (i=0; i < length; i++) { + ctx->dims[i] = byteswap4(ctx->dims[i]); + } + } + if (ctx->dim_handler) { + if (ctx->dim_handler( + key, + RDATA_TYPE_INT32, + ctx->dims, length, + ctx->user_ctx) + ) { + retval = RDATA_ERROR_USER_ABORT; + } + } + } + } + } else if (strcmp(key, "dimnames") == 0) { + ctx->is_dimnames = true; + retval = read_generic_list(val_info.header.attributes, ctx); + } else { + retval = recursive_discard(val_info.header, ctx); + } +cleanup: + return retval; +} + +static rdata_error_t read_character_string(char **key, rdata_ctx_t *ctx) { + uint32_t length; + char *string = NULL; + char *utf8_string = NULL; + rdata_error_t retval = RDATA_OK; + + if (read_st(ctx, &length, sizeof(length)) != sizeof(length)) { + retval = RDATA_ERROR_READ; + goto cleanup; + } + + if (ctx->machine_needs_byteswap) + length = byteswap4(length); + + if (length == -1 || length == 0) { + *key = strdup(""); + return RDATA_OK; + } + + if (length < 0) { + return RDATA_ERROR_PARSE; + } + + if ((string = rdata_malloc(length)) == NULL) { + retval = RDATA_ERROR_MALLOC; + goto cleanup; + } + + if (read_st(ctx, string, length) != length) { + retval = RDATA_ERROR_READ; + goto cleanup; + } + + if ((utf8_string = rdata_malloc(4*length+1)) == NULL) { + retval = RDATA_ERROR_MALLOC; + goto cleanup; + } + + retval = rdata_convert( + utf8_string, + 4 * length + 1, + string, length, + ctx->converter); + if (retval != RDATA_OK) + goto cleanup; + +cleanup: + if (string) + free(string); + + if (retval == RDATA_OK) { + *key = utf8_string; + } else if (utf8_string) { + free(utf8_string); + } + + return retval; +} + +static int handle_data_frame_attribute( + char *key, + rdata_sexptype_info_t val_info, + rdata_ctx_t *ctx +) { + rdata_error_t retval = RDATA_OK; + + if (strcmp(key, "names") == 0 && + val_info.header.type == RDATA_SEXPTYPE_CHARACTER_VECTOR + ) { + retval = read_string_vector( + val_info.header.attributes, + ctx->column_name_handler, ctx->user_ctx, ctx); + } else if (strcmp(key, "row.names") == 0 && + val_info.header.type == RDATA_SEXPTYPE_CHARACTER_VECTOR + ) { + retval = read_string_vector( + val_info.header.attributes, + ctx->row_name_handler, + ctx->user_ctx, ctx); + } else if (strcmp(key, "label.table") == 0) { + retval = recursive_discard(val_info.header, ctx); + } else { + retval = recursive_discard(val_info.header, ctx); + } + + return retval; +} + +static rdata_error_t read_attributes(int (*handle_attribute)( + char *key, + rdata_sexptype_info_t val_info, + rdata_ctx_t *ctx), + rdata_ctx_t *ctx +) { + rdata_sexptype_info_t pairlist_info, val_info; + rdata_error_t retval = RDATA_OK; + char *key = NULL; + + retval = read_sexptype_header(&pairlist_info, ctx); + if (retval != RDATA_OK) + goto cleanup; + + while (pairlist_info.header.type == RDATA_SEXPTYPE_PAIRLIST) { + /* value */ + if ((retval = read_sexptype_header(&val_info, ctx)) != RDATA_OK) + goto cleanup; + + if (handle_attribute) { + if ((key = atom_table_lookup( + ctx->atom_table, pairlist_info.ref)) == NULL) { + retval = RDATA_ERROR_PARSE; + goto cleanup; + } + if ((retval = handle_attribute(key, val_info, ctx)) != RDATA_OK) + goto cleanup; + } else { + if ((retval = recursive_discard( + val_info.header, + ctx)) != RDATA_OK + ) + goto cleanup; + } + + /* next */ + if ((retval = read_sexptype_header(&pairlist_info, ctx)) != RDATA_OK) + goto cleanup; + } + +cleanup: + return retval; +} + +static rdata_error_t read_wrap_real(const char *name, rdata_ctx_t *ctx) { + rdata_error_t retval = RDATA_OK; + rdata_sexptype_info_t sexptype_info; + /* pairlist */ + if ((retval = read_sexptype_header(&sexptype_info, ctx)) != RDATA_OK) + goto cleanup; + if (sexptype_info.header.type != RDATA_SEXPTYPE_PAIRLIST) { + retval = RDATA_ERROR_PARSE; + goto cleanup; + } + /* representation */ + if ((retval = read_sexptype_header(&sexptype_info, ctx)) != RDATA_OK) + goto cleanup; + + if ((retval = read_value_vector( + sexptype_info.header, + name, + ctx)) != RDATA_OK + ) + goto cleanup; + + /* alt representation */ + if ((retval = read_sexptype_header(&sexptype_info, ctx)) != RDATA_OK) + goto cleanup; + if ((retval = recursive_discard(sexptype_info.header, ctx)) != RDATA_OK) + goto cleanup; + + /* nil */ + if ((retval = read_sexptype_header(&sexptype_info, ctx)) != RDATA_OK) + goto cleanup; + if (sexptype_info.header.type != RDATA_PSEUDO_SXP_NIL) { + retval = RDATA_ERROR_PARSE; + goto cleanup; + } + +cleanup: + return retval; +} + +static rdata_error_t read_compact_intseq( + const char *name, + rdata_ctx_t *ctx +) { + rdata_error_t retval = RDATA_OK; + rdata_sexptype_info_t sexptype_info; + if ((retval = read_sexptype_header(&sexptype_info, ctx)) != RDATA_OK) + goto cleanup; + + int32_t length; + if ((retval = read_length(&length, ctx)) != RDATA_OK) + goto cleanup; + if (length != 3) { + retval = RDATA_ERROR_PARSE; + goto cleanup; + } + + double vals[3]; + if (read_st(ctx, vals, sizeof(vals)) != sizeof(vals)) { + retval = RDATA_ERROR_READ; + goto cleanup; + } + if (ctx->machine_needs_byteswap) { + vals[0] = byteswap_double(vals[0]); + vals[1] = byteswap_double(vals[1]); + vals[2] = byteswap_double(vals[2]); + } + + if (sexptype_info.header.attributes) { + if ((retval = read_attributes( + &handle_vector_attribute, ctx)) != RDATA_OK + ) + goto cleanup; + } + + if (ctx->column_handler) { + int32_t *integers = rdata_malloc(vals[0] * sizeof(int32_t)); + int32_t val = vals[1]; + for (int i=0; i < vals[0]; i++) { + integers[i] = val; + val += vals[2]; + } + int cb_retval = ctx->column_handler( + name, + RDATA_TYPE_INT32, + integers, + vals[0], ctx->user_ctx); + free(integers); + if (cb_retval) { + retval = RDATA_ERROR_USER_ABORT; + goto cleanup; + } + } + + /* nil */ + if ((retval = read_sexptype_header(&sexptype_info, ctx)) != RDATA_OK) + goto cleanup; + if (sexptype_info.header.type != RDATA_PSEUDO_SXP_NIL) { + retval = RDATA_ERROR_PARSE; + goto cleanup; + } +cleanup: + return retval; +} + +static int deferred_string_handler( + const char *name, + enum rdata_type_e type, + void *vals, + long length, + void *user_ctx +) { + rdata_ctx_t *ctx = (rdata_ctx_t *)user_ctx; + if (ctx->column_handler) + ctx->column_handler( + name, + RDATA_TYPE_STRING, + NULL, + length, + ctx->user_ctx); + if (ctx->text_value_handler) { + for (int i=0; i < length; i++) { + char buf[128] = { 0 }; + if (type == RDATA_TYPE_REAL) { + snprintf(buf, sizeof(buf), "%.0lf", ((double *)vals)[i]); + } else if (type == RDATA_TYPE_INT32) { + snprintf(buf, sizeof(buf), "%d", ((int32_t *)vals)[i]); + } + ctx->text_value_handler(buf, i, ctx->user_ctx); + } + } + return 0; +} + +static rdata_error_t read_deferred_string( + const char *name, + rdata_ctx_t *ctx +) { + rdata_error_t retval = RDATA_OK; + rdata_sexptype_info_t sexptype_info; + /* pairlist */ + if ((retval = read_sexptype_header(&sexptype_info, ctx)) != RDATA_OK) + goto cleanup; + if (sexptype_info.header.type != RDATA_SEXPTYPE_PAIRLIST) { + retval = RDATA_ERROR_PARSE; + goto cleanup; + } + /* representation */ + if ((retval = read_sexptype_header(&sexptype_info, ctx)) != RDATA_OK) + goto cleanup; + + if ((retval = read_value_vector_cb( + sexptype_info.header, + name, + &deferred_string_handler, + ctx, + ctx)) != RDATA_OK + ) + goto cleanup; + + /* alt representation */ + if ((retval = read_sexptype_header(&sexptype_info, ctx)) != RDATA_OK) + goto cleanup; + if ((retval = recursive_discard(sexptype_info.header, ctx)) != RDATA_OK) + goto cleanup; + + /* nil */ + if ((retval = read_sexptype_header(&sexptype_info, ctx)) != RDATA_OK) + goto cleanup; + if (sexptype_info.header.type != RDATA_PSEUDO_SXP_NIL) { + retval = RDATA_ERROR_PARSE; + goto cleanup; + } + +cleanup: + return retval; +} + +static rdata_error_t read_altrep_vector( + const char *name, + rdata_ctx_t *ctx +) { + rdata_error_t retval = RDATA_OK; + rdata_sexptype_info_t sexptype_info; + /* pairlist */ + if ((retval = read_sexptype_header(&sexptype_info, ctx)) != RDATA_OK) + goto cleanup; + if (sexptype_info.header.type != RDATA_SEXPTYPE_PAIRLIST) { + retval = RDATA_ERROR_PARSE; + goto cleanup; + } + /* class name */ + char *class = NULL; + if ((retval = read_sexptype_header(&sexptype_info, ctx)) != RDATA_OK) + goto cleanup; + if (sexptype_info.header.type == RDATA_SEXPTYPE_SYMBOL) { + if ((retval = read_sexptype_header(&sexptype_info, ctx)) != RDATA_OK) + goto cleanup; + if (sexptype_info.header.type != RDATA_SEXPTYPE_CHARACTER_STRING) { + retval = RDATA_ERROR_PARSE; + goto cleanup; + } + if ((retval = read_character_string(&class, ctx)) != RDATA_OK) + goto cleanup; + + atom_table_add(ctx->atom_table, class); + } else if (sexptype_info.header.type == RDATA_PSEUDO_SXP_REF) { + if ((class = atom_table_lookup( + ctx->atom_table, + sexptype_info.ref)) == NULL + ) { + retval = RDATA_ERROR_PARSE; + goto cleanup; + } + } else { + retval = RDATA_ERROR_PARSE; + goto cleanup; + } + + /* package and class ID */ + if ((retval = read_sexptype_header(&sexptype_info, ctx)) != RDATA_OK) + goto cleanup; + if (sexptype_info.header.type != RDATA_SEXPTYPE_PAIRLIST) { + retval = RDATA_ERROR_PARSE; + goto cleanup; + } + if ((retval = recursive_discard(sexptype_info.header, ctx)) != RDATA_OK) + goto cleanup; + + if (strcmp(class, "wrap_real") == 0) { + if ((retval = read_wrap_real(name, ctx)) != RDATA_OK) + goto cleanup; + } else if (strcmp(class, "compact_intseq") == 0) { + if ((retval = read_compact_intseq(name, ctx)) != RDATA_OK) + goto cleanup; + } else if (strcmp(class, "deferred_string") == 0) { + if ((retval = read_deferred_string(name, ctx)) != RDATA_OK) + goto cleanup; + } else { + if (ctx->error_handler) { + char error_buf[1024]; + snprintf( + error_buf, + sizeof(error_buf), + "Unrecognized ALTREP class: %s\n", + class); + ctx->error_handler(error_buf, ctx->user_ctx); + } + retval = RDATA_ERROR_UNSUPPORTED_STORAGE_CLASS; + } +cleanup: + return retval; +} + +static rdata_error_t read_generic_list(int attributes, rdata_ctx_t *ctx) { + rdata_error_t retval = RDATA_OK; + int32_t length; + int i; + rdata_sexptype_info_t sexptype_info; + + if ((retval = read_length(&length, ctx)) != RDATA_OK) + goto cleanup; + + for (i=0; i < length; i++) { + if ((retval = read_sexptype_header( + &sexptype_info, ctx)) != RDATA_OK + ) + goto cleanup; + + if (sexptype_info.header.type == RDATA_SEXPTYPE_CHARACTER_VECTOR) { + int32_t vec_length; + + if ((retval = read_length(&vec_length, ctx)) != RDATA_OK) + goto cleanup; + if (ctx->is_dimnames) { + retval = read_string_vector_n( + sexptype_info.header.attributes, + vec_length, + ctx->dim_name_handler, + ctx->user_ctx, ctx); + } else { + if (ctx->column_handler) { + if (ctx->column_handler( + NULL, + RDATA_TYPE_STRING, + NULL, + vec_length, + ctx->user_ctx) + ) { + retval = RDATA_ERROR_USER_ABORT; + goto cleanup; + } + } + retval = read_string_vector_n( + sexptype_info.header.attributes, + vec_length, + ctx->text_value_handler, + ctx->user_ctx, ctx); + } + } else if (sexptype_info.header.type == RDATA_PSEUDO_SXP_ALTREP) { + retval = read_altrep_vector(NULL, ctx); + } else if (sexptype_info.header.type == RDATA_PSEUDO_SXP_NIL) { + if (ctx->is_dimnames && + ctx->dim_name_handler && + i < sizeof(ctx->dims)/sizeof(ctx->dims[0]) + ) { + int j; + for (j=0; j < ctx->dims[i]; j++) { + ctx->dim_name_handler(NULL, j, ctx->user_ctx); + } + } + } else { + retval = read_value_vector(sexptype_info.header, NULL, ctx); + } + if (retval != RDATA_OK) + goto cleanup; + } + + if (attributes) { + if ((retval = read_attributes( + &handle_data_frame_attribute, + ctx)) != RDATA_OK + ) + goto cleanup; + } + +cleanup: + + if (ctx->is_dimnames) + ctx->is_dimnames = false; + + return retval; +} + +static rdata_error_t read_length(int32_t *outLength, rdata_ctx_t *ctx) { + int32_t length; + rdata_error_t retval = RDATA_OK; + + if (read_st(ctx, &length, sizeof(length)) != sizeof(length)) { + retval = RDATA_ERROR_READ; + goto cleanup; + } + + if (ctx->machine_needs_byteswap) + length = byteswap4(length); + + if (outLength) + *outLength = length; + +cleanup: + + return retval; +} + +static rdata_error_t read_string_vector_n( + int attributes, + int32_t length, + rdata_text_value_handler text_value_handler, + void *callback_ctx, + rdata_ctx_t *ctx +) { + int32_t string_length; + rdata_error_t retval = RDATA_OK; + rdata_sexptype_info_t info; + size_t buffer_size = 4096; + char *buffer = NULL; + size_t utf8_buffer_size = 16384; + char *utf8_buffer = NULL; + int i; + + buffer = rdata_malloc(buffer_size); + if (ctx->converter) + utf8_buffer = rdata_malloc(utf8_buffer_size); + + for (i=0; i < length; i++) { + if ((retval = read_sexptype_header(&info, ctx)) != RDATA_OK) + goto cleanup; + + if (info.header.type != RDATA_SEXPTYPE_CHARACTER_STRING) { + retval = RDATA_ERROR_PARSE; + goto cleanup; + } + + if ((retval = read_length(&string_length, ctx)) != RDATA_OK) + goto cleanup; + + if (string_length + 1 > buffer_size) { + buffer_size = string_length + 1; + if ((buffer = rdata_realloc(buffer, buffer_size)) == NULL) { + retval = RDATA_ERROR_MALLOC; + goto cleanup; + } + } + + if (string_length >= 0) { + if (read_st(ctx, buffer, string_length) != string_length) { + retval = RDATA_ERROR_READ; + goto cleanup; + } + buffer[string_length] = '\0'; + } + + if (text_value_handler) { + int cb_retval = 0; + if (string_length < 0) { + cb_retval = text_value_handler(NULL, i, callback_ctx); + } else if (!ctx->converter) { + cb_retval = text_value_handler(buffer, i, callback_ctx); + } else { + if (4*string_length + 1 > utf8_buffer_size) { + utf8_buffer_size = 4*string_length + 1; + if ((utf8_buffer = rdata_realloc( + utf8_buffer, utf8_buffer_size)) == NULL + ) { + retval = RDATA_ERROR_MALLOC; + goto cleanup; + } + } + retval = rdata_convert( + utf8_buffer, + utf8_buffer_size, + buffer, string_length, + ctx->converter); + if (retval != RDATA_OK) + goto cleanup; + + cb_retval = text_value_handler(utf8_buffer, i, callback_ctx); + } + if (cb_retval) { + retval = RDATA_ERROR_USER_ABORT; + goto cleanup; + } + } + } + + if (attributes) { + if ((retval = read_attributes( + &handle_vector_attribute, + ctx)) != RDATA_OK) + goto cleanup; + } + +cleanup: + + if (buffer) + free(buffer); + if (utf8_buffer) + free(utf8_buffer); + + return retval; +} + +static rdata_error_t read_string_vector( + int attributes, + rdata_text_value_handler text_value_handler, + void *callback_ctx, + rdata_ctx_t *ctx +) { + rdata_error_t retval = RDATA_OK; + int32_t length; + + if ((retval = read_length(&length, ctx)) != RDATA_OK) + return retval; + + return read_string_vector_n( + attributes, + length, + text_value_handler, + callback_ctx, + ctx); +} + +static rdata_error_t read_value_vector_cb( + rdata_sexptype_header_t header, + const char *name, + rdata_column_handler column_handler, + void *user_ctx, + rdata_ctx_t *ctx +) { + rdata_error_t retval = RDATA_OK; + int32_t length; + size_t input_elem_size = 0; + void *vals = NULL; + size_t buf_len = 0; + enum rdata_type_e output_data_type; + int i; + + switch (header.type) { + case RDATA_SEXPTYPE_REAL_VECTOR: + input_elem_size = sizeof(double); + output_data_type = RDATA_TYPE_REAL; + break; + case RDATA_SEXPTYPE_INTEGER_VECTOR: + input_elem_size = sizeof(int32_t); + output_data_type = RDATA_TYPE_INT32; + break; + case RDATA_SEXPTYPE_LOGICAL_VECTOR: + input_elem_size = sizeof(int32_t); + output_data_type = RDATA_TYPE_LOGICAL; + break; + default: + retval = RDATA_ERROR_PARSE; + break; + } + if (retval != RDATA_OK) + goto cleanup; + + if ((retval = read_length(&length, ctx)) != RDATA_OK) + goto cleanup; + + buf_len = length * input_elem_size; + + if (buf_len) { + vals = rdata_malloc(buf_len); + if (vals == NULL) { + retval = RDATA_ERROR_MALLOC; + goto cleanup; + } + + if (read_st(ctx, vals, buf_len) != buf_len) { + retval = RDATA_ERROR_READ; + goto cleanup; + } + + if (ctx->machine_needs_byteswap) { + if (input_elem_size == sizeof(double)) { + double *d_vals = (double *)vals; + for (i=0; i < buf_len/sizeof(double); i++) { + d_vals[i] = byteswap_double(d_vals[i]); + } + } else { + uint32_t *i_vals = (uint32_t *)vals; + for (i=0; i < buf_len/sizeof(uint32_t); i++) { + i_vals[i] = byteswap4(i_vals[i]); + } + } + } + } + + ctx->column_class = 0; + if (header.attributes) { + if ((retval = read_attributes( + &handle_vector_attribute, + ctx)) != RDATA_OK) + goto cleanup; + } + if (ctx->column_class == RDATA_CLASS_POSIXCT) + output_data_type = RDATA_TYPE_TIMESTAMP; + if (ctx->column_class == RDATA_CLASS_DATE) + output_data_type = RDATA_TYPE_DATE; + + if (column_handler) { + if (column_handler(name, output_data_type, vals, length, user_ctx)) { + retval = RDATA_ERROR_USER_ABORT; + goto cleanup; + } + } + +cleanup: + if (vals) + free(vals); + + return retval; +} + +static rdata_error_t read_value_vector( + rdata_sexptype_header_t header, + const char *name, + rdata_ctx_t *ctx +) { + return read_value_vector_cb( + header, + name, + ctx->column_handler, + ctx->user_ctx, ctx); +} + +static rdata_error_t discard_vector( + rdata_sexptype_header_t sexptype_header, + size_t element_size, + rdata_ctx_t *ctx +) { + int32_t length; + rdata_error_t retval = RDATA_OK; + + if ((retval = read_length(&length, ctx)) != RDATA_OK) + goto cleanup; + + if (length > 0) { + if (lseek_st(ctx, length * element_size) == -1) { + return RDATA_ERROR_SEEK; + } + } else if (ctx->error_handler) { + char error_buf[1024]; + snprintf( + error_buf, + sizeof(error_buf), + "Vector with non-positive length: %d\n", + length); + ctx->error_handler(error_buf, ctx->user_ctx); + } + + if (sexptype_header.attributes) { + rdata_sexptype_info_t temp_info; + if ((retval = read_sexptype_header(&temp_info, ctx)) != RDATA_OK) + goto cleanup; + + retval = recursive_discard(temp_info.header, ctx); + } + +cleanup: + + return retval; +} + +static rdata_error_t discard_character_string( + int add_to_table, + rdata_ctx_t *ctx +) { + rdata_error_t retval = RDATA_OK; + char *key = NULL; + + if ((retval = read_character_string(&key, ctx)) != RDATA_OK) + goto cleanup; + + if (strlen(key) > 0 && add_to_table) { + atom_table_add(ctx->atom_table, key); + } + + free(key); + +cleanup: + + return retval; +} + +static rdata_error_t discard_pairlist( + rdata_sexptype_header_t sexptype_header, + rdata_ctx_t *ctx +) { + rdata_sexptype_info_t temp_info; + rdata_error_t error = 0; + while (1) { + switch (sexptype_header.type) { + case RDATA_SEXPTYPE_PAIRLIST: + /* value */ + if ((error = read_sexptype_header( + &temp_info, + ctx)) != RDATA_OK) + return error; + if ((error = recursive_discard( + temp_info.header, + ctx)) != RDATA_OK) + return error; + + /* tail */ + if ((error = read_sexptype_header( + &temp_info, + ctx)) != RDATA_OK) + return error; + sexptype_header = temp_info.header; + break; + case RDATA_PSEUDO_SXP_NIL: + goto done; + default: + return RDATA_ERROR_PARSE; + } + } +done: + + return 0; +} + +static rdata_error_t recursive_discard( + rdata_sexptype_header_t sexptype_header, + rdata_ctx_t *ctx +) { + uint32_t length; + rdata_sexptype_info_t info; + rdata_sexptype_info_t prot, tag; + + rdata_error_t error = 0; + int i; + + switch (sexptype_header.type) { + case RDATA_SEXPTYPE_SYMBOL: + if ((error = read_sexptype_header(&info, ctx)) != RDATA_OK) + goto cleanup; + + if ((error = recursive_discard(info.header, ctx)) != RDATA_OK) + goto cleanup; + break; + case RDATA_PSEUDO_SXP_PERSIST: + case RDATA_PSEUDO_SXP_NAMESPACE: + case RDATA_PSEUDO_SXP_PACKAGE: + if ((error = read_sexptype_header(&info, ctx)) != RDATA_OK) + goto cleanup; + + if ((error = recursive_discard(info.header, ctx)) != RDATA_OK) + goto cleanup; + break; + case RDATA_SEXPTYPE_BUILTIN_FUNCTION: + case RDATA_SEXPTYPE_SPECIAL_FUNCTION: + error = discard_character_string(0, ctx); + break; + case RDATA_SEXPTYPE_PAIRLIST: + error = discard_pairlist(sexptype_header, ctx); + break; + case RDATA_SEXPTYPE_CHARACTER_STRING: + error = discard_character_string(1, ctx); + break; + case RDATA_SEXPTYPE_RAW_VECTOR: + error = discard_vector(sexptype_header, 1, ctx); + break; + case RDATA_SEXPTYPE_LOGICAL_VECTOR: + error = discard_vector(sexptype_header, 4, ctx); + break; + case RDATA_SEXPTYPE_INTEGER_VECTOR: + error = discard_vector(sexptype_header, 4, ctx); + break; + case RDATA_SEXPTYPE_REAL_VECTOR: + error = discard_vector(sexptype_header, 8, ctx); + break; + case RDATA_SEXPTYPE_COMPLEX_VECTOR: + error = discard_vector(sexptype_header, 16, ctx); + break; + case RDATA_SEXPTYPE_CHARACTER_VECTOR: + case RDATA_SEXPTYPE_GENERIC_VECTOR: + case RDATA_SEXPTYPE_EXPRESSION_VECTOR: + if (read_st(ctx, &length, sizeof(length)) != sizeof(length)) { + return RDATA_ERROR_READ; + } + if (ctx->machine_needs_byteswap) + length = byteswap4(length); + + for (i=0; i < length; i++) { + if ((error = read_sexptype_header(&info, ctx)) != RDATA_OK) + goto cleanup; + + if (sexptype_header.type == RDATA_SEXPTYPE_CHARACTER_VECTOR) { + if (info.header.type != RDATA_SEXPTYPE_CHARACTER_STRING) { + error = RDATA_ERROR_PARSE; + goto cleanup; + } + + if ((error = discard_character_string(0, ctx)) != RDATA_OK) + goto cleanup; + } else if ((error = recursive_discard( + info.header, + ctx)) != RDATA_OK) { + goto cleanup; + } + } + if (sexptype_header.attributes) { + if ((error = read_attributes(NULL, ctx)) != RDATA_OK) + goto cleanup; + } + break; + case RDATA_SEXPTYPE_DOT_DOT_DOT: + case RDATA_SEXPTYPE_PROMISE: + case RDATA_SEXPTYPE_LANGUAGE_OBJECT: + case RDATA_SEXPTYPE_CLOSURE: + if (sexptype_header.attributes) { + if ((error = read_sexptype_header(&info, ctx)) != RDATA_OK) + goto cleanup; + + if ((error = recursive_discard(info.header, ctx)) != RDATA_OK) + goto cleanup; + } + if (sexptype_header.tag) { + if ((error = read_sexptype_header(&info, ctx)) != RDATA_OK) + goto cleanup; + + if ((error = recursive_discard(info.header, ctx)) != RDATA_OK) + goto cleanup; + } + /* CAR */ + if ((error = read_sexptype_header(&info, ctx)) != RDATA_OK) + goto cleanup; + + if ((error = recursive_discard(info.header, ctx)) != RDATA_OK) + goto cleanup; + + /* CDR */ + if ((error = read_sexptype_header(&info, ctx)) != RDATA_OK) + goto cleanup; + + if ((error = recursive_discard(info.header, ctx)) != RDATA_OK) + goto cleanup; + break; + case RDATA_SEXPTYPE_EXTERNAL_POINTER: + read_sexptype_header(&prot, ctx); + recursive_discard(prot.header, ctx); + + read_sexptype_header(&tag, ctx); + recursive_discard(tag.header, ctx); + break; + case RDATA_SEXPTYPE_ENVIRONMENT: + /* locked */ + if (lseek_st(ctx, sizeof(uint32_t)) == -1) { + return RDATA_ERROR_SEEK; + } + + rdata_sexptype_info_t enclosure, frame, hash_table, attributes; + read_sexptype_header(&enclosure, ctx); + recursive_discard(enclosure.header, ctx); + + read_sexptype_header(&frame, ctx); + recursive_discard(frame.header, ctx); + + read_sexptype_header(&hash_table, ctx); + recursive_discard(hash_table.header, ctx); + + read_sexptype_header(&attributes, ctx); + recursive_discard(attributes.header, ctx); + /* + if (sexptype_header.attributes) { + if (lseek(ctx->fd, sizeof(uint32_t), SEEK_CUR) == -1) { + return RDATA_ERROR_SEEK; + } + } */ + break; + case RDATA_PSEUDO_SXP_REF: + case RDATA_PSEUDO_SXP_NIL: + case RDATA_PSEUDO_SXP_GLOBAL_ENVIRONMENT: + case RDATA_PSEUDO_SXP_UNBOUND_VALUE: + case RDATA_PSEUDO_SXP_MISSING_ARGUMENT: + case RDATA_PSEUDO_SXP_BASE_NAMESPACE: + case RDATA_PSEUDO_SXP_EMPTY_ENVIRONMENT: + case RDATA_PSEUDO_SXP_BASE_ENVIRONMENT: + break; + case RDATA_PSEUDO_SXP_ALTREP: + /* class, package, type */ + if ((error = read_sexptype_header(&info, ctx)) != RDATA_OK) + goto cleanup; + if ((error = recursive_discard(info.header, ctx)) != RDATA_OK) + goto cleanup; + + while (1) { + if ((error = read_sexptype_header(&info, ctx)) != RDATA_OK) + goto cleanup; + if (info.header.type == RDATA_SEXPTYPE_PAIRLIST) + continue; + if (info.header.type == RDATA_PSEUDO_SXP_NIL) + break; + if ((error = recursive_discard(info.header, ctx)) != RDATA_OK) + goto cleanup; + } + break; + default: + if (ctx->error_handler) { + char error_buf[1024]; + snprintf( + error_buf, + sizeof(error_buf), + "Unhandled S-Expression: %d", + sexptype_header.type); + ctx->error_handler(error_buf, ctx->user_ctx); + } + return RDATA_ERROR_UNSUPPORTED_S_EXPRESSION; + } +cleanup: + + return error; +} diff --git a/pandas/_libs/src/librdata/rdata_write.c b/pandas/_libs/src/librdata/rdata_write.c new file mode 100644 index 0000000000000..3a53f595e877e --- /dev/null +++ b/pandas/_libs/src/librdata/rdata_write.c @@ -0,0 +1,704 @@ +/* +Copyright (c) 2020 Evan Miller +*/ + +#include +#include +#include + +#include "CKHashTable.h" +#include "rdata.h" +#include "rdata_internal.h" + +#define R_TAG 0x01 +#define R_OBJECT 0x02 +#define R_ATTRIBUTES 0x04 + +#define INITIAL_COLUMNS_CAPACITY 100 + +#ifdef _WIN32 +#define timegm _mkgmtime +#endif + +rdata_writer_t *rdata_writer_init( + rdata_data_writer write_callback, + rdata_file_format_t format +) { + rdata_writer_t *writer = calloc(1, sizeof(rdata_writer_t)); + writer->file_format = format; + writer->bswap = machine_is_little_endian(); + writer->atom_table = ck_hash_table_init(100, 24); + writer->data_writer = write_callback; + + writer->columns_capacity = INITIAL_COLUMNS_CAPACITY; + writer->columns = malloc( + writer->columns_capacity * sizeof(rdata_column_t *)); + + return writer; +} + +void rdata_writer_free(rdata_writer_t *writer) { + ck_hash_table_free(writer->atom_table); + int i, j; + for (i=0; i < writer->columns_count; i++) { + rdata_column_t *column = writer->columns[i]; + for (j=0; j < column->factor_count; j++) { + free(column->factor[j]); + } + free(column->factor); + free(column); + } + free(writer->columns); + free(writer); +} + +rdata_column_t *rdata_add_column( + rdata_writer_t *writer, + const char *name, + rdata_type_t type +) { + if (writer->columns_count == writer->columns_capacity) { + writer->columns_capacity *= 2; + writer->columns = realloc(writer->columns, + writer->columns_capacity * sizeof(rdata_column_t *)); + } + rdata_column_t *new_column = calloc(1, sizeof(rdata_column_t)); + + new_column->index = writer->columns_count++; + + writer->columns[new_column->index] = new_column; + + new_column->type = type; + + if (name) { + snprintf(new_column->name, sizeof(new_column->name), "%s", name); + } + + return new_column; +} + +rdata_column_t *rdata_get_column(rdata_writer_t *writer, int32_t j) { + return writer->columns[j]; +} + +rdata_error_t rdata_column_set_label( + rdata_column_t *column, + const char *label +) { + snprintf(column->label, sizeof(column->label), "%s", label); + return RDATA_OK; +} + +rdata_error_t rdata_column_add_factor( + rdata_column_t *column, + const char *factor +) { + if (column->type != RDATA_TYPE_INT32) + return RDATA_ERROR_FACTOR; + + char *factor_copy = malloc(strlen(factor)+1); + strcpy(factor_copy, factor); // NOLINT + + column->factor_count++; + column->factor = realloc( + column->factor, + sizeof(char *) * column->factor_count); + column->factor[column->factor_count-1] = factor_copy; + + return RDATA_OK; +} + +static rdata_error_t rdata_write_bytes( + rdata_writer_t *writer, + const void *data, size_t len +) { + size_t bytes_written = writer->data_writer(data, len, writer->user_ctx); + if (bytes_written < len) { + return RDATA_ERROR_WRITE; + } + writer->bytes_written += bytes_written; + return RDATA_OK; +} + +static rdata_error_t rdata_write_integer( + rdata_writer_t *writer, + int32_t val +) { + if (writer->bswap) { + val = byteswap4(val); + } + return rdata_write_bytes(writer, &val, sizeof(val)); +} + +static rdata_error_t rdata_write_double(rdata_writer_t *writer, double val) { + if (writer->bswap) { + val = byteswap_double(val); + } + return rdata_write_bytes(writer, &val, sizeof(val)); +} + +static rdata_error_t rdata_write_header( + rdata_writer_t *writer, + int type, + int flags +) { + rdata_sexptype_header_t header; + memset(&header, 0, sizeof(header)); + + header.type = type; + header.object = !!(flags & R_OBJECT); + header.tag = !!(flags & R_TAG); + header.attributes = !!(flags & R_ATTRIBUTES); + + uint32_t sexp_int; + + memcpy(&sexp_int, &header, sizeof(header)); + + return rdata_write_integer(writer, sexp_int); +} + +static rdata_error_t rdata_write_string( + rdata_writer_t *writer, + const char *string +) { + rdata_error_t retval = RDATA_OK; + + retval = rdata_write_header(writer, RDATA_SEXPTYPE_CHARACTER_STRING, 0); + if (retval != RDATA_OK) + goto cleanup; + + ssize_t len = string ? strlen(string) : -1; + + retval = rdata_write_integer(writer, len); + if (retval != RDATA_OK) + goto cleanup; + + if (len > 0) + return rdata_write_bytes(writer, string, len); + +cleanup: + return retval; +} + +static rdata_error_t rdata_write_pairlist_key( + rdata_writer_t *writer, + const char *key +) { + rdata_error_t retval = RDATA_OK; + ck_hash_table_t *atom_table = (ck_hash_table_t *)writer->atom_table; + uint64_t ref = (uint64_t)ck_str_hash_lookup(key, atom_table); + if (ref == 0) { + ck_str_hash_insert(key, (void *)(atom_table->count + 1), atom_table); + + retval = rdata_write_integer(writer, 1); + if (retval != RDATA_OK) + goto cleanup; + + retval = rdata_write_string(writer, key); + } else { + retval = rdata_write_integer(writer, (ref << 8) | 0xFF); + } + +cleanup: + return retval; +} + +static rdata_error_t rdata_write_pairlist_header( + rdata_writer_t *writer, + const char *key +) { + rdata_error_t retval = RDATA_OK; + + retval = rdata_write_header(writer, RDATA_SEXPTYPE_PAIRLIST, R_TAG); + if (retval != RDATA_OK) + goto cleanup; + + retval = rdata_write_pairlist_key(writer, key); + if (retval != RDATA_OK) + goto cleanup; + +cleanup: + return retval; +} + +static rdata_error_t rdata_write_attributed_vector_header( + rdata_writer_t *writer, int type, + int32_t size +) { + rdata_error_t retval = RDATA_OK; + + retval = rdata_write_header(writer, type, R_OBJECT | R_ATTRIBUTES); + if (retval != RDATA_OK) + goto cleanup; + + retval = rdata_write_integer(writer, size); + if (retval != RDATA_OK) + goto cleanup; + +cleanup: + return retval; +} + +static rdata_error_t rdata_write_simple_vector_header( + rdata_writer_t *writer, + int type, + int32_t size +) { + rdata_error_t retval = RDATA_OK; + + retval = rdata_write_header(writer, type, 0); + if (retval != RDATA_OK) + goto cleanup; + + retval = rdata_write_integer(writer, size); + if (retval != RDATA_OK) + goto cleanup; + +cleanup: + return retval; +} + +static rdata_error_t rdata_write_class_pairlist( + rdata_writer_t *writer, + const char *class +) { + rdata_error_t retval = RDATA_OK; + + retval = rdata_write_pairlist_header(writer, "class"); + if (retval != RDATA_OK) + goto cleanup; + + retval = rdata_write_simple_vector_header( + writer, + RDATA_SEXPTYPE_CHARACTER_VECTOR, + 1); + if (retval != RDATA_OK) + goto cleanup; + + retval = rdata_write_string(writer, class); + if (retval != RDATA_OK) + goto cleanup; + +cleanup: + return retval; +} + +rdata_error_t rdata_begin_file( + rdata_writer_t *writer, + void *user_ctx +) { + rdata_error_t retval = RDATA_OK; + + writer->user_ctx = user_ctx; + + if (writer->file_format == RDATA_WORKSPACE) { + retval = rdata_write_bytes(writer, "RDX2\n", 5); + if (retval != RDATA_OK) + goto cleanup; + } + + rdata_v2_header_t v2_header; + memcpy(v2_header.header, "X\n", sizeof("X\n")-1); + v2_header.format_version = 2; + v2_header.reader_version = 131840; + v2_header.writer_version = 131840; + + if (writer->bswap) { + v2_header.format_version = byteswap4(v2_header.format_version); + v2_header.reader_version = byteswap4(v2_header.reader_version); + v2_header.writer_version = byteswap4(v2_header.writer_version); + } + + retval = rdata_write_bytes(writer, &v2_header, sizeof(v2_header)); + if (retval != RDATA_OK) + goto cleanup; + +cleanup: + return retval; +} + +rdata_error_t rdata_begin_table( + rdata_writer_t *writer, + const char *variable_name +) { + rdata_error_t retval = RDATA_OK; + + if (writer->file_format == RDATA_WORKSPACE) { + retval = rdata_write_pairlist_header(writer, variable_name); + if (retval != RDATA_OK) + goto cleanup; + } + + retval = rdata_write_attributed_vector_header( + writer, + RDATA_SEXPTYPE_GENERIC_VECTOR, + writer->columns_count); + if (retval != RDATA_OK) + goto cleanup; + +cleanup: + return retval; +} + +static rdata_error_t rdata_begin_factor_column( + rdata_writer_t *writer, + rdata_column_t *column, + int32_t row_count +) { + return rdata_write_attributed_vector_header( + writer, + RDATA_SEXPTYPE_INTEGER_VECTOR, + row_count); +} + +static rdata_error_t rdata_end_factor_column( + rdata_writer_t *writer, + rdata_column_t *column +) { + int i; + + rdata_error_t retval = RDATA_OK; + + retval = rdata_write_pairlist_header(writer, "levels"); + if (retval != RDATA_OK) + goto cleanup; + + retval = rdata_write_simple_vector_header(writer, + RDATA_SEXPTYPE_CHARACTER_VECTOR, column->factor_count); + if (retval != RDATA_OK) + goto cleanup; + + for (i=0; i < column->factor_count; i++) { + retval = rdata_write_string(writer, column->factor[i]); + if (retval != RDATA_OK) + goto cleanup; + } + + retval = rdata_write_class_pairlist(writer, "factor"); + if (retval != RDATA_OK) + goto cleanup; + + retval = rdata_write_header(writer, RDATA_PSEUDO_SXP_NIL, 0); + if (retval != RDATA_OK) + goto cleanup; + +cleanup: + return retval; +} + +static rdata_error_t rdata_begin_real_column( + rdata_writer_t *writer, + rdata_column_t *column, + int32_t row_count +) { + return rdata_write_simple_vector_header( + writer, + RDATA_SEXPTYPE_REAL_VECTOR, + row_count); +} + +static rdata_error_t rdata_end_real_column( + rdata_writer_t *writer, + rdata_column_t *column +) { + return RDATA_OK; +} + +static rdata_error_t rdata_begin_timestamp_column( + rdata_writer_t *writer, + rdata_column_t *column, + int32_t row_count +) { + return rdata_write_attributed_vector_header( + writer, + RDATA_SEXPTYPE_REAL_VECTOR, + row_count); +} + +static rdata_error_t rdata_end_timestamp_column( + rdata_writer_t *writer, + rdata_column_t *column +) { + rdata_error_t retval = RDATA_OK; + + retval = rdata_write_class_pairlist(writer, "POSIXct"); + if (retval != RDATA_OK) + goto cleanup; + + retval = rdata_write_header(writer, RDATA_PSEUDO_SXP_NIL, 0); + if (retval != RDATA_OK) + goto cleanup; + +cleanup: + return retval; +} + +static rdata_error_t rdata_begin_date_column( + rdata_writer_t *writer, + rdata_column_t *column, + int32_t row_count +) { + return rdata_write_attributed_vector_header( + writer, + RDATA_SEXPTYPE_REAL_VECTOR, + row_count); +} + +static rdata_error_t rdata_end_date_column( + rdata_writer_t *writer, + rdata_column_t *column +) { + rdata_error_t retval = RDATA_OK; + + retval = rdata_write_class_pairlist(writer, "Date"); + if (retval != RDATA_OK) + goto cleanup; + + retval = rdata_write_header(writer, RDATA_PSEUDO_SXP_NIL, 0); + if (retval != RDATA_OK) + goto cleanup; + +cleanup: + return retval; +} + +static rdata_error_t rdata_begin_integer_column( + rdata_writer_t *writer, + rdata_column_t *column, + int32_t row_count +) { + return rdata_write_simple_vector_header( + writer, + RDATA_SEXPTYPE_INTEGER_VECTOR, + row_count); +} + +static rdata_error_t rdata_end_integer_column( + rdata_writer_t *writer, + rdata_column_t *column +) { + return RDATA_OK; +} + +static rdata_error_t rdata_begin_logical_column( + rdata_writer_t *writer, + rdata_column_t *column, + int32_t row_count +) { + return rdata_write_simple_vector_header( + writer, + RDATA_SEXPTYPE_LOGICAL_VECTOR, + row_count); +} + +static rdata_error_t rdata_end_logical_column( + rdata_writer_t *writer, + rdata_column_t *column +) { + return RDATA_OK; +} + +static rdata_error_t rdata_begin_string_column( + rdata_writer_t *writer, + rdata_column_t *column, + int32_t row_count +) { + return rdata_write_simple_vector_header( + writer, + RDATA_SEXPTYPE_CHARACTER_VECTOR, + row_count); +} + +static rdata_error_t rdata_end_string_column( + rdata_writer_t *writer, + rdata_column_t *column +) { + return RDATA_OK; +} + +rdata_error_t rdata_begin_column( + rdata_writer_t *writer, + rdata_column_t *column, + int32_t row_count +) { + rdata_type_t type = column->type; + + if (type == RDATA_TYPE_INT32) { + if (column->factor_count) + return rdata_begin_factor_column(writer, column, row_count); + return rdata_begin_integer_column(writer, column, row_count); + } + if (type == RDATA_TYPE_REAL) + return rdata_begin_real_column(writer, column, row_count); + if (type == RDATA_TYPE_TIMESTAMP) + return rdata_begin_timestamp_column(writer, column, row_count); + if (type == RDATA_TYPE_DATE) + return rdata_begin_date_column(writer, column, row_count); + if (type == RDATA_TYPE_LOGICAL) + return rdata_begin_logical_column(writer, column, row_count); + if (type == RDATA_TYPE_STRING) + return rdata_begin_string_column(writer, column, row_count); + + return RDATA_OK; +} + +rdata_error_t rdata_append_real_value( + rdata_writer_t *writer, + double value +) { + return rdata_write_double(writer, value); +} + +rdata_error_t rdata_append_int32_value( + rdata_writer_t *writer, + int32_t value +) { + return rdata_write_integer(writer, value); +} + +rdata_error_t rdata_append_timestamp_value( + rdata_writer_t *writer, + time_t value +) { + return rdata_write_double(writer, value); +} + +rdata_error_t rdata_append_date_value( + rdata_writer_t *writer, + struct tm *value +) { + return rdata_write_double(writer, timegm(value) / 86400); +} + +rdata_error_t rdata_append_logical_value( + rdata_writer_t *writer, + int value +) { + if (value < 0) + return rdata_write_integer(writer, INT32_MIN); + + return rdata_write_integer(writer, (value > 0)); +} + +rdata_error_t rdata_append_string_value( + rdata_writer_t *writer, + const char *value +) { + return rdata_write_string(writer, value); +} + +rdata_error_t rdata_end_column( + rdata_writer_t *writer, + rdata_column_t *column +) { + rdata_type_t type = column->type; + + if (type == RDATA_TYPE_INT32) { + if (column->factor_count) + return rdata_end_factor_column(writer, column); + return rdata_end_integer_column(writer, column); + } + if (type == RDATA_TYPE_REAL) + return rdata_end_real_column(writer, column); + if (type == RDATA_TYPE_TIMESTAMP) + return rdata_end_timestamp_column(writer, column); + if (type == RDATA_TYPE_DATE) + return rdata_end_date_column(writer, column); + if (type == RDATA_TYPE_LOGICAL) + return rdata_end_logical_column(writer, column); + if (type == RDATA_TYPE_STRING) + return rdata_end_string_column(writer, column); + + return RDATA_OK; +} + +rdata_error_t rdata_end_table( + rdata_writer_t *writer, + int32_t row_count, + const char *datalabel +) { + int i; + rdata_error_t retval = RDATA_OK; + + retval = rdata_write_pairlist_header(writer, "datalabel"); + if (retval != RDATA_OK) + goto cleanup; + + retval = rdata_write_simple_vector_header( + writer, + RDATA_SEXPTYPE_CHARACTER_VECTOR, + 1); + if (retval != RDATA_OK) + goto cleanup; + + retval = rdata_write_string(writer, datalabel); + if (retval != RDATA_OK) + goto cleanup; + + retval = rdata_write_pairlist_header(writer, "names"); + if (retval != RDATA_OK) + goto cleanup; + + retval = rdata_write_simple_vector_header(writer, + RDATA_SEXPTYPE_CHARACTER_VECTOR, writer->columns_count); + if (retval != RDATA_OK) + goto cleanup; + + for (i=0; i < writer->columns_count; i++) { + retval = rdata_write_string(writer, writer->columns[i]->name); + if (retval != RDATA_OK) + goto cleanup; + } + + retval = rdata_write_pairlist_header(writer, "var.labels"); + if (retval != RDATA_OK) + goto cleanup; + + retval = rdata_write_simple_vector_header(writer, + RDATA_SEXPTYPE_CHARACTER_VECTOR, writer->columns_count); + if (retval != RDATA_OK) + goto cleanup; + + for (i=0; i < writer->columns_count; i++) { + retval = rdata_write_string(writer, writer->columns[i]->label); + if (retval != RDATA_OK) + goto cleanup; + } + + retval = rdata_write_class_pairlist(writer, "data.frame"); + if (retval != RDATA_OK) + goto cleanup; + + if (row_count > 0) { + retval = rdata_write_pairlist_header(writer, "row.names"); + if (retval != RDATA_OK) + goto cleanup; + + retval = rdata_write_simple_vector_header(writer, + RDATA_SEXPTYPE_CHARACTER_VECTOR, row_count); + if (retval != RDATA_OK) + goto cleanup; + + char buf[128]; + for (i=0; i < row_count; i++) { + snprintf(buf, sizeof(buf), "%d", i+1); + retval = rdata_write_string(writer, buf); + if (retval != RDATA_OK) + goto cleanup; + } + } + + retval = rdata_write_header(writer, RDATA_PSEUDO_SXP_NIL, 0); + if (retval != RDATA_OK) + goto cleanup; + +cleanup: + return retval; +} + +rdata_error_t rdata_end_file(rdata_writer_t *writer) { + if (writer->file_format == RDATA_WORKSPACE) + return rdata_write_header(writer, RDATA_PSEUDO_SXP_NIL, 0); + + return RDATA_OK; +} diff --git a/pandas/_libs/src/librdata/win_iconv.c b/pandas/_libs/src/librdata/win_iconv.c new file mode 100644 index 0000000000000..23d9938b3d795 --- /dev/null +++ b/pandas/_libs/src/librdata/win_iconv.c @@ -0,0 +1,2228 @@ +/* + * No Copyright. + * + * iconv implementation using Win32 API to convert. + * This file is placed in the public domain. + */ + +/* for WC_NO_BEST_FIT_CHARS */ +#ifndef WINVER +# define WINVER 0x0500 +#endif + +#define STRICT +#include "win_iconv.h" +#include +#include +#include +#include + +#ifdef __GNUC__ +#define UNUSED __attribute__((unused)) +#else +#define UNUSED +#endif + +/* WORKAROUND: */ +#ifndef UNDER_CE +#define GetProcAddressA GetProcAddress +#endif + +#if 0 +# define MAKE_EXE +# define MAKE_DLL +# define USE_LIBICONV_DLL +#endif + +#if !defined(DEFAULT_LIBICONV_DLL) +# define DEFAULT_LIBICONV_DLL "" +#endif + +#define MB_CHAR_MAX 16 + +#define UNICODE_MODE_BOM_DONE 1 +#define UNICODE_MODE_SWAPPED 2 + +#define FLAG_USE_BOM 1 +#define FLAG_TRANSLIT 2 +#define FLAG_IGNORE 4 + +typedef unsigned char uchar; +typedef unsigned short ushort; +typedef unsigned int uint; + +typedef void* iconv_t; + +iconv_t iconv_open(const char *tocode, const char *fromcode); +int iconv_close(iconv_t cd); +size_t iconv( + iconv_t cd, + const char **inbuf, + size_t *inbytesleft, + char **outbuf, + size_t *outbytesleft); + +/* libiconv interface for vim */ +#if defined(MAKE_DLL) +int iconvctl(iconv_t cd, int request, void* argument) { + /* not supported */ + return 0; +} +#endif + +typedef struct compat_t compat_t; +typedef struct csconv_t csconv_t; +typedef struct rec_iconv_t rec_iconv_t; + +typedef iconv_t (*f_iconv_open)(const char *tocode, const char *fromcode); +typedef int (*f_iconv_close)(iconv_t cd); +typedef size_t (*f_iconv)( + iconv_t cd, + const char **inbuf, + size_t *inbytesleft, + char **outbuf, + size_t *outbytesleft); +typedef int* (*f_errno)(void); +typedef int (*f_mbtowc)( + csconv_t *cv, + const uchar *buf, + int bufsize, + ushort *wbuf, + int *wbufsize); +typedef int (*f_wctomb)( + csconv_t *cv, + ushort *wbuf, + int wbufsize, + uchar *buf, + int bufsize); +typedef int (*f_mblen)(csconv_t *cv, const uchar *buf, int bufsize); +typedef int (*f_flush)(csconv_t *cv, uchar *buf, int bufsize); + +#define COMPAT_IN 1 +#define COMPAT_OUT 2 + +/* unicode mapping for compatibility with other conversion table. */ +struct compat_t { + uint in; + uint out; + uint flag; +}; + +struct csconv_t { + int codepage; + int flags; + f_mbtowc mbtowc; + f_wctomb wctomb; + f_mblen mblen; + f_flush flush; + DWORD mode; + compat_t *compat; +}; + +struct rec_iconv_t { + iconv_t cd; + f_iconv_close iconv_close; + f_iconv iconv; + f_errno _errno; + csconv_t from; + csconv_t to; +#if defined(USE_LIBICONV_DLL) + HMODULE hlibiconv; +#endif +}; + +static int win_iconv_open( + rec_iconv_t *cd, + const char *tocode, + const char *fromcode); +static int win_iconv_close(iconv_t cd); +static size_t win_iconv( + iconv_t cd, + const char **inbuf, + size_t *inbytesleft, + char **outbuf, + size_t *outbytesleft); + +static int load_mlang(void); +static int make_csconv(const char *name, csconv_t *cv); +static int name_to_codepage(const char *name); +static uint utf16_to_ucs4(const ushort *wbuf); +static void ucs4_to_utf16(uint wc, ushort *wbuf, int *wbufsize); +static int mbtowc_flags(int codepage); +static int must_use_null_useddefaultchar(int codepage); +static char *strrstr(const char *str, const char *token); +static char *xstrndup(const char *s, size_t n); +static int seterror(int err); + +#if defined(USE_LIBICONV_DLL) +static int libiconv_iconv_open( + rec_iconv_t *cd, + const char *tocode, + const char *fromcode); +static PVOID MyImageDirectoryEntryToData( + LPVOID Base, + BOOLEAN MappedAsImage, + USHORT DirectoryEntry, + PULONG Size); +static FARPROC find_imported_function( + HMODULE hModule, + const char *funcname); + +static HMODULE hwiniconv; +#endif + +static int sbcs_mblen(csconv_t *cv, const uchar *buf, int bufsize); +static int dbcs_mblen(csconv_t *cv, const uchar *buf, int bufsize); +static int mbcs_mblen(csconv_t *cv, const uchar *buf, int bufsize); +static int utf8_mblen(csconv_t *cv, const uchar *buf, int bufsize); +static int eucjp_mblen(csconv_t *cv, const uchar *buf, int bufsize); + +static int kernel_mbtowc( + csconv_t *cv, + const uchar *buf, + int bufsize, + ushort *wbuf, + int *wbufsize); +static int kernel_wctomb( + csconv_t *cv, + ushort *wbuf, + int wbufsize, + uchar *buf, + int bufsize); +static int mlang_mbtowc( + csconv_t *cv, + const uchar *buf, + int bufsize, + ushort *wbuf, + int *wbufsize); +static int mlang_wctomb( + csconv_t *cv, + ushort *wbuf, + int wbufsize, + uchar *buf, + int bufsize); +static int utf16_mbtowc( + csconv_t *cv, + const uchar *buf, + int bufsize, + ushort *wbuf, + int *wbufsize); +static int utf16_wctomb( + csconv_t *cv, + ushort *wbuf, + int wbufsize, + uchar *buf, + int bufsize); +static int utf32_mbtowc( + csconv_t *cv, + const uchar *buf, + int bufsize, + ushort *wbuf, + int *wbufsize); +static int utf32_wctomb( + csconv_t *cv, + ushort *wbuf, + int wbufsize, + uchar *buf, + int bufsize); +static int iso2022jp_mbtowc( + csconv_t *cv, + const uchar *buf, + int bufsize, + ushort *wbuf, + int *wbufsize); +static int iso2022jp_wctomb( + csconv_t *cv, + ushort *wbuf, + int wbufsize, + uchar *buf, + int bufsize); +static int iso2022jp_flush( + csconv_t *cv, + uchar *buf, + int bufsize); + +static struct { + int codepage; + const char *name; +} codepage_alias[] = { + {65001, "CP65001"}, + {65001, "UTF8"}, + {65001, "UTF-8"}, + + {1200, "CP1200"}, + {1200, "UTF16LE"}, + {1200, "UTF-16LE"}, + {1200, "UCS2LE"}, + {1200, "UCS-2LE"}, + {1200, "UCS-2-INTERNAL"}, + + {1201, "CP1201"}, + {1201, "UTF16BE"}, + {1201, "UTF-16BE"}, + {1201, "UCS2BE"}, + {1201, "UCS-2BE"}, + {1201, "unicodeFFFE"}, + + {12000, "CP12000"}, + {12000, "UTF32LE"}, + {12000, "UTF-32LE"}, + {12000, "UCS4LE"}, + {12000, "UCS-4LE"}, + + {12001, "CP12001"}, + {12001, "UTF32BE"}, + {12001, "UTF-32BE"}, + {12001, "UCS4BE"}, + {12001, "UCS-4BE"}, + +#ifndef GLIB_COMPILATION + /* + * Default is big endian. + * See rfc2781 4.3 Interpreting text labelled as UTF-16. + */ + {1201, "UTF16"}, + {1201, "UTF-16"}, + {1201, "UCS2"}, + {1201, "UCS-2"}, + {12001, "UTF32"}, + {12001, "UTF-32"}, + {12001, "UCS-4"}, + {12001, "UCS4"}, +#else + /* Default is little endian, because the platform is */ + {1200, "UTF16"}, + {1200, "UTF-16"}, + {1200, "UCS2"}, + {1200, "UCS-2"}, + {12000, "UTF32"}, + {12000, "UTF-32"}, + {12000, "UCS4"}, + {12000, "UCS-4"}, +#endif + + /* copy from libiconv `iconv -l` */ + /* !IsValidCodePage(367) */ + {20127, "ANSI_X3.4-1968"}, + {20127, "ANSI_X3.4-1986"}, + {20127, "ASCII"}, + {20127, "CP367"}, + {20127, "IBM367"}, + {20127, "ISO-IR-6"}, + {20127, "ISO646-US"}, + {20127, "ISO_646.IRV:1991"}, + {20127, "US"}, + {20127, "US-ASCII"}, + {20127, "CSASCII"}, + + /* !IsValidCodePage(819) */ + {1252, "CP819"}, + {1252, "IBM819"}, + {28591, "ISO-8859-1"}, + {28591, "ISO-IR-100"}, + {28591, "ISO8859-1"}, + {28591, "ISO_8859-1"}, + {28591, "ISO_8859-1:1987"}, + {28591, "L1"}, + {28591, "LATIN1"}, + {28591, "CSISOLATIN1"}, + + {1250, "CP1250"}, + {1250, "MS-EE"}, + {1250, "WINDOWS-1250"}, + + {1251, "CP1251"}, + {1251, "MS-CYRL"}, + {1251, "WINDOWS-1251"}, + + {1252, "CP1252"}, + {1252, "MS-ANSI"}, + {1252, "WINDOWS-1252"}, + + {1253, "CP1253"}, + {1253, "MS-GREEK"}, + {1253, "WINDOWS-1253"}, + + {1254, "CP1254"}, + {1254, "MS-TURK"}, + {1254, "WINDOWS-1254"}, + + {1255, "CP1255"}, + {1255, "MS-HEBR"}, + {1255, "WINDOWS-1255"}, + + {1256, "CP1256"}, + {1256, "MS-ARAB"}, + {1256, "WINDOWS-1256"}, + + {1257, "CP1257"}, + {1257, "WINBALTRIM"}, + {1257, "WINDOWS-1257"}, + + {1258, "CP1258"}, + {1258, "WINDOWS-1258"}, + + {850, "850"}, + {850, "CP850"}, + {850, "IBM850"}, + {850, "CSPC850MULTILINGUAL"}, + + /* !IsValidCodePage(862) */ + {862, "862"}, + {862, "CP862"}, + {862, "IBM862"}, + {862, "CSPC862LATINHEBREW"}, + + {866, "866"}, + {866, "CP866"}, + {866, "IBM866"}, + {866, "CSIBM866"}, + + /* !IsValidCodePage(154) */ + {154, "CP154"}, + {154, "CYRILLIC-ASIAN"}, + {154, "PT154"}, + {154, "PTCP154"}, + {154, "CSPTCP154"}, + + /* !IsValidCodePage(1133) */ + {1133, "CP1133"}, + {1133, "IBM-CP1133"}, + + {874, "CP874"}, + {874, "WINDOWS-874"}, + + /* !IsValidCodePage(51932) */ + {51932, "CP51932"}, + {51932, "MS51932"}, + {51932, "WINDOWS-51932"}, + {51932, "EUC-JP"}, + + {932, "CP932"}, + {932, "MS932"}, + {932, "SHIFFT_JIS"}, + {932, "SHIFFT_JIS-MS"}, + {932, "SJIS"}, + {932, "SJIS-MS"}, + {932, "SJIS-OPEN"}, + {932, "SJIS-WIN"}, + {932, "WINDOWS-31J"}, + {932, "WINDOWS-932"}, + {932, "CSWINDOWS31J"}, + + {50221, "CP50221"}, + {50221, "ISO-2022-JP"}, + {50221, "ISO-2022-JP-MS"}, + {50221, "ISO2022-JP"}, + {50221, "ISO2022-JP-MS"}, + {50221, "MS50221"}, + {50221, "WINDOWS-50221"}, + + {936, "CP936"}, + {936, "GBK"}, + {936, "MS936"}, + {936, "WINDOWS-936"}, + + {950, "CP950"}, + {950, "BIG5"}, + {950, "BIG5HKSCS"}, + {950, "BIG5-HKSCS"}, + + {949, "CP949"}, + {949, "UHC"}, + {949, "EUC-KR"}, + + {1361, "CP1361"}, + {1361, "JOHAB"}, + + {437, "437"}, + {437, "CP437"}, + {437, "IBM437"}, + {437, "CSPC8CODEPAGE437"}, + + {737, "CP737"}, + + {775, "CP775"}, + {775, "IBM775"}, + {775, "CSPC775BALTIC"}, + + {852, "852"}, + {852, "CP852"}, + {852, "IBM852"}, + {852, "CSPCP852"}, + + /* !IsValidCodePage(853) */ + {853, "CP853"}, + + {855, "855"}, + {855, "CP855"}, + {855, "IBM855"}, + {855, "CSIBM855"}, + + {857, "857"}, + {857, "CP857"}, + {857, "IBM857"}, + {857, "CSIBM857"}, + + /* !IsValidCodePage(858) */ + {858, "CP858"}, + + {860, "860"}, + {860, "CP860"}, + {860, "IBM860"}, + {860, "CSIBM860"}, + + {861, "861"}, + {861, "CP-IS"}, + {861, "CP861"}, + {861, "IBM861"}, + {861, "CSIBM861"}, + + {863, "863"}, + {863, "CP863"}, + {863, "IBM863"}, + {863, "CSIBM863"}, + + {864, "CP864"}, + {864, "IBM864"}, + {864, "CSIBM864"}, + + {865, "865"}, + {865, "CP865"}, + {865, "IBM865"}, + {865, "CSIBM865"}, + + {869, "869"}, + {869, "CP-GR"}, + {869, "CP869"}, + {869, "IBM869"}, + {869, "CSIBM869"}, + + /* !IsValidCodePage(1152) */ + {1125, "CP1125"}, + + /* + * Code Page Identifiers + * http://msdn2.microsoft.com/en-us/library/ms776446.aspx + */ + {37, "IBM037"}, /* IBM EBCDIC US-Canada */ + {437, "IBM437"}, /* OEM United States */ + {500, "IBM500"}, /* IBM EBCDIC International */ + {708, "ASMO-708"}, /* Arabic (ASMO 708) */ + /* 709 Arabic (ASMO-449+, BCON V4) */ + /* 710 Arabic - Transparent Arabic */ + {720, "DOS-720"}, /* Arabic (Transparent ASMO); Arabic (DOS) */ + {737, "ibm737"}, /* OEM Greek (formerly 437G); Greek (DOS) */ + {775, "ibm775"}, /* OEM Baltic; Baltic (DOS) */ + {850, "ibm850"}, /* OEM Multilingual Latin 1; Western European (DOS) */ + {852, "ibm852"}, /* OEM Latin 2; Central European (DOS) */ + {855, "IBM855"}, /* OEM Cyrillic (primarily Russian) */ + {857, "ibm857"}, /* OEM Turkish; Turkish (DOS) */ + {858, "IBM00858"}, /* OEM Multilingual Latin 1 + Euro symbol */ + {860, "IBM860"}, /* OEM Portuguese; Portuguese (DOS) */ + {861, "ibm861"}, /* OEM Icelandic; Icelandic (DOS) */ + {862, "DOS-862"}, /* OEM Hebrew; Hebrew (DOS) */ + {863, "IBM863"}, /* OEM French Canadian; French Canadian (DOS) */ + {864, "IBM864"}, /* OEM Arabic; Arabic (864) */ + {865, "IBM865"}, /* OEM Nordic; Nordic (DOS) */ + {866, "cp866"}, /* OEM Russian; Cyrillic (DOS) */ + {869, "ibm869"}, /* OEM Modern Greek; Greek, Modern (DOS) */ + /* + * IBM EBCDIC Multilingual/ROECE (Latin 2); + * IBM EBCDIC Multilingual Latin 2 + */ + {870, "IBM870"}, + /* ANSI/OEM Thai (same as 28605, ISO 8859-15); Thai (Windows) */ + {874, "windows-874"}, + {875, "cp875"}, /* IBM EBCDIC Greek Modern */ + {932, "shift_jis"}, /* ANSI/OEM Japanese; Japanese (Shift-JIS) */ + {932, "shift-jis"}, /* alternative name for it */ + /* + * ANSI/OEM Simplified Chinese (PRC, Singapore); + * Chinese Simplified (GB2312) + */ + {936, "gb2312"}, + {949, "ks_c_5601-1987"}, /* ANSI/OEM Korean (Unified Hangul Code) */ + /* + * ANSI/OEM Traditional Chinese (Taiwan; Hong Kong SAR, PRC); + * Chinese Traditional (Big5) + */ + {950, "big5"}, + /* + * ANSI/OEM Traditional Chinese (Hong Kong SAR); + * Chinese Traditional (Big5-HKSCS) + */ + {950, "big5hkscs"}, + {950, "big5-hkscs"}, /* alternative name for it */ + {1026, "IBM1026"}, /* IBM EBCDIC Turkish (Latin 5) */ + {1047, "IBM01047"}, /* IBM EBCDIC Latin 1/Open System */ + /* + * IBM EBCDIC US-Canada (037 + Euro symbol); + * IBM EBCDIC (US-Canada-Euro) + */ + {1140, "IBM01140"}, + /* + * IBM EBCDIC Germany (20273 + Euro symbol); + * IBM EBCDIC (Germany-Euro) + */ + {1141, "IBM01141"}, + /* + * IBM EBCDIC Denmark-Norway (20277 + Euro symbol); + * IBM EBCDIC (Denmark-Norway-Euro) + */ + {1142, "IBM01142"}, + /* + * IBM EBCDIC Finland-Sweden (20278 + Euro symbol); + * IBM EBCDIC (Finland-Sweden-Euro) + */ + {1143, "IBM01143"}, + /* IBM EBCDIC Italy (20280 + Euro symbol); IBM EBCDIC (Italy-Euro) */ + {1144, "IBM01144"}, + /* + * IBM EBCDIC Latin America-Spain (20284 + Euro symbol); + * IBM EBCDIC (Spain-Euro) + */ + {1145, "IBM01145"}, + /* + * IBM EBCDIC United Kingdom (20285 + Euro symbol); + * IBM EBCDIC (UK-Euro) + */ + {1146, "IBM01146"}, + /* + * IBM EBCDIC France (20297 + Euro symbol); + * IBM EBCDIC (France-Euro) + */ + {1147, "IBM01147"} + /* + * IBM EBCDIC International (500 + Euro symbol); + * IBM EBCDIC (International-Euro) + */ + {1148, "IBM01148"}, + /* + * IBM EBCDIC Icelandic (20871 + Euro symbol); + * IBM EBCDIC (Icelandic-Euro) + */ + {1149, "IBM01149"}, + /* ANSI Central European; Central European (Windows) */ + {1250, "windows-1250"}, + {1251, "windows-1251"}, /* ANSI Cyrillic; Cyrillic (Windows) */ + {1252, "windows-1252"}, /* ANSI Latin 1; Western European (Windows) */ + {1253, "windows-1253"}, /* ANSI Greek; Greek (Windows) */ + {1254, "windows-1254"}, /* ANSI Turkish; Turkish (Windows) */ + {1255, "windows-1255"}, /* ANSI Hebrew; Hebrew (Windows) */ + {1256, "windows-1256"}, /* ANSI Arabic; Arabic (Windows) */ + {1257, "windows-1257"}, /* ANSI Baltic; Baltic (Windows) */ + {1258, "windows-1258"}, /* ANSI/OEM Vietnamese; Vietnamese (Windows) */ + {1361, "Johab"}, /* Korean (Johab) */ + {10000, "macintosh"}, /* MAC Roman; Western European (Mac) */ + {10001, "x-mac-japanese"}, /* Japanese (Mac) */ + /* MAC Traditional Chinese (Big5); Chinese Traditional (Mac) */ + {10002, "x-mac-chinesetrad"}, + {10003, "x-mac-korean"}, /* Korean (Mac) */ + {10004, "x-mac-arabic"}, /* Arabic (Mac) */ + {10005, "x-mac-hebrew"}, /* Hebrew (Mac) */ + {10006, "x-mac-greek"}, /* Greek (Mac) */ + {10007, "x-mac-cyrillic"}, /* Cyrillic (Mac) */ + /* MAC Simplified Chinese (GB 2312); Chinese Simplified (Mac) */ + {10008, "x-mac-chinesesimp"}, + {10010, "x-mac-romanian"}, /* Romanian (Mac) */ + {10017, "x-mac-ukrainian"}, /* Ukrainian (Mac) */ + {10021, "x-mac-thai"}, /* Thai (Mac) */ + {10029, "x-mac-ce"}, /* MAC Latin 2; Central European (Mac) */ + {10079, "x-mac-icelandic"}, /* Icelandic (Mac) */ + {10081, "x-mac-turkish"}, /* Turkish (Mac) */ + {10082, "x-mac-croatian"}, /* Croatian (Mac) */ + {20000, "x-Chinese_CNS"}, /* CNS Taiwan; Chinese Traditional (CNS) */ + {20001, "x-cp20001"}, /* TCA Taiwan */ + {20002, "x_Chinese-Eten"}, /* Eten Taiwan; Chinese Traditional (Eten) */ + {20003, "x-cp20003"}, /* IBM5550 Taiwan */ + {20004, "x-cp20004"}, /* TeleText Taiwan */ + {20005, "x-cp20005"}, /* Wang Taiwan */ + /* + * IA5 (IRV International Alphabet No. 5, 7-bit); + * Western European (IA5) + */ + {20105, "x-IA5"}, + {20106, "x-IA5-German"}, /* IA5 German (7-bit) */ + {20107, "x-IA5-Swedish"}, /* IA5 Swedish (7-bit) */ + {20108, "x-IA5-Norwegian"}, /* IA5 Norwegian (7-bit) */ + {20127, "us-ascii"}, /* US-ASCII (7-bit) */ + {20261, "x-cp20261"}, /* T.61 */ + {20269, "x-cp20269"}, /* ISO 6937 Non-Spacing Accent */ + {20273, "IBM273"}, /* IBM EBCDIC Germany */ + {20277, "IBM277"}, /* IBM EBCDIC Denmark-Norway */ + {20278, "IBM278"}, /* IBM EBCDIC Finland-Sweden */ + {20280, "IBM280"}, /* IBM EBCDIC Italy */ + {20284, "IBM284"}, /* IBM EBCDIC Latin America-Spain */ + {20285, "IBM285"}, /* IBM EBCDIC United Kingdom */ + {20290, "IBM290"}, /* IBM EBCDIC Japanese Katakana Extended */ + {20297, "IBM297"}, /* IBM EBCDIC France */ + {20420, "IBM420"}, /* IBM EBCDIC Arabic */ + {20423, "IBM423"}, /* IBM EBCDIC Greek */ + {20424, "IBM424"}, /* IBM EBCDIC Hebrew */ + {20833, "x-EBCDIC-KoreanExtended"}, /* IBM EBCDIC Korean Extended */ + {20838, "IBM-Thai"}, /* IBM EBCDIC Thai */ + {20866, "koi8-r"}, /* Russian (KOI8-R); Cyrillic (KOI8-R) */ + {20871, "IBM871"}, /* IBM EBCDIC Icelandic */ + {20880, "IBM880"}, /* IBM EBCDIC Cyrillic Russian */ + {20905, "IBM905"}, /* IBM EBCDIC Turkish */ + /* IBM EBCDIC Latin 1/Open System (1047 + Euro symbol) */ + {20924, "IBM00924"}, + {20932, "EUC-JP"}, /* Japanese (JIS 0208-1990 and 0121-1990) */ + /* Simplified Chinese (GB2312); Chinese Simplified (GB2312-80) */ + {20936, "x-cp20936"}, + {20949, "x-cp20949"}, /* Korean Wansung */ + {21025, "cp1025"}, /* IBM EBCDIC Cyrillic Serbian-Bulgarian */ + /* 21027 (deprecated) */ + {21866, "koi8-u"}, /* Ukrainian (KOI8-U); Cyrillic (KOI8-U) */ + {28591, "iso-8859-1"}, /* ISO 8859-1 Latin 1; Western European (ISO) */ + {28591, "iso8859-1"}, /* ISO 8859-1 Latin 1; Western European (ISO) */ + {28591, "iso_8859-1"}, + {28591, "iso_8859_1"}, + /* ISO 8859-2 Central European; Central European (ISO) */ + {28592, "iso-8859-2"}, + /* ISO 8859-2 Central European; Central European (ISO) */ + {28592, "iso8859-2"}, + {28592, "iso_8859-2"}, + {28592, "iso_8859_2"}, + {28593, "iso-8859-3"}, /* ISO 8859-3 Latin 3 */ + {28593, "iso8859-3"}, /* ISO 8859-3 Latin 3 */ + {28593, "iso_8859-3"}, + {28593, "iso_8859_3"}, + {28594, "iso-8859-4"}, /* ISO 8859-4 Baltic */ + {28594, "iso8859-4"}, /* ISO 8859-4 Baltic */ + {28594, "iso_8859-4"}, + {28594, "iso_8859_4"}, + {28595, "iso-8859-5"}, /* ISO 8859-5 Cyrillic */ + {28595, "iso8859-5"}, /* ISO 8859-5 Cyrillic */ + {28595, "iso_8859-5"}, + {28595, "iso_8859_5"}, + {28596, "iso-8859-6"}, /* ISO 8859-6 Arabic */ + {28596, "iso8859-6"}, /* ISO 8859-6 Arabic */ + {28596, "iso_8859-6"}, + {28596, "iso_8859_6"}, + {28597, "iso-8859-7"}, /* ISO 8859-7 Greek */ + {28597, "iso8859-7"}, /* ISO 8859-7 Greek */ + {28597, "iso_8859-7"}, + {28597, "iso_8859_7"}, + {28598, "iso-8859-8"}, /* ISO 8859-8 Hebrew; Hebrew (ISO-Visual) */ + {28598, "iso8859-8"}, /* ISO 8859-8 Hebrew; Hebrew (ISO-Visual) */ + {28598, "iso_8859-8"}, + {28598, "iso_8859_8"}, + {28599, "iso-8859-9"}, /* ISO 8859-9 Turkish */ + {28599, "iso8859-9"}, /* ISO 8859-9 Turkish */ + {28599, "iso_8859-9"}, + {28599, "iso_8859_9"}, + {28603, "iso-8859-13"}, /* ISO 8859-13 Estonian */ + {28603, "iso8859-13"}, /* ISO 8859-13 Estonian */ + {28603, "iso_8859-13"}, + {28603, "iso_8859_13"}, + {28605, "iso-8859-15"}, /* ISO 8859-15 Latin 9 */ + {28605, "iso8859-15"}, /* ISO 8859-15 Latin 9 */ + {28605, "iso_8859-15"}, + {28605, "iso_8859_15"}, + {29001, "x-Europa"}, /* Europa 3 */ + {38598, "iso-8859-8-i"}, /* ISO 8859-8 Hebrew; Hebrew (ISO-Logical) */ + {38598, "iso8859-8-i"}, /* ISO 8859-8 Hebrew; Hebrew (ISO-Logical) */ + {38598, "iso_8859-8-i"}, + {38598, "iso_8859_8-i"}, + /* + * ISO 2022 Japanese with no halfwidth Katakana; + * Japanese (JIS) + */ + {50220, "iso-2022-jp"}, + /* + * ISO 2022 Japanese with halfwidth Katakana; + * Japanese (JIS-Allow 1 byte Kana) + */ + {50221, "csISO2022JP"}, + /* + * ISO 2022 Japanese JIS X 0201-1989; + * Japanese (JIS-Allow 1 byte Kana - SO/SI) + */ + {50222, "iso-2022-jp"}, + {50225, "iso-2022-kr"}, /* ISO 2022 Korean */ + {50225, "iso2022-kr"}, /* ISO 2022 Korean */ + /* ISO 2022 Simplified Chinese; Chinese Simplified (ISO 2022) */ + {50227, "x-cp50227"}, + /* 50229 ISO 2022 Traditional Chinese */ + /* 50930 EBCDIC Japanese (Katakana) Extended */ + /* 50931 EBCDIC US-Canada and Japanese */ + /* 50933 EBCDIC Korean Extended and Korean */ + /* 50935 EBCDIC Simplified Chinese Extended and Simplified Chinese */ + /* 50936 EBCDIC Simplified Chinese */ + /* 50937 EBCDIC US-Canada and Traditional Chinese */ + /* 50939 EBCDIC Japanese (Latin) Extended and Japanese */ + {51932, "euc-jp"}, /* EUC Japanese */ + {51936, "EUC-CN"}, /* EUC Simplified Chinese; Chinese Simplified (EUC) */ + {51949, "euc-kr"}, /* EUC Korean */ + /* 51950 EUC Traditional Chinese */ + /* HZ-GB2312 Simplified Chinese; Chinese Simplified (HZ) */ + {52936, "hz-gb-2312"}, + /* + * Windows XP and later: GB18030 Simplified Chinese (4 byte); + * Chinese Simplified (GB18030) + */ + {54936, "GB18030"}, + {57002, "x-iscii-de"}, /* ISCII Devanagari */ + {57003, "x-iscii-be"}, /* ISCII Bengali */ + {57004, "x-iscii-ta"}, /* ISCII Tamil */ + {57005, "x-iscii-te"}, /* ISCII Telugu */ + {57006, "x-iscii-as"}, /* ISCII Assamese */ + {57007, "x-iscii-or"}, /* ISCII Oriya */ + {57008, "x-iscii-ka"}, /* ISCII Kannada */ + {57009, "x-iscii-ma"}, /* ISCII Malayalam */ + {57010, "x-iscii-gu"}, /* ISCII Gujarati */ + {57011, "x-iscii-pa"}, /* ISCII Punjabi */ + + {0, NULL} +}; + +/* + * SJIS SHIFTJIS table CP932 table + * ---- --------------------------- -------------------------------- + * 5C U+00A5 YEN SIGN U+005C REVERSE SOLIDUS + * 7E U+203E OVERLINE U+007E TILDE + * 815C U+2014 EM DASH U+2015 HORIZONTAL BAR + * 815F U+005C REVERSE SOLIDUS U+FF3C FULLWIDTH REVERSE SOLIDUS + * 8160 U+301C WAVE DASH U+FF5E FULLWIDTH TILDE + * 8161 U+2016 DOUBLE VERTICAL LINE U+2225 PARALLEL TO + * 817C U+2212 MINUS SIGN U+FF0D FULLWIDTH HYPHEN-MINUS + * 8191 U+00A2 CENT SIGN U+FFE0 FULLWIDTH CENT SIGN + * 8192 U+00A3 POUND SIGN U+FFE1 FULLWIDTH POUND SIGN + * 81CA U+00AC NOT SIGN U+FFE2 FULLWIDTH NOT SIGN + * + * EUC-JP and ISO-2022-JP should be compatible with CP932. + * + * Kernel and MLang have different Unicode mapping table. Make sure + * which API is used. + */ +static compat_t cp932_compat[] = { + {0x00A5, 0x005C, COMPAT_OUT}, + {0x203E, 0x007E, COMPAT_OUT}, + {0x2014, 0x2015, COMPAT_OUT}, + {0x301C, 0xFF5E, COMPAT_OUT}, + {0x2016, 0x2225, COMPAT_OUT}, + {0x2212, 0xFF0D, COMPAT_OUT}, + {0x00A2, 0xFFE0, COMPAT_OUT}, + {0x00A3, 0xFFE1, COMPAT_OUT}, + {0x00AC, 0xFFE2, COMPAT_OUT}, + {0, 0, 0} +}; + +static compat_t cp20932_compat[] = { + {0x00A5, 0x005C, COMPAT_OUT}, + {0x203E, 0x007E, COMPAT_OUT}, + {0x2014, 0x2015, COMPAT_OUT}, + {0xFF5E, 0x301C, COMPAT_OUT|COMPAT_IN}, + {0x2225, 0x2016, COMPAT_OUT|COMPAT_IN}, + {0xFF0D, 0x2212, COMPAT_OUT|COMPAT_IN}, + {0xFFE0, 0x00A2, COMPAT_OUT|COMPAT_IN}, + {0xFFE1, 0x00A3, COMPAT_OUT|COMPAT_IN}, + {0xFFE2, 0x00AC, COMPAT_OUT|COMPAT_IN}, + {0, 0, 0} +}; + +static compat_t *cp51932_compat = cp932_compat; + +/* cp20932_compat for kernel. cp932_compat for mlang. */ +static compat_t *cp5022x_compat = cp932_compat; + +typedef HRESULT (WINAPI *CONVERTINETSTRING)( + LPDWORD lpdwMode, + DWORD dwSrcEncoding, + DWORD dwDstEncoding, + LPCSTR lpSrcStr, + LPINT lpnSrcSize, + LPBYTE lpDstStr, + LPINT lpnDstSize +); +typedef HRESULT (WINAPI *CONVERTINETMULTIBYTETOUNICODE)( + LPDWORD lpdwMode, + DWORD dwSrcEncoding, + LPCSTR lpSrcStr, + LPINT lpnMultiCharCount, + LPWSTR lpDstStr, + LPINT lpnWideCharCount +); +typedef HRESULT (WINAPI *CONVERTINETUNICODETOMULTIBYTE)( + LPDWORD lpdwMode, + DWORD dwEncoding, + LPCWSTR lpSrcStr, + LPINT lpnWideCharCount, + LPSTR lpDstStr, + LPINT lpnMultiCharCount +); +typedef HRESULT (WINAPI *ISCONVERTINETSTRINGAVAILABLE)( + DWORD dwSrcEncoding, + DWORD dwDstEncoding +); +typedef HRESULT (WINAPI *LCIDTORFC1766A)( + LCID Locale, + LPSTR pszRfc1766, + int nChar +); +typedef HRESULT (WINAPI *LCIDTORFC1766W)( + LCID Locale, + LPWSTR pszRfc1766, + int nChar +); +typedef HRESULT (WINAPI *RFC1766TOLCIDA)( + LCID *pLocale, + LPSTR pszRfc1766 +); +typedef HRESULT (WINAPI *RFC1766TOLCIDW)( + LCID *pLocale, + LPWSTR pszRfc1766 +); +static CONVERTINETSTRING ConvertINetString; +static CONVERTINETMULTIBYTETOUNICODE ConvertINetMultiByteToUnicode; +static CONVERTINETUNICODETOMULTIBYTE ConvertINetUnicodeToMultiByte; +static ISCONVERTINETSTRINGAVAILABLE IsConvertINetStringAvailable; +static LCIDTORFC1766A LcidToRfc1766A; +static RFC1766TOLCIDA Rfc1766ToLcidA; + +static int load_mlang(void) { + HMODULE h; + if (ConvertINetString != NULL) + return TRUE; + h = LoadLibrary(TEXT("mlang.dll")); + if (!h) + return FALSE; + ConvertINetString = + (CONVERTINETSTRING)GetProcAddressA(h, "ConvertINetString"); + ConvertINetMultiByteToUnicode = + (CONVERTINETMULTIBYTETOUNICODE)GetProcAddressA( + h, "ConvertINetMultiByteToUnicode"); + ConvertINetUnicodeToMultiByte = + (CONVERTINETUNICODETOMULTIBYTE)GetProcAddressA( + h, "ConvertINetUnicodeToMultiByte"); + IsConvertINetStringAvailable = + (ISCONVERTINETSTRINGAVAILABLE)GetProcAddressA( + h, "IsConvertINetStringAvailable"); + LcidToRfc1766A = + (LCIDTORFC1766A)GetProcAddressA(h, "LcidToRfc1766A"); + Rfc1766ToLcidA = + (RFC1766TOLCIDA)GetProcAddressA(h, "Rfc1766ToLcidA"); + return TRUE; +} + +iconv_t iconv_open(const char *tocode, const char *fromcode) { + rec_iconv_t *cd; + + cd = (rec_iconv_t *)calloc(1, sizeof(rec_iconv_t)); + if (cd == NULL) + return (iconv_t)(-1); + +#if defined(USE_LIBICONV_DLL) + errno = 0; + if (libiconv_iconv_open(cd, tocode, fromcode)) + return (iconv_t)cd; +#endif + + /* reset the errno to prevent reporting wrong error code. + * 0 for unsorted error. */ + errno = 0; + if (win_iconv_open(cd, tocode, fromcode)) + return (iconv_t)cd; + + free(cd); + + return (iconv_t)(-1); +} + +int iconv_close(iconv_t _cd) { + rec_iconv_t *cd = (rec_iconv_t *)_cd; + int r = cd->iconv_close(cd->cd); + int e = *(cd->_errno()); +#if defined(USE_LIBICONV_DLL) + if (cd->hlibiconv != NULL) + FreeLibrary(cd->hlibiconv); +#endif + free(cd); + errno = e; + return r; +} + +size_t iconv( + iconv_t _cd, + const char **inbuf, + size_t *inbytesleft, + char **outbuf, + size_t *outbytesleft +) { + rec_iconv_t *cd = (rec_iconv_t *)_cd; + size_t r = cd->iconv(cd->cd, inbuf, inbytesleft, outbuf, outbytesleft); + errno = *(cd->_errno()); + return r; +} + +static int win_iconv_open( + rec_iconv_t *cd, + const char *tocode, + const char *fromcode +) { + if (!make_csconv(fromcode, &cd->from) || !make_csconv(tocode, &cd->to)) + return FALSE; + cd->iconv_close = win_iconv_close; + cd->iconv = win_iconv; + cd->_errno = _errno; + cd->cd = (iconv_t)cd; + return TRUE; +} + +static int win_iconv_close(iconv_t cd UNUSED) { + return 0; +} + +static size_t win_iconv( + iconv_t _cd, + const char **inbuf, + size_t *inbytesleft, + char **outbuf, + size_t *outbytesleft +) { + rec_iconv_t *cd = (rec_iconv_t *)_cd; + ushort wbuf[MB_CHAR_MAX]; /* enough room for one character */ + int insize; + int outsize; + int wsize; + DWORD frommode; + DWORD tomode; + uint wc; + compat_t *cp; + int i; + + if (inbuf == NULL || *inbuf == NULL) { + if (outbuf != NULL && *outbuf != NULL && cd->to.flush != NULL) { + tomode = cd->to.mode; + outsize = cd->to.flush( + &cd->to, + (uchar *)*outbuf, + *outbytesleft); + if (outsize == -1) { + if ((cd->to.flags & FLAG_IGNORE) && errno != E2BIG) { + outsize = 0; + } else { + cd->to.mode = tomode; + return (size_t)(-1); + } + } + *outbuf += outsize; + *outbytesleft -= outsize; + } + cd->from.mode = 0; + cd->to.mode = 0; + return 0; + } + + while (*inbytesleft != 0) { + frommode = cd->from.mode; + tomode = cd->to.mode; + wsize = MB_CHAR_MAX; + + insize = cd->from.mbtowc( + &cd->from, + (const uchar *)*inbuf, + *inbytesleft, wbuf, &wsize); + if (insize == -1) { + if (cd->to.flags & FLAG_IGNORE) { + cd->from.mode = frommode; + insize = 1; + wsize = 0; + } else { + cd->from.mode = frommode; + return (size_t)(-1); + } + } + + if (wsize == 0) { + *inbuf += insize; + *inbytesleft -= insize; + continue; + } + + if (cd->from.compat != NULL) { + wc = utf16_to_ucs4(wbuf); + cp = cd->from.compat; + for (i = 0; cp[i].in != 0; ++i) { + if ((cp[i].flag & COMPAT_IN) && cp[i].out == wc) { + ucs4_to_utf16(cp[i].in, wbuf, &wsize); + break; + } + } + } + + if (cd->to.compat != NULL) { + wc = utf16_to_ucs4(wbuf); + cp = cd->to.compat; + for (i = 0; cp[i].in != 0; ++i) { + if ((cp[i].flag & COMPAT_OUT) && cp[i].in == wc) { + ucs4_to_utf16(cp[i].out, wbuf, &wsize); + break; + } + } + } + + outsize = cd->to.wctomb( + &cd->to, + wbuf, wsize, + (uchar *)*outbuf, + *outbytesleft); + if (outsize == -1) { + if ((cd->to.flags & FLAG_IGNORE) && errno != E2BIG) { + cd->to.mode = tomode; + outsize = 0; + } else { + cd->from.mode = frommode; + cd->to.mode = tomode; + return (size_t)(-1); + } + } + + *inbuf += insize; + *outbuf += outsize; + *inbytesleft -= insize; + *outbytesleft -= outsize; + } + + return 0; +} + +static int make_csconv(const char *_name, csconv_t *cv) { + CPINFO cpinfo; + int use_compat = TRUE; + int flag = 0; + char *name; + char *p; + + name = xstrndup(_name, strlen(_name)); + if (name == NULL) + return FALSE; + + /* check for option "enc_name//opt1//opt2" */ + while ((p = strrstr(name, "//")) != NULL) { + if (_stricmp(p + 2, "nocompat") == 0) + use_compat = FALSE; + else if (_stricmp(p + 2, "translit") == 0) + flag |= FLAG_TRANSLIT; + else if (_stricmp(p + 2, "ignore") == 0) + flag |= FLAG_IGNORE; + *p = 0; + } + + cv->mode = 0; + cv->flags = flag; + cv->mblen = NULL; + cv->flush = NULL; + cv->compat = NULL; + cv->codepage = name_to_codepage(name); + if (cv->codepage == 1200 || cv->codepage == 1201) { + cv->mbtowc = utf16_mbtowc; + cv->wctomb = utf16_wctomb; + if (_stricmp(name, "UTF-16") == 0 || + _stricmp(name, "UTF16") == 0 || + _stricmp(name, "UCS-2") == 0 || + _stricmp(name, "UCS2") == 0 || + _stricmp(name, "UCS-2-INTERNAL") == 0) + cv->flags |= FLAG_USE_BOM; + } else if (cv->codepage == 12000 || cv->codepage == 12001) { + cv->mbtowc = utf32_mbtowc; + cv->wctomb = utf32_wctomb; + if (_stricmp(name, "UTF-32") == 0 || + _stricmp(name, "UTF32") == 0 || + _stricmp(name, "UCS-4") == 0 || + _stricmp(name, "UCS4") == 0) + cv->flags |= FLAG_USE_BOM; + } else if (cv->codepage == 65001) { + cv->mbtowc = kernel_mbtowc; + cv->wctomb = kernel_wctomb; + cv->mblen = utf8_mblen; + } else if ((cv->codepage == 50220 || + cv->codepage == 50221 || + cv->codepage == 50222) && load_mlang()) { + cv->mbtowc = iso2022jp_mbtowc; + cv->wctomb = iso2022jp_wctomb; + cv->flush = iso2022jp_flush; + } else if (cv->codepage == 51932 && load_mlang()) { + cv->mbtowc = mlang_mbtowc; + cv->wctomb = mlang_wctomb; + cv->mblen = eucjp_mblen; + } else if (IsValidCodePage(cv->codepage) + && GetCPInfo(cv->codepage, &cpinfo) != 0) { + cv->mbtowc = kernel_mbtowc; + cv->wctomb = kernel_wctomb; + if (cpinfo.MaxCharSize == 1) + cv->mblen = sbcs_mblen; + else if (cpinfo.MaxCharSize == 2) + cv->mblen = dbcs_mblen; + else + cv->mblen = mbcs_mblen; + } else { + /* not supported */ + free(name); + errno = EINVAL; + return FALSE; + } + + if (use_compat) { + switch (cv->codepage) { + case 932: cv->compat = cp932_compat; break; + case 20932: cv->compat = cp20932_compat; break; + case 51932: cv->compat = cp51932_compat; break; + case 50220: + case 50221: + case 50222: cv->compat = cp5022x_compat; break; + } + } + + free(name); + + return TRUE; +} + +static int name_to_codepage(const char *name) { + int i; + + if (*name == '\0' || + strcmp(name, "char") == 0) + return GetACP(); + else if (strcmp(name, "wchar_t") == 0) + return 1200; + else if (_strnicmp(name, "cp", 2) == 0) + return atoi(name + 2); /* CP123 */ + else if ('0' <= name[0] && name[0] <= '9') + return atoi(name); /* 123 */ + else if (_strnicmp(name, "xx", 2) == 0) + return atoi(name + 2); /* XX123 for debug */ + + for (i = 0; codepage_alias[i].name != NULL; ++i) + if (_stricmp(name, codepage_alias[i].name) == 0) + return codepage_alias[i].codepage; + return -1; +} + +/* + * http://www.faqs.org/rfcs/rfc2781.html + */ +static uint utf16_to_ucs4(const ushort *wbuf) { + uint wc = wbuf[0]; + if (0xD800 <= wbuf[0] && wbuf[0] <= 0xDBFF) + wc = ((wbuf[0] & 0x3FF) << 10) + (wbuf[1] & 0x3FF) + 0x10000; + return wc; +} + +static void ucs4_to_utf16(uint wc, ushort *wbuf, int *wbufsize) { + if (wc < 0x10000) { + wbuf[0] = wc; + *wbufsize = 1; + } else { + wc -= 0x10000; + wbuf[0] = 0xD800 | ((wc >> 10) & 0x3FF); + wbuf[1] = 0xDC00 | (wc & 0x3FF); + *wbufsize = 2; + } +} + +/* + * Check if codepage is one of those for which the dwFlags parameter + * to MultiByteToWideChar() must be zero. Return zero or + * MB_ERR_INVALID_CHARS. The docs in Platform SDK for Windows + * Server 2003 R2 claims that also codepage 65001 is one of these, but + * that doesn't seem to be the case. The MSDN docs for MSVS2008 leave + * out 65001 (UTF-8), and that indeed seems to be the case on XP, it + * works fine to pass MB_ERR_INVALID_CHARS in dwFlags when converting + * from UTF-8. + */ +static int mbtowc_flags(int codepage) { + return (codepage == 50220 || codepage == 50221 || + codepage == 50222 || codepage == 50225 || + codepage == 50227 || codepage == 50229 || + codepage == 52936 || codepage == 54936 || + (codepage >= 57002 && codepage <= 57011) || + codepage == 65000 || codepage == 42) ? 0 : MB_ERR_INVALID_CHARS; +} + +/* + * Check if codepage is one those for which the lpUsedDefaultChar + * parameter to WideCharToMultiByte() must be NULL. The docs in + * Platform SDK for Windows Server 2003 R2 claims that this is the + * list below, while the MSDN docs for MSVS2008 claim that it is only + * for 65000 (UTF-7) and 65001 (UTF-8). This time the earlier Platform + * SDK seems to be correct, at least for XP. + */ +static int must_use_null_useddefaultchar(int codepage) { + return (codepage == 65000 || codepage == 65001 || + codepage == 50220 || codepage == 50221 || + codepage == 50222 || codepage == 50225 || + codepage == 50227 || codepage == 50229 || + codepage == 52936 || codepage == 54936 || + (codepage >= 57002 && codepage <= 57011) || + codepage == 42); +} + +static char * strrstr(const char *str, const char *token) { + int len = strlen(token); + const char *p = str + strlen(str); + + while (str <= --p) + if (p[0] == token[0] && strncmp(p, token, len) == 0) + return (char *)p; + return NULL; +} + +static char * xstrndup(const char *s, size_t n) { + char *p; + + p = (char *)malloc(n + 1); + if (p == NULL) + return NULL; + memcpy(p, s, n); + p[n] = '\0'; + return p; +} + +static int seterror(int err) { + errno = err; + return -1; +} + +#if defined(USE_LIBICONV_DLL) +static int libiconv_iconv_open( + rec_iconv_t *cd, + const char *tocode, + const char *fromcode +) { + HMODULE hlibiconv = NULL; + char *dllname; + const char *p; + const char *e; + f_iconv_open _iconv_open; + + /* + * always try to load dll, so that we can switch dll in runtime. + */ + + /* XXX: getenv() can't get variable set by SetEnvironmentVariable() */ + p = getenv("WINICONV_LIBICONV_DLL"); + if (p == NULL) + p = DEFAULT_LIBICONV_DLL; + /* parse comma separated value */ + for ( ; *p != 0; p = (*e == ',') ? e + 1 : e) { + e = strchr(p, ','); + if (p == e) + continue; + else if (e == NULL) + e = p + strlen(p); + dllname = xstrndup(p, e - p); + if (dllname == NULL) + return FALSE; + hlibiconv = LoadLibraryA(dllname); + free(dllname); + if (hlibiconv != NULL) { + if (hlibiconv == hwiniconv) { + FreeLibrary(hlibiconv); + hlibiconv = NULL; + continue; + } + break; + } + } + + if (hlibiconv == NULL) + goto failed; + + _iconv_open = (f_iconv_open)GetProcAddressA( + hlibiconv, + "libiconv_open"); + if (_iconv_open == NULL) + _iconv_open = (f_iconv_open)GetProcAddressA( + hlibiconv, + "iconv_open"); + cd->iconv_close = (f_iconv_close)GetProcAddressA( + hlibiconv, + "libiconv_close"); + if (cd->iconv_close == NULL) + cd->iconv_close = (f_iconv_close)GetProcAddressA( + hlibiconv, + "iconv_close"); + cd->iconv = (f_iconv)GetProcAddressA( + hlibiconv, + "libiconv"); + if (cd->iconv == NULL) + cd->iconv = (f_iconv)GetProcAddressA( + hlibiconv, + "iconv"); + cd->_errno = (f_errno)find_imported_function( + hlibiconv, + "_errno"); + if (_iconv_open == NULL || cd->iconv_close == NULL + || cd->iconv == NULL || cd->_errno == NULL) + goto failed; + + cd->cd = _iconv_open(tocode, fromcode); + if (cd->cd == (iconv_t)(-1)) + goto failed; + + cd->hlibiconv = hlibiconv; + return TRUE; + +failed: + if (hlibiconv != NULL) + FreeLibrary(hlibiconv); + return FALSE; +} + +/* + * Reference: + * http://forums.belution.com/ja/vc/000/234/78s.shtml + * http://nienie.com/~masapico/api_ImageDirectoryEntryToData.html + * + * The formal way is + * imagehlp.h or dbghelp.h + * imagehlp.lib or dbghelp.lib + * ImageDirectoryEntryToData() + */ +#define TO_DOS_HEADER(base) ((PIMAGE_DOS_HEADER)(base)) +#define TO_NT_HEADERS(base) \ +((PIMAGE_NT_HEADERS)((LPBYTE)(base) + TO_DOS_HEADER(base)->e_lfanew)) +static PVOID MyImageDirectoryEntryToData( + LPVOID Base, + BOOLEAN MappedAsImage, + USHORT DirectoryEntry, + PULONG Size +) { + /* TODO: MappedAsImage? */ + PIMAGE_DATA_DIRECTORY p; + p = TO_NT_HEADERS(Base)->OptionalHeader.DataDirectory + DirectoryEntry; + if (p->VirtualAddress == 0) { + *Size = 0; + return NULL; + } + *Size = p->Size; + return (PVOID)((LPBYTE)Base + p->VirtualAddress); +} + +static FARPROC find_imported_function( + HMODULE hModule, + const char *funcname +) { + DWORD_PTR Base; + ULONG Size; + PIMAGE_IMPORT_DESCRIPTOR Imp; + PIMAGE_THUNK_DATA Address; /* Import Address Table */ + PIMAGE_THUNK_DATA Name; /* Import Name Table */ + PIMAGE_IMPORT_BY_NAME ImpName; + + Base = (DWORD_PTR)hModule; + Imp = (PIMAGE_IMPORT_DESCRIPTOR)MyImageDirectoryEntryToData( + (LPVOID)Base, + TRUE, + IMAGE_DIRECTORY_ENTRY_IMPORT, + &Size); + if (Imp == NULL) + return NULL; + for ( ; Imp->OriginalFirstThunk != 0; ++Imp) { + Address = (PIMAGE_THUNK_DATA)(Base + Imp->FirstThunk); + Name = (PIMAGE_THUNK_DATA)(Base + Imp->OriginalFirstThunk); + for ( ; Name->u1.Ordinal != 0; ++Name, ++Address) { + if (!IMAGE_SNAP_BY_ORDINAL(Name->u1.Ordinal)) { + ImpName = (PIMAGE_IMPORT_BY_NAME) + (Base + (DWORD_PTR)Name->u1.AddressOfData); + if (strcmp((char *)ImpName->Name, funcname) == 0) + return (FARPROC)Address->u1.Function; + } + } + } + return NULL; +} +#endif + +static int sbcs_mblen( + csconv_t *cv UNUSED, + const uchar *buf UNUSED, + int bufsize UNUSED +) { + return 1; +} + +static int dbcs_mblen( + csconv_t *cv, + const uchar *buf, + int bufsize +) { + int len = IsDBCSLeadByteEx(cv->codepage, buf[0]) ? 2 : 1; + if (bufsize < len) + return seterror(EINVAL); + return len; +} + +static int mbcs_mblen( + csconv_t *cv, + const uchar *buf, + int bufsize +) { + int len = 0; + + if (cv->codepage == 54936) { + if (buf[0] <= 0x7F) len = 1; + else if (buf[0] >= 0x81 && buf[0] <= 0xFE && + bufsize >= 2 && + ((buf[1] >= 0x40 && buf[1] <= 0x7E) || + (buf[1] >= 0x80 && buf[1] <= 0xFE))) len = 2; + else if (buf[0] >= 0x81 && buf[0] <= 0xFE && + bufsize >= 4 && + buf[1] >= 0x30 && buf[1] <= 0x39) len = 4; + else + return seterror(EINVAL); + return len; + } else { + return seterror(EINVAL); + } +} + +static int utf8_mblen( + csconv_t *cv UNUSED, + const uchar *buf, + int bufsize +) { + int len = 0; + + if (buf[0] < 0x80) len = 1; + else if ((buf[0] & 0xE0) == 0xC0) len = 2; + else if ((buf[0] & 0xF0) == 0xE0) len = 3; + else if ((buf[0] & 0xF8) == 0xF0) len = 4; + else if ((buf[0] & 0xFC) == 0xF8) len = 5; + else if ((buf[0] & 0xFE) == 0xFC) len = 6; + + if (len == 0) + return seterror(EILSEQ); + else if (bufsize < len) + return seterror(EINVAL); + return len; +} + +static int eucjp_mblen( + csconv_t *cv UNUSED, + const uchar *buf, + int bufsize +) { + if (buf[0] < 0x80) { /* ASCII */ + return 1; + } else if (buf[0] == 0x8E) { /* JIS X 0201 */ + if (bufsize < 2) + return seterror(EINVAL); + else if (!(0xA1 <= buf[1] && buf[1] <= 0xDF)) + return seterror(EILSEQ); + return 2; + } else if (buf[0] == 0x8F) { /* JIS X 0212 */ + if (bufsize < 3) + return seterror(EINVAL); + else if (!(0xA1 <= buf[1] && buf[1] <= 0xFE) + || !(0xA1 <= buf[2] && buf[2] <= 0xFE)) + return seterror(EILSEQ); + return 3; + } else { /* JIS X 0208 */ + if (bufsize < 2) + return seterror(EINVAL); + else if (!(0xA1 <= buf[0] && buf[0] <= 0xFE) + || !(0xA1 <= buf[1] && buf[1] <= 0xFE)) + return seterror(EILSEQ); + return 2; + } +} + +static int kernel_mbtowc( + csconv_t *cv, + const uchar *buf, + int bufsize, + ushort *wbuf, + int *wbufsize +) { + int len; + + len = cv->mblen(cv, buf, bufsize); + if (len == -1) + return -1; + /* If converting from ASCII, reject 8bit + * chars. MultiByteToWideChar() doesn't. Note that for ASCII we + * know that the mblen function is sbcs_mblen() so len is 1. + */ + if (cv->codepage == 20127 && buf[0] >= 0x80) + return seterror(EILSEQ); + *wbufsize = MultiByteToWideChar( + cv->codepage, + mbtowc_flags(cv->codepage), + (const char *)buf, + len, + (wchar_t *)wbuf, *wbufsize); + if (*wbufsize == 0) + return seterror(EILSEQ); + return len; +} + +static int kernel_wctomb( + csconv_t *cv, + ushort *wbuf, + int wbufsize, + uchar *buf, + int bufsize +) { + BOOL usedDefaultChar = 0; + BOOL *p = NULL; + int flags = 0; + int len; + + if (bufsize == 0) + return seterror(E2BIG); + if (!must_use_null_useddefaultchar(cv->codepage)) { + p = &usedDefaultChar; +#ifdef WC_NO_BEST_FIT_CHARS + if (!(cv->flags & FLAG_TRANSLIT)) + flags |= WC_NO_BEST_FIT_CHARS; +#endif + } + len = WideCharToMultiByte(cv->codepage, flags, + (const wchar_t *)wbuf, wbufsize, (char *)buf, bufsize, NULL, p); + if (len == 0) { + if (GetLastError() == ERROR_INSUFFICIENT_BUFFER) + return seterror(E2BIG); + return seterror(EILSEQ); + } else if (usedDefaultChar && !(cv->flags & FLAG_TRANSLIT)) { + return seterror(EILSEQ); + } else if (cv->mblen(cv, buf, len) != len) { /* validate result */ + return seterror(EILSEQ); + } + return len; +} + +/* + * It seems that the mode (cv->mode) is fixnum. + * For example, when converting iso-2022-jp(cp50221) to unicode: + * in ascii sequence: mode=0xC42C0000 + * in jisx0208 sequence: mode=0xC42C0001 + * "C42C" is same for each convert session. + * It should be: ((codepage-1)<<16)|state + */ +static int mlang_mbtowc( + csconv_t *cv, + const uchar *buf, + int bufsize, + ushort *wbuf, + int *wbufsize +) { + int len; + int insize; + HRESULT hr; + + len = cv->mblen(cv, buf, bufsize); + if (len == -1) + return -1; + insize = len; + hr = ConvertINetMultiByteToUnicode(&cv->mode, cv->codepage, + (const char *)buf, &insize, (wchar_t *)wbuf, wbufsize); + if (hr != S_OK || insize != len) + return seterror(EILSEQ); + return len; +} + +static int mlang_wctomb( + csconv_t *cv, + ushort *wbuf, + int wbufsize, + uchar *buf, + int bufsize) { + char tmpbuf[MB_CHAR_MAX]; /* enough room for one character */ + int tmpsize = MB_CHAR_MAX; + int insize = wbufsize; + HRESULT hr; + + hr = ConvertINetUnicodeToMultiByte(&cv->mode, cv->codepage, + (const wchar_t *)wbuf, &wbufsize, tmpbuf, &tmpsize); + if (hr != S_OK || insize != wbufsize) + return seterror(EILSEQ); + else if (bufsize < tmpsize) + return seterror(E2BIG); + else if (cv->mblen(cv, (uchar *)tmpbuf, tmpsize) != tmpsize) + return seterror(EILSEQ); + memcpy(buf, tmpbuf, tmpsize); + return tmpsize; +} + +static int utf16_mbtowc( + csconv_t *cv, + const uchar *buf, + int bufsize, + ushort *wbuf, + int *wbufsize +) { + int codepage = cv->codepage; + + /* swap endian: 1200 <-> 1201 */ + if (cv->mode & UNICODE_MODE_SWAPPED) + codepage ^= 1; + + if (bufsize < 2) + return seterror(EINVAL); + if (codepage == 1200) /* little endian */ + wbuf[0] = (buf[1] << 8) | buf[0]; + else if (codepage == 1201) /* big endian */ + wbuf[0] = (buf[0] << 8) | buf[1]; + + if ((cv->flags & FLAG_USE_BOM) && + !(cv->mode & UNICODE_MODE_BOM_DONE)) { + cv->mode |= UNICODE_MODE_BOM_DONE; + if (wbuf[0] == 0xFFFE) { + cv->mode |= UNICODE_MODE_SWAPPED; + *wbufsize = 0; + return 2; + } else if (wbuf[0] == 0xFEFF) { + *wbufsize = 0; + return 2; + } + } + + if (0xDC00 <= wbuf[0] && wbuf[0] <= 0xDFFF) + return seterror(EILSEQ); + if (0xD800 <= wbuf[0] && wbuf[0] <= 0xDBFF) { + if (bufsize < 4) + return seterror(EINVAL); + if (codepage == 1200) /* little endian */ + wbuf[1] = (buf[3] << 8) | buf[2]; + else if (codepage == 1201) /* big endian */ + wbuf[1] = (buf[2] << 8) | buf[3]; + if (!(0xDC00 <= wbuf[1] && wbuf[1] <= 0xDFFF)) + return seterror(EILSEQ); + *wbufsize = 2; + return 4; + } + *wbufsize = 1; + return 2; +} + +static int utf16_wctomb( + csconv_t *cv, + ushort *wbuf, + int wbufsize, + uchar *buf, + int bufsize +) { + if ((cv->flags & FLAG_USE_BOM) && + !(cv->mode & UNICODE_MODE_BOM_DONE)) { + int r; + + cv->mode |= UNICODE_MODE_BOM_DONE; + if (bufsize < 2) + return seterror(E2BIG); + if (cv->codepage == 1200) /* little endian */ + memcpy(buf, "\xFF\xFE", 2); + else if (cv->codepage == 1201) /* big endian */ + memcpy(buf, "\xFE\xFF", 2); + + r = utf16_wctomb(cv, wbuf, wbufsize, buf + 2, bufsize - 2); + if (r == -1) + return -1; + return r + 2; + } + + if (bufsize < 2) + return seterror(E2BIG); + if (cv->codepage == 1200) { /* little endian */ + buf[0] = (wbuf[0] & 0x00FF); + buf[1] = (wbuf[0] & 0xFF00) >> 8; + } else if (cv->codepage == 1201) { /* big endian */ + buf[0] = (wbuf[0] & 0xFF00) >> 8; + buf[1] = (wbuf[0] & 0x00FF); + } + if (0xD800 <= wbuf[0] && wbuf[0] <= 0xDBFF) { + if (bufsize < 4) + return seterror(E2BIG); + if (cv->codepage == 1200) { /* little endian */ + buf[2] = (wbuf[1] & 0x00FF); + buf[3] = (wbuf[1] & 0xFF00) >> 8; + } else if (cv->codepage == 1201) { /* big endian */ + buf[2] = (wbuf[1] & 0xFF00) >> 8; + buf[3] = (wbuf[1] & 0x00FF); + } + return 4; + } + return 2; +} + +static int utf32_mbtowc( + csconv_t *cv, + const uchar *buf, + int bufsize, + ushort *wbuf, + int *wbufsize +) { + int codepage = cv->codepage; + uint wc = 0xD800; + + /* swap endian: 12000 <-> 12001 */ + if (cv->mode & UNICODE_MODE_SWAPPED) + codepage ^= 1; + + if (bufsize < 4) + return seterror(EINVAL); + if (codepage == 12000) /* little endian */ + wc = (buf[3] << 24) | (buf[2] << 16) | (buf[1] << 8) | buf[0]; + else if (codepage == 12001) /* big endian */ + wc = (buf[0] << 24) | (buf[1] << 16) | (buf[2] << 8) | buf[3]; + + if ((cv->flags & FLAG_USE_BOM) && !(cv->mode & UNICODE_MODE_BOM_DONE)) { + cv->mode |= UNICODE_MODE_BOM_DONE; + if (wc == 0xFFFE0000) { + cv->mode |= UNICODE_MODE_SWAPPED; + *wbufsize = 0; + return 4; + } else if (wc == 0x0000FEFF) { + *wbufsize = 0; + return 4; + } + } + + if ((0xD800 <= wc && wc <= 0xDFFF) || 0x10FFFF < wc) + return seterror(EILSEQ); + ucs4_to_utf16(wc, wbuf, wbufsize); + return 4; +} + +static int utf32_wctomb( + csconv_t *cv, + ushort *wbuf, + int wbufsize, + uchar *buf, + int bufsize +) { + uint wc; + + if ((cv->flags & FLAG_USE_BOM) && !(cv->mode & UNICODE_MODE_BOM_DONE)) { + int r; + + cv->mode |= UNICODE_MODE_BOM_DONE; + if (bufsize < 4) + return seterror(E2BIG); + if (cv->codepage == 12000) /* little endian */ + memcpy(buf, "\xFF\xFE\x00\x00", 4); + else if (cv->codepage == 12001) /* big endian */ + memcpy(buf, "\x00\x00\xFE\xFF", 4); + + r = utf32_wctomb(cv, wbuf, wbufsize, buf + 4, bufsize - 4); + if (r == -1) + return -1; + return r + 4; + } + + if (bufsize < 4) + return seterror(E2BIG); + wc = utf16_to_ucs4(wbuf); + if (cv->codepage == 12000) { /* little endian */ + buf[0] = wc & 0x000000FF; + buf[1] = (wc & 0x0000FF00) >> 8; + buf[2] = (wc & 0x00FF0000) >> 16; + buf[3] = (wc & 0xFF000000) >> 24; + } else if (cv->codepage == 12001) { /* big endian */ + buf[0] = (wc & 0xFF000000) >> 24; + buf[1] = (wc & 0x00FF0000) >> 16; + buf[2] = (wc & 0x0000FF00) >> 8; + buf[3] = wc & 0x000000FF; + } + return 4; +} + +/* + * 50220: ISO 2022 Japanese with no halfwidth Katakana; Japanese (JIS) + * 50221: ISO 2022 Japanese with halfwidth Katakana; Japanese (JIS-Allow + * 1 byte Kana) + * 50222: ISO 2022 Japanese JIS X 0201-1989; Japanese (JIS-Allow 1 byte + * Kana - SO/SI) + * + * MultiByteToWideChar() and WideCharToMultiByte() behave differently + * depending on Windows version. On XP, WideCharToMultiByte() doesn't + * terminate result sequence with ascii escape. But Vista does. + * Use MLang instead. + */ + +#define ISO2022_MODE(cs, shift) (((cs) << 8) | (shift)) +#define ISO2022_MODE_CS(mode) (((mode) >> 8) & 0xFF) +#define ISO2022_MODE_SHIFT(mode) ((mode) & 0xFF) + +#define ISO2022_SI 0 +#define ISO2022_SO 1 + +/* shift in */ +static const char iso2022_SI_seq[] = "\x0F"; +/* shift out */ +static const char iso2022_SO_seq[] = "\x0E"; + +typedef struct iso2022_esc_t iso2022_esc_t; +struct iso2022_esc_t { + const char *esc; + int esc_len; + int len; + int cs; +}; + +#define ISO2022JP_CS_ASCII 0 +#define ISO2022JP_CS_JISX0201_ROMAN 1 +#define ISO2022JP_CS_JISX0201_KANA 2 +#define ISO2022JP_CS_JISX0208_1978 3 +#define ISO2022JP_CS_JISX0208_1983 4 +#define ISO2022JP_CS_JISX0212 5 + +static iso2022_esc_t iso2022jp_esc[] = { + {"\x1B\x28\x42", 3, 1, ISO2022JP_CS_ASCII}, + {"\x1B\x28\x4A", 3, 1, ISO2022JP_CS_JISX0201_ROMAN}, + {"\x1B\x28\x49", 3, 1, ISO2022JP_CS_JISX0201_KANA}, + /* unify 1978 with 1983 */ + {"\x1B\x24\x40", 3, 2, ISO2022JP_CS_JISX0208_1983}, + {"\x1B\x24\x42", 3, 2, ISO2022JP_CS_JISX0208_1983}, + {"\x1B\x24\x28\x44", 4, 2, ISO2022JP_CS_JISX0212}, + {NULL, 0, 0, 0} +}; + +static int iso2022jp_mbtowc( + csconv_t *cv, + const uchar *buf, + int bufsize, + ushort *wbuf, + int *wbufsize +) { + iso2022_esc_t *iesc = iso2022jp_esc; + char tmp[MB_CHAR_MAX]; + int insize; + HRESULT hr; + DWORD dummy = 0; + int len; + int esc_len; + int cs; + int shift; + int i; + + if (buf[0] == 0x1B) { + for (i = 0; iesc[i].esc != NULL; ++i) { + esc_len = iesc[i].esc_len; + if (bufsize < esc_len) { + if (strncmp((char *)buf, iesc[i].esc, bufsize) == 0) + return seterror(EINVAL); + } else { + if (strncmp((char *)buf, iesc[i].esc, esc_len) == 0) { + cv->mode = ISO2022_MODE(iesc[i].cs, ISO2022_SI); + *wbufsize = 0; + return esc_len; + } + } + } + /* not supported escape sequence */ + return seterror(EILSEQ); + } else if (buf[0] == iso2022_SO_seq[0]) { + cv->mode = ISO2022_MODE(ISO2022_MODE_CS(cv->mode), ISO2022_SO); + *wbufsize = 0; + return 1; + } else if (buf[0] == iso2022_SI_seq[0]) { + cv->mode = ISO2022_MODE(ISO2022_MODE_CS(cv->mode), ISO2022_SI); + *wbufsize = 0; + return 1; + } + + cs = ISO2022_MODE_CS(cv->mode); + shift = ISO2022_MODE_SHIFT(cv->mode); + + /* reset the mode for informal sequence */ + if (buf[0] < 0x20) { + cs = ISO2022JP_CS_ASCII; + shift = ISO2022_SI; + } + + len = iesc[cs].len; + if (bufsize < len) + return seterror(EINVAL); + for (i = 0; i < len; ++i) + if (!(buf[i] < 0x80)) + return seterror(EILSEQ); + esc_len = iesc[cs].esc_len; + memcpy(tmp, iesc[cs].esc, esc_len); + if (shift == ISO2022_SO) { + memcpy(tmp + esc_len, iso2022_SO_seq, 1); + esc_len += 1; + } + memcpy(tmp + esc_len, buf, len); + + if ((cv->codepage == 50220 || cv->codepage == 50221 + || cv->codepage == 50222) && shift == ISO2022_SO) { + /* XXX: shift-out cannot be used for mbtowc (both kernel and + * mlang) */ + esc_len = iesc[ISO2022JP_CS_JISX0201_KANA].esc_len; + memcpy(tmp, iesc[ISO2022JP_CS_JISX0201_KANA].esc, esc_len); + memcpy(tmp + esc_len, buf, len); + } + + insize = len + esc_len; + hr = ConvertINetMultiByteToUnicode(&dummy, cv->codepage, + (const char *)tmp, &insize, (wchar_t *)wbuf, wbufsize); + if (hr != S_OK || insize != len + esc_len) + return seterror(EILSEQ); + + /* Check for conversion error. Assuming defaultChar is 0x3F. */ + /* ascii should be converted from ascii */ + if (wbuf[0] == buf[0] + && cv->mode != ISO2022_MODE(ISO2022JP_CS_ASCII, ISO2022_SI)) + return seterror(EILSEQ); + + /* reset the mode for informal sequence */ + if (cv->mode != ISO2022_MODE(cs, shift)) + cv->mode = ISO2022_MODE(cs, shift); + + return len; +} + +static int iso2022jp_wctomb( + csconv_t *cv, + ushort *wbuf, + int wbufsize, + uchar *buf, + int bufsize +) { + iso2022_esc_t *iesc = iso2022jp_esc; + char tmp[MB_CHAR_MAX]; + int tmpsize = MB_CHAR_MAX; + int insize = wbufsize; + HRESULT hr; + DWORD dummy = 0; + int len; + int esc_len; + int cs; + int shift; + int i; + + /* + * MultiByte = [escape sequence] + character + [escape sequence] + * + * Whether trailing escape sequence is added depends on which API is + * used (kernel or MLang, and its version). + */ + hr = ConvertINetUnicodeToMultiByte(&dummy, cv->codepage, + (const wchar_t *)wbuf, &wbufsize, tmp, &tmpsize); + if (hr != S_OK || insize != wbufsize) + return seterror(EILSEQ); + else if (bufsize < tmpsize) + return seterror(E2BIG); + + if (tmpsize == 1) { + cs = ISO2022JP_CS_ASCII; + esc_len = 0; + } else { + for (i = 1; iesc[i].esc != NULL; ++i) { + esc_len = iesc[i].esc_len; + if (strncmp(tmp, iesc[i].esc, esc_len) == 0) { + cs = iesc[i].cs; + break; + } + } + if (iesc[i].esc == NULL) + /* not supported escape sequence */ + return seterror(EILSEQ); + } + + shift = ISO2022_SI; + if (tmp[esc_len] == iso2022_SO_seq[0]) { + shift = ISO2022_SO; + esc_len += 1; + } + + len = iesc[cs].len; + + /* Check for converting error. Assuming defaultChar is 0x3F. */ + /* ascii should be converted from ascii */ + if (cs == ISO2022JP_CS_ASCII && !(wbuf[0] < 0x80)) + return seterror(EILSEQ); + else if (tmpsize < esc_len + len) + return seterror(EILSEQ); + + if (cv->mode == ISO2022_MODE(cs, shift)) { + /* remove escape sequence */ + if (esc_len != 0) + memmove(tmp, tmp + esc_len, len); + esc_len = 0; + } else { + if (cs == ISO2022JP_CS_ASCII) { + esc_len = iesc[ISO2022JP_CS_ASCII].esc_len; + memmove(tmp + esc_len, tmp, len); + memcpy(tmp, iesc[ISO2022JP_CS_ASCII].esc, esc_len); + } + if (ISO2022_MODE_SHIFT(cv->mode) == ISO2022_SO) { + /* shift-in before changing to other mode */ + memmove(tmp + 1, tmp, len + esc_len); + memcpy(tmp, iso2022_SI_seq, 1); + esc_len += 1; + } + } + + if (bufsize < len + esc_len) + return seterror(E2BIG); + memcpy(buf, tmp, len + esc_len); + cv->mode = ISO2022_MODE(cs, shift); + return len + esc_len; +} + +static int iso2022jp_flush( + csconv_t *cv, + uchar *buf, + int bufsize +) { + iso2022_esc_t *iesc = iso2022jp_esc; + int esc_len; + + if (cv->mode != ISO2022_MODE(ISO2022JP_CS_ASCII, ISO2022_SI)) { + esc_len = 0; + if (ISO2022_MODE_SHIFT(cv->mode) != ISO2022_SI) + esc_len += 1; + if (ISO2022_MODE_CS(cv->mode) != ISO2022JP_CS_ASCII) + esc_len += iesc[ISO2022JP_CS_ASCII].esc_len; + if (bufsize < esc_len) + return seterror(E2BIG); + + esc_len = 0; + if (ISO2022_MODE_SHIFT(cv->mode) != ISO2022_SI) { + memcpy(buf, iso2022_SI_seq, 1); + esc_len += 1; + } + if (ISO2022_MODE_CS(cv->mode) != ISO2022JP_CS_ASCII) { + memcpy(buf + esc_len, iesc[ISO2022JP_CS_ASCII].esc, + iesc[ISO2022JP_CS_ASCII].esc_len); + esc_len += iesc[ISO2022JP_CS_ASCII].esc_len; + } + return esc_len; + } + return 0; +} + +#if defined(MAKE_DLL) && defined(USE_LIBICONV_DLL) +BOOL WINAPI DllMain( + HINSTANCE hinstDLL, + DWORD fdwReason, + LPVOID lpReserved +) { + switch ( fdwReason ) { + case DLL_PROCESS_ATTACH: + hwiniconv = (HMODULE)hinstDLL; + break; + case DLL_THREAD_ATTACH: + case DLL_THREAD_DETACH: + case DLL_PROCESS_DETACH: + break; + } + return TRUE; +} +#endif + +#if defined(MAKE_EXE) +#include +#include +#include +int main(int argc, char **argv) { + char *fromcode = NULL; + char *tocode = NULL; + int i; + char inbuf[BUFSIZ]; + char outbuf[BUFSIZ]; + const char *pin; + char *pout; + size_t inbytesleft; + size_t outbytesleft; + size_t rest = 0; + iconv_t cd; + size_t r; + FILE *in = stdin; + FILE *out = stdout; + int ignore = 0; + char *p; + + _setmode(_fileno(stdin), _O_BINARY); + _setmode(_fileno(stdout), _O_BINARY); + + for (i = 1; i < argc; ++i) { + if (strcmp(argv[i], "-l") == 0) { + for (i = 0; codepage_alias[i].name != NULL; ++i) + printf("%s\n", codepage_alias[i].name); + return 0; + } + + if (strcmp(argv[i], "-f") == 0) { + fromcode = argv[++i]; + } else if (strcmp(argv[i], "-t") == 0) { + tocode = argv[++i]; + } else if (strcmp(argv[i], "-c") == 0) { + ignore = 1; + } else if (strcmp(argv[i], "--output") == 0) { + out = fopen(argv[++i], "wb"); + if (out == NULL) { + fprintf(stderr, "cannot open %s\n", argv[i]); + return 1; + } + } else { + in = fopen(argv[i], "rb"); + if (in == NULL) { + fprintf(stderr, "cannot open %s\n", argv[i]); + return 1; + } + break; + } + } + + if (fromcode == NULL || tocode == NULL) { + printf("usage: %s [-c] -f from-enc -t to-enc [file]\n", argv[0]); + return 0; + } + + if (ignore) { + p = tocode; + tocode = (char *)malloc(strlen(p) + strlen("//IGNORE") + 1); + if (tocode == NULL) { + perror("fatal error"); + return 1; + } + strcpy(tocode, p); //NOLINT + strcat(tocode, "//IGNORE"); //NOLINT + } + + cd = iconv_open(tocode, fromcode); + if (cd == (iconv_t)(-1)) { + perror("iconv_open error"); + return 1; + } + + while ((inbytesleft = fread( + inbuf + rest, 1, + sizeof(inbuf) - rest, in)) != 0 + || rest != 0) { + inbytesleft += rest; + pin = inbuf; + pout = outbuf; + outbytesleft = sizeof(outbuf); + r = iconv(cd, &pin, &inbytesleft, &pout, &outbytesleft); + fwrite(outbuf, 1, sizeof(outbuf) - outbytesleft, out); + if (r == (size_t)(-1) && + errno != E2BIG && + (errno != EINVAL || feof(in))) { + perror("conversion error"); + return 1; + } + memmove(inbuf, pin, inbytesleft); + rest = inbytesleft; + } + pout = outbuf; + outbytesleft = sizeof(outbuf); + r = iconv(cd, NULL, NULL, &pout, &outbytesleft); + fwrite(outbuf, 1, sizeof(outbuf) - outbytesleft, out); + if (r == (size_t)(-1)) { + perror("conversion error"); + return 1; + } + + iconv_close(cd); + + return 0; +} +#endif From 770b8104c7a1d9ec14f33ff1b4bc4bf7a96f578a Mon Sep 17 00:00:00 2001 From: Parfait Gasana Date: Sun, 9 May 2021 12:38:53 -0500 Subject: [PATCH 12/35] Fix C src warnings raised as compiled errors --- pandas/_libs/src/librdata/CKHashTable.c | 2 +- pandas/_libs/src/librdata/rdata_read.c | 95 +++++++++++++++++-------- pandas/_libs/src/librdata/rdata_write.c | 2 +- setup.py | 4 +- 4 files changed, 70 insertions(+), 33 deletions(-) diff --git a/pandas/_libs/src/librdata/CKHashTable.c b/pandas/_libs/src/librdata/CKHashTable.c index 6178f0360a7d9..c0312e3f5dc74 100644 --- a/pandas/_libs/src/librdata/CKHashTable.c +++ b/pandas/_libs/src/librdata/CKHashTable.c @@ -330,7 +330,7 @@ int ck_hash_table_grow(ck_hash_table_t *table) { } table->capacity = new_capacity; table->count = 0; - for (int i = 0; i < old_capacity; i++) { + for (unsigned int i = 0; i < old_capacity; i++) { if (old_entries[i].key_length != 0) { char *this_key = &table->keys[old_entries[i].key_offset]; uint64_t hash_key = ck_hash_str( diff --git a/pandas/_libs/src/librdata/rdata_read.c b/pandas/_libs/src/librdata/rdata_read.c index 42eda92d61841..3d08a0ad28470 100644 --- a/pandas/_libs/src/librdata/rdata_read.c +++ b/pandas/_libs/src/librdata/rdata_read.c @@ -17,7 +17,7 @@ Copyright (c) 2020 Evan Miller #ifdef _WIN32 #include "win_iconv.h" #else -#include "/usr/include/iconv.h" +#include #endif #include @@ -449,11 +449,23 @@ static int lseek_st(rdata_ctx_t *ctx, size_t len) { ) { int retval = 0; char *buf = rdata_malloc(len); - if (buf == NULL) { - retval = -1; - } else if (read_st(ctx, buf, len) != len) { - retval = -1; + + int result_st = read_st(ctx, buf, len); + + if (result_st > 0) { + if (buf == NULL) { + retval = -1; + } else if ((size_t)result_st != len) { + retval = -1; + } + } else { + if (buf == NULL) { + retval = -1; + } else { + retval = -1; + } } + if (buf) free(buf); @@ -801,9 +813,18 @@ rdata_error_t rdata_parse( v2_header.reader_version = byteswap4(v2_header.reader_version); } - if (is_rdata && v2_header.format_version != header_line[3] - '0') { - retval = RDATA_ERROR_PARSE; - goto cleanup; + int32_t hdr_result = header_line[3] - '0'; + + if (hdr_result > 0) { + if (is_rdata && v2_header.format_version != (uint32_t)hdr_result) { + retval = RDATA_ERROR_PARSE; + goto cleanup; + } + } else { + if (is_rdata) { + retval = RDATA_ERROR_PARSE; + goto cleanup; + } } if (v2_header.format_version == 3) { @@ -1015,8 +1036,10 @@ static rdata_error_t read_sexptype_header( retval = RDATA_ERROR_READ; goto cleanup; } - if (ctx->machine_needs_byteswap) - header_info->attributes = byteswap4(header_info->attributes); + if (ctx->machine_needs_byteswap) { + int32_t hdr_info_attrs = header_info->attributes; + header_info->attributes = byteswap4(hdr_info_attrs); + } } if (header.tag) { if (read_st(ctx, &tag, sizeof(tag)) != sizeof(tag)) { @@ -1099,7 +1122,7 @@ static int handle_vector_attribute( if ((retval = read_length(&length, ctx)) != RDATA_OK) goto cleanup; - if (length <= sizeof(ctx->dims)/sizeof(ctx->dims[0])) { + if ((uint32_t)length <= sizeof(ctx->dims)/sizeof(ctx->dims[0])) { int buf_len = length * sizeof(int32_t); if (read_st(ctx, ctx->dims, buf_len) != buf_len) { retval = RDATA_ERROR_READ; @@ -1147,7 +1170,7 @@ static rdata_error_t read_character_string(char **key, rdata_ctx_t *ctx) { if (ctx->machine_needs_byteswap) length = byteswap4(length); - if (length == -1 || length == 0) { + if ((int32_t)length == -1 || length == 0) { *key = strdup(""); return RDATA_OK; } @@ -1521,13 +1544,13 @@ static rdata_error_t read_altrep_vector( static rdata_error_t read_generic_list(int attributes, rdata_ctx_t *ctx) { rdata_error_t retval = RDATA_OK; int32_t length; - int i; + unsigned int i; rdata_sexptype_info_t sexptype_info; if ((retval = read_length(&length, ctx)) != RDATA_OK) goto cleanup; - for (i=0; i < length; i++) { + for (i=0; i < (uint32_t)length; i++) { if ((retval = read_sexptype_header( &sexptype_info, ctx)) != RDATA_OK ) @@ -1650,11 +1673,14 @@ static rdata_error_t read_string_vector_n( if ((retval = read_length(&string_length, ctx)) != RDATA_OK) goto cleanup; - if (string_length + 1 > buffer_size) { - buffer_size = string_length + 1; - if ((buffer = rdata_realloc(buffer, buffer_size)) == NULL) { - retval = RDATA_ERROR_MALLOC; - goto cleanup; + int32_t str_len_calc = string_length + 1; + if (str_len_calc > 0) { + if ((uint32_t)str_len_calc > buffer_size) { + buffer_size = str_len_calc; + if ((buffer = rdata_realloc(buffer, buffer_size)) == NULL) { + retval = RDATA_ERROR_MALLOC; + goto cleanup; + } } } @@ -1673,15 +1699,19 @@ static rdata_error_t read_string_vector_n( } else if (!ctx->converter) { cb_retval = text_value_handler(buffer, i, callback_ctx); } else { - if (4*string_length + 1 > utf8_buffer_size) { - utf8_buffer_size = 4*string_length + 1; - if ((utf8_buffer = rdata_realloc( - utf8_buffer, utf8_buffer_size)) == NULL - ) { - retval = RDATA_ERROR_MALLOC; - goto cleanup; + int32_t str_len_calc = 4*string_length + 1; + if (str_len_calc >= 0) { + if ((uint32_t)str_len_calc > utf8_buffer_size) { + utf8_buffer_size = str_len_calc; + if ((utf8_buffer = rdata_realloc( + utf8_buffer, utf8_buffer_size)) == NULL + ) { + retval = RDATA_ERROR_MALLOC; + goto cleanup; + } } } + retval = rdata_convert( utf8_buffer, utf8_buffer_size, @@ -1749,7 +1779,7 @@ static rdata_error_t read_value_vector_cb( void *vals = NULL; size_t buf_len = 0; enum rdata_type_e output_data_type; - int i; + unsigned int i; switch (header.type) { case RDATA_SEXPTYPE_REAL_VECTOR: @@ -1783,7 +1813,14 @@ static rdata_error_t read_value_vector_cb( goto cleanup; } - if (read_st(ctx, vals, buf_len) != buf_len) { + ssize_t result_st = read_st(ctx, vals, buf_len); + + if (result_st > 0) { + if ((size_t)result_st != buf_len) { + retval = RDATA_ERROR_READ; + goto cleanup; + } + } else { retval = RDATA_ERROR_READ; goto cleanup; } @@ -1946,7 +1983,7 @@ static rdata_error_t recursive_discard( rdata_sexptype_info_t prot, tag; rdata_error_t error = 0; - int i; + unsigned int i; switch (sexptype_header.type) { case RDATA_SEXPTYPE_SYMBOL: diff --git a/pandas/_libs/src/librdata/rdata_write.c b/pandas/_libs/src/librdata/rdata_write.c index 3a53f595e877e..0383dd85f4ace 100644 --- a/pandas/_libs/src/librdata/rdata_write.c +++ b/pandas/_libs/src/librdata/rdata_write.c @@ -167,7 +167,7 @@ static rdata_error_t rdata_write_string( if (retval != RDATA_OK) goto cleanup; - ssize_t len = string ? strlen(string) : -1; + ssize_t len = string ? (ssize_t)strlen(string) : -1; retval = rdata_write_integer(writer, len); if (retval != RDATA_OK) diff --git a/setup.py b/setup.py index 7b316e1fc0de1..c40fc9a22fb14 100755 --- a/setup.py +++ b/setup.py @@ -587,8 +587,8 @@ def srcpath(name=None, suffix=".pyx", subdir="src"): include = data.get("include", []) include.append(numpy.get_include()) - if name == "io.rdata._rdata" and not is_platform_windows(): - extra_link_args.append("-liconv") + if name == "io.rdata._rdata": + extra_compile_args = ["-D_GNU_SOURCE"] + extra_compile_args obj = Extension( f"pandas.{name}", From 5ce5c05580b5d4aa2b94b336672ac46a209822ba Mon Sep 17 00:00:00 2001 From: Parfait Gasana Date: Sun, 9 May 2021 12:45:49 -0500 Subject: [PATCH 13/35] Remove pyreadr listing in yml files and docs --- ci/deps/actions-37.yaml | 1 - ci/deps/azure-macos-37.yaml | 1 - ci/deps/azure-windows-37.yaml | 1 - doc/source/getting_started/install.rst | 1 - environment.yml | 1 - requirements-dev.txt | 1 - 6 files changed, 6 deletions(-) diff --git a/ci/deps/actions-37.yaml b/ci/deps/actions-37.yaml index 64a1015d64acb..f29830e9b3e79 100644 --- a/ci/deps/actions-37.yaml +++ b/ci/deps/actions-37.yaml @@ -25,5 +25,4 @@ dependencies: - flask - tabulate - pyreadstat - - pyreadr - pip diff --git a/ci/deps/azure-macos-37.yaml b/ci/deps/azure-macos-37.yaml index 1812fd16e3668..8c8b49ff3df5b 100644 --- a/ci/deps/azure-macos-37.yaml +++ b/ci/deps/azure-macos-37.yaml @@ -33,5 +33,4 @@ dependencies: - pip: - cython>=0.29.21 - pyreadstat - - pyreadr - pyxlsb diff --git a/ci/deps/azure-windows-37.yaml b/ci/deps/azure-windows-37.yaml index 6e7be62cdc56f..c9d22ffbead45 100644 --- a/ci/deps/azure-windows-37.yaml +++ b/ci/deps/azure-windows-37.yaml @@ -37,7 +37,6 @@ dependencies: - xlsxwriter - xlwt - pyreadstat - - pyreadr - pip - pip: - pyxlsb diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index a1b089098e8b3..16beb00d201b7 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -360,7 +360,6 @@ zlib Compression for HDF5 fastparquet 0.4.0 Parquet reading / writing pyarrow 0.15.0 Parquet, ORC, and feather reading / writing pyreadstat SPSS files (.sav) reading -pyreadr R files (.RData, .rda, .rds) reading / writing ========================= ================== ============================================================= .. _install.warn_orc: diff --git a/environment.yml b/environment.yml index 10aff870e224e..30fa7c0dea696 100644 --- a/environment.yml +++ b/environment.yml @@ -112,7 +112,6 @@ dependencies: - xarray # DataFrame.to_xarray - cftime # Needed for downstream xarray.CFTimeIndex test - pyreadstat # pandas.read_spss - - pyreadr # pandas.read_rdata, DataFrame.to_rdata - tabulate>=0.8.3 # DataFrame.to_markdown - natsort # DataFrame.sort_values - pip: diff --git a/requirements-dev.txt b/requirements-dev.txt index c12f237d3a6bf..3e421c7715566 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -76,7 +76,6 @@ sqlalchemy xarray cftime pyreadstat -pyreadr tabulate>=0.8.3 natsort git+https://github.com/pydata/pydata-sphinx-theme.git@master From f9a23cd9ea0bd9d0796b9f6b7573089c4c997792 Mon Sep 17 00:00:00 2001 From: Parfait Gasana Date: Sun, 9 May 2021 16:49:07 -0500 Subject: [PATCH 14/35] Fix C src warnings, syntax, and add unix_iconv.h --- pandas/_libs/src/librdata/rdata_read.c | 8 ++-- pandas/_libs/src/librdata/unix_iconv.h | 60 ++++++++++++++++++++++++++ pandas/_libs/src/librdata/win_iconv.c | 2 +- setup.py | 3 -- 4 files changed, 66 insertions(+), 7 deletions(-) create mode 100644 pandas/_libs/src/librdata/unix_iconv.h diff --git a/pandas/_libs/src/librdata/rdata_read.c b/pandas/_libs/src/librdata/rdata_read.c index 3d08a0ad28470..4bf181cb559e0 100644 --- a/pandas/_libs/src/librdata/rdata_read.c +++ b/pandas/_libs/src/librdata/rdata_read.c @@ -17,7 +17,7 @@ Copyright (c) 2020 Evan Miller #ifdef _WIN32 #include "win_iconv.h" #else -#include +#include "unix_iconv.h" #endif #include @@ -877,6 +877,9 @@ static rdata_error_t read_toplevel_object( rdata_sexptype_info_t sexptype_info; rdata_error_t retval = RDATA_OK; + sexptype_info.attributes = 0; + sexptype_info.tag = 0; + sexptype_info.ref = 0; if ((retval = read_sexptype_header(&sexptype_info, ctx)) != RDATA_OK) goto cleanup; @@ -1037,8 +1040,7 @@ static rdata_error_t read_sexptype_header( goto cleanup; } if (ctx->machine_needs_byteswap) { - int32_t hdr_info_attrs = header_info->attributes; - header_info->attributes = byteswap4(hdr_info_attrs); + header_info->attributes = byteswap4(header_info->attributes); } } if (header.tag) { diff --git a/pandas/_libs/src/librdata/unix_iconv.h b/pandas/_libs/src/librdata/unix_iconv.h new file mode 100644 index 0000000000000..58ee38c36dd9c --- /dev/null +++ b/pandas/_libs/src/librdata/unix_iconv.h @@ -0,0 +1,60 @@ +/* Copyright (C) 1997-2020 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#ifndef PANDAS__LIBS_SRC_LIBRDATA_UNIX_ICONV_H_ +#define PANDAS__LIBS_SRC_LIBRDATA_UNIX_ICONV_H_ + +#ifndef _ICONV_H +#define _ICONV_H 1 + +#include +#define __need_size_t +#include + + +__BEGIN_DECLS + +/* Identifier for conversion method from one codeset to another. */ +typedef void *iconv_t; + + +/* Allocate descriptor for code conversion from codeset FROMCODE to + codeset TOCODE. + + This function is a possible cancellation point and therefore not + marked with __THROW. */ +extern iconv_t iconv_open(const char *__tocode, const char *__fromcode); + +/* Convert at most *INBYTESLEFT bytes from *INBUF according to the + code conversion algorithm specified by CD and place up to + *OUTBYTESLEFT bytes in buffer at *OUTBUF. */ +extern size_t iconv(iconv_t __cd, char **__restrict __inbuf, + size_t *__restrict __inbytesleft, + char **__restrict __outbuf, + size_t *__restrict __outbytesleft); + +/* Free resources allocated for descriptor CD for code conversion. + + This function is a possible cancellation point and therefore not + marked with __THROW. */ +extern int iconv_close(iconv_t __cd); + +__END_DECLS + +#endif /* iconv.h */ + +#endif // PANDAS__LIBS_SRC_LIBRDATA_UNIX_ICONV_H_ diff --git a/pandas/_libs/src/librdata/win_iconv.c b/pandas/_libs/src/librdata/win_iconv.c index 23d9938b3d795..b772cd1bb8a22 100644 --- a/pandas/_libs/src/librdata/win_iconv.c +++ b/pandas/_libs/src/librdata/win_iconv.c @@ -592,7 +592,7 @@ static struct { * IBM EBCDIC France (20297 + Euro symbol); * IBM EBCDIC (France-Euro) */ - {1147, "IBM01147"} + {1147, "IBM01147"}, /* * IBM EBCDIC International (500 + Euro symbol); * IBM EBCDIC (International-Euro) diff --git a/setup.py b/setup.py index 45533dd3b2e0f..bf162998b9a07 100755 --- a/setup.py +++ b/setup.py @@ -588,9 +588,6 @@ def srcpath(name=None, suffix=".pyx", subdir="src"): include = data.get("include", []) include.append(numpy.get_include()) - if name == "io.rdata._rdata": - extra_compile_args = ["-D_GNU_SOURCE"] + extra_compile_args - obj = Extension( f"pandas.{name}", sources=sources, From 952889f9b3af9503769ff6cbecceff3e6a881fe0 Mon Sep 17 00:00:00 2001 From: Parfait Gasana Date: Sun, 9 May 2021 22:40:21 -0500 Subject: [PATCH 15/35] Fix docstring issue and add mac_iconv.h --- pandas/_libs/src/librdata/mac_iconv.h | 222 +++++++++++++++++++++++++ pandas/_libs/src/librdata/rdata_read.c | 4 +- pandas/core/frame.py | 11 +- 3 files changed, 230 insertions(+), 7 deletions(-) create mode 100644 pandas/_libs/src/librdata/mac_iconv.h diff --git a/pandas/_libs/src/librdata/mac_iconv.h b/pandas/_libs/src/librdata/mac_iconv.h new file mode 100644 index 0000000000000..ce6fd073d7d02 --- /dev/null +++ b/pandas/_libs/src/librdata/mac_iconv.h @@ -0,0 +1,222 @@ +/* Copyright (C) 1999-2003, 2005-2006 Free Software Foundation, Inc. + This file is part of the GNU LIBICONV Library. + + The GNU LIBICONV Library is free software; you can redistribute it + and/or modify it under the terms of the GNU Library General Public + License as published by the Free Software Foundation; either version 2 + of the License, or (at your option) any later version. + + The GNU LIBICONV Library is distributed in the hope that it will be + useful, but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Library General Public License for more details. + + You should have received a copy of the GNU Library General Public + License along with the GNU LIBICONV Library; see the file COPYING.LIB. + If not, write to the Free Software Foundation, Inc., 51 Franklin Street, + Fifth Floor, Boston, MA 02110-1301, USA. */ + +/* When installed, this file is called "iconv.h". */ + +#ifndef PANDAS__LIBS_SRC_LIBRDATA_MAC_ICONV_H_ +#define PANDAS__LIBS_SRC_LIBRDATA_MAC_ICONV_H_ + +#ifndef _LIBICONV_H +#define _LIBICONV_H + +#include +#include <_types.h> +#include + +#define _LIBICONV_VERSION 0x010B /* version number: (major<<8) + minor */ + +#if BUILDING_LIBICONV +#define __LIBICONV_DLL_EXPORTED __attribute__((__visibility__("default"))) +#else +#define __LIBICONV_DLL_EXPORTED +#endif +extern __LIBICONV_DLL_EXPORTED int _libiconv_version; /* Likewise */ + +/* We would like to #include any system header file which could define + iconv_t, 1. in order to eliminate the risk that the user gets compilation + errors because some other system header file includes /usr/include/iconv.h + which defines iconv_t or declares iconv after this file, 2. when compiling + for LIBICONV_PLUG, we need the proper iconv_t type in order to produce + binary compatible code. + But gcc's #include_next is not portable. Thus, once libiconv's iconv.h + has been installed in /usr/local/include, there is no way any more to + include the original /usr/include/iconv.h. We simply have to get away + without it. + Ad 1. The risk that a system header file does + #include "iconv.h" or #include_next "iconv.h" + is small. They all do #include . + Ad 2. The iconv_t type is a pointer type in all cases I have seen. (It + has to be a scalar type because (iconv_t)(-1) is a possible return value + from iconv_open().) */ + +/* Define iconv_t ourselves. */ +#ifndef _ICONV_T +#define _ICONV_T +typedef void* iconv_t; +#endif + + +#ifdef __cplusplus +extern "C" { +#endif + + +/* Allocates descriptor for code conversion from encoding `fromcode' to + encoding `tocode'. */ +extern __LIBICONV_DLL_EXPORTED iconv_t iconv_open( + const char* __tocode, + const char* __fromcode); + +/* Converts, using conversion descriptor `cd', at most `*inbytesleft' bytes + starting at `*inbuf', writing at most `*outbytesleft' bytes starting at + `*outbuf'. + Decrements `*inbytesleft' and increments `*inbuf' by the same amount. + Decrements `*outbytesleft' and increments `*outbuf' by the same amount. */ +extern __LIBICONV_DLL_EXPORTED size_t iconv( + iconv_t __cd, + char* * __restrict __inbuf, + size_t * __restrict __inbytesleft, + char* * __restrict __outbuf, + size_t * __restrict __outbytesleft); + +/* Frees resources allocated for conversion descriptor `cd'. */ +extern __LIBICONV_DLL_EXPORTED int iconv_close(iconv_t _cd); + +#if !defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE) + +/* Nonstandard extensions. */ + +#include + +/* Control of attributes. */ +extern __LIBICONV_DLL_EXPORTED int iconvctl( + iconv_t cd, + int request, + void* argument); + +/* Hook performed after every successful conversion of a Unicode character. */ +typedef void (*iconv_unicode_char_hook)(unsigned int uc, void* data); +/* Hook performed after every successful conversion of a wide character. */ +typedef void (*iconv_wide_char_hook)(wchar_t wc, void* data); +/* Set of hooks. */ +struct iconv_hooks { + iconv_unicode_char_hook uc_hook; + iconv_wide_char_hook wc_hook; + void* data; +}; + +/* Fallback function. Invoked when a small number of bytes could not be + converted to a Unicode character. This function should process all + bytes from inbuf and may produce replacement Unicode characters by calling + the write_replacement callback repeatedly. */ +typedef void (*iconv_unicode_mb_to_uc_fallback)( + const char* inbuf, size_t inbufsize, + void (*write_replacement)( + const unsigned int *buf, + size_t buflen, + void* callback_arg), + void* callback_arg, + void* data); +/* Fallback function. Invoked when a Unicode character could not be converted + to the target encoding. This function should process the character and + may produce replacement bytes (in the target encoding) by calling the + write_replacement callback repeatedly. */ +typedef void (*iconv_unicode_uc_to_mb_fallback)( + unsigned int code, + void (*write_replacement)( + const char *buf, + size_t buflen, + void* callback_arg), + void* callback_arg, + void* data); +#if 1 +/* Fallback function. Invoked when a number of bytes could not be converted to + a wide character. This function should process all bytes from inbuf and may + produce replacement wide characters by calling the write_replacement + callback repeatedly. */ +typedef void (*iconv_wchar_mb_to_wc_fallback)( + const char* inbuf, size_t inbufsize, + void (*write_replacement)( + const wchar_t *buf, + size_t buflen, + void* callback_arg), + void* callback_arg, + void* data); +/* Fallback function. Invoked when a wide character could not be converted to + the target encoding. This function should process the character and may + produce replacement bytes (in the target encoding) by calling the + write_replacement callback repeatedly. */ +typedef void (*iconv_wchar_wc_to_mb_fallback)( + wchar_t code, + void (*write_replacement)( + const char *buf, + size_t buflen, + void* callback_arg), + void* callback_arg, + void* data); +#else +/* If the wchar_t type does not exist, these two fallback functions are never + invoked. Their argument list therefore does not matter. */ +typedef void (*iconv_wchar_mb_to_wc_fallback) (); +typedef void (*iconv_wchar_wc_to_mb_fallback) (); +#endif +/* Set of fallbacks. */ +struct iconv_fallbacks { + iconv_unicode_mb_to_uc_fallback mb_to_uc_fallback; + iconv_unicode_uc_to_mb_fallback uc_to_mb_fallback; + iconv_wchar_mb_to_wc_fallback mb_to_wc_fallback; + iconv_wchar_wc_to_mb_fallback wc_to_mb_fallback; + void* data; +}; + +/* Requests for iconvctl. */ +#define ICONV_TRIVIALP 0 /* int *argument */ +#define ICONV_GET_TRANSLITERATE 1 /* int *argument */ +#define ICONV_SET_TRANSLITERATE 2 /* const int *argument */ +#define ICONV_GET_DISCARD_ILSEQ 3 /* int *argument */ +#define ICONV_SET_DISCARD_ILSEQ 4 /* const int *argument */ +/* const struct iconv_hooks *argument */ +#define ICONV_SET_HOOKS 5 +/* const struct iconv_fallbacks *argument */ +#define ICONV_SET_FALLBACKS 6 + +/* Listing of locale independent encodings. */ +extern __LIBICONV_DLL_EXPORTED void iconvlist( + int (*do_one)( + unsigned int namescount, + const char * const * names, + void* data), + void* data); + +/* Canonicalize an encoding name. + The result is either a canonical encoding name, or name itself. */ +extern __LIBICONV_DLL_EXPORTED const char * iconv_canonicalize( + const char * name); + +/* Support for relocatable packages. */ + +/* Sets the original and the current installation prefix of the package. + Relocation simply replaces a pathname starting with the original prefix + by the corresponding pathname with the current prefix instead. Both + prefixes should be directory names without trailing slash (i.e. use "" + instead of "/"). */ +extern __LIBICONV_DLL_EXPORTED void libiconv_set_relocation_prefix( + const char *orig_prefix, + const char *curr_prefix); + +#endif /* (!_POSIX_C_SOURCE || _DARWIN_C_SOURCE) */ + + +#ifdef __cplusplus +} +#endif + + +#endif /* _LIBICONV_H */ + +#endif // PANDAS__LIBS_SRC_LIBRDATA_MAC_ICONV_H_ diff --git a/pandas/_libs/src/librdata/rdata_read.c b/pandas/_libs/src/librdata/rdata_read.c index 4bf181cb559e0..a1139de1d0fe9 100644 --- a/pandas/_libs/src/librdata/rdata_read.c +++ b/pandas/_libs/src/librdata/rdata_read.c @@ -14,8 +14,10 @@ Copyright (c) 2020 Evan Miller #include #include -#ifdef _WIN32 +#if defined(_WIN32) #include "win_iconv.h" +#elif __APPLE__ +#include "mac_iconv.h" #else #include "unix_iconv.h" #endif diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 1a7c7d4ce86f6..8ee62a2ef3487 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2346,13 +2346,12 @@ def to_rdata( Notes ----- For more information of R serialization data types, see docs on - `rda`_ and `rds`_ formats. + rda_ and rds_ formats. - .. _rda: https://www.rdocumentation.org/packages/base/versions/3.6.2/ - topics/save - - .. _rds: https://www.rdocumentation.org/packages/base/versions/3.6.2/ - topics/readRDS + .. _rda: https://www.rdocumentation.org/packages/base/versions/3.6.2/\ +topics/save + .. _rds: https://www.rdocumentation.org/packages/base/versions/3.6.2/\ +topics/readRDS Examples -------- From 4a0cf892683133dec9265ebd011b12a0370ae8c0 Mon Sep 17 00:00:00 2001 From: Parfait Gasana Date: Mon, 10 May 2021 21:05:38 -0500 Subject: [PATCH 16/35] Adjust Cython scripts to fix write rdata for Windows, revert Mac iconv --- pandas/_libs/src/librdata/rdata_read.c | 6 ++--- pandas/io/rdata/_rdata.pxd | 29 +++++++++++++++------ pandas/io/rdata/_rdata.pyx | 36 +++++++++++++++++++++----- 3 files changed, 54 insertions(+), 17 deletions(-) diff --git a/pandas/_libs/src/librdata/rdata_read.c b/pandas/_libs/src/librdata/rdata_read.c index a1139de1d0fe9..d311831dcc884 100644 --- a/pandas/_libs/src/librdata/rdata_read.c +++ b/pandas/_libs/src/librdata/rdata_read.c @@ -16,10 +16,10 @@ Copyright (c) 2020 Evan Miller #if defined(_WIN32) #include "win_iconv.h" -#elif __APPLE__ -#include "mac_iconv.h" -#else +#elif __linux__ #include "unix_iconv.h" +#else +#include "" #endif #include diff --git a/pandas/io/rdata/_rdata.pxd b/pandas/io/rdata/_rdata.pxd index 66b829a1e7eec..0189f8e94bcf9 100644 --- a/pandas/io/rdata/_rdata.pxd +++ b/pandas/io/rdata/_rdata.pxd @@ -250,18 +250,31 @@ cdef extern from 'rdata.h': rdata_writer_t *writer ) - cdef extern from "": - int open(const char *path, int oflag, int mode) - IF UNAME_SYSNAME == "Windows": + cdef extern from "": + int _sopen(const char *path, int oflag, int shflag, int pmode) + cdef extern from "": - int close(int fd) - ssize_t write(int fd, const void *buf, size_t nbyte) + int _close(int fd) + ssize_t _write(int fd, const void *buf, size_t nbyte) + + cdef extern from "" nogil: + enum: _O_CREAT + enum: _O_WRONLY + enum: _O_BINARY + enum: _O_U8TEXT + enum: _SH_DENYNO + enum: _S_IREAD + enum: _S_IWRITE + ELSE: + cdef extern from "": + int open(const char *path, int oflag, int mode) + cdef extern from "": int close(int fd) ssize_t write(int fd, const void *buf, size_t nbyte) - cdef extern from "" nogil: - enum: O_CREAT - enum: O_WRONLY + cdef extern from "" nogil: + enum: O_CREAT + enum: O_WRONLY diff --git a/pandas/io/rdata/_rdata.pyx b/pandas/io/rdata/_rdata.pyx index 33a6cd60e6396..7f0004dc5428f 100644 --- a/pandas/io/rdata/_rdata.pyx +++ b/pandas/io/rdata/_rdata.pyx @@ -303,7 +303,13 @@ class LibrdataWriterError(Exception): cdef ssize_t write_data(const void *bytes, size_t len, void *ctx): cdef int fd = (ctx)[0] - return write(fd, bytes, len) + + IF UNAME_SYSNAME == "Windows": + result = _write(fd, bytes, len) + ELSE: + result = write(fd, bytes, len) + + return result cdef class LibrdataWriter(): """ @@ -319,7 +325,8 @@ cdef class LibrdataWriter(): dict rdict dict rformats dict rtypes - str tbl_name + bytes file_name + bytes tbl_name rdata_writer_t *writer rdata_column_t *py_col @@ -355,7 +362,8 @@ cdef class LibrdataWriter(): cpdef write_rdata(self, rfile, rdict, rformat, tbl_name=None): self.rdict = rdict - self.tbl_name = tbl_name + self.file_name = rfile.encode("utf-8") + self.tbl_name = tbl_name.encode("utf-8") self.row_count = len(next(iter(rdict["data"].items()))[1]) self.rformats = { @@ -382,7 +390,16 @@ cdef class LibrdataWriter(): "object": RDATA_TYPE_STRING } - self.fd = open(rfile, O_CREAT | O_WRONLY, 0644); + IF UNAME_SYSNAME == "Windows": + self.fd = _sopen( + self.file_name, + _O_CREAT | _O_WRONLY | _O_BINARY | _O_U8TEXT, + _SH_DENYNO, + _S_IREAD | _S_IWRITE + ) + ELSE: + self.fd = open(self.file_name, O_CREAT | O_WRONLY, 0644) + self.writer = rdata_writer_init(write_data, self.rformats[rformat]) for k, v in self.rdict["dtypes"].items(): @@ -400,7 +417,8 @@ cdef class LibrdataWriter(): ): self.write_col_data(n, kd, vd, kt, vt) - except (TypeError, ValueError, UnicodeDecodeError) as e: + except (TypeError, ValueError, UnicodeDecodeError): + self.close_rdata() raise LibrdataWriterError( "DataFrame contains one more invalid types or data values. " "that does not conform to R data types." @@ -409,5 +427,11 @@ cdef class LibrdataWriter(): rdata_end_table(self.writer, self.row_count, "pandas_dataframe") rdata_end_file(self.writer) - close(self.fd) + self.close_rdata() rdata_writer_free(self.writer) + + cdef close_rdata(self): + IF UNAME_SYSNAME == "Windows": + _close(self.fd) + ELSE: + close(self.fd) From 09f2005237adfed8d047585d97b72c877a383e7e Mon Sep 17 00:00:00 2001 From: Parfait Gasana Date: Mon, 10 May 2021 23:09:49 -0500 Subject: [PATCH 17/35] Remove quotes in include iconv.h line of C source --- pandas/_libs/src/librdata/mac_iconv.h | 222 ------------------------- pandas/_libs/src/librdata/rdata_read.c | 2 +- 2 files changed, 1 insertion(+), 223 deletions(-) delete mode 100644 pandas/_libs/src/librdata/mac_iconv.h diff --git a/pandas/_libs/src/librdata/mac_iconv.h b/pandas/_libs/src/librdata/mac_iconv.h deleted file mode 100644 index ce6fd073d7d02..0000000000000 --- a/pandas/_libs/src/librdata/mac_iconv.h +++ /dev/null @@ -1,222 +0,0 @@ -/* Copyright (C) 1999-2003, 2005-2006 Free Software Foundation, Inc. - This file is part of the GNU LIBICONV Library. - - The GNU LIBICONV Library is free software; you can redistribute it - and/or modify it under the terms of the GNU Library General Public - License as published by the Free Software Foundation; either version 2 - of the License, or (at your option) any later version. - - The GNU LIBICONV Library is distributed in the hope that it will be - useful, but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Library General Public License for more details. - - You should have received a copy of the GNU Library General Public - License along with the GNU LIBICONV Library; see the file COPYING.LIB. - If not, write to the Free Software Foundation, Inc., 51 Franklin Street, - Fifth Floor, Boston, MA 02110-1301, USA. */ - -/* When installed, this file is called "iconv.h". */ - -#ifndef PANDAS__LIBS_SRC_LIBRDATA_MAC_ICONV_H_ -#define PANDAS__LIBS_SRC_LIBRDATA_MAC_ICONV_H_ - -#ifndef _LIBICONV_H -#define _LIBICONV_H - -#include -#include <_types.h> -#include - -#define _LIBICONV_VERSION 0x010B /* version number: (major<<8) + minor */ - -#if BUILDING_LIBICONV -#define __LIBICONV_DLL_EXPORTED __attribute__((__visibility__("default"))) -#else -#define __LIBICONV_DLL_EXPORTED -#endif -extern __LIBICONV_DLL_EXPORTED int _libiconv_version; /* Likewise */ - -/* We would like to #include any system header file which could define - iconv_t, 1. in order to eliminate the risk that the user gets compilation - errors because some other system header file includes /usr/include/iconv.h - which defines iconv_t or declares iconv after this file, 2. when compiling - for LIBICONV_PLUG, we need the proper iconv_t type in order to produce - binary compatible code. - But gcc's #include_next is not portable. Thus, once libiconv's iconv.h - has been installed in /usr/local/include, there is no way any more to - include the original /usr/include/iconv.h. We simply have to get away - without it. - Ad 1. The risk that a system header file does - #include "iconv.h" or #include_next "iconv.h" - is small. They all do #include . - Ad 2. The iconv_t type is a pointer type in all cases I have seen. (It - has to be a scalar type because (iconv_t)(-1) is a possible return value - from iconv_open().) */ - -/* Define iconv_t ourselves. */ -#ifndef _ICONV_T -#define _ICONV_T -typedef void* iconv_t; -#endif - - -#ifdef __cplusplus -extern "C" { -#endif - - -/* Allocates descriptor for code conversion from encoding `fromcode' to - encoding `tocode'. */ -extern __LIBICONV_DLL_EXPORTED iconv_t iconv_open( - const char* __tocode, - const char* __fromcode); - -/* Converts, using conversion descriptor `cd', at most `*inbytesleft' bytes - starting at `*inbuf', writing at most `*outbytesleft' bytes starting at - `*outbuf'. - Decrements `*inbytesleft' and increments `*inbuf' by the same amount. - Decrements `*outbytesleft' and increments `*outbuf' by the same amount. */ -extern __LIBICONV_DLL_EXPORTED size_t iconv( - iconv_t __cd, - char* * __restrict __inbuf, - size_t * __restrict __inbytesleft, - char* * __restrict __outbuf, - size_t * __restrict __outbytesleft); - -/* Frees resources allocated for conversion descriptor `cd'. */ -extern __LIBICONV_DLL_EXPORTED int iconv_close(iconv_t _cd); - -#if !defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE) - -/* Nonstandard extensions. */ - -#include - -/* Control of attributes. */ -extern __LIBICONV_DLL_EXPORTED int iconvctl( - iconv_t cd, - int request, - void* argument); - -/* Hook performed after every successful conversion of a Unicode character. */ -typedef void (*iconv_unicode_char_hook)(unsigned int uc, void* data); -/* Hook performed after every successful conversion of a wide character. */ -typedef void (*iconv_wide_char_hook)(wchar_t wc, void* data); -/* Set of hooks. */ -struct iconv_hooks { - iconv_unicode_char_hook uc_hook; - iconv_wide_char_hook wc_hook; - void* data; -}; - -/* Fallback function. Invoked when a small number of bytes could not be - converted to a Unicode character. This function should process all - bytes from inbuf and may produce replacement Unicode characters by calling - the write_replacement callback repeatedly. */ -typedef void (*iconv_unicode_mb_to_uc_fallback)( - const char* inbuf, size_t inbufsize, - void (*write_replacement)( - const unsigned int *buf, - size_t buflen, - void* callback_arg), - void* callback_arg, - void* data); -/* Fallback function. Invoked when a Unicode character could not be converted - to the target encoding. This function should process the character and - may produce replacement bytes (in the target encoding) by calling the - write_replacement callback repeatedly. */ -typedef void (*iconv_unicode_uc_to_mb_fallback)( - unsigned int code, - void (*write_replacement)( - const char *buf, - size_t buflen, - void* callback_arg), - void* callback_arg, - void* data); -#if 1 -/* Fallback function. Invoked when a number of bytes could not be converted to - a wide character. This function should process all bytes from inbuf and may - produce replacement wide characters by calling the write_replacement - callback repeatedly. */ -typedef void (*iconv_wchar_mb_to_wc_fallback)( - const char* inbuf, size_t inbufsize, - void (*write_replacement)( - const wchar_t *buf, - size_t buflen, - void* callback_arg), - void* callback_arg, - void* data); -/* Fallback function. Invoked when a wide character could not be converted to - the target encoding. This function should process the character and may - produce replacement bytes (in the target encoding) by calling the - write_replacement callback repeatedly. */ -typedef void (*iconv_wchar_wc_to_mb_fallback)( - wchar_t code, - void (*write_replacement)( - const char *buf, - size_t buflen, - void* callback_arg), - void* callback_arg, - void* data); -#else -/* If the wchar_t type does not exist, these two fallback functions are never - invoked. Their argument list therefore does not matter. */ -typedef void (*iconv_wchar_mb_to_wc_fallback) (); -typedef void (*iconv_wchar_wc_to_mb_fallback) (); -#endif -/* Set of fallbacks. */ -struct iconv_fallbacks { - iconv_unicode_mb_to_uc_fallback mb_to_uc_fallback; - iconv_unicode_uc_to_mb_fallback uc_to_mb_fallback; - iconv_wchar_mb_to_wc_fallback mb_to_wc_fallback; - iconv_wchar_wc_to_mb_fallback wc_to_mb_fallback; - void* data; -}; - -/* Requests for iconvctl. */ -#define ICONV_TRIVIALP 0 /* int *argument */ -#define ICONV_GET_TRANSLITERATE 1 /* int *argument */ -#define ICONV_SET_TRANSLITERATE 2 /* const int *argument */ -#define ICONV_GET_DISCARD_ILSEQ 3 /* int *argument */ -#define ICONV_SET_DISCARD_ILSEQ 4 /* const int *argument */ -/* const struct iconv_hooks *argument */ -#define ICONV_SET_HOOKS 5 -/* const struct iconv_fallbacks *argument */ -#define ICONV_SET_FALLBACKS 6 - -/* Listing of locale independent encodings. */ -extern __LIBICONV_DLL_EXPORTED void iconvlist( - int (*do_one)( - unsigned int namescount, - const char * const * names, - void* data), - void* data); - -/* Canonicalize an encoding name. - The result is either a canonical encoding name, or name itself. */ -extern __LIBICONV_DLL_EXPORTED const char * iconv_canonicalize( - const char * name); - -/* Support for relocatable packages. */ - -/* Sets the original and the current installation prefix of the package. - Relocation simply replaces a pathname starting with the original prefix - by the corresponding pathname with the current prefix instead. Both - prefixes should be directory names without trailing slash (i.e. use "" - instead of "/"). */ -extern __LIBICONV_DLL_EXPORTED void libiconv_set_relocation_prefix( - const char *orig_prefix, - const char *curr_prefix); - -#endif /* (!_POSIX_C_SOURCE || _DARWIN_C_SOURCE) */ - - -#ifdef __cplusplus -} -#endif - - -#endif /* _LIBICONV_H */ - -#endif // PANDAS__LIBS_SRC_LIBRDATA_MAC_ICONV_H_ diff --git a/pandas/_libs/src/librdata/rdata_read.c b/pandas/_libs/src/librdata/rdata_read.c index d311831dcc884..dbc165a2273dc 100644 --- a/pandas/_libs/src/librdata/rdata_read.c +++ b/pandas/_libs/src/librdata/rdata_read.c @@ -19,7 +19,7 @@ Copyright (c) 2020 Evan Miller #elif __linux__ #include "unix_iconv.h" #else -#include "" +#include #endif #include From a9da74aeb24884b78c9d96313da939eaa3e9f8e9 Mon Sep 17 00:00:00 2001 From: Parfait Gasana Date: Tue, 11 May 2021 10:48:28 -0500 Subject: [PATCH 18/35] Add liconv to extra_link_args for Mac OS build --- setup.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/setup.py b/setup.py index bf162998b9a07..7a2fb2bc55af9 100755 --- a/setup.py +++ b/setup.py @@ -588,6 +588,9 @@ def srcpath(name=None, suffix=".pyx", subdir="src"): include = data.get("include", []) include.append(numpy.get_include()) + if name == "io.rdata._rdata" and is_platform_mac(): + extra_link_args.append(["-liconv"]) + obj = Extension( f"pandas.{name}", sources=sources, @@ -601,6 +604,10 @@ def srcpath(name=None, suffix=".pyx", subdir="src"): extensions.append(obj) + if name == "io.rdata._rdata" and is_platform_mac(): + extra_link_args.remove("-liconv") + + # ---------------------------------------------------------------------- # ujson From 40862c51efe439d9fbcf6634fec54afec34bd1fe Mon Sep 17 00:00:00 2001 From: Parfait Gasana Date: Tue, 11 May 2021 12:18:27 -0500 Subject: [PATCH 19/35] Slight fix to liconv in extra_link_args for Mac OS build --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 7a2fb2bc55af9..8635c9eae4384 100755 --- a/setup.py +++ b/setup.py @@ -589,7 +589,7 @@ def srcpath(name=None, suffix=".pyx", subdir="src"): include.append(numpy.get_include()) if name == "io.rdata._rdata" and is_platform_mac(): - extra_link_args.append(["-liconv"]) + extra_link_args.append("-liconv") obj = Extension( f"pandas.{name}", From 63968196a2d9c5ab8cd260b0554b269b00c66934 Mon Sep 17 00:00:00 2001 From: Parfait Gasana Date: Tue, 11 May 2021 14:57:53 -0500 Subject: [PATCH 20/35] Adjust rdata include_dirs for libiconv on Mac OS --- setup.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/setup.py b/setup.py index 8635c9eae4384..689fcdbe20df9 100755 --- a/setup.py +++ b/setup.py @@ -589,7 +589,11 @@ def srcpath(name=None, suffix=".pyx", subdir="src"): include.append(numpy.get_include()) if name == "io.rdata._rdata" and is_platform_mac(): - extra_link_args.append("-liconv") + # non-conda builds must adjust paths to libiconv .h and lib dirs + include = [ + os.path.join(os.environ["CONDA_PREFIX"], "include"), + os.path.join(os.environ["CONDA_PREFIX"], "lib"), + ] + include obj = Extension( f"pandas.{name}", @@ -604,10 +608,6 @@ def srcpath(name=None, suffix=".pyx", subdir="src"): extensions.append(obj) - if name == "io.rdata._rdata" and is_platform_mac(): - extra_link_args.remove("-liconv") - - # ---------------------------------------------------------------------- # ujson From 749a04ec8f6c1970ce4f1031eff36b921d90aeef Mon Sep 17 00:00:00 2001 From: Parfait Gasana Date: Tue, 11 May 2021 17:20:45 -0500 Subject: [PATCH 21/35] Add library_dirs to find libiconv on Mac OS --- setup.py | 64 +++++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 47 insertions(+), 17 deletions(-) diff --git a/setup.py b/setup.py index 689fcdbe20df9..9d17103029201 100755 --- a/setup.py +++ b/setup.py @@ -437,19 +437,6 @@ def srcpath(name=None, suffix=".pyx", subdir="src"): "pandas/_libs/tslibs/src/datetime/np_datetime_strings.h", ] -rdata_srcs = [ - "pandas/_libs/src/librdata/rdata_parser.c", - "pandas/_libs/src/librdata/rdata_read.c", - "pandas/_libs/src/librdata/rdata_write.c", - "pandas/_libs/src/librdata/rdata_io_unistd.c", - "pandas/_libs/src/librdata/rdata_error.c", - "pandas/_libs/src/librdata/rdata_bits.c", - "pandas/_libs/src/librdata/CKHashTable.c", -] - -if is_platform_windows(): - rdata_srcs.append("pandas/_libs/src/librdata/win_iconv.c") - ext_data = { "_libs.algos": { "pyxfile": "_libs/algos", @@ -570,10 +557,6 @@ def srcpath(name=None, suffix=".pyx", subdir="src"): "_libs.window.indexers": {"pyxfile": "_libs/window/indexers"}, "_libs.writers": {"pyxfile": "_libs/writers"}, "io.sas._sas": {"pyxfile": "io/sas/sas"}, - "io.rdata._rdata": { - "pyxfile": "io/rdata/_rdata", - "sources": rdata_srcs, - }, } extensions = [] @@ -654,6 +637,53 @@ def srcpath(name=None, suffix=".pyx", subdir="src"): # ---------------------------------------------------------------------- +# ---------------------------------------------------------------------- +# rdata + +rdata_srcs = [ + "pandas/io/rdata/_rdata.pyx", + "pandas/_libs/src/librdata/rdata_parser.c", + "pandas/_libs/src/librdata/rdata_read.c", + "pandas/_libs/src/librdata/rdata_write.c", + "pandas/_libs/src/librdata/rdata_io_unistd.c", + "pandas/_libs/src/librdata/rdata_error.c", + "pandas/_libs/src/librdata/rdata_bits.c", + "pandas/_libs/src/librdata/CKHashTable.c", +] + +if is_platform_windows(): + rdata_srcs.append("pandas/_libs/src/librdata/win_iconv.c") + +include = [] +libs_dir = [] +libs = [] +if is_platform_mac(): + # non-conda builds must adjust paths to libiconv .h and lib dirs + include = [ + os.path.join(os.environ["CONDA_PREFIX"], "include"), + ] + libs_dir = [ + os.path.join(os.environ["CONDA_PREFIX"], "lib"), + ] + libs = ["liconv"] + +rdata_ext = Extension( + name="io.rdata._rdata", + sources=rdata_srcs, + include_dirs=include, + library_dirs=libs_dir, + libraries=libs, + language="c", + define_macros=macros, + extra_compile_args=extra_compile_args, + extra_link_args=extra_link_args, +) + + +extensions.append(rdata_ext) + +# ---------------------------------------------------------------------- + if __name__ == "__main__": # Freeze to support parallel compilation when using spawn instead of fork From f5ab7cd1570487067e281f28fbd761601cd30512 Mon Sep 17 00:00:00 2001 From: Parfait Gasana Date: Tue, 11 May 2021 19:11:22 -0500 Subject: [PATCH 22/35] Resolve rdata extension name for compilation --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 9d17103029201..95d4133af2e29 100755 --- a/setup.py +++ b/setup.py @@ -668,7 +668,7 @@ def srcpath(name=None, suffix=".pyx", subdir="src"): libs = ["liconv"] rdata_ext = Extension( - name="io.rdata._rdata", + name="pandas.io.rdata._rdata", sources=rdata_srcs, include_dirs=include, library_dirs=libs_dir, From e9852e949ea3400364558eeebc981d17d26bd1dc Mon Sep 17 00:00:00 2001 From: Parfait Gasana Date: Tue, 11 May 2021 20:04:21 -0500 Subject: [PATCH 23/35] Adjust include and library dir in rdata extension for Mac OS --- setup.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/setup.py b/setup.py index 95d4133af2e29..374954ce394b4 100755 --- a/setup.py +++ b/setup.py @@ -658,14 +658,10 @@ def srcpath(name=None, suffix=".pyx", subdir="src"): libs_dir = [] libs = [] if is_platform_mac(): - # non-conda builds must adjust paths to libiconv .h and lib dirs - include = [ - os.path.join(os.environ["CONDA_PREFIX"], "include"), - ] - libs_dir = [ - os.path.join(os.environ["CONDA_PREFIX"], "lib"), - ] - libs = ["liconv"] + os.environ["DYLD_LIBRARY_PATH"] = "" + include = ["/usr/include"] + libs_dir = ["/usr/lib"] + libs = ["iconv"] rdata_ext = Extension( name="pandas.io.rdata._rdata", From a4963810e96e975f293e312f52f13ab1131e2b9a Mon Sep 17 00:00:00 2001 From: Parfait Gasana Date: Sat, 15 May 2021 12:57:32 -0500 Subject: [PATCH 24/35] Adjust docs, tests, and code re dtypes and pickling --- doc/source/user_guide/io.rst | 75 +++++- .../{io/rdata => _libs/src/librdata}/rdata.h | 6 +- pandas/_libs/src/librdata/win_iconv.c | 16 +- pandas/_libs/src/librdata/win_iconv.h | 16 +- pandas/core/frame.py | 14 +- pandas/io/rdata/__init__.py | 4 +- pandas/io/rdata/_rdata.pxd | 2 +- pandas/io/rdata/_rdata.pyx | 64 +++-- pandas/io/rdata/rdata_writer.py | 25 +- .../io/data/rdata/planetary_boundaries_df.rda | Bin 0 -> 197 bytes .../io/data/rdata/planetary_boundaries_df.rds | Bin 0 -> 178 bytes pandas/tests/io/data/rdata/ppm_df.rda | Bin 11221 -> 13418 bytes pandas/tests/io/test_rdata.py | 255 ++++++++++++++---- setup.py | 23 +- 14 files changed, 386 insertions(+), 114 deletions(-) rename pandas/{io/rdata => _libs/src/librdata}/rdata.h (98%) create mode 100644 pandas/tests/io/data/rdata/planetary_boundaries_df.rda create mode 100644 pandas/tests/io/data/rdata/planetary_boundaries_df.rds diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 45e6d80ca1f28..a7df3484f700a 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -6055,12 +6055,9 @@ will occur: .. code-block:: ipython - In [608]: rds_file = os.path.join(file_path, "env_data_non_dfs.rda") + In [608]: rds_file = pd.read_rdata("env_data_non_dfs.rda") ... - LibrdataParserError: Invalid file, or file has unsupported features - - -.. _io.rdata_writer: + LibrdataReaderError: Invalid file, or file has unsupported features Finally, please note R's ``Date`` (without time component) will translate to ``datetime64`` in pandas. Also, R's date/time field type, ``POSIXct``, that can @@ -6086,6 +6083,36 @@ In pandas, conversion shows adjustment in hours to UTC: r_dfs = pd.read_rdata(os.path.join(file_path, "ppm_df.rda")) r_dfs["ppm_df"].tail() +Below is summary of how ``read_rdata`` handles data types between R and pandas. + +.. list-table:: + :widths: 25 25 25 + :header-rows: 1 + + * - R types + - Conversion notes + - pandas types + * - logical + - + - bool + * - integer + - + - int64 + * - numeric + - + - float64 + * - POSIXct + - UTC conversion + - datetime64[ns] + * - factor + - + - Categorical + * - character + - + - object + +.. _io.rdata_writer: + Writing R data '''''''''''''' @@ -6208,6 +6235,44 @@ Once exported, the single DataFrame can be read or loaded in R: 144 Fluorinated gases 2018 182.7824 145 Total 2018 6676.6496 +Please note R does not support all dtypes of pandas. For special dtypes, +you may have to handle data in either end to fit your specific data needs. + +Below is summary of how ``write_rdata`` handles data types between pandas +and R in order to translate pandas simpler dtypes to R's atomic types. + +.. list-table:: + :widths: 25 25 25 + :header-rows: 1 + + * - pandas types + - Conversion notes + - R types + * - bool + - + - logical + * - any uint/int + - + - integer + * - any float + - + - numeric + * - datetime64[ns] + - + - POSIXct + * - datetime64[ns, tz] + - remove tz awareness + - POSIXct + * - timedelta + - convert to seconds + - numeric + * - object + - + - character + * - all other dtypes + - convert to string + - character + .. _io.stata: Stata format diff --git a/pandas/io/rdata/rdata.h b/pandas/_libs/src/librdata/rdata.h similarity index 98% rename from pandas/io/rdata/rdata.h rename to pandas/_libs/src/librdata/rdata.h index 7d49be71f16c4..216c2cbab11d0 100644 --- a/pandas/io/rdata/rdata.h +++ b/pandas/_libs/src/librdata/rdata.h @@ -2,8 +2,8 @@ Copyright (c) 2020 Evan Miller */ -#ifndef PANDAS_IO_RDATA_RDATA_H_ -#define PANDAS_IO_RDATA_RDATA_H_ +#ifndef PANDAS__LIBS_SRC_LIBRDATA_RDATA_H_ +#define PANDAS__LIBS_SRC_LIBRDATA_RDATA_H_ #include #include @@ -254,4 +254,4 @@ rdata_error_t rdata_end_file(rdata_writer_t *writer); } // extern c block #endif -#endif // PANDAS_IO_RDATA_RDATA_H_ +#endif // PANDAS__LIBS_SRC_LIBRDATA_RDATA_H_ diff --git a/pandas/_libs/src/librdata/win_iconv.c b/pandas/_libs/src/librdata/win_iconv.c index b772cd1bb8a22..dd5ddc5882abd 100644 --- a/pandas/_libs/src/librdata/win_iconv.c +++ b/pandas/_libs/src/librdata/win_iconv.c @@ -1,8 +1,16 @@ /* - * No Copyright. - * - * iconv implementation using Win32 API to convert. - * This file is placed in the public domain. + +win-iconv - iconv implementation using Win32 API to convert. +Written in 2009-2016 by Yukihiro Nakadaira +and contributors to win-iconv + +To the extent possible under law, the author(s) have dedicated all copyright +and related and neighboring rights to this software to the public domain +worldwide. This software is distributed without any warranty. + +You should have received a copy of the CC0 Public Domain Dedication along with +this software. If not, see http://creativecommons.org/publicdomain/zero/1.0/. + */ /* for WC_NO_BEST_FIT_CHARS */ diff --git a/pandas/_libs/src/librdata/win_iconv.h b/pandas/_libs/src/librdata/win_iconv.h index ac30123596971..da6e9fa4ab96a 100644 --- a/pandas/_libs/src/librdata/win_iconv.h +++ b/pandas/_libs/src/librdata/win_iconv.h @@ -1,8 +1,16 @@ /* - * No Copyright. - * - * iconv implementation using Win32 API to convert. - * This file is placed in the public domain. + +win-iconv - iconv implementation using Win32 API to convert. +Written in 2009-2016 by Yukihiro Nakadaira +and contributors to win-iconv + +To the extent possible under law, the author(s) have dedicated all copyright +and related and neighboring rights to this software to the public domain +worldwide. This software is distributed without any warranty. + +You should have received a copy of the CC0 Public Domain Dedication along with +this software. If not, see http://creativecommons.org/publicdomain/zero/1.0/. + */ #ifndef PANDAS__LIBS_SRC_LIBRDATA_WIN_ICONV_H_ diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 39473a199e721..6eee0d70b6b2e 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2335,10 +2335,10 @@ def to_rdata( {storage_options} - Returns - ------- - None - Either None for successful output or raises an error. + Raises + ------ + LibrdataWriterError + * If DataFrame types or values not translatable to R data types. See Also -------- @@ -2415,7 +2415,7 @@ def to_rdata( """ from pandas.io.rdata.rdata_writer import RDataWriter - r = RDataWriter( + RDataWriter( self, path_or_buffer=path_or_buffer, file_format=file_format, @@ -2423,9 +2423,7 @@ def to_rdata( index=index, compression=compression, storage_options=storage_options, - ) - - return r.write_data() + ).write_data() @doc(storage_options=generic._shared_docs["storage_options"]) @deprecate_kwarg(old_arg_name="fname", new_arg_name="path") diff --git a/pandas/io/rdata/__init__.py b/pandas/io/rdata/__init__.py index aee8bdaa19c2c..41ebfa536ec3b 100644 --- a/pandas/io/rdata/__init__.py +++ b/pandas/io/rdata/__init__.py @@ -1 +1,3 @@ -from pandas.io.rdata.rdata_reader import read_rdata # noqa +from pandas.io.rdata.rdata_reader import read_rdata + +__all__ = ["read_rdata"] diff --git a/pandas/io/rdata/_rdata.pxd b/pandas/io/rdata/_rdata.pxd index 0189f8e94bcf9..1ed11347d72be 100644 --- a/pandas/io/rdata/_rdata.pxd +++ b/pandas/io/rdata/_rdata.pxd @@ -13,7 +13,7 @@ from libc.time cimport ( ) -cdef extern from 'rdata.h': +cdef extern from '../../_libs/src/librdata/rdata.h': ctypedef enum rdata_type_t: RDATA_TYPE_STRING, diff --git a/pandas/io/rdata/_rdata.pyx b/pandas/io/rdata/_rdata.pyx index 7f0004dc5428f..e9008dcd5c78d 100644 --- a/pandas/io/rdata/_rdata.pyx +++ b/pandas/io/rdata/_rdata.pyx @@ -212,13 +212,16 @@ cdef int handle_dim_name(const char *name, int index, void *ctx) except *: return 0 -class LibrdataParserError(Exception): +class LibrdataReaderError(Exception): """ Base error class to capture exceptions in librdata parsing. """ pass +cdef int length = 40 + + cdef class LibrdataReader: """ Base class to read RData files. @@ -242,7 +245,7 @@ cdef class LibrdataReader: int dims dict dim_str - cpdef read_rdata(self, rfile): + def read_rdata(self, rfile): self.rparser = rdata_parser_init() self.colidx = 0 @@ -289,10 +292,20 @@ cdef class LibrdataReader: if err != RDATA_OK: msg = rdata_error_message(err) - raise LibrdataParserError(msg) + raise LibrdataReaderError(msg) return self.rvalues + cdef bytes get_rparser(self): + return (self.rparser)[:sizeof(rdata_parser_t)*length] + + def __reduce__(self): + rparser = self.get_rparser() + return (rebuild_reader, (rparser,)) + +cpdef object rebuild_reader(bytes data): + return LibrdataReader() + class LibrdataWriterError(Exception): """ @@ -334,23 +347,23 @@ cdef class LibrdataWriter(): py_col = rdata_get_column(self.writer, i) rdata_begin_column(self.writer, py_col, self.row_count) - if vtype == "bool": + if vtype == RDATA_TYPE_LOGICAL: for k, v in vdata.items(): rdata_append_logical_value(self.writer, v) - if vtype.startswith(("int", "uint")): + if vtype == RDATA_TYPE_INT32: for k, v in vdata.items(): rdata_append_int32_value(self.writer, v) - if vtype.startswith("float"): + if vtype == RDATA_TYPE_REAL: for k, v in vdata.items(): rdata_append_real_value(self.writer, v) - if vtype.startswith("datetime64"): + if vtype == RDATA_TYPE_TIMESTAMP: for k, v in vdata.items(): rdata_append_timestamp_value(self.writer, v) - if vtype == "object": + if vtype == RDATA_TYPE_STRING: for k, v in vdata.items(): if v == v: rdata_append_string_value(self.writer, v) @@ -359,7 +372,7 @@ cdef class LibrdataWriter(): rdata_end_column(self.writer, py_col) - cpdef write_rdata(self, rfile, rdict, rformat, tbl_name=None): + def write_rdata(self, rfile, rdict, rformat, tbl_name=None): self.rdict = rdict self.file_name = rfile.encode("utf-8") @@ -374,19 +387,9 @@ cdef class LibrdataWriter(): self.rtypes = { "bool": RDATA_TYPE_LOGICAL, - "int8": RDATA_TYPE_INT32, - "int16": RDATA_TYPE_INT32, - "int32": RDATA_TYPE_INT32, - "int64": RDATA_TYPE_INT32, - "uint8": RDATA_TYPE_INT32, - "uint16": RDATA_TYPE_INT32, - "uint32": RDATA_TYPE_INT32, - "uint64": RDATA_TYPE_INT32, - "float8": RDATA_TYPE_REAL, - "float16": RDATA_TYPE_REAL, - "float32": RDATA_TYPE_REAL, - "float64": RDATA_TYPE_REAL, - "datetime64[ns]": RDATA_TYPE_TIMESTAMP, + "int": RDATA_TYPE_INT32, + "float": RDATA_TYPE_REAL, + "datetime": RDATA_TYPE_TIMESTAMP, "object": RDATA_TYPE_STRING } @@ -415,7 +418,7 @@ cdef class LibrdataWriter(): self.rdict["dtypes"].items() ) ): - self.write_col_data(n, kd, vd, kt, vt) + self.write_col_data(n, kd, vd, kt, self.rtypes[vt]) except (TypeError, ValueError, UnicodeDecodeError): self.close_rdata() @@ -435,3 +438,18 @@ cdef class LibrdataWriter(): _close(self.fd) ELSE: close(self.fd) + + cdef bytes get_writer(self): + return (self.writer)[:sizeof(rdata_writer_t)*length] + + cdef bytes get_py_col(self): + return (self.py_col)[:sizeof(rdata_column_t)*length] + + def __reduce__(self): + writer = self.get_writer() + py_col = self.get_py_col() + return (rebuild_writer, (writer, py_col)) + + +cpdef object rebuild_writer(bytes data1, bytes data2): + return LibrdataWriter() diff --git a/pandas/io/rdata/rdata_writer.py b/pandas/io/rdata/rdata_writer.py index dddfe9684353f..5bb5b4f3f3b90 100644 --- a/pandas/io/rdata/rdata_writer.py +++ b/pandas/io/rdata/rdata_writer.py @@ -130,10 +130,8 @@ def write_data(self) -> None: """ Write DataFrames to R data files. - Converts special class variables (Categorical, datetimes, etc.), - then exports dictionaries of each column to write to disk. For - datetimes, data is converted to epoch seconds with timezone - handling for midnight time-stamped datetimes to align to R types. + Converts non-primitive and non-datetimes to object to align to R + atomic types, then exports dictionaries of each column with meta data. """ self.frame = ( @@ -142,7 +140,8 @@ def write_data(self) -> None: else self.frame.reset_index(drop=True) ) - for col in self.frame.select_dtypes(include="category").columns: + excl_types = ["bool", "number", "object", "datetime", "datetimetz", "timedelta"] + for col in self.frame.select_dtypes(exclude=excl_types).columns: self.frame[col] = self.frame[col].astype(str) for col in self.frame.select_dtypes(include=["datetimetz"]).columns: @@ -153,6 +152,22 @@ def write_data(self) -> None: rdict = {"dtypes": {k: str(v) for k, v in self.frame.dtypes.to_dict().items()}} + for k, v in rdict["dtypes"].items(): + if any(x in v for x in ("bool", "Boolean")): + rdict["dtypes"][k] = "bool" + + elif any(x in v for x in ("int", "uint", "Int", "UInt")): + rdict["dtypes"][k] = "int" + + elif any(x in v for x in ("float", "Float")): + rdict["dtypes"][k] = "float" + + elif any(x in v for x in ("datetime", "Datetime")): + rdict["dtypes"][k] = "datetime" + + elif any(x in v for x in ("object", "string", "String")): + rdict["dtypes"][k] = "object" + for col in self.frame.select_dtypes(include=["datetime"]).columns: self.frame[col] = self.frame[col].values.view("int64") / (10 ** 9) diff --git a/pandas/tests/io/data/rdata/planetary_boundaries_df.rda b/pandas/tests/io/data/rdata/planetary_boundaries_df.rda new file mode 100644 index 0000000000000000000000000000000000000000..0c3308434ccbb41f409d938e620a7244610d474d GIT binary patch literal 197 zcmV;$06PC4iwFP!000000}FDAFy@NjVqjokW?*4pWng9nG8tGyL)>&N7#LWXfE-4k z1SgPYPf3eUNi0bPDisEDfl|zTAn8LK>%Xck0BJ)}&YGN)SX>Nb2!Its%?R)h_KZj_ zfpXcv+!A!1B~|%(sZgCTK@j){v(ibB`}Fh%n3YWbNw9_aKX!vrU67ZUn~Lss=G@FY zI5V*V&2E@hE?^KP>ZKI{4L~!Lvnan@4`whg$Y78U{sRHT1q}ZH`9=EdjR61vs!mVI literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/rdata/planetary_boundaries_df.rds b/pandas/tests/io/data/rdata/planetary_boundaries_df.rds new file mode 100644 index 0000000000000000000000000000000000000000..b370d2bd13785bbcf969c10b54e47567472b3cc6 GIT binary patch literal 178 zcmV;j08RfNiwFP!000001B>8dU|?WoU}0coU}gm}8CXL@+;lB~V!}WU6OiTu(u{{V z)_+x70Fq~60`Y+wI6;b&a}tY-feZn#Vu&DnfPb)OL~;p~%Le9_pzAED%Fj!M>Vye` zz(1IkPJ-O0r#HZ?Wc*KpElmHh8;t6Lyu{p8bhk6-X6C_}i4|ye!?bdxB$g!Vr4<1U gKr@uHD8F0}W-u?vV2}^~0|Ced0L-Pg4Q&Ac02A#?X#fBK literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/rdata/ppm_df.rda b/pandas/tests/io/data/rdata/ppm_df.rda index 5b967767c7216f84425fb3255f058f461ca54c2f..b900815050a559d007bd1726abfd1cb0f951e2a5 100644 GIT binary patch literal 13418 zcmY+~c|6m9;K1>3r%I)ggjknULM6GkZwcQntEA;zs1{;Lj$vDuq(miA*g}yzSFW)T zl4Hp+=Vnbd8#Z>@_4$q84n~=F+Jy~`25Y+vy+g(>&_g`tQK1>?cWEZM2;lL85jgzIlrd2r%AkLh&XkOV*;rXcJ`W#z6N?)0V9$7x z!#Q)UE5akl^`WIL^|o>X`)3vMvYg__8Mf~Ji+B$e7TZSb$`Oy})(2=#9!O0mG}ax@ zmyGx9GN=z2F}4{mA%1l@`KZ4oH;O#@8P*Gj2(QL&A_AqQ@C_m_qhc>VhI!f?URb5EnUCn^0&XVs!);@9Do zf=mOe+Pv`oqRAJpo%2z!mUDAc&aHL%8RqbN_a2A4Bv_keWmEaZ9(d0K$L^(v|7DQi z@$oxYp00;d2H~_oBOz?P_~LX)L{RD=99Mf~+5{wgGzj-%R}8|#{@9liZ=W$JB_4`S zAB0Or{w*cmo4ry>tO$)CgwHdd4Z_RvamtaaUn%k9c%2V+E@9XQ8w|=V0Rky4H@B_`V3H=hi?WL$O8rn5=U@V zhKEaZ=dfNrMsrwPhmnF*u*F0{@)Rh7o_|UY4{8Wa1VNkxt^BxJb+_=s@a_k@aZFcE z6yvhjbUB=C9d)l8@k%t4|vv_7N}`TO&u~crM)J_Tcy2J|J_1tYSiUv-~!jB6jH=ER0=Vh z3MqwDvR0HrZcH1OLKaR&l|qIhU8bRK)lSpUvNv0$>VLvL;^X)K(FOX~Qsvh<)smF7 zbFv9EM`RfQsyQxsCK}N#F^78=CWcRQD-rKY+G)DH;e?F~IxprE6m^uM7{`t%$CemJ z2Z!$wjvH^GvLtd<%H~X$PYN!(WO}=}*NXGSI1r{On&4qVC)m$l)oVv&Q|6`EZ|quf zt;>9)znA00yDRN+;e?P}reIO~eTrbk?)UCd;|4_8(cbdXj^Q4Sr1W%3^KcP8 z|HqGp8%3`;!m~~kAg2=g*xV}eCW`${vmR{iQuydAU$|h_&$>06_T;o(QL$$@dUcL# zHibhI^-B5`IhmOPZ0W`TQNTOqRNMfKSjiv52kKso{8P$wMHlUqKiKu(sPIyP#ripN$#@pi=2p;y-;;CAchU+vg-!g9SgS3dZ1H7e~#D8WOYwh zI2nZmb+UY_ik!ovS?nrUVQ;OTQDk7>N_K@K?2|%GbvF*#;2#$nDL$G9kw?cMckdZ+ zMZ{~-IpOTiyJp1(xdi`=8dL8ZcYppDaF0Q(Iz=Mf&bTo()eFfDz0ygz^Zc(aJMsKH zG7hKVEuVYX%%Iw^r(YFBv*%Z_)BQQ(w#-ncCi1Ez=h1;w9QnIq^x;g4Vzfr>yMn+_ zFW{LPAf}(z+YM$c91mhMX)OH zCRgWwYtYnIJBfHPUD$L6LlrUYKFLruVE{i!SpTc=Y2et`fi?{@V8sY5`A?`nc$35*@b7V0?~%nSX1_EjgsSA6fpiB2z+?n zn}9dTn0YHXTY`Tn>ClcnEx$ppQYQE0N5%Xw{H*-19qB31p{B7FfL*@5Q&F=0@KqVS zG@w*WEaPsT!`>io`2+kMyfcSQ{*)*t<_rAiu>AzzKR`b9?lYjE$mqL*)qcE`C^@IR zZSj7(mJ)A%H}%1uce5)6x%0c0XJ(qX`@*GnUgO)s*niXhaV#vQV#3dhhfynHKqC=k z5*eI~{zW1WRQC*%$X~l&?XtB4Ab6=Ne}KushstD| zJO$hjoo@k*`XSqZstUcG3S=-;IqGyjq(EAJ@sv^TvOkKI7p3w}TwqqB9Kq~*Lf8|O zEXIIQxjm@}k?uu0Fr*3V$-=&)mvj{KpX?B~pK-*iv@P^otmQ$88H$+HI$j$6LDr^mFgt|Pf{mQe@C%=%xyZo;0e@@n5o$?Q2tL%$E- z0xqPFH06eo45LpI_tHk51Z~AI-K)4Eyab|Ef&#caF~YN zA5=5>%xvDJ&yVpOR1yCojpOoOpWt5K)1>Qd^cX7*!q7sj#?%{*`;>DZKe*VGh&GZ; z*Bo);$6@_znd)$#IAwe@nCjGsSY`IP@Z%CK_DQ}+b;p6{=$vQZIUeXacrJLs9xwiQ z*X1!j|E{Ze#;QXejSOO|VH&D#7bGYf7F*Vo#_!aZWuH)aEYh3L;AX1tXv#fhzO^BO zhHt70s+%GQBK@Dl^Xd`EbMx%4q7#jA7~F*q&kFHtHK>Jj#@bB$8La-XaAl&8o{$)N ze>*r;^7xhXBXP@3*=~9Amp>1fNnfmEnMp4%Uf`du_Bg>m9dT88Y+-wq=P0Zuk6B(q zQ^_JVTv4vCv{twcEgaHSPL?*? z<_v!bE-z!26MJ-f5ZJWzERy1BQx*vW)#^b2;`f@5NhQR~;#MxDNqwY36H^xAM(l5 z7yq=!#k2g=qy5UQKZjM02z^fc(=xVck3rrY6J;*FVJ7umt5D`{mNGYR%%tn?yCi#G z#HpiSV@6FRdy7shM}MGC!Ka7JPvJiXC@%mNBd9XZH8s&2uc&QCZ#YUA?nwd+a#wkcM0UMeNJ|e%?xarqPP?Jl zN8D<{Gq;o>T%oT=#*zhlYsjW$xV>tq(*sEp?dNaUp7!GB6&kd=H@sGyaavFw=!H)I zsgjAP1Ha)17L3neKV6R4D^I>u^^uwMdm7ba@TvD~7W53zng#v&bel5#ugc8ND$}h{ z&UmlfLp9L9kMBEyP>ztr&%t@+if!mi zqc97@BLLq2cWJ7R25PKV4^)>p5UZFk&NO|x?syPftrCek4T z)fOqPS(oydTzEh_NsBBA+NQW!zPic1A~|&O&WJH8CVUp+5OMv@bYiSZ!qk2ne}8xC z^GpHv^E;i#(5DABYrBymBPRcF9-CYZ_7wlD{)=)a(`Q? z64UGL#5292F%ZJxE>hw|LnMimxV5Htgp~M3ZtEKH*Rk?3F+J)5m%MIrtKv$}8X70r z(wJb^;?Pv*g4;uY79nD+S;H}E0WS~8M!4ZIP;}HMuB8!vbHhaQ<|cBOjY;$f0;vjp zI(krcfjY0eomid{*X%MT1~~>r8`5Pz9!(j^YCqXl-Xh|2HLe;kk}9@XBi5Cqx}Ax5 zK|iYKJ!#^#jUrZPHn~r{4EfJ(q-jG;xUqrcqdi!e4TZavqyBB^-wrX)z<%mp>geA$ z-R$udtGbj)E;@y;s1D154$nGeLESEC_81st=Q5MR^6|=`UMdsyg_$J0Q_W1mNp*S* z>VK?MX8N)+ca@igS5yNtiE<}Snd_gF@ox^KfdT^!2<+k)O;MoF2S#y6oy5N?beh@} zvKsDzT+uo_y7U>hh1k7CFN%@LEy6}0wAlqyAqGvwKKmJo!yNip3$NmbRK8t9oQvE{ zQ+r<{x1r|=)nqo=-xy`xh&N2^zY#5&T`^>_@s$W>{O=|mldb5Z?@ct&M~4fyppSx; zm*&ykU*o`9D+pQ8Ck7Y!?c3iSlw6M|Z9*TtnXb$vUb*ZQ59PAQDawb2jVT|n1yGKT z;@=4ozp< zwXK6#?@Hvd%U7=>62+hKpvppZbiyR{kYtBZgB{;eyNpzlc%T4Z5Je3-p2h(wEK^pb z<_c6m4ZWpxo~FRvDg3#_XskXPq-VJz>tJycJp!;V;UTI1EFnr?L1~}@vb6x_X z+4rW4rHJ;(u3^4_@08sp`NA4ii!w(B6B{w>g$y68@nXDQ5RCns7!{_ZVR{<4cY&}m-s6paH^9J0B2qVkExlj-?nyV>vc_1|DO3m3-UU8EgR%_H8q z`rKP3SA2#AJO&^%04gS~0W@jAZW%ltcs_?cqA+O$^u@%LK#C9cfKVANVT$)+e6f~` zStW#BTZ|7@e=!<&d6O&Mepa9&JY3z++2O@+#+3ahRy#s_P-7#(4i_hNjp z){9XQhF@EZ57uBYn!<4J#bg6PeCi8@7oVR+)U*|S0QC6OM~b<=L`NWk#c2WR#Y9a& zoeNo}3>Ku|Gy`Zc5d?_n07eG6FJCdJ%&BPuAIN6u=|N}{W{XhG2iU0$L`&K}2yMlz z6FPsvKa^jd&h>$4NMpZX*DPkcTrC^$=0hIh9Df08B6Bja+LGK%td*d*2a+Mt65}5L z%ESTh7UL?$$1LUrFtixQQtVc#GV7&nvb1+Cu$y1s59vcAI_KWAiK(L4lEk>_|2^T(|OIw5(UqVADF~*nB7)qMsOK6%(DdS6MDN3;MB{Y~4d3*_t ztYkjEh}eXU0oD=ME?GrX0yZdgWai?)CA#88o=a@Si_l9}h!=6VR2s1vxDPA?RwxW) zP%)lM9LF{S4wuctzwHzKf-89V1$ymgPTG#X=Z<0 z1b;wWj$P%YxkzuFT1;#cyKzY~TL~URT!+=5{Ig^eMME=k}nN~D3BAy2{|F2XJcRw~vKOc%+V#kpK44c`cPgk8;9 zzGNL|(dtFO4uz#GBqDr?TEwE2Hp}&vB|dg++CzhXBi_d@inD0Lk`WA%#J$JR8bYSr0?b>l(RIGMib+($ssh^-Mlx%0I9K`kOfxYSs|He8q6SjJ|FdKbe$mP$>+y@0FIk6Qw0aS6 zqS~G({2G|4fY83){+Zp+nRz#8IrQ3EOf9gIBC25I6tYI32vzHE+A zzf~f)m#cqQrw(y$RR?Oy)`S^vY6FS=j{GUrJj@z0+#We%*lf36cLv4fX#YJSJGZNW z&L^1-qp$V#GNxm`@E*Q(7!|H-fzv#Q@#cOV8;7YoAq4B}qNC#;V?Fv7?TYi$y@RW$a7>#SAF30{< zkpBg0TO+;@PuCB`rzO@?hwBI3V~Tq}b6QC!@(tzlx^n3XNzn)3xOZacFfO5cPF(di zEawjO1TZ*7y;z~M0sf%={2(=^eh*Wk8dlCs@njy?1i!U#bjDg)ne6HAstA7>YLFCJ z7$poCl&*p#jFa#3Q=h`A&d6nw+l`nlg5O+D&PD30=3k_XKdL!D8s+dg&OeyI4@C11 zB+Rh&lUef~dT|dlLFOP$xVt*s1`%)47BWhDuBYi|@*qN#O~Iz%UI^4(#Pznwnl{Z}Dn{%+)UG)m~VzJoi0^4t*@tH|Xbaj7fHmb5NKkG%Co+?? zABo54#d_KKnQtLdPnr*yJGxAp{mZyA=q=wc7`mPQ^N8-)woRhj!=UqWoQHW^;x-%V zJBz^YX0N5{+<4${3cZ^KS_#EYlzRd}=l@N*)|s#P+b#2{5a=#1{ibDzMBWo_tY zIAWtr@hCWWqpd4vM@JlDRakvrZ?%a!1MYw};PtkP!>%0p6ZT{~{k!YS;E1(yHE(lu z+f;G_Bt8IfagQ7t-#N4j$d>o^(42 zFB&j*AWUWQr)BSs;L_7o)9o)iDp#ulI3#+f);%m9OSKrLL!>#qH~ER>);FAG1^-3Y ztOqh`N}DZzYx(n&c!9InAEBDK@{1dvSfd(@7{%kI)UlbP3enl;@dmMh}rPI^7Yzta|&p zjHjHlvPlB%I@p@=r)}T#%m>KvOa4*QY^Ux6-K4l?yQofc#>rNygV+nlu>Jn-L|yEQ zxj&<;@o_}V7{iD4qET^b)+846Z`+|^ znrxPU_)a@`OH4Z$D4vOYi{fs#cEhUCpNOt7f?L}?_kdx>*|>u0%;(QVyEPf1WW!dQ za2HcqSn`QBcV_4sbmA5RhAPad84Ei=Zz$M7B3@_CtTUC}tM_J3_@A>*H}J454cD;^ z(xHzf!{o;#RKb=Rf;W>HnIg2_0*A4%r))ZPG#O}39ywXf)6j?EVrKw5D?=t?ZNCI$ zQ!Y#&aq__HMAkKxk|cj3Uqz$^cS#?!&-$jwW(83St(;JgJ1sngQ#W+c#_bP-rV?E4n@T}WL1U2-Co=3{lL=lv1Kbi;(*plsox$wb9$$IN0?!VCnzetdDQ!f zZTJRRyFEa0BwITGbs;+B*?9+#*;)rkO}k39d$XlUtt)X1e}|ZN1Q<<+kEKJ#7G7ZP zl&e+4%_aSE@QnSXyWo3R$2#XZwhk+xe%oM)U37`IdNuHK71;{4oPVqJJMPmS`J+`ozcqS4Wh0OiryTbj`&d?VS%l& zguZ+HqO4&!(eX4SdrJtjCws|Ej9CZ*#E9(Rc^LW)tZ!v;Avg(G^c zb+M=k?@`o`+z6XN&@=fm8z3RGF86(AdG2BLLaefXrmv$$2<4QYrSBf zawVH<8ha_+5uUa_1ER+MjM%A2_(m-KuJHnUmYW#zEY>C;v0R`+BN_0a;jB6E#JeZ; zS7l?XsCL*L`d$OpB-iOzSni<@6pPkI3z_Fbue$w-bFx*keVk3nZCprx^=9N&lmkiQ z=XKd|JU-1YmhhBeGb0ZgO+0wZ!!g;6pOC+YK7}X9edcorjG-Tl!3edAB06|(BqFi2 zc@p+4{TYp}bS)Up`FxVfdH8Il5#oNTqSdcQZY+eAlT+Z@^S@#6W`i z2=8^(AM~i#OW}|55!Kg_sP~eEA8Fh6;jg?Qc&&{%!b;u+cYSN)VhP4(Sxz~Vx2RX zFU!K<{e)w|!QG|gW@E;^sO>YyPqyWJ?|;Ld6q8T#{xZ4CNWpqlL@2miLvh!ul&A1# zMBfIsXMlG>2j4~#)KnUD?iiPb+X1AwTvZ9S)r^xCMlqSjU&2Q}t~Z=3gA7GwBSF)S z<5&=%c~*{L4#bL8E;#(gWmu>LkKu*e?GMr2Y~c}c35r$9v|6TnjMNGC!=JW} zrdF7I!*)xBpW@js5MSA|06w zeRNBARGjU*>MeO~qD$7&h77@IcVrOcysS_vR28GOxR<}%DKl+6wS z-)E^zzBMTTCqJUF*^QJQL!B6Kcl4SMguc@Ji|(9TkFxFJl+Ri|3+|FI_S@;b6h*eF zPRty9nQ}w-?&X|}{0aMjDYV#dumIXH@4_ktuWH!C#c;0a-sPfu8%3v77S=jiEc`Uj zgW{Tc9uo{vj*e&lpw+O$*Ak;~*+wYjz#d0~;`R~CtN0xmU&yZ_tdii0?Vb^dA@^EI zvlRsk!CiEXsPt^zBX=mdJ9s^bg1s+I@KBi>@|~A6RwG>`|7OO%@4EqW`!O$Sr3!DX zvF3N9CHd8VJL*c?7D$b#!(8+(TyUA1rzP1kVH19mhT??mK-i>z9D{F9L7=?Ob`hz^ zl4U{PgJY|Y!Mkg|w{v!+6Udg9Wb5`cOWtdk;W^A})KsTq!v)?#I^tO3?1X6h0~_qx z>7&?;ba;BYY5Ih1y36M_&Y|?b2h6DF8c42du@*23`1g{O1MH@E#O~qm(sc_z945@u z#&eS+C=>5PVtHm3RA5CqYG!>KK4&n%i2@^<-_Z;H`oT-*4>mTMUpE1xwwAoLA)#7_ z9!8sT8lC3%I&~lEemfC6@>(R{PAi6SWZ=0uCEX= z!0edbV(JDi+pu_Hmbv4Md^5k}mW+<>YY(sZ*DLC1EiY;{BXRIuTtfL=_%;PQ{JuUj z7d2PIEPW~M3}I*&lW8WLWG5lofRoZK=%d$7JCAMim6kv1CU=<>EGccNFCdFx>!S+QAFwY?_~A1%$|elXa-&!Eik3IPYVu#1yEx%` z1xJJDq}w&A^&Xu8nhWR2h7(A*K3t|P((#Rz54C`2eHv=s`*L(yrc@8m9XyZtT;AWJ zV6jfr7mNFoqJu%cHnkSHb*nGnWyu{5mfNKhQ68OYR9%i?@^)eIl<&y{f)K4cC{J{le&Q{>b^gN4=@u@k91$piL;*5&y5=(bsMkY zIex}MTiCuxe+4f3BYQn5sqIe2ZZGB9{|KoS%W>?3!Sfkop(~`4j5zc!`-UdLb|S`r z>Yq6gz9W*_h3fabFALZ;Uowpi!XW-k60|?V?#m2_*x+(b4T=*6+Y5ApCyR4X==pZD z_O^LzQ_)#*kMe{RZ~hsQ?mk8^P-##n#6YHzK|&pKYRV|*C4%#>Ed@F@Ha^CU6?uef z{_!T>;d%CVMTDC+jG@~>BftKhogQk3@#O7YHg_C1lm07?b>_Ga>zA+;2^OvDDxAy* z9Mx!07!UnZzQ>AmlC&T$eae9S(&H2aHT2e+|Ds%put|^3jz5(DJdF4CNL!R5A#CU-$PEd<;s zBk0BX8QH8eQ8Pk37Mr_;RR3SQ)eI_`p)-%Z&vDViAhmvwiT2X2A6TB=#KNX1{2e?M z7E$ZNah{(oEXVyP>v=8vZzqVfu=gdrik7(5|+jYN-qhC^PJW{Z>Ul==hRiTyE82V6}>G3P%??YV4zr^JAe?C2$4Inqi84d*2Z)LhGB%pgWMahXwV+Mp}IgE9^lxh1Oa9N7H|+ zWK>@_U$utfZ{>}?gFscpg*@Mahkx56TtwqX+!Vn6N8~pzcW(1R-8GiMML*pYRU9*~ zBKM~G>!A0Hi>Esv(m?KT3Hzp{vvjzP1>;iGr|r0eUzqh2#|pI$SKJOnqx8m|_{MXg z{ZXuf;kCHlPe#xL_IyrR{NHxP@evc9%VZnm$wScY;~h-A6!U+J7NW}wxj`=t;tlx@fW5uuztt3j0BbGm6L6A28{FLs0>8P z_E;-DGfYOfm0BNG7okm*lz7bC+TrF;U~w#E7m0oN=gfi`7WxHD(5L91yZsFE3u>j; z{kvgam1PnOh7r2X{Ja8V}UX)y(gUS?K zhV?s(O#bF44V#x}Mi$NaK|ZGb=??=C=x+(Gj+NMc{^$+oQO0?BrXp2T^v$MM@N9ma z{QAIYQB)vI@O!R6ULbTa$Y^_TiB6WULU%I0ep5WbJEr2bVISarF!G6h&l=Wex?^Y` zDoRge&w&|^`74IRzLRh^gT3qwIiIqKS28 ziRF!XZ9_5uBVs1z1j>uVKz%Z7MVagUIv zs|)1Uc?Il{5Q<|X!579VJOw6)69nmgELcBhHJxu^tZK9wc}Q&9fk0 zbBo1hWAEX{j@HBcN6F`F^Sg&lA{#Ci(>+6pF%TAZobW?zjd<1(t>_O903-Z~QxQa` zyqQ!HXiTjVul#4#(eOC>@fmrJ^j(5oFQ!OJEEY|QHhD5C`cPXf5WMu(OpMAvl&wiB zg?-P)lh%+Bs}39udTPfL)&*%>{l9z9>TYW-dJFzuW zBp`g`oM%iB%PHl&k`M2!LtkfC1pA8B%ZSEKP|Uxvh<)+kngDyf-rZWH$s4_FPUMZ zUpUj1`}#yleU8Ef66!G}9`$pBb}CZaYFuHuq+!LJDHOwSR(_ne!s=Gu98((S8qO2< zNu=l5Q>&e^@5vyZ*j#j$kcXa8=@pAXz%eJDqR_D4cbrlyfir8rB<$RgA2q}!F(j|TOnziGg{C==uRaED8H$F zQglR?g#9@$5B=VU>jqVpM5c-iLWu9Br$c`&WMllSqt3IRf=gx!JOkSbvsp!9os4xM zMEQtJQiX9LcFv`eVkfJGX|n#XPWdfd%Fiyw&Pq-3MsKSWKMz)lD<0cRCYYqM--!-{ zSX&uaVEcgytR*~^Tu_-4oGY&%JC*>Ib%1YC%sZtEtCRF5c`padAqX$WPODT>`y}tR z5Srsx{~J|8zG5B4)q>*E=?y;S0A_{L#R7)K(8*wRS zoai#wGcC%_Ph7D5Nhn4h6%-7ZGR9Ug&dB;~vKNoc)j+Fo<*6{K-zEPr!>UA${SND5 ztp4i{J=;%}DQ*=bDJlKfa=4p+-MgHC5F$&kuh;4$eTV?h)17Ja&B2%x4N^tAJlQcK zJ`Vy(BDp}0L4v?Y|GG)Rs0?!p%qW1|1iGX$BB4ZOnq7Ug#Pntc-y?@^I`lCq!H&1^ z!-MmNK#_4o_B=(!NChTMbfO%1Ai9LcA1^YPTnNCGObR^Ph(bIHr_z8O2TAX@yzT|4 zskS%T{n{CZAp*{TNz=_E3cF;RZ|SzRLwm&Gje48Mq6F-LsXcJrNfO0QNs>@W%4Iv9$gi7KawOS`N=VCf*zE`@LQ%O}Ny&AU z5N30WB<6m%88OXeWB2?2Yk$wjo2Q#so2Qt&8`2Sh zk)u_9m*3=dzO*Z3?}}%C)g~WiH$=;8d5@L{4$PvzGtJaT?jA>v2Ba?@Mct#D3Q*o) z2u8%TK&uHK6cq3b1@{d8{o2Aa_et>^v^4co&I=$t=-IAZ*DeT0>;QT5Vj-rA2Lbm5 zJe(LgcEr>)C}X02zKYs#4!vOKfDwR(p(D!Mj*hvAh2}9CJZh=*59y;Fome(=%=u+j zaOe_=EV$?;pg@HkBtZ{%UNbr3*Hi3|kq3tk9TG42G(+`nRGhC)N%XTmmLHyvl@&9+ zz4MJ0wQGS8ZRP5~`BPYw0 zyt@SPWNN?CJBVkM-sFF&V@1dUgg^IRhzL?-maIIp!DEOJa7!uw}Ja@+M z7GX?GFb`o~`%JCiKY8@%TOgtO`i^r!k5>lP9k3hT!{)zr};7Bb4It zekR^r>b^|acJ7Q|SeHjSAfQ+b|GR*f^Bq1?IGM4f85x8H^;}agtj2=+v~s~3wd5_$ z-=cA*r3G^bv9*uu2St1iI^_mMKbhYzpd4RG8dg@FDIQi1a?^*EM_)$|Bd*pK4kJbb9u6aFX9flP zQB|Yqai4`$2kEC|Px?1aznh4vsToO=GS9Awnr=e@GnpMsbb(^UtnIA=rLh}GTGeSe z+vOpU0Nj2)1sr}=ri4_!5>a-O%0(3KhRw_PujOy2l;4};zbapNB_=A(h8!O&OYUM@ zly2tkiOP~~hB`|3iEri#VwH`Sw4_jDo3sRGxmgM&g`Qea9$Y#7QF-zzaD3j)&9B?N z8~%Y9Ia6PRBfs;sq^~)47h#2Ky2S|4)!d2;Z4QJYvW=v#2*k3e%C`h!V6N54h`V2P znsh0Wd`=5$NSz{HX5z@@KZbV$Wymst3cDapg@p)RGT$biQ}GtMDaql6^5fqjJ>0fO zux(z3u8j(x!ZP*Pl-e!Em|Nn$CM(agROM-!}TV4Las)S&Cs8O97+`@X~2=7VPfP&NQxXwfb0mUbtEJ$ z?ggfJ^DWq;7D?LJqwjX=V2{$|Uc8a5a7W%qzvy_6Z{L4Ij)e0S(9PJRi~0v7cWV#2 z@J6nQ&^^8(6JGh$S&K{{AB8Q-&VY>x%Ip+-GX?Tvy{Q6Ov_ck9-p>5Cj8F8<5>ak3 zE-m9bq8F!>FXv5Klx1)4Jyv=yioYt$>RFHTJPY~T9IEeST1MS-z@FocZ@ct2Z~Whq z=((D*>CJk?KqL3|h(M{L;K@)GFZeKgOo4N54M4{+fH7P7m*g&Zr={esawTJ~!eNO# zS0VrInomvc{tiTP-ict~Q2PjCx7XH@U@-6d7-E-9+>8yMr5Q;UN^$@4Mn+~qJ-*ED zfAXnaNW@lb_+E;!WP#Lonm6Kxdp}n(U-m1XT8`^*B6uP8Jc87P%XUi^Y*C)Pkz69M zmo*iObsaC?PciEd5t2*i&RI&AY~bIw(#VRP46HtCD}>bS+2k8awZwQ(92sK|12^Z{ z(u`yaKiuQ!p5lN07o!k3tebM8IxtK$C&YfWSXhhvGUI$gM=hNlWD!E$mc&N&+N^R*?@IFBk)o9}lE;bB~F|$GgT16Em-^IPO=r9D{^Qb#dhC zbBUTjZ%Yq1?GXGhB2Bzn+k^(?{)HL7S;+YnGQR;)`_q!)S?$H0ud*>dIV+}Fxfe>p z#?ZvctU)B&+O|8hA2{NYTi8Ll6qM-+`vzD0Z`egSrp z-DN6i@!h%=J7xJ&7dypv*TM>P9&g49+%T@Z5qGv%k1rxo=tRJJp%Wp}oi07T^e#Z^a50z4l8M>^=eK88u_l<4ZC3;EjNzF7ZYXGx>8B6oq+@FTP|vpQ@N^ z%BSLxem%a*0)J+#WpTwd(Zlz&Che=0I=K80Q-+OqvRE8#zZ3z5UWod645-f~D~4cQ zSZuYIBAx?>-qV5bCPBCf&2v&W0bwm&(br2Hi!Pg?>EvBj zhHto`ev`DtfAQ(^nLE#or(bk0XFYLm`VW&2YkHp#WFZ(xfEjMyvJ@sl9zzR&`vtWDn+qlRu&EiS zBO#{hT0XV0`$7-%`ip}=XZZXeyD<)XQ*L;ToytdZ z8m<6KDX?^4l(>Jwr3DjH`PrBKD@y4FtRP@JANNwer>HY<1eO}*U@AS`+s*n!*mEB3 zDo86pLZ8a__;r{ZhM#Une2Xy7g_}>m2sQC&H4{JP9)`EifPou__V1GHv57jxJH2$p z3kdyh-f6=TN8ahYfvddJ3Ub|C8mWRZmzFcDgH5HvHe*wjS*DVLtrB20GCXN6ZDr+X zkICB+NRP=(^W7ekTit%ZRAH76`%c0;5}t!s5+SBRUx8_1BXHYH7LaClM?$}M=S}&$ zfA5*i+wdzbKH*&H2K(l1Dy5q}C`C(DUVE3dYLr(Phdu^F4E2vy#c+o8%Fjb%5l`Z9DYP=8V)i(hi2%-a$%gOt7`5;JyJM@hwu!}vWL zh@w{yIl3zr+Z1OS$D8Q;^vz|9y=}48AfhG7^D8k{ShW27zOQ*ahY@BKmy+VY7t`u# z7Ka~@on~~!U;}LO4DaV|#e8@AdP+)E%n`=86M=)Rs=EFs-h)iZYyVHMa$lScG zS}M0MYp|^$6zI^8J4ZyEIn8iOX_vtgkKr#y#MCLZXnP9BPxfBbEjJ;Bc=VW<&jP0> z@VYTDRlS=_J8Jt6?{u~-0hrn>Bn9U84oC{h?(UKl1l~206f7?7lN1<-Yhim2Md(FCZu)P5mTd=+J=D@;^@~zlj z&r*shX=3nbQ{B?&UnF6)f0IGGb{yzz-G0nC6}*9f^Cxn|OtTFLiPWAEaBl+m*rtMF z+n9drA#;by_Y2Hl&dtT5u(ME4pG-~(ahn#7_^gd;75urL$<$j1NyS3mL@3ubHOE_T zgNHTgGqftsZa{u;&`*iZRm zk0BmwZGi=vVLoi|Jse2E??CQW&!tt~c@M<900<+b$0T6ra*v4_Lmdbs27^qe)R9O{=A6zc71BH6iN-bl_(L(#o>X)*-s+NGlbONO9OFy zfl44aESU8X%xZjn`ycFCFu4KUAZ;Ipwc-v7Tt5&V$hR&OdP8-jMZ?g`e6~6kDtuvd zH0k{W-Q5r(>k4SUsIKXr8L?|fA0cj^(`XZlzw_Rs$YKls6IW-VX5yDtUYtY*Vgf!A ze~1_pCeF_0|0~2vID4BxF2Wc6)CT1y8IW@_YRhVxlm}LGS4M3G0(~Gi8V2%a7|J4C zJBr90@pehaTvShXlP*KmmAn@m9}a;5ZN@(Y$FCy%rxHs5DAy?0Db!_KRgTD3ZBjWP zTjeeyau5vWqzZ!x(4ACa=mIE>_!#AS${#A5D67_}Y@`5`1`0bFN@T4PCR(MTa$dB` zTqRJn%0R_Sw8}vR%z;kg8;D(iF0)QKdNqz->@z;;w2`Y32e5|3QDpuQ6N$ijHqk|9?&D075Ku z1AYtqPnGTPRez{#g##2@Wu_umjli1CcGFJi0x#(VK& zjPQt)t3*3EHj~H(dMsrFUR$wEW-jvNoKk7!tlF#cjNqHdxk}lj3Zo3bZluRh)R1dc)RC*yR6s}o`?pniumU2(MmVdOM6B{%B~s&n zCczERSp1*L&9rqYIQ2NFdws*TDbZ~Wz{?OMgL3jz;G9)7 z6$ob)Sp~;grBL~uvnpC8g#m7cJ_J2bc?bg7O=^BwyA*53aQULX^q_9?-j^Lq57Co` zU(}Z3&fkl2lz%8T$h1WlR1S((-Bz8 z>o$o&BXRc-&23uD?JVj)O!lJ?E4cx$=olw^pYvY61s9`-rcAcokVy<3-c1fM`#qmP zJUOJGu(RSS#mF*wy# zSSFsqWC_JR)AsVO+H4%pwktxJY{0MhO|aja_|Q7J>(8?$zJUwrO>~OJL{~o3U{vSx zRLDaQZi7dAR-20*n|>l)@DkBg^^(w4YoGp6hMKLK+91;N??n1Kb%1;WzwhNFza)3T zU*fu=N$k_&qjL8uS@^bhl--|+{z(*W81Ev~P8*L1$jNEeV?Lg30x1RRKjhbG#yk+}d2XEh3%`kvUK!sG&FUsw1!?XrzeplHL2w`W8iZF*D{W=j zFzGIl5;86LeR}%(wZg*|6+JOO`FWsKhmBxiwWA-ZQ$|8S;9)=TqkXp8-1rE~W z5KDPp10fMOyUVyk%f=@&WPmbn zU6N~-ipxZm7k*TNrlYb}cH)#b>QOqMfcsR3h6g3rvZnDyvOwpNZ&|SNtmjUbooAJF zo^mVO15}Ghbb{3yeyGg6qFZg%L~t)HOao zIcYIkeR~}7frrxh*h+OzQ_}5}8swKN)N|U+%UKDOv%lERt)|JMVc%<| z;31va9%{>1G3)A?`j5keCMlbt)_o`u=kjLAz)TZ$q6367?LZ1$OJm}PdnC|9NX<#% zq1pC8GYHDCB+stD_p298-v(vg0TP}av_-AHs?txj&Ck6^AUi)wlg8Oq&rs?L_%yGV zf&ugTiDxefOG5{8$2I2SrLlH4f_wRG=L5)f+!Df|sIH`s8t^iC={AoZEvmglFS_zg?fSsKlT+!pz>x7BtVr3;oz%*WMQR1EA3v}(2p zz0X((7p{>oWkHrFDeWyoiITu&iU#);m}=w)%?;j#Bfh=~`H~hvJAvVLpB`(QL?R^y zF-tkqOKe6?-M!fu>6&G)-Fe*M^RFlTzqgYJ(YTJ~0L$f_Z=r$s0euq?^Lb%LrG-OH zhI9J+^ac_<#z6p5QsQ)Q36bH=QG%ZzriAyAyo2yO)=Nd+nSar+->0U?zg$eGSn1pA>zL*YSK!7Zw*4u0hz8P&soX|2d34OSH9=%g3Ew06lGnyWU> zk>~aMoMoA(Ct5bZ*vY~9_74qa>xWL&nGHKMeYxw1BsV*#mqyMr zayAaz#nCC=%;;Nm&+nh6HL7{SVCBkv`tr>w`oEHq#9X7f4l=9LuaY}k{iV9p+Ja^~ zXwN%ZKvRE7prt+$YRK$QKyy%VL7*jfqQiFOFq`z#l>}b^H7kA_wW$mh{#z9)*!IQ& zSgh|I+l0-e(`a*3A49oauV`yn=PPCt)6p^2Wi!=6oij^#xZooDZ<)bk{Pd#$3x#VrN6ceU-KP5R zi-vFI$7Orjo3p7a(4+kcHi8WSUQzfr=bDI}qMA(Hx5NniTMD#P_;FnsbNKK}+DFUp zn2q1xJVQ*9f=Mv=D{cB>S$n{JXc)&}@jBsts1bNsOPp-+QycO#Q%Z>8(w2Yciy`+z zjV&87LA<@tel=;&PgluP_S{)H?q{0hsqgl`^-kh7SF~axmSFg-!amaEg@q8DZR-?e zHmUWQZsrrsEa45B?QA#IeXe@D@;Kh0qrDEBNJ(T={9>vf6aJ<~=WCLk*BaC5$;Lct zrVhEi?w|Ei_ zuX1hFWM-7j9IOKxsniWuo+yOgAx0A8Mcn`F&sdQA0)K*6xbo~6`*lJiW*WvffLyjc zGS=HZeJ|`{7CU%pq{|mLP_Vp$jgfH)JnqiO1bvC2VvB(F6dI8k71r)^2c9CWvPwO+ zzymwkV=Rxo`35GrV97gjG}>dHs@*?oLaSab)Deit;Z=JzMX9@*P97jnAmO6iFj5WG9pT5=FLpyge@D)f2X4E-B?*ydUi+~LkYl@{l#w& zs5!%_Cq`CJ4d0U2es2)xp_UD%iXH0{L~hBKxKIqfZvQ0Adci&gVR>QjK?Tms`mxY> zWi7wm=12?7Zkg0&MV~JR{z6DCywFt}?!=&G4+oqxJjKPIpQCKxU)939uAJ=rh9bow zI{oMw5!p|b3^;45?kitYkex>KJ8wAa5X!J?9Wcb(-JvDoj!^l#DO1h;$YoAq!>MxK*wu?y)Af+)J0@tqy! zC!w9L*d)WV5mZ~0yn$;gGp4hr#!+Q&=-{F@hv7=Cx&V%kn&2Dk%h72#QI_)mg?)vCE_HbUr#dxP=lBjJ0W+Vlo*T)YClw=^AXuR}?x zVOqht-B=6Uxn{+hH?)6N2pu1&!mMoRC$zaFaCfr7T=iqgxX;8-%kiAVdpJ-qxB(&# zv|;AiX**aGPT?A`S%SiD{+{JjXm}dxK4DtCC1|BCUAkXj&F(lpM{XBETZqta_~CtG zFE={+3o&ZrQ;?}6B$i8UW|IwE4BJF;shQ;j1o<>ROBhDnx-dZ$-hQiFtYNg5g0W7BjE(ZJU9I6 z?JdHfDXCswl9Kr=#Xgs5!y6!NWB$i{c`?{9LXj_DYu+ADGeM>%%FaSotbe##DsfGo z&t>jftdJD$id|{-n#H`rn*;&*oHXj^{5LRqhlgBk#TVI)H(I%fQH!KE!%b*hbYZob z0P+{86*8aH2sUp|`D4Xr9JWSW-YELj0>aTZ{S-fZYBOEe+QCkRhyWaFP>!?1dp%hXIBV}`CCDHb}wSBhzi{U+4Oi5S8%?<}OT1HzgyWONolC?Szn z`%+$a1o5VY;=gfE?Ee$4mvh)~lFc$#U$M*%Ab}1Tc#GkWDrcWj`_mNmqs>aHqGIaP z3xRxDiMPGjDlByDrgw1Ll)`*6)EJAGDcfkbid+*U>oEAdCitOYSDERO)HMTY{EN69 z`>^Rj$A!`OUh;N!Lf^YTZXKtUAYb>JpYck1eSrVMul}%oiRG=2Wy>!&B^=uNZ|t%D zd#18|onR2jChK?^?;SG(TlLYg#RZJkz;H8*v$9oJ6xIU z^wdAkVavka;R_T0m?nBtE^ju|-Ws4KiP--?yd3?3_BJr3)lHM0II76iHs0sfmwICK zg?ew@xo&Xo+G5YhF-nNl7Z=X4=NkU*N8e*3Tgx0bTk7!OZPq^o?5qpWm^P{sV!to> zQ)U+ivvF#m0W{-Be%Z(7`FrWT!U3dTBJvoL*~!}7lsmB5iPgN^cLOQoUj(! z1y16>vg01`Hs!_Gn#2e*diymD+ooEFOD49Q@F|}+85gjx?bo_#*&?7bKermnuH^y$ zCmU9p9C_bg<3DdJJ>#)wcUS1 zDkN#pq+#>}0X(8lDQ+FXp%L;2QQe?zGQ_d%L+N5q+hu&klT5n@WkljqbH1dz{en78l?pIy2p z9Oj`+wmkey!rQ^I)TFIO1Fy+FTA^g_*<>4P?2-+j4No-S?-1|2CkKU!Dz~hcLrPbS@ycf?Fy)K z7=2199;c%vHVdcTaF-4Qbt|a;*w*O`7HH_0o6P{dcyP~VA_RP6=fG0fLa`#gd)I}j z^eR_qXx@7Zwe6rqSj}LcXdsytkrDZN`Mt93k@NIGB=IdP^kvZHdKW!Ku~%-}sA8j} zgknqhOlC!%U{)NWhF)_6=b~)EAGEK4+YG&Iq6M{G76(CL5PwuAngD$q zL(A*MBVCDe?-WJYkUT$`eoAXmxSR8U6TdsCCpO8{i@glAh^@UYyZFf|039E>i9z1Y*Fz4+-%e1bz4G*M7MRPYig~u-!}`ODrc1fkrn1n=U{8P-&s?JeK^>BFe(cIIlsjj{Sl5Aq8pvF$kg- zS0@uAEeCxk5TV#R&|17&mzzUj68fX0P|y#`CHN27rwjGx|0nEdKp9O(v~xUbmeOqV zB=sfKsN`Rm9^f#&slCRO&$N{mHpmHIc=gArR6Qw2qt$t6By76w(lpt7uGn_|%bfgF zO&4lFZmW2&jF#@^vP-qnBb5V@0W_Gzm=2;b<79wR`;-k>%5X6Kj>9h*pac!T0c8;YZol;TFzKc&ky$^ zvOj|0X5%2VhE%?pE)#Sv_eKF1)oXT`vKReC68KU-JvoI{AQ)VMORbnGlEUR8i>v;2 zl>xM&bM;{)H*lZjqVrZgZl$t{;qNy|6RmoDVhtZ2&7PT=qv}iSl`LlI!%va_$L&aQk!#jz#Jf*Ypj;&f$MV-eVPWJ)}cY zuLK5R5RsyJ)!!aCWMMYU!a+K6p>RHF`G^o4qR5`!;>#mWU;mM!WDr{YmY`!v%L8Fr z-vacOikAJj zos}QvN9K@cYpxGkQB5nrxw^Jw7QWYKNp}`kIhcX6rHv&;Vq}MyuOjbdFvU?PL0m`y zzcYLm;?25FOY4-TCd1x+MEt59^6g7z72z_3_92QrdeX{S=hRL0M?kNq?=GpQjK%+u z)9sm^?B6R8edWPisq+2Dlw!y%>Uj+jdAQYWJ~Q$|L;V&gzmDENkpnJCLKoKHj=-)b z#|TQOv*fq(EaCbYf7sapx(s(@KAZVTf|R^{E6Db<&z>%(Wq*W&$La*;esS|i5Zp>YSmeRysT{)aHCf!t*lLO2xGi_Yw# z?m20a8$T=mRx130p+rI!j|!07ok!##Ogc;W`6D4?e&`x;R#DDHv{A1yqcM9FL7ExL zfJ^c^e&V3gx(L%S?ofL?UyN=v?3qCG=Lf6LZZ;c7{*8C0l4%wb*Lq@V<<3E`4SQnV z)M*%YQ#|SLk39~r#8Nw{SRbtxFx{AJ=oRHySG(y^ Date: Sat, 15 May 2021 17:12:39 -0500 Subject: [PATCH 25/35] Add compression test, adjust test to fit 32-bit OS, and Mac condition in setup.py --- pandas/tests/io/test_rdata.py | 17 ++++++++++++----- setup.py | 4 +--- 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/pandas/tests/io/test_rdata.py b/pandas/tests/io/test_rdata.py index 342719ea5229f..3f99c2e2f2983 100644 --- a/pandas/tests/io/test_rdata.py +++ b/pandas/tests/io/test_rdata.py @@ -508,10 +508,10 @@ def test_write_index_false(rtype): assert "index" not in r_df.columns -# COMPRESS +# COMPRESSION -def test_write_compress_all(rtype, comp): +def test_write_all_compression(rtype, comp): with tm.ensure_clean("test.out") as path: ghg_df.to_rdata(path, file_format=rtype, compression=comp, index=False) r_dfs = read_rdata(path, file_format=rtype, compression=comp, rownames=False) @@ -522,10 +522,17 @@ def test_write_compress_all(rtype, comp): tm.assert_frame_equal(output, expected) -def test_write_compress_zip(rtype): +def test_write_zip_compression(rtype): with tm.ensure_clean("test.out") as path: with pytest.raises(ValueError, match=("not a supported value for compression")): - ghg_df.to_rdata(path, file_format=rtype, index=False, compression="zip") + ghg_df.to_rdata(path, file_format=rtype, compression="zip") + + +def test_write_read_mismatched_compression(rtype): + with tm.ensure_clean("test.out") as path: + with pytest.raises(gzip.BadGzipFile, match=("Not a gzipped file")): + ghg_df.to_rdata(path, file_format=rtype, compression=None) + read_rdata(path, file_format=rtype) # RDA_NAMES @@ -592,7 +599,7 @@ def test_write_read_dtypes(rtype, comp): ), "interval": interval_range(start=10, periods=6, freq=10 * 2), "bool": [False, True, True, True, False, False], - "int": [2 ** 31 - 1, 1, -(2 ** 31) + 1, -1, 0, 10 ** 9], + "int": [2 ** 20 - 1, 1, -(2 ** 20) + 1, -1, 0, 10 ** 9], "float": [0, np.pi, float("nan"), np.e, np.euler_gamma, 0], "string": array( ["acidification", "change", "loss", "use", "depletion", "aersols"], diff --git a/setup.py b/setup.py index ca3311359a684..2e7daa46313fe 100755 --- a/setup.py +++ b/setup.py @@ -371,6 +371,7 @@ def run(self): extra_compile_args.append("-Wno-error=unreachable-code") # rdata requires system iconv library + os.environ["DYLD_LIBRARY_PATH"] = "" rdata_includes = ["/usr/include"] rdata_libs_dir = ["/usr/lib"] rdata_libs = ["iconv"] @@ -664,9 +665,6 @@ def srcpath(name=None, suffix=".pyx", subdir="src"): if is_platform_windows(): rdata_srcs.append("pandas/_libs/src/librdata/win_iconv.c") -if is_platform_mac(): - os.environ["DYLD_LIBRARY_PATH"] = "" - rdata_ext = Extension( name="pandas.io.rdata._rdata", sources=rdata_srcs, From 01c0807a60dda6e9e7820e7a1bb114533fe7dc20 Mon Sep 17 00:00:00 2001 From: Parfait Gasana Date: Sat, 15 May 2021 18:20:41 -0500 Subject: [PATCH 26/35] Add gzip skip in new test for < PY 3.8 --- pandas/tests/io/test_rdata.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pandas/tests/io/test_rdata.py b/pandas/tests/io/test_rdata.py index 3f99c2e2f2983..6b0ffdb070b44 100644 --- a/pandas/tests/io/test_rdata.py +++ b/pandas/tests/io/test_rdata.py @@ -9,6 +9,7 @@ import pytest from pandas._libs.tslibs.np_datetime import OutOfBoundsDatetime +from pandas.compat import PY38 import pandas.util._test_decorators as td from pandas import ( @@ -528,6 +529,10 @@ def test_write_zip_compression(rtype): ghg_df.to_rdata(path, file_format=rtype, compression="zip") +@pytest.mark.skipif( + not PY38, + reason=("gzip.BadGzipFile exception added in 3.8"), +) def test_write_read_mismatched_compression(rtype): with tm.ensure_clean("test.out") as path: with pytest.raises(gzip.BadGzipFile, match=("Not a gzipped file")): From f5f2e99634ddd9f8436e28559a1e5b5510df0e08 Mon Sep 17 00:00:00 2001 From: Parfait Gasana Date: Sun, 16 May 2021 18:25:48 -0500 Subject: [PATCH 27/35] Add try/except for encoding, fix S3 read in tests and docs, and reduce integer for 32-bit test --- doc/source/user_guide/io.rst | 12 ++- pandas/io/rdata/_rdata.pyx | 8 +- pandas/tests/io/test_rdata.py | 141 +++++++++++++++++++++++++++++++++- 3 files changed, 149 insertions(+), 12 deletions(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index a7df3484f700a..0d95190e2f5f1 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -6041,21 +6041,19 @@ To read from URL, pass link directly into method: airlines = pd.read_rdata(url, file_format="rda") airlines -To read from an Amazon S3 bucket, point to the storage path. This also raises -another issue. Any R data encoded in non utf-8 is currently not supported: +To read from an Amazon S3 bucket, point to the storage path: -.. code-block:: ipython +.. ipython:: python - In [608]: ghcran = pd.read_rdata("s3://public-r-data/ghcran.Rdata") - ... - UnicodeDecodeError: 'utf-8' codec can't decode byte 0xe9 in position 45: invalid continuation byte + ghcran = pd.read_rdata("s3://public-r-data/ghcran.Rdata", compression=None) + ghcran Also, remember if R data files do not contain any data frame object, a parsing error will occur: .. code-block:: ipython - In [608]: rds_file = pd.read_rdata("env_data_non_dfs.rda") + In [610]: rds_file = pd.read_rdata("env_data_non_dfs.rda") ... LibrdataReaderError: Invalid file, or file has unsupported features diff --git a/pandas/io/rdata/_rdata.pyx b/pandas/io/rdata/_rdata.pyx index e9008dcd5c78d..33b5b41afc6d9 100644 --- a/pandas/io/rdata/_rdata.pyx +++ b/pandas/io/rdata/_rdata.pyx @@ -108,7 +108,13 @@ cdef int handle_text_value(const char *value, int index, void *ctx) except *: """ lbr = ctx - lbr.rtext[index] = value if value != NULL else None + if value != NULL: + try: + lbr.rtext[index] = value + except UnicodeDecodeError: + lbr.rtext[index] = None + else: + lbr.rtext[index] = None if index == (lbr.rows - 1): lbr.rvalues[lbr.tblname]["dtypes"][lbr.colidx] = "str" diff --git a/pandas/tests/io/test_rdata.py b/pandas/tests/io/test_rdata.py index 6b0ffdb070b44..9bed00517017e 100644 --- a/pandas/tests/io/test_rdata.py +++ b/pandas/tests/io/test_rdata.py @@ -309,13 +309,146 @@ def test_read_wrong_url(): # S3 +@pytest.mark.slow @tm.network @td.skip_if_no("s3fs") def test_read_rda_s3(): # Public Data of CRAN Packages on GitHub - s3 = "s3://public-r-data/ghcran.Rdata" - with pytest.raises(SystemError, match=("returned a result with an error set")): - read_rdata(s3, compression=None) + rda_s3 = "s3://public-r-data/ghcran.Rdata" + r_df = read_rdata(rda_s3, compression=None, rownames=False) + + # below needed to pass codespell on keyword + r_df["ghcran"].columns.values[107] = "Repository" + + # test structure and not static data since data changes daily + expected_cols = [ + "Package", + "Type", + "Title", + "Version", + "Date", + "Author", + "Maintainer", + "Description", + "License", + "Depends", + "Suggests", + "NeedsCompilation", + "Packaged", + "Repository", + "Date/Publication", + "Contact", + "Imports", + "VignetteBuilder", + "Encoding", + "SystemRequirements", + "RoxygenNote", + "LazyLoad", + "URL", + "Authors@R", + "Classification/ACM", + "Classification/JEL", + "LinkingTo", + "BugReports", + "LazyData", + "Keywords", + "Repository/R-Forge/Project", + "Repository/R-Forge/Revision", + "Repository/R-Forge/DateTimeStamp", + "biocViews", + "Collate", + "Copyright", + "ByteCompile", + "ZipData", + "BuildVignettes", + "Additional_repositories", + "Acknowledgements", + "MailingList", + "Enhances", + "Classification/MSC", + "OS_type", + "BuildManual", + "BuildResaveData", + "References", + "Note", + "X-CRAN-Original-Maintainer", + "RcppModules", + "Data", + "BioViews", + "lazy-loading", + "URLNote", + "Reference", + "KeepSource", + "LazyDataCompression", + "Language", + "Requires", + "Dependencies", + "X-CRAN-Comment", + "Citation", + "Biarch", + "Published", + "RequiredLauncherGeneration", + "SuggestsNote", + "Priority", + "Acknowledgments", + "Revision", + "License_is_FOSS", + "License_restricts_use", + "Archs", + "LazyDataNote", + "Affiliations", + "LicenseDetails", + "SCM", + "Classification/ACM-2012", + "X-CRAN-Original-Package", + "Dialect", + "Limitations", + "Check", + "Recommends", + "LastChangedDate", + "LastChangedRevision", + "SVNRevision", + "X-CRAN-Original-OS_type", + "RcmdrModels", + "Log-Exceptions", + "Models", + "DateNote", + "SystemRequirementsNote", + "Url", + "Reverse depends", + "Lazyload", + "DependsNote", + "VersionSplus", + "MaintainerSplus", + "VersionNote", + "Disclaimer", + "LicenseNote", + "Namespace", + "Address", + "Keyword", + "Contributors", + "NOTE", + "Acknowledgement", + "Repository", + "Lazydata", + "RdMacros", + "HowToCite", + "Publication", + "Reference Manual", + "Special Acknowledgement", + "SysDataCompression", + "DisplayMode", + "Nickname", + "BuildKeepEmpty", + "Twitter", + "Remotes", + "SystemRequirement", + "Github", + ] + + assert isinstance(r_df, dict) + assert isinstance(r_df["ghcran"], DataFrame) + assert r_df["ghcran"].columns.tolist() == expected_cols # TYPE @@ -604,7 +737,7 @@ def test_write_read_dtypes(rtype, comp): ), "interval": interval_range(start=10, periods=6, freq=10 * 2), "bool": [False, True, True, True, False, False], - "int": [2 ** 20 - 1, 1, -(2 ** 20) + 1, -1, 0, 10 ** 9], + "int": [2 ** 10 - 1, 1, -(2 ** 10) + 1, -1, 0, 10 ** 9], "float": [0, np.pi, float("nan"), np.e, np.euler_gamma, 0], "string": array( ["acidification", "change", "loss", "use", "depletion", "aersols"], From 7299ee56cf4f7a63e520aa1e4272cef85e262537 Mon Sep 17 00:00:00 2001 From: Parfait Gasana Date: Sun, 16 May 2021 21:42:51 -0500 Subject: [PATCH 28/35] Replace integer for float in timestamps to fit 32-bit limit --- pandas/core/frame.py | 2 +- pandas/tests/io/test_rdata.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 6eee0d70b6b2e..29ccb4b508a6d 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2338,7 +2338,7 @@ def to_rdata( Raises ------ LibrdataWriterError - * If DataFrame types or values not translatable to R data types. + * If DataFrame types or values do not conform to R data types. See Also -------- diff --git a/pandas/tests/io/test_rdata.py b/pandas/tests/io/test_rdata.py index 9bed00517017e..2a4b4da46be4f 100644 --- a/pandas/tests/io/test_rdata.py +++ b/pandas/tests/io/test_rdata.py @@ -720,10 +720,10 @@ def test_write_read_dtypes(rtype, comp): dts = [ Timestamp.min.ceil("S"), - Timestamp(-(10 ** 18)), + Timestamp(-(10.0 ** 18.0)).ceil("S"), Timestamp(0), Timestamp.now().floor("S"), - Timestamp(10 ** 18), + Timestamp(10.0 ** 18.0).floor("S"), Timestamp.max.floor("S"), ] @@ -737,7 +737,7 @@ def test_write_read_dtypes(rtype, comp): ), "interval": interval_range(start=10, periods=6, freq=10 * 2), "bool": [False, True, True, True, False, False], - "int": [2 ** 10 - 1, 1, -(2 ** 10) + 1, -1, 0, 10 ** 9], + "int": [2 ** 20 - 1, 1, -(2 ** 20) + 1, -1, 0, 10 ** 9], "float": [0, np.pi, float("nan"), np.e, np.euler_gamma, 0], "string": array( ["acidification", "change", "loss", "use", "depletion", "aersols"], From 6a35bfacec3407d6a5a486e4fba1fe284db96077 Mon Sep 17 00:00:00 2001 From: Parfait Gasana Date: Sun, 16 May 2021 23:49:12 -0500 Subject: [PATCH 29/35] Use C long long for large timevalue to work on 32 and 64-bit --- pandas/io/rdata/_rdata.pyx | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/io/rdata/_rdata.pyx b/pandas/io/rdata/_rdata.pyx index 33b5b41afc6d9..47a9f0318bcb6 100644 --- a/pandas/io/rdata/_rdata.pyx +++ b/pandas/io/rdata/_rdata.pyx @@ -344,6 +344,7 @@ cdef class LibrdataWriter(): dict rdict dict rformats dict rtypes + long long timeval bytes file_name bytes tbl_name rdata_writer_t *writer @@ -367,7 +368,8 @@ cdef class LibrdataWriter(): if vtype == RDATA_TYPE_TIMESTAMP: for k, v in vdata.items(): - rdata_append_timestamp_value(self.writer, v) + self.timeval = v + rdata_append_timestamp_value(self.writer, self.timeval) if vtype == RDATA_TYPE_STRING: for k, v in vdata.items(): From 7b35651d79767417b006720ec4d915068736def5 Mon Sep 17 00:00:00 2001 From: Parfait Gasana Date: Mon, 17 May 2021 07:17:07 -0500 Subject: [PATCH 30/35] Adjust timestamps in test to work on 32 and 64-bit machines --- pandas/io/rdata/_rdata.pyx | 4 +--- pandas/tests/io/test_rdata.py | 4 ++-- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/pandas/io/rdata/_rdata.pyx b/pandas/io/rdata/_rdata.pyx index 47a9f0318bcb6..33b5b41afc6d9 100644 --- a/pandas/io/rdata/_rdata.pyx +++ b/pandas/io/rdata/_rdata.pyx @@ -344,7 +344,6 @@ cdef class LibrdataWriter(): dict rdict dict rformats dict rtypes - long long timeval bytes file_name bytes tbl_name rdata_writer_t *writer @@ -368,8 +367,7 @@ cdef class LibrdataWriter(): if vtype == RDATA_TYPE_TIMESTAMP: for k, v in vdata.items(): - self.timeval = v - rdata_append_timestamp_value(self.writer, self.timeval) + rdata_append_timestamp_value(self.writer, v) if vtype == RDATA_TYPE_STRING: for k, v in vdata.items(): diff --git a/pandas/tests/io/test_rdata.py b/pandas/tests/io/test_rdata.py index 2a4b4da46be4f..8de0881c3957b 100644 --- a/pandas/tests/io/test_rdata.py +++ b/pandas/tests/io/test_rdata.py @@ -720,10 +720,10 @@ def test_write_read_dtypes(rtype, comp): dts = [ Timestamp.min.ceil("S"), - Timestamp(-(10.0 ** 18.0)).ceil("S"), + Timestamp("1950-01-01").ceil("S"), Timestamp(0), + Timestamp("2000-01-01").floor("S"), Timestamp.now().floor("S"), - Timestamp(10.0 ** 18.0).floor("S"), Timestamp.max.floor("S"), ] From 0ab02ecae63f25c8fb3b7315e960426011e9dc0f Mon Sep 17 00:00:00 2001 From: Parfait Gasana Date: Mon, 17 May 2021 09:13:07 -0500 Subject: [PATCH 31/35] Add skip for 32-bit in dtypes test --- pandas/tests/io/test_rdata.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/pandas/tests/io/test_rdata.py b/pandas/tests/io/test_rdata.py index 8de0881c3957b..16674f32d3d7e 100644 --- a/pandas/tests/io/test_rdata.py +++ b/pandas/tests/io/test_rdata.py @@ -9,7 +9,10 @@ import pytest from pandas._libs.tslibs.np_datetime import OutOfBoundsDatetime -from pandas.compat import PY38 +from pandas.compat import ( + IS64, + PY38, +) import pandas.util._test_decorators as td from pandas import ( @@ -715,14 +718,18 @@ def test_write_read_utc_dateteime(): # DTYPES +@pytest.mark.skipif( + not IS64, + reason=("large dtypes not supported in 32-bit"), +) def test_write_read_dtypes(rtype, comp): rda_name = "pandas_dataframe" if rtype == "rda" else "r_dataframe" dts = [ Timestamp.min.ceil("S"), - Timestamp("1950-01-01").ceil("S"), + Timestamp(-(10 ** 18)), Timestamp(0), - Timestamp("2000-01-01").floor("S"), + Timestamp(10 ** 18), Timestamp.now().floor("S"), Timestamp.max.floor("S"), ] @@ -737,7 +744,7 @@ def test_write_read_dtypes(rtype, comp): ), "interval": interval_range(start=10, periods=6, freq=10 * 2), "bool": [False, True, True, True, False, False], - "int": [2 ** 20 - 1, 1, -(2 ** 20) + 1, -1, 0, 10 ** 9], + "int": [2 ** 31 - 1, 1, -(2 ** 31) + 1, -1, 0, 10 ** 9], "float": [0, np.pi, float("nan"), np.e, np.euler_gamma, 0], "string": array( ["acidification", "change", "loss", "use", "depletion", "aersols"], From a51f8de013157d254ab8c93eedeb8edf82f879ab Mon Sep 17 00:00:00 2001 From: Parfait Gasana Date: Tue, 18 May 2021 23:05:41 -0500 Subject: [PATCH 32/35] Adjust rdata section of user_guide/io.rst docs --- doc/source/user_guide/io.rst | 89 +++++++++++++++++++++++++----------- 1 file changed, 63 insertions(+), 26 deletions(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 0d95190e2f5f1..a5af89af7aec0 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -5939,8 +5939,10 @@ and `rda`_ data formats. .. _rda: https://www.rdocumentation.org/packages/base/versions/3.6.2/topics/save -For example, consider the following generated data.frames in R using environment -data samples from US EPA, UK BGCI, and NOAA pubilc data: +To walk through an example, consider the following generated data.frames in R +using natural environment data samples from US EPA, UK BGCI, and NOAA pubilc data. +As shown below, each data.frame is saved individually in .rds types and all together +in an .rda type. .. code-block:: r @@ -5953,7 +5955,15 @@ data samples from US EPA, UK BGCI, and NOAA pubilc data: row.names = c(141:145), stringsAsFactors = FALSE ) + ghg_df + gas year emissions + 141 Carbon dioxide 2018 5424.8815 + 142 Methane 2018 634.4571 + 143 Nitrous oxide 2018 434.5286 + 144 Fluorinated gases 2018 182.7824 + 145 Total 2018 6676.6496 + # SAVE SINGLE OBJECT saveRDS(ghg_df, file="ghg_df.rds") plants_df <- data.frame( @@ -5965,10 +5975,18 @@ data samples from US EPA, UK BGCI, and NOAA pubilc data: row.names = c(16:20), stringsAsFactors = FALSE ) + plants_df + plant_group status count + 16 Pteridophytes Data Deficient 398 + 17 Pteridophytes Extinct 65 + 18 Pteridophytes Not Threatened 1294 + 19 Pteridophytes Possibly Threatened 408 + 20 Pteridophytes Threatened 1275 + # SAVE SINGLE OBJECT saveRDS(plants_df, file="plants_df.rds") - sea_ice_df_new <- data.frame( + sea_ice_df <- data.frame( year = c(2016, 2017, 2018, 2019, 2020), mo = c(12, 12, 12, 12, 12), data.type = c("Goddard", "Goddard", "Goddard", "Goddard", "NRTSI-G"), @@ -5978,12 +5996,22 @@ data samples from US EPA, UK BGCI, and NOAA pubilc data: row.names = c(1012:1016), stringsAsFactors = FALSE ) - + sea_ice_df + year mo data.type region extent area + 1012 2016 12 Goddard S 8.28 5.51 + 1013 2017 12 Goddard S 9.48 6.23 + 1014 2018 12 Goddard S 9.19 5.59 + 1015 2019 12 Goddard S 9.41 6.59 + 1016 2020 12 NRTSI-G S 10.44 6.50 + + # SAVE SINGLE OBJECT saveRDS(sea_ice_df, file="sea_ice_df.rds") + # SAVE MULTIPLE OBJECTS save(ghg_df, plants_df, sea_ice_df, file="env_data_dfs.rda") -With ``read_rdata``, you can read these above .rds or .rda files: +With ``read_rdata``, you can read these above .rds and .rda files, both +generating a dictionary of DataFrame(s): .. ipython:: python :suppress: @@ -5994,8 +6022,8 @@ With ``read_rdata``, you can read these above .rds or .rda files: .. ipython:: python rds_file = os.path.join(file_path, "ghg_df.rds") - ghg_df = pd.read_rdata(rds_file)["r_dataframe"].tail() - ghg_df + env_df = pd.read_rdata(rds_file) + {k: df.tail() for k, df in env_df.items()} rda_file = os.path.join(file_path, "env_data_dfs.rda") env_dfs = pd.read_rdata(rda_file) @@ -6010,13 +6038,14 @@ To ignore the rownames of data.frame, use option ``rownames=False``: plants_df -To select specific objects in .rda, pass a list of names into ``select_frames``: +To select specific objects in .rda, pass a list of names into ``select_frames``. +By default, all objects are returned. .. ipython:: python rda_file = os.path.join(file_path, "env_data_dfs.rda") - env_dfs = pd.read_rdata(rda_file, select_frames=["sea_ice_df"]) - env_dfs + sub_env_dfs = pd.read_rdata(rda_file, select_frames=["sea_ice_df"]) + sub_env_dfs To read from a file-like object, read object in argument, ``path_or_buffer``: @@ -6059,7 +6088,7 @@ will occur: Finally, please note R's ``Date`` (without time component) will translate to ``datetime64`` in pandas. Also, R's date/time field type, ``POSIXct``, that can -carry varying timezones will translate to UTC time in pandas. For example, in R, +carry timezones will translate to UTC time in pandas. For example, in R, the following data sample from an .rda shows date/time in 'America/Chicago' local timezone: @@ -6099,6 +6128,9 @@ Below is summary of how ``read_rdata`` handles data types between R and pandas. * - numeric - - float64 + * - Date + - + - datetime64[ns] * - POSIXct - UTC conversion - datetime64[ns] @@ -6123,17 +6155,19 @@ For a single DataFrame in rds type, pass in a file or buffer in method: .. ipython:: python - plants_df.to_rdata("plants_df.rds") + env_dfs["plants_df"].to_rdata("plants_df.rds") For a single DataFrame in RData or rda types, pass in a file or buffer in method and optionally give it a name: .. ipython:: python - ghg_df.to_rdata("ghg_df.rda", rda_name="ghg_df") + env_dfs["ghg_df"].to_rdata("ghg_df.rda", rda_name="ghg_df") + +.. note:: -While RData and rda types can hold multiple R objects, this method currently -only supports writing out a single DataFrame. + While RData and rda types can hold multiple R objects, this method currently + only supports writing out a single DataFrame. Even write to a buffer and read its content (and be sure to adjust default ``gzip`` compression to ``compression=None``): @@ -6161,7 +6195,7 @@ will output as a named column or multiple columns for MultiIndex. .. ipython:: python - ghg_df.rename_axis(None).to_rdata("ghg_df.rds") + env_dfs["ghg_df"].rename_axis(None).to_rdata("ghg_df.rds") pd.read_rdata("ghg_df.rds")["r_dataframe"].tail() @@ -6169,7 +6203,7 @@ To ignore the index, use ``index=False``: .. ipython:: python - ghg_df.rename_axis(None).to_rdata("ghg_df.rds", index=False) + env_dfs["ghg_df"].rename_axis(None).to_rdata("ghg_df.rds", index=False) pd.read_rdata("ghg_df.rds")["r_dataframe"].tail() @@ -6179,10 +6213,10 @@ is "gzip" or "gz". Notice size difference of compressed and uncompressed files: .. ipython:: python - plants_df.to_rdata("plants_df_gz.rds") - plants_df.to_rdata("plants_df_bz2.rds", compression="bz2") - plants_df.to_rdata("plants_df_xz.rds", compression="xz") - plants_df.to_rdata("plants_df_non_comp.rds", compression=None) + env_dfs["plants_df"].to_rdata("plants_df_gz.rds") + env_dfs["plants_df"].to_rdata("plants_df_bz2.rds", compression="bz2") + env_dfs["plants_df"].to_rdata("plants_df_xz.rds", compression="xz") + env_dfs["plants_df"].to_rdata("plants_df_non_comp.rds", compression=None) os.stat("plants_df_gz.rds").st_size os.stat("plants_df_bz2.rds").st_size @@ -6193,7 +6227,7 @@ Like other IO methods, ``storage_options`` are enabled to write to those platfor .. code-block:: ipython - ghg_df.to_rdata( + env_dfs["ghg_df"].to_rdata( "s3://path/to/my/storage/pandas_df.rda", storage_options={"user": "xxx", "password": "???"} ) @@ -6233,10 +6267,13 @@ Once exported, the single DataFrame can be read or loaded in R: 144 Fluorinated gases 2018 182.7824 145 Total 2018 6676.6496 -Please note R does not support all dtypes of pandas. For special dtypes, -you may have to handle data in either end to fit your specific data needs. +.. note:: + + R does not support all dtypes of pandas. For special dtypes, you may + have to prepare or clean data in either end (R or pandas side) to + meet your specific data needs. -Below is summary of how ``write_rdata`` handles data types between pandas +Below is summary of how ``to_rdata`` handles data types between pandas and R in order to translate pandas simpler dtypes to R's atomic types. .. list-table:: @@ -6249,7 +6286,7 @@ and R in order to translate pandas simpler dtypes to R's atomic types. * - bool - - logical - * - any uint/int + * - any uint or int - - integer * - any float From 67613aaf84dc3c274bba217dce39127906fdc930 Mon Sep 17 00:00:00 2001 From: Parfait Gasana Date: Mon, 21 Jun 2021 15:06:30 -0500 Subject: [PATCH 33/35] Adjust setup.py per comments --- setup.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/setup.py b/setup.py index bb0603a05b18c..317ea89fa44c1 100755 --- a/setup.py +++ b/setup.py @@ -371,7 +371,6 @@ def run(self): extra_compile_args.append("-Wno-error=unreachable-code") # rdata requires system iconv library - os.environ["DYLD_LIBRARY_PATH"] = "" rdata_includes = ["/usr/include"] rdata_libs_dir = ["/usr/lib"] rdata_libs = ["iconv"] @@ -582,8 +581,8 @@ def srcpath(name=None, suffix=".pyx", subdir="src"): include = data.get("include", []) include.append(numpy.get_include()) - if name == "io.rdata._rdata" and is_platform_mac(): - # non-conda builds must adjust paths to libiconv .h and lib dirs + if name == "io.rdata._rdata" and is_platform_mac() and "CONDA_PREFIX" in os.environ: + # conda builds on mac must adjust paths to libiconv and lib dirs include = [ os.path.join(os.environ["CONDA_PREFIX"], "include"), os.path.join(os.environ["CONDA_PREFIX"], "lib"), From dc56c8288137babf21d1556c80d45f24eb6d1875 Mon Sep 17 00:00:00 2001 From: Parfait Gasana Date: Mon, 21 Jun 2021 16:04:52 -0500 Subject: [PATCH 34/35] Remove conda prefix condition for mac in setup.py --- setup.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/setup.py b/setup.py index 317ea89fa44c1..ddd6b0315f2ba 100755 --- a/setup.py +++ b/setup.py @@ -581,13 +581,6 @@ def srcpath(name=None, suffix=".pyx", subdir="src"): include = data.get("include", []) include.append(numpy.get_include()) - if name == "io.rdata._rdata" and is_platform_mac() and "CONDA_PREFIX" in os.environ: - # conda builds on mac must adjust paths to libiconv and lib dirs - include = [ - os.path.join(os.environ["CONDA_PREFIX"], "include"), - os.path.join(os.environ["CONDA_PREFIX"], "lib"), - ] + include - undef_macros = [] if ( From a56cf384df5a73e79be78d4afa19dc89d1614963 Mon Sep 17 00:00:00 2001 From: Parfait Gasana Date: Thu, 24 Jun 2021 15:26:19 -0500 Subject: [PATCH 35/35] Remove extraneous lines --- setup.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/setup.py b/setup.py index ddd6b0315f2ba..3d682610a2e46 100755 --- a/setup.py +++ b/setup.py @@ -650,8 +650,6 @@ def srcpath(name=None, suffix=".pyx", subdir="src"): extensions.append(ujson_ext) -# ---------------------------------------------------------------------- - # ---------------------------------------------------------------------- # rdata @@ -685,7 +683,6 @@ def srcpath(name=None, suffix=".pyx", subdir="src"): # ---------------------------------------------------------------------- - if __name__ == "__main__": # Freeze to support parallel compilation when using spawn instead of fork multiprocessing.freeze_support()