diff --git a/LICENSES/LIBRDATA_LICENSE b/LICENSES/LIBRDATA_LICENSE new file mode 100644 index 0000000000000..4f24e6b9127ff --- /dev/null +++ b/LICENSES/LIBRDATA_LICENSE @@ -0,0 +1,19 @@ +Copyright (c) 2013-2020 Evan Miller (except where otherwise noted) + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index c2b030d732ba9..edcf852749deb 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -31,6 +31,7 @@ The pandas I/O API is a set of top level ``reader`` functions accessed like binary;`Feather Format `__;:ref:`read_feather`;:ref:`to_feather` binary;`Parquet Format `__;:ref:`read_parquet`;:ref:`to_parquet` binary;`ORC Format `__;:ref:`read_orc`; + binary;`R `__;:ref:`read_rdata`;:ref:`to_rdata` binary;`Stata `__;:ref:`read_stata`;:ref:`to_stata` binary;`SAS `__;:ref:`read_sas`; binary;`SPSS `__;:ref:`read_spss`; @@ -5927,6 +5928,407 @@ respective functions from ``pandas-gbq``. Full documentation can be found `here `__. + +.. _io.rdata: + +R data format +------------- + +.. _io.rdata_reader: + +Reading R data +'''''''''''''' + +.. versionadded:: 1.3.0 + +The top-level function ``read_rdata`` will read the native serialization types +in the R language and environment. For .RData and its synonymous shorthand, .rda, +that can hold multiple R objects, method will return a ``dict`` of ``DataFrames``. +For .rds types that only contains a single R object, method will return a ``dict`` +of a single ``DataFrame``. + +.. note:: + + Since any R object can be saved in these types, this method will only return + data.frame objects or objects coercible to data.frames including matrices, + tibbles, and data.tables. + +For more information of R serialization data types, see docs on `rds`_ +and `rda`_ data formats. + +.. _rds: https://www.rdocumentation.org/packages/base/versions/3.6.2/topics/readRDS + +.. _rda: https://www.rdocumentation.org/packages/base/versions/3.6.2/topics/save + +To walk through an example, consider the following generated data.frames in R +using natural environment data samples from US EPA, UK BGCI, and NOAA pubilc data. +As shown below, each data.frame is saved individually in .rds types and all together +in an .rda type. + +.. code-block:: r + + ghg_df <- data.frame( + gas = c("Carbon dioxide", "Methane", "Nitrous oxide", + "Fluorinated gases", "Total"), + year = c(2018, 2018, 2018, 2018, 2018), + emissions = c(5424.88150213288, 634.457127078267, 434.528555376666, + 182.782432461777, 6676.64961704959), + row.names = c(141:145), + stringsAsFactors = FALSE + ) + ghg_df + gas year emissions + 141 Carbon dioxide 2018 5424.8815 + 142 Methane 2018 634.4571 + 143 Nitrous oxide 2018 434.5286 + 144 Fluorinated gases 2018 182.7824 + 145 Total 2018 6676.6496 + + # SAVE SINGLE OBJECT + saveRDS(ghg_df, file="ghg_df.rds") + + plants_df <- data.frame( + plant_group = c("Pteridophytes", "Pteridophytes", "Pteridophytes", + "Pteridophytes", "Pteridophytes"), + status = c("Data Deficient", "Extinct", "Not Threatened", + "Possibly Threatened", "Threatened"), + count = c(398, 65, 1294, 408, 1275), + row.names = c(16:20), + stringsAsFactors = FALSE + ) + plants_df + plant_group status count + 16 Pteridophytes Data Deficient 398 + 17 Pteridophytes Extinct 65 + 18 Pteridophytes Not Threatened 1294 + 19 Pteridophytes Possibly Threatened 408 + 20 Pteridophytes Threatened 1275 + + # SAVE SINGLE OBJECT + saveRDS(plants_df, file="plants_df.rds") + + sea_ice_df <- data.frame( + year = c(2016, 2017, 2018, 2019, 2020), + mo = c(12, 12, 12, 12, 12), + data.type = c("Goddard", "Goddard", "Goddard", "Goddard", "NRTSI-G"), + region = c("S", "S", "S", "S", "S"), + extent = c(8.28, 9.48, 9.19, 9.41, 10.44), + area = c(5.51, 6.23, 5.59, 6.59, 6.5), + row.names = c(1012:1016), + stringsAsFactors = FALSE + ) + sea_ice_df + year mo data.type region extent area + 1012 2016 12 Goddard S 8.28 5.51 + 1013 2017 12 Goddard S 9.48 6.23 + 1014 2018 12 Goddard S 9.19 5.59 + 1015 2019 12 Goddard S 9.41 6.59 + 1016 2020 12 NRTSI-G S 10.44 6.50 + + # SAVE SINGLE OBJECT + saveRDS(sea_ice_df, file="sea_ice_df.rds") + + # SAVE MULTIPLE OBJECTS + save(ghg_df, plants_df, sea_ice_df, file="env_data_dfs.rda") + +With ``read_rdata``, you can read these above .rds and .rda files, both +generating a dictionary of DataFrame(s): + +.. ipython:: python + :suppress: + + rel_path = os.path.join("..", "pandas", "tests", "io", "data", "rdata") + file_path = os.path.abspath(rel_path) + +.. ipython:: python + + rds_file = os.path.join(file_path, "ghg_df.rds") + env_df = pd.read_rdata(rds_file) + {k: df.tail() for k, df in env_df.items()} + + rda_file = os.path.join(file_path, "env_data_dfs.rda") + env_dfs = pd.read_rdata(rda_file) + {k: df.tail() for k, df in env_dfs.items()} + +To ignore the rownames of data.frame, use option ``rownames=False``: + +.. ipython:: python + + rds_file = os.path.join(file_path, "plants_df.rds") + plants_df = pd.read_rdata(rds_file, rownames=False)["r_dataframe"].tail() + plants_df + + +To select specific objects in .rda, pass a list of names into ``select_frames``. +By default, all objects are returned. + +.. ipython:: python + + rda_file = os.path.join(file_path, "env_data_dfs.rda") + sub_env_dfs = pd.read_rdata(rda_file, select_frames=["sea_ice_df"]) + sub_env_dfs + +To read from a file-like object, read object in argument, ``path_or_buffer``: + +.. ipython:: python + + rds_file = os.path.join(file_path, "plants_df.rds") + with open(rds_file, "rb") as f: + plants_df = pd.read_rdata( + f, + file_format="rds", + )["r_dataframe"] + + plants_df + +To read from URL, pass link directly into method: + +.. ipython:: python + + url = ("https://github.com/hadley/nycflights13/" + "blob/master/data/airlines.rda?raw=true") + + airlines = pd.read_rdata(url, file_format="rda") + airlines + +To read from an Amazon S3 bucket, point to the storage path: + +.. ipython:: python + + ghcran = pd.read_rdata("s3://public-r-data/ghcran.Rdata", compression=None) + ghcran + +Also, remember if R data files do not contain any data frame object, a parsing error +will occur: + +.. code-block:: ipython + + In [610]: rds_file = pd.read_rdata("env_data_non_dfs.rda") + ... + LibrdataReaderError: Invalid file, or file has unsupported features + +Finally, please note R's ``Date`` (without time component) will translate to +``datetime64`` in pandas. Also, R's date/time field type, ``POSIXct``, that can +carry timezones will translate to UTC time in pandas. For example, in R, +the following data sample from an .rda shows date/time in 'America/Chicago' local +timezone: + +.. code-block:: r + + load("ppm_df.rda") + tail(ppm_df, 5) + date decimal_date monthly_average deseasonalized num_days std_dev_of_days unc_of_mon_mean + 612 2020-12-16 17:42:25 2020.958 414.25 414.98 30 0.47 0.17 + 613 2021-01-16 05:17:31 2021.042 415.52 415.26 29 0.44 0.16 + 614 2021-02-15 15:00:00 2021.125 416.75 415.93 28 1.02 0.37 + 615 2021-03-18 01:42:28 2021.208 417.64 416.18 28 0.86 0.31 + 616 2021-04-17 12:17:31 2021.292 419.05 416.23 24 1.12 0.44 + +In pandas, conversion shows adjustment in hours to UTC: + +.. ipython:: python + + r_dfs = pd.read_rdata(os.path.join(file_path, "ppm_df.rda")) + r_dfs["ppm_df"].tail() + +Below is summary of how ``read_rdata`` handles data types between R and pandas. + +.. list-table:: + :widths: 25 25 25 + :header-rows: 1 + + * - R types + - Conversion notes + - pandas types + * - logical + - + - bool + * - integer + - + - int64 + * - numeric + - + - float64 + * - Date + - + - datetime64[ns] + * - POSIXct + - UTC conversion + - datetime64[ns] + * - factor + - + - Categorical + * - character + - + - object + +.. _io.rdata_writer: + +Writing R data +'''''''''''''' + +.. versionadded:: 1.3.0 + +The method :func:`~pandas.core.frame.DataFrame.to_rdata` will write a DataFrame +into R data files (.RData, .rda, and .rds). + +For a single DataFrame in rds type, pass in a file or buffer in method: + +.. ipython:: python + + env_dfs["plants_df"].to_rdata("plants_df.rds") + +For a single DataFrame in RData or rda types, pass in a file or buffer in method +and optionally give it a name: + +.. ipython:: python + + env_dfs["ghg_df"].to_rdata("ghg_df.rda", rda_name="ghg_df") + +.. note:: + + While RData and rda types can hold multiple R objects, this method currently + only supports writing out a single DataFrame. + +Even write to a buffer and read its content (and be sure to adjust default +``gzip`` compression to ``compression=None``): + +.. ipython:: python + + with BytesIO() as b_io: + env_dfs["sea_ice_df"].to_rdata( + b_io, + file_format="rda", + index=False, + compression=None, + ) + print( + pd.read_rdata( + b_io.getvalue(), + file_format="rda", + rownames=False, + compression=None, + )["pandas_dataframe"].tail() + ) + +While DataFrame index will not map into R rownames, by default ``index=True`` +will output as a named column or multiple columns for MultiIndex. + +.. ipython:: python + + env_dfs["ghg_df"].rename_axis(None).to_rdata("ghg_df.rds") + + pd.read_rdata("ghg_df.rds")["r_dataframe"].tail() + +To ignore the index, use ``index=False``: + +.. ipython:: python + + env_dfs["ghg_df"].rename_axis(None).to_rdata("ghg_df.rds", index=False) + + pd.read_rdata("ghg_df.rds")["r_dataframe"].tail() + +By default, these R serialized types are compressed files in either gzip, bzip2, +or xz algorithms. Similar to R, the default ``compression`` type in this method +is "gzip" or "gz". Notice size difference of compressed and uncompressed files: + +.. ipython:: python + + env_dfs["plants_df"].to_rdata("plants_df_gz.rds") + env_dfs["plants_df"].to_rdata("plants_df_bz2.rds", compression="bz2") + env_dfs["plants_df"].to_rdata("plants_df_xz.rds", compression="xz") + env_dfs["plants_df"].to_rdata("plants_df_non_comp.rds", compression=None) + + os.stat("plants_df_gz.rds").st_size + os.stat("plants_df_bz2.rds").st_size + os.stat("plants_df_xz.rds").st_size + os.stat("plants_df_non_comp.rds").st_size + +Like other IO methods, ``storage_options`` are enabled to write to those platforms: + +.. code-block:: ipython + + env_dfs["ghg_df"].to_rdata( + "s3://path/to/my/storage/pandas_df.rda", + storage_options={"user": "xxx", "password": "???"} + ) + +.. ipython:: python + :suppress: + + os.remove("ghg_df.rds") + os.remove("ghg_df.rda") + os.remove("plants_df.rds") + os.remove("plants_df_gz.rds") + os.remove("plants_df_bz2.rds") + os.remove("plants_df_xz.rds") + os.remove("plants_df_non_comp.rds") + +Once exported, the single DataFrame can be read or loaded in R: + +.. code-block:: r + + plants_df <- readRDS("plants_df.rds") + plants_df + plant_group status count + 16 Pteridophytes Data Deficient 398 + 17 Pteridophytes Extinct 65 + 18 Pteridophytes Not Threatened 1294 + 19 Pteridophytes Possibly Threatened 408 + 20 Pteridophytes Threatened 1275 + + load("ghg_df.rda") + + mget(list=ls()) + $ghg_df + gas year emissions + 141 Carbon dioxide 2018 5424.8815 + 142 Methane 2018 634.4571 + 143 Nitrous oxide 2018 434.5286 + 144 Fluorinated gases 2018 182.7824 + 145 Total 2018 6676.6496 + +.. note:: + + R does not support all dtypes of pandas. For special dtypes, you may + have to prepare or clean data in either end (R or pandas side) to + meet your specific data needs. + +Below is summary of how ``to_rdata`` handles data types between pandas +and R in order to translate pandas simpler dtypes to R's atomic types. + +.. list-table:: + :widths: 25 25 25 + :header-rows: 1 + + * - pandas types + - Conversion notes + - R types + * - bool + - + - logical + * - any uint or int + - + - integer + * - any float + - + - numeric + * - datetime64[ns] + - + - POSIXct + * - datetime64[ns, tz] + - remove tz awareness + - POSIXct + * - timedelta + - convert to seconds + - numeric + * - object + - + - character + * - all other dtypes + - convert to string + - character + .. _io.stata: Stata format @@ -5982,6 +6384,7 @@ outside of this range, the variable is cast to ``int16``. 115 dta file format. Attempting to write *Stata* dta files with strings longer than 244 characters raises a ``ValueError``. + .. _io.stata_reader: Reading from Stata format diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index b92e414f2055e..0e7f74376125f 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -112,6 +112,109 @@ both XPath 1.0 and XSLT 1.0 are available. (:issue:`27554`) For more, see :ref:`io.xml` in the user guide on IO tools. +.. _whatsnew_130.read_to_rdata: + +Read and write R data files +^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +We added I/O support to read and write R data files (.RData, .rda, .rds) using +:func:`pandas.read_rdata` and :meth:`DataFrame.to_rdata`. Both methods rely on +the `librdata`_ C library to support open source data migration between R and +Python pandas. (:issue:`40287`) + +.. _librdata: https://github.com/WizardMac/librdata + +For example, consider the below generated data frame and matrix in R: + +.. code-block:: r + + In [1]: carbon_ppm_df <- data.frame( + ...: year = c(2020, 2020, 2020, 2021, 2021), + ...: month = c(10, 11, 12, 1, 2), + ...: monthly_average = c(411.51, 413.11, 414.25, 415.52, 416.75), + ...: num_days = c(30, 27, 30, 29, 28), + ...: st_dev_of_days = c(0.22, 0.8, 0.48, 0.44, 1.01), + ...: unc_mon_mean = c(0.08, 0.29, 0.17, 0.16, 0.36) + ...: ) + + In [2]: iucn_species_mtx <- matrix( + ...: c(102, 79, 159, 63, 30, 13, 267, 35, 85, + ...: 30, 10, 5, 1, 2, 7, 14, 2, 2, + ...: 409, 121, 22, 75, 40, 78, 134, 146, 28, + ...: 29, 6, 0, 0, 0, 12, 2, 1, 0, + ...: 3770, 627, 223, 365, 332, 699, 604, 663, 225, + ...: 6972, 989, 460, 730, 588, 1302, 518, 1060, 542, + ...: 7089, 1219, 798, 831, 538, 1051, 975, 719, 556, + ...: 2990, 4251, 52, 2819, 1220, 914, 1648, 1184, 845, + ...: 43885, 20685, 11158, 10865, 8492, 8192, 7326, 7212, 5940 + ...: ), + ...: ncol=9, nrow=9, + ...: dimnames = list( + ...: c("MAGNOLIOPSIDA", "ACTINOPTERYGII", "AVES", + ...: "INSECTA", "REPTILIA", "LILIOPSIDA", + ...: "GASTROPODA", "AMPHIBIA", "MAMMALIA"), + ...: c("EX", "EW", "CR(PE)", "CR(PEW)", "CR", + ...: "EN", "VU", "DD", "Total") + ...: ) + ...: ) + + In [3]: saveRDS(ppm_df, "ppm_df_r.rds") + In [4]: save(ppm_df, iucn_species_mtx, "env_objs_r.rda") + +Now, both R data files can be read in pandas to return either DataFrame +for .rds types or ``dict`` of DataFrames for .RData and .rda types: + +.. code-block:: ipython + + In [1]: ppm_df = pd.read_rdata("ppm_df_r.rda")["r_dataframe"] + In [2]: ppm_df + Out[3]: + year month monthly_average deseasonalized num_days std_dev_of_days unc_of_mon_mean + rownames + 1 2020 12 414.25 414.98 30 0.47 0.17 + 2 2021 1 415.52 415.26 29 0.44 0.16 + 3 2021 2 416.75 415.93 28 1.02 0.37 + 4 2021 3 417.64 416.18 28 0.86 0.31 + 5 2021 4 419.05 416.23 24 1.12 0.44 + + In [4]: env_objs = pd.read_rdata("env_objs_r.rda") + Out[5]: + {'carbon_ppm_df': + year month monthly_average deseasonalized num_days std_dev_of_days unc_of_mon_mean + rownames + 1 2020 12 414.25 414.98 30 0.47 0.17 + 2 2021 1 415.52 415.26 29 0.44 0.16 + 3 2021 2 416.75 415.93 28 1.02 0.37 + 4 2021 3 417.64 416.18 28 0.86 0.31 + 5 2021 4 419.05 416.23 24 1.12 0.44 + + [5 rows x 7 columns], + 'iucn_species_mtx': + EX EW CR(PE) CR(PEW) CR EN VU DD Total + rownames + MAGNOLIOPSIDA 102 30 409 29 3770 6972 7089 2990 43885 + ACTINOPTERYGII 79 10 121 6 627 989 1219 4251 20685 + AVES 159 5 22 0 223 460 798 52 11158 + INSECTA 63 1 75 0 365 730 831 2819 10865 + REPTILIA 30 2 40 0 332 588 538 1220 8492 + LILIOPSIDA 13 7 78 12 699 1302 1051 914 8192 + GASTROPODA 267 14 134 2 604 518 975 1648 7326 + AMPHIBIA 35 2 146 1 663 1060 719 1184 7212 + + [8 rows x 9 columns]} + +Additionally, pandas data can be written back out into the same R data files: + +.. code-block:: ipython + + In [5]: ppm_df.to_rdata("ppm_df_py.rds") + In [6]: env_objs['iucn_species_mtx'].to_rdata( + ...: "iucn_species_py.rda", + ...: rda_name="iucn_species_df" + ...: ) + +For more, see :ref:`io.rdata` in the user guide on IO tools. + .. _whatsnew_130.enhancements.styler: Styler enhancements @@ -234,7 +337,6 @@ For example: df df.rolling("2D", center=True).mean() - .. _whatsnew_130.enhancements.other: Other enhancements diff --git a/pandas/__init__.py b/pandas/__init__.py index db4043686bcbb..498696938d079 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -171,6 +171,7 @@ read_stata, read_sas, read_spss, + read_rdata, ) from pandas.io.json import _json_normalize as json_normalize diff --git a/pandas/_libs/src/librdata/CKHashTable.c b/pandas/_libs/src/librdata/CKHashTable.c new file mode 100644 index 0000000000000..c0312e3f5dc74 --- /dev/null +++ b/pandas/_libs/src/librdata/CKHashTable.c @@ -0,0 +1,350 @@ +// CKHashTable - A simple hash table +// Copyright 2010-2020 Evan Miller (see LICENSE) + +#include "CKHashTable.h" + +/* + SipHash reference C implementation + + Copyright (c) 2012 Jean-Philippe Aumasson + Copyright (c) 2012 Daniel J. Bernstein + + To the extent possible under law, the author(s) have dedicated all copyright + and related and neighboring rights to this software to the public domain + worldwide. This software is distributed without any warranty. + + You should have received a copy of the CC0 Public Domain Dedication along with + this software. If not, see . + */ +#include +#include +#include +typedef uint64_t u64; +typedef uint32_t u32; +typedef uint8_t u8; + + +#define ROTL(x, b) (u64)( ((x) << (b)) | ( (x) >> (64 - (b))) ) + +#define U32TO8_LE(p, v) \ +(p)[0] = (u8)((v) ); (p)[1] = (u8)((v) >> 8); \ +(p)[2] = (u8)((v) >> 16); (p)[3] = (u8)((v) >> 24); + +#define U64TO8_LE(p, v) \ +U32TO8_LE((p), (u32)((v) )); \ +U32TO8_LE((p) + 4, (u32)((v) >> 32)); + +#define U8TO64_LE(p) \ +(((u64)((p)[0])) | \ +((u64)((p)[1]) << 8) | \ +((u64)((p)[2]) << 16) | \ +((u64)((p)[3]) << 24) | \ +((u64)((p)[4]) << 32) | \ +((u64)((p)[5]) << 40) | \ +((u64)((p)[6]) << 48) | \ +((u64)((p)[7]) << 56)) + +#define SIPROUND \ +do { \ +v0 += v1; v1=ROTL(v1, 13); v1 ^= v0; v0=ROTL(v0, 32); \ +v2 += v3; v3=ROTL(v3, 16); v3 ^= v2; \ +v0 += v3; v3=ROTL(v3, 21); v3 ^= v0; \ +v2 += v1; v1=ROTL(v1, 17); v1 ^= v2; v2=ROTL(v2, 32); \ +} while (0) + +/* SipHash-1-2 */ +static int siphash( + unsigned char *out, + const unsigned char *in, + unsigned long long inlen, + const unsigned char *k) { + /* "somepseudorandomlygeneratedbytes" */ + u64 v0 = 0x736f6d6570736575ULL; + u64 v1 = 0x646f72616e646f6dULL; + u64 v2 = 0x6c7967656e657261ULL; + u64 v3 = 0x7465646279746573ULL; + u64 b; + u64 k0 = U8TO64_LE(k); + u64 k1 = U8TO64_LE(k + 8); + u64 m; + const u8 *end = in + inlen - ( inlen % sizeof( u64 ) ); + const int left = inlen & 7; + b = ((u64)inlen) << 56; + v3 ^= k1; + v2 ^= k0; + v1 ^= k1; + v0 ^= k0; + + for ( ; in != end; in += 8 ) { + m = U8TO64_LE(in); + + v3 ^= m; + + SIPROUND; + + v0 ^= m; + } + + switch ( left ) { + case 7: b |= ((u64)in[ 6]) << 48; + + case 6: b |= ((u64)in[ 5]) << 40; + + case 5: b |= ((u64)in[ 4]) << 32; + + case 4: b |= ((u64)in[ 3]) << 24; + + case 3: b |= ((u64)in[ 2]) << 16; + + case 2: b |= ((u64)in[ 1]) << 8; + + case 1: b |= ((u64)in[ 0]); break; + + case 0: break; + } + + v3 ^= b; + + SIPROUND; + + v0 ^= b; + v2 ^= 0xff; + + SIPROUND; + SIPROUND; + + b = v0 ^ v1 ^ v2 ^ v3; + U64TO8_LE(out, b); + return 0; +} + +inline uint64_t ck_hash_str(const char *str, size_t keylen) { + uint64_t hash; + unsigned char k[16] = { 0 }; + siphash((unsigned char *)&hash, (const unsigned char *)str, keylen, k); + return hash; +} + +const void *ck_float_hash_lookup(float key, ck_hash_table_t *table) { + return ck_str_n_hash_lookup((const char *)&key, sizeof(float), table); +} + +int ck_float_hash_insert( + float key, + const void *value, + ck_hash_table_t *table +) { + return ck_str_n_hash_insert( + (const char *)&key, + sizeof(float), + value, + table); +} + +const void *ck_double_hash_lookup(double key, ck_hash_table_t *table) { + return ck_str_n_hash_lookup((const char *)&key, sizeof(double), table); +} + +int ck_double_hash_insert( + double key, + const void *value, + ck_hash_table_t *table +) { + return ck_str_n_hash_insert( + (const char *)&key, + sizeof(double), + value, + table); +} + +const void *ck_str_hash_lookup(const char *key, ck_hash_table_t *table) { + size_t keylen = strlen(key); + return ck_str_n_hash_lookup(key, keylen, table); +} + +const void *ck_str_n_hash_lookup( + const char *key, + size_t keylen, + ck_hash_table_t *table +) { + if (table->count == 0) + return NULL; + + if (keylen == 0) + return NULL; + + uint64_t hash_key = ck_hash_str(key, keylen); + hash_key %= table->capacity; + uint64_t end = hash_key; + do { + char *this_key = &table->keys[table->entries[hash_key].key_offset]; + size_t this_keylen = table->entries[hash_key].key_length; + if (this_keylen == 0) + return NULL; + if (this_keylen == keylen && memcmp(this_key, key, keylen) == 0) { + return table->entries[hash_key].value; + } + hash_key++; + hash_key %= table->capacity; + } while (hash_key != end); + return NULL; +} + +int ck_str_hash_insert( + const char *key, + const void *value, + ck_hash_table_t *table +) { + size_t keylen = strlen(key); + return ck_str_n_hash_insert(key, keylen, value, table); +} + +static int ck_hash_insert_nocopy( + off_t key_offset, + size_t keylen, + uint64_t hash_key, + const void *value, + ck_hash_table_t *table +) { + if (table->capacity == 0) + return 0; + + hash_key %= table->capacity; + uint64_t end = (hash_key + table->capacity - 1) % table->capacity; + while (hash_key != end) { + ck_hash_entry_t *entry = &table->entries[hash_key]; + if (table->entries[hash_key].key_length == 0) { + table->count++; + entry->key_offset = key_offset; + entry->key_length = keylen; + entry->value = value; + return 1; + } else if (entry->key_length == keylen && + entry->key_offset == key_offset) { + entry->value = value; + return 1; + } + hash_key++; + hash_key %= table->capacity; + } + return 0; +} + +int ck_str_n_hash_insert( + const char *key, + size_t keylen, + const void *value, + ck_hash_table_t *table +) { + if (table->capacity == 0) + return 0; + + if (keylen == 0) + return 0; + + if (table->count >= 0.75 * table->capacity) { + if (ck_hash_table_grow(table) == -1) { + return 0; + } + } + + uint64_t hash_key = ck_hash_str(key, keylen); + hash_key %= table->capacity; + uint64_t end = hash_key; + do { + ck_hash_entry_t *entry = &table->entries[hash_key]; + char *this_key = &table->keys[entry->key_offset]; + if (entry->key_length == 0) { + table->count++; + while (table->keys_used + keylen > table->keys_capacity) { + table->keys_capacity *= 2; + table->keys = realloc(table->keys, table->keys_capacity); + } + memcpy(table->keys + table->keys_used, key, keylen); + entry->key_offset = table->keys_used; + entry->key_length = keylen; + table->keys_used += keylen; + entry->value = value; + return 1; + } else if (entry->key_length == keylen && + memcmp(this_key, key, keylen) == 0) { + table->entries[hash_key].value = value; + return 1; + } + hash_key++; + hash_key %= table->capacity; + } while (hash_key != end); + return 0; +} + +ck_hash_table_t *ck_hash_table_init( + size_t num_entries, + size_t mean_key_length +) { + ck_hash_table_t *table; + if ((table = malloc(sizeof(ck_hash_table_t))) == NULL) + return NULL; + + if ((table->keys = malloc(num_entries * mean_key_length)) == NULL) { + free(table); + return NULL; + } + table->keys_capacity = num_entries * mean_key_length; + + num_entries *= 2; + + if ((table->entries = malloc( + num_entries * sizeof(ck_hash_entry_t))) == NULL + ) { + free(table->keys); + free(table); + return NULL; + } + table->capacity = num_entries; + ck_hash_table_wipe(table); + return table; +} + +void ck_hash_table_free(ck_hash_table_t *table) { + free(table->entries); + if (table->keys) + free(table->keys); + free(table); +} + +void ck_hash_table_wipe(ck_hash_table_t *table) { + table->keys_used = 0; + table->count = 0; + memset(table->entries, 0, table->capacity * sizeof(ck_hash_entry_t)); +} + +int ck_hash_table_grow(ck_hash_table_t *table) { + ck_hash_entry_t *old_entries = table->entries; + uint64_t old_capacity = table->capacity; + uint64_t new_capacity = 2 * table->capacity; + if ((table->entries = calloc( + new_capacity, + sizeof(ck_hash_entry_t))) == NULL + ) { + return -1; + } + table->capacity = new_capacity; + table->count = 0; + for (unsigned int i = 0; i < old_capacity; i++) { + if (old_entries[i].key_length != 0) { + char *this_key = &table->keys[old_entries[i].key_offset]; + uint64_t hash_key = ck_hash_str( + this_key, + old_entries[i].key_length); + if (!ck_hash_insert_nocopy( + old_entries[i].key_offset, + old_entries[i].key_length, + hash_key, + old_entries[i].value, table) + ) + return -1; + } + } + free(old_entries); + return 0; +} diff --git a/pandas/_libs/src/librdata/CKHashTable.h b/pandas/_libs/src/librdata/CKHashTable.h new file mode 100644 index 0000000000000..17190e02e3521 --- /dev/null +++ b/pandas/_libs/src/librdata/CKHashTable.h @@ -0,0 +1,55 @@ +// CKHashTable - A simple hash table +// Copyright 2010-2020 Evan Miller (see LICENSE) + +#ifndef PANDAS__LIBS_SRC_LIBRDATA_CKHASHTABLE_H_ +#define PANDAS__LIBS_SRC_LIBRDATA_CKHASHTABLE_H_ + +#include +#include + +typedef struct ck_hash_entry_s { + off_t key_offset; + size_t key_length; + const void *value; +} ck_hash_entry_t; + +typedef struct ck_hash_table_s { + size_t capacity; + size_t count; + ck_hash_entry_t *entries; + char *keys; + size_t keys_used; + size_t keys_capacity; +} ck_hash_table_t; + +int ck_str_hash_insert( + const char *key, const void *value, ck_hash_table_t *table +); +const void *ck_str_hash_lookup(const char *key, ck_hash_table_t *table); + +int ck_str_n_hash_insert( + const char *key, size_t keylen, const void *value, ck_hash_table_t *table +); +const void *ck_str_n_hash_lookup( + const char *key, size_t keylen, ck_hash_table_t *table +); + +int ck_float_hash_insert( + float key, const void *value, ck_hash_table_t *table +); +const void *ck_float_hash_lookup(float key, ck_hash_table_t *table); + +int ck_double_hash_insert( + double key, const void *value, ck_hash_table_t *table +); +const void *ck_double_hash_lookup(double key, ck_hash_table_t *table); + +ck_hash_table_t *ck_hash_table_init( + size_t num_entries, size_t mean_key_length +); +void ck_hash_table_wipe(ck_hash_table_t *table); +int ck_hash_table_grow(ck_hash_table_t *table); +void ck_hash_table_free(ck_hash_table_t *table); +uint64_t ck_hash_str(const char *str, size_t keylen); + +#endif // PANDAS__LIBS_SRC_LIBRDATA_CKHASHTABLE_H_ diff --git a/pandas/_libs/src/librdata/rdata.h b/pandas/_libs/src/librdata/rdata.h new file mode 100644 index 0000000000000..216c2cbab11d0 --- /dev/null +++ b/pandas/_libs/src/librdata/rdata.h @@ -0,0 +1,257 @@ +/* +Copyright (c) 2020 Evan Miller +*/ + +#ifndef PANDAS__LIBS_SRC_LIBRDATA_RDATA_H_ +#define PANDAS__LIBS_SRC_LIBRDATA_RDATA_H_ + +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +typedef enum rdata_type_e { + RDATA_TYPE_STRING, + RDATA_TYPE_INT32, + RDATA_TYPE_REAL, + RDATA_TYPE_LOGICAL, + RDATA_TYPE_TIMESTAMP, + RDATA_TYPE_DATE +} rdata_type_t; + +typedef enum rdata_error_e { + RDATA_OK, + RDATA_ERROR_OPEN = 1, + RDATA_ERROR_SEEK, + RDATA_ERROR_READ, + RDATA_ERROR_MALLOC, + RDATA_ERROR_USER_ABORT, + RDATA_ERROR_PARSE, + RDATA_ERROR_WRITE, + RDATA_ERROR_FACTOR, + RDATA_ERROR_UNSUPPORTED_COMPRESSION, + RDATA_ERROR_UNSUPPORTED_CHARSET, + RDATA_ERROR_CONVERT, + RDATA_ERROR_CONVERT_BAD_STRING, + RDATA_ERROR_CONVERT_LONG_STRING, + RDATA_ERROR_CONVERT_SHORT_STRING, + RDATA_ERROR_UNSUPPORTED_S_EXPRESSION, + RDATA_ERROR_UNSUPPORTED_STORAGE_CLASS +} rdata_error_t; + +typedef enum rdata_file_format_e { + RDATA_WORKSPACE, + RDATA_SINGLE_OBJECT +} rdata_file_format_t; + +const char *rdata_error_message(rdata_error_t error_code); + +typedef int (*rdata_column_handler)(const char *name, rdata_type_t type, + void *data, long count, void *ctx); +typedef int (*rdata_table_handler)(const char *name, void *ctx); +typedef int (*rdata_text_value_handler)( + const char *value, int index, void *ctx +); +typedef int (*rdata_column_name_handler)( + const char *value, int index, void *ctx +); +typedef void (*rdata_error_handler)(const char *error_message, void *ctx); +typedef int (*rdata_progress_handler)(double progress, void *ctx); + +#if defined(_MSC_VER) +#include +typedef SSIZE_T ssize_t; +typedef __int64 rdata_off_t; +#elif defined _WIN32 || defined __CYGWIN__ +typedef _off64_t rdata_off_t; +#elif defined _AIX +typedef off64_t rdata_off_t; +#else +typedef off_t rdata_off_t; +#endif + +typedef enum rdata_io_flags_e { + RDATA_SEEK_SET, + RDATA_SEEK_CUR, + RDATA_SEEK_END +} rdata_io_flags_t; + +typedef int (*rdata_open_handler)(const char *path, void *io_ctx); +typedef int (*rdata_close_handler)(void *io_ctx); +typedef rdata_off_t (*rdata_seek_handler)( + rdata_off_t offset, rdata_io_flags_t whence, void *io_ctx +); +typedef ssize_t (*rdata_read_handler)(void *buf, size_t nbyte, void *io_ctx); +typedef rdata_error_t (*rdata_update_handler)( + long file_size, + rdata_progress_handler progress_handler, + void *user_ctx, + void *io_ctx +); + +typedef struct rdata_io_s { + rdata_open_handler open; + rdata_close_handler close; + rdata_seek_handler seek; + rdata_read_handler read; + rdata_update_handler update; + void *io_ctx; + int external_io; +} rdata_io_t; + +typedef struct rdata_parser_s { + rdata_table_handler table_handler; + rdata_column_handler column_handler; + rdata_column_name_handler column_name_handler; + rdata_column_name_handler row_name_handler; + rdata_text_value_handler text_value_handler; + rdata_text_value_handler value_label_handler; + rdata_column_handler dim_handler; + rdata_text_value_handler dim_name_handler; + rdata_error_handler error_handler; + rdata_io_t *io; +} rdata_parser_t; + +rdata_parser_t *rdata_parser_init(void); +void rdata_parser_free(rdata_parser_t *parser); + +rdata_error_t rdata_set_table_handler( + rdata_parser_t *parser, rdata_table_handler table_handler +); +rdata_error_t rdata_set_column_handler( + rdata_parser_t *parser, rdata_column_handler column_handler +); +rdata_error_t rdata_set_column_name_handler( + rdata_parser_t *parser, rdata_column_name_handler column_name_handler +); +rdata_error_t rdata_set_row_name_handler( + rdata_parser_t *parser, rdata_column_name_handler row_name_handler +); +rdata_error_t rdata_set_text_value_handler( + rdata_parser_t *parser, rdata_text_value_handler text_value_handler +); +rdata_error_t rdata_set_value_label_handler( + rdata_parser_t *parser, rdata_text_value_handler value_label_handler +); +rdata_error_t rdata_set_dim_handler( + rdata_parser_t *parser, rdata_column_handler dim_handler +); +rdata_error_t rdata_set_dim_name_handler( + rdata_parser_t *parser, rdata_text_value_handler dim_name_handler +); +rdata_error_t rdata_set_error_handler( + rdata_parser_t *parser, rdata_error_handler error_handler +); +rdata_error_t rdata_set_open_handler( + rdata_parser_t *parser, rdata_open_handler open_handler +); +rdata_error_t rdata_set_close_handler( + rdata_parser_t *parser, rdata_close_handler close_handler +); +rdata_error_t rdata_set_seek_handler( + rdata_parser_t *parser, rdata_seek_handler seek_handler +); +rdata_error_t rdata_set_read_handler( + rdata_parser_t *parser, rdata_read_handler read_handler +); +rdata_error_t rdata_set_update_handler( + rdata_parser_t *parser, rdata_update_handler update_handler); +rdata_error_t rdata_set_io_ctx(rdata_parser_t *parser, void *io_ctx); + +/* rdata_parse works on RData and RDS. The table handler will be called once + * per data frame in RData files, and zero times on RDS files. */ + +rdata_error_t rdata_parse( + rdata_parser_t *parser, const char *filename, void *user_ctx +); + + +// Write API +typedef ssize_t (*rdata_data_writer)(const void *data, size_t len, void *ctx); + +typedef struct rdata_column_s { + rdata_type_t type; + int index; + char name[256]; + char label[1024]; + + int32_t factor_count; + char **factor; +} rdata_column_t; + +typedef struct rdata_writer_s { + rdata_file_format_t file_format; + rdata_data_writer data_writer; + size_t bytes_written; + + rdata_error_handler error_handler; + void *user_ctx; + + void *atom_table; + int bswap; + + rdata_column_t **columns; + int32_t columns_count; + int32_t columns_capacity; +} rdata_writer_t; + +rdata_writer_t *rdata_writer_init( + rdata_data_writer write_callback, rdata_file_format_t format +); +void rdata_writer_free(rdata_writer_t *writer); + +rdata_column_t *rdata_add_column( + rdata_writer_t *writer, const char *name, rdata_type_t type +); + +rdata_error_t rdata_column_set_label( + rdata_column_t *column, const char *label +); +rdata_error_t rdata_column_add_factor( + rdata_column_t *column, const char *factor); + +rdata_column_t *rdata_get_column(rdata_writer_t *writer, int32_t j +); + +rdata_error_t rdata_begin_file(rdata_writer_t *writer, void *ctx); +rdata_error_t rdata_begin_table( + rdata_writer_t *writer, const char *variable_name); +rdata_error_t rdata_begin_column( + rdata_writer_t *writer, rdata_column_t *column, int32_t row_count +); + +rdata_error_t rdata_append_real_value( + rdata_writer_t *writer, double value +); +rdata_error_t rdata_append_int32_value( + rdata_writer_t *writer, int32_t value +); +rdata_error_t rdata_append_timestamp_value( + rdata_writer_t *writer, time_t value +); +rdata_error_t rdata_append_date_value( + rdata_writer_t *writer, struct tm *value +); +rdata_error_t rdata_append_logical_value( + rdata_writer_t *writer, int value); +rdata_error_t rdata_append_string_value( + rdata_writer_t *writer, const char *value +); + +rdata_error_t rdata_end_column( + rdata_writer_t *writer, rdata_column_t *column +); +rdata_error_t rdata_end_table( + rdata_writer_t *writer, int32_t row_count, const char *datalabel +); +rdata_error_t rdata_end_file(rdata_writer_t *writer); + +#ifdef __cplusplus +} // extern c block +#endif + +#endif // PANDAS__LIBS_SRC_LIBRDATA_RDATA_H_ diff --git a/pandas/_libs/src/librdata/rdata_bits.c b/pandas/_libs/src/librdata/rdata_bits.c new file mode 100644 index 0000000000000..dd308d0e5002f --- /dev/null +++ b/pandas/_libs/src/librdata/rdata_bits.c @@ -0,0 +1,52 @@ +/* +Copyright (c) 2020 Evan Miller +*/ + +// +// readstat_bits.c - Bit-twiddling utility functions +// + +#include +#include +#include + +#include "rdata_bits.h" + +int machine_is_little_endian() { + int test_byte_order = 1; + return ((char *)&test_byte_order)[0]; +} + +uint16_t byteswap2(uint16_t num) { + return ((num & 0xFF00) >> 8) | ((num & 0x00FF) << 8); +} + +uint32_t byteswap4(uint32_t num) { + num = ((num & 0xFFFF0000) >> 16) | ((num & 0x0000FFFF) << 16); + return ((num & 0xFF00FF00) >> 8) | ((num & 0x00FF00FF) << 8); +} + +uint64_t byteswap8(uint64_t num) { + num = ((num & 0xFFFFFFFF00000000) >> 32) | + ((num & 0x00000000FFFFFFFF) << 32); + num = ((num & 0xFFFF0000FFFF0000) >> 16) | + ((num & 0x0000FFFF0000FFFF) << 16); + return ((num & 0xFF00FF00FF00FF00) >> 8) | + ((num & 0x00FF00FF00FF00FF) << 8); +} + +float byteswap_float(float num) { + uint32_t answer = 0; + memcpy(&answer, &num, 4); + answer = byteswap4(answer); + memcpy(&num, &answer, 4); + return num; +} + +double byteswap_double(double num) { + uint64_t answer = 0; + memcpy(&answer, &num, 8); + answer = byteswap8(answer); + memcpy(&num, &answer, 8); + return num; +} diff --git a/pandas/_libs/src/librdata/rdata_bits.h b/pandas/_libs/src/librdata/rdata_bits.h new file mode 100644 index 0000000000000..1bd6493dfb230 --- /dev/null +++ b/pandas/_libs/src/librdata/rdata_bits.h @@ -0,0 +1,21 @@ +/* +Copyright (c) 2020 Evan Miller +*/ + +// +// rdata_bit.h - Bit-twiddling utility functions +// + +#ifndef PANDAS__LIBS_SRC_LIBRDATA_RDATA_BITS_H_ +#define PANDAS__LIBS_SRC_LIBRDATA_RDATA_BITS_H_ + +int machine_is_little_endian(void); + +uint16_t byteswap2(uint16_t num); +uint32_t byteswap4(uint32_t num); +uint64_t byteswap8(uint64_t num); + +float byteswap_float(float num); +double byteswap_double(double num); + +#endif // PANDAS__LIBS_SRC_LIBRDATA_RDATA_BITS_H_ diff --git a/pandas/_libs/src/librdata/rdata_error.c b/pandas/_libs/src/librdata/rdata_error.c new file mode 100644 index 0000000000000..5a5cabc1f55b7 --- /dev/null +++ b/pandas/_libs/src/librdata/rdata_error.c @@ -0,0 +1,64 @@ +/* +Copyright (c) 2020 Evan Miller +*/ + +#include "rdata.h" + +const char *rdata_error_message(rdata_error_t error_code) { + if (error_code == RDATA_OK) + return NULL; + + if (error_code == RDATA_ERROR_OPEN) + return "Unable to open file"; + + if (error_code == RDATA_ERROR_SEEK) + return "Unable to seek within file"; + + if (error_code == RDATA_ERROR_READ) + return "Unable to read from file"; + + if (error_code == RDATA_ERROR_MALLOC) + return "Unable to allocate memory"; + + if (error_code == RDATA_ERROR_USER_ABORT) + return "The parsing was aborted (callback returned non-zero value)"; + + if (error_code == RDATA_ERROR_PARSE) + return "Invalid file, or file has unsupported features"; + + if (error_code == RDATA_ERROR_WRITE) + return "Unable to write to file"; + + if (error_code == RDATA_ERROR_FACTOR) + return "The provided column does not support factors"; + + if (error_code == RDATA_ERROR_UNSUPPORTED_COMPRESSION) + return "The file is compressed using an unsupported " + "compression scheme"; + + if (error_code == RDATA_ERROR_UNSUPPORTED_CHARSET) + return "File has an unsupported character set"; + + if (error_code == RDATA_ERROR_CONVERT) + return "Unable to convert string to the requested encoding"; + + if (error_code == RDATA_ERROR_CONVERT_BAD_STRING) + return "Unable to convert string to the requested " + "encoding (invalid byte sequence)"; + + if (error_code == RDATA_ERROR_CONVERT_SHORT_STRING) + return "Unable to convert string to the requested " + "encoding (incomplete byte sequence)"; + + if (error_code == RDATA_ERROR_CONVERT_LONG_STRING) + return "Unable to convert string to the requested " + "encoding (output buffer too small)"; + + if (error_code == RDATA_ERROR_UNSUPPORTED_S_EXPRESSION) + return "The file contains an unrecognized object"; + + if (error_code == RDATA_ERROR_UNSUPPORTED_STORAGE_CLASS) + return "The file contains an unrecognized object"; + + return "Unknown error"; +} diff --git a/pandas/_libs/src/librdata/rdata_internal.h b/pandas/_libs/src/librdata/rdata_internal.h new file mode 100644 index 0000000000000..ba1ba11c91f78 --- /dev/null +++ b/pandas/_libs/src/librdata/rdata_internal.h @@ -0,0 +1,89 @@ +/* +Copyright (c) 2020 Evan Miller +*/ + +// +// rdata_internal.h +// + +#ifndef PANDAS__LIBS_SRC_LIBRDATA_RDATA_INTERNAL_H_ +#define PANDAS__LIBS_SRC_LIBRDATA_RDATA_INTERNAL_H_ + +#include "rdata_bits.h" + +#pragma pack(push, 1) + +typedef struct rdata_v2_header_s { + char header[2]; + uint32_t format_version; + uint32_t writer_version; + uint32_t reader_version; +} rdata_v2_header_t; + +typedef struct rdata_sexptype_header_s { + unsigned int type:8; + unsigned int object:1; + unsigned int attributes:1; + unsigned int tag:1; + unsigned int unused:1; + unsigned int gp:16; + unsigned int padding:4; +} rdata_sexptype_header_t; + +typedef struct rdata_sexptype_info_s { + rdata_sexptype_header_t header; + int32_t attributes; + int32_t tag; + int32_t ref; +} rdata_sexptype_info_t; + +#pragma pack(pop) + +#define RDATA_SEXPTYPE_NIL 0 +#define RDATA_SEXPTYPE_SYMBOL 1 +#define RDATA_SEXPTYPE_PAIRLIST 2 +#define RDATA_SEXPTYPE_CLOSURE 3 +#define RDATA_SEXPTYPE_ENVIRONMENT 4 +#define RDATA_SEXPTYPE_PROMISE 5 +#define RDATA_SEXPTYPE_LANGUAGE_OBJECT 6 +#define RDATA_SEXPTYPE_SPECIAL_FUNCTION 7 +#define RDATA_SEXPTYPE_BUILTIN_FUNCTION 8 +#define RDATA_SEXPTYPE_CHARACTER_STRING 9 +#define RDATA_SEXPTYPE_LOGICAL_VECTOR 10 +#define RDATA_SEXPTYPE_INTEGER_VECTOR 13 +#define RDATA_SEXPTYPE_REAL_VECTOR 14 +#define RDATA_SEXPTYPE_COMPLEX_VECTOR 15 +#define RDATA_SEXPTYPE_CHARACTER_VECTOR 16 +#define RDATA_SEXPTYPE_DOT_DOT_DOT 17 +#define RDATA_SEXPTYPE_ANY 18 +#define RDATA_SEXPTYPE_GENERIC_VECTOR 19 +#define RDATA_SEXPTYPE_EXPRESSION_VECTOR 20 +#define RDATA_SEXPTYPE_BYTE_CODE 21 +#define RDATA_SEXPTYPE_EXTERNAL_POINTER 22 +#define RDATA_SEXPTYPE_WEAK_REFERENCE 23 +#define RDATA_SEXPTYPE_RAW_VECTOR 24 +#define RDATA_SEXPTYPE_S4_CLASS 25 + +#define RDATA_SEXPTYPE_FUN 99 + +#define RDATA_PSEUDO_SXP_REF 255 +#define RDATA_PSEUDO_SXP_NIL 254 +#define RDATA_PSEUDO_SXP_GLOBAL_ENVIRONMENT 253 +#define RDATA_PSEUDO_SXP_UNBOUND_VALUE 252 +#define RDATA_PSEUDO_SXP_MISSING_ARGUMENT 251 +#define RDATA_PSEUDO_SXP_BASE_NAMESPACE 250 +#define RDATA_PSEUDO_SXP_NAMESPACE 249 +#define RDATA_PSEUDO_SXP_PACKAGE 248 +#define RDATA_PSEUDO_SXP_PERSIST 247 +#define RDATA_PSEUDO_SXP_CLASS_REF 246 +#define RDATA_PSEUDO_SXP_GENERIC_REF 245 +#define RDATA_PSEUDO_SXP_BYTE_CODE_REP_DEF 244 +#define RDATA_PSEUDO_SXP_BYTE_CODE_REP_REF 243 +#define RDATA_PSEUDO_SXP_EMPTY_ENVIRONMENT 242 +#define RDATA_PSEUDO_SXP_BASE_ENVIRONMENT 241 + +#define RDATA_SEXPTYPE_LANGUAGE_OBJECT_ATTR 240 +#define RDATA_SEXPTYPE_PAIRLIST_ATTR 239 +#define RDATA_PSEUDO_SXP_ALTREP 238 + +#endif // PANDAS__LIBS_SRC_LIBRDATA_RDATA_INTERNAL_H_ diff --git a/pandas/_libs/src/librdata/rdata_io_unistd.c b/pandas/_libs/src/librdata/rdata_io_unistd.c new file mode 100644 index 0000000000000..118eb4a64a968 --- /dev/null +++ b/pandas/_libs/src/librdata/rdata_io_unistd.c @@ -0,0 +1,101 @@ +/* +Copyright (c) 2020 Evan Miller +*/ + +#include +#include +#if defined _WIN32 || defined __CYGWIN__ + #include +#else + #include +#endif + + +#include "rdata.h" +#include "rdata_io_unistd.h" + +#if defined _WIN32 || defined __CYGWIN__ +#define UNISTD_OPEN_OPTIONS O_RDONLY | O_BINARY +#elif defined _AIX +#define UNISTD_OPEN_OPTIONS O_RDONLY | O_LARGEFILE +#else +#define UNISTD_OPEN_OPTIONS O_RDONLY +#endif + +#if defined _WIN32 || defined _AIX +#define lseek lseek +#endif + + +int rdata_unistd_open_handler(const char *path, void *io_ctx) { + int fd = open(path, UNISTD_OPEN_OPTIONS); + ((rdata_unistd_io_ctx_t*) io_ctx)->fd = fd; + return fd; +} + +int rdata_unistd_close_handler(void *io_ctx) { + int fd = ((rdata_unistd_io_ctx_t*) io_ctx)->fd; + if (fd != -1) + return close(fd); + else + return 0; +} + +rdata_off_t rdata_unistd_seek_handler( + rdata_off_t offset, + rdata_io_flags_t whence, + void *io_ctx +) { + int flag = 0; + switch (whence) { + case RDATA_SEEK_SET: + flag = SEEK_SET; + break; + case RDATA_SEEK_CUR: + flag = SEEK_CUR; + break; + case RDATA_SEEK_END: + flag = SEEK_END; + break; + default: + return -1; + } + int fd = ((rdata_unistd_io_ctx_t*) io_ctx)->fd; + return lseek(fd, offset, flag); +} + +ssize_t rdata_unistd_read_handler(void *buf, size_t nbyte, void *io_ctx) { + int fd = ((rdata_unistd_io_ctx_t*) io_ctx)->fd; + ssize_t out = read(fd, buf, nbyte); + return out; +} + +rdata_error_t rdata_unistd_update_handler(long file_size, + rdata_progress_handler progress_handler, void *user_ctx, + void *io_ctx) { + if (!progress_handler) + return RDATA_OK; + + int fd = ((rdata_unistd_io_ctx_t*) io_ctx)->fd; + long current_offset = lseek(fd, 0, SEEK_CUR); + + if (current_offset == -1) + return RDATA_ERROR_SEEK; + + if (progress_handler(1.0 * current_offset / file_size, user_ctx)) + return RDATA_ERROR_USER_ABORT; + + return RDATA_OK; +} + +void rdata_unistd_io_init(rdata_parser_t *parser) { + rdata_set_open_handler(parser, rdata_unistd_open_handler); + rdata_set_close_handler(parser, rdata_unistd_close_handler); + rdata_set_seek_handler(parser, rdata_unistd_seek_handler); + rdata_set_read_handler(parser, rdata_unistd_read_handler); + rdata_set_update_handler(parser, rdata_unistd_update_handler); + + rdata_unistd_io_ctx_t *io_ctx = calloc(1, sizeof(rdata_unistd_io_ctx_t)); + io_ctx->fd = -1; + rdata_set_io_ctx(parser, (void*) io_ctx); +} diff --git a/pandas/_libs/src/librdata/rdata_io_unistd.h b/pandas/_libs/src/librdata/rdata_io_unistd.h new file mode 100644 index 0000000000000..661010c76c4aa --- /dev/null +++ b/pandas/_libs/src/librdata/rdata_io_unistd.h @@ -0,0 +1,26 @@ +/* +Copyright (c) 2020 Evan Miller +*/ + +#ifndef PANDAS__LIBS_SRC_LIBRDATA_RDATA_IO_UNISTD_H_ +#define PANDAS__LIBS_SRC_LIBRDATA_RDATA_IO_UNISTD_H_ + +typedef struct rdata_unistd_io_ctx_s { + int fd; +} rdata_unistd_io_ctx_t; + +int rdata_unistd_open_handler(const char *path, void *io_ctx); +int rdata_unistd_close_handler(void *io_ctx); +rdata_off_t rdata_unistd_seek_handler( + rdata_off_t offset, rdata_io_flags_t whence, void *io_ctx +); +ssize_t rdata_unistd_read_handler(void *buf, size_t nbytes, void *io_ctx); +rdata_error_t rdata_unistd_update_handler( + long file_size, + rdata_progress_handler progress_handler, + void *user_ctx, + void *io_ctx +); +void rdata_unistd_io_init(rdata_parser_t *parser); + +#endif // PANDAS__LIBS_SRC_LIBRDATA_RDATA_IO_UNISTD_H_ diff --git a/pandas/_libs/src/librdata/rdata_parser.c b/pandas/_libs/src/librdata/rdata_parser.c new file mode 100644 index 0000000000000..5d948a449fba3 --- /dev/null +++ b/pandas/_libs/src/librdata/rdata_parser.c @@ -0,0 +1,147 @@ +/* +Copyright (c) 2020 Evan Miller +*/ + +#include +#include "rdata.h" +#include "rdata_io_unistd.h" + +rdata_parser_t *rdata_parser_init() { + rdata_parser_t *parser = calloc(1, sizeof(rdata_parser_t)); + parser->io = calloc(1, sizeof(rdata_io_t)); + rdata_unistd_io_init(parser); + return parser; +} + +void rdata_parser_free(rdata_parser_t *parser) { + if (parser) { + if (parser->io) + free(parser->io); + free(parser); + } +} + +rdata_error_t rdata_set_table_handler( + rdata_parser_t *parser, + rdata_table_handler table_handler +) { + parser->table_handler = table_handler; + return RDATA_OK; +} + +rdata_error_t rdata_set_column_handler( + rdata_parser_t *parser, + rdata_column_handler column_handler +) { + parser->column_handler = column_handler; + return RDATA_OK; +} + +rdata_error_t rdata_set_column_name_handler( + rdata_parser_t *parser, + rdata_column_name_handler column_name_handler +) { + parser->column_name_handler = column_name_handler; + return RDATA_OK; +} + +rdata_error_t rdata_set_row_name_handler( + rdata_parser_t *parser, + rdata_column_name_handler row_name_handler +) { + parser->row_name_handler = row_name_handler; + return RDATA_OK; +} + +rdata_error_t rdata_set_text_value_handler( + rdata_parser_t *parser, + rdata_text_value_handler text_value_handler +) { + parser->text_value_handler = text_value_handler; + return RDATA_OK; +} + +rdata_error_t rdata_set_value_label_handler( + rdata_parser_t *parser, + rdata_text_value_handler value_label_handler +) { + parser->value_label_handler = value_label_handler; + return RDATA_OK; +} + +rdata_error_t rdata_set_dim_handler( + rdata_parser_t *parser, + rdata_column_handler dim_handler +) { + parser->dim_handler = dim_handler; + return RDATA_OK; +} + +rdata_error_t rdata_set_dim_name_handler( + rdata_parser_t *parser, + rdata_text_value_handler dim_name_handler +) { + parser->dim_name_handler = dim_name_handler; + return RDATA_OK; +} + +rdata_error_t rdata_set_error_handler( + rdata_parser_t *parser, + rdata_error_handler error_handler +) { + parser->error_handler = error_handler; + return RDATA_OK; +} + +rdata_error_t rdata_set_open_handler( + rdata_parser_t *parser, + rdata_open_handler open_handler +) { + parser->io->open = open_handler; + return RDATA_OK; +} + +rdata_error_t rdata_set_close_handler( + rdata_parser_t *parser, + rdata_close_handler close_handler +) { + parser->io->close = close_handler; + return RDATA_OK; +} + +rdata_error_t rdata_set_seek_handler( + rdata_parser_t *parser, + rdata_seek_handler seek_handler +) { + parser->io->seek = seek_handler; + return RDATA_OK; +} + +rdata_error_t rdata_set_read_handler( + rdata_parser_t *parser, + rdata_read_handler read_handler +) { + parser->io->read = read_handler; + return RDATA_OK; +} + +rdata_error_t rdata_set_update_handler( + rdata_parser_t *parser, + rdata_update_handler update_handler +) { + parser->io->update = update_handler; + return RDATA_OK; +} + +rdata_error_t rdata_set_io_ctx( + rdata_parser_t *parser, + void *io_ctx +) { + if (!parser->io->external_io) + free(parser->io->io_ctx); + + parser->io->io_ctx = io_ctx; + parser->io->external_io = 1; + + return RDATA_OK; +} diff --git a/pandas/_libs/src/librdata/rdata_read.c b/pandas/_libs/src/librdata/rdata_read.c new file mode 100644 index 0000000000000..dbc165a2273dc --- /dev/null +++ b/pandas/_libs/src/librdata/rdata_read.c @@ -0,0 +1,2172 @@ +/* +Copyright (c) 2020 Evan Miller +*/ + +// +// rdata_rdata.c +// + +#include +#include +#include +#include +#include +#include +#include + +#if defined(_WIN32) +#include "win_iconv.h" +#elif __linux__ +#include "unix_iconv.h" +#else +#include +#endif + +#include +#include + +#if HAVE_BZIP2 +#include +#endif + +#if HAVE_APPLE_COMPRESSION +#include +#endif + +#if HAVE_ZLIB +#include +#endif + +#if HAVE_LZMA +#include +#endif + +#include "rdata.h" +#include "rdata_internal.h" + +#define RDATA_CLASS_POSIXCT 0x01 +#define RDATA_CLASS_DATE 0x02 + +#define STREAM_BUFFER_SIZE 65536 +#define MAX_ARRAY_DIMENSIONS 3 + +/* ICONV_CONST defined by autotools during configure according + * to the current platform. Some people copy-paste the source code, so + * provide some fallback logic */ +#ifndef ICONV_CONST +#define ICONV_CONST +#endif + +typedef struct rdata_atom_table_s { + int count; + char **data; +} rdata_atom_table_t; + +typedef struct rdata_ctx_s { + int machine_needs_byteswap; + rdata_table_handler table_handler; + rdata_column_handler column_handler; + rdata_column_name_handler column_name_handler; + rdata_column_name_handler row_name_handler; + rdata_text_value_handler text_value_handler; + rdata_text_value_handler value_label_handler; + rdata_column_handler dim_handler; + rdata_text_value_handler dim_name_handler; + rdata_error_handler error_handler; + void *user_ctx; +#if HAVE_BZIP2 + bz_stream *bz_strm; +#endif +#if HAVE_APPLE_COMPRESSION + compression_stream *compression_strm; +#endif +#if HAVE_ZLIB + z_stream *z_strm; +#endif +#if HAVE_LZMA + lzma_stream *lzma_strm; +#endif + void *strm_buffer; + rdata_io_t *io; + size_t bytes_read; + + rdata_atom_table_t *atom_table; + unsigned int column_class; + + iconv_t converter; + + int32_t dims[MAX_ARRAY_DIMENSIONS]; + bool is_dimnames; +} rdata_ctx_t; + +static int atom_table_add(rdata_atom_table_t *table, char *key); +static char *atom_table_lookup(rdata_atom_table_t *table, int index); + +static rdata_error_t read_environment( + const char *table_name, + rdata_ctx_t *ctx); +static rdata_error_t read_toplevel_object( + const char *table_name, + const char *key, + rdata_ctx_t *ctx); +static rdata_error_t read_sexptype_header( + rdata_sexptype_info_t *header, + rdata_ctx_t *ctx); +static rdata_error_t read_length( + int32_t *outLength, + rdata_ctx_t *ctx); +static rdata_error_t read_string_vector_n( + int attributes, + int32_t length, + rdata_text_value_handler text_value_handler, + void *callback_ctx, + rdata_ctx_t *ctx); +static rdata_error_t read_string_vector( + int attributes, + rdata_text_value_handler text_value_handler, + void *callback_ctx, + rdata_ctx_t *ctx); +static rdata_error_t read_value_vector( + rdata_sexptype_header_t header, + const char *name, + rdata_ctx_t *ctx); +static rdata_error_t read_value_vector_cb( + rdata_sexptype_header_t header, + const char *name, + rdata_column_handler column_handler, + void *user_ctx, + rdata_ctx_t *ctx); +static rdata_error_t read_character_string( + char **key, + rdata_ctx_t *ctx); +static rdata_error_t read_generic_list( + int attributes, + rdata_ctx_t *ctx); +static rdata_error_t read_altrep_vector( + const char *name, + rdata_ctx_t *ctx); +static rdata_error_t read_attributes(int (*handle_attribute)( + char *key, + rdata_sexptype_info_t val_info, + rdata_ctx_t *ctx), + rdata_ctx_t *ctx); +static rdata_error_t recursive_discard( + rdata_sexptype_header_t sexptype_header, + rdata_ctx_t *ctx); + +static void *rdata_malloc(size_t len) { + if (len == 0) + return NULL; + + return malloc(len); +} + +static void *rdata_realloc(void *buf, size_t len) { + if (len == 0) + return NULL; + + return realloc(buf, len); +} + +static int atom_table_add(rdata_atom_table_t *table, char *key) { + table->data = realloc(table->data, sizeof(char *) * (table->count + 1)); + table->data[table->count++] = strdup(key); + return table->count; +} + +static char *atom_table_lookup(rdata_atom_table_t *table, int index) { + if (index <= 0 || index > table->count) { + return NULL; + } + return table->data[(index-1)]; +} + +#if HAVE_BZIP2 +static ssize_t read_st_bzip2(rdata_ctx_t *ctx, void *buffer, size_t len) { + ssize_t bytes_written = 0; + int error = 0; + int result = BZ_OK; + while (1) { + ssize_t start_out = ctx->bz_strm->total_out_lo32 + + ((ssize_t)ctx->bz_strm->total_out_hi32 << 32LL); + + ctx->bz_strm->next_out = (char *)buffer + bytes_written; + ctx->bz_strm->avail_out = len - bytes_written; + + result = BZ2_bzDecompress(ctx->bz_strm); + + if (result != BZ_OK && result != BZ_STREAM_END) { + error = -1; + break; + } + + bytes_written += ctx->bz_strm->total_out_lo32 + + ((ssize_t)ctx->bz_strm->total_out_hi32 << 32LL) - start_out; + + if (result == BZ_STREAM_END) + break; + + if (ctx->bz_strm->avail_in == 0) { + int bytes_read = 0; + bytes_read = ctx->io->read( + ctx->strm_buffer, + STREAM_BUFFER_SIZE, + ctx->io->io_ctx); + if (bytes_read < 0) { + error = bytes_read; + break; + } + if (bytes_read == 0) + break; + + ctx->bz_strm->next_in = ctx->strm_buffer; + ctx->bz_strm->avail_in = bytes_read; + } + if (bytes_written == len) + break; + } + + if (error != 0) + return error; + + return bytes_written; +} +#endif /* HAVE_BZIP2 */ + +#if HAVE_APPLE_COMPRESSION +static ssize_t read_st_compression( + rdata_ctx_t *ctx, + void *buffer, + size_t len +) { + ssize_t bytes_written = 0; + int error = 0; + compression_status result = COMPRESSION_STATUS_OK; + size_t start_size = len; + + ctx->compression_strm->dst_ptr = (unsigned char *)buffer; + ctx->compression_strm->dst_size = len; + + while (1) { + start_size = ctx->compression_strm->dst_size; + + result = compression_stream_process(ctx->compression_strm, 0); + + if (result == COMPRESSION_STATUS_OK) { + bytes_written += start_size - ctx->compression_strm->dst_size; + } else { + error = -1; + break; + } + + if (ctx->compression_strm->src_size == 0) { + int bytes_read = 0; + bytes_read = ctx->io->read( + ctx->compression_strm, + STREAM_BUFFER_SIZE, + ctx->io->io_ctx); + if (bytes_read < 0) { + error = bytes_read; + break; + } + if (bytes_read == 0) { + start_size = ctx->compression_strm->dst_size; + result = compression_stream_process( + ctx->compression_strm, + COMPRESSION_STREAM_FINALIZE); + if (result == COMPRESSION_STATUS_END) { + bytes_written += ( + start_size - ctx->compression_strm->dst_size); + } else { + error = -1; + } + break; + } + + ctx->compression_strm->src_ptr = ctx->strm_buffer; + ctx->compression_strm->src_size = bytes_read; + } + if (bytes_written == len) + break; + } + + if (error != 0) + return error; + + return bytes_written; +} +#endif /* HAVE_APPLE_COMPRESSION */ + +#if HAVE_ZLIB +static ssize_t read_st_z(rdata_ctx_t *ctx, void *buffer, size_t len) { + ssize_t bytes_written = 0; + int error = 0; + int result = Z_OK; + while (1) { + long start_out = ctx->z_strm->total_out; + + ctx->z_strm->next_out = (unsigned char *)buffer + bytes_written; + ctx->z_strm->avail_out = len - bytes_written; + + result = inflate(ctx->z_strm, Z_SYNC_FLUSH); + + if (result != Z_OK && result != Z_STREAM_END) { + error = -1; + break; + } + + bytes_written += ctx->z_strm->total_out - start_out; + + if (result == Z_STREAM_END) + break; + + if (ctx->z_strm->avail_in == 0) { + int bytes_read = 0; + bytes_read = ctx->io->read( + ctx->strm_buffer, + STREAM_BUFFER_SIZE, + ctx->io->io_ctx); + if (bytes_read < 0) { + error = bytes_read; + break; + } + if (bytes_read == 0) + break; + + ctx->z_strm->next_in = ctx->strm_buffer; + ctx->z_strm->avail_in = bytes_read; + } + if (bytes_written == len) + break; + } + + if (error != 0) + return error; + + return bytes_written; +} +#endif /* HAVE_ZLIB */ + +#if HAVE_LZMA +static ssize_t read_st_lzma(rdata_ctx_t *ctx, void *buffer, size_t len) { + ssize_t bytes_written = 0; + int error = 0; + int result = LZMA_OK; + while (1) { + long start_out = ctx->lzma_strm->total_out; + + ctx->lzma_strm->next_out = (unsigned char *)buffer + bytes_written; + ctx->lzma_strm->avail_out = len - bytes_written; + + result = lzma_code(ctx->lzma_strm, LZMA_RUN); + + if (result != LZMA_OK && result != LZMA_STREAM_END) { + error = -1; + break; + } + + bytes_written += ctx->lzma_strm->total_out - start_out; + + if (result == LZMA_STREAM_END) + break; + + if (ctx->lzma_strm->avail_in == 0) { + int bytes_read = 0; + bytes_read = ctx->io->read( + ctx->strm_buffer, + STREAM_BUFFER_SIZE, + ctx->io->io_ctx); + if (bytes_read < 0) { + error = bytes_read; + break; + } + if (bytes_read == 0) + break; + + ctx->lzma_strm->next_in = ctx->strm_buffer; + ctx->lzma_strm->avail_in = bytes_read; + } + if (bytes_written == len) + break; + } + + if (error != 0) + return error; + + return bytes_written; +} +#endif /* HAVE_LZMA */ + +static ssize_t read_st(rdata_ctx_t *ctx, void *buffer, size_t len) { + ssize_t bytes_read = 0; + + if (len == 0) + return 0; + +#if HAVE_BZIP2 + if (ctx->bz_strm) { + bytes_read = read_st_bzip2(ctx, buffer, len); + } else // NOLINT +#endif +#if HAVE_APPLE_COMPRESSION + if (ctx->compression_strm) { + bytes_read = read_st_compression(ctx, buffer, len); + } else // NOLINT +#endif +#if HAVE_ZLIB + if (ctx->z_strm) { + bytes_read = read_st_z(ctx, buffer, len); + } else // NOLINT +#endif +#if HAVE_LZMA + if (ctx->lzma_strm) { + bytes_read = read_st_lzma(ctx, buffer, len); + } else // NOLINT +#endif + { + bytes_read = ctx->io->read(buffer, len, ctx->io->io_ctx); + } + + if (bytes_read > 0) { + ctx->bytes_read += bytes_read; + } + + return bytes_read; +} + +static int lseek_st(rdata_ctx_t *ctx, size_t len) { + if (0 +#if HAVE_BZIP2 + || ctx->bz_strm +#endif +#if HAVE_APPLE_COMPRESSION + || ctx->compression_strm +#endif +#if HAVE_ZLIB + || ctx->z_strm +#endif +#if HAVE_LZMA + || ctx->lzma_strm +#endif + ) { + int retval = 0; + char *buf = rdata_malloc(len); + + int result_st = read_st(ctx, buf, len); + + if (result_st > 0) { + if (buf == NULL) { + retval = -1; + } else if ((size_t)result_st != len) { + retval = -1; + } + } else { + if (buf == NULL) { + retval = -1; + } else { + retval = -1; + } + } + + if (buf) + free(buf); + + return retval; + } + + return ctx->io->seek(len, SEEK_CUR, ctx->io->io_ctx); +} + +static rdata_error_t init_bz_stream(rdata_ctx_t *ctx) { + rdata_error_t retval = RDATA_OK; + ctx->strm_buffer = malloc(STREAM_BUFFER_SIZE); + int bytes_read = ctx->io->read( + ctx->strm_buffer, + STREAM_BUFFER_SIZE, + ctx->io->io_ctx); + if (bytes_read <= 0) { + retval = RDATA_ERROR_READ; + goto cleanup; + } + +#if HAVE_BZIP2 + ctx->bz_strm = calloc(1, sizeof(bz_stream)); + ctx->bz_strm->next_in = ctx->strm_buffer; + ctx->bz_strm->avail_in = bytes_read; + + if (BZ2_bzDecompressInit(ctx->bz_strm, 0, 0) != BZ_OK) { + retval = RDATA_ERROR_MALLOC; + goto cleanup; + } +#else + retval = RDATA_ERROR_UNSUPPORTED_COMPRESSION; + goto cleanup; +#endif + +cleanup: + return retval; +} + +static rdata_error_t init_z_stream(rdata_ctx_t *ctx) { + rdata_error_t retval = RDATA_OK; + ctx->strm_buffer = malloc(STREAM_BUFFER_SIZE); + int bytes_read = ctx->io->read( + ctx->strm_buffer, + STREAM_BUFFER_SIZE, + ctx->io->io_ctx); + if (bytes_read <= 0) { + retval = RDATA_ERROR_READ; + goto cleanup; + } + +#if HAVE_ZLIB + ctx->z_strm = calloc(1, sizeof(z_stream)); + ctx->z_strm->next_in = ctx->strm_buffer; + ctx->z_strm->avail_in = bytes_read; + + if (inflateInit2(ctx->z_strm, (15+32)) != Z_OK) { + retval = RDATA_ERROR_MALLOC; + goto cleanup; + } +#else + retval = RDATA_ERROR_UNSUPPORTED_COMPRESSION; + goto cleanup; +#endif + +cleanup: + return retval; +} + +static rdata_error_t init_lzma_stream(rdata_ctx_t *ctx) { + rdata_error_t retval = RDATA_OK; + ctx->strm_buffer = malloc(STREAM_BUFFER_SIZE); + int bytes_read = ctx->io->read( + ctx->strm_buffer, + STREAM_BUFFER_SIZE, + ctx->io->io_ctx); + if (bytes_read <= 0) { + retval = RDATA_ERROR_READ; + goto cleanup; + } + +#if HAVE_APPLE_COMPRESSION + ctx->compression_strm = calloc(1, sizeof(compression_stream)); + + if (compression_stream_init( + ctx->compression_strm, + COMPRESSION_STREAM_DECODE, + COMPRESSION_LZMA) == COMPRESSION_STATUS_ERROR + ) { + retval = RDATA_ERROR_MALLOC; + goto cleanup; + } + + ctx->compression_strm->src_ptr = ctx->strm_buffer; + ctx->compression_strm->src_size = bytes_read; +#elif HAVE_LZMA + ctx->lzma_strm = calloc(1, sizeof(lzma_stream)); + + if (lzma_stream_decoder(ctx->lzma_strm, UINT64_MAX, 0) != LZMA_OK) { + retval = RDATA_ERROR_MALLOC; + goto cleanup; + } + + ctx->lzma_strm->next_in = ctx->strm_buffer; + ctx->lzma_strm->avail_in = bytes_read; +#else + retval = RDATA_ERROR_UNSUPPORTED_COMPRESSION; + goto cleanup; +#endif + +cleanup: + return retval; +} + +static rdata_error_t init_stream(rdata_ctx_t *ctx) { + rdata_error_t retval = RDATA_OK; + char header[5]; + + if (ctx->io->read( + &header, + sizeof(header), + ctx->io->io_ctx) != sizeof(header) + ) { + retval = RDATA_ERROR_READ; + goto cleanup; + } + + if (ctx->io->seek(0, SEEK_SET, ctx->io->io_ctx) == -1) { + retval = RDATA_ERROR_SEEK; + goto cleanup; + } + + if (header[0] == 'B' && header[1] == 'Z' && header[2] == 'h' && + header[3] >= '0' && header[3] <= '9') { + return init_bz_stream(ctx); + } + if (header[0] == '\x1f' && header[1] == '\x8b') { + return init_z_stream(ctx); + } + if (strncmp("\xFD" "7zXZ", header, sizeof(header)) == 0) { + return init_lzma_stream(ctx); + } + +cleanup: + return retval; +} + +static rdata_error_t reset_stream(rdata_ctx_t *ctx) { +#if HAVE_BZIP2 + if (ctx->bz_strm) { + BZ2_bzDecompressEnd(ctx->bz_strm); + free(ctx->bz_strm); + ctx->bz_strm = NULL; + } +#endif +#if HAVE_APPLE_COMPRESSION + if (ctx->compression_strm) { + compression_stream_destroy(ctx->compression_strm); + free(ctx->compression_strm); + ctx->compression_strm = NULL; + } +#endif +#if HAVE_ZLIB + if (ctx->z_strm) { + inflateEnd(ctx->z_strm); + free(ctx->z_strm); + ctx->z_strm = NULL; + } +#endif +#if HAVE_LZMA + if (ctx->lzma_strm) { + lzma_end(ctx->lzma_strm); + free(ctx->lzma_strm); + ctx->lzma_strm = NULL; + } +#endif + + if (ctx->io->seek(0, SEEK_SET, ctx->io->io_ctx) == -1) { + return RDATA_ERROR_SEEK; + } + return init_stream(ctx); +} + +static rdata_error_t rdata_convert( + char *dst, + size_t dst_len, + const char *src, + size_t src_len, + iconv_t converter +) { + if (dst_len == 0) { + return RDATA_ERROR_CONVERT_LONG_STRING; + } else if (converter) { + size_t dst_left = dst_len - 1; + char *dst_end = dst; + size_t status = iconv(converter, ( + ICONV_CONST char **)&src, + &src_len, + &dst_end, + &dst_left); + if (status == (size_t)-1) { + if (errno == E2BIG) { + return RDATA_ERROR_CONVERT_LONG_STRING; + } else if (errno == EILSEQ) { + return RDATA_ERROR_CONVERT_BAD_STRING; + } else if (errno != EINVAL) { + /* EINVAL indicates improper truncation; accept it */ + return RDATA_ERROR_CONVERT; + } + } + dst[dst_len - dst_left - 1] = '\0'; + } else if (src_len + 1 > dst_len) { + return RDATA_ERROR_CONVERT_LONG_STRING; + } else { + memcpy(dst, src, src_len); + dst[src_len] = '\0'; + } + return RDATA_OK; +} + +rdata_ctx_t *rdata_ctx_init(rdata_io_t *io, const char *filename) { + int fd = io->open(filename, io->io_ctx); + if (fd == -1) { + return NULL; + } + rdata_ctx_t *ctx = calloc(1, sizeof(rdata_ctx_t)); + rdata_atom_table_t *atom_table = malloc(sizeof(rdata_atom_table_t)); + + atom_table->count = 0; + atom_table->data = NULL; + + ctx->atom_table = atom_table; + + ctx->machine_needs_byteswap = 0; + if (machine_is_little_endian()) { + ctx->machine_needs_byteswap = 1; + } + + ctx->io = io; + + return ctx; +} + +void free_rdata_ctx(rdata_ctx_t *ctx) { + if (ctx->io) { + ctx->io->close(ctx->io->io_ctx); + } + if (ctx->atom_table) { + if (ctx->atom_table->data) { + int i; + for (i=0; i < ctx->atom_table->count; i++) + free(ctx->atom_table->data[i]); + free(ctx->atom_table->data); + } + free(ctx->atom_table); + } +#if HAVE_BZIP2 + if (ctx->bz_strm) { + BZ2_bzDecompressEnd(ctx->bz_strm); + free(ctx->bz_strm); + } +#endif +#if HAVE_APPLE_COMPRESSION + if (ctx->compression_strm) { + compression_stream_destroy(ctx->compression_strm); + free(ctx->compression_strm); + } +#endif +#if HAVE_ZLIB + if (ctx->z_strm) { + inflateEnd(ctx->z_strm); + free(ctx->z_strm); + } +#endif +#if HAVE_LZMA + if (ctx->lzma_strm) { + lzma_end(ctx->lzma_strm); + free(ctx->lzma_strm); + } +#endif + if (ctx->strm_buffer) { + free(ctx->strm_buffer); + } + if (ctx->converter) { + iconv_close(ctx->converter); + } + free(ctx); +} + +rdata_error_t rdata_parse( + rdata_parser_t *parser, + const char *filename, + void *user_ctx +) { + int is_rdata = 0; + rdata_error_t retval = RDATA_OK; + rdata_v2_header_t v2_header; + rdata_ctx_t *ctx = rdata_ctx_init(parser->io, filename); + char *encoding = NULL; + + if (ctx == NULL) { + retval = RDATA_ERROR_OPEN; + goto cleanup; + } + + ctx->user_ctx = user_ctx; + ctx->table_handler = parser->table_handler; + ctx->column_handler = parser->column_handler; + ctx->column_name_handler = parser->column_name_handler; + ctx->row_name_handler = parser->row_name_handler; + ctx->text_value_handler = parser->text_value_handler; + ctx->value_label_handler = parser->value_label_handler; + ctx->dim_handler = parser->dim_handler; + ctx->dim_name_handler = parser->dim_name_handler; + ctx->error_handler = parser->error_handler; + + ctx->is_dimnames = false; + + if ((retval = init_stream(ctx)) != RDATA_OK) { + goto cleanup; + } + + char header_line[5]; + if (read_st( + ctx, &header_line, + sizeof(header_line)) != sizeof(header_line) + ) { + retval = RDATA_ERROR_READ; + goto cleanup; + } + if (memcmp("RDX", header_line, 3) == 0 && header_line[4] == '\n') { + is_rdata = 1; + } else { + reset_stream(ctx); + } + + if (read_st(ctx, &v2_header, sizeof(v2_header)) != sizeof(v2_header)) { + retval = RDATA_ERROR_READ; + goto cleanup; + } + + if (ctx->machine_needs_byteswap) { + v2_header.format_version = byteswap4(v2_header.format_version); + v2_header.writer_version = byteswap4(v2_header.writer_version); + v2_header.reader_version = byteswap4(v2_header.reader_version); + } + + int32_t hdr_result = header_line[3] - '0'; + + if (hdr_result > 0) { + if (is_rdata && v2_header.format_version != (uint32_t)hdr_result) { + retval = RDATA_ERROR_PARSE; + goto cleanup; + } + } else { + if (is_rdata) { + retval = RDATA_ERROR_PARSE; + goto cleanup; + } + } + + if (v2_header.format_version == 3) { + retval = read_character_string(&encoding, ctx); + if (retval != RDATA_OK) + goto cleanup; + + if (strcmp("UTF-8", encoding) != 0) { + if ((ctx->converter = iconv_open("UTF-8", encoding)) + == (iconv_t)-1 + ) { + ctx->converter = NULL; + retval = RDATA_ERROR_UNSUPPORTED_CHARSET; + goto cleanup; + } + } + } + + if (is_rdata) { + retval = read_environment(NULL, ctx); + } else { + retval = read_toplevel_object(NULL, NULL, ctx); + } + if (retval != RDATA_OK) + goto cleanup; + + char test; + + if (read_st(ctx, &test, 1) == 1) { + retval = RDATA_ERROR_PARSE; + goto cleanup; + } + +cleanup: + if (encoding) + free(encoding); + if (ctx) { + free_rdata_ctx(ctx); + } + + return retval; +} + + +static rdata_error_t read_toplevel_object( + const char *table_name, + const char *key, + rdata_ctx_t *ctx +) { + rdata_sexptype_info_t sexptype_info; + rdata_error_t retval = RDATA_OK; + + sexptype_info.attributes = 0; + sexptype_info.tag = 0; + sexptype_info.ref = 0; + if ((retval = read_sexptype_header(&sexptype_info, ctx)) != RDATA_OK) + goto cleanup; + + if (sexptype_info.header.type == RDATA_SEXPTYPE_REAL_VECTOR || + sexptype_info.header.type == RDATA_SEXPTYPE_INTEGER_VECTOR || + sexptype_info.header.type == RDATA_SEXPTYPE_LOGICAL_VECTOR) { + if (table_name == NULL && ctx->table_handler) { + if (ctx->table_handler(key, ctx->user_ctx)) { + retval = RDATA_ERROR_USER_ABORT; + goto cleanup; + } + } + + if ((retval = read_value_vector( + sexptype_info.header, + key, + ctx)) != RDATA_OK + ) + goto cleanup; + } else if (sexptype_info.header.type == RDATA_SEXPTYPE_CHARACTER_VECTOR) { + if (table_name == NULL && ctx->table_handler) { + if (ctx->table_handler(key, ctx->user_ctx)) { + retval = RDATA_ERROR_USER_ABORT; + goto cleanup; + } + } + int32_t length; + + if ((retval = read_length(&length, ctx)) != RDATA_OK) + goto cleanup; + + if (ctx->column_handler) { + if (ctx->column_handler( + key, + RDATA_TYPE_STRING, NULL, + length, ctx->user_ctx) + ) { + retval = RDATA_ERROR_USER_ABORT; + goto cleanup; + } + } + + if ((retval = read_string_vector_n( + sexptype_info.header.attributes, + length, + ctx->text_value_handler, + ctx->user_ctx, ctx)) != RDATA_OK) + goto cleanup; + } else if (sexptype_info.header.type == RDATA_PSEUDO_SXP_ALTREP) { + if (table_name == NULL && ctx->table_handler) { + if (ctx->table_handler(key, ctx->user_ctx)) { + retval = RDATA_ERROR_USER_ABORT; + goto cleanup; + } + } + if ((retval = read_altrep_vector(key, ctx)) != RDATA_OK) + goto cleanup; + } else if (sexptype_info.header.type == RDATA_SEXPTYPE_GENERIC_VECTOR && + sexptype_info.header.object && sexptype_info.header.attributes) { + if (table_name != NULL) { + retval = recursive_discard(sexptype_info.header, ctx); + } else { + if (ctx->table_handler) { + if (ctx->table_handler(key, ctx->user_ctx)) { + retval = RDATA_ERROR_USER_ABORT; + goto cleanup; + } + } + retval = read_generic_list(sexptype_info.header.attributes, ctx); + } + if (retval != RDATA_OK) + goto cleanup; + } else { + if ((retval = recursive_discard(sexptype_info.header, ctx)) + != RDATA_OK + ) + goto cleanup; + } + +cleanup: + + return retval; +} + +static rdata_error_t read_environment( + const char *table_name, + rdata_ctx_t *ctx +) { + rdata_error_t retval = RDATA_OK; + char *key = NULL; + + while (1) { + rdata_sexptype_info_t sexptype_info; + + if ((retval = read_sexptype_header(&sexptype_info, ctx)) != RDATA_OK) + goto cleanup; + + if (sexptype_info.header.type == RDATA_PSEUDO_SXP_NIL) + break; + + if (sexptype_info.header.type != RDATA_SEXPTYPE_PAIRLIST) { + if ((retval = recursive_discard( + sexptype_info.header, + ctx)) != RDATA_OK) + goto cleanup; + continue; + } + + if ((key = atom_table_lookup( + ctx->atom_table, + sexptype_info.ref)) == NULL) { + retval = RDATA_ERROR_PARSE; + goto cleanup; + } + + if ((retval = read_toplevel_object(table_name, key, ctx)) != RDATA_OK) + goto cleanup; + } + +cleanup: + + return retval; +} + +static rdata_error_t read_sexptype_header( + rdata_sexptype_info_t *header_info, + rdata_ctx_t *ctx +) { + uint32_t sexptype; + rdata_sexptype_header_t header; + rdata_error_t retval = RDATA_OK; + if (read_st(ctx, &sexptype, sizeof(sexptype)) != sizeof(sexptype)) { + retval = RDATA_ERROR_READ; + goto cleanup; + } + if (ctx->machine_needs_byteswap) + sexptype = byteswap4(sexptype); + + memcpy(&header, &sexptype, sizeof(sexptype)); + uint32_t attributes = 0, tag = 0, ref = 0; + + if (header.type == RDATA_SEXPTYPE_PAIRLIST_ATTR) { + header.attributes = 1; + header.type = RDATA_SEXPTYPE_PAIRLIST; + } + if (header.type == RDATA_SEXPTYPE_LANGUAGE_OBJECT_ATTR) { + header.attributes = 1; + header.type = RDATA_SEXPTYPE_LANGUAGE_OBJECT; + } + if (header.type == RDATA_SEXPTYPE_PAIRLIST) { + if (header.attributes) { + if (read_st( + ctx, + &attributes, + sizeof(attributes)) != sizeof(attributes) + ) { + retval = RDATA_ERROR_READ; + goto cleanup; + } + if (ctx->machine_needs_byteswap) { + header_info->attributes = byteswap4(header_info->attributes); + } + } + if (header.tag) { + if (read_st(ctx, &tag, sizeof(tag)) != sizeof(tag)) { + retval = RDATA_ERROR_READ; + goto cleanup; + } + if (ctx->machine_needs_byteswap) + tag = byteswap4(tag); + } + + if (tag == 1) { + rdata_sexptype_info_t key_info; + + if ((retval = read_sexptype_header(&key_info, ctx)) != RDATA_OK) + goto cleanup; + + if (key_info.header.type != RDATA_SEXPTYPE_CHARACTER_STRING) { + retval = RDATA_ERROR_PARSE; + goto cleanup; + } + + char *key = NULL; + if ((retval = read_character_string(&key, ctx)) != RDATA_OK) + goto cleanup; + + ref = atom_table_add(ctx->atom_table, key); + + free(key); + } else if ((tag & 0xFF) == RDATA_PSEUDO_SXP_REF) { + ref = (tag >> 8); + } + } + if (header.type == RDATA_PSEUDO_SXP_REF) { + ref = (sexptype >> 8); + } + + header_info->header = header; + header_info->attributes = attributes; + header_info->tag = tag; + header_info->ref = ref; + +cleanup: + + return retval; +} + +static int handle_class_name(const char *buf, int i, void *ctx) { + unsigned int *column_class = (unsigned int *)ctx; + if (buf) { + if (strcmp(buf, "POSIXct") == 0) { + *column_class |= RDATA_CLASS_POSIXCT; + } + if (strcmp(buf, "Date") == 0) { + *column_class |= RDATA_CLASS_DATE; + } + } + return RDATA_OK; +} + +static int handle_vector_attribute( + char *key, + rdata_sexptype_info_t val_info, + rdata_ctx_t *ctx +) { + rdata_error_t retval = RDATA_OK; + if (strcmp(key, "levels") == 0) { + retval = read_string_vector( + val_info.header.attributes, + ctx->value_label_handler, + ctx->user_ctx, ctx); + } else if (strcmp(key, "class") == 0) { + ctx->column_class = 0; + retval = read_string_vector( + val_info.header.attributes, + &handle_class_name, + &ctx->column_class, ctx); + } else if (strcmp(key, "dim") == 0) { + if (val_info.header.type == RDATA_SEXPTYPE_INTEGER_VECTOR) { + int32_t length; + if ((retval = read_length(&length, ctx)) != RDATA_OK) + goto cleanup; + + if ((uint32_t)length <= sizeof(ctx->dims)/sizeof(ctx->dims[0])) { + int buf_len = length * sizeof(int32_t); + if (read_st(ctx, ctx->dims, buf_len) != buf_len) { + retval = RDATA_ERROR_READ; + goto cleanup; + } + if (ctx->machine_needs_byteswap) { + int i; + for (i=0; i < length; i++) { + ctx->dims[i] = byteswap4(ctx->dims[i]); + } + } + if (ctx->dim_handler) { + if (ctx->dim_handler( + key, + RDATA_TYPE_INT32, + ctx->dims, length, + ctx->user_ctx) + ) { + retval = RDATA_ERROR_USER_ABORT; + } + } + } + } + } else if (strcmp(key, "dimnames") == 0) { + ctx->is_dimnames = true; + retval = read_generic_list(val_info.header.attributes, ctx); + } else { + retval = recursive_discard(val_info.header, ctx); + } +cleanup: + return retval; +} + +static rdata_error_t read_character_string(char **key, rdata_ctx_t *ctx) { + uint32_t length; + char *string = NULL; + char *utf8_string = NULL; + rdata_error_t retval = RDATA_OK; + + if (read_st(ctx, &length, sizeof(length)) != sizeof(length)) { + retval = RDATA_ERROR_READ; + goto cleanup; + } + + if (ctx->machine_needs_byteswap) + length = byteswap4(length); + + if ((int32_t)length == -1 || length == 0) { + *key = strdup(""); + return RDATA_OK; + } + + if (length < 0) { + return RDATA_ERROR_PARSE; + } + + if ((string = rdata_malloc(length)) == NULL) { + retval = RDATA_ERROR_MALLOC; + goto cleanup; + } + + if (read_st(ctx, string, length) != length) { + retval = RDATA_ERROR_READ; + goto cleanup; + } + + if ((utf8_string = rdata_malloc(4*length+1)) == NULL) { + retval = RDATA_ERROR_MALLOC; + goto cleanup; + } + + retval = rdata_convert( + utf8_string, + 4 * length + 1, + string, length, + ctx->converter); + if (retval != RDATA_OK) + goto cleanup; + +cleanup: + if (string) + free(string); + + if (retval == RDATA_OK) { + *key = utf8_string; + } else if (utf8_string) { + free(utf8_string); + } + + return retval; +} + +static int handle_data_frame_attribute( + char *key, + rdata_sexptype_info_t val_info, + rdata_ctx_t *ctx +) { + rdata_error_t retval = RDATA_OK; + + if (strcmp(key, "names") == 0 && + val_info.header.type == RDATA_SEXPTYPE_CHARACTER_VECTOR + ) { + retval = read_string_vector( + val_info.header.attributes, + ctx->column_name_handler, ctx->user_ctx, ctx); + } else if (strcmp(key, "row.names") == 0 && + val_info.header.type == RDATA_SEXPTYPE_CHARACTER_VECTOR + ) { + retval = read_string_vector( + val_info.header.attributes, + ctx->row_name_handler, + ctx->user_ctx, ctx); + } else if (strcmp(key, "label.table") == 0) { + retval = recursive_discard(val_info.header, ctx); + } else { + retval = recursive_discard(val_info.header, ctx); + } + + return retval; +} + +static rdata_error_t read_attributes(int (*handle_attribute)( + char *key, + rdata_sexptype_info_t val_info, + rdata_ctx_t *ctx), + rdata_ctx_t *ctx +) { + rdata_sexptype_info_t pairlist_info, val_info; + rdata_error_t retval = RDATA_OK; + char *key = NULL; + + retval = read_sexptype_header(&pairlist_info, ctx); + if (retval != RDATA_OK) + goto cleanup; + + while (pairlist_info.header.type == RDATA_SEXPTYPE_PAIRLIST) { + /* value */ + if ((retval = read_sexptype_header(&val_info, ctx)) != RDATA_OK) + goto cleanup; + + if (handle_attribute) { + if ((key = atom_table_lookup( + ctx->atom_table, pairlist_info.ref)) == NULL) { + retval = RDATA_ERROR_PARSE; + goto cleanup; + } + if ((retval = handle_attribute(key, val_info, ctx)) != RDATA_OK) + goto cleanup; + } else { + if ((retval = recursive_discard( + val_info.header, + ctx)) != RDATA_OK + ) + goto cleanup; + } + + /* next */ + if ((retval = read_sexptype_header(&pairlist_info, ctx)) != RDATA_OK) + goto cleanup; + } + +cleanup: + return retval; +} + +static rdata_error_t read_wrap_real(const char *name, rdata_ctx_t *ctx) { + rdata_error_t retval = RDATA_OK; + rdata_sexptype_info_t sexptype_info; + /* pairlist */ + if ((retval = read_sexptype_header(&sexptype_info, ctx)) != RDATA_OK) + goto cleanup; + if (sexptype_info.header.type != RDATA_SEXPTYPE_PAIRLIST) { + retval = RDATA_ERROR_PARSE; + goto cleanup; + } + /* representation */ + if ((retval = read_sexptype_header(&sexptype_info, ctx)) != RDATA_OK) + goto cleanup; + + if ((retval = read_value_vector( + sexptype_info.header, + name, + ctx)) != RDATA_OK + ) + goto cleanup; + + /* alt representation */ + if ((retval = read_sexptype_header(&sexptype_info, ctx)) != RDATA_OK) + goto cleanup; + if ((retval = recursive_discard(sexptype_info.header, ctx)) != RDATA_OK) + goto cleanup; + + /* nil */ + if ((retval = read_sexptype_header(&sexptype_info, ctx)) != RDATA_OK) + goto cleanup; + if (sexptype_info.header.type != RDATA_PSEUDO_SXP_NIL) { + retval = RDATA_ERROR_PARSE; + goto cleanup; + } + +cleanup: + return retval; +} + +static rdata_error_t read_compact_intseq( + const char *name, + rdata_ctx_t *ctx +) { + rdata_error_t retval = RDATA_OK; + rdata_sexptype_info_t sexptype_info; + if ((retval = read_sexptype_header(&sexptype_info, ctx)) != RDATA_OK) + goto cleanup; + + int32_t length; + if ((retval = read_length(&length, ctx)) != RDATA_OK) + goto cleanup; + if (length != 3) { + retval = RDATA_ERROR_PARSE; + goto cleanup; + } + + double vals[3]; + if (read_st(ctx, vals, sizeof(vals)) != sizeof(vals)) { + retval = RDATA_ERROR_READ; + goto cleanup; + } + if (ctx->machine_needs_byteswap) { + vals[0] = byteswap_double(vals[0]); + vals[1] = byteswap_double(vals[1]); + vals[2] = byteswap_double(vals[2]); + } + + if (sexptype_info.header.attributes) { + if ((retval = read_attributes( + &handle_vector_attribute, ctx)) != RDATA_OK + ) + goto cleanup; + } + + if (ctx->column_handler) { + int32_t *integers = rdata_malloc(vals[0] * sizeof(int32_t)); + int32_t val = vals[1]; + for (int i=0; i < vals[0]; i++) { + integers[i] = val; + val += vals[2]; + } + int cb_retval = ctx->column_handler( + name, + RDATA_TYPE_INT32, + integers, + vals[0], ctx->user_ctx); + free(integers); + if (cb_retval) { + retval = RDATA_ERROR_USER_ABORT; + goto cleanup; + } + } + + /* nil */ + if ((retval = read_sexptype_header(&sexptype_info, ctx)) != RDATA_OK) + goto cleanup; + if (sexptype_info.header.type != RDATA_PSEUDO_SXP_NIL) { + retval = RDATA_ERROR_PARSE; + goto cleanup; + } +cleanup: + return retval; +} + +static int deferred_string_handler( + const char *name, + enum rdata_type_e type, + void *vals, + long length, + void *user_ctx +) { + rdata_ctx_t *ctx = (rdata_ctx_t *)user_ctx; + if (ctx->column_handler) + ctx->column_handler( + name, + RDATA_TYPE_STRING, + NULL, + length, + ctx->user_ctx); + if (ctx->text_value_handler) { + for (int i=0; i < length; i++) { + char buf[128] = { 0 }; + if (type == RDATA_TYPE_REAL) { + snprintf(buf, sizeof(buf), "%.0lf", ((double *)vals)[i]); + } else if (type == RDATA_TYPE_INT32) { + snprintf(buf, sizeof(buf), "%d", ((int32_t *)vals)[i]); + } + ctx->text_value_handler(buf, i, ctx->user_ctx); + } + } + return 0; +} + +static rdata_error_t read_deferred_string( + const char *name, + rdata_ctx_t *ctx +) { + rdata_error_t retval = RDATA_OK; + rdata_sexptype_info_t sexptype_info; + /* pairlist */ + if ((retval = read_sexptype_header(&sexptype_info, ctx)) != RDATA_OK) + goto cleanup; + if (sexptype_info.header.type != RDATA_SEXPTYPE_PAIRLIST) { + retval = RDATA_ERROR_PARSE; + goto cleanup; + } + /* representation */ + if ((retval = read_sexptype_header(&sexptype_info, ctx)) != RDATA_OK) + goto cleanup; + + if ((retval = read_value_vector_cb( + sexptype_info.header, + name, + &deferred_string_handler, + ctx, + ctx)) != RDATA_OK + ) + goto cleanup; + + /* alt representation */ + if ((retval = read_sexptype_header(&sexptype_info, ctx)) != RDATA_OK) + goto cleanup; + if ((retval = recursive_discard(sexptype_info.header, ctx)) != RDATA_OK) + goto cleanup; + + /* nil */ + if ((retval = read_sexptype_header(&sexptype_info, ctx)) != RDATA_OK) + goto cleanup; + if (sexptype_info.header.type != RDATA_PSEUDO_SXP_NIL) { + retval = RDATA_ERROR_PARSE; + goto cleanup; + } + +cleanup: + return retval; +} + +static rdata_error_t read_altrep_vector( + const char *name, + rdata_ctx_t *ctx +) { + rdata_error_t retval = RDATA_OK; + rdata_sexptype_info_t sexptype_info; + /* pairlist */ + if ((retval = read_sexptype_header(&sexptype_info, ctx)) != RDATA_OK) + goto cleanup; + if (sexptype_info.header.type != RDATA_SEXPTYPE_PAIRLIST) { + retval = RDATA_ERROR_PARSE; + goto cleanup; + } + /* class name */ + char *class = NULL; + if ((retval = read_sexptype_header(&sexptype_info, ctx)) != RDATA_OK) + goto cleanup; + if (sexptype_info.header.type == RDATA_SEXPTYPE_SYMBOL) { + if ((retval = read_sexptype_header(&sexptype_info, ctx)) != RDATA_OK) + goto cleanup; + if (sexptype_info.header.type != RDATA_SEXPTYPE_CHARACTER_STRING) { + retval = RDATA_ERROR_PARSE; + goto cleanup; + } + if ((retval = read_character_string(&class, ctx)) != RDATA_OK) + goto cleanup; + + atom_table_add(ctx->atom_table, class); + } else if (sexptype_info.header.type == RDATA_PSEUDO_SXP_REF) { + if ((class = atom_table_lookup( + ctx->atom_table, + sexptype_info.ref)) == NULL + ) { + retval = RDATA_ERROR_PARSE; + goto cleanup; + } + } else { + retval = RDATA_ERROR_PARSE; + goto cleanup; + } + + /* package and class ID */ + if ((retval = read_sexptype_header(&sexptype_info, ctx)) != RDATA_OK) + goto cleanup; + if (sexptype_info.header.type != RDATA_SEXPTYPE_PAIRLIST) { + retval = RDATA_ERROR_PARSE; + goto cleanup; + } + if ((retval = recursive_discard(sexptype_info.header, ctx)) != RDATA_OK) + goto cleanup; + + if (strcmp(class, "wrap_real") == 0) { + if ((retval = read_wrap_real(name, ctx)) != RDATA_OK) + goto cleanup; + } else if (strcmp(class, "compact_intseq") == 0) { + if ((retval = read_compact_intseq(name, ctx)) != RDATA_OK) + goto cleanup; + } else if (strcmp(class, "deferred_string") == 0) { + if ((retval = read_deferred_string(name, ctx)) != RDATA_OK) + goto cleanup; + } else { + if (ctx->error_handler) { + char error_buf[1024]; + snprintf( + error_buf, + sizeof(error_buf), + "Unrecognized ALTREP class: %s\n", + class); + ctx->error_handler(error_buf, ctx->user_ctx); + } + retval = RDATA_ERROR_UNSUPPORTED_STORAGE_CLASS; + } +cleanup: + return retval; +} + +static rdata_error_t read_generic_list(int attributes, rdata_ctx_t *ctx) { + rdata_error_t retval = RDATA_OK; + int32_t length; + unsigned int i; + rdata_sexptype_info_t sexptype_info; + + if ((retval = read_length(&length, ctx)) != RDATA_OK) + goto cleanup; + + for (i=0; i < (uint32_t)length; i++) { + if ((retval = read_sexptype_header( + &sexptype_info, ctx)) != RDATA_OK + ) + goto cleanup; + + if (sexptype_info.header.type == RDATA_SEXPTYPE_CHARACTER_VECTOR) { + int32_t vec_length; + + if ((retval = read_length(&vec_length, ctx)) != RDATA_OK) + goto cleanup; + if (ctx->is_dimnames) { + retval = read_string_vector_n( + sexptype_info.header.attributes, + vec_length, + ctx->dim_name_handler, + ctx->user_ctx, ctx); + } else { + if (ctx->column_handler) { + if (ctx->column_handler( + NULL, + RDATA_TYPE_STRING, + NULL, + vec_length, + ctx->user_ctx) + ) { + retval = RDATA_ERROR_USER_ABORT; + goto cleanup; + } + } + retval = read_string_vector_n( + sexptype_info.header.attributes, + vec_length, + ctx->text_value_handler, + ctx->user_ctx, ctx); + } + } else if (sexptype_info.header.type == RDATA_PSEUDO_SXP_ALTREP) { + retval = read_altrep_vector(NULL, ctx); + } else if (sexptype_info.header.type == RDATA_PSEUDO_SXP_NIL) { + if (ctx->is_dimnames && + ctx->dim_name_handler && + i < sizeof(ctx->dims)/sizeof(ctx->dims[0]) + ) { + int j; + for (j=0; j < ctx->dims[i]; j++) { + ctx->dim_name_handler(NULL, j, ctx->user_ctx); + } + } + } else { + retval = read_value_vector(sexptype_info.header, NULL, ctx); + } + if (retval != RDATA_OK) + goto cleanup; + } + + if (attributes) { + if ((retval = read_attributes( + &handle_data_frame_attribute, + ctx)) != RDATA_OK + ) + goto cleanup; + } + +cleanup: + + if (ctx->is_dimnames) + ctx->is_dimnames = false; + + return retval; +} + +static rdata_error_t read_length(int32_t *outLength, rdata_ctx_t *ctx) { + int32_t length; + rdata_error_t retval = RDATA_OK; + + if (read_st(ctx, &length, sizeof(length)) != sizeof(length)) { + retval = RDATA_ERROR_READ; + goto cleanup; + } + + if (ctx->machine_needs_byteswap) + length = byteswap4(length); + + if (outLength) + *outLength = length; + +cleanup: + + return retval; +} + +static rdata_error_t read_string_vector_n( + int attributes, + int32_t length, + rdata_text_value_handler text_value_handler, + void *callback_ctx, + rdata_ctx_t *ctx +) { + int32_t string_length; + rdata_error_t retval = RDATA_OK; + rdata_sexptype_info_t info; + size_t buffer_size = 4096; + char *buffer = NULL; + size_t utf8_buffer_size = 16384; + char *utf8_buffer = NULL; + int i; + + buffer = rdata_malloc(buffer_size); + if (ctx->converter) + utf8_buffer = rdata_malloc(utf8_buffer_size); + + for (i=0; i < length; i++) { + if ((retval = read_sexptype_header(&info, ctx)) != RDATA_OK) + goto cleanup; + + if (info.header.type != RDATA_SEXPTYPE_CHARACTER_STRING) { + retval = RDATA_ERROR_PARSE; + goto cleanup; + } + + if ((retval = read_length(&string_length, ctx)) != RDATA_OK) + goto cleanup; + + int32_t str_len_calc = string_length + 1; + if (str_len_calc > 0) { + if ((uint32_t)str_len_calc > buffer_size) { + buffer_size = str_len_calc; + if ((buffer = rdata_realloc(buffer, buffer_size)) == NULL) { + retval = RDATA_ERROR_MALLOC; + goto cleanup; + } + } + } + + if (string_length >= 0) { + if (read_st(ctx, buffer, string_length) != string_length) { + retval = RDATA_ERROR_READ; + goto cleanup; + } + buffer[string_length] = '\0'; + } + + if (text_value_handler) { + int cb_retval = 0; + if (string_length < 0) { + cb_retval = text_value_handler(NULL, i, callback_ctx); + } else if (!ctx->converter) { + cb_retval = text_value_handler(buffer, i, callback_ctx); + } else { + int32_t str_len_calc = 4*string_length + 1; + if (str_len_calc >= 0) { + if ((uint32_t)str_len_calc > utf8_buffer_size) { + utf8_buffer_size = str_len_calc; + if ((utf8_buffer = rdata_realloc( + utf8_buffer, utf8_buffer_size)) == NULL + ) { + retval = RDATA_ERROR_MALLOC; + goto cleanup; + } + } + } + + retval = rdata_convert( + utf8_buffer, + utf8_buffer_size, + buffer, string_length, + ctx->converter); + if (retval != RDATA_OK) + goto cleanup; + + cb_retval = text_value_handler(utf8_buffer, i, callback_ctx); + } + if (cb_retval) { + retval = RDATA_ERROR_USER_ABORT; + goto cleanup; + } + } + } + + if (attributes) { + if ((retval = read_attributes( + &handle_vector_attribute, + ctx)) != RDATA_OK) + goto cleanup; + } + +cleanup: + + if (buffer) + free(buffer); + if (utf8_buffer) + free(utf8_buffer); + + return retval; +} + +static rdata_error_t read_string_vector( + int attributes, + rdata_text_value_handler text_value_handler, + void *callback_ctx, + rdata_ctx_t *ctx +) { + rdata_error_t retval = RDATA_OK; + int32_t length; + + if ((retval = read_length(&length, ctx)) != RDATA_OK) + return retval; + + return read_string_vector_n( + attributes, + length, + text_value_handler, + callback_ctx, + ctx); +} + +static rdata_error_t read_value_vector_cb( + rdata_sexptype_header_t header, + const char *name, + rdata_column_handler column_handler, + void *user_ctx, + rdata_ctx_t *ctx +) { + rdata_error_t retval = RDATA_OK; + int32_t length; + size_t input_elem_size = 0; + void *vals = NULL; + size_t buf_len = 0; + enum rdata_type_e output_data_type; + unsigned int i; + + switch (header.type) { + case RDATA_SEXPTYPE_REAL_VECTOR: + input_elem_size = sizeof(double); + output_data_type = RDATA_TYPE_REAL; + break; + case RDATA_SEXPTYPE_INTEGER_VECTOR: + input_elem_size = sizeof(int32_t); + output_data_type = RDATA_TYPE_INT32; + break; + case RDATA_SEXPTYPE_LOGICAL_VECTOR: + input_elem_size = sizeof(int32_t); + output_data_type = RDATA_TYPE_LOGICAL; + break; + default: + retval = RDATA_ERROR_PARSE; + break; + } + if (retval != RDATA_OK) + goto cleanup; + + if ((retval = read_length(&length, ctx)) != RDATA_OK) + goto cleanup; + + buf_len = length * input_elem_size; + + if (buf_len) { + vals = rdata_malloc(buf_len); + if (vals == NULL) { + retval = RDATA_ERROR_MALLOC; + goto cleanup; + } + + ssize_t result_st = read_st(ctx, vals, buf_len); + + if (result_st > 0) { + if ((size_t)result_st != buf_len) { + retval = RDATA_ERROR_READ; + goto cleanup; + } + } else { + retval = RDATA_ERROR_READ; + goto cleanup; + } + + if (ctx->machine_needs_byteswap) { + if (input_elem_size == sizeof(double)) { + double *d_vals = (double *)vals; + for (i=0; i < buf_len/sizeof(double); i++) { + d_vals[i] = byteswap_double(d_vals[i]); + } + } else { + uint32_t *i_vals = (uint32_t *)vals; + for (i=0; i < buf_len/sizeof(uint32_t); i++) { + i_vals[i] = byteswap4(i_vals[i]); + } + } + } + } + + ctx->column_class = 0; + if (header.attributes) { + if ((retval = read_attributes( + &handle_vector_attribute, + ctx)) != RDATA_OK) + goto cleanup; + } + if (ctx->column_class == RDATA_CLASS_POSIXCT) + output_data_type = RDATA_TYPE_TIMESTAMP; + if (ctx->column_class == RDATA_CLASS_DATE) + output_data_type = RDATA_TYPE_DATE; + + if (column_handler) { + if (column_handler(name, output_data_type, vals, length, user_ctx)) { + retval = RDATA_ERROR_USER_ABORT; + goto cleanup; + } + } + +cleanup: + if (vals) + free(vals); + + return retval; +} + +static rdata_error_t read_value_vector( + rdata_sexptype_header_t header, + const char *name, + rdata_ctx_t *ctx +) { + return read_value_vector_cb( + header, + name, + ctx->column_handler, + ctx->user_ctx, ctx); +} + +static rdata_error_t discard_vector( + rdata_sexptype_header_t sexptype_header, + size_t element_size, + rdata_ctx_t *ctx +) { + int32_t length; + rdata_error_t retval = RDATA_OK; + + if ((retval = read_length(&length, ctx)) != RDATA_OK) + goto cleanup; + + if (length > 0) { + if (lseek_st(ctx, length * element_size) == -1) { + return RDATA_ERROR_SEEK; + } + } else if (ctx->error_handler) { + char error_buf[1024]; + snprintf( + error_buf, + sizeof(error_buf), + "Vector with non-positive length: %d\n", + length); + ctx->error_handler(error_buf, ctx->user_ctx); + } + + if (sexptype_header.attributes) { + rdata_sexptype_info_t temp_info; + if ((retval = read_sexptype_header(&temp_info, ctx)) != RDATA_OK) + goto cleanup; + + retval = recursive_discard(temp_info.header, ctx); + } + +cleanup: + + return retval; +} + +static rdata_error_t discard_character_string( + int add_to_table, + rdata_ctx_t *ctx +) { + rdata_error_t retval = RDATA_OK; + char *key = NULL; + + if ((retval = read_character_string(&key, ctx)) != RDATA_OK) + goto cleanup; + + if (strlen(key) > 0 && add_to_table) { + atom_table_add(ctx->atom_table, key); + } + + free(key); + +cleanup: + + return retval; +} + +static rdata_error_t discard_pairlist( + rdata_sexptype_header_t sexptype_header, + rdata_ctx_t *ctx +) { + rdata_sexptype_info_t temp_info; + rdata_error_t error = 0; + while (1) { + switch (sexptype_header.type) { + case RDATA_SEXPTYPE_PAIRLIST: + /* value */ + if ((error = read_sexptype_header( + &temp_info, + ctx)) != RDATA_OK) + return error; + if ((error = recursive_discard( + temp_info.header, + ctx)) != RDATA_OK) + return error; + + /* tail */ + if ((error = read_sexptype_header( + &temp_info, + ctx)) != RDATA_OK) + return error; + sexptype_header = temp_info.header; + break; + case RDATA_PSEUDO_SXP_NIL: + goto done; + default: + return RDATA_ERROR_PARSE; + } + } +done: + + return 0; +} + +static rdata_error_t recursive_discard( + rdata_sexptype_header_t sexptype_header, + rdata_ctx_t *ctx +) { + uint32_t length; + rdata_sexptype_info_t info; + rdata_sexptype_info_t prot, tag; + + rdata_error_t error = 0; + unsigned int i; + + switch (sexptype_header.type) { + case RDATA_SEXPTYPE_SYMBOL: + if ((error = read_sexptype_header(&info, ctx)) != RDATA_OK) + goto cleanup; + + if ((error = recursive_discard(info.header, ctx)) != RDATA_OK) + goto cleanup; + break; + case RDATA_PSEUDO_SXP_PERSIST: + case RDATA_PSEUDO_SXP_NAMESPACE: + case RDATA_PSEUDO_SXP_PACKAGE: + if ((error = read_sexptype_header(&info, ctx)) != RDATA_OK) + goto cleanup; + + if ((error = recursive_discard(info.header, ctx)) != RDATA_OK) + goto cleanup; + break; + case RDATA_SEXPTYPE_BUILTIN_FUNCTION: + case RDATA_SEXPTYPE_SPECIAL_FUNCTION: + error = discard_character_string(0, ctx); + break; + case RDATA_SEXPTYPE_PAIRLIST: + error = discard_pairlist(sexptype_header, ctx); + break; + case RDATA_SEXPTYPE_CHARACTER_STRING: + error = discard_character_string(1, ctx); + break; + case RDATA_SEXPTYPE_RAW_VECTOR: + error = discard_vector(sexptype_header, 1, ctx); + break; + case RDATA_SEXPTYPE_LOGICAL_VECTOR: + error = discard_vector(sexptype_header, 4, ctx); + break; + case RDATA_SEXPTYPE_INTEGER_VECTOR: + error = discard_vector(sexptype_header, 4, ctx); + break; + case RDATA_SEXPTYPE_REAL_VECTOR: + error = discard_vector(sexptype_header, 8, ctx); + break; + case RDATA_SEXPTYPE_COMPLEX_VECTOR: + error = discard_vector(sexptype_header, 16, ctx); + break; + case RDATA_SEXPTYPE_CHARACTER_VECTOR: + case RDATA_SEXPTYPE_GENERIC_VECTOR: + case RDATA_SEXPTYPE_EXPRESSION_VECTOR: + if (read_st(ctx, &length, sizeof(length)) != sizeof(length)) { + return RDATA_ERROR_READ; + } + if (ctx->machine_needs_byteswap) + length = byteswap4(length); + + for (i=0; i < length; i++) { + if ((error = read_sexptype_header(&info, ctx)) != RDATA_OK) + goto cleanup; + + if (sexptype_header.type == RDATA_SEXPTYPE_CHARACTER_VECTOR) { + if (info.header.type != RDATA_SEXPTYPE_CHARACTER_STRING) { + error = RDATA_ERROR_PARSE; + goto cleanup; + } + + if ((error = discard_character_string(0, ctx)) != RDATA_OK) + goto cleanup; + } else if ((error = recursive_discard( + info.header, + ctx)) != RDATA_OK) { + goto cleanup; + } + } + if (sexptype_header.attributes) { + if ((error = read_attributes(NULL, ctx)) != RDATA_OK) + goto cleanup; + } + break; + case RDATA_SEXPTYPE_DOT_DOT_DOT: + case RDATA_SEXPTYPE_PROMISE: + case RDATA_SEXPTYPE_LANGUAGE_OBJECT: + case RDATA_SEXPTYPE_CLOSURE: + if (sexptype_header.attributes) { + if ((error = read_sexptype_header(&info, ctx)) != RDATA_OK) + goto cleanup; + + if ((error = recursive_discard(info.header, ctx)) != RDATA_OK) + goto cleanup; + } + if (sexptype_header.tag) { + if ((error = read_sexptype_header(&info, ctx)) != RDATA_OK) + goto cleanup; + + if ((error = recursive_discard(info.header, ctx)) != RDATA_OK) + goto cleanup; + } + /* CAR */ + if ((error = read_sexptype_header(&info, ctx)) != RDATA_OK) + goto cleanup; + + if ((error = recursive_discard(info.header, ctx)) != RDATA_OK) + goto cleanup; + + /* CDR */ + if ((error = read_sexptype_header(&info, ctx)) != RDATA_OK) + goto cleanup; + + if ((error = recursive_discard(info.header, ctx)) != RDATA_OK) + goto cleanup; + break; + case RDATA_SEXPTYPE_EXTERNAL_POINTER: + read_sexptype_header(&prot, ctx); + recursive_discard(prot.header, ctx); + + read_sexptype_header(&tag, ctx); + recursive_discard(tag.header, ctx); + break; + case RDATA_SEXPTYPE_ENVIRONMENT: + /* locked */ + if (lseek_st(ctx, sizeof(uint32_t)) == -1) { + return RDATA_ERROR_SEEK; + } + + rdata_sexptype_info_t enclosure, frame, hash_table, attributes; + read_sexptype_header(&enclosure, ctx); + recursive_discard(enclosure.header, ctx); + + read_sexptype_header(&frame, ctx); + recursive_discard(frame.header, ctx); + + read_sexptype_header(&hash_table, ctx); + recursive_discard(hash_table.header, ctx); + + read_sexptype_header(&attributes, ctx); + recursive_discard(attributes.header, ctx); + /* + if (sexptype_header.attributes) { + if (lseek(ctx->fd, sizeof(uint32_t), SEEK_CUR) == -1) { + return RDATA_ERROR_SEEK; + } + } */ + break; + case RDATA_PSEUDO_SXP_REF: + case RDATA_PSEUDO_SXP_NIL: + case RDATA_PSEUDO_SXP_GLOBAL_ENVIRONMENT: + case RDATA_PSEUDO_SXP_UNBOUND_VALUE: + case RDATA_PSEUDO_SXP_MISSING_ARGUMENT: + case RDATA_PSEUDO_SXP_BASE_NAMESPACE: + case RDATA_PSEUDO_SXP_EMPTY_ENVIRONMENT: + case RDATA_PSEUDO_SXP_BASE_ENVIRONMENT: + break; + case RDATA_PSEUDO_SXP_ALTREP: + /* class, package, type */ + if ((error = read_sexptype_header(&info, ctx)) != RDATA_OK) + goto cleanup; + if ((error = recursive_discard(info.header, ctx)) != RDATA_OK) + goto cleanup; + + while (1) { + if ((error = read_sexptype_header(&info, ctx)) != RDATA_OK) + goto cleanup; + if (info.header.type == RDATA_SEXPTYPE_PAIRLIST) + continue; + if (info.header.type == RDATA_PSEUDO_SXP_NIL) + break; + if ((error = recursive_discard(info.header, ctx)) != RDATA_OK) + goto cleanup; + } + break; + default: + if (ctx->error_handler) { + char error_buf[1024]; + snprintf( + error_buf, + sizeof(error_buf), + "Unhandled S-Expression: %d", + sexptype_header.type); + ctx->error_handler(error_buf, ctx->user_ctx); + } + return RDATA_ERROR_UNSUPPORTED_S_EXPRESSION; + } +cleanup: + + return error; +} diff --git a/pandas/_libs/src/librdata/rdata_write.c b/pandas/_libs/src/librdata/rdata_write.c new file mode 100644 index 0000000000000..0383dd85f4ace --- /dev/null +++ b/pandas/_libs/src/librdata/rdata_write.c @@ -0,0 +1,704 @@ +/* +Copyright (c) 2020 Evan Miller +*/ + +#include +#include +#include + +#include "CKHashTable.h" +#include "rdata.h" +#include "rdata_internal.h" + +#define R_TAG 0x01 +#define R_OBJECT 0x02 +#define R_ATTRIBUTES 0x04 + +#define INITIAL_COLUMNS_CAPACITY 100 + +#ifdef _WIN32 +#define timegm _mkgmtime +#endif + +rdata_writer_t *rdata_writer_init( + rdata_data_writer write_callback, + rdata_file_format_t format +) { + rdata_writer_t *writer = calloc(1, sizeof(rdata_writer_t)); + writer->file_format = format; + writer->bswap = machine_is_little_endian(); + writer->atom_table = ck_hash_table_init(100, 24); + writer->data_writer = write_callback; + + writer->columns_capacity = INITIAL_COLUMNS_CAPACITY; + writer->columns = malloc( + writer->columns_capacity * sizeof(rdata_column_t *)); + + return writer; +} + +void rdata_writer_free(rdata_writer_t *writer) { + ck_hash_table_free(writer->atom_table); + int i, j; + for (i=0; i < writer->columns_count; i++) { + rdata_column_t *column = writer->columns[i]; + for (j=0; j < column->factor_count; j++) { + free(column->factor[j]); + } + free(column->factor); + free(column); + } + free(writer->columns); + free(writer); +} + +rdata_column_t *rdata_add_column( + rdata_writer_t *writer, + const char *name, + rdata_type_t type +) { + if (writer->columns_count == writer->columns_capacity) { + writer->columns_capacity *= 2; + writer->columns = realloc(writer->columns, + writer->columns_capacity * sizeof(rdata_column_t *)); + } + rdata_column_t *new_column = calloc(1, sizeof(rdata_column_t)); + + new_column->index = writer->columns_count++; + + writer->columns[new_column->index] = new_column; + + new_column->type = type; + + if (name) { + snprintf(new_column->name, sizeof(new_column->name), "%s", name); + } + + return new_column; +} + +rdata_column_t *rdata_get_column(rdata_writer_t *writer, int32_t j) { + return writer->columns[j]; +} + +rdata_error_t rdata_column_set_label( + rdata_column_t *column, + const char *label +) { + snprintf(column->label, sizeof(column->label), "%s", label); + return RDATA_OK; +} + +rdata_error_t rdata_column_add_factor( + rdata_column_t *column, + const char *factor +) { + if (column->type != RDATA_TYPE_INT32) + return RDATA_ERROR_FACTOR; + + char *factor_copy = malloc(strlen(factor)+1); + strcpy(factor_copy, factor); // NOLINT + + column->factor_count++; + column->factor = realloc( + column->factor, + sizeof(char *) * column->factor_count); + column->factor[column->factor_count-1] = factor_copy; + + return RDATA_OK; +} + +static rdata_error_t rdata_write_bytes( + rdata_writer_t *writer, + const void *data, size_t len +) { + size_t bytes_written = writer->data_writer(data, len, writer->user_ctx); + if (bytes_written < len) { + return RDATA_ERROR_WRITE; + } + writer->bytes_written += bytes_written; + return RDATA_OK; +} + +static rdata_error_t rdata_write_integer( + rdata_writer_t *writer, + int32_t val +) { + if (writer->bswap) { + val = byteswap4(val); + } + return rdata_write_bytes(writer, &val, sizeof(val)); +} + +static rdata_error_t rdata_write_double(rdata_writer_t *writer, double val) { + if (writer->bswap) { + val = byteswap_double(val); + } + return rdata_write_bytes(writer, &val, sizeof(val)); +} + +static rdata_error_t rdata_write_header( + rdata_writer_t *writer, + int type, + int flags +) { + rdata_sexptype_header_t header; + memset(&header, 0, sizeof(header)); + + header.type = type; + header.object = !!(flags & R_OBJECT); + header.tag = !!(flags & R_TAG); + header.attributes = !!(flags & R_ATTRIBUTES); + + uint32_t sexp_int; + + memcpy(&sexp_int, &header, sizeof(header)); + + return rdata_write_integer(writer, sexp_int); +} + +static rdata_error_t rdata_write_string( + rdata_writer_t *writer, + const char *string +) { + rdata_error_t retval = RDATA_OK; + + retval = rdata_write_header(writer, RDATA_SEXPTYPE_CHARACTER_STRING, 0); + if (retval != RDATA_OK) + goto cleanup; + + ssize_t len = string ? (ssize_t)strlen(string) : -1; + + retval = rdata_write_integer(writer, len); + if (retval != RDATA_OK) + goto cleanup; + + if (len > 0) + return rdata_write_bytes(writer, string, len); + +cleanup: + return retval; +} + +static rdata_error_t rdata_write_pairlist_key( + rdata_writer_t *writer, + const char *key +) { + rdata_error_t retval = RDATA_OK; + ck_hash_table_t *atom_table = (ck_hash_table_t *)writer->atom_table; + uint64_t ref = (uint64_t)ck_str_hash_lookup(key, atom_table); + if (ref == 0) { + ck_str_hash_insert(key, (void *)(atom_table->count + 1), atom_table); + + retval = rdata_write_integer(writer, 1); + if (retval != RDATA_OK) + goto cleanup; + + retval = rdata_write_string(writer, key); + } else { + retval = rdata_write_integer(writer, (ref << 8) | 0xFF); + } + +cleanup: + return retval; +} + +static rdata_error_t rdata_write_pairlist_header( + rdata_writer_t *writer, + const char *key +) { + rdata_error_t retval = RDATA_OK; + + retval = rdata_write_header(writer, RDATA_SEXPTYPE_PAIRLIST, R_TAG); + if (retval != RDATA_OK) + goto cleanup; + + retval = rdata_write_pairlist_key(writer, key); + if (retval != RDATA_OK) + goto cleanup; + +cleanup: + return retval; +} + +static rdata_error_t rdata_write_attributed_vector_header( + rdata_writer_t *writer, int type, + int32_t size +) { + rdata_error_t retval = RDATA_OK; + + retval = rdata_write_header(writer, type, R_OBJECT | R_ATTRIBUTES); + if (retval != RDATA_OK) + goto cleanup; + + retval = rdata_write_integer(writer, size); + if (retval != RDATA_OK) + goto cleanup; + +cleanup: + return retval; +} + +static rdata_error_t rdata_write_simple_vector_header( + rdata_writer_t *writer, + int type, + int32_t size +) { + rdata_error_t retval = RDATA_OK; + + retval = rdata_write_header(writer, type, 0); + if (retval != RDATA_OK) + goto cleanup; + + retval = rdata_write_integer(writer, size); + if (retval != RDATA_OK) + goto cleanup; + +cleanup: + return retval; +} + +static rdata_error_t rdata_write_class_pairlist( + rdata_writer_t *writer, + const char *class +) { + rdata_error_t retval = RDATA_OK; + + retval = rdata_write_pairlist_header(writer, "class"); + if (retval != RDATA_OK) + goto cleanup; + + retval = rdata_write_simple_vector_header( + writer, + RDATA_SEXPTYPE_CHARACTER_VECTOR, + 1); + if (retval != RDATA_OK) + goto cleanup; + + retval = rdata_write_string(writer, class); + if (retval != RDATA_OK) + goto cleanup; + +cleanup: + return retval; +} + +rdata_error_t rdata_begin_file( + rdata_writer_t *writer, + void *user_ctx +) { + rdata_error_t retval = RDATA_OK; + + writer->user_ctx = user_ctx; + + if (writer->file_format == RDATA_WORKSPACE) { + retval = rdata_write_bytes(writer, "RDX2\n", 5); + if (retval != RDATA_OK) + goto cleanup; + } + + rdata_v2_header_t v2_header; + memcpy(v2_header.header, "X\n", sizeof("X\n")-1); + v2_header.format_version = 2; + v2_header.reader_version = 131840; + v2_header.writer_version = 131840; + + if (writer->bswap) { + v2_header.format_version = byteswap4(v2_header.format_version); + v2_header.reader_version = byteswap4(v2_header.reader_version); + v2_header.writer_version = byteswap4(v2_header.writer_version); + } + + retval = rdata_write_bytes(writer, &v2_header, sizeof(v2_header)); + if (retval != RDATA_OK) + goto cleanup; + +cleanup: + return retval; +} + +rdata_error_t rdata_begin_table( + rdata_writer_t *writer, + const char *variable_name +) { + rdata_error_t retval = RDATA_OK; + + if (writer->file_format == RDATA_WORKSPACE) { + retval = rdata_write_pairlist_header(writer, variable_name); + if (retval != RDATA_OK) + goto cleanup; + } + + retval = rdata_write_attributed_vector_header( + writer, + RDATA_SEXPTYPE_GENERIC_VECTOR, + writer->columns_count); + if (retval != RDATA_OK) + goto cleanup; + +cleanup: + return retval; +} + +static rdata_error_t rdata_begin_factor_column( + rdata_writer_t *writer, + rdata_column_t *column, + int32_t row_count +) { + return rdata_write_attributed_vector_header( + writer, + RDATA_SEXPTYPE_INTEGER_VECTOR, + row_count); +} + +static rdata_error_t rdata_end_factor_column( + rdata_writer_t *writer, + rdata_column_t *column +) { + int i; + + rdata_error_t retval = RDATA_OK; + + retval = rdata_write_pairlist_header(writer, "levels"); + if (retval != RDATA_OK) + goto cleanup; + + retval = rdata_write_simple_vector_header(writer, + RDATA_SEXPTYPE_CHARACTER_VECTOR, column->factor_count); + if (retval != RDATA_OK) + goto cleanup; + + for (i=0; i < column->factor_count; i++) { + retval = rdata_write_string(writer, column->factor[i]); + if (retval != RDATA_OK) + goto cleanup; + } + + retval = rdata_write_class_pairlist(writer, "factor"); + if (retval != RDATA_OK) + goto cleanup; + + retval = rdata_write_header(writer, RDATA_PSEUDO_SXP_NIL, 0); + if (retval != RDATA_OK) + goto cleanup; + +cleanup: + return retval; +} + +static rdata_error_t rdata_begin_real_column( + rdata_writer_t *writer, + rdata_column_t *column, + int32_t row_count +) { + return rdata_write_simple_vector_header( + writer, + RDATA_SEXPTYPE_REAL_VECTOR, + row_count); +} + +static rdata_error_t rdata_end_real_column( + rdata_writer_t *writer, + rdata_column_t *column +) { + return RDATA_OK; +} + +static rdata_error_t rdata_begin_timestamp_column( + rdata_writer_t *writer, + rdata_column_t *column, + int32_t row_count +) { + return rdata_write_attributed_vector_header( + writer, + RDATA_SEXPTYPE_REAL_VECTOR, + row_count); +} + +static rdata_error_t rdata_end_timestamp_column( + rdata_writer_t *writer, + rdata_column_t *column +) { + rdata_error_t retval = RDATA_OK; + + retval = rdata_write_class_pairlist(writer, "POSIXct"); + if (retval != RDATA_OK) + goto cleanup; + + retval = rdata_write_header(writer, RDATA_PSEUDO_SXP_NIL, 0); + if (retval != RDATA_OK) + goto cleanup; + +cleanup: + return retval; +} + +static rdata_error_t rdata_begin_date_column( + rdata_writer_t *writer, + rdata_column_t *column, + int32_t row_count +) { + return rdata_write_attributed_vector_header( + writer, + RDATA_SEXPTYPE_REAL_VECTOR, + row_count); +} + +static rdata_error_t rdata_end_date_column( + rdata_writer_t *writer, + rdata_column_t *column +) { + rdata_error_t retval = RDATA_OK; + + retval = rdata_write_class_pairlist(writer, "Date"); + if (retval != RDATA_OK) + goto cleanup; + + retval = rdata_write_header(writer, RDATA_PSEUDO_SXP_NIL, 0); + if (retval != RDATA_OK) + goto cleanup; + +cleanup: + return retval; +} + +static rdata_error_t rdata_begin_integer_column( + rdata_writer_t *writer, + rdata_column_t *column, + int32_t row_count +) { + return rdata_write_simple_vector_header( + writer, + RDATA_SEXPTYPE_INTEGER_VECTOR, + row_count); +} + +static rdata_error_t rdata_end_integer_column( + rdata_writer_t *writer, + rdata_column_t *column +) { + return RDATA_OK; +} + +static rdata_error_t rdata_begin_logical_column( + rdata_writer_t *writer, + rdata_column_t *column, + int32_t row_count +) { + return rdata_write_simple_vector_header( + writer, + RDATA_SEXPTYPE_LOGICAL_VECTOR, + row_count); +} + +static rdata_error_t rdata_end_logical_column( + rdata_writer_t *writer, + rdata_column_t *column +) { + return RDATA_OK; +} + +static rdata_error_t rdata_begin_string_column( + rdata_writer_t *writer, + rdata_column_t *column, + int32_t row_count +) { + return rdata_write_simple_vector_header( + writer, + RDATA_SEXPTYPE_CHARACTER_VECTOR, + row_count); +} + +static rdata_error_t rdata_end_string_column( + rdata_writer_t *writer, + rdata_column_t *column +) { + return RDATA_OK; +} + +rdata_error_t rdata_begin_column( + rdata_writer_t *writer, + rdata_column_t *column, + int32_t row_count +) { + rdata_type_t type = column->type; + + if (type == RDATA_TYPE_INT32) { + if (column->factor_count) + return rdata_begin_factor_column(writer, column, row_count); + return rdata_begin_integer_column(writer, column, row_count); + } + if (type == RDATA_TYPE_REAL) + return rdata_begin_real_column(writer, column, row_count); + if (type == RDATA_TYPE_TIMESTAMP) + return rdata_begin_timestamp_column(writer, column, row_count); + if (type == RDATA_TYPE_DATE) + return rdata_begin_date_column(writer, column, row_count); + if (type == RDATA_TYPE_LOGICAL) + return rdata_begin_logical_column(writer, column, row_count); + if (type == RDATA_TYPE_STRING) + return rdata_begin_string_column(writer, column, row_count); + + return RDATA_OK; +} + +rdata_error_t rdata_append_real_value( + rdata_writer_t *writer, + double value +) { + return rdata_write_double(writer, value); +} + +rdata_error_t rdata_append_int32_value( + rdata_writer_t *writer, + int32_t value +) { + return rdata_write_integer(writer, value); +} + +rdata_error_t rdata_append_timestamp_value( + rdata_writer_t *writer, + time_t value +) { + return rdata_write_double(writer, value); +} + +rdata_error_t rdata_append_date_value( + rdata_writer_t *writer, + struct tm *value +) { + return rdata_write_double(writer, timegm(value) / 86400); +} + +rdata_error_t rdata_append_logical_value( + rdata_writer_t *writer, + int value +) { + if (value < 0) + return rdata_write_integer(writer, INT32_MIN); + + return rdata_write_integer(writer, (value > 0)); +} + +rdata_error_t rdata_append_string_value( + rdata_writer_t *writer, + const char *value +) { + return rdata_write_string(writer, value); +} + +rdata_error_t rdata_end_column( + rdata_writer_t *writer, + rdata_column_t *column +) { + rdata_type_t type = column->type; + + if (type == RDATA_TYPE_INT32) { + if (column->factor_count) + return rdata_end_factor_column(writer, column); + return rdata_end_integer_column(writer, column); + } + if (type == RDATA_TYPE_REAL) + return rdata_end_real_column(writer, column); + if (type == RDATA_TYPE_TIMESTAMP) + return rdata_end_timestamp_column(writer, column); + if (type == RDATA_TYPE_DATE) + return rdata_end_date_column(writer, column); + if (type == RDATA_TYPE_LOGICAL) + return rdata_end_logical_column(writer, column); + if (type == RDATA_TYPE_STRING) + return rdata_end_string_column(writer, column); + + return RDATA_OK; +} + +rdata_error_t rdata_end_table( + rdata_writer_t *writer, + int32_t row_count, + const char *datalabel +) { + int i; + rdata_error_t retval = RDATA_OK; + + retval = rdata_write_pairlist_header(writer, "datalabel"); + if (retval != RDATA_OK) + goto cleanup; + + retval = rdata_write_simple_vector_header( + writer, + RDATA_SEXPTYPE_CHARACTER_VECTOR, + 1); + if (retval != RDATA_OK) + goto cleanup; + + retval = rdata_write_string(writer, datalabel); + if (retval != RDATA_OK) + goto cleanup; + + retval = rdata_write_pairlist_header(writer, "names"); + if (retval != RDATA_OK) + goto cleanup; + + retval = rdata_write_simple_vector_header(writer, + RDATA_SEXPTYPE_CHARACTER_VECTOR, writer->columns_count); + if (retval != RDATA_OK) + goto cleanup; + + for (i=0; i < writer->columns_count; i++) { + retval = rdata_write_string(writer, writer->columns[i]->name); + if (retval != RDATA_OK) + goto cleanup; + } + + retval = rdata_write_pairlist_header(writer, "var.labels"); + if (retval != RDATA_OK) + goto cleanup; + + retval = rdata_write_simple_vector_header(writer, + RDATA_SEXPTYPE_CHARACTER_VECTOR, writer->columns_count); + if (retval != RDATA_OK) + goto cleanup; + + for (i=0; i < writer->columns_count; i++) { + retval = rdata_write_string(writer, writer->columns[i]->label); + if (retval != RDATA_OK) + goto cleanup; + } + + retval = rdata_write_class_pairlist(writer, "data.frame"); + if (retval != RDATA_OK) + goto cleanup; + + if (row_count > 0) { + retval = rdata_write_pairlist_header(writer, "row.names"); + if (retval != RDATA_OK) + goto cleanup; + + retval = rdata_write_simple_vector_header(writer, + RDATA_SEXPTYPE_CHARACTER_VECTOR, row_count); + if (retval != RDATA_OK) + goto cleanup; + + char buf[128]; + for (i=0; i < row_count; i++) { + snprintf(buf, sizeof(buf), "%d", i+1); + retval = rdata_write_string(writer, buf); + if (retval != RDATA_OK) + goto cleanup; + } + } + + retval = rdata_write_header(writer, RDATA_PSEUDO_SXP_NIL, 0); + if (retval != RDATA_OK) + goto cleanup; + +cleanup: + return retval; +} + +rdata_error_t rdata_end_file(rdata_writer_t *writer) { + if (writer->file_format == RDATA_WORKSPACE) + return rdata_write_header(writer, RDATA_PSEUDO_SXP_NIL, 0); + + return RDATA_OK; +} diff --git a/pandas/_libs/src/librdata/unix_iconv.h b/pandas/_libs/src/librdata/unix_iconv.h new file mode 100644 index 0000000000000..58ee38c36dd9c --- /dev/null +++ b/pandas/_libs/src/librdata/unix_iconv.h @@ -0,0 +1,60 @@ +/* Copyright (C) 1997-2020 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#ifndef PANDAS__LIBS_SRC_LIBRDATA_UNIX_ICONV_H_ +#define PANDAS__LIBS_SRC_LIBRDATA_UNIX_ICONV_H_ + +#ifndef _ICONV_H +#define _ICONV_H 1 + +#include +#define __need_size_t +#include + + +__BEGIN_DECLS + +/* Identifier for conversion method from one codeset to another. */ +typedef void *iconv_t; + + +/* Allocate descriptor for code conversion from codeset FROMCODE to + codeset TOCODE. + + This function is a possible cancellation point and therefore not + marked with __THROW. */ +extern iconv_t iconv_open(const char *__tocode, const char *__fromcode); + +/* Convert at most *INBYTESLEFT bytes from *INBUF according to the + code conversion algorithm specified by CD and place up to + *OUTBYTESLEFT bytes in buffer at *OUTBUF. */ +extern size_t iconv(iconv_t __cd, char **__restrict __inbuf, + size_t *__restrict __inbytesleft, + char **__restrict __outbuf, + size_t *__restrict __outbytesleft); + +/* Free resources allocated for descriptor CD for code conversion. + + This function is a possible cancellation point and therefore not + marked with __THROW. */ +extern int iconv_close(iconv_t __cd); + +__END_DECLS + +#endif /* iconv.h */ + +#endif // PANDAS__LIBS_SRC_LIBRDATA_UNIX_ICONV_H_ diff --git a/pandas/_libs/src/librdata/win_iconv.c b/pandas/_libs/src/librdata/win_iconv.c new file mode 100644 index 0000000000000..dd5ddc5882abd --- /dev/null +++ b/pandas/_libs/src/librdata/win_iconv.c @@ -0,0 +1,2236 @@ +/* + +win-iconv - iconv implementation using Win32 API to convert. +Written in 2009-2016 by Yukihiro Nakadaira +and contributors to win-iconv + +To the extent possible under law, the author(s) have dedicated all copyright +and related and neighboring rights to this software to the public domain +worldwide. This software is distributed without any warranty. + +You should have received a copy of the CC0 Public Domain Dedication along with +this software. If not, see http://creativecommons.org/publicdomain/zero/1.0/. + + */ + +/* for WC_NO_BEST_FIT_CHARS */ +#ifndef WINVER +# define WINVER 0x0500 +#endif + +#define STRICT +#include "win_iconv.h" +#include +#include +#include +#include + +#ifdef __GNUC__ +#define UNUSED __attribute__((unused)) +#else +#define UNUSED +#endif + +/* WORKAROUND: */ +#ifndef UNDER_CE +#define GetProcAddressA GetProcAddress +#endif + +#if 0 +# define MAKE_EXE +# define MAKE_DLL +# define USE_LIBICONV_DLL +#endif + +#if !defined(DEFAULT_LIBICONV_DLL) +# define DEFAULT_LIBICONV_DLL "" +#endif + +#define MB_CHAR_MAX 16 + +#define UNICODE_MODE_BOM_DONE 1 +#define UNICODE_MODE_SWAPPED 2 + +#define FLAG_USE_BOM 1 +#define FLAG_TRANSLIT 2 +#define FLAG_IGNORE 4 + +typedef unsigned char uchar; +typedef unsigned short ushort; +typedef unsigned int uint; + +typedef void* iconv_t; + +iconv_t iconv_open(const char *tocode, const char *fromcode); +int iconv_close(iconv_t cd); +size_t iconv( + iconv_t cd, + const char **inbuf, + size_t *inbytesleft, + char **outbuf, + size_t *outbytesleft); + +/* libiconv interface for vim */ +#if defined(MAKE_DLL) +int iconvctl(iconv_t cd, int request, void* argument) { + /* not supported */ + return 0; +} +#endif + +typedef struct compat_t compat_t; +typedef struct csconv_t csconv_t; +typedef struct rec_iconv_t rec_iconv_t; + +typedef iconv_t (*f_iconv_open)(const char *tocode, const char *fromcode); +typedef int (*f_iconv_close)(iconv_t cd); +typedef size_t (*f_iconv)( + iconv_t cd, + const char **inbuf, + size_t *inbytesleft, + char **outbuf, + size_t *outbytesleft); +typedef int* (*f_errno)(void); +typedef int (*f_mbtowc)( + csconv_t *cv, + const uchar *buf, + int bufsize, + ushort *wbuf, + int *wbufsize); +typedef int (*f_wctomb)( + csconv_t *cv, + ushort *wbuf, + int wbufsize, + uchar *buf, + int bufsize); +typedef int (*f_mblen)(csconv_t *cv, const uchar *buf, int bufsize); +typedef int (*f_flush)(csconv_t *cv, uchar *buf, int bufsize); + +#define COMPAT_IN 1 +#define COMPAT_OUT 2 + +/* unicode mapping for compatibility with other conversion table. */ +struct compat_t { + uint in; + uint out; + uint flag; +}; + +struct csconv_t { + int codepage; + int flags; + f_mbtowc mbtowc; + f_wctomb wctomb; + f_mblen mblen; + f_flush flush; + DWORD mode; + compat_t *compat; +}; + +struct rec_iconv_t { + iconv_t cd; + f_iconv_close iconv_close; + f_iconv iconv; + f_errno _errno; + csconv_t from; + csconv_t to; +#if defined(USE_LIBICONV_DLL) + HMODULE hlibiconv; +#endif +}; + +static int win_iconv_open( + rec_iconv_t *cd, + const char *tocode, + const char *fromcode); +static int win_iconv_close(iconv_t cd); +static size_t win_iconv( + iconv_t cd, + const char **inbuf, + size_t *inbytesleft, + char **outbuf, + size_t *outbytesleft); + +static int load_mlang(void); +static int make_csconv(const char *name, csconv_t *cv); +static int name_to_codepage(const char *name); +static uint utf16_to_ucs4(const ushort *wbuf); +static void ucs4_to_utf16(uint wc, ushort *wbuf, int *wbufsize); +static int mbtowc_flags(int codepage); +static int must_use_null_useddefaultchar(int codepage); +static char *strrstr(const char *str, const char *token); +static char *xstrndup(const char *s, size_t n); +static int seterror(int err); + +#if defined(USE_LIBICONV_DLL) +static int libiconv_iconv_open( + rec_iconv_t *cd, + const char *tocode, + const char *fromcode); +static PVOID MyImageDirectoryEntryToData( + LPVOID Base, + BOOLEAN MappedAsImage, + USHORT DirectoryEntry, + PULONG Size); +static FARPROC find_imported_function( + HMODULE hModule, + const char *funcname); + +static HMODULE hwiniconv; +#endif + +static int sbcs_mblen(csconv_t *cv, const uchar *buf, int bufsize); +static int dbcs_mblen(csconv_t *cv, const uchar *buf, int bufsize); +static int mbcs_mblen(csconv_t *cv, const uchar *buf, int bufsize); +static int utf8_mblen(csconv_t *cv, const uchar *buf, int bufsize); +static int eucjp_mblen(csconv_t *cv, const uchar *buf, int bufsize); + +static int kernel_mbtowc( + csconv_t *cv, + const uchar *buf, + int bufsize, + ushort *wbuf, + int *wbufsize); +static int kernel_wctomb( + csconv_t *cv, + ushort *wbuf, + int wbufsize, + uchar *buf, + int bufsize); +static int mlang_mbtowc( + csconv_t *cv, + const uchar *buf, + int bufsize, + ushort *wbuf, + int *wbufsize); +static int mlang_wctomb( + csconv_t *cv, + ushort *wbuf, + int wbufsize, + uchar *buf, + int bufsize); +static int utf16_mbtowc( + csconv_t *cv, + const uchar *buf, + int bufsize, + ushort *wbuf, + int *wbufsize); +static int utf16_wctomb( + csconv_t *cv, + ushort *wbuf, + int wbufsize, + uchar *buf, + int bufsize); +static int utf32_mbtowc( + csconv_t *cv, + const uchar *buf, + int bufsize, + ushort *wbuf, + int *wbufsize); +static int utf32_wctomb( + csconv_t *cv, + ushort *wbuf, + int wbufsize, + uchar *buf, + int bufsize); +static int iso2022jp_mbtowc( + csconv_t *cv, + const uchar *buf, + int bufsize, + ushort *wbuf, + int *wbufsize); +static int iso2022jp_wctomb( + csconv_t *cv, + ushort *wbuf, + int wbufsize, + uchar *buf, + int bufsize); +static int iso2022jp_flush( + csconv_t *cv, + uchar *buf, + int bufsize); + +static struct { + int codepage; + const char *name; +} codepage_alias[] = { + {65001, "CP65001"}, + {65001, "UTF8"}, + {65001, "UTF-8"}, + + {1200, "CP1200"}, + {1200, "UTF16LE"}, + {1200, "UTF-16LE"}, + {1200, "UCS2LE"}, + {1200, "UCS-2LE"}, + {1200, "UCS-2-INTERNAL"}, + + {1201, "CP1201"}, + {1201, "UTF16BE"}, + {1201, "UTF-16BE"}, + {1201, "UCS2BE"}, + {1201, "UCS-2BE"}, + {1201, "unicodeFFFE"}, + + {12000, "CP12000"}, + {12000, "UTF32LE"}, + {12000, "UTF-32LE"}, + {12000, "UCS4LE"}, + {12000, "UCS-4LE"}, + + {12001, "CP12001"}, + {12001, "UTF32BE"}, + {12001, "UTF-32BE"}, + {12001, "UCS4BE"}, + {12001, "UCS-4BE"}, + +#ifndef GLIB_COMPILATION + /* + * Default is big endian. + * See rfc2781 4.3 Interpreting text labelled as UTF-16. + */ + {1201, "UTF16"}, + {1201, "UTF-16"}, + {1201, "UCS2"}, + {1201, "UCS-2"}, + {12001, "UTF32"}, + {12001, "UTF-32"}, + {12001, "UCS-4"}, + {12001, "UCS4"}, +#else + /* Default is little endian, because the platform is */ + {1200, "UTF16"}, + {1200, "UTF-16"}, + {1200, "UCS2"}, + {1200, "UCS-2"}, + {12000, "UTF32"}, + {12000, "UTF-32"}, + {12000, "UCS4"}, + {12000, "UCS-4"}, +#endif + + /* copy from libiconv `iconv -l` */ + /* !IsValidCodePage(367) */ + {20127, "ANSI_X3.4-1968"}, + {20127, "ANSI_X3.4-1986"}, + {20127, "ASCII"}, + {20127, "CP367"}, + {20127, "IBM367"}, + {20127, "ISO-IR-6"}, + {20127, "ISO646-US"}, + {20127, "ISO_646.IRV:1991"}, + {20127, "US"}, + {20127, "US-ASCII"}, + {20127, "CSASCII"}, + + /* !IsValidCodePage(819) */ + {1252, "CP819"}, + {1252, "IBM819"}, + {28591, "ISO-8859-1"}, + {28591, "ISO-IR-100"}, + {28591, "ISO8859-1"}, + {28591, "ISO_8859-1"}, + {28591, "ISO_8859-1:1987"}, + {28591, "L1"}, + {28591, "LATIN1"}, + {28591, "CSISOLATIN1"}, + + {1250, "CP1250"}, + {1250, "MS-EE"}, + {1250, "WINDOWS-1250"}, + + {1251, "CP1251"}, + {1251, "MS-CYRL"}, + {1251, "WINDOWS-1251"}, + + {1252, "CP1252"}, + {1252, "MS-ANSI"}, + {1252, "WINDOWS-1252"}, + + {1253, "CP1253"}, + {1253, "MS-GREEK"}, + {1253, "WINDOWS-1253"}, + + {1254, "CP1254"}, + {1254, "MS-TURK"}, + {1254, "WINDOWS-1254"}, + + {1255, "CP1255"}, + {1255, "MS-HEBR"}, + {1255, "WINDOWS-1255"}, + + {1256, "CP1256"}, + {1256, "MS-ARAB"}, + {1256, "WINDOWS-1256"}, + + {1257, "CP1257"}, + {1257, "WINBALTRIM"}, + {1257, "WINDOWS-1257"}, + + {1258, "CP1258"}, + {1258, "WINDOWS-1258"}, + + {850, "850"}, + {850, "CP850"}, + {850, "IBM850"}, + {850, "CSPC850MULTILINGUAL"}, + + /* !IsValidCodePage(862) */ + {862, "862"}, + {862, "CP862"}, + {862, "IBM862"}, + {862, "CSPC862LATINHEBREW"}, + + {866, "866"}, + {866, "CP866"}, + {866, "IBM866"}, + {866, "CSIBM866"}, + + /* !IsValidCodePage(154) */ + {154, "CP154"}, + {154, "CYRILLIC-ASIAN"}, + {154, "PT154"}, + {154, "PTCP154"}, + {154, "CSPTCP154"}, + + /* !IsValidCodePage(1133) */ + {1133, "CP1133"}, + {1133, "IBM-CP1133"}, + + {874, "CP874"}, + {874, "WINDOWS-874"}, + + /* !IsValidCodePage(51932) */ + {51932, "CP51932"}, + {51932, "MS51932"}, + {51932, "WINDOWS-51932"}, + {51932, "EUC-JP"}, + + {932, "CP932"}, + {932, "MS932"}, + {932, "SHIFFT_JIS"}, + {932, "SHIFFT_JIS-MS"}, + {932, "SJIS"}, + {932, "SJIS-MS"}, + {932, "SJIS-OPEN"}, + {932, "SJIS-WIN"}, + {932, "WINDOWS-31J"}, + {932, "WINDOWS-932"}, + {932, "CSWINDOWS31J"}, + + {50221, "CP50221"}, + {50221, "ISO-2022-JP"}, + {50221, "ISO-2022-JP-MS"}, + {50221, "ISO2022-JP"}, + {50221, "ISO2022-JP-MS"}, + {50221, "MS50221"}, + {50221, "WINDOWS-50221"}, + + {936, "CP936"}, + {936, "GBK"}, + {936, "MS936"}, + {936, "WINDOWS-936"}, + + {950, "CP950"}, + {950, "BIG5"}, + {950, "BIG5HKSCS"}, + {950, "BIG5-HKSCS"}, + + {949, "CP949"}, + {949, "UHC"}, + {949, "EUC-KR"}, + + {1361, "CP1361"}, + {1361, "JOHAB"}, + + {437, "437"}, + {437, "CP437"}, + {437, "IBM437"}, + {437, "CSPC8CODEPAGE437"}, + + {737, "CP737"}, + + {775, "CP775"}, + {775, "IBM775"}, + {775, "CSPC775BALTIC"}, + + {852, "852"}, + {852, "CP852"}, + {852, "IBM852"}, + {852, "CSPCP852"}, + + /* !IsValidCodePage(853) */ + {853, "CP853"}, + + {855, "855"}, + {855, "CP855"}, + {855, "IBM855"}, + {855, "CSIBM855"}, + + {857, "857"}, + {857, "CP857"}, + {857, "IBM857"}, + {857, "CSIBM857"}, + + /* !IsValidCodePage(858) */ + {858, "CP858"}, + + {860, "860"}, + {860, "CP860"}, + {860, "IBM860"}, + {860, "CSIBM860"}, + + {861, "861"}, + {861, "CP-IS"}, + {861, "CP861"}, + {861, "IBM861"}, + {861, "CSIBM861"}, + + {863, "863"}, + {863, "CP863"}, + {863, "IBM863"}, + {863, "CSIBM863"}, + + {864, "CP864"}, + {864, "IBM864"}, + {864, "CSIBM864"}, + + {865, "865"}, + {865, "CP865"}, + {865, "IBM865"}, + {865, "CSIBM865"}, + + {869, "869"}, + {869, "CP-GR"}, + {869, "CP869"}, + {869, "IBM869"}, + {869, "CSIBM869"}, + + /* !IsValidCodePage(1152) */ + {1125, "CP1125"}, + + /* + * Code Page Identifiers + * http://msdn2.microsoft.com/en-us/library/ms776446.aspx + */ + {37, "IBM037"}, /* IBM EBCDIC US-Canada */ + {437, "IBM437"}, /* OEM United States */ + {500, "IBM500"}, /* IBM EBCDIC International */ + {708, "ASMO-708"}, /* Arabic (ASMO 708) */ + /* 709 Arabic (ASMO-449+, BCON V4) */ + /* 710 Arabic - Transparent Arabic */ + {720, "DOS-720"}, /* Arabic (Transparent ASMO); Arabic (DOS) */ + {737, "ibm737"}, /* OEM Greek (formerly 437G); Greek (DOS) */ + {775, "ibm775"}, /* OEM Baltic; Baltic (DOS) */ + {850, "ibm850"}, /* OEM Multilingual Latin 1; Western European (DOS) */ + {852, "ibm852"}, /* OEM Latin 2; Central European (DOS) */ + {855, "IBM855"}, /* OEM Cyrillic (primarily Russian) */ + {857, "ibm857"}, /* OEM Turkish; Turkish (DOS) */ + {858, "IBM00858"}, /* OEM Multilingual Latin 1 + Euro symbol */ + {860, "IBM860"}, /* OEM Portuguese; Portuguese (DOS) */ + {861, "ibm861"}, /* OEM Icelandic; Icelandic (DOS) */ + {862, "DOS-862"}, /* OEM Hebrew; Hebrew (DOS) */ + {863, "IBM863"}, /* OEM French Canadian; French Canadian (DOS) */ + {864, "IBM864"}, /* OEM Arabic; Arabic (864) */ + {865, "IBM865"}, /* OEM Nordic; Nordic (DOS) */ + {866, "cp866"}, /* OEM Russian; Cyrillic (DOS) */ + {869, "ibm869"}, /* OEM Modern Greek; Greek, Modern (DOS) */ + /* + * IBM EBCDIC Multilingual/ROECE (Latin 2); + * IBM EBCDIC Multilingual Latin 2 + */ + {870, "IBM870"}, + /* ANSI/OEM Thai (same as 28605, ISO 8859-15); Thai (Windows) */ + {874, "windows-874"}, + {875, "cp875"}, /* IBM EBCDIC Greek Modern */ + {932, "shift_jis"}, /* ANSI/OEM Japanese; Japanese (Shift-JIS) */ + {932, "shift-jis"}, /* alternative name for it */ + /* + * ANSI/OEM Simplified Chinese (PRC, Singapore); + * Chinese Simplified (GB2312) + */ + {936, "gb2312"}, + {949, "ks_c_5601-1987"}, /* ANSI/OEM Korean (Unified Hangul Code) */ + /* + * ANSI/OEM Traditional Chinese (Taiwan; Hong Kong SAR, PRC); + * Chinese Traditional (Big5) + */ + {950, "big5"}, + /* + * ANSI/OEM Traditional Chinese (Hong Kong SAR); + * Chinese Traditional (Big5-HKSCS) + */ + {950, "big5hkscs"}, + {950, "big5-hkscs"}, /* alternative name for it */ + {1026, "IBM1026"}, /* IBM EBCDIC Turkish (Latin 5) */ + {1047, "IBM01047"}, /* IBM EBCDIC Latin 1/Open System */ + /* + * IBM EBCDIC US-Canada (037 + Euro symbol); + * IBM EBCDIC (US-Canada-Euro) + */ + {1140, "IBM01140"}, + /* + * IBM EBCDIC Germany (20273 + Euro symbol); + * IBM EBCDIC (Germany-Euro) + */ + {1141, "IBM01141"}, + /* + * IBM EBCDIC Denmark-Norway (20277 + Euro symbol); + * IBM EBCDIC (Denmark-Norway-Euro) + */ + {1142, "IBM01142"}, + /* + * IBM EBCDIC Finland-Sweden (20278 + Euro symbol); + * IBM EBCDIC (Finland-Sweden-Euro) + */ + {1143, "IBM01143"}, + /* IBM EBCDIC Italy (20280 + Euro symbol); IBM EBCDIC (Italy-Euro) */ + {1144, "IBM01144"}, + /* + * IBM EBCDIC Latin America-Spain (20284 + Euro symbol); + * IBM EBCDIC (Spain-Euro) + */ + {1145, "IBM01145"}, + /* + * IBM EBCDIC United Kingdom (20285 + Euro symbol); + * IBM EBCDIC (UK-Euro) + */ + {1146, "IBM01146"}, + /* + * IBM EBCDIC France (20297 + Euro symbol); + * IBM EBCDIC (France-Euro) + */ + {1147, "IBM01147"}, + /* + * IBM EBCDIC International (500 + Euro symbol); + * IBM EBCDIC (International-Euro) + */ + {1148, "IBM01148"}, + /* + * IBM EBCDIC Icelandic (20871 + Euro symbol); + * IBM EBCDIC (Icelandic-Euro) + */ + {1149, "IBM01149"}, + /* ANSI Central European; Central European (Windows) */ + {1250, "windows-1250"}, + {1251, "windows-1251"}, /* ANSI Cyrillic; Cyrillic (Windows) */ + {1252, "windows-1252"}, /* ANSI Latin 1; Western European (Windows) */ + {1253, "windows-1253"}, /* ANSI Greek; Greek (Windows) */ + {1254, "windows-1254"}, /* ANSI Turkish; Turkish (Windows) */ + {1255, "windows-1255"}, /* ANSI Hebrew; Hebrew (Windows) */ + {1256, "windows-1256"}, /* ANSI Arabic; Arabic (Windows) */ + {1257, "windows-1257"}, /* ANSI Baltic; Baltic (Windows) */ + {1258, "windows-1258"}, /* ANSI/OEM Vietnamese; Vietnamese (Windows) */ + {1361, "Johab"}, /* Korean (Johab) */ + {10000, "macintosh"}, /* MAC Roman; Western European (Mac) */ + {10001, "x-mac-japanese"}, /* Japanese (Mac) */ + /* MAC Traditional Chinese (Big5); Chinese Traditional (Mac) */ + {10002, "x-mac-chinesetrad"}, + {10003, "x-mac-korean"}, /* Korean (Mac) */ + {10004, "x-mac-arabic"}, /* Arabic (Mac) */ + {10005, "x-mac-hebrew"}, /* Hebrew (Mac) */ + {10006, "x-mac-greek"}, /* Greek (Mac) */ + {10007, "x-mac-cyrillic"}, /* Cyrillic (Mac) */ + /* MAC Simplified Chinese (GB 2312); Chinese Simplified (Mac) */ + {10008, "x-mac-chinesesimp"}, + {10010, "x-mac-romanian"}, /* Romanian (Mac) */ + {10017, "x-mac-ukrainian"}, /* Ukrainian (Mac) */ + {10021, "x-mac-thai"}, /* Thai (Mac) */ + {10029, "x-mac-ce"}, /* MAC Latin 2; Central European (Mac) */ + {10079, "x-mac-icelandic"}, /* Icelandic (Mac) */ + {10081, "x-mac-turkish"}, /* Turkish (Mac) */ + {10082, "x-mac-croatian"}, /* Croatian (Mac) */ + {20000, "x-Chinese_CNS"}, /* CNS Taiwan; Chinese Traditional (CNS) */ + {20001, "x-cp20001"}, /* TCA Taiwan */ + {20002, "x_Chinese-Eten"}, /* Eten Taiwan; Chinese Traditional (Eten) */ + {20003, "x-cp20003"}, /* IBM5550 Taiwan */ + {20004, "x-cp20004"}, /* TeleText Taiwan */ + {20005, "x-cp20005"}, /* Wang Taiwan */ + /* + * IA5 (IRV International Alphabet No. 5, 7-bit); + * Western European (IA5) + */ + {20105, "x-IA5"}, + {20106, "x-IA5-German"}, /* IA5 German (7-bit) */ + {20107, "x-IA5-Swedish"}, /* IA5 Swedish (7-bit) */ + {20108, "x-IA5-Norwegian"}, /* IA5 Norwegian (7-bit) */ + {20127, "us-ascii"}, /* US-ASCII (7-bit) */ + {20261, "x-cp20261"}, /* T.61 */ + {20269, "x-cp20269"}, /* ISO 6937 Non-Spacing Accent */ + {20273, "IBM273"}, /* IBM EBCDIC Germany */ + {20277, "IBM277"}, /* IBM EBCDIC Denmark-Norway */ + {20278, "IBM278"}, /* IBM EBCDIC Finland-Sweden */ + {20280, "IBM280"}, /* IBM EBCDIC Italy */ + {20284, "IBM284"}, /* IBM EBCDIC Latin America-Spain */ + {20285, "IBM285"}, /* IBM EBCDIC United Kingdom */ + {20290, "IBM290"}, /* IBM EBCDIC Japanese Katakana Extended */ + {20297, "IBM297"}, /* IBM EBCDIC France */ + {20420, "IBM420"}, /* IBM EBCDIC Arabic */ + {20423, "IBM423"}, /* IBM EBCDIC Greek */ + {20424, "IBM424"}, /* IBM EBCDIC Hebrew */ + {20833, "x-EBCDIC-KoreanExtended"}, /* IBM EBCDIC Korean Extended */ + {20838, "IBM-Thai"}, /* IBM EBCDIC Thai */ + {20866, "koi8-r"}, /* Russian (KOI8-R); Cyrillic (KOI8-R) */ + {20871, "IBM871"}, /* IBM EBCDIC Icelandic */ + {20880, "IBM880"}, /* IBM EBCDIC Cyrillic Russian */ + {20905, "IBM905"}, /* IBM EBCDIC Turkish */ + /* IBM EBCDIC Latin 1/Open System (1047 + Euro symbol) */ + {20924, "IBM00924"}, + {20932, "EUC-JP"}, /* Japanese (JIS 0208-1990 and 0121-1990) */ + /* Simplified Chinese (GB2312); Chinese Simplified (GB2312-80) */ + {20936, "x-cp20936"}, + {20949, "x-cp20949"}, /* Korean Wansung */ + {21025, "cp1025"}, /* IBM EBCDIC Cyrillic Serbian-Bulgarian */ + /* 21027 (deprecated) */ + {21866, "koi8-u"}, /* Ukrainian (KOI8-U); Cyrillic (KOI8-U) */ + {28591, "iso-8859-1"}, /* ISO 8859-1 Latin 1; Western European (ISO) */ + {28591, "iso8859-1"}, /* ISO 8859-1 Latin 1; Western European (ISO) */ + {28591, "iso_8859-1"}, + {28591, "iso_8859_1"}, + /* ISO 8859-2 Central European; Central European (ISO) */ + {28592, "iso-8859-2"}, + /* ISO 8859-2 Central European; Central European (ISO) */ + {28592, "iso8859-2"}, + {28592, "iso_8859-2"}, + {28592, "iso_8859_2"}, + {28593, "iso-8859-3"}, /* ISO 8859-3 Latin 3 */ + {28593, "iso8859-3"}, /* ISO 8859-3 Latin 3 */ + {28593, "iso_8859-3"}, + {28593, "iso_8859_3"}, + {28594, "iso-8859-4"}, /* ISO 8859-4 Baltic */ + {28594, "iso8859-4"}, /* ISO 8859-4 Baltic */ + {28594, "iso_8859-4"}, + {28594, "iso_8859_4"}, + {28595, "iso-8859-5"}, /* ISO 8859-5 Cyrillic */ + {28595, "iso8859-5"}, /* ISO 8859-5 Cyrillic */ + {28595, "iso_8859-5"}, + {28595, "iso_8859_5"}, + {28596, "iso-8859-6"}, /* ISO 8859-6 Arabic */ + {28596, "iso8859-6"}, /* ISO 8859-6 Arabic */ + {28596, "iso_8859-6"}, + {28596, "iso_8859_6"}, + {28597, "iso-8859-7"}, /* ISO 8859-7 Greek */ + {28597, "iso8859-7"}, /* ISO 8859-7 Greek */ + {28597, "iso_8859-7"}, + {28597, "iso_8859_7"}, + {28598, "iso-8859-8"}, /* ISO 8859-8 Hebrew; Hebrew (ISO-Visual) */ + {28598, "iso8859-8"}, /* ISO 8859-8 Hebrew; Hebrew (ISO-Visual) */ + {28598, "iso_8859-8"}, + {28598, "iso_8859_8"}, + {28599, "iso-8859-9"}, /* ISO 8859-9 Turkish */ + {28599, "iso8859-9"}, /* ISO 8859-9 Turkish */ + {28599, "iso_8859-9"}, + {28599, "iso_8859_9"}, + {28603, "iso-8859-13"}, /* ISO 8859-13 Estonian */ + {28603, "iso8859-13"}, /* ISO 8859-13 Estonian */ + {28603, "iso_8859-13"}, + {28603, "iso_8859_13"}, + {28605, "iso-8859-15"}, /* ISO 8859-15 Latin 9 */ + {28605, "iso8859-15"}, /* ISO 8859-15 Latin 9 */ + {28605, "iso_8859-15"}, + {28605, "iso_8859_15"}, + {29001, "x-Europa"}, /* Europa 3 */ + {38598, "iso-8859-8-i"}, /* ISO 8859-8 Hebrew; Hebrew (ISO-Logical) */ + {38598, "iso8859-8-i"}, /* ISO 8859-8 Hebrew; Hebrew (ISO-Logical) */ + {38598, "iso_8859-8-i"}, + {38598, "iso_8859_8-i"}, + /* + * ISO 2022 Japanese with no halfwidth Katakana; + * Japanese (JIS) + */ + {50220, "iso-2022-jp"}, + /* + * ISO 2022 Japanese with halfwidth Katakana; + * Japanese (JIS-Allow 1 byte Kana) + */ + {50221, "csISO2022JP"}, + /* + * ISO 2022 Japanese JIS X 0201-1989; + * Japanese (JIS-Allow 1 byte Kana - SO/SI) + */ + {50222, "iso-2022-jp"}, + {50225, "iso-2022-kr"}, /* ISO 2022 Korean */ + {50225, "iso2022-kr"}, /* ISO 2022 Korean */ + /* ISO 2022 Simplified Chinese; Chinese Simplified (ISO 2022) */ + {50227, "x-cp50227"}, + /* 50229 ISO 2022 Traditional Chinese */ + /* 50930 EBCDIC Japanese (Katakana) Extended */ + /* 50931 EBCDIC US-Canada and Japanese */ + /* 50933 EBCDIC Korean Extended and Korean */ + /* 50935 EBCDIC Simplified Chinese Extended and Simplified Chinese */ + /* 50936 EBCDIC Simplified Chinese */ + /* 50937 EBCDIC US-Canada and Traditional Chinese */ + /* 50939 EBCDIC Japanese (Latin) Extended and Japanese */ + {51932, "euc-jp"}, /* EUC Japanese */ + {51936, "EUC-CN"}, /* EUC Simplified Chinese; Chinese Simplified (EUC) */ + {51949, "euc-kr"}, /* EUC Korean */ + /* 51950 EUC Traditional Chinese */ + /* HZ-GB2312 Simplified Chinese; Chinese Simplified (HZ) */ + {52936, "hz-gb-2312"}, + /* + * Windows XP and later: GB18030 Simplified Chinese (4 byte); + * Chinese Simplified (GB18030) + */ + {54936, "GB18030"}, + {57002, "x-iscii-de"}, /* ISCII Devanagari */ + {57003, "x-iscii-be"}, /* ISCII Bengali */ + {57004, "x-iscii-ta"}, /* ISCII Tamil */ + {57005, "x-iscii-te"}, /* ISCII Telugu */ + {57006, "x-iscii-as"}, /* ISCII Assamese */ + {57007, "x-iscii-or"}, /* ISCII Oriya */ + {57008, "x-iscii-ka"}, /* ISCII Kannada */ + {57009, "x-iscii-ma"}, /* ISCII Malayalam */ + {57010, "x-iscii-gu"}, /* ISCII Gujarati */ + {57011, "x-iscii-pa"}, /* ISCII Punjabi */ + + {0, NULL} +}; + +/* + * SJIS SHIFTJIS table CP932 table + * ---- --------------------------- -------------------------------- + * 5C U+00A5 YEN SIGN U+005C REVERSE SOLIDUS + * 7E U+203E OVERLINE U+007E TILDE + * 815C U+2014 EM DASH U+2015 HORIZONTAL BAR + * 815F U+005C REVERSE SOLIDUS U+FF3C FULLWIDTH REVERSE SOLIDUS + * 8160 U+301C WAVE DASH U+FF5E FULLWIDTH TILDE + * 8161 U+2016 DOUBLE VERTICAL LINE U+2225 PARALLEL TO + * 817C U+2212 MINUS SIGN U+FF0D FULLWIDTH HYPHEN-MINUS + * 8191 U+00A2 CENT SIGN U+FFE0 FULLWIDTH CENT SIGN + * 8192 U+00A3 POUND SIGN U+FFE1 FULLWIDTH POUND SIGN + * 81CA U+00AC NOT SIGN U+FFE2 FULLWIDTH NOT SIGN + * + * EUC-JP and ISO-2022-JP should be compatible with CP932. + * + * Kernel and MLang have different Unicode mapping table. Make sure + * which API is used. + */ +static compat_t cp932_compat[] = { + {0x00A5, 0x005C, COMPAT_OUT}, + {0x203E, 0x007E, COMPAT_OUT}, + {0x2014, 0x2015, COMPAT_OUT}, + {0x301C, 0xFF5E, COMPAT_OUT}, + {0x2016, 0x2225, COMPAT_OUT}, + {0x2212, 0xFF0D, COMPAT_OUT}, + {0x00A2, 0xFFE0, COMPAT_OUT}, + {0x00A3, 0xFFE1, COMPAT_OUT}, + {0x00AC, 0xFFE2, COMPAT_OUT}, + {0, 0, 0} +}; + +static compat_t cp20932_compat[] = { + {0x00A5, 0x005C, COMPAT_OUT}, + {0x203E, 0x007E, COMPAT_OUT}, + {0x2014, 0x2015, COMPAT_OUT}, + {0xFF5E, 0x301C, COMPAT_OUT|COMPAT_IN}, + {0x2225, 0x2016, COMPAT_OUT|COMPAT_IN}, + {0xFF0D, 0x2212, COMPAT_OUT|COMPAT_IN}, + {0xFFE0, 0x00A2, COMPAT_OUT|COMPAT_IN}, + {0xFFE1, 0x00A3, COMPAT_OUT|COMPAT_IN}, + {0xFFE2, 0x00AC, COMPAT_OUT|COMPAT_IN}, + {0, 0, 0} +}; + +static compat_t *cp51932_compat = cp932_compat; + +/* cp20932_compat for kernel. cp932_compat for mlang. */ +static compat_t *cp5022x_compat = cp932_compat; + +typedef HRESULT (WINAPI *CONVERTINETSTRING)( + LPDWORD lpdwMode, + DWORD dwSrcEncoding, + DWORD dwDstEncoding, + LPCSTR lpSrcStr, + LPINT lpnSrcSize, + LPBYTE lpDstStr, + LPINT lpnDstSize +); +typedef HRESULT (WINAPI *CONVERTINETMULTIBYTETOUNICODE)( + LPDWORD lpdwMode, + DWORD dwSrcEncoding, + LPCSTR lpSrcStr, + LPINT lpnMultiCharCount, + LPWSTR lpDstStr, + LPINT lpnWideCharCount +); +typedef HRESULT (WINAPI *CONVERTINETUNICODETOMULTIBYTE)( + LPDWORD lpdwMode, + DWORD dwEncoding, + LPCWSTR lpSrcStr, + LPINT lpnWideCharCount, + LPSTR lpDstStr, + LPINT lpnMultiCharCount +); +typedef HRESULT (WINAPI *ISCONVERTINETSTRINGAVAILABLE)( + DWORD dwSrcEncoding, + DWORD dwDstEncoding +); +typedef HRESULT (WINAPI *LCIDTORFC1766A)( + LCID Locale, + LPSTR pszRfc1766, + int nChar +); +typedef HRESULT (WINAPI *LCIDTORFC1766W)( + LCID Locale, + LPWSTR pszRfc1766, + int nChar +); +typedef HRESULT (WINAPI *RFC1766TOLCIDA)( + LCID *pLocale, + LPSTR pszRfc1766 +); +typedef HRESULT (WINAPI *RFC1766TOLCIDW)( + LCID *pLocale, + LPWSTR pszRfc1766 +); +static CONVERTINETSTRING ConvertINetString; +static CONVERTINETMULTIBYTETOUNICODE ConvertINetMultiByteToUnicode; +static CONVERTINETUNICODETOMULTIBYTE ConvertINetUnicodeToMultiByte; +static ISCONVERTINETSTRINGAVAILABLE IsConvertINetStringAvailable; +static LCIDTORFC1766A LcidToRfc1766A; +static RFC1766TOLCIDA Rfc1766ToLcidA; + +static int load_mlang(void) { + HMODULE h; + if (ConvertINetString != NULL) + return TRUE; + h = LoadLibrary(TEXT("mlang.dll")); + if (!h) + return FALSE; + ConvertINetString = + (CONVERTINETSTRING)GetProcAddressA(h, "ConvertINetString"); + ConvertINetMultiByteToUnicode = + (CONVERTINETMULTIBYTETOUNICODE)GetProcAddressA( + h, "ConvertINetMultiByteToUnicode"); + ConvertINetUnicodeToMultiByte = + (CONVERTINETUNICODETOMULTIBYTE)GetProcAddressA( + h, "ConvertINetUnicodeToMultiByte"); + IsConvertINetStringAvailable = + (ISCONVERTINETSTRINGAVAILABLE)GetProcAddressA( + h, "IsConvertINetStringAvailable"); + LcidToRfc1766A = + (LCIDTORFC1766A)GetProcAddressA(h, "LcidToRfc1766A"); + Rfc1766ToLcidA = + (RFC1766TOLCIDA)GetProcAddressA(h, "Rfc1766ToLcidA"); + return TRUE; +} + +iconv_t iconv_open(const char *tocode, const char *fromcode) { + rec_iconv_t *cd; + + cd = (rec_iconv_t *)calloc(1, sizeof(rec_iconv_t)); + if (cd == NULL) + return (iconv_t)(-1); + +#if defined(USE_LIBICONV_DLL) + errno = 0; + if (libiconv_iconv_open(cd, tocode, fromcode)) + return (iconv_t)cd; +#endif + + /* reset the errno to prevent reporting wrong error code. + * 0 for unsorted error. */ + errno = 0; + if (win_iconv_open(cd, tocode, fromcode)) + return (iconv_t)cd; + + free(cd); + + return (iconv_t)(-1); +} + +int iconv_close(iconv_t _cd) { + rec_iconv_t *cd = (rec_iconv_t *)_cd; + int r = cd->iconv_close(cd->cd); + int e = *(cd->_errno()); +#if defined(USE_LIBICONV_DLL) + if (cd->hlibiconv != NULL) + FreeLibrary(cd->hlibiconv); +#endif + free(cd); + errno = e; + return r; +} + +size_t iconv( + iconv_t _cd, + const char **inbuf, + size_t *inbytesleft, + char **outbuf, + size_t *outbytesleft +) { + rec_iconv_t *cd = (rec_iconv_t *)_cd; + size_t r = cd->iconv(cd->cd, inbuf, inbytesleft, outbuf, outbytesleft); + errno = *(cd->_errno()); + return r; +} + +static int win_iconv_open( + rec_iconv_t *cd, + const char *tocode, + const char *fromcode +) { + if (!make_csconv(fromcode, &cd->from) || !make_csconv(tocode, &cd->to)) + return FALSE; + cd->iconv_close = win_iconv_close; + cd->iconv = win_iconv; + cd->_errno = _errno; + cd->cd = (iconv_t)cd; + return TRUE; +} + +static int win_iconv_close(iconv_t cd UNUSED) { + return 0; +} + +static size_t win_iconv( + iconv_t _cd, + const char **inbuf, + size_t *inbytesleft, + char **outbuf, + size_t *outbytesleft +) { + rec_iconv_t *cd = (rec_iconv_t *)_cd; + ushort wbuf[MB_CHAR_MAX]; /* enough room for one character */ + int insize; + int outsize; + int wsize; + DWORD frommode; + DWORD tomode; + uint wc; + compat_t *cp; + int i; + + if (inbuf == NULL || *inbuf == NULL) { + if (outbuf != NULL && *outbuf != NULL && cd->to.flush != NULL) { + tomode = cd->to.mode; + outsize = cd->to.flush( + &cd->to, + (uchar *)*outbuf, + *outbytesleft); + if (outsize == -1) { + if ((cd->to.flags & FLAG_IGNORE) && errno != E2BIG) { + outsize = 0; + } else { + cd->to.mode = tomode; + return (size_t)(-1); + } + } + *outbuf += outsize; + *outbytesleft -= outsize; + } + cd->from.mode = 0; + cd->to.mode = 0; + return 0; + } + + while (*inbytesleft != 0) { + frommode = cd->from.mode; + tomode = cd->to.mode; + wsize = MB_CHAR_MAX; + + insize = cd->from.mbtowc( + &cd->from, + (const uchar *)*inbuf, + *inbytesleft, wbuf, &wsize); + if (insize == -1) { + if (cd->to.flags & FLAG_IGNORE) { + cd->from.mode = frommode; + insize = 1; + wsize = 0; + } else { + cd->from.mode = frommode; + return (size_t)(-1); + } + } + + if (wsize == 0) { + *inbuf += insize; + *inbytesleft -= insize; + continue; + } + + if (cd->from.compat != NULL) { + wc = utf16_to_ucs4(wbuf); + cp = cd->from.compat; + for (i = 0; cp[i].in != 0; ++i) { + if ((cp[i].flag & COMPAT_IN) && cp[i].out == wc) { + ucs4_to_utf16(cp[i].in, wbuf, &wsize); + break; + } + } + } + + if (cd->to.compat != NULL) { + wc = utf16_to_ucs4(wbuf); + cp = cd->to.compat; + for (i = 0; cp[i].in != 0; ++i) { + if ((cp[i].flag & COMPAT_OUT) && cp[i].in == wc) { + ucs4_to_utf16(cp[i].out, wbuf, &wsize); + break; + } + } + } + + outsize = cd->to.wctomb( + &cd->to, + wbuf, wsize, + (uchar *)*outbuf, + *outbytesleft); + if (outsize == -1) { + if ((cd->to.flags & FLAG_IGNORE) && errno != E2BIG) { + cd->to.mode = tomode; + outsize = 0; + } else { + cd->from.mode = frommode; + cd->to.mode = tomode; + return (size_t)(-1); + } + } + + *inbuf += insize; + *outbuf += outsize; + *inbytesleft -= insize; + *outbytesleft -= outsize; + } + + return 0; +} + +static int make_csconv(const char *_name, csconv_t *cv) { + CPINFO cpinfo; + int use_compat = TRUE; + int flag = 0; + char *name; + char *p; + + name = xstrndup(_name, strlen(_name)); + if (name == NULL) + return FALSE; + + /* check for option "enc_name//opt1//opt2" */ + while ((p = strrstr(name, "//")) != NULL) { + if (_stricmp(p + 2, "nocompat") == 0) + use_compat = FALSE; + else if (_stricmp(p + 2, "translit") == 0) + flag |= FLAG_TRANSLIT; + else if (_stricmp(p + 2, "ignore") == 0) + flag |= FLAG_IGNORE; + *p = 0; + } + + cv->mode = 0; + cv->flags = flag; + cv->mblen = NULL; + cv->flush = NULL; + cv->compat = NULL; + cv->codepage = name_to_codepage(name); + if (cv->codepage == 1200 || cv->codepage == 1201) { + cv->mbtowc = utf16_mbtowc; + cv->wctomb = utf16_wctomb; + if (_stricmp(name, "UTF-16") == 0 || + _stricmp(name, "UTF16") == 0 || + _stricmp(name, "UCS-2") == 0 || + _stricmp(name, "UCS2") == 0 || + _stricmp(name, "UCS-2-INTERNAL") == 0) + cv->flags |= FLAG_USE_BOM; + } else if (cv->codepage == 12000 || cv->codepage == 12001) { + cv->mbtowc = utf32_mbtowc; + cv->wctomb = utf32_wctomb; + if (_stricmp(name, "UTF-32") == 0 || + _stricmp(name, "UTF32") == 0 || + _stricmp(name, "UCS-4") == 0 || + _stricmp(name, "UCS4") == 0) + cv->flags |= FLAG_USE_BOM; + } else if (cv->codepage == 65001) { + cv->mbtowc = kernel_mbtowc; + cv->wctomb = kernel_wctomb; + cv->mblen = utf8_mblen; + } else if ((cv->codepage == 50220 || + cv->codepage == 50221 || + cv->codepage == 50222) && load_mlang()) { + cv->mbtowc = iso2022jp_mbtowc; + cv->wctomb = iso2022jp_wctomb; + cv->flush = iso2022jp_flush; + } else if (cv->codepage == 51932 && load_mlang()) { + cv->mbtowc = mlang_mbtowc; + cv->wctomb = mlang_wctomb; + cv->mblen = eucjp_mblen; + } else if (IsValidCodePage(cv->codepage) + && GetCPInfo(cv->codepage, &cpinfo) != 0) { + cv->mbtowc = kernel_mbtowc; + cv->wctomb = kernel_wctomb; + if (cpinfo.MaxCharSize == 1) + cv->mblen = sbcs_mblen; + else if (cpinfo.MaxCharSize == 2) + cv->mblen = dbcs_mblen; + else + cv->mblen = mbcs_mblen; + } else { + /* not supported */ + free(name); + errno = EINVAL; + return FALSE; + } + + if (use_compat) { + switch (cv->codepage) { + case 932: cv->compat = cp932_compat; break; + case 20932: cv->compat = cp20932_compat; break; + case 51932: cv->compat = cp51932_compat; break; + case 50220: + case 50221: + case 50222: cv->compat = cp5022x_compat; break; + } + } + + free(name); + + return TRUE; +} + +static int name_to_codepage(const char *name) { + int i; + + if (*name == '\0' || + strcmp(name, "char") == 0) + return GetACP(); + else if (strcmp(name, "wchar_t") == 0) + return 1200; + else if (_strnicmp(name, "cp", 2) == 0) + return atoi(name + 2); /* CP123 */ + else if ('0' <= name[0] && name[0] <= '9') + return atoi(name); /* 123 */ + else if (_strnicmp(name, "xx", 2) == 0) + return atoi(name + 2); /* XX123 for debug */ + + for (i = 0; codepage_alias[i].name != NULL; ++i) + if (_stricmp(name, codepage_alias[i].name) == 0) + return codepage_alias[i].codepage; + return -1; +} + +/* + * http://www.faqs.org/rfcs/rfc2781.html + */ +static uint utf16_to_ucs4(const ushort *wbuf) { + uint wc = wbuf[0]; + if (0xD800 <= wbuf[0] && wbuf[0] <= 0xDBFF) + wc = ((wbuf[0] & 0x3FF) << 10) + (wbuf[1] & 0x3FF) + 0x10000; + return wc; +} + +static void ucs4_to_utf16(uint wc, ushort *wbuf, int *wbufsize) { + if (wc < 0x10000) { + wbuf[0] = wc; + *wbufsize = 1; + } else { + wc -= 0x10000; + wbuf[0] = 0xD800 | ((wc >> 10) & 0x3FF); + wbuf[1] = 0xDC00 | (wc & 0x3FF); + *wbufsize = 2; + } +} + +/* + * Check if codepage is one of those for which the dwFlags parameter + * to MultiByteToWideChar() must be zero. Return zero or + * MB_ERR_INVALID_CHARS. The docs in Platform SDK for Windows + * Server 2003 R2 claims that also codepage 65001 is one of these, but + * that doesn't seem to be the case. The MSDN docs for MSVS2008 leave + * out 65001 (UTF-8), and that indeed seems to be the case on XP, it + * works fine to pass MB_ERR_INVALID_CHARS in dwFlags when converting + * from UTF-8. + */ +static int mbtowc_flags(int codepage) { + return (codepage == 50220 || codepage == 50221 || + codepage == 50222 || codepage == 50225 || + codepage == 50227 || codepage == 50229 || + codepage == 52936 || codepage == 54936 || + (codepage >= 57002 && codepage <= 57011) || + codepage == 65000 || codepage == 42) ? 0 : MB_ERR_INVALID_CHARS; +} + +/* + * Check if codepage is one those for which the lpUsedDefaultChar + * parameter to WideCharToMultiByte() must be NULL. The docs in + * Platform SDK for Windows Server 2003 R2 claims that this is the + * list below, while the MSDN docs for MSVS2008 claim that it is only + * for 65000 (UTF-7) and 65001 (UTF-8). This time the earlier Platform + * SDK seems to be correct, at least for XP. + */ +static int must_use_null_useddefaultchar(int codepage) { + return (codepage == 65000 || codepage == 65001 || + codepage == 50220 || codepage == 50221 || + codepage == 50222 || codepage == 50225 || + codepage == 50227 || codepage == 50229 || + codepage == 52936 || codepage == 54936 || + (codepage >= 57002 && codepage <= 57011) || + codepage == 42); +} + +static char * strrstr(const char *str, const char *token) { + int len = strlen(token); + const char *p = str + strlen(str); + + while (str <= --p) + if (p[0] == token[0] && strncmp(p, token, len) == 0) + return (char *)p; + return NULL; +} + +static char * xstrndup(const char *s, size_t n) { + char *p; + + p = (char *)malloc(n + 1); + if (p == NULL) + return NULL; + memcpy(p, s, n); + p[n] = '\0'; + return p; +} + +static int seterror(int err) { + errno = err; + return -1; +} + +#if defined(USE_LIBICONV_DLL) +static int libiconv_iconv_open( + rec_iconv_t *cd, + const char *tocode, + const char *fromcode +) { + HMODULE hlibiconv = NULL; + char *dllname; + const char *p; + const char *e; + f_iconv_open _iconv_open; + + /* + * always try to load dll, so that we can switch dll in runtime. + */ + + /* XXX: getenv() can't get variable set by SetEnvironmentVariable() */ + p = getenv("WINICONV_LIBICONV_DLL"); + if (p == NULL) + p = DEFAULT_LIBICONV_DLL; + /* parse comma separated value */ + for ( ; *p != 0; p = (*e == ',') ? e + 1 : e) { + e = strchr(p, ','); + if (p == e) + continue; + else if (e == NULL) + e = p + strlen(p); + dllname = xstrndup(p, e - p); + if (dllname == NULL) + return FALSE; + hlibiconv = LoadLibraryA(dllname); + free(dllname); + if (hlibiconv != NULL) { + if (hlibiconv == hwiniconv) { + FreeLibrary(hlibiconv); + hlibiconv = NULL; + continue; + } + break; + } + } + + if (hlibiconv == NULL) + goto failed; + + _iconv_open = (f_iconv_open)GetProcAddressA( + hlibiconv, + "libiconv_open"); + if (_iconv_open == NULL) + _iconv_open = (f_iconv_open)GetProcAddressA( + hlibiconv, + "iconv_open"); + cd->iconv_close = (f_iconv_close)GetProcAddressA( + hlibiconv, + "libiconv_close"); + if (cd->iconv_close == NULL) + cd->iconv_close = (f_iconv_close)GetProcAddressA( + hlibiconv, + "iconv_close"); + cd->iconv = (f_iconv)GetProcAddressA( + hlibiconv, + "libiconv"); + if (cd->iconv == NULL) + cd->iconv = (f_iconv)GetProcAddressA( + hlibiconv, + "iconv"); + cd->_errno = (f_errno)find_imported_function( + hlibiconv, + "_errno"); + if (_iconv_open == NULL || cd->iconv_close == NULL + || cd->iconv == NULL || cd->_errno == NULL) + goto failed; + + cd->cd = _iconv_open(tocode, fromcode); + if (cd->cd == (iconv_t)(-1)) + goto failed; + + cd->hlibiconv = hlibiconv; + return TRUE; + +failed: + if (hlibiconv != NULL) + FreeLibrary(hlibiconv); + return FALSE; +} + +/* + * Reference: + * http://forums.belution.com/ja/vc/000/234/78s.shtml + * http://nienie.com/~masapico/api_ImageDirectoryEntryToData.html + * + * The formal way is + * imagehlp.h or dbghelp.h + * imagehlp.lib or dbghelp.lib + * ImageDirectoryEntryToData() + */ +#define TO_DOS_HEADER(base) ((PIMAGE_DOS_HEADER)(base)) +#define TO_NT_HEADERS(base) \ +((PIMAGE_NT_HEADERS)((LPBYTE)(base) + TO_DOS_HEADER(base)->e_lfanew)) +static PVOID MyImageDirectoryEntryToData( + LPVOID Base, + BOOLEAN MappedAsImage, + USHORT DirectoryEntry, + PULONG Size +) { + /* TODO: MappedAsImage? */ + PIMAGE_DATA_DIRECTORY p; + p = TO_NT_HEADERS(Base)->OptionalHeader.DataDirectory + DirectoryEntry; + if (p->VirtualAddress == 0) { + *Size = 0; + return NULL; + } + *Size = p->Size; + return (PVOID)((LPBYTE)Base + p->VirtualAddress); +} + +static FARPROC find_imported_function( + HMODULE hModule, + const char *funcname +) { + DWORD_PTR Base; + ULONG Size; + PIMAGE_IMPORT_DESCRIPTOR Imp; + PIMAGE_THUNK_DATA Address; /* Import Address Table */ + PIMAGE_THUNK_DATA Name; /* Import Name Table */ + PIMAGE_IMPORT_BY_NAME ImpName; + + Base = (DWORD_PTR)hModule; + Imp = (PIMAGE_IMPORT_DESCRIPTOR)MyImageDirectoryEntryToData( + (LPVOID)Base, + TRUE, + IMAGE_DIRECTORY_ENTRY_IMPORT, + &Size); + if (Imp == NULL) + return NULL; + for ( ; Imp->OriginalFirstThunk != 0; ++Imp) { + Address = (PIMAGE_THUNK_DATA)(Base + Imp->FirstThunk); + Name = (PIMAGE_THUNK_DATA)(Base + Imp->OriginalFirstThunk); + for ( ; Name->u1.Ordinal != 0; ++Name, ++Address) { + if (!IMAGE_SNAP_BY_ORDINAL(Name->u1.Ordinal)) { + ImpName = (PIMAGE_IMPORT_BY_NAME) + (Base + (DWORD_PTR)Name->u1.AddressOfData); + if (strcmp((char *)ImpName->Name, funcname) == 0) + return (FARPROC)Address->u1.Function; + } + } + } + return NULL; +} +#endif + +static int sbcs_mblen( + csconv_t *cv UNUSED, + const uchar *buf UNUSED, + int bufsize UNUSED +) { + return 1; +} + +static int dbcs_mblen( + csconv_t *cv, + const uchar *buf, + int bufsize +) { + int len = IsDBCSLeadByteEx(cv->codepage, buf[0]) ? 2 : 1; + if (bufsize < len) + return seterror(EINVAL); + return len; +} + +static int mbcs_mblen( + csconv_t *cv, + const uchar *buf, + int bufsize +) { + int len = 0; + + if (cv->codepage == 54936) { + if (buf[0] <= 0x7F) len = 1; + else if (buf[0] >= 0x81 && buf[0] <= 0xFE && + bufsize >= 2 && + ((buf[1] >= 0x40 && buf[1] <= 0x7E) || + (buf[1] >= 0x80 && buf[1] <= 0xFE))) len = 2; + else if (buf[0] >= 0x81 && buf[0] <= 0xFE && + bufsize >= 4 && + buf[1] >= 0x30 && buf[1] <= 0x39) len = 4; + else + return seterror(EINVAL); + return len; + } else { + return seterror(EINVAL); + } +} + +static int utf8_mblen( + csconv_t *cv UNUSED, + const uchar *buf, + int bufsize +) { + int len = 0; + + if (buf[0] < 0x80) len = 1; + else if ((buf[0] & 0xE0) == 0xC0) len = 2; + else if ((buf[0] & 0xF0) == 0xE0) len = 3; + else if ((buf[0] & 0xF8) == 0xF0) len = 4; + else if ((buf[0] & 0xFC) == 0xF8) len = 5; + else if ((buf[0] & 0xFE) == 0xFC) len = 6; + + if (len == 0) + return seterror(EILSEQ); + else if (bufsize < len) + return seterror(EINVAL); + return len; +} + +static int eucjp_mblen( + csconv_t *cv UNUSED, + const uchar *buf, + int bufsize +) { + if (buf[0] < 0x80) { /* ASCII */ + return 1; + } else if (buf[0] == 0x8E) { /* JIS X 0201 */ + if (bufsize < 2) + return seterror(EINVAL); + else if (!(0xA1 <= buf[1] && buf[1] <= 0xDF)) + return seterror(EILSEQ); + return 2; + } else if (buf[0] == 0x8F) { /* JIS X 0212 */ + if (bufsize < 3) + return seterror(EINVAL); + else if (!(0xA1 <= buf[1] && buf[1] <= 0xFE) + || !(0xA1 <= buf[2] && buf[2] <= 0xFE)) + return seterror(EILSEQ); + return 3; + } else { /* JIS X 0208 */ + if (bufsize < 2) + return seterror(EINVAL); + else if (!(0xA1 <= buf[0] && buf[0] <= 0xFE) + || !(0xA1 <= buf[1] && buf[1] <= 0xFE)) + return seterror(EILSEQ); + return 2; + } +} + +static int kernel_mbtowc( + csconv_t *cv, + const uchar *buf, + int bufsize, + ushort *wbuf, + int *wbufsize +) { + int len; + + len = cv->mblen(cv, buf, bufsize); + if (len == -1) + return -1; + /* If converting from ASCII, reject 8bit + * chars. MultiByteToWideChar() doesn't. Note that for ASCII we + * know that the mblen function is sbcs_mblen() so len is 1. + */ + if (cv->codepage == 20127 && buf[0] >= 0x80) + return seterror(EILSEQ); + *wbufsize = MultiByteToWideChar( + cv->codepage, + mbtowc_flags(cv->codepage), + (const char *)buf, + len, + (wchar_t *)wbuf, *wbufsize); + if (*wbufsize == 0) + return seterror(EILSEQ); + return len; +} + +static int kernel_wctomb( + csconv_t *cv, + ushort *wbuf, + int wbufsize, + uchar *buf, + int bufsize +) { + BOOL usedDefaultChar = 0; + BOOL *p = NULL; + int flags = 0; + int len; + + if (bufsize == 0) + return seterror(E2BIG); + if (!must_use_null_useddefaultchar(cv->codepage)) { + p = &usedDefaultChar; +#ifdef WC_NO_BEST_FIT_CHARS + if (!(cv->flags & FLAG_TRANSLIT)) + flags |= WC_NO_BEST_FIT_CHARS; +#endif + } + len = WideCharToMultiByte(cv->codepage, flags, + (const wchar_t *)wbuf, wbufsize, (char *)buf, bufsize, NULL, p); + if (len == 0) { + if (GetLastError() == ERROR_INSUFFICIENT_BUFFER) + return seterror(E2BIG); + return seterror(EILSEQ); + } else if (usedDefaultChar && !(cv->flags & FLAG_TRANSLIT)) { + return seterror(EILSEQ); + } else if (cv->mblen(cv, buf, len) != len) { /* validate result */ + return seterror(EILSEQ); + } + return len; +} + +/* + * It seems that the mode (cv->mode) is fixnum. + * For example, when converting iso-2022-jp(cp50221) to unicode: + * in ascii sequence: mode=0xC42C0000 + * in jisx0208 sequence: mode=0xC42C0001 + * "C42C" is same for each convert session. + * It should be: ((codepage-1)<<16)|state + */ +static int mlang_mbtowc( + csconv_t *cv, + const uchar *buf, + int bufsize, + ushort *wbuf, + int *wbufsize +) { + int len; + int insize; + HRESULT hr; + + len = cv->mblen(cv, buf, bufsize); + if (len == -1) + return -1; + insize = len; + hr = ConvertINetMultiByteToUnicode(&cv->mode, cv->codepage, + (const char *)buf, &insize, (wchar_t *)wbuf, wbufsize); + if (hr != S_OK || insize != len) + return seterror(EILSEQ); + return len; +} + +static int mlang_wctomb( + csconv_t *cv, + ushort *wbuf, + int wbufsize, + uchar *buf, + int bufsize) { + char tmpbuf[MB_CHAR_MAX]; /* enough room for one character */ + int tmpsize = MB_CHAR_MAX; + int insize = wbufsize; + HRESULT hr; + + hr = ConvertINetUnicodeToMultiByte(&cv->mode, cv->codepage, + (const wchar_t *)wbuf, &wbufsize, tmpbuf, &tmpsize); + if (hr != S_OK || insize != wbufsize) + return seterror(EILSEQ); + else if (bufsize < tmpsize) + return seterror(E2BIG); + else if (cv->mblen(cv, (uchar *)tmpbuf, tmpsize) != tmpsize) + return seterror(EILSEQ); + memcpy(buf, tmpbuf, tmpsize); + return tmpsize; +} + +static int utf16_mbtowc( + csconv_t *cv, + const uchar *buf, + int bufsize, + ushort *wbuf, + int *wbufsize +) { + int codepage = cv->codepage; + + /* swap endian: 1200 <-> 1201 */ + if (cv->mode & UNICODE_MODE_SWAPPED) + codepage ^= 1; + + if (bufsize < 2) + return seterror(EINVAL); + if (codepage == 1200) /* little endian */ + wbuf[0] = (buf[1] << 8) | buf[0]; + else if (codepage == 1201) /* big endian */ + wbuf[0] = (buf[0] << 8) | buf[1]; + + if ((cv->flags & FLAG_USE_BOM) && + !(cv->mode & UNICODE_MODE_BOM_DONE)) { + cv->mode |= UNICODE_MODE_BOM_DONE; + if (wbuf[0] == 0xFFFE) { + cv->mode |= UNICODE_MODE_SWAPPED; + *wbufsize = 0; + return 2; + } else if (wbuf[0] == 0xFEFF) { + *wbufsize = 0; + return 2; + } + } + + if (0xDC00 <= wbuf[0] && wbuf[0] <= 0xDFFF) + return seterror(EILSEQ); + if (0xD800 <= wbuf[0] && wbuf[0] <= 0xDBFF) { + if (bufsize < 4) + return seterror(EINVAL); + if (codepage == 1200) /* little endian */ + wbuf[1] = (buf[3] << 8) | buf[2]; + else if (codepage == 1201) /* big endian */ + wbuf[1] = (buf[2] << 8) | buf[3]; + if (!(0xDC00 <= wbuf[1] && wbuf[1] <= 0xDFFF)) + return seterror(EILSEQ); + *wbufsize = 2; + return 4; + } + *wbufsize = 1; + return 2; +} + +static int utf16_wctomb( + csconv_t *cv, + ushort *wbuf, + int wbufsize, + uchar *buf, + int bufsize +) { + if ((cv->flags & FLAG_USE_BOM) && + !(cv->mode & UNICODE_MODE_BOM_DONE)) { + int r; + + cv->mode |= UNICODE_MODE_BOM_DONE; + if (bufsize < 2) + return seterror(E2BIG); + if (cv->codepage == 1200) /* little endian */ + memcpy(buf, "\xFF\xFE", 2); + else if (cv->codepage == 1201) /* big endian */ + memcpy(buf, "\xFE\xFF", 2); + + r = utf16_wctomb(cv, wbuf, wbufsize, buf + 2, bufsize - 2); + if (r == -1) + return -1; + return r + 2; + } + + if (bufsize < 2) + return seterror(E2BIG); + if (cv->codepage == 1200) { /* little endian */ + buf[0] = (wbuf[0] & 0x00FF); + buf[1] = (wbuf[0] & 0xFF00) >> 8; + } else if (cv->codepage == 1201) { /* big endian */ + buf[0] = (wbuf[0] & 0xFF00) >> 8; + buf[1] = (wbuf[0] & 0x00FF); + } + if (0xD800 <= wbuf[0] && wbuf[0] <= 0xDBFF) { + if (bufsize < 4) + return seterror(E2BIG); + if (cv->codepage == 1200) { /* little endian */ + buf[2] = (wbuf[1] & 0x00FF); + buf[3] = (wbuf[1] & 0xFF00) >> 8; + } else if (cv->codepage == 1201) { /* big endian */ + buf[2] = (wbuf[1] & 0xFF00) >> 8; + buf[3] = (wbuf[1] & 0x00FF); + } + return 4; + } + return 2; +} + +static int utf32_mbtowc( + csconv_t *cv, + const uchar *buf, + int bufsize, + ushort *wbuf, + int *wbufsize +) { + int codepage = cv->codepage; + uint wc = 0xD800; + + /* swap endian: 12000 <-> 12001 */ + if (cv->mode & UNICODE_MODE_SWAPPED) + codepage ^= 1; + + if (bufsize < 4) + return seterror(EINVAL); + if (codepage == 12000) /* little endian */ + wc = (buf[3] << 24) | (buf[2] << 16) | (buf[1] << 8) | buf[0]; + else if (codepage == 12001) /* big endian */ + wc = (buf[0] << 24) | (buf[1] << 16) | (buf[2] << 8) | buf[3]; + + if ((cv->flags & FLAG_USE_BOM) && !(cv->mode & UNICODE_MODE_BOM_DONE)) { + cv->mode |= UNICODE_MODE_BOM_DONE; + if (wc == 0xFFFE0000) { + cv->mode |= UNICODE_MODE_SWAPPED; + *wbufsize = 0; + return 4; + } else if (wc == 0x0000FEFF) { + *wbufsize = 0; + return 4; + } + } + + if ((0xD800 <= wc && wc <= 0xDFFF) || 0x10FFFF < wc) + return seterror(EILSEQ); + ucs4_to_utf16(wc, wbuf, wbufsize); + return 4; +} + +static int utf32_wctomb( + csconv_t *cv, + ushort *wbuf, + int wbufsize, + uchar *buf, + int bufsize +) { + uint wc; + + if ((cv->flags & FLAG_USE_BOM) && !(cv->mode & UNICODE_MODE_BOM_DONE)) { + int r; + + cv->mode |= UNICODE_MODE_BOM_DONE; + if (bufsize < 4) + return seterror(E2BIG); + if (cv->codepage == 12000) /* little endian */ + memcpy(buf, "\xFF\xFE\x00\x00", 4); + else if (cv->codepage == 12001) /* big endian */ + memcpy(buf, "\x00\x00\xFE\xFF", 4); + + r = utf32_wctomb(cv, wbuf, wbufsize, buf + 4, bufsize - 4); + if (r == -1) + return -1; + return r + 4; + } + + if (bufsize < 4) + return seterror(E2BIG); + wc = utf16_to_ucs4(wbuf); + if (cv->codepage == 12000) { /* little endian */ + buf[0] = wc & 0x000000FF; + buf[1] = (wc & 0x0000FF00) >> 8; + buf[2] = (wc & 0x00FF0000) >> 16; + buf[3] = (wc & 0xFF000000) >> 24; + } else if (cv->codepage == 12001) { /* big endian */ + buf[0] = (wc & 0xFF000000) >> 24; + buf[1] = (wc & 0x00FF0000) >> 16; + buf[2] = (wc & 0x0000FF00) >> 8; + buf[3] = wc & 0x000000FF; + } + return 4; +} + +/* + * 50220: ISO 2022 Japanese with no halfwidth Katakana; Japanese (JIS) + * 50221: ISO 2022 Japanese with halfwidth Katakana; Japanese (JIS-Allow + * 1 byte Kana) + * 50222: ISO 2022 Japanese JIS X 0201-1989; Japanese (JIS-Allow 1 byte + * Kana - SO/SI) + * + * MultiByteToWideChar() and WideCharToMultiByte() behave differently + * depending on Windows version. On XP, WideCharToMultiByte() doesn't + * terminate result sequence with ascii escape. But Vista does. + * Use MLang instead. + */ + +#define ISO2022_MODE(cs, shift) (((cs) << 8) | (shift)) +#define ISO2022_MODE_CS(mode) (((mode) >> 8) & 0xFF) +#define ISO2022_MODE_SHIFT(mode) ((mode) & 0xFF) + +#define ISO2022_SI 0 +#define ISO2022_SO 1 + +/* shift in */ +static const char iso2022_SI_seq[] = "\x0F"; +/* shift out */ +static const char iso2022_SO_seq[] = "\x0E"; + +typedef struct iso2022_esc_t iso2022_esc_t; +struct iso2022_esc_t { + const char *esc; + int esc_len; + int len; + int cs; +}; + +#define ISO2022JP_CS_ASCII 0 +#define ISO2022JP_CS_JISX0201_ROMAN 1 +#define ISO2022JP_CS_JISX0201_KANA 2 +#define ISO2022JP_CS_JISX0208_1978 3 +#define ISO2022JP_CS_JISX0208_1983 4 +#define ISO2022JP_CS_JISX0212 5 + +static iso2022_esc_t iso2022jp_esc[] = { + {"\x1B\x28\x42", 3, 1, ISO2022JP_CS_ASCII}, + {"\x1B\x28\x4A", 3, 1, ISO2022JP_CS_JISX0201_ROMAN}, + {"\x1B\x28\x49", 3, 1, ISO2022JP_CS_JISX0201_KANA}, + /* unify 1978 with 1983 */ + {"\x1B\x24\x40", 3, 2, ISO2022JP_CS_JISX0208_1983}, + {"\x1B\x24\x42", 3, 2, ISO2022JP_CS_JISX0208_1983}, + {"\x1B\x24\x28\x44", 4, 2, ISO2022JP_CS_JISX0212}, + {NULL, 0, 0, 0} +}; + +static int iso2022jp_mbtowc( + csconv_t *cv, + const uchar *buf, + int bufsize, + ushort *wbuf, + int *wbufsize +) { + iso2022_esc_t *iesc = iso2022jp_esc; + char tmp[MB_CHAR_MAX]; + int insize; + HRESULT hr; + DWORD dummy = 0; + int len; + int esc_len; + int cs; + int shift; + int i; + + if (buf[0] == 0x1B) { + for (i = 0; iesc[i].esc != NULL; ++i) { + esc_len = iesc[i].esc_len; + if (bufsize < esc_len) { + if (strncmp((char *)buf, iesc[i].esc, bufsize) == 0) + return seterror(EINVAL); + } else { + if (strncmp((char *)buf, iesc[i].esc, esc_len) == 0) { + cv->mode = ISO2022_MODE(iesc[i].cs, ISO2022_SI); + *wbufsize = 0; + return esc_len; + } + } + } + /* not supported escape sequence */ + return seterror(EILSEQ); + } else if (buf[0] == iso2022_SO_seq[0]) { + cv->mode = ISO2022_MODE(ISO2022_MODE_CS(cv->mode), ISO2022_SO); + *wbufsize = 0; + return 1; + } else if (buf[0] == iso2022_SI_seq[0]) { + cv->mode = ISO2022_MODE(ISO2022_MODE_CS(cv->mode), ISO2022_SI); + *wbufsize = 0; + return 1; + } + + cs = ISO2022_MODE_CS(cv->mode); + shift = ISO2022_MODE_SHIFT(cv->mode); + + /* reset the mode for informal sequence */ + if (buf[0] < 0x20) { + cs = ISO2022JP_CS_ASCII; + shift = ISO2022_SI; + } + + len = iesc[cs].len; + if (bufsize < len) + return seterror(EINVAL); + for (i = 0; i < len; ++i) + if (!(buf[i] < 0x80)) + return seterror(EILSEQ); + esc_len = iesc[cs].esc_len; + memcpy(tmp, iesc[cs].esc, esc_len); + if (shift == ISO2022_SO) { + memcpy(tmp + esc_len, iso2022_SO_seq, 1); + esc_len += 1; + } + memcpy(tmp + esc_len, buf, len); + + if ((cv->codepage == 50220 || cv->codepage == 50221 + || cv->codepage == 50222) && shift == ISO2022_SO) { + /* XXX: shift-out cannot be used for mbtowc (both kernel and + * mlang) */ + esc_len = iesc[ISO2022JP_CS_JISX0201_KANA].esc_len; + memcpy(tmp, iesc[ISO2022JP_CS_JISX0201_KANA].esc, esc_len); + memcpy(tmp + esc_len, buf, len); + } + + insize = len + esc_len; + hr = ConvertINetMultiByteToUnicode(&dummy, cv->codepage, + (const char *)tmp, &insize, (wchar_t *)wbuf, wbufsize); + if (hr != S_OK || insize != len + esc_len) + return seterror(EILSEQ); + + /* Check for conversion error. Assuming defaultChar is 0x3F. */ + /* ascii should be converted from ascii */ + if (wbuf[0] == buf[0] + && cv->mode != ISO2022_MODE(ISO2022JP_CS_ASCII, ISO2022_SI)) + return seterror(EILSEQ); + + /* reset the mode for informal sequence */ + if (cv->mode != ISO2022_MODE(cs, shift)) + cv->mode = ISO2022_MODE(cs, shift); + + return len; +} + +static int iso2022jp_wctomb( + csconv_t *cv, + ushort *wbuf, + int wbufsize, + uchar *buf, + int bufsize +) { + iso2022_esc_t *iesc = iso2022jp_esc; + char tmp[MB_CHAR_MAX]; + int tmpsize = MB_CHAR_MAX; + int insize = wbufsize; + HRESULT hr; + DWORD dummy = 0; + int len; + int esc_len; + int cs; + int shift; + int i; + + /* + * MultiByte = [escape sequence] + character + [escape sequence] + * + * Whether trailing escape sequence is added depends on which API is + * used (kernel or MLang, and its version). + */ + hr = ConvertINetUnicodeToMultiByte(&dummy, cv->codepage, + (const wchar_t *)wbuf, &wbufsize, tmp, &tmpsize); + if (hr != S_OK || insize != wbufsize) + return seterror(EILSEQ); + else if (bufsize < tmpsize) + return seterror(E2BIG); + + if (tmpsize == 1) { + cs = ISO2022JP_CS_ASCII; + esc_len = 0; + } else { + for (i = 1; iesc[i].esc != NULL; ++i) { + esc_len = iesc[i].esc_len; + if (strncmp(tmp, iesc[i].esc, esc_len) == 0) { + cs = iesc[i].cs; + break; + } + } + if (iesc[i].esc == NULL) + /* not supported escape sequence */ + return seterror(EILSEQ); + } + + shift = ISO2022_SI; + if (tmp[esc_len] == iso2022_SO_seq[0]) { + shift = ISO2022_SO; + esc_len += 1; + } + + len = iesc[cs].len; + + /* Check for converting error. Assuming defaultChar is 0x3F. */ + /* ascii should be converted from ascii */ + if (cs == ISO2022JP_CS_ASCII && !(wbuf[0] < 0x80)) + return seterror(EILSEQ); + else if (tmpsize < esc_len + len) + return seterror(EILSEQ); + + if (cv->mode == ISO2022_MODE(cs, shift)) { + /* remove escape sequence */ + if (esc_len != 0) + memmove(tmp, tmp + esc_len, len); + esc_len = 0; + } else { + if (cs == ISO2022JP_CS_ASCII) { + esc_len = iesc[ISO2022JP_CS_ASCII].esc_len; + memmove(tmp + esc_len, tmp, len); + memcpy(tmp, iesc[ISO2022JP_CS_ASCII].esc, esc_len); + } + if (ISO2022_MODE_SHIFT(cv->mode) == ISO2022_SO) { + /* shift-in before changing to other mode */ + memmove(tmp + 1, tmp, len + esc_len); + memcpy(tmp, iso2022_SI_seq, 1); + esc_len += 1; + } + } + + if (bufsize < len + esc_len) + return seterror(E2BIG); + memcpy(buf, tmp, len + esc_len); + cv->mode = ISO2022_MODE(cs, shift); + return len + esc_len; +} + +static int iso2022jp_flush( + csconv_t *cv, + uchar *buf, + int bufsize +) { + iso2022_esc_t *iesc = iso2022jp_esc; + int esc_len; + + if (cv->mode != ISO2022_MODE(ISO2022JP_CS_ASCII, ISO2022_SI)) { + esc_len = 0; + if (ISO2022_MODE_SHIFT(cv->mode) != ISO2022_SI) + esc_len += 1; + if (ISO2022_MODE_CS(cv->mode) != ISO2022JP_CS_ASCII) + esc_len += iesc[ISO2022JP_CS_ASCII].esc_len; + if (bufsize < esc_len) + return seterror(E2BIG); + + esc_len = 0; + if (ISO2022_MODE_SHIFT(cv->mode) != ISO2022_SI) { + memcpy(buf, iso2022_SI_seq, 1); + esc_len += 1; + } + if (ISO2022_MODE_CS(cv->mode) != ISO2022JP_CS_ASCII) { + memcpy(buf + esc_len, iesc[ISO2022JP_CS_ASCII].esc, + iesc[ISO2022JP_CS_ASCII].esc_len); + esc_len += iesc[ISO2022JP_CS_ASCII].esc_len; + } + return esc_len; + } + return 0; +} + +#if defined(MAKE_DLL) && defined(USE_LIBICONV_DLL) +BOOL WINAPI DllMain( + HINSTANCE hinstDLL, + DWORD fdwReason, + LPVOID lpReserved +) { + switch ( fdwReason ) { + case DLL_PROCESS_ATTACH: + hwiniconv = (HMODULE)hinstDLL; + break; + case DLL_THREAD_ATTACH: + case DLL_THREAD_DETACH: + case DLL_PROCESS_DETACH: + break; + } + return TRUE; +} +#endif + +#if defined(MAKE_EXE) +#include +#include +#include +int main(int argc, char **argv) { + char *fromcode = NULL; + char *tocode = NULL; + int i; + char inbuf[BUFSIZ]; + char outbuf[BUFSIZ]; + const char *pin; + char *pout; + size_t inbytesleft; + size_t outbytesleft; + size_t rest = 0; + iconv_t cd; + size_t r; + FILE *in = stdin; + FILE *out = stdout; + int ignore = 0; + char *p; + + _setmode(_fileno(stdin), _O_BINARY); + _setmode(_fileno(stdout), _O_BINARY); + + for (i = 1; i < argc; ++i) { + if (strcmp(argv[i], "-l") == 0) { + for (i = 0; codepage_alias[i].name != NULL; ++i) + printf("%s\n", codepage_alias[i].name); + return 0; + } + + if (strcmp(argv[i], "-f") == 0) { + fromcode = argv[++i]; + } else if (strcmp(argv[i], "-t") == 0) { + tocode = argv[++i]; + } else if (strcmp(argv[i], "-c") == 0) { + ignore = 1; + } else if (strcmp(argv[i], "--output") == 0) { + out = fopen(argv[++i], "wb"); + if (out == NULL) { + fprintf(stderr, "cannot open %s\n", argv[i]); + return 1; + } + } else { + in = fopen(argv[i], "rb"); + if (in == NULL) { + fprintf(stderr, "cannot open %s\n", argv[i]); + return 1; + } + break; + } + } + + if (fromcode == NULL || tocode == NULL) { + printf("usage: %s [-c] -f from-enc -t to-enc [file]\n", argv[0]); + return 0; + } + + if (ignore) { + p = tocode; + tocode = (char *)malloc(strlen(p) + strlen("//IGNORE") + 1); + if (tocode == NULL) { + perror("fatal error"); + return 1; + } + strcpy(tocode, p); //NOLINT + strcat(tocode, "//IGNORE"); //NOLINT + } + + cd = iconv_open(tocode, fromcode); + if (cd == (iconv_t)(-1)) { + perror("iconv_open error"); + return 1; + } + + while ((inbytesleft = fread( + inbuf + rest, 1, + sizeof(inbuf) - rest, in)) != 0 + || rest != 0) { + inbytesleft += rest; + pin = inbuf; + pout = outbuf; + outbytesleft = sizeof(outbuf); + r = iconv(cd, &pin, &inbytesleft, &pout, &outbytesleft); + fwrite(outbuf, 1, sizeof(outbuf) - outbytesleft, out); + if (r == (size_t)(-1) && + errno != E2BIG && + (errno != EINVAL || feof(in))) { + perror("conversion error"); + return 1; + } + memmove(inbuf, pin, inbytesleft); + rest = inbytesleft; + } + pout = outbuf; + outbytesleft = sizeof(outbuf); + r = iconv(cd, NULL, NULL, &pout, &outbytesleft); + fwrite(outbuf, 1, sizeof(outbuf) - outbytesleft, out); + if (r == (size_t)(-1)) { + perror("conversion error"); + return 1; + } + + iconv_close(cd); + + return 0; +} +#endif diff --git a/pandas/_libs/src/librdata/win_iconv.h b/pandas/_libs/src/librdata/win_iconv.h new file mode 100644 index 0000000000000..da6e9fa4ab96a --- /dev/null +++ b/pandas/_libs/src/librdata/win_iconv.h @@ -0,0 +1,48 @@ +/* + +win-iconv - iconv implementation using Win32 API to convert. +Written in 2009-2016 by Yukihiro Nakadaira +and contributors to win-iconv + +To the extent possible under law, the author(s) have dedicated all copyright +and related and neighboring rights to this software to the public domain +worldwide. This software is distributed without any warranty. + +You should have received a copy of the CC0 Public Domain Dedication along with +this software. If not, see http://creativecommons.org/publicdomain/zero/1.0/. + + */ + +#ifndef PANDAS__LIBS_SRC_LIBRDATA_WIN_ICONV_H_ +#define PANDAS__LIBS_SRC_LIBRDATA_WIN_ICONV_H_ + +// #ifndef _LIBICONV_H + #define _LIBICONV_H + #include + #ifndef WINICONV_CONST + # ifdef ICONV_CONST + # define WINICONV_CONST ICONV_CONST + # else + # define WINICONV_CONST const + # endif + #endif + #ifdef __cplusplus + extern "C" { + #endif + + typedef void* iconv_t; + iconv_t iconv_open(const char *tocode, const char *fromcode); + int iconv_close(iconv_t cd); + size_t iconv( + iconv_t cd, + WINICONV_CONST char **inbuf, + size_t *inbytesleft, + char **outbuf, + size_t *outbytesleft); + + #ifdef __cplusplus + } + #endif +// #endif + +#endif // PANDAS__LIBS_SRC_LIBRDATA_WIN_ICONV_H_ diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 6cc12ccfba22e..82caa844b2751 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2336,6 +2336,136 @@ def _from_arrays( ) return cls(mgr) + @doc(storage_options=generic._shared_docs["storage_options"]) + def to_rdata( + self, + path_or_buffer: FilePathOrBuffer, + file_format: str = "infer", + rda_name: str = "pandas_dataframe", + index: bool = True, + compression: CompressionOptions = "gzip", + storage_options: StorageOptions = None, + ) -> None: + """ + Render one or more DataFrames to R data (.RData, .rda, .rds). + + .. versionadded:: 1.3.0 + + Parameters + ---------- + path_or_buffer : a valid str, path object or file-like object + Any valid string path is acceptable. + + file_format : {{'infer', 'rda', 'rdata', 'rds'}}, default 'infer' + R serialization type generated from native commands: base::save + (that saves multiple objects) or base::saveRDS (that saves a + single object to disk). Default 'infer' will use extension in file + name to determine the format type. + + rda_name : str, default "pandas_dataframe" + Name for R data.frame in RData/rda file. + + index : bool, default True + Include index or MulitIndex in output as separate columns. Since + DataFrame indexes can include multiple columns and R rownames can + only include one column, DataFrame index will not map to R + data.frame rownames. + + compression : {{'gzip', 'bz2', 'xz', None}}, default 'gzip' + Compression type for on-the-fly decompression of on-disk data. + + {storage_options} + + Raises + ------ + LibrdataWriterError + * If DataFrame types or values do not conform to R data types. + + See Also + -------- + to_stata : Convert DataFrame to a Stata dataset. + + Notes + ----- + For more information of R serialization data types, see docs on + rda_ and rds_ formats. + + .. _rda: https://www.rdocumentation.org/packages/base/versions/3.6.2/\ +topics/save + .. _rds: https://www.rdocumentation.org/packages/base/versions/3.6.2/\ +topics/readRDS + + Examples + -------- + To save an .rds file which only contains a single DataFrame: + + >>> ghg_df = pd.DataFrame( + ... {{'gas': ['Carbon dioxide', 'Methane', + ... 'Nitrous oxide', + ... 'Fluorinated gases', + ... 'Total'], + ... 'year': [2018, 2018, 2018, 2018, 2018], + ... 'emissions': [5424.88, 634.46, 434.53, + ... 182.78, 6676.65] + ... }}) + >>> ghg_df.to_rdata("ghg_df.rds") # doctest: +SKIP + + >>> R_code = ''' + ... ghg_df <- readRDS("ghg_df.rds") + ... ghg_df + ... index gas year emissions + ... 1 0 Carbon dioxide 2018 5424.88 + ... 2 1 Methane 2018 634.46 + ... 3 2 Nitrous oxide 2018 434.53 + ... 4 3 Fluorinated gases 2018 182.78 + ... 5 4 Total 2018 6676.65 + ... ''' + + To save an .RData or .rda file: + + >>> plants_df = pd.DataFrame( + ... {{'plant_group': ['Pteridophytes', + ... 'Pteridophytes', + ... 'Pteridophytes', + ... 'Pteridophytes', + ... 'Pteridophytes'], + ... 'status': ['Data Deficient', + ... 'Extinct', + ... 'Not Threatened', + ... 'Possibly Threatened', + ... 'Threatened'], + ... 'count': [398, 65, 1294, 408, 1275] + ... }}) + >>> plants_df.to_rdata( + ... "plants_df.rda", + ... rda_name="plants_df", + ... ) # doctest: +SKIP + + >>> R_code = ''' + ... load("plants_df.rda") + ... + ... mget(ls()) + ... $plants_df + ... index plant_group status count + ... 1 0 Pteridophytes Data Deficient 398 + ... 2 1 Pteridophytes Extinct 65 + ... 3 2 Pteridophytes Not Threatened 1294 + ... 4 3 Pteridophytes Possibly Threatened 408 + ... 5 4 Pteridophytes Threatened 1275 + ... ''' + """ + from pandas.io.rdata.rdata_writer import RDataWriter + + RDataWriter( + self, + path_or_buffer=path_or_buffer, + file_format=file_format, + rda_name=rda_name, + index=index, + compression=compression, + storage_options=storage_options, + ).write_data() + @doc(storage_options=generic._shared_docs["storage_options"]) @deprecate_kwarg(old_arg_name="fname", new_arg_name="path") def to_stata( diff --git a/pandas/io/api.py b/pandas/io/api.py index 5926f2166ee9d..9cacb014e7dd0 100644 --- a/pandas/io/api.py +++ b/pandas/io/api.py @@ -29,6 +29,7 @@ HDFStore, read_hdf, ) +from pandas.io.rdata import read_rdata from pandas.io.sas import read_sas from pandas.io.spss import read_spss from pandas.io.sql import ( diff --git a/pandas/io/rdata/__init__.py b/pandas/io/rdata/__init__.py new file mode 100644 index 0000000000000..41ebfa536ec3b --- /dev/null +++ b/pandas/io/rdata/__init__.py @@ -0,0 +1,3 @@ +from pandas.io.rdata.rdata_reader import read_rdata + +__all__ = ["read_rdata"] diff --git a/pandas/io/rdata/_rdata.pxd b/pandas/io/rdata/_rdata.pxd new file mode 100644 index 0000000000000..1ed11347d72be --- /dev/null +++ b/pandas/io/rdata/_rdata.pxd @@ -0,0 +1,280 @@ +# cython: c_string_type=str, c_string_encoding=utf8, language_level=3 + +from posix.types cimport off_t + +from libc.stdint cimport ( + int32_t, + int64_t, +) +from libc.time cimport ( + mktime, + time_t, + tm, +) + + +cdef extern from '../../_libs/src/librdata/rdata.h': + + ctypedef enum rdata_type_t: + RDATA_TYPE_STRING, + RDATA_TYPE_INT32, + RDATA_TYPE_REAL, + RDATA_TYPE_LOGICAL, + RDATA_TYPE_TIMESTAMP, + RDATA_TYPE_DATE + + ctypedef enum rdata_error_t: + RDATA_OK, + RDATA_ERROR_OPEN = 1, + RDATA_ERROR_SEEK, + RDATA_ERROR_READ, + RDATA_ERROR_MALLOC, + RDATA_ERROR_USER_ABORT, + RDATA_ERROR_PARSE, + RDATA_ERROR_WRITE, + RDATA_ERROR_FACTOR, + RDATA_ERROR_UNSUPPORTED_COMPRESSION, + RDATA_ERROR_UNSUPPORTED_CHARSET, + RDATA_ERROR_CONVERT, + RDATA_ERROR_CONVERT_BAD_STRING, + RDATA_ERROR_CONVERT_LONG_STRING, + RDATA_ERROR_CONVERT_SHORT_STRING, + RDATA_ERROR_UNSUPPORTED_S_EXPRESSION, + RDATA_ERROR_UNSUPPORTED_STORAGE_CLASS + + ctypedef enum rdata_file_format_t: + RDATA_WORKSPACE, + RDATA_SINGLE_OBJECT + + cdef const char *rdata_error_message(rdata_error_t error_code) + + ctypedef int (*rdata_column_handler)( + const char *name, rdata_type_t type, + void *data, long count, void *ctx + ) except * + ctypedef int ( + *rdata_table_handler)(const char *name, void *ctx + ) except * + ctypedef int ( + *rdata_text_value_handler)(const char *value, int index, void *ctx + ) except * + ctypedef int ( + *rdata_column_name_handler)(const char *value, int index, void *ctx + ) except * + ctypedef void (*rdata_error_handler)(const char *error_message, void *ctx) + ctypedef int (*rdata_progress_handler)(double progress, void *ctx) + + IF UNAME_SYSNAME == "AIX": + ctypedef off64_t rdata_off_t + ELSE: + ctypedef off_t rdata_off_t + + # Read API + + ctypedef enum rdata_io_flags_t: + RDATA_SEEK_SET, + RDATA_SEEK_CUR, + RDATA_SEEK_END + + ctypedef int (*rdata_open_handler)(const char *path, void *io_ctx) + ctypedef int (*rdata_close_handler)(void *io_ctx) + ctypedef rdata_off_t ( + *rdata_seek_handler + )(rdata_off_t offset, rdata_io_flags_t whence, void *io_ctx) + ctypedef ssize_t ( + *rdata_read_handler + )(void *buf, size_t nbyte, void *io_ctx) + ctypedef rdata_error_t ( + *rdata_update_handler + )( + long file_size, + rdata_progress_handler progress_handler, + void *user_ctx, + void *io_ctx + ) + + ctypedef struct rdata_io_t: + rdata_open_handler open + rdata_close_handler close + rdata_seek_handler seek + rdata_read_handler read + rdata_update_handler update + void *io_ctx + int external_io + + ctypedef struct rdata_parser_t: + rdata_table_handler table_handler + rdata_column_handler column_handler + rdata_column_name_handler column_name_handler + rdata_column_name_handler row_name_handler + rdata_text_value_handler text_value_handler + rdata_text_value_handler value_label_handler + rdata_column_handler dim_handler + rdata_text_value_handler dim_name_handler + rdata_error_handler error_handler + rdata_io_t *io + + cdef rdata_parser_t *rdata_parser_init() + cdef void rdata_parser_free(rdata_parser_t *parser) + + cdef rdata_error_t rdata_set_table_handler( + rdata_parser_t *parser, rdata_table_handler table_handler + ) + cdef rdata_error_t rdata_set_column_handler( + rdata_parser_t *parser, rdata_column_handler column_handler + ) + cdef rdata_error_t rdata_set_column_name_handler( + rdata_parser_t *parser, rdata_column_name_handler column_name_handler + ) + cdef rdata_error_t rdata_set_row_name_handler( + rdata_parser_t *parser, rdata_column_name_handler row_name_handler + ) + cdef rdata_error_t rdata_set_text_value_handler( + rdata_parser_t *parser, rdata_text_value_handler text_value_handler + ) + cdef rdata_error_t rdata_set_value_label_handler( + rdata_parser_t *parser, rdata_text_value_handler value_label_handler + ) + cdef rdata_error_t rdata_set_dim_handler( + rdata_parser_t *parser, rdata_column_handler dim_handler + ) + cdef rdata_error_t rdata_set_dim_name_handler( + rdata_parser_t *parser, rdata_text_value_handler dim_name_handler + ) + cdef rdata_error_t rdata_set_error_handler( + rdata_parser_t *parser, rdata_error_handler error_handler + ) + cdef rdata_error_t rdata_set_open_handler( + rdata_parser_t *parser, rdata_open_handler open_handler + ) + cdef rdata_error_t rdata_set_close_handler( + rdata_parser_t *parser, rdata_close_handler close_handler + ) + cdef rdata_error_t rdata_set_seek_handler( + rdata_parser_t *parser, rdata_seek_handler seek_handler + ) + cdef rdata_error_t rdata_set_read_handler( + rdata_parser_t *parser, rdata_read_handler read_handler + ) + cdef rdata_error_t rdata_set_update_handler( + rdata_parser_t *parser, rdata_update_handler update_handler + ) + cdef rdata_error_t rdata_set_io_ctx( + rdata_parser_t *parser, void *io_ctx + ) + cdef rdata_error_t rdata_parse( + rdata_parser_t *parser, const char *filename, void *user_ctx + ) + + # Write API + ctypedef ssize_t ( + *rdata_data_writer)(const void *data, size_t len, void *ctx + ) + + ctypedef struct rdata_column_t: + rdata_type_t type + int index + char name[256] + char label[1024] + + int32_t factor_count + char **factor + + ctypedef struct rdata_writer_t: + rdata_file_format_t file_format + rdata_data_writer data_writer + size_t bytes_written + + rdata_error_handler error_handler + void *user_ctx + + void *atom_table + int bswap + + rdata_column_t **columns + int32_t columns_count + int32_t columns_capacity + + cdef rdata_writer_t *rdata_writer_init( + rdata_data_writer write_callback, rdata_file_format_t format + ) + cdef void rdata_writer_free(rdata_writer_t *writer) + + cdef rdata_column_t *rdata_add_column( + rdata_writer_t *writer, const char *name, rdata_type_t type + ) + + cdef rdata_error_t rdata_column_set_label( + rdata_column_t *column, const char *label + ) + cdef rdata_error_t rdata_column_add_factor( + rdata_column_t *column, const char *factor + ) + + cdef rdata_column_t *rdata_get_column(rdata_writer_t *writer, int32_t j) + + cdef rdata_error_t rdata_begin_file(rdata_writer_t *writer, void *ctx) + cdef rdata_error_t rdata_begin_table( + rdata_writer_t *writer, const char *variable_name + ) + cdef rdata_error_t rdata_begin_column( + rdata_writer_t *writer, rdata_column_t *column, int32_t row_count + ) + + cdef rdata_error_t rdata_append_real_value( + rdata_writer_t *writer, double value + ) + cdef rdata_error_t rdata_append_int32_value( + rdata_writer_t *writer, int32_t value + ) + cdef rdata_error_t rdata_append_timestamp_value( + rdata_writer_t *writer, time_t value + ) + cdef rdata_error_t rdata_append_date_value( + rdata_writer_t *writer, tm *value + ) + cdef rdata_error_t rdata_append_logical_value( + rdata_writer_t *writer, int value + ) + cdef rdata_error_t rdata_append_string_value( + rdata_writer_t *writer, const char *value + ) + + cdef rdata_error_t rdata_end_column( + rdata_writer_t *writer, rdata_column_t *column + ) + cdef rdata_error_t rdata_end_table( + rdata_writer_t *writer, int32_t row_count, const char *datalabel + ) + cdef rdata_error_t rdata_end_file( + rdata_writer_t *writer + ) + + IF UNAME_SYSNAME == "Windows": + cdef extern from "": + int _sopen(const char *path, int oflag, int shflag, int pmode) + + cdef extern from "": + int _close(int fd) + ssize_t _write(int fd, const void *buf, size_t nbyte) + + cdef extern from "" nogil: + enum: _O_CREAT + enum: _O_WRONLY + enum: _O_BINARY + enum: _O_U8TEXT + enum: _SH_DENYNO + enum: _S_IREAD + enum: _S_IWRITE + + ELSE: + cdef extern from "": + int open(const char *path, int oflag, int mode) + + cdef extern from "": + int close(int fd) + ssize_t write(int fd, const void *buf, size_t nbyte) + + cdef extern from "" nogil: + enum: O_CREAT + enum: O_WRONLY diff --git a/pandas/io/rdata/_rdata.pyx b/pandas/io/rdata/_rdata.pyx new file mode 100644 index 0000000000000..33b5b41afc6d9 --- /dev/null +++ b/pandas/io/rdata/_rdata.pyx @@ -0,0 +1,461 @@ +# cython: c_string_type=str, c_string_encoding=utf8, language_level=3 + +cdef int handle_table(const char *name, void *ctx) except *: + """ + Retrieves original R object name. + + Called once per data frame in RData files, + and zero times on RDS files. + """ + lbr = ctx + + lbr.colidx = 0 + lbr.rows = 0 + lbr.rlevels = {} + lbr.rtext = {} + lbr.is_factor = False + lbr.rownames = {} + lbr.colnames = {} + lbr.dims = 0 + lbr.dim_str = {} + + if name != NULL: + lbr.tblname = name + + if "r_dataframe" in lbr.rvalues.keys(): + lbr.rvalues[lbr.tblname] = lbr.rvalues.pop("r_dataframe") + else: + lbr.rvalues[lbr.tblname] = { + "data": {}, + "dtypes": {}, + "colnames": None, + "rownames": None + } + return 0 # non-zero to abort processing + + +cdef int handle_column( + const char *name, + rdata_type_t dtype, + void *data, + long count, + void *ctx +) except *: + """ + Parses each non-string column in data frame. + + Called once for all columns with the following caveats: + * `name` is NULL for some columns (see handle_column_name below) + * `data` is NULL for text columns (see handle_text_value below) + Special conditon for matrices with dims attribute. + """ + lbr = ctx + + lbr.rows = count + cdef int *rints = data + cdef double *rdoubles = data + + if dtype in [ + rdata_type_t.RDATA_TYPE_REAL, + rdata_type_t.RDATA_TYPE_DATE, + rdata_type_t.RDATA_TYPE_TIMESTAMP + ]: + lbr.rvalues[lbr.tblname]["dtypes"][lbr.colidx] = lbr.rtypes[dtype] + lbr.rvalues[lbr.tblname]["data"][lbr.colidx] = { + i: rdoubles[i] for i in range(count) + } + lbr.colidx += 1 + + elif dtype in [ + rdata_type_t.RDATA_TYPE_INT32, + rdata_type_t.RDATA_TYPE_LOGICAL + ]: + if lbr.is_factor: + lbr.rvalues[lbr.tblname]["dtypes"][lbr.colidx] = "factor" + lbr.rvalues[lbr.tblname]["data"][lbr.colidx] = { + i: float('nan') if rints[i] < 0 else lbr.rlevels[rints[i]-1] + for i in range(count) + } + lbr.is_factor = False + else: + lbr.rvalues[lbr.tblname]["dtypes"][lbr.colidx] = lbr.rtypes[dtype] + lbr.rvalues[lbr.tblname]["data"][lbr.colidx] = { + i: rints[i] for i in range(count) + } + lbr.colidx += 1 + + if lbr.dims > 0: + lbr.tblname = "r_matrix" + lbr.rvalues[lbr.tblname] = lbr.rvalues.pop("r_dataframe") + dim_data = list(lbr.rvalues[lbr.tblname]["data"][0].values()) + + n = 0 + rows, cols = lbr.dim_str.values() + for col in range(cols): + lbr.rvalues[lbr.tblname]["dtypes"][col] = lbr.rtypes[dtype] + lbr.rvalues[lbr.tblname]["data"][col] = { + i: d for i, d in enumerate(dim_data[n:n+rows]) + } + n += rows + + return 0 + +cdef int handle_text_value(const char *value, int index, void *ctx) except *: + """ + Parses string data. + + Called once per row for a text column. + """ + lbr = ctx + + if value != NULL: + try: + lbr.rtext[index] = value + except UnicodeDecodeError: + lbr.rtext[index] = None + else: + lbr.rtext[index] = None + + if index == (lbr.rows - 1): + lbr.rvalues[lbr.tblname]["dtypes"][lbr.colidx] = "str" + lbr.rvalues[lbr.tblname]["data"][lbr.colidx] = lbr.rtext + lbr.colidx += 1 + lbr.rtext = {} + + return 0 + +cdef int handle_value_label(const char *value, int index, void *ctx) except *: + """ + Parses factor levels. + + Called for factor variables, once for each level + """ + lbr = ctx + + lbr.is_factor = True + lbr.rlevels[index] = value + + return 0 + +cdef int handle_dim( + const char *name, + rdata_type_t dtype, + void *data, + long count, + void *ctx +) except *: + """ + Parses meta data on non-dataframe objects + + Called once for objects with R dims (matrices, arrays, etc.)). + Special condition for character matrices. + """ + lbr = ctx + + cdef int *rdims = data + + lbr.dims = count + lbr.dim_str = {i: rdims[i] for i in range(count)} + + if lbr.rvalues[lbr.tblname]["dtypes"] == {0: "str"}: + dim_data = list(lbr.rvalues[lbr.tblname]["data"][0].values()) + + n = 0 + rows, cols = lbr.dim_str.values() + + for col in range(cols): + lbr.rvalues[lbr.tblname]["dtypes"][col] = "str" + lbr.rvalues[lbr.tblname]["data"][col] = dim_data[n:n+rows] + n += rows + + return 0 + +cdef int handle_column_name(const char *name, int index, void *ctx) except *: + """ + Retrieves column names of data frame + + Returns only non-NULL column names after parsing data. + """ + lbr = ctx + + lbr.colnames[index] = name + lbr.rvalues[lbr.tblname]["colnames"] = lbr.colnames + + return 0 + +cdef int handle_row_name(const char *name, int index, void *ctx) except *: + """ + Retrieves row names of data frame + + Returns only non-NULL row names appear after parsing data. + """ + lbr = ctx + + lbr.rownames[index] = name + lbr.rvalues[lbr.tblname]["rownames"] = lbr.rownames + + return 0 + +cdef int handle_dim_name(const char *name, int index, void *ctx) except *: + """ + Retrieves dim names of matrices or arrays + + Returns only non-NULL dim names appear after parsing data. + """ + + lbr = ctx + + if (index < lbr.dim_str[0]) and lbr.rownames.get(index) is None: + lbr.rownames[index] = name if name != NULL else str(index) + else: + lbr.rvalues[lbr.tblname]["rownames"] = lbr.rownames + + if index < lbr.dim_str[1]: + lbr.colnames[index] = name if name != NULL else str(index) + else: + lbr.rvalues[lbr.tblname]["colnames"] = lbr.colnames + + return 0 + + +class LibrdataReaderError(Exception): + """ + Base error class to capture exceptions in librdata parsing. + """ + pass + + +cdef int length = 40 + + +cdef class LibrdataReader: + """ + Base class to read RData files. + + Class interfaces with librdata C library to builds dictionaries + of each data frame including data content and meta (dtypes, colnames, + and rownames). Callbacks above are used in ``rdata_`` method attributes. + """ + cdef rdata_parser_t *rparser + cdef public: + int colidx + int rows + dict rlevels + dict rtext + bint is_factor + dict rownames + dict colnames + dict rtypes + str tblname + dict rvalues + int dims + dict dim_str + + def read_rdata(self, rfile): + self.rparser = rdata_parser_init() + + self.colidx = 0 + self.rows = 0 + self.rlevels = {} + self.rtext = {} + self.is_factor = False + self.rownames = {} + self.colnames = {} + self.dims = 0 + self.dim_str = {} + self.rtypes = { + rdata_type_t.RDATA_TYPE_LOGICAL: "bool", + rdata_type_t.RDATA_TYPE_INT32: "int", + rdata_type_t.RDATA_TYPE_REAL: "float", + rdata_type_t.RDATA_TYPE_DATE: "date", + rdata_type_t.RDATA_TYPE_TIMESTAMP: "datetime", + rdata_type_t.RDATA_TYPE_STRING: "str" + } + self.tblname = "r_dataframe" + self.rvalues = { + "r_dataframe": { + "data": {}, + "dtypes": {}, + "colnames": None, + "rownames": None + } + } + + err = RDATA_OK + while err == RDATA_OK: + err = rdata_set_table_handler(self.rparser, handle_table) + err = rdata_set_dim_handler(self.rparser, handle_dim) + err = rdata_set_column_handler(self.rparser, handle_column) + err = rdata_set_text_value_handler(self.rparser, handle_text_value) + err = rdata_set_value_label_handler(self.rparser, handle_value_label) + err = rdata_set_column_name_handler(self.rparser, handle_column_name) + err = rdata_set_row_name_handler(self.rparser, handle_row_name) + err = rdata_set_dim_name_handler(self.rparser, handle_dim_name) + + err = rdata_parse(self.rparser, rfile, self) + rdata_parser_free(self.rparser) + break + + if err != RDATA_OK: + msg = rdata_error_message(err) + raise LibrdataReaderError(msg) + + return self.rvalues + + cdef bytes get_rparser(self): + return (self.rparser)[:sizeof(rdata_parser_t)*length] + + def __reduce__(self): + rparser = self.get_rparser() + return (rebuild_reader, (rparser,)) + +cpdef object rebuild_reader(bytes data): + return LibrdataReader() + + +class LibrdataWriterError(Exception): + """ + Base error class to capture exceptions in librdata writing. + """ + pass + + +cdef ssize_t write_data(const void *bytes, size_t len, void *ctx): + cdef int fd = (ctx)[0] + + IF UNAME_SYSNAME == "Windows": + result = _write(fd, bytes, len) + ELSE: + result = write(fd, bytes, len) + + return result + +cdef class LibrdataWriter(): + """ + Base class to write RData files. + + Class interfaces with librdata C library to iterate through dictionaries + of each DataFrame column according to correspoinding dtype. + Single callback above is usedd in exposed `init`` method. + """ + cdef: + int fd + int row_count + dict rdict + dict rformats + dict rtypes + bytes file_name + bytes tbl_name + rdata_writer_t *writer + rdata_column_t *py_col + + cdef write_col_data(self, i, kdata, vdata, ktype, vtype): + py_col = rdata_get_column(self.writer, i) + rdata_begin_column(self.writer, py_col, self.row_count) + + if vtype == RDATA_TYPE_LOGICAL: + for k, v in vdata.items(): + rdata_append_logical_value(self.writer, v) + + if vtype == RDATA_TYPE_INT32: + for k, v in vdata.items(): + rdata_append_int32_value(self.writer, v) + + if vtype == RDATA_TYPE_REAL: + for k, v in vdata.items(): + rdata_append_real_value(self.writer, v) + + if vtype == RDATA_TYPE_TIMESTAMP: + for k, v in vdata.items(): + rdata_append_timestamp_value(self.writer, v) + + if vtype == RDATA_TYPE_STRING: + for k, v in vdata.items(): + if v == v: + rdata_append_string_value(self.writer, v) + else: + rdata_append_string_value(self.writer, NULL) + + rdata_end_column(self.writer, py_col) + + def write_rdata(self, rfile, rdict, rformat, tbl_name=None): + + self.rdict = rdict + self.file_name = rfile.encode("utf-8") + self.tbl_name = tbl_name.encode("utf-8") + self.row_count = len(next(iter(rdict["data"].items()))[1]) + + self.rformats = { + "rdata": RDATA_WORKSPACE, + "rda": RDATA_WORKSPACE, + "rds": RDATA_SINGLE_OBJECT + } + + self.rtypes = { + "bool": RDATA_TYPE_LOGICAL, + "int": RDATA_TYPE_INT32, + "float": RDATA_TYPE_REAL, + "datetime": RDATA_TYPE_TIMESTAMP, + "object": RDATA_TYPE_STRING + } + + IF UNAME_SYSNAME == "Windows": + self.fd = _sopen( + self.file_name, + _O_CREAT | _O_WRONLY | _O_BINARY | _O_U8TEXT, + _SH_DENYNO, + _S_IREAD | _S_IWRITE + ) + ELSE: + self.fd = open(self.file_name, O_CREAT | O_WRONLY, 0644) + + self.writer = rdata_writer_init(write_data, self.rformats[rformat]) + + for k, v in self.rdict["dtypes"].items(): + rdata_add_column(self.writer, k, self.rtypes[v]) + + rdata_begin_file(self.writer, &self.fd) + rdata_begin_table(self.writer, self.tbl_name) + + try: + for n, ((kd, vd), (kt, vt)) in enumerate( + zip( + self.rdict["data"].items(), + self.rdict["dtypes"].items() + ) + ): + self.write_col_data(n, kd, vd, kt, self.rtypes[vt]) + + except (TypeError, ValueError, UnicodeDecodeError): + self.close_rdata() + raise LibrdataWriterError( + "DataFrame contains one more invalid types or data values. " + "that does not conform to R data types." + ) + + rdata_end_table(self.writer, self.row_count, "pandas_dataframe") + rdata_end_file(self.writer) + + self.close_rdata() + rdata_writer_free(self.writer) + + cdef close_rdata(self): + IF UNAME_SYSNAME == "Windows": + _close(self.fd) + ELSE: + close(self.fd) + + cdef bytes get_writer(self): + return (self.writer)[:sizeof(rdata_writer_t)*length] + + cdef bytes get_py_col(self): + return (self.py_col)[:sizeof(rdata_column_t)*length] + + def __reduce__(self): + writer = self.get_writer() + py_col = self.get_py_col() + return (rebuild_writer, (writer, py_col)) + + +cpdef object rebuild_writer(bytes data1, bytes data2): + return LibrdataWriter() diff --git a/pandas/io/rdata/rdata_reader.py b/pandas/io/rdata/rdata_reader.py new file mode 100644 index 0000000000000..59f633119537c --- /dev/null +++ b/pandas/io/rdata/rdata_reader.py @@ -0,0 +1,457 @@ +""" +Read R data files (RData, rda, rds). + +This IO module interfaces with the librdata C library by Evan Miller: + https://github.com/WizardMac/librdata +""" +from __future__ import annotations + +import io +import os +from tempfile import TemporaryDirectory + +from pandas._typing import ( + Buffer, + CompressionOptions, + FilePathOrBuffer, + StorageOptions, +) +from pandas.util._decorators import doc + +from pandas.core.dtypes.common import is_list_like + +from pandas.core.api import to_datetime +from pandas.core.arrays import Categorical +from pandas.core.frame import ( + DataFrame, + Index, + Series, +) +from pandas.core.shared_docs import _shared_docs + +from pandas.io.common import ( + file_exists, + get_handle, + is_fsspec_url, + is_url, + stringify_path, +) +from pandas.io.rdata._rdata import LibrdataReader + + +@doc(storage_options=_shared_docs["storage_options"]) +def read_rdata( + path_or_buffer: FilePathOrBuffer, + file_format: str = "infer", + select_frames: list[str] | None = None, + rownames: bool = True, + compression: CompressionOptions = "gzip", + storage_options: StorageOptions = None, +) -> dict[str, DataFrame]: + r""" + Read R data (.RData, .rda, .rds) into DataFrame or ``dict`` of DataFrames. + + .. versionadded:: 1.3.0 + + Parameters + ---------- + path_or_buffer : str, path object, or file-like object + Any valid file path is acceptable. The string could be a URL. + Valid URL schemes include http, ftp, s3, and file. + + file_format : {{'infer', 'rdata', 'rda', 'rds'}}, default 'infer' + R serialization type as output from R's base::save or base::saveRDS + commands. Default 'infer' will use extension in file name to + to determine the format type. + + select_frames : list, default returns all DataFrames + Selected names of DataFrames to return from R RData and rdata types that + can contain multiple objects. + + rownames : bool, default True + Include original rownames in R data frames to map into a DataFrame index. + + compression : {{'infer', 'gzip', 'bz2', 'zip', 'xz', None}}, default 'gzip' + For on-the-fly decompression of on-disk data. If 'infer', then use + gzip, bz2, zip or xz if path_or_buffer is a string ending in + '.gz', '.bz2', '.zip', or 'xz', respectively, and no decompression + otherwise. If using 'zip', the ZIP file must contain only one data + file to be read in. Set to None for no decompression. This method will + default to 'gzip' since 'gzip2` is the default compression in R for + RData and rds types. + + {storage_options} + + Returns + ------- + Dict of DataFrames + Depends on R data type where rds formats returns a ``dict`` of a single + DataFrame and RData or rda formats can return ``dict`` of one or more + DataFrames. + + See Also + -------- + read_sas : Read SAS datasets into DataFrame. + read_stata : Read Stata datasets into DataFrame. + read_spss : Read SPSS datasets into DataFrame. + + Notes + ----- + Any R data file that contains a non-data.frame object may raise parsing errors. + Method will return data.frame and data.frame like objects such as tibbles and + data.tables. For more information of R serialization data types, see docs on + `rds`__ + and `rda`__ + formats. + + Examples + -------- + For an .rds file which only contains a single R object, method returns a + DataFrame: + + >>> R_code = ''' + ... ghg_df <- data.frame( + ... gas = c('Carbon dioxide', + ... 'Methane', + ... 'Nitrous oxide', + ... 'Fluorinated gases', + ... 'Total'), + ... year = c(2018, + ... 2018, + ... 2018, + ... 2018, + ... 2018), + ... emissions = c(5424.88, + ... 634.46, + ... 434.53, + ... 182.78, + ... 6676.65) + ... ) + ... saveRDS(ghg_df, file="ghg_df.rds") + ... ''' + + >>> ghg_df = pd.read_rdata("ghg_df.rds") # doctest: +SKIP + >>> ghg_df # doctest: +SKIP + {{'r_dataframe': + gas year emissions + rownames + 1 Carbon dioxide 2018 5424.88 + 2 Methane 2018 634.46 + 3 Nitrous oxide 2018 434.53 + 4 Fluorinated gases 2018 182.79 + 5 Total 2018 6676.65}} + + For an .RData or .rda file which can contain multiple R objects, method + returns a ``dict`` of DataFrames: + + >>> R_code = ''' + ... plants_df <- pd.DataFrame( + ... plant_group = c('Pteridophytes', + ... 'Pteridophytes', + ... 'Pteridophytes', + ... 'Pteridophytes', + ... 'Pteridophytes'), + ... status = c('Data Deficient', + ... 'Extinct', + ... 'Not Threatened', + ... 'Possibly Threatened', + ... 'Threatened'), + ... count = c(398, 65, 1294, 408, 1275) + ... ) + ... sea_ice_df <- pd.DataFrame( + ... year = c(2016, 2017, 2018, 2019, 2020), + ... mo = c(12, 12, 12, 12, 12], + ... data.type: c('Goddard', + ... 'Goddard', + ... 'Goddard', + ... 'Goddard', + ... 'NRTSI-G'), + ... region = c('S', 'S', 'S', 'S', 'S'), + ... extent = c(8.28, 9.48, 9.19, 9.41, 10.44), + ... area = c(5.51, 6.23, 5.59, 6.59, 6.5) + ... ) + ... save(ghg_df, plants_df, sea_ice_df, file="env_data_dfs.rda") + ... ''' + + >>> env_dfs = pd.read_rdata("env_data_dfs.rda") # doctest: +SKIP + >>> env_dfs # doctest: +SKIP + {{'ghg_df': + gas year emissions + rownames + 1 Carbon dioxide 2018 5424.88 + 2 Methane 2018 634.46 + 3 Nitrous oxide 2018 434.53 + 4 Fluorinated gases 2018 182.79 + 5 Total 2018 6676.65, + 'plants_df': + plant_group status count + rownames + 1 Pteridophytes Data Deficient 398 + 2 Pteridophytes Extinct 65 + 3 Pteridophytes Not Threatened 1294 + 4 Pteridophytes Possibly Threatened 408 + 5 Pteridophytes Threatened 1275, + 'sea_ice_df': + year mo data.type region extent area + rownames + 1 2016 12 Goddard S 8.28 5.51 + 2 2017 12 Goddard S 9.48 6.23 + 3 2018 12 Goddard S 9.19 5.59 + 4 2019 12 Goddard S 9.41 6.59 + 5 2020 12 NRTSI-G S 10.44 6.50}} + """ + + rdr = _RDataReader( + path_or_buffer, + file_format, + select_frames, + rownames, + compression, + storage_options, + ) + + r_dfs = rdr.parse_data() + + return r_dfs + + +def get_data_from_filepath( + filepath_or_buffer, + encoding, + compression, + storage_options, +) -> str | bytes | Buffer: + """ + Extract raw R data. + + The method accepts three input types: + 1. filepath (string-like) + 2. file-like object (e.g. open file object, BytesIO) + 3. R data file in ascii or binary content + + This method turns (1) into (2) to simplify the rest of the processing. + It returns input types (2) and (3) unchanged. + """ + filepath_or_buffer = stringify_path(filepath_or_buffer) + + if ( + not isinstance(filepath_or_buffer, str) + or is_url(filepath_or_buffer) + or is_fsspec_url(filepath_or_buffer) + or file_exists(filepath_or_buffer) + ): + with get_handle( + filepath_or_buffer, + "rb", + encoding=encoding, + compression=compression, + storage_options=storage_options, + is_text=False, + ) as handle_obj: + filepath_or_buffer = ( + handle_obj.handle.read() + if hasattr(handle_obj.handle, "read") + else handle_obj.handle + ) + else: + raise FileNotFoundError(f"{filepath_or_buffer} file cannot be found.") + + return filepath_or_buffer + + +def preprocess_data(data) -> io.StringIO | io.BytesIO: + """ + Convert extracted raw data. + + This method will return underlying data of extracted R data formats. + The data either has a `read` attribute (e.g. a file object or a + StringIO/BytesIO) or is bytes that represents the R data. + """ + + if isinstance(data, str): + data = io.StringIO(data) + + elif isinstance(data, bytes): + data = io.BytesIO(data) + + return data + + +class _RDataReader: + """ + Internal subclass to parse R data files into dict of DataFrames. + + Parameters + ---------- + path_or_buffer : a valid str, path object or file-like object + Any valid string path is acceptable. The string could be a URL. Valid + URL schemes include http, ftp, s3, and file. + + file_format : {{'infer', 'rdata', 'rda', 'rds'}}, default 'infer' + R serialization type. + + select_frames : list, default None + Selected names of DataFrames to return from R data. + + rownames : bool, default True + Include original rownames in R data frames. + + compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer' + Compression type for on-the-fly decompression of on-disk data. + If 'infer', then use extension for gzip, bz2, zip or xz. + + storage_options : dict, optional + Extra options that make sense for a particular storage connection, + e.g. host, port, username, password, etc. + """ + + def __init__( + self, + path_or_buffer, + file_format, + select_frames, + rownames, + compression, + storage_options, + ) -> None: + self.path_or_buffer = path_or_buffer + self.file_format = file_format.lower() + self.select_frames = select_frames + self.rownames = rownames + self.compression = compression + self.storage_options = storage_options + self.verify_params() + + def verify_params(self) -> None: + """ + Verify user entries of parameters. + + This method will check the values and types of select parameters + and raise appropriate errors. + """ + + path_ext: str | None = ( + os.path.splitext(self.path_or_buffer.lower())[1][1:] + if isinstance(self.path_or_buffer, str) + else None + ) + + if self.file_format not in ["infer", "rdata", "rda", "rds"]: + raise ValueError( + f"'{self.file_format}' is not a valid value for file_format" + ) + + if ( + self.file_format == "infer" + and isinstance(self.path_or_buffer, str) + and path_ext not in ["rdata", "rda", "rds"] + ) or (self.file_format == "infer" and not isinstance(self.path_or_buffer, str)): + raise ValueError( + f"Unable to infer file format from file name: {self.path_or_buffer}. " + "Please use known R data type (rdata, rda, rds)." + ) + + if self.file_format == "infer" and isinstance(path_ext, str): + self.file_format = path_ext + + if self.select_frames is not None and not is_list_like(self.select_frames): + raise TypeError( + f"{type(self.select_frames).__name__} is " + "not a valid type for select_frames" + ) + + def buffer_to_disk(self, tmp_dir: str) -> str: + """ + Convert path or buffer to disk file. + + This method will convert path_or_buffer to temp file + to parse RData from disk. + """ + + r_temp = os.path.join(tmp_dir, "rdata.rda") + + handle_data = get_data_from_filepath( + filepath_or_buffer=self.path_or_buffer, + encoding="utf-8", + compression=self.compression, + storage_options=self.storage_options, + ) + + with preprocess_data(handle_data) as r_data: + if isinstance(r_data, io.BytesIO): + with open(r_temp, "wb") as f: + f.write(r_data.read()) + + return r_temp + + def build_frame(self, data_dict: dict) -> DataFrame: + """ + Builds DataFrame from raw, nested parsed RData dict. + + Converts special class variables (bools, factors, dates, datetimes), + then binds all columns together with DataFrame constructor. + """ + + final_dict = { + k: Series(v) + for k, v in data_dict["data"].items() + if k not in ["dtypes", "colnames", "rownames"] + } + + rdf = DataFrame(data=final_dict) + + for col, dtype in data_dict["dtypes"].items(): + if dtype == "bool": + rdf[col] = rdf[col].astype(bool) + + if dtype == "factor": + rdf[col] = Categorical(rdf[col]) + + if dtype == "date": + rdf[col] = to_datetime(rdf[col], unit="d") + + if dtype == "datetime": + rdf[col] = to_datetime(rdf[col], unit="s") + + colnames = ( + None + if data_dict["colnames"] is None + else list(data_dict["colnames"].values()) + ) + if colnames is not None: + rdf.columns = Index(colnames) + + rownames = ( + None + if data_dict["rownames"] is None + else list(data_dict["rownames"].values()) + ) + if self.rownames: + if rownames is not None: + rdf.index = Index(rownames) + else: + rdf.index += 1 + rdf.index.name = "rownames" + + return rdf + + def parse_data(self) -> dict[str, DataFrame]: + """ + Parse R data files into DataFrames + + This method will retrieve dictionary of R data and build + DataFrame for each item in data file + """ + + lbr = LibrdataReader() + + with TemporaryDirectory() as tmp_dir: + r_temp = self.buffer_to_disk(tmp_dir) + rdict = lbr.read_rdata(r_temp) + + r_dfs = {k: self.build_frame(v) for k, v in rdict.items()} + + if self.select_frames: + r_dfs = {k: v for k, v in r_dfs.items() if k in self.select_frames} + + return r_dfs diff --git a/pandas/io/rdata/rdata_writer.py b/pandas/io/rdata/rdata_writer.py new file mode 100644 index 0000000000000..5bb5b4f3f3b90 --- /dev/null +++ b/pandas/io/rdata/rdata_writer.py @@ -0,0 +1,189 @@ +""" +write R data files (RData, rda, rds). + +This IO module interfaces with the librdata C library by Evan Miller: + https://github.com/WizardMac/librdata +""" +from __future__ import annotations + +import os +from tempfile import TemporaryDirectory + +from pandas._typing import ( + CompressionOptions, + FilePathOrBuffer, + StorageOptions, +) + +from pandas.core.frame import DataFrame + +from pandas.io.common import get_handle +from pandas.io.rdata._rdata import LibrdataWriter + + +class RDataWriter: + """ + Subclass to write pandas DataFrames into R data files. + + Parameters + ---------- + path_or_buffer : a valid str, path object or file-like object + Any valid string path is acceptable. + + file_format : {{'infer', 'rdata', 'rda', 'rds'}}, default 'infer' + R serialization type. + + rda_name : str, default "pandas_dataframe" + Name for exported DataFrame in rda file. + + index : bool, default True + Include index or MultiIndex in output as separate columns. + + compression : {'gzip', 'bz2', 'xz', None}, default 'gzip' + Compression type for on-the-fly decompression of on-disk data. + + storage_options : dict, optional + Extra options that make sense for a particular storage connection, + e.g. host, port, username, password, etc. + """ + + def __init__( + self, + frame: DataFrame, + path_or_buffer: FilePathOrBuffer, + file_format: str = "infer", + rda_name: str = "pandas_dataframe", + index: bool = True, + compression: CompressionOptions = "gzip", + storage_options: StorageOptions = None, + ) -> None: + self.frame = frame + self.path_or_buffer = path_or_buffer + self.file_format = file_format.lower() + self.rda_name = rda_name + self.index = index + self.compression = compression + self.storage_options = storage_options + self.verify_params() + + def verify_params(self) -> None: + """ + Verify user entries of parameters. + + This method will check the values and types of select parameters + and raise appropriate errors. + """ + + path_ext: str | None = ( + os.path.splitext(self.path_or_buffer.lower())[1][1:] + if isinstance(self.path_or_buffer, str) + else None + ) + + if self.file_format not in ["infer", "rdata", "rda", "rds"]: + raise ValueError( + f"{self.file_format} is not a valid value for file_format." + ) + + if ( + self.file_format == "infer" + and isinstance(self.path_or_buffer, str) + and path_ext not in ["rdata", "rda", "rds"] + ): + raise ValueError( + f"Unable to infer file format from file name: {self.path_or_buffer}" + "Please use known R data type (rdata, rda, rds)." + ) + + if self.file_format == "infer" and isinstance(path_ext, str): + self.file_format = path_ext + + if self.compression is not None and self.compression not in [ + "gzip", + "bz2", + "xz", + ]: + raise ValueError( + f"{self.compression} is not a supported value for compression." + ) + + def disk_to_buffer(self, r_file: str) -> None: + """ + Save temp file to path or buffer. + + This method will convert written R data to path_or_buffer. + """ + + with open(r_file, "rb") as rdata: + with get_handle( + self.path_or_buffer, + "wb", + compression=self.compression, + storage_options=self.storage_options, + is_text=False, + ) as handles: + handles.handle.write(rdata.read()) # type: ignore[arg-type] + + return None + + def write_data(self) -> None: + """ + Write DataFrames to R data files. + + Converts non-primitive and non-datetimes to object to align to R + atomic types, then exports dictionaries of each column with meta data. + """ + + self.frame = ( + self.frame.reset_index() + if self.index + else self.frame.reset_index(drop=True) + ) + + excl_types = ["bool", "number", "object", "datetime", "datetimetz", "timedelta"] + for col in self.frame.select_dtypes(exclude=excl_types).columns: + self.frame[col] = self.frame[col].astype(str) + + for col in self.frame.select_dtypes(include=["datetimetz"]).columns: + self.frame[col] = self.frame[col].dt.tz_localize(None) + + for col in self.frame.select_dtypes(include=["timedelta"]).columns: + self.frame[col] = self.frame[col].dt.total_seconds() + + rdict = {"dtypes": {k: str(v) for k, v in self.frame.dtypes.to_dict().items()}} + + for k, v in rdict["dtypes"].items(): + if any(x in v for x in ("bool", "Boolean")): + rdict["dtypes"][k] = "bool" + + elif any(x in v for x in ("int", "uint", "Int", "UInt")): + rdict["dtypes"][k] = "int" + + elif any(x in v for x in ("float", "Float")): + rdict["dtypes"][k] = "float" + + elif any(x in v for x in ("datetime", "Datetime")): + rdict["dtypes"][k] = "datetime" + + elif any(x in v for x in ("object", "string", "String")): + rdict["dtypes"][k] = "object" + + for col in self.frame.select_dtypes(include=["datetime"]).columns: + self.frame[col] = self.frame[col].values.view("int64") / (10 ** 9) + + rdict["data"] = self.frame.to_dict() + + lbw = LibrdataWriter() + + with TemporaryDirectory() as tmp_dir: + r_temp = os.path.join(tmp_dir, "rdata.rda") + lbw.write_rdata( + rfile=r_temp, + rdict=rdict, + rformat=self.file_format, + tbl_name=self.rda_name, + ) + + self.disk_to_buffer(r_temp) + + return None diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py index 38984238ecf65..225b4805717c3 100644 --- a/pandas/tests/api/test_api.py +++ b/pandas/tests/api/test_api.py @@ -163,6 +163,7 @@ class TestPDApi(Base): "read_xml", "read_json", "read_pickle", + "read_rdata", "read_sas", "read_sql", "read_sql_query", diff --git a/pandas/tests/io/data/rdata/climate_non_utf8_df.rda b/pandas/tests/io/data/rdata/climate_non_utf8_df.rda new file mode 100644 index 0000000000000..a506806405f5e Binary files /dev/null and b/pandas/tests/io/data/rdata/climate_non_utf8_df.rda differ diff --git a/pandas/tests/io/data/rdata/climate_non_utf8_df.rds b/pandas/tests/io/data/rdata/climate_non_utf8_df.rds new file mode 100644 index 0000000000000..85a65550ad80f Binary files /dev/null and b/pandas/tests/io/data/rdata/climate_non_utf8_df.rds differ diff --git a/pandas/tests/io/data/rdata/env_data_dfs.rda b/pandas/tests/io/data/rdata/env_data_dfs.rda new file mode 100644 index 0000000000000..07fbef3ecb00d Binary files /dev/null and b/pandas/tests/io/data/rdata/env_data_dfs.rda differ diff --git a/pandas/tests/io/data/rdata/env_data_non_dfs.rda b/pandas/tests/io/data/rdata/env_data_non_dfs.rda new file mode 100644 index 0000000000000..e1b6bbb5e117e Binary files /dev/null and b/pandas/tests/io/data/rdata/env_data_non_dfs.rda differ diff --git a/pandas/tests/io/data/rdata/env_data_objs.rda b/pandas/tests/io/data/rdata/env_data_objs.rda new file mode 100644 index 0000000000000..61731d7774e45 Binary files /dev/null and b/pandas/tests/io/data/rdata/env_data_objs.rda differ diff --git a/pandas/tests/io/data/rdata/ghg_df.rds b/pandas/tests/io/data/rdata/ghg_df.rds new file mode 100644 index 0000000000000..18c91b7acf9d7 Binary files /dev/null and b/pandas/tests/io/data/rdata/ghg_df.rds differ diff --git a/pandas/tests/io/data/rdata/ghg_t_tests.rds b/pandas/tests/io/data/rdata/ghg_t_tests.rds new file mode 100644 index 0000000000000..e58879d33c1c8 Binary files /dev/null and b/pandas/tests/io/data/rdata/ghg_t_tests.rds differ diff --git a/pandas/tests/io/data/rdata/planetary_boundaries_df.rda b/pandas/tests/io/data/rdata/planetary_boundaries_df.rda new file mode 100644 index 0000000000000..0c3308434ccbb Binary files /dev/null and b/pandas/tests/io/data/rdata/planetary_boundaries_df.rda differ diff --git a/pandas/tests/io/data/rdata/planetary_boundaries_df.rds b/pandas/tests/io/data/rdata/planetary_boundaries_df.rds new file mode 100644 index 0000000000000..b370d2bd13785 Binary files /dev/null and b/pandas/tests/io/data/rdata/planetary_boundaries_df.rds differ diff --git a/pandas/tests/io/data/rdata/plants_arry.rds b/pandas/tests/io/data/rdata/plants_arry.rds new file mode 100644 index 0000000000000..e1d7032acebeb Binary files /dev/null and b/pandas/tests/io/data/rdata/plants_arry.rds differ diff --git a/pandas/tests/io/data/rdata/plants_df.rds b/pandas/tests/io/data/rdata/plants_df.rds new file mode 100644 index 0000000000000..5b9f58f6483ba Binary files /dev/null and b/pandas/tests/io/data/rdata/plants_df.rds differ diff --git a/pandas/tests/io/data/rdata/ppm_df.csv b/pandas/tests/io/data/rdata/ppm_df.csv new file mode 100644 index 0000000000000..4a2663110dca3 --- /dev/null +++ b/pandas/tests/io/data/rdata/ppm_df.csv @@ -0,0 +1,757 @@ +"year","month","decimal_date","monthly_average","de_seasonalized","num_days","st_dev_of_days","unc_mon_mean" +1958,3,1958.2027,315.7,314.43,-1,-9.99,-0.99 +1958,4,1958.2877,317.45,315.16,-1,-9.99,-0.99 +1958,5,1958.3699,317.51,314.71,-1,-9.99,-0.99 +1958,6,1958.4548,317.24,315.14,-1,-9.99,-0.99 +1958,7,1958.537,315.86,315.18,-1,-9.99,-0.99 +1958,8,1958.6219,314.93,316.18,-1,-9.99,-0.99 +1958,9,1958.7068,313.2,316.08,-1,-9.99,-0.99 +1958,10,1958.789,312.43,315.41,-1,-9.99,-0.99 +1958,11,1958.874,313.33,315.2,-1,-9.99,-0.99 +1958,12,1958.9562,314.67,315.43,-1,-9.99,-0.99 +1959,1,1959.0411,315.58,315.55,-1,-9.99,-0.99 +1959,2,1959.126,316.48,315.86,-1,-9.99,-0.99 +1959,3,1959.2027,316.65,315.38,-1,-9.99,-0.99 +1959,4,1959.2877,317.72,315.41,-1,-9.99,-0.99 +1959,5,1959.3699,318.29,315.49,-1,-9.99,-0.99 +1959,6,1959.4548,318.15,316.03,-1,-9.99,-0.99 +1959,7,1959.537,316.54,315.86,-1,-9.99,-0.99 +1959,8,1959.6219,314.8,316.06,-1,-9.99,-0.99 +1959,9,1959.7068,313.84,316.73,-1,-9.99,-0.99 +1959,10,1959.789,313.33,316.33,-1,-9.99,-0.99 +1959,11,1959.874,314.81,316.68,-1,-9.99,-0.99 +1959,12,1959.9562,315.58,316.35,-1,-9.99,-0.99 +1960,1,1960.041,316.43,316.4,-1,-9.99,-0.99 +1960,2,1960.1257,316.98,316.36,-1,-9.99,-0.99 +1960,3,1960.2049,317.58,316.28,-1,-9.99,-0.99 +1960,4,1960.2896,319.03,316.7,-1,-9.99,-0.99 +1960,5,1960.3716,320.04,317.22,-1,-9.99,-0.99 +1960,6,1960.4563,319.59,317.47,-1,-9.99,-0.99 +1960,7,1960.5383,318.18,317.52,-1,-9.99,-0.99 +1960,8,1960.623,315.9,317.19,-1,-9.99,-0.99 +1960,9,1960.7077,314.17,317.08,-1,-9.99,-0.99 +1960,10,1960.7896,313.83,316.83,-1,-9.99,-0.99 +1960,11,1960.8743,315,316.88,-1,-9.99,-0.99 +1960,12,1960.9563,316.19,316.96,-1,-9.99,-0.99 +1961,1,1961.0411,316.89,316.86,-1,-9.99,-0.99 +1961,2,1961.126,317.7,317.08,-1,-9.99,-0.99 +1961,3,1961.2027,318.54,317.26,-1,-9.99,-0.99 +1961,4,1961.2877,319.48,317.16,-1,-9.99,-0.99 +1961,5,1961.3699,320.58,317.76,-1,-9.99,-0.99 +1961,6,1961.4548,319.77,317.63,-1,-9.99,-0.99 +1961,7,1961.537,318.57,317.88,-1,-9.99,-0.99 +1961,8,1961.6219,316.79,318.06,-1,-9.99,-0.99 +1961,9,1961.7068,314.99,317.9,-1,-9.99,-0.99 +1961,10,1961.789,315.31,318.32,-1,-9.99,-0.99 +1961,11,1961.874,316.1,317.99,-1,-9.99,-0.99 +1961,12,1961.9562,317.01,317.79,-1,-9.99,-0.99 +1962,1,1962.0411,317.94,317.91,-1,-9.99,-0.99 +1962,2,1962.126,318.55,317.92,-1,-9.99,-0.99 +1962,3,1962.2027,319.68,318.39,-1,-9.99,-0.99 +1962,4,1962.2877,320.57,318.24,-1,-9.99,-0.99 +1962,5,1962.3699,321.02,318.18,-1,-9.99,-0.99 +1962,6,1962.4548,320.62,318.47,-1,-9.99,-0.99 +1962,7,1962.537,319.61,318.92,-1,-9.99,-0.99 +1962,8,1962.6219,317.4,318.68,-1,-9.99,-0.99 +1962,9,1962.7068,316.25,319.17,-1,-9.99,-0.99 +1962,10,1962.789,315.42,318.45,-1,-9.99,-0.99 +1962,11,1962.874,316.69,318.58,-1,-9.99,-0.99 +1962,12,1962.9562,317.7,318.47,-1,-9.99,-0.99 +1963,1,1963.0411,318.74,318.7,-1,-9.99,-0.99 +1963,2,1963.126,319.07,318.44,-1,-9.99,-0.99 +1963,3,1963.2027,319.86,318.57,-1,-9.99,-0.99 +1963,4,1963.2877,321.38,319.05,-1,-9.99,-0.99 +1963,5,1963.3699,322.25,319.4,-1,-9.99,-0.99 +1963,6,1963.4548,321.48,319.32,-1,-9.99,-0.99 +1963,7,1963.537,319.74,319.05,-1,-9.99,-0.99 +1963,8,1963.6219,317.77,319.05,-1,-9.99,-0.99 +1963,9,1963.7068,316.21,319.14,-1,-9.99,-0.99 +1963,10,1963.789,315.99,319.02,-1,-9.99,-0.99 +1963,11,1963.874,317.07,318.97,-1,-9.99,-0.99 +1963,12,1963.9562,318.35,319.13,-1,-9.99,-0.99 +1964,1,1964.041,319.57,319.54,-1,-9.99,-0.99 +1964,2,1964.1257,320.01,319.37,-1,-9.99,-0.99 +1964,3,1964.2049,320.74,319.41,-1,-9.99,-0.99 +1964,4,1964.2896,321.84,319.45,-1,-9.99,-0.99 +1964,5,1964.3716,322.26,319.4,-1,-9.99,-0.99 +1964,6,1964.4563,321.89,319.75,-1,-9.99,-0.99 +1964,7,1964.5383,320.44,319.77,-1,-9.99,-0.99 +1964,8,1964.623,318.69,320,-1,-9.99,-0.99 +1964,9,1964.7077,316.7,319.66,-1,-9.99,-0.99 +1964,10,1964.7896,316.87,319.91,-1,-9.99,-0.99 +1964,11,1964.8743,317.68,319.58,-1,-9.99,-0.99 +1964,12,1964.9563,318.71,319.49,-1,-9.99,-0.99 +1965,1,1965.0411,319.44,319.4,-1,-9.99,-0.99 +1965,2,1965.126,320.44,319.81,-1,-9.99,-0.99 +1965,3,1965.2027,320.89,319.59,-1,-9.99,-0.99 +1965,4,1965.2877,322.14,319.78,-1,-9.99,-0.99 +1965,5,1965.3699,322.17,319.3,-1,-9.99,-0.99 +1965,6,1965.4548,321.87,319.7,-1,-9.99,-0.99 +1965,7,1965.537,321.21,320.51,-1,-9.99,-0.99 +1965,8,1965.6219,318.87,320.15,-1,-9.99,-0.99 +1965,9,1965.7068,317.81,320.77,-1,-9.99,-0.99 +1965,10,1965.789,317.3,320.36,-1,-9.99,-0.99 +1965,11,1965.874,318.87,320.78,-1,-9.99,-0.99 +1965,12,1965.9562,319.42,320.2,-1,-9.99,-0.99 +1966,1,1966.0411,320.62,320.59,-1,-9.99,-0.99 +1966,2,1966.126,321.6,320.96,-1,-9.99,-0.99 +1966,3,1966.2027,322.39,321.08,-1,-9.99,-0.99 +1966,4,1966.2877,323.7,321.34,-1,-9.99,-0.99 +1966,5,1966.3699,324.08,321.2,-1,-9.99,-0.99 +1966,6,1966.4548,323.75,321.57,-1,-9.99,-0.99 +1966,7,1966.537,322.38,321.68,-1,-9.99,-0.99 +1966,8,1966.6219,320.36,321.65,-1,-9.99,-0.99 +1966,9,1966.7068,318.64,321.6,-1,-9.99,-0.99 +1966,10,1966.789,318.1,321.17,-1,-9.99,-0.99 +1966,11,1966.874,319.78,321.7,-1,-9.99,-0.99 +1966,12,1966.9562,321.03,321.81,-1,-9.99,-0.99 +1967,1,1967.0411,322.33,322.29,-1,-9.99,-0.99 +1967,2,1967.126,322.5,321.86,-1,-9.99,-0.99 +1967,3,1967.2027,323.04,321.73,-1,-9.99,-0.99 +1967,4,1967.2877,324.42,322.04,-1,-9.99,-0.99 +1967,5,1967.3699,325,322.12,-1,-9.99,-0.99 +1967,6,1967.4548,324.09,321.91,-1,-9.99,-0.99 +1967,7,1967.537,322.54,321.84,-1,-9.99,-0.99 +1967,8,1967.6219,320.92,322.21,-1,-9.99,-0.99 +1967,9,1967.7068,319.25,322.23,-1,-9.99,-0.99 +1967,10,1967.789,319.39,322.47,-1,-9.99,-0.99 +1967,11,1967.874,320.73,322.65,-1,-9.99,-0.99 +1967,12,1967.9562,321.96,322.75,-1,-9.99,-0.99 +1968,1,1968.041,322.57,322.54,-1,-9.99,-0.99 +1968,2,1968.1257,323.15,322.51,-1,-9.99,-0.99 +1968,3,1968.2049,323.89,322.55,-1,-9.99,-0.99 +1968,4,1968.2896,325.02,322.62,-1,-9.99,-0.99 +1968,5,1968.3716,325.57,322.68,-1,-9.99,-0.99 +1968,6,1968.4563,325.36,323.19,-1,-9.99,-0.99 +1968,7,1968.5383,324.14,323.46,-1,-9.99,-0.99 +1968,8,1968.623,322.11,323.43,-1,-9.99,-0.99 +1968,9,1968.7077,320.33,323.32,-1,-9.99,-0.99 +1968,10,1968.7896,320.25,323.33,-1,-9.99,-0.99 +1968,11,1968.8743,321.32,323.25,-1,-9.99,-0.99 +1968,12,1968.9563,322.89,323.69,-1,-9.99,-0.99 +1969,1,1969.0411,324,323.97,-1,-9.99,-0.99 +1969,2,1969.126,324.42,323.77,-1,-9.99,-0.99 +1969,3,1969.2027,325.63,324.31,-1,-9.99,-0.99 +1969,4,1969.2877,326.66,324.27,-1,-9.99,-0.99 +1969,5,1969.3699,327.38,324.48,-1,-9.99,-0.99 +1969,6,1969.4548,326.71,324.51,-1,-9.99,-0.99 +1969,7,1969.537,325.88,325.17,-1,-9.99,-0.99 +1969,8,1969.6219,323.66,324.97,-1,-9.99,-0.99 +1969,9,1969.7068,322.38,325.37,-1,-9.99,-0.99 +1969,10,1969.789,321.78,324.88,-1,-9.99,-0.99 +1969,11,1969.874,322.86,324.79,-1,-9.99,-0.99 +1969,12,1969.9562,324.12,324.91,-1,-9.99,-0.99 +1970,1,1970.0411,325.06,325.03,-1,-9.99,-0.99 +1970,2,1970.126,325.98,325.34,-1,-9.99,-0.99 +1970,3,1970.2027,326.93,325.61,-1,-9.99,-0.99 +1970,4,1970.2877,328.13,325.74,-1,-9.99,-0.99 +1970,5,1970.3699,328.08,325.16,-1,-9.99,-0.99 +1970,6,1970.4548,327.67,325.46,-1,-9.99,-0.99 +1970,7,1970.537,326.34,325.63,-1,-9.99,-0.99 +1970,8,1970.6219,324.69,325.99,-1,-9.99,-0.99 +1970,9,1970.7068,323.1,326.1,-1,-9.99,-0.99 +1970,10,1970.789,323.06,326.18,-1,-9.99,-0.99 +1970,11,1970.874,324.01,325.95,-1,-9.99,-0.99 +1970,12,1970.9562,325.13,325.93,-1,-9.99,-0.99 +1971,1,1971.0411,326.17,326.14,-1,-9.99,-0.99 +1971,2,1971.126,326.68,326.03,-1,-9.99,-0.99 +1971,3,1971.2027,327.17,325.85,-1,-9.99,-0.99 +1971,4,1971.2877,327.79,325.38,-1,-9.99,-0.99 +1971,5,1971.3699,328.93,326,-1,-9.99,-0.99 +1971,6,1971.4548,328.57,326.36,-1,-9.99,-0.99 +1971,7,1971.537,327.36,326.65,-1,-9.99,-0.99 +1971,8,1971.6219,325.43,326.74,-1,-9.99,-0.99 +1971,9,1971.7068,323.36,326.37,-1,-9.99,-0.99 +1971,10,1971.789,323.56,326.69,-1,-9.99,-0.99 +1971,11,1971.874,324.8,326.75,-1,-9.99,-0.99 +1971,12,1971.9562,326.01,326.82,-1,-9.99,-0.99 +1972,1,1972.041,326.77,326.73,-1,-9.99,-0.99 +1972,2,1972.1257,327.63,326.98,-1,-9.99,-0.99 +1972,3,1972.2049,327.75,326.39,-1,-9.99,-0.99 +1972,4,1972.2896,329.72,327.29,-1,-9.99,-0.99 +1972,5,1972.3716,330.07,327.14,-1,-9.99,-0.99 +1972,6,1972.4563,329.09,326.88,-1,-9.99,-0.99 +1972,7,1972.5383,328.04,327.36,-1,-9.99,-0.99 +1972,8,1972.623,326.32,327.67,-1,-9.99,-0.99 +1972,9,1972.7077,324.84,327.87,-1,-9.99,-0.99 +1972,10,1972.7896,325.2,328.33,-1,-9.99,-0.99 +1972,11,1972.8743,326.5,328.45,-1,-9.99,-0.99 +1972,12,1972.9563,327.55,328.36,-1,-9.99,-0.99 +1973,1,1973.0411,328.55,328.51,-1,-9.99,-0.99 +1973,2,1973.126,329.56,328.91,-1,-9.99,-0.99 +1973,3,1973.2027,330.3,328.96,-1,-9.99,-0.99 +1973,4,1973.2877,331.5,329.08,-1,-9.99,-0.99 +1973,5,1973.3699,332.48,329.54,-1,-9.99,-0.99 +1973,6,1973.4548,332.07,329.84,-1,-9.99,-0.99 +1973,7,1973.537,330.87,330.15,-1,-9.99,-0.99 +1973,8,1973.6219,329.31,330.63,-1,-9.99,-0.99 +1973,9,1973.7068,327.51,330.55,-1,-9.99,-0.99 +1973,10,1973.789,327.18,330.32,-1,-9.99,-0.99 +1973,11,1973.874,328.16,330.13,-1,-9.99,-0.99 +1973,12,1973.9562,328.64,329.45,-1,-9.99,-0.99 +1974,1,1974.0411,329.35,329.32,-1,-9.99,-0.99 +1974,2,1974.126,330.71,330.05,-1,-9.99,-0.99 +1974,3,1974.2027,331.48,330.14,-1,-9.99,-0.99 +1974,4,1974.2877,332.65,330.22,-1,-9.99,-0.99 +1974,5,1974.375,333.19,330.22,13,0.31,0.16 +1974,6,1974.4583,332.2,329.79,25,0.37,0.14 +1974,7,1974.5417,331.07,330.21,24,0.24,0.09 +1974,8,1974.625,329.15,330.54,26,0.31,0.12 +1974,9,1974.7083,327.33,330.44,22,0.47,0.19 +1974,10,1974.7917,327.28,330.52,24,0.22,0.09 +1974,11,1974.875,328.31,330.5,26,0.43,0.16 +1974,12,1974.9583,329.58,330.56,29,0.29,0.1 +1975,1,1975.0417,330.73,330.84,29,0.43,0.15 +1975,2,1975.125,331.46,330.85,26,0.46,0.17 +1975,3,1975.2083,331.94,330.37,17,0.33,0.15 +1975,4,1975.2917,333.11,330.53,23,0.59,0.24 +1975,5,1975.375,333.95,330.97,28,0.35,0.13 +1975,6,1975.4583,333.42,331.01,27,0.48,0.18 +1975,7,1975.5417,331.97,331.12,24,0.45,0.18 +1975,8,1975.625,329.95,331.33,24,0.47,0.18 +1975,9,1975.7083,328.5,331.6,22,0.53,0.22 +1975,10,1975.7917,328.36,331.61,11,0.21,0.12 +1975,11,1975.875,329.38,331.57,18,0.31,0.14 +1975,12,1975.9583,330.62,331.6,-1,-9.99,-0.99 +1976,1,1976.0417,331.56,331.67,19,0.23,0.1 +1976,2,1976.125,332.74,332.13,22,0.49,0.2 +1976,3,1976.2083,333.36,331.79,18,0.52,0.23 +1976,4,1976.2917,334.74,332.16,18,0.77,0.35 +1976,5,1976.375,334.72,331.75,21,0.56,0.23 +1976,6,1976.4583,333.98,331.56,15,0.21,0.1 +1976,7,1976.5417,333.08,332.22,15,0.24,0.12 +1976,8,1976.625,330.68,332.07,23,0.51,0.2 +1976,9,1976.7083,328.96,332.07,13,0.69,0.37 +1976,10,1976.7917,328.72,331.97,19,0.57,0.25 +1976,11,1976.875,330.16,332.35,25,0.36,0.14 +1976,12,1976.9583,331.62,332.6,20,0.38,0.16 +1977,1,1977.0417,332.68,332.77,23,0.4,0.16 +1977,2,1977.125,333.17,332.57,20,0.34,0.15 +1977,3,1977.2083,334.96,333.4,23,0.51,0.21 +1977,4,1977.2917,336.14,333.54,20,0.5,0.21 +1977,5,1977.375,336.93,333.99,20,0.31,0.13 +1977,6,1977.4583,336.17,333.79,22,0.4,0.16 +1977,7,1977.5417,334.88,334,20,0.23,0.1 +1977,8,1977.625,332.56,333.9,18,0.46,0.21 +1977,9,1977.7083,331.29,334.36,19,0.46,0.2 +1977,10,1977.7917,331.28,334.5,23,0.29,0.12 +1977,11,1977.875,332.46,334.69,21,0.43,0.18 +1977,12,1977.9583,333.6,334.59,25,0.36,0.14 +1978,1,1978.0417,334.94,335.01,22,0.52,0.21 +1978,2,1978.125,335.26,334.59,25,0.5,0.19 +1978,3,1978.2083,336.66,335,28,0.59,0.21 +1978,4,1978.2917,337.69,335.07,18,0.44,0.2 +1978,5,1978.375,338.02,335.07,26,0.46,0.17 +1978,6,1978.4583,338.01,335.59,17,0.31,0.15 +1978,7,1978.5417,336.5,335.65,20,0.32,0.14 +1978,8,1978.625,334.42,335.87,19,0.32,0.14 +1978,9,1978.7083,332.36,335.51,17,0.75,0.35 +1978,10,1978.7917,332.45,335.72,21,0.34,0.14 +1978,11,1978.875,333.76,335.99,24,0.25,0.1 +1978,12,1978.9583,334.91,335.89,26,0.33,0.12 +1979,1,1979.0417,336.14,336.22,27,0.55,0.2 +1979,2,1979.125,336.69,336,25,0.3,0.11 +1979,3,1979.2083,338.27,336.56,21,0.63,0.26 +1979,4,1979.2917,338.82,336.11,24,0.67,0.26 +1979,5,1979.375,339.24,336.24,20,0.5,0.22 +1979,6,1979.4583,339.26,336.83,19,0.35,0.15 +1979,7,1979.5417,337.54,336.69,26,0.59,0.22 +1979,8,1979.625,335.72,337.2,24,0.6,0.23 +1979,9,1979.7083,333.98,337.19,19,0.65,0.29 +1979,10,1979.7917,334.24,337.57,25,0.42,0.16 +1979,11,1979.875,335.32,337.59,27,0.3,0.11 +1979,12,1979.9583,336.81,337.83,22,0.23,0.09 +1980,1,1980.0417,337.9,338.13,29,0.57,0.2 +1980,2,1980.125,338.34,337.85,26,0.49,0.18 +1980,3,1980.2083,340.07,338.51,23,0.54,0.22 +1980,4,1980.2917,340.93,338.31,24,0.29,0.11 +1980,5,1980.375,341.45,338.4,24,0.54,0.21 +1980,6,1980.4583,341.36,338.85,20,0.39,0.17 +1980,7,1980.5417,339.45,338.56,26,0.6,0.22 +1980,8,1980.625,337.67,339.07,16,1.05,0.5 +1980,9,1980.7083,336.25,339.37,15,0.69,0.34 +1980,10,1980.7917,336.14,339.4,26,0.26,0.1 +1980,11,1980.875,337.3,339.46,27,0.26,0.1 +1980,12,1980.9583,338.29,339.26,24,0.25,0.1 +1981,1,1981.0417,339.29,339.42,28,0.39,0.14 +1981,2,1981.125,340.55,339.97,25,0.65,0.25 +1981,3,1981.2083,341.63,340.09,25,0.48,0.19 +1981,4,1981.2917,342.6,340,26,0.46,0.17 +1981,5,1981.375,343.04,339.98,30,0.19,0.07 +1981,6,1981.4583,342.54,340.05,25,0.29,0.11 +1981,7,1981.5417,340.82,339.92,24,0.46,0.18 +1981,8,1981.625,338.48,339.87,25,0.48,0.18 +1981,9,1981.7083,336.95,340.16,27,0.55,0.2 +1981,10,1981.7917,337.05,340.39,25,0.39,0.15 +1981,11,1981.875,338.58,340.74,26,0.31,0.12 +1981,12,1981.9583,339.91,340.85,20,0.28,0.12 +1982,1,1982.0417,340.93,341.09,28,0.3,0.11 +1982,2,1982.125,341.76,341.15,24,0.49,0.19 +1982,3,1982.2083,342.78,341.18,17,0.41,0.19 +1982,4,1982.2917,343.96,341.34,7,0.42,0.31 +1982,5,1982.375,344.77,341.68,27,0.37,0.14 +1982,6,1982.4583,343.88,341.42,27,0.37,0.14 +1982,7,1982.5417,342.42,341.61,28,0.35,0.13 +1982,8,1982.625,340.24,341.64,25,0.61,0.23 +1982,9,1982.7083,338.38,341.56,21,0.59,0.25 +1982,10,1982.7917,338.41,341.77,26,0.5,0.19 +1982,11,1982.875,339.44,341.58,24,0.39,0.15 +1982,12,1982.9583,340.78,341.7,26,0.3,0.11 +1983,1,1983.0417,341.57,341.75,28,0.47,0.17 +1983,2,1983.125,342.79,342.24,24,0.37,0.15 +1983,3,1983.2083,343.37,341.86,27,0.88,0.32 +1983,4,1983.2917,345.4,342.78,23,0.29,0.12 +1983,5,1983.375,346.14,342.97,28,0.51,0.19 +1983,6,1983.4583,345.76,343.29,20,0.3,0.13 +1983,7,1983.5417,344.32,343.56,22,0.57,0.23 +1983,8,1983.625,342.51,343.89,16,0.73,0.35 +1983,9,1983.7083,340.46,343.59,15,0.5,0.25 +1983,10,1983.7917,340.53,343.86,20,0.31,0.13 +1983,11,1983.875,341.79,343.92,27,0.33,0.12 +1983,12,1983.9583,343.2,344.12,21,0.25,0.1 +1984,1,1984.0417,344.21,344.32,23,0.4,0.16 +1984,2,1984.125,344.92,344.38,23,0.32,0.13 +1984,3,1984.2083,345.68,344.26,19,0.3,0.13 +1984,4,1984.2917,347.14,344.54,2,-9.99,-0.99 +1984,5,1984.375,347.78,344.59,20,0.42,0.18 +1984,6,1984.4583,347.16,344.72,20,0.31,0.13 +1984,7,1984.5417,345.79,345.02,18,0.33,0.15 +1984,8,1984.625,343.74,345.11,12,0.45,0.25 +1984,9,1984.7083,341.59,344.75,14,0.72,0.37 +1984,10,1984.7917,341.86,345.19,12,0.36,0.2 +1984,11,1984.875,343.31,345.4,18,0.41,0.19 +1984,12,1984.9583,345,345.88,14,0.53,0.27 +1985,1,1985.0417,345.48,345.59,25,0.38,0.14 +1985,2,1985.125,346.41,345.91,15,0.37,0.18 +1985,3,1985.2083,347.91,346.57,17,0.34,0.16 +1985,4,1985.2917,348.66,346.1,21,0.61,0.25 +1985,5,1985.375,349.28,346.13,20,0.51,0.22 +1985,6,1985.4583,348.65,346.22,21,0.34,0.14 +1985,7,1985.5417,346.91,346.08,17,0.36,0.17 +1985,8,1985.625,345.26,346.57,16,0.57,0.27 +1985,9,1985.7083,343.47,346.58,24,0.57,0.22 +1985,10,1985.7917,343.35,346.6,20,0.29,0.13 +1985,11,1985.875,344.73,346.82,21,0.4,0.17 +1985,12,1985.9583,346.12,347.04,26,0.62,0.23 +1986,1,1986.0417,346.78,346.82,25,0.31,0.12 +1986,2,1986.125,347.48,346.97,25,0.45,0.17 +1986,3,1986.2083,348.25,346.94,16,0.7,0.34 +1986,4,1986.2917,349.86,347.32,19,0.38,0.17 +1986,5,1986.375,350.52,347.42,18,0.31,0.14 +1986,6,1986.4583,349.98,347.6,17,0.25,0.11 +1986,7,1986.5417,348.25,347.43,20,0.47,0.2 +1986,8,1986.625,346.17,347.51,18,0.48,0.21 +1986,9,1986.7083,345.48,348.61,17,0.63,0.29 +1986,10,1986.7917,344.82,348.04,25,0.32,0.12 +1986,11,1986.875,346.22,348.28,21,0.3,0.13 +1986,12,1986.9583,347.48,348.36,24,0.35,0.14 +1987,1,1987.0417,348.73,348.66,25,0.46,0.17 +1987,2,1987.125,348.92,348.23,25,0.58,0.22 +1987,3,1987.2083,349.81,348.39,21,0.35,0.15 +1987,4,1987.2917,351.4,348.86,26,0.68,0.25 +1987,5,1987.375,352.15,349.09,28,0.37,0.13 +1987,6,1987.4583,351.58,349.28,22,0.21,0.09 +1987,7,1987.5417,350.21,349.51,17,0.73,0.34 +1987,8,1987.625,348.2,349.65,15,0.85,0.42 +1987,9,1987.7083,346.66,349.85,23,0.61,0.24 +1987,10,1987.7917,346.72,349.96,22,0.41,0.17 +1987,11,1987.875,348.08,350.14,23,0.33,0.13 +1987,12,1987.9583,349.28,350.14,27,0.2,0.08 +1988,1,1988.0417,350.51,350.49,24,0.21,0.08 +1988,2,1988.125,351.7,350.99,23,0.57,0.23 +1988,3,1988.2083,352.5,350.99,25,0.78,0.3 +1988,4,1988.2917,353.67,351.03,27,0.48,0.18 +1988,5,1988.375,354.35,351.22,28,0.37,0.13 +1988,6,1988.4583,353.88,351.55,26,0.3,0.11 +1988,7,1988.5417,352.8,352.15,27,0.49,0.18 +1988,8,1988.625,350.49,352.01,26,0.62,0.23 +1988,9,1988.7083,348.97,352.18,26,0.47,0.18 +1988,10,1988.7917,349.37,352.62,26,0.31,0.12 +1988,11,1988.875,350.42,352.53,25,0.2,0.08 +1988,12,1988.9583,351.62,352.52,28,0.36,0.13 +1989,1,1989.0417,353.07,352.99,28,0.45,0.16 +1989,2,1989.125,353.43,352.69,25,0.38,0.15 +1989,3,1989.2083,354.08,352.6,29,0.53,0.19 +1989,4,1989.2917,355.72,353.07,28,0.47,0.17 +1989,5,1989.375,355.95,352.78,27,0.49,0.18 +1989,6,1989.4583,355.44,353.06,26,0.42,0.16 +1989,7,1989.5417,354.05,353.38,26,0.41,0.15 +1989,8,1989.625,351.84,353.43,25,0.48,0.18 +1989,9,1989.7083,350.09,353.37,24,0.69,0.27 +1989,10,1989.7917,350.33,353.57,25,0.34,0.13 +1989,11,1989.875,351.55,353.68,27,0.36,0.13 +1989,12,1989.9583,352.91,353.84,27,0.48,0.18 +1990,1,1990.0417,353.86,353.78,25,0.34,0.13 +1990,2,1990.125,355.1,354.37,28,0.66,0.24 +1990,3,1990.2083,355.75,354.27,27,0.57,0.21 +1990,4,1990.2917,356.38,353.76,28,0.55,0.2 +1990,5,1990.375,357.38,354.23,28,0.3,0.11 +1990,6,1990.4583,356.39,354.02,29,0.4,0.14 +1990,7,1990.5417,354.89,354.24,30,0.89,0.31 +1990,8,1990.625,353.06,354.68,22,0.62,0.25 +1990,9,1990.7083,351.38,354.69,27,0.72,0.26 +1990,10,1990.7917,351.69,354.94,28,0.3,0.11 +1990,11,1990.875,353.14,355.18,24,0.2,0.08 +1990,12,1990.9583,354.41,355.26,28,0.51,0.19 +1991,1,1991.0417,354.93,354.9,28,0.51,0.18 +1991,2,1991.125,355.82,355.11,26,0.54,0.2 +1991,3,1991.2083,357.33,355.79,30,0.73,0.25 +1991,4,1991.2917,358.77,356.13,30,0.66,0.23 +1991,5,1991.375,359.23,356.1,29,0.52,0.19 +1991,6,1991.4583,358.23,355.88,29,0.3,0.11 +1991,7,1991.5417,356.3,355.69,24,0.46,0.18 +1991,8,1991.625,353.97,355.6,23,0.39,0.15 +1991,9,1991.7083,352.34,355.66,27,0.37,0.14 +1991,10,1991.7917,352.43,355.69,27,0.25,0.09 +1991,11,1991.875,353.89,355.87,28,0.25,0.09 +1991,12,1991.9583,355.21,356.02,30,0.34,0.12 +1992,1,1992.0417,356.34,356.29,31,0.6,0.21 +1992,2,1992.125,357.21,356.47,27,0.56,0.21 +1992,3,1992.2083,357.97,356.38,24,0.72,0.28 +1992,4,1992.2917,359.22,356.51,27,0.53,0.2 +1992,5,1992.375,359.71,356.52,26,0.74,0.28 +1992,6,1992.4583,359.43,357.07,30,0.49,0.17 +1992,7,1992.5417,357.15,356.58,25,0.63,0.24 +1992,8,1992.625,354.99,356.67,24,0.62,0.24 +1992,9,1992.7083,353.01,356.36,25,0.98,0.38 +1992,10,1992.7917,353.41,356.72,29,0.56,0.2 +1992,11,1992.875,354.42,356.48,29,0.34,0.12 +1992,12,1992.9583,355.68,356.5,31,0.32,0.11 +1993,1,1993.0417,357.1,357.06,28,0.58,0.21 +1993,2,1993.125,357.42,356.54,28,0.49,0.18 +1993,3,1993.2083,358.59,356.88,30,0.72,0.25 +1993,4,1993.2917,359.39,356.71,25,0.53,0.2 +1993,5,1993.375,360.3,357.14,30,0.45,0.16 +1993,6,1993.4583,359.64,357.24,28,0.35,0.13 +1993,7,1993.5417,357.46,356.87,25,0.78,0.3 +1993,8,1993.625,355.76,357.44,27,0.62,0.23 +1993,9,1993.7083,354.14,357.51,23,0.73,0.29 +1993,10,1993.7917,354.23,357.61,28,0.29,0.11 +1993,11,1993.875,355.53,357.65,29,0.26,0.09 +1993,12,1993.9583,357.03,357.92,29,0.28,0.1 +1994,1,1994.0417,358.36,358.25,27,0.33,0.12 +1994,2,1994.125,359.04,358.21,25,0.5,0.19 +1994,3,1994.2083,360.11,358.41,29,0.82,0.29 +1994,4,1994.2917,361.36,358.59,28,0.5,0.18 +1994,5,1994.375,361.78,358.59,30,0.45,0.16 +1994,6,1994.4583,360.94,358.57,27,0.3,0.11 +1994,7,1994.5417,359.51,358.91,31,0.41,0.14 +1994,8,1994.625,357.59,359.29,24,0.43,0.17 +1994,9,1994.7083,355.86,359.3,24,0.58,0.23 +1994,10,1994.7917,356.21,359.63,28,0.28,0.1 +1994,11,1994.875,357.65,359.8,28,0.51,0.18 +1994,12,1994.9583,359.1,359.96,28,0.46,0.17 +1995,1,1995.0417,360.04,359.91,30,0.47,0.16 +1995,2,1995.125,361,360.18,28,0.52,0.19 +1995,3,1995.2083,361.98,360.37,29,0.78,0.28 +1995,4,1995.2917,363.44,360.76,29,0.65,0.23 +1995,5,1995.375,363.83,360.73,29,0.66,0.24 +1995,6,1995.4583,363.33,360.98,27,0.37,0.14 +1995,7,1995.5417,361.78,361.1,28,0.36,0.13 +1995,8,1995.625,359.33,360.93,24,0.7,0.28 +1995,9,1995.7083,358.32,361.71,24,0.68,0.26 +1995,10,1995.7917,358.14,361.52,29,0.26,0.09 +1995,11,1995.875,359.61,361.75,26,0.24,0.09 +1995,12,1995.9583,360.82,361.67,30,0.36,0.12 +1996,1,1996.0417,362.2,361.98,29,0.38,0.13 +1996,2,1996.125,363.36,362.47,28,0.55,0.2 +1996,3,1996.2083,364.28,362.64,28,0.67,0.24 +1996,4,1996.2917,364.69,361.99,29,0.59,0.21 +1996,5,1996.375,365.25,362.23,30,0.57,0.2 +1996,6,1996.4583,365.06,362.82,30,0.38,0.13 +1996,7,1996.5417,363.69,362.98,31,0.32,0.11 +1996,8,1996.625,361.55,363.13,27,0.49,0.18 +1996,9,1996.7083,359.69,363.14,25,0.75,0.29 +1996,10,1996.7917,359.72,363.12,29,0.32,0.11 +1996,11,1996.875,361.04,363.18,29,0.29,0.1 +1996,12,1996.9583,362.39,363.23,29,0.36,0.13 +1997,1,1997.0417,363.24,363.03,31,0.4,0.14 +1997,2,1997.125,364.21,363.4,28,0.62,0.22 +1997,3,1997.2083,364.65,363.02,31,0.4,0.14 +1997,4,1997.2917,366.48,363.82,21,0.46,0.19 +1997,5,1997.375,366.77,363.87,29,0.53,0.19 +1997,6,1997.4583,365.73,363.56,27,0.23,0.09 +1997,7,1997.5417,364.46,363.74,24,0.47,0.18 +1997,8,1997.625,362.4,363.98,25,0.57,0.22 +1997,9,1997.7083,360.44,363.83,26,0.63,0.24 +1997,10,1997.7917,360.98,364.28,27,0.32,0.12 +1997,11,1997.875,362.65,364.71,30,0.31,0.11 +1997,12,1997.9583,364.51,365.28,30,0.41,0.14 +1998,1,1998.0417,365.39,365.19,30,0.43,0.15 +1998,2,1998.125,366.1,365.29,28,0.62,0.23 +1998,3,1998.2083,367.36,365.73,31,0.82,0.28 +1998,4,1998.2917,368.79,366.17,29,0.63,0.22 +1998,5,1998.375,369.56,366.68,30,0.77,0.27 +1998,6,1998.4583,369.13,366.95,28,0.24,0.09 +1998,7,1998.5417,367.98,367.29,23,0.65,0.26 +1998,8,1998.625,366.1,367.69,30,0.3,0.1 +1998,9,1998.7083,364.16,367.51,28,0.4,0.14 +1998,10,1998.7917,364.54,367.82,30,0.26,0.09 +1998,11,1998.875,365.67,367.7,23,0.25,0.1 +1998,12,1998.9583,367.3,368.05,26,0.36,0.14 +1999,1,1999.0417,368.35,368.13,27,0.47,0.17 +1999,2,1999.125,369.28,368.46,21,0.47,0.2 +1999,3,1999.2083,369.84,368.24,25,0.81,0.31 +1999,4,1999.2917,371.15,368.62,29,0.67,0.24 +1999,5,1999.375,371.12,368.31,26,0.59,0.22 +1999,6,1999.4583,370.46,368.3,26,0.44,0.16 +1999,7,1999.5417,369.61,368.93,27,0.63,0.23 +1999,8,1999.625,367.06,368.63,25,0.38,0.14 +1999,9,1999.7083,364.95,368.28,28,0.74,0.27 +1999,10,1999.7917,365.52,368.8,31,0.28,0.1 +1999,11,1999.875,366.88,368.86,28,0.25,0.09 +1999,12,1999.9583,368.26,368.93,26,0.29,0.11 +2000,1,2000.0417,369.45,369.24,26,0.47,0.18 +2000,2,2000.125,369.71,368.99,19,0.48,0.21 +2000,3,2000.2083,370.75,369.24,30,0.47,0.16 +2000,4,2000.2917,371.98,369.44,27,0.58,0.21 +2000,5,2000.375,371.75,368.87,28,0.53,0.19 +2000,6,2000.4583,371.87,369.66,28,0.24,0.09 +2000,7,2000.5417,370.02,369.36,25,0.31,0.12 +2000,8,2000.625,368.27,369.87,27,0.42,0.15 +2000,9,2000.7083,367.15,370.46,25,0.36,0.14 +2000,10,2000.7917,367.18,370.42,30,0.27,0.09 +2000,11,2000.875,368.53,370.48,25,0.3,0.12 +2000,12,2000.9583,369.83,370.46,30,0.38,0.13 +2001,1,2001.0417,370.76,370.6,30,0.56,0.2 +2001,2,2001.125,371.69,370.95,26,0.61,0.23 +2001,3,2001.2083,372.63,371.06,26,0.46,0.17 +2001,4,2001.2917,373.55,370.99,29,0.56,0.2 +2001,5,2001.375,374.03,371.11,24,0.41,0.16 +2001,6,2001.4583,373.4,371.17,26,0.37,0.14 +2001,7,2001.5417,371.68,371.08,25,0.62,0.24 +2001,8,2001.625,369.78,371.39,27,0.6,0.22 +2001,9,2001.7083,368.34,371.61,28,0.49,0.18 +2001,10,2001.7917,368.61,371.85,31,0.33,0.11 +2001,11,2001.875,369.94,371.92,24,0.24,0.09 +2001,12,2001.9583,371.42,372.09,29,0.4,0.14 +2002,1,2002.0417,372.7,372.48,28,0.52,0.19 +2002,2,2002.125,373.37,372.49,28,0.66,0.24 +2002,3,2002.2083,374.3,372.61,24,0.62,0.24 +2002,4,2002.2917,375.19,372.54,29,0.55,0.19 +2002,5,2002.375,375.93,372.98,29,0.57,0.2 +2002,6,2002.4583,375.69,373.46,28,0.46,0.17 +2002,7,2002.5417,374.16,373.58,25,0.47,0.18 +2002,8,2002.625,372.03,373.7,28,0.65,0.24 +2002,9,2002.7083,370.93,374.29,23,0.74,0.3 +2002,10,2002.7917,370.73,374.06,31,0.62,0.21 +2002,11,2002.875,372.43,374.52,29,0.43,0.15 +2002,12,2002.9583,373.98,374.72,31,0.46,0.16 +2003,1,2003.0417,375.07,374.82,30,0.51,0.18 +2003,2,2003.125,375.82,374.95,27,0.58,0.21 +2003,3,2003.2083,376.64,374.99,28,0.63,0.23 +2003,4,2003.2917,377.92,375.24,27,0.37,0.14 +2003,5,2003.375,378.78,375.73,30,0.78,0.27 +2003,6,2003.4583,378.46,376.21,25,0.39,0.15 +2003,7,2003.5417,376.88,376.37,29,0.7,0.25 +2003,8,2003.625,374.57,376.27,23,0.57,0.23 +2003,9,2003.7083,373.34,376.65,25,0.37,0.14 +2003,10,2003.7917,373.31,376.65,30,0.33,0.12 +2003,11,2003.875,374.84,376.99,26,0.45,0.17 +2003,12,2003.9583,376.17,376.93,27,0.4,0.15 +2004,1,2004.0417,377.17,376.96,30,0.45,0.16 +2004,2,2004.125,378.05,377.19,29,0.74,0.26 +2004,3,2004.2083,379.06,377.4,27,0.84,0.31 +2004,4,2004.2917,380.54,377.8,26,0.52,0.19 +2004,5,2004.375,380.8,377.66,28,0.61,0.22 +2004,6,2004.4583,379.87,377.57,21,0.47,0.19 +2004,7,2004.5417,377.65,377.12,25,0.5,0.19 +2004,8,2004.625,376.17,377.9,16,0.45,0.21 +2004,9,2004.7083,374.43,377.8,15,0.56,0.28 +2004,10,2004.7917,374.63,378,29,0.19,0.07 +2004,11,2004.875,376.33,378.49,29,0.62,0.22 +2004,12,2004.9583,377.68,378.48,30,0.29,0.1 +2005,1,2005.0417,378.63,378.37,31,0.32,0.11 +2005,2,2005.125,379.91,379.1,24,0.6,0.24 +2005,3,2005.2083,380.95,379.45,26,1.16,0.44 +2005,4,2005.2917,382.48,379.84,26,0.53,0.2 +2005,5,2005.375,382.64,379.49,31,0.61,0.21 +2005,6,2005.4583,382.4,380.07,28,0.21,0.08 +2005,7,2005.5417,380.93,380.38,29,0.38,0.13 +2005,8,2005.625,378.93,380.61,26,0.53,0.2 +2005,9,2005.7083,376.89,380.2,27,0.51,0.19 +2005,10,2005.7917,377.19,380.5,14,0.15,0.08 +2005,11,2005.875,378.54,380.69,23,0.45,0.18 +2005,12,2005.9583,380.31,381.09,26,0.39,0.15 +2006,1,2006.0417,381.58,381.33,24,0.31,0.12 +2006,2,2006.125,382.4,381.58,25,0.51,0.2 +2006,3,2006.2083,382.86,381.32,29,0.55,0.2 +2006,4,2006.2917,384.8,382.11,25,0.49,0.19 +2006,5,2006.375,385.22,382.06,24,0.45,0.17 +2006,6,2006.4583,384.24,381.93,28,0.43,0.16 +2006,7,2006.5417,382.65,382.1,24,0.32,0.12 +2006,8,2006.625,380.6,382.27,27,0.47,0.17 +2006,9,2006.7083,379.04,382.35,25,0.42,0.16 +2006,10,2006.7917,379.33,382.66,23,0.4,0.16 +2006,11,2006.875,380.35,382.52,29,0.39,0.14 +2006,12,2006.9583,382.02,382.84,27,0.38,0.14 +2007,1,2007.0417,383.1,382.88,24,0.76,0.3 +2007,2,2007.125,384.12,383.22,21,0.81,0.34 +2007,3,2007.2083,384.81,383.17,27,0.63,0.23 +2007,4,2007.2917,386.73,383.95,25,0.76,0.29 +2007,5,2007.375,386.78,383.56,29,0.64,0.23 +2007,6,2007.4583,386.33,384.06,26,0.42,0.16 +2007,7,2007.5417,384.73,384.25,27,0.44,0.16 +2007,8,2007.625,382.24,383.95,22,0.64,0.26 +2007,9,2007.7083,381.2,384.56,21,0.45,0.19 +2007,10,2007.7917,381.37,384.72,29,0.19,0.07 +2007,11,2007.875,382.7,384.9,30,0.31,0.11 +2007,12,2007.9583,384.19,385.07,22,0.34,0.14 +2008,1,2008.0417,385.78,385.54,31,0.56,0.19 +2008,2,2008.125,386.06,385.2,26,0.58,0.22 +2008,3,2008.2083,386.28,384.72,30,0.6,0.21 +2008,4,2008.2917,387.34,384.71,22,1.19,0.49 +2008,5,2008.375,388.78,385.69,25,0.57,0.22 +2008,6,2008.4583,387.99,385.68,23,0.49,0.2 +2008,7,2008.5417,386.6,386.04,10,0.96,0.58 +2008,8,2008.625,384.32,385.98,25,0.66,0.25 +2008,9,2008.7083,383.41,386.68,27,0.34,0.12 +2008,10,2008.7917,383.21,386.49,23,0.27,0.11 +2008,11,2008.875,384.41,386.59,28,0.29,0.11 +2008,12,2008.9583,385.79,386.64,29,0.27,0.1 +2009,1,2009.0417,387.17,386.86,30,0.38,0.13 +2009,2,2009.125,387.7,386.81,26,0.49,0.18 +2009,3,2009.2083,389.04,387.54,28,0.68,0.25 +2009,4,2009.2917,389.76,387.15,29,0.85,0.3 +2009,5,2009.375,390.36,387.24,30,0.51,0.18 +2009,6,2009.4583,389.7,387.46,29,0.6,0.21 +2009,7,2009.5417,388.25,387.77,22,0.31,0.13 +2009,8,2009.625,386.29,387.99,28,0.62,0.22 +2009,9,2009.7083,384.95,388.22,28,0.56,0.2 +2009,10,2009.7917,384.64,387.88,30,0.31,0.11 +2009,11,2009.875,386.23,388.36,30,0.29,0.1 +2009,12,2009.9583,387.63,388.43,20,0.47,0.2 +2010,1,2010.0417,388.91,388.62,30,0.92,0.32 +2010,2,2010.125,390.41,389.47,20,1.31,0.56 +2010,3,2010.2083,391.37,389.85,25,1.05,0.4 +2010,4,2010.2917,392.67,390.12,26,0.65,0.24 +2010,5,2010.375,393.21,390.09,29,0.65,0.23 +2010,6,2010.4583,392.38,390.1,28,0.42,0.15 +2010,7,2010.5417,390.41,389.94,29,0.47,0.17 +2010,8,2010.625,388.54,390.21,26,0.41,0.16 +2010,9,2010.7083,387.03,390.32,29,0.55,0.19 +2010,10,2010.7917,387.43,390.72,31,0.27,0.09 +2010,11,2010.875,388.87,390.99,29,0.42,0.15 +2010,12,2010.9583,389.99,390.8,29,0.47,0.17 +2011,1,2011.0417,391.5,391.2,29,0.88,0.31 +2011,2,2011.125,392.05,391.12,28,0.47,0.17 +2011,3,2011.2083,392.8,391.28,29,0.97,0.35 +2011,4,2011.2917,393.44,390.84,28,0.73,0.26 +2011,5,2011.375,394.41,391.24,29,0.93,0.33 +2011,6,2011.4583,393.95,391.65,28,0.45,0.16 +2011,7,2011.5417,392.72,392.24,26,0.71,0.26 +2011,8,2011.625,390.33,392.03,27,0.42,0.15 +2011,9,2011.7083,389.28,392.6,26,0.31,0.12 +2011,10,2011.7917,389.19,392.52,30,0.17,0.06 +2011,11,2011.875,390.48,392.63,28,0.26,0.1 +2011,12,2011.9583,392.06,392.86,26,0.37,0.14 +2012,1,2012.0417,393.31,393.08,30,0.77,0.27 +2012,2,2012.125,394.04,393.21,26,1.19,0.45 +2012,3,2012.2083,394.59,393,30,0.63,0.22 +2012,4,2012.2917,396.38,393.65,29,0.59,0.21 +2012,5,2012.375,396.93,393.73,30,0.5,0.17 +2012,6,2012.4583,395.91,393.64,28,0.59,0.21 +2012,7,2012.5417,394.56,394.12,26,0.3,0.11 +2012,8,2012.625,392.59,394.36,30,0.52,0.18 +2012,9,2012.7083,391.32,394.74,26,0.42,0.16 +2012,10,2012.7917,391.27,394.63,28,0.23,0.08 +2012,11,2012.875,393.2,395.24,29,0.53,0.19 +2012,12,2012.9583,394.57,395.27,29,0.44,0.16 +2013,1,2013.0417,395.78,395.63,28,0.6,0.22 +2013,2,2013.125,397.03,396.24,25,0.57,0.22 +2013,3,2013.2083,397.66,396.08,30,0.71,0.25 +2013,4,2013.2917,398.64,395.8,22,0.59,0.24 +2013,5,2013.375,400.02,396.65,28,0.37,0.13 +2013,6,2013.4583,398.81,396.48,26,0.43,0.16 +2013,7,2013.5417,397.51,397.12,21,0.52,0.22 +2013,8,2013.625,395.39,397.26,27,0.45,0.16 +2013,9,2013.7083,393.72,397.23,26,0.35,0.13 +2013,10,2013.7917,393.9,397.24,28,0.16,0.06 +2013,11,2013.875,395.36,397.34,30,0.6,0.21 +2013,12,2013.9583,397.03,397.78,30,0.48,0.17 +2014,1,2014.0417,398.04,397.74,31,0.49,0.17 +2014,2,2014.125,398.27,397.46,27,0.51,0.19 +2014,3,2014.2083,399.91,398.38,22,0.84,0.34 +2014,4,2014.2917,401.51,398.64,26,0.5,0.19 +2014,5,2014.375,401.96,398.57,22,0.51,0.21 +2014,6,2014.4583,401.43,399.11,28,0.36,0.13 +2014,7,2014.5417,399.38,398.95,25,0.56,0.21 +2014,8,2014.625,397.32,399.2,21,0.22,0.09 +2014,9,2014.7083,395.64,399.2,21,0.56,0.24 +2014,10,2014.7917,396.29,399.69,24,0.75,0.29 +2014,11,2014.875,397.55,399.63,27,0.38,0.14 +2014,12,2014.9583,399.15,399.88,29,0.61,0.22 +2015,1,2015.0417,400.18,399.92,30,0.55,0.19 +2015,2,2015.125,400.55,399.78,28,0.63,0.23 +2015,3,2015.2083,401.74,400.23,24,1.02,0.4 +2015,4,2015.2917,403.34,400.47,26,0.86,0.32 +2015,5,2015.375,404.15,400.71,30,0.32,0.11 +2015,6,2015.4583,402.97,400.66,29,0.47,0.17 +2015,7,2015.5417,401.46,401.1,24,0.57,0.22 +2015,8,2015.625,399.11,401.03,28,0.74,0.27 +2015,9,2015.7083,397.82,401.43,25,0.32,0.12 +2015,10,2015.7917,398.49,401.88,28,0.56,0.2 +2015,11,2015.875,400.27,402.22,25,0.58,0.22 +2015,12,2015.9583,402.06,402.72,30,0.67,0.23 +2016,1,2016.0417,402.73,402.46,27,0.56,0.21 +2016,2,2016.125,404.25,403.41,25,1.11,0.43 +2016,3,2016.2083,405.06,403.55,28,0.81,0.29 +2016,4,2016.2917,407.6,404.78,23,1.04,0.41 +2016,5,2016.375,407.9,404.42,29,0.5,0.18 +2016,6,2016.4583,406.99,404.59,26,0.6,0.23 +2016,7,2016.5417,404.59,404.23,28,0.88,0.32 +2016,8,2016.625,402.45,404.39,24,0.6,0.23 +2016,9,2016.7083,401.23,404.84,25,0.44,0.17 +2016,10,2016.7917,401.79,405.22,29,0.3,0.11 +2016,11,2016.875,403.72,405.73,27,0.72,0.26 +2016,12,2016.9583,404.64,405.33,29,0.44,0.16 +2017,1,2017.0417,406.36,406.05,27,0.68,0.25 +2017,2,2017.125,406.66,405.82,26,0.71,0.27 +2017,3,2017.2083,407.54,406.06,24,1.03,0.4 +2017,4,2017.2917,409.22,406.38,26,0.86,0.32 +2017,5,2017.375,409.89,406.38,27,0.57,0.21 +2017,6,2017.4583,409.08,406.69,26,0.54,0.2 +2017,7,2017.5417,407.33,407,28,0.61,0.22 +2017,8,2017.625,405.32,407.29,29,0.32,0.12 +2017,9,2017.7083,403.57,407.16,26,0.37,0.14 +2017,10,2017.7917,403.82,407.21,27,0.3,0.11 +2017,11,2017.875,405.31,407.34,26,0.41,0.15 +2017,12,2017.9583,407,407.71,31,0.57,0.2 +2018,1,2018.0417,408.15,407.89,29,0.55,0.19 +2018,2,2018.125,408.52,407.65,28,0.52,0.19 +2018,3,2018.2083,409.59,408.09,29,0.65,0.23 +2018,4,2018.2917,410.45,407.65,21,0.9,0.38 +2018,5,2018.375,411.44,407.94,24,0.86,0.33 +2018,6,2018.4583,410.99,408.59,29,0.61,0.22 +2018,7,2018.5417,408.9,408.55,27,0.46,0.17 +2018,8,2018.625,407.16,409.07,31,0.28,0.1 +2018,9,2018.7083,405.71,409.28,29,0.45,0.16 +2018,10,2018.7917,406.19,409.61,30,0.32,0.11 +2018,11,2018.875,408.21,410.24,24,0.56,0.22 +2018,12,2018.9583,409.27,410.01,30,0.5,0.17 +2019,1,2019.0417,411.03,410.78,26,1.26,0.47 +2019,2,2019.125,411.96,411.09,27,1.14,0.42 +2019,3,2019.2083,412.18,410.68,28,1.12,0.4 +2019,4,2019.2917,413.54,410.74,27,0.6,0.22 +2019,5,2019.375,414.86,411.37,28,0.5,0.18 +2019,6,2019.4583,414.16,411.76,27,0.36,0.13 +2019,7,2019.5417,411.97,411.62,25,0.82,0.31 +2019,8,2019.625,410.18,412.09,29,0.33,0.12 +2019,9,2019.7083,408.79,412.36,29,0.35,0.13 +2019,10,2019.7917,408.75,412.17,29,0.31,0.11 +2019,11,2019.875,410.48,412.5,26,0.4,0.15 +2019,12,2019.9583,411.98,412.72,31,0.4,0.14 +2020,1,2020.0417,413.61,413.35,29,0.73,0.26 +2020,2,2020.125,414.34,413.47,28,0.69,0.25 +2020,3,2020.2083,414.74,413.24,26,0.33,0.12 +2020,4,2020.2917,416.45,413.65,28,0.65,0.24 +2020,5,2020.375,417.31,413.81,27,0.61,0.23 +2020,6,2020.4583,416.62,414.22,27,0.45,0.16 +2020,7,2020.5417,414.61,414.26,30,0.57,0.2 +2020,8,2020.625,412.78,414.69,25,0.25,0.1 +2020,9,2020.7083,411.52,415.1,29,0.31,0.11 +2020,10,2020.7917,411.51,414.92,30,0.22,0.08 +2020,11,2020.875,413.11,415.14,27,0.8,0.29 +2020,12,2020.9583,414.25,415,30,0.48,0.17 +2021,1,2021.0417,415.52,415.26,29,0.44,0.16 +2021,2,2021.125,416.75,415.88,28,1.01,0.36 diff --git a/pandas/tests/io/data/rdata/ppm_df.rda b/pandas/tests/io/data/rdata/ppm_df.rda new file mode 100644 index 0000000000000..b900815050a55 Binary files /dev/null and b/pandas/tests/io/data/rdata/ppm_df.rda differ diff --git a/pandas/tests/io/data/rdata/ppm_df.rds b/pandas/tests/io/data/rdata/ppm_df.rds new file mode 100644 index 0000000000000..242a3e2b11236 Binary files /dev/null and b/pandas/tests/io/data/rdata/ppm_df.rds differ diff --git a/pandas/tests/io/data/rdata/ppm_ts.rds b/pandas/tests/io/data/rdata/ppm_ts.rds new file mode 100644 index 0000000000000..3f49b7d24f6b0 Binary files /dev/null and b/pandas/tests/io/data/rdata/ppm_ts.rds differ diff --git a/pandas/tests/io/data/rdata/sea_ice_df.rds b/pandas/tests/io/data/rdata/sea_ice_df.rds new file mode 100644 index 0000000000000..23229ca9a87db Binary files /dev/null and b/pandas/tests/io/data/rdata/sea_ice_df.rds differ diff --git a/pandas/tests/io/data/rdata/species_mtx.rds b/pandas/tests/io/data/rdata/species_mtx.rds new file mode 100644 index 0000000000000..aa9ebe379e50a Binary files /dev/null and b/pandas/tests/io/data/rdata/species_mtx.rds differ diff --git a/pandas/tests/io/test_rdata.py b/pandas/tests/io/test_rdata.py new file mode 100644 index 0000000000000..16674f32d3d7e --- /dev/null +++ b/pandas/tests/io/test_rdata.py @@ -0,0 +1,872 @@ +import gzip +from io import BytesIO +import os +import pickle +import shutil +from urllib.error import HTTPError + +import numpy as np +import pytest + +from pandas._libs.tslibs.np_datetime import OutOfBoundsDatetime +from pandas.compat import ( + IS64, + PY38, +) +import pandas.util._test_decorators as td + +from pandas import ( + Categorical, + DataFrame, + Timestamp, + array, + interval_range, + period_range, + to_datetime, +) +import pandas._testing as tm +from pandas.arrays import SparseArray + +from pandas.io.rdata._rdata import ( + LibrdataReader, + LibrdataReaderError, + LibrdataWriter, + LibrdataWriterError, +) +from pandas.io.rdata.rdata_reader import read_rdata + +ghg_df = DataFrame( + { + "gas": { + 141: "Carbon dioxide", + 142: "Methane", + 143: "Nitrous oxide", + 144: "Fluorinated gases", + 145: "Total", + }, + "year": {141: 2018, 142: 2018, 143: 2018, 144: 2018, 145: 2018}, + "emissions": { + 141: 5424.881502132882, + 142: 634.4571270782675, + 143: 434.52855537666636, + 144: 182.78243246177678, + 145: 6676.649617049592, + }, + } +).rename_axis("rownames") + +plants_df = DataFrame( + { + "plant_group": { + 16: "Pteridophytes", + 17: "Pteridophytes", + 18: "Pteridophytes", + 19: "Pteridophytes", + 20: "Pteridophytes", + }, + "status": { + 16: "Data Deficient", + 17: "Extinct", + 18: "Not Threatened", + 19: "Possibly Threatened", + 20: "Threatened", + }, + "count": {16: 398, 17: 65, 18: 1294, 19: 408, 20: 1275}, + } +).rename_axis("rownames") + +sea_ice_df = DataFrame( + { + "year": {1012: 2016, 1013: 2017, 1014: 2018, 1015: 2019, 1016: 2020}, + "mo": {1012: 12, 1013: 12, 1014: 12, 1015: 12, 1016: 12}, + "data.type": { + 1012: "Goddard", + 1013: "Goddard", + 1014: "Goddard", + 1015: "Goddard", + 1016: "NRTSI-G", + }, + "region": {1012: "S", 1013: "S", 1014: "S", 1015: "S", 1016: "S"}, + "extent": {1012: 8.28, 1013: 9.48, 1014: 9.19, 1015: 9.41, 1016: 10.44}, + "area": {1012: 5.51, 1013: 6.23, 1014: 5.59, 1015: 6.59, 1016: 6.5}, + } +).rename_axis("rownames") + +ppm_df = DataFrame( + { + "date": { + 754: Timestamp("2020-12-16 23:42:25.920000256"), + 755: Timestamp("2021-01-16 11:17:31.199999744"), + 756: Timestamp("2021-02-15 21:00:00"), + 757: Timestamp("2021-03-18 06:42:28.800000256"), + 758: Timestamp("2021-04-17 17:17:31.199999744"), + }, + "decimal_date": { + 754: 2020.9583, + 755: 2021.0417, + 756: 2021.125, + 757: 2021.2083, + 758: 2021.2917, + }, + "monthly_average": { + 754: 414.25, + 755: 415.52, + 756: 416.75, + 757: 417.64, + 758: 419.05, + }, + "deseasonalized": { + 754: 414.98, + 755: 415.26, + 756: 415.93, + 757: 416.18, + 758: 416.23, + }, + "num_days": {754: 30, 755: 29, 756: 28, 757: 28, 758: 24}, + "std_dev_of_days": {754: 0.47, 755: 0.44, 756: 1.02, 757: 0.86, 758: 1.12}, + "unc_of_mon_mean": {754: 0.17, 755: 0.16, 756: 0.37, 757: 0.31, 758: 0.44}, + } +).rename_axis("rownames") + + +@pytest.fixture(params=["rda", "rds"]) +def rtype(request): + return request.param + + +@pytest.fixture(params=[None, "gzip", "bz2", "xz"]) +def comp(request): + return request.param + + +# RDATA READER + + +# PATH_OR_BUFFER + + +def test_read_rds_file(datapath): + filename = datapath("io", "data", "rdata", "ghg_df.rds") + r_dfs = read_rdata(filename) + + tm.assert_frame_equal(ghg_df, r_dfs["r_dataframe"].tail()) + + +def test_read_rda_file(datapath): + filename = datapath("io", "data", "rdata", "env_data_dfs.rda") + r_dfs = read_rdata(filename) + + assert list(r_dfs.keys()) == ["ghg_df", "plants_df", "sea_ice_df"] + + tm.assert_frame_equal(ghg_df, r_dfs["ghg_df"].tail()) + tm.assert_frame_equal(plants_df, r_dfs["plants_df"].tail()) + tm.assert_frame_equal(sea_ice_df, r_dfs["sea_ice_df"].tail()) + + +def test_read_rds_filelike(datapath): + filename = datapath("io", "data", "rdata", "sea_ice_df.rds") + + with open(filename, "rb") as f: + r_dfs = read_rdata(f, file_format="rds") + + tm.assert_frame_equal(sea_ice_df, r_dfs["r_dataframe"].tail()) + + +def test_read_rda_filelike(datapath): + filename = datapath("io", "data", "rdata", "env_data_dfs.rda") + + with open(filename, "rb") as f: + r_dfs = read_rdata(f, file_format="rda") + + assert list(r_dfs.keys()) == ["ghg_df", "plants_df", "sea_ice_df"] + + tm.assert_frame_equal(ghg_df, r_dfs["ghg_df"].tail()) + tm.assert_frame_equal(plants_df, r_dfs["plants_df"].tail()) + tm.assert_frame_equal(sea_ice_df, r_dfs["sea_ice_df"].tail()) + + +def test_bytesio_rds(datapath): + filename = datapath("io", "data", "rdata", "sea_ice_df.rds") + + with open(filename, "rb") as f: + with BytesIO(f.read()) as b_io: + r_dfs = read_rdata(b_io, file_format="rds") + + tm.assert_frame_equal(sea_ice_df, r_dfs["r_dataframe"].tail()) + + +def test_bytesio_rda(datapath): + filename = datapath("io", "data", "rdata", "env_data_dfs.rda") + + with open(filename, "rb") as f: + with BytesIO(f.read()) as b_io: + r_dfs = read_rdata(b_io, file_format="rda") + + assert list(r_dfs.keys()) == ["ghg_df", "plants_df", "sea_ice_df"] + + tm.assert_frame_equal(ghg_df, r_dfs["ghg_df"].tail()) + tm.assert_frame_equal(plants_df, r_dfs["plants_df"].tail()) + tm.assert_frame_equal(sea_ice_df, r_dfs["sea_ice_df"].tail()) + + +# FILE FORMAT + + +def test_read_wrong_format(datapath): + with pytest.raises(ValueError, match="not a valid value for file_format"): + filename = datapath("io", "data", "rdata", "plants_df.rds") + read_rdata(filename, file_format="r") + + +def test_read_wrong_file(): + with pytest.raises(FileNotFoundError, match="file cannot be found"): + filename = os.path.join("data", "rdata", "plants_df.rda") + read_rdata(filename) + + +def test_read_rds_non_df(datapath): + with pytest.raises( + LibrdataReaderError, + match="Invalid file, or file has unsupported features", + ): + filename = datapath("io", "data", "rdata", "ppm_ts.rds") + read_rdata(filename) + + +def test_read_rda_non_dfs(datapath): + with pytest.raises( + LibrdataReaderError, + match="Invalid file, or file has unsupported features", + ): + filename = datapath("io", "data", "rdata", "env_data_non_dfs.rda") + read_rdata(filename) + + +def test_read_not_rda_file(datapath): + with pytest.raises( + LibrdataReaderError, match="The file contains an unrecognized object" + ): + filename = datapath("io", "data", "rdata", "ppm_df.csv") + read_rdata(filename, file_format="rda", compression=None) + + +def test_bytes_read_infer_rds(datapath): + filename = datapath("io", "data", "rdata", "sea_ice_df.rds") + + with pytest.raises(ValueError, match="Unable to infer file format from file name"): + with open(filename, "rb") as f: + read_rdata(f) + + +def test_bytes_read_infer_rda(datapath): + filename = datapath("io", "data", "rdata", "env_data_dfs.rda") + + with pytest.raises(ValueError, match="Unable to infer file format from file name"): + with open(filename, "rb") as f: + read_rdata(f) + + +# URL + + +@tm.network +def test_read_rda_url(): + url_df = DataFrame( + { + "carrier": {1: "9E", 2: "AA", 3: "AS", 4: "B6", 5: "DL"}, + "name": { + 1: "Endeavor Air Inc.", + 2: "American Airlines Inc.", + 3: "Alaska Airlines Inc.", + 4: "JetBlue Airways", + 5: "Delta Air Lines Inc.", + }, + } + ).rename_axis("rownames") + + url = ( + "https://github.com/hadley/nycflights13/blob/master/data/airlines.rda?raw=true" + ) + r_dfs = read_rdata(url, file_format="rda") + + tm.assert_frame_equal(url_df, r_dfs["airlines"].head()) + + +@tm.network +def test_read_unable_infer_format(): + with pytest.raises(ValueError, match="Unable to infer file format from file name"): + url = ( + "https://github.com/hadley/nycflights13/" + "blob/master/data/airlines.rda?raw=true" + ) + read_rdata(url) + + +@tm.network +def test_read_wrong_url(): + with pytest.raises(HTTPError, match="HTTP Error 404: Not Found"): + url = "https://example.com/data.rdata" + read_rdata(url) + + +# S3 + + +@pytest.mark.slow +@tm.network +@td.skip_if_no("s3fs") +def test_read_rda_s3(): + # Public Data of CRAN Packages on GitHub + rda_s3 = "s3://public-r-data/ghcran.Rdata" + r_df = read_rdata(rda_s3, compression=None, rownames=False) + + # below needed to pass codespell on keyword + r_df["ghcran"].columns.values[107] = "Repository" + + # test structure and not static data since data changes daily + expected_cols = [ + "Package", + "Type", + "Title", + "Version", + "Date", + "Author", + "Maintainer", + "Description", + "License", + "Depends", + "Suggests", + "NeedsCompilation", + "Packaged", + "Repository", + "Date/Publication", + "Contact", + "Imports", + "VignetteBuilder", + "Encoding", + "SystemRequirements", + "RoxygenNote", + "LazyLoad", + "URL", + "Authors@R", + "Classification/ACM", + "Classification/JEL", + "LinkingTo", + "BugReports", + "LazyData", + "Keywords", + "Repository/R-Forge/Project", + "Repository/R-Forge/Revision", + "Repository/R-Forge/DateTimeStamp", + "biocViews", + "Collate", + "Copyright", + "ByteCompile", + "ZipData", + "BuildVignettes", + "Additional_repositories", + "Acknowledgements", + "MailingList", + "Enhances", + "Classification/MSC", + "OS_type", + "BuildManual", + "BuildResaveData", + "References", + "Note", + "X-CRAN-Original-Maintainer", + "RcppModules", + "Data", + "BioViews", + "lazy-loading", + "URLNote", + "Reference", + "KeepSource", + "LazyDataCompression", + "Language", + "Requires", + "Dependencies", + "X-CRAN-Comment", + "Citation", + "Biarch", + "Published", + "RequiredLauncherGeneration", + "SuggestsNote", + "Priority", + "Acknowledgments", + "Revision", + "License_is_FOSS", + "License_restricts_use", + "Archs", + "LazyDataNote", + "Affiliations", + "LicenseDetails", + "SCM", + "Classification/ACM-2012", + "X-CRAN-Original-Package", + "Dialect", + "Limitations", + "Check", + "Recommends", + "LastChangedDate", + "LastChangedRevision", + "SVNRevision", + "X-CRAN-Original-OS_type", + "RcmdrModels", + "Log-Exceptions", + "Models", + "DateNote", + "SystemRequirementsNote", + "Url", + "Reverse depends", + "Lazyload", + "DependsNote", + "VersionSplus", + "MaintainerSplus", + "VersionNote", + "Disclaimer", + "LicenseNote", + "Namespace", + "Address", + "Keyword", + "Contributors", + "NOTE", + "Acknowledgement", + "Repository", + "Lazydata", + "RdMacros", + "HowToCite", + "Publication", + "Reference Manual", + "Special Acknowledgement", + "SysDataCompression", + "DisplayMode", + "Nickname", + "BuildKeepEmpty", + "Twitter", + "Remotes", + "SystemRequirement", + "Github", + ] + + assert isinstance(r_df, dict) + assert isinstance(r_df["ghcran"], DataFrame) + assert r_df["ghcran"].columns.tolist() == expected_cols + + +# TYPE + + +def test_read_rds_df_output(datapath): + filename = datapath("io", "data", "rdata", "sea_ice_df.rds") + r_dfs = read_rdata(filename) + + assert isinstance(r_dfs, dict) + assert list(r_dfs.keys()) == ["r_dataframe"] + + +def test_read_rda_dict_output(datapath): + filename = datapath("io", "data", "rdata", "env_data_dfs.rda") + r_dfs = read_rdata(filename) + + assert isinstance(r_dfs, dict) + assert list(r_dfs.keys()) == ["ghg_df", "plants_df", "sea_ice_df"] + + +# SELECT_FRAMES + + +def test_read_select_frames_rda_dfs(datapath): + filename = datapath("io", "data", "rdata", "env_data_dfs.rda") + r_dfs = read_rdata(filename, select_frames=["ghg_df", "sea_ice_df"]) + + assert "plants_df" not in list(r_dfs.keys()) + assert "ghg_df" in list(r_dfs.keys()) + assert "sea_ice_df" in list(r_dfs.keys()) + + +def test_read_wrong_select_frames(datapath): + with pytest.raises(TypeError, match="not a valid type for select_frames"): + filename = datapath("io", "data", "rdata", "env_data_dfs.rda") + read_rdata(filename, select_frames="plants_df") + + +# ROWNAMES + + +def test_read_rownames_true_rds(datapath): + filename = datapath("io", "data", "rdata", "sea_ice_df.rds") + r_df = read_rdata(filename, rownames=True)["r_dataframe"] + + if isinstance(r_df, DataFrame): + assert r_df.index.name == "rownames" + + +def test_read_rownames_false_rds(datapath): + filename = datapath("io", "data", "rdata", "sea_ice_df.rds") + r_df = read_rdata(filename, rownames=False)["r_dataframe"] + + if isinstance(r_df, DataFrame): + assert r_df.index.name != "rownames" + + +def test_read_rownames_true_rda(datapath): + filename = datapath("io", "data", "rdata", "env_data_dfs.rda") + r_dfs = read_rdata(filename, rownames=True) + + assert r_dfs["ghg_df"].index.name == "rownames" + assert r_dfs["plants_df"].index.name == "rownames" + assert r_dfs["sea_ice_df"].index.name == "rownames" + + +def test_read_rownames_false_rda(datapath): + filename = datapath("io", "data", "rdata", "env_data_dfs.rda") + r_dfs = read_rdata(filename, rownames=False) + + assert r_dfs["ghg_df"].index.name != "rownames" + assert r_dfs["plants_df"].index.name != "rownames" + assert r_dfs["sea_ice_df"].index.name != "rownames" + + +# ENCODING + + +def test_non_utf8_data(datapath, rtype): + filename = datapath("io", "data", "rdata", f"climate_non_utf8_df.{rtype}") + with pytest.raises(SystemError, match=("returned a result with an error set")): + read_rdata(filename) + + +# DATE / TIME + + +def test_utc_datetime_convert(datapath): + filename = datapath("io", "data", "rdata", "ppm_df.rda") + r_dfs = read_rdata(filename) + + assert str(r_dfs["ppm_df"]["date"].dtype) == "datetime64[ns]" + + tm.assert_frame_equal(ppm_df, r_dfs["ppm_df"].tail()) + + +def test_read_outbound_dates(datapath, rtype): + filename = datapath("io", "data", "rdata", f"planetary_boundaries_df.{rtype}") + with pytest.raises( + OutOfBoundsDatetime, match=("cannot convert input with unit 's'") + ): + read_rdata(filename) + + +# RDATA WRITER + +# PATH_OR_BUFFER + + +def test_write_read_file(rtype): + with tm.ensure_clean("test.out") as path: + ghg_df.to_rdata(path, file_format=rtype, index=False) + r_dfs = read_rdata(path, file_format=rtype, rownames=False) + + expected = ghg_df.reset_index(drop=True) + output = r_dfs["pandas_dataframe"] if rtype == "rda" else r_dfs["r_dataframe"] + + tm.assert_frame_equal(output, expected) + + +def test_write_read_pathlib(rtype): + from pathlib import Path + + with tm.ensure_clean_dir() as tmp_dir: + tmp_file = Path(tmp_dir).joinpath("test.out") + sea_ice_df.to_rdata(tmp_file, file_format=rtype, index=False) + r_dfs = read_rdata(tmp_file, file_format=rtype, rownames=False) + + expected = sea_ice_df.reset_index(drop=True) + output = r_dfs["pandas_dataframe"] if rtype == "rda" else r_dfs["r_dataframe"] + + tm.assert_frame_equal(output, expected) + + +def test_write_read_filelike(rtype): + with BytesIO() as b_io: + sea_ice_df.to_rdata(b_io, file_format=rtype, compression=None, index=False) + r_dfs = read_rdata( + b_io.getvalue(), + file_format=rtype, + rownames=False, + compression=None, + ) + + expected = sea_ice_df.reset_index(drop=True) + output = r_dfs["pandas_dataframe"] if rtype == "rda" else r_dfs["r_dataframe"] + + tm.assert_frame_equal(output, expected) + + +# FILE FORMAT + + +def test_write_wrong_format(): + with tm.ensure_clean("test.rda") as path: + with pytest.raises(ValueError, match=("not a valid value for file_format")): + ghg_df.to_rdata(path, file_format="csv") + + +def test_write_unable_to_infer(): + with tm.ensure_clean("test") as path: + with pytest.raises( + ValueError, match=("Unable to infer file format from file name") + ): + ghg_df.to_rdata(path) + + +# INDEX + + +def test_write_index_true(rtype): + with tm.ensure_clean("test.out") as path: + plants_df.rename_axis(None).to_rdata(path, file_format=rtype, index=True) + r_dfs = read_rdata(path, file_format=rtype) + + r_df = r_dfs if rtype == "rds" else r_dfs["pandas_dataframe"] + + if isinstance(r_df, DataFrame): + assert "index" in r_df.columns + + +def test_write_index_false(rtype): + with tm.ensure_clean("test.out") as path: + plants_df.rename_axis(None).to_rdata(path, file_format=rtype, index=False) + r_dfs = read_rdata(path, file_format=rtype) + + r_df = r_dfs if rtype == "rds" else r_dfs["pandas_dataframe"] + + if isinstance(r_df, DataFrame): + assert "index" not in r_df.columns + + +# COMPRESSION + + +def test_write_all_compression(rtype, comp): + with tm.ensure_clean("test.out") as path: + ghg_df.to_rdata(path, file_format=rtype, compression=comp, index=False) + r_dfs = read_rdata(path, file_format=rtype, compression=comp, rownames=False) + + expected = ghg_df.reset_index(drop=True) + output = r_dfs["pandas_dataframe"] if rtype == "rda" else r_dfs["r_dataframe"] + + tm.assert_frame_equal(output, expected) + + +def test_write_zip_compression(rtype): + with tm.ensure_clean("test.out") as path: + with pytest.raises(ValueError, match=("not a supported value for compression")): + ghg_df.to_rdata(path, file_format=rtype, compression="zip") + + +@pytest.mark.skipif( + not PY38, + reason=("gzip.BadGzipFile exception added in 3.8"), +) +def test_write_read_mismatched_compression(rtype): + with tm.ensure_clean("test.out") as path: + with pytest.raises(gzip.BadGzipFile, match=("Not a gzipped file")): + ghg_df.to_rdata(path, file_format=rtype, compression=None) + read_rdata(path, file_format=rtype) + + +# RDA_NAMES + + +def test_write_new_rda_name(): + with tm.ensure_clean("test.rda") as path: + ghg_df.to_rdata(path, rda_name="py_df") + r_dfs = read_rdata(path) + + assert "py_df" in list(r_dfs.keys()) + + +# PROBLEM DATA + + +def test_write_nested_list(rtype, comp): + plants_df["plants_dict"] = plants_df["plant_group"].apply( + lambda x: plants_df["plant_group"].unique() + ) + with tm.ensure_clean("test") as path: + with pytest.raises( + LibrdataWriterError, + match=("DataFrame contains one more invalid types or data values"), + ): + plants_df.to_rdata(path, file_format=rtype, compression=comp) + + +# DATE / TIME + + +def test_write_read_utc_dateteime(): + with tm.ensure_clean("test.rda") as path: + ppm_df.to_rdata(path, index=False) + r_dfs = read_rdata(path, rownames=False) + + ppm_df["date"] = ppm_df["date"].dt.floor("S") + + tm.assert_frame_equal(ppm_df.reset_index(drop=True), r_dfs["pandas_dataframe"]) + + +# DTYPES + + +@pytest.mark.skipif( + not IS64, + reason=("large dtypes not supported in 32-bit"), +) +def test_write_read_dtypes(rtype, comp): + rda_name = "pandas_dataframe" if rtype == "rda" else "r_dataframe" + + dts = [ + Timestamp.min.ceil("S"), + Timestamp(-(10 ** 18)), + Timestamp(0), + Timestamp(10 ** 18), + Timestamp.now().floor("S"), + Timestamp.max.floor("S"), + ] + + arr = np.random.randn(6) + arr[2:-2] = np.nan + + dtypes_df = DataFrame( + { + "categ": Categorical( + ["ocean", "climate", "biosphere", "land", "freshwater", "atmosphere"] + ), + "interval": interval_range(start=10, periods=6, freq=10 * 2), + "bool": [False, True, True, True, False, False], + "int": [2 ** 31 - 1, 1, -(2 ** 31) + 1, -1, 0, 10 ** 9], + "float": [0, np.pi, float("nan"), np.e, np.euler_gamma, 0], + "string": array( + ["acidification", "change", "loss", "use", "depletion", "aersols"], + dtype="string", + ), + "sparse": SparseArray(arr), + "period": period_range( + start="2021-01-01 00:00:00", end="2021-06-01 00:00:00", freq="M" + ), + "datetime": to_datetime(dts), + "datetime_tz": to_datetime(dts).tz_localize("utc"), + "timedelta": [(dt - Timestamp(0)) for dt in dts], + } + ) + + with tm.ensure_clean("test") as path: + dtypes_df.to_rdata(path, file_format=rtype, index=False, compression=comp) + r_df = read_rdata(path, file_format=rtype, rownames=False, compression=comp)[ + rda_name + ] + + # convert non-primitive and non-datetimes to objects not supported in R + excl_types = ["bool", "number", "object", "datetime", "datetimetz", "timedelta"] + for col in dtypes_df.select_dtypes(exclude=excl_types).columns: + dtypes_df[col] = dtypes_df[col].astype(str) + + # convert special types + dtypes_df["sparse"] = np.array(dtypes_df["sparse"].values, dtype="float64") + dtypes_df["datetime_tz"] = dtypes_df["datetime_tz"].dt.tz_localize(None) + dtypes_df["timedelta"] = dtypes_df["timedelta"].dt.total_seconds() + + tm.assert_frame_equal(dtypes_df, r_df) + + +# CYTHON CLASSES + + +def test_reader_unpickled(datapath, rtype): + if rtype == "rda": + filename = datapath("io", "data", "rdata", "env_data_dfs.rda") + rda_name = "sea_ice_df" + elif rtype == "rds": + filename = datapath("io", "data", "rdata", "plants_df.rds") + rda_name = "r_dataframe" + + lbr1 = LibrdataReader() + + with tm.ensure_clean("test.pkl") as pklpath: + with open(pklpath, "wb") as f_w: + pickle.dump(lbr1, f_w) + + with open(pklpath, "rb") as f_r: + lbr2 = pickle.load(f_r) + + with tm.ensure_clean("test") as r_temp: + # need to decompress to temp file + with gzip.open(filename, "rb") as f_r: + with open(r_temp, "wb") as f_w: + shutil.copyfileobj(f_r, f_w) + + df_output = read_rdata( + r_temp, file_format=rtype, compression=None, rownames=False + )[rda_name].to_dict() + + cy_output = lbr2.read_rdata(r_temp) + + lbr_output = { + vcol: vdata + for (kdata, vdata), (kcol, vcol) in zip( + cy_output[rda_name]["data"].items(), cy_output[rda_name]["colnames"].items() + ) + } + + assert lbr_output == df_output + + +def test_writer_unpickled(datapath, rtype): + rda_name = "test_frame" if rtype == "rda" else "r_dataframe" + + lbw1 = LibrdataWriter() + + with tm.ensure_clean("test.pkl") as pklpath: + with open(pklpath, "wb") as f_w: + pickle.dump(lbw1, f_w) + + with open(pklpath, "rb") as f_r: + lbw2 = pickle.load(f_r) + + rdict = {"dtypes": {k: str(v) for k, v in ghg_df.dtypes.to_dict().items()}} + for k, v in rdict["dtypes"].items(): + if any(x in v for x in ("bool", "Boolean")): + rdict["dtypes"][k] = "bool" + + elif any(x in v for x in ("int", "uint", "Int", "UInt")): + rdict["dtypes"][k] = "int" + + elif any(x in v for x in ("float", "Float")): + rdict["dtypes"][k] = "float" + + elif any(x in v for x in ("datetime", "Datetime")): + rdict["dtypes"][k] = "datetime" + + elif any(x in v for x in ("object", "string", "String")): + rdict["dtypes"][k] = "object" + + rdict["data"] = ghg_df.reset_index(drop=True).to_dict() + + expected = ghg_df.reset_index(drop=True) + + with tm.ensure_clean("test") as r_temp: + lbw2.write_rdata( + rfile=r_temp, + rdict=rdict, + rformat=rtype, + tbl_name="test_frame", + ) + + output = read_rdata( + r_temp, + file_format=rtype, + rownames=False, + compression=None, + )[rda_name] + + tm.assert_frame_equal(output, expected) diff --git a/setup.py b/setup.py index 337719053585c..3d682610a2e46 100755 --- a/setup.py +++ b/setup.py @@ -225,6 +225,7 @@ class CheckSDist(sdist_class): "pandas/_libs/window/indexers.pyx", "pandas/_libs/writers.pyx", "pandas/io/sas/sas.pyx", + "pandas/io/rdata/_rdata.pyx", ] _cpp_pyxfiles = [ @@ -327,6 +328,11 @@ def run(self): extra_compile_args = [] extra_link_args = [] + +rdata_includes = [] +rdata_libs_dir = [] +rdata_libs = [] + if is_platform_windows(): if debugging_symbols_requested: extra_compile_args.append("/Z7") @@ -364,6 +370,11 @@ def run(self): # https://github.com/pandas-dev/pandas/issues/35559 extra_compile_args.append("-Wno-error=unreachable-code") + # rdata requires system iconv library + rdata_includes = ["/usr/include"] + rdata_libs_dir = ["/usr/lib"] + rdata_libs = ["iconv"] + # enable coverage by building cython files by setting the environment variable # "PANDAS_CYTHON_COVERAGE" (with a Truthy value) or by running build_ext # with `--with-cython-coverage`enabled @@ -640,7 +651,37 @@ def srcpath(name=None, suffix=".pyx", subdir="src"): extensions.append(ujson_ext) # ---------------------------------------------------------------------- +# rdata + +rdata_srcs = [ + "pandas/io/rdata/_rdata.pyx", + "pandas/_libs/src/librdata/rdata_parser.c", + "pandas/_libs/src/librdata/rdata_read.c", + "pandas/_libs/src/librdata/rdata_write.c", + "pandas/_libs/src/librdata/rdata_io_unistd.c", + "pandas/_libs/src/librdata/rdata_error.c", + "pandas/_libs/src/librdata/rdata_bits.c", + "pandas/_libs/src/librdata/CKHashTable.c", +] +if is_platform_windows(): + rdata_srcs.append("pandas/_libs/src/librdata/win_iconv.c") + +rdata_ext = Extension( + name="pandas.io.rdata._rdata", + sources=rdata_srcs, + include_dirs=rdata_includes, + library_dirs=rdata_libs_dir, + libraries=rdata_libs, + language="c", + define_macros=macros, + extra_compile_args=extra_compile_args, + extra_link_args=extra_link_args, +) + +extensions.append(rdata_ext) + +# ---------------------------------------------------------------------- if __name__ == "__main__": # Freeze to support parallel compilation when using spawn instead of fork