diff --git a/pymc/data.py b/pymc/data.py index 7e306f19e3..13393c4924 100644 --- a/pymc/data.py +++ b/pymc/data.py @@ -21,7 +21,7 @@ from typing import cast import numpy as np -import pandas as pd +from numpy.typing import ArrayLike import pytensor import pytensor.tensor as pt import xarray as xr @@ -204,7 +204,7 @@ def Minibatch(variable: TensorVariable, *variables: TensorVariable, batch_size: def determine_coords( model, - value: pd.DataFrame | pd.Series | xr.DataArray, + value: ArrayLike, dims: Sequence[str | None] | None = None, coords: dict[str, Sequence | np.ndarray] | None = None, ) -> tuple[dict[str, Sequence | np.ndarray], Sequence[str | None]]: @@ -348,7 +348,7 @@ def Data( ---------- name : str The name for this variable. - value : array_like or pandas.Series, pandas.Dataframe + value : array_like or pandas.Series, pandas.Dataframe, polars.DataFrame, polars.Series, xarray.DataArray A value to associate with this variable. dims : str or tuple of str, optional Dimension names of the random variables (as opposed to the shapes of these diff --git a/pymc/pytensorf.py b/pymc/pytensorf.py index cc7204c28a..69ed5a25c4 100644 --- a/pymc/pytensorf.py +++ b/pymc/pytensorf.py @@ -18,6 +18,10 @@ import numpy as np import pandas as pd +try: + import polars as pl +except ImportError: + pl = None import pytensor import pytensor.tensor as pt import scipy.sparse as sps @@ -111,6 +115,18 @@ def convert_data(data) -> np.ndarray | Variable: ret = np.ma.MaskedArray(vals, mask) else: ret = vals + elif hasattr(data, "to_numpy") and hasattr(data, "is_null"): + vals = data.to_numpy() + try: + null_data = data.is_null() + except AttributeError: + null_data = data.with_columns(pl.all().is_null()) + mask = null_data.to_numpy() + if mask.any(): + # there are missing values + ret = np.ma.MaskedArray(vals, mask) + else: + ret = vals elif isinstance(data, np.ndarray): if isinstance(data, np.ma.MaskedArray): if not data.mask.any(): @@ -141,11 +157,18 @@ def convert_data(data) -> np.ndarray | Variable: # Otherwise we only convert the precision. return smarttypeX(ret) - -@_as_tensor_variable.register(pd.Series) -@_as_tensor_variable.register(pd.DataFrame) -def dataframe_to_tensor_variable(df: pd.DataFrame, *args, **kwargs) -> TensorVariable: - return pt.as_tensor_variable(df.to_numpy(), *args, **kwargs) +if pl is not None: + @_as_tensor_variable.register(pd.Series) + @_as_tensor_variable.register(pd.DataFrame) + @_as_tensor_variable.register(pl.DataFrame) + @_as_tensor_variable.register(pl.Series) + def dataframe_to_tensor_variable(df: pd.DataFrame | pl.DataFrame, *args, **kwargs) -> TensorVariable: + return pt.as_tensor_variable(df.to_numpy(), *args, **kwargs) +else: + @_as_tensor_variable.register(pd.Series) + @_as_tensor_variable.register(pd.DataFrame) + def dataframe_to_tensor_variable(df: pd.DataFrame, *args, **kwargs) -> TensorVariable: + return pt.as_tensor_variable(df.to_numpy(), *args, **kwargs) def extract_obs_data(x: TensorVariable) -> np.ndarray: diff --git a/tests/test_data.py b/tests/test_data.py index c8472359f1..2d3b3dd7fd 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -404,6 +404,21 @@ def test_implicit_coords_dataframe(self, seeded_test): assert "columns" in pmodel.coords assert pmodel.named_vars_to_dims == {"observations": ("rows", "columns")} + def test_implicit_coords_polars_dataframe(self, seeded_test): + pl = pytest.importorskip("polars") + N_rows = 5 + N_cols = 7 + df_data = pl.DataFrame({f"Column {c+1}": np.random.normal(size=(N_rows,)) for c in range(N_cols)}) + df_data = df_data.with_row_count("rows") + + # infer coordinates from index and columns of the DataFrame + with pm.Model() as pmodel: + pm.Data("observations", df_data, dims=("rows", "columns"), infer_dims_and_coords=True) + + assert "rows" in pmodel.coords + assert "columns" in pmodel.coords + assert pmodel.named_vars_to_dims == {"observations": ("rows", "columns")} + def test_implicit_coords_xarray(self): xr = pytest.importorskip("xarray") data = xr.DataArray([[1, 2, 3], [4, 5, 6]], dims=("y", "x")) diff --git a/tests/test_pytensorf.py b/tests/test_pytensorf.py index e8881451bf..709f2728a9 100644 --- a/tests/test_pytensorf.py +++ b/tests/test_pytensorf.py @@ -66,6 +66,19 @@ def test_pd_dataframe_as_tensor_variable(np_array: np.ndarray) -> None: df = pd.DataFrame(np_array) np.testing.assert_array_equal(x=pt.as_tensor_variable(x=df).eval(), y=np_array) +@pytest.mark.parametrize( + argnames="np_array", + argvalues=[ + np.array([[1.0], [2.0], [-1.0]]), + np.array([[1.0, 1.0, 1.0], [0.0, 0.0, 0.0]]), + np.ones(shape=(10, 1)), + ], +) +def test_polars_dataframe_as_tensor_variable(np_array: np.ndarray) -> None: + pl = pytest.importorskip("polars") + df = pl.DataFrame(np_array) + np.testing.assert_array_equal(x=pt.as_tensor_variable(x=df).eval(), y=np_array) + @pytest.mark.parametrize( argnames="np_array",