Source code for kumoai.experimental.rfm.backend.local.table
from typing import Sequence, cast
import pandas as pd
from kumoapi.model_plan import MissingType
from kumoai.experimental.rfm.base import (
ColumnSpec,
DataBackend,
SourceColumn,
SourceForeignKey,
Table,
)
[docs]
class LocalTable(Table):
r"""A table backed by a :class:`pandas.DataFrame`.
A :class:`LocalTable` fully specifies the relevant metadata, *i.e.*
selected columns, column semantic types, primary keys and time columns.
:class:`LocalTable` is used to create a :class:`Graph`.
.. code-block:: python
import pandas as pd
import kumoai.experimental.rfm as rfm
# Load data from a CSV file:
df = pd.read_csv("data.csv")
# Create a table from a `pandas.DataFrame` and infer its metadata ...
table = rfm.LocalTable(df, name="my_table").infer_metadata()
# ... or create a table explicitly:
table = rfm.LocalTable(
df=df,
name="my_table",
primary_key="id",
time_column="time",
end_time_column=None,
)
# Verify metadata:
table.print_metadata()
# Change the semantic type of a column:
table[column].stype = "text"
Args:
df: The data frame to create this table from.
name: The name of this table.
primary_key: The name of the primary key of this table, if it exists.
time_column: The name of the time column of this table, if it exists.
end_time_column: The name of the end time column of this table, if it
exists.
"""
[docs]
def __init__(
self,
df: pd.DataFrame,
name: str,
primary_key: MissingType | str | None = MissingType.VALUE,
time_column: str | None = None,
end_time_column: str | None = None,
) -> None:
if df.empty:
raise ValueError("Data frame is empty")
if isinstance(df.columns, pd.MultiIndex):
raise ValueError("Data frame must not have a multi-index")
if not df.columns.is_unique:
raise ValueError("Data frame must have unique column names")
if any(col == '' for col in df.columns):
raise ValueError("Data frame must have non-empty column names")
self._data = df.copy(deep=False)
super().__init__(
name=name,
primary_key=primary_key,
time_column=time_column,
end_time_column=end_time_column,
)
@property
def backend(self) -> DataBackend:
return cast(DataBackend, DataBackend.LOCAL)
def _get_source_columns(self) -> list[SourceColumn]:
return [
SourceColumn(
name=column_name,
dtype=None,
is_primary_key=False,
is_unique_key=False,
is_nullable=True,
) for column_name in self._data.columns
]
def _get_source_foreign_keys(self) -> list[SourceForeignKey]:
return []
def _get_source_sample_df(self) -> pd.DataFrame:
return self._data
def _get_expr_sample_df(
self,
columns: Sequence[ColumnSpec],
) -> pd.DataFrame:
raise RuntimeError(f"Column expressions are not supported in "
f"'{self.__class__.__name__}'. Please apply your "
f"expressions on the `pd.DataFrame` directly.")
def _get_num_rows(self) -> int | None:
return len(self._data)