Source code for kumoapi.typing
import builtins
from typing import TYPE_CHECKING
import pydantic
from kumoapi.common import StrEnum
WITH_PYDANTIC_V2 = int(pydantic.__version__.split('.')[0]) >= 2
if TYPE_CHECKING:
from pandas import DateOffset
else:
try:
from pandas import DateOffset
except ImportError:
class DateOffset:
def __init__(self, *args, **kawrgs) -> None:
raise ModuleNotFoundError("No module named 'pandas'")
[docs]class Stype(StrEnum):
r"""The semantic type of a column.
A semantic type denotes the semantic meaning of a column, and determines
the preprocessing that is applied to the column. Semantic types can be
passed to methods in the SDK as strings (*e.g.* ``"numerical"``).
.. note::
For more information about how to select a semantic type, please
refer to https://docs.kumo.ai/docs/column-preprocessing.
Attributes:
numerical: A numerical column. Typically integers or floats.
categorical: A categorical column. Typically boolean or string values
typically a single token in length.
multicategorical: A multi-categorical column. Typically a concatenation
of multiple categories under a single string representation.
ID: A column holding IDs. Typically numerical values used to uniquely
identify different entities.
text: A text column. String values typically multiple tokens in length,
where the actual language content of the value has semantic
meaning.
timestamp: A date/time column.
sequence: A column holding sequences/embeddings. Consists of lists of
floats, all of equal length, and are typically the output of
another AI model
image: A column holding image URLs.
"""
numerical = 'numerical'
categorical = 'categorical'
multicategorical = 'multicategorical'
ID = 'ID'
text = 'text'
timestamp = 'timestamp'
sequence = 'sequence'
image = 'image'
unsupported = 'unsupported'
[docs] def to_parent_stype(self) -> 'Stype':
r"""Convert the semantic type to its parent type.
Most semantic types are their own parent type. However, ``ID`` is
converted to ``categorical`` because it is a special case.
"""
return self if self != Stype.ID else Stype.categorical
[docs] def supports_dtype(self, dtype: 'Dtype') -> bool:
r"""Whether a :class:`Stype` supports a :class:`Dtype`."""
if self == Stype.numerical:
return dtype.is_numerical()
if self == Stype.categorical:
return dtype.is_bool() or dtype.is_numerical() or dtype.is_string()
if self == Stype.multicategorical:
return dtype.is_string() or dtype.is_list()
if self == Stype.ID:
return dtype.is_int() or dtype.is_string() or dtype.is_float()
if self == Stype.text:
return dtype in {Dtype.string}
if self == Stype.timestamp:
return dtype.is_maybe_timestamp()
if self == Stype.sequence:
return dtype in {
Dtype.floatlist,
Dtype.intlist,
Dtype.string,
}
if self == Stype.image:
return dtype in {Dtype.string}
assert self == Stype.unsupported
return True
[docs]class Dtype(StrEnum):
r"""The data type of a column.
A data type represents how the data of a column is physically stored. Data
types can be passed to methods in the SDK as strings (*e.g.* ``"int"``).
Attributes:
bool: A boolean column.
int: An integer column.
float: An floating-point column.
date: A column holding a date.
time: A column holding a timestamp.
floatlist: A column holding a list of floating-point values.
intlist: A column holding a list of integers.
binary: A column containing binary data.
stringlist: A column containing list of strings.
"""
# Booleans:
bool = 'bool'
# Integers:
int = 'int'
byte = 'byte'
int16 = 'int16'
int32 = 'int32'
int64 = 'int64'
# Floating point numbers:
float = 'float'
float32 = 'float32'
float64 = 'float64'
# Strings:
string = 'string'
binary = 'binary'
# Time:
date = 'date'
time = 'time'
timedelta = 'timedelta'
# Nested lists:
floatlist = 'floatlist'
intlist = 'intlist'
stringlist = 'stringlist'
# Unsupported:
unsupported = 'unsupported'
[docs] def is_bool(self) -> builtins.bool:
r"""Whether the :class:`Dtype` holds booleans."""
return self in {Dtype.bool}
[docs] def is_int(self) -> builtins.bool:
r"""Whether the :class:`Dtype` holds integers."""
return self in {
Dtype.int, Dtype.byte, Dtype.int16, Dtype.int32, Dtype.int64
}
[docs] def is_float(self) -> builtins.bool:
r"""Whether the :class:`Dtype` holds floating point numbers."""
return self in {Dtype.float, Dtype.float32, Dtype.float64}
[docs] def is_numerical(self) -> builtins.bool:
r"""Whether the :class:`Dtype` holds numbers."""
return self.is_int() or self.is_float() or self == Dtype.timedelta
[docs] def is_string(self) -> builtins.bool:
r"""Whether the :class:`Dtype` holds strings."""
return self in {Dtype.string, Dtype.binary}
[docs] def is_timestamp(self) -> builtins.bool:
r"""Whether the :class:`Dtype` holds timestamps."""
return self in {Dtype.date, Dtype.time}
[docs] def is_maybe_timestamp(self) -> builtins.bool:
r"""Whether the :class:`Dtype` holds castable timestamps."""
return self.is_timestamp() or self in {Dtype.string}
[docs] def is_list(self) -> builtins.bool:
r"""Whether the :class:`Dtype` holds nested lists."""
return self in {Dtype.floatlist, Dtype.intlist, Dtype.stringlist}
[docs] def is_unsupported(self) -> builtins.bool:
r"""Whether the :class:`Dtype` holds unsupported types."""
return self in {Dtype.unsupported}
@property
def default_stype(self) -> Stype:
r"""Returns the default semantic type of this data type."""
if self.is_bool():
return Stype.categorical
if self.is_numerical():
return Stype.numerical
if self == Dtype.binary:
return Stype.categorical
if self == Dtype.string:
return Stype.text
if self.is_timestamp():
return Stype.timestamp
if self in {Dtype.stringlist}:
return Stype.multicategorical
if self in {Dtype.floatlist, Dtype.intlist}:
return Stype.sequence
assert self == Dtype.unsupported
return Stype.unsupported
class ColStatType(StrEnum):
# Any:
COUNT = 'COUNT'
NUM_NA = 'NUM_NA'
NA_FRACTION = 'NA_FRACTION'
INVALID_FRACTION = 'INVALID_FRACTION'
# Numerical, Temporal
MIN = 'MIN'
MAX = 'MAX'
# Numerical:
MEAN = 'MEAN'
QUANTILES = 'QUANTILES'
QUANTILE25 = 'QUANTILE25'
MEDIAN = 'MEDIAN'
QUANTILE75 = 'QUANTILE75'
STD = 'STD'
KURTOSIS = 'KURTOSIS'
HISTOGRAM = 'HISTOGRAM'
# num irrational entries (which are included in NA count and treated as NA)
NUM_IRRATIONAL = 'NUM_IRRATIONAL'
# Categorical:
# NUM_UNIQUE and NUM_UNIQUE_MULTI count empty strings / NA values as their
# own category. CATEGORY_COUNTS and MULTI_CATEGORY_COUNTS do not include
# empty strings / NA values as their own category.
NUM_UNIQUE = 'NUM_UNIQUE'
NUM_UNIQUE_MULTI = 'NUM_UNIQUE_MULTI'
CATEGORY_COUNTS = 'CATEGORY_COUNTS'
MULTI_CATEGORY_COUNTS = 'MULTI_CATEGORY_COUNTS'
UNIQUE_FRACTION = 'UNIQUE_FRACTION'
# The separator to use for the multi-categorical column:
MULTI_CATEGORIES_SEPARATOR = 'MULTI_CATEGORIES_SEPARATOR'
# Strings:
STRING_AVG_LEN = 'STRING_AVG_LEN'
STRING_MAX_LEN = 'STRING_MAX_LEN'
STRING_AVG_TOKENS = 'STRING_AVG_TOKENS'
STRING_MAX_TOKENS = 'STRING_MAX_TOKENS'
STRING_GLOVE_OVERLAP = 'STRING_GLOVE_OVERLAP'
STRING_AVG_NON_CHAR = 'STRING_AVG_NON_CHAR'
STRING_ARR_MIN_LEN = 'STRING_ARR_MIN_LEN'
STRING_ARR_MAX_LEN = 'STRING_ARR_MAX_LEN'
# Sequence:
SEQUENCE_MAX_LENGTH = 'SEQUENCE_MAX_LENGTH'
SEQUENCE_MIN_LENGTH = 'SEQUENCE_MIN_LENGTH'
SEQUENCE_MEAN = 'SEQUENCE_MEAN'
SEQUENCE_STD = 'SEQUENCE_STD'
class TimeUnit(StrEnum):
r"""Defines the unit of a time."""
SECONDS = 'seconds'
MINUTES = 'minutes'
HOURS = 'hours'
DAYS = 'days'
WEEKS = 'weeks'
MONTHS = 'months'
def to_offset(self) -> DateOffset:
return DateOffset(**{self: 1})