Source code for kumoapi.typing
from kumoapi.common import StrEnum
[docs]class Stype(StrEnum):
r"""The semantic type of a column.
A semantic type denotes the semantic meaning of a column, and determines
the preprocessing that is applied to the column. Semantic types can be
passed to methods in the SDK as strings (*e.g.* ``"numerical"``).
.. note::
For more information about how to select a semantic type, please
refer to https://docs.kumo.ai/docs/column-preprocessing.
Attributes:
numerical: A numerical column. Typically integers or floats.
categorical: A categorical column. Typically boolean or string values
typically a single token in length.
multicategorical: A multi-categorical column. Typically a concatenation
of multiple categories under a single string representation.
ID: A column holding IDs. Typically numerical values used to uniquely
identify different entities.
text: A text column. String values typically multiple tokens in length,
where the actual language content of the value has semantic
meaning.
timestamp: A date/time column.
sequence: A column holding sequences/embeddings. Consists of lists of
floats, all of equal length, and are typically the output of
another AI model
image: A column holding image URLs.
"""
numerical = 'numerical'
categorical = 'categorical'
multicategorical = 'multicategorical'
ID = 'ID'
text = 'text'
timestamp = 'timestamp'
sequence = 'sequence'
image = 'image'
unsupported = 'unsupported'
[docs]class Dtype(StrEnum):
r"""The data type of a column.
A data type represents how the data of a column is physically stored. Data
types can be passed to methods in the SDK as strings (*e.g.* ``"int"``).
Attributes:
bool: A boolean column.
int: An integer column.
float: An floating-point column.
date: A column holding a date.
time: A column holding a timestamp.
floatlist: A column holding a list of floating-point values.
intlist: A column holding a list of integers.
binary: A column containing binary data.
"""
bool = 'bool'
int = 'int'
byte = 'byte'
int16 = 'int16'
int32 = 'int32'
int64 = 'int64'
float = 'float'
float32 = 'float32'
float64 = 'float64'
string = 'string'
date = 'date'
time = 'time'
timedelta = 'timedelta'
floatlist = 'floatlist'
intlist = 'intlist'
binary = 'binary'
unsupported = 'unsupported'
class ColStatType(StrEnum):
# Any:
COUNT = 'COUNT'
NUM_NA = 'NUM_NA'
NA_FRACTION = 'NA_FRACTION'
INVALID_FRACTION = 'INVALID_FRACTION'
# Numerical, Temporal
MIN = 'MIN'
MAX = 'MAX'
# Numerical:
MEAN = 'MEAN'
QUANTILES = 'QUANTILES'
QUANTILE25 = 'QUANTILE25'
MEDIAN = 'MEDIAN'
QUANTILE75 = 'QUANTILE75'
STD = 'STD'
KURTOSIS = 'KURTOSIS'
HISTOGRAM = 'HISTOGRAM'
# num irrational entries (which are included in NA count and treated as NA)
NUM_IRRATIONAL = 'NUM_IRRATIONAL'
# Categorical:
# NUM_UNIQUE and NUM_UNIQUE_MULTI count empty strings / NA values as their
# own category. CATEGORY_COUNTS and MULTI_CATEGORY_COUNTS do not include
# empty strings / NA values as their own category.
NUM_UNIQUE = 'NUM_UNIQUE'
NUM_UNIQUE_MULTI = 'NUM_UNIQUE_MULTI'
CATEGORY_COUNTS = 'CATEGORY_COUNTS'
MULTI_CATEGORY_COUNTS = 'MULTI_CATEGORY_COUNTS'
UNIQUE_FRACTION = 'UNIQUE_FRACTION'
# The separator to use for the multi-categorical column:
MULTI_CATEGORIES_SEPARATOR = 'MULTI_CATEGORIES_SEPARATOR'
# Strings:
STRING_AVG_LEN = 'STRING_AVG_LEN'
STRING_MAX_LEN = 'STRING_MAX_LEN'
STRING_AVG_TOKENS = 'STRING_AVG_TOKENS'
STRING_MAX_TOKENS = 'STRING_MAX_TOKENS'
STRING_GLOVE_OVERLAP = 'STRING_GLOVE_OVERLAP'
STRING_AVG_NON_CHAR = 'STRING_AVG_NON_CHAR'
STRING_ARR_MIN_LEN = 'STRING_ARR_MIN_LEN'
STRING_ARR_MAX_LEN = 'STRING_ARR_MAX_LEN'
# Sequence:
SEQUENCE_MAX_LENGTH = 'SEQUENCE_MAX_LENGTH'
SEQUENCE_MIN_LENGTH = 'SEQUENCE_MIN_LENGTH'