Source code for kumoapi.encoder

# flake8: noqa

import warnings
from abc import ABC, abstractmethod
from dataclasses import field, fields
from typing import Any, Dict, Literal, Optional, Set, Union, get_args

from pydantic import PositiveInt
from pydantic.dataclasses import dataclass

from kumoapi.common import StrEnum
from kumoapi.typing import ColStatType, Stype

warnings.filterwarnings('ignore', "fields may not start with an underscore")


[docs]class NAStrategy(StrEnum): r"""Kumo-supported null value imputation strategies.""" ZERO = 'zero' # Fill missing values with zeros. MEAN = 'mean' # Fill missing values with mean. SEPARATE = 'separate' # Regard missing values as a separate category. MOST_FREQUENT = 'most_frequent' # Fill with most frequent value. RAISE = 'raise' # Backward compatibility. Do not use. def __repr__(self) -> str: return self.value
[docs]class Scaler(StrEnum): r"""Kumo-supported numerical value scaling strategies.""" #: Scale values with z-score normalization. #: Equivalent to `StandardScaler <https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html>`_. STANDARD = 'standard' #: Scale values by subtracting the minimum value and dividing by the range. #: Equivalent to `MinMaxScaler <https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html>`_. MINMAX = 'minmax' #: Scale values by subtracting the median and dividng by the range between #: the first and third quartiles. Equivalent to `RobustScaler <https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.RobustScaler.html>`_. ROBUST = 'robust' def __repr__(self) -> str: return self.value
@dataclass class Encoder(ABC): def __post_init__(self) -> None: if hasattr(self, 'na_strategy'): self.na_strategy = NAStrategy(self.na_strategy) # Let `pydantic` break on invalid `_target_` names. Needed because # `pydantic` doesn't check for type-safety in underscore attributes. target = getattr(self, '_target_', None) if target is not None: f = [f for f in fields(self.__class__) if f.name == '_target_'][0] if target not in get_args(f.type): raise ValueError(f"Unsupported `_target_={target}` for " f"'{self.__class__.__name__}' encoder") @property @abstractmethod def supported_stypes(self) -> Set[Stype]: pass @property @abstractmethod def required_stats(self) -> Set[ColStatType]: pass
[docs]@dataclass class Null(Encoder): r"""A :class:`Null` encoder skips encoding its corresponding column.""" name: Literal['Null'] = field(default='Null', repr=False) _stats: Dict[ColStatType, Any] = field(default_factory=dict, repr=False) # Deprecated: _target_: Literal['kumo.encoder.encoder.Null'] = field( default='kumo.encoder.encoder.Null', repr=False, ) @property def supported_stypes(self) -> Set[Stype]: return set(Stype) @property def required_stats(self) -> Set[ColStatType]: return set()
[docs]@dataclass class Numerical(Encoder): r"""A :class:`Numerical` encoder encodes its corresponding numerical column with a normalization specified by :obj:`scaler` and strategy for null value imputation specified by :obj:`na_strategy`.""" #: The specified :obj:`~kumoapi.encoder.Scaler`, one of "standard", #: "minmax", or "robust". scaler: Optional[Scaler] = None #: The specified null value imputation strategy. na_strategy: Literal[ NAStrategy.ZERO, NAStrategy.MEAN, NAStrategy.RAISE, ] = NAStrategy.MEAN name: Literal['Numerical'] = field(default='Numerical', repr=False) _stats: Dict[ColStatType, Any] = field(default_factory=dict, repr=False) # Deprecated: _target_: Literal['kumo.encoder.numerical.Numerical'] = field( default='kumo.encoder.numerical.Numerical', repr=False, ) @property def supported_stypes(self) -> Set[Stype]: return {Stype.numerical} @property def required_stats(self) -> Set[ColStatType]: stats = set() if self.na_strategy is NAStrategy.MEAN: stats |= {ColStatType.MEAN} if self.scaler is Scaler.STANDARD: stats |= {ColStatType.MEAN, ColStatType.STD} elif self.scaler is Scaler.MINMAX: stats |= {ColStatType.MIN, ColStatType.MAX} elif self.scaler is Scaler.ROBUST: stats |= {ColStatType.QUANTILES} return stats
[docs]@dataclass class MaxLogNumerical(Encoder): r"""A :class:`MaxLogNumerical` encoder encodes its corresponding numerical column, after applying the transformation .. math:: \log \left( \frac{\text{feature} - (\text{min} - 1)}{1.0} \right) and using a strategy for null value imputation specified by :obj:`na_strategy`.""" #: The specified null value imputation strategy. na_strategy: Literal[ NAStrategy.ZERO, NAStrategy.MEAN, NAStrategy.RAISE, ] = NAStrategy.MEAN name: Literal['MaxLogNumerical'] = field( default='MaxLogNumerical', repr=False, ) _stats: Dict[ColStatType, Any] = field(default_factory=dict, repr=False) # Deprecated: _target_: Literal['kumo.encoder.numerical.MaxLogNumerical'] = field( default='kumo.encoder.numerical.MaxLogNumerical', repr=False, ) @property def supported_stypes(self) -> Set[Stype]: return {Stype.numerical} @property def required_stats(self) -> Set[ColStatType]: if self.na_strategy is NAStrategy.MEAN: return {ColStatType.MIN, ColStatType.MEAN} return {ColStatType.MIN}
[docs]@dataclass class MinLogNumerical(Encoder): r"""A :class:`MinLogNumerical` encoder encodes its corresponding numerical column, after applying the transformation .. math:: \log \left( \frac{\text{feature} - (\text{max} + 1)}{-1.0} \right) and using a strategy for null value imputation specified by :obj:`na_strategy`.""" #: The specified null value imputation strategy. na_strategy: Literal[ NAStrategy.ZERO, NAStrategy.MEAN, NAStrategy.RAISE, ] = NAStrategy.MEAN name: Literal['MinLogNumerical'] = field( default='MinLogNumerical', repr=False, ) _stats: Dict[ColStatType, Any] = field(default_factory=dict, repr=False) _target_: Literal['kumo.encoder.numerical.MinLogNumerical'] = field( default='kumo.encoder.numerical.MinLogNumerical', repr=False, ) @property def supported_stypes(self) -> Set[Stype]: return {Stype.numerical} @property def required_stats(self) -> Set[ColStatType]: if self.na_strategy is NAStrategy.MEAN: return {ColStatType.MAX, ColStatType.MEAN} return {ColStatType.MAX}
[docs]@dataclass class Index(Encoder): r"""An :class:`Index` encoder encodes its corresponding categorical column by assigning each unique value with frequency above :obj:`min_occ` to an embedding of size :obj:`channels` from the model plan. Values below this frequency are all collapsed to the same embedding.""" #: The minimum frequency of distinct values. min_occ: PositiveInt = 1 #: The specified null value imputation strategy. na_strategy: Literal[ NAStrategy.ZERO, NAStrategy.SEPARATE, NAStrategy.MOST_FREQUENT, NAStrategy.RAISE, ] = NAStrategy.SEPARATE name: Literal['Index'] = field(default='Index', repr=False) _stats: Dict[ColStatType, Any] = field(default_factory=dict, repr=False) # Deprecated: _target_: Literal[ 'kumo.encoder.categorical.Index', 'kumo.encoder.categorical.OneHot', # Backward compatibility. ] = field( default='kumo.encoder.categorical.Index', repr=False, ) @property def supported_stypes(self) -> Set[Stype]: return {Stype.categorical, Stype.ID} @property def required_stats(self) -> Set[ColStatType]: return {ColStatType.CATEGORY_COUNTS}
[docs]@dataclass class Hash(Encoder): r"""A :class:`Hash` encoder encodes its corresponding categorical column by hashing each value to range :obj:`[0..num_components]`, and using this hashed value to determine the corresponding embedding (with size :obj:`channels` from the model plan).""" #: The number of distinct categories after hashing. num_components: PositiveInt #: The specified null value imputation strategy. na_strategy: Literal[ NAStrategy.SEPARATE, NAStrategy.MOST_FREQUENT, ] = NAStrategy.SEPARATE name: Literal['Hash'] = field(default='Hash', repr=False) _stats: Dict[ColStatType, Any] = field(default_factory=dict, repr=False) # Deprecated: _target_: Literal['kumo.encoder.categorical.Hash'] = field( default='kumo.encoder.categorical.Hash', repr=False, ) @property def supported_stypes(self) -> Set[Stype]: return {Stype.categorical, Stype.ID} @property def required_stats(self) -> Set[ColStatType]: return {ColStatType.CATEGORY_COUNTS}
[docs]@dataclass class MultiCategorical(Encoder): r"""A :class:`MultiCategorical` encoder encodes its corresponding multicategorical column by treating each categorical value independently, and fusing the results.""" #: The minimum frequency of distinct values. min_occ: PositiveInt = 1 #: The specified null value imputation strategy. na_strategy: Literal[ NAStrategy.ZERO, NAStrategy.SEPARATE, NAStrategy.MOST_FREQUENT, ] = NAStrategy.ZERO name: Literal['MultiCategorical'] = field( default='MultiCategorical', repr=False, ) _stats: Dict[ColStatType, Any] = field(default_factory=dict, repr=False) # Deprecated: _target_: Literal['kumo.encoder.categorical.MultiCategorical'] = field( default='kumo.encoder.categorical.MultiCategorical', repr=False, ) @property def supported_stypes(cls) -> Set[Stype]: return {Stype.multicategorical} @property def required_stats(self) -> Set[ColStatType]: return { ColStatType.MULTI_CATEGORY_COUNTS, ColStatType.MULTI_CATEGORIES_SEPARATOR, }
[docs]@dataclass class GloVe(Encoder): r"""A :class:`GloVe` encoder uses embeddings from the `GloVe <https://nlp.stanford.edu/projects/glove/>`_ project to embed text in a semantically meaningful manner.""" #: Options for the GloVe model to be used. model_name: Literal[ 'glove.test', 'glove.6B', 'glove.42B', 'glove.840B', 'glove_twitter.27B', ] = 'glove.6B' #: The embedding dimension. Must correspond to the :obj:`model_name`. embedding_dim: int = 50 na_strategy: Literal[NAStrategy.ZERO] = field( # No need to show/modify. default=NAStrategy.ZERO, repr=False, ) name: Literal['GloVe'] = field(default='GloVe', repr=False) _stats: Dict[ColStatType, Any] = field(default_factory=dict, repr=False) # Deprecated: _target_: Literal['kumo.encoder.sequential.GloVe'] = field( default='kumo.encoder.sequential.GloVe', repr=False, ) def __post_init__(self) -> None: super().__post_init__() if self.model_name == 'glove.test': valid_embedding_dims = {10} elif self.model_name == 'glove.6B': valid_embedding_dims = {50, 100, 200, 300} elif self.model_name == 'glove.42B': valid_embedding_dims = {300} elif self.model_name == 'glove.840B': valid_embedding_dims = {300} else: assert self.model_name == 'glove.twitter.27B' valid_embedding_dims = {25, 50, 100, 200} if self.embedding_dim not in valid_embedding_dims: raise ValueError(f"GloVe model '{self.model_name}' only supports " f"embedding dimensions {valid_embedding_dims} " f"(got {self.embedding_dim})") @property def supported_stypes(self) -> Set[Stype]: return {Stype.text} @property def required_stats(self) -> Set[ColStatType]: return set()
[docs]@dataclass class NumericalList(Encoder): r"""A :class:`NumericalList` encoder encodes numerical sequences, by treating these sequences as input features without any applied transformations.""" na_strategy: Literal[NAStrategy.ZERO] = field( # No need to show/modify. default=NAStrategy.ZERO, repr=False, ) name: Literal['NumericalList'] = field(default='NumericalList', repr=False) _stats: Dict[ColStatType, Any] = field(default_factory=dict, repr=False) # Deprecated: _target_: Literal['kumo.encoder.numerical.NumericalList'] = field( default='kumo.encoder.numerical.NumericalList', repr=False, ) @property def supported_stypes(self) -> Set[Stype]: return {Stype.sequence} @property def required_stats(self) -> Set[ColStatType]: return { ColStatType.SEQUENCE_MIN_LENGTH, ColStatType.SEQUENCE_MAX_LENGTH, }
[docs]@dataclass(repr=False) class Datetime(Encoder): r"""A :class:`Datetime` encoder encodes a date or time value, representing it with various user-specified granularities.""" #: Whether to include minute-granularity features. include_minute: bool = True #: Whether to include hour-granularity features. include_hour: bool = True #: Whether to include week-granularity features. include_day_of_week: bool = True #: Whether to include month-granularity features. include_day_of_month: bool = True #: Whether to include day-of-year-granularity features. include_day_of_year: bool = True #: Whether to include year-granularity features. include_year: bool = True num_year_periods: Optional[PositiveInt] = None # TODO: document? na_strategy: Literal[NAStrategy.ZERO] = field( # No need to show/modify. default=NAStrategy.ZERO, repr=False, ) name: Literal['Datetime'] = field(default='Datetime', repr=False) _stats: Dict[ColStatType, Any] = field(default_factory=dict, repr=False) # Deprecated: _target_: Literal['kumo.encoder.temporal.Datetime'] = field( default='kumo.encoder.temporal.Datetime', repr=False, ) @property def supported_stypes(self) -> Set[Stype]: return {Stype.timestamp} @property def required_stats(self) -> Set[ColStatType]: if self.include_year: return {ColStatType.MIN, ColStatType.MAX} return set() def __repr__(self) -> str: kwargs = { # Only show arguments that diverge from the default: f.name: getattr(self, f.name) for f in fields(self) if f.repr and getattr(self, f.name) != f.default } reprs = ', '.join([f'{k}={v}' for k, v in kwargs.items()]) return f'{self.__class__.__name__}({reprs})'
EncoderType = Union[ Null, Numerical, MaxLogNumerical, MinLogNumerical, Index, Hash, MultiCategorical, GloVe, NumericalList, Datetime, ]