Skip to content

API: modeling

skyulf.modeling

Modeling module for Skyulf.

BaseModelApplier

Bases: ABC

Source code in skyulf-core/skyulf/modeling/base.py
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
class BaseModelApplier(ABC):
    @abstractmethod
    def predict(
        self, df: Union[pd.DataFrame, SkyulfDataFrame], model_artifact: Any
    ) -> Union[pd.Series, Any]:
        """
        Generates predictions.
        """

    def predict_proba(
        self, df: Union[pd.DataFrame, SkyulfDataFrame], model_artifact: Any
    ) -> Optional[Union[pd.DataFrame, SkyulfDataFrame]]:
        """
        Generates prediction probabilities if supported.
        Returns DataFrame where columns are classes.
        """
        return None

predict(df, model_artifact) abstractmethod

Generates predictions.

Source code in skyulf-core/skyulf/modeling/base.py
67
68
69
70
71
72
73
@abstractmethod
def predict(
    self, df: Union[pd.DataFrame, SkyulfDataFrame], model_artifact: Any
) -> Union[pd.Series, Any]:
    """
    Generates predictions.
    """

predict_proba(df, model_artifact)

Generates prediction probabilities if supported. Returns DataFrame where columns are classes.

Source code in skyulf-core/skyulf/modeling/base.py
75
76
77
78
79
80
81
82
def predict_proba(
    self, df: Union[pd.DataFrame, SkyulfDataFrame], model_artifact: Any
) -> Optional[Union[pd.DataFrame, SkyulfDataFrame]]:
    """
    Generates prediction probabilities if supported.
    Returns DataFrame where columns are classes.
    """
    return None

BaseModelCalculator

Bases: ABC

Source code in skyulf-core/skyulf/modeling/base.py
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
class BaseModelCalculator(ABC):
    @property
    @abstractmethod
    def problem_type(self) -> str:
        """Returns 'classification' or 'regression'."""

    @property
    def default_params(self) -> Dict[str, Any]:
        """Default hyperparameters for the model."""
        return {}

    def prepare_tuning_params(self, config: Dict[str, Any]) -> None:
        """Hook for structural models (e.g. ensembles) to absorb their
        sub-estimator selection before the tuner builds the base model.

        No-op for plain models. Ensembles override this to inject the resolved
        ``estimators`` (and ``final_estimator``) into :attr:`default_params` so
        the tuner can construct a valid meta-estimator.
        """
        return None

    def build_tuning_search_space(self, config: Dict[str, Any], strategy: str) -> Dict[str, Any]:
        """Hook: let a model auto-build its tuning search space.

        Returns an empty dict for plain models (the caller keeps the
        user-provided space). Ensembles override this to expand their base
        learners' parameter grids into nested ``<name>__<param>`` keys.
        """
        return {}

    @abstractmethod
    def fit(
        self,
        X: Union[pd.DataFrame, SkyulfDataFrame],
        y: Union[pd.Series, Any],
        config: Dict[str, Any],
        progress_callback: Optional[Callable[..., None]] = None,
        log_callback: Optional[Callable[[str], None]] = None,
        validation_data: Optional[
            tuple[Union[pd.DataFrame, SkyulfDataFrame], Union[pd.Series, Any]]
        ] = None,
    ) -> Any:
        """
        Trains the model. Returns the model object (serializable).
        """

default_params property

Default hyperparameters for the model.

problem_type abstractmethod property

Returns 'classification' or 'regression'.

build_tuning_search_space(config, strategy)

Hook: let a model auto-build its tuning search space.

Returns an empty dict for plain models (the caller keeps the user-provided space). Ensembles override this to expand their base learners' parameter grids into nested <name>__<param> keys.

Source code in skyulf-core/skyulf/modeling/base.py
40
41
42
43
44
45
46
47
def build_tuning_search_space(self, config: Dict[str, Any], strategy: str) -> Dict[str, Any]:
    """Hook: let a model auto-build its tuning search space.

    Returns an empty dict for plain models (the caller keeps the
    user-provided space). Ensembles override this to expand their base
    learners' parameter grids into nested ``<name>__<param>`` keys.
    """
    return {}

fit(X, y, config, progress_callback=None, log_callback=None, validation_data=None) abstractmethod

Trains the model. Returns the model object (serializable).

Source code in skyulf-core/skyulf/modeling/base.py
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
@abstractmethod
def fit(
    self,
    X: Union[pd.DataFrame, SkyulfDataFrame],
    y: Union[pd.Series, Any],
    config: Dict[str, Any],
    progress_callback: Optional[Callable[..., None]] = None,
    log_callback: Optional[Callable[[str], None]] = None,
    validation_data: Optional[
        tuple[Union[pd.DataFrame, SkyulfDataFrame], Union[pd.Series, Any]]
    ] = None,
) -> Any:
    """
    Trains the model. Returns the model object (serializable).
    """

prepare_tuning_params(config)

Hook for structural models (e.g. ensembles) to absorb their sub-estimator selection before the tuner builds the base model.

No-op for plain models. Ensembles override this to inject the resolved estimators (and final_estimator) into :attr:default_params so the tuner can construct a valid meta-estimator.

Source code in skyulf-core/skyulf/modeling/base.py
30
31
32
33
34
35
36
37
38
def prepare_tuning_params(self, config: Dict[str, Any]) -> None:
    """Hook for structural models (e.g. ensembles) to absorb their
    sub-estimator selection before the tuner builds the base model.

    No-op for plain models. Ensembles override this to inject the resolved
    ``estimators`` (and ``final_estimator``) into :attr:`default_params` so
    the tuner can construct a valid meta-estimator.
    """
    return None

BernoulliNBApplier

Bases: SklearnApplier

Bernoulli Naive Bayes Applier.

Source code in skyulf-core/skyulf/modeling/naive_bayes.py
56
57
class BernoulliNBApplier(SklearnApplier):
    """Bernoulli Naive Bayes Applier."""

BernoulliNBCalculator

Bases: SklearnCalculator

Bernoulli Naive Bayes Calculator.

Source code in skyulf-core/skyulf/modeling/naive_bayes.py
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
@NodeRegistry.register("bernoulli_nb", BernoulliNBApplier)
@node_meta(
    id="bernoulli_nb",
    name="Bernoulli Naive Bayes (binary / text)",
    category="Modeling",
    description=(
        "Naive Bayes classifier designed for binary/boolean features. "
        "Each feature is treated as a binary indicator of a token's presence. "
        "Also works with continuous features via a binarization threshold."
    ),
    params={"alpha": 1.0, "binarize": 0.0, "fit_prior": True},
    tags=["text", "nlp", "classification", "naive_bayes"],
)
class BernoulliNBCalculator(SklearnCalculator):
    """Bernoulli Naive Bayes Calculator."""

    def __init__(self):
        super().__init__(
            model_class=BernoulliNB,
            default_params={"alpha": 1.0, "binarize": 0.0, "fit_prior": True},
            problem_type="classification",
        )

    @property
    def problem_type(self) -> str:
        return "classification"

CalibratedClassifierApplier

Bases: SklearnApplier

Calibrated Classifier Applier (well-calibrated predict_proba).

Source code in skyulf-core/skyulf/modeling/classification.py
86
87
class CalibratedClassifierApplier(SklearnApplier):
    """Calibrated Classifier Applier (well-calibrated predict_proba)."""

CalibratedClassifierCalculator

Bases: SklearnCalculator

Calibrated Classifier Calculator with a selectable base estimator.

The frontend sends base_estimator as a string key (e.g. "random_forest"); it is resolved here into a fresh estimator instance before CalibratedClassifierCV is constructed. Defaults to logistic regression for backward compatibility.

Source code in skyulf-core/skyulf/modeling/classification.py
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
@NodeRegistry.register("calibrated_classifier", CalibratedClassifierApplier)
@node_meta(
    id="calibrated_classifier",
    name="Calibrated Classifier",
    category="Modeling",
    description=(
        "Wraps a base classifier with CalibratedClassifierCV so predicted "
        "probabilities are well-calibrated (Platt/sigmoid or isotonic)."
    ),
    params={"base_estimator": "logistic_regression", "method": "sigmoid", "cv": 5},
    tags=["requires_scaling"],
)
class CalibratedClassifierCalculator(SklearnCalculator):
    """Calibrated Classifier Calculator with a selectable base estimator.

    The frontend sends ``base_estimator`` as a string key (e.g.
    ``"random_forest"``); it is resolved here into a fresh estimator instance
    before ``CalibratedClassifierCV`` is constructed. Defaults to logistic
    regression for backward compatibility.
    """

    # Map of selectable base estimators → factory. Each must support
    # ``predict_proba`` (or ``decision_function``) so calibration is meaningful.
    BASE_ESTIMATORS: Dict[str, Callable[[], BaseEstimator]] = {
        "logistic_regression": lambda: LogisticRegression(max_iter=1000),
        "random_forest": lambda: RandomForestClassifier(n_estimators=100, random_state=42),
        "gradient_boosting": lambda: GradientBoostingClassifier(random_state=42),
        "decision_tree": lambda: DecisionTreeClassifier(random_state=42),
        "gaussian_nb": lambda: GaussianNB(),
        "svc": lambda: SVC(probability=True, random_state=42),
    }

    def __init__(self):
        super().__init__(
            model_class=CalibratedClassifierCV,
            default_params={
                "estimator": LogisticRegression(max_iter=1000),
                "method": "sigmoid",
                "cv": 5,
            },
            problem_type="classification",
        )

    def fit(
        self,
        X: Any,
        y: Any,
        config: Dict[str, Any],
        progress_callback: Optional[Callable[..., Any]] = None,
        log_callback: Optional[Callable[..., Any]] = None,
        validation_data: Any = None,
    ) -> Any:
        config = self._resolve_base_estimator(config)
        return super().fit(X, y, config, progress_callback, log_callback, validation_data)

    @classmethod
    def _resolve_base_estimator(cls, config: Optional[Dict[str, Any]]) -> Dict[str, Any]:
        """Translate a ``base_estimator`` string key into an estimator instance.

        Supports both the flat config shape and the nested ``{"params": {...}}``
        shape used by the model-training payload. Unknown keys fall back to
        logistic regression with a warning.
        """
        if not config:
            return config or {}
        resolved = dict(config)
        nested = isinstance(resolved.get("params"), dict)
        bucket = dict(resolved["params"]) if nested else resolved
        key = bucket.pop("base_estimator", None)
        if isinstance(key, str):
            factory = cls.BASE_ESTIMATORS.get(key)
            if factory is None:
                logger.warning(
                    "Unknown base_estimator '%s'; falling back to logistic_regression.", key
                )
                factory = cls.BASE_ESTIMATORS["logistic_regression"]
            bucket["estimator"] = factory()
        if nested:
            resolved["params"] = bucket
            return resolved
        return bucket

HyperparameterField dataclass

Describe a single tunable hyperparameter.

Source code in skyulf-core/skyulf/modeling/hyperparameters/_field.py
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
@dataclass
class HyperparameterField:
    """Describe a single tunable hyperparameter."""

    name: str
    label: str
    type: str  # "number", "select", "boolean"
    default: Any
    description: str = ""
    min: Optional[float] = None
    max: Optional[float] = None
    step: Optional[float] = None
    options: Optional[List[Dict[str, Any]]] = (
        None  # For 'select' type: [{"label": "L1", "value": "l1"}]
    )

    def to_dict(self) -> Dict[str, Any]:
        return asdict(self)

LogisticRegressionApplier

Bases: SklearnApplier

Logistic Regression Applier.

Source code in skyulf-core/skyulf/modeling/classification.py
57
58
class LogisticRegressionApplier(SklearnApplier):
    """Logistic Regression Applier."""

LogisticRegressionCalculator

Bases: SklearnCalculator

Logistic Regression Calculator.

Source code in skyulf-core/skyulf/modeling/classification.py
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
@NodeRegistry.register("logistic_regression", LogisticRegressionApplier)
@node_meta(
    id="logistic_regression",
    name="Logistic Regression",
    category="Modeling",
    description="Linear model for classification.",
    params={"max_iter": 1000, "solver": "lbfgs", "random_state": 42},
    tags=["requires_scaling"],
)
class LogisticRegressionCalculator(SklearnCalculator):
    """Logistic Regression Calculator."""

    def __init__(self):
        super().__init__(
            model_class=LogisticRegression,
            default_params={
                "max_iter": 1000,
                "solver": "lbfgs",
                "random_state": 42,
            },
            problem_type="classification",
        )

MultinomialNBApplier

Bases: SklearnApplier

Multinomial Naive Bayes Applier.

Source code in skyulf-core/skyulf/modeling/naive_bayes.py
21
22
class MultinomialNBApplier(SklearnApplier):
    """Multinomial Naive Bayes Applier."""

MultinomialNBCalculator

Bases: SklearnCalculator

Multinomial Naive Bayes Calculator.

Source code in skyulf-core/skyulf/modeling/naive_bayes.py
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
@NodeRegistry.register("multinomial_nb", MultinomialNBApplier)
@node_meta(
    id="multinomial_nb",
    name="Multinomial Naive Bayes (counts / text)",
    category="Modeling",
    description=(
        "Naive Bayes classifier for multinomially-distributed features "
        "(e.g. token counts or TF-IDF). "
        "Requires non-negative input features."
    ),
    params={"alpha": 1.0, "fit_prior": True},
    tags=["text", "nlp", "classification", "naive_bayes"],
)
class MultinomialNBCalculator(SklearnCalculator):
    """Multinomial Naive Bayes Calculator."""

    def __init__(self):
        super().__init__(
            model_class=MultinomialNB,
            default_params={"alpha": 1.0, "fit_prior": True},
            problem_type="classification",
        )

    @property
    def problem_type(self) -> str:
        return "classification"

RandomForestClassifierApplier

Bases: SklearnApplier

Random Forest Classifier Applier.

Source code in skyulf-core/skyulf/modeling/classification.py
174
175
class RandomForestClassifierApplier(SklearnApplier):
    """Random Forest Classifier Applier."""

RandomForestClassifierCalculator

Bases: SklearnCalculator

Random Forest Classifier Calculator.

Source code in skyulf-core/skyulf/modeling/classification.py
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
@NodeRegistry.register("random_forest_classifier", RandomForestClassifierApplier)
@node_meta(
    id="random_forest_classifier",
    name="Random Forest Classifier",
    category="Modeling",
    description="Ensemble of decision trees.",
    params={"n_estimators": 50, "max_depth": 10, "min_samples_split": 5},
)
class RandomForestClassifierCalculator(SklearnCalculator):
    """Random Forest Classifier Calculator."""

    def __init__(self):
        super().__init__(
            model_class=RandomForestClassifier,
            default_params={
                "n_estimators": 50,
                "max_depth": 10,
                "min_samples_split": 5,
                "min_samples_leaf": 2,
                "n_jobs": -1,
                "random_state": 42,
            },
            problem_type="classification",
        )

RandomForestRegressorApplier

Bases: SklearnApplier

Random Forest Regressor Applier.

Source code in skyulf-core/skyulf/modeling/regression.py
107
108
class RandomForestRegressorApplier(SklearnApplier):
    """Random Forest Regressor Applier."""

RandomForestRegressorCalculator

Bases: SklearnCalculator

Random Forest Regressor Calculator.

Source code in skyulf-core/skyulf/modeling/regression.py
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
@NodeRegistry.register("random_forest_regressor", RandomForestRegressorApplier)
@node_meta(
    id="random_forest_regressor",
    name="Random Forest Regressor",
    category="Modeling",
    description="Ensemble of decision trees for regression.",
    params={"n_estimators": 50, "max_depth": 10, "min_samples_split": 5},
)
class RandomForestRegressorCalculator(SklearnCalculator):
    """Random Forest Regressor Calculator."""

    def __init__(self):
        super().__init__(
            model_class=RandomForestRegressor,
            default_params={
                "n_estimators": 50,
                "max_depth": 10,
                "min_samples_split": 5,
                "min_samples_leaf": 2,
                "n_jobs": -1,
                "random_state": 42,
            },
            problem_type="regression",
        )

RidgeRegressionApplier

Bases: SklearnApplier

Ridge Regression Applier.

Source code in skyulf-core/skyulf/modeling/regression.py
78
79
class RidgeRegressionApplier(SklearnApplier):
    """Ridge Regression Applier."""

RidgeRegressionCalculator

Bases: SklearnCalculator

Ridge Regression Calculator.

Source code in skyulf-core/skyulf/modeling/regression.py
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
@NodeRegistry.register("ridge_regression", RidgeRegressionApplier)
@node_meta(
    id="ridge_regression",
    name="Ridge Regression",
    category="Modeling",
    description="Linear least squares with l2 regularization.",
    params={"alpha": 1.0, "solver": "auto", "random_state": 42},
    tags=["requires_scaling"],
)
class RidgeRegressionCalculator(SklearnCalculator):
    """Ridge Regression Calculator."""

    def __init__(self):
        super().__init__(
            model_class=Ridge,
            default_params={
                "alpha": 1.0,
                "solver": "auto",
                "random_state": 42,
            },
            problem_type="regression",
        )

SGDClassifierApplier

Bases: SklearnApplier

Stochastic Gradient Descent Classifier Applier.

Source code in skyulf-core/skyulf/modeling/classification.py
553
554
class SGDClassifierApplier(SklearnApplier):
    """Stochastic Gradient Descent Classifier Applier."""

SGDClassifierCalculator

Bases: SklearnCalculator

SGD Classifier Calculator.

Source code in skyulf-core/skyulf/modeling/classification.py
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
@NodeRegistry.register("sgd_classifier", SGDClassifierApplier)
@node_meta(
    id="sgd_classifier",
    name="SGD Classifier (text / linear)",
    category="Modeling",
    description=(
        "Linear classifiers (SVM, logistic regression, etc.) with SGD training. "
        "Highly efficient for high-dimensional sparse/dense text representations "
        "and large datasets."
    ),
    params={
        "loss": "log_loss",
        "penalty": "l2",
        "alpha": 0.0001,
        "l1_ratio": 0.15,
        "max_iter": 1000,
        "random_state": 42,
    },
    tags=["text", "nlp", "classification", "linear", "requires_scaling"],
)
class SGDClassifierCalculator(SklearnCalculator):
    """SGD Classifier Calculator."""

    def __init__(self):
        super().__init__(
            model_class=SGDClassifier,
            default_params={
                "loss": "log_loss",
                "penalty": "l2",
                "alpha": 0.0001,
                "l1_ratio": 0.15,
                "max_iter": 1000,
                "random_state": 42,
            },
            problem_type="classification",
        )

SklearnApplier

Bases: BaseModelApplier

Base applier for Scikit-Learn models.

Source code in skyulf-core/skyulf/modeling/sklearn_wrapper.py
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
class SklearnApplier(BaseModelApplier):
    """Base applier for Scikit-Learn models."""

    def predict(self, df: Union[pd.DataFrame, SkyulfDataFrame], model_artifact: Any) -> Any:
        # Convert to Numpy
        X_np, _ = SklearnBridge.to_sklearn(df)

        preds = model_artifact.predict(X_np)

        # Return as Pandas Series for consistency
        # If input was Pandas, try to preserve index
        index = None
        if hasattr(df, "index"):
            index = df.index
        elif hasattr(df, "to_pandas"):
            # If it's a wrapper or Polars, we might lose index unless we convert
            # For now, default index is acceptable for predictions
            pass

        return pd.Series(preds, index=index)

    def predict_proba(
        self, df: Union[pd.DataFrame, SkyulfDataFrame], model_artifact: Any
    ) -> Optional[Any]:
        if not hasattr(model_artifact, "predict_proba"):
            return None

        X_np, _ = SklearnBridge.to_sklearn(df)
        probs = model_artifact.predict_proba(X_np)

        # Return as DataFrame
        index = None
        if hasattr(df, "index"):
            index = df.index

        # Column names usually 0, 1, etc. or classes_
        columns = None
        if hasattr(model_artifact, "classes_"):
            columns = model_artifact.classes_

        return pd.DataFrame(probs, index=index, columns=columns)

SklearnCalculator

Bases: BaseModelCalculator

Base calculator for Scikit-Learn models.

Source code in skyulf-core/skyulf/modeling/sklearn_wrapper.py
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
class SklearnCalculator(BaseModelCalculator):
    """Base calculator for Scikit-Learn models."""

    def __init__(
        self,
        model_class: Type[BaseEstimator],
        default_params: Dict[str, Any],
        problem_type: str,
    ):
        # `Any` because sklearn stubs make BaseEstimator subclasses appear non-callable.
        self.model_class: Any = model_class
        self._default_params = default_params
        self._problem_type = problem_type

    @property
    def default_params(self) -> Dict[str, Any]:
        return self._default_params

    @property
    def problem_type(self) -> str:
        return self._problem_type

    def fit(
        self,
        X: Union[pd.DataFrame, SkyulfDataFrame],
        y: Union[pd.Series, Any],
        config: Dict[str, Any],
        progress_callback=None,
        log_callback=None,
        validation_data=None,
    ) -> Any:
        """Fit the Scikit-Learn model."""
        # 1. Merge Config with Defaults
        params = self.default_params.copy()
        if config:
            # We support two configuration structures:
            # 1. Nested: {'params': {'C': 1.0, ...}} - Preferred
            # 2. Flat: {'C': 1.0, 'type': '...', ...} - Legacy/Simple support

            # Check for explicit 'params' dictionary first
            overrides = config.get("params", {})

            # If 'params' key exists but is None or empty, check if there are other keys at top level
            # that might be params. But be careful not to mix them.
            # If config has 'params', we assume it's the source of truth.

            if not overrides and "params" not in config:
                # Fallback to flat config if 'params' key is completely missing
                reserved_keys = {
                    "type",
                    "target_column",
                    "node_id",
                    "step_type",
                    "inputs",
                }
                overrides = {
                    k: v
                    for k, v in config.items()
                    if k not in reserved_keys and not isinstance(v, dict)
                }

            if overrides:
                params.update(overrides)

        msg = f"Initializing {self.model_class.__name__} with params: {params}"
        logger.info(msg)
        if log_callback:
            log_callback(msg)

        # 2. Instantiate Model
        # Filter params to only include those accepted by the model_class constructor.
        # Skip filtering when the constructor uses **kwargs (e.g. XGBoost 2.x) because
        # every named param would fail the membership check even though it is valid.
        import inspect

        sig = inspect.signature(self.model_class)
        accepts_kwargs = any(
            p.kind == inspect.Parameter.VAR_KEYWORD for p in sig.parameters.values()
        )

        if accepts_kwargs:
            valid_params = params
        else:
            valid_params = {k: v for k, v in params.items() if k in sig.parameters}
            dropped = set(params.keys()) - set(valid_params.keys())
            if dropped:
                logger.warning(
                    f"Dropped parameters not supported by {self.model_class.__name__}: {dropped}"
                )

        model = self.model_class(**valid_params)

        # 3. Fit
        # Convert to Numpy using Bridge (handles Polars/Pandas/Wrappers)
        X_np, y_np = SklearnBridge.to_sklearn((X, y))

        model.fit(X_np, y_np)

        return model

fit(X, y, config, progress_callback=None, log_callback=None, validation_data=None)

Fit the Scikit-Learn model.

Source code in skyulf-core/skyulf/modeling/sklearn_wrapper.py
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
def fit(
    self,
    X: Union[pd.DataFrame, SkyulfDataFrame],
    y: Union[pd.Series, Any],
    config: Dict[str, Any],
    progress_callback=None,
    log_callback=None,
    validation_data=None,
) -> Any:
    """Fit the Scikit-Learn model."""
    # 1. Merge Config with Defaults
    params = self.default_params.copy()
    if config:
        # We support two configuration structures:
        # 1. Nested: {'params': {'C': 1.0, ...}} - Preferred
        # 2. Flat: {'C': 1.0, 'type': '...', ...} - Legacy/Simple support

        # Check for explicit 'params' dictionary first
        overrides = config.get("params", {})

        # If 'params' key exists but is None or empty, check if there are other keys at top level
        # that might be params. But be careful not to mix them.
        # If config has 'params', we assume it's the source of truth.

        if not overrides and "params" not in config:
            # Fallback to flat config if 'params' key is completely missing
            reserved_keys = {
                "type",
                "target_column",
                "node_id",
                "step_type",
                "inputs",
            }
            overrides = {
                k: v
                for k, v in config.items()
                if k not in reserved_keys and not isinstance(v, dict)
            }

        if overrides:
            params.update(overrides)

    msg = f"Initializing {self.model_class.__name__} with params: {params}"
    logger.info(msg)
    if log_callback:
        log_callback(msg)

    # 2. Instantiate Model
    # Filter params to only include those accepted by the model_class constructor.
    # Skip filtering when the constructor uses **kwargs (e.g. XGBoost 2.x) because
    # every named param would fail the membership check even though it is valid.
    import inspect

    sig = inspect.signature(self.model_class)
    accepts_kwargs = any(
        p.kind == inspect.Parameter.VAR_KEYWORD for p in sig.parameters.values()
    )

    if accepts_kwargs:
        valid_params = params
    else:
        valid_params = {k: v for k, v in params.items() if k in sig.parameters}
        dropped = set(params.keys()) - set(valid_params.keys())
        if dropped:
            logger.warning(
                f"Dropped parameters not supported by {self.model_class.__name__}: {dropped}"
            )

    model = self.model_class(**valid_params)

    # 3. Fit
    # Convert to Numpy using Bridge (handles Polars/Pandas/Wrappers)
    X_np, y_np = SklearnBridge.to_sklearn((X, y))

    model.fit(X_np, y_np)

    return model

StackingClassifierApplier

Bases: SklearnApplier

Stacking Classifier Applier (meta-learner over base classifiers).

Source code in skyulf-core/skyulf/modeling/ensemble.py
458
459
class StackingClassifierApplier(SklearnApplier):
    """Stacking Classifier Applier (meta-learner over base classifiers)."""

StackingClassifierCalculator

Bases: _BaseEnsembleCalculator

Stacking Classifier Calculator with selectable base + final learners.

Source code in skyulf-core/skyulf/modeling/ensemble.py
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
@NodeRegistry.register("stacking_classifier", StackingClassifierApplier)
@node_meta(
    id="stacking_classifier",
    name="Stacking Classifier",
    category="Ensemble",
    description=(
        "Trains a final classifier on the out-of-fold predictions of several "
        "base classifiers. Uses internal CV folds to avoid leakage."
    ),
    params={
        "base_estimators": ["random_forest", "gradient_boosting", "svc"],
        "final_estimator": "logistic_regression",
        "cv": 5,
    },
    tags=["requires_scaling"],
)
class StackingClassifierCalculator(_BaseEnsembleCalculator):
    """Stacking Classifier Calculator with selectable base + final learners."""

    BASE_ESTIMATORS = BASE_ESTIMATORS_CLF
    DEFAULT_KEYS = ("random_forest", "gradient_boosting", "svc")
    DEFAULT_FINAL_KEY = "logistic_regression"
    MODEL_KEY = "stacking_classifier"
    IS_STACKING = True

    def __init__(self):
        super().__init__(
            model_class=StackingClassifier,
            default_params={"cv": 5},
            problem_type="classification",
        )

StackingRegressorApplier

Bases: SklearnApplier

Stacking Regressor Applier (meta-learner over base regressors).

Source code in skyulf-core/skyulf/modeling/ensemble.py
530
531
class StackingRegressorApplier(SklearnApplier):
    """Stacking Regressor Applier (meta-learner over base regressors)."""

StackingRegressorCalculator

Bases: _BaseEnsembleCalculator

Stacking Regressor Calculator with selectable base + final learners.

Source code in skyulf-core/skyulf/modeling/ensemble.py
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
@NodeRegistry.register("stacking_regressor", StackingRegressorApplier)
@node_meta(
    id="stacking_regressor",
    name="Stacking Regressor",
    category="Ensemble",
    description=(
        "Trains a final regressor on the out-of-fold predictions of several "
        "base regressors. Uses internal CV folds to avoid leakage."
    ),
    params={
        "base_estimators": ["random_forest", "gradient_boosting", "ridge"],
        "final_estimator": "ridge",
        "cv": 5,
    },
    tags=["requires_scaling"],
)
class StackingRegressorCalculator(_BaseEnsembleCalculator):
    """Stacking Regressor Calculator with selectable base + final learners."""

    BASE_ESTIMATORS = BASE_ESTIMATORS_REG
    DEFAULT_KEYS = ("random_forest", "gradient_boosting", "ridge")
    DEFAULT_FINAL_KEY = "ridge"
    MODEL_KEY = "stacking_regressor"
    IS_STACKING = True

    def __init__(self):
        super().__init__(
            model_class=StackingRegressor,
            default_params={"cv": 5},
            problem_type="regression",
        )

StatefulEstimator

Source code in skyulf-core/skyulf/modeling/base.py
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
class StatefulEstimator:
    def __init__(self, calculator: BaseModelCalculator, applier: BaseModelApplier, node_id: str):
        self.calculator = calculator
        self.applier = applier
        self.node_id = node_id
        self.model = None  # In-memory model storage

    def _extract_xy(self, data: Any, target_column: str) -> tuple[Any, Any]:
        """Helper to extract X and y from DataFrame or Tuple."""
        if isinstance(data, tuple) and len(data) == 2:
            X, y = data[0], data[1]
            # If y is None but X is a DataFrame containing the target, extract it
            if y is None and hasattr(X, "columns") and target_column in X.columns:
                return self._extract_xy(X, target_column)
            return X, y

        engine = get_engine(data)

        if engine.name == EngineName.POLARS:
            if target_column not in data.columns:
                raise ValueError(f"Target column '{target_column}' not found in data")
            X = data.drop([target_column])
            y = data.select(target_column).to_series()
            return X, y

        # Pandas / Default
        # Check for DataFrame-like
        if hasattr(data, "columns"):
            if target_column not in data.columns:
                raise ValueError(f"Target column '{target_column}' not found in data")

            # Fallback for pure Pandas or Generic DataFrame
            # If we reached here without matching Polars explicitly, treat as generic/pandas
            # Try generic drop if available
            if hasattr(data, "drop"):
                # Handle pandas-like drop
                try:
                    return data.drop(columns=[target_column]), data[target_column]
                except TypeError:
                    # Maybe it doesn't support columns= kwarg, try position or list
                    pass

            # Simple attribute access fallback
            if hasattr(data, target_column):
                return data, getattr(data, target_column)

        raise ValueError(f"Unexpected data type: {type(data)}")

    def cross_validate(
        self,
        dataset: SplitDataset,
        target_column: str,
        config: Dict[str, Any],
        n_folds: int = 5,
        cv_type: str = "k_fold",
        shuffle: bool = True,
        random_state: int = 42,
        time_column: Optional[str] = None,
        progress_callback: Optional[Callable[[int, int], None]] = None,
        log_callback: Optional[Callable[[str], None]] = None,
    ) -> Dict[str, Any]:
        """
        Performs cross-validation on the training split.
        """
        # Import here to avoid circular dependency if any
        from .cross_validation import perform_cross_validation

        X_train, y_train = self._extract_xy(dataset.train, target_column)

        return perform_cross_validation(
            calculator=self.calculator,
            applier=self.applier,
            X=X_train,
            y=y_train,
            config=config,
            n_folds=n_folds,
            cv_type=cv_type,
            shuffle=shuffle,
            random_state=random_state,
            time_column=time_column,
            progress_callback=progress_callback,
            log_callback=log_callback,
        )

    def fit_predict(
        self,
        dataset: Union[SplitDataset, pd.DataFrame, Tuple[pd.DataFrame, pd.Series]],
        target_column: str,
        config: Dict[str, Any],
        progress_callback: Optional[Callable[[int, int], None]] = None,
        log_callback: Optional[Callable[[str], None]] = None,
        job_id: str = "unknown",
    ) -> Dict[str, pd.Series]:
        """
        Fits the model on training data and returns predictions for all splits.
        """
        # Handle raw DataFrame or Tuple input by wrapping it in a dummy SplitDataset
        if isinstance(dataset, pd.DataFrame):
            dataset = SplitDataset(train=dataset, test=pd.DataFrame(), validation=None)
        elif isinstance(dataset, tuple):
            # Check if it's (train_df, test_df) or (X, y)
            elem0 = dataset[0]
            if isinstance(elem0, pd.DataFrame) and target_column in elem0.columns:
                # It's (train_df, test_df)
                train_df, test_df = dataset
                dataset = SplitDataset(train=train_df, test=test_df, validation=None)  # type: ignore
            else:
                # Fallback: Treat input as training data (e.g. X, y tuple) and initialize empty test set.
                msg = (
                    "WARNING: No test set provided. Using entire input as training data. "
                    "Ensure data was split BEFORE preprocessing to avoid data leakage."
                )
                logger.warning(msg)
                if log_callback:
                    log_callback(msg)

                dataset = SplitDataset(
                    train=cast(Any, dataset), test=pd.DataFrame(), validation=None
                )

        # 1. Prepare Data
        X_train, y_train = self._extract_xy(dataset.train, target_column)

        validation_data = None
        if dataset.validation is not None:
            X_val, y_val = self._extract_xy(dataset.validation, target_column)
            validation_data = (X_val, y_val)

        # 2. Train Model
        self.model = self.calculator.fit(
            X_train,
            y_train,
            config,
            progress_callback=progress_callback,
            log_callback=log_callback,
            validation_data=validation_data,
        )

        # 3. Predict on all splits
        predictions = {}

        # Train Predictions
        predictions["train"] = self.applier.predict(X_train, self.model)

        # Test Predictions
        is_test_empty = False
        test_df = None
        if isinstance(dataset.test, tuple):
            test_df = dataset.test[0]
        else:
            test_df = dataset.test

        if hasattr(test_df, "empty"):
            is_test_empty = test_df.empty
        else:
            # Polars
            is_test_empty = test_df.is_empty()

        if not is_test_empty:
            if isinstance(dataset.test, tuple):
                X_test, y_test_split = dataset.test
                X_test = cast(Any, X_test)
                # If y is None, the target may still be in X — drop it
                if (
                    y_test_split is None
                    and hasattr(X_test, "columns")
                    and target_column in X_test.columns
                ):
                    try:
                        X_test = X_test.drop(columns=[target_column])
                    except TypeError:
                        X_test = X_test.drop([target_column])
            else:
                if target_column in dataset.test.columns:
                    try:
                        X_test = dataset.test.drop(columns=[target_column])
                    except TypeError:
                        # Polars
                        X_test = dataset.test.drop([target_column])
                else:
                    X_test = dataset.test
            predictions["test"] = self.applier.predict(X_test, self.model)

        # Validation Predictions
        if dataset.validation is not None:
            if isinstance(dataset.validation, tuple):
                X_val, y_val_split = dataset.validation
                X_val = cast(Any, X_val)
                # If y is None, the target may still be in X — drop it
                if (
                    y_val_split is None
                    and hasattr(X_val, "columns")
                    and target_column in X_val.columns
                ):
                    try:
                        X_val = X_val.drop(columns=[target_column])
                    except TypeError:
                        X_val = X_val.drop([target_column])
            else:
                if target_column in dataset.validation.columns:
                    X_val = dataset.validation.drop(columns=[target_column])
                else:
                    X_val = dataset.validation
            predictions["validation"] = self.applier.predict(X_val, self.model)

        return predictions

    def refit(
        self,
        dataset: SplitDataset,
        target_column: str,
        config: Dict[str, Any],
        job_id: str = "unknown",
    ) -> None:
        """
        Refits the model on Train + Validation data and updates the artifact.
        """
        if dataset.validation is None:
            # Fallback to normal fit if no validation set
            self.fit_predict(dataset, target_column, config, job_id=job_id)
            return

        # 1. Prepare Combined Data
        X_train, y_train = self._extract_xy(dataset.train, target_column)
        X_val, y_val = self._extract_xy(dataset.validation, target_column)

        X_combined = pd.concat([X_train, X_val], axis=0)
        y_combined = pd.concat([y_train, y_val], axis=0)

        # 2. Train Model
        self.model = self.calculator.fit(X_combined, y_combined, config)

    def evaluate(  # noqa: C901
        self, dataset: SplitDataset, target_column: str, job_id: str = "unknown"
    ) -> Any:
        """
        Evaluates the model on all splits and returns a detailed report.
        """
        # Import here to avoid circular dependency
        from ._evaluation.classification import evaluate_classification_model
        from ._evaluation.regression import evaluate_regression_model

        if self.model is None:
            raise ValueError("Model has not been trained yet. Call fit_predict() first.")

        problem_type = self.calculator.problem_type

        splits_payload = {}

        # Container for raw predictions
        evaluation_data: Dict[str, Any] = {
            "job_id": job_id,
            "node_id": self.node_id,
            "problem_type": problem_type,
            "splits": {},
        }

        # Helper to evaluate a single split
        def evaluate_split(split_name: str, data: Any):
            if isinstance(data, tuple):
                X, y = data
                # If y is None, the target may still be embedded in X
                if y is None and hasattr(X, "columns"):
                    if target_column not in X.columns:
                        return None  # Cannot evaluate without target
                    y = X[target_column]
                    try:
                        X = X.drop(columns=[target_column])
                    except TypeError:
                        X = X.drop([target_column])
            elif isinstance(data, pd.DataFrame):
                if target_column not in data.columns:
                    return None  # Cannot evaluate without target
                X = data.drop(columns=[target_column])
                y = data[target_column]
            else:
                return None

            y_pred = self.applier.predict(X, self.model)

            # Try to get probabilities for classification
            y_proba = None
            if problem_type == "classification":
                y_proba_df = self.applier.predict_proba(X, self.model)
                if y_proba_df is not None:
                    y_proba = {
                        "classes": y_proba_df.columns.tolist(),
                        "values": y_proba_df.values.tolist(),
                    }

            split_data = {
                "y_true": y.tolist() if hasattr(y, "tolist") else list(y),
                "y_pred": (y_pred.tolist() if hasattr(y_pred, "tolist") else list(y_pred)),
            }

            if y_proba:
                split_data["y_proba"] = y_proba

            evaluation_data["splits"][split_name] = split_data

            # Unpack model if it's a tuple (from Tuner)
            model_to_evaluate = self.model
            if isinstance(self.model, tuple) and len(self.model) == 2:
                # Check if first element looks like a model (has fit/predict)
                # or if it's just a convention from TuningCalculator
                model_to_evaluate = self.model[0]

            if problem_type == "classification":
                return evaluate_classification_model(
                    model=model_to_evaluate, dataset_name=split_name, X_test=X, y_test=y
                )
            elif problem_type == "regression":
                return evaluate_regression_model(
                    model=model_to_evaluate, dataset_name=split_name, X_test=X, y_test=y
                )
            else:
                raise ValueError(f"Unknown problem type: {problem_type}")

        # 2. Evaluate Train
        splits_payload["train"] = evaluate_split("train", dataset.train)

        # 3. Evaluate Test
        has_test = False
        if isinstance(dataset.test, pd.DataFrame):
            has_test = not dataset.test.empty
        elif isinstance(dataset.test, tuple):
            has_test = len(dataset.test) == 2 and len(dataset.test[0]) > 0

        if has_test:
            splits_payload["test"] = evaluate_split("test", dataset.test)

        # 4. Evaluate Validation
        if dataset.validation is not None:
            has_val = False
            if isinstance(dataset.validation, pd.DataFrame):
                has_val = not dataset.validation.empty
            elif isinstance(dataset.validation, tuple):
                has_val = len(dataset.validation) == 2 and len(dataset.validation[0]) > 0

            if has_val:
                splits_payload["validation"] = evaluate_split("validation", dataset.validation)

        # Return report object (simplified for now, assuming schema matches)
        return {
            "problem_type": problem_type,
            "splits": splits_payload,
            "raw_data": evaluation_data,
        }

cross_validate(dataset, target_column, config, n_folds=5, cv_type='k_fold', shuffle=True, random_state=42, time_column=None, progress_callback=None, log_callback=None)

Performs cross-validation on the training split.

Source code in skyulf-core/skyulf/modeling/base.py
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
def cross_validate(
    self,
    dataset: SplitDataset,
    target_column: str,
    config: Dict[str, Any],
    n_folds: int = 5,
    cv_type: str = "k_fold",
    shuffle: bool = True,
    random_state: int = 42,
    time_column: Optional[str] = None,
    progress_callback: Optional[Callable[[int, int], None]] = None,
    log_callback: Optional[Callable[[str], None]] = None,
) -> Dict[str, Any]:
    """
    Performs cross-validation on the training split.
    """
    # Import here to avoid circular dependency if any
    from .cross_validation import perform_cross_validation

    X_train, y_train = self._extract_xy(dataset.train, target_column)

    return perform_cross_validation(
        calculator=self.calculator,
        applier=self.applier,
        X=X_train,
        y=y_train,
        config=config,
        n_folds=n_folds,
        cv_type=cv_type,
        shuffle=shuffle,
        random_state=random_state,
        time_column=time_column,
        progress_callback=progress_callback,
        log_callback=log_callback,
    )

evaluate(dataset, target_column, job_id='unknown')

Evaluates the model on all splits and returns a detailed report.

Source code in skyulf-core/skyulf/modeling/base.py
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
def evaluate(  # noqa: C901
    self, dataset: SplitDataset, target_column: str, job_id: str = "unknown"
) -> Any:
    """
    Evaluates the model on all splits and returns a detailed report.
    """
    # Import here to avoid circular dependency
    from ._evaluation.classification import evaluate_classification_model
    from ._evaluation.regression import evaluate_regression_model

    if self.model is None:
        raise ValueError("Model has not been trained yet. Call fit_predict() first.")

    problem_type = self.calculator.problem_type

    splits_payload = {}

    # Container for raw predictions
    evaluation_data: Dict[str, Any] = {
        "job_id": job_id,
        "node_id": self.node_id,
        "problem_type": problem_type,
        "splits": {},
    }

    # Helper to evaluate a single split
    def evaluate_split(split_name: str, data: Any):
        if isinstance(data, tuple):
            X, y = data
            # If y is None, the target may still be embedded in X
            if y is None and hasattr(X, "columns"):
                if target_column not in X.columns:
                    return None  # Cannot evaluate without target
                y = X[target_column]
                try:
                    X = X.drop(columns=[target_column])
                except TypeError:
                    X = X.drop([target_column])
        elif isinstance(data, pd.DataFrame):
            if target_column not in data.columns:
                return None  # Cannot evaluate without target
            X = data.drop(columns=[target_column])
            y = data[target_column]
        else:
            return None

        y_pred = self.applier.predict(X, self.model)

        # Try to get probabilities for classification
        y_proba = None
        if problem_type == "classification":
            y_proba_df = self.applier.predict_proba(X, self.model)
            if y_proba_df is not None:
                y_proba = {
                    "classes": y_proba_df.columns.tolist(),
                    "values": y_proba_df.values.tolist(),
                }

        split_data = {
            "y_true": y.tolist() if hasattr(y, "tolist") else list(y),
            "y_pred": (y_pred.tolist() if hasattr(y_pred, "tolist") else list(y_pred)),
        }

        if y_proba:
            split_data["y_proba"] = y_proba

        evaluation_data["splits"][split_name] = split_data

        # Unpack model if it's a tuple (from Tuner)
        model_to_evaluate = self.model
        if isinstance(self.model, tuple) and len(self.model) == 2:
            # Check if first element looks like a model (has fit/predict)
            # or if it's just a convention from TuningCalculator
            model_to_evaluate = self.model[0]

        if problem_type == "classification":
            return evaluate_classification_model(
                model=model_to_evaluate, dataset_name=split_name, X_test=X, y_test=y
            )
        elif problem_type == "regression":
            return evaluate_regression_model(
                model=model_to_evaluate, dataset_name=split_name, X_test=X, y_test=y
            )
        else:
            raise ValueError(f"Unknown problem type: {problem_type}")

    # 2. Evaluate Train
    splits_payload["train"] = evaluate_split("train", dataset.train)

    # 3. Evaluate Test
    has_test = False
    if isinstance(dataset.test, pd.DataFrame):
        has_test = not dataset.test.empty
    elif isinstance(dataset.test, tuple):
        has_test = len(dataset.test) == 2 and len(dataset.test[0]) > 0

    if has_test:
        splits_payload["test"] = evaluate_split("test", dataset.test)

    # 4. Evaluate Validation
    if dataset.validation is not None:
        has_val = False
        if isinstance(dataset.validation, pd.DataFrame):
            has_val = not dataset.validation.empty
        elif isinstance(dataset.validation, tuple):
            has_val = len(dataset.validation) == 2 and len(dataset.validation[0]) > 0

        if has_val:
            splits_payload["validation"] = evaluate_split("validation", dataset.validation)

    # Return report object (simplified for now, assuming schema matches)
    return {
        "problem_type": problem_type,
        "splits": splits_payload,
        "raw_data": evaluation_data,
    }

fit_predict(dataset, target_column, config, progress_callback=None, log_callback=None, job_id='unknown')

Fits the model on training data and returns predictions for all splits.

Source code in skyulf-core/skyulf/modeling/base.py
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
def fit_predict(
    self,
    dataset: Union[SplitDataset, pd.DataFrame, Tuple[pd.DataFrame, pd.Series]],
    target_column: str,
    config: Dict[str, Any],
    progress_callback: Optional[Callable[[int, int], None]] = None,
    log_callback: Optional[Callable[[str], None]] = None,
    job_id: str = "unknown",
) -> Dict[str, pd.Series]:
    """
    Fits the model on training data and returns predictions for all splits.
    """
    # Handle raw DataFrame or Tuple input by wrapping it in a dummy SplitDataset
    if isinstance(dataset, pd.DataFrame):
        dataset = SplitDataset(train=dataset, test=pd.DataFrame(), validation=None)
    elif isinstance(dataset, tuple):
        # Check if it's (train_df, test_df) or (X, y)
        elem0 = dataset[0]
        if isinstance(elem0, pd.DataFrame) and target_column in elem0.columns:
            # It's (train_df, test_df)
            train_df, test_df = dataset
            dataset = SplitDataset(train=train_df, test=test_df, validation=None)  # type: ignore
        else:
            # Fallback: Treat input as training data (e.g. X, y tuple) and initialize empty test set.
            msg = (
                "WARNING: No test set provided. Using entire input as training data. "
                "Ensure data was split BEFORE preprocessing to avoid data leakage."
            )
            logger.warning(msg)
            if log_callback:
                log_callback(msg)

            dataset = SplitDataset(
                train=cast(Any, dataset), test=pd.DataFrame(), validation=None
            )

    # 1. Prepare Data
    X_train, y_train = self._extract_xy(dataset.train, target_column)

    validation_data = None
    if dataset.validation is not None:
        X_val, y_val = self._extract_xy(dataset.validation, target_column)
        validation_data = (X_val, y_val)

    # 2. Train Model
    self.model = self.calculator.fit(
        X_train,
        y_train,
        config,
        progress_callback=progress_callback,
        log_callback=log_callback,
        validation_data=validation_data,
    )

    # 3. Predict on all splits
    predictions = {}

    # Train Predictions
    predictions["train"] = self.applier.predict(X_train, self.model)

    # Test Predictions
    is_test_empty = False
    test_df = None
    if isinstance(dataset.test, tuple):
        test_df = dataset.test[0]
    else:
        test_df = dataset.test

    if hasattr(test_df, "empty"):
        is_test_empty = test_df.empty
    else:
        # Polars
        is_test_empty = test_df.is_empty()

    if not is_test_empty:
        if isinstance(dataset.test, tuple):
            X_test, y_test_split = dataset.test
            X_test = cast(Any, X_test)
            # If y is None, the target may still be in X — drop it
            if (
                y_test_split is None
                and hasattr(X_test, "columns")
                and target_column in X_test.columns
            ):
                try:
                    X_test = X_test.drop(columns=[target_column])
                except TypeError:
                    X_test = X_test.drop([target_column])
        else:
            if target_column in dataset.test.columns:
                try:
                    X_test = dataset.test.drop(columns=[target_column])
                except TypeError:
                    # Polars
                    X_test = dataset.test.drop([target_column])
            else:
                X_test = dataset.test
        predictions["test"] = self.applier.predict(X_test, self.model)

    # Validation Predictions
    if dataset.validation is not None:
        if isinstance(dataset.validation, tuple):
            X_val, y_val_split = dataset.validation
            X_val = cast(Any, X_val)
            # If y is None, the target may still be in X — drop it
            if (
                y_val_split is None
                and hasattr(X_val, "columns")
                and target_column in X_val.columns
            ):
                try:
                    X_val = X_val.drop(columns=[target_column])
                except TypeError:
                    X_val = X_val.drop([target_column])
        else:
            if target_column in dataset.validation.columns:
                X_val = dataset.validation.drop(columns=[target_column])
            else:
                X_val = dataset.validation
        predictions["validation"] = self.applier.predict(X_val, self.model)

    return predictions

refit(dataset, target_column, config, job_id='unknown')

Refits the model on Train + Validation data and updates the artifact.

Source code in skyulf-core/skyulf/modeling/base.py
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
def refit(
    self,
    dataset: SplitDataset,
    target_column: str,
    config: Dict[str, Any],
    job_id: str = "unknown",
) -> None:
    """
    Refits the model on Train + Validation data and updates the artifact.
    """
    if dataset.validation is None:
        # Fallback to normal fit if no validation set
        self.fit_predict(dataset, target_column, config, job_id=job_id)
        return

    # 1. Prepare Combined Data
    X_train, y_train = self._extract_xy(dataset.train, target_column)
    X_val, y_val = self._extract_xy(dataset.validation, target_column)

    X_combined = pd.concat([X_train, X_val], axis=0)
    y_combined = pd.concat([y_train, y_val], axis=0)

    # 2. Train Model
    self.model = self.calculator.fit(X_combined, y_combined, config)

VotingClassifierApplier

Bases: SklearnApplier

Voting Classifier Applier (hard/soft vote over base classifiers).

Source code in skyulf-core/skyulf/modeling/ensemble.py
422
423
class VotingClassifierApplier(SklearnApplier):
    """Voting Classifier Applier (hard/soft vote over base classifiers)."""

VotingClassifierCalculator

Bases: _BaseEnsembleCalculator

Voting Classifier Calculator with selectable base learners.

Source code in skyulf-core/skyulf/modeling/ensemble.py
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
@NodeRegistry.register("voting_classifier", VotingClassifierApplier)
@node_meta(
    id="voting_classifier",
    name="Voting Classifier",
    category="Ensemble",
    description=(
        "Combines several classifiers by majority vote (hard) or averaged "
        "probabilities (soft). Fits each base model once; no internal CV."
    ),
    params={
        "base_estimators": ["random_forest", "logistic_regression", "gradient_boosting"],
        "voting": "soft",
    },
    tags=["requires_scaling"],
)
class VotingClassifierCalculator(_BaseEnsembleCalculator):
    """Voting Classifier Calculator with selectable base learners."""

    BASE_ESTIMATORS = BASE_ESTIMATORS_CLF
    DEFAULT_KEYS = ("random_forest", "logistic_regression", "gradient_boosting")
    MODEL_KEY = "voting_classifier"
    HAS_VOTING = True

    def __init__(self):
        super().__init__(
            model_class=VotingClassifier,
            default_params={"voting": "soft"},
            problem_type="classification",
        )

VotingRegressorApplier

Bases: SklearnApplier

Voting Regressor Applier (averaged predictions over base regressors).

Source code in skyulf-core/skyulf/modeling/ensemble.py
496
497
class VotingRegressorApplier(SklearnApplier):
    """Voting Regressor Applier (averaged predictions over base regressors)."""

VotingRegressorCalculator

Bases: _BaseEnsembleCalculator

Voting Regressor Calculator with selectable base learners.

Source code in skyulf-core/skyulf/modeling/ensemble.py
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
@NodeRegistry.register("voting_regressor", VotingRegressorApplier)
@node_meta(
    id="voting_regressor",
    name="Voting Regressor",
    category="Ensemble",
    description=(
        "Averages the predictions of several regressors (optionally weighted). "
        "Fits each base model once; no internal CV."
    ),
    params={
        "base_estimators": ["linear_regression", "random_forest", "gradient_boosting"],
    },
    tags=["requires_scaling"],
)
class VotingRegressorCalculator(_BaseEnsembleCalculator):
    """Voting Regressor Calculator with selectable base learners."""

    BASE_ESTIMATORS = BASE_ESTIMATORS_REG
    DEFAULT_KEYS = ("linear_regression", "random_forest", "gradient_boosting")
    MODEL_KEY = "voting_regressor"

    def __init__(self):
        super().__init__(
            model_class=VotingRegressor,
            default_params={},
            problem_type="regression",
        )

get_default_search_space(model_key, strategy='random')

Return the default search space for model_key.

For grid-based strategies (grid / halving_grid) the trimmed GRID_SEARCH_SPACES dict is used so the cartesian product stays manageable. All other strategies (random, halving_random, optuna) use the richer DEFAULT_SEARCH_SPACES.

Source code in skyulf-core/skyulf/modeling/hyperparameters/_registry.py
472
473
474
475
476
477
478
479
480
481
482
def get_default_search_space(model_key: str, strategy: str = "random") -> Dict[str, Any]:
    """Return the default search space for *model_key*.

    For grid-based strategies (``grid`` / ``halving_grid``) the trimmed
    ``GRID_SEARCH_SPACES`` dict is used so the cartesian product stays
    manageable. All other strategies (``random``, ``halving_random``,
    ``optuna``) use the richer ``DEFAULT_SEARCH_SPACES``.
    """
    if strategy in _GRID_STRATEGIES:
        return GRID_SEARCH_SPACES.get(model_key, DEFAULT_SEARCH_SPACES.get(model_key, {}))
    return DEFAULT_SEARCH_SPACES.get(model_key, {})

perform_cross_validation(calculator, applier, X, y, config, n_folds=5, cv_type='k_fold', shuffle=True, random_state=42, time_column=None, progress_callback=None, log_callback=None)

Performs K-Fold cross-validation.

Parameters:

Name Type Description Default
calculator BaseModelCalculator

The model calculator (fit logic).

required
applier BaseModelApplier

The model applier (predict logic).

required
X Union[DataFrame, SkyulfDataFrame]

Features.

required
y Union[Series, Any]

Target.

required
config Dict[str, Any]

Model configuration.

required
n_folds int

Number of folds.

5
cv_type str

Type of CV.

'k_fold'
shuffle bool

Whether to shuffle data before splitting (for KFold/Stratified).

True
random_state int

Random seed for shuffling.

42
time_column Optional[str]

Optional column name for sorting when using time_series_split.

None
progress_callback Optional[Callable[[int, int], None]]

Optional callback(current_fold, total_folds).

None
log_callback Optional[Callable[[str], None]]

Optional callback for logging messages.

None

Returns:

Type Description
Dict[str, Any]

Dict containing aggregated metrics and per-fold details.

Source code in skyulf-core/skyulf/modeling/cross_validation.py
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
def perform_cross_validation(
    calculator: BaseModelCalculator,
    applier: BaseModelApplier,
    X: Union[pd.DataFrame, SkyulfDataFrame],
    y: Union[pd.Series, Any],
    config: Dict[str, Any],
    n_folds: int = 5,
    cv_type: str = "k_fold",  # k_fold, stratified_k_fold, time_series_split, shuffle_split, nested_cv
    shuffle: bool = True,
    random_state: int = 42,
    time_column: Optional[str] = None,
    progress_callback: Optional[Callable[[int, int], None]] = None,
    log_callback: Optional[Callable[[str], None]] = None,
) -> Dict[str, Any]:
    """
    Performs K-Fold cross-validation.

    Args:
        calculator: The model calculator (fit logic).
        applier: The model applier (predict logic).
        X: Features.
        y: Target.
        config: Model configuration.
        n_folds: Number of folds.
        cv_type: Type of CV.
        shuffle: Whether to shuffle data before splitting (for KFold/Stratified).
        random_state: Random seed for shuffling.
        time_column: Optional column name for sorting when using time_series_split.
        progress_callback: Optional callback(current_fold, total_folds).
        log_callback: Optional callback for logging messages.

    Returns:
        Dict containing aggregated metrics and per-fold details.
    """
    import logging

    logger = logging.getLogger(__name__)
    problem_type = calculator.problem_type

    if log_callback:
        log_callback(f"Starting Cross-Validation (Folds: {n_folds}, Type: {cv_type})")

    # For Time Series Split, sort data chronologically
    if cv_type == "time_series_split" and isinstance(X, pd.DataFrame):
        X, y = _sort_by_time(X, y, time_column, log_callback, logger)

    # Handle nested CV separately
    if cv_type == "nested_cv":
        return _perform_nested_cv(
            calculator=calculator,
            applier=applier,
            X=X,
            y=y,
            config=config,
            n_folds=n_folds,
            shuffle=shuffle,
            random_state=random_state,
            progress_callback=progress_callback,
            log_callback=log_callback,
        )

    # 1. Setup Splitter
    if cv_type == "time_series_split":
        splitter = TimeSeriesSplit(n_splits=n_folds)
    elif cv_type == "shuffle_split":
        splitter = ShuffleSplit(n_splits=n_folds, test_size=0.2, random_state=random_state)
    elif cv_type == "stratified_k_fold" and problem_type == "classification":
        splitter = StratifiedKFold(
            n_splits=n_folds,
            shuffle=shuffle,
            random_state=random_state if shuffle else None,
        )
    else:
        # Default to KFold
        splitter = KFold(
            n_splits=n_folds,
            shuffle=shuffle,
            random_state=random_state if shuffle else None,
        )

    fold_results = []

    # Ensure numpy for splitting using the Bridge
    X_arr, y_arr = SklearnBridge.to_sklearn((X, y))

    # 2. Iterate Folds
    for fold_idx, (train_idx, val_idx) in enumerate(splitter.split(X_arr, y_arr)):
        if progress_callback:
            progress_callback(fold_idx + 1, n_folds)

        if log_callback:
            log_callback(f"Processing Fold {fold_idx + 1}/{n_folds}...")

        # Split Data
        # We slice the original X/y to preserve their type (Pandas/Polars) for the calculator
        # Polars supports slicing with numpy arrays via __getitem__
        # Pandas supports slicing via iloc

        if hasattr(X, "iloc"):
            X_train_fold = X.iloc[train_idx]
            X_val_fold = X.iloc[val_idx]
        else:
            # Polars or other
            X_train_fold = X[train_idx]
            X_val_fold = X[val_idx]

        if hasattr(y, "iloc"):
            y_train_fold = y.iloc[train_idx]
            y_val_fold = y.iloc[val_idx]
        else:
            # Polars Series or numpy array
            y_train_fold = y[train_idx]
            y_val_fold = y[val_idx]

        # Fit
        model_artifact = calculator.fit(X_train_fold, y_train_fold, config)

        # Evaluate
        if problem_type == "classification":
            metrics = calculate_classification_metrics(model_artifact, X_val_fold, y_val_fold)
        else:
            metrics = calculate_regression_metrics(model_artifact, X_val_fold, y_val_fold)

        if log_callback:
            # Log a key metric for the fold
            key_metric = "accuracy" if problem_type == "classification" else "r2"
            score = metrics.get(key_metric, 0.0)
            log_callback(f"Fold {fold_idx + 1} completed. {key_metric}: {score:.4f}")

        fold_results.append(
            {
                "fold": fold_idx + 1,
                "metrics": sanitize_metrics(metrics),
                # We could store predictions here if needed, but might be too heavy
            }
        )

    # 3. Aggregate
    fold_metrics = [cast(Dict[str, float], r["metrics"]) for r in fold_results]
    aggregated = _aggregate_metrics(fold_metrics)

    if log_callback:
        log_callback(f"Cross-Validation Completed. Aggregated Metrics: {aggregated}")

    return {
        "aggregated_metrics": aggregated,
        "folds": fold_results,
        "cv_config": {
            "n_folds": n_folds,
            "cv_type": cv_type,
            "shuffle": shuffle,
            "random_state": random_state,
        },
    }