Skip to content

API: modeling

skyulf.modeling

Modeling module for Skyulf.

BaseModelApplier

Bases: ABC

Source code in skyulf-core\skyulf\modeling\base.py
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
class BaseModelApplier(ABC):
    @abstractmethod
    def predict(self, df: Union[pd.DataFrame, SkyulfDataFrame], model_artifact: Any) -> Union[pd.Series, Any]:
        """
        Generates predictions.
        """
        pass

    def predict_proba(
        self, df: Union[pd.DataFrame, SkyulfDataFrame], model_artifact: Any
    ) -> Optional[Union[pd.DataFrame, SkyulfDataFrame]]:
        """
        Generates prediction probabilities if supported.
        Returns DataFrame where columns are classes.
        """
        return None

predict(df, model_artifact) abstractmethod

Generates predictions.

Source code in skyulf-core\skyulf\modeling\base.py
48
49
50
51
52
53
@abstractmethod
def predict(self, df: Union[pd.DataFrame, SkyulfDataFrame], model_artifact: Any) -> Union[pd.Series, Any]:
    """
    Generates predictions.
    """
    pass

predict_proba(df, model_artifact)

Generates prediction probabilities if supported. Returns DataFrame where columns are classes.

Source code in skyulf-core\skyulf\modeling\base.py
55
56
57
58
59
60
61
62
def predict_proba(
    self, df: Union[pd.DataFrame, SkyulfDataFrame], model_artifact: Any
) -> Optional[Union[pd.DataFrame, SkyulfDataFrame]]:
    """
    Generates prediction probabilities if supported.
    Returns DataFrame where columns are classes.
    """
    return None

BaseModelCalculator

Bases: ABC

Source code in skyulf-core\skyulf\modeling\base.py
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
class BaseModelCalculator(ABC):
    @property
    @abstractmethod
    def problem_type(self) -> str:
        """Returns 'classification' or 'regression'."""
        pass

    @property
    def default_params(self) -> Dict[str, Any]:
        """Default hyperparameters for the model."""
        return {}

    @abstractmethod
    def fit(
        self,
        X: Union[pd.DataFrame, SkyulfDataFrame],
        y: Union[pd.Series, Any],
        config: Dict[str, Any],
        progress_callback: Optional[Callable[..., None]] = None,
        log_callback: Optional[Callable[[str], None]] = None,
        validation_data: Optional[tuple[Union[pd.DataFrame, SkyulfDataFrame], Union[pd.Series, Any]]] = None,
    ) -> Any:
        """
        Trains the model. Returns the model object (serializable).
        """
        pass

default_params property

Default hyperparameters for the model.

problem_type abstractmethod property

Returns 'classification' or 'regression'.

fit(X, y, config, progress_callback=None, log_callback=None, validation_data=None) abstractmethod

Trains the model. Returns the model object (serializable).

Source code in skyulf-core\skyulf\modeling\base.py
31
32
33
34
35
36
37
38
39
40
41
42
43
44
@abstractmethod
def fit(
    self,
    X: Union[pd.DataFrame, SkyulfDataFrame],
    y: Union[pd.Series, Any],
    config: Dict[str, Any],
    progress_callback: Optional[Callable[..., None]] = None,
    log_callback: Optional[Callable[[str], None]] = None,
    validation_data: Optional[tuple[Union[pd.DataFrame, SkyulfDataFrame], Union[pd.Series, Any]]] = None,
) -> Any:
    """
    Trains the model. Returns the model object (serializable).
    """
    pass

HyperparameterField dataclass

Describe a single tunable hyperparameter.

Source code in skyulf-core\skyulf\modeling\hyperparameters.py
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
@dataclass
class HyperparameterField:
    """Describe a single tunable hyperparameter."""

    name: str
    label: str
    type: str  # "number", "select", "boolean"
    default: Any
    description: str = ""
    min: Optional[float] = None
    max: Optional[float] = None
    step: Optional[float] = None
    options: Optional[List[Dict[str, Any]]] = (
        None  # For 'select' type: [{"label": "L1", "value": "l1"}]
    )

    def to_dict(self) -> Dict[str, Any]:
        return asdict(self)

LogisticRegressionApplier

Bases: SklearnApplier

Logistic Regression Applier.

Source code in skyulf-core\skyulf\modeling\classification.py
11
12
13
14
class LogisticRegressionApplier(SklearnApplier):
    """Logistic Regression Applier."""

    pass

LogisticRegressionCalculator

Bases: SklearnCalculator

Logistic Regression Calculator.

Source code in skyulf-core\skyulf\modeling\classification.py
17
18
19
20
21
22
23
24
25
26
27
28
29
30
@NodeRegistry.register("logistic_regression", LogisticRegressionApplier)
class LogisticRegressionCalculator(SklearnCalculator):
    """Logistic Regression Calculator."""

    def __init__(self):
        super().__init__(
            model_class=LogisticRegression,
            default_params={
                "max_iter": 1000,
                "solver": "lbfgs",
                "random_state": 42,
            },
            problem_type="classification",
        )

RandomForestClassifierApplier

Bases: SklearnApplier

Random Forest Classifier Applier.

Source code in skyulf-core\skyulf\modeling\classification.py
34
35
36
37
class RandomForestClassifierApplier(SklearnApplier):
    """Random Forest Classifier Applier."""

    pass

RandomForestClassifierCalculator

Bases: SklearnCalculator

Random Forest Classifier Calculator.

Source code in skyulf-core\skyulf\modeling\classification.py
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
@NodeRegistry.register("random_forest_classifier", RandomForestClassifierApplier)
class RandomForestClassifierCalculator(SklearnCalculator):
    """Random Forest Classifier Calculator."""

    def __init__(self):
        super().__init__(
            model_class=RandomForestClassifier,
            default_params={
                "n_estimators": 50,
                "max_depth": 10,
                "min_samples_split": 5,
                "min_samples_leaf": 2,
                "n_jobs": -1,
                "random_state": 42,
            },
            problem_type="classification",
        )

RandomForestRegressorApplier

Bases: SklearnApplier

Random Forest Regressor Applier.

Source code in skyulf-core\skyulf\modeling\regression.py
34
35
36
37
class RandomForestRegressorApplier(SklearnApplier):
    """Random Forest Regressor Applier."""

    pass

RandomForestRegressorCalculator

Bases: SklearnCalculator

Random Forest Regressor Calculator.

Source code in skyulf-core\skyulf\modeling\regression.py
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
@NodeRegistry.register("random_forest_regressor", RandomForestRegressorApplier)
class RandomForestRegressorCalculator(SklearnCalculator):
    """Random Forest Regressor Calculator."""

    def __init__(self):
        super().__init__(
            model_class=RandomForestRegressor,
            default_params={
                "n_estimators": 50,
                "max_depth": 10,
                "min_samples_split": 5,
                "min_samples_leaf": 2,
                "n_jobs": -1,
                "random_state": 42,
            },
            problem_type="regression",
        )

RidgeRegressionApplier

Bases: SklearnApplier

Ridge Regression Applier.

Source code in skyulf-core\skyulf\modeling\regression.py
11
12
13
14
class RidgeRegressionApplier(SklearnApplier):
    """Ridge Regression Applier."""

    pass

RidgeRegressionCalculator

Bases: SklearnCalculator

Ridge Regression Calculator.

Source code in skyulf-core\skyulf\modeling\regression.py
17
18
19
20
21
22
23
24
25
26
27
28
29
30
@NodeRegistry.register("ridge_regression", RidgeRegressionApplier)
class RidgeRegressionCalculator(SklearnCalculator):
    """Ridge Regression Calculator."""

    def __init__(self):
        super().__init__(
            model_class=Ridge,
            default_params={
                "alpha": 1.0,
                "solver": "auto",
                "random_state": 42,
            },
            problem_type="regression",
        )

SklearnApplier

Bases: BaseModelApplier

Base applier for Scikit-Learn models.

Source code in skyulf-core\skyulf\modeling\sklearn_wrapper.py
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
class SklearnApplier(BaseModelApplier):
    """Base applier for Scikit-Learn models."""

    def predict(self, df: Union[pd.DataFrame, SkyulfDataFrame], model_artifact: Any) -> pd.Series:
        # Convert to Numpy
        X_np, _ = SklearnBridge.to_sklearn(df)

        preds = model_artifact.predict(X_np)

        # Return as Pandas Series for consistency
        # If input was Pandas, try to preserve index
        index = None
        if hasattr(df, "index"):
            index = df.index
        elif hasattr(df, "to_pandas"):
             # If it's a wrapper or Polars, we might lose index unless we convert
             # For now, default index is acceptable for predictions
             pass

        return pd.Series(preds, index=index)

    def predict_proba(
        self, df: Union[pd.DataFrame, SkyulfDataFrame], model_artifact: Any
    ) -> Optional[pd.DataFrame]:
        if not hasattr(model_artifact, "predict_proba"):
            return None

        X_np, _ = SklearnBridge.to_sklearn(df)
        probs = model_artifact.predict_proba(X_np)

        # Return as DataFrame
        index = None
        if hasattr(df, "index"):
            index = df.index

        # Column names usually 0, 1, etc. or classes_
        columns = None
        if hasattr(model_artifact, "classes_"):
            columns = model_artifact.classes_

        return pd.DataFrame(probs, index=index, columns=columns)

SklearnCalculator

Bases: BaseModelCalculator

Base calculator for Scikit-Learn models.

Source code in skyulf-core\skyulf\modeling\sklearn_wrapper.py
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
class SklearnCalculator(BaseModelCalculator):
    """Base calculator for Scikit-Learn models."""

    def __init__(
        self,
        model_class: Type[BaseEstimator],
        default_params: Dict[str, Any],
        problem_type: str,
    ):
        self.model_class = model_class
        self._default_params = default_params
        self._problem_type = problem_type

    @property
    def default_params(self) -> Dict[str, Any]:
        return self._default_params

    @property
    def problem_type(self) -> str:
        return self._problem_type

    def fit(
        self,
        X: Union[pd.DataFrame, SkyulfDataFrame],
        y: Union[pd.Series, Any],
        config: Dict[str, Any],
        progress_callback=None,
        log_callback=None,
        validation_data=None,
    ) -> Any:
        """Fit the Scikit-Learn model."""
        # 1. Merge Config with Defaults
        params = self.default_params.copy()
        if config:
            # We support two configuration structures:
            # 1. Nested: {'params': {'C': 1.0, ...}} - Preferred
            # 2. Flat: {'C': 1.0, 'type': '...', ...} - Legacy/Simple support

            # Check for explicit 'params' dictionary first
            overrides = config.get("params", {})

            # If 'params' key exists but is None or empty, check if there are other keys at top level
            # that might be params. But be careful not to mix them.
            # If config has 'params', we assume it's the source of truth.

            if not overrides and "params" not in config:
                # Fallback to flat config if 'params' key is completely missing
                reserved_keys = {
                    "type",
                    "target_column",
                    "node_id",
                    "step_type",
                    "inputs",
                }
                overrides = {
                    k: v
                    for k, v in config.items()
                    if k not in reserved_keys and not isinstance(v, dict)
                }

            if overrides:
                params.update(overrides)

        msg = f"Initializing {self.model_class.__name__} with params: {params}"
        logger.info(msg)
        if log_callback:
            log_callback(msg)

        # 2. Instantiate Model
        model = self.model_class(**params)

        # 3. Fit
        # Convert to Numpy using Bridge (handles Polars/Pandas/Wrappers)
        X_np, y_np = SklearnBridge.to_sklearn((X, y))

        model.fit(X_np, y_np)

        return model

fit(X, y, config, progress_callback=None, log_callback=None, validation_data=None)

Fit the Scikit-Learn model.

Source code in skyulf-core\skyulf\modeling\sklearn_wrapper.py
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
def fit(
    self,
    X: Union[pd.DataFrame, SkyulfDataFrame],
    y: Union[pd.Series, Any],
    config: Dict[str, Any],
    progress_callback=None,
    log_callback=None,
    validation_data=None,
) -> Any:
    """Fit the Scikit-Learn model."""
    # 1. Merge Config with Defaults
    params = self.default_params.copy()
    if config:
        # We support two configuration structures:
        # 1. Nested: {'params': {'C': 1.0, ...}} - Preferred
        # 2. Flat: {'C': 1.0, 'type': '...', ...} - Legacy/Simple support

        # Check for explicit 'params' dictionary first
        overrides = config.get("params", {})

        # If 'params' key exists but is None or empty, check if there are other keys at top level
        # that might be params. But be careful not to mix them.
        # If config has 'params', we assume it's the source of truth.

        if not overrides and "params" not in config:
            # Fallback to flat config if 'params' key is completely missing
            reserved_keys = {
                "type",
                "target_column",
                "node_id",
                "step_type",
                "inputs",
            }
            overrides = {
                k: v
                for k, v in config.items()
                if k not in reserved_keys and not isinstance(v, dict)
            }

        if overrides:
            params.update(overrides)

    msg = f"Initializing {self.model_class.__name__} with params: {params}"
    logger.info(msg)
    if log_callback:
        log_callback(msg)

    # 2. Instantiate Model
    model = self.model_class(**params)

    # 3. Fit
    # Convert to Numpy using Bridge (handles Polars/Pandas/Wrappers)
    X_np, y_np = SklearnBridge.to_sklearn((X, y))

    model.fit(X_np, y_np)

    return model

StatefulEstimator

Source code in skyulf-core\skyulf\modeling\base.py
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
class StatefulEstimator:
    def __init__(
        self, calculator: BaseModelCalculator, applier: BaseModelApplier, node_id: str
    ):
        self.calculator = calculator
        self.applier = applier
        self.node_id = node_id
        self.model = None  # In-memory model storage

    def _extract_xy(
        self, data: Any, target_column: str
    ) -> tuple[Any, Any]:
        """Helper to extract X and y from DataFrame or Tuple."""
        if isinstance(data, tuple) and len(data) == 2:
            return data[0], data[1]

        engine = get_engine(data)

        if engine.name == "polars":
            if target_column not in data.columns:
                raise ValueError(f"Target column '{target_column}' not found in data")
            X = data.drop([target_column])
            y = data.select(target_column).to_series()
            return X, y

        # Pandas / Default
        # Check for DataFrame-like
        if hasattr(data, "columns"):
            if target_column not in data.columns:
                raise ValueError(f"Target column '{target_column}' not found in data")

            # Fallback for pure Pandas
            if isinstance(data, pd.DataFrame):
                return data.drop(columns=[target_column]), data[target_column]

        raise ValueError(f"Unexpected data type: {type(data)}")

    def cross_validate(
        self,
        dataset: SplitDataset,
        target_column: str,
        config: Dict[str, Any],
        n_folds: int = 5,
        cv_type: str = "k_fold",
        shuffle: bool = True,
        random_state: int = 42,
        progress_callback: Optional[Callable[[int, int], None]] = None,
        log_callback: Optional[Callable[[str], None]] = None,
    ) -> Dict[str, Any]:
        """
        Performs cross-validation on the training split.
        """
        # Import here to avoid circular dependency if any
        from .cross_validation import perform_cross_validation

        X_train, y_train = self._extract_xy(dataset.train, target_column)

        return perform_cross_validation(
            calculator=self.calculator,
            applier=self.applier,
            X=X_train,
            y=y_train,
            config=config,
            n_folds=n_folds,
            cv_type=cv_type,
            shuffle=shuffle,
            random_state=random_state,
            progress_callback=progress_callback,
            log_callback=log_callback,
        )

    def fit_predict(
        self,
        dataset: Union[SplitDataset, pd.DataFrame, Tuple[pd.DataFrame, pd.Series]],
        target_column: str,
        config: Dict[str, Any],
        progress_callback: Optional[Callable[[int, int], None]] = None,
        log_callback: Optional[Callable[[str], None]] = None,
        job_id: str = "unknown",
    ) -> Dict[str, pd.Series]:
        """
        Fits the model on training data and returns predictions for all splits.
        """
        # Handle raw DataFrame or Tuple input by wrapping it in a dummy SplitDataset
        if isinstance(dataset, pd.DataFrame):
            dataset = SplitDataset(train=dataset, test=pd.DataFrame(), validation=None)
        elif isinstance(dataset, tuple):
            # Check if it's (train_df, test_df) or (X, y)
            elem0 = dataset[0]
            if isinstance(elem0, pd.DataFrame) and target_column in elem0.columns:
                # It's (train_df, test_df)
                train_df, test_df = dataset
                dataset = SplitDataset(train=train_df, test=test_df, validation=None)  # type: ignore
            else:
                # Fallback: Treat input as training data (e.g. X, y tuple) and initialize empty test set.
                msg = (
                    "WARNING: No test set provided. Using entire input as training data. "
                    "Ensure data was split BEFORE preprocessing to avoid data leakage."
                )
                logger.warning(msg)
                if log_callback:
                    log_callback(msg)

                dataset = SplitDataset(
                    train=dataset, test=pd.DataFrame(), validation=None
                )

        # 1. Prepare Data
        X_train, y_train = self._extract_xy(dataset.train, target_column)

        validation_data = None
        if dataset.validation is not None:
            X_val, y_val = self._extract_xy(dataset.validation, target_column)
            validation_data = (X_val, y_val)

        # 2. Train Model
        self.model = self.calculator.fit(
            X_train,
            y_train,
            config,
            progress_callback=progress_callback,
            log_callback=log_callback,
            validation_data=validation_data,
        )

        # 3. Predict on all splits
        predictions = {}

        # Train Predictions
        predictions["train"] = self.applier.predict(X_train, self.model)

        # Test Predictions
        is_test_empty = False
        test_df = None
        if isinstance(dataset.test, tuple):
            test_df = dataset.test[0]
        else:
            test_df = dataset.test

        if hasattr(test_df, "empty"):
            is_test_empty = test_df.empty
        else:
            # Polars
            is_test_empty = test_df.is_empty()

        if not is_test_empty:
            if isinstance(dataset.test, tuple):
                X_test, _ = dataset.test
            else:
                if target_column in dataset.test.columns:
                    try:
                        X_test = dataset.test.drop(columns=[target_column])
                    except TypeError:
                        # Polars
                        X_test = dataset.test.drop([target_column])
                else:
                    X_test = dataset.test
            predictions["test"] = self.applier.predict(X_test, self.model)

        # Validation Predictions
        if dataset.validation is not None:
            if isinstance(dataset.validation, tuple):
                X_val, _ = dataset.validation
            else:
                if target_column in dataset.validation.columns:
                    X_val = dataset.validation.drop(columns=[target_column])
                else:
                    X_val = dataset.validation
            predictions["validation"] = self.applier.predict(X_val, self.model)

        return predictions

    def refit(
        self,
        dataset: SplitDataset,
        target_column: str,
        config: Dict[str, Any],
        job_id: str = "unknown",
    ) -> None:
        """
        Refits the model on Train + Validation data and updates the artifact.
        """
        if dataset.validation is None:
            # Fallback to normal fit if no validation set
            self.fit_predict(dataset, target_column, config, job_id=job_id)
            return

        # 1. Prepare Combined Data
        X_train, y_train = self._extract_xy(dataset.train, target_column)
        X_val, y_val = self._extract_xy(dataset.validation, target_column)

        X_combined = pd.concat([X_train, X_val], axis=0)
        y_combined = pd.concat([y_train, y_val], axis=0)

        # 2. Train Model
        self.model = self.calculator.fit(X_combined, y_combined, config)

    def evaluate(  # noqa: C901
        self, dataset: SplitDataset, target_column: str, job_id: str = "unknown"
    ) -> Any:
        """
        Evaluates the model on all splits and returns a detailed report.
        """
        # Import here to avoid circular dependency
        from .evaluation.classification import evaluate_classification_model
        from .evaluation.regression import evaluate_regression_model

        if self.model is None:
            raise ValueError(
                "Model has not been trained yet. Call fit_predict() first."
            )

        problem_type = self.calculator.problem_type

        splits_payload = {}

        # Container for raw predictions
        evaluation_data = {
            "job_id": job_id,
            "node_id": self.node_id,
            "problem_type": problem_type,
            "splits": {},
        }

        # Helper to evaluate a single split
        def evaluate_split(split_name: str, data: Any):
            if isinstance(data, tuple):
                X, y = data
            elif isinstance(data, pd.DataFrame):
                if target_column not in data.columns:
                    return None  # Cannot evaluate without target
                X = data.drop(columns=[target_column])
                y = data[target_column]
            else:
                return None

            y_pred = self.applier.predict(X, self.model)

            # Try to get probabilities for classification
            y_proba = None
            if problem_type == "classification":
                y_proba_df = self.applier.predict_proba(X, self.model)
                if y_proba_df is not None:
                    y_proba = {
                        "classes": y_proba_df.columns.tolist(),
                        "values": y_proba_df.values.tolist(),
                    }

            split_data = {
                "y_true": y.tolist() if hasattr(y, "tolist") else list(y),
                "y_pred": (
                    y_pred.tolist() if hasattr(y_pred, "tolist") else list(y_pred)
                ),
            }

            if y_proba:
                split_data["y_proba"] = y_proba

            evaluation_data["splits"][split_name] = split_data

            # Unpack model if it's a tuple (from Tuner)
            model_to_evaluate = self.model
            if isinstance(self.model, tuple) and len(self.model) == 2:
                # Check if first element looks like a model (has fit/predict)
                # or if it's just a convention from TuningCalculator
                model_to_evaluate = self.model[0]

            if problem_type == "classification":
                return evaluate_classification_model(
                    model=model_to_evaluate, dataset_name=split_name, X_test=X, y_test=y
                )
            elif problem_type == "regression":
                return evaluate_regression_model(
                    model=model_to_evaluate, dataset_name=split_name, X_test=X, y_test=y
                )
            else:
                raise ValueError(f"Unknown problem type: {problem_type}")

        # 2. Evaluate Train
        splits_payload["train"] = evaluate_split("train", dataset.train)

        # 3. Evaluate Test
        has_test = False
        if isinstance(dataset.test, pd.DataFrame):
            has_test = not dataset.test.empty
        elif isinstance(dataset.test, tuple):
            has_test = len(dataset.test) == 2 and len(dataset.test[0]) > 0

        if has_test:
            splits_payload["test"] = evaluate_split("test", dataset.test)

        # 4. Evaluate Validation
        if dataset.validation is not None:
            has_val = False
            if isinstance(dataset.validation, pd.DataFrame):
                has_val = not dataset.validation.empty
            elif isinstance(dataset.validation, tuple):
                has_val = (
                    len(dataset.validation) == 2 and len(dataset.validation[0]) > 0
                )

            if has_val:
                splits_payload["validation"] = evaluate_split(
                    "validation", dataset.validation
                )

        # Return report object (simplified for now, assuming schema matches)
        return {
            "problem_type": problem_type,
            "splits": splits_payload,
            "raw_data": evaluation_data,
        }

cross_validate(dataset, target_column, config, n_folds=5, cv_type='k_fold', shuffle=True, random_state=42, progress_callback=None, log_callback=None)

Performs cross-validation on the training split.

Source code in skyulf-core\skyulf\modeling\base.py
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
def cross_validate(
    self,
    dataset: SplitDataset,
    target_column: str,
    config: Dict[str, Any],
    n_folds: int = 5,
    cv_type: str = "k_fold",
    shuffle: bool = True,
    random_state: int = 42,
    progress_callback: Optional[Callable[[int, int], None]] = None,
    log_callback: Optional[Callable[[str], None]] = None,
) -> Dict[str, Any]:
    """
    Performs cross-validation on the training split.
    """
    # Import here to avoid circular dependency if any
    from .cross_validation import perform_cross_validation

    X_train, y_train = self._extract_xy(dataset.train, target_column)

    return perform_cross_validation(
        calculator=self.calculator,
        applier=self.applier,
        X=X_train,
        y=y_train,
        config=config,
        n_folds=n_folds,
        cv_type=cv_type,
        shuffle=shuffle,
        random_state=random_state,
        progress_callback=progress_callback,
        log_callback=log_callback,
    )

evaluate(dataset, target_column, job_id='unknown')

Evaluates the model on all splits and returns a detailed report.

Source code in skyulf-core\skyulf\modeling\base.py
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
def evaluate(  # noqa: C901
    self, dataset: SplitDataset, target_column: str, job_id: str = "unknown"
) -> Any:
    """
    Evaluates the model on all splits and returns a detailed report.
    """
    # Import here to avoid circular dependency
    from .evaluation.classification import evaluate_classification_model
    from .evaluation.regression import evaluate_regression_model

    if self.model is None:
        raise ValueError(
            "Model has not been trained yet. Call fit_predict() first."
        )

    problem_type = self.calculator.problem_type

    splits_payload = {}

    # Container for raw predictions
    evaluation_data = {
        "job_id": job_id,
        "node_id": self.node_id,
        "problem_type": problem_type,
        "splits": {},
    }

    # Helper to evaluate a single split
    def evaluate_split(split_name: str, data: Any):
        if isinstance(data, tuple):
            X, y = data
        elif isinstance(data, pd.DataFrame):
            if target_column not in data.columns:
                return None  # Cannot evaluate without target
            X = data.drop(columns=[target_column])
            y = data[target_column]
        else:
            return None

        y_pred = self.applier.predict(X, self.model)

        # Try to get probabilities for classification
        y_proba = None
        if problem_type == "classification":
            y_proba_df = self.applier.predict_proba(X, self.model)
            if y_proba_df is not None:
                y_proba = {
                    "classes": y_proba_df.columns.tolist(),
                    "values": y_proba_df.values.tolist(),
                }

        split_data = {
            "y_true": y.tolist() if hasattr(y, "tolist") else list(y),
            "y_pred": (
                y_pred.tolist() if hasattr(y_pred, "tolist") else list(y_pred)
            ),
        }

        if y_proba:
            split_data["y_proba"] = y_proba

        evaluation_data["splits"][split_name] = split_data

        # Unpack model if it's a tuple (from Tuner)
        model_to_evaluate = self.model
        if isinstance(self.model, tuple) and len(self.model) == 2:
            # Check if first element looks like a model (has fit/predict)
            # or if it's just a convention from TuningCalculator
            model_to_evaluate = self.model[0]

        if problem_type == "classification":
            return evaluate_classification_model(
                model=model_to_evaluate, dataset_name=split_name, X_test=X, y_test=y
            )
        elif problem_type == "regression":
            return evaluate_regression_model(
                model=model_to_evaluate, dataset_name=split_name, X_test=X, y_test=y
            )
        else:
            raise ValueError(f"Unknown problem type: {problem_type}")

    # 2. Evaluate Train
    splits_payload["train"] = evaluate_split("train", dataset.train)

    # 3. Evaluate Test
    has_test = False
    if isinstance(dataset.test, pd.DataFrame):
        has_test = not dataset.test.empty
    elif isinstance(dataset.test, tuple):
        has_test = len(dataset.test) == 2 and len(dataset.test[0]) > 0

    if has_test:
        splits_payload["test"] = evaluate_split("test", dataset.test)

    # 4. Evaluate Validation
    if dataset.validation is not None:
        has_val = False
        if isinstance(dataset.validation, pd.DataFrame):
            has_val = not dataset.validation.empty
        elif isinstance(dataset.validation, tuple):
            has_val = (
                len(dataset.validation) == 2 and len(dataset.validation[0]) > 0
            )

        if has_val:
            splits_payload["validation"] = evaluate_split(
                "validation", dataset.validation
            )

    # Return report object (simplified for now, assuming schema matches)
    return {
        "problem_type": problem_type,
        "splits": splits_payload,
        "raw_data": evaluation_data,
    }

fit_predict(dataset, target_column, config, progress_callback=None, log_callback=None, job_id='unknown')

Fits the model on training data and returns predictions for all splits.

Source code in skyulf-core\skyulf\modeling\base.py
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
def fit_predict(
    self,
    dataset: Union[SplitDataset, pd.DataFrame, Tuple[pd.DataFrame, pd.Series]],
    target_column: str,
    config: Dict[str, Any],
    progress_callback: Optional[Callable[[int, int], None]] = None,
    log_callback: Optional[Callable[[str], None]] = None,
    job_id: str = "unknown",
) -> Dict[str, pd.Series]:
    """
    Fits the model on training data and returns predictions for all splits.
    """
    # Handle raw DataFrame or Tuple input by wrapping it in a dummy SplitDataset
    if isinstance(dataset, pd.DataFrame):
        dataset = SplitDataset(train=dataset, test=pd.DataFrame(), validation=None)
    elif isinstance(dataset, tuple):
        # Check if it's (train_df, test_df) or (X, y)
        elem0 = dataset[0]
        if isinstance(elem0, pd.DataFrame) and target_column in elem0.columns:
            # It's (train_df, test_df)
            train_df, test_df = dataset
            dataset = SplitDataset(train=train_df, test=test_df, validation=None)  # type: ignore
        else:
            # Fallback: Treat input as training data (e.g. X, y tuple) and initialize empty test set.
            msg = (
                "WARNING: No test set provided. Using entire input as training data. "
                "Ensure data was split BEFORE preprocessing to avoid data leakage."
            )
            logger.warning(msg)
            if log_callback:
                log_callback(msg)

            dataset = SplitDataset(
                train=dataset, test=pd.DataFrame(), validation=None
            )

    # 1. Prepare Data
    X_train, y_train = self._extract_xy(dataset.train, target_column)

    validation_data = None
    if dataset.validation is not None:
        X_val, y_val = self._extract_xy(dataset.validation, target_column)
        validation_data = (X_val, y_val)

    # 2. Train Model
    self.model = self.calculator.fit(
        X_train,
        y_train,
        config,
        progress_callback=progress_callback,
        log_callback=log_callback,
        validation_data=validation_data,
    )

    # 3. Predict on all splits
    predictions = {}

    # Train Predictions
    predictions["train"] = self.applier.predict(X_train, self.model)

    # Test Predictions
    is_test_empty = False
    test_df = None
    if isinstance(dataset.test, tuple):
        test_df = dataset.test[0]
    else:
        test_df = dataset.test

    if hasattr(test_df, "empty"):
        is_test_empty = test_df.empty
    else:
        # Polars
        is_test_empty = test_df.is_empty()

    if not is_test_empty:
        if isinstance(dataset.test, tuple):
            X_test, _ = dataset.test
        else:
            if target_column in dataset.test.columns:
                try:
                    X_test = dataset.test.drop(columns=[target_column])
                except TypeError:
                    # Polars
                    X_test = dataset.test.drop([target_column])
            else:
                X_test = dataset.test
        predictions["test"] = self.applier.predict(X_test, self.model)

    # Validation Predictions
    if dataset.validation is not None:
        if isinstance(dataset.validation, tuple):
            X_val, _ = dataset.validation
        else:
            if target_column in dataset.validation.columns:
                X_val = dataset.validation.drop(columns=[target_column])
            else:
                X_val = dataset.validation
        predictions["validation"] = self.applier.predict(X_val, self.model)

    return predictions

refit(dataset, target_column, config, job_id='unknown')

Refits the model on Train + Validation data and updates the artifact.

Source code in skyulf-core\skyulf\modeling\base.py
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
def refit(
    self,
    dataset: SplitDataset,
    target_column: str,
    config: Dict[str, Any],
    job_id: str = "unknown",
) -> None:
    """
    Refits the model on Train + Validation data and updates the artifact.
    """
    if dataset.validation is None:
        # Fallback to normal fit if no validation set
        self.fit_predict(dataset, target_column, config, job_id=job_id)
        return

    # 1. Prepare Combined Data
    X_train, y_train = self._extract_xy(dataset.train, target_column)
    X_val, y_val = self._extract_xy(dataset.validation, target_column)

    X_combined = pd.concat([X_train, X_val], axis=0)
    y_combined = pd.concat([y_train, y_val], axis=0)

    # 2. Train Model
    self.model = self.calculator.fit(X_combined, y_combined, config)

perform_cross_validation(calculator, applier, X, y, config, n_folds=5, cv_type='k_fold', shuffle=True, random_state=42, progress_callback=None, log_callback=None)

Performs K-Fold cross-validation.

Parameters:

Name Type Description Default
calculator BaseModelCalculator

The model calculator (fit logic).

required
applier BaseModelApplier

The model applier (predict logic).

required
X Union[DataFrame, SkyulfDataFrame]

Features.

required
y Union[Series, Any]

Target.

required
config Dict[str, Any]

Model configuration.

required
n_folds int

Number of folds.

5
cv_type str

Type of CV.

'k_fold'
shuffle bool

Whether to shuffle data before splitting (for KFold/Stratified).

True
random_state int

Random seed for shuffling.

42
progress_callback Optional[Callable[[int, int], None]]

Optional callback(current_fold, total_folds).

None
log_callback Optional[Callable[[str], None]]

Optional callback for logging messages.

None

Returns:

Type Description
Dict[str, Any]

Dict containing aggregated metrics and per-fold details.

Source code in skyulf-core\skyulf\modeling\cross_validation.py
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
def perform_cross_validation(
    calculator: BaseModelCalculator,
    applier: BaseModelApplier,
    X: Union[pd.DataFrame, SkyulfDataFrame],
    y: Union[pd.Series, Any],
    config: Dict[str, Any],
    n_folds: int = 5,
    cv_type: str = "k_fold",  # k_fold, stratified_k_fold, time_series_split, shuffle_split
    shuffle: bool = True,
    random_state: int = 42,
    progress_callback: Optional[Callable[[int, int], None]] = None,
    log_callback: Optional[Callable[[str], None]] = None,
) -> Dict[str, Any]:
    """
    Performs K-Fold cross-validation.

    Args:
        calculator: The model calculator (fit logic).
        applier: The model applier (predict logic).
        X: Features.
        y: Target.
        config: Model configuration.
        n_folds: Number of folds.
        cv_type: Type of CV.
        shuffle: Whether to shuffle data before splitting (for KFold/Stratified).
        random_state: Random seed for shuffling.
        progress_callback: Optional callback(current_fold, total_folds).
        log_callback: Optional callback for logging messages.

    Returns:
        Dict containing aggregated metrics and per-fold details.
    """

    problem_type = calculator.problem_type

    if log_callback:
        log_callback(f"Starting Cross-Validation (Folds: {n_folds}, Type: {cv_type})")

    # 1. Setup Splitter
    if cv_type == "time_series_split":
        splitter = TimeSeriesSplit(n_splits=n_folds)
    elif cv_type == "shuffle_split":
        splitter = ShuffleSplit(
            n_splits=n_folds, test_size=0.2, random_state=random_state
        )
    elif cv_type == "stratified_k_fold" and problem_type == "classification":
        splitter = StratifiedKFold(
            n_splits=n_folds,
            shuffle=shuffle,
            random_state=random_state if shuffle else None,
        )
    else:
        # Default to KFold
        splitter = KFold(
            n_splits=n_folds,
            shuffle=shuffle,
            random_state=random_state if shuffle else None,
        )

    fold_results = []

    # Ensure numpy for splitting using the Bridge
    X_arr, y_arr = SklearnBridge.to_sklearn((X, y))

    # 2. Iterate Folds
    for fold_idx, (train_idx, val_idx) in enumerate(splitter.split(X_arr, y_arr)):
        if progress_callback:
            progress_callback(fold_idx + 1, n_folds)

        if log_callback:
            log_callback(f"Processing Fold {fold_idx + 1}/{n_folds}...")

        # Split Data
        # We slice the original X/y to preserve their type (Pandas/Polars) for the calculator
        # Polars supports slicing with numpy arrays via __getitem__
        # Pandas supports slicing via iloc

        if hasattr(X, "iloc"):
            X_train_fold = X.iloc[train_idx]
            X_val_fold = X.iloc[val_idx]
        else:
            # Polars or other
            X_train_fold = X[train_idx]
            X_val_fold = X[val_idx]

        if hasattr(y, "iloc"):
            y_train_fold = y.iloc[train_idx]
            y_val_fold = y.iloc[val_idx]
        else:
            # Polars Series or numpy array
            y_train_fold = y[train_idx]
            y_val_fold = y[val_idx]

        # Fit
        model_artifact = calculator.fit(X_train_fold, y_train_fold, config)

        # Evaluate
        if problem_type == "classification":
            metrics = calculate_classification_metrics(
                model_artifact, X_val_fold, y_val_fold
            )
        else:
            metrics = calculate_regression_metrics(
                model_artifact, X_val_fold, y_val_fold
            )

        if log_callback:
            # Log a key metric for the fold
            key_metric = "accuracy" if problem_type == "classification" else "r2"
            score = metrics.get(key_metric, 0.0)
            log_callback(f"Fold {fold_idx + 1} completed. {key_metric}: {score:.4f}")

        fold_results.append(
            {
                "fold": fold_idx + 1,
                "metrics": sanitize_metrics(metrics),
                # We could store predictions here if needed, but might be too heavy
            }
        )

    # 3. Aggregate
    fold_metrics = [cast(Dict[str, float], r["metrics"]) for r in fold_results]
    aggregated = _aggregate_metrics(fold_metrics)

    if log_callback:
        log_callback(f"Cross-Validation Completed. Aggregated Metrics: {aggregated}")

    return {
        "aggregated_metrics": aggregated,
        "folds": fold_results,
        "cv_config": {
            "n_folds": n_folds,
            "cv_type": cv_type,
            "shuffle": shuffle,
            "random_state": random_state,
        },
    }