Skip to content

API: preprocessing.bucketing

skyulf.preprocessing.bucketing

Binning / discretization nodes (GeneralBinning, CustomBinning, KBinsDiscretizer).

All Appliers share a single :class:BaseBinningApplier which routes through :func:apply_dual_engine. Per-engine helpers live at module level so each one stays at low CCN. Fits are sklearn / pandas-bound (pd.cut, pd.qcut, KBinsDiscretizer), so they convert via to_pandas once at the top and do not use :func:fit_dual_engine.

BaseBinningApplier

Bases: BaseApplier

Shared Applier for all binning Calculators.

Expects bin_edges in params: Dict[str, List[float]] mapping column names to bin edges.

Source code in skyulf-core/skyulf/preprocessing/bucketing.py
227
228
229
230
231
232
233
234
235
236
class BaseBinningApplier(BaseApplier):
    """Shared Applier for all binning Calculators.

    Expects ``bin_edges`` in params: ``Dict[str, List[float]]`` mapping column
    names to bin edges.
    """

    @apply_method
    def apply(self, X: Any, _y: Any, params: Dict[str, Any]) -> Any:
        return apply_dual_engine(X, params, _bucketing_apply_polars, _bucketing_apply_pandas)

CustomBinningCalculator

Bases: BaseCalculator

Apply user-supplied bin edges to selected columns.

Source code in skyulf-core/skyulf/preprocessing/bucketing.py
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
@NodeRegistry.register("CustomBinning", CustomBinningApplier)
@node_meta(
    id="CustomBinning",
    name="Custom Binning",
    category="Preprocessing",
    description="Bin data using custom edges.",
    params={"bins": [], "columns": []},
)
class CustomBinningCalculator(BaseCalculator):
    """Apply user-supplied bin edges to selected columns."""

    @fit_method
    def fit(self, X: Any, _y: Any, config: Dict[str, Any]) -> GeneralBinningArtifact:
        if user_picked_no_columns(config):
            return cast(GeneralBinningArtifact, {})

        X = _to_pandas_for_fit(X)
        columns = resolve_columns(X, config, detect_numeric_columns)
        bins = config.get("bins")

        bin_edges_map: Dict[str, List[float]] = {}
        if bins:
            sorted_bins = sorted(bins)
            for col in columns:
                if col in X.columns:
                    bin_edges_map[col] = sorted_bins

        artifact: Dict[str, Any] = {
            "type": "general_binning",  # Reuses GeneralBinningApplier.
            "bin_edges": bin_edges_map,
        }
        artifact.update(_passthrough_artifact_options(config))
        return cast(GeneralBinningArtifact, artifact)

GeneralBinningCalculator

Bases: BaseCalculator

Master calculator that handles mixed strategies and per-column overrides.

Source code in skyulf-core/skyulf/preprocessing/bucketing.py
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
@NodeRegistry.register("GeneralBinning", GeneralBinningApplier)
@node_meta(
    id="GeneralBinning",
    name="General Binning",
    category="Preprocessing",
    description="Bin continuous data into intervals.",
    params={"n_bins": 5, "strategy": "uniform", "columns": []},
)
class GeneralBinningCalculator(BaseCalculator):
    """Master calculator that handles mixed strategies and per-column overrides."""

    @fit_method
    def fit(self, X: Any, _y: Any, config: Dict[str, Any]) -> GeneralBinningArtifact:
        if user_picked_no_columns(config):
            return cast(GeneralBinningArtifact, {})

        X = _to_pandas_for_fit(X)
        columns = resolve_columns(X, config, detect_numeric_columns)

        defaults = {
            "default_n_bins": config.get("n_bins", 5),
            "n_bins": config.get("equal_width_bins", config.get("n_bins", 5)),
            "q_bins": config.get("equal_frequency_bins", config.get("n_bins", 5)),
            "duplicates": config.get("duplicates", "drop"),
        }

        valid_cols = [c for c in columns if c in X.columns]
        bin_edges_map: Dict[str, List[float]] = {}
        custom_labels_map: Dict[str, Any] = {}

        for col in valid_cols:
            _fit_one_column_into_maps(X, col, config, defaults, bin_edges_map, custom_labels_map)

        artifact: Dict[str, Any] = {
            "type": "general_binning",
            "bin_edges": bin_edges_map,
            "custom_labels": custom_labels_map,
        }
        artifact.update(_passthrough_artifact_options(config))
        return cast(GeneralBinningArtifact, artifact)

KBinsDiscretizerCalculator

Bases: GeneralBinningCalculator

Thin wrapper around :class:GeneralBinningCalculator with kbins strategy.

Source code in skyulf-core/skyulf/preprocessing/bucketing.py
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
@NodeRegistry.register("KBinsDiscretizer", KBinsDiscretizerApplier)
@node_meta(
    id="KBinsDiscretizer",
    name="K-Bins Discretizer",
    category="Preprocessing",
    description="Bin continuous data into intervals using sklearn KBinsDiscretizer.",
    params={"n_bins": 5, "encode": "ordinal", "strategy": "quantile", "columns": []},
)
class KBinsDiscretizerCalculator(GeneralBinningCalculator):
    """Thin wrapper around :class:`GeneralBinningCalculator` with ``kbins`` strategy."""

    def fit(
        self,
        df: Union[pd.DataFrame, SkyulfDataFrame, Tuple[Any, ...], Any],
        config: Dict[str, Any],
    ) -> GeneralBinningArtifact:
        new_config = config.copy()
        new_config["strategy"] = "kbins"
        if "n_bins" in config:
            new_config["kbins_n_bins"] = config["n_bins"]
        if "strategy" in config and config["strategy"] != "kbins":
            new_config["kbins_strategy"] = config["strategy"]
        return super().fit(df, new_config)