Skip to content

Serialization

What is persisted

SkyulfPipeline.save() uses Python pickle to serialize the entire pipeline object.

That includes:

  • preprocessing fitted artifacts (per-step params)
  • the trained model (sklearn estimator object)

Practical guidance

  • Prefer saving in environments where the same library versions are available.
  • Some preprocessing nodes store sklearn objects inside params (e.g., KNN/Iterative imputers, OneHotEncoder). Those are not JSON-serializable and require pickling.

Load and use

from __future__ import annotations

import tempfile
from pathlib import Path

import pandas as pd

from skyulf.pipeline import SkyulfPipeline

df = pd.DataFrame(
  {
    "age": [10, 20, None, 40, 50, 60, None, 80],
    "city": ["A", "B", "A", "C", "B", "A", "C", "B"],
    "target": [0, 1, 0, 1, 1, 0, 1, 0],
  }
)

config = {
  "preprocessing": [
    {
      "name": "split",
      "transformer": "TrainTestSplitter",
      "params": {
        "test_size": 0.2,
        "validation_size": 0.0,
        "random_state": 42,
        "shuffle": True,
        "stratify": True,
        "target_column": "target",
      },
    },
    {
      "name": "impute",
      "transformer": "SimpleImputer",
      "params": {"strategy": "mean", "columns": ["age"]},
    },
    {
      "name": "encode",
      "transformer": "OneHotEncoder",
      "params": {"columns": ["city"], "drop_original": True},
    },
  ],
  "modeling": {
    "type": "random_forest_classifier",
    "params": {"n_estimators": 50, "random_state": 42},
  },
}

with tempfile.TemporaryDirectory() as tmp:
  model_path = Path(tmp) / "model.pkl"

  pipeline = SkyulfPipeline(config)
  _ = pipeline.fit(df, target_column="target")
  pipeline.save(model_path)

  loaded = SkyulfPipeline.load(model_path)
  new_df = pd.DataFrame({"age": [25, None], "city": ["A", "C"]})
  preds = loaded.predict(new_df)

print(preds)