Recipes
This page contains practical patterns for common workflows.
Recipe: split, preprocess, train
If you want evaluation metrics, use a split step (or pass a SplitDataset).
from __future__ import annotations
import pandas as pd
from skyulf.pipeline import SkyulfPipeline
# In real usage you'd likely load a file:
# df = pd.read_csv("your_data.csv")
df = pd.DataFrame(
{
"free_text": [
" Hello World ",
"Skyulf is GREAT! ",
" hello\tworld ",
" ML pipelines ",
"Encode + scale",
" text cleaning ",
],
"country": ["TR", "TR", "DE", "DE", "TR", "DE"],
"age": [10, 20, 30, 40, 50, 60],
"target": [0, 1, 0, 1, 1, 0],
}
)
config = {
"preprocessing": [
{
"name": "split",
"transformer": "TrainTestSplitter",
"params": {
"test_size": 0.34,
"validation_size": 0.0,
"random_state": 42,
"shuffle": True,
"stratify": True,
"target_column": "target",
},
},
{
"name": "text_clean",
"transformer": "TextCleaning",
"params": {
"columns": ["free_text"],
"operations": [
{"op": "trim", "mode": "both"},
{"op": "case", "mode": "lower"},
{"op": "regex", "mode": "collapse_whitespace"},
],
},
},
{
"name": "encode",
"transformer": "OneHotEncoder",
"params": {
"columns": ["country", "free_text"],
"drop_original": True,
"handle_unknown": "ignore",
},
},
{
"name": "impute",
"transformer": "SimpleImputer",
"params": {"strategy": "mean", "columns": ["age"]},
},
{
"name": "scale",
"transformer": "StandardScaler",
"params": {"auto_detect": True},
},
],
"modeling": {"type": "random_forest_classifier", "params": {"n_estimators": 200}},
}
pipeline = SkyulfPipeline(config)
report = pipeline.fit(df, target_column="target")
print(report.get("modeling"))
Recipe: safe inference
At inference time you typically:
- load a persisted pipeline
- call
predict(df)on a dataframe without the target column
from __future__ import annotations
import tempfile
from pathlib import Path
import pandas as pd
from skyulf.pipeline import SkyulfPipeline
df = pd.DataFrame(
{
"age": [10, 20, 30, 40, 50, 60],
"city": ["A", "B", "A", "C", "B", "A"],
"target": [0, 1, 0, 1, 1, 0],
}
)
config = {
"preprocessing": [
{
"name": "split",
"transformer": "TrainTestSplitter",
"params": {
"test_size": 0.2,
"validation_size": 0.0,
"random_state": 42,
"shuffle": True,
"stratify": True,
"target_column": "target",
},
},
{
"name": "impute",
"transformer": "SimpleImputer",
"params": {"strategy": "mean", "columns": ["age"]},
},
{
"name": "encode",
"transformer": "OneHotEncoder",
"params": {"columns": ["city"], "drop_original": True},
},
],
"modeling": {
"type": "random_forest_classifier",
"params": {"n_estimators": 50, "random_state": 42},
},
}
with tempfile.TemporaryDirectory() as tmp:
model_path = Path(tmp) / "model.pkl"
pipeline = SkyulfPipeline(config)
_ = pipeline.fit(df, target_column="target")
pipeline.save(model_path)
loaded = SkyulfPipeline.load(model_path)
new_df = pd.DataFrame({"age": [25, 55], "city": ["A", "B"]})
preds = loaded.predict(new_df)
print(preds)
Recipe: use a single component (debug)
For debugging, you can run one node directly.
import pandas as pd
from skyulf.preprocessing.imputation import SimpleImputerApplier, SimpleImputerCalculator
df = pd.DataFrame({"A": [1, 2, None, 4]})
config = {"columns": ["A"], "strategy": "mean"}
params = SimpleImputerCalculator().fit(df, config)
out = SimpleImputerApplier().apply(df, params)
print(params)
print(out)