API: preprocessing.split

`skyulf.preprocessing.split`

`DataSplitter`

Splits a DataFrame into Train, Test, and optionally Validation sets.

Source code in skyulf-core\skyulf\preprocessing\split.py

class DataSplitter:
    """
    Splits a DataFrame into Train, Test, and optionally Validation sets.
    """

    def __init__(
        self,
        test_size: float = 0.2,
        validation_size: float = 0.0,
        random_state: int = 42,
        shuffle: bool = True,
        stratify_col: Optional[str] = None,
    ):
        self.test_size = test_size
        self.validation_size = validation_size
        self.random_state = random_state
        self.shuffle = shuffle
        self.stratify_col = stratify_col

    def split_xy(self, X: Union[pd.DataFrame, SkyulfDataFrame], y: Union[pd.Series, Any]) -> SplitDataset:
        """
        Splits X and y arrays.
        """
        engine = get_engine(X)
        is_polars = engine.name == "polars"

        if is_polars:
            # Convert to Pandas to preserve schema/metadata during split
            X_pd = X.to_pandas()
            y_pd = y.to_pandas() if y is not None else None
        else:
            X_pd = X
            y_pd = y

        stratify = y_pd if self.stratify_col else None  # If stratify is requested, use y

        if stratify is not None:
            # Check value counts
            class_counts = y_pd.value_counts()
            min_count = class_counts.min()

            if min_count < 2:
                logger.warning(
                    f"Stratified split requested but the least populated class has only {min_count} "
                    "member(s). Stratification will be disabled."
                )
                stratify = None

        # First split: Train+Val vs Test
        X_train_val, X_test, y_train_val, y_test = train_test_split(
            X_pd,
            y_pd,
            test_size=self.test_size,
            random_state=self.random_state,
            shuffle=self.shuffle,
            stratify=stratify,
        )

        validation = None
        if self.validation_size > 0:
            relative_val_size = self.validation_size / (1 - self.test_size)
            stratify_val = y_train_val if self.stratify_col else None

            if stratify_val is not None:
                class_counts_val = y_train_val.value_counts()
                min_count_val = class_counts_val.min()

                if min_count_val < 2:
                    logger.warning(
                        "Stratified validation split requested but the least populated class has only "
                        f"{min_count_val} member(s). Stratification will be disabled for validation split."
                    )
                    stratify_val = None

            X_train, X_val, y_train, y_val = train_test_split(
                X_train_val,
                y_train_val,
                test_size=relative_val_size,
                random_state=self.random_state,
                shuffle=self.shuffle,
                stratify=stratify_val,
            )
            validation = (X_val, y_val)
        else:
            X_train, y_train = X_train_val, y_train_val

        # Convert back to Polars if needed
        if is_polars:
            import polars as pl

            def to_pl(df_or_series):
                if df_or_series is None: return None
                if isinstance(df_or_series, pd.DataFrame): return pl.from_pandas(df_or_series)
                if isinstance(df_or_series, pd.Series): return pl.from_pandas(df_or_series)
                return df_or_series

            X_train = to_pl(X_train)
            y_train = to_pl(y_train)
            X_test = to_pl(X_test)
            y_test = to_pl(y_test)

            if validation:
                validation = (to_pl(validation[0]), to_pl(validation[1]))

        return SplitDataset(
            train=(X_train, y_train), test=(X_test, y_test), validation=validation
        )

    def split(self, df: Union[pd.DataFrame, SkyulfDataFrame]) -> SplitDataset:
        """
        Splits a DataFrame.
        """
        engine = get_engine(df)
        is_polars = engine.name == "polars"

        if is_polars:
            # Convert to Pandas to preserve schema/metadata during split
            df_pd = df.to_pandas()
        else:
            df_pd = df

        stratify = None
        if self.stratify_col and self.stratify_col in df_pd.columns:
            stratify = df_pd[self.stratify_col]
            class_counts = stratify.value_counts()
            if class_counts.min() < 2:
                logger.warning(
                    f"Stratified split requested but the least populated class has only {class_counts.min()} "
                    "member(s). Stratification will be disabled."
                )
                stratify = None

        train_val, test = train_test_split(
            df_pd,
            test_size=self.test_size,
            random_state=self.random_state,
            shuffle=self.shuffle,
            stratify=stratify,
        )

        validation = None
        if self.validation_size > 0:
            relative_val_size = self.validation_size / (1 - self.test_size)

            stratify_val = None
            if self.stratify_col and self.stratify_col in train_val.columns:
                stratify_val = train_val[self.stratify_col]
                class_counts_val = stratify_val.value_counts()
                if class_counts_val.min() < 2:
                    logger.warning(
                        "Stratified validation split requested but the least populated class has only "
                        f"{class_counts_val.min()} member(s). Stratification will be disabled for validation split."
                    )
                    stratify_val = None

            train, val = train_test_split(
                train_val,
                test_size=relative_val_size,
                random_state=self.random_state,
                shuffle=self.shuffle,
                stratify=stratify_val,
            )
            validation = val
        else:
            train = train_val

        # Convert back to Polars if needed
        if is_polars:
            import polars as pl
            train = pl.from_pandas(train)
            test = pl.from_pandas(test)
            if validation is not None:
                validation = pl.from_pandas(validation)

        return SplitDataset(train=train, test=test, validation=validation)

`split(df)`

Splits a DataFrame.

Source code in skyulf-core\skyulf\preprocessing\split.py

def split(self, df: Union[pd.DataFrame, SkyulfDataFrame]) -> SplitDataset:
    """
    Splits a DataFrame.
    """
    engine = get_engine(df)
    is_polars = engine.name == "polars"

    if is_polars:
        # Convert to Pandas to preserve schema/metadata during split
        df_pd = df.to_pandas()
    else:
        df_pd = df

    stratify = None
    if self.stratify_col and self.stratify_col in df_pd.columns:
        stratify = df_pd[self.stratify_col]
        class_counts = stratify.value_counts()
        if class_counts.min() < 2:
            logger.warning(
                f"Stratified split requested but the least populated class has only {class_counts.min()} "
                "member(s). Stratification will be disabled."
            )
            stratify = None

    train_val, test = train_test_split(
        df_pd,
        test_size=self.test_size,
        random_state=self.random_state,
        shuffle=self.shuffle,
        stratify=stratify,
    )

    validation = None
    if self.validation_size > 0:
        relative_val_size = self.validation_size / (1 - self.test_size)

        stratify_val = None
        if self.stratify_col and self.stratify_col in train_val.columns:
            stratify_val = train_val[self.stratify_col]
            class_counts_val = stratify_val.value_counts()
            if class_counts_val.min() < 2:
                logger.warning(
                    "Stratified validation split requested but the least populated class has only "
                    f"{class_counts_val.min()} member(s). Stratification will be disabled for validation split."
                )
                stratify_val = None

        train, val = train_test_split(
            train_val,
            test_size=relative_val_size,
            random_state=self.random_state,
            shuffle=self.shuffle,
            stratify=stratify_val,
        )
        validation = val
    else:
        train = train_val

    # Convert back to Polars if needed
    if is_polars:
        import polars as pl
        train = pl.from_pandas(train)
        test = pl.from_pandas(test)
        if validation is not None:
            validation = pl.from_pandas(validation)

    return SplitDataset(train=train, test=test, validation=validation)

`split_xy(X, y)`

Splits X and y arrays.

Source code in skyulf-core\skyulf\preprocessing\split.py

def split_xy(self, X: Union[pd.DataFrame, SkyulfDataFrame], y: Union[pd.Series, Any]) -> SplitDataset:
    """
    Splits X and y arrays.
    """
    engine = get_engine(X)
    is_polars = engine.name == "polars"

    if is_polars:
        # Convert to Pandas to preserve schema/metadata during split
        X_pd = X.to_pandas()
        y_pd = y.to_pandas() if y is not None else None
    else:
        X_pd = X
        y_pd = y

    stratify = y_pd if self.stratify_col else None  # If stratify is requested, use y

    if stratify is not None:
        # Check value counts
        class_counts = y_pd.value_counts()
        min_count = class_counts.min()

        if min_count < 2:
            logger.warning(
                f"Stratified split requested but the least populated class has only {min_count} "
                "member(s). Stratification will be disabled."
            )
            stratify = None

    # First split: Train+Val vs Test
    X_train_val, X_test, y_train_val, y_test = train_test_split(
        X_pd,
        y_pd,
        test_size=self.test_size,
        random_state=self.random_state,
        shuffle=self.shuffle,
        stratify=stratify,
    )

    validation = None
    if self.validation_size > 0:
        relative_val_size = self.validation_size / (1 - self.test_size)
        stratify_val = y_train_val if self.stratify_col else None

        if stratify_val is not None:
            class_counts_val = y_train_val.value_counts()
            min_count_val = class_counts_val.min()

            if min_count_val < 2:
                logger.warning(
                    "Stratified validation split requested but the least populated class has only "
                    f"{min_count_val} member(s). Stratification will be disabled for validation split."
                )
                stratify_val = None

        X_train, X_val, y_train, y_val = train_test_split(
            X_train_val,
            y_train_val,
            test_size=relative_val_size,
            random_state=self.random_state,
            shuffle=self.shuffle,
            stratify=stratify_val,
        )
        validation = (X_val, y_val)
    else:
        X_train, y_train = X_train_val, y_train_val

    # Convert back to Polars if needed
    if is_polars:
        import polars as pl

        def to_pl(df_or_series):
            if df_or_series is None: return None
            if isinstance(df_or_series, pd.DataFrame): return pl.from_pandas(df_or_series)
            if isinstance(df_or_series, pd.Series): return pl.from_pandas(df_or_series)
            return df_or_series

        X_train = to_pl(X_train)
        y_train = to_pl(y_train)
        X_test = to_pl(X_test)
        y_test = to_pl(y_test)

        if validation:
            validation = (to_pl(validation[0]), to_pl(validation[1]))

    return SplitDataset(
        train=(X_train, y_train), test=(X_test, y_test), validation=validation
    )