Source code for fontr.pipelines.data_processing.nodes

from typing import Any

import numpy as np
import pandas as pd


[docs]def get_label2idx_mapping(idx2label: pd.DataFrame) -> dict: labels = idx2label.iloc[:, 0].unique() label2idx = {l: i for i, l in enumerate(sorted(labels))} return label2idx
[docs]def labeled_images_split( data: pd.DataFrame, parameters: dict[str, Any] ) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: """Split labeled image list to train, validation and test dataset Args: data (pd.DataFrame): list of images parameters (dict[str, Any]): pipeline parameters Returns: tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: split dataset """ # TODO in-class balance in the split here train_frac: float = parameters["data_split"]["train_frac"] valid_frac: float = parameters["data_split"]["valid_frac"] test_frac: float = parameters["data_split"]["test_frac"] assert np.isclose(1.0, train_frac + valid_frac + test_frac) train, valid, test = np.split( data.sample( frac=1, random_state=parameters["random_state_seed"], ), np.array( [int(train_frac * len(data)), int((train_frac + valid_frac) * len(data))] ), ) return train, valid, test # type: ignore
[docs]def unlabeled_images_split( data: pd.DataFrame, parameters: dict[str, Any] ) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: """Split unlabeled image list to train and test dataset Args: data (pd.DataFrame): list of images parameters (dict[str, Any]): pipeline parameters Returns: tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: split dataset """ train_frac: float = parameters["data_split"]["train_frac"] valid_frac: float = parameters["data_split"]["valid_frac"] test_frac: float = parameters["data_split"]["test_frac"] assert np.isclose(1.0, train_frac + valid_frac + test_frac) train: pd.DataFrame valid: pd.DataFrame test: pd.DataFrame # (mypy does not know that the returned np.ndarrays are in fact pd.dataframes) train, valid, test = np.split( # type: ignore data.sample( frac=1, random_state=parameters["random_state_seed"], ), np.array( [int(train_frac * len(data)), int((train_frac + valid_frac) * len(data))] ), ) return train, valid, test