Source code for fontr.pipelines.bcf_preprocessing.nodes

import fsspec
import numpy as np
import pandas as pd
from kedro.io.core import get_protocol_and_path


[docs]def read_bcf_metadata( bcf_file: fsspec.core.OpenFile, ) -> tuple[fsspec.core.OpenFile, np.ndarray]: """ Reads metadata of bcf file using the potiner given as the argument. The .bcf format looks as follows: * 8 bytes - n number of the .png files in the .bcf file. * 8n - size of each .png file. * n - .png files stored as raw bytes. Args: bcf_file (fsspec.core.OpenFile): File descriptior to the .bcf file. Returns: tuple[fsspec.core.OpenFile, np.ndarray]: File descriptor to the .bcf file. Read sizes of the .png files. """ size = int(np.frombuffer(bcf_file.read(8), dtype=np.uint64)[0]) file_sizes = np.frombuffer(bcf_file.read(8 * size), dtype=np.uint64) return bcf_file, file_sizes
[docs]def upload_bcf_as_png( bcf_file: fsspec.core.OpenFile, file_sizes: np.ndarray, output_path: str ) -> None: """ Stores .png files stored in a .bcf files in a `output_path`. Args: bcf_file (fsspec.core.OpenFile): File descriptior to the .bcf file. file_sizes (np.ndarray): File sizes read in `read_bcf_metadata` node. output_path (str): Path where the .png files are stored """ offsets = np.append(np.uint64(0), np.add.accumulate(file_sizes)) protocol, _ = get_protocol_and_path(output_path) _fs = fsspec.filesystem(protocol) for i in range(len(file_sizes)): bcf_file.seek(np.uint64(len(offsets) * 8 + offsets[i])) out = bcf_file.read(offsets[i + 1] - offsets[i]) filename = f"{output_path}/{i}.png" with _fs.open(filename, "wb") as f: f.write(out) return None
[docs]def read_labels(label_file: fsspec.core.OpenFile) -> pd.DataFrame: """ Stores reads labels saved under `label_file` and converts it into a cvs file Args: label_file (fsspec.core.OpenFile): File descriptor to the .label file Returns: pd.DataFrame: Read labels as dataframe """ labels = np.frombuffer(label_file.read(), dtype=np.uint32) df_labels = pd.DataFrame(data=labels, columns=["labels"]) return df_labels
[docs]def upload_labels_as_csv( df_labels: pd.DataFrame, output_path: str ): # TODO: use fsspec here """ Stores passed `df_labels` in the `output_path`. Args: df_labels (pd.DataFrame): labels output_path (str): Pathe where the labels.csv file is stored """ df_labels.to_csv(f"{output_path}/labels.csv") return None