Source code for pyprophet._base

from dataclasses import dataclass, field
import os
import copy
import sys

from loguru import logger
from .io.util import (
    is_tsv_file,
    is_sqlite_file,
    is_parquet_file,
    is_valid_single_split_parquet_dir,
    is_valid_multi_split_parquet_dir,
)


[docs] @dataclass class BaseIOConfig: """ Base configuration class for I/O-related metadata and common attributes used across algorithms. Attributes: infile (str): Path to the input file (e.g., .osw, .parquet). outfile (str): Path to the output file to be written. file_type (str): Type of the input file (e.g., 'osw', 'parquet', 'parquet_split', 'tsv'). subsample_ratio (float): Subsampling ratio for large data. level (str): Scoring level (e.g., 'ms1', 'ms2', 'transition', 'alignment'). context (str): Context or mode in which the reader/writer operates (e.g., 'score_learn', 'score_apply', 'ipf', 'level_context'). prefix (str): Automatically derived from outfile (e.g., 'results/output' for 'results/output.osw'). """ infile: str outfile: str file_type: str = field(init=False) subsample_ratio: float level: str context: str prefix: str = field(init=False)
[docs] def __post_init__(self): """ Initialize the file_type and prefix attributes based on the input file. """ infile = self.infile if is_sqlite_file(infile): if infile.endswith(".osw"): self.file_type = "osw" elif infile.endswith(".sqmass") or infile.endswith(".sqMass"): self.file_type = "sqmass" elif is_parquet_file(infile): self.file_type = "parquet" elif is_valid_single_split_parquet_dir(infile): self.file_type = "parquet_split" elif is_valid_multi_split_parquet_dir(infile): self.file_type = "parquet_split_multi" elif is_tsv_file(infile): self.file_type = "tsv" else: logger.critical( f"Failed to infer file type for: {infile}. Supported formats are: " ".osw, .sqmass/.sqMass, .parquet, split parquet directories (.oswpq/oswpqd), or .tsv files.\n" f" - is_sqlite_file: {is_sqlite_file(infile)}\n" f" - endswith .osw: {infile.endswith('.osw')}\n" f" - endswith .sqmass/.sqMass: {infile.endswith('.sqmass') or infile.endswith('.sqMass')}\n" f" - is_parquet_file: {is_parquet_file(infile)}\n" f" - is_valid_single_split_parquet_dir: {is_valid_single_split_parquet_dir(infile)}\n" f" - is_valid_multi_split_parquet_dir: {is_valid_multi_split_parquet_dir(infile)}\n" f" - is_tsv_file: {is_tsv_file(infile)}" ) sys.exit(1) self.prefix = os.path.splitext( self.outfile )[ 0 ] # TODO: use pathlib instead to avoid potential cases where the outfile is a directory (split_parquet) file.oswpq/, which would result the prefix in being file.oswpq instead of file
[docs] def __str__(self): return ( f"BaseIOConfig(\ninfile='{self.infile}'\noutfile='{self.outfile}'\n" f"file_type='{self.file_type}'\nsubsample_ratio={self.subsample_ratio}\n" f"level='{self.level}'\ncontext='{self.context}'\nprefix='{self.prefix}')" )
[docs] def __repr__(self): return ( f"BaseIOConfig(infile='{self.infile}', outfile='{self.outfile}', " f"subsample_ratio={self.subsample_ratio}, level='{self.level}', " f"context='{self.context}')" )
[docs] def copy(self): """ Return a deep copy of the config object. """ return copy.deepcopy(self)