Source code for pyprophet.io.levels_context.osw

import os
import pickle
from shutil import copyfile
import sqlite3
from typing import Literal
import duckdb
import pandas as pd
import click
from loguru import logger
from ..util import check_sqlite_table, check_duckdb_table
from .._base import BaseOSWReader, BaseOSWWriter
from ..._config import LevelContextIOConfig


[docs] class OSWReader(BaseOSWReader): """ Class for reading and processing data from an OpenSWATH workflow OSW-sqlite based file. The OSWReader class provides methods to read different levels of data from the file and process it accordingly. It supports reading data for semi-supervised learning, IPF analysis, context level analysis. Attributes: infile (str): Input file path. outfile (str): Output file path. classifier (str): Classifier used for semi-supervised learning. level (str): Level used in semi-supervised learning (e.g., 'ms1', 'ms2', 'ms1ms2', 'transition', 'alignment'), or context level used peptide/protein/gene inference (e.g., 'global', 'experiment-wide', 'run-specific'). glyco (bool): Flag indicating whether analysis is glycoform-specific. Methods: read(): Read data from the input file based on the alogorithm. """
[docs] def __init__(self, config: LevelContextIOConfig): super().__init__(config)
[docs] def read(self) -> pd.DataFrame: self._create_indexes() try: con = duckdb.connect() con.execute("INSTALL sqlite_scanner;") con.execute("LOAD sqlite_scanner;") con.execute(f"ATTACH DATABASE '{self.infile}' AS osw (TYPE sqlite);") return self._read_using_duckdb(con) except ModuleNotFoundError as e: logger.warning( f"Warn: DuckDB sqlite_scanner failed, falling back to SQLite. Reason: {e}", ) con = sqlite3.connect(self.infile) return self._read_using_sqlite(con)
[docs] def _create_indexes(self): """ Always use a temporary SQLite connection to create indexes directly on the .osw file, since DuckDB doesn't seem to currently support creating indexes on attached SQLite databases. """ try: sqlite_con = sqlite3.connect(self.infile) if self.level == "peptide": index_statements = [ "CREATE INDEX IF NOT EXISTS idx_peptide_peptide_id ON PEPTIDE (ID);", "CREATE INDEX IF NOT EXISTS idx_precursor_peptide_mapping_peptide_id ON PRECURSOR_PEPTIDE_MAPPING (PEPTIDE_ID);", "CREATE INDEX IF NOT EXISTS idx_precursor_peptide_mapping_precursor_id ON PRECURSOR_PEPTIDE_MAPPING (PRECURSOR_ID);", "CREATE INDEX IF NOT EXISTS idx_precursor_precursor_id ON PRECURSOR (ID);", "CREATE INDEX IF NOT EXISTS idx_feature_precursor_id ON FEATURE (PRECURSOR_ID);", "CREATE INDEX IF NOT EXISTS idx_feature_feature_id ON FEATURE (ID);", "CREATE INDEX IF NOT EXISTS idx_score_ms2_feature_id ON SCORE_MS2 (FEATURE_ID);", ] elif self.level == "glycopeptide": index_statements = [ "CREATE INDEX IF NOT EXISTS idx_glycopeptide_glycopeptide_id ON GLYCOPEPTIDE (ID);", "CREATE INDEX IF NOT EXISTS idx_precursor_glycopeptide_mapping_glycopeptide_id ON PRECURSOR_GLYCOPEPTIDE_MAPPING (GLYCOPEPTIDE_ID);", "CREATE INDEX IF NOT EXISTS idx_precursor_glycopeptide_mapping_precursor_id ON PRECURSOR_GLYCOPEPTIDE_MAPPING (PRECURSOR_ID);", "CREATE INDEX IF NOT EXISTS idx_precursor_precursor_id ON PRECURSOR (ID);", "CREATE INDEX IF NOT EXISTS idx_feature_precursor_id ON FEATURE (PRECURSOR_ID);", "CREATE INDEX IF NOT EXISTS idx_feature_feature_id ON FEATURE (ID);", "CREATE INDEX IF NOT EXISTS idx_score_ms2_feature_id ON SCORE_MS2 (FEATURE_ID);", "CREATE INDEX IF NOT EXISTS idx_score_ms2_part_peptide_feature_id ON SCORE_MS2_PART_PEPTIDE (FEATURE_ID);", "CREATE INDEX IF NOT EXISTS idx_score_ms2_part_glycan_feature_id ON SCORE_MS2_PART_GLYCAN (FEATURE_ID);", ] elif self.level == "protein": index_statements = [ "CREATE INDEX IF NOT EXISTS idx_peptide_protein_mapping_protein_id ON PEPTIDE_PROTEIN_MAPPING (PROTEIN_ID);", "CREATE INDEX IF NOT EXISTS idx_peptide_protein_mapping_peptide_id ON PEPTIDE_PROTEIN_MAPPING (PEPTIDE_ID);", "CREATE INDEX IF NOT EXISTS idx_peptide_peptide_id ON PEPTIDE (ID);", "CREATE INDEX IF NOT EXISTS idx_precursor_peptide_mapping_peptide_id ON PRECURSOR_PEPTIDE_MAPPING (PEPTIDE_ID);", "CREATE INDEX IF NOT EXISTS idx_precursor_peptide_mapping_precursor_id ON PRECURSOR_PEPTIDE_MAPPING (PRECURSOR_ID);", "CREATE INDEX IF NOT EXISTS idx_precursor_precursor_id ON PRECURSOR (ID);", "CREATE INDEX IF NOT EXISTS idx_feature_precursor_id ON FEATURE (PRECURSOR_ID);", "CREATE INDEX IF NOT EXISTS idx_feature_feature_id ON FEATURE (ID);", "CREATE INDEX IF NOT EXISTS idx_score_ms2_feature_id ON SCORE_MS2 (FEATURE_ID);", ] elif self.level == "gene": index_statements = [ " CREATE INDEX IF NOT EXISTS idx_peptide_gene_mapping_gene_id ON PEPTIDE_GENE_MAPPING (GENE_ID);", "CREATE INDEX IF NOT EXISTS idx_peptide_gene_mapping_peptide_id ON PEPTIDE_GENE_MAPPING (PEPTIDE_ID);", "CREATE INDEX IF NOT EXISTS idx_peptide_peptide_id ON PEPTIDE (ID);", "CREATE INDEX IF NOT EXISTS idx_precursor_peptide_mapping_peptide_id ON PRECURSOR_PEPTIDE_MAPPING (PEPTIDE_ID);", "CREATE INDEX IF NOT EXISTS idx_precursor_peptide_mapping_precursor_id ON PRECURSOR_PEPTIDE_MAPPING (PRECURSOR_ID);", "CREATE INDEX IF NOT EXISTS idx_precursor_precursor_id ON PRECURSOR (ID);", "CREATE INDEX IF NOT EXISTS idx_feature_precursor_id ON FEATURE (PRECURSOR_ID);", "CREATE INDEX IF NOT EXISTS idx_feature_feature_id ON FEATURE (ID);", "CREATE INDEX IF NOT EXISTS idx_score_ms2_feature_id ON SCORE_MS2 (FEATURE_ID);", ] if index_statements: for stmt in index_statements: try: sqlite_con.execute(stmt) except sqlite3.OperationalError as e: logger.warning(f"Warn: SQLite index creation failed: {e}") sqlite_con.commit() sqlite_con.close() except Exception as e: raise click.ClickException( f"Failed to create indexes via SQLite fallback: {e}" )
def _read_using_duckdb(self, con): level = self.level if level == "peptide": return self._read_pyp_peptide_duckdb(con) elif level == "glycopeptide": return self._read_pyp_glycopeptide_duckdb(con) elif level == "protein": return self._read_pyp_protein_duckdb(con) elif level == "gene": return self._read_pyp_gene_duckdb(con) else: raise click.ClickException(f"Unsupported level: {level}") def _read_using_sqlite(self, con): level = self.level if level == "peptide": return self._read_pyp_peptide_sqlite(con) elif level == "glycopeptide": return self._read_pyp_glycopeptide_sqlite(con) elif level == "protein": return self._read_pyp_protein_sqlite(con) elif level == "gene": return self._read_pyp_gene_sqlite(con) else: raise click.ClickException(f"Unsupported level: {level}") # ---------------------------- # DuckDB Queries # ---------------------------- def _fetch_tables_duckdb(self, con): tables = con.execute( "SELECT table_schema, table_name FROM information_schema.tables" ).fetchdf() return tables def _read_pyp_peptide_duckdb(self, con): if not check_duckdb_table(con, "main", "SCORE_MS2"): raise click.ClickException( f"Apply scoring to MS2-level data before running peptide-level scoring.\nTable Info:\n{self._fetch_tables_duckdb(con)}" ) cfg = self.config # LevelContextIOConfig instance if cfg.context_fdr == "global": run_id = "NULL" group_id = "PEPTIDE.ID" else: run_id = "RUN_ID" group_id = "RUN_ID || '_' || PEPTIDE_ID" logger.info("Reading peptide-level data ...") query = f""" SELECT {run_id} AS RUN_ID, {group_id} AS GROUP_ID, PEPTIDE.ID AS PEPTIDE_ID, PRECURSOR.DECOY AS DECOY, SCORE_MS2.SCORE AS SCORE, '{cfg.context_fdr}' AS CONTEXT FROM osw.PEPTIDE INNER JOIN osw.PRECURSOR_PEPTIDE_MAPPING ON PEPTIDE.ID = PRECURSOR_PEPTIDE_MAPPING.PEPTIDE_ID INNER JOIN osw.PRECURSOR ON PRECURSOR_PEPTIDE_MAPPING.PRECURSOR_ID = PRECURSOR.ID INNER JOIN osw.FEATURE ON PRECURSOR.ID = FEATURE.PRECURSOR_ID INNER JOIN osw.SCORE_MS2 ON FEATURE.ID = SCORE_MS2.FEATURE_ID WHERE SCORE_MS2.SCORE IS NOT NULL QUALIFY ROW_NUMBER() OVER (PARTITION BY {group_id} ORDER BY SCORE DESC) = 1 """ df = con.execute(query).fetchdf() return df.rename(columns=str.lower) def _read_pyp_glycopeptide_duckdb(self, con): if ( not check_duckdb_table(con, "main", "SCORE_MS2") or not check_duckdb_table(con, "main", "SCORE_MS2_PART_PEPTIDE") or not check_duckdb_table(con, "main", "SCORE_MS2_PART_GLYCAN") ): raise click.ClickException( f"Apply scoring to MS2-level data before running glycopeptide-level scoring.\nTable Info:\n{self._fetch_tables_duckdb(con)}" ) cfg = self.config if cfg.context_fdr == "global": run_id = "NULL" group_id = "GLYCOPEPTIDE.ID" else: run_id = "RUN_ID" group_id = "RUN_ID || '_' || GLYCOPEPTIDE.ID" logger.info("Reading glycopeptide-level data ...") query = f""" SELECT {run_id} AS RUN_ID, {group_id} AS GROUP_ID, GLYCOPEPTIDE.ID AS GLYCOPEPTIDE_ID, GLYCOPEPTIDE.DECOY_PEPTIDE, GLYCOPEPTIDE.DECOY_GLYCAN, SCORE_MS2.SCORE AS d_score_combined, SCORE_MS2_PART_PEPTIDE.SCORE AS d_score_peptide, SCORE_MS2_PART_GLYCAN.SCORE AS d_score_glycan, '{cfg.context_fdr}' AS CONTEXT FROM osw.GLYCOPEPTIDE INNER JOIN osw.PRECURSOR_GLYCOPEPTIDE_MAPPING ON GLYCOPEPTIDE.ID = PRECURSOR_GLYCOPEPTIDE_MAPPING.GLYCOPEPTIDE_ID INNER JOIN osw.PRECURSOR ON PRECURSOR_GLYCOPEPTIDE_MAPPING.PRECURSOR_ID = PRECURSOR.ID INNER JOIN osw.FEATURE ON PRECURSOR.ID = FEATURE.PRECURSOR_ID INNER JOIN osw.SCORE_MS2 ON FEATURE.ID = SCORE_MS2.FEATURE_ID INNER JOIN osw.SCORE_MS2_PART_PEPTIDE ON FEATURE.ID = SCORE_MS2_PART_PEPTIDE.FEATURE_ID INNER JOIN osw.SCORE_MS2_PART_GLYCAN ON FEATURE.ID = SCORE_MS2_PART_GLYCAN.FEATURE_ID WHERE SCORE_MS2.SCORE IS NOT NULL AND SCORE_MS2_PART_PEPTIDE.SCORE IS NOT NULL AND SCORE_MS2_PART_GLYCAN.SCORE IS NOT NULL QUALIFY ROW_NUMBER() OVER (PARTITION BY {group_id} ORDER BY d_score_combined DESC) = 1 """ df = con.execute(query).fetchdf() return df.rename(columns=str.lower) def _read_pyp_protein_duckdb(self, con): if not check_duckdb_table(con, "main", "SCORE_MS2"): raise click.ClickException( f"Apply scoring to MS2-level data before running protein-level scoring.\nTable Info:\n{self._fetch_tables_duckdb(con)}" ) cfg = self.config if cfg.context_fdr == "global": run_id = "NULL" group_id = "PROTEIN.ID" else: run_id = "RUN_ID" group_id = "RUN_ID || '_' || PROTEIN_ID" logger.info("Reading protein-level data ...") query = f""" SELECT {run_id} AS RUN_ID, {group_id} AS GROUP_ID, PROTEIN.ID AS PROTEIN_ID, PRECURSOR.DECOY AS DECOY, SCORE, '{cfg.context_fdr}' AS CONTEXT FROM osw.PROTEIN INNER JOIN (SELECT PEPTIDE_PROTEIN_MAPPING.PEPTIDE_ID AS PEPTIDE_ID, PROTEIN_ID FROM (SELECT PEPTIDE_ID, COUNT(*) AS NUM_PROTEINS FROM osw.PEPTIDE_PROTEIN_MAPPING GROUP BY PEPTIDE_ID) AS PROTEINS_PER_PEPTIDE INNER JOIN osw.PEPTIDE_PROTEIN_MAPPING ON PROTEINS_PER_PEPTIDE.PEPTIDE_ID = PEPTIDE_PROTEIN_MAPPING.PEPTIDE_ID WHERE NUM_PROTEINS == 1) AS PEPTIDE_PROTEIN_MAPPING ON PROTEIN.ID = PEPTIDE_PROTEIN_MAPPING.PROTEIN_ID INNER JOIN osw.PEPTIDE ON PEPTIDE_PROTEIN_MAPPING.PEPTIDE_ID = PEPTIDE.ID INNER JOIN osw.PRECURSOR_PEPTIDE_MAPPING ON PEPTIDE.ID = PRECURSOR_PEPTIDE_MAPPING.PEPTIDE_ID INNER JOIN osw.PRECURSOR ON PRECURSOR_PEPTIDE_MAPPING.PRECURSOR_ID = PRECURSOR.ID INNER JOIN osw.FEATURE ON PRECURSOR.ID = FEATURE.PRECURSOR_ID INNER JOIN osw.SCORE_MS2 ON FEATURE.ID = SCORE_MS2.FEATURE_ID WHERE SCORE_MS2.SCORE IS NOT NULL QUALIFY ROW_NUMBER() OVER (PARTITION BY {group_id} ORDER BY SCORE DESC) = 1 """ df = con.execute(query).fetchdf() return df.rename(columns=str.lower) def _read_pyp_gene_duckdb(self, con): if not check_duckdb_table(con, "main", "SCORE_MS2"): raise click.ClickException( f"Apply scoring to MS2-level data before running gene-level scoring.\nTable Info:\n{self._fetch_tables_duckdb(con)}" ) cfg = self.config if cfg.context_fdr == "global": run_id = "NULL" group_id = "GENE.ID" else: run_id = "RUN_ID" group_id = "RUN_ID || '_' || GENE_ID" logger.info("Reading gene-level data ...") query = f""" SELECT {run_id} AS RUN_ID, {group_id} AS GROUP_ID, GENE.ID AS GENE_ID, PRECURSOR.DECOY AS DECOY, SCORE, '{cfg.context_fdr}' AS CONTEXT FROM osw.GENE INNER JOIN (SELECT PEPTIDE_GENE_MAPPING.PEPTIDE_ID AS PEPTIDE_ID, GENE_ID FROM (SELECT PEPTIDE_ID, COUNT(*) AS NUM_GENES FROM osw.PEPTIDE_GENE_MAPPING GROUP BY PEPTIDE_ID) AS GENES_PER_PEPTIDE INNER JOIN osw.PEPTIDE_GENE_MAPPING ON GENES_PER_PEPTIDE.PEPTIDE_ID = PEPTIDE_GENE_MAPPING.PEPTIDE_ID WHERE NUM_GENES == 1) AS PEPTIDE_GENE_MAPPING ON GENE.ID = PEPTIDE_GENE_MAPPING.GENE_ID INNER JOIN osw.PEPTIDE ON PEPTIDE_GENE_MAPPING.PEPTIDE_ID = PEPTIDE.ID INNER JOIN osw.PRECURSOR_PEPTIDE_MAPPING ON PEPTIDE.ID = PRECURSOR_PEPTIDE_MAPPING.PEPTIDE_ID INNER JOIN osw.PRECURSOR ON PRECURSOR_PEPTIDE_MAPPING.PRECURSOR_ID = PRECURSOR.ID INNER JOIN osw.FEATURE ON PRECURSOR.ID = FEATURE.PRECURSOR_ID INNER JOIN osw.SCORE_MS2 ON FEATURE.ID = SCORE_MS2.FEATURE_ID WHERE SCORE_MS2.SCORE IS NOT NULL QUALIFY ROW_NUMBER() OVER (PARTITION BY {group_id} ORDER BY SCORE DESC) = 1 """ df = con.execute(query).fetchdf() return df.rename(columns=str.lower) # ---------------------------- # SQLite fallback # ---------------------------- def _read_pyp_peptide_sqlite(self, con): if not check_sqlite_table(con, "SCORE_MS2"): raise click.ClickException( "Apply scoring to MS2-level data before running peptide-level scoring." ) cfg = self.config # LevelContextIOConfig instance if cfg.context_fdr == "global": run_id = "NULL" group_id = "PEPTIDE.ID" else: run_id = "RUN_ID" group_id = 'RUN_ID || "_" || PEPTIDE.ID' logger.info("Reading peptide-level data ...") query = f""" SELECT {run_id} AS RUN_ID, {group_id} AS GROUP_ID, PEPTIDE.ID AS PEPTIDE_ID, PRECURSOR.DECOY AS DECOY, SCORE_MS2.SCORE AS SCORE, '{cfg.context_fdr}' AS CONTEXT FROM PEPTIDE INNER JOIN PRECURSOR_PEPTIDE_MAPPING ON PEPTIDE.ID = PRECURSOR_PEPTIDE_MAPPING.PEPTIDE_ID INNER JOIN PRECURSOR ON PRECURSOR_PEPTIDE_MAPPING.PRECURSOR_ID = PRECURSOR.ID INNER JOIN FEATURE ON PRECURSOR.ID = FEATURE.PRECURSOR_ID INNER JOIN SCORE_MS2 ON FEATURE.ID = SCORE_MS2.FEATURE_ID GROUP BY GROUP_ID HAVING MAX(SCORE) ORDER BY SCORE DESC """ df = pd.read_sql_query(query, con) return df.rename(columns=str.lower) def _read_pyp_glycopeptide_sqlite(self, con): if ( not check_sqlite_table(con, "SCORE_MS2") or not check_sqlite_table(con, "SCORE_MS2_PART_PEPTIDE") or not check_sqlite_table(con, "SCORE_MS2_PART_GLYCAN") ): raise click.ClickException( "Apply scoring to MS2-level data before running glycopeptide-level scoring." ) cfg = self.config if cfg.context_fdr == "global": run_id = "NULL" group_id = "GLYCOPEPTIDE.ID" else: run_id = "RUN_ID" group_id = 'RUN_ID || "_" || GLYCOPEPTIDE.ID' logger.info("Reading glycopeptide-level data ...") query = f""" SELECT {run_id} AS RUN_ID, {group_id} AS GROUP_ID, GLYCOPEPTIDE.ID AS GLYCOPEPTIDE_ID, GLYCOPEPTIDE.DECOY_PEPTIDE, GLYCOPEPTIDE.DECOY_GLYCAN, SCORE_MS2.SCORE AS d_score_combined, SCORE_MS2_PART_PEPTIDE.SCORE AS d_score_peptide, SCORE_MS2_PART_GLYCAN.SCORE AS d_score_glycan, "{cfg.context_fdr}" AS CONTEXT FROM GLYCOPEPTIDE INNER JOIN PRECURSOR_GLYCOPEPTIDE_MAPPING ON GLYCOPEPTIDE.ID = PRECURSOR_GLYCOPEPTIDE_MAPPING.GLYCOPEPTIDE_ID INNER JOIN PRECURSOR ON PRECURSOR_GLYCOPEPTIDE_MAPPING.PRECURSOR_ID = PRECURSOR.ID INNER JOIN FEATURE ON PRECURSOR.ID = FEATURE.PRECURSOR_ID INNER JOIN SCORE_MS2 ON FEATURE.ID = SCORE_MS2.FEATURE_ID INNER JOIN SCORE_MS2_PART_PEPTIDE ON FEATURE.ID = SCORE_MS2_PART_PEPTIDE.FEATURE_ID INNER JOIN SCORE_MS2_PART_GLYCAN ON FEATURE.ID = SCORE_MS2_PART_GLYCAN.FEATURE_ID GROUP BY GROUP_ID HAVING MAX(d_score_combined) ORDER BY d_score_combined DESC """ df = pd.read_sql_query(query, con) return df.rename(columns=str.lower) def _read_pyp_protein_sqlite(self, con): if not check_sqlite_table(con, "SCORE_MS2"): raise click.ClickException( "Apply scoring to MS2-level data before running protein-level scoring." ) cfg = self.config if cfg.context_fdr == "global": run_id = "NULL" group_id = "PROTEIN.ID" else: run_id = "RUN_ID" group_id = 'RUN_ID || "_" || PROTEIN.ID' logger.info("Reading protein-level data ...") query = f""" SELECT {run_id} AS RUN_ID, {group_id} AS GROUP_ID, PROTEIN.ID AS PROTEIN_ID, PRECURSOR.DECOY AS DECOY, SCORE, "{cfg.context_fdr}" AS CONTEXT FROM PROTEIN INNER JOIN (SELECT PEPTIDE_PROTEIN_MAPPING.PEPTIDE_ID AS PEPTIDE_ID, PROTEIN_ID FROM (SELECT PEPTIDE_ID, COUNT(*) AS NUM_PROTEINS FROM PEPTIDE_PROTEIN_MAPPING GROUP BY PEPTIDE_ID) AS PROTEINS_PER_PEPTIDE INNER JOIN PEPTIDE_PROTEIN_MAPPING ON PROTEINS_PER_PEPTIDE.PEPTIDE_ID = PEPTIDE_PROTEIN_MAPPING.PEPTIDE_ID WHERE NUM_PROTEINS == 1) AS PEPTIDE_PROTEIN_MAPPING ON PROTEIN.ID = PEPTIDE_PROTEIN_MAPPING.PROTEIN_ID INNER JOIN PEPTIDE ON PEPTIDE_PROTEIN_MAPPING.PEPTIDE_ID = PEPTIDE.ID INNER JOIN PRECURSOR_PEPTIDE_MAPPING ON PEPTIDE.ID = PRECURSOR_PEPTIDE_MAPPING.PEPTIDE_ID INNER JOIN PRECURSOR ON PRECURSOR_PEPTIDE_MAPPING.PRECURSOR_ID = PRECURSOR.ID INNER JOIN FEATURE ON PRECURSOR.ID = FEATURE.PRECURSOR_ID INNER JOIN SCORE_MS2 ON FEATURE.ID = SCORE_MS2.FEATURE_ID GROUP BY GROUP_ID HAVING MAX(SCORE) ORDER BY SCORE DESC """ df = pd.read_sql_query(query, con) return df.rename(columns=str.lower) def _read_pyp_gene_sqlite(self, con): if not check_sqlite_table(con, "SCORE_MS2"): raise click.ClickException( "Apply scoring to MS2-level data before running gene-level scoring." ) cfg = self.config if cfg.context_fdr == "global": run_id = "NULL" group_id = "GENE.ID" else: run_id = "RUN_ID" group_id = 'RUN_ID || "_" || GENE.ID' logger.info("Reading gene-level data ...") query = f""" SELECT {run_id} AS RUN_ID, {group_id} AS GROUP_ID, GENE.ID AS GENE_ID, PRECURSOR.DECOY AS DECOY, SCORE, "{cfg.context_fdr}" AS CONTEXT FROM GENE INNER JOIN (SELECT PEPTIDE_GENE_MAPPING.PEPTIDE_ID AS PEPTIDE_ID, GENE_ID FROM (SELECT PEPTIDE_ID, COUNT(*) AS NUM_GENES FROM PEPTIDE_GENE_MAPPING GROUP BY PEPTIDE_ID) AS GENES_PER_PEPTIDE INNER JOIN PEPTIDE_GENE_MAPPING ON GENES_PER_PEPTIDE.PEPTIDE_ID = PEPTIDE_GENE_MAPPING.PEPTIDE_ID WHERE NUM_GENES == 1) AS PEPTIDE_GENE_MAPPING ON GENE.ID = PEPTIDE_GENE_MAPPING.GENE_ID INNER JOIN PEPTIDE ON PEPTIDE_GENE_MAPPING.PEPTIDE_ID = PEPTIDE.ID INNER JOIN PRECURSOR_PEPTIDE_MAPPING ON PEPTIDE.ID = PRECURSOR_PEPTIDE_MAPPING.PEPTIDE_ID INNER JOIN PRECURSOR ON PRECURSOR_PEPTIDE_MAPPING.PRECURSOR_ID = PRECURSOR.ID INNER JOIN FEATURE ON PRECURSOR.ID = FEATURE.PRECURSOR_ID INNER JOIN SCORE_MS2 ON FEATURE.ID = SCORE_MS2.FEATURE_ID GROUP BY GROUP_ID HAVING MAX(SCORE) ORDER BY SCORE DESC """ df = pd.read_sql_query(query, con) return df.rename(columns=str.lower)
[docs] class OSWWriter(BaseOSWWriter): """ Class for writing OpenSWATH results to an OSW-sqlite based file. Attributes: infile (str): Input file path. outfile (str): Output file path. classifier (str): Classifier used for semi-supervised learning. level (str): Level used in semi-supervised learning (e.g., 'ms1', 'ms2', 'ms1ms2', 'transition', 'alignment'), or context level used peptide/protein/gene inference (e.g., 'global', 'experiment-wide', 'run-specific'). glyco (bool): Flag indicating whether analysis is glycoform-specific. Methods: save_results(result, pi0): Save the results to the output file based on the module using this class. save_weights(weights): Save the weights to the output file. """
[docs] def __init__(self, config: LevelContextIOConfig): super().__init__(config) self.context_level_id_map = { "peptide": "peptide_id", "protein": "protein_id", "gene": "gene_id", }
[docs] def save_results(self, result): """ Save the results to the output file based on the module using this class. """ context = self.config.context_fdr if self.infile != self.outfile: copyfile(self.infile, self.outfile) con = sqlite3.connect(self.outfile) if self.level == "glycopeptide": c = con.cursor() c.execute( 'SELECT count(name) FROM sqlite_master WHERE type="table" AND name="SCORE_GLYCOPEPTIDE"' ) if c.fetchone()[0] == 1: c.execute( 'DELETE FROM SCORE_GLYCOPEPTIDE WHERE CONTEXT =="%s"' % context ) c.fetchall() c.execute( 'SELECT count(name) FROM sqlite_master WHERE type="table" AND name="SCORE_GLYCOPEPTIDE_PART_PEPTIDE"' ) if c.fetchone()[0] == 1: c.execute( 'DELETE FROM SCORE_GLYCOPEPTIDE_PART_PEPTIDE WHERE CONTEXT =="%s"' % context ) c.fetchall() c.execute( 'SELECT count(name) FROM sqlite_master WHERE type="table" AND name="SCORE_GLYCOPEPTIDE_PART_GLYCAN"' ) if c.fetchone()[0] == 1: c.execute( 'DELETE FROM SCORE_GLYCOPEPTIDE_PART_GLYCAN WHERE CONTEXT =="%s"' % context ) c.fetchall() df = result[ [ "context", "run_id", "glycopeptide_id", "d_score_combined", "q_value", "pep", ] ] df.columns = [ "CONTEXT", "RUN_ID", "GLYCOPEPTIDE_ID", "SCORE", "QVALUE", "PEP", ] table = "SCORE_GLYCOPEPTIDE" df.to_sql( table, con, index=False, dtype={"RUN_ID": "INTEGER"}, if_exists="append" ) for part in ["peptide", "glycan"]: df = result[ [ "context", "run_id", "glycopeptide_id", "d_score_" + part, "pep_" + part, ] ] df.columns = ["CONTEXT", "RUN_ID", "GLYCOPEPTIDE_ID", "SCORE", "PEP"] table = "SCORE_GLYCOPEPTIDE_PART_" + part.upper() df.to_sql( table, con, index=False, dtype={"RUN_ID": "INTEGER"}, if_exists="append", ) else: c = con.cursor() c.execute( f'''SELECT count(name) FROM sqlite_master WHERE type="table" AND name="SCORE_{self.level.upper()}"''' ) if c.fetchone()[0] == 1: c.execute( f'''DELETE FROM SCORE_{self.level.upper()} WHERE CONTEXT =="{context}"''' ) c.fetchall() context_level_id = self.context_level_id_map.get(self.level) result = result[ [ "context", "run_id", context_level_id, "score", "p_value", "q_value", "pep", ] ] result.columns = [ "CONTEXT", "RUN_ID", context_level_id.upper(), "SCORE", "PVALUE", "QVALUE", "PEP", ] result.to_sql( f"SCORE_{self.level.upper()}", con, index=False, dtype={"RUN_ID": "INTEGER"}, if_exists="append", ) con.close()