From 4de63dc55d0abbe619136ab70f1ba2252fcc65ed Mon Sep 17 00:00:00 2001 From: Yoo HoJun Date: Thu, 18 Jun 2026 12:49:00 +0900 Subject: [PATCH 1/2] feat: LFQ/TMT Integration --- app.py | 1 + content/results_abundance.py | 140 ++- content/results_heatmap.py | 130 ++- content/results_pathway_analysis.py | 258 ++++ content/results_pca.py | 50 +- content/results_volcano.py | 235 ++-- requirements.txt | 2 +- src/WorkflowTest.py | 1678 +++++++++++++++++++-------- src/common/results_helpers.py | 352 ++++-- src/workflow/ParameterManager.py | 24 + src/workflow/StreamlitUI.py | 126 +- 11 files changed, 2255 insertions(+), 741 deletions(-) create mode 100644 content/results_pathway_analysis.py diff --git a/app.py b/app.py index 6c276f0..97f9a16 100644 --- a/app.py +++ b/app.py @@ -27,6 +27,7 @@ st.Page(Path("content", "results_pca.py"), title="PCA", icon="๐Ÿ“Š"), st.Page(Path("content", "results_heatmap.py"), title="Heatmap", icon="๐Ÿ”ฅ"), st.Page(Path("content", "results_library.py"), title="Spectral Library", icon="๐Ÿ“š"), + st.Page(Path("content", "results_pathway_analysis.py"), title="Pathway Analysis", icon="๐Ÿ“‰"), ], } diff --git a/content/results_abundance.py b/content/results_abundance.py index 3740bcc..a86f1a3 100644 --- a/content/results_abundance.py +++ b/content/results_abundance.py @@ -1,10 +1,10 @@ """Abundance (ProteomicsLFQ) Results Page.""" import streamlit as st import pandas as pd -import numpy as np from pathlib import Path from src.common.common import page_setup from src.common.results_helpers import get_workflow_dir, get_abundance_data +from src.workflow.ParameterManager import ParameterManager params = page_setup() st.title("Abundance Quantification") @@ -22,6 +22,10 @@ workflow_dir = get_workflow_dir(st.session_state["workspace"]) quant_dir = workflow_dir / "results" / "quant_results" +parameter_manager = ParameterManager(workflow_dir, "TOPP Workflow") + +workflow_params = parameter_manager.get_parameters_from_json() +analysis_mode = workflow_params.get("analysis-mode", "LFQ") if not quant_dir.exists(): st.info("No quantification results available yet. Please run the workflow first.") @@ -36,6 +40,60 @@ csv_file = csv_files[0] +def render_protein_table(pivot_df, group_map, is_lfq=True): + """Common function to render the protein-level abundance table""" + st.markdown("### Protein-Level Abundance Table") + st.info( + "This protein-level table is generated by grouping all PSMs that map to the " + "same protein and aggregating their intensities across samples.\n\n" + "Additionally, log2 fold change and p-values are calculated between sample groups." + ) + + # Display group comparison info + groups = sorted(set(group_map.values())) + if len(groups) >= 2: + group1, group2 = sorted(groups)[:2] + st.info(f"Statistical comparison: **{group2} vs {group1}**") + + if is_lfq: + # Handle LFQ mode columns (Raw Intensity) + id_col = "ProteinName" + exclude_cols = [id_col, "log2FC", "p-value", "PeptideSequence"] + sample_cols = [c for c in pivot_df.columns if c not in exclude_cols] + + pivot_df["Intensity"] = pivot_df[sample_cols].apply(list, axis=1) + display_cols = [id_col, "log2FC", "p-value", "Intensity"] + sample_cols + ["PeptideSequence"] + help_text = "Raw sample intensities" + y_min = None + else: + # Handle non-LFQ mode columns (Log2-transformed Intensity) + id_col = "protein" + exclude_cols = [id_col, "log2FC", "p-value", "p-adj", "n_proteins", "n_peptides", "protein_score"] + sample_cols = [c for c in pivot_df.columns if c not in exclude_cols and "ratio" not in c.lower()] + + pivot_df["Intensity"] = pivot_df[sample_cols].apply( + lambda row: [np.log2(v + 1) for v in row], axis=1 + ) + display_cols = [id_col, "log2FC", "p-value", "Intensity"] + sample_cols + help_text = "Sample intensities (log2 scale)" + y_min = 0 + + # Filter to available columns, then sort and display + available_cols = [c for c in display_cols if c in pivot_df.columns] + + st.dataframe( + pivot_df[available_cols].sort_values("p-value"), + column_config={ + "Intensity": st.column_config.BarChartColumn( + "Intensity", + help=help_text, + width="small", + y_min=y_min, + ), + }, + use_container_width=True, + ) + protein_tab, psm_tab = st.tabs(["Protein Table", "PSM-level Quantification Table"]) try: @@ -45,62 +103,44 @@ st.info("No data found in this file.") st.stop() - with protein_tab: - st.markdown("### Protein-Level Abundance Table") + result = get_abundance_data(st.session_state["workspace"]) - st.info( - "This protein-level table is generated by grouping all PSMs that map to the " - "same protein and aggregating their intensities across samples.\n\n" - "Additionally, log2 fold change and p-values are calculated between sample groups." - ) - - result = get_abundance_data(st.session_state["workspace"]) - if result is None: - st.warning("Could not compute abundance data. Please ensure sample groups are defined in the Configure page.") - st.page_link("content/workflow_configure.py", label="Go to Configure", icon="โš™๏ธ") - st.stop() + if analysis_mode == "LFQ": + protein_tab, psm_tab = st.tabs(["Protein Table", "PSM-level Quantification Table"]) - pivot_df, expr_df, group_map = result + with protein_tab: + if result is None: + st.warning("Could not compute abundance data. Please ensure sample groups are defined in the Configure page.") + # st.page_link("content/workflow_configure.py", label="Go to Configure", icon="โš™๏ธ") + st.stop() + + pivot_df, expr_df, group_map = result + render_protein_table(pivot_df, group_map, is_lfq=True) - # Display group comparison info - groups = sorted(set(group_map.values())) - if len(groups) >= 2: - group1, group2 = sorted(groups)[:2] - st.info(f"Statistical comparison: **{group2} vs {group1}**") + with psm_tab: + st.markdown("### PSM-level Quantification Table") + st.info( + "This table shows the PSM-level quantification data, including protein IDs, " + "peptide sequences, charge states, and intensities across samples. " + "Each row represents one peptide-spectrum match detected from the MS/MS analysis." + ) + st.dataframe(df, use_container_width=True) - # Get sample columns (between stats and PeptideSequence) - sample_cols = [c for c in pivot_df.columns if c not in ["ProteinName", "log2FC", "p-value", "PeptideSequence"]] + else: + pre_processing_tab, protein_tab = st.tabs(["Pre-processing", "Protein Table"]) - # Create bar chart column with log2-transformed values - pivot_df["Intensity"] = pivot_df[sample_cols].apply( - lambda row: [np.log2(v + 1) for v in row], axis=1 - ) + if result is None: + st.info("๐Ÿ’ก Please complete the configuration in the 'Configure' page to see results.") + st.stop() + + pivot_df, expr_df, group_map = result - # Reorder columns: place Intensity after p-value - display_cols = ["ProteinName", "log2FC", "p-value", "Intensity"] + sample_cols + ["PeptideSequence"] - display_df = pivot_df[display_cols] - - st.dataframe( - display_df.sort_values("p-value"), - column_config={ - "Intensity": st.column_config.BarChartColumn( - "Intensity", - help="Sample intensities (log2 scale)", - width="small", - y_min=0, - ), - }, - use_container_width=True, - ) + with pre_processing_tab: + st.write("### Final Results (Group row removed, Stats added)") + st.dataframe(pivot_df.head(10)) - with psm_tab: - st.markdown("### PSM-level Quantification Table") - st.info( - "This table shows the PSM-level quantification data, including protein IDs, " - "peptide sequences, charge states, and intensities across samples. " - "Each row represents one peptide-spectrum match detected from the MS/MS analysis." - ) - st.dataframe(df, use_container_width=True) + with protein_tab: + render_protein_table(pivot_df, group_map, is_lfq=False) except Exception as e: st.error(f"Failed to load {csv_file.name}: {e}") diff --git a/content/results_heatmap.py b/content/results_heatmap.py index 4ece3f4..72b8438 100644 --- a/content/results_heatmap.py +++ b/content/results_heatmap.py @@ -5,7 +5,8 @@ from scipy.cluster.hierarchy import linkage, leaves_list from scipy.spatial.distance import pdist from src.common.common import page_setup -from src.common.results_helpers import get_abundance_data +from src.common.results_helpers import get_abundance_data, get_workflow_dir +from src.workflow.ParameterManager import ParameterManager params = page_setup() st.title("Heatmap") @@ -29,48 +30,103 @@ pivot_df, expr_df, group_map = result -top_n = st.slider("Number of proteins", 20, 200, 50, key="heatmap_top_n") +workflow_dir = get_workflow_dir(st.session_state["workspace"]) +parameter_manager = ParameterManager(workflow_dir, "TOPP Workflow") -var_series = expr_df.var(axis=1) -top_proteins = var_series.sort_values(ascending=False).head(top_n).index -heatmap_df = expr_df.loc[top_proteins] -heatmap_z = heatmap_df.sub(heatmap_df.mean(axis=1), axis=0).div(heatmap_df.std(axis=1), axis=0) -heatmap_z = heatmap_z.replace([np.inf, -np.inf], np.nan).dropna() +workflow_params = parameter_manager.get_parameters_from_json() +analysis_mode = workflow_params.get("analysis-mode", "LFQ") -if not heatmap_z.empty: - row_linkage = linkage(pdist(heatmap_z.values), method="average") - row_order = leaves_list(row_linkage) +st.write("Workflow Analysis Mode:", analysis_mode) - col_linkage = linkage(pdist(heatmap_z.T.values), method="average") - col_order = leaves_list(col_linkage) +if analysis_mode == "LFQ": + top_n = st.slider("Number of proteins", 20, 200, 50, key="heatmap_top_n") - heatmap_clustered = heatmap_z.iloc[row_order, col_order] + var_series = expr_df.var(axis=1) + top_proteins = var_series.sort_values(ascending=False).head(top_n).index + heatmap_df = expr_df.loc[top_proteins] + heatmap_z = heatmap_df.sub(heatmap_df.mean(axis=1), axis=0).div(heatmap_df.std(axis=1), axis=0) + heatmap_z = heatmap_z.replace([np.inf, -np.inf], np.nan).dropna() - fig_heatmap = px.imshow( - heatmap_clustered, - labels=dict(x="Sample", y="Protein", color="Z-score"), - aspect="auto", - color_continuous_scale=[[0.0, "#3b6fb6"], [0.5, "white"], [1.0, "#b40426"]], - zmin=-3, zmax=3 - ) + if not heatmap_z.empty: + row_linkage = linkage(pdist(heatmap_z.values), method="average") + row_order = leaves_list(row_linkage) - fig_heatmap.update_layout( - height=700, - xaxis={'side': 'bottom'}, - yaxis={'side': 'left'} - ) + col_linkage = linkage(pdist(heatmap_z.T.values), method="average") + col_order = leaves_list(col_linkage) - fig_heatmap.update_xaxes(tickfont=dict(size=10)) - fig_heatmap.update_yaxes(tickfont=dict(size=8)) + heatmap_clustered = heatmap_z.iloc[row_order, col_order] - st.plotly_chart(fig_heatmap, use_container_width=True) + fig_heatmap = px.imshow( + heatmap_clustered, + labels=dict(x="Sample", y="Protein", color="Z-score"), + aspect="auto", + color_continuous_scale=[[0.0, "#3b6fb6"], [0.5, "white"], [1.0, "#b40426"]], + zmin=-3, zmax=3 + ) + + fig_heatmap.update_layout( + height=700, + xaxis={'side': 'bottom'}, + yaxis={'side': 'left'} + ) + + fig_heatmap.update_xaxes(tickfont=dict(size=10)) + fig_heatmap.update_yaxes(tickfont=dict(size=8)) + + st.plotly_chart(fig_heatmap, use_container_width=True) + else: + st.warning("Insufficient data to generate the heatmap.") + + st.markdown("---") + st.markdown("**Other visualizations:**") + col1, col2 = st.columns(2) + with col1: + st.page_link("content/results_volcano.py", label="Volcano Plot", icon="๐ŸŒ‹") + with col2: + st.page_link("content/results_pca.py", label="PCA", icon="๐Ÿ“Š") else: - st.warning("Insufficient data to generate the heatmap.") - -st.markdown("---") -st.markdown("**Other visualizations:**") -col1, col2 = st.columns(2) -with col1: - st.page_link("content/results_volcano.py", label="Volcano Plot", icon="๐ŸŒ‹") -with col2: - st.page_link("content/results_pca.py", label="PCA", icon="๐Ÿ“Š") + top_n = st.slider("Number of proteins", 20, 200, 50, key="heatmap_top_n") + + var_series = expr_df.var(axis=1) + top_proteins = var_series.sort_values(ascending=False).head(top_n).index + heatmap_df = expr_df.loc[top_proteins] + heatmap_z = heatmap_df.sub(heatmap_df.mean(axis=1), axis=0).div(heatmap_df.std(axis=1), axis=0) + heatmap_z = heatmap_z.replace([np.inf, -np.inf], np.nan).dropna() + + if not heatmap_z.empty: + row_linkage = linkage(pdist(heatmap_z.values), method="average") + row_order = leaves_list(row_linkage) + + col_linkage = linkage(pdist(heatmap_z.T.values), method="average") + col_order = leaves_list(col_linkage) + + heatmap_clustered = heatmap_z.iloc[row_order, col_order] + + fig_heatmap = px.imshow( + heatmap_clustered, + labels=dict(x="Sample", y="Protein", color="Z-score"), + aspect="auto", + color_continuous_scale=[[0.0, "#3b6fb6"], [0.5, "white"], [1.0, "#b40426"]], + zmin=-3, zmax=3 + ) + + fig_heatmap.update_layout( + height=700, + xaxis={'side': 'bottom'}, + yaxis={'side': 'left'} + ) + + fig_heatmap.update_xaxes(tickfont=dict(size=10)) + fig_heatmap.update_yaxes(tickfont=dict(size=8)) + + st.plotly_chart(fig_heatmap, width="stretch") + else: + st.warning("Insufficient data to generate the heatmap.") + + st.markdown("---") + st.markdown("**Other visualizations:**") + col1, col2 = st.columns(2) + with col1: + st.page_link("content/results_volcano.py", label="Volcano Plot", icon="๐ŸŒ‹") + with col2: + st.page_link("content/results_pca.py", label="PCA", icon="๐Ÿ“Š") diff --git a/content/results_pathway_analysis.py b/content/results_pathway_analysis.py new file mode 100644 index 0000000..f5eb4c1 --- /dev/null +++ b/content/results_pathway_analysis.py @@ -0,0 +1,258 @@ +import json +import mygene +import streamlit as st +import pandas as pd +import numpy as np +import plotly.express as px +import plotly.io as pio +from collections import defaultdict +from scipy.stats import fisher_exact +from pathlib import Path +from src.common.common import page_setup +from src.common.results_helpers import get_abundance_data + +# ================================ +# Page setup +# ================================ +params = page_setup() +st.title("ProteomicsLFQ Results") + +# ================================ +# Workspace check +# ================================ +if "workspace" not in st.session_state: + st.warning("Please initialize your workspace first.") + st.stop() + +# ================================ +# _run_go_enrichment function +# ================================ +def _run_go_enrichment(pivot_df: pd.DataFrame, results_dir: Path): + p_cutoff = 0.05 + fc_cutoff = 1.0 + + analysis_df = pivot_df.dropna(subset=["p-value", "log2FC"]).copy() + + if analysis_df.empty: + st.error("No valid statistical data found for GO enrichment.") + st.write("โ— analysis_df is empty") + else: + with st.spinner("Fetching GO terms from MyGene.info API..."): + mg = mygene.MyGeneInfo() + + def get_clean_uniprot(name): + parts = str(name).split("|") + return parts[1] if len(parts) >= 2 else parts[0] + + analysis_df["UniProt"] = analysis_df["protein"].apply(get_clean_uniprot) + + bg_ids = analysis_df["UniProt"].dropna().astype(str).unique().tolist() + fg_ids = analysis_df[ + (analysis_df["p-value"] < p_cutoff) & + (analysis_df["log2FC"].abs() >= fc_cutoff) + ]["UniProt"].dropna().astype(str).unique().tolist() + # st.write("โœ… get_clean_uniprot applied") + + if len(fg_ids) < 3: + st.warning( + f"Not enough significant proteins " + f"(p < {p_cutoff}, |log2FC| โ‰ฅ {fc_cutoff}). " + f"Found: {len(fg_ids)}" + ) + st.write("โ— Not enough significant proteins") + else: + res_list = mg.querymany( + bg_ids, scopes="uniprot", fields="go", as_dataframe=False + ) + res_go = pd.DataFrame(res_list) + if "notfound" in res_go.columns: + res_go = res_go[res_go["notfound"] != True] + + def extract_go_terms(go_data, go_type): + if not isinstance(go_data, dict) or go_type not in go_data: + return [] + terms = go_data[go_type] + if isinstance(terms, dict): + terms = [terms] + return list({t.get("term") for t in terms if "term" in t}) + + for go_type in ["BP", "CC", "MF"]: + res_go[f"{go_type}_terms"] = res_go["go"].apply( + lambda x: extract_go_terms(x, go_type) + ) + + annotated_ids = set(res_go["query"].astype(str)) + fg_set = annotated_ids.intersection(fg_ids) + bg_set = annotated_ids + # st.write(f"โœ… fg_set bg_set are set") + + def run_go(go_type): + go2fg = defaultdict(set) + go2bg = defaultdict(set) + + for _, row in res_go.iterrows(): + uid = str(row["query"]) + for term in row[f"{go_type}_terms"]: + go2bg[term].add(uid) + if uid in fg_set: + go2fg[term].add(uid) + + records = [] + N_fg = len(fg_set) + N_bg = len(bg_set) + + for term, fg_genes in go2fg.items(): + a = len(fg_genes) + if a == 0: + continue + b = N_fg - a + c = len(go2bg[term]) - a + d = N_bg - (a + b + c) + + _, p = fisher_exact([[a, b], [c, d]], alternative="greater") + records.append({ + "GO_Term": term, + "Count": a, + "GeneRatio": f"{a}/{N_fg}", + "p_value": p, + }) + + df = pd.DataFrame(records) + if df.empty: + return None, None + + df["-log10(p)"] = -np.log10(df["p_value"].replace(0, 1e-10)) + df = df.sort_values("p_value").head(20) + + # โœ… Plotly Figure + fig = px.bar( + df, + x="-log10(p)", + y="GO_Term", + orientation="h", + title=f"GO Enrichment ({go_type})", + ) + + # st.write(f"โœ… Plotly Figure generated") + + fig.update_layout( + yaxis=dict(autorange="reversed"), + height=500, + margin=dict(l=10, r=10, t=40, b=10), + ) + + return fig, df + + go_results = {} + + for go_type in ["BP", "CC", "MF"]: + fig, df_go = run_go(go_type) + if fig is not None: + go_results[go_type] = { + "fig": fig, + "df": df_go + } + # st.write(f"โœ… go_type generated") + + go_dir = results_dir / "go-terms" + go_dir.mkdir(parents=True, exist_ok=True) + + go_data = {} + + for go_type in ["BP", "CC", "MF"]: + if go_type in go_results: + fig = go_results[go_type]["fig"] + df = go_results[go_type]["df"] + + go_data[go_type] = { + "fig_json": fig.to_json(), # Figure โ†’ JSON string + "df_dict": df.to_dict(orient="records") # DataFrame โ†’ list of dicts + } + + go_json_file = go_dir / "go_results.json" + with open(go_json_file, "w") as f: + json.dump(go_data, f) + st.session_state["go_results"] = go_results + st.session_state["go_ready"] = True if go_data else False + # st.write("โœ… GO enrichment analysis complete") + +# ================================ +# Load abundance data +# ================================ +results_dir = Path(st.session_state["workspace"]) / "topp-workflow" / "results" / "quant_results" +result = get_abundance_data(st.session_state["workspace"]) +if result is None: + st.info("Abundance data not available. Please run the workflow and configure sample groups first.") + st.page_link("content/results_abundance.py", label="Go to Abundance", icon="๐Ÿ“‹") + st.stop() + +pivot_df, expr_df, group_map = result + +go_json_file = results_dir / "go-terms" / "go_results.json" + +go_input_df = pivot_df.copy() +if "ProteinName" in go_input_df.columns: + go_input_df = go_input_df.rename(columns={"ProteinName": "protein"}) + +_run_go_enrichment(go_input_df, results_dir) + +# ================================ +# Tabs +# ================================ +protein_tab, = st.tabs(["๐Ÿงฌ Protein Table"]) + +# ================================ +# Protein-level results +# ================================ +with protein_tab: + st.markdown("### ๐Ÿงฌ Protein-Level Abundance Table") + st.info( + "This protein-level table is generated by grouping all PSMs that map to the " + "same protein and aggregating their intensities across samples.\n\n" + "Additionally, log2 fold change and p-values are calculated between sample groups." + ) + + if pivot_df.empty: + st.info("No protein-level data available.") + else: + st.session_state["pivot_df"] = pivot_df + st.dataframe(pivot_df.sort_values("p-value"), width="stretch") + +# ====================================================== +# GO Enrichment Results +# ====================================================== +st.markdown("---") +st.subheader("๐Ÿงฌ GO Enrichment Analysis") + +if not go_json_file.exists(): + st.info("GO Enrichment results are not available yet. Please run the analysis first.") +else: + with open(go_json_file, "r") as f: + go_data = json.load(f) + + bp_tab, cc_tab, mf_tab = st.tabs([ + "๐Ÿงฌ Biological Process", + "๐Ÿ  Cellular Component", + "โš™๏ธ Molecular Function", + ]) + + for tab, go_type in zip([bp_tab, cc_tab, mf_tab], ["BP", "CC", "MF"]): + with tab: + if go_type not in go_data: + st.info(f"No enriched {go_type} terms found.") + continue + + fig_json = go_data[go_type]["fig_json"] + df_dict = go_data[go_type]["df_dict"] + + fig = pio.from_json(fig_json) + + df_go = pd.DataFrame(df_dict) + + if df_go.empty: + st.info(f"No enriched {go_type} terms found.") + else: + st.plotly_chart(fig, width="stretch") + + st.markdown(f"#### {go_type} Enrichment Results") + st.dataframe(df_go, width="stretch") \ No newline at end of file diff --git a/content/results_pca.py b/content/results_pca.py index 0b36469..dc64e9a 100644 --- a/content/results_pca.py +++ b/content/results_pca.py @@ -5,7 +5,8 @@ from sklearn.decomposition import PCA from sklearn.preprocessing import StandardScaler from src.common.common import page_setup -from src.common.results_helpers import get_abundance_data +from src.common.results_helpers import get_abundance_data, get_workflow_dir +from src.workflow.ParameterManager import ParameterManager params = page_setup() st.title("PCA Analysis") @@ -29,13 +30,25 @@ pivot_df, expr_df, group_map = result +workflow_dir = get_workflow_dir(st.session_state["workspace"]) +parameter_manager = ParameterManager(workflow_dir, "TOPP Workflow") +workflow_params = parameter_manager.get_parameters_from_json() +analysis_mode = workflow_params.get("analysis-mode", "LFQ") + +st.write("Workflow Analysis Mode:", analysis_mode) + top_n = 500 +if analysis_mode == "LFQ": + protein_col = "ProteinName" +else: + protein_col = "protein" + top_proteins = ( pivot_df - .dropna(subset=["p-value"]) - .sort_values("p-value", ascending=True) - .head(top_n)["ProteinName"] + .dropna(subset=["p-adj"]) + .sort_values("p-adj", ascending=True) + .head(top_n)[protein_col] ) expr_df_pca = expr_df.loc[ @@ -58,10 +71,25 @@ index=X.index ) -norm_map = { - k.replace(".mzML", ""): v - for k, v in group_map.items() -} +if analysis_mode == "LFQ": + norm_map = { + k.replace(".mzML", ""): v + for k, v in group_map.items() + } +else: + actual_sample_names = pca_df.index.tolist() + norm_map = {} + for k, v in group_map.items(): + try: + sample_idx = int(k) + 1 + target_substring = f"sample{sample_idx}[" + real_full_name = next((name for name in actual_sample_names if target_substring in name), None) + + if real_full_name: + norm_map[real_full_name] = v if v and v.strip() else "Unassigned" + except ValueError: + continue + pca_df["Group"] = pca_df.index.map(norm_map) fig_pca = px.scatter( @@ -79,9 +107,9 @@ height=600, ) -st.plotly_chart(fig_pca, use_container_width=True) +st.plotly_chart(fig_pca, width="stretch") -st.markdown(f"**Proteins used:** {expr_df_pca.shape[0]} (top {top_n} by p-value)") +st.markdown(f"**Proteins used:** {expr_df_pca.shape[0]} (top {top_n} by p-adj)") st.markdown("---") st.markdown("**Other visualizations:**") @@ -89,4 +117,4 @@ with col1: st.page_link("content/results_volcano.py", label="Volcano Plot", icon="๐ŸŒ‹") with col2: - st.page_link("content/results_heatmap.py", label="Heatmap", icon="๐Ÿ”ฅ") + st.page_link("content/results_heatmap.py", label="Heatmap", icon="๐Ÿ”ฅ") \ No newline at end of file diff --git a/content/results_volcano.py b/content/results_volcano.py index d3f09cc..5cc9b29 100644 --- a/content/results_volcano.py +++ b/content/results_volcano.py @@ -3,7 +3,8 @@ import plotly.express as px import numpy as np from src.common.common import page_setup -from src.common.results_helpers import get_abundance_data +from src.common.results_helpers import get_abundance_data, get_workflow_dir +from src.workflow.ParameterManager import ParameterManager params = page_setup() st.title("Volcano Plot") @@ -28,74 +29,164 @@ pivot_df, expr_df, group_map = result if pivot_df.empty: - st.info("No data available for volcano plot.") - st.stop() - -volcano_df = pivot_df.copy() -volcano_df = volcano_df.dropna(subset=["log2FC", "p-value"]) - -volcano_df["neg_log10_p"] = -np.log10(volcano_df["p-value"]) - -fc_thresh = st.slider( - "log2 Fold Change threshold", - min_value=0.5, - max_value=3.0, - value=1.0, - step=0.1, -) - -p_thresh = st.slider( - "p-value threshold", - min_value=0.001, - max_value=0.1, - value=0.05, - step=0.001, -) - -volcano_df["Significance"] = "Not significant" -volcano_df.loc[ - (volcano_df["p-value"] <= p_thresh) & (volcano_df["log2FC"] >= fc_thresh), - "Significance", -] = "Up-regulated" - -volcano_df.loc[ - (volcano_df["p-value"] <= p_thresh) & (volcano_df["log2FC"] <= -fc_thresh), - "Significance", -] = "Down-regulated" - -fig_volcano = px.scatter( - volcano_df, - x="log2FC", - y="neg_log10_p", - color="Significance", - hover_data=["ProteinName", "p-value"], -) - -fig_volcano.add_vline(x=fc_thresh, line_dash="dash") -fig_volcano.add_vline(x=-fc_thresh, line_dash="dash") -fig_volcano.add_hline(y=-np.log10(p_thresh), line_dash="dash") - -# Make x-axis symmetric around zero -max_abs_fc = volcano_df["log2FC"].abs().max() -x_range = [-max_abs_fc * 1.1, max_abs_fc * 1.1] # 10% padding - -fig_volcano.update_layout( - xaxis_title="log2 Fold Change", - yaxis_title="-log10(p-value)", - xaxis_range=x_range, - height=600, -) - -st.plotly_chart(fig_volcano, use_container_width=True) - -up_count = (volcano_df["Significance"] == "Up-regulated").sum() -down_count = (volcano_df["Significance"] == "Down-regulated").sum() -st.markdown(f"**Up-regulated:** {up_count} | **Down-regulated:** {down_count}") - -st.markdown("---") -st.markdown("**Other visualizations:**") -col1, col2 = st.columns(2) -with col1: - st.page_link("content/results_pca.py", label="PCA", icon="๐Ÿ“Š") -with col2: - st.page_link("content/results_heatmap.py", label="Heatmap", icon="๐Ÿ”ฅ") + st.info("No data available for volcano plot.") + st.stop() + +workflow_dir = get_workflow_dir(st.session_state["workspace"]) +parameter_manager = ParameterManager(workflow_dir, "TOPP Workflow") + +workflow_params = parameter_manager.get_parameters_from_json() +analysis_mode = workflow_params.get("analysis-mode", "LFQ") + +st.write("Workflow Analysis Mode:", analysis_mode) + +if analysis_mode == "LFQ": + volcano_df = pivot_df.copy() + volcano_df = volcano_df.dropna(subset=["log2FC", "p-adj"]) + + volcano_df["neg_log10_padj"] = -np.log10(volcano_df["p-adj"]) + + fc_thresh = st.slider( + "log2 Fold Change threshold", + min_value=0.5, + max_value=3.0, + value=1.0, + step=0.1, + ) + + p_thresh = st.slider( + "p-adj (FDR) threshold", + min_value=0.001, + max_value=0.1, + value=0.05, + step=0.001, + ) + + volcano_df["Significance"] = "Not significant" + volcano_df.loc[ + (volcano_df["p-adj"] <= p_thresh) & (volcano_df["log2FC"] >= fc_thresh), + "Significance", + ] = "Up-regulated" + + volcano_df.loc[ + (volcano_df["p-adj"] <= p_thresh) & (volcano_df["log2FC"] <= -fc_thresh), + "Significance", + ] = "Down-regulated" + + fig_volcano = px.scatter( + volcano_df, + x="log2FC", + y="neg_log10_padj", + color="Significance", + hover_data=["ProteinName", "log2FC", "p-value", "p-adj"], + color_discrete_map={ + "Up-regulated": "red", + "Down-regulated": "blue", + "Not significant": "lightgrey", + } + ) + + fig_volcano.add_vline(x=fc_thresh, line_dash="dash") + fig_volcano.add_vline(x=-fc_thresh, line_dash="dash") + fig_volcano.add_hline(y=-np.log10(p_thresh), line_dash="dash") + + # Make x-axis symmetric around zero + max_abs_fc = volcano_df["log2FC"].abs().max() + x_range = [-max_abs_fc * 1.1, max_abs_fc * 1.1] # 10% padding + + fig_volcano.update_layout( + xaxis_title="log2 Fold Change", + yaxis_title="-log10(p-adj)", + xaxis_range=x_range, + height=600, + ) + + st.plotly_chart(fig_volcano, use_container_width=True) + + up_count = (volcano_df["Significance"] == "Up-regulated").sum() + down_count = (volcano_df["Significance"] == "Down-regulated").sum() + st.markdown(f"**Up-regulated:** {up_count} | **Down-regulated:** {down_count}") + + st.markdown("---") + st.markdown("**Other visualizations:**") + col1, col2 = st.columns(2) + with col1: + st.page_link("content/results_pca.py", label="PCA", icon="๐Ÿ“Š") + with col2: + st.page_link("content/results_heatmap.py", label="Heatmap", icon="๐Ÿ”ฅ") +else: + # Threshold Selection UI + st.divider() + c1, c2 = st.columns(2) + with c1: + fc_thresh = st.slider( + "log2 Fold Change threshold", + min_value=0.1, + max_value=3.0, + value=1.0, + step=0.1, + ) + with c2: + p_thresh = st.slider( + "p-adj (FDR) threshold", + min_value=0.001, + max_value=0.1, + value=0.05, + step=0.001, + ) + + volcano_df = pivot_df.dropna(subset=["log2FC", "p-adj"]).copy() + volcano_df["neg_log10_padj"] = -np.log10(volcano_df["p-adj"]) + + volcano_df["Significance"] = "Not significant" + volcano_df.loc[ + (volcano_df["p-adj"] <= p_thresh) & (volcano_df["log2FC"] >= fc_thresh), + "Significance", + ] = "Up-regulated" + + volcano_df.loc[ + (volcano_df["p-adj"] <= p_thresh) & (volcano_df["log2FC"] <= -fc_thresh), + "Significance", + ] = "Down-regulated" + + fig_volcano = px.scatter( + volcano_df, + x="log2FC", + y="neg_log10_padj", + color="Significance", + hover_data=["protein", "log2FC", "p-value", "p-adj"], + color_discrete_map={ + "Up-regulated": "red", + "Down-regulated": "blue", + "Not significant": "lightgrey", + } + ) + + fig_volcano.add_vline(x=fc_thresh, line_dash="dash") + fig_volcano.add_vline(x=-fc_thresh, line_dash="dash") + fig_volcano.add_hline(y=-np.log10(p_thresh), line_dash="dash") + + # Make x-axis symmetric around zero + max_abs_fc = volcano_df["log2FC"].abs().max() + x_range = [-max_abs_fc * 1.1, max_abs_fc * 1.1] # 10% padding + + fig_volcano.update_layout( + xaxis_title="log2 Fold Change", + yaxis_title="-log10(p-adj)", + xaxis_range=x_range, + height=600, + ) + + st.plotly_chart(fig_volcano, width="stretch") + + up_count = (volcano_df["Significance"] == "Up-regulated").sum() + down_count = (volcano_df["Significance"] == "Down-regulated").sum() + st.markdown(f"**Up-regulated:** {up_count} | **Down-regulated:** {down_count}") + + st.markdown("---") + st.markdown("**Other visualizations:**") + col1, col2 = st.columns(2) + with col1: + st.page_link("content/results_pca.py", label="PCA", icon="๐Ÿ“Š") + with col2: + st.page_link("content/results_heatmap.py", label="Heatmap", icon="๐Ÿ”ฅ") diff --git a/requirements.txt b/requirements.txt index 510c539..2f5fd30 100644 --- a/requirements.txt +++ b/requirements.txt @@ -145,7 +145,7 @@ polars>=1.0.0 cython easypqp>=0.1.34 pyprophet>=2.2.0 - +mygene # Redis Queue dependencies (for online mode) redis>=5.0.0 rq>=1.16.0 diff --git a/src/WorkflowTest.py b/src/WorkflowTest.py index d94a37d..f6826d4 100644 --- a/src/WorkflowTest.py +++ b/src/WorkflowTest.py @@ -42,6 +42,29 @@ def configure(self) -> None: self.ui.select_input_file("mzML-files", multiple=True, reactive=True) self.ui.select_input_file("fasta-file", multiple=False) + self.params = self.parameter_manager.get_parameters_from_json() + saved_mode = self.params.get("analysis-mode", "LFQ") + + self.ui.input_widget( + key="analysis-mode", + default=saved_mode, + name="Analysis Mode", + widget_type="selectbox", + options=["LFQ", "TMT"], + help="Choose between Label-Free Quantification (LFQ) or Tandem Mass Tag (TMT) analysis.", + reactive=True + ) + + self.params = self.parameter_manager.get_parameters_from_json() + current_mode = self.params.get("analysis-mode", "LFQ") + + if current_mode == "LFQ": + self.render_lfq_tabs() + else: + self.render_tmt_tabs() + + def render_lfq_tabs(self): + st.subheader("LFQ Analysis Mode") t = st.tabs(["**Identification**", "**Rescoring**", "**Filtering**", "**Library Generation**", "**Quantification**", "**Group Selection**"]) with t[0]: @@ -65,10 +88,10 @@ def configure(self) -> None: st.info(""" **Decoy Database Settings:** * **method**: How decoy sequences are generated from target protein sequences. - *Reverse* creates decoys by reversing each sequence, while *shuffle* randomly - rearranges the amino acids. Both methods preserve the amino acid composition - of the original protein, ensuring decoys have similar properties to real sequences - for accurate false discovery rate (FDR) estimation. + *Reverse* creates decoys by reversing each sequence, while *shuffle* randomly + rearranges the amino acids. Both methods preserve the amino acid composition + of the original protein, ensuring decoys have similar properties to real sequences + for accurate false discovery rate (FDR) estimation. """) self.ui.input_TOPP( "DecoyDatabase", @@ -97,7 +120,7 @@ def configure(self) -> None: st.info(comet_info) comet_include = [":enzyme", "missed_cleavages", "fixed_modifications", "variable_modifications", - "instrument", "fragment_mass_tolerance", "fragment_error_units", "fragment_bin_offset"] + "instrument", "fragment_mass_tolerance", "fragment_error_units", "fragment_bin_offset"] if not self.params.get("generate-decoys", True): # Only show decoy_string when not generating decoys comet_include.append("PeptideIndexing:decoy_string") @@ -106,7 +129,7 @@ def configure(self) -> None: "CometAdapter", custom_defaults={ "threads": 8, - "instrument": "high_res", + "instrument": "low_res", "missed_cleavages": 2, "min_peptide_length": 6, "max_peptide_length": 40, @@ -115,16 +138,18 @@ def configure(self) -> None: "isotope_error": "0/1", "precursor_charge": "2:4", "precursor_mass_tolerance": 20.0, - "fragment_mass_tolerance": 0.02, - "fragment_bin_offset": 0.0, + "fragment_mass_tolerance": 0.6, + "fragment_bin_offset": 0.4, "max_variable_mods_in_peptide": 3, "minimum_peaks": 1, "clip_nterm_methionine": "true", - "PeptideIndexing:IL_equivalent": "true", + "PeptideIndexing:IL_equivalent": True, "PeptideIndexing:unmatched_action": "warn", "PeptideIndexing:decoy_string": "rev_", + "mass_recalibration": False, }, include_parameters=comet_include, + flag_parameters=["PeptideIndexing:IL_equivalent", "mass_recalibration"], exclude_parameters=["second_enzyme"], ) @@ -145,9 +170,10 @@ def configure(self) -> None: "subset_max_train": 300000, "decoy_pattern": "rev_", "score_type": "pep", - "post_processing_tdc": "true", + "post_processing_tdc": True, }, include_parameters=percolator_include, + flag_parameters=["post_processing_tdc"], exclude_parameters=["out_type"], ) @@ -243,6 +269,10 @@ def configure(self) -> None: "psmFDR": 0.01, "proteinFDR": 0.01, "picked_proteinFDR": "true", + "alignment_order": "star", + "protein_quantification": "unique_peptides", + "quantification_method": "feature_intensity", + "protein_inference": "aggregation", }, include_parameters=["intThreshold", "psmFDR", "proteinFDR"], ) @@ -293,6 +323,295 @@ def configure(self) -> None: if orphaned_keys: self.parameter_manager.save_parameters() + def render_tmt_tabs(self): + st.subheader("TMT Analysis Mode") + # Create tabs for different analysis steps. + t = st.tabs( + ["**IsobaricAnalyzer**", "**CometAdapter**", "**PercolatorAdapter**", "**IDFilter**", "**IDMapper**", "**FileMerger**", + "**ProteinInference**", "**IDFilter**", "**IDConflictResolver**", "**ProteinQuantifier**", "**Group Selection**"] + ) + with t[0]: + # Checkbox for decoy generation + # reactive=True ensures the parent configure() fragment re-runs when checkbox changes, + # so conditional UI (DecoyDatabase settings) updates immediately + self.ui.input_widget( + key="generate-decoys", + default=True, + name="Generate Decoy Database", + widget_type="checkbox", + help="Generate reversed decoy sequences for FDR calculation. Disable if your FASTA already contains decoys.", + reactive=True, + ) + + # Reload params to get current checkbox value after it was saved + self.params = self.parameter_manager.get_parameters_from_json() + + # Show DecoyDatabase settings if generating decoys + if self.params.get("generate-decoys", True): + st.info(""" + **Decoy Database Settings:** + * **method**: How decoy sequences are generated from target protein sequences. + *Reverse* creates decoys by reversing each sequence, while *shuffle* randomly + rearranges the amino acids. Both methods preserve the amino acid composition + of the original protein, ensuring decoys have similar properties to real sequences + for accurate false discovery rate (FDR) estimation. + """) + self.ui.input_TOPP( + "DecoyDatabase", + custom_defaults={ + "decoy_string": "rev_", + "decoy_string_position": "prefix", + "method": "reverse", + }, + include_parameters=["method"], + ) + + comet_info = """ + **Identification (Comet):** + * **enzyme**: The enzyme used for peptide digestion. + * **missed_cleavages**: Number of possible cleavage sites missed by the enzyme. It has no effect if enzyme is unspecific cleavage. + * **fixed_modifications**: Fixed modifications, specified using Unimod (www.unimod.org) terms, e.g. 'Carbamidomethyl (C)' or 'Oxidation (M)' + * **variable_modifications**: Variable modifications, specified using Unimod (www.unimod.org) terms, e.g. 'Carbamidomethyl (C)' or 'Oxidation (M)' + * **instrument**: Type of instrument (high_res or low_res). Use 'high_res' for high-resolution MS2 (Orbitrap, TOF), 'low_res' for ion trap. + * **fragment_mass_tolerance**: Fragment mass tolerance for MS2 matching. + * **fragment_bin_offset**: Offset for binning MS2 spectra. Typically 0.0 for high-res, 0.4 for low-res instruments. + """ + if not self.params.get("generate-decoys", True): + comet_info += """* **PeptideIndexing:decoy_string**: String that was appended (or prefixed - see 'decoy_string_position' flag below) to the accessions + in the protein database to indicate decoy proteins. + """ + st.info(comet_info) + + st.write(Path(self.workflow_dir, "results")) + + comet_include = [":enzyme", "missed_cleavages", "fixed_modifications", "variable_modifications", + "instrument", "fragment_mass_tolerance", "fragment_error_units", "fragment_bin_offset"] + if not self.params.get("generate-decoys", True): + # Only show decoy_string when not generating decoys + comet_include.append("PeptideIndexing:decoy_string") + + self.ui.input_TOPP( + "IsobaricAnalyzer", + custom_defaults={ + "tmt11plex:reference_channel": 126, + "type": "tmt11plex", + "extraction:select_activation": "auto", + "extraction:reporter_mass_shift": 0.002, + "extraction:min_reporter_intensity": 0.0, + "extraction:min_precursor_purity": 0.0, + "extraction:precursor_isotope_deviation": 10.0, + "quantification:isotope_correction": "false", + }, + tool_instance_name="IsobaricAnalyzer-TMT", + ) + with t[1]: + comet_include = [":enzyme", "missed_cleavages", "fixed_modifications", "variable_modifications", + "instrument", "fragment_mass_tolerance", "fragment_error_units", "fragment_bin_offset", "PeptideIndexing:IL_equivalent"] + self.ui.input_TOPP( + "CometAdapter", + custom_defaults={ + "PeptideIndexing:IL_equivalent": True, + "clip_nterm_methionine": "true", + "instrument": "high_res", + "missed_cleavages": 2, + "min_peptide_length": 6, + "max_peptide_length": 40, + "enzyme": "Trypsin/P", + "PeptideIndexing:unmatched_action": "warn", + "max_variable_mods_in_peptide": 3, + "precursor_mass_tolerance": 4.5, + "isotope_error": "0/1", + "precursor_error_units": "ppm", + "num_hits": 1, + "num_enzyme_termini": "fully", + "fragment_bin_offset": 0.0, + "minimum_peaks": 10, + "precursor_charge": "2:4", + "fragment_mass_tolerance": 0.015, + "PeptideIndexing:unmatched_action": "warn", + "variable_modifications": "Oxidation (M)\nAcetyl (Protein N-term)\nTMT6plex (K)\nTMT6plex (N-term)", + "debug": 0, + "force": True, + }, + include_parameters=comet_include, + flag_parameters=["PeptideIndexing:IL_equivalent", "force"], + exclude_parameters=["second_enzyme"], + tool_instance_name="CometAdapter-TMT", + ) + with t[2]: + st.info(""" + **Filtering (IDFilter):** + * **score:type_peptide**: Score used for filtering. If empty, the main score is used. + * **score:psm**: The score which should be reached by a peptide hit to be kept. (use 'NAN' to disable this filter) + """) + self.ui.input_TOPP( + "PercolatorAdapter", + custom_defaults={ + "subset_max_train": 300000, + "decoy_pattern": "DECOY_", + "score_type": "pep", + "post_processing_tdc": True, + "debug": 0, + }, + flag_parameters=["post_processing_tdc"], + tool_instance_name="PercolatorAdapter-TMT", + ) + + with t[3]: + self.ui.input_TOPP( + "IDFilter", + custom_defaults={ + "score:type_peptide": "q-value", + "score:psm": 0.10, + }, + tool_instance_name="IDFilter-strict", + ) + with t[4]: + st.info(""" + **Quantification (ProteomicsLFQ):** + * **intThreshold**: Peak intensity threshold applied in seed detection. + * **psmFDR**: FDR threshold for sub-protein level (e.g. 0.05=5%). Use -FDR_type to choose the level. Cutoff is applied at the highest level. If Bayesian inference was chosen, it is equivalent with a peptide FDR + * **proteinFDR**: Protein FDR threshold (0.05=5%). + """) + self.ui.input_TOPP( + "IDMapper", + custom_defaults={ + "threads": 8, + "debug": 0, + }, + tool_instance_name="IDMapper-TMT", + ) + with t[5]: + self.ui.input_TOPP( + "FileMerger", + custom_defaults={ + "in_type": "consensusXML", + "append_method": "append_cols", + "annotate_file_origin": True, + "threads": 8, + }, + flag_parameters=["annotate_file_origin"], + tool_instance_name="FileMerger-TMT", + ) + with t[6]: + self.ui.input_TOPP( + "ProteinInference", + custom_defaults={ + "threads": 8, + "picked_decoy_string": "DECOY_", + "picked_fdr": "true", + "protein_fdr": "true", + "Algorithm:use_shared_peptides": "true", + "Algorithm:annotate_indistinguishable_groups": "true", + "Algorithm:score_type": "PEP", + "Algorithm:score_aggregation_method": "best", + "Algorithm:min_peptides_per_protein": 1, + }, + tool_instance_name="ProteinInference-TMT", + ) + with t[7]: + # A single checkbox widget for workflow logic. + # self.ui.input_widget("run-python-script", False, "Run custom Python script") * + # Generate input widgets for a custom Python tool, located at src/python-tools. + # Parameters are specified within the file in the DEFAULTS dictionary. + # self.ui.input_python("example") * + self.ui.input_TOPP( + "IDFilter", + custom_defaults={ + "score:type_protein": "q-value", + "score:proteingroup": 0.01, + "score:psm": 0.01, + "delete_unreferenced_peptide_hits": True, + "remove_decoys": True + }, + flag_parameters=["delete_unreferenced_peptide_hits", "remove_decoys"], + tool_instance_name="IDFilter-lenient", + ) + with t[8]: + self.ui.input_TOPP( + "IDConflictResolver", + custom_defaults={ + "threads": 4, + }, + tool_instance_name="IDConflictResolver-TMT", + ) + + with t[9]: + self.ui.input_TOPP( + "ProteinQuantifier", + custom_defaults={ + "method": "top", + "top:N": 3, + "top:aggregate": "median", + "top:include_all": True, + "ratios": True, + "threads": 8, + "debug": 0, + }, + flag_parameters=["top:include_all", "ratios"], + tool_instance_name="ProteinQuantifier-TMT", + ) + with t[10]: + st.markdown("### ๐Ÿงช TMT Sample Group Assignment") + + # 1. Determine TMT type (e.g., tmt10plex, tmt16plex) + target_key = f"{self.parameter_manager.topp_param_prefix}IsobaricAnalyzer:1:type" + selected_tmt = st.session_state.get(target_key, "tmt12plex") + + if "tmt" in selected_tmt: + import re + # Extract the number to determine the plex count + num_plex_match = re.search(r'\d+', selected_tmt) + if num_plex_match: + num_plex = int(num_plex_match.group()) + all_channels = [f"sample{i+1}" for i in range(num_plex)] + + st.info( + "Enter a group name for each TMT channel.\n\n" + "Type **'skip'** for channels you wish to skip. (e.g., control, case, skip)" + ) + + # 2. Create an input_widget for each channel (automatically saved to params.json) + cols = st.columns(2) + for i, ch in enumerate(all_channels): + with cols[i % 2]: + self.ui.input_widget( + key=f"TMT-group-{ch}", + default="", + name=f"Group for {ch}", + widget_type="text", + help="Enter group name or 'skip' to ignore this channel.", + ) + + # 3. Read values from params.json and construct a dictionary in tmt_group_map format + # (This can be used later to filter DataFrames in subsequent logic) + self.params = self.parameter_manager.get_parameters_from_json() + + tmt_group_map = {} + for i, ch in enumerate(all_channels): + # Retrieve stored value (default is empty string) + group_val = self.params.get(f"TMT-group-{ch}", "") + tmt_group_map[str(i)] = group_val + + # For data inspection (remove if not needed) + if st.checkbox("Show current TMT mapping"): + st.json(tmt_group_map) + + # 4. Clean up parameters from unused/previous TMT settings + all_possible_channel_keys = {f"TMT-group-{ch}" for ch in all_channels} + orphaned_keys = [ + k for k in self.params.keys() + if k.startswith("TMT-group-") and k not in all_possible_channel_keys + ] + + if orphaned_keys: + for key in orphaned_keys: + del self.params[key] + self.parameter_manager.save_parameters() + + else: + st.warning("Please select a TMT type in the parameters first.") + def execution(self) -> bool: """ Refactored TOPP workflow execution: @@ -345,493 +664,946 @@ def execution(self) -> bool: st.info(f"Using original FASTA: {fasta_path.name}") database_fasta = fasta_path - # ================================ - # 1๏ธโƒฃ Directory setup - # ================================ - results_dir = Path(self.workflow_dir, "results") - comet_dir = results_dir / "comet_results" - perc_dir = results_dir / "percolator_results" - filter_dir = results_dir / "filter_results" - quant_dir = results_dir / "quant_results" + current_mode = self.params.get("analysis-mode", "LFQ") + st.write(f"Current analysis mode: **{current_mode}**") - results_dir = Path(self.workflow_dir, "input-files") + if current_mode == "LFQ": + self.logger.log("โš™๏ธ Running LFQ workflow") - for d in [comet_dir, perc_dir, filter_dir, quant_dir]: - d.mkdir(parents=True, exist_ok=True) + # ================================ + # 1๏ธโƒฃ Directory setup + # ================================ + results_dir = Path(self.workflow_dir, "results") + comet_dir = results_dir / "comet_results" + perc_dir = results_dir / "percolator_results" + filter_dir = results_dir / "psm_filter" + quant_dir = results_dir / "quant_results" + + results_dir = Path(self.workflow_dir, "input-files") + + for d in [comet_dir, perc_dir, filter_dir, quant_dir]: + d.mkdir(parents=True, exist_ok=True) + + self.logger.log("๐Ÿ“ Output directories created") + + # ================================ + # 2๏ธโƒฃ File path definitions (per sample) + # ================================ + comet_results = [] + percolator_results = [] + filter_results = [] + + for mz in in_mzML: + stem = Path(mz).stem + comet_results.append(str(comet_dir / f"{stem}_comet.idXML")) + percolator_results.append(str(perc_dir / f"{stem}_per.idXML")) + filter_results.append(str(filter_dir / f"{stem}_filter.idXML")) + + # ================================ + # 3๏ธโƒฃ Per-file processing + # ================================ + for i, mz in enumerate(in_mzML): + stem = Path(mz).stem + st.info(f"Processing sample: {stem}") + + self.logger.log("๐Ÿ”ฌ Starting per-sample processing...") + + # --- CometAdapter --- + self.logger.log("๐Ÿ”Ž Running peptide search...") + with st.spinner(f"CometAdapter ({stem})"): + comet_extra_params = {"database": str(database_fasta)} + if self.params.get("generate-decoys", True): + # Propagate decoy_string from DecoyDatabase + comet_extra_params["PeptideIndexing:decoy_string"] = decoy_string - self.logger.log("๐Ÿ“ Output directories created") + if not self.executor.run_topp( + "CometAdapter", + { + "in": in_mzML, + "out": comet_results, + }, + comet_extra_params, + ): + self.logger.log("Workflow stopped due to error") + return False - # ================================ - # 2๏ธโƒฃ File path definitions (per sample) - # ================================ - comet_results = [] - percolator_results = [] - filter_results = [] + # Get fragment tolerance from CometAdapter parameters for visualization + comet_params = self.parameter_manager.get_topp_parameters("CometAdapter") + frag_tol = comet_params.get("fragment_mass_tolerance", 0.02) + frag_tol_is_ppm = comet_params.get("fragment_error_units", "Da") != "Da" + + # Build visualization cache for Comet results + results_dir_path = Path(self.workflow_dir, "results") + cache_dir = results_dir_path / "insight_cache" + cache_dir.mkdir(parents=True, exist_ok=True) + + # Get mzML directory + mzml_dir = Path(in_mzML[0]).parent + + # Build spectra cache (once, shared by all stages) + spectra_df = None + filename_to_index = {} + + for idxml_file in comet_results: + idxml_path = Path(idxml_file) + cache_id_prefix = idxml_path.stem + + # Parse idXML to DataFrame + id_df, spectra_data = parse_idxml(idxml_path) + + # Build spectra cache (only once) + if spectra_df is None: + filename_to_index = {Path(f).name: i for i, f in enumerate(spectra_data)} + spectra_df, filename_to_index = build_spectra_cache(mzml_dir, filename_to_index) + + # Initialize Table component (caches itself) + Table( + cache_id=f"table_{cache_id_prefix}", + data=id_df.lazy(), + cache_path=str(cache_dir), + interactivity={"file": "file_index", "spectrum": "scan_id", "identification": "id_idx"}, + column_definitions=[ + {"field": "sequence", "title": "Sequence"}, + {"field": "charge", "title": "Z", "sorter": "number"}, + {"field": "mz", "title": "m/z", "sorter": "number"}, + {"field": "rt", "title": "RT", "sorter": "number"}, + {"field": "score", "title": "Score", "sorter": "number"}, + {"field": "protein_accession", "title": "Proteins"}, + ], + initial_sort=[{"column": "score", "dir": "asc"}], + index_field="id_idx", + ) - for mz in in_mzML: - stem = Path(mz).stem - comet_results.append(str(comet_dir / f"{stem}_comet.idXML")) - percolator_results.append(str(perc_dir / f"{stem}_per.idXML")) - filter_results.append(str(filter_dir / f"{stem}_filter.idXML")) + # Initialize Heatmap component + Heatmap( + cache_id=f"heatmap_{cache_id_prefix}", + data=id_df.lazy(), + cache_path=str(cache_dir), + x_column="rt", + y_column="mz", + intensity_column="score", + interactivity={"identification": "id_idx"}, + ) - # ================================ - # 3๏ธโƒฃ Per-file processing - # ================================ - for i, mz in enumerate(in_mzML): - stem = Path(mz).stem - st.info(f"Processing sample: {stem}") + # Initialize SequenceView component + seq_view = SequenceView( + cache_id=f"seqview_{cache_id_prefix}", + sequence_data=id_df.lazy().select(["id_idx", "sequence", "charge", "file_index", "scan_id"]).rename({ + "id_idx": "sequence_id", + "charge": "precursor_charge", + }), + peaks_data=spectra_df.lazy(), + filters={ + "identification": "sequence_id", + "file": "file_index", + "spectrum": "scan_id", + }, + interactivity={"peak": "peak_id"}, + cache_path=str(cache_dir), + deconvolved=False, + annotation_config={ + "ion_types": ["b", "y"], + "neutral_losses": True, + "tolerance": frag_tol, + "tolerance_ppm": frag_tol_is_ppm, + }, + ) - self.logger.log("๐Ÿ”ฌ Starting per-sample processing...") + # Initialize LinePlot from SequenceView + LinePlot.from_sequence_view( + seq_view, + cache_id=f"lineplot_{cache_id_prefix}", + cache_path=str(cache_dir), + title="Annotated Spectrum", + styling={ + "unhighlightedColor": "#CCCCCC", + "highlightColor": "#E74C3C", + "selectedColor": "#F3A712", + }, + ) - # --- CometAdapter --- - self.logger.log("๐Ÿ”Ž Running peptide search...") - with st.spinner(f"CometAdapter ({stem})"): - comet_extra_params = {"database": str(database_fasta)} - if self.params.get("generate-decoys", True): - # Propagate decoy_string from DecoyDatabase - comet_extra_params["PeptideIndexing:decoy_string"] = decoy_string + self.logger.log("โœ… Peptide search complete") - if not self.executor.run_topp( - "CometAdapter", - { - "in": in_mzML, - "out": comet_results, - }, - comet_extra_params, - ): - self.logger.log("Workflow stopped due to error") - return False - - # Get fragment tolerance from CometAdapter parameters for visualization - comet_params = self.parameter_manager.get_topp_parameters("CometAdapter") - frag_tol = comet_params.get("fragment_mass_tolerance", 0.02) - frag_tol_is_ppm = comet_params.get("fragment_error_units", "Da") != "Da" - - # Build visualization cache for Comet results - results_dir_path = Path(self.workflow_dir, "results") - cache_dir = results_dir_path / "insight_cache" - cache_dir.mkdir(parents=True, exist_ok=True) - - # Get mzML directory - mzml_dir = Path(in_mzML[0]).parent - - # Build spectra cache (once, shared by all stages) - spectra_df = None - filename_to_index = {} - - for idxml_file in comet_results: - idxml_path = Path(idxml_file) - cache_id_prefix = idxml_path.stem - - # Parse idXML to DataFrame - id_df, spectra_data = parse_idxml(idxml_path) - - # Build spectra cache (only once) - if spectra_df is None: - filename_to_index = {Path(f).name: i for i, f in enumerate(spectra_data)} - spectra_df, filename_to_index = build_spectra_cache(mzml_dir, filename_to_index) - - # Initialize Table component (caches itself) - Table( - cache_id=f"table_{cache_id_prefix}", - data=id_df.lazy(), - cache_path=str(cache_dir), - interactivity={"file": "file_index", "spectrum": "scan_id", "identification": "id_idx"}, - column_definitions=[ - {"field": "sequence", "title": "Sequence"}, - {"field": "charge", "title": "Z", "sorter": "number"}, - {"field": "mz", "title": "m/z", "sorter": "number"}, - {"field": "rt", "title": "RT", "sorter": "number"}, - {"field": "score", "title": "Score", "sorter": "number"}, - {"field": "protein_accession", "title": "Proteins"}, - ], - initial_sort=[{"column": "score", "dir": "asc"}], - index_field="id_idx", - ) + # if not Path(comet_results).exists(): + # st.error(f"CometAdapter failed for {stem}") + # st.stop() - # Initialize Heatmap component - Heatmap( - cache_id=f"heatmap_{cache_id_prefix}", - data=id_df.lazy(), - cache_path=str(cache_dir), - x_column="rt", - y_column="mz", - intensity_column="score", - interactivity={"identification": "id_idx"}, - ) + # --- PercolatorAdapter --- + self.logger.log("๐Ÿ“Š Running rescoring...") + with st.spinner(f"PercolatorAdapter ({stem})"): + if not self.executor.run_topp( + "PercolatorAdapter", + { + "in": comet_results, + "out": percolator_results, + }, + {"decoy_pattern": decoy_string}, # Always propagated from upstream + ): + self.logger.log("Workflow stopped due to error") + return False - # Initialize SequenceView component - seq_view = SequenceView( - cache_id=f"seqview_{cache_id_prefix}", - sequence_data=id_df.lazy().select(["id_idx", "sequence", "charge", "file_index", "scan_id"]).rename({ - "id_idx": "sequence_id", - "charge": "precursor_charge", - }), - peaks_data=spectra_df.lazy(), - filters={ - "identification": "sequence_id", - "file": "file_index", - "spectrum": "scan_id", - }, - interactivity={"peak": "peak_id"}, - cache_path=str(cache_dir), - deconvolved=False, - annotation_config={ - "ion_types": ["b", "y"], - "neutral_losses": True, - "tolerance": frag_tol, - "tolerance_ppm": frag_tol_is_ppm, - }, - ) + # Build visualization cache for Percolator results + for idxml_file in percolator_results: + idxml_path = Path(idxml_file) + cache_id_prefix = idxml_path.stem + + # Parse idXML to DataFrame + id_df, spectra_data = parse_idxml(idxml_path) + + # Initialize Table component (caches itself) + Table( + cache_id=f"table_{cache_id_prefix}", + data=id_df.lazy(), + cache_path=str(cache_dir), + interactivity={"file": "file_index", "spectrum": "scan_id", "identification": "id_idx"}, + column_definitions=[ + {"field": "sequence", "title": "Sequence"}, + {"field": "charge", "title": "Z", "sorter": "number"}, + {"field": "mz", "title": "m/z", "sorter": "number"}, + {"field": "rt", "title": "RT", "sorter": "number"}, + {"field": "score", "title": "Score", "sorter": "number"}, + {"field": "protein_accession", "title": "Proteins"}, + ], + initial_sort=[{"column": "score", "dir": "asc"}], + index_field="id_idx", + ) - # Initialize LinePlot from SequenceView - LinePlot.from_sequence_view( - seq_view, - cache_id=f"lineplot_{cache_id_prefix}", - cache_path=str(cache_dir), - title="Annotated Spectrum", - styling={ - "unhighlightedColor": "#CCCCCC", - "highlightColor": "#E74C3C", - "selectedColor": "#F3A712", - }, - ) + # Initialize Heatmap component + Heatmap( + cache_id=f"heatmap_{cache_id_prefix}", + data=id_df.lazy(), + cache_path=str(cache_dir), + x_column="rt", + y_column="mz", + intensity_column="score", + interactivity={"identification": "id_idx"}, + ) - self.logger.log("โœ… Peptide search complete") + # Initialize SequenceView component + seq_view = SequenceView( + cache_id=f"seqview_{cache_id_prefix}", + sequence_data=id_df.lazy().select(["id_idx", "sequence", "charge", "file_index", "scan_id"]).rename({ + "id_idx": "sequence_id", + "charge": "precursor_charge", + }), + peaks_data=spectra_df.lazy(), + filters={ + "identification": "sequence_id", + "file": "file_index", + "spectrum": "scan_id", + }, + interactivity={"peak": "peak_id"}, + cache_path=str(cache_dir), + deconvolved=False, + annotation_config={ + "ion_types": ["b", "y"], + "neutral_losses": True, + "tolerance": frag_tol, + "tolerance_ppm": frag_tol_is_ppm, + }, + ) - # if not Path(comet_results).exists(): - # st.error(f"CometAdapter failed for {stem}") - # st.stop() + # Initialize LinePlot from SequenceView + LinePlot.from_sequence_view( + seq_view, + cache_id=f"lineplot_{cache_id_prefix}", + cache_path=str(cache_dir), + title="Annotated Spectrum", + styling={ + "unhighlightedColor": "#CCCCCC", + "highlightColor": "#E74C3C", + "selectedColor": "#F3A712", + }, + ) - # --- PercolatorAdapter --- - self.logger.log("๐Ÿ“Š Running rescoring...") - with st.spinner(f"PercolatorAdapter ({stem})"): - if not self.executor.run_topp( - "PercolatorAdapter", - { - "in": comet_results, - "out": percolator_results, - }, - {"decoy_pattern": decoy_string}, # Always propagated from upstream - ): - self.logger.log("Workflow stopped due to error") - return False - - # Build visualization cache for Percolator results - for idxml_file in percolator_results: - idxml_path = Path(idxml_file) - cache_id_prefix = idxml_path.stem - - # Parse idXML to DataFrame - id_df, spectra_data = parse_idxml(idxml_path) - - # Initialize Table component (caches itself) - Table( - cache_id=f"table_{cache_id_prefix}", - data=id_df.lazy(), - cache_path=str(cache_dir), - interactivity={"file": "file_index", "spectrum": "scan_id", "identification": "id_idx"}, - column_definitions=[ - {"field": "sequence", "title": "Sequence"}, - {"field": "charge", "title": "Z", "sorter": "number"}, - {"field": "mz", "title": "m/z", "sorter": "number"}, - {"field": "rt", "title": "RT", "sorter": "number"}, - {"field": "score", "title": "Score", "sorter": "number"}, - {"field": "protein_accession", "title": "Proteins"}, - ], - initial_sort=[{"column": "score", "dir": "asc"}], - index_field="id_idx", - ) + self.logger.log("โœ… Rescoring complete") - # Initialize Heatmap component - Heatmap( - cache_id=f"heatmap_{cache_id_prefix}", - data=id_df.lazy(), - cache_path=str(cache_dir), - x_column="rt", - y_column="mz", - intensity_column="score", - interactivity={"identification": "id_idx"}, - ) + # if not Path(percolator_results[i]).exists(): + # st.error(f"PercolatorAdapter failed for {stem}") + # st.stop() - # Initialize SequenceView component - seq_view = SequenceView( - cache_id=f"seqview_{cache_id_prefix}", - sequence_data=id_df.lazy().select(["id_idx", "sequence", "charge", "file_index", "scan_id"]).rename({ - "id_idx": "sequence_id", - "charge": "precursor_charge", - }), - peaks_data=spectra_df.lazy(), - filters={ - "identification": "sequence_id", - "file": "file_index", - "spectrum": "scan_id", - }, - interactivity={"peak": "peak_id"}, - cache_path=str(cache_dir), - deconvolved=False, - annotation_config={ - "ion_types": ["b", "y"], - "neutral_losses": True, - "tolerance": frag_tol, - "tolerance_ppm": frag_tol_is_ppm, - }, - ) + # --- IDFilter --- + self.logger.log("๐Ÿ”ง Filtering identifications...") + with st.spinner(f"IDFilter ({stem})"): + if not self.executor.run_topp( + "IDFilter", + { + "in": percolator_results, + "out": filter_results, + }, + ): + self.logger.log("Workflow stopped due to error") + return False - # Initialize LinePlot from SequenceView - LinePlot.from_sequence_view( - seq_view, - cache_id=f"lineplot_{cache_id_prefix}", - cache_path=str(cache_dir), - title="Annotated Spectrum", - styling={ - "unhighlightedColor": "#CCCCCC", - "highlightColor": "#E74C3C", - "selectedColor": "#F3A712", - }, - ) + # Build visualization cache for Filter results + for idxml_file in filter_results: + idxml_path = Path(idxml_file) + cache_id_prefix = idxml_path.stem + + # Parse idXML to DataFrame + id_df, spectra_data = parse_idxml(idxml_path) + + # Initialize Table component (caches itself) + Table( + cache_id=f"table_{cache_id_prefix}", + data=id_df.lazy(), + cache_path=str(cache_dir), + interactivity={"file": "file_index", "spectrum": "scan_id", "identification": "id_idx"}, + column_definitions=[ + {"field": "sequence", "title": "Sequence"}, + {"field": "charge", "title": "Z", "sorter": "number"}, + {"field": "mz", "title": "m/z", "sorter": "number"}, + {"field": "rt", "title": "RT", "sorter": "number"}, + {"field": "score", "title": "Score", "sorter": "number"}, + {"field": "protein_accession", "title": "Proteins"}, + ], + initial_sort=[{"column": "score", "dir": "asc"}], + index_field="id_idx", + ) - self.logger.log("โœ… Rescoring complete") + # Initialize Heatmap component + Heatmap( + cache_id=f"heatmap_{cache_id_prefix}", + data=id_df.lazy(), + cache_path=str(cache_dir), + x_column="rt", + y_column="mz", + intensity_column="score", + interactivity={"identification": "id_idx"}, + ) - # if not Path(percolator_results[i]).exists(): - # st.error(f"PercolatorAdapter failed for {stem}") - # st.stop() + # Initialize SequenceView component + seq_view = SequenceView( + cache_id=f"seqview_{cache_id_prefix}", + sequence_data=id_df.lazy().select(["id_idx", "sequence", "charge", "file_index", "scan_id"]).rename({ + "id_idx": "sequence_id", + "charge": "precursor_charge", + }), + peaks_data=spectra_df.lazy(), + filters={ + "identification": "sequence_id", + "file": "file_index", + "spectrum": "scan_id", + }, + interactivity={"peak": "peak_id"}, + cache_path=str(cache_dir), + deconvolved=False, + annotation_config={ + "ion_types": ["b", "y"], + "neutral_losses": True, + "tolerance": frag_tol, + "tolerance_ppm": frag_tol_is_ppm, + }, + ) - # --- IDFilter --- - self.logger.log("๐Ÿ”ง Filtering identifications...") - with st.spinner(f"IDFilter ({stem})"): - if not self.executor.run_topp( - "IDFilter", - { - "in": percolator_results, - "out": filter_results, - }, - ): - self.logger.log("Workflow stopped due to error") - return False - - # Build visualization cache for Filter results - for idxml_file in filter_results: - idxml_path = Path(idxml_file) - cache_id_prefix = idxml_path.stem - - # Parse idXML to DataFrame - id_df, spectra_data = parse_idxml(idxml_path) - - # Initialize Table component (caches itself) - Table( - cache_id=f"table_{cache_id_prefix}", - data=id_df.lazy(), - cache_path=str(cache_dir), - interactivity={"file": "file_index", "spectrum": "scan_id", "identification": "id_idx"}, - column_definitions=[ - {"field": "sequence", "title": "Sequence"}, - {"field": "charge", "title": "Z", "sorter": "number"}, - {"field": "mz", "title": "m/z", "sorter": "number"}, - {"field": "rt", "title": "RT", "sorter": "number"}, - {"field": "score", "title": "Score", "sorter": "number"}, - {"field": "protein_accession", "title": "Proteins"}, - ], - initial_sort=[{"column": "score", "dir": "asc"}], - index_field="id_idx", - ) + # Initialize LinePlot from SequenceView + LinePlot.from_sequence_view( + seq_view, + cache_id=f"lineplot_{cache_id_prefix}", + cache_path=str(cache_dir), + title="Annotated Spectrum", + styling={ + "unhighlightedColor": "#CCCCCC", + "highlightColor": "#E74C3C", + "selectedColor": "#F3A712", + }, + ) - # Initialize Heatmap component - Heatmap( - cache_id=f"heatmap_{cache_id_prefix}", - data=id_df.lazy(), - cache_path=str(cache_dir), - x_column="rt", - y_column="mz", - intensity_column="score", - interactivity={"identification": "id_idx"}, - ) + self.logger.log("โœ… Filtering complete") - # Initialize SequenceView component - seq_view = SequenceView( - cache_id=f"seqview_{cache_id_prefix}", - sequence_data=id_df.lazy().select(["id_idx", "sequence", "charge", "file_index", "scan_id"]).rename({ - "id_idx": "sequence_id", - "charge": "precursor_charge", - }), - peaks_data=spectra_df.lazy(), - filters={ - "identification": "sequence_id", - "file": "file_index", - "spectrum": "scan_id", - }, - interactivity={"peak": "peak_id"}, - cache_path=str(cache_dir), - deconvolved=False, - annotation_config={ - "ion_types": ["b", "y"], - "neutral_losses": True, - "tolerance": frag_tol, - "tolerance_ppm": frag_tol_is_ppm, - }, - ) + # if not Path(filter_results[i]).exists(): + # st.error(f"IDFilter failed for {stem}") + # st.stop() - # Initialize LinePlot from SequenceView - LinePlot.from_sequence_view( - seq_view, - cache_id=f"lineplot_{cache_id_prefix}", - cache_path=str(cache_dir), - title="Annotated Spectrum", - styling={ - "unhighlightedColor": "#CCCCCC", - "highlightColor": "#E74C3C", - "selectedColor": "#F3A712", - }, - ) + # ================================ + # EasyPQP Spectral Library Generation (optional) + # ================================ + if self.params.get("generate-library", False): + self.logger.log("๐Ÿ“š Building spectral library with EasyPQP...") + st.info("Building spectral library with EasyPQP...") + library_dir = Path(self.workflow_dir, "results", "library") + library_dir.mkdir(parents=True, exist_ok=True) + + psms_files, peaks_files = [], [] + + for filter_idxml in filter_results: + original_stem = Path(filter_idxml).stem.replace("_filter", "") + matching_mzml = next((m for m in in_mzML if Path(m).stem == original_stem), None) + if not matching_mzml: + self.logger.log(f"Warning: No matching mzML found for {filter_idxml}") + continue + + # easypqp library requires specific extensions for file recognition: + # - PSM files must contain 'psmpkl' โ†’ use .psmpkl extension + # - Peak files must contain 'peakpkl' โ†’ use .peakpkl extension + # After splitext(), stem will be just "{mzML_stem}" matching PSM base_name + psms_out = str(library_dir / f"{original_stem}.psmpkl") + peaks_out = str(library_dir / f"{original_stem}.peakpkl") + + convert_cmd = [ + "easypqp", "convert", + "--pepxml", filter_idxml, + "--spectra", matching_mzml, + "--psms", psms_out, + "--peaks", peaks_out + ] + if self.executor.run_command(convert_cmd): + psms_files.append(psms_out) + peaks_files.append(peaks_out) + + if psms_files: + # easypqp library outputs TSV format (despite common .pqp extension) + library_tsv = str(library_dir / "spectral_library.tsv") + library_cmd = ["easypqp", "library", "--out", library_tsv] + + if not self.params.get("library-use-fdr", False): + # --nofdr only skips FDR recalculation, NOT threshold filtering + # Set all thresholds to 1.0 to bypass filtering for pre-filtered input + library_cmd.extend([ + "--nofdr", + "--psm_fdr_threshold", "1.0", + "--peptide_fdr_threshold", "1.0", + "--protein_fdr_threshold", "1.0" + ]) + else: + # Apply user-specified FDR filtering + library_cmd.extend([ + "--psm_fdr_threshold", + str(self.params.get("library-psm-fdr", 0.01)), + "--peptide_fdr_threshold", + str(self.params.get("library-peptide-fdr", 0.01)), + "--protein_fdr_threshold", + str(self.params.get("library-protein-fdr", 0.01)) + ]) + + for psms, peaks in zip(psms_files, peaks_files): + library_cmd.extend([psms, peaks]) + + if self.executor.run_command(library_cmd): + self.logger.log("โœ… Spectral library created") + st.success("Spectral library created") + else: + self.logger.log("Warning: Failed to build spectral library") + else: + self.logger.log("Warning: No PSMs converted for library generation") + + st.success(f"โœ“ {stem} identification completed") + + # # ================================ + # # 4๏ธโƒฃ ProteomicsLFQ (cross-sample) + # # ================================ + self.logger.log("๐Ÿ“ˆ Running cross-sample quantification...") + st.info("Running ProteomicsLFQ (cross-sample quantification)") + + quant_mztab = str(quant_dir / "openms_quant.mzTab") + quant_cxml = str(quant_dir / "openms.consensusXML") + quant_msstats = str(quant_dir / "openms_msstats.csv") + + with st.spinner("ProteomicsLFQ"): + combined_in = " ".join(in_mzML) + combined_ids = " ".join(filter_results) + self.logger.log(f"COMBINED_IN {combined_in}", 1) + self.logger.log(f"COMBINED_IN_TYPE {type(combined_in).__name__}", 1) + self.logger.log(f"FILTER_RESULTS = {filter_results}", 1) + self.logger.log(f"FILTER_RESULTS_LEN = {len(filter_results)}", 1) + + # โœ… Streamlit output (debug view) + st.markdown("### ๐Ÿ” ProteomicsLFQ Input Debug") + st.write("**combined_in:**", combined_in) + st.write("**combined_in type:**", type(combined_in).__name__) + + st.write("**combined_ids:**", combined_ids) + st.write("**combined_ids type:**", type(combined_ids).__name__) + + if not self.executor.run_topp( + "ProteomicsLFQ", + { + "in": [in_mzML], + "ids": [filter_results], + "out": [quant_mztab], + "out_cxml": [quant_cxml], + "out_msstats": [quant_msstats], + }, + { + "fasta": str(database_fasta), + "threads": 12, + # Disable FAIMS/IM handling to avoid segfault in OpenMS 3.5.0 + "PeptideQuantification:extract:IM_window": "0.0", + "PeptideQuantification:faims:merge_features": "false", + }, + ): + self.logger.log("Workflow stopped due to error") + return False + self.logger.log("โœ… Quantification complete") + + # if not Path(quant_mztab).exists(): + # st.error("ProteomicsLFQ failed: mzTab not created") + # st.stop() + + + # ================================ + # 5๏ธโƒฃ Final report + # # ================================ + st.success("๐ŸŽ‰ TOPP workflow completed successfully") + st.write("๐Ÿ“ Results directory:") + st.code(str(results_dir)) + + + st.write("๐Ÿ“„ Generated files:") + st.write(f"- mzTab: {quant_mztab}") + st.write(f"- consensusXML: {quant_cxml}") + st.write(f"- MSstats CSV: {quant_msstats}") + + return True + else: + self.logger.log("โš™๏ธ Running TMT workflow") - self.logger.log("โœ… Filtering complete") + results_dir = Path(self.workflow_dir, "results") + iso_dir = results_dir / "isobaric_consensusXML" + comet_dir = results_dir / "comet_results" + perc_dir = results_dir / "percolator_results" + psm_filter_dir = results_dir / "psm_filter" + map_dir = results_dir / "idmapper" + merge_dir = results_dir / "merged" + protein_dir = results_dir / "protein" + msstats_dir = results_dir / "msstats" + quant_dir = results_dir / "quant_results" + + iso_consensus = [] + comet_results = [] + percolator_results = [] + psm_filtered = [] + mapped_ids = [] + + for d in [ + iso_dir, comet_dir, perc_dir, psm_filter_dir, + map_dir, merge_dir, protein_dir, msstats_dir, quant_dir + ]: + d.mkdir(parents=True, exist_ok=True) + + for mz in in_mzML: + stem = Path(mz).stem + iso_consensus.append(str(iso_dir / f"{stem}_iso.consensusXML")) + comet_results.append(str(comet_dir / f"{stem}_comet.idXML")) + percolator_results.append(str(perc_dir / f"{stem}_comet_perc.idXML")) + psm_filtered.append(str(psm_filter_dir / f"{stem}_comet_perc_filter.idXML")) + mapped_ids.append(str(map_dir / f"{stem}_comet_perc_filter_map.consensusXML")) + + merged_id = str(merge_dir / "ID_mapper_merge.consensusXML") + protein_id = str(protein_dir / "ID_mapper_merge_epi.consensusXML") + protein_filter = str(protein_dir / "ID_mapper_merge_epi_filter.consensusXML") + protein_resolved = str(protein_dir / "ID_mapper_merge_epi_filter_resconf.consensusXML") + consensus_out = str(quant_dir / "openms_design_protein_openms.csv") + + # --- IsobaricAnalyzer --- + self.logger.log("๐Ÿท๏ธ Running isobaric labeling analysis...") + with st.spinner("IsobaricAnalyzer"): + if not self.executor.run_topp( + "IsobaricAnalyzer", + { + "in": in_mzML, + "out": iso_consensus, + }, + tool_instance_name="IsobaricAnalyzer-TMT", + ): + self.logger.log("Workflow stopped due to error") + return False + self.logger.log("โœ… IsobaricAnalyzer complete") + + # --- CometAdapter --- + self.logger.log("๐Ÿ”Ž Running peptide search...") + with st.spinner(f"CometAdapter ({stem})"): + comet_extra_params = {"database": str(database_fasta)} + if self.params.get("generate-decoys", True): + # Propagate decoy_string from DecoyDatabase + comet_extra_params["PeptideIndexing:decoy_string"] = decoy_string + if not self.executor.run_topp( + "CometAdapter", + { + "in": in_mzML, + "out": comet_results, + }, + comet_extra_params, + tool_instance_name="CometAdapter-TMT", + ): + self.logger.log("Workflow stopped due to error") + return False + self.logger.log("โœ… CometAdapter complete") + + # Get fragment tolerance from CometAdapter parameters for visualization + comet_params = self.parameter_manager.get_topp_parameters("CometAdapter") + frag_tol = comet_params.get("fragment_mass_tolerance", 0.02) + frag_tol_is_ppm = comet_params.get("fragment_error_units", "Da") != "Da" + + # Build visualization cache for Comet results + results_dir_path = Path(self.workflow_dir, "results") + cache_dir = results_dir_path / "insight_cache" + cache_dir.mkdir(parents=True, exist_ok=True) + + # Get mzML directory + mzml_dir = Path(in_mzML[0]).parent + + # Build spectra cache (once, shared by all stages) + spectra_df = None + filename_to_index = {} + + for idxml_file in comet_results: + idxml_path = Path(idxml_file) + cache_id_prefix = idxml_path.stem + + # Parse idXML to DataFrame + id_df, spectra_data = parse_idxml(idxml_path) + + # Build spectra cache (only once) + if spectra_df is None: + filename_to_index = {Path(f).name: i for i, f in enumerate(spectra_data)} + spectra_df, filename_to_index = build_spectra_cache(mzml_dir, filename_to_index) + + # Initialize Table component (caches itself) + Table( + cache_id=f"table_{cache_id_prefix}", + data=id_df.lazy(), + cache_path=str(cache_dir), + interactivity={"file": "file_index", "spectrum": "scan_id", "identification": "id_idx"}, + column_definitions=[ + {"field": "sequence", "title": "Sequence"}, + {"field": "charge", "title": "Z", "sorter": "number"}, + {"field": "mz", "title": "m/z", "sorter": "number"}, + {"field": "rt", "title": "RT", "sorter": "number"}, + {"field": "score", "title": "Score", "sorter": "number"}, + {"field": "protein_accession", "title": "Proteins"}, + ], + initial_sort=[{"column": "score", "dir": "asc"}], + index_field="id_idx", + ) - # if not Path(filter_results[i]).exists(): - # st.error(f"IDFilter failed for {stem}") - # st.stop() + # Initialize Heatmap component + Heatmap( + cache_id=f"heatmap_{cache_id_prefix}", + data=id_df.lazy(), + cache_path=str(cache_dir), + x_column="rt", + y_column="mz", + intensity_column="score", + interactivity={"identification": "id_idx"}, + ) - # ================================ - # EasyPQP Spectral Library Generation (optional) - # ================================ - if self.params.get("generate-library", False): - self.logger.log("๐Ÿ“š Building spectral library with EasyPQP...") - st.info("Building spectral library with EasyPQP...") - library_dir = Path(self.workflow_dir, "results", "library") - library_dir.mkdir(parents=True, exist_ok=True) - - psms_files, peaks_files = [], [] - - for filter_idxml in filter_results: - original_stem = Path(filter_idxml).stem.replace("_filter", "") - matching_mzml = next((m for m in in_mzML if Path(m).stem == original_stem), None) - if not matching_mzml: - self.logger.log(f"Warning: No matching mzML found for {filter_idxml}") - continue - - # easypqp library requires specific extensions for file recognition: - # - PSM files must contain 'psmpkl' โ†’ use .psmpkl extension - # - Peak files must contain 'peakpkl' โ†’ use .peakpkl extension - # After splitext(), stem will be just "{mzML_stem}" matching PSM base_name - psms_out = str(library_dir / f"{original_stem}.psmpkl") - peaks_out = str(library_dir / f"{original_stem}.peakpkl") - - convert_cmd = [ - "easypqp", "convert", - "--pepxml", filter_idxml, - "--spectra", matching_mzml, - "--psms", psms_out, - "--peaks", peaks_out - ] - if self.executor.run_command(convert_cmd): - psms_files.append(psms_out) - peaks_files.append(peaks_out) - - if psms_files: - # easypqp library outputs TSV format (despite common .pqp extension) - library_tsv = str(library_dir / "spectral_library.tsv") - library_cmd = ["easypqp", "library", "--out", library_tsv] - - if not self.params.get("library-use-fdr", False): - # --nofdr only skips FDR recalculation, NOT threshold filtering - # Set all thresholds to 1.0 to bypass filtering for pre-filtered input - library_cmd.extend([ - "--nofdr", - "--psm_fdr_threshold", "1.0", - "--peptide_fdr_threshold", "1.0", - "--protein_fdr_threshold", "1.0" - ]) - else: - # Apply user-specified FDR filtering - library_cmd.extend([ - "--psm_fdr_threshold", - str(self.params.get("library-psm-fdr", 0.01)), - "--peptide_fdr_threshold", - str(self.params.get("library-peptide-fdr", 0.01)), - "--protein_fdr_threshold", - str(self.params.get("library-protein-fdr", 0.01)) - ]) - - for psms, peaks in zip(psms_files, peaks_files): - library_cmd.extend([psms, peaks]) - - if self.executor.run_command(library_cmd): - self.logger.log("โœ… Spectral library created") - st.success("Spectral library created") - else: - self.logger.log("Warning: Failed to build spectral library") - else: - self.logger.log("Warning: No PSMs converted for library generation") + # Initialize SequenceView component + seq_view = SequenceView( + cache_id=f"seqview_{cache_id_prefix}", + sequence_data=id_df.lazy().select(["id_idx", "sequence", "charge", "file_index", "scan_id"]).rename({ + "id_idx": "sequence_id", + "charge": "precursor_charge", + }), + peaks_data=spectra_df.lazy(), + filters={ + "identification": "sequence_id", + "file": "file_index", + "spectrum": "scan_id", + }, + interactivity={"peak": "peak_id"}, + cache_path=str(cache_dir), + deconvolved=False, + annotation_config={ + "ion_types": ["b", "y"], + "neutral_losses": True, + "tolerance": frag_tol, + "tolerance_ppm": frag_tol_is_ppm, + }, + ) - st.success(f"โœ“ {stem} identification completed") + # Initialize LinePlot from SequenceView + LinePlot.from_sequence_view( + seq_view, + cache_id=f"lineplot_{cache_id_prefix}", + cache_path=str(cache_dir), + title="Annotated Spectrum", + styling={ + "unhighlightedColor": "#CCCCCC", + "highlightColor": "#E74C3C", + "selectedColor": "#F3A712", + }, + ) - # # ================================ - # # 4๏ธโƒฃ ProteomicsLFQ (cross-sample) - # # ================================ - self.logger.log("๐Ÿ“ˆ Running cross-sample quantification...") - st.info("Running ProteomicsLFQ (cross-sample quantification)") + self.logger.log("โœ… Peptide search complete") - quant_mztab = str(quant_dir / "openms_quant.mzTab") - quant_cxml = str(quant_dir / "openms.consensusXML") - quant_msstats = str(quant_dir / "openms_msstats.csv") + # --- PercolatorAdapter --- + self.logger.log("๐Ÿ“Š Running rescoring...") + with st.spinner(f"PercolatorAdapter"): + if not self.executor.run_topp( + "PercolatorAdapter", + { + "in": comet_results, + "out": percolator_results, + }, + tool_instance_name="PercolatorAdapter-TMT", + ): + self.logger.log("Workflow stopped due to error") + return False + # Build visualization cache for Percolator results + for idxml_file in percolator_results: + idxml_path = Path(idxml_file) + cache_id_prefix = idxml_path.stem + + # Parse idXML to DataFrame + id_df, spectra_data = parse_idxml(idxml_path) + + # Initialize Table component (caches itself) + Table( + cache_id=f"table_{cache_id_prefix}", + data=id_df.lazy(), + cache_path=str(cache_dir), + interactivity={"file": "file_index", "spectrum": "scan_id", "identification": "id_idx"}, + column_definitions=[ + {"field": "sequence", "title": "Sequence"}, + {"field": "charge", "title": "Z", "sorter": "number"}, + {"field": "mz", "title": "m/z", "sorter": "number"}, + {"field": "rt", "title": "RT", "sorter": "number"}, + {"field": "score", "title": "Score", "sorter": "number"}, + {"field": "protein_accession", "title": "Proteins"}, + ], + initial_sort=[{"column": "score", "dir": "asc"}], + index_field="id_idx", + ) - with st.spinner("ProteomicsLFQ"): - combined_in = " ".join(in_mzML) - combined_ids = " ".join(filter_results) - self.logger.log(f"COMBINED_IN {combined_in}", 1) - self.logger.log(f"COMBINED_IN_TYPE {type(combined_in).__name__}", 1) - self.logger.log(f"FILTER_RESULTS = {filter_results}", 1) - self.logger.log(f"FILTER_RESULTS_LEN = {len(filter_results)}", 1) + # Initialize Heatmap component + Heatmap( + cache_id=f"heatmap_{cache_id_prefix}", + data=id_df.lazy(), + cache_path=str(cache_dir), + x_column="rt", + y_column="mz", + intensity_column="score", + interactivity={"identification": "id_idx"}, + ) - # โœ… Streamlit output (debug view) - st.markdown("### ๐Ÿ” ProteomicsLFQ Input Debug") - st.write("**combined_in:**", combined_in) - st.write("**combined_in type:**", type(combined_in).__name__) + # Initialize SequenceView component + seq_view = SequenceView( + cache_id=f"seqview_{cache_id_prefix}", + sequence_data=id_df.lazy().select(["id_idx", "sequence", "charge", "file_index", "scan_id"]).rename({ + "id_idx": "sequence_id", + "charge": "precursor_charge", + }), + peaks_data=spectra_df.lazy(), + filters={ + "identification": "sequence_id", + "file": "file_index", + "spectrum": "scan_id", + }, + interactivity={"peak": "peak_id"}, + cache_path=str(cache_dir), + deconvolved=False, + annotation_config={ + "ion_types": ["b", "y"], + "neutral_losses": True, + "tolerance": frag_tol, + "tolerance_ppm": frag_tol_is_ppm, + }, + ) - st.write("**combined_ids:**", combined_ids) - st.write("**combined_ids type:**", type(combined_ids).__name__) + # Initialize LinePlot from SequenceView + LinePlot.from_sequence_view( + seq_view, + cache_id=f"lineplot_{cache_id_prefix}", + cache_path=str(cache_dir), + title="Annotated Spectrum", + styling={ + "unhighlightedColor": "#CCCCCC", + "highlightColor": "#E74C3C", + "selectedColor": "#F3A712", + }, + ) + self.logger.log("โœ… PercolatorAdapter complete") + + # --- IDFilter --- + self.logger.log("๐Ÿ”ง Filtering identifications...") + with st.spinner(f"IDFilter"): if not self.executor.run_topp( - "ProteomicsLFQ", + "IDFilter", + { + "in": percolator_results, + "out": psm_filtered, + }, + tool_instance_name="IDFilter-strict" + ): + self.logger.log("Workflow stopped due to error") + return False + self.logger.log("โœ… IDFilter-strict complete") + + # Build visualization cache for Filter results + for idxml_file in psm_filtered: + idxml_path = Path(idxml_file) + cache_id_prefix = idxml_path.stem + + # Parse idXML to DataFrame + id_df, spectra_data = parse_idxml(idxml_path) + + # Initialize Table component (caches itself) + Table( + cache_id=f"table_{cache_id_prefix}", + data=id_df.lazy(), + cache_path=str(cache_dir), + interactivity={"file": "file_index", "spectrum": "scan_id", "identification": "id_idx"}, + column_definitions=[ + {"field": "sequence", "title": "Sequence"}, + {"field": "charge", "title": "Z", "sorter": "number"}, + {"field": "mz", "title": "m/z", "sorter": "number"}, + {"field": "rt", "title": "RT", "sorter": "number"}, + {"field": "score", "title": "Score", "sorter": "number"}, + {"field": "protein_accession", "title": "Proteins"}, + ], + initial_sort=[{"column": "score", "dir": "asc"}], + index_field="id_idx", + ) + + # Initialize Heatmap component + Heatmap( + cache_id=f"heatmap_{cache_id_prefix}", + data=id_df.lazy(), + cache_path=str(cache_dir), + x_column="rt", + y_column="mz", + intensity_column="score", + interactivity={"identification": "id_idx"}, + ) + + # Initialize SequenceView component + seq_view = SequenceView( + cache_id=f"seqview_{cache_id_prefix}", + sequence_data=id_df.lazy().select(["id_idx", "sequence", "charge", "file_index", "scan_id"]).rename({ + "id_idx": "sequence_id", + "charge": "precursor_charge", + }), + peaks_data=spectra_df.lazy(), + filters={ + "identification": "sequence_id", + "file": "file_index", + "spectrum": "scan_id", + }, + interactivity={"peak": "peak_id"}, + cache_path=str(cache_dir), + deconvolved=False, + annotation_config={ + "ion_types": ["b", "y"], + "neutral_losses": True, + "tolerance": frag_tol, + "tolerance_ppm": frag_tol_is_ppm, + }, + ) + + # Initialize LinePlot from SequenceView + LinePlot.from_sequence_view( + seq_view, + cache_id=f"lineplot_{cache_id_prefix}", + cache_path=str(cache_dir), + title="Annotated Spectrum", + styling={ + "unhighlightedColor": "#CCCCCC", + "highlightColor": "#E74C3C", + "selectedColor": "#F3A712", + }, + ) + + # --- IDMapper --- + self.logger.log("๐Ÿ—บ๏ธ Mapping IDs to isobaric consensus features...") + for iso, psm, mapped in zip(iso_consensus, psm_filtered, mapped_ids): + iso_stem = Path(iso).stem + with st.spinner(f"IDMapper ({iso_stem})"): + if not self.executor.run_topp( + "IDMapper", { - "in": [in_mzML], - "ids": [filter_results], - "out": [quant_mztab], - "out_cxml": [quant_cxml], - "out_msstats": [quant_msstats], + "in": [iso], + "id": [psm], + "out": [mapped], }, - { - "fasta": str(database_fasta), - "psmFDR": 0.5, - "proteinFDR": 0.5, - "threads": 12, - # Disable FAIMS/IM handling to avoid segfault in OpenMS 3.5.0 - "PeptideQuantification:extract:IM_window": "0.0", - "PeptideQuantification:faims:merge_features": "false", - } + tool_instance_name="IDMapper-TMT", ): + self.logger.log("Workflow stopped due to error") + return False + self.logger.log("โœ… IDMapper complete") + + # --- FileMerger --- + self.logger.log("๐Ÿ”— Merging mapped consensus files...") + with st.spinner("FileMerger"): + if not self.executor.run_topp( + "FileMerger", + { + "in": mapped_ids, + "out": [merged_id], + }, + tool_instance_name="FileMerger-TMT", + ): self.logger.log("Workflow stopped due to error") return False - self.logger.log("โœ… Quantification complete") - - # if not Path(quant_mztab).exists(): - # st.error("ProteomicsLFQ failed: mzTab not created") - # st.stop() + self.logger.log("โœ… FileMerger complete") + # --- ProteinInference --- + self.logger.log("๐Ÿงฉ Running protein inference...") + with st.spinner("ProteinInference"): + if not self.executor.run_topp( + "ProteinInference", + { + "in": [merged_id], + "out": [protein_id], + }, + tool_instance_name="ProteinInference-TMT", + ): + self.logger.log("Workflow stopped due to error") + return False + self.logger.log("โœ… ProteinInference complete") - # ================================ - # 5๏ธโƒฃ Final report - # # ================================ - st.success("๐ŸŽ‰ TOPP workflow completed successfully") - st.write("๐Ÿ“ Results directory:") - st.code(str(results_dir)) - + # --- IDFilter-lenient (Protein) --- + self.logger.log("๐Ÿ”ฌ Filtering proteins...") + with st.spinner("IDFilter (Protein)"): + if not self.executor.run_topp( + "IDFilter", + { + "in": [protein_id], + "out": [protein_filter], + }, + tool_instance_name="IDFilter-lenient" + ): + self.logger.log("Workflow stopped due to error") + return False + self.logger.log("โœ… IDFilter-lenient (Protein) complete") - st.write("๐Ÿ“„ Generated files:") - st.write(f"- mzTab: {quant_mztab}") - st.write(f"- consensusXML: {quant_cxml}") - st.write(f"- MSstats CSV: {quant_msstats}") + # ================================ + # โœจ NEW: 8๏ธโƒฃ IDConflictResolver (protein_filter โ†’ protein_resolved) + # ================================ + self.logger.log("โš–๏ธ Resolving ID conflicts...") + with st.spinner("IDConflictResolver"): + if not self.executor.run_topp( + "IDConflictResolver", + { + "in": [protein_filter], + "out": [protein_resolved], + }, + tool_instance_name="IDConflictResolver-TMT", + ): + self.logger.log("Workflow stopped due to error") + return False + self.logger.log("โœ… IDConflictResolver complete") - return True + # ================================ + # โœจ NEW: ๐Ÿ”Ÿ ProteinQuantifier (protein_resolved โ†’ consensus_out) + # ================================ + self.logger.log("๐Ÿ“ Running protein quantification...") + with st.spinner("ProteinQuantifier"): + if not self.executor.run_topp( + "ProteinQuantifier", + { + "in": [protein_resolved], + "out": [consensus_out], + }, + tool_instance_name="ProteinQuantifier-TMT", + ): + self.logger.log("Workflow stopped due to error") + return False + self.logger.log("โœ… ProteinQuantifier complete") + self.logger.log("๐Ÿ“„ Generating protein table...") + + self.logger.log("๐ŸŽ‰ WORKFLOW FINISHED") @st.fragment def results(self) -> None: diff --git a/src/common/results_helpers.py b/src/common/results_helpers.py index 0db0d07..eaf4e66 100644 --- a/src/common/results_helpers.py +++ b/src/common/results_helpers.py @@ -8,7 +8,7 @@ from scipy.stats import ttest_ind from pyopenms import IdXMLFile, MSExperiment, MzMLFile from src.workflow.ParameterManager import ParameterManager - +from statsmodels.stats.multitest import multipletests def get_workflow_dir(workspace): """Get the workflow directory path.""" @@ -197,103 +197,289 @@ def load_abundance_data(workspace_path: str, csv_mtime: float) -> tuple | None: workflow_dir = get_workflow_dir(Path(workspace_path)) quant_dir = workflow_dir / "results" / "quant_results" - if not quant_dir.exists(): - return None + parameter_manager = ParameterManager(workflow_dir, "TOPP Workflow") - csv_files = sorted(quant_dir.glob("*.csv")) - if not csv_files: - return None + workflow_params = parameter_manager.get_parameters_from_json() + analysis_mode = workflow_params.get("analysis-mode", "LFQ") - csv_file = csv_files[0] + if analysis_mode == "LFQ": + if not quant_dir.exists(): + return None - try: - df = pd.read_csv(csv_file) - except Exception: - return None + csv_files = sorted(quant_dir.glob("*.csv")) + if not csv_files: + return None - if df.empty: - return None + csv_file = csv_files[0] - # Get group mapping from parameters - param_manager = ParameterManager(workflow_dir) - params = param_manager.get_parameters_from_json() - group_map = { - key[11:]: value # Remove "mzML-group-" prefix - for key, value in params.items() - if key.startswith("mzML-group-") and value - } + try: + df = pd.read_csv(csv_file) + except Exception: + return None - if not group_map: - return None + if df.empty: + return None - df["Sample"] = df["Reference"].str.replace(".mzML", "", regex=False) - df["Group"] = df["Reference"].map(group_map) - df = df.dropna(subset=["Group"]) + # Get group mapping from parameters + param_manager = ParameterManager(workflow_dir) + params = param_manager.get_parameters_from_json() + group_map = { + key[11:]: value # Remove "mzML-group-" prefix + for key, value in params.items() + if key.startswith("mzML-group-") and value + } - groups = sorted(df["Group"].unique()) + if not group_map: + return None - if len(groups) < 2: - return None + df["Sample"] = df["Reference"].str.replace(".mzML", "", regex=False) + df["Group"] = df["Reference"].map(group_map) + df = df.dropna(subset=["Group"]) - group1, group2 = groups[:2] + groups = sorted(df["Group"].unique()) - # Compute statistics per protein - stats_rows = [] - for protein, protein_df in df.groupby("ProteinName"): - g1_vals = protein_df[protein_df["Group"] == group1]["Intensity"].values - g2_vals = protein_df[protein_df["Group"] == group2]["Intensity"].values + if len(groups) < 2: + return None - if len(g1_vals) < 2 or len(g2_vals) < 2: - pval = np.nan - else: - _, pval = ttest_ind(g1_vals, g2_vals, equal_var=False) - - mean_g1 = np.mean(g1_vals) if len(g1_vals) > 0 else np.nan - mean_g2 = np.mean(g2_vals) if len(g2_vals) > 0 else np.nan - - log2fc = np.log2(mean_g2 / mean_g1) if mean_g1 > 0 else np.nan - - stats_rows.append({ - "ProteinName": protein, - "log2FC": log2fc, - "p-value": pval, - }) - - stats_df = pd.DataFrame(stats_rows) - - # Order samples by group (group2 first, then group1) - sample_group_df = df[["Sample", "Group"]].drop_duplicates() - group2_samples = sample_group_df[sample_group_df["Group"] == group2]["Sample"].tolist() - group1_samples = sample_group_df[sample_group_df["Group"] == group1]["Sample"].tolist() - all_samples = group2_samples + group1_samples - - # Build pivot table - pivot_list = [] - for protein, group_df in df.groupby("ProteinName"): - peptides = ";".join(group_df["PeptideSequence"].unique()) - intensity_dict = group_df.groupby("Sample")["Intensity"].sum().to_dict() - intensity_dict_complete = { - sample: intensity_dict.get(sample, 0) - for sample in all_samples - } - row = { - "ProteinName": protein, - **intensity_dict_complete, - "PeptideSequence": peptides, - } - pivot_list.append(row) + group1, group2 = groups[:2] + + # Compute statistics per protein + stats_rows = [] + for protein, protein_df in df.groupby("ProteinName"): + g1_vals = protein_df[protein_df["Group"] == group1]["Intensity"].values + g2_vals = protein_df[protein_df["Group"] == group2]["Intensity"].values - pivot_df = pd.DataFrame(pivot_list) - pivot_df = pivot_df.merge(stats_df, on="ProteinName", how="left") - pivot_df = pivot_df[["ProteinName", "log2FC", "p-value"] + all_samples + ["PeptideSequence"]] + if len(g1_vals) < 2 or len(g2_vals) < 2: + pval = np.nan + else: + _, pval = ttest_ind(g1_vals, g2_vals, equal_var=False) - # Build expression matrix (log2-transformed) - expr_df = pivot_df.set_index("ProteinName")[all_samples] - expr_df = expr_df.replace(0, np.nan) - expr_df = np.log2(expr_df + 1) - expr_df = expr_df.dropna() + mean_g1 = np.mean(g1_vals) if len(g1_vals) > 0 else np.nan + mean_g2 = np.mean(g2_vals) if len(g2_vals) > 0 else np.nan + + log2fc = np.log2(mean_g2 / mean_g1) if mean_g1 > 0 else np.nan + + stats_rows.append({ + "ProteinName": protein, + "log2FC": log2fc, + "p-value": pval, + }) - return pivot_df, expr_df, group_map + stats_df = pd.DataFrame(stats_rows) + + if not stats_df.empty: + mask = stats_df["p-value"].notna() + if mask.any(): + _, p_adj, _, _ = multipletests(stats_df.loc[mask, "p-value"], method="fdr_bh") + stats_df.loc[mask, "p-adj"] = p_adj + else: + stats_df["p-adj"] = np.nan + + # Order samples by group (group2 first, then group1) + sample_group_df = df[["Sample", "Group"]].drop_duplicates() + group2_samples = sample_group_df[sample_group_df["Group"] == group2]["Sample"].tolist() + group1_samples = sample_group_df[sample_group_df["Group"] == group1]["Sample"].tolist() + all_samples = group2_samples + group1_samples + + # Build pivot table + pivot_list = [] + for protein, group_df in df.groupby("ProteinName"): + peptides = ";".join(group_df["PeptideSequence"].unique()) + intensity_dict = group_df.groupby("Sample")["Intensity"].sum().to_dict() + intensity_dict_complete = { + sample: intensity_dict.get(sample, 0) + for sample in all_samples + } + row = { + "ProteinName": protein, + **intensity_dict_complete, + "PeptideSequence": peptides, + } + pivot_list.append(row) + + pivot_df = pd.DataFrame(pivot_list) + pivot_df = pivot_df.merge(stats_df, on="ProteinName", how="left") + pivot_df = pivot_df[["ProteinName", "log2FC", "p-value", "p-adj"] + all_samples + ["PeptideSequence"]] + + # Build expression matrix (log2-transformed) + expr_df = pivot_df.set_index("ProteinName")[all_samples] + expr_df = expr_df.replace(0, np.nan) + expr_df = np.log2(expr_df + 1) + expr_df = expr_df.dropna() + + return pivot_df, expr_df, group_map + + else: + if not quant_dir.exists(): + return None + + csv_files = sorted(quant_dir.glob("*.csv")) + if not csv_files: + return None + + csv_file = csv_files[0] + + try: + df = pd.read_csv(csv_file, sep="\t", comment="#", engine="python") + except Exception: + return None + + if df.empty: + return None + + # ratio column removal + df = df.loc[:, ~df.columns.str.contains('ratio', case=False)] + + # exclude_indices = st.session_state.get("tmt_exclude_indices", []) + # group_map = st.session_state.get("tmt_group_map", {}) + # Get group mapping from parameters + parameter_manager = ParameterManager(Path(workflow_dir), "TOPP Workflow") + params = parameter_manager.get_parameters_from_json() + group_map = {} + for key, value in params.items(): + if key.startswith("TMT-group-") and value: + # Extract the numeric part from keys like "TMT-group-sample1" + match = re.search(r'sample(\d+)', key) + if match: + # Subtract 1 to convert to a 0-based index (0, 1, 2...). + # If your samples are already 0-based, remove the -1 adjustment. + index = str(int(match.group(1)) - 1) + group_map[index] = value + + # 1. Extract keys labeled as "skip" from group_map as integer list + exclude_indices = [ + int(k) for k, v in group_map.items() if v.lower() == "skip" + ] + + # 2. Remove "skip" entries from group_map (keep only actual group info) + group_map = { + int(k): v for k, v in group_map.items() if v.lower() != "skip" + } + + start_column_offset = 4 + + # st.write("exclude_indices:", exclude_indices) + # st.write("group_map:", group_map) + + if not group_map: + st.warning("โš ๏ธ Group mapping information is missing. Please configure sample groups in the Setup page.") + return None + + if exclude_indices: + # st.write("Current columns:", df.columns.tolist()) + # st.write("Number of columns:", len(df.columns)) + # st.write("Exclude indices:", exclude_indices) + # st.write("Offset:", start_column_offset) + cols_to_drop = [df.columns[i + start_column_offset] for i in exclude_indices] + df_cleaned = df.drop(columns=cols_to_drop) + else: + df_cleaned = df.copy() + + if group_map: + # Create new row data (defaulting to empty strings) + # Create a list with the same length as the column order of df_cleaned + new_row = [""] * len(df_cleaned.columns) + new_row[0] = "Group" + + # Get the column names of the current dataframe as a list + current_cols = df_cleaned.columns.tolist() + original_cols = df.columns.tolist() + + for col_name in current_cols[start_column_offset:]: + # Check the original index position of this column + original_idx = original_cols.index(col_name) - start_column_offset + col_pos = current_cols.index(col_name) + new_row[col_pos] = group_map.get(original_idx, "NA") + + # Insert the row at the top of the dataframe + # Create a new DF and concatenate to prepend the row to existing data + group_df = pd.DataFrame([new_row], columns=df_cleaned.columns) + df_with_groups = pd.concat([group_df, df_cleaned], ignore_index=True) + + # drop_msg = f"{len(exclude_indices)} channels dropped" if exclude_indices else "No channels dropped" + # st.success(f"โœ… {drop_msg} and Group names have been inserted at the top of the data.") + + # st.write("### Data Preview with Group Information") + # st.dataframe(df_with_groups.head(10)) + + if group_map and len(set(group_map.values())) >= 2: + # Prepare data for calculation + # Extract group information from row 0 of df_with_groups (the newly added Group row) + # Actual sample data starts from the 5th column (index 4) + group_info_row = df_with_groups.iloc[0] + + # Get unique group names (excluding NA) + unique_groups = sorted([g for g in set(group_map.values()) if g != "NA"]) + g1_name, g2_name = unique_groups[0], unique_groups[1] + + # Extract numerical data for statistical calculation (from row 1 and column index 4 onwards) + # Convert to numeric type (to prevent calculation errors) + numeric_data = df_with_groups.iloc[1:, 4:].apply(pd.to_numeric, errors='coerce') + + # Column indexing by group + # Categorize columns based on the values in the Group row + g1_cols = [col for col in numeric_data.columns if group_info_row[col] == g1_name] + g2_cols = [col for col in numeric_data.columns if group_info_row[col] == g2_name] + + # Calculate log2FC and p-value for each row + def run_stats(row): + v1 = row[g1_cols].dropna() + v2 = row[g2_cols].dropna() + + # log2FC (Group2 / Group1) + m1, m2 = v1.mean(), v2.mean() + l2fc = np.log2(m2 / m1) if m1 > 0 and m2 > 0 else np.nan + + # p-value (T-test) + if len(v1) > 1 and len(v2) > 1: + _, pval = ttest_ind(v1, v2, equal_var=False) + else: + pval = np.nan + return pd.Series([l2fc, pval]) + + stats_results = numeric_data.apply(run_stats, axis=1) + stats_results.columns = ['log2FC', 'p-value'] + # Add Adjusted p-value (FDR) calculation + if not stats_results['p-value'].isna().all(): + # Select only rows that contain p-values + mask = stats_results['p-value'].notna() + # Apply Benjamini-Hochberg (BH) correction + _, p_adj, _, _ = multipletests(stats_results.loc[mask, 'p-value'], method='fdr_bh') + stats_results.loc[mask, 'p-adj'] = p_adj + else: + stats_results['p-adj'] = np.nan + + # Construct the final dataframe (Based on df_cleaned - excluding the group row) + # Insert calculation results into the 2nd and 3rd column positions + pivot_df = df_cleaned.copy() + pivot_df.insert(1, "log2FC", stats_results['log2FC'].values) + pivot_df.insert(2, "p-value", stats_results['p-value'].values) + pivot_df.insert(3, "p-adj", stats_results['p-adj'].values) + + # st.success(f"Analysis Complete: {g1_name} (n={len(g1_cols)}) vs {g2_name} (n={len(g2_cols)})") + + # Set the first column ('protein') of final_df as the index + protein_col = pivot_df.columns[0] + sample_cols = current_cols[start_column_offset:] # Identify actual sample column names + + # Select sample columns and create a matrix + expr_df = pivot_df.set_index(protein_col)[sample_cols] + + # Replace 0 with NaN (to prevent log transformation errors) + expr_df = expr_df.replace(0, np.nan) + + # Log2 transformation (data normalization) + expr_df = np.log2(expr_df + 1) + + # Remove proteins (rows) with any missing values + expr_df = expr_df.dropna() + + return pivot_df, expr_df, group_map + else: + st.warning("โš ๏ธ At least two distinct groups are required for statistical analysis.") + else: + st.warning("โš ๏ธ No group mapping information is set. Please check the Configure page.") + return None def get_abundance_data(workspace: Path) -> tuple | None: diff --git a/src/workflow/ParameterManager.py b/src/workflow/ParameterManager.py index b0c3626..19e8700 100644 --- a/src/workflow/ParameterManager.py +++ b/src/workflow/ParameterManager.py @@ -129,6 +129,30 @@ def get_parameters_from_json(self) -> dict: except: st.error("**ERROR**: Attempting to load an invalid JSON parameter file. Reset to defaults.") return {} + + def get_merged_params(self, tool_instance_name: str, ini_params: dict = None) -> dict: + """ + Three-layer parameter merge: ini defaults < _defaults < user overrides. + + Args: + tool_instance_name: Instance name (or tool name) to look up in params.json. + ini_params: Base parameters from the .ini file. Optional โ€” callers that + don't need the ini layer (e.g., run_topp, which passes -ini separately) + can omit this. + + Returns: + Merged dict with the effective value for each parameter. + """ + params = self.get_parameters_from_json() + defaults = params.get("_defaults", {}).get(tool_instance_name, {}) + user = params.get(tool_instance_name, {}) + + merged = {} + if ini_params: + merged.update(ini_params) + merged.update(defaults) + merged.update(user) + return merged def get_topp_parameters(self, tool: str) -> dict: """ diff --git a/src/workflow/StreamlitUI.py b/src/workflow/StreamlitUI.py index 3182639..730f9b5 100644 --- a/src/workflow/StreamlitUI.py +++ b/src/workflow/StreamlitUI.py @@ -612,10 +612,12 @@ def input_TOPP( num_cols: int = 4, exclude_parameters: List[str] = [], include_parameters: List[str] = [], + flag_parameters: List[str] = [], display_tool_name: bool = True, display_subsections: bool = True, display_subsection_tabs: bool = False, custom_defaults: dict = {}, + tool_instance_name: str = None, ) -> None: """ Generates input widgets for TOPP tool parameters dynamically based on the tool's @@ -627,33 +629,59 @@ def input_TOPP( num_cols (int, optional): Number of columns to use for the layout. Defaults to 3. exclude_parameters (List[str], optional): List of parameter names to exclude from the widget. Defaults to an empty list. include_parameters (List[str], optional): List of parameter names to include in the widget. Defaults to an empty list. + flag_parameters (List[str], optional): List of parameter names that should + be treated as no-value CLI flags during command construction. display_tool_name (bool, optional): Whether to display the TOPP tool name. Defaults to True. display_subsections (bool, optional): Whether to split parameters into subsections based on the prefix. Defaults to True. display_subsection_tabs (bool, optional): Whether to display main subsections in separate tabs (if more than one main section). Defaults to False. custom_defaults (dict, optional): Dictionary of custom defaults to use. Defaults to an empty dict. + tool_instance_name (str, optional): A unique instance name for this tool + invocation. Allows multiple instances of the same TOPP tool with + independent parameters (e.g., two IDFilter calls). If not provided, + defaults to topp_tool_name. The instance name is used for session + state keys and parameter storage, while topp_tool_name is used for + the actual tool executable and ini file creation. """ + # Default instance name to the tool name when not provided + if tool_instance_name is None: + tool_instance_name = topp_tool_name + + # Register instance-name โ†’ real-tool-name mapping in session state + if "_topp_tool_instance_map" not in st.session_state: + st.session_state["_topp_tool_instance_map"] = {} + st.session_state["_topp_tool_instance_map"][tool_instance_name] = topp_tool_name + if "_topp_flag_params" not in st.session_state: + st.session_state["_topp_flag_params"] = {} + st.session_state["_topp_flag_params"][tool_instance_name] = list(flag_parameters) + # Persist flag metadata so execution still sees it outside UI reruns/session context. + params = self.parameter_manager.get_parameters_from_json() + if "_flag_params" not in params: + params["_flag_params"] = {} + params["_flag_params"][tool_instance_name] = list(flag_parameters) + with open(self.parameter_manager.params_file, "w", encoding="utf-8") as f: + json.dump(params, f, indent=4) if not display_subsections: display_subsection_tabs = False if display_subsection_tabs: display_subsections = True - # write defaults ini files + # Create pristine ini file (never mutated with custom defaults) ini_file_path = Path(self.parameter_manager.ini_dir, f"{topp_tool_name}.ini") - ini_existed = ini_file_path.exists() if not self.parameter_manager.create_ini(topp_tool_name): st.error(f"TOPP tool **'{topp_tool_name}'** not found.") return - if not ini_existed: - # update custom defaults if necessary - if custom_defaults: - param = poms.Param() - poms.ParamXMLFile().load(str(ini_file_path), param) - for key, value in custom_defaults.items(): - encoded_key = f"{topp_tool_name}:1:{key}".encode() - if encoded_key in param.keys(): - param.setValue(encoded_key, value) - poms.ParamXMLFile().store(str(ini_file_path), param) + + # Seed custom defaults into params.json under _defaults key + if custom_defaults: + params = self.parameter_manager.get_parameters_from_json() + if "_defaults" not in params: + params["_defaults"] = {} + params["_defaults"][tool_instance_name] = custom_defaults + with open(self.parameter_manager.params_file, "w", encoding="utf-8") as f: + json.dump(params, f, indent=4) + # Refresh self.params so widget resolution sees _defaults + self.params = self.parameter_manager.get_parameters_from_json() # read into Param object param = poms.Param() @@ -723,6 +751,7 @@ def _matches_parameter(pattern: str, key: bytes) -> bool: ":".join(key.decode().split(":")[:-1]) ), } + p["is_flag"] = (b"flag" in param.getTags(key)) # Parameter sections and subsections as string (e.g. "section:subsection") if display_subsections: p["sections"] = ":".join( @@ -730,18 +759,18 @@ def _matches_parameter(pattern: str, key: bytes) -> bool: ) params.append(p) - # for each parameter in params_decoded - # if a parameter with custom default value exists, use that value - # else check if the parameter is already in self.params, if yes take the value from self.params + # Build ini_params dict for three-layer merge + ini_params = {} + for p in params: + name = p["key"].decode().split(":1:")[1] + ini_params[name] = p["value"] + + # Resolve effective values: ini < _defaults < user overrides + merged = self.parameter_manager.get_merged_params(tool_instance_name, ini_params=ini_params) for p in params: name = p["key"].decode().split(":1:")[1] - if topp_tool_name in self.params: - if name in self.params[topp_tool_name]: - p["value"] = self.params[topp_tool_name][name] - elif name in custom_defaults: - p["value"] = custom_defaults[name] - elif name in custom_defaults: - p["value"] = custom_defaults[name] + if name in merged: + p["value"] = merged[name] # Ensure list parameters stay as lists after loading from JSON # (JSON may store single-item lists as strings) if p["original_is_list"] and isinstance(p["value"], str): @@ -775,7 +804,7 @@ def _matches_parameter(pattern: str, key: bytes) -> bool: # Display tool name if required if display_tool_name: - st.markdown(f"**{topp_tool_name}**") + st.markdown(f"**{tool_instance_name}**") tab_names = [k for k in param_sections.keys() if ":" not in k] tabs = None @@ -803,23 +832,53 @@ def display_TOPP_params(params: dict, num_cols): cols = st.columns(num_cols) i = 0 for p in params: - # get key and name - key = f"{self.parameter_manager.topp_param_prefix}{p['key'].decode()}" + # get key and name โ€“ use tool_instance_name in session state key + key_str = p['key'].decode() + if tool_instance_name != topp_tool_name: + key_str = key_str.replace(f"{topp_tool_name}:1:", f"{tool_instance_name}:1:", 1) + key = f"{self.parameter_manager.topp_param_prefix}{key_str}" name = p["name"] try: # sometimes strings with newline, handle as list if isinstance(p["value"], str) and "\n" in p["value"]: p["value"] = p["value"].split("\n") + # no-value CLI flag parameters should be shown as checkboxes + if p.get("is_flag", False): + flag_default = p["value"] + if isinstance(flag_default, str): + flag_default = flag_default.lower() in {"true", "1", "yes", "on"} + else: + flag_default = bool(flag_default) + # Streamlit widget keys persist in session_state and can override + # updated custom_defaults. Normalize and seed key explicitly. + if key in st.session_state: + current = st.session_state[key] + if isinstance(current, str): + st.session_state[key] = current.lower() in {"true", "1", "yes", "on"} + else: + st.session_state[key] = bool(current) + else: + st.session_state[key] = flag_default + cols[i].selectbox( + name, + options=[True, False], + index=0 if st.session_state[key] else 1, + format_func=lambda x: "True" if x else "False", + help=p["description"], + key=key, + ) # bools - if isinstance(p["value"], bool): - cols[i].markdown("##") - cols[i].checkbox( + elif isinstance(p["value"], bool): + bool_value = ( + (p["value"] == "true") + if type(p["value"]) == str + else p["value"] + ) + cols[i].selectbox( name, - value=( - (p["value"] == "true") - if type(p["value"]) == str - else p["value"] - ), + options=[True, False], + index=0 if bool_value else 1, + format_func=lambda x: "True" if x else "False", help=p["description"], key=key, ) @@ -904,7 +963,6 @@ def on_multiselect_change(dk=display_key, tk=key): cols[i].error(f"Error in parameter **{p['name']}**.") print('Error parsing "' + p["name"] + '": ' + str(e)) - for section, params in param_sections.items(): if tabs is None: show_subsection_header(section, display_subsections) From 033a2958ed83bc0e1aada71018fac2fa29da70d4 Mon Sep 17 00:00:00 2001 From: Yoo HoJun Date: Mon, 22 Jun 2026 16:54:12 +0900 Subject: [PATCH 2/2] fix: resolve merge conflicts and add tool_instance-name support to run_topp --- src/WorkflowTest.py | 574 +------------------------------- src/workflow/CommandExecutor.py | 81 +++-- 2 files changed, 58 insertions(+), 597 deletions(-) diff --git a/src/WorkflowTest.py b/src/WorkflowTest.py index 3f60e67..ebf7e65 100644 --- a/src/WorkflowTest.py +++ b/src/WorkflowTest.py @@ -148,6 +148,7 @@ def render_lfq_tabs(self): "max_variable_mods_in_peptide": 3, "minimum_peaks": 1, "clip_nterm_methionine": "true", + "variable_modifications": "Oxidation (M)\nAcetyl (Protein N-term)", "PeptideIndexing:IL_equivalent": True, "PeptideIndexing:unmatched_action": "warn", "PeptideIndexing:decoy_string": "rev_", @@ -672,7 +673,6 @@ def execution(self) -> bool: current_mode = self.params.get("analysis-mode", "LFQ") st.write(f"Current analysis mode: **{current_mode}**") -<<<<<<< HEAD if current_mode == "LFQ": self.logger.log("โš™๏ธ Running LFQ workflow") @@ -684,24 +684,11 @@ def execution(self) -> bool: perc_dir = results_dir / "percolator_results" filter_dir = results_dir / "psm_filter" quant_dir = results_dir / "quant_results" -======= - for d in [comet_dir, perc_dir, filter_dir, quant_dir]: - d.mkdir(parents=True, exist_ok=True) ->>>>>>> upstream/main results_dir = Path(self.workflow_dir, "input-files") -<<<<<<< HEAD for d in [comet_dir, perc_dir, filter_dir, quant_dir]: d.mkdir(parents=True, exist_ok=True) -======= - # # ================================ - # # 2๏ธโƒฃ File path definitions (per sample) - # # ================================ - comet_results = [] - percolator_results = [] - filter_results = [] ->>>>>>> upstream/main self.logger.log("๐Ÿ“ Output directories created") @@ -727,7 +714,6 @@ def execution(self) -> bool: self.logger.log("๐Ÿ”ฌ Starting per-sample processing...") -<<<<<<< HEAD # --- CometAdapter --- self.logger.log("๐Ÿ”Ž Running peptide search...") with st.spinner(f"CometAdapter ({stem})"): @@ -735,391 +721,6 @@ def execution(self) -> bool: if self.params.get("generate-decoys", True): # Propagate decoy_string from DecoyDatabase comet_extra_params["PeptideIndexing:decoy_string"] = decoy_string -======= - # Get fragment tolerance from CometAdapter parameters for visualization - comet_params = self.parameter_manager.get_topp_parameters("CometAdapter") - frag_tol = comet_params.get("fragment_mass_tolerance", 0.02) - frag_tol_is_ppm = comet_params.get("fragment_error_units", "Da") != "Da" - - # Build visualization cache for Comet results - results_dir_path = Path(self.workflow_dir, "results") - cache_dir = results_dir_path / "insight_cache" - cache_dir.mkdir(parents=True, exist_ok=True) - - # Get mzML directory - mzml_dir = Path(in_mzML[0]).parent - - # Build spectra cache (once, shared by all stages) - spectra_df = None - filename_to_index = {} - - for idxml_file in comet_results: - idxml_path = Path(idxml_file) - cache_id_prefix = idxml_path.stem - - # Parse idXML to DataFrame - id_df, spectra_data = parse_idxml(idxml_path) - - # Build spectra cache (only once) - if spectra_df is None: - filename_to_index = {Path(f).name: i for i, f in enumerate(spectra_data)} - spectra_df, filename_to_index = build_spectra_cache(mzml_dir, filename_to_index) - - # Initialize Table component (caches itself) - Table( - cache_id=f"table_{cache_id_prefix}", - data=id_df.lazy(), - cache_path=str(cache_dir), - interactivity={"file": "file_index", "spectrum": "scan_id", "identification": "id_idx"}, - column_definitions=[ - {"field": "sequence", "title": "Sequence"}, - {"field": "charge", "title": "Z", "sorter": "number"}, - {"field": "mz", "title": "m/z", "sorter": "number"}, - {"field": "rt", "title": "RT", "sorter": "number"}, - {"field": "score", "title": "Score", "sorter": "number"}, - {"field": "protein_accession", "title": "Proteins"}, - ], - initial_sort=[{"column": "score", "dir": "asc"}], - index_field="id_idx", - ) - - # Initialize Heatmap component - Heatmap( - cache_id=f"heatmap_{cache_id_prefix}", - data=id_df.lazy(), - cache_path=str(cache_dir), - x_column="rt", - y_column="mz", - intensity_column="score", - interactivity={"identification": "id_idx"}, - ) - - # Initialize SequenceView component - seq_view = SequenceView( - cache_id=f"seqview_{cache_id_prefix}", - sequence_data=id_df.lazy().select(["id_idx", "sequence", "charge", "file_index", "scan_id"]).rename({ - "id_idx": "sequence_id", - "charge": "precursor_charge", - }), - peaks_data=spectra_df.lazy(), - filters={ - "identification": "sequence_id", - "file": "file_index", - "spectrum": "scan_id", - }, - interactivity={"peak": "peak_id"}, - cache_path=str(cache_dir), - deconvolved=False, - annotation_config={ - "ion_types": ["b", "y"], - "neutral_losses": True, - "tolerance": frag_tol, - "tolerance_ppm": frag_tol_is_ppm, - }, - ) - - # Initialize LinePlot from SequenceView - LinePlot.from_sequence_view( - seq_view, - cache_id=f"lineplot_{cache_id_prefix}", - cache_path=str(cache_dir), - title="Annotated Spectrum", - styling={ - "unhighlightedColor": "#CCCCCC", - "highlightColor": "#E74C3C", - "selectedColor": "#F3A712", - }, - ) - - self.logger.log("โœ… Peptide search complete") - - # --- PercolatorAdapter --- - self.logger.log("๐Ÿ“Š Running rescoring...") - with st.spinner(f"PercolatorAdapter ({stem})"): - if not self.executor.run_topp( - "PercolatorAdapter", - { - "in": comet_results, - "out": percolator_results, - }, - {"decoy_pattern": decoy_string}, # Always propagated from upstream - ): - self.logger.log("Workflow stopped due to error") - return False - - # Build visualization cache for Percolator results - for idxml_file in percolator_results: - idxml_path = Path(idxml_file) - cache_id_prefix = idxml_path.stem - - # Parse idXML to DataFrame - id_df, spectra_data = parse_idxml(idxml_path) - - # Initialize Table component (caches itself) - Table( - cache_id=f"table_{cache_id_prefix}", - data=id_df.lazy(), - cache_path=str(cache_dir), - interactivity={"file": "file_index", "spectrum": "scan_id", "identification": "id_idx"}, - column_definitions=[ - {"field": "sequence", "title": "Sequence"}, - {"field": "charge", "title": "Z", "sorter": "number"}, - {"field": "mz", "title": "m/z", "sorter": "number"}, - {"field": "rt", "title": "RT", "sorter": "number"}, - {"field": "score", "title": "Score", "sorter": "number"}, - {"field": "protein_accession", "title": "Proteins"}, - ], - initial_sort=[{"column": "score", "dir": "asc"}], - index_field="id_idx", - ) - - # Initialize Heatmap component - Heatmap( - cache_id=f"heatmap_{cache_id_prefix}", - data=id_df.lazy(), - cache_path=str(cache_dir), - x_column="rt", - y_column="mz", - intensity_column="score", - interactivity={"identification": "id_idx"}, - ) - - # Initialize SequenceView component - seq_view = SequenceView( - cache_id=f"seqview_{cache_id_prefix}", - sequence_data=id_df.lazy().select(["id_idx", "sequence", "charge", "file_index", "scan_id"]).rename({ - "id_idx": "sequence_id", - "charge": "precursor_charge", - }), - peaks_data=spectra_df.lazy(), - filters={ - "identification": "sequence_id", - "file": "file_index", - "spectrum": "scan_id", - }, - interactivity={"peak": "peak_id"}, - cache_path=str(cache_dir), - deconvolved=False, - annotation_config={ - "ion_types": ["b", "y"], - "neutral_losses": True, - "tolerance": frag_tol, - "tolerance_ppm": frag_tol_is_ppm, - }, - ) - - # Initialize LinePlot from SequenceView - LinePlot.from_sequence_view( - seq_view, - cache_id=f"lineplot_{cache_id_prefix}", - cache_path=str(cache_dir), - title="Annotated Spectrum", - styling={ - "unhighlightedColor": "#CCCCCC", - "highlightColor": "#E74C3C", - "selectedColor": "#F3A712", - }, - ) - - self.logger.log("โœ… Rescoring complete") - - # if not Path(percolator_results[i]).exists(): - # st.error(f"PercolatorAdapter failed for {stem}") - # st.stop() - - # --- IDFilter --- - self.logger.log("๐Ÿ”ง Filtering identifications...") - with st.spinner(f"IDFilter ({stem})"): - if not self.executor.run_topp( - "IDFilter", - { - "in": percolator_results, - "out": filter_results, - }, - ): - self.logger.log("Workflow stopped due to error") - return False - - # Build visualization cache for Filter results - for idxml_file in filter_results: - idxml_path = Path(idxml_file) - cache_id_prefix = idxml_path.stem - - # Parse idXML to DataFrame - id_df, spectra_data = parse_idxml(idxml_path) - - # Initialize Table component (caches itself) - Table( - cache_id=f"table_{cache_id_prefix}", - data=id_df.lazy(), - cache_path=str(cache_dir), - interactivity={"file": "file_index", "spectrum": "scan_id", "identification": "id_idx"}, - column_definitions=[ - {"field": "sequence", "title": "Sequence"}, - {"field": "charge", "title": "Z", "sorter": "number"}, - {"field": "mz", "title": "m/z", "sorter": "number"}, - {"field": "rt", "title": "RT", "sorter": "number"}, - {"field": "score", "title": "Score", "sorter": "number"}, - {"field": "protein_accession", "title": "Proteins"}, - ], - initial_sort=[{"column": "score", "dir": "asc"}], - index_field="id_idx", - ) - - # Initialize Heatmap component - Heatmap( - cache_id=f"heatmap_{cache_id_prefix}", - data=id_df.lazy(), - cache_path=str(cache_dir), - x_column="rt", - y_column="mz", - intensity_column="score", - interactivity={"identification": "id_idx"}, - ) - - # Initialize SequenceView component - seq_view = SequenceView( - cache_id=f"seqview_{cache_id_prefix}", - sequence_data=id_df.lazy().select(["id_idx", "sequence", "charge", "file_index", "scan_id"]).rename({ - "id_idx": "sequence_id", - "charge": "precursor_charge", - }), - peaks_data=spectra_df.lazy(), - filters={ - "identification": "sequence_id", - "file": "file_index", - "spectrum": "scan_id", - }, - interactivity={"peak": "peak_id"}, - cache_path=str(cache_dir), - deconvolved=False, - annotation_config={ - "ion_types": ["b", "y"], - "neutral_losses": True, - "tolerance": frag_tol, - "tolerance_ppm": frag_tol_is_ppm, - }, - ) - - # Initialize LinePlot from SequenceView - LinePlot.from_sequence_view( - seq_view, - cache_id=f"lineplot_{cache_id_prefix}", - cache_path=str(cache_dir), - title="Annotated Spectrum", - styling={ - "unhighlightedColor": "#CCCCCC", - "highlightColor": "#E74C3C", - "selectedColor": "#F3A712", - }, - ) - - self.logger.log("โœ… Filtering complete") - - # if not Path(filter_results[i]).exists(): - # st.error(f"IDFilter failed for {stem}") - # st.stop() - - # ================================ - # EasyPQP Spectral Library Generation (optional) - # ================================ - if self.params.get("generate-library", False): - self.logger.log("๐Ÿ“š Building spectral library with EasyPQP...") - st.info("Building spectral library with EasyPQP...") - library_dir = Path(self.workflow_dir, "results", "library") - library_dir.mkdir(parents=True, exist_ok=True) - - psms_files, peaks_files = [], [] - - for filter_idxml in filter_results: - original_stem = Path(filter_idxml).stem.replace("_filter", "") - matching_mzml = next((m for m in in_mzML if Path(m).stem == original_stem), None) - if not matching_mzml: - self.logger.log(f"Warning: No matching mzML found for {filter_idxml}") - continue - - # easypqp library requires specific extensions for file recognition: - # - PSM files must contain 'psmpkl' โ†’ use .psmpkl extension - # - Peak files must contain 'peakpkl' โ†’ use .peakpkl extension - # After splitext(), stem will be just "{mzML_stem}" matching PSM base_name - psms_out = str(library_dir / f"{original_stem}.psmpkl") - peaks_out = str(library_dir / f"{original_stem}.peakpkl") - - convert_cmd = [ - "easypqp", "convert", - "--pepxml", filter_idxml, - "--spectra", matching_mzml, - "--psms", psms_out, - "--peaks", peaks_out - ] - if self.executor.run_command(convert_cmd): - psms_files.append(psms_out) - peaks_files.append(peaks_out) - - if psms_files: - # easypqp library outputs TSV format (despite common .pqp extension) - library_tsv = str(library_dir / "spectral_library.tsv") - library_cmd = ["easypqp", "library", "--out", library_tsv] - - if not self.params.get("library-use-fdr", False): - # --nofdr only skips FDR recalculation, NOT threshold filtering - # Set all thresholds to 1.0 to bypass filtering for pre-filtered input - library_cmd.extend([ - "--nofdr", - "--psm_fdr_threshold", "1.0", - "--peptide_fdr_threshold", "1.0", - "--protein_fdr_threshold", "1.0" - ]) - else: - # Apply user-specified FDR filtering - library_cmd.extend([ - "--psm_fdr_threshold", - str(self.params.get("library-psm-fdr", 0.01)), - "--peptide_fdr_threshold", - str(self.params.get("library-peptide-fdr", 0.01)), - "--protein_fdr_threshold", - str(self.params.get("library-protein-fdr", 0.01)) - ]) - - for psms, peaks in zip(psms_files, peaks_files): - library_cmd.extend([psms, peaks]) - - if self.executor.run_command(library_cmd): - self.logger.log("โœ… Spectral library created") - st.success("Spectral library created") - else: - self.logger.log("Warning: Failed to build spectral library") - else: - self.logger.log("Warning: No PSMs converted for library generation") - - st.success(f"โœ“ {stem} identification completed") - - # ================================ - # 4๏ธโƒฃ ProteomicsLFQ (cross-sample) - # ================================ - self.logger.log("๐Ÿ“ˆ Running cross-sample quantification...") - st.info("Running ProteomicsLFQ (cross-sample quantification)") - - quant_mztab = str(quant_dir / "openms_quant.mzTab") - quant_cxml = str(quant_dir / "openms.consensusXML") - quant_msstats = str(quant_dir / "openms_msstats.csv") - - with st.spinner("ProteomicsLFQ"): - combined_in = " ".join(in_mzML) - combined_ids = " ".join(filter_results) - self.logger.log(f"COMBINED_IN {combined_in}", 1) - self.logger.log(f"COMBINED_IN_TYPE {type(combined_in).__name__}", 1) - self.logger.log(f"FILTER_RESULTS = {filter_results}", 1) - self.logger.log(f"FILTER_RESULTS_LEN = {len(filter_results)}", 1) - - # โœ… Streamlit output (debug view) - st.markdown("### ๐Ÿ” ProteomicsLFQ Input Debug") - st.write("**combined_in:**", combined_in) - st.write("**combined_in type:**", type(combined_in).__name__) - - st.write("**combined_ids:**", combined_ids) - st.write("**combined_ids type:**", type(combined_ids).__name__) ->>>>>>> upstream/main if not self.executor.run_topp( "CometAdapter", @@ -1132,7 +733,6 @@ def execution(self) -> bool: self.logger.log("Workflow stopped due to error") return False -<<<<<<< HEAD # Get fragment tolerance from CometAdapter parameters for visualization comet_params = self.parameter_manager.get_topp_parameters("CometAdapter") frag_tol = comet_params.get("fragment_mass_tolerance", 0.02) @@ -1546,29 +1146,13 @@ def execution(self) -> bool: # st.error("ProteomicsLFQ failed: mzTab not created") # st.stop() -======= - # ====================================================== - # โš ๏ธ 5๏ธโƒฃ GO Enrichment Analysis (INLINE IN EXECUTION) - # ====================================================== - workspace_path = Path(self.workflow_dir).parent - res = get_abundance_data(workspace_path) - if res is not None: - pivot_df, _, _ = res - self.logger.log("โœ… pivot_df loaded, starting GO enrichment...") - self._run_go_enrichment(pivot_df, results_dir) - else: - st.warning("GO enrichment skipped: abundance data not available.") ->>>>>>> upstream/main - # ================================ # 5๏ธโƒฃ Final report # # ================================ st.success("๐ŸŽ‰ TOPP workflow completed successfully") st.write("๐Ÿ“ Results directory:") st.code(str(results_dir)) - -<<<<<<< HEAD - + st.write("๐Ÿ“„ Generated files:") st.write(f"- mzTab: {quant_mztab}") st.write(f"- consensusXML: {quant_cxml}") @@ -2024,160 +1608,6 @@ def execution(self) -> bool: self.logger.log("๐Ÿ“„ Generating protein table...") self.logger.log("๐ŸŽ‰ WORKFLOW FINISHED") -======= - return True - - def _run_go_enrichment(self, pivot_df: pd.DataFrame, results_dir: Path): - p_cutoff = 0.05 - fc_cutoff = 1.0 - - analysis_df = pivot_df.dropna(subset=["p-value", "log2FC"]).copy() - - if analysis_df.empty: - st.error("No valid statistical data found for GO enrichment.") - self.logger.log("โ— analysis_df is empty") - else: - with st.spinner("Fetching GO terms from MyGene.info API..."): - mg = mygene.MyGeneInfo() - - def get_clean_uniprot(name): - parts = str(name).split("|") - return parts[1] if len(parts) >= 2 else parts[0] - - analysis_df["UniProt"] = analysis_df["ProteinName"].apply(get_clean_uniprot) - - bg_ids = analysis_df["UniProt"].dropna().astype(str).unique().tolist() - fg_ids = analysis_df[ - (analysis_df["p-value"] < p_cutoff) & - (analysis_df["log2FC"].abs() >= fc_cutoff) - ]["UniProt"].dropna().astype(str).unique().tolist() - self.logger.log("โœ… get_clean_uniprot applied") - - if len(fg_ids) < 3: - st.warning( - f"Not enough significant proteins " - f"(p < {p_cutoff}, |log2FC| โ‰ฅ {fc_cutoff}). " - f"Found: {len(fg_ids)}" - ) - self.logger.log("โ— Not enough significant proteins") - else: - res_list = mg.querymany( - bg_ids, scopes="uniprot", fields="go", as_dataframe=False - ) - res_go = pd.DataFrame(res_list) - if "notfound" in res_go.columns: - res_go = res_go[res_go["notfound"] != True] - - def extract_go_terms(go_data, go_type): - if not isinstance(go_data, dict) or go_type not in go_data: - return [] - terms = go_data[go_type] - if isinstance(terms, dict): - terms = [terms] - return list({t.get("term") for t in terms if "term" in t}) - - for go_type in ["BP", "CC", "MF"]: - res_go[f"{go_type}_terms"] = res_go["go"].apply( - lambda x: extract_go_terms(x, go_type) - ) - - annotated_ids = set(res_go["query"].astype(str)) - fg_set = annotated_ids.intersection(fg_ids) - bg_set = annotated_ids - self.logger.log(f"โœ… fg_set bg_set are set") - - def run_go(go_type): - go2fg = defaultdict(set) - go2bg = defaultdict(set) - - for _, row in res_go.iterrows(): - uid = str(row["query"]) - for term in row[f"{go_type}_terms"]: - go2bg[term].add(uid) - if uid in fg_set: - go2fg[term].add(uid) - - records = [] - N_fg = len(fg_set) - N_bg = len(bg_set) - - for term, fg_genes in go2fg.items(): - a = len(fg_genes) - if a == 0: - continue - b = N_fg - a - c = len(go2bg[term]) - a - d = N_bg - (a + b + c) - - _, p = fisher_exact([[a, b], [c, d]], alternative="greater") - records.append({ - "GO_Term": term, - "Count": a, - "GeneRatio": f"{a}/{N_fg}", - "p_value": p, - }) - - df = pd.DataFrame(records) - if df.empty: - return None, None - - df["-log10(p)"] = -np.log10(df["p_value"].replace(0, 1e-10)) - df = df.sort_values("p_value").head(20) - - # โœ… Plotly Figure - fig = px.bar( - df, - x="-log10(p)", - y="GO_Term", - orientation="h", - title=f"GO Enrichment ({go_type})", - ) - - self.logger.log(f"โœ… Plotly Figure generated") - - fig.update_layout( - yaxis=dict(autorange="reversed"), - height=500, - margin=dict(l=10, r=10, t=40, b=10), - ) - - return fig, df - - go_results = {} - - for go_type in ["BP", "CC", "MF"]: - fig, df_go = run_go(go_type) - if fig is not None: - go_results[go_type] = { - "fig": fig, - "df": df_go - } - self.logger.log(f"โœ… go_type generated") - - go_dir = results_dir / "go-terms" - go_dir.mkdir(parents=True, exist_ok=True) - - import json - go_data = {} - - for go_type in ["BP", "CC", "MF"]: - if go_type in go_results: - fig = go_results[go_type]["fig"] - df = go_results[go_type]["df"] - - go_data[go_type] = { - "fig_json": fig.to_json(), # Figure โ†’ JSON string - "df_dict": df.to_dict(orient="records") # DataFrame โ†’ list of dicts - } - - go_json_file = go_dir / "go_results.json" - with open(go_json_file, "w") as f: - json.dump(go_data, f) - st.session_state["go_results"] = go_results - st.session_state["go_ready"] = True if go_data else False - self.logger.log("โœ… GO enrichment analysis complete") - ->>>>>>> upstream/main @st.fragment def results(self) -> None: diff --git a/src/workflow/CommandExecutor.py b/src/workflow/CommandExecutor.py index 86f265b..6a587cd 100644 --- a/src/workflow/CommandExecutor.py +++ b/src/workflow/CommandExecutor.py @@ -216,7 +216,7 @@ def read_stderr(): stdout_thread.join() stderr_thread.join() - def run_topp(self, tool: str, input_output: dict, custom_params: dict = {}) -> bool: + def run_topp(self, tool: str, input_output: dict, custom_params: dict = {}, tool_instance_name: str = None) -> bool: """ Constructs and executes commands for the specified tool OpenMS TOPP tool based on the given input and output configurations. Ensures that all input/output file lists @@ -234,6 +234,9 @@ def run_topp(self, tool: str, input_output: dict, custom_params: dict = {}) -> b tool (str): The executable name or path of the tool. input_output (dict): A dictionary specifying the input/output parameter names (as key) and their corresponding file paths (as value). custom_params (dict): A dictionary of custom parameters to pass to the tool. + tool_instance_name (str, optional): A unique instance name for this tool + invocation, used for parameter lookup when multiple instances of the + same tool exist. If not provided, defaults to the tool name. Returns: bool: True if all commands succeeded, False if any failed. @@ -242,6 +245,8 @@ def run_topp(self, tool: str, input_output: dict, custom_params: dict = {}) -> b ValueError: If the lengths of input/output file lists are inconsistent, except for single string inputs. """ + # Use tool_instance_name for parameter lookup, fall back to tool name + params_key = tool_instance_name if tool_instance_name else tool # check input: any input lists must be same length, other items can be a single string # e.g. input_mzML : [list of n mzML files], output_featureXML : [list of n featureXML files], input_database : database.tsv io_lengths = [len(v) for v in input_output.values() if len(v) > 1] @@ -261,8 +266,13 @@ def run_topp(self, tool: str, input_output: dict, custom_params: dict = {}) -> b commands = [] - # Load parameters for non-defaults - params = self.parameter_manager.get_parameters_from_json() + # Load merged parameters (_defaults + user overrides) for this tool instance + merged_params = self.parameter_manager.get_merged_params(params_key) + flag_map = self.parameter_manager.get_parameters_from_json().get("_flag_params", {}) + if not flag_map: + flag_map = st.session_state.get("_topp_flag_params", {}) + flag_params = set(flag_map.get(params_key, [])) + # Construct commands for each process for i in range(n_processes): command = [tool] @@ -281,35 +291,56 @@ def run_topp(self, tool: str, input_output: dict, custom_params: dict = {}) -> b # standard case, files was a list of strings, take the file name at index else: command += [value[i]] - # Add non-default TOPP tool parameters - if tool in params.keys(): - for k, v in params[tool].items(): - command += [f"-{k}"] - # Skip only empty strings (pass flag with no value) - # Note: 0 and 0.0 are valid values, so use explicit check - if v != "" and v is not None: - if isinstance(v, str) and "\n" in v: - command += v.split("\n") - else: - command += [str(v)] + # Add merged TOPP tool parameters (_defaults + user overrides) + for k, v in merged_params.items(): + if k in flag_params: + # CLI flag: include "-k" only when enabled + if isinstance(v, str): + is_enabled = v.lower() in {"true", "1", "yes", "on"} + else: + is_enabled = bool(v) + if is_enabled: + command += [f"-{k}"] + continue + # For non-flag parameters, skip entirely if empty. + # Note: 0 and 0.0 are valid values, so use explicit checks. + if v == "" or v is None: + continue + command += [f"-{k}"] + if isinstance(v, str) and "\n" in v: + command += v.split("\n") + elif isinstance(v, bool): + command += [str(v).lower()] + else: + command += [str(v)] # Add custom parameters for k, v in custom_params.items(): - command += [f"-{k}"] - # Skip only empty strings (pass flag with no value) - # Note: 0 and 0.0 are valid values, so use explicit check - if v != "" and v is not None: - if isinstance(v, list): - command += [str(x) for x in v] + if k in flag_params: + if isinstance(v, str): + is_enabled = v.lower() in {"true", "1", "yes", "on"} else: - command += [str(v)] + is_enabled = bool(v) + if is_enabled: + command += [f"-{k}"] + continue + if v == "" or v is None: + continue + command += [f"-{k}"] + if isinstance(v, list): + command += [str(x) for x in v] + elif isinstance(v, bool): + command += [str(v).lower()] + else: + command += [str(v)] # Add threads parameter for TOPP tools command += ["-threads", str(threads_per_command)] commands.append(command) - # check if a ini file has been written, if yes use it (contains custom defaults) - ini_path = Path(self.parameter_manager.ini_dir, tool + ".ini") - if ini_path.exists(): - command += ["-ini", str(ini_path)] + for idx, cmd in enumerate(commands): + # Print list-form command joined into a single string for readability + print(f" ๐Ÿ”น Command {idx + 1}: {' '.join(cmd)}") + print("==========================================================\n") + # Run command(s) if len(commands) == 1: