diff --git a/README.md b/README.md index 6ad7d84..e175d28 100644 --- a/README.md +++ b/README.md @@ -17,17 +17,18 @@ Scalable Ancestry Predictions from Genomic Data 3. [Installation](#install) 4. [Dependencies](#dependencies) 5. [Usage](#usage) -6. [Human ancestry predictions](#data) -7. [Demo](#demo) -8. [Documentation](#docs) -9. [Citing](#citing) -10. [License](#license) +6. [Interactive LAI HTML visualization](#lai) +7. [Human ancestry predictions](#data) +8. [Demo](#demo) +9. [Documentation](#docs) +10. [Citing](#citing) +11. [License](#license) ## Credit Written by René L Warren and Lauren Coombe ## Description -ntRoot is a framework for ancestry inference from genomic data, offering both Local Ancestry Inference (LAI) and Global Ancestry Inference (GAI). Leveraging integrated variant call sets from the 1000 Genomes Project (1kGP), ntRoot provides accurate predictions(1) of human super-population ancestry with speed and efficiency from Whole Genome Sequencing (WGS) datasets and complete or draft-stage Whole Genome Assemblies (WGA). Through streamlined processing and flexible genomic input, ntRoot holds promises for human ancestry inference of small-to-large patient/individual cohorts, enabling association studies with demographics and facilitating deeper insights into population genetics and disease risk factors. +ntRoot is a framework for ancestry inference from genomic data, offering both Local Ancestry Inference (LAI) and Global Ancestry Inference (GAI). Leveraging integrated variant call sets from the 1000 Genomes Project (1kGP), ntRoot provides accurate predictions(1) of human super-population ancestry with speed and efficiency from Whole Genome Sequencing (WGS) datasets and complete or draft-stage Whole Genome Assemblies (WGA). Through streamlined processing, flexible genomic input, and integrated local ancestry visualization, ntRoot holds promises for human ancestry inference of small-to-large patient/individual cohorts, enabling association studies with demographics and facilitating deeper insights into population genetics and disease risk factors. (1) Tested on base-accurate quality data, including Illumina short read and newer nanopore (ONT, KitV14) & PacBio CCS HiFi long read datasets, complete reference genomes and polished, Oxford Nanopore Technology long read GoldRush, Flye and Shasta draft genome assemblies @@ -93,6 +94,27 @@ Note: please specify --reads OR --genome (not both) If you have any questions about ntRoot, please open an issue at https://github.com/birollab/ntRoot ``` +## Interactive Local Ancestry (LAI) visualization + +When `--lai` is specified, ntRoot automatically generates an interactive HTML visualization of chromosomal local ancestry assignments as part of the Snakemake workflow. + +The visualization enables interactive exploration of chromosome-specific and genome-wide ancestry patterns through linked chromosome, ancestry, and summary views. + +The resulting visualization provides: + +- Chromosome-length-scaled local ancestry tracks +- Global ancestry composition summaries +- Interactive exploration of chromosome-specific ancestry patterns +- Interactive exploration of genome-wide ancestry patterns +- Hover and click highlighting across linked views +- Standalone HTML output viewable in any modern web browser + +Users can interactively explore ancestry patterns within individual chromosomes or across the entire genome using the generated standalone HTML report. + +An example visualization is available here: +[Interactive LAI demo (HTML)](ntroot-lai-interactive_tile5000000.html) + + ## Human ancestry predictions Using the 1kGP integrated variant call set. @@ -163,6 +185,9 @@ cd demo ``` Ensure that the ntRoot installation is available on your PATH. +For an example interactive Local Ancestry Inference (LAI) report generated by ntRoot, see: +[Interactive LAI demo (HTML)](ntroot-lai-interactive_tile5000000.html) + ## Documentation diff --git a/ntroot-lai-interactive_tile5000000.html b/ntroot-lai-interactive_tile5000000.html new file mode 100644 index 0000000..da38488 --- /dev/null +++ b/ntroot-lai-interactive_tile5000000.html @@ -0,0 +1,4014 @@ + + + +
+
+ + + + + \ No newline at end of file diff --git a/ntroot_run_pipeline.smk b/ntroot_run_pipeline.smk index 85ed11a..66afcb7 100644 --- a/ntroot_run_pipeline.smk +++ b/ntroot_run_pipeline.smk @@ -202,13 +202,25 @@ rule ancestry_prediction_lai: vcf = "{vcf}", ref_fai = f"{draft_base}.fai" output: - lai_output = "{vcf}_ancestry-predictions-tile-resolution_tile{tile_size}.tsv" + lai_output = "{vcf}_ancestry-predictions-tile-resolution_tile{tile_size}.tsv", + html_output = "{vcf}_ntroot-lai-interactive_tile{tile_size}.html" params: benchmark = f"{time_command} ancestry_prediction_tile{tile_size}.time" if input_vcf_basename else f"{time_command} ancestry_prediction_k{k}_tile{tile_size}.time", tile_size = tile_size, verbosity = v shell: - "{params.benchmark} ntRootAncestryPredictor.pl -f {input.vcf} -t {params.tile_size} -v {params.verbosity} -r 1 -i {input.ref_fai}" + """ + {params.benchmark} ntRootAncestryPredictor.pl \ + -f {input.vcf} \ + -t {params.tile_size} \ + -v {params.verbosity} \ + -r 1 \ + -i {input.ref_fai} + + plot_ntroot_lai.py \ + {output.lai_output} \ + {output.html_output} + """ rule sort_vcf_input: input: vcf = f"{input_vcf}" @@ -250,4 +262,4 @@ rule cross_reference_vcf: prefix=f"{input_vcf_basename}.cross-ref", strip = "--strip" if strip_info else "" shell: - "{params.benchmark} ntroot_cross_reference_vcf.py -b {input.bedtools} --vcf {input.vcf} --vcf_l {input.ref_vars} -p {params.prefix} {params.strip}" \ No newline at end of file + "{params.benchmark} ntroot_cross_reference_vcf.py -b {input.bedtools} --vcf {input.vcf} --vcf_l {input.ref_vars} -p {params.prefix} {params.strip}" diff --git a/plot_ntroot_lai.py b/plot_ntroot_lai.py new file mode 100755 index 0000000..b6a889a --- /dev/null +++ b/plot_ntroot_lai.py @@ -0,0 +1,668 @@ +#!/usr/bin/env python3 +""" +plot_ntroot_lai.py +RLW 06/2026 +""" + +import pandas as pd +import numpy as np +import plotly.graph_objects as go +import plotly.express as px +import os +import argparse + + +# Define parameters / assumes GRCh38 +CHR_LENGTHS = { + "1":248956422,"2":242193529,"3":198295559,"4":190214555, + "5":181538259,"6":170805979,"7":159345973,"8":145138636, + "9":138394717,"10":133797422,"11":135086622,"12":133275309, + "13":114364328,"14":107043718,"15":101991189,"16":90338345, + "17":83257441,"18":80373285,"19":58617616,"20":64444167, + "21":46709983,"22":50818468 +} + +# Define colours +COLOR_MAP = { + "AFR": "orange", + "EUR": "green", + "EAS": "red", + "AMR": "black", + "SAS": "blue", + "CAS": "violet", + "OCE": "lightgreen", + "NA": "#c000c0", + "REF": "#d3d3d3" +} + +# Define cluster colors +UNSUPERVISED_PALETTE = [ + "#4E79A7", # blue + "#F28E2B", # orange + "#E15759", # red + "#76B7B2", # teal + "#59A14F", # green + "#EDC948", # yellow + "#B07AA1", # purple + "#FF9DA7", # pink + "#9C755F", # brown + "#BAB0AC", # gray + "#8dd3c7", # pastels V + "#ffffb3", + "#bebada", + "#fb8072", + "#80b1d3", + "#fdb462", + "#b3de69", + "#fccde5", + "#d9d9d9", + "#bc80bd" +] + +ANCESTRY_LABELS = { + "AFR": "African", + "AMR": "Admixed American", + "EUR": "European", + "EAS": "East Asian", + "CAS": "Central Asian", + "OCE": "Oceania", + "NA": "Not available", + "SAS": "South Asian" +} + +# ---------------------------- +def get_cluster_color(cluster_id, unique_clusters): + """ + Define unsupervised color + """ + palette = UNSUPERVISED_PALETTE + #idx = list(unique_clusters).index(cluster_id) + cluster_to_idx = {c: i for i, c in enumerate(unique_clusters)} + idx = cluster_to_idx[cluster_id] + return palette[idx % len(palette)] + +# ---------------------------- +def build_dynamic_ancestry_colors(ancestries, color_map, palette): + """ + Define dynamic colour + """ + dynamic = {} + used = set(color_map.keys()) + + # stable ordering prevents nondeterministic coloring + unknowns = sorted([a for a in ancestries if a not in used]) + + for i, anc in enumerate(unknowns): + dynamic[anc] = palette[i % len(palette)] + + return dynamic + +# ----------------------------- +def parse_args(): + """ + Parse command-line arguments. + """ + + parser = argparse.ArgumentParser( + description="Generate an interactive Local Ancestry (LAI) visualization." + ) + + parser.add_argument( + "input_tsv", + help="Tile-resolution ancestry prediction TSV generated by ntRoot/korus." + ) + + parser.add_argument( + "output_html", + help="Output interactive HTML visualization." + ) + + parser.add_argument( + "predictor", + nargs="?", + default="ntRoot", + help="Predictor name displayed in the plot title (default: ntRoot)." + ) + + return parser.parse_args() + +# ----------------------------- +def extract_sample_name(path): + """ + Extract a sample identifier from an ntRoot/korus LAI TSV filename. + + The sample name is assumed to be the leading token before the first + underscore ('_'). If no underscore exists, the leading token before + the first period ('.') is used. This value is displayed beneath the + plot title as a subtitle. + + Examples + -------- + HG002_ntedit_k50_variants.vcf_ancestry.tsv -> HG002 + sample123.tsv -> sample123 + """ + + name = os.path.basename(path) + + if "_" in name: + return name.split("_", 1)[0] + + if "." in name: + return name.split(".", 1)[0] + + return name + +# ----------------------------- +def load_data(path): + """ + Load and validate an ntRoot/korus LAI tile-resolution TSV file. + + The input table is expected to contain: + - chrom + - start + - end + - ancestry_prediction + + Chromosome identifiers are converted to strings and restricted to + autosomes present in CHR_LENGTHS. Coordinate columns are converted + to numeric types to support downstream angular calculations. + + Returns + ------- + pandas.DataFrame + Cleaned dataframe ready for visualization. + """ + + df = pd.read_csv(path, sep="\t") + df["chrom"] = df["chrom"].astype(str) + df = df[df["chrom"].isin(CHR_LENGTHS.keys())].copy() + df["start"] = pd.to_numeric(df["start"]) + df["end"] = pd.to_numeric(df["end"]) + return df + +# ----------------------------- +def build_angles(): + """ + Calculate chromosome angular coordinates for the circular genome plot. + + Chromosomes are allocated angular spans proportional to their GRCh38 + lengths. A fixed inter-chromosomal gap is inserted between adjacent + chromosomes. Chromosome 1 begins at 0 degrees ("true north"), with + the final rotation applied later during Plotly layout configuration. + + Returns + ------- + dict + Chromosome -> (start_angle, end_angle) + """ + + total = sum(CHR_LENGTHS.values()) + gap = 2.0 + usable = 360 - gap * len(CHR_LENGTHS) + + angles = {} + current = 0.0 # chr1 starts at 0 + + for c, L in CHR_LENGTHS.items(): + span = usable * L / total + angles[c] = (current, current + span) + current += span + gap + + return angles + +# ----------------------------- +def build(df, out_html, input_tsv, predictor): + """ + Construct the interactive ntRoot/korus Local Ancestry (LAI) visualization. + + The figure contains four major components: + + 1. Chromosome backbone + Circular ideogram-like representation of chromosomes with lengths + scaled according to GRCh38 chromosome sizes. + + 2. Local ancestry tracks + Tile-resolution ancestry assignments displayed as colored radial + segments positioned according to genomic coordinates. + + 3. Global ancestry summary + Central donut chart summarizing ancestry proportions across all + analyzed genomic tiles. + + 4. Interactive chromosome labels + Hoverable chromosome labels that dynamically highlight ancestry + assignments from a selected chromosome. + + Additional JavaScript is injected into the exported HTML to support + interactive highlighting between ancestry tracks, chromosome labels, + and donut-chart ancestry categories. + + Parameters + ---------- + df : pandas.DataFrame + Parsed ntRoot/korus LAI tile-resolution table. + + out_html : str + Output HTML file path. + + input_tsv : str + Original TSV filename used to derive the displayed sample name. + """ + + sample_name = extract_sample_name(input_tsv) + + ancestries = sorted(df["ancestry_prediction"].dropna().unique()) + + dynamic_color_map = build_dynamic_ancestry_colors( + ancestries, + COLOR_MAP, + UNSUPERVISED_PALETTE + ) + + def anc_color(anc): + if anc in COLOR_MAP: + return COLOR_MAP[anc] + return dynamic_color_map.get(anc, "#999999") + + angles = build_angles() + + fig = go.Figure() + + # ============================= + # BACKBONE CHROMOSOMES + # ============================= + for c in CHR_LENGTHS: + + # ----------------------------- + # CHROMOSOME BOUNDARY SEPARATORS (dark radial lines) + # ----------------------------- + a0, _ = angles[c] + + fig.add_trace( + go.Scatterpolar( + #r=[0.01, 1.05], + r=[0.2, 1.05], + theta=[a0, a0], + mode="lines", + line=dict(color="rgba(80,80,80,0.35)", width=1), + hoverinfo="skip", + showlegend=False + ) + ) + + a0, a1 = angles[c] + + sub = df[df["chrom"] == c] + summary_raw = sub.groupby("ancestry_prediction")["end"].count() + summary_pct = (summary_raw / summary_raw.sum() * 100).sort_values(ascending=False) + + hover_txt = ( + f"chromosome{c}
" + f"{CHR_LENGTHS[c]/1e6:.1f} Mb

" + + "
".join([f"{k}: {v:.1f}%" for k, v in summary_pct.items()]) + ) + + fig.add_trace( + go.Barpolar( + r=[1.0], + #r=[0.2,1.0], + theta=[(a0 + a1) / 2], + width=[a1 - a0], + base=0.0, + marker=dict(color="#e0e0e0", line=dict(color="white", width=1)), + hoverinfo="skip", + #hovertemplate=hover_txt + "", + showlegend=False, + meta=f"BACKBONE|{c}" + ) + ) + + for anc in ancestries: + + if anc in ANCESTRY_LABELS: + legend_name = f"{ANCESTRY_LABELS[anc]} ({anc})" + else: + legend_name = anc + + fig.add_trace( + go.Scatterpolar( + r=[None], + theta=[None], + mode="markers", + marker=dict(color=anc_color(anc), size=12), + name=legend_name, + legendgroup=anc, + showlegend=True + ) + ) + + # ============================= + # LAI TRACKS + # ============================= + for anc in ancestries: + for c in CHR_LENGTHS: + + sub = df[ + (df["ancestry_prediction"] == anc) & + (df["chrom"] == c) + ] + + t, w, custom = [], [], [] + + for _, r in sub.iterrows(): + c = str(r["chrom"]) + a0, a1 = angles[c] + span = a1 - a0 + clen = CHR_LENGTHS[c] + + t0 = a0 + span * (r["start"] / clen) + t1 = a0 + span * (r["end"] / clen) + + t.append((t0 + t1) / 2) + w.append(t1 - t0) + custom.append([c, int(r["start"]), int(r["end"]), ANCESTRY_LABELS.get(anc, anc)]) + + fig.add_trace( + go.Barpolar( + r=[0.45]*len(t), + theta=t, + width=w, + base=0.55, + marker=dict( + color=anc_color(anc), + line=dict(width=0) + ), + legendgroup=anc, + showlegend=False, + customdata=custom, + hovertemplate= + "Chromosome %{customdata[0]}
" + "%{customdata[1]:,} - %{customdata[2]:,}
" + "%{customdata[3]}", + meta=f"{anc}|{c}" + ) + ) + + # ============================= + # PIE + # ============================= + df["tile_len"] = df["end"] - df["start"] + 1 + frac_bp = df.groupby("ancestry_prediction")["tile_len"].sum() + frac_mb = (frac_bp / 1e6).round(1) + frac_pct = frac_bp / frac_bp.sum() * 100 + + pie_text = [] + + for anc in frac_mb.index: + pct = frac_pct[anc] + + if pct >= 10: + pie_text.append(f"{anc}
{pct:.1f}%") + else: + pie_text.append("") + + fig.add_trace( + go.Pie( + labels=list(frac_mb.index), + values=list(frac_mb.values), + hole=0.55, + sort=False, + marker=dict(colors=[anc_color(a) for a in frac_mb.index]), + text=pie_text, + textinfo="text", + textposition="inside", + insidetextorientation="horizontal", + textfont=dict(size=11, color="white"), + showlegend=False, + domain=dict(x=[0.35, 0.65], y=[0.35, 0.65]), + customdata=[ANCESTRY_LABELS.get(a, a) for a in frac_mb.index], + hovertemplate="%{customdata}
%{percent:.1%}
%{value} Mb", + ) + ) + # ============================= + # CHR LABELS + # ============================= + label_theta = [] + label_text = [] + hover_texts = [] + + for c in CHR_LENGTHS: + a0, a1 = angles[c] + + sub = df[df["chrom"] == c] + summary_raw = sub.groupby("ancestry_prediction")["end"].count() + summary_pct = (summary_raw / summary_raw.sum() * 100).sort_values(ascending=False) + + hover_txt = ( + f"{CHR_LENGTHS[c]/1e6:.1f} Mb

" + + "
".join([f"{k}: {v:.1f}%" for k, v in summary_pct.items()]) + ) + + label_theta.append((a0 + a1) / 2) + label_text.append(f"{c}") + hover_texts.append(hover_txt) + + fig.add_trace( + go.Scatterpolar( + r=[1.12]*len(label_theta), + theta=label_theta, + mode="text", + text=label_text, + customdata=[ + [c, hover_texts[i]] for i, c in enumerate(CHR_LENGTHS) + ], + hovertemplate= + "Chromosome %{customdata[0]}
" + "%{customdata[1]}", + showlegend=False, + textfont=dict(size=14), + meta="CHR_LABELS" + ) + ) + + # ============================= + # LAYOUT + # ============================= + fig.update_layout( + + title=dict( + text=f"{predictor} — Chromosomal Local Ancestry (LAI)
{sample_name}", + x=0.41, + xanchor="center", + font=dict(size=20) + ), + legend=dict( + x=1.3, + y=1.0, + xanchor="right", + yanchor="top", + font=dict(size=14) + ), + polar=dict( + bgcolor="white", + radialaxis=dict(visible=False, range=[0, 1.2]), + angularaxis=dict( + rotation=90, # chr1 at top + direction="clockwise", + showgrid=False, + showticklabels=False + ) + ), + margin=dict(l=40, r=180, t=60, b=40), + height=750, #820, + width=950#1050 + ) + + # ============================= + # LINKING (LAI ↔ PIE ↔ BACKBONE) + # ============================= + js = """ + +""" + + html = fig.to_html(full_html=True, include_plotlyjs=True) + html = html.replace("", js + "\n") + + with open(out_html, "w") as f: + f.write(html) + +# ----------------------------- +def main(): + """ + Command-line entry point. + + Expected arguments + ------------------ + input.tsv + Tile-resolution local ancestry prediction file. + + output.html + Interactive Plotly visualization. + + [predictor] + Optional predictor name displayed in the title + (default: ntRoot). + + The TSV is loaded, converted into an interactive circular LAI plot, + and written as a standalone HTML document. + """ + + args = parse_args() + + df = load_data(args.input_tsv) + + build( + df, + args.output_html, + args.input_tsv, + args.predictor + ) + +if __name__ == "__main__": + main()