Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 55 additions & 24 deletions app/components/UserDataUpload.tsx
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"use client";

import { useState, useRef, createContext, useContext, useCallback, useEffect } from "react";
import pako from "pako";
import { GenotypeData, detectAndParseGenotypeFile, validateFileSize, validateFileFormat } from "@/lib/genotype-parser";
import { calculateFileHash } from "@/lib/file-hash";
import {
Expand Down Expand Up @@ -35,6 +36,55 @@ type GenotypeContextType = {

const GenotypeContext = createContext<GenotypeContextType | null>(null);

const MAX_TEXT_FILE_SIZE_MB = 100;
const MAX_GZIP_FILE_SIZE_MB = 150;
const MAX_DECOMPRESSED_FILE_SIZE_MB = 250;

function getUploadSizeLimitMB(file: File, fileExtension: string): number {
return fileExtension === "gz" ? MAX_GZIP_FILE_SIZE_MB : MAX_TEXT_FILE_SIZE_MB;
}

function trimTrailingGarbageFromGzip(bytes: Uint8Array): Uint8Array {
const minimumGzipSize = 18;
const maxTrimBytes = Math.min(16 * 1024, bytes.length - minimumGzipSize);

for (let trimBytes = 1; trimBytes <= maxTrimBytes; trimBytes += 1) {
const trimmed = bytes.subarray(0, bytes.length - trimBytes);
try {
pako.ungzip(trimmed);
return trimmed;
} catch {
continue;
}
}

return bytes;
}

function readTextFromFile(file: File, fileExtension: string): Promise<string> {
if (fileExtension !== "gz") {
return file.text();
}

return file.arrayBuffer().then((buffer) => {
const compressed = new Uint8Array(buffer);

let decompressed: Uint8Array;
try {
decompressed = pako.ungzip(compressed);
} catch {
const trimmed = trimTrailingGarbageFromGzip(compressed);
decompressed = pako.ungzip(trimmed);
}

if (decompressed.byteLength > MAX_DECOMPRESSED_FILE_SIZE_MB * 1024 * 1024) {
throw new Error(`Expanded file is too large. Maximum decompressed size is ${MAX_DECOMPRESSED_FILE_SIZE_MB}MB.`);
}

return new TextDecoder().decode(decompressed);
});
}

export function GenotypeProvider({ children }: { children: React.ReactNode }) {
const [genotypeData, setGenotypeData] = useState<Map<string, string> | null>(null);
const [isLoading, setIsLoading] = useState(false);
Expand All @@ -55,36 +105,17 @@ export function GenotypeProvider({ children }: { children: React.ReactNode }) {
trackGenotypeFileUploadStarted(source);

try {
if (!validateFileSize(file, 50)) {
throw new Error('File too large. Maximum size is 50MB.');
const maxSizeMB = getUploadSizeLimitMB(file, fileExtension);
if (!validateFileSize(file, maxSizeMB)) {
const sizeKind = fileExtension === "gz" ? "compressed" : "raw";
throw new Error(`File too large. Maximum ${sizeKind} size is ${maxSizeMB}MB.`);
}

if (!validateFileFormat(file)) {
throw new Error('Unsupported file type. Please upload a .txt, .tsv, .csv, or .gz file exported from 23andMe, AncestryDNA, MyHeritage, FTDNA, LivingDNA, or a compatible provider.');
}

let fileContent: string;
if (fileExtension === 'gz') {
const buffer = await file.arrayBuffer();
const ds = new DecompressionStream('gzip');
const writer = ds.writable.getWriter();
writer.write(buffer);
writer.close();
const chunks: Uint8Array[] = [];
const reader = ds.readable.getReader();
while (true) {
const { value, done } = await reader.read();
if (done) break;
if (value) chunks.push(value);
}
const total = chunks.reduce((n, c) => n + c.length, 0);
const combined = new Uint8Array(total);
let offset = 0;
for (const chunk of chunks) { combined.set(chunk, offset); offset += chunk.length; }
fileContent = new TextDecoder().decode(combined);
} else {
fileContent = await file.text();
}
const fileContent = await readTextFromFile(file, fileExtension);
const hash = calculateFileHash(fileContent);

trackGenotypeParseStarted(source, fileExtension);
Expand Down
180 changes: 163 additions & 17 deletions lib/genotype-parser.ts
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ export type ParseResult = {
error?: string;
totalVariants?: number;
validVariants?: number;
detectedFormat?: 'monadic' | '23andme' | 'ancestrydna';
detectedFormat?: 'monadic' | '23andme' | 'ancestrydna' | 'myheritage' | 'ftdna' | 'livingdna' | 'mapmygenome';
};

// Chromosome 26 = mitochondrial in AncestryDNA exports.
Expand All @@ -28,13 +28,35 @@ function splitLines(content: string): string[] {
}

function stripQuotes(value: string): string {
// Remove all quote charactersDNA field values (rsid, chr, position, allele) never contain quotes.
// Remove all quote characters; DNA field values never contain literal quotes.
return value.trim().replace(/"/g, '');
}

function preprocessContent(content: string): string {
return content
.replace(/^\uFEFF/, '')
.replace(/\u0000/g, '')
.replace(/\r\n?/g, '\n');
}

function inferProvider(content: string): ParseResult['detectedFormat'] {
const preview = content.slice(0, 4000).toLowerCase();

if (preview.includes('living dna')) return 'livingdna';
if (preview.includes('mapmygenome')) return 'mapmygenome';
if (preview.includes('myheritage')) return 'myheritage';
if (preview.includes('familytreedna') || preview.includes('family tree dna') || preview.includes('famfinder')) return 'ftdna';
if (preview.includes('ancestrydna') || preview.includes('ancestry.com')) return 'ancestrydna';
if (preview.includes('23andme')) return '23andme';
if (preview.includes('monadic dna')) return 'monadic';

return undefined;
}

export function parse23andMeFile(content: string): ParseResult {
try {
const lines = splitLines(content);
const normalizedContent = preprocessContent(content);
const lines = splitLines(normalizedContent);
const genotypeData: GenotypeData[] = [];
let totalVariants = 0;
let validVariants = 0;
Expand Down Expand Up @@ -86,15 +108,22 @@ export function parse23andMeFile(content: string): ParseResult {
};
}

return { success: true, data: genotypeData, totalVariants, validVariants, detectedFormat: '23andme' };
return {
success: true,
data: genotypeData,
totalVariants,
validVariants,
detectedFormat: inferProvider(normalizedContent) || '23andme',
};
} catch (error) {
return { success: false, error: `Failed to parse file: ${error instanceof Error ? error.message : 'Unknown error'}` };
}
}

export function parseMonadicDNAFile(content: string): ParseResult {
try {
const lines = splitLines(content);
const normalizedContent = preprocessContent(content);
const lines = splitLines(normalizedContent);
const genotypeData: GenotypeData[] = [];
let totalVariants = 0;
let validVariants = 0;
Expand Down Expand Up @@ -160,15 +189,22 @@ export function parseMonadicDNAFile(content: string): ParseResult {
return { success: false, error: 'No valid genotype data found in file. Please ensure the file is in Monadic DNA format.' };
}

return { success: true, data: genotypeData, totalVariants, validVariants, detectedFormat: 'monadic' };
return {
success: true,
data: genotypeData,
totalVariants,
validVariants,
detectedFormat: inferProvider(normalizedContent) || 'monadic',
};
} catch (error) {
return { success: false, error: `Failed to parse file: ${error instanceof Error ? error.message : 'Unknown error'}` };
}
}

export function parseAncestryDNAFile(content: string): ParseResult {
try {
const lines = splitLines(content);
const normalizedContent = preprocessContent(content);
const lines = splitLines(normalizedContent);
const genotypeData: GenotypeData[] = [];
let totalVariants = 0;
let validVariants = 0;
Expand Down Expand Up @@ -279,15 +315,125 @@ export function parseAncestryDNAFile(content: string): ParseResult {
return { success: false, error: 'No valid genotype data found in file. Please ensure the file is in AncestryDNA format.' };
}

return { success: true, data: genotypeData, totalVariants, validVariants, detectedFormat: 'ancestrydna' };
return {
success: true,
data: genotypeData,
totalVariants,
validVariants,
detectedFormat: inferProvider(normalizedContent) || 'ancestrydna',
};
} catch (error) {
return { success: false, error: `Failed to parse file: ${error instanceof Error ? error.message : 'Unknown error'}` };
}
}

export function parseMapmygenomeFile(content: string): ParseResult {
try {
const normalizedContent = preprocessContent(content);
const lines = splitLines(normalizedContent);
const genotypeData: GenotypeData[] = [];
let totalVariants = 0;
let validVariants = 0;
let headerFound = false;
let rsidIdx = 0;
let chromosomeIdx = -1;
let positionIdx = -1;
let allele1Idx = -1;
let allele2Idx = -1;

for (const line of lines) {
const trimmedLine = line.trim();
if (!trimmedLine) continue;

if (!headerFound) {
const cols = trimmedLine.split('\t').map(c => stripQuotes(c));
const lower = cols.map(c => c.toLowerCase());
const hasPlusAlleles = lower.includes('allele1...plus') && lower.includes('allele2...plus');
const hasPosition = lower.includes('position');
const hasChromosome = lower.includes('chr') || lower.includes('chromosome');
const hasProbeId = lower.includes('rsid') || lower.includes('snp name') || lower.includes('snp.name');

if (hasPlusAlleles && hasPosition && hasChromosome && hasProbeId) {
headerFound = true;
rsidIdx = lower.findIndex(c => c === 'rsid' || c === 'snp name' || c === 'snp.name');
chromosomeIdx = lower.findIndex(c => c === 'chr' || c === 'chromosome');
positionIdx = lower.findIndex(c => c === 'position');
allele1Idx = lower.indexOf('allele1...plus');
allele2Idx = lower.indexOf('allele2...plus');
}
continue;
}

totalVariants++;
const parts = trimmedLine.split('\t').map(stripQuotes);
if (parts.length <= Math.max(rsidIdx, chromosomeIdx, positionIdx, allele1Idx, allele2Idx)) continue;

const rsid = parts[rsidIdx];
const chromosome = parts[chromosomeIdx];
const positionStr = parts[positionIdx];
const allele1 = parts[allele1Idx] || '0';
const allele2 = parts[allele2Idx] || '0';

if (!rsid.startsWith('rs')) continue;
if (!VALID_CHROMOSOMES.has(chromosome)) continue;

const position = parseInt(positionStr, 10);
if (!Number.isInteger(position) || position <= 0) continue;

if ((allele1 === '--' && allele2 === '--') || (allele1 === '-' && allele2 === '-')) {
genotypeData.push({ rsid, chromosome, position, genotype: '--' });
validVariants++;
continue;
}

if (!VALID_BASES.has(allele1) || !VALID_BASES.has(allele2)) continue;
const genotype = (allele1 === '0' || allele2 === '0') ? '--' : allele1 + allele2;

genotypeData.push({ rsid, chromosome, position, genotype });
validVariants++;
}

if (!headerFound) {
return {
success: false,
error: 'No valid Mapmygenome header found. Expected Illumina-style SNP table with plus-strand allele columns.',
};
}

if (validVariants === 0) {
return { success: false, error: 'No valid genotype data found in file. Please ensure the file is in Mapmygenome format.' };
}

return {
success: true,
data: genotypeData,
totalVariants,
validVariants,
detectedFormat: 'mapmygenome',
};
} catch (error) {
return { success: false, error: `Failed to parse file: ${error instanceof Error ? error.message : 'Unknown error'}` };
}
}

export function detectAndParseGenotypeFile(content: string): ParseResult {
const normalizedContent = preprocessContent(content);
// Scan first 50 lines — some files have long comment/metadata sections before the header.
const lines = splitLines(content).slice(0, 50);
const lines = splitLines(normalizedContent).slice(0, 50);

const hasMapmygenomeHeader = lines.some(line => {
const cols = line.trim().split('\t').map(c => stripQuotes(c).toLowerCase());
return (
(cols.includes('rsid') || cols.includes('snp name') || cols.includes('snp.name')) &&
(cols.includes('chr') || cols.includes('chromosome')) &&
cols.includes('position') &&
cols.includes('allele1...plus') &&
cols.includes('allele2...plus')
);
});
if (hasMapmygenomeHeader) {
return parseMapmygenomeFile(normalizedContent);
}

// Monadic DNA: CSV/TSV with specific header (also matches MyHeritage, FTDNA, generic 4-col formats).
// Require an explicit tab or comma so purely space-delimited files fall through to the
Expand All @@ -304,7 +450,7 @@ export function detectAndParseGenotypeFile(content: string): ParseResult {
);
});
if (hasMonadicHeader) {
return parseMonadicDNAFile(content);
return parseMonadicDNAFile(normalizedContent);
}

// AncestryDNA: non-comment header line with rsid + chromosome + position + allele columns.
Expand All @@ -318,7 +464,7 @@ export function detectAndParseGenotypeFile(content: string): ParseResult {
lower.includes('position');
});
if (hasAncestryHeader) {
return parseAncestryDNAFile(content);
return parseAncestryDNAFile(normalizedContent);
}

// 23andMe and compatible formats: comment lines starting with #.
Expand All @@ -334,25 +480,25 @@ export function detectAndParseGenotypeFile(content: string): ParseResult {
return (lower.includes('rsid') || lower.includes('name')) && lower.includes('chromosome') && lower.includes('allele');
});
if (commentHasAlleleHeader) {
const ancestryResult = parseAncestryDNAFile(content);
const ancestryResult = parseAncestryDNAFile(normalizedContent);
if (ancestryResult.success) return ancestryResult;
}
return parse23andMeFile(content);
return parse23andMeFile(normalizedContent);
}

// Blind fallback.
const result23andMe = parse23andMeFile(content);
const result23andMe = parse23andMeFile(normalizedContent);
if (result23andMe.success) return result23andMe;

const resultAncestry = parseAncestryDNAFile(content);
const resultAncestry = parseAncestryDNAFile(normalizedContent);
if (resultAncestry.success) return resultAncestry;

const resultMonadic = parseMonadicDNAFile(content);
const resultMonadic = parseMonadicDNAFile(normalizedContent);
if (resultMonadic.success) return resultMonadic;

return {
success: false,
error: 'Unable to detect file format. Supported formats: 23andMe (.txt), AncestryDNA (.txt), or Monadic DNA (.csv)',
error: 'Unable to detect file format. Supported formats include 23andMe, AncestryDNA, MyHeritage, FTDNA, LivingDNA, and compatible raw DNA exports.',
};
}

Expand Down
2 changes: 1 addition & 1 deletion next-env.d.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/// <reference types="next" />
/// <reference types="next/image-types/global" />
import "./.next/dev/types/routes.d.ts";
import "./.next/types/routes.d.ts";

// NOTE: This file should not be edited
// see https://nextjs.org/docs/app/api-reference/config/typescript for more information.
Loading
Loading