diff --git a/app/components/UserDataUpload.tsx b/app/components/UserDataUpload.tsx index 40a710a..c8c2e71 100644 --- a/app/components/UserDataUpload.tsx +++ b/app/components/UserDataUpload.tsx @@ -1,6 +1,7 @@ "use client"; import { useState, useRef, createContext, useContext, useCallback, useEffect } from "react"; +import pako from "pako"; import { GenotypeData, detectAndParseGenotypeFile, validateFileSize, validateFileFormat } from "@/lib/genotype-parser"; import { calculateFileHash } from "@/lib/file-hash"; import { @@ -35,6 +36,55 @@ type GenotypeContextType = { const GenotypeContext = createContext(null); +const MAX_TEXT_FILE_SIZE_MB = 100; +const MAX_GZIP_FILE_SIZE_MB = 150; +const MAX_DECOMPRESSED_FILE_SIZE_MB = 250; + +function getUploadSizeLimitMB(file: File, fileExtension: string): number { + return fileExtension === "gz" ? MAX_GZIP_FILE_SIZE_MB : MAX_TEXT_FILE_SIZE_MB; +} + +function trimTrailingGarbageFromGzip(bytes: Uint8Array): Uint8Array { + const minimumGzipSize = 18; + const maxTrimBytes = Math.min(16 * 1024, bytes.length - minimumGzipSize); + + for (let trimBytes = 1; trimBytes <= maxTrimBytes; trimBytes += 1) { + const trimmed = bytes.subarray(0, bytes.length - trimBytes); + try { + pako.ungzip(trimmed); + return trimmed; + } catch { + continue; + } + } + + return bytes; +} + +function readTextFromFile(file: File, fileExtension: string): Promise { + if (fileExtension !== "gz") { + return file.text(); + } + + return file.arrayBuffer().then((buffer) => { + const compressed = new Uint8Array(buffer); + + let decompressed: Uint8Array; + try { + decompressed = pako.ungzip(compressed); + } catch { + const trimmed = trimTrailingGarbageFromGzip(compressed); + decompressed = pako.ungzip(trimmed); + } + + if (decompressed.byteLength > MAX_DECOMPRESSED_FILE_SIZE_MB * 1024 * 1024) { + throw new Error(`Expanded file is too large. Maximum decompressed size is ${MAX_DECOMPRESSED_FILE_SIZE_MB}MB.`); + } + + return new TextDecoder().decode(decompressed); + }); +} + export function GenotypeProvider({ children }: { children: React.ReactNode }) { const [genotypeData, setGenotypeData] = useState | null>(null); const [isLoading, setIsLoading] = useState(false); @@ -55,36 +105,17 @@ export function GenotypeProvider({ children }: { children: React.ReactNode }) { trackGenotypeFileUploadStarted(source); try { - if (!validateFileSize(file, 50)) { - throw new Error('File too large. Maximum size is 50MB.'); + const maxSizeMB = getUploadSizeLimitMB(file, fileExtension); + if (!validateFileSize(file, maxSizeMB)) { + const sizeKind = fileExtension === "gz" ? "compressed" : "raw"; + throw new Error(`File too large. Maximum ${sizeKind} size is ${maxSizeMB}MB.`); } if (!validateFileFormat(file)) { throw new Error('Unsupported file type. Please upload a .txt, .tsv, .csv, or .gz file exported from 23andMe, AncestryDNA, MyHeritage, FTDNA, LivingDNA, or a compatible provider.'); } - let fileContent: string; - if (fileExtension === 'gz') { - const buffer = await file.arrayBuffer(); - const ds = new DecompressionStream('gzip'); - const writer = ds.writable.getWriter(); - writer.write(buffer); - writer.close(); - const chunks: Uint8Array[] = []; - const reader = ds.readable.getReader(); - while (true) { - const { value, done } = await reader.read(); - if (done) break; - if (value) chunks.push(value); - } - const total = chunks.reduce((n, c) => n + c.length, 0); - const combined = new Uint8Array(total); - let offset = 0; - for (const chunk of chunks) { combined.set(chunk, offset); offset += chunk.length; } - fileContent = new TextDecoder().decode(combined); - } else { - fileContent = await file.text(); - } + const fileContent = await readTextFromFile(file, fileExtension); const hash = calculateFileHash(fileContent); trackGenotypeParseStarted(source, fileExtension); diff --git a/lib/genotype-parser.ts b/lib/genotype-parser.ts index e83a0d0..a38b6c5 100644 --- a/lib/genotype-parser.ts +++ b/lib/genotype-parser.ts @@ -11,7 +11,7 @@ export type ParseResult = { error?: string; totalVariants?: number; validVariants?: number; - detectedFormat?: 'monadic' | '23andme' | 'ancestrydna'; + detectedFormat?: 'monadic' | '23andme' | 'ancestrydna' | 'myheritage' | 'ftdna' | 'livingdna' | 'mapmygenome'; }; // Chromosome 26 = mitochondrial in AncestryDNA exports. @@ -28,13 +28,35 @@ function splitLines(content: string): string[] { } function stripQuotes(value: string): string { - // Remove all quote characters — DNA field values (rsid, chr, position, allele) never contain quotes. + // Remove all quote characters; DNA field values never contain literal quotes. return value.trim().replace(/"/g, ''); } +function preprocessContent(content: string): string { + return content + .replace(/^\uFEFF/, '') + .replace(/\u0000/g, '') + .replace(/\r\n?/g, '\n'); +} + +function inferProvider(content: string): ParseResult['detectedFormat'] { + const preview = content.slice(0, 4000).toLowerCase(); + + if (preview.includes('living dna')) return 'livingdna'; + if (preview.includes('mapmygenome')) return 'mapmygenome'; + if (preview.includes('myheritage')) return 'myheritage'; + if (preview.includes('familytreedna') || preview.includes('family tree dna') || preview.includes('famfinder')) return 'ftdna'; + if (preview.includes('ancestrydna') || preview.includes('ancestry.com')) return 'ancestrydna'; + if (preview.includes('23andme')) return '23andme'; + if (preview.includes('monadic dna')) return 'monadic'; + + return undefined; +} + export function parse23andMeFile(content: string): ParseResult { try { - const lines = splitLines(content); + const normalizedContent = preprocessContent(content); + const lines = splitLines(normalizedContent); const genotypeData: GenotypeData[] = []; let totalVariants = 0; let validVariants = 0; @@ -86,7 +108,13 @@ export function parse23andMeFile(content: string): ParseResult { }; } - return { success: true, data: genotypeData, totalVariants, validVariants, detectedFormat: '23andme' }; + return { + success: true, + data: genotypeData, + totalVariants, + validVariants, + detectedFormat: inferProvider(normalizedContent) || '23andme', + }; } catch (error) { return { success: false, error: `Failed to parse file: ${error instanceof Error ? error.message : 'Unknown error'}` }; } @@ -94,7 +122,8 @@ export function parse23andMeFile(content: string): ParseResult { export function parseMonadicDNAFile(content: string): ParseResult { try { - const lines = splitLines(content); + const normalizedContent = preprocessContent(content); + const lines = splitLines(normalizedContent); const genotypeData: GenotypeData[] = []; let totalVariants = 0; let validVariants = 0; @@ -160,7 +189,13 @@ export function parseMonadicDNAFile(content: string): ParseResult { return { success: false, error: 'No valid genotype data found in file. Please ensure the file is in Monadic DNA format.' }; } - return { success: true, data: genotypeData, totalVariants, validVariants, detectedFormat: 'monadic' }; + return { + success: true, + data: genotypeData, + totalVariants, + validVariants, + detectedFormat: inferProvider(normalizedContent) || 'monadic', + }; } catch (error) { return { success: false, error: `Failed to parse file: ${error instanceof Error ? error.message : 'Unknown error'}` }; } @@ -168,7 +203,8 @@ export function parseMonadicDNAFile(content: string): ParseResult { export function parseAncestryDNAFile(content: string): ParseResult { try { - const lines = splitLines(content); + const normalizedContent = preprocessContent(content); + const lines = splitLines(normalizedContent); const genotypeData: GenotypeData[] = []; let totalVariants = 0; let validVariants = 0; @@ -279,15 +315,125 @@ export function parseAncestryDNAFile(content: string): ParseResult { return { success: false, error: 'No valid genotype data found in file. Please ensure the file is in AncestryDNA format.' }; } - return { success: true, data: genotypeData, totalVariants, validVariants, detectedFormat: 'ancestrydna' }; + return { + success: true, + data: genotypeData, + totalVariants, + validVariants, + detectedFormat: inferProvider(normalizedContent) || 'ancestrydna', + }; + } catch (error) { + return { success: false, error: `Failed to parse file: ${error instanceof Error ? error.message : 'Unknown error'}` }; + } +} + +export function parseMapmygenomeFile(content: string): ParseResult { + try { + const normalizedContent = preprocessContent(content); + const lines = splitLines(normalizedContent); + const genotypeData: GenotypeData[] = []; + let totalVariants = 0; + let validVariants = 0; + let headerFound = false; + let rsidIdx = 0; + let chromosomeIdx = -1; + let positionIdx = -1; + let allele1Idx = -1; + let allele2Idx = -1; + + for (const line of lines) { + const trimmedLine = line.trim(); + if (!trimmedLine) continue; + + if (!headerFound) { + const cols = trimmedLine.split('\t').map(c => stripQuotes(c)); + const lower = cols.map(c => c.toLowerCase()); + const hasPlusAlleles = lower.includes('allele1...plus') && lower.includes('allele2...plus'); + const hasPosition = lower.includes('position'); + const hasChromosome = lower.includes('chr') || lower.includes('chromosome'); + const hasProbeId = lower.includes('rsid') || lower.includes('snp name') || lower.includes('snp.name'); + + if (hasPlusAlleles && hasPosition && hasChromosome && hasProbeId) { + headerFound = true; + rsidIdx = lower.findIndex(c => c === 'rsid' || c === 'snp name' || c === 'snp.name'); + chromosomeIdx = lower.findIndex(c => c === 'chr' || c === 'chromosome'); + positionIdx = lower.findIndex(c => c === 'position'); + allele1Idx = lower.indexOf('allele1...plus'); + allele2Idx = lower.indexOf('allele2...plus'); + } + continue; + } + + totalVariants++; + const parts = trimmedLine.split('\t').map(stripQuotes); + if (parts.length <= Math.max(rsidIdx, chromosomeIdx, positionIdx, allele1Idx, allele2Idx)) continue; + + const rsid = parts[rsidIdx]; + const chromosome = parts[chromosomeIdx]; + const positionStr = parts[positionIdx]; + const allele1 = parts[allele1Idx] || '0'; + const allele2 = parts[allele2Idx] || '0'; + + if (!rsid.startsWith('rs')) continue; + if (!VALID_CHROMOSOMES.has(chromosome)) continue; + + const position = parseInt(positionStr, 10); + if (!Number.isInteger(position) || position <= 0) continue; + + if ((allele1 === '--' && allele2 === '--') || (allele1 === '-' && allele2 === '-')) { + genotypeData.push({ rsid, chromosome, position, genotype: '--' }); + validVariants++; + continue; + } + + if (!VALID_BASES.has(allele1) || !VALID_BASES.has(allele2)) continue; + const genotype = (allele1 === '0' || allele2 === '0') ? '--' : allele1 + allele2; + + genotypeData.push({ rsid, chromosome, position, genotype }); + validVariants++; + } + + if (!headerFound) { + return { + success: false, + error: 'No valid Mapmygenome header found. Expected Illumina-style SNP table with plus-strand allele columns.', + }; + } + + if (validVariants === 0) { + return { success: false, error: 'No valid genotype data found in file. Please ensure the file is in Mapmygenome format.' }; + } + + return { + success: true, + data: genotypeData, + totalVariants, + validVariants, + detectedFormat: 'mapmygenome', + }; } catch (error) { return { success: false, error: `Failed to parse file: ${error instanceof Error ? error.message : 'Unknown error'}` }; } } export function detectAndParseGenotypeFile(content: string): ParseResult { + const normalizedContent = preprocessContent(content); // Scan first 50 lines — some files have long comment/metadata sections before the header. - const lines = splitLines(content).slice(0, 50); + const lines = splitLines(normalizedContent).slice(0, 50); + + const hasMapmygenomeHeader = lines.some(line => { + const cols = line.trim().split('\t').map(c => stripQuotes(c).toLowerCase()); + return ( + (cols.includes('rsid') || cols.includes('snp name') || cols.includes('snp.name')) && + (cols.includes('chr') || cols.includes('chromosome')) && + cols.includes('position') && + cols.includes('allele1...plus') && + cols.includes('allele2...plus') + ); + }); + if (hasMapmygenomeHeader) { + return parseMapmygenomeFile(normalizedContent); + } // Monadic DNA: CSV/TSV with specific header (also matches MyHeritage, FTDNA, generic 4-col formats). // Require an explicit tab or comma so purely space-delimited files fall through to the @@ -304,7 +450,7 @@ export function detectAndParseGenotypeFile(content: string): ParseResult { ); }); if (hasMonadicHeader) { - return parseMonadicDNAFile(content); + return parseMonadicDNAFile(normalizedContent); } // AncestryDNA: non-comment header line with rsid + chromosome + position + allele columns. @@ -318,7 +464,7 @@ export function detectAndParseGenotypeFile(content: string): ParseResult { lower.includes('position'); }); if (hasAncestryHeader) { - return parseAncestryDNAFile(content); + return parseAncestryDNAFile(normalizedContent); } // 23andMe and compatible formats: comment lines starting with #. @@ -334,25 +480,25 @@ export function detectAndParseGenotypeFile(content: string): ParseResult { return (lower.includes('rsid') || lower.includes('name')) && lower.includes('chromosome') && lower.includes('allele'); }); if (commentHasAlleleHeader) { - const ancestryResult = parseAncestryDNAFile(content); + const ancestryResult = parseAncestryDNAFile(normalizedContent); if (ancestryResult.success) return ancestryResult; } - return parse23andMeFile(content); + return parse23andMeFile(normalizedContent); } // Blind fallback. - const result23andMe = parse23andMeFile(content); + const result23andMe = parse23andMeFile(normalizedContent); if (result23andMe.success) return result23andMe; - const resultAncestry = parseAncestryDNAFile(content); + const resultAncestry = parseAncestryDNAFile(normalizedContent); if (resultAncestry.success) return resultAncestry; - const resultMonadic = parseMonadicDNAFile(content); + const resultMonadic = parseMonadicDNAFile(normalizedContent); if (resultMonadic.success) return resultMonadic; return { success: false, - error: 'Unable to detect file format. Supported formats: 23andMe (.txt), AncestryDNA (.txt), or Monadic DNA (.csv)', + error: 'Unable to detect file format. Supported formats include 23andMe, AncestryDNA, MyHeritage, FTDNA, LivingDNA, and compatible raw DNA exports.', }; } diff --git a/next-env.d.ts b/next-env.d.ts index c4b7818..9edff1c 100644 --- a/next-env.d.ts +++ b/next-env.d.ts @@ -1,6 +1,6 @@ /// /// -import "./.next/dev/types/routes.d.ts"; +import "./.next/types/routes.d.ts"; // NOTE: This file should not be edited // see https://nextjs.org/docs/app/api-reference/config/typescript for more information. diff --git a/scripts/download-ga-data.mjs b/scripts/download-ga-data.mjs index d71fb5a..e16d634 100644 --- a/scripts/download-ga-data.mjs +++ b/scripts/download-ga-data.mjs @@ -10,10 +10,13 @@ * overview.json - session/user/engagement summary * events.json - all app event counts * events_by_date.json - daily event time series + * events_by_country.json - event counts segmented by country * pages.json - page views by path * acquisition.json - sessions by source / medium * devices.json - sessions by device category * countries.json - sessions by country + * upload_failures.json - upload failure breakdown by country/source/file/reason + * upload_successes.json - upload success breakdown by country/source/file/format * onboarding_paths.json - onboarding_path_chosen breakdown * onboarding_steps.json - onboarding_step_viewed by step name and number * index.json - metadata about this download @@ -96,6 +99,14 @@ async function ga4Report(propertyId, token, body) { return res.json(); } +async function ga4Metadata(propertyId, token) { + const res = await fetch(`https://analyticsdata.googleapis.com/v1beta/properties/${propertyId}/metadata`, { + headers: { Authorization: `Bearer ${token}` }, + }); + if (!res.ok) throw new Error(`GA4 metadata error (${res.status}): ${await res.text()}`); + return res.json(); +} + const APP_EVENTS = [ 'terms_accepted', 'onboarding_started', 'onboarding_completed', 'onboarding_dismissed', 'onboarding_path_chosen', 'get_started_clicked', 'onboarding_action', @@ -172,6 +183,19 @@ async function fetchEventsByDate(propertyId, token, startDate, endDate) { }); } +async function fetchEventsByCountry(propertyId, token, startDate, endDate) { + return ga4Report(propertyId, token, { + dateRanges: [{ startDate, endDate }], + dimensions: [{ name: 'country' }, { name: 'eventName' }], + metrics: [{ name: 'eventCount' }, { name: 'totalUsers' }], + dimensionFilter: { + filter: { fieldName: 'eventName', inListFilter: { values: APP_EVENTS } }, + }, + orderBys: [{ dimension: { dimensionName: 'country' } }, { metric: { metricName: 'eventCount' }, desc: true }], + limit: 1000, + }); +} + async function fetchPages(propertyId, token, startDate, endDate) { return ga4Report(propertyId, token, { dateRanges: [{ startDate, endDate }], @@ -212,6 +236,54 @@ async function fetchCountries(propertyId, token, startDate, endDate) { }); } +function filterSupportedDimensions(availableDimensions, dimensions) { + return dimensions.filter(name => availableDimensions.has(name)).map(name => ({ name })); +} + +async function fetchUploadFailures(propertyId, token, startDate, endDate, availableDimensions) { + const dimensions = filterSupportedDimensions(availableDimensions, [ + 'country', + 'deviceCategory', + 'browser', + 'customEvent:source', + 'customEvent:file_extension', + 'customEvent:reason', + ]); + + return ga4Report(propertyId, token, { + dateRanges: [{ startDate, endDate }], + dimensions, + metrics: [{ name: 'eventCount' }, { name: 'totalUsers' }], + dimensionFilter: { + filter: { fieldName: 'eventName', stringFilter: { matchType: 'EXACT', value: 'genotype_file_upload_failed' } }, + }, + orderBys: [{ metric: { metricName: 'eventCount' }, desc: true }], + limit: 200, + }); +} + +async function fetchUploadSuccesses(propertyId, token, startDate, endDate, availableDimensions) { + const dimensions = filterSupportedDimensions(availableDimensions, [ + 'country', + 'deviceCategory', + 'browser', + 'customEvent:source', + 'customEvent:file_extension', + 'customEvent:detected_format', + ]); + + return ga4Report(propertyId, token, { + dateRanges: [{ startDate, endDate }], + dimensions, + metrics: [{ name: 'eventCount' }, { name: 'totalUsers' }], + dimensionFilter: { + filter: { fieldName: 'eventName', stringFilter: { matchType: 'EXACT', value: 'genotype_file_loaded' } }, + }, + orderBys: [{ metric: { metricName: 'eventCount' }, desc: true }], + limit: 200, + }); +} + async function fetchOnboardingSteps(propertyId, token, startDate, endDate) { return ga4Report(propertyId, token, { dateRanges: [{ startDate, endDate }], @@ -293,7 +365,7 @@ async function fetchOnboardingPaths(propertyId, token, startDate, endDate) { async function main() { const envPath = join(__dirname, 'ga-performance.env'); console.log(`Reading config: ${envPath}`); - const env = loadEnv(envPath); + const env = { ...loadEnv(envPath), ...process.env }; const propertyId = env.GA4_PROPERTY_ID; if (!propertyId || propertyId === '123456789') { @@ -312,20 +384,37 @@ async function main() { console.log(`Authenticating (${clientEmail})...`); const token = await getAccessToken(clientEmail, privateKey); + const metadata = await ga4Metadata(propertyId, token); + const availableDimensions = new Set((metadata.dimensions || []).map(d => d.apiName)); const ts = new Date().toISOString().replace(/[:.]/g, '-').slice(0, 19); const outDir = join('/tmp', 'ga-data', ts); mkdirSync(outDir, { recursive: true }); console.log(`Output directory: ${outDir}`); + writeFileSync( + join(outDir, 'metadata.json'), + JSON.stringify({ + propertyId, + dimensionCount: (metadata.dimensions || []).length, + metricCount: (metadata.metrics || []).length, + customEventDimensions: (metadata.dimensions || []) + .filter(d => d.apiName.startsWith('customEvent:')) + .map(d => d.apiName) + .sort(), + }, null, 2) + ); const reports = [ { name: 'overview', label: 'overview metrics', fn: fetchOverview }, { name: 'events', label: 'event counts', fn: fetchEvents }, { name: 'events_by_date', label: 'events by date', fn: fetchEventsByDate }, + { name: 'events_by_country', label: 'events by country', fn: fetchEventsByCountry }, { name: 'pages', label: 'page views', fn: fetchPages }, { name: 'acquisition', label: 'traffic acquisition', fn: fetchAcquisition }, { name: 'devices', label: 'device breakdown', fn: fetchDevices }, { name: 'countries', label: 'country breakdown', fn: fetchCountries }, + { name: 'upload_failures', label: 'upload failure breakdown', fn: (...args) => fetchUploadFailures(...args, availableDimensions) }, + { name: 'upload_successes', label: 'upload success breakdown', fn: (...args) => fetchUploadSuccesses(...args, availableDimensions) }, { name: 'onboarding_paths', label: 'onboarding path breakdown', fn: fetchOnboardingPaths }, { name: 'onboarding_steps', label: 'onboarding step breakdown', fn: fetchOnboardingSteps }, { name: 'paid_funnel', label: 'paid traffic funnel', fn: fetchPaidFunnel }, diff --git a/scripts/validate-genotype-samples.mjs b/scripts/validate-genotype-samples.mjs new file mode 100644 index 0000000..aeb2658 --- /dev/null +++ b/scripts/validate-genotype-samples.mjs @@ -0,0 +1,312 @@ +#!/usr/bin/env node + +/** + * Downloads public raw DNA sample files from multiple providers and validates + * that the local parser can detect and parse them. + * + * Run with: + * node --experimental-transform-types scripts/validate-genotype-samples.mjs + * + * Network access is required because the samples are fetched from GitHub. + */ + +import { mkdirSync, writeFileSync } from "fs"; +import { inflateRawSync } from "zlib"; +import { join } from "path"; + +const { detectAndParseGenotypeFile } = await import("../lib/genotype-parser.ts"); + +const SAMPLE_SPECS = [ + { + provider: "23andMe", + sample: "apriha/basic", + url: "https://raw.githubusercontent.com/apriha/snps/main/tests/input/23andme.txt", + acceptedFormats: ["23andme"], + }, + { + provider: "23andMe", + sample: "apriha/allele-columns", + url: "https://raw.githubusercontent.com/apriha/snps/main/tests/input/23andme_allele.txt", + acceptedFormats: ["23andme", "livingdna"], + }, + { + provider: "23andMe", + sample: "apriha/windows-newlines", + url: "https://raw.githubusercontent.com/apriha/snps/main/tests/input/23andme_win.txt", + acceptedFormats: ["23andme"], + }, + { + provider: "AncestryDNA", + sample: "apriha/basic", + url: "https://raw.githubusercontent.com/apriha/snps/main/tests/input/ancestry.txt", + acceptedFormats: ["ancestrydna"], + }, + { + provider: "AncestryDNA", + sample: "apriha/mt", + url: "https://raw.githubusercontent.com/apriha/snps/main/tests/input/ancestry_mt.txt", + acceptedFormats: ["ancestrydna"], + }, + { + provider: "AncestryDNA", + sample: "apriha/multi-separator", + url: "https://raw.githubusercontent.com/apriha/snps/main/tests/input/ancestry_multi_sep.txt", + acceptedFormats: ["ancestrydna"], + }, + { + provider: "FTDNA", + sample: "apriha/four-column", + url: "https://raw.githubusercontent.com/apriha/snps/main/tests/input/ftdna.csv", + acceptedFormats: ["ftdna", "monadic"], + }, + { + provider: "FTDNA", + sample: "apriha/famfinder", + url: "https://raw.githubusercontent.com/apriha/snps/main/tests/input/ftdna_famfinder.csv", + acceptedFormats: ["ftdna"], + }, + { + provider: "LivingDNA", + sample: "apriha/basic", + url: "https://raw.githubusercontent.com/apriha/snps/main/tests/input/livingdna.csv", + acceptedFormats: ["livingdna"], + }, + { + provider: "Mapmygenome", + sample: "apriha/basic", + url: "https://raw.githubusercontent.com/apriha/snps/main/tests/input/mapmygenome.txt", + acceptedFormats: ["mapmygenome"], + }, + { + provider: "Mapmygenome", + sample: "apriha/alt-header", + url: "https://raw.githubusercontent.com/apriha/snps/main/tests/input/mapmygenome_alt_header.txt", + acceptedFormats: ["mapmygenome"], + }, + { + provider: "Mapmygenome", + sample: "apriha/new-format", + url: "https://raw.githubusercontent.com/apriha/snps/main/tests/input/mapmygenome_new_format.txt", + acceptedFormats: ["mapmygenome", "monadic", "23andme"], + }, + { + provider: "MyHeritage", + sample: "apriha/basic", + url: "https://raw.githubusercontent.com/apriha/snps/main/tests/input/myheritage.csv", + acceptedFormats: ["myheritage"], + }, + { + provider: "MyHeritage extra quotes", + sample: "apriha/extra-quotes", + url: "https://raw.githubusercontent.com/apriha/snps/main/tests/input/myheritage_extra_quotes.csv", + acceptedFormats: ["myheritage"], + }, + { + provider: "23andMe", + sample: "OpenDNA/sample", + url: "https://raw.githubusercontent.com/corbett3000/OpenDNA/master/tests/fixtures/sample_23andme.txt", + acceptedFormats: ["23andme"], + }, + { + provider: "23andMe", + sample: "milaza/basic", + url: "https://raw.githubusercontent.com/milaza/dna-raw-data-converter-23andme-myheritage-ancestry/main/examples/sample_23andme.txt", + acceptedFormats: ["23andme"], + expectSuccess: false, + }, + { + provider: "AncestryDNA", + sample: "milaza/converter", + url: "https://raw.githubusercontent.com/milaza/dna-raw-data-converter-23andme-myheritage-ancestry/main/examples/sam%D0%B7le_AncestryDNA.txt", + acceptedFormats: ["ancestrydna"], + expectSuccess: false, + }, + { + provider: "FTDNA", + sample: "milaza/four-column", + url: "https://raw.githubusercontent.com/milaza/dna-raw-data-converter-23andme-myheritage-ancestry/main/examples/sample_FamilyTreeDNA.csv", + acceptedFormats: ["ftdna", "monadic"], + expectSuccess: false, + }, + { + provider: "MyHeritage", + sample: "milaza/basic", + url: "https://raw.githubusercontent.com/milaza/dna-raw-data-converter-23andme-myheritage-ancestry/main/examples/sample_MyHeritage.csv", + acceptedFormats: ["myheritage"], + expectSuccess: false, + }, + { + provider: "MyHeritage", + sample: "melvincarvalho/full-export", + url: "https://raw.githubusercontent.com/melvincarvalho/dna/master/dna.csv", + acceptedFormats: ["myheritage"], + }, + { + provider: "23andMe", + sample: "DeepImpute/full-export", + url: "https://raw.githubusercontent.com/aaronge-2020/DeepImpute/main/test_files/11576.23andme.9465.txt", + acceptedFormats: ["23andme"], + }, + { + provider: "Genes for Good", + sample: "PGP/plain-text", + url: "https://d58995d3742b2243a00f53567e7c31c5-95.collections.ac2it.arvadosapi.com/_/GFGFilteredUnphasedGenotypes23andMe.txt", + acceptedFormats: ["23andme"], + }, + { + provider: "23andMe", + sample: "PGP/zip-2019-03-27", + url: "https://2df8bd76617b789834be9a7f50e00477-100.collections.ac2it.arvadosapi.com/_/genome_dennis_gallo_v5_Full_20190327181104.zip", + acceptedFormats: ["23andme"], + kind: "zip", + }, + { + provider: "23andMe", + sample: "PGP/zip-2019-01-10", + url: "https://ac07513af7d69bf95f3f9b98e7914bed-91.collections.ac2it.arvadosapi.com/_/2019-01-10_23andMe-genome_v5_Full.zip", + acceptedFormats: ["23andme"], + kind: "zip", + }, +]; + +function sanitizeName(value) { + return value.toLowerCase().replace(/[^a-z0-9]+/g, "-").replace(/^-+|-+$/g, ""); +} + +async function fetchText(url) { + const response = await fetch(url, { + headers: { "User-Agent": "codex" }, + }); + + if (!response.ok) { + throw new Error(`Download failed (${response.status}) for ${url}`); + } + + return response.text(); +} + +function findEndOfCentralDirectory(bytes) { + const minOffset = Math.max(0, bytes.length - 0xffff - 22); + + for (let offset = bytes.length - 22; offset >= minOffset; offset -= 1) { + if ( + bytes[offset] === 0x50 && + bytes[offset + 1] === 0x4b && + bytes[offset + 2] === 0x05 && + bytes[offset + 3] === 0x06 + ) { + return offset; + } + } + + return -1; +} + +function extractFirstZipEntry(bytes) { + const view = new DataView(bytes.buffer, bytes.byteOffset, bytes.byteLength); + const eocdOffset = findEndOfCentralDirectory(bytes); + if (eocdOffset < 0) throw new Error("ZIP end-of-central-directory not found"); + + const centralDirectoryOffset = view.getUint32(eocdOffset + 16, true); + if (centralDirectoryOffset + 46 > bytes.length) throw new Error("ZIP central directory is truncated"); + if (view.getUint32(centralDirectoryOffset, true) !== 0x02014b50) throw new Error("ZIP central directory signature missing"); + + const compressionMethod = view.getUint16(centralDirectoryOffset + 10, true); + const compressedSize = view.getUint32(centralDirectoryOffset + 20, true); + const fileNameLength = view.getUint16(centralDirectoryOffset + 28, true); + const localHeaderOffset = view.getUint32(centralDirectoryOffset + 42, true); + const filenameStart = centralDirectoryOffset + 46; + const filenameEnd = filenameStart + fileNameLength; + const filename = new TextDecoder().decode(bytes.slice(filenameStart, filenameEnd)); + + if (localHeaderOffset + 30 > bytes.length) throw new Error("ZIP local header is truncated"); + if (view.getUint32(localHeaderOffset, true) !== 0x04034b50) throw new Error("ZIP local file header signature missing"); + + const localNameLength = view.getUint16(localHeaderOffset + 26, true); + const localExtraLength = view.getUint16(localHeaderOffset + 28, true); + const dataStart = localHeaderOffset + 30 + localNameLength + localExtraLength; + const dataEnd = dataStart + compressedSize; + if (dataEnd > bytes.length) throw new Error("ZIP file entry is truncated"); + + const compressed = bytes.slice(dataStart, dataEnd); + + if (compressionMethod === 0) { + return { filename, data: compressed }; + } + + if (compressionMethod === 8) { + return { filename, data: inflateRawSync(compressed) }; + } + + throw new Error(`Unsupported ZIP compression method: ${compressionMethod}`); +} + +async function fetchSample(spec) { + if (spec.kind === "zip") { + const response = await fetch(spec.url, { + headers: { "User-Agent": "codex" }, + }); + if (!response.ok) { + throw new Error(`Download failed (${response.status}) for ${spec.url}`); + } + const bytes = new Uint8Array(await response.arrayBuffer()); + const extracted = extractFirstZipEntry(bytes); + return { + content: new TextDecoder().decode(extracted.data), + filename: extracted.filename, + }; + } + + return { + content: await fetchText(spec.url), + filename: null, + }; +} + +async function main() { + const outputDir = join("/tmp", "genotype-samples", new Date().toISOString().replace(/[:.]/g, "-").slice(0, 19)); + mkdirSync(outputDir, { recursive: true }); + + const results = []; + + for (const spec of SAMPLE_SPECS) { + const { content, filename } = await fetchSample(spec); + const parsed = detectAndParseGenotypeFile(content); + const baseName = sanitizeName(`${spec.provider}-${spec.sample}`); + writeFileSync(join(outputDir, `${baseName}.txt`), content); + + results.push({ + provider: spec.provider, + sample: spec.sample, + variant: spec.kind || "text", + acceptedFormats: spec.acceptedFormats, + expectSuccess: spec.expectSuccess ?? true, + detectedFormat: parsed.detectedFormat || null, + success: parsed.success, + variantCount: parsed.data?.length || 0, + error: parsed.error || null, + url: spec.url, + extractedFilename: filename, + }); + } + + const summary = { + generatedAt: new Date().toISOString(), + outputDir, + successCount: results.filter((entry) => { + if (!entry.expectSuccess) return !entry.success; + return entry.success && entry.acceptedFormats.includes(entry.detectedFormat); + }).length, + totalCount: results.length, + results, + }; + + writeFileSync(join(outputDir, "validation-results.json"), JSON.stringify(summary, null, 2)); + + console.log(JSON.stringify(summary, null, 2)); +} + +main().catch((error) => { + console.error(error instanceof Error ? error.message : String(error)); + process.exit(1); +});