diff --git a/.cspell-wordlist.txt b/.cspell-wordlist.txt index aed3ee1e7d..3d35595f05 100644 --- a/.cspell-wordlist.txt +++ b/.cspell-wordlist.txt @@ -275,3 +275,19 @@ pcre libkleidicv thresholding binarization +autoregressive +dbnet +dewarp +dewarped +dewarps +doclayout +easyocr +letterboxed +nums +ocrv +onehot +ppocrv +redetections +svtr +unclip +unclips diff --git a/apps/computer-vision/app/_layout.tsx b/apps/computer-vision/app/_layout.tsx index f269b27aae..9f3fadd51b 100644 --- a/apps/computer-vision/app/_layout.tsx +++ b/apps/computer-vision/app/_layout.tsx @@ -62,6 +62,20 @@ export default function Layout() { title: 'Instance Segmentation', }} /> + + b.platforms.includes(Platform.OS)); +const BACKEND_OPTIONS: ModelOption[] = AVAILABLE.map((b, i) => ({ label: b.label, value: i })); + +type Cell = { text: string; colspan: number; rowspan: number }; +type DocBlock = { + regionType: string; + text: string; + isTable: boolean; + tableHtml?: string; + bbox: { xmin: number; ymin: number; xmax: number; ymax: number }; +}; + +// Parse the SLANet structure HTML (filled) into rows of cells for rendering. +function parseTable(html: string): Cell[][] { + const rows: Cell[][] = []; + const trRe = /([\s\S]*?)<\/tr>/g; + let tr: RegExpExecArray | null; + while ((tr = trRe.exec(html))) { + const cells: Cell[] = []; + const tdRe = /]*)>([\s\S]*?)<\/td>/g; + let td: RegExpExecArray | null; + while ((td = tdRe.exec(tr[1]!))) { + const attrs = td[1] ?? ''; + cells.push({ + text: td[2] ?? '', + colspan: Number(/colspan="(\d+)"/.exec(attrs)?.[1] ?? 1), + rowspan: Number(/rowspan="(\d+)"/.exec(attrs)?.[1] ?? 1), + }); + } + rows.push(cells); + } + return rows; +} + +function TableView({ html }: { html: string }) { + const rows = parseTable(html); + if (rows.length === 0) { + return {html}; + } + // Fixed-width cells inside a horizontal scroll — wide tables scroll instead of + // squishing every column into the screen width. + return ( + + + {rows.map((cells, r) => ( + + {cells.map((c, i) => ( + + {c.text} + + ))} + + ))} + + + ); +} + +function DocumentContent() { + const [backendIdx, setBackendIdx] = useState(0); + const [layoutOn, setLayoutOn] = useState(true); + const [supportingOn, setSupportingOn] = useState(true); + const [orientation, setOrientation] = useState(true); + // Off by default: dewarp (UVDoc) corrects photographed, physically-warped pages; + // on a flat screenshot it has nothing to fix and visibly distorts clean text. + const [dewarp, setDewarp] = useState(false); + const [imageUri, setImageUri] = useState(null); + const [isProcessing, setIsProcessing] = useState(false); + const [blocks, setBlocks] = useState([]); + // The frame the result boxes are relative to (orientation/dewarp may move it + // away from the original), so the overlay lines up. + const [processed, setProcessed] = useState(null); + const [wallMs, setWallMs] = useState(null); + const [error, setError] = useState(null); + + const backend = AVAILABLE[backendIdx]!; + + const skiaImage = useImage(imageUri, (err) => setError(err.message || String(err))); + + // Hosted configs — `useDocumentOcr` downloads + caches each enabled model. + // orientation/dewarp are NOT baked here: they're passed per-run to + // `runDocumentOcr` below, so toggling them takes effect without a reload. + const config = { + ocr: models.ocr.PADDLE.PPOCRV6_SMALL[backend.key], + ...(layoutOn ? { layout: models.layoutDetection.PP_DOCLAYOUT[backend.key] } : {}), + ...(supportingOn ? { documentModels: models.documentModels.PP_HELPERS[backend.key] } : {}), + }; + + const { isReady, downloadProgress, error: loadError, runDocumentOcr } = useDocumentOcr(config); + + const handlePick = async (useCamera: boolean) => { + setError(null); + try { + const uri = await getImage(useCamera); + if (uri) { + setImageUri(uri); + setBlocks([]); + setProcessed(null); + setWallMs(null); + } + } catch (e: any) { + setError(e.message || String(e)); + } + }; + + const run = async () => { + if (!skiaImage || !runDocumentOcr) return; + setIsProcessing(true); + setError(null); + try { + const pixels = skiaImage.readPixels(); + if (!(pixels instanceof Uint8Array)) throw new Error('Expected Uint8Array from readPixels'); + const start = Date.now(); + const out = await runDocumentOcr( + { + data: pixels, + width: skiaImage.width(), + height: skiaImage.height(), + format: 'rgba' as const, + layout: 'hwc' as const, + }, + { orientation, dewarp } + ); + setWallMs(Date.now() - start); + setBlocks(out.blocks as DocBlock[]); + // Show the frame the boxes are relative to (orientation/dewarp may have + // rotated/warped it), so the overlaid boxes align. + const frame = out.image; + const skData = Skia.Data.fromBytes(frame.data); + const frameImage = Skia.Image.MakeImage( + { + width: frame.width, + height: frame.height, + colorType: ColorType.RGBA_8888, + alphaType: AlphaType.Unpremul, + }, + skData, + frame.width * 4 + ); + setProcessed(frameImage); + } catch (e: any) { + setError(e.message || String(e)); + } finally { + setIsProcessing(false); + } + }; + + const activeError = loadError ? String(loadError) : error; + const boxes = blocks.map((b) => [ + { x: b.bbox.xmin, y: b.bbox.ymin }, + { x: b.bbox.xmax, y: b.bbox.ymin }, + { x: b.bbox.xmax, y: b.bbox.ymax }, + { x: b.bbox.xmin, y: b.bbox.ymax }, + ]); + + return ( + + + Full document pipeline: layout → OCR grouped into reading-ordered blocks, with orientation, + table-structure recognition and (optional) dewarp. PaddleOCR is always on; dewarp is off by + default — it only helps photographed, warped pages. Orientation/dewarp are per-run, so + toggling them takes effect on the next run without reloading the models. + + + { + setBackendIdx(v); + setBlocks([]); + setProcessed(null); + setWallMs(null); + }} + /> + + + + + + + + + handlePick(false)} + /> + + +