From 44b45ab17aec08a8aa6528a5b7568e2ab600eb82 Mon Sep 17 00:00:00 2001 From: benITo47 Date: Mon, 29 Jun 2026 21:37:58 +0200 Subject: [PATCH 01/29] [RNE Rewrite] feat(ocr): unified on-device OCR + document pipeline Two-stage OCR (EasyOCR CRAFT+CRNN / PaddleOCR DBNet+SVTR) plus a document pipeline, on top of rne-rewrite. - One fused PTE per model with bucketed detect_/recognize_ methods and snap-to-closest sizing; a single baked contract, with only the box decoder (detectorKind: 'craft' | 'dbnet') and the drop score per architecture. - Document pipeline: layout via createObjectDetector, native dewarp/gridSample, SLANet_plus table-structure recognition, structure-guided table HTML. - Vertical reading (additive, opt-in): page-level column grouping for stacked signage + char-level second CRAFT pass + joint-hconcat recognition; tall lines are no longer flipped flat, and vertical reads skip the drop-score gate. - Native ops: extractTextBoxes (CRAFT + DBNet), warpQuad, ctcGreedyDecode, gridSample. - Models hosted on Hugging Face (EasyOCR, PP-OCRv6, PP-DocLayoutV3, PaddleHelpers), downloaded + cached on device; demo screens consume them directly. --- .cspell-wordlist.txt | 32 + apps/computer-vision/app/_layout.tsx | 14 + apps/computer-vision/app/document/index.tsx | 383 ++++++++ apps/computer-vision/app/index.tsx | 6 + apps/computer-vision/app/ocr/index.tsx | 307 +++++++ .../components/ImageViewport.tsx | 57 +- .../cpp/extensions/cv/box_ops.cpp | 2 + .../cpp/extensions/cv/image_ops.cpp | 105 +++ .../cpp/extensions/cv/image_ops.h | 1 + .../cpp/extensions/cv/install.cpp | 6 + .../cpp/extensions/cv/ocr_ops.cpp | 826 ++++++++++++++++++ .../cpp/extensions/cv/ocr_ops.h | 17 + .../react-native-executorch/src/constants.ts | 102 +++ .../src/extensions/cv/ops/boxes.ts | 24 +- .../src/extensions/cv/ops/image.ts | 192 ++++ .../src/extensions/cv/ops/points.ts | 13 + .../extensions/cv/tasks/documentHelpers.ts | 179 ++++ .../src/extensions/cv/tasks/documentOCR.ts | 214 +++++ .../src/extensions/cv/tasks/ocr.ts | 816 +++++++++++++++++ .../src/extensions/cv/tasks/ocrHelpers.ts | 263 ++++++ .../src/extensions/cv/tasks/supporting.ts | 261 ++++++ .../src/hooks/useDocumentOCR.ts | 59 ++ .../src/hooks/useOCR.ts | 39 + packages/react-native-executorch/src/index.ts | 7 + .../react-native-executorch/src/models.ts | 134 +++ .../react-native-executorch/src/ocrSymbols.ts | 161 ++++ 26 files changed, 4211 insertions(+), 9 deletions(-) create mode 100644 apps/computer-vision/app/document/index.tsx create mode 100644 apps/computer-vision/app/ocr/index.tsx create mode 100644 packages/react-native-executorch/cpp/extensions/cv/ocr_ops.cpp create mode 100644 packages/react-native-executorch/cpp/extensions/cv/ocr_ops.h create mode 100644 packages/react-native-executorch/src/extensions/cv/tasks/documentHelpers.ts create mode 100644 packages/react-native-executorch/src/extensions/cv/tasks/documentOCR.ts create mode 100644 packages/react-native-executorch/src/extensions/cv/tasks/ocr.ts create mode 100644 packages/react-native-executorch/src/extensions/cv/tasks/ocrHelpers.ts create mode 100644 packages/react-native-executorch/src/extensions/cv/tasks/supporting.ts create mode 100644 packages/react-native-executorch/src/hooks/useDocumentOCR.ts create mode 100644 packages/react-native-executorch/src/hooks/useOCR.ts create mode 100644 packages/react-native-executorch/src/ocrSymbols.ts diff --git a/.cspell-wordlist.txt b/.cspell-wordlist.txt index aed3ee1e7d..77307d75eb 100644 --- a/.cspell-wordlist.txt +++ b/.cspell-wordlist.txt @@ -275,3 +275,35 @@ pcre libkleidicv thresholding binarization +dbnet +svtr +softmaxed +softmax +unclip +cand +parameterizes +pyimagesearch +letterbox +CRNN +CRAFT +PaddleOCR +EasyOCR +cornerMean +ctc +Vatti +softmaxing +ppocrv +PPOCRV +ctcGreedyDecode +dewarp +vctx +onehot +slanet +letterboxed +redetect +redetections +eos +doclayout +dynint +softmaxes +hconcat diff --git a/apps/computer-vision/app/_layout.tsx b/apps/computer-vision/app/_layout.tsx index f269b27aae..9f3fadd51b 100644 --- a/apps/computer-vision/app/_layout.tsx +++ b/apps/computer-vision/app/_layout.tsx @@ -62,6 +62,20 @@ export default function Layout() { title: 'Instance Segmentation', }} /> + + b.platforms.includes(Platform.OS)); +const BACKEND_OPTIONS: ModelOption[] = AVAILABLE.map((b, i) => ({ label: b.label, value: i })); + +type Cell = { text: string; colspan: number; rowspan: number }; +type DocBlock = { + regionType: string; + text: string; + isTable: boolean; + tableHtml?: string; + bbox: { xmin: number; ymin: number; xmax: number; ymax: number }; +}; + +// Parse the SLANet structure HTML (filled) into rows of cells for rendering. +function parseTable(html: string): Cell[][] { + const rows: Cell[][] = []; + const trRe = /([\s\S]*?)<\/tr>/g; + let tr: RegExpExecArray | null; + while ((tr = trRe.exec(html))) { + const cells: Cell[] = []; + const tdRe = /]*)>([\s\S]*?)<\/td>/g; + let td: RegExpExecArray | null; + while ((td = tdRe.exec(tr[1]!))) { + const attrs = td[1] ?? ''; + cells.push({ + text: td[2] ?? '', + colspan: Number(/colspan="(\d+)"/.exec(attrs)?.[1] ?? 1), + rowspan: Number(/rowspan="(\d+)"/.exec(attrs)?.[1] ?? 1), + }); + } + rows.push(cells); + } + return rows; +} + +function TableView({ html }: { html: string }) { + const rows = parseTable(html); + if (rows.length === 0) { + return {html}; + } + // Fixed-width cells inside a horizontal scroll — wide tables scroll instead of + // squishing every column into the screen width. + return ( + + + {rows.map((cells, r) => ( + + {cells.map((c, i) => ( + + {c.text} + + ))} + + ))} + + + ); +} + +function DocumentContent() { + const [backendIdx, setBackendIdx] = useState(0); + const [layoutOn, setLayoutOn] = useState(true); + const [supportingOn, setSupportingOn] = useState(true); + const [orientation, setOrientation] = useState(true); + const [dewarp, setDewarp] = useState(true); + const [imageUri, setImageUri] = useState(null); + const [isProcessing, setIsProcessing] = useState(false); + const [blocks, setBlocks] = useState([]); + // The frame the result boxes are relative to (orientation/dewarp may move it + // away from the original), so the overlay lines up. + const [processed, setProcessed] = useState(null); + const [wallMs, setWallMs] = useState(null); + const [error, setError] = useState(null); + + const backend = AVAILABLE[backendIdx]!; + + const skiaImage = useImage(imageUri, (err) => setError(err.message || String(err))); + + // Hosted configs — `useDocumentOCR` downloads + caches each enabled model. + const config = { + ocr: models.ocr.PADDLE.PPOCRV6_SMALL[backend.key], + ...(layoutOn ? { layout: models.layoutDetection.PP_DOCLAYOUT[backend.key] } : {}), + ...(supportingOn ? { supporting: models.supporting.PP_SUPPORTING[backend.key] } : {}), + orientation, + dewarp, + }; + + const { isReady, downloadProgress, error: loadError, runDocumentOCR } = useDocumentOCR(config); + + const handlePick = async (useCamera: boolean) => { + setError(null); + try { + const uri = await getImage(useCamera); + if (uri) { + setImageUri(uri); + setBlocks([]); + setProcessed(null); + setWallMs(null); + } + } catch (e: any) { + setError(e.message || String(e)); + } + }; + + const run = async () => { + if (!skiaImage || !runDocumentOCR) return; + setIsProcessing(true); + setError(null); + try { + const pixels = skiaImage.readPixels(); + if (!(pixels instanceof Uint8Array)) throw new Error('Expected Uint8Array from readPixels'); + const start = Date.now(); + const out = await runDocumentOCR({ + data: pixels, + width: skiaImage.width(), + height: skiaImage.height(), + format: 'rgba' as const, + layout: 'hwc' as const, + }); + setWallMs(Date.now() - start); + setBlocks(out.blocks as DocBlock[]); + // Show the frame the boxes are relative to (orientation/dewarp may have + // rotated/warped it), so the overlaid boxes align. + const frame = out.image; + const skData = Skia.Data.fromBytes(frame.data); + const frameImage = Skia.Image.MakeImage( + { + width: frame.width, + height: frame.height, + colorType: ColorType.RGBA_8888, + alphaType: AlphaType.Unpremul, + }, + skData, + frame.width * 4 + ); + setProcessed(frameImage); + } catch (e: any) { + setError(e.message || String(e)); + } finally { + setIsProcessing(false); + } + }; + + const activeError = loadError ? String(loadError) : error; + const boxes = blocks.map((b) => [ + { x: b.bbox.xmin, y: b.bbox.ymin }, + { x: b.bbox.xmax, y: b.bbox.ymin }, + { x: b.bbox.xmax, y: b.bbox.ymax }, + { x: b.bbox.xmin, y: b.bbox.ymax }, + ]); + + return ( + + + Full document pipeline: layout → OCR grouped into reading-ordered blocks, with + orientation/dewarp and table-structure recognition. PaddleOCR is always on; the rest are on + by default — toggle any off (toggling reloads the models). + + + { + setBackendIdx(v); + setBlocks([]); + setProcessed(null); + setWallMs(null); + }} + /> + + + + + + + + + handlePick(false)} + /> + + +