From 5e1902c12524230c62f8eed3eac2f421fbad74bb Mon Sep 17 00:00:00 2001 From: Susumu Fukuda Date: Fri, 15 Oct 2021 23:48:38 +0900 Subject: [PATCH 01/51] chore: add pdfjs-dist typings --- package.json | 1 + .../components/PdfViewer/PdfViewer.tsx | 20 +++++++++---------- .../components/PdfViewer/typings.d.ts | 1 - yarn.lock | 5 +++++ 4 files changed, 16 insertions(+), 11 deletions(-) diff --git a/package.json b/package.json index f558ed0cb..7ccfb06b0 100644 --- a/package.json +++ b/package.json @@ -38,6 +38,7 @@ "@types/lodash": "^4.14.141", "@types/mustache": "^0.8.32", "@types/node": "^12.7.3", + "@types/pdfjs-dist": "^2.7.5", "@types/react": "^16.9.2", "@types/react-dom": "^16.9.0", "@types/react-resize-detector": "^4.2.0", diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewer/PdfViewer.tsx b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewer/PdfViewer.tsx index 753f263e1..9d4357de1 100644 --- a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewer/PdfViewer.tsx +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewer/PdfViewer.tsx @@ -1,5 +1,5 @@ -import React, { SFC, useEffect, useRef, useState, useMemo } from 'react'; -import PdfjsLib from 'pdfjs-dist'; +import React, { FC, useEffect, useRef, useState, useMemo } from 'react'; +import PdfjsLib, { PDFDocumentProxy, PDFPageProxy, PDFPageViewport, PDFSource } from 'pdfjs-dist'; import PdfjsWorkerAsText from 'pdfjs-dist/build/pdf.worker.min.js'; import { settings } from 'carbon-components'; @@ -35,7 +35,7 @@ interface Props { setHideToolbarControls?: (disabled: boolean) => void; } -const PdfViewer: SFC = ({ +const PdfViewer: FC = ({ file, page, scale, @@ -46,8 +46,8 @@ const PdfViewer: SFC = ({ const canvasRef = useRef(null); // In order to prevent unnecessary re-loading, loaded file and page are stored in state - const [loadedFile, setLoadedFile] = useState(null); - const [loadedPage, setLoadedPage] = useState(null); + const [loadedFile, setLoadedFile] = useState(null); + const [loadedPage, setLoadedPage] = useState(null); useEffect(() => { let didCancel = false; @@ -95,7 +95,7 @@ const PdfViewer: SFC = ({ }, [loadedPage, scale]); useEffect(() => { - if (loadedPage && !loadedPage.then && viewport && canvasInfo) { + if (loadedPage && !(loadedPage as any).then && viewport && canvasInfo) { _renderPage(loadedPage, canvasRef.current!, viewport, canvasInfo); setLoading(false); } @@ -127,14 +127,14 @@ function _loadPdf(data: string): Promise { return PdfjsLib.getDocument({ data }).promise; } -function _loadPage(file: any, page: number): Promise { +function _loadPage(file: PDFDocumentProxy, page: number) { return file.getPage(page); } function _renderPage( - pdfPage: any, + pdfPage: PDFPageProxy, canvas: HTMLCanvasElement, - viewport: any, + viewport: PDFPageViewport, canvasInfo: CanvasInfo ): void { const canvasContext = canvas.getContext('2d'); @@ -148,7 +148,7 @@ function _renderPage( function setupPdfjs(): void { if (typeof Worker !== 'undefined') { const blob = new Blob([PdfjsWorkerAsText], { type: 'text/javascript' }); - const pdfjsWorker = new Worker(URL.createObjectURL(blob)); + const pdfjsWorker = new Worker(URL.createObjectURL(blob)) as any; // TODO is this usage correct? PdfjsLib.GlobalWorkerOptions.workerPort = pdfjsWorker; } else { PdfjsLib.GlobalWorkerOptions.workerSrc = PdfjsWorkerAsText; diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewer/typings.d.ts b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewer/typings.d.ts index fa31151b5..21a04c613 100644 --- a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewer/typings.d.ts +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewer/typings.d.ts @@ -1,2 +1 @@ -declare module 'pdfjs-dist'; declare module 'pdfjs-dist/build/pdf.worker.min.js'; diff --git a/yarn.lock b/yarn.lock index b99dd926d..c29f30314 100644 --- a/yarn.lock +++ b/yarn.lock @@ -4615,6 +4615,11 @@ resolved "https://registry.yarnpkg.com/@types/parse5/-/parse5-5.0.3.tgz#e7b5aebbac150f8b5fdd4a46e7f0bd8e65e19109" integrity sha512-kUNnecmtkunAoQ3CnjmMkzNU/gtxG8guhi+Fk2U/kOpIKjIMKnXGp4IJCgQJrXSgMsWYimYG4TGjz/UzbGEBTw== +"@types/pdfjs-dist@^2.7.5": + version "2.7.5" + resolved "https://registry.yarnpkg.com/@types/pdfjs-dist/-/pdfjs-dist-2.7.5.tgz#53fc13e30b6bd18acdb3617fb6332a43225e0ffa" + integrity sha512-QdKChstEcREV+imniONC3JvSSFOeYv0RL12QOpzSfTpGg7xRAZcJXpWdJEtuH6FAPXX5V+sp9i/sBHJ9bsU7JA== + "@types/prop-types@*": version "15.7.3" resolved "https://registry.yarnpkg.com/@types/prop-types/-/prop-types-15.7.3.tgz#2ab0d5da2e5815f94b0b9d4b95d1e5f243ab2ca7" From 824c2d7e16de6e333645921550be12c7f02a3679 Mon Sep 17 00:00:00 2001 From: Susumu Fukuda Date: Tue, 9 Nov 2021 16:04:37 +0900 Subject: [PATCH 02/51] fix: control PDF rendering tasks properly --- .../components/PdfViewer/PdfViewer.tsx | 66 +++++++++++++++---- 1 file changed, 52 insertions(+), 14 deletions(-) diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewer/PdfViewer.tsx b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewer/PdfViewer.tsx index 9d4357de1..5ab2ee420 100644 --- a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewer/PdfViewer.tsx +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewer/PdfViewer.tsx @@ -1,8 +1,15 @@ import React, { FC, useEffect, useRef, useState, useMemo } from 'react'; -import PdfjsLib, { PDFDocumentProxy, PDFPageProxy, PDFPageViewport, PDFSource } from 'pdfjs-dist'; +import PdfjsLib, { + PDFDocumentProxy, + PDFPageProxy, + PDFPageViewport, + PDFRenderTask +} from 'pdfjs-dist'; import PdfjsWorkerAsText from 'pdfjs-dist/build/pdf.worker.min.js'; import { settings } from 'carbon-components'; +const { RenderingCancelledException } = PdfjsLib as any; + setupPdfjs(); interface Props { @@ -88,18 +95,45 @@ const PdfViewer: FC = ({ }; }, [loadedFile, page]); - const [viewport, canvasInfo] = useMemo(() => { - const viewport = loadedPage?.getViewport({ scale }); - const canvasInfo = viewport ? getCanvasInfo(viewport) : undefined; - return [viewport, canvasInfo]; - }, [loadedPage, scale]); + const currentPage = useMemo(() => { + const isPageValid = !!loadedPage && loadedPage.pageNumber === page; + if (isPageValid) { + const viewport = loadedPage?.getViewport({ scale }); + const canvasInfo = viewport ? getCanvasInfo(viewport) : undefined; + return { loadedPage, viewport, canvasInfo }; + } + return null; + }, [loadedPage, page, scale]); useEffect(() => { + let didCancel = false; + let task: PDFRenderTask | null = null; + + const { loadedPage, viewport, canvasInfo } = currentPage || {}; if (loadedPage && !(loadedPage as any).then && viewport && canvasInfo) { - _renderPage(loadedPage, canvasRef.current!, viewport, canvasInfo); - setLoading(false); + const render = async () => { + try { + task = _renderPage(loadedPage, canvasRef.current!, viewport, canvasInfo); + await task?.promise; + } catch (e) { + if (e instanceof RenderingCancelledException) { + // ignore + } else { + throw e; // rethrow unknown exception + } + } finally { + if (!didCancel) { + setLoading(false); + } + } + }; + render(); } - }, [loadedPage, viewport, canvasInfo, setLoading]); + return () => { + didCancel = true; + task?.cancel(); + }; + }, [loadedPage, currentPage, setLoading]); useEffect(() => { if (setHideToolbarControls) { @@ -107,6 +141,7 @@ const PdfViewer: FC = ({ } }, [setHideToolbarControls]); + const { canvasInfo } = currentPage || {}; return ( Date: Wed, 20 Oct 2021 13:47:13 +0900 Subject: [PATCH 03/51] feat: add pdf text layer support --- .../PdfViewer/PdfViewer.stories.tsx | 18 +- .../components/PdfViewer/PdfViewer.tsx | 54 ++++- .../PdfViewer/PdfViewerTextLayer.tsx | 207 ++++++++++++++++++ .../components/PdfViewer/typings.d.ts | 53 +++++ .../_document-preview-pdf-viewer.scss | 58 +++++ 5 files changed, 379 insertions(+), 11 deletions(-) create mode 100644 packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewer/PdfViewerTextLayer.tsx diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewer/PdfViewer.stories.tsx b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewer/PdfViewer.stories.tsx index e6cdc54b1..6e7b31d12 100644 --- a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewer/PdfViewer.stories.tsx +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewer/PdfViewer.stories.tsx @@ -1,6 +1,7 @@ import React from 'react'; import { storiesOf } from '@storybook/react'; -import { withKnobs, radios, number } from '@storybook/addon-knobs'; +import { withKnobs, radios, number, boolean } from '@storybook/addon-knobs'; +import { action } from '@storybook/addon-actions'; import PdfViewer from './PdfViewer'; import { document as doc } from 'components/DocumentPreview/__fixtures__/Art Effects.pdf'; @@ -32,6 +33,19 @@ storiesOf('DocumentPreview/components/PdfViewer', module) const zoom = radios(zoomKnob.label, zoomKnob.options, zoomKnob.defaultValue); const scale = parseFloat(zoom); + const showTextLayer = boolean('Show text layer', false); - return {}} />; + const setLoadingAction = action('setLoading'); + const setTextLayerInfoAction = action('setTextLayerInfo'); + + return ( + + ); }); diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewer/PdfViewer.tsx b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewer/PdfViewer.tsx index 5ab2ee420..d0ee15607 100644 --- a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewer/PdfViewer.tsx +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewer/PdfViewer.tsx @@ -1,4 +1,5 @@ import React, { FC, useEffect, useRef, useState, useMemo } from 'react'; +import cx from 'classnames'; import PdfjsLib, { PDFDocumentProxy, PDFPageProxy, @@ -7,12 +8,15 @@ import PdfjsLib, { } from 'pdfjs-dist'; import PdfjsWorkerAsText from 'pdfjs-dist/build/pdf.worker.min.js'; import { settings } from 'carbon-components'; +import PdfViewerTextLayer, { PdfTextLayerInfo } from './PdfViewerTextLayer'; const { RenderingCancelledException } = PdfjsLib as any; setupPdfjs(); interface Props { + className?: string; + /** * PDF file data as base64-encoded string */ @@ -28,6 +32,16 @@ interface Props { */ scale: number; + /** + * Render text layer + */ + showTextLayer?: boolean; + + /** + * Text layer class name. Only applicable when showTextLayer is true + */ + textLayerClassName?: string; + /** * Callback invoked with page count, once `file` has been parsed */ @@ -40,15 +54,24 @@ interface Props { * Callback which is invoked with whether to enable/disable toolbar controls */ setHideToolbarControls?: (disabled: boolean) => void; + /** + * Callback for text layer info + */ + setTextLayerInfo?: (info: PdfTextLayerInfo | null) => any; } const PdfViewer: FC = ({ + className, file, page, scale, + showTextLayer, + textLayerClassName, setPageCount, setLoading, - setHideToolbarControls + setHideToolbarControls, + setTextLayerInfo, + children }) => { const canvasRef = useRef(null); @@ -141,15 +164,28 @@ const PdfViewer: FC = ({ } }, [setHideToolbarControls]); - const { canvasInfo } = currentPage || {}; + const classNameBase = `${settings.prefix}--document-preview-pdf-viewer`; + const { loadedPage: currentLoadedPage, canvasInfo } = currentPage || {}; return ( - +
+ + {showTextLayer && ( + + )} + {children} +
); }; diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewer/PdfViewerTextLayer.tsx b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewer/PdfViewerTextLayer.tsx new file mode 100644 index 000000000..f6c38f08c --- /dev/null +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewer/PdfViewerTextLayer.tsx @@ -0,0 +1,207 @@ +import React, { FC, useEffect, useRef, useState } from 'react'; +import cx from 'classnames'; +import PDFJSLib, { PDFPageProxy, PDFPageViewport, TextContent, TextContentItem } from 'pdfjs-dist'; +import { EventBus } from 'pdfjs-dist/lib/web/ui_utils'; +import { TextLayerBuilder } from 'pdfjs-dist/lib/web/text_layer_builder'; + +const { RenderingCancelledException } = PDFJSLib as any; + +interface Props { + className?: string; + + /** + * PDF page from pdfjs + */ + loadedPage: PDFPageProxy | null | undefined; + + /** + * Page number, starting at 1 + */ + page: number; + + /** + * Zoom factor, where `1` is equal to 100% + */ + scale: number; + + /** + * Callback for text layer info + */ + setTextLayerInfo?: (info: PdfTextLayerInfo | null) => any; +} + +export type PdfTextLayerInfo = { + /** + * PDF text content + */ + textContent: TextContent & { + styles: { [styleName: string]: CSSStyleDeclaration }; + }; + + /** + * Text span DOM elements rendered on the text layer + */ + textDivs: HTMLElement[]; + + /** + * Pdf page viewport used to render text items + */ + viewport: PDFPageViewport; + + /** + * Page number, starting at 1 + */ + page: number; +}; + +const PdfViewerTextLayer: FC = ({ + className, + loadedPage, + page = 1, + scale = 1, + setTextLayerInfo: setTextLayerInfoCallback = () => {} +}) => { + const textLayerRef = useRef(null); + const textLayerDiv = textLayerRef.current; + + const [textRenderInfo, setTextRenderInfo] = useState<{ + page: number; + scale: number; + textContent: TextContent; + viewport: PDFPageViewport; + } | null>(null); + + useEffect(() => { + async function loadTextInfo() { + const isPageReady = !!loadedPage && loadedPage.pageNumber === page; + if (isPageReady) { + const viewport = loadedPage.getViewport({ scale }); + const textContent = await loadedPage.getTextContent(); + setTextRenderInfo({ textContent, viewport, page, scale }); + } + } + loadTextInfo(); + }, [loadedPage, page, scale]); + + const textLayerBuilderRef = useRef(null); // ref for debugging purpose + const [textLayerInfo, setTextLayerInfo] = useState(null); + useEffect(() => { + let textLayerBuilder: TextLayerBuilder | null = null; + async function loadTextLayer() { + let textLayerInfo: PdfTextLayerInfo | null = null; + + if (textLayerDiv && textRenderInfo) { + const { textContent, viewport, scale, page } = textRenderInfo; + // prepare text layer + textLayerBuilder = textLayerBuilderRef.current = new TextLayerBuilder({ + textLayerDiv, + viewport, + eventBus: new EventBus(), + pageIndex: page - 1 + }); + textLayerBuilder.setTextContent(textContent); + + // render + textLayerDiv.innerHTML = ''; + try { + const deferredRenderEnd = (() => { + let resolve: null | Function = null; + const promise = new Promise(resolvePromise => { + resolve = resolvePromise; + }); + + const listener = () => { + resolve!(); + textLayerBuilder?.eventBus.off('textlayerrendered', listener); + }; + textLayerBuilder.eventBus.on('textlayerrendered', listener); + + return { promise }; + })(); + + textLayerBuilder.render(); + await deferredRenderEnd.promise; + + // fix text divs + _adjustTextDivs(textLayerBuilder.textDivs, textContent.items, scale); + + textLayerInfo = { + textContent, + textDivs: textLayerBuilder.textDivs, + viewport, + page + }; + } catch (e) { + if (e instanceof RenderingCancelledException) { + // ignore + return; + } else { + throw e; + } + } + } + setTextLayerInfo(textLayerInfo); + } + loadTextLayer(); + + return () => { + textLayerBuilder?.cancel(); + // should we set text items?? + }; + }, [textLayerDiv, textRenderInfo]); + + useEffect(() => { + setTextLayerInfoCallback(textLayerInfo); + }, [textLayerInfo, setTextLayerInfoCallback]); + + const rootClassName = cx(className, `textLayer`); + return ( +
+ ); +}; + +/** + * Adjust text span width + * @param textDivs + * @param textItems + * @param scale + */ +function _adjustTextDivs( + textDivs: HTMLElement[], + textItems: TextContentItem[] | null, + scale: number +): void { + const scaleXPattern = /scaleX\(([\d.]+)\)/; + (textDivs || []).forEach((textDivElm, index) => { + const textItem = textItems?.[index]; + if (!textItem) return; + + const expectedWidth = textItem.width * scale; + const actualWidth = textDivElm.getBoundingClientRect().width; + + function getScaleX(element: HTMLElement) { + const match = element.style.transform?.match(scaleXPattern); + if (match) { + return parseFloat(match[1]); + } + return null; + } + const currentScaleX = getScaleX(textDivElm); + if (currentScaleX && !isNaN(currentScaleX)) { + const newScale = `scaleX(${(expectedWidth / actualWidth) * currentScaleX})`; + textDivElm.style.transform = textDivElm.style.transform.replace(scaleXPattern, newScale); + } else { + const newScale = `scaleX(${expectedWidth / actualWidth})`; + textDivElm.style.transform = newScale; + } + }); +} + +export default PdfViewerTextLayer; diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewer/typings.d.ts b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewer/typings.d.ts index 21a04c613..44544e96f 100644 --- a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewer/typings.d.ts +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewer/typings.d.ts @@ -1 +1,54 @@ declare module 'pdfjs-dist/build/pdf.worker.min.js'; + +// +// Declare modules and their types that is referred from PDF text layer rendering. +// Unused properties are commented out +// +declare module 'pdfjs-dist/lib/web/ui_utils' { + export class EventBus { + on(eventName: string, listener: any): void; + off(eventName: string, listener: any): void; + dispatch(eventName: string, args?: any): void; + } + // export function getGlobalEventBus(): EventBus; +} + +declare module 'pdfjs-dist/lib/web/text_layer_builder' { + import { EventBus } from 'pdfjs-dist/lib/web/ui_utils'; + import PdfjsLib from 'pdfjs-dist'; + + export class TextLayerBuilder { + constructor(options: TextLayerBuilder.Options); + + textLayerDiv: HTMLElement; + eventBus: EventBus; + textContent: PdfjsLib.TextContent | null; + // textContentItemsStr: any[]; + renderingDone: boolean; + // pageIdx: number; + pageNumber: number; + // matches: any[]; + // viewport: PdfjsLib.PDFPageViewport; + textDivs: HTMLElement[]; + // findController: any; + textLayerRenderTask: TextLayerRenderTask; + // enhanceTextSelection: any; + + render(timeout?: number): void; + cancel(): void; + // setTextContentStream(readableStream: any): void; + setTextContent(textContent: PdfjsLib.TextContent): void; + } + export const DefaultTextLayerFactory; + + declare namespace TextLayerBuilder { + export interface Options { + textLayerDiv: HTMLElement; + eventBus: EventBus; + pageIndex: number; + viewport: PdfjsLib.PDFPageViewport; + // findController?: any; + // enhanceTextSelection?: any; + } + } +} diff --git a/packages/discovery-styles/scss/components/document-preview/_document-preview-pdf-viewer.scss b/packages/discovery-styles/scss/components/document-preview/_document-preview-pdf-viewer.scss index 19169fe5f..f3bcda870 100644 --- a/packages/discovery-styles/scss/components/document-preview/_document-preview-pdf-viewer.scss +++ b/packages/discovery-styles/scss/components/document-preview/_document-preview-pdf-viewer.scss @@ -1,3 +1,61 @@ .#{$prefix}--document-preview-pdf-viewer { + position: relative; +} + +.#{$prefix}--document-preview-pdf-viewer--canvas { + transform-origin: left top 0px; +} + +.#{$prefix}--document-preview-pdf-viewer--text { transform-origin: left top 0px; + + // + // NOTE: import textLayer styles from ~pdfjs-dist/web/pdf_viewer.css + // @import "~pdfjs-dist/web/pdf_viewer" doesn't work for loading image + // + &.textLayer { + position: absolute; + text-align: initial; + left: 0; + top: 0; + right: 0; + bottom: 0; + overflow: hidden; + opacity: 0.2; + line-height: 1; + } + + &.textLayer span, + &.textLayer br { + color: transparent; + position: absolute; + white-space: pre; + cursor: text; + transform-origin: 0% 0%; + } + + &.textLayer ::selection { + background: rgba(0, 0, 255, 1); + } + + /* Avoids https://github.com/mozilla/pdf.js/issues/13840 in Chrome */ + &.textLayer br::selection { + background: transparent; + } + + &.textLayer .endOfContent { + display: block; + position: absolute; + left: 0; + top: 100%; + right: 0; + bottom: 0; + z-index: -1; + cursor: default; + user-select: none; + } + + &.textLayer .endOfContent.active { + top: 0; + } } From 9f0dcd527061452bcbe2f73449c8855a6ef5e77b Mon Sep 17 00:00:00 2001 From: Susumu Fukuda Date: Tue, 16 Nov 2021 22:33:28 +0900 Subject: [PATCH 04/51] fix: apply review comments --- .../components/PdfViewer/PdfViewer.tsx | 28 ++++++------- .../PdfViewer/PdfViewerTextLayer.tsx | 41 +++++++------------ .../_document-preview-pdf-viewer.scss | 3 +- 3 files changed, 30 insertions(+), 42 deletions(-) diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewer/PdfViewer.tsx b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewer/PdfViewer.tsx index d0ee15607..209b371d1 100644 --- a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewer/PdfViewer.tsx +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewer/PdfViewer.tsx @@ -79,6 +79,7 @@ const PdfViewer: FC = ({ const [loadedFile, setLoadedFile] = useState(null); const [loadedPage, setLoadedPage] = useState(null); + // load PDF file useEffect(() => { let didCancel = false; @@ -100,6 +101,7 @@ const PdfViewer: FC = ({ }; }, [file, setPageCount]); + // load page from PDF file useEffect(() => { let didCancel = false; @@ -118,21 +120,18 @@ const PdfViewer: FC = ({ }; }, [loadedFile, page]); - const currentPage = useMemo(() => { - const isPageValid = !!loadedPage && loadedPage.pageNumber === page; - if (isPageValid) { - const viewport = loadedPage?.getViewport({ scale }); - const canvasInfo = viewport ? getCanvasInfo(viewport) : undefined; - return { loadedPage, viewport, canvasInfo }; - } - return null; - }, [loadedPage, page, scale]); + // extract canvas size of the current page + const [viewport, canvasInfo] = useMemo(() => { + const viewport = loadedPage?.getViewport({ scale }); + const canvasInfo = viewport ? getCanvasInfo(viewport) : undefined; + return [viewport, canvasInfo]; + }, [loadedPage, scale]); + // render the current page useEffect(() => { let didCancel = false; let task: PDFRenderTask | null = null; - const { loadedPage, viewport, canvasInfo } = currentPage || {}; if (loadedPage && !(loadedPage as any).then && viewport && canvasInfo) { const render = async () => { try { @@ -140,7 +139,8 @@ const PdfViewer: FC = ({ await task?.promise; } catch (e) { if (e instanceof RenderingCancelledException) { - // ignore + // Ignore. Rendering is interrupted by the effect cleanup method + // and another rendering will be taken place soon } else { throw e; // rethrow unknown exception } @@ -156,7 +156,7 @@ const PdfViewer: FC = ({ didCancel = true; task?.cancel(); }; - }, [loadedPage, currentPage, setLoading]); + }, [loadedPage, viewport, canvasInfo, setLoading]); useEffect(() => { if (setHideToolbarControls) { @@ -165,7 +165,6 @@ const PdfViewer: FC = ({ }, [setHideToolbarControls]); const classNameBase = `${settings.prefix}--document-preview-pdf-viewer`; - const { loadedPage: currentLoadedPage, canvasInfo } = currentPage || {}; return (
= ({ {showTextLayer && ( diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewer/PdfViewerTextLayer.tsx b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewer/PdfViewerTextLayer.tsx index f6c38f08c..e05a596ee 100644 --- a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewer/PdfViewerTextLayer.tsx +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewer/PdfViewerTextLayer.tsx @@ -14,11 +14,6 @@ interface Props { */ loadedPage: PDFPageProxy | null | undefined; - /** - * Page number, starting at 1 - */ - page: number; - /** * Zoom factor, where `1` is equal to 100% */ @@ -57,7 +52,6 @@ export type PdfTextLayerInfo = { const PdfViewerTextLayer: FC = ({ className, loadedPage, - page = 1, scale = 1, setTextLayerInfo: setTextLayerInfoCallback = () => {} }) => { @@ -71,20 +65,21 @@ const PdfViewerTextLayer: FC = ({ viewport: PDFPageViewport; } | null>(null); + // load text content from the page useEffect(() => { async function loadTextInfo() { - const isPageReady = !!loadedPage && loadedPage.pageNumber === page; - if (isPageReady) { + if (loadedPage) { const viewport = loadedPage.getViewport({ scale }); const textContent = await loadedPage.getTextContent(); - setTextRenderInfo({ textContent, viewport, page, scale }); + setTextRenderInfo({ textContent, viewport, page: loadedPage.pageNumber, scale }); } } loadTextInfo(); - }, [loadedPage, page, scale]); + }, [loadedPage, scale]); const textLayerBuilderRef = useRef(null); // ref for debugging purpose const [textLayerInfo, setTextLayerInfo] = useState(null); + // render text content useEffect(() => { let textLayerBuilder: TextLayerBuilder | null = null; async function loadTextLayer() { @@ -104,23 +99,16 @@ const PdfViewerTextLayer: FC = ({ // render textLayerDiv.innerHTML = ''; try { - const deferredRenderEnd = (() => { - let resolve: null | Function = null; - const promise = new Promise(resolvePromise => { - resolve = resolvePromise; - }); - + const deferredRenderEndPromise = new Promise(resolve => { const listener = () => { - resolve!(); + resolve(undefined); textLayerBuilder?.eventBus.off('textlayerrendered', listener); }; - textLayerBuilder.eventBus.on('textlayerrendered', listener); - - return { promise }; - })(); + textLayerBuilder?.eventBus.on('textlayerrendered', listener); + }); textLayerBuilder.render(); - await deferredRenderEnd.promise; + await deferredRenderEndPromise; // fix text divs _adjustTextDivs(textLayerBuilder.textDivs, textContent.items, scale); @@ -133,20 +121,21 @@ const PdfViewerTextLayer: FC = ({ }; } catch (e) { if (e instanceof RenderingCancelledException) { - // ignore + // Ignore. Rendering is interrupted by useEffect cleanup method. + // Another rendering starts soon return; } else { - throw e; + throw e; // rethrow unknown exception } } } setTextLayerInfo(textLayerInfo); } + loadTextLayer(); return () => { textLayerBuilder?.cancel(); - // should we set text items?? }; }, [textLayerDiv, textRenderInfo]); @@ -168,7 +157,7 @@ const PdfViewerTextLayer: FC = ({ }; /** - * Adjust text span width + * Adjust text span width based on scale * @param textDivs * @param textItems * @param scale diff --git a/packages/discovery-styles/scss/components/document-preview/_document-preview-pdf-viewer.scss b/packages/discovery-styles/scss/components/document-preview/_document-preview-pdf-viewer.scss index f3bcda870..bcce8911c 100644 --- a/packages/discovery-styles/scss/components/document-preview/_document-preview-pdf-viewer.scss +++ b/packages/discovery-styles/scss/components/document-preview/_document-preview-pdf-viewer.scss @@ -38,7 +38,8 @@ background: rgba(0, 0, 255, 1); } - /* Avoids https://github.com/mozilla/pdf.js/issues/13840 in Chrome */ + // Avoid unexpected text selection box in Chrome + // see https://github.com/mozilla/pdf.js/issues/13840 &.textLayer br::selection { background: transparent; } From cc0644610267820f1a210f2f89c172f6439108d1 Mon Sep 17 00:00:00 2001 From: Susumu Fukuda Date: Thu, 18 Nov 2021 11:48:22 +0900 Subject: [PATCH 05/51] refactor: extract text rendering hook --- .../PdfViewer/PdfViewerTextLayer.tsx | 75 ++++++++++++------- 1 file changed, 47 insertions(+), 28 deletions(-) diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewer/PdfViewerTextLayer.tsx b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewer/PdfViewerTextLayer.tsx index e05a596ee..097a6d0ac 100644 --- a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewer/PdfViewerTextLayer.tsx +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewer/PdfViewerTextLayer.tsx @@ -49,6 +49,20 @@ export type PdfTextLayerInfo = { page: number; }; +type PdfTextContentInfo = { + /** extracted PDF text content */ + textContent: TextContent; + + /** @see Props['scale'] */ + scale: number; + + /** @see PdfTextLayerInfo['viewport'] */ + viewport: PDFPageViewport; + + /** @see PdfTextLayerInfo['page'] */ + page: number; +}; + const PdfViewerTextLayer: FC = ({ className, loadedPage, @@ -58,27 +72,46 @@ const PdfViewerTextLayer: FC = ({ const textLayerRef = useRef(null); const textLayerDiv = textLayerRef.current; - const [textRenderInfo, setTextRenderInfo] = useState<{ - page: number; - scale: number; - textContent: TextContent; - viewport: PDFPageViewport; - } | null>(null); - // load text content from the page + const [textContentInfo, setTextContentInfo] = useState(null); useEffect(() => { async function loadTextInfo() { if (loadedPage) { const viewport = loadedPage.getViewport({ scale }); const textContent = await loadedPage.getTextContent(); - setTextRenderInfo({ textContent, viewport, page: loadedPage.pageNumber, scale }); + setTextContentInfo({ textContent, viewport, page: loadedPage.pageNumber, scale }); } } loadTextInfo(); }, [loadedPage, scale]); + // render text content + const [renderedTextInfo, setRenderedTextInfo] = useState(null); + useTextLayerRendering(textLayerDiv, textContentInfo, setRenderedTextInfo); + + useEffect(() => { + setTextLayerInfoCallback(renderedTextInfo); + }, [renderedTextInfo, setTextLayerInfoCallback]); + + const rootClassName = cx(className, `textLayer`); + return ( +
+ ); +}; + +function useTextLayerRendering( + textLayerDiv: HTMLDivElement | null, + textRenderInfo: PdfTextContentInfo | null, + setRenderedTextInfo?: (info: PdfTextLayerInfo | null) => any +) { const textLayerBuilderRef = useRef(null); // ref for debugging purpose - const [textLayerInfo, setTextLayerInfo] = useState(null); // render text content useEffect(() => { let textLayerBuilder: TextLayerBuilder | null = null; @@ -129,7 +162,9 @@ const PdfViewerTextLayer: FC = ({ } } } - setTextLayerInfo(textLayerInfo); + if (setRenderedTextInfo) { + setRenderedTextInfo(textLayerInfo); + } } loadTextLayer(); @@ -137,24 +172,8 @@ const PdfViewerTextLayer: FC = ({ return () => { textLayerBuilder?.cancel(); }; - }, [textLayerDiv, textRenderInfo]); - - useEffect(() => { - setTextLayerInfoCallback(textLayerInfo); - }, [textLayerInfo, setTextLayerInfoCallback]); - - const rootClassName = cx(className, `textLayer`); - return ( -
- ); -}; + }, [setRenderedTextInfo, textLayerDiv, textRenderInfo]); +} /** * Adjust text span width based on scale From de5731fad760918ea523ab3d6318bf5757d807da Mon Sep 17 00:00:00 2001 From: Susumu Fukuda Date: Fri, 19 Nov 2021 23:46:04 +0900 Subject: [PATCH 06/51] refactor: extract async func call hook --- .../PdfViewer/PdfViewer.stories.tsx | 4 +- .../components/PdfViewer/PdfViewer.tsx | 114 ++++--------- .../PdfViewer/PdfViewerTextLayer.tsx | 160 +++++++----------- .../PdfViewer/useAsyncFunctionCall.ts | 46 +++++ 4 files changed, 144 insertions(+), 180 deletions(-) create mode 100644 packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewer/useAsyncFunctionCall.ts diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewer/PdfViewer.stories.tsx b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewer/PdfViewer.stories.tsx index 6e7b31d12..11a3d0f08 100644 --- a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewer/PdfViewer.stories.tsx +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewer/PdfViewer.stories.tsx @@ -36,7 +36,7 @@ storiesOf('DocumentPreview/components/PdfViewer', module) const showTextLayer = boolean('Show text layer', false); const setLoadingAction = action('setLoading'); - const setTextLayerInfoAction = action('setTextLayerInfo'); + const setRenderedTextAction = action('setRenderedText'); return ( ); }); diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewer/PdfViewer.tsx b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewer/PdfViewer.tsx index 209b371d1..4a0de76ef 100644 --- a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewer/PdfViewer.tsx +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewer/PdfViewer.tsx @@ -1,4 +1,4 @@ -import React, { FC, useEffect, useRef, useState, useMemo } from 'react'; +import React, { FC, useEffect, useRef, useMemo, useCallback } from 'react'; import cx from 'classnames'; import PdfjsLib, { PDFDocumentProxy, @@ -8,9 +8,8 @@ import PdfjsLib, { } from 'pdfjs-dist'; import PdfjsWorkerAsText from 'pdfjs-dist/build/pdf.worker.min.js'; import { settings } from 'carbon-components'; -import PdfViewerTextLayer, { PdfTextLayerInfo } from './PdfViewerTextLayer'; - -const { RenderingCancelledException } = PdfjsLib as any; +import PdfViewerTextLayer, { PdfRenderedText } from './PdfViewerTextLayer'; +import useAsyncFunctionCall from './useAsyncFunctionCall'; setupPdfjs(); @@ -57,7 +56,7 @@ interface Props { /** * Callback for text layer info */ - setTextLayerInfo?: (info: PdfTextLayerInfo | null) => any; + setRenderedText?: (info: PdfRenderedText | null) => any; } const PdfViewer: FC = ({ @@ -70,93 +69,48 @@ const PdfViewer: FC = ({ setPageCount, setLoading, setHideToolbarControls, - setTextLayerInfo, + setRenderedText, children }) => { const canvasRef = useRef(null); - // In order to prevent unnecessary re-loading, loaded file and page are stored in state - const [loadedFile, setLoadedFile] = useState(null); - const [loadedPage, setLoadedPage] = useState(null); - - // load PDF file - useEffect(() => { - let didCancel = false; - - async function loadPdf(): Promise { - if (file) { - const newPdf = await _loadPdf(file); - if (!didCancel) { - setLoadedFile(newPdf); - if (setPageCount) { - setPageCount(newPdf.numPages); - } - } - } - } - loadPdf(); - - return (): void => { - didCancel = true; - }; - }, [file, setPageCount]); - - // load page from PDF file - useEffect(() => { - let didCancel = false; - - async function loadPage(): Promise { - if (loadedFile && page > 0) { - const newPage = await _loadPage(loadedFile, page); - if (!didCancel) { - setLoadedPage(newPage); - } - } - } - loadPage(); - - return (): void => { - didCancel = true; - }; - }, [loadedFile, page]); + const loadedFile = useAsyncFunctionCall( + useCallback(async () => (file ? await _loadPdf(file) : null), [file]) + ); + const loadedPage = useAsyncFunctionCall( + useCallback( + async () => (loadedFile && page > 0 ? await _loadPage(loadedFile, page) : null), + [loadedFile, page] + ) + ); - // extract canvas size of the current page const [viewport, canvasInfo] = useMemo(() => { const viewport = loadedPage?.getViewport({ scale }); const canvasInfo = viewport ? getCanvasInfo(viewport) : undefined; return [viewport, canvasInfo]; }, [loadedPage, scale]); - // render the current page - useEffect(() => { - let didCancel = false; - let task: PDFRenderTask | null = null; - - if (loadedPage && !(loadedPage as any).then && viewport && canvasInfo) { - const render = async () => { - try { - task = _renderPage(loadedPage, canvasRef.current!, viewport, canvasInfo); + // render page + useAsyncFunctionCall( + useCallback( + async (abortSignal: AbortSignal) => { + if (loadedPage && !(loadedPage as any).then && viewport && canvasInfo) { + const task = _renderPage(loadedPage, canvasRef.current!, viewport, canvasInfo); + abortSignal.addEventListener('abort', () => task?.cancel()); await task?.promise; - } catch (e) { - if (e instanceof RenderingCancelledException) { - // Ignore. Rendering is interrupted by the effect cleanup method - // and another rendering will be taken place soon - } else { - throw e; // rethrow unknown exception - } - } finally { - if (!didCancel) { - setLoading(false); - } + + setLoading(false); } - }; - render(); + }, + [canvasInfo, loadedPage, setLoading, viewport] + ) + ); + + useEffect(() => { + if (setPageCount && loadedFile) { + setPageCount(loadedFile.numPages); } - return () => { - didCancel = true; - task?.cancel(); - }; - }, [loadedPage, viewport, canvasInfo, setLoading]); + }, [loadedFile, setPageCount]); useEffect(() => { if (setHideToolbarControls) { @@ -179,7 +133,7 @@ const PdfViewer: FC = ({ className={cx(`${classNameBase}--text`, textLayerClassName)} loadedPage={loadedPage} scale={scale} - setTextLayerInfo={setTextLayerInfo} + setRenderedText={setRenderedText} /> )} {children} @@ -192,7 +146,7 @@ PdfViewer.defaultProps = { scale: 1 }; -function _loadPdf(data: string): Promise { +function _loadPdf(data: string): Promise { return PdfjsLib.getDocument({ data }).promise; } diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewer/PdfViewerTextLayer.tsx b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewer/PdfViewerTextLayer.tsx index 097a6d0ac..e6f536076 100644 --- a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewer/PdfViewerTextLayer.tsx +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewer/PdfViewerTextLayer.tsx @@ -1,10 +1,9 @@ -import React, { FC, useEffect, useRef, useState } from 'react'; +import React, { FC, useEffect, useRef, useCallback } from 'react'; import cx from 'classnames'; -import PDFJSLib, { PDFPageProxy, PDFPageViewport, TextContent, TextContentItem } from 'pdfjs-dist'; +import { PDFPageProxy, PDFPageViewport, TextContent, TextContentItem } from 'pdfjs-dist'; import { EventBus } from 'pdfjs-dist/lib/web/ui_utils'; import { TextLayerBuilder } from 'pdfjs-dist/lib/web/text_layer_builder'; - -const { RenderingCancelledException } = PDFJSLib as any; +import useAsyncFunctionCall from './useAsyncFunctionCall'; interface Props { className?: string; @@ -22,10 +21,10 @@ interface Props { /** * Callback for text layer info */ - setTextLayerInfo?: (info: PdfTextLayerInfo | null) => any; + setRenderedText?: (info: PdfRenderedText | null) => any; } -export type PdfTextLayerInfo = { +export type PdfRenderedText = { /** * PDF text content */ @@ -49,49 +48,56 @@ export type PdfTextLayerInfo = { page: number; }; -type PdfTextContentInfo = { - /** extracted PDF text content */ - textContent: TextContent; - - /** @see Props['scale'] */ - scale: number; - - /** @see PdfTextLayerInfo['viewport'] */ - viewport: PDFPageViewport; - - /** @see PdfTextLayerInfo['page'] */ - page: number; -}; - const PdfViewerTextLayer: FC = ({ className, loadedPage, scale = 1, - setTextLayerInfo: setTextLayerInfoCallback = () => {} + setRenderedText = () => {} }) => { const textLayerRef = useRef(null); const textLayerDiv = textLayerRef.current; // load text content from the page - const [textContentInfo, setTextContentInfo] = useState(null); - useEffect(() => { - async function loadTextInfo() { + const loadedText = useAsyncFunctionCall( + useCallback(async () => { if (loadedPage) { const viewport = loadedPage.getViewport({ scale }); const textContent = await loadedPage.getTextContent(); - setTextContentInfo({ textContent, viewport, page: loadedPage.pageNumber, scale }); + return { textContent, viewport, page: loadedPage.pageNumber, scale }; } - } - loadTextInfo(); - }, [loadedPage, scale]); + return null; + }, [loadedPage, scale]) + ); // render text content - const [renderedTextInfo, setRenderedTextInfo] = useState(null); - useTextLayerRendering(textLayerDiv, textContentInfo, setRenderedTextInfo); + const renderedText = useAsyncFunctionCall( + useCallback( + async (signal: AbortSignal) => { + if (textLayerDiv && loadedText) { + const { textContent, viewport, scale, page } = loadedText; + + const builder = new TextLayerBuilder({ + textLayerDiv, + viewport, + eventBus: new EventBus(), + pageIndex: page - 1 + }); + signal.addEventListener('abort', () => builder.cancel()); + + await _renderTextLayer(builder, textContent, textLayerDiv, scale); + return { textContent, viewport, page, textDivs: builder.textDivs }; + } + return undefined; + }, + [loadedText, textLayerDiv] + ) + ); useEffect(() => { - setTextLayerInfoCallback(renderedTextInfo); - }, [renderedTextInfo, setTextLayerInfoCallback]); + if (renderedText !== undefined) { + setRenderedText(renderedText); + } + }, [renderedText, setRenderedText]); const rootClassName = cx(className, `textLayer`); return ( @@ -99,80 +105,38 @@ const PdfViewerTextLayer: FC = ({ className={rootClassName} ref={textLayerRef} style={{ - width: `${textContentInfo?.viewport?.width ?? 0}px`, - height: `${textContentInfo?.viewport?.height ?? 0}px` + width: `${loadedText?.viewport?.width ?? 0}px`, + height: `${loadedText?.viewport?.height ?? 0}px` }} /> ); }; -function useTextLayerRendering( - textLayerDiv: HTMLDivElement | null, - textRenderInfo: PdfTextContentInfo | null, - setRenderedTextInfo?: (info: PdfTextLayerInfo | null) => any +/** + * Render text into DOM using the text layer builder + */ +async function _renderTextLayer( + builder: TextLayerBuilder, + textContent: TextContent, + textLayerDiv: HTMLDivElement, + scale: number ) { - const textLayerBuilderRef = useRef(null); // ref for debugging purpose - // render text content - useEffect(() => { - let textLayerBuilder: TextLayerBuilder | null = null; - async function loadTextLayer() { - let textLayerInfo: PdfTextLayerInfo | null = null; - - if (textLayerDiv && textRenderInfo) { - const { textContent, viewport, scale, page } = textRenderInfo; - // prepare text layer - textLayerBuilder = textLayerBuilderRef.current = new TextLayerBuilder({ - textLayerDiv, - viewport, - eventBus: new EventBus(), - pageIndex: page - 1 - }); - textLayerBuilder.setTextContent(textContent); - - // render - textLayerDiv.innerHTML = ''; - try { - const deferredRenderEndPromise = new Promise(resolve => { - const listener = () => { - resolve(undefined); - textLayerBuilder?.eventBus.off('textlayerrendered', listener); - }; - textLayerBuilder?.eventBus.on('textlayerrendered', listener); - }); - - textLayerBuilder.render(); - await deferredRenderEndPromise; - - // fix text divs - _adjustTextDivs(textLayerBuilder.textDivs, textContent.items, scale); - - textLayerInfo = { - textContent, - textDivs: textLayerBuilder.textDivs, - viewport, - page - }; - } catch (e) { - if (e instanceof RenderingCancelledException) { - // Ignore. Rendering is interrupted by useEffect cleanup method. - // Another rendering starts soon - return; - } else { - throw e; // rethrow unknown exception - } - } - } - if (setRenderedTextInfo) { - setRenderedTextInfo(textLayerInfo); - } - } + builder.setTextContent(textContent); + + // render + textLayerDiv.innerHTML = ''; + const deferredRenderEndPromise = new Promise(resolve => { + const listener = () => { + resolve(undefined); + builder?.eventBus.off('textlayerrendered', listener); + }; + builder?.eventBus.on('textlayerrendered', listener); + }); - loadTextLayer(); + builder.render(); + await deferredRenderEndPromise; - return () => { - textLayerBuilder?.cancel(); - }; - }, [setRenderedTextInfo, textLayerDiv, textRenderInfo]); + _adjustTextDivs(builder.textDivs, textContent.items, scale); } /** diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewer/useAsyncFunctionCall.ts b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewer/useAsyncFunctionCall.ts new file mode 100644 index 000000000..6f3d5f929 --- /dev/null +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewer/useAsyncFunctionCall.ts @@ -0,0 +1,46 @@ +import { useEffect, useState } from 'react'; + +type AsyncFunc = (signal: AbortSignal) => Promise; +type AsyncFuncReturnType = T extends AsyncFunc ? U : never; + +/** + * Call async function WRAPPED BY `useCallback` and return its result + * + * @param asyncFunction async function wrapped by `useCallback`. + * Take one parameter `setCancellable` to set _cancellable_ of the current async call. + * @returns the result of the async function + */ +function useAsyncFunctionCall, ReturnType = AsyncFuncReturnType>( + asyncFunction: Func +): ReturnType | undefined { + const [result, setResult] = useState(); + + useEffect(() => { + let state: 'pending' | 'fulfilled' | 'rejected' = 'pending'; + const abortController = new AbortController(); + + asyncFunction(abortController.signal) + .then((promiseResult: ReturnType) => { + state = 'fulfilled'; + if (!abortController.signal.aborted && promiseResult !== undefined) { + setResult(promiseResult); + } + }) + .catch(err => { + state = 'rejected'; + if (!abortController.signal.aborted) { + throw err; + } + }); + + return (): void => { + if (state === 'pending') { + abortController.abort(); + } + }; + }, [asyncFunction]); + + return result; +} + +export default useAsyncFunctionCall; From f64fa0774851b7aa2bb7aa375321d1937b283941 Mon Sep 17 00:00:00 2001 From: Susumu Fukuda Date: Wed, 24 Nov 2021 10:51:53 +0900 Subject: [PATCH 07/51] fix: revise how to import css from pdfjs --- .../_document-preview-pdf-viewer.scss | 54 +----------- .../document-preview/_pdfjs_web_mixins.scss | 83 +++++++++++++++++++ 2 files changed, 86 insertions(+), 51 deletions(-) create mode 100644 packages/discovery-styles/scss/components/document-preview/_pdfjs_web_mixins.scss diff --git a/packages/discovery-styles/scss/components/document-preview/_document-preview-pdf-viewer.scss b/packages/discovery-styles/scss/components/document-preview/_document-preview-pdf-viewer.scss index bcce8911c..dce987167 100644 --- a/packages/discovery-styles/scss/components/document-preview/_document-preview-pdf-viewer.scss +++ b/packages/discovery-styles/scss/components/document-preview/_document-preview-pdf-viewer.scss @@ -1,5 +1,8 @@ +@import './pdfjs_web_mixins'; + .#{$prefix}--document-preview-pdf-viewer { position: relative; + @include pdfjsTextLayer; } .#{$prefix}--document-preview-pdf-viewer--canvas { @@ -8,55 +11,4 @@ .#{$prefix}--document-preview-pdf-viewer--text { transform-origin: left top 0px; - - // - // NOTE: import textLayer styles from ~pdfjs-dist/web/pdf_viewer.css - // @import "~pdfjs-dist/web/pdf_viewer" doesn't work for loading image - // - &.textLayer { - position: absolute; - text-align: initial; - left: 0; - top: 0; - right: 0; - bottom: 0; - overflow: hidden; - opacity: 0.2; - line-height: 1; - } - - &.textLayer span, - &.textLayer br { - color: transparent; - position: absolute; - white-space: pre; - cursor: text; - transform-origin: 0% 0%; - } - - &.textLayer ::selection { - background: rgba(0, 0, 255, 1); - } - - // Avoid unexpected text selection box in Chrome - // see https://github.com/mozilla/pdf.js/issues/13840 - &.textLayer br::selection { - background: transparent; - } - - &.textLayer .endOfContent { - display: block; - position: absolute; - left: 0; - top: 100%; - right: 0; - bottom: 0; - z-index: -1; - cursor: default; - user-select: none; - } - - &.textLayer .endOfContent.active { - top: 0; - } } diff --git a/packages/discovery-styles/scss/components/document-preview/_pdfjs_web_mixins.scss b/packages/discovery-styles/scss/components/document-preview/_pdfjs_web_mixins.scss new file mode 100644 index 000000000..abac06472 --- /dev/null +++ b/packages/discovery-styles/scss/components/document-preview/_pdfjs_web_mixins.scss @@ -0,0 +1,83 @@ +@mixin pdfjsTextLayer { + // CSS from ~pdfjs-dist/web/pdf_viewer.css for scoped style + + // BEGIN-QUOTE --- awk '/^\/\*/,/\*\//' + /* Copyright 2014 Mozilla Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + // END-QUOTE + + // BEGIN-QUOTE --- awk '/^\.textLayer/,/}/' + .textLayer { + position: absolute; + left: 0; + top: 0; + right: 0; + bottom: 0; + overflow: hidden; + opacity: 0.2; + line-height: 1; + } + .textLayer > span { + color: transparent; + position: absolute; + white-space: pre; + cursor: text; + -webkit-transform-origin: 0% 0%; + transform-origin: 0% 0%; + } + .textLayer .highlight { + margin: -1px; + padding: 1px; + + background-color: rgb(180, 0, 170); + border-radius: 4px; + } + .textLayer .highlight.begin { + border-radius: 4px 0px 0px 4px; + } + .textLayer .highlight.end { + border-radius: 0px 4px 4px 0px; + } + .textLayer .highlight.middle { + border-radius: 0px; + } + .textLayer .highlight.selected { + background-color: rgb(0, 100, 0); + } + .textLayer ::-moz-selection { + background: rgb(0, 0, 255); + } + .textLayer ::selection { + background: rgb(0, 0, 255); + } + .textLayer .endOfContent { + display: block; + position: absolute; + left: 0px; + top: 100%; + right: 0px; + bottom: 0px; + z-index: -1; + cursor: default; + -webkit-user-select: none; + -moz-user-select: none; + -ms-user-select: none; + user-select: none; + } + .textLayer .endOfContent.active { + top: 0px; + } + // END-QUOTE +} // end mixin From 3c08df91e4938c7a7483b7c26ee4c52bca94b9f6 Mon Sep 17 00:00:00 2001 From: Susumu Fukuda Date: Wed, 24 Nov 2021 11:29:44 +0900 Subject: [PATCH 08/51] fix: install @types/pdfjs-dist to yarn2 --- .../discovery-react-components/package.json | 1 + yarn.lock | 30 ++++++++++++++++--- 2 files changed, 27 insertions(+), 4 deletions(-) diff --git a/packages/discovery-react-components/package.json b/packages/discovery-react-components/package.json index 91f4d4191..49e64485a 100644 --- a/packages/discovery-react-components/package.json +++ b/packages/discovery-react-components/package.json @@ -43,6 +43,7 @@ "react-virtualized": "9.21.1" }, "devDependencies": { + "@types/pdfjs-dist": "^2.10.378", "cross-env": "^7.0.3", "css-loader": "^3.4.2", "madge": "^5.0.1", diff --git a/yarn.lock b/yarn.lock index f7460da53..513c03d4e 100644 --- a/yarn.lock +++ b/yarn.lock @@ -2267,10 +2267,11 @@ __metadata: languageName: node linkType: hard -"@ibm-watson/discovery-react-components@^1.5.0-beta.1, @ibm-watson/discovery-react-components@workspace:packages/discovery-react-components": +"@ibm-watson/discovery-react-components@^1.5.0-beta.2, @ibm-watson/discovery-react-components@workspace:packages/discovery-react-components": version: 0.0.0-use.local resolution: "@ibm-watson/discovery-react-components@workspace:packages/discovery-react-components" dependencies: + "@types/pdfjs-dist": ^2.10.378 classnames: ^2.2.6 cross-env: ^7.0.3 css-loader: ^3.4.2 @@ -2298,7 +2299,7 @@ __metadata: languageName: unknown linkType: soft -"@ibm-watson/discovery-styles@^1.5.0-beta.1, @ibm-watson/discovery-styles@workspace:packages/discovery-styles": +"@ibm-watson/discovery-styles@^1.5.0-beta.2, @ibm-watson/discovery-styles@workspace:packages/discovery-styles": version: 0.0.0-use.local resolution: "@ibm-watson/discovery-styles@workspace:packages/discovery-styles" peerDependencies: @@ -4938,6 +4939,15 @@ __metadata: languageName: node linkType: hard +"@types/pdfjs-dist@npm:^2.10.378": + version: 2.10.378 + resolution: "@types/pdfjs-dist@npm:2.10.378" + dependencies: + pdfjs-dist: "*" + checksum: 36dd6010f7d23a995efdf11ea4ecb56f371f8bfb3e83a5c311666726e13238597ed1519701d0e2e6fb297270d01ad6aece9582b036fd4cb3aa301e61ea364978 + languageName: node + linkType: hard + "@types/prop-types@npm:*": version: 15.7.3 resolution: "@types/prop-types@npm:15.7.3" @@ -10236,8 +10246,8 @@ __metadata: resolution: "discovery-search-app@workspace:examples/discovery-search-app" dependencies: "@carbon/icons": ^10.5.0 - "@ibm-watson/discovery-react-components": ^1.5.0-beta.1 - "@ibm-watson/discovery-styles": ^1.5.0-beta.1 + "@ibm-watson/discovery-react-components": ^1.5.0-beta.2 + "@ibm-watson/discovery-styles": ^1.5.0-beta.2 body-parser: ^1.19.0 carbon-components: ^10.6.0 carbon-components-react: ^7.7.0 @@ -19518,6 +19528,18 @@ __metadata: languageName: node linkType: hard +"pdfjs-dist@npm:*": + version: 2.11.338 + resolution: "pdfjs-dist@npm:2.11.338" + peerDependencies: + worker-loader: ^3.0.8 + peerDependenciesMeta: + worker-loader: + optional: true + checksum: 1b946a3eeb3312a79e12b4e0aa066bb2b98487b9ee329666edc840a194602595cf84de9a3f6dbb023b808699a6ebb0cd06e751314fc4c0ffa56f7be12855d296 + languageName: node + linkType: hard + "pdfjs-dist@npm:^2.2.228": version: 2.2.228 resolution: "pdfjs-dist@npm:2.2.228" From c72222159d280202a567f2d33291f121d8b1973c Mon Sep 17 00:00:00 2001 From: Susumu Fukuda Date: Wed, 24 Nov 2021 12:07:45 +0900 Subject: [PATCH 09/51] feat: add script to update style --- packages/discovery-styles/package.json | 1 + .../discovery-styles/scripts/update-styles.sh | 19 +++++++++++++++++++ .../document-preview/_pdfjs_web_mixins.scss | 8 ++++---- 3 files changed, 24 insertions(+), 4 deletions(-) create mode 100755 packages/discovery-styles/scripts/update-styles.sh diff --git a/packages/discovery-styles/package.json b/packages/discovery-styles/package.json index 3bbdf2c64..7c6b9e05f 100644 --- a/packages/discovery-styles/package.json +++ b/packages/discovery-styles/package.json @@ -7,6 +7,7 @@ "repository": "https://github.com/watson-developer-cloud/discovery-components", "main": "scss/index.scss", "scripts": { + "prebuild": "scripts/update-styles.sh", "build": "node-sass --importer=../../node_modules/node-sass-tilde-importer --source-map=true scss/index.scss css/index.css", "prepublish": "yarn run build", "start": "yarn run build -- --watch", diff --git a/packages/discovery-styles/scripts/update-styles.sh b/packages/discovery-styles/scripts/update-styles.sh new file mode 100755 index 000000000..2c112101f --- /dev/null +++ b/packages/discovery-styles/scripts/update-styles.sh @@ -0,0 +1,19 @@ +#!/bin/sh + +PDFJS_WEB_CSS=../../node_modules/pdfjs-dist/web/pdf_viewer.css +PDFJS_SCSS=scss/components/document-preview/_pdfjs_web_mixins.scss + +function update_pdfjs_scss() { + key=$1 + tmp=$PDFJS_SCSS.tmp + + sed -e "/BEGIN-QUOTE $key/q" $PDFJS_SCSS > $tmp + cat >> $tmp + sed -ne "/END-QUOTE $key/,\$p" $PDFJS_SCSS >> $tmp + cp $tmp $PDFJS_SCSS; + rm $tmp; +} + +cat $PDFJS_WEB_CSS | awk '/^\/\*/,/\*\//' | update_pdfjs_scss "COMMENT" +cat $PDFJS_WEB_CSS | awk '/^\.textLayer/,/}/' | update_pdfjs_scss "TEXT-LAYER" +../../node_modules/.bin/prettier --write $PDFJS_SCSS diff --git a/packages/discovery-styles/scss/components/document-preview/_pdfjs_web_mixins.scss b/packages/discovery-styles/scss/components/document-preview/_pdfjs_web_mixins.scss index abac06472..37e94e1c8 100644 --- a/packages/discovery-styles/scss/components/document-preview/_pdfjs_web_mixins.scss +++ b/packages/discovery-styles/scss/components/document-preview/_pdfjs_web_mixins.scss @@ -1,7 +1,7 @@ @mixin pdfjsTextLayer { // CSS from ~pdfjs-dist/web/pdf_viewer.css for scoped style - // BEGIN-QUOTE --- awk '/^\/\*/,/\*\//' + // BEGIN-QUOTE COMMENT /* Copyright 2014 Mozilla Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -16,9 +16,9 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - // END-QUOTE + // END-QUOTE COMMENT - // BEGIN-QUOTE --- awk '/^\.textLayer/,/}/' + // BEGIN-QUOTE TEXT-LAYER .textLayer { position: absolute; left: 0; @@ -79,5 +79,5 @@ .textLayer .endOfContent.active { top: 0px; } - // END-QUOTE + // END-QUOTE TEXT-LAYER } // end mixin From 69ff0425f2a74f9b35ad7191cc50b945366fd319 Mon Sep 17 00:00:00 2001 From: Susumu Fukuda Date: Wed, 24 Nov 2021 12:50:02 +0900 Subject: [PATCH 10/51] refactor: revise script for importing css --- .../discovery-styles/scripts/update-styles.sh | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/packages/discovery-styles/scripts/update-styles.sh b/packages/discovery-styles/scripts/update-styles.sh index 2c112101f..49ab2c595 100755 --- a/packages/discovery-styles/scripts/update-styles.sh +++ b/packages/discovery-styles/scripts/update-styles.sh @@ -3,17 +3,18 @@ PDFJS_WEB_CSS=../../node_modules/pdfjs-dist/web/pdf_viewer.css PDFJS_SCSS=scss/components/document-preview/_pdfjs_web_mixins.scss -function update_pdfjs_scss() { - key=$1 - tmp=$PDFJS_SCSS.tmp +function replace_quote() { + file=$1 + key=$2 + tmp=$file.tmp - sed -e "/BEGIN-QUOTE $key/q" $PDFJS_SCSS > $tmp + sed -e "/BEGIN-QUOTE $key/q" $file > $tmp cat >> $tmp - sed -ne "/END-QUOTE $key/,\$p" $PDFJS_SCSS >> $tmp - cp $tmp $PDFJS_SCSS; + sed -ne "/END-QUOTE $key/,\$p" $file >> $tmp + cp $tmp $file; rm $tmp; } -cat $PDFJS_WEB_CSS | awk '/^\/\*/,/\*\//' | update_pdfjs_scss "COMMENT" -cat $PDFJS_WEB_CSS | awk '/^\.textLayer/,/}/' | update_pdfjs_scss "TEXT-LAYER" +cat $PDFJS_WEB_CSS | awk '/^\/\*/,/\*\//' | replace_quote $PDFJS_SCSS "COMMENT" +cat $PDFJS_WEB_CSS | awk '/^\.textLayer/,/}/' | replace_quote $PDFJS_SCSS "TEXT-LAYER" ../../node_modules/.bin/prettier --write $PDFJS_SCSS From c0771fc93a238dc3d18cebcefc656c301d32f7db Mon Sep 17 00:00:00 2001 From: Susumu Fukuda Date: Wed, 10 Nov 2021 15:19:39 +0900 Subject: [PATCH 11/51] feat: add types and common utilities --- .../components/PdfViewerHighlight/types.ts | 39 +++ .../utils/common/TextNormalizer.ts | 266 ++++++++++++++++++ .../common/__tests__/TextNormalizer.test.ts | 142 ++++++++++ .../utils/common/__tests__/bboxUtils.test.ts | 41 +++ .../common/__tests__/findLargestIndex.test.ts | 46 +++ .../common/__tests__/textSpanUtils.test.ts | 135 +++++++++ .../utils/common/bboxUtils.ts | 76 +++++ .../utils/common/documentUtils.ts | 53 ++++ .../utils/common/findLargestIndex.ts | 34 +++ .../utils/common/nonEmpty.ts | 3 + .../utils/common/textSpanUtils.ts | 58 ++++ 11 files changed, 893 insertions(+) create mode 100644 packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/types.ts create mode 100644 packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/TextNormalizer.ts create mode 100644 packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/__tests__/TextNormalizer.test.ts create mode 100644 packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/__tests__/bboxUtils.test.ts create mode 100644 packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/__tests__/findLargestIndex.test.ts create mode 100644 packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/__tests__/textSpanUtils.test.ts create mode 100644 packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/bboxUtils.ts create mode 100644 packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/documentUtils.ts create mode 100644 packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/findLargestIndex.ts create mode 100644 packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/nonEmpty.ts create mode 100644 packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/textSpanUtils.ts diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/types.ts b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/types.ts new file mode 100644 index 000000000..5496895c1 --- /dev/null +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/types.ts @@ -0,0 +1,39 @@ +import { Bbox as DocumentPreviewBbox } from '../../../DocumentPreview/types'; +import { Location } from 'utils/document/processDoc'; + +// (re-)export useful types +export type Bbox = DocumentPreviewBbox; +export type TextSpan = [number, number]; + +/** + * A document. Same to QueryResult, but this more focuses on document fields + */ +export type DocumentFields = { [fieldName: string]: string[] | undefined }; + +/** + * Highlight on a document field + */ +export type DocumentFieldHighlight = { + field: string; + fieldIndex: number; + location: Location; + className?: string; +}; + +/** + * Highlight shape on a page, which consists of boundary boxes + */ +export interface HighlightShape { + boxes: HighlightShapeBox[]; + className?: string; +} + +/** + * Boundary box for a highlight + */ +export interface HighlightShapeBox { + bbox: Bbox; + dir?: string; // e.g. ltr, rtl. ltr by default + isStart?: boolean; + isEnd?: boolean; +} diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/TextNormalizer.ts b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/TextNormalizer.ts new file mode 100644 index 000000000..8eab9cf07 --- /dev/null +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/TextNormalizer.ts @@ -0,0 +1,266 @@ +import { TextSpan } from '../../types'; +import { END, spanLen, START } from './textSpanUtils'; + +type SpanMapping = { rawSpan: TextSpan; normalizedSpan: TextSpan }; + +const SPACES = { + normal: () => ' ', + regexString: '\\s+' +}; + +const DOUBLE_QUOTE = { + normal: () => '"', + regexString: `[${[ + '«', // U+00AB + '»', // U+00BB + '“', // U+201C + '”', // U+201D + '„', // U+201E + '‟', // U+201F + '❝', // U+275D + '❞', // U+275E + '⹂', // U+2E42 + '〝', // U+301D + '〞', // U+301E + '〟', // U+301F + '"' // U+FF02 + ].join('')}]` +}; + +const QUOTE = { + normal: () => "'", + regexString: `[${[ + '‹', // U+2039 + '›', // U+203A + '’', // U+2019 + '❮', // U+276E + '❯', // U+276F + '‘', // U+2018 + '‚', // U+201A + '‛', // U+201B + '❛', // U+275B + '❜', // U+275C + '❟' // U+275F + ].join('')}]` +}; + +const SURROGATE_PAIR = { + normal: (_: string) => '_', + regexString: '[\uD800-\uDBFF][\uDC00-\uDFFF]' +}; + +// remove "Combining Diacritical Marks" from the string +// NOTE: we may have to do this after conversion again +// str.normalize("NFD").replace(/[\u0300-\u036f]/g, "") +const DIACRITICAL_MARK = { + normal: () => '', + regexString: '[\u0300-\u036f]' +}; +const DIACRITICAL_MARK_REGEX = new RegExp(DIACRITICAL_MARK.regexString, 'g'); + +function normalizeDiacriticalMarks(text: string, keepLength = false) { + const r = text + .normalize('NFD') + .replace(DIACRITICAL_MARK_REGEX, DIACRITICAL_MARK.normal) + .normalize('NFC'); + if (keepLength && r.length !== text.length) { + // + // String.normalize may change length of a string. `keepLength` flag keeps string + // length after conversion by padding or truncating a string. + // + return r.substring(0, text.length).padEnd(text.length, ' '); + } + return r; +} + +const NORMALIZATIONS = [SPACES, DOUBLE_QUOTE, QUOTE, SURROGATE_PAIR, DIACRITICAL_MARK].map(n => ({ + ...n, + regex: new RegExp(n.regexString, 'g') +})); + +// regex to match all the chars to normalize. +// the regex is: /(\s+)|(["""])|(['''])|([\u8D..FF])|([\u03..6f])/g +const NORMALIZATIONS_REGEX = new RegExp( + NORMALIZATIONS.map(n => `(${n.regexString})`).join('|'), + 'g' +); + +/** + * Normalize text + * @param text text to normalize + * @returns normalized text @see TextNormalizer + */ +export function normalizeText(text: string) { + const r = NORMALIZATIONS.reduce((text, n) => { + return text.replace(n.regex, m => n.normal(m)); + }, text); + return normalizeDiacriticalMarks(r); +} + +/** + * Text normalizer with mapping between spans on original and normalized text + * + * Normalize the following in a text: + * - two or more consequent spaces + * - single or double quote + * - surrogate pairs + * - diacritical marks (accent) + */ +export class TextNormalizer { + readonly rawText: string; + readonly normalizedText: string; + private readonly normalizationMappings: SpanMapping[]; + + constructor(rawText: string) { + this.rawText = rawText; + + let normalizedText = ''; + const addNormalizedText = (text: string) => { + normalizedText += normalizeDiacriticalMarks(text, true); + }; + + const normalizationMappings: SpanMapping[] = []; + const re = NORMALIZATIONS_REGEX; + let cur = 0; + let match = re.exec(this.rawText); + while (match != null) { + const originalChar = match[0]; + let normalizedChar = match[0]; + for (let i = 0; i < match.length - 1; i += 1) { + if (match[i + 1] != null) { + normalizedChar = NORMALIZATIONS[i].normal(match[0]); + break; + } + } + const needNormalize = originalChar !== normalizedChar; + + if (match.index > cur) { + const newText = this.rawText.substring(cur, match.index); + if (needNormalize) { + const rawSpan: TextSpan = [cur, match.index]; + const normalizedSpan: TextSpan = [ + normalizedText.length, + normalizedText.length + newText.length + ]; + normalizationMappings.push({ rawSpan, normalizedSpan }); + addNormalizedText(newText); + cur += newText.length; + } + } + + if (needNormalize) { + const newText = normalizedChar; + const rawSpan: TextSpan = [match.index, match.index + match[0].length]; + const normalizedSpan: TextSpan = [ + normalizedText.length, + normalizedText.length + newText.length + ]; + normalizationMappings.push({ rawSpan, normalizedSpan }); + addNormalizedText(newText); + cur = re.lastIndex; + } + match = re.exec(this.rawText); + } + if (cur < this.rawText.length) { + const newText = this.rawText.substring(cur); + const rawSpan: TextSpan = [cur, cur + newText.length]; + const normalizedSpan: TextSpan = [ + normalizedText.length, + normalizedText.length + newText.length + ]; + normalizationMappings.push({ rawSpan, normalizedSpan }); + addNormalizedText(newText); + } + this.normalizedText = normalizedText; + this.normalizationMappings = optimizeSpanMappings(normalizationMappings); + } + + toNormalized(rawSpan: TextSpan): TextSpan { + const [rawBegin, rawEnd] = rawSpan; + + const normalizedIndex = (raw: number) => { + if (raw < 0) { + return raw; + } + const beginIndex = this.normalizationMappings.findIndex(({ rawSpan }) => raw < rawSpan[END]); + if (beginIndex >= 0) { + const { rawSpan, normalizedSpan } = this.normalizationMappings[beginIndex]; + return mapCharIndexOnSpans(raw, { from: rawSpan, to: normalizedSpan }); + } + const last = this.normalizationMappings[this.normalizationMappings.length - 1]; + return raw - last.rawSpan[END] + last.normalizedSpan[END]; + }; + return [normalizedIndex(rawBegin), normalizedIndex(rawEnd)]; + } + + toRaw(normalizedSpan: TextSpan): TextSpan { + const [normalizedBegin, normalizedEnd] = normalizedSpan; + + const rawIndex = (normalized: number) => { + if (normalized < 0) { + return normalized; + } + const beginIndex = this.normalizationMappings.findIndex( + ({ normalizedSpan }) => normalized < normalizedSpan[END] + ); + if (beginIndex >= 0) { + const { rawSpan, normalizedSpan } = this.normalizationMappings[beginIndex]; + return mapCharIndexOnSpans(normalized, { from: normalizedSpan, to: rawSpan }); + } + const last = this.normalizationMappings[this.normalizationMappings.length - 1]; + return normalized - last.normalizedSpan[END] + last.rawSpan[END]; + }; + return [rawIndex(normalizedBegin), rawIndex(normalizedEnd)]; + } + + normalize(text: string) { + return normalizeText(text); + } + + isBlank(text: string) { + return text.length === 0 || text.trim().length === 0 || text.match(/^\s*$/); + } +} + +/** + * Map charIndex on a 'from' span to index on 'to' span + * @param charIndex char index to map + * @param mapping {from: Span, to: Span} spans + * @returns + */ +function mapCharIndexOnSpans( + charIndex: number, + { from: fromSpan, to: toSpan }: { from: TextSpan; to: TextSpan } +): number { + if (spanLen(fromSpan) === spanLen(toSpan)) { + return toSpan[START] + (charIndex - fromSpan[START]); + } + return ( + toSpan[START] + + Math.round((charIndex - fromSpan[START]) * (spanLen(toSpan) / spanLen(fromSpan))) + ); +} + +function optimizeSpanMappings(mappings: SpanMapping[]) { + const sameLength = (mapping: SpanMapping) => + spanLen(mapping.normalizedSpan) === spanLen(mapping.rawSpan); + const isShifted = (a: SpanMapping, b: SpanMapping) => + b.normalizedSpan[START] - a.normalizedSpan[START] === b.rawSpan[START] - a.rawSpan[START]; + + return mappings.reduce((acc, mapping) => { + const lastMapping = acc.length > 0 ? acc[acc.length - 1] : null; + if ( + sameLength(mapping) && + lastMapping && + sameLength(lastMapping) && + isShifted(lastMapping, mapping) + ) { + // merge mappings + lastMapping.normalizedSpan[END] = mapping.normalizedSpan[END]; + lastMapping.rawSpan[END] = mapping.rawSpan[END]; + return acc; + } + acc.push(mapping); + return acc; + }, [] as SpanMapping[]); +} diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/__tests__/TextNormalizer.test.ts b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/__tests__/TextNormalizer.test.ts new file mode 100644 index 000000000..029b1d7c4 --- /dev/null +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/__tests__/TextNormalizer.test.ts @@ -0,0 +1,142 @@ +import { TextSpan } from '../../../types'; +import { TextNormalizer } from '../TextNormalizer'; + +describe('TextNormalizer', () => { + it('should do nothing with text that does not have any chars to normalize', () => { + const fieldText = 'This is a sample text content.'; + const expectedNormalizedText = fieldText; + + const matcher = new TextNormalizer(fieldText); + expect(matcher.rawText).toEqual(fieldText); + expect(matcher.normalizedText).toEqual(expectedNormalizedText); + expect(matcher.normalizationMappings).toHaveLength(1); + + let spans: TextSpan[] = [ + [0, 10], // start from beginning + [3, 10], // start from one before space + [4, 10], // start from space + [5, 10], // start from one character after space + [10, 20], // end at one char before space + [10, 21], // end at space + [10, 22], // end at one char after space, + [10, fieldText.length] + ]; + for (const span of spans) { + expect(matcher.toNormalized(span)).toEqual(span); + expect(matcher.toRaw(span)).toEqual(span); + } + }); + + it('should normalize text with one long blank', () => { + const fieldText = 'This is a sample text content.'; + const expectedNormalizedText = 'This is a sample text content.'; + + const matcher = new TextNormalizer(fieldText); + expect(matcher.rawText).toEqual(fieldText); + expect(matcher.normalizedText).toEqual(expectedNormalizedText); + expect(matcher.normalizationMappings).toHaveLength(3); + + // test begin + expect(matcher.toNormalized([0, 10])).toEqual([0, 7]); + expect(matcher.toNormalized([3, 10])).toEqual([3, 7]); // one before blank + expect(matcher.toNormalized([4, 10])).toEqual([4, 7]); + expect(matcher.toNormalized([5, 10])).toEqual([4, 7]); + expect(matcher.toNormalized([6, 10])).toEqual([5, 7]); + expect(matcher.toNormalized([7, 10])).toEqual([5, 7]); + expect(matcher.toNormalized([8, 10])).toEqual([5, 7]); + expect(matcher.toNormalized([9, 10])).toEqual([6, 7]); // one after blank + // test end + expect(matcher.toNormalized([0, 3])).toEqual([0, 3]); // one before blank + expect(matcher.toNormalized([0, 4])).toEqual([0, 4]); + expect(matcher.toNormalized([0, 5])).toEqual([0, 4]); + expect(matcher.toNormalized([0, 6])).toEqual([0, 5]); + expect(matcher.toNormalized([0, 7])).toEqual([0, 5]); + expect(matcher.toNormalized([0, 8])).toEqual([0, 5]); + expect(matcher.toNormalized([0, 9])).toEqual([0, 6]); // one after blank + // last + expect(matcher.toNormalized([20, fieldText.length])).toEqual([ + 17, + expectedNormalizedText.length + ]); + + // test begin + expect(matcher.toRaw([0, 7])).toEqual([0, 10]); + expect(matcher.toRaw([3, 7])).toEqual([3, 10]); // one before blank + expect(matcher.toRaw([4, 7])).toEqual([4, 10]); + expect(matcher.toRaw([5, 7])).toEqual([8, 10]); // one after blank + // test end + expect(matcher.toRaw([0, 3])).toEqual([0, 3]); // one before blank + expect(matcher.toRaw([0, 4])).toEqual([0, 4]); + expect(matcher.toRaw([0, 5])).toEqual([0, 8]); // one after blank + expect(matcher.toRaw([0, 6])).toEqual([0, 9]); // two after blank + // last + expect(matcher.toRaw([17, expectedNormalizedText.length])).toEqual([20, fieldText.length]); + }); + + it('should normalize text with multiple long blanks', () => { + const fieldText = 'This is a sample text content. '; + const expectedNormalizedText = 'This is a sample text content. '; + + const matcher = new TextNormalizer(fieldText); + expect(matcher.normalizedText).toEqual(expectedNormalizedText); + expect(matcher.toNormalized([9, 29] /* s a sample te */)).toEqual([6, 19]); + expect(matcher.toRaw([10, 16] /* sample */)).toEqual([17, 23]); + }); + + it('should normalize quotes', () => { + const fieldText = 'This is “double-quoted”. This is ‘single-quoted’.'; + const expectedNormalizedText = 'This is "double-quoted". This is \'single-quoted\'.'; + + const matcher = new TextNormalizer(fieldText); + expect(matcher.normalizedText).toEqual(expectedNormalizedText); + expect(matcher.toNormalized([9, 29])).toEqual([9, 29]); + expect(matcher.toRaw([10, 16])).toEqual([10, 16]); + for (let i = 0; i < fieldText.length; i += 1) { + expect(matcher.toNormalized([0, i + 1])).toEqual([0, i + 1]); + expect(matcher.toNormalized([i, fieldText.length])).toEqual([i, fieldText.length]); + expect(matcher.toRaw([0, i + 1])).toEqual([0, i + 1]); + expect(matcher.toRaw([i, fieldText.length])).toEqual([i, fieldText.length]); + } + }); + + it('should normalize surrogate pairs', () => { + const fieldText = 'This is emoji 😁.'; + const expectedNormalizedText = 'This is emoji _.'; + + const matcher = new TextNormalizer(fieldText); + expect(matcher.normalizedText).toEqual(expectedNormalizedText); + expect(matcher.toNormalized([14, 16])).toEqual([14, 15]); + expect(matcher.toRaw([14, 15])).toEqual([14, 16]); + }); + + it('should normalize diacritical marks', () => { + const fieldText = 'àáâãäåçèéêëìíîïñòóôõöùúûüýÿæœ'; + const expectedNormalizedText = 'aaaaaaceeeeiiiinooooouuuuyyæœ'; + + const matcher = new TextNormalizer(fieldText); + expect(matcher.normalizedText).toEqual(expectedNormalizedText); + + const fieldText2 = fieldText.normalize('NFD'); // à: U+00E0 -> U+0061 U+0300 + expect(fieldText2.length).toBe(fieldText.length * 2 - 2 /* æœ are not changed */); + const matcher2 = new TextNormalizer(fieldText2); + expect(matcher2.normalizedText).toEqual(expectedNormalizedText); + }); + + describe('range conversion', () => { + it('should return mapped indices for negative indices and greater indices than text length', () => { + const matcher = new TextNormalizer('1234567890'); + expect(matcher.toNormalized([-10, 20])).toEqual([-10, 20]); + expect(matcher.toNormalized([20, 30])).toEqual([20, 30]); + expect(matcher.toRaw([-10, 20])).toEqual([-10, 20]); + expect(matcher.toRaw([20, 30])).toEqual([20, 30]); + }); + + it('should return mapped indices for negative indices and greater indices than normalized text length', () => { + const matcher = new TextNormalizer(' '); + expect(matcher.toNormalized([-10, 20])).toEqual([-10, 11]); + expect(matcher.toNormalized([20, 30])).toEqual([11, 21]); + expect(matcher.toRaw([-10, 20])).toEqual([-10, 29]); + expect(matcher.toRaw([20, 30])).toEqual([29, 39]); + }); + }); +}); diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/__tests__/bboxUtils.test.ts b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/__tests__/bboxUtils.test.ts new file mode 100644 index 000000000..f67fc87fd --- /dev/null +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/__tests__/bboxUtils.test.ts @@ -0,0 +1,41 @@ +import { bboxGetSpanByRatio, bboxIntersects, isSideBySideOnLine } from '../bboxUtils'; + +describe('bboxIntersects', () => { + it('should return true when boxes intersect', () => { + expect(bboxIntersects([10, 10, 20, 20], [15, 15, 25, 25])).toBeTruthy(); + }); + + it("should return false when boxes don't intersect", () => { + expect(bboxIntersects([10, 10, 20, 20], [15, 25, 25, 35])).toBeFalsy(); + }); + + it('should return false when one box is on another', () => { + expect(bboxIntersects([10, 10, 20, 20], [20, 10, 30, 20])).toBeFalsy(); + expect(bboxIntersects([10, 10, 20, 20], [0, 10, 10, 20])).toBeFalsy(); + expect(bboxIntersects([10, 10, 20, 20], [10, 20, 20, 30])).toBeFalsy(); + expect(bboxIntersects([10, 10, 20, 20], [10, 0, 20, 10])).toBeFalsy(); + }); +}); + +describe('bboxGetSpanByRatio', () => { + it('should return proper bbox for spans on text', () => { + // text: '0123456789' -> highlight: '0123456789' + expect(bboxGetSpanByRatio([0, 0, 10, 2], 10, [0, 10])).toEqual([0, 0, 10, 2]); + // text: '0123456789' -> highlight: '23' + expect(bboxGetSpanByRatio([0, 0, 10, 2], 10, [2, 4])).toEqual([2, 0, 4, 2]); + // text: '012345' -> highlight: '23' + expect(bboxGetSpanByRatio([0, 0, 10, 2], 5, [2, 4])).toEqual([4, 0, 8, 2]); + }); +}); + +describe('isSideBySideOnLine', () => { + it('should return true for side-by-side boxes', () => { + expect(isSideBySideOnLine([0, 0, 5, 2], [5, 0, 10, 2])).toBeTruthy(); + }); + it('should return false when boxes are not vertically aligned', () => { + expect(isSideBySideOnLine([0, 0, 5, 2], [5, 1, 10, 3])).toBeFalsy(); + }); + it('should return false when two boxes are apart from each other', () => { + expect(isSideBySideOnLine([0, 0, 5, 2], [7, 0, 10, 2])).toBeFalsy(); + }); +}); diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/__tests__/findLargestIndex.test.ts b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/__tests__/findLargestIndex.test.ts new file mode 100644 index 000000000..3c4abf352 --- /dev/null +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/__tests__/findLargestIndex.test.ts @@ -0,0 +1,46 @@ +import { findLargestIndex } from '../findLargestIndex'; + +describe('findLargestIndex', () => { + it('should find correct index', () => { + expect(findLargestIndex(0, 100, index => (index <= 49 ? index : null))).toEqual({ + index: 49, + value: 49 + }); + expect(findLargestIndex(0, 100, index => (index <= 50 ? index : null))).toEqual({ + index: 50, + value: 50 + }); + expect(findLargestIndex(0, 100, index => (index <= 51 ? index : null))).toEqual({ + index: 51, + value: 51 + }); + }); + + it('should find correct index at the edge of the range', () => { + expect(findLargestIndex(0, 100, index => (index === 0 ? index : null))).toEqual({ + index: 0, + value: 0 + }); + expect(findLargestIndex(0, 100, index => (index <= 150 ? index : null))).toEqual({ + index: 99, + value: 99 + }); + }); + + it('should find correct index in a range of 1 width', () => { + expect(findLargestIndex(0, 1, _ => true)).toEqual({ + index: 0, + value: true + }); + }); + + it('should return null for empty ranges', () => { + expect(findLargestIndex(0, 0, _ => true)).toBeNull(); + }); + + it('should return null when no match in the range', () => { + expect(findLargestIndex(0, 100, _ => null)).toBeNull(); + expect(findLargestIndex(0, 100, index => (index <= -50 ? index : null))).toBeNull(); + expect(findLargestIndex(0, 1, _ => null)).toBeNull(); + }); +}); diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/__tests__/textSpanUtils.test.ts b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/__tests__/textSpanUtils.test.ts new file mode 100644 index 000000000..78c663ced --- /dev/null +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/__tests__/textSpanUtils.test.ts @@ -0,0 +1,135 @@ +import { TextSpan } from '../../../types'; +import { + spanCompare, + spanContains, + spanFromSubSpan, + spanGetSubSpan, + spanGetText, + spanIncludesIndex, + spanIntersection, + spanIntersects, + spanLen +} from '../textSpanUtils'; + +describe('spanGetText', () => { + it('should return valid span text', () => { + expect(spanGetText('0123456789', [3, 5])).toBe('34'); + expect(spanGetText('0123456789', [0, 10])).toBe('0123456789'); + }); + it('should return null for null text', () => { + expect(spanGetText(null, [3, 5])).toBe(null); + }); + it('should return empty text for empty or negative span', () => { + expect(spanGetText('0123456789', [0, 0])).toBe(''); + expect(spanGetText('0123456789', [5, 3])).toBe(''); + }); + it('should return text span for negative or large indices', () => { + expect(spanGetText('0123456789', [-10, 20])).toBe('0123456789'); + }); +}); + +describe('spanLen', () => { + it('should return span length', () => { + expect(spanLen([5, 10])).toBe(5); + expect(spanLen([10, 10])).toBe(0); + }); + it('should return zero for negative spans', () => { + expect(spanLen([10, 5])).toBe(0); + }); +}); + +describe('spanIntersects', () => { + it('should properly distinguish span intersection', () => { + expect(spanIntersects([10, 19], [20, 30])).toBeFalsy(); + expect(spanIntersects([10, 20], [20, 30])).toBeFalsy(); + expect(spanIntersects([10, 21], [20, 30])).toBeTruthy(); + expect(spanIntersects([29, 40], [20, 30])).toBeTruthy(); + expect(spanIntersects([30, 41], [20, 30])).toBeFalsy(); + expect(spanIntersects([31, 40], [20, 30])).toBeFalsy(); + + expect(spanIntersects([25, 26], [20, 30])).toBeTruthy(); + + expect(spanIntersects([20, 30], [10, 19])).toBeFalsy(); + expect(spanIntersects([20, 30], [10, 20])).toBeFalsy(); + expect(spanIntersects([20, 30], [10, 21])).toBeTruthy(); + expect(spanIntersects([20, 30], [29, 40])).toBeTruthy(); + expect(spanIntersects([20, 30], [30, 41])).toBeFalsy(); + expect(spanIntersects([20, 30], [31, 40])).toBeFalsy(); + }); +}); + +describe('spanIncludesIndex', () => { + it('should return true for indices inside a span', () => { + expect(spanIncludesIndex([10, 20], 10)).toBeTruthy(); + expect(spanIncludesIndex([10, 20], 15)).toBeTruthy(); + expect(spanIncludesIndex([10, 20], 19)).toBeTruthy(); + }); + it('should return false for indices outside a span', () => { + expect(spanIncludesIndex([10, 20], 9)).toBeFalsy(); + expect(spanIncludesIndex([10, 20], 20)).toBeFalsy(); + expect(spanIncludesIndex([10, 20], 21)).toBeFalsy(); + }); +}); + +describe('spanContains', () => { + it('should return true when a span contains other span', () => { + expect(spanContains([10, 20], [15, 18])).toBeTruthy(); + expect(spanContains([10, 20], [10, 18])).toBeTruthy(); + expect(spanContains([10, 20], [15, 20])).toBeTruthy(); + }); + it("should return true when a span doesn't contain other span", () => { + expect(spanContains([10, 20], [9, 10])).toBeFalsy(); + expect(spanContains([10, 20], [9, 18])).toBeFalsy(); + expect(spanContains([10, 20], [15, 21])).toBeFalsy(); + expect(spanContains([10, 20], [21, 30])).toBeFalsy(); + }); +}); + +describe('spanIntersection', () => { + it('should return span intersection', () => { + expect(spanIntersection([10, 20], [15, 18])).toEqual([15, 18]); + expect(spanIntersection([10, 20], [10, 18])).toEqual([10, 18]); + expect(spanIntersection([10, 20], [15, 25])).toEqual([15, 20]); + }); + it('should return a span when the span is contained in another span', () => { + const a = [10, 20] as TextSpan; + expect(spanIntersection(a, [0, 30])).toBe(a); + expect(spanIntersection(a, [10, 21])).toBe(a); + expect(spanIntersection([0, 30], a)).toBe(a); + expect(spanIntersection([10, 20], a)).toBe(a); + }); +}); + +describe('spanFromSubSpan', () => { + it('should return a span that represents a sub-span (span in span) in a base span', () => { + expect(spanFromSubSpan([10, 20], [0, 5])).toEqual([10, 15]); + expect(spanFromSubSpan([10, 20], [5, 10])).toEqual([15, 20]); + expect(spanFromSubSpan([10, 20], [5, 20])).toEqual([15, 20]); + }); +}); + +describe('spanGetSubSpan', () => { + it('should return a span on a base span', () => { + expect(spanGetSubSpan([10, 20], [10, 15])).toEqual([0, 5]); + expect(spanGetSubSpan([10, 20], [15, 20])).toEqual([5, 10]); + }); + it('should return an empty span when given spans has no intersection', () => { + expect(spanLen(spanGetSubSpan([10, 20], [0, 5]))).toBe(0); + expect(spanLen(spanGetSubSpan([10, 20], [20, 25]))).toBe(0); + }); +}); + +describe('spanCompare', () => { + it('should return zero for same spans', () => { + expect(spanCompare([0, 0], [0, 0])).toBe(0); + expect(spanCompare([10, 20], [10, 20])).toBe(0); + }); + it('should return negative for spans before another', () => { + expect(spanCompare([10, 20], [11, 20]) < 0).toBeTruthy(); + expect(spanCompare([10, 20], [10, 21]) < 0).toBeTruthy(); + }); + it('should return positive for spans after another', () => { + expect(spanCompare([10, 20], [9, 20]) > 0).toBeTruthy(); + expect(spanCompare([10, 20], [10, 19]) > 0).toBeTruthy(); + }); +}); diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/bboxUtils.ts b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/bboxUtils.ts new file mode 100644 index 000000000..2a3de4d31 --- /dev/null +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/bboxUtils.ts @@ -0,0 +1,76 @@ +import { Bbox, TextSpan } from '../../types'; +import { spanIntersection, spanLen } from './textSpanUtils'; + +export const LEFT = 0; +export const TOP = 1; +export const RIGHT = 2; +export const BOTTOM = 3; + +/** + * Check whether two bbox intersect + * + * Same to `intersects` in DocumentPreview/utils/box.ts, + * but for type `Bbox`, which doesn't have page property + * @param boxA one bbox + * @param boxB another bbox + * @returns true iff boxA and boxB are overwrapped + */ +export function bboxIntersects(boxA: Bbox, boxB: Bbox) { + const [leftA, topA, rightA, bottomA] = boxA; + const [leftB, topB, rightB, bottomB] = boxB; + return !(leftB >= rightA || rightB <= leftA || topB >= bottomA || bottomB <= topA); +} + +/** + * Get bbox for a text span assuming each character takes horizontal spaces evenly + * @param bbox bbox occupied with a text + * @param origLength length of the text + * @returns bbox for the text + */ +export function bboxGetSpanByRatio(bbox: Bbox, origLength: number, span: TextSpan) { + const theSpan = spanIntersection([0, origLength], span); + if (origLength === 0 || spanLen(theSpan) <= 0) { + return [bbox[0], bbox[1], bbox[0], bbox[3]] as Bbox; + } + + const [spanStart, spanEnd] = span; + const [left, top, right, bottom] = bbox; + const width = right - left; + const resultLeft = left + (width / origLength) * spanStart; + const resultRight = left + (width / origLength) * spanEnd; + + return [resultLeft, top, resultRight, bottom] as Bbox; +} + +/** + * Check whether two bboxes seems to be side-by-side on a same line. + * @param boxA + * @param boxB + * @returns + */ +export function isSideBySideOnLine(boxA: Bbox, boxB: Bbox) { + if (bboxIntersects(boxA, boxB)) { + return false; + } + + const [leftA, topA, rightA, bottomA] = boxA; + const [leftB, topB, rightB, bottomB] = boxB; + const heightA = bottomA - topA; + const heightB = bottomB - topB; + + // compare height ratio + const OVERWRAP_RATIO = 0.8; + if (!(heightA * OVERWRAP_RATIO < heightB || heightB * OVERWRAP_RATIO < heightA)) { + return false; + } + + const avgHeight = (heightA + heightB) / 2; + const overWrapHeight = Math.max(0, Math.min(bottomA, bottomB) - Math.max(topA, topB)); + if (overWrapHeight < avgHeight * OVERWRAP_RATIO) { + return false; + } + + // see if boxes can be neighborhoods + const verticalGap = Math.max(0, leftB - rightA, leftA - rightB); + return verticalGap < avgHeight; +} diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/documentUtils.ts b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/documentUtils.ts new file mode 100644 index 000000000..655093698 --- /dev/null +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/documentUtils.ts @@ -0,0 +1,53 @@ +import { TextMappings } from 'components/DocumentPreview/types'; +import { getTextMappings } from 'components/DocumentPreview/utils/documentData'; +import { QueryResult } from 'ibm-watson/discovery/v2'; +import { processDoc, ProcessedDoc } from 'utils/document'; +import { Location } from 'utils/document/processDoc'; +import { DocumentFields, TextSpan } from '../../types'; + +export function getDocFieldValue( + document: DocumentFields, + field: string, + index?: number, + span?: Location | TextSpan +) { + let fieldText: string | undefined; + + const documentFieldArray = document[field]; + if (!Array.isArray(documentFieldArray) && !index) { + fieldText = documentFieldArray; + } else { + fieldText = documentFieldArray?.[index ?? 0]; + } + if (!fieldText || !span) { + return fieldText; + } + + if (Array.isArray(span)) { + return fieldText.substring(span[0], span[1]); + } else { + return fieldText.substring(span.begin, span.end); + } +} + +export type ExtractedDocumentInfo = { + processedDoc: ProcessedDoc; + textMappings?: TextMappings; +}; + +export async function extractDocumentInfo(document: QueryResult) { + const docHtml = document.html; + const textMappings = getTextMappings(document) ?? undefined; + + // HtmlView.tsx + const processedDoc = await processDoc( + { ...document, docHtml }, + { sections: true, bbox: true, bboxInnerText: true } + ); + + if (!processedDoc.bboxes) { + throw Error('Unexpected result from processDoc'); + } + + return { processedDoc, textMappings }; +} diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/findLargestIndex.ts b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/findLargestIndex.ts new file mode 100644 index 000000000..34a3f1a03 --- /dev/null +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/findLargestIndex.ts @@ -0,0 +1,34 @@ +/** + * Find the largest index that satisfies the matchFn and the value of matchFn then + * @param begin begin index of the range. inclusive + * @param end end index of the rage. exclusive + * @param matchFn + */ +export function findLargestIndex( + begin: number, + end: number, + matchFn: (index: number) => V | null, + splitMid?: boolean +): { index: number; value: V } | null { + if (end - begin < 1) return null; + + const midIndex = splitMid ? begin + Math.floor((end - begin) / 2) : end - 1; + const value = matchFn(midIndex); + if (!(value == null)) { + if (end - (midIndex + 1) > 0) { + const r = findLargestIndex(midIndex + 1, end, matchFn, true); + if (r) return r; + else return { index: midIndex, value }; + } else { + return { index: midIndex, value }; + } + } else { + if (midIndex - begin > 0) { + const r = findLargestIndex(begin, midIndex, matchFn, true); + if (r) return r; + else return null; + } else { + return null; + } + } +} diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/nonEmpty.ts b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/nonEmpty.ts new file mode 100644 index 000000000..a511faa30 --- /dev/null +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/nonEmpty.ts @@ -0,0 +1,3 @@ +export function nonEmpty(value: T | null | undefined): value is T { + return value !== null && value !== undefined; +} diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/textSpanUtils.ts b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/textSpanUtils.ts new file mode 100644 index 000000000..b0ae85939 --- /dev/null +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/textSpanUtils.ts @@ -0,0 +1,58 @@ +import { TextSpan } from '../../types'; + +export const START = 0; +export const END = 1; + +export function spanGetText(text: T, span: TextSpan) { + if (!text) return text; + if (spanLen(span) === 0) return ''; + return text.substring(span[START], span[END]); +} + +export function spanLen(span: TextSpan) { + return Math.max(0, span[END] - span[START]); +} + +export function spanIntersects([beginA, endA]: TextSpan, [beginB, endB]: TextSpan): boolean { + return beginA < endB && endA > beginB; +} + +export function spanIncludesIndex([begin, end]: TextSpan, index: number) { + return begin <= index && index < end; +} + +export function spanContains(span: TextSpan, other: TextSpan) { + return span[START] <= other[START] && other[END] <= span[END]; +} + +export function spanIntersection(a: TextSpan, b: TextSpan): TextSpan { + if (spanContains(a, b)) return b; + if (spanContains(b, a)) return a; + const start = Math.max(a[START], b[START]); + const end = Math.min(a[END], b[END]); + return [start, start <= end ? end : start]; +} + +export function spanUnion(a: TextSpan, b: TextSpan): TextSpan { + if (spanContains(a, b) || spanLen(b) === 0) return a; + if (spanContains(b, a) || spanLen(a) === 0) return b; + const start = Math.min(a[START], b[START]); + const end = Math.max(a[END], b[END]); + return [start, start <= end ? end : start]; +} + +export function spanOffset([start, end]: TextSpan, offset: number): TextSpan { + return [start + offset, end + offset]; +} + +export function spanFromSubSpan(base: TextSpan, subSpan: TextSpan) { + return spanIntersection(base, spanOffset(subSpan, base[START])); +} + +export function spanGetSubSpan(base: TextSpan, span: TextSpan) { + return spanOffset(spanIntersection(base, span), -base[START]); +} + +export function spanCompare([startA, endA]: TextSpan, [startB, endB]: TextSpan) { + return startA === startB ? endA - endB : startA - startB; +} From 66c7c56a59a517b88ddbf60539ca8e57553e6709 Mon Sep 17 00:00:00 2001 From: Susumu Fukuda Date: Fri, 15 Oct 2021 15:49:38 +0900 Subject: [PATCH 12/51] feat: add option for bbox text to processDoc --- .../document/__tests__/processDoc.spec.tsx | 29 +++++++++++++++ .../src/utils/document/processDoc.ts | 36 +++++++++++++++---- 2 files changed, 59 insertions(+), 6 deletions(-) diff --git a/packages/discovery-react-components/src/utils/document/__tests__/processDoc.spec.tsx b/packages/discovery-react-components/src/utils/document/__tests__/processDoc.spec.tsx index ad894bfb8..15c424b87 100644 --- a/packages/discovery-react-components/src/utils/document/__tests__/processDoc.spec.tsx +++ b/packages/discovery-react-components/src/utils/document/__tests__/processDoc.spec.tsx @@ -219,3 +219,32 @@ describe('processDoc', () => { expect(doc.tables![2].bboxes[0]).toEqual(bboxData); }); }); + +describe('processDoc', () => { + let doc: ProcessedDoc; + + beforeAll(async () => { + // parse doc for use in tests + doc = await processDoc(contractData.results[0], { bbox: true, bboxInnerText: true }); + }); + + it('successfully picks up bboxes', () => { + expect(doc.bboxes).toHaveLength(1584); + }); + + it('successfully picks up bbox text source', () => { + expect(doc.bboxes).toHaveLength(1584); + + // + // On 22 December 2008 ART EFFECTS LIMITED and Customer entered into an Information Technology Procurement Framework Agreement ("the + // + expect(doc.bboxes[0].innerTextSource).toEqual( + 'On 22 December 2008 ART EFFECTS LIMITED and Customer entered into an Information Technology Procurement Framework Agreement ("the ' + ); + expect(doc.bboxes[0].innerTextLocation).toEqual({ begin: 2530, end: 2660 }); + + // <Enter Amendment Text> + expect(doc.bboxes[1490].innerTextSource).toEqual('<Enter Amendment Text> '); + expect(doc.bboxes[1490].innerTextLocation).toEqual({ begin: 442990, end: 443016 }); + }); +}); diff --git a/packages/discovery-react-components/src/utils/document/processDoc.ts b/packages/discovery-react-components/src/utils/document/processDoc.ts index a39b42879..aca49147d 100644 --- a/packages/discovery-react-components/src/utils/document/processDoc.ts +++ b/packages/discovery-react-components/src/utils/document/processDoc.ts @@ -28,6 +28,7 @@ interface Options { sections?: boolean; tables?: boolean; bbox?: boolean; + bboxInnerText?: boolean; itemMap?: boolean; } @@ -66,6 +67,8 @@ export interface ProcessedBbox { page: number; className: string; location: Location; + innerTextSource?: string; + innerTextLocation?: Location; } export interface Table { @@ -130,7 +133,7 @@ export async function processDoc( const parser = new SaxParser(); // setup initial parsing handling - setupDocParser(parser, doc); + setupDocParser(parser, doc, options); const htmlContent = Array.isArray(html) ? html[0] : html; @@ -145,7 +148,7 @@ export async function processDoc( return doc; } -function setupDocParser(parser: SaxParser, doc: ProcessedDoc): void { +function setupDocParser(parser: SaxParser, doc: ProcessedDoc, options: Options): void { parser.pushState({ onopentag: (_: Parser, tagName: string): void => { /* eslint-disable-next-line default-case */ @@ -155,7 +158,7 @@ function setupDocParser(parser: SaxParser, doc: ProcessedDoc): void { break; } case 'body': { - setupBodyParser(parser, doc); + setupBodyParser(parser, doc, options); break; } } @@ -189,11 +192,11 @@ function setupStyleParser(parser: SaxParser, doc: ProcessedDoc): void { }); } -function setupBodyParser(parser: SaxParser, doc: ProcessedDoc): void { +function setupBodyParser(parser: SaxParser, doc: ProcessedDoc, options: Options): void { parser.pushState({ onopentag: (p: Parser, tagName: string, attributes: Attributes): void => { if (SECTION_NAMES.includes(tagName)) { - setupSectionParser(parser, doc, tagName, attributes, p.startIndex, p); + setupSectionParser(parser, doc, tagName, attributes, p.startIndex, p, options); } } }); @@ -205,7 +208,8 @@ function setupSectionParser( sectionTagName: string, sectionTagAttrs: Attributes, sectionStartIndex: number, - sectionParser: Parser + sectionParser: Parser, + options: Options ): void { let lastClassName = ''; let currentTable: Table | null = null; @@ -283,6 +287,13 @@ function setupSectionParser( if (doc.bboxes) { doc.bboxes.push(currentBbox); } + if (options.bboxInnerText) { + currentBbox.innerTextSource = ''; + currentBbox.innerTextLocation = { + begin: p.endIndex != null ? p.endIndex + 1 : -1, + end: -1 + }; + } if (currentTable && doc.tables) { currentTable.bboxes.push(currentBbox); } @@ -309,6 +320,10 @@ function setupSectionParser( ); } + if (currentBbox && options.bboxInnerText) { + currentBbox.innerTextSource += text; + } + sectionHtml.push(text); }, @@ -335,6 +350,15 @@ function setupSectionParser( if (doc.bboxes && tagName === BBOX_TAG && currentBbox) { currentBbox.location.end = getChildEndFromCloseTag(p); + + if (options.bboxInnerText && currentBbox.innerTextLocation) { + currentBbox.innerTextLocation.end = getChildEndFromCloseTag(p); + if (currentBbox.innerTextLocation.end < 0 && currentBbox.innerTextSource != null) { + currentBbox.innerTextLocation.begin = + currentBbox.innerTextLocation.end - currentBbox.innerTextSource.length; + } + } + currentBbox = null; } From 47855f400fc3211e487d630ab720d32e35121b7b Mon Sep 17 00:00:00 2001 From: Susumu Fukuda Date: Wed, 10 Nov 2021 15:55:10 +0900 Subject: [PATCH 13/51] feat: add text layer classes --- .../utils/textLayout/BaseTextLayout.ts | 75 +++++++++++++++ .../utils/textLayout/HtmlBboxTextLayout.ts | 56 +++++++++++ .../textLayout/PdfTextContentTextLayout.ts | 94 +++++++++++++++++++ .../textLayout/TextMappingsTextLayout.ts | 67 +++++++++++++ .../utils/textLayout/dom.ts | 67 +++++++++++++ .../utils/textLayout/index.ts | 3 + .../utils/textLayout/types.ts | 67 +++++++++++++ 7 files changed, 429 insertions(+) create mode 100644 packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textLayout/BaseTextLayout.ts create mode 100644 packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textLayout/HtmlBboxTextLayout.ts create mode 100644 packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textLayout/PdfTextContentTextLayout.ts create mode 100644 packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textLayout/TextMappingsTextLayout.ts create mode 100644 packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textLayout/dom.ts create mode 100644 packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textLayout/index.ts create mode 100644 packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textLayout/types.ts diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textLayout/BaseTextLayout.ts b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textLayout/BaseTextLayout.ts new file mode 100644 index 000000000..77e043328 --- /dev/null +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textLayout/BaseTextLayout.ts @@ -0,0 +1,75 @@ +import { Bbox, TextSpan } from '../../types'; +import { TextLayout, TextLayoutCell, TextLayoutCellBase } from './types'; +import { spanGetText, spanIntersection, spanOffset, START } from '../common/textSpanUtils'; +import { bboxGetSpanByRatio } from '../common/bboxUtils'; + +/** + * Base implementation of text layout cell + */ +export class BaseTextLayoutCell> + implements TextLayoutCell +{ + readonly parent: Layout; + readonly id: number; + readonly pageNum: number; + readonly bbox: Bbox; + readonly text: string; + + constructor({ + parent, + id, + pageNum, + bbox, + text + }: { + parent: Layout; + id: number; + pageNum: number; + bbox: Bbox; + text: string; + }) { + this.parent = parent; + this.id = id; + this.pageNum = pageNum; + this.bbox = bbox; + this.text = text; + } + + getPartial(span: TextSpan): TextLayoutCellBase { + return new PartialTextLayoutCell(this, span); + } + getNormalized(): { cell: TextLayoutCell; span?: TextSpan } { + return { cell: this }; + } + getBboxForTextSpan(span: TextSpan, options: { useRatio?: boolean }): Bbox | null { + if (options?.useRatio) { + return bboxGetSpanByRatio(this.bbox, this.text.length, span); + } + return null; + } +} + +/** + * Text span on a base text layout cell + */ +export class PartialTextLayoutCell implements TextLayoutCellBase { + readonly base: TextLayoutCell; + readonly span: TextSpan; + + constructor(base: TextLayoutCell, span: TextSpan) { + this.base = base; + this.span = spanIntersection([0, base.text.length], span); + } + + get text() { + return spanGetText(this.base.text, this.span); + } + + getPartial(span: TextSpan): TextLayoutCellBase { + const newSpan = spanIntersection(this.span, spanOffset(span, this.span[START])); + return new PartialTextLayoutCell(this.base, newSpan); + } + getNormalized() { + return { cell: this.base, span: this.span }; + } +} diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textLayout/HtmlBboxTextLayout.ts b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textLayout/HtmlBboxTextLayout.ts new file mode 100644 index 000000000..851ab6991 --- /dev/null +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textLayout/HtmlBboxTextLayout.ts @@ -0,0 +1,56 @@ +import { decodeHTML } from 'entities'; +import { ProcessedBbox } from 'utils/document'; +import { Bbox, TextSpan } from '../../types'; +import { BaseTextLayoutCell } from './BaseTextLayout'; +import { HtmlBboxInfo, TextLayout } from './types'; + +export class HtmlBboxTextLayout implements TextLayout { + private readonly bboxInfo: HtmlBboxInfo; + readonly cells: HtmlBboxTextLayoutCell[]; + + constructor(bboxInfo: HtmlBboxInfo, pageNum: number) { + this.bboxInfo = bboxInfo; + this.cells = + bboxInfo.bboxes + ?.filter(bbox => bbox.page === pageNum) + .map((bbox, index) => { + return new HtmlBboxTextLayoutCell(this, index, bbox); + }) ?? []; + } + + cellAt(id: number) { + return this.cells[id]; + } + + installStyle() { + if (this.bboxInfo.styles) { + // TODO: install style to DOM if not yet. For getBboxForTextSpan in cell + } + } +} + +class HtmlBboxTextLayoutCell extends BaseTextLayoutCell { + private readonly processedBbox: ProcessedBbox; + + constructor(parent: HtmlBboxTextLayout, index: number, processedBbox: ProcessedBbox) { + const id = index; + const pageNum = processedBbox.page; + const bbox: Bbox = [ + processedBbox.left, + processedBbox.top, + processedBbox.right, + processedBbox.bottom + ]; + const text = decodeHTML(processedBbox.innerTextSource ?? ''); + super({ parent, id, pageNum, bbox, text }); + + this.processedBbox = processedBbox; // keep this for later improvement + } + + getBboxForTextSpan(span: TextSpan, options: { useRatio?: boolean }): Bbox | null { + if (this.processedBbox != null) { + // TODO: calculate bbox for text span using text on browser + } + return super.getBboxForTextSpan(span, options); + } +} diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textLayout/PdfTextContentTextLayout.ts b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textLayout/PdfTextContentTextLayout.ts new file mode 100644 index 000000000..9a5c09e71 --- /dev/null +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textLayout/PdfTextContentTextLayout.ts @@ -0,0 +1,94 @@ +import { PDFPageViewport, PDFPageViewportOptions, TextContentItem } from 'pdfjs-dist'; +import { Bbox, TextSpan } from '../../types'; +import { bboxIntersects } from '../common/bboxUtils'; +import { BaseTextLayoutCell } from './BaseTextLayout'; +import { getAdjustedCellByOffsetByDom } from './dom'; +import { HtmlBboxInfo, PdfTextContentInfo, TextLayout } from './types'; + +export class PdfTextContentTextLayout implements TextLayout { + private readonly textContentInfo: PdfTextContentInfo; + readonly cells: PdfTextContentTextLayoutCell[]; + private spans: HTMLElement[] | undefined; + + constructor(textContentInfo: PdfTextContentInfo, pageNum: number, htmlBboxInfo?: HtmlBboxInfo) { + this.textContentInfo = textContentInfo; + + const textContentItems = textContentInfo.textContent.items; + + this.cells = textContentItems + .map((item, index) => { + return new PdfTextContentTextLayoutCell(this, index, item, pageNum); + }) + .filter(cell => { + if (htmlBboxInfo?.bboxes?.length) { + return htmlBboxInfo.bboxes.some(bbox => { + return bboxIntersects(cell.bbox, [bbox.left, bbox.top, bbox.right, bbox.bottom]); + }); + } + return true; + }); + } + + get viewport() { + return this.textContentInfo.viewport; + } + + cellAt(id: number) { + return this.cells[id]; + } + + setSpans(spans: HTMLElement[] | undefined) { + this.spans = spans; + } + spanAt(id: number) { + return this.spans?.[id]; + } +} + +class PdfTextContentTextLayoutCell extends BaseTextLayoutCell { + // private readonly textItem: TextContentItem; + + constructor( + parent: PdfTextContentTextLayout, + index: number, + textItem: TextContentItem, + pageNum: number + ) { + const id = index; + const bbox = PdfTextContentTextLayoutCell.getBbox(textItem, parent.viewport); + const text = textItem.str; + super({ parent, id, pageNum, bbox, text }); + + // this.textItem = textItem; + } + + getBboxForTextSpan(span: TextSpan, options: { useRatio?: boolean }): Bbox | null { + const spanElement = this.parent.spanAt(this.id); + if (spanElement && spanElement.parentNode) { + const scale = this.parent.viewport.scale; + const bbox = getAdjustedCellByOffsetByDom(this, span, spanElement, scale); + if (bbox) { + return bbox; + } + } + return super.getBboxForTextSpan(span, options); + } + + static getBbox(textItem: TextContentItem, viewport: PDFPageViewport): Bbox { + const { transform } = textItem; + + const patchedViewport = viewport as PDFPageViewportOptions & PDFPageViewport; + const defaultSideways = patchedViewport.rotation % 180 !== 0; + + // not sure this is true... + const [fontHeightPx, , offsetX, offsetY, x, y] = transform; + + const [xMin, yMin, , yMax] = patchedViewport.viewBox; + const top = defaultSideways ? x + offsetX + yMin : yMax - (y + offsetY); + const left = defaultSideways ? y - xMin : x - xMin; + const bottom = top + fontHeightPx; + const adjustHeight = fontHeightPx * 0.2; + + return [left, top + adjustHeight, left + textItem.width, bottom + adjustHeight]; + } +} diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textLayout/TextMappingsTextLayout.ts b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textLayout/TextMappingsTextLayout.ts new file mode 100644 index 000000000..f3f51bdb6 --- /dev/null +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textLayout/TextMappingsTextLayout.ts @@ -0,0 +1,67 @@ +import { Cell, CellField } from 'components/DocumentPreview/types'; +import { DocumentFields, DocumentFieldHighlight, TextSpan } from '../../types'; +import { getDocFieldValue } from '../common/documentUtils'; +import { TextBoxMappingResult } from '../textBoxMapping/types'; +import { + spanGetSubSpan, + spanContains, + spanIntersection, + spanIntersects +} from '../common/textSpanUtils'; +import { BaseTextLayoutCell } from './BaseTextLayout'; +import { TextLayout, TextMappingInfo } from './types'; + +export class TextMappingsTextLayout implements TextLayout { + readonly cells: TextMappingsTextLayoutCell[]; + + constructor(textMappingInfo: TextMappingInfo, pageNum: number) { + const { textMappings, document } = textMappingInfo; + + this.cells = textMappings.text_mappings + .filter(cell => cell.page.page_number === pageNum) + .map((cell, index) => { + return new TextMappingsTextLayoutCell(this, index, document, cell); + }); + } + + cellAt(id: number) { + return this.cells[id]; + } + + getHighlight(highlight: DocumentFieldHighlight): TextBoxMappingResult { + const highlightSpan: TextSpan = [highlight.location.begin, highlight.location.end]; + const highlightCells = this.cells + .filter(cell => { + const { cellField } = cell; + return ( + cellField.name === highlight.field && + cellField.index === highlight.fieldIndex && + spanIntersects(cellField.span, highlightSpan) + ); + }) + .map(cell => { + const { cellField } = cell; + const currentSpan = spanIntersection(cellField.span, highlightSpan); + if (spanContains(highlightSpan, cellField.span)) { + return { cell, sourceSpan: currentSpan }; + } + const subSpan = spanGetSubSpan(cellField.span, currentSpan); + return { cell: cell.getPartial(subSpan), sourceSpan: currentSpan }; + }); + return highlightCells; + } +} + +class TextMappingsTextLayoutCell extends BaseTextLayoutCell { + readonly cellField: CellField; + + constructor(parent: TextMappingsTextLayout, index: number, document: DocumentFields, cell: Cell) { + const id = index; + const pageNum = cell.page.page_number; + const bbox = cell.page.bbox; + const text = + getDocFieldValue(document, cell.field.name, cell.field.index, cell.field.span) ?? ''; + super({ parent, id, pageNum, bbox, text }); + this.cellField = cell.field; + } +} diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textLayout/dom.ts b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textLayout/dom.ts new file mode 100644 index 000000000..d2cfae28d --- /dev/null +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textLayout/dom.ts @@ -0,0 +1,67 @@ +import { getTextNodeAndOffset, uniqRects } from 'utils/document/documentUtils'; +import { Bbox, TextSpan } from '../../types'; +import { BOTTOM, LEFT, RIGHT, TOP } from '../common/bboxUtils'; +import { END, START } from '../common/textSpanUtils'; +import { TextLayoutCell } from './types'; + +const debugOut = require('debug')?.('pdf:textLayout:dom'); +function debug(...args: any) { + debugOut?.apply(null, args); +} + +export function getAdjustedCellByOffsetByDom( + cell: TextLayoutCell, + textSpan: TextSpan, + spanElement: HTMLElement, + scale: number +): Bbox | null { + if (!(spanElement.firstChild instanceof Text) || !(spanElement.lastChild instanceof Text)) { + debug('unexpected. span dont have text node'); + return null; + } + + const beginOffset = textSpan[START]; + const endOffset = Math.min(cell.text.length, textSpan[END]); + + let left = cell.bbox[LEFT]; + let right = cell.bbox[RIGHT]; + const top = cell.bbox[TOP]; + const bottom = cell.bbox[BOTTOM]; + + // convert offset + function getAdjustedOffset(orgOffset: number) { + return orgOffset; + } + try { + const { textNode: beginTextNode, textOffset: beginTextOffset } = + beginOffset > 0 + ? getTextNodeAndOffset(spanElement, getAdjustedOffset(beginOffset)) + : { textNode: spanElement.firstChild, textOffset: 0 }; + const { textNode: endTextNode, textOffset: endTextOffset } = + endOffset > 0 + ? getTextNodeAndOffset(spanElement, getAdjustedOffset(endOffset)) + : { textNode: spanElement.lastChild, textOffset: spanElement.lastChild.length }; + + debug('finding text node for: ', cell.text); + debug(' textContent: ', beginTextNode.textContent); + debug(' beginOffset: ', beginTextOffset); + debug(' textContent: ', endTextNode.textContent); + debug(' endOffset: ', endTextOffset); + + const range = document.createRange(); + range.setStart(beginTextNode, Math.min(beginTextOffset, beginTextNode.length)); + range.setEnd(endTextNode, Math.min(endTextOffset, endTextNode.length)); + + // create highlight rect(s) inside of a field + const parentRect = spanElement.parentElement?.getBoundingClientRect(); + Array.prototype.forEach.call(uniqRects(range.getClientRects() as DOMRectList), rect => { + left = (rect.left - parentRect!.left) / scale; + right = left + rect.width / scale; + }); + + return [left, top, right, bottom]; + } catch (e) { + debug('Caught exception on calculating bbox from DOM: ', e); + } + return null; +} diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textLayout/index.ts b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textLayout/index.ts new file mode 100644 index 000000000..cfef239cf --- /dev/null +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textLayout/index.ts @@ -0,0 +1,3 @@ +export { HtmlBboxTextLayout } from './HtmlBboxTextLayout'; +export { PdfTextContentTextLayout } from './PdfTextContentTextLayout'; +export { TextMappingsTextLayout } from './TextMappingsTextLayout'; diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textLayout/types.ts b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textLayout/types.ts new file mode 100644 index 000000000..7ed59012d --- /dev/null +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textLayout/types.ts @@ -0,0 +1,67 @@ +import { TextMappings } from 'components/DocumentPreview/types'; +import { PDFPageViewport, TextContent } from 'pdfjs-dist'; +import { ProcessedDoc } from 'utils/document'; +import { Bbox, DocumentFields, TextSpan } from '../../types'; + +/** + * Text layout information + */ +export interface TextLayout { + /** cells, paris of bbox and text, of this text layout */ + readonly cells: CellType[]; + /** get cell by ID */ + cellAt(id: CellType['id']): CellType; +} + +/** + * Text layout cell. A text and its bbox. + */ +export interface TextLayoutCell extends TextLayoutCellBase { + readonly parent: TextLayout; + /** ID to identify this cell in */ + readonly id: IDType; + /** text of this cell */ + readonly text: string; + readonly pageNum: number; + readonly bbox: Bbox; + + /** + * get bbox for the given text span. + * @returns null when it's not available + */ + getBboxForTextSpan(span: TextSpan, options?: { useRatio?: boolean }): Bbox | null; +} + +/** + * Generic text layout cell. Bbox may not be directly available. + * Mainly for sub-string of a text layout cell. + */ +export interface TextLayoutCellBase { + /** text of this cell */ + readonly text: string; + /** get sub-span of this text layout */ + getPartial(span: TextSpan): TextLayoutCellBase; + /** get normalized form, the base text layout cell and a span on it */ + getNormalized(): { cell: TextLayoutCell; span?: TextSpan }; +} + +/** + * Information to create HtmlBboxTextLayout + */ +export type HtmlBboxInfo = Pick; + +/** + * Information to create PdfTextContentTextLayout + */ +export type PdfTextContentInfo = { + textContent: TextContent; + viewport: PDFPageViewport; +}; + +/** + * Information to create TextMappingsTextLayout + */ +export type TextMappingInfo = { + document: DocumentFields; + textMappings: TextMappings; +}; From 66b3e43399bd7261b4e3fe3a77229dbd08c7ce88 Mon Sep 17 00:00:00 2001 From: Susumu Fukuda Date: Wed, 10 Nov 2021 16:06:25 +0900 Subject: [PATCH 14/51] feat: add highlighting logic and README --- .../PdfViewerHighlight/utils/Highlighter.ts | 158 ++++++++++++++++++ .../PdfViewerHighlight/utils/README.md | 37 ++++ .../utils/textBoxMapping/CellProvider.ts | 96 +++++++++++ .../MappingSourceTextProvider.ts | 61 +++++++ .../MappingTargetCellProvider.ts | 57 +++++++ .../utils/textBoxMapping/TextBoxMapping.ts | 87 ++++++++++ .../utils/textBoxMapping/TextProvider.ts | 92 ++++++++++ .../__tests__/TextProvider.test.ts | 56 +++++++ .../utils/textBoxMapping/getTextBoxMapping.ts | 133 +++++++++++++++ .../utils/textBoxMapping/index.ts | 1 + .../utils/textBoxMapping/types.ts | 16 ++ 11 files changed, 794 insertions(+) create mode 100644 packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/Highlighter.ts create mode 100644 packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/README.md create mode 100644 packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/CellProvider.ts create mode 100644 packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/MappingSourceTextProvider.ts create mode 100644 packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/MappingTargetCellProvider.ts create mode 100644 packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/TextBoxMapping.ts create mode 100644 packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/TextProvider.ts create mode 100644 packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/__tests__/TextProvider.test.ts create mode 100644 packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/getTextBoxMapping.ts create mode 100644 packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/index.ts create mode 100644 packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/types.ts diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/Highlighter.ts b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/Highlighter.ts new file mode 100644 index 000000000..8216cd532 --- /dev/null +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/Highlighter.ts @@ -0,0 +1,158 @@ +import { TextMappings } from 'components/DocumentPreview/types'; +import flatMap from 'lodash/flatMap'; +import { PDFPageViewport, TextContent } from 'pdfjs-dist'; +import { + DocumentFields, + DocumentFieldHighlight, + HighlightShape, + HighlightShapeBox +} from '../types'; +import { getTextBoxMappings } from './textBoxMapping'; +import { TextBoxMapping, TextBoxMappingResult } from './textBoxMapping/types'; +import { HtmlBboxTextLayout, PdfTextContentTextLayout, TextMappingsTextLayout } from './textLayout'; +import { HtmlBboxInfo } from './textLayout/types'; +import { spanOffset, START } from './common/textSpanUtils'; +import { nonEmpty } from './common/nonEmpty'; + +const debugOut = require('debug')?.('pdf:Highlighter'); +function debug(...args: any) { + debugOut?.apply(null, args); +} + +export class Highlighter { + readonly pageNum: number; + private readonly textMappingsLayout: TextMappingsTextLayout; + private pdfTextContentLayout: PdfTextContentTextLayout | null = null; + private textToHtmlBboxMappings: TextBoxMapping | null = null; + private textToPdfTextItemMappings: TextBoxMapping | null = null; + + constructor({ + document, + textMappings, + pageNum, + htmlBboxInfo, + pdfTextContentInfo + }: { + document: DocumentFields; + textMappings: TextMappings; + pageNum: number; + htmlBboxInfo?: HtmlBboxInfo; + pdfTextContentInfo?: { + textContent: TextContent; + viewport: PDFPageViewport; + spans?: HTMLElement[]; + }; + }) { + this.pageNum = pageNum; + this.textMappingsLayout = new TextMappingsTextLayout({ document, textMappings }, pageNum); + if (htmlBboxInfo) { + this.setProcessedDoc(htmlBboxInfo); + } + if (pdfTextContentInfo) { + this.setTextContentItems( + pdfTextContentInfo.textContent, + pdfTextContentInfo.viewport, + pdfTextContentInfo.spans, + htmlBboxInfo + ); + } + } + + setProcessedDoc(htmlBoxInfo: HtmlBboxInfo) { + const htmlLayout = new HtmlBboxTextLayout(htmlBoxInfo, this.pageNum); + this.textToHtmlBboxMappings = getTextBoxMappings(this.textMappingsLayout, htmlLayout); + } + + setTextContentItems( + textContent: TextContent, + viewport: PDFPageViewport, + spans?: HTMLElement[], + htmlBoxInfo?: HtmlBboxInfo + ) { + this.pdfTextContentLayout = new PdfTextContentTextLayout( + { textContent, viewport }, + this.pageNum, + htmlBoxInfo + ); + this.textToPdfTextItemMappings = getTextBoxMappings( + this.textMappingsLayout, + this.pdfTextContentLayout + ); + this.setTextContentDivs(spans); + } + + setTextContentDivs(spans?: HTMLElement[]) { + this.pdfTextContentLayout?.setSpans(spans); + } + + getHighlightTextMappingResult(highlight: DocumentFieldHighlight): TextBoxMappingResult { + let items = this.textMappingsLayout.getHighlight(highlight); + + const doMapping = (items: TextBoxMappingResult, textBoxMapping: TextBoxMapping, parent: any) => + flatMap(items, item => { + if (item.cell) { + const { cell: baseCell } = item.cell.getNormalized(); + if (baseCell.parent === parent) { + const newItems = textBoxMapping.apply(item.cell); + return newItems.map(({ cell, sourceSpan }) => { + return { + cell, + sourceSpan: spanOffset(sourceSpan, item.sourceSpan[START]) + }; + }); + } + return item; + } + return []; + }); + + const { textToPdfTextItemMappings, textToHtmlBboxMappings } = this; + if (textToPdfTextItemMappings) { + items = doMapping(items, textToPdfTextItemMappings, this.textMappingsLayout); + } + if (textToHtmlBboxMappings) { + items = doMapping(items, textToHtmlBboxMappings, this.textMappingsLayout); + } + return items; + } + + getHighlight( + highlight: T + ): HighlightShape & Omit { + debug('getHighlight: %o', highlight); + const { field, fieldIndex, location, className, ...rest } = highlight; + const items = this.getHighlightTextMappingResult({ field, fieldIndex, location }); + debug('getHighlight - items: %o', items); + + const boxShapes: HighlightShapeBox[] = items + .map((item, index) => { + const { cell: baseCell, span: baseSpan } = item.cell?.getNormalized() || {}; + if (baseCell) { + let bbox = baseCell.bbox; + if (baseSpan) { + bbox = + baseCell.getBboxForTextSpan(baseSpan) || + baseCell.getBboxForTextSpan(baseSpan, { useRatio: true }) || + baseCell.bbox; + } + debug('getHighlight - cell(%i): %o', item.cell); + debug(' box: %o', bbox); + return { + bbox, + isStart: index === 0, + isEnd: index === items.length - 1 + }; + } else { + debug('getHighlight - cell(%i) missing. source span: %o', item.sourceSpan); + } + // drop something!! + return null; + }) + .filter(nonEmpty); + return { + boxes: boxShapes, + className, + ...rest + }; + } +} diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/README.md b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/README.md new file mode 100644 index 000000000..94a34ae74 --- /dev/null +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/README.md @@ -0,0 +1,37 @@ +## How highlighting works + +### TextLayout + +`TextLayout` shows that what text is placed where in a page. `TextLayout` has multiple `TextLayoutCells`, which shows a particular text is rendered in a particular boundary box. + +So, `metadata.text_mappings` is a kind of `TextLayout` because it bounds text to a boundary box. `bbox`es stored in `html` field can be `TextLayout`s. Also, text content items from PDF (i.e. PDF programmatic text) can be `TextLayout`s. + +Each type of text layout has each granularity, text length and the size of boundary box in a `TextLayoutCell` are different. For example, a cell from `text_mappings` typically has longer text (sometimes it's a paragraph) and large boundary box. A cell from PDF text content item has shorter text (say it's word or short phrase) and small boundary box. + +For highlighting, smaller boundary boxes allow more accurate highlight location. + +### Find smaller text layout cell using `TextBoxMappings` + +So, we build mappings from larger cells to smaller cells. More detail, map a span on a text of a large cell to a span on a text of a smaller cell. + +We typically starts with cells from `text_mapping` because we can find a cell and s span on it from a span on a field. Then we can use the mappings to find smaller cells, which are typically from PDF text content items. + +However, calculation of the mapping is not straightforward. Cells can be over-wrapped, order of smaller cells may not same to the text in a larger cells. So, `getTextBoxMappings` and it helpers `TextNormalizer`, `TextProvider`, `CellProvider` are for calculating the best mapping even with the situation. + +### Text layout cell to boundary box + +`TextLayout` shows what text is placed where in a page. `TextLayout` has multiple `TextLayoutCells`, which shows a particular text is rendered in a particular boundary box. + +So, `metadata.text_mappings` is a kind of `TextLayout` because it bounds text to a boundary box. `bbox`es stored in `html` field can be `TextLayout`s. Also, text content items from PDF (i.e. PDF programmatic text) can be `TextLayout`s. + +Each type of text layout has each granularity, text length and the size of boundary box in a `TextLayoutCell` are different. For example, a cell from `text_mappings` typically has longer text (sometimes it's a paragraph) and large boundary box. A cell from PDF text content item has shorter text (say it's word or short phrase) and small boundary box. + +For highlighting, smaller boundary boxes allow more accurate highlight location. + +Even with a small cell, text to highlight may be a span on a cell text. In the case, we have to calculate boundary box for the text. By default, cells approximate the boundary box by assigning width evenly to every characters in the cell text. + +Some `TextLayoutCalls` has capability of calculating boundary box for a sub-span of its text. For example, cells for PDF text items `PdfTextContentTextLayoutCell` can calculate boundary boxes for given text spans. It internally uses DOM and DOM's `getBoundingClientRect` to get the result. + +### Highlighter + +`Highlighter` manages available information about a document and a page, and calculate boundary boxes for given spans on fields. diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/CellProvider.ts b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/CellProvider.ts new file mode 100644 index 000000000..331277f79 --- /dev/null +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/CellProvider.ts @@ -0,0 +1,96 @@ +import { isSideBySideOnLine } from '../common/bboxUtils'; +import { TextLayoutCell, TextLayoutCellBase } from '../textLayout/types'; + +export class CellProvider { + private readonly skippedCells: TextLayoutCellBase[] = []; + private cells: TextLayoutCellBase[]; // make sure to handle this as immutable array + private cursor: number = 0; + + constructor(cells: TextLayoutCellBase[]) { + this.cells = [...cells]; + } + + hasNext() { + while (this.cursor < this.cells.length) { + const cell = this.cells[this.cursor]; + if (cell.text.trim().length !== 0) { + break; + } + this.skip(); + } + return this.cursor < this.cells.length; + } + + /** get cells on a line */ + private getNextCells = (() => { + let lastCells: TextLayoutCellBase[] | null = null; + let lastCursor: number | null = null; + let lastResult: TextLayoutCellBase[] | null = null; + + return () => { + if (lastResult && lastCells === this.cells && lastCursor === this.cursor) { + return lastResult; + } + + const result: TextLayoutCellBase[] = []; + let lastCell: TextLayoutCell | null = null; + for (let i = this.cursor; i < this.cells.length; i += 1) { + const currentBox = this.cells[i]; + // maybe we need to break this loop by big box change + const { cell: baseCurrentCell } = currentBox.getNormalized(); + if (lastCell && !isSideBySideOnLine(lastCell.bbox, baseCurrentCell.bbox)) { + break; + } + result.push(currentBox); + lastCell = baseCurrentCell; + } + lastCells = this.cells; + lastCursor = this.cursor; + lastResult = result; + + return result; + }; + })(); + + /** get text from cells on a line */ + getNextText() { + const nextCells = this.getNextCells(); + const texts = nextCells.map(cell => cell.text); + return { texts, nextCellIndex: this.cursor }; + } + + /** consume first n chars */ + consume(length: number): TextLayoutCellBase[] { + const result: TextLayoutCellBase[] = []; + + let lengthToConsume = length; + while (lengthToConsume > 0) { + const current = this.cells[this.cursor]; + const bboxTextLength = current.text.length; + + if (lengthToConsume < bboxTextLength) { + // in this case, split bbox and consume matched part + // add prefix to the result + const consumed = current.getPartial([0, lengthToConsume]); + result.push(consumed); + + const remaining = current.getPartial([lengthToConsume, bboxTextLength]); + const newCells = [...this.cells]; + newCells[this.cursor] = remaining; + this.cells = newCells; + break; + } + + result.push(current); + lengthToConsume -= bboxTextLength; + this.cursor += 1; + } + return result; + } + + /** skip the current cell */ + skip() { + this.skippedCells.push(this.cells[this.cursor]); + this.cursor += 1; + } +} diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/MappingSourceTextProvider.ts b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/MappingSourceTextProvider.ts new file mode 100644 index 000000000..9c1b12384 --- /dev/null +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/MappingSourceTextProvider.ts @@ -0,0 +1,61 @@ +import { TextSpan } from '../../types'; +import { TextProvider } from './TextProvider'; +import { TextNormalizer } from '../common/TextNormalizer'; +import minBy from 'lodash/minBy'; +import { spanGetText, spanLen, START } from '../common/textSpanUtils'; +import { TextLayoutCell } from '../textLayout/types'; + +const debugOut = require('debug')?.('pdf:mapping:MappingSourceTextProvider'); +function debug(...args: any) { + debugOut?.apply(null, args); +} + +export class MappingSourceTextProvider { + private readonly cell: TextLayoutCell; + private readonly normalizer: TextNormalizer; + private readonly provider: TextProvider; + + constructor(cell: TextLayoutCell) { + this.cell = cell; + this.normalizer = new TextNormalizer(cell.text); + this.provider = new TextProvider(this.normalizer.normalizedText); + } + + getMatch(text: string) { + const normalizedText = this.normalizer.normalize(text); + debug('getMatch "%s", normalized "%s"', text, normalizedText); + const normalizedMatches = this.provider.getMatches(normalizedText); + debug('normalized matches: %o', normalizedMatches); + + // find best + const normalizedResult = minBy(normalizedMatches, m => m.minHistoryDistance); + if (!normalizedResult) { + debug('getMatch result: null'); + return null; + } + + const rawMatchedSpan = this.normalizer.toRaw(normalizedResult.span); + const rawSkipTextSpan = this.normalizer.toRaw([ + normalizedResult.span[START] - normalizedResult.skipText.length, + normalizedResult.span[START] + ]); + const r = { + span: rawMatchedSpan, + skipText: spanGetText(this.cell.text, rawSkipTextSpan), + score: spanLen(rawMatchedSpan) - normalizedResult.minHistoryDistance, + approxLenAfterEnd: normalizedResult.textAfterEnd.length + }; + debug('getMatch result: %o', r); + return r; + } + + consume(span: TextSpan) { + const normalizedSpan = this.normalizer.toNormalized(span); + this.provider.consume(normalizedSpan); + debug('text span consumed %o', span); + } + + isBlank(text: string) { + return this.normalizer.isBlank(text); + } +} diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/MappingTargetCellProvider.ts b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/MappingTargetCellProvider.ts new file mode 100644 index 000000000..72990c7e2 --- /dev/null +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/MappingTargetCellProvider.ts @@ -0,0 +1,57 @@ +import { TextLayoutCellBase } from '../textLayout/types'; +import { TextNormalizer } from '../common/TextNormalizer'; +import { CellProvider } from './CellProvider'; +import { END } from '../common/textSpanUtils'; + +export class MappingTargetBoxProvider { + private readonly cellProvider: CellProvider; + private current: { + nextCellIndex: number; + normalizer: TextNormalizer; + leadingSpaces: number; + } | null = null; + + constructor(cells: TextLayoutCellBase[]) { + this.cellProvider = new CellProvider(cells); + } + + hasNext() { + while (this.cellProvider.hasNext()) { + const { texts, nextCellIndex } = this.cellProvider.getNextText(); + const text = texts.join(''); + const leadingSpaces = text.match(/^\s*/)?.[0].length ?? 0; + const trimmedText = text.substring(leadingSpaces); + if (trimmedText.length > 0) { + const normalizer = new TextNormalizer(trimmedText); + this.current = { + nextCellIndex, + normalizer, + leadingSpaces + }; + return true; + } + this.cellProvider.skip(); // skip blank only + } + this.current = null; + return false; + } + + getNextInfo() { + return { + text: this.current!.normalizer.normalizedText, + index: this.current!.nextCellIndex + }; + } + + consume(length: number) { + const rawSpan = this.current!.normalizer.toRaw([0, length]); + const rawLength = this.current!.leadingSpaces + rawSpan[END]; + this.current = null; + return this.cellProvider.consume(rawLength); + } + + skip() { + this.current = null; + this.cellProvider.skip(); + } +} diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/TextBoxMapping.ts b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/TextBoxMapping.ts new file mode 100644 index 000000000..15580c849 --- /dev/null +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/TextBoxMapping.ts @@ -0,0 +1,87 @@ +import { TextSpan } from '../../types'; +import { TextBoxMapping, TextBoxMappingEntry, TextBoxMappingResult } from './types'; +import { Dictionary } from 'lodash'; +import groupBy from 'lodash/groupBy'; +import { + spanCompare, + spanFromSubSpan, + spanGetSubSpan, + spanIntersection, + spanIntersects +} from '../common/textSpanUtils'; +import { TextLayoutCell, TextLayoutCellBase } from '../textLayout/types'; +import { TextNormalizer } from '../common/TextNormalizer'; + +const debugOut = require('debug')?.('pdf:mapping:TextBoxMappingImpl'); +function debug(...args: any) { + debugOut?.apply(null, args); +} + +export class TextBoxMappingImpl implements TextBoxMapping { + private readonly mappingEntryMap: Dictionary; + + constructor(mappingEntries: TextBoxMappingEntry[]) { + this.mappingEntryMap = groupBy(mappingEntries, m => m.text.cell.id); + + // sort by span offset + Object.values(this.mappingEntryMap).forEach(value => { + value.sort((a, b) => spanCompare(a.text.span, b.text.span)); + }); + debug('TextBoxMapping created'); + debug(this); + } + + getEntries(sourceCell: TextLayoutCell, spanInSourceCell: TextSpan) { + return (this.mappingEntryMap[sourceCell.id] || []).filter(m => + spanIntersects(m.text.span, spanInSourceCell) + ); + } + + apply(source: TextLayoutCellBase, aSpan?: TextSpan): TextBoxMappingResult { + const span: TextSpan = aSpan || [0, source.text.length]; + + const { cell: sourceCell, span: sourceSpan } = source.getNormalized(); + const spanInSourceCell = sourceSpan ? spanFromSubSpan(sourceSpan, span) : span; + + debug('applying TextBoxMapping'); + debug(source, span); + const entries = this.getEntries(sourceCell, spanInSourceCell); + const result = entries.map(m => { + if (!m.box) { + return { cell: null, sourceSpan: m.text.span }; + } else { + let boxSpan; + if (hasSameText(m.text.cell, m.text.span, source, spanInSourceCell)) { + boxSpan = spanGetSubSpan(m.text.span, spanInSourceCell); + } else { + const n1 = new TextNormalizer(m.text.cell.text); + const normalizedBoxSpan = spanGetSubSpan( + n1.toNormalized(m.text.span), + n1.toNormalized(spanInSourceCell) + ); + const n2 = new TextNormalizer(m.box.cell.text); + boxSpan = n2.toRaw(normalizedBoxSpan); + } + + return { + cell: m.box.cell.getPartial(boxSpan), + sourceSpan: spanIntersection(m.text.span, spanInSourceCell) + }; + } + }); + debug('applying TextBoxMapping - result'); + debug(result); + return result; + } +} + +function hasSameText( + textCell: TextLayoutCellBase, + textSpan: TextSpan, + sourceCell: TextLayoutCellBase, + sourceSpan: TextSpan +) { + const left = textCell.text.substring(...textSpan); + const right = sourceCell.text.substring(...sourceSpan); + return left === right; +} diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/TextProvider.ts b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/TextProvider.ts new file mode 100644 index 000000000..08f8ff4e9 --- /dev/null +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/TextProvider.ts @@ -0,0 +1,92 @@ +import { TextSpan } from '../../types'; +import { + END, + START, + spanIntersects, + spanIncludesIndex, + spanGetText, + spanIntersection +} from '../common/textSpanUtils'; +import { findLargestIndex } from '../common/findLargestIndex'; + +const MAX_HISTORY = 3; + +export type TextMatch = { + span: TextSpan; + skipText: string; + minHistoryDistance: number; + textAfterEnd: string; +}; + +export class TextProvider { + private readonly fieldText: string; + private remainingSpans: TextSpan[]; + private history: number[] = [0]; // Keep MAX_HISTORY last recently consumed + + constructor(fieldText: string) { + this.fieldText = fieldText; + this.remainingSpans = [[0, fieldText.length]]; + } + + getMatches(text: string, minLength = 1, maxLength = text.length): TextMatch[] { + const match = findLargestIndex(minLength, maxLength + 1, index => { + const lengthToMatch = index; + const textToMatch = text.substring(0, lengthToMatch); + + const result: TextMatch[] = []; + for (const aSpan of this.remainingSpans) { + const [spanBegin, spanEnd] = aSpan; + const spanText = this.fieldText.slice(spanBegin, spanEnd); + + const foundIndex = spanText.indexOf(textToMatch); + if (foundIndex >= 0) { + const foundSpanBegin = spanBegin + foundIndex; + const foundSpanEnd = foundSpanBegin + textToMatch.length; + const historyDistances = this.history.map(i => { + const v = foundSpanBegin - i; + return v >= 0 ? v : Number.MAX_SAFE_INTEGER; + }); + result.push({ + span: [foundSpanBegin, foundSpanEnd], + skipText: spanText.substring(0, foundIndex), + minHistoryDistance: Math.min(...historyDistances, this.fieldText.length), + textAfterEnd: this.remainingSpans + .map(span => { + const validSpan = spanIntersection([foundSpanEnd, this.fieldText.length], span); + return spanGetText(this.fieldText, validSpan); + }) + .join('') + }); + } + } + return result.length > 0 ? result : null; + }); + + return match ? match.value : []; + } + + consume(span: TextSpan) { + const remaining: TextSpan[] = []; + this.remainingSpans.forEach(remainingSpan => { + if (spanIntersects(span, remainingSpan)) { + if (remainingSpan[START] < span[START]) { + remaining.push([remainingSpan[START], span[START]]); + } + if (span[END] < remainingSpan[END]) { + remaining.push([span[END], remainingSpan[END]]); + } + } else { + remaining.push(remainingSpan); + } + }); + this.remainingSpans = remaining; + + // update history + const validSpans = [span[END], ...this.history].filter(index => { + if (spanIncludesIndex(span, index)) return false; + if (!this.remainingSpans.some(s => spanIncludesIndex(s, index))) return false; + return true; + }); + this.history = validSpans.slice(0, MAX_HISTORY); + } +} diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/__tests__/TextProvider.test.ts b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/__tests__/TextProvider.test.ts new file mode 100644 index 000000000..928b4cabd --- /dev/null +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/__tests__/TextProvider.test.ts @@ -0,0 +1,56 @@ +import { TextProvider } from '../TextProvider'; + +describe('TextProvider', () => { + it('should find correct span for a text', () => { + const fieldText = 'This is a sample sample text content.'; + const provider = new TextProvider(fieldText); + + const r = provider.getMatches('sample')[0]; + expect(r?.skipText).toBe('This is a '); + expect(r?.span).toEqual([10, 16]); + expect(r?.minHistoryDistance).toBe(10); + expect(r?.textAfterEnd).toBe(' sample text content.'); + }); + + it('should find correct spans for a text after consuming a span', () => { + const fieldText = 'This is a sample sample text content.'; + const matcher = new TextProvider(fieldText); + + // match and consumer a word + let match = matcher.getMatches('sample'); + let r = match[0]; + matcher.consume(r?.span); + + // find span in former of remaining spans + match = matcher.getMatches(' is'); + expect(match).toHaveLength(1); + r = match[0]; + expect(r?.skipText).toBe('This'); + expect(r?.span).toEqual([4, 7]); + expect(r?.minHistoryDistance).toBe(4); + expect(r?.textAfterEnd).toBe(' a sample text content.'); + + // find span in latter of remaining spans + match = matcher.getMatches('sample'); + expect(match).toHaveLength(1); + r = match[0]; + expect(r?.skipText).toBe(' '); + expect(r?.span).toEqual([17, 23]); + expect(r?.minHistoryDistance).toBe(1); + expect(r?.textAfterEnd).toBe(' text content.'); + + // find spans in both of remaining spans + match = matcher.getMatches('s'); + expect(match).toHaveLength(2); + r = match[0]; + expect(r?.skipText).toBe('Thi'); + expect(r?.span).toEqual([3, 4]); + expect(r?.minHistoryDistance).toBe(3); + expect(r?.textAfterEnd).toBe(' is a sample text content.'); + r = match[1]; + expect(r?.skipText).toBe(' '); + expect(r?.span).toEqual([17, 18]); + expect(r?.minHistoryDistance).toBe(1); + expect(r?.textAfterEnd).toBe('ample text content.'); + }); +}); diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/getTextBoxMapping.ts b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/getTextBoxMapping.ts new file mode 100644 index 000000000..dc618c6b7 --- /dev/null +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/getTextBoxMapping.ts @@ -0,0 +1,133 @@ +import minBy from 'lodash/minBy'; +import { TextSpan } from '../../types'; +import { bboxIntersects } from '../common/bboxUtils'; +import { nonEmpty } from '../common/nonEmpty'; +import { spanLen, spanUnion } from '../common/textSpanUtils'; +import { TextLayout, TextLayoutCell, TextLayoutCellBase } from '../textLayout/types'; +import { MappingSourceTextProvider } from './MappingSourceTextProvider'; +import { MappingTargetBoxProvider } from './MappingTargetCellProvider'; +import { TextBoxMappingImpl } from './TextBoxMapping'; +import { TextBoxMappingEntry } from './types'; + +const debugOut = require('debug')?.('pdf:mapping:getTextBoxMapping'); +function debug(...args: any) { + debugOut?.apply(null, args); +} + +function findMatchInSources( + sources: { + cell: TextLayoutCell; + provider: MappingSourceTextProvider; + }[], + textToMatch: string +) { + // find matches + const matches = sources.map(source => { + const match = source.provider.getMatch(textToMatch); + return { + cell: source.cell, + provider: source.provider, + match + }; + }); + + // calc cost for each match + let skipTextLen = 0; + const matchesWithCost = matches.map(aMatch => { + const { match: providerMatch } = aMatch; + const cost = !providerMatch + ? Number.MAX_SAFE_INTEGER + : skipTextLen + providerMatch.skipText.length - spanLen(providerMatch.span); + + skipTextLen += providerMatch?.approxLenAfterEnd ?? 0; + + return { ...aMatch, cost }; + }); + + // find best match + const bestMatch = minBy(matchesWithCost, match => match.cost); + return bestMatch; +} + +export function getTextBoxMappings< + SourceCell extends TextLayoutCell, + TargetCell extends TextLayoutCell +>(source: TextLayout, target: TextLayout) { + const sourceProviders = source.cells.map(cell => new MappingSourceTextProvider(cell)); + const targetProvider = new MappingTargetBoxProvider(target.cells); + + const targetIndexToSources = target.cells.map(targetCell => { + return source.cells + .map((sourceCell, index) => { + if (!bboxIntersects(sourceCell.bbox, targetCell.bbox)) { + return null; + } + return { cell: sourceCell, provider: sourceProviders[index] }; + }) + .filter(nonEmpty); + }); + + const mappingEntries: TextBoxMappingEntry[] = []; + + debug('getTextBoxMapping'); + while (targetProvider.hasNext()) { + // find matches + const { index: targetCellIndex, text: targetText } = targetProvider.getNextInfo(); + debug('> find match at index %d, text: %s', targetCellIndex, targetText); + const matchInSource = findMatchInSources(targetIndexToSources[targetCellIndex], targetText); + debug('> source cell(s) matched: %o', matchInSource); + + // skip when no match found... + if (!matchInSource?.match || spanLen(matchInSource.match.span) === 0) { + targetProvider.skip(); + continue; + } + + const matchedSourceSpan = matchInSource.match.span; + const matchedSourceProvider = matchInSource.provider; + const matchedLength = spanLen(matchedSourceSpan); + + const matchedTargetCells = targetProvider.consume(matchedLength); + debug('> target cells for matched length: %d', matchedLength); + debug(matchedTargetCells); + + let consumedSourceSpan: TextSpan = [0, 0]; + matchedTargetCells.forEach(mTargetCell => { + const trimmedCell = trimCell(mTargetCell); + if (trimmedCell.text.length > 0) { + const matchToTargetCell = matchedSourceProvider.getMatch(trimmedCell.text); + debug('>> target cell %o (%o) to source %o', mTargetCell, trimmedCell, matchToTargetCell); + if (matchToTargetCell) { + // consume source text which is just mapped to the target + matchedSourceProvider.consume(matchToTargetCell.span); + consumedSourceSpan = spanUnion(consumedSourceSpan, matchToTargetCell.span); + mappingEntries.push({ + text: { cell: matchInSource.cell, span: matchToTargetCell.span }, + box: { cell: trimmedCell } + }); + debug('>> added mapping entry %o', mappingEntries[mappingEntries.length - 1]); + } + } + }); + // consume entire the range that is matched to sources + if (spanLen(consumedSourceSpan) > 0) { + matchedSourceProvider.consume(consumedSourceSpan); + debug('> span consumed in source: ', consumedSourceSpan); + } + } + + return new TextBoxMappingImpl(mappingEntries); +} + +function trimCell(cell: TextLayoutCellBase) { + const text = cell.text; + const nLeadingSpaces = text.match(/^\s*/)![0].length; + const nTrailingSpaces = text.match(/\s*$/)![0].length; + if (nLeadingSpaces === 0 && nTrailingSpaces === 0) { + return cell; + } + if (text.length > nLeadingSpaces + nTrailingSpaces) { + return cell.getPartial([nLeadingSpaces, text.length - nTrailingSpaces]); + } + return cell.getPartial([0, 0]); +} diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/index.ts b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/index.ts new file mode 100644 index 000000000..8e16507ac --- /dev/null +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/index.ts @@ -0,0 +1 @@ +export { getTextBoxMappings } from './getTextBoxMapping'; diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/types.ts b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/types.ts new file mode 100644 index 000000000..132694a7d --- /dev/null +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/types.ts @@ -0,0 +1,16 @@ +import { TextSpan } from '../../types'; +import { TextLayoutCell, TextLayoutCellBase } from '../textLayout/types'; + +export type TextBoxMappingResult = { + cell: TextLayoutCellBase | null; + sourceSpan: TextSpan; +}[]; + +export interface TextBoxMapping { + apply(source: TextLayoutCellBase, span?: TextSpan): TextBoxMappingResult; +} + +export interface TextBoxMappingEntry { + text: { cell: TextLayoutCell; span: TextSpan }; + box: { cell: TextLayoutCellBase } | null; +} From e43318a6f1afc9c9b044ce43ac886b500b055652 Mon Sep 17 00:00:00 2001 From: Susumu Fukuda Date: Wed, 10 Nov 2021 16:43:20 +0900 Subject: [PATCH 15/51] feat: add PDF highlight component --- .../PdfViewerHighlight/PdfViewerHighlight.tsx | 109 +++++++++++ .../PdfViewerWithHighlight.stories.scss | 25 +++ .../PdfViewerWithHighlight.stories.tsx | 182 ++++++++++++++++++ .../PdfViewerWithHighlight.tsx | 105 ++++++++++ .../_document-preview-pdf-viewer.scss | 13 ++ 5 files changed, 434 insertions(+) create mode 100644 packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/PdfViewerHighlight.tsx create mode 100644 packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/PdfViewerWithHighlight.stories.scss create mode 100644 packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/PdfViewerWithHighlight.stories.tsx create mode 100644 packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/PdfViewerWithHighlight.tsx diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/PdfViewerHighlight.tsx b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/PdfViewerHighlight.tsx new file mode 100644 index 000000000..7991ddfb3 --- /dev/null +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/PdfViewerHighlight.tsx @@ -0,0 +1,109 @@ +import React, { FC, useMemo, useEffect } from 'react'; +import cx from 'classnames'; +import { DocumentFieldHighlight } from './types'; +import { QueryResult } from 'ibm-watson/discovery/v2'; +import { PdfTextLayerInfo } from '../PdfViewer/PdfViewerTextLayer'; +import { Highlighter } from './utils/Highlighter'; +import { ExtractedDocumentInfo } from './utils/common/documentUtils'; +import { settings } from 'carbon-components'; + +interface Props { + className?: string; + highlightClassName?: string; + + document: QueryResult; + documentInfo: ExtractedDocumentInfo | null; + pageNum: number; + highlights: DocumentFieldHighlight[]; + pdfTextLayerInfo?: PdfTextLayerInfo; + + scale?: number; + + useHtmlBbox?: boolean; + usePdfTextItem?: boolean; +} + +const PdfViewerHighlight: FC = ({ + className, + highlightClassName, + document, + documentInfo, + pageNum, + highlights, + pdfTextLayerInfo, + scale = 1.0, + useHtmlBbox = true, + usePdfTextItem = true +}) => { + const { + viewport: pdfViewport, + textContent: pdfTextContent, + textDivs: pdfTextDivs + } = pdfTextLayerInfo || {}; + const highlighter = useMemo(() => { + if (documentInfo && documentInfo.textMappings) { + return new Highlighter({ + document, + textMappings: documentInfo.textMappings, + pageNum, + htmlBboxInfo: useHtmlBbox + ? { + bboxes: documentInfo.processedDoc.bboxes, + styles: documentInfo.processedDoc.styles + } + : undefined, + pdfTextContentInfo: + usePdfTextItem && pdfTextContent && pdfViewport + ? { textContent: pdfTextContent, viewport: pdfViewport } + : undefined + }); + } + return null; + }, [document, documentInfo, pageNum, pdfTextContent, pdfViewport, useHtmlBbox, usePdfTextItem]); + + useEffect(() => { + if (highlighter) { + highlighter.setTextContentDivs(pdfTextDivs); + } + }, [highlighter, pdfTextDivs]); + + const highlightBoxes = useMemo(() => { + return highlights.map(highlight => { + return highlighter?.getHighlight(highlight); + }); + }, [highlighter, highlights]); + + return ( +
+ {highlightBoxes.map((hl, hlIndex) => { + return ( + + {hl?.boxes.map((item, index) => { + const padding = 0; + const [left, top, right, bottom] = item.bbox; + return ( +
+ ); + })} + + ); + })} +
+ ); +}; + +export default PdfViewerHighlight; diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/PdfViewerWithHighlight.stories.scss b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/PdfViewerWithHighlight.stories.scss new file mode 100644 index 000000000..6de187694 --- /dev/null +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/PdfViewerWithHighlight.stories.scss @@ -0,0 +1,25 @@ +.withTextSelection { + display: flex; + height: 800px; + + .rightPane { + flex: 1 1 auto; + width: 20%; + overflow-y: scroll; + + p { + margin-bottom: 0.5rem; + } + } + .text { + overflow-wrap: break-word; + white-space: pre-wrap; + font-size: 10pt; + font-family: 'Courier New', Courier, monospace; + } + + .highlight { + opacity: 0.4; + background: rgba(255, 64, 128, 1); + } +} diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/PdfViewerWithHighlight.stories.tsx b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/PdfViewerWithHighlight.stories.tsx new file mode 100644 index 000000000..67bdd3ffa --- /dev/null +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/PdfViewerWithHighlight.stories.tsx @@ -0,0 +1,182 @@ +import React, { useCallback, useMemo, useRef, useState } from 'react'; +import { storiesOf } from '@storybook/react'; +import { withKnobs, radios, number } from '@storybook/addon-knobs'; +import { action } from '@storybook/addon-actions'; +import PdfViewerWithHighlight from './PdfViewerWithHighlight'; +import { flatten } from 'lodash'; +import { DocumentFieldHighlight } from './types'; + +import { document as doc } from 'components/DocumentPreview/__fixtures__/Art Effects.pdf'; +import document from 'components/DocumentPreview/__fixtures__/Art Effects Koya Creative Base TSA 2008.pdf.json'; + +import './PdfViewerWithHighlight.stories.scss'; + +const pageKnob = { + label: 'Page', + options: { + range: true, + min: 1, + max: 8, + step: 1 + }, + defaultValue: 1 +}; + +const zoomKnob = { + label: 'Zoom', + options: { + 'Zoom out (50%)': '0.5', + 'Default (100%)': '1', + 'Zoom in (150%)': '1.5' + }, + defaultValue: '1' +}; + +const EMPTY: never[] = []; + +const WithTextSelection: typeof PdfViewerWithHighlight = props => { + const [selectedField, setSelectedField] = useState('text|||0'); + const { document } = props; + + const handleOnChangeField = useCallback((e: React.ChangeEvent) => { + setSelectedField(e.target.value); + }, []); + const [selectedFieldName, selectedFieldIndex] = useMemo(() => { + const [n, i] = selectedField?.split('|||') || []; + return [n, Number(i)]; + }, [selectedField]); + const fieldOptions = useMemo(() => { + const fields = Object.keys(document).filter(field => { + return !field.match(/^(document_id|extracted_|enriched_)/) && document[field]?.length > 0; + }); + return flatten( + fields.map(field => { + return document[field] + .map((content: any, index: number) => { + if (typeof content === 'string') { + return { + value: `${field}|||${index}`, + label: `${field}[${index}]` + }; + } + return null; + }) + .filter((x: any) => !!x); + }) + ); + }, [document]); + + // text selection & highlights + const [highlights, setHighlights] = useState([]); + + const fieldTextNodeRef = useRef(null); + const getFieldTextSelection = () => { + const selection = window.getSelection(); + if (!fieldTextNodeRef.current) { + return null; + } + if (!selection || selection.rangeCount < 1 || selection.isCollapsed) { + return null; + } + + const { anchorNode, focusNode, anchorOffset, focusOffset } = selection; + const anchorParentNode = anchorNode?.parentNode as HTMLElement; + const focusParentNode = focusNode?.parentNode as HTMLElement; + if ( + anchorParentNode !== fieldTextNodeRef.current || + focusParentNode !== fieldTextNodeRef.current + ) { + return null; + } + + const text = selection.toString(); + return { text, begin: anchorOffset, end: focusOffset }; + }; + const handleOnMouseUp = (_: MouseEvent) => { + const textSelection = getFieldTextSelection(); + if (!textSelection) { + return; + } + + const { begin, end } = textSelection; + const fieldText = document[selectedFieldName][selectedFieldIndex]; + + const highlight: DocumentFieldHighlight = { + field: selectedFieldName, + fieldIndex: selectedFieldIndex, + location: { begin: Math.min(begin, end), end: Math.max(begin, end) }, + text: fieldText?.substring(begin, end) + } as DocumentFieldHighlight; + setHighlights([highlight]); + }; + + return ( +
+ +
+
+ +
+

+ {/* eslint-disable-next-line jsx-a11y/no-onchange*/} + +

+
Select text to highlight
+ {/* eslint-disable-next-line jsx-a11y/no-noninteractive-element-interactions */} +

+ {selectedField && + document[selectedFieldName][selectedFieldIndex] + .replace(/ /g, '\u00a0') // NBSP + .replaceAll('\n', '\\n')} +

+
+
+ ); +}; + +storiesOf('DocumentPreview/components/PdfViewerWithHighlight', module) + .addDecorator(withKnobs) + .add('default', () => { + const page = number(pageKnob.label, pageKnob.defaultValue, pageKnob.options); + const zoom = radios(zoomKnob.label, zoomKnob.options, zoomKnob.defaultValue); + const scale = parseFloat(zoom); + const setLoadingAction = action('setLoading'); + + return ( + + ); + }) + .add('with text selection', () => { + const page = number(pageKnob.label, pageKnob.defaultValue, pageKnob.options); + const zoom = radios(zoomKnob.label, zoomKnob.options, zoomKnob.defaultValue); + const scale = parseFloat(zoom); + const setLoadingAction = action('setLoading'); + + return ( + + ); + }); diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/PdfViewerWithHighlight.tsx b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/PdfViewerWithHighlight.tsx new file mode 100644 index 000000000..a1f6fef63 --- /dev/null +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/PdfViewerWithHighlight.tsx @@ -0,0 +1,105 @@ +import React, { FC, useState, useEffect } from 'react'; +import { PDFSource } from 'pdfjs-dist'; +import { QueryResult } from 'ibm-watson/discovery/v2'; +import { DocumentFieldHighlight } from './types'; +import PdfViewer from '../PdfViewer/PdfViewer'; +import PdfViewerHighlight from './PdfViewerHighlight'; +import { extractDocumentInfo, ExtractedDocumentInfo } from './utils/common/documentUtils'; + +interface Props { + className?: string; + highlightClassName?: string; + + /** + * PDF file data as base64-encoded string + */ + file: string; + + /** + * Page number, starting at 1 + */ + page: number; + + /** + * Zoom factor, where `1` is equal to 100% + */ + scale: number; + + /** + * Options passed to PdfJsLib.getDocument + */ + pdfLoadOptions?: PDFSource; + + /** + * Callback invoked with page count, once `file` has been parsed + */ + setPageCount?: (count: number) => void; + /** + * Check if document is loading + */ + setLoading?: (loading: boolean) => void; + /** + * Callback which is invoked with whether to enable/disable toolbar controls + */ + setHideToolbarControls?: (disabled: boolean) => void; + + /** + * A document + */ + document: QueryResult; + + /** + * Highlight + */ + highlights: DocumentFieldHighlight[]; + + /** + * Consider bboxes in HTML field to highlight (internal) + */ + useHtmlBbox?: boolean; +} + +const PdfViewerWithHighlight: FC = ({ + highlightClassName, + document, + highlights, + useHtmlBbox, + ...rest +}) => { + const { page, scale } = rest; + const [textLayerInfo, setTextLayerInfo] = useState(); + + const [documentInfo, setDocumentInfo] = useState(null); + useEffect(() => { + let cancelled = false; + const extractDocInfo = async () => { + const info = await extractDocumentInfo(document); + if (!cancelled) { + setDocumentInfo(info); + } + }; + extractDocInfo(); + return () => { + cancelled = true; + }; + }, [document]); + + const highlightReady = !!documentInfo && !!textLayerInfo; + return ( + + + + ); +}; + +export default PdfViewerWithHighlight; diff --git a/packages/discovery-styles/scss/components/document-preview/_document-preview-pdf-viewer.scss b/packages/discovery-styles/scss/components/document-preview/_document-preview-pdf-viewer.scss index dce987167..fb66bb360 100644 --- a/packages/discovery-styles/scss/components/document-preview/_document-preview-pdf-viewer.scss +++ b/packages/discovery-styles/scss/components/document-preview/_document-preview-pdf-viewer.scss @@ -12,3 +12,16 @@ .#{$prefix}--document-preview-pdf-viewer--text { transform-origin: left top 0px; } + +.#{$prefix}--document-preview-pdf-viewer-highlight { + position: absolute; + transform-origin: left top 0px; + top: 0; + left: 0; +} + +.#{$prefix}--document-preview-pdf-viewer-highlight--item { + position: absolute; + opacity: 0.5; + background: rgba(0, 0, 255, 1); +} From 1d46e110822fa91da9f87758cb6684519b6d16e2 Mon Sep 17 00:00:00 2001 From: Susumu Fukuda Date: Wed, 10 Nov 2021 20:59:44 +0900 Subject: [PATCH 16/51] fix: fix readme --- .../PdfViewerHighlight/utils/README.md | 24 +++++++------------ 1 file changed, 9 insertions(+), 15 deletions(-) diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/README.md b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/README.md index 94a34ae74..30a215684 100644 --- a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/README.md +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/README.md @@ -2,33 +2,27 @@ ### TextLayout -`TextLayout` shows that what text is placed where in a page. `TextLayout` has multiple `TextLayoutCells`, which shows a particular text is rendered in a particular boundary box. +`TextLayout` shows that what text is placed where in a page. `TextLayout` has multiple `TextLayoutCells`. Each cell shows a particular text is rendered in a particular boundary box. -So, `metadata.text_mappings` is a kind of `TextLayout` because it bounds text to a boundary box. `bbox`es stored in `html` field can be `TextLayout`s. Also, text content items from PDF (i.e. PDF programmatic text) can be `TextLayout`s. +So, `metadata.text_mappings` is a kind of `TextLayout` because it bounds text to a boundary box. `bbox`es stored in `html` field can also be a `TextLayout`. Text content items from PDF (i.e. PDF programmatic text) as well. -Each type of text layout has each granularity, text length and the size of boundary box in a `TextLayoutCell` are different. For example, a cell from `text_mappings` typically has longer text (sometimes it's a paragraph) and large boundary box. A cell from PDF text content item has shorter text (say it's word or short phrase) and small boundary box. +Depending on its source, each type of text layout has each granularity, i.e. text length and the size of boundary box in a `TextLayoutCell` are different. For example, a cell from `text_mappings` typically has longer text (sometimes it's a paragraph) and large boundary box. A cell from PDF text content item has shorter text (say it's word or short phrase) and small boundary box. -For highlighting, smaller boundary boxes allow more accurate highlight location. +For highlighting, smaller boundary boxes produces more accurate highlight boundary box. ### Find smaller text layout cell using `TextBoxMappings` -So, we build mappings from larger cells to smaller cells. More detail, map a span on a text of a large cell to a span on a text of a smaller cell. +So, we build mappings from larger cells to smaller cells. More detail, mapping from a span on a text in a large cell to a span on a text in a smaller cell. -We typically starts with cells from `text_mapping` because we can find a cell and s span on it from a span on a field. Then we can use the mappings to find smaller cells, which are typically from PDF text content items. +To find highlight boundary box, we typically starts with cells from `text_mapping` because we can find a cell and s span on it from a span on a field. Then, use the mappings to find smaller cells, which are typically from PDF text content items. -However, calculation of the mapping is not straightforward. Cells can be over-wrapped, order of smaller cells may not same to the text in a larger cells. So, `getTextBoxMappings` and it helpers `TextNormalizer`, `TextProvider`, `CellProvider` are for calculating the best mapping even with the situation. +However, calculation of the mapping is not straightforward. Cells can be over-wrapped, order of smaller cells may not same to the text in a larger cells. `getTextBoxMappings` and it helpers `TextNormalizer`, `TextProvider`, `CellProvider` are used to calculate a good mapping even with the situation. ### Text layout cell to boundary box -`TextLayout` shows what text is placed where in a page. `TextLayout` has multiple `TextLayoutCells`, which shows a particular text is rendered in a particular boundary box. +Now, we have small cells for highlighting. -So, `metadata.text_mappings` is a kind of `TextLayout` because it bounds text to a boundary box. `bbox`es stored in `html` field can be `TextLayout`s. Also, text content items from PDF (i.e. PDF programmatic text) can be `TextLayout`s. - -Each type of text layout has each granularity, text length and the size of boundary box in a `TextLayoutCell` are different. For example, a cell from `text_mappings` typically has longer text (sometimes it's a paragraph) and large boundary box. A cell from PDF text content item has shorter text (say it's word or short phrase) and small boundary box. - -For highlighting, smaller boundary boxes allow more accurate highlight location. - -Even with a small cell, text to highlight may be a span on a cell text. In the case, we have to calculate boundary box for the text. By default, cells approximate the boundary box by assigning width evenly to every characters in the cell text. +Even with a small cell, text to highlight may be a span on a cell text. In the case, we have to calculate boundary box for the span. By default, cells approximate the boundary box by assigning width evenly to every characters in the cell text. Some `TextLayoutCalls` has capability of calculating boundary box for a sub-span of its text. For example, cells for PDF text items `PdfTextContentTextLayoutCell` can calculate boundary boxes for given text spans. It internally uses DOM and DOM's `getBoundingClientRect` to get the result. From 9b33674849553dd8082cf65511cc5c4d1737edb1 Mon Sep 17 00:00:00 2001 From: Susumu Fukuda Date: Wed, 10 Nov 2021 21:54:35 +0900 Subject: [PATCH 17/51] fix: revise readme --- .../components/PdfViewerHighlight/utils/README.md | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/README.md b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/README.md index 30a215684..4f4ad2966 100644 --- a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/README.md +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/README.md @@ -18,6 +18,20 @@ To find highlight boundary box, we typically starts with cells from `text_mappin However, calculation of the mapping is not straightforward. Cells can be over-wrapped, order of smaller cells may not same to the text in a larger cells. `getTextBoxMappings` and it helpers `TextNormalizer`, `TextProvider`, `CellProvider` are used to calculate a good mapping even with the situation. +#### How to build mappings + +`CellProvider` denotes fine-grained text layout. It provides small text layout cells with the text. `MappingTargetBoxProvider` wraps `CellProvider` mainly for normalizing text. Normalization is important because the text in original PDF can be refined in field text. For example, two consequence spaces are normalized to one, and quotation marks can be normalized. + +`TextProvider` provides text from course-grained text layout cells. User can consume spans on the text (i.e. mark the text span used) and the class manages text which is yet to be consumed. The class can find `match` to a given text in the remaining text and returns score of match. `MappingSourceTextProvider` wraps `TextProvider` for text normalization. + +With these classes, `getTextBoxMappings` builds mappings as follow: + +1. Load text from `CellProvider`. It may spans on multiple text layout cells +2. Find match in `TextProvider`, and then consume the matched text +3. For each text layout cells in the matched text, + 1. associate the text layout cell and a span on the matched text + 2. mark the text layout cell consumed + ### Text layout cell to boundary box Now, we have small cells for highlighting. From 73c2ff87295094f50bc0393e74b24b4114c27198 Mon Sep 17 00:00:00 2001 From: Susumu Fukuda Date: Wed, 17 Nov 2021 15:45:37 +0900 Subject: [PATCH 18/51] fix: fix readme --- .../components/PdfViewerHighlight/utils/README.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/README.md b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/README.md index 4f4ad2966..daec4a7a5 100644 --- a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/README.md +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/README.md @@ -4,7 +4,7 @@ `TextLayout` shows that what text is placed where in a page. `TextLayout` has multiple `TextLayoutCells`. Each cell shows a particular text is rendered in a particular boundary box. -So, `metadata.text_mappings` is a kind of `TextLayout` because it bounds text to a boundary box. `bbox`es stored in `html` field can also be a `TextLayout`. Text content items from PDF (i.e. PDF programmatic text) as well. +So, `metadata.text_mappings` is a kind of `TextLayout` because it bounds text to boundary boxes. `bbox`es stored in `html` field can also be a `TextLayout`. Text objects in a PDF page (`TextContentItem`s in `pdfjs-dist` npm package) can also be a `TextLayout`. Depending on its source, each type of text layout has each granularity, i.e. text length and the size of boundary box in a `TextLayoutCell` are different. For example, a cell from `text_mappings` typically has longer text (sometimes it's a paragraph) and large boundary box. A cell from PDF text content item has shorter text (say it's word or short phrase) and small boundary box. @@ -14,19 +14,19 @@ For highlighting, smaller boundary boxes produces more accurate highlight bounda So, we build mappings from larger cells to smaller cells. More detail, mapping from a span on a text in a large cell to a span on a text in a smaller cell. -To find highlight boundary box, we typically starts with cells from `text_mapping` because we can find a cell and s span on it from a span on a field. Then, use the mappings to find smaller cells, which are typically from PDF text content items. +To find highlight boundary box, we typically starts with cells from `text_mapping` because we can find a cell and a span on it from a span on a field. Then, use the mappings to find smaller cells, which are typically from PDF text content items. -However, calculation of the mapping is not straightforward. Cells can be over-wrapped, order of smaller cells may not same to the text in a larger cells. `getTextBoxMappings` and it helpers `TextNormalizer`, `TextProvider`, `CellProvider` are used to calculate a good mapping even with the situation. +However, calculation of the mapping is not straightforward. A smaller cell can be overlapped with two or more larger cells. The order of smaller cells may not be the same as the text in a larger cells. They make hard to find a smaller cell from a span on a text in a larger cell. `getTextBoxMappings` and it helpers `TextNormalizer`, `TextProvider`, `CellProvider` are used to calculate a good mapping even with the situation. #### How to build mappings -`CellProvider` denotes fine-grained text layout. It provides small text layout cells with the text. `MappingTargetBoxProvider` wraps `CellProvider` mainly for normalizing text. Normalization is important because the text in original PDF can be refined in field text. For example, two consequence spaces are normalized to one, and quotation marks can be normalized. +`CellProvider` denotes fine-grained text layout. It provides small text layout cells with the text. `MappingTargetBoxProvider` wraps `CellProvider` mainly for normalizing text. Normalization is important because the text in original PDF can be refined in field text. For example, two consecutive spaces are normalized to one, and quotation marks can be normalized. `TextProvider` provides text from course-grained text layout cells. User can consume spans on the text (i.e. mark the text span used) and the class manages text which is yet to be consumed. The class can find `match` to a given text in the remaining text and returns score of match. `MappingSourceTextProvider` wraps `TextProvider` for text normalization. With these classes, `getTextBoxMappings` builds mappings as follow: -1. Load text from `CellProvider`. It may spans on multiple text layout cells +1. Load text from `CellProvider`. It may span on multiple text layout cells 2. Find match in `TextProvider`, and then consume the matched text 3. For each text layout cells in the matched text, 1. associate the text layout cell and a span on the matched text From 050db2d91fd70707a66da42b61b22401bc7cc21a Mon Sep 17 00:00:00 2001 From: Susumu Fukuda Date: Wed, 17 Nov 2021 18:22:12 +0900 Subject: [PATCH 19/51] refactor: extract logic of iterating range rects --- .../utils/textLayout/dom.ts | 26 +++++-------- .../src/utils/document/documentUtils.ts | 37 ++++++++++++++----- 2 files changed, 37 insertions(+), 26 deletions(-) diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textLayout/dom.ts b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textLayout/dom.ts index d2cfae28d..498879b72 100644 --- a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textLayout/dom.ts +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textLayout/dom.ts @@ -1,4 +1,4 @@ -import { getTextNodeAndOffset, uniqRects } from 'utils/document/documentUtils'; +import { forEachRectInRange, getTextNodeAndOffset } from 'utils/document/documentUtils'; import { Bbox, TextSpan } from '../../types'; import { BOTTOM, LEFT, RIGHT, TOP } from '../common/bboxUtils'; import { END, START } from '../common/textSpanUtils'; @@ -23,23 +23,14 @@ export function getAdjustedCellByOffsetByDom( const beginOffset = textSpan[START]; const endOffset = Math.min(cell.text.length, textSpan[END]); - let left = cell.bbox[LEFT]; - let right = cell.bbox[RIGHT]; - const top = cell.bbox[TOP]; - const bottom = cell.bbox[BOTTOM]; - - // convert offset - function getAdjustedOffset(orgOffset: number) { - return orgOffset; - } try { const { textNode: beginTextNode, textOffset: beginTextOffset } = beginOffset > 0 - ? getTextNodeAndOffset(spanElement, getAdjustedOffset(beginOffset)) + ? getTextNodeAndOffset(spanElement, beginOffset) : { textNode: spanElement.firstChild, textOffset: 0 }; const { textNode: endTextNode, textOffset: endTextOffset } = endOffset > 0 - ? getTextNodeAndOffset(spanElement, getAdjustedOffset(endOffset)) + ? getTextNodeAndOffset(spanElement, endOffset) : { textNode: spanElement.lastChild, textOffset: spanElement.lastChild.length }; debug('finding text node for: ', cell.text); @@ -48,13 +39,14 @@ export function getAdjustedCellByOffsetByDom( debug(' textContent: ', endTextNode.textContent); debug(' endOffset: ', endTextOffset); - const range = document.createRange(); - range.setStart(beginTextNode, Math.min(beginTextOffset, beginTextNode.length)); - range.setEnd(endTextNode, Math.min(endTextOffset, endTextNode.length)); - // create highlight rect(s) inside of a field + let left = cell.bbox[LEFT]; + let right = cell.bbox[RIGHT]; + const top = cell.bbox[TOP]; + const bottom = cell.bbox[BOTTOM]; + const parentRect = spanElement.parentElement?.getBoundingClientRect(); - Array.prototype.forEach.call(uniqRects(range.getClientRects() as DOMRectList), rect => { + forEachRectInRange(beginTextNode, beginTextOffset, endTextNode, endTextOffset, rect => { left = (rect.left - parentRect!.left) / scale; right = left + rect.width / scale; }); diff --git a/packages/discovery-react-components/src/utils/document/documentUtils.ts b/packages/discovery-react-components/src/utils/document/documentUtils.ts index d82fa9b35..ebadd1cd0 100644 --- a/packages/discovery-react-components/src/utils/document/documentUtils.ts +++ b/packages/discovery-react-components/src/utils/document/documentUtils.ts @@ -144,11 +144,6 @@ export function createFieldRects({ endTextNode, endOffset }: CreateFieldRectsProps): void { - // create a Range for each field - const range = document.createRange(); - range.setStart(beginTextNode, Math.min(beginOffset, beginTextNode.length)); - range.setEnd(endTextNode, Math.min(endOffset, endTextNode.length)); - // create a field container const fieldNode = document.createElement('div'); fieldNode.className = 'field'; @@ -158,21 +153,45 @@ export function createFieldRects({ fragment.appendChild(fieldNode); // create highlight rect(s) inside of a field - Array.prototype.forEach.call(uniqRects(range.getClientRects() as DOMRectList), rect => { + forEachRectInRange(beginTextNode, beginOffset, endTextNode, endOffset, rect => { const div = document.createElement('div'); div.className = 'field--rect'; div.setAttribute('data-testid', 'field-rect'); div.setAttribute( 'style', `top: ${rect.top - parentRect.top}px; - left: ${rect.left - parentRect.left}px; - width: ${rect.width}px; - height: ${rect.height}px;` + left: ${rect.left - parentRect.left}px; + width: ${rect.width}px; + height: ${rect.height}px;` ); fieldNode.appendChild(div); }); } +/** + * Iterate over all the DOMRects for a range + * @param beginTextNode + * @param beginOffset + * @param endTextNode + * @param endOffset + * @param callback a callback invoked with each DOMRect in a range + */ +export function forEachRectInRange( + beginTextNode: Text, + beginOffset: number, + endTextNode: Text, + endOffset: number, + callback: (rect: DOMRect) => any +) { + // create a Range + const range = document.createRange(); + range.setStart(beginTextNode, Math.min(beginOffset, beginTextNode.length)); + range.setEnd(endTextNode, Math.min(endOffset, endTextNode.length)); + + // visit rects in the range + Array.prototype.forEach.call(uniqRects(range.getClientRects() as DOMRectList), callback); +} + // Some browsers (Chrome, Safari) return duplicate rects export function uniqRects(rects: DOMRectList): Partial { return uniqWith( From f62c4196f72c751f72498b29978cdce2d1598777 Mon Sep 17 00:00:00 2001 From: Susumu Fukuda Date: Wed, 17 Nov 2021 22:01:24 +0900 Subject: [PATCH 20/51] fix: apply review comments - add return type - add document to methods/classes --- .../components/PdfViewer/PdfViewer.tsx | 1 + .../PdfViewerHighlight/PdfViewerHighlight.tsx | 117 +++++++++++++----- .../PdfViewerWithHighlight.tsx | 57 +++------ .../PdfViewerHighlight/{utils => }/README.md | 0 .../PdfViewerHighlight/utils/Highlighter.ts | 109 ++++++++++------ .../utils/common/TextNormalizer.ts | 92 +++++++++++--- .../utils/common/__tests__/bboxUtils.test.ts | 8 +- .../utils/common/bboxUtils.ts | 29 ++--- .../utils/common/documentUtils.ts | 17 ++- .../utils/common/nonEmpty.ts | 6 + .../utils/common/textSpanUtils.ts | 63 ++++++++-- .../utils/textBoxMapping/CellProvider.ts | 73 ++++++----- .../MappingSourceTextProvider.ts | 15 ++- .../MappingTargetCellProvider.ts | 17 ++- .../utils/textBoxMapping/TextBoxMapping.ts | 19 ++- .../utils/textBoxMapping/TextProvider.ts | 15 +++ .../utils/textBoxMapping/getTextBoxMapping.ts | 26 +++- .../utils/textBoxMapping/types.ts | 12 ++ .../utils/textLayout/BaseTextLayout.ts | 9 ++ .../utils/textLayout/HtmlBboxTextLayout.ts | 15 ++- .../textLayout/PdfTextContentTextLayout.ts | 35 ++++-- .../textLayout/TextMappingsTextLayout.ts | 12 ++ .../utils/textLayout/dom.ts | 8 ++ .../utils/textLayout/types.ts | 2 +- .../components/DocumentPreview/utils/box.ts | 2 +- 25 files changed, 538 insertions(+), 221 deletions(-) rename packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/{utils => }/README.md (100%) diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewer/PdfViewer.tsx b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewer/PdfViewer.tsx index 4a0de76ef..3c90b0196 100644 --- a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewer/PdfViewer.tsx +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewer/PdfViewer.tsx @@ -199,4 +199,5 @@ function getCanvasInfo(viewport: any): CanvasInfo { return { width, height, canvasWidth, canvasHeight, canvasScale }; } +export type PdfViewerProps = Props; export default PdfViewer; diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/PdfViewerHighlight.tsx b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/PdfViewerHighlight.tsx index 7991ddfb3..44e626c33 100644 --- a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/PdfViewerHighlight.tsx +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/PdfViewerHighlight.tsx @@ -6,28 +6,71 @@ import { PdfTextLayerInfo } from '../PdfViewer/PdfViewerTextLayer'; import { Highlighter } from './utils/Highlighter'; import { ExtractedDocumentInfo } from './utils/common/documentUtils'; import { settings } from 'carbon-components'; +import { TextMappings } from 'components/DocumentPreview/types'; +import { ProcessedDoc } from 'utils/document'; interface Props { + /** + * Class name to style highlight layer + */ className?: string; + + /** + * Class name to style each highlight + */ highlightClassName?: string; + /** + * Document data returned by query + */ document: QueryResult; - documentInfo: ExtractedDocumentInfo | null; + + /** + * Parsed document information + */ + parsedDocument: ExtractedDocumentInfo | null; + + /** + * Current page, starting at index 1 + */ pageNum: number; + + /** + * Highlight spans on fields in document + */ highlights: DocumentFieldHighlight[]; - pdfTextLayerInfo?: PdfTextLayerInfo; + /** + * PDF text content information in a page from parsed PDF + */ + pdfTextLayerInfo: PdfTextLayerInfo | null; + + /** + * Zoom factor, where `1` is equal to 100% + */ scale?: number; + /** + * Flag to whether or not to use bbox information from html field in the document. + * True by default. This is for testing and debugging purpose. + */ useHtmlBbox?: boolean; + + /** + * Flag to whether to use PDF text items for finding bbox for highlighting. + * True by default. This is for testing and debugging purpose. + */ usePdfTextItem?: boolean; } +/** + * Text highlight layer for PdfViewer + */ const PdfViewerHighlight: FC = ({ className, highlightClassName, document, - documentInfo, + parsedDocument, pageNum, highlights, pdfTextLayerInfo, @@ -35,37 +78,20 @@ const PdfViewerHighlight: FC = ({ useHtmlBbox = true, usePdfTextItem = true }) => { - const { - viewport: pdfViewport, - textContent: pdfTextContent, - textDivs: pdfTextDivs - } = pdfTextLayerInfo || {}; - const highlighter = useMemo(() => { - if (documentInfo && documentInfo.textMappings) { - return new Highlighter({ - document, - textMappings: documentInfo.textMappings, - pageNum, - htmlBboxInfo: useHtmlBbox - ? { - bboxes: documentInfo.processedDoc.bboxes, - styles: documentInfo.processedDoc.styles - } - : undefined, - pdfTextContentInfo: - usePdfTextItem && pdfTextContent && pdfViewport - ? { textContent: pdfTextContent, viewport: pdfViewport } - : undefined - }); - } - return null; - }, [document, documentInfo, pageNum, pdfTextContent, pdfViewport, useHtmlBbox, usePdfTextItem]); + const highlighter = useHighlighter({ + document, + textMappings: parsedDocument?.textMappings, + processedDoc: useHtmlBbox ? parsedDocument?.processedDoc : undefined, + pdfTextLayerInfo: (usePdfTextItem && pdfTextLayerInfo) || undefined, + pageNum + }); + const { textDivs } = pdfTextLayerInfo || {}; useEffect(() => { if (highlighter) { - highlighter.setTextContentDivs(pdfTextDivs); + highlighter.setTextContentDivs(textDivs); } - }, [highlighter, pdfTextDivs]); + }, [highlighter, textDivs]); const highlightBoxes = useMemo(() => { return highlights.map(highlight => { @@ -106,4 +132,35 @@ const PdfViewerHighlight: FC = ({ ); }; +const useHighlighter = ({ + document, + textMappings, + processedDoc, + pdfTextLayerInfo, + pageNum +}: { + document: QueryResult; + textMappings?: TextMappings; + processedDoc?: ProcessedDoc; + pdfTextLayerInfo?: PdfTextLayerInfo; + pageNum: number; +}) => { + return useMemo(() => { + if (textMappings) { + return new Highlighter({ + document, + textMappings, + pageNum, + htmlBboxInfo: processedDoc && { + bboxes: processedDoc.bboxes, + styles: processedDoc.styles + }, + pdfTextContentInfo: + pdfTextLayerInfo?.textContent && pdfTextLayerInfo?.viewport ? pdfTextLayerInfo : undefined + }); + } + return null; + }, [document, pageNum, pdfTextLayerInfo, processedDoc, textMappings]); +}; + export default PdfViewerHighlight; diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/PdfViewerWithHighlight.tsx b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/PdfViewerWithHighlight.tsx index a1f6fef63..6036ff996 100644 --- a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/PdfViewerWithHighlight.tsx +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/PdfViewerWithHighlight.tsx @@ -1,64 +1,37 @@ import React, { FC, useState, useEffect } from 'react'; -import { PDFSource } from 'pdfjs-dist'; -import { QueryResult } from 'ibm-watson/discovery/v2'; import { DocumentFieldHighlight } from './types'; -import PdfViewer from '../PdfViewer/PdfViewer'; +import PdfViewer, { PdfViewerProps } from '../PdfViewer/PdfViewer'; import PdfViewerHighlight from './PdfViewerHighlight'; import { extractDocumentInfo, ExtractedDocumentInfo } from './utils/common/documentUtils'; +import { QueryResult } from 'ibm-watson/discovery/v2'; +import { PdfTextLayerInfo } from '../PdfViewer/PdfViewerTextLayer'; -interface Props { - className?: string; - highlightClassName?: string; - - /** - * PDF file data as base64-encoded string - */ - file: string; - - /** - * Page number, starting at 1 - */ - page: number; - - /** - * Zoom factor, where `1` is equal to 100% - */ - scale: number; - - /** - * Options passed to PdfJsLib.getDocument - */ - pdfLoadOptions?: PDFSource; - +interface Props extends PdfViewerProps { /** - * Callback invoked with page count, once `file` has been parsed + * Class name to style each highlight */ - setPageCount?: (count: number) => void; - /** - * Check if document is loading - */ - setLoading?: (loading: boolean) => void; - /** - * Callback which is invoked with whether to enable/disable toolbar controls - */ - setHideToolbarControls?: (disabled: boolean) => void; + highlightClassName?: string; /** - * A document + * Document data returned by query */ document: QueryResult; /** - * Highlight + * Highlight spans on fields in document */ highlights: DocumentFieldHighlight[]; /** - * Consider bboxes in HTML field to highlight (internal) + * Consider bboxes in HTML field to highlight. + * True by default. This is for testing purpose. */ useHtmlBbox?: boolean; } +/** + * PDF viewer component with text highlighting capability + */ const PdfViewerWithHighlight: FC = ({ highlightClassName, document, @@ -67,7 +40,7 @@ const PdfViewerWithHighlight: FC = ({ ...rest }) => { const { page, scale } = rest; - const [textLayerInfo, setTextLayerInfo] = useState(); + const [textLayerInfo, setTextLayerInfo] = useState(null); const [documentInfo, setDocumentInfo] = useState(null); useEffect(() => { @@ -90,7 +63,7 @@ const PdfViewerWithHighlight: FC = ({ - flatMap(items, item => { - if (item.cell) { - const { cell: baseCell } = item.cell.getNormalized(); - if (baseCell.parent === parent) { - const newItems = textBoxMapping.apply(item.cell); - return newItems.map(({ cell, sourceSpan }) => { - return { - cell, - sourceSpan: spanOffset(sourceSpan, item.sourceSpan[START]) - }; - }); - } - return item; - } - return []; - }); - - const { textToPdfTextItemMappings, textToHtmlBboxMappings } = this; - if (textToPdfTextItemMappings) { - items = doMapping(items, textToPdfTextItemMappings, this.textMappingsLayout); - } - if (textToHtmlBboxMappings) { - items = doMapping(items, textToHtmlBboxMappings, this.textMappingsLayout); - } - return items; + /** + * Update text content HTML elements + * @param textContentDivs HTML elements where text content items are rendered + */ + setTextContentDivs(textContentDivs?: HTMLElement[]) { + this.pdfTextContentLayout?.setDivs(textContentDivs); } + /** + * Get highlight shape from a span on a field + * @param highlight a span on a document field to highlight + * @returns highlight shape + */ getHighlight( highlight: T ): HighlightShape & Omit { @@ -142,10 +131,8 @@ export class Highlighter { isStart: index === 0, isEnd: index === items.length - 1 }; - } else { - debug('getHighlight - cell(%i) missing. source span: %o', item.sourceSpan); } - // drop something!! + debug('getHighlight - cell(%i) is not mapped. source span: %o', item.sourceSpan); return null; }) .filter(nonEmpty); @@ -155,4 +142,44 @@ export class Highlighter { ...rest }; } + + /** + * Get text layout cells from a span on a field + * @param highlight a span on a document field to highlight + * @returns TextLayoutCells representing the given highlight + */ + private getHighlightTextMappingResult(highlight: DocumentFieldHighlight): TextBoxMappingResult { + let items = this.textMappingsLayout.getHighlight(highlight); + + const doMapping = ( + items: TextBoxMappingResult, + textBoxMapping: TextBoxMapping, + parent: TextLayout + ) => + flatMap(items, item => { + if (item.cell) { + const { cell: baseCell } = item.cell.getNormalized(); + if (baseCell.parent === parent) { + const newItems = textBoxMapping.apply(item.cell); + return newItems.map(({ cell, sourceSpan }) => { + return { + cell, + sourceSpan: spanOffset(sourceSpan, item.sourceSpan[START]) + }; + }); + } + return item; + } + return []; + }); + + const { textToPdfTextItemMappings, textToHtmlBboxMappings } = this; + if (textToPdfTextItemMappings) { + items = doMapping(items, textToPdfTextItemMappings, this.textMappingsLayout); + } + if (textToHtmlBboxMappings) { + items = doMapping(items, textToHtmlBboxMappings, this.textMappingsLayout); + } + return items; + } } diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/TextNormalizer.ts b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/TextNormalizer.ts index 8eab9cf07..edfc7967f 100644 --- a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/TextNormalizer.ts +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/TextNormalizer.ts @@ -3,13 +3,24 @@ import { END, spanLen, START } from './textSpanUtils'; type SpanMapping = { rawSpan: TextSpan; normalizedSpan: TextSpan }; -const SPACES = { +type CharNormalizer = { + /** + * Get normalized character of the original string + */ + normal: (original: string) => string; + /** + * String representation regex that matches to characters to be normalized + */ + regexString: string; +}; + +const SPACES: CharNormalizer = { normal: () => ' ', regexString: '\\s+' }; -const DOUBLE_QUOTE = { - normal: () => '"', +const DOUBLE_QUOTE: CharNormalizer = { + normal: (_: string) => '"', regexString: `[${[ '«', // U+00AB '»', // U+00BB @@ -27,7 +38,7 @@ const DOUBLE_QUOTE = { ].join('')}]` }; -const QUOTE = { +const QUOTE: CharNormalizer = { normal: () => "'", regexString: `[${[ '‹', // U+2039 @@ -44,7 +55,10 @@ const QUOTE = { ].join('')}]` }; -const SURROGATE_PAIR = { +// handle a character that is encoded as a surrogate pair +// in Javascript string (i.e. UTF-16), whose length is 2 +// as a single character +const SURROGATE_PAIR: CharNormalizer = { normal: (_: string) => '_', regexString: '[\uD800-\uDBFF][\uDC00-\uDFFF]' }; @@ -52,13 +66,13 @@ const SURROGATE_PAIR = { // remove "Combining Diacritical Marks" from the string // NOTE: we may have to do this after conversion again // str.normalize("NFD").replace(/[\u0300-\u036f]/g, "") -const DIACRITICAL_MARK = { +const DIACRITICAL_MARK: CharNormalizer = { normal: () => '', regexString: '[\u0300-\u036f]' }; const DIACRITICAL_MARK_REGEX = new RegExp(DIACRITICAL_MARK.regexString, 'g'); -function normalizeDiacriticalMarks(text: string, keepLength = false) { +function normalizeDiacriticalMarks(text: string, keepLength = false): string { const r = text .normalize('NFD') .replace(DIACRITICAL_MARK_REGEX, DIACRITICAL_MARK.normal) @@ -86,11 +100,20 @@ const NORMALIZATIONS_REGEX = new RegExp( ); /** - * Normalize text + * Normalize the following in text: + * - two or more consecutive spaces to a single space + * - variants of single quote to `'` + * - variants of double quote to `"` + * - surrogate pairs to a single character `_` + * - remove diacritical marks (accent) from characters + * + * This is used for preprocessing to compare texts to ignore minor + * text differences. + * * @param text text to normalize * @returns normalized text @see TextNormalizer */ -export function normalizeText(text: string) { +function normalizeText(text: string): string { const r = NORMALIZATIONS.reduce((text, n) => { return text.replace(n.regex, m => n.normal(m)); }, text); @@ -101,10 +124,11 @@ export function normalizeText(text: string) { * Text normalizer with mapping between spans on original and normalized text * * Normalize the following in a text: - * - two or more consequent spaces - * - single or double quote - * - surrogate pairs - * - diacritical marks (accent) + * - two or more consecutive spaces to a single space + * - variants of single quote to `'` + * - variants of double quote to `"` + * - surrogate pairs to a single character `_` + * - remove diacritical marks (accent) from characters */ export class TextNormalizer { readonly rawText: string; @@ -161,6 +185,7 @@ export class TextNormalizer { } match = re.exec(this.rawText); } + if (cur < this.rawText.length) { const newText = this.rawText.substring(cur); const rawSpan: TextSpan = [cur, cur + newText.length]; @@ -171,10 +196,16 @@ export class TextNormalizer { normalizationMappings.push({ rawSpan, normalizedSpan }); addNormalizedText(newText); } + this.normalizedText = normalizedText; this.normalizationMappings = optimizeSpanMappings(normalizationMappings); } + /** + * Convert a span on original text to a span on normalized text + * @param rawSpan span on original text + * @returns span on normalized text + */ toNormalized(rawSpan: TextSpan): TextSpan { const [rawBegin, rawEnd] = rawSpan; @@ -193,6 +224,11 @@ export class TextNormalizer { return [normalizedIndex(rawBegin), normalizedIndex(rawEnd)]; } + /** + * Convert a span on normalized text to a span on normalized text + * @param normalizedSpan span on normalized text + * @returns span on original text + */ toRaw(normalizedSpan: TextSpan): TextSpan { const [normalizedBegin, normalizedEnd] = normalizedSpan; @@ -213,12 +249,22 @@ export class TextNormalizer { return [rawIndex(normalizedBegin), rawIndex(normalizedEnd)]; } - normalize(text: string) { + /** + * Normalize a text. @see TextNormalizer for the details of the normalization + * @param text text to be normalized + * @returns normalized text + */ + normalize(text: string): string { return normalizeText(text); } - isBlank(text: string) { - return text.length === 0 || text.trim().length === 0 || text.match(/^\s*$/); + /** + * Check whether a given text is blank or not + * @param text text to be tested + * @returns `true` when the text only contains spaces + */ + isBlank(text: string): boolean { + return text.length === 0 || text.trim().length === 0 || !!text.match(/^\s*$/); } } @@ -241,7 +287,19 @@ function mapCharIndexOnSpans( ); } -function optimizeSpanMappings(mappings: SpanMapping[]) { +/** + * Optimize the mappings between spans on original text and spans on normalized text + * by merging consecutive identical mappings + * + * Example: given mapping: + * (original: [0,10] -> normalized: [0,10]) + * (original: [10,20] -> normalized: [10,20]) + * (original: [20,25] -> normalized: [20,21]) + * The mapping above is optimized to: + * (original: [0,20] -> normalized: [0,20]) + * (original: [20,25] -> normalized: [20,21]) + */ +function optimizeSpanMappings(mappings: SpanMapping[]): SpanMapping[] { const sameLength = (mapping: SpanMapping) => spanLen(mapping.normalizedSpan) === spanLen(mapping.rawSpan); const isShifted = (a: SpanMapping, b: SpanMapping) => diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/__tests__/bboxUtils.test.ts b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/__tests__/bboxUtils.test.ts index f67fc87fd..419f0836b 100644 --- a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/__tests__/bboxUtils.test.ts +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/__tests__/bboxUtils.test.ts @@ -1,4 +1,4 @@ -import { bboxGetSpanByRatio, bboxIntersects, isSideBySideOnLine } from '../bboxUtils'; +import { bboxGetSpanByRatio, bboxIntersects, isNextToEachOther } from '../bboxUtils'; describe('bboxIntersects', () => { it('should return true when boxes intersect', () => { @@ -30,12 +30,12 @@ describe('bboxGetSpanByRatio', () => { describe('isSideBySideOnLine', () => { it('should return true for side-by-side boxes', () => { - expect(isSideBySideOnLine([0, 0, 5, 2], [5, 0, 10, 2])).toBeTruthy(); + expect(isNextToEachOther([0, 0, 5, 2], [5, 0, 10, 2])).toBeTruthy(); }); it('should return false when boxes are not vertically aligned', () => { - expect(isSideBySideOnLine([0, 0, 5, 2], [5, 1, 10, 3])).toBeFalsy(); + expect(isNextToEachOther([0, 0, 5, 2], [5, 1, 10, 3])).toBeFalsy(); }); it('should return false when two boxes are apart from each other', () => { - expect(isSideBySideOnLine([0, 0, 5, 2], [7, 0, 10, 2])).toBeFalsy(); + expect(isNextToEachOther([0, 0, 5, 2], [7, 0, 10, 2])).toBeFalsy(); }); }); diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/bboxUtils.ts b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/bboxUtils.ts index 2a3de4d31..a918f64f0 100644 --- a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/bboxUtils.ts +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/bboxUtils.ts @@ -1,3 +1,4 @@ +import { intersects } from 'components/DocumentPreview/utils/box'; import { Bbox, TextSpan } from '../../types'; import { spanIntersection, spanLen } from './textSpanUtils'; @@ -13,12 +14,10 @@ export const BOTTOM = 3; * but for type `Bbox`, which doesn't have page property * @param boxA one bbox * @param boxB another bbox - * @returns true iff boxA and boxB are overwrapped + * @returns true iff boxA and boxB are overlapped */ -export function bboxIntersects(boxA: Bbox, boxB: Bbox) { - const [leftA, topA, rightA, bottomA] = boxA; - const [leftB, topB, rightB, bottomB] = boxB; - return !(leftB >= rightA || rightB <= leftA || topB >= bottomA || bottomB <= topA); +export function bboxIntersects(boxA: Bbox, boxB: Bbox): boolean { + return intersects(boxA, boxB); } /** @@ -27,7 +26,7 @@ export function bboxIntersects(boxA: Bbox, boxB: Bbox) { * @param origLength length of the text * @returns bbox for the text */ -export function bboxGetSpanByRatio(bbox: Bbox, origLength: number, span: TextSpan) { +export function bboxGetSpanByRatio(bbox: Bbox, origLength: number, span: TextSpan): Bbox { const theSpan = spanIntersection([0, origLength], span); if (origLength === 0 || spanLen(theSpan) <= 0) { return [bbox[0], bbox[1], bbox[0], bbox[3]] as Bbox; @@ -39,16 +38,14 @@ export function bboxGetSpanByRatio(bbox: Bbox, origLength: number, span: TextSpa const resultLeft = left + (width / origLength) * spanStart; const resultRight = left + (width / origLength) * spanEnd; - return [resultLeft, top, resultRight, bottom] as Bbox; + return [resultLeft, top, resultRight, bottom]; } /** - * Check whether two bboxes seems to be side-by-side on a same line. - * @param boxA - * @param boxB - * @returns + * Check whether the two bboxes are next to each other in a row. + * This is used to get a text of a line from a list of small text cells. */ -export function isSideBySideOnLine(boxA: Bbox, boxB: Bbox) { +export function isNextToEachOther(boxA: Bbox, boxB: Bbox): boolean { if (bboxIntersects(boxA, boxB)) { return false; } @@ -59,14 +56,14 @@ export function isSideBySideOnLine(boxA: Bbox, boxB: Bbox) { const heightB = bottomB - topB; // compare height ratio - const OVERWRAP_RATIO = 0.8; - if (!(heightA * OVERWRAP_RATIO < heightB || heightB * OVERWRAP_RATIO < heightA)) { + const OVERLAP_RATIO = 0.8; + if (!(heightA * OVERLAP_RATIO < heightB || heightB * OVERLAP_RATIO < heightA)) { return false; } const avgHeight = (heightA + heightB) / 2; - const overWrapHeight = Math.max(0, Math.min(bottomA, bottomB) - Math.max(topA, topB)); - if (overWrapHeight < avgHeight * OVERWRAP_RATIO) { + const overlapHeight = Math.max(0, Math.min(bottomA, bottomB) - Math.max(topA, topB)); + if (overlapHeight < avgHeight * OVERLAP_RATIO) { return false; } diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/documentUtils.ts b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/documentUtils.ts index 655093698..576fb6c49 100644 --- a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/documentUtils.ts +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/documentUtils.ts @@ -5,12 +5,21 @@ import { processDoc, ProcessedDoc } from 'utils/document'; import { Location } from 'utils/document/processDoc'; import { DocumentFields, TextSpan } from '../../types'; +/** + * Get value of the specified field from a search result document + * + * @param document search result document + * @param field field name + * @param index field index. 0 by default + * @param span (optional) span on the field value to return. Returns entire the field value by default + * @returns text + */ export function getDocFieldValue( document: DocumentFields, field: string, index?: number, span?: Location | TextSpan -) { +): string | undefined { let fieldText: string | undefined; const documentFieldArray = document[field]; @@ -35,11 +44,13 @@ export type ExtractedDocumentInfo = { textMappings?: TextMappings; }; -export async function extractDocumentInfo(document: QueryResult) { +/** + * Extract bboxes and text_mappings from a search result document + */ +export async function extractDocumentInfo(document: QueryResult): Promise { const docHtml = document.html; const textMappings = getTextMappings(document) ?? undefined; - // HtmlView.tsx const processedDoc = await processDoc( { ...document, docHtml }, { sections: true, bbox: true, bboxInnerText: true } diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/nonEmpty.ts b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/nonEmpty.ts index a511faa30..be6fb569f 100644 --- a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/nonEmpty.ts +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/nonEmpty.ts @@ -1,3 +1,9 @@ +/** + * A filter to drop any non-null values from a list. + * Use with `Array.filter` method to get a list of non-null type. + * + * `const list: number[] = [1, null, 2].filter(nonEmpty); // [1,2]` + */ export function nonEmpty(value: T | null | undefined): value is T { return value !== null && value !== undefined; } diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/textSpanUtils.ts b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/textSpanUtils.ts index b0ae85939..58b388e01 100644 --- a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/textSpanUtils.ts +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/textSpanUtils.ts @@ -3,28 +3,52 @@ import { TextSpan } from '../../types'; export const START = 0; export const END = 1; -export function spanGetText(text: T, span: TextSpan) { +/** + * Get text for a given span + */ +export function spanGetText( + text: T, + span: TextSpan +): string | T { if (!text) return text; if (spanLen(span) === 0) return ''; return text.substring(span[START], span[END]); } -export function spanLen(span: TextSpan) { +/** + * Get span length + */ +export function spanLen(span: TextSpan): number { return Math.max(0, span[END] - span[START]); } +/** + * Check whether two spans has intersection or not + */ export function spanIntersects([beginA, endA]: TextSpan, [beginB, endB]: TextSpan): boolean { + // TODO: integrate with spansIntersect in documentUtils.ts return beginA < endB && endA > beginB; } -export function spanIncludesIndex([begin, end]: TextSpan, index: number) { +/** + * Check whether a span includes an given character index or not + */ +export function spanIncludesIndex([begin, end]: TextSpan, index: number): boolean { return begin <= index && index < end; } -export function spanContains(span: TextSpan, other: TextSpan) { +/** + * Check whether a span contains another span + * (i.e. for all index in `other` span, the index is in `span` span) + */ +export function spanContains(span: TextSpan, other: TextSpan): boolean { return span[START] <= other[START] && other[END] <= span[END]; } +/** + * Get the largest span that is contained by both of given spans + * @returns intersection of two spans when the two spans intersects. Zero-length span otherwise. + */ export function spanIntersection(a: TextSpan, b: TextSpan): TextSpan { if (spanContains(a, b)) return b; if (spanContains(b, a)) return a; @@ -33,7 +57,10 @@ export function spanIntersection(a: TextSpan, b: TextSpan): TextSpan { return [start, start <= end ? end : start]; } -export function spanUnion(a: TextSpan, b: TextSpan): TextSpan { +/** + * Get the smallest span that contains both of given spans + */ +export function spanMerge(a: TextSpan, b: TextSpan): TextSpan { if (spanContains(a, b) || spanLen(b) === 0) return a; if (spanContains(b, a) || spanLen(a) === 0) return b; const start = Math.min(a[START], b[START]); @@ -41,18 +68,38 @@ export function spanUnion(a: TextSpan, b: TextSpan): TextSpan { return [start, start <= end ? end : start]; } +/** + * Offset spans by given offset + */ export function spanOffset([start, end]: TextSpan, offset: number): TextSpan { return [start + offset, end + offset]; } -export function spanFromSubSpan(base: TextSpan, subSpan: TextSpan) { +/** + * Get a span from a `subSpan` on a given `base` span + * + * For example, `spanFromSubSpan([10, 20], [1, 2]) // [11, 12]` + */ +export function spanFromSubSpan(base: TextSpan, subSpan: TextSpan): TextSpan { return spanIntersection(base, spanOffset(subSpan, base[START])); } -export function spanGetSubSpan(base: TextSpan, span: TextSpan) { +/** + * Get a span within a given `base` span for a `span` + * + * For example, `spanGetSubSpan([10, 20], [11, 12]) // [1, 2]` + */ +export function spanGetSubSpan(base: TextSpan, span: TextSpan): TextSpan { return spanOffset(spanIntersection(base, span), -base[START]); } -export function spanCompare([startA, endA]: TextSpan, [startB, endB]: TextSpan) { +/** + * Compare method for spans + * + * @param spanA a span to compare + * @param spanB another span to compare + * @returns a positive number when spanA is after spanB, a negative number when spanA is before spanB, zero when spanA equals to spanB + */ +export function spanCompare([startA, endA]: TextSpan, [startB, endB]: TextSpan): number { return startA === startB ? endA - endB : startA - startB; } diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/CellProvider.ts b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/CellProvider.ts index 331277f79..73dd4ffe8 100644 --- a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/CellProvider.ts +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/CellProvider.ts @@ -1,16 +1,16 @@ -import { isSideBySideOnLine } from '../common/bboxUtils'; +import { isNextToEachOther } from '../common/bboxUtils'; import { TextLayoutCell, TextLayoutCellBase } from '../textLayout/types'; export class CellProvider { private readonly skippedCells: TextLayoutCellBase[] = []; - private cells: TextLayoutCellBase[]; // make sure to handle this as immutable array + private cells: readonly TextLayoutCellBase[]; private cursor: number = 0; constructor(cells: TextLayoutCellBase[]) { - this.cells = [...cells]; + this.cells = Object.freeze([...cells]); } - hasNext() { + hasNext(): boolean { while (this.cursor < this.cells.length) { const cell = this.cells[this.cursor]; if (cell.text.trim().length !== 0) { @@ -22,44 +22,55 @@ export class CellProvider { } /** get cells on a line */ - private getNextCells = (() => { - let lastCells: TextLayoutCellBase[] | null = null; - let lastCursor: number | null = null; - let lastResult: TextLayoutCellBase[] | null = null; + private getNextCells(): TextLayoutCellBase[] { + const { + cells: lastCells, + cursor: lastCursor, + result: lastResult + } = this.getNextCellsCache || {}; - return () => { - if (lastResult && lastCells === this.cells && lastCursor === this.cursor) { - return lastResult; - } + if (lastResult && lastCells === this.cells && lastCursor === this.cursor) { + return lastResult; + } - const result: TextLayoutCellBase[] = []; - let lastCell: TextLayoutCell | null = null; - for (let i = this.cursor; i < this.cells.length; i += 1) { - const currentBox = this.cells[i]; - // maybe we need to break this loop by big box change - const { cell: baseCurrentCell } = currentBox.getNormalized(); - if (lastCell && !isSideBySideOnLine(lastCell.bbox, baseCurrentCell.bbox)) { - break; - } - result.push(currentBox); - lastCell = baseCurrentCell; + const result: TextLayoutCellBase[] = []; + let lastCell: TextLayoutCell | null = null; + for (let i = this.cursor; i < this.cells.length; i += 1) { + const currentBox = this.cells[i]; + // maybe we need to break this loop by big box change + const { cell: baseCurrentCell } = currentBox.getNormalized(); + if (lastCell && !isNextToEachOther(lastCell.bbox, baseCurrentCell.bbox)) { + break; } - lastCells = this.cells; - lastCursor = this.cursor; - lastResult = result; + result.push(currentBox); + lastCell = baseCurrentCell; + } - return result; + this.getNextCellsCache = { + cells: this.cells, + cursor: this.cursor, + result }; - })(); + return result; + } + + private getNextCellsCache: { + cells: readonly TextLayoutCellBase[]; + cursor: number; + result: TextLayoutCellBase[]; + } | null = null; /** get text from cells on a line */ - getNextText() { + getNextText(): { texts: string[]; nextCellIndex: number } { const nextCells = this.getNextCells(); const texts = nextCells.map(cell => cell.text); return { texts, nextCellIndex: this.cursor }; } - /** consume first n chars */ + /** + * consume (mark as used) first n chars from the cursor + * @return text layout cells on the consumed text + */ consume(length: number): TextLayoutCellBase[] { const result: TextLayoutCellBase[] = []; @@ -77,7 +88,7 @@ export class CellProvider { const remaining = current.getPartial([lengthToConsume, bboxTextLength]); const newCells = [...this.cells]; newCells[this.cursor] = remaining; - this.cells = newCells; + this.cells = Object.freeze(newCells); break; } diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/MappingSourceTextProvider.ts b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/MappingSourceTextProvider.ts index 9c1b12384..22dadc34a 100644 --- a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/MappingSourceTextProvider.ts +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/MappingSourceTextProvider.ts @@ -10,6 +10,10 @@ function debug(...args: any) { debugOut?.apply(null, args); } +/** + * TextProvider with normalization + * @see TextProvider + */ export class MappingSourceTextProvider { private readonly cell: TextLayoutCell; private readonly normalizer: TextNormalizer; @@ -21,6 +25,9 @@ export class MappingSourceTextProvider { this.provider = new TextProvider(this.normalizer.normalizedText); } + /** + * Find the best span where the give text matches to the rest of the text + */ getMatch(text: string) { const normalizedText = this.normalizer.normalize(text); debug('getMatch "%s", normalized "%s"', text, normalizedText); @@ -49,13 +56,19 @@ export class MappingSourceTextProvider { return r; } + /** + * Mark the given `span` as used + */ consume(span: TextSpan) { const normalizedSpan = this.normalizer.toNormalized(span); this.provider.consume(normalizedSpan); debug('text span consumed %o', span); } - isBlank(text: string) { + /** + * Check whether a given text is blank or not + */ + isBlank(text: string): boolean { return this.normalizer.isBlank(text); } } diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/MappingTargetCellProvider.ts b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/MappingTargetCellProvider.ts index 72990c7e2..04bad0ac4 100644 --- a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/MappingTargetCellProvider.ts +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/MappingTargetCellProvider.ts @@ -3,6 +3,10 @@ import { TextNormalizer } from '../common/TextNormalizer'; import { CellProvider } from './CellProvider'; import { END } from '../common/textSpanUtils'; +/** + * Cell provider with normalization + * @see CellProvider + */ export class MappingTargetBoxProvider { private readonly cellProvider: CellProvider; private current: { @@ -15,7 +19,8 @@ export class MappingTargetBoxProvider { this.cellProvider = new CellProvider(cells); } - hasNext() { + /** check whether this provider has another item to visit or not */ + hasNext(): boolean { while (this.cellProvider.hasNext()) { const { texts, nextCellIndex } = this.cellProvider.getNextText(); const text = texts.join(''); @@ -36,20 +41,26 @@ export class MappingTargetBoxProvider { return false; } - getNextInfo() { + /** get the next value */ + getNextInfo(): { text: string; index: number } { return { text: this.current!.normalizer.normalizedText, index: this.current!.nextCellIndex }; } - consume(length: number) { + /** + * consume (mark as used) first n chars from the cursor + * @return text layout cells on the consumed text + */ + consume(length: number): TextLayoutCellBase[] { const rawSpan = this.current!.normalizer.toRaw([0, length]); const rawLength = this.current!.leadingSpaces + rawSpan[END]; this.current = null; return this.cellProvider.consume(rawLength); } + /** mark the current cell skipped (when no match found in source) */ skip() { this.current = null; this.cellProvider.skip(); diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/TextBoxMapping.ts b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/TextBoxMapping.ts index 15580c849..95c6cded0 100644 --- a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/TextBoxMapping.ts +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/TextBoxMapping.ts @@ -17,6 +17,9 @@ function debug(...args: any) { debugOut?.apply(null, args); } +/** + * Text box mapping + */ export class TextBoxMappingImpl implements TextBoxMapping { private readonly mappingEntryMap: Dictionary; @@ -31,12 +34,17 @@ export class TextBoxMappingImpl implements TextBoxMapping { debug(this); } - getEntries(sourceCell: TextLayoutCell, spanInSourceCell: TextSpan) { + /** get text mapping entries for a given span `spanInSourceCell` on a given `sourceCell` */ + private getEntries( + sourceCell: TextLayoutCell, + spanOnSourceCell: TextSpan + ): TextBoxMappingEntry[] { return (this.mappingEntryMap[sourceCell.id] || []).filter(m => - spanIntersects(m.text.span, spanInSourceCell) + spanIntersects(m.text.span, spanOnSourceCell) ); } + /** @inheritdoc */ apply(source: TextLayoutCellBase, aSpan?: TextSpan): TextBoxMappingResult { const span: TextSpan = aSpan || [0, source.text.length]; @@ -51,7 +59,7 @@ export class TextBoxMappingImpl implements TextBoxMapping { return { cell: null, sourceSpan: m.text.span }; } else { let boxSpan; - if (hasSameText(m.text.cell, m.text.span, source, spanInSourceCell)) { + if (equalsSpanText(m.text.cell, m.text.span, source, spanInSourceCell)) { boxSpan = spanGetSubSpan(m.text.span, spanInSourceCell); } else { const n1 = new TextNormalizer(m.text.cell.text); @@ -75,7 +83,10 @@ export class TextBoxMappingImpl implements TextBoxMapping { } } -function hasSameText( +/** + * Check if text on spans on cells are the same or not + */ +function equalsSpanText( textCell: TextLayoutCellBase, textSpan: TextSpan, sourceCell: TextLayoutCellBase, diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/TextProvider.ts b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/TextProvider.ts index 08f8ff4e9..2a5825240 100644 --- a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/TextProvider.ts +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/TextProvider.ts @@ -12,12 +12,21 @@ import { findLargestIndex } from '../common/findLargestIndex'; const MAX_HISTORY = 3; export type TextMatch = { + /** matched text span */ span: TextSpan; + /** text before the matched text. i.e. text that will be skipped by using this match */ skipText: string; + /** distance from the nearest cursors */ minHistoryDistance: number; + /** text after the matched text */ textAfterEnd: string; }; +/** + * Manage text in a source (larger) cell. + * - Find text (in a target cell) from the _unused_ text + * - Once a span is mapped to a target (smaller) cell, mark the the correspondent span _used_ + */ export class TextProvider { private readonly fieldText: string; private remainingSpans: TextSpan[]; @@ -28,6 +37,9 @@ export class TextProvider { this.remainingSpans = [[0, fieldText.length]]; } + /** + * Get how the given `text` matches to the currently available text + */ getMatches(text: string, minLength = 1, maxLength = text.length): TextMatch[] { const match = findLargestIndex(minLength, maxLength + 1, index => { const lengthToMatch = index; @@ -65,6 +77,9 @@ export class TextProvider { return match ? match.value : []; } + /** + * Mark the `span` as used + */ consume(span: TextSpan) { const remaining: TextSpan[] = []; this.remainingSpans.forEach(remainingSpan => { diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/getTextBoxMapping.ts b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/getTextBoxMapping.ts index dc618c6b7..f664757b3 100644 --- a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/getTextBoxMapping.ts +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/getTextBoxMapping.ts @@ -2,18 +2,24 @@ import minBy from 'lodash/minBy'; import { TextSpan } from '../../types'; import { bboxIntersects } from '../common/bboxUtils'; import { nonEmpty } from '../common/nonEmpty'; -import { spanLen, spanUnion } from '../common/textSpanUtils'; +import { spanLen, spanMerge } from '../common/textSpanUtils'; import { TextLayout, TextLayoutCell, TextLayoutCellBase } from '../textLayout/types'; import { MappingSourceTextProvider } from './MappingSourceTextProvider'; import { MappingTargetBoxProvider } from './MappingTargetCellProvider'; import { TextBoxMappingImpl } from './TextBoxMapping'; -import { TextBoxMappingEntry } from './types'; +import { TextBoxMapping, TextBoxMappingEntry } from './types'; const debugOut = require('debug')?.('pdf:mapping:getTextBoxMapping'); function debug(...args: any) { debugOut?.apply(null, args); } +/** + * Find the best source (larger text layout cell) where text `textToMatch` is in + * @param sources source (larger) text layout cells overlapping the current target cell + * @param textToMatch text form target cell(s) + * @returns the best source where the `textToMatch` is matched and the text location in the source + */ function findMatchInSources( sources: { cell: TextLayoutCell; @@ -49,10 +55,16 @@ function findMatchInSources( return bestMatch; } +/** + * Calculate text box mapping from `source` text layout to `target` text layout + * @param source text layout with larger cells + * @param target text layout with smaller cells + * @returns a text box mapping instance + */ export function getTextBoxMappings< SourceCell extends TextLayoutCell, TargetCell extends TextLayoutCell ->(source: TextLayout, target: TextLayout) { +>(source: TextLayout, target: TextLayout): TextBoxMapping { const sourceProviders = source.cells.map(cell => new MappingSourceTextProvider(cell)); const targetProvider = new MappingTargetBoxProvider(target.cells); @@ -100,7 +112,7 @@ export function getTextBoxMappings< if (matchToTargetCell) { // consume source text which is just mapped to the target matchedSourceProvider.consume(matchToTargetCell.span); - consumedSourceSpan = spanUnion(consumedSourceSpan, matchToTargetCell.span); + consumedSourceSpan = spanMerge(consumedSourceSpan, matchToTargetCell.span); mappingEntries.push({ text: { cell: matchInSource.cell, span: matchToTargetCell.span }, box: { cell: trimmedCell } @@ -119,6 +131,10 @@ export function getTextBoxMappings< return new TextBoxMappingImpl(mappingEntries); } +/** + * Get a text layout cell that represents a trimmed text of a given `cell` + * @returns a new cell for the trimmed text. Zero-length cell when the text of the given `cell` is blank + */ function trimCell(cell: TextLayoutCellBase) { const text = cell.text; const nLeadingSpaces = text.match(/^\s*/)![0].length; @@ -129,5 +145,5 @@ function trimCell(cell: TextLayoutCellBase) { if (text.length > nLeadingSpaces + nTrailingSpaces) { return cell.getPartial([nLeadingSpaces, text.length - nTrailingSpaces]); } - return cell.getPartial([0, 0]); + return cell.getPartial([0, 0]); // return zero-length cell } diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/types.ts b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/types.ts index 132694a7d..2c7e0d666 100644 --- a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/types.ts +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/types.ts @@ -6,10 +6,22 @@ export type TextBoxMappingResult = { sourceSpan: TextSpan; }[]; +/** + * Interface for text box mapping + */ export interface TextBoxMapping { + /** + * Get spans on target (smaller) cells for a given span on a source (larger) cell + * @param source source text layout cell + * @param span span on the source cell + */ apply(source: TextLayoutCellBase, span?: TextSpan): TextBoxMappingResult; } +/** + * Interface for text box mapping entries. + * Internal. Used only in text box mapping implementation + */ export interface TextBoxMappingEntry { text: { cell: TextLayoutCell; span: TextSpan }; box: { cell: TextLayoutCellBase } | null; diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textLayout/BaseTextLayout.ts b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textLayout/BaseTextLayout.ts index 77e043328..7e8f7f9e5 100644 --- a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textLayout/BaseTextLayout.ts +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textLayout/BaseTextLayout.ts @@ -35,12 +35,17 @@ export class BaseTextLayoutCell> this.text = text; } + /** @inheritdoc */ getPartial(span: TextSpan): TextLayoutCellBase { return new PartialTextLayoutCell(this, span); } + + /** @inheritdoc */ getNormalized(): { cell: TextLayoutCell; span?: TextSpan } { return { cell: this }; } + + /** @inheritdoc */ getBboxForTextSpan(span: TextSpan, options: { useRatio?: boolean }): Bbox | null { if (options?.useRatio) { return bboxGetSpanByRatio(this.bbox, this.text.length, span); @@ -61,14 +66,18 @@ export class PartialTextLayoutCell implements TextLayoutCellBase { this.span = spanIntersection([0, base.text.length], span); } + /* @inheritdoc */ get text() { return spanGetText(this.base.text, this.span); } + /** @inheritdoc */ getPartial(span: TextSpan): TextLayoutCellBase { const newSpan = spanIntersection(this.span, spanOffset(span, this.span[START])); return new PartialTextLayoutCell(this.base, newSpan); } + + /** @inheritdoc */ getNormalized() { return { cell: this.base, span: this.span }; } diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textLayout/HtmlBboxTextLayout.ts b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textLayout/HtmlBboxTextLayout.ts index 851ab6991..359120466 100644 --- a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textLayout/HtmlBboxTextLayout.ts +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textLayout/HtmlBboxTextLayout.ts @@ -4,6 +4,9 @@ import { Bbox, TextSpan } from '../../types'; import { BaseTextLayoutCell } from './BaseTextLayout'; import { HtmlBboxInfo, TextLayout } from './types'; +/** + * Text layout based on bboxes in HTML field + */ export class HtmlBboxTextLayout implements TextLayout { private readonly bboxInfo: HtmlBboxInfo; readonly cells: HtmlBboxTextLayoutCell[]; @@ -18,17 +21,24 @@ export class HtmlBboxTextLayout implements TextLayout { }) ?? []; } + /** @inheritdoc */ cellAt(id: number) { return this.cells[id]; } + /** + * Install style to DOM if not yet. The style will be used to calculate bbox in `getBboxForTextSpan` + */ installStyle() { if (this.bboxInfo.styles) { - // TODO: install style to DOM if not yet. For getBboxForTextSpan in cell + // TODO: implement this } } } +/** + * Text layout cell based on bboxes in HTML field + */ class HtmlBboxTextLayoutCell extends BaseTextLayoutCell { private readonly processedBbox: ProcessedBbox; @@ -47,9 +57,10 @@ class HtmlBboxTextLayoutCell extends BaseTextLayoutCell { this.processedBbox = processedBbox; // keep this for later improvement } + /** @inheritdoc */ getBboxForTextSpan(span: TextSpan, options: { useRatio?: boolean }): Bbox | null { if (this.processedBbox != null) { - // TODO: calculate bbox for text span using text on browser + // TODO: implement this. calculate bbox for text span using text on browser } return super.getBboxForTextSpan(span, options); } diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textLayout/PdfTextContentTextLayout.ts b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textLayout/PdfTextContentTextLayout.ts index 9a5c09e71..401dd7578 100644 --- a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textLayout/PdfTextContentTextLayout.ts +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textLayout/PdfTextContentTextLayout.ts @@ -5,10 +5,13 @@ import { BaseTextLayoutCell } from './BaseTextLayout'; import { getAdjustedCellByOffsetByDom } from './dom'; import { HtmlBboxInfo, PdfTextContentInfo, TextLayout } from './types'; +/** + * Text layout based on PDF text objects + */ export class PdfTextContentTextLayout implements TextLayout { private readonly textContentInfo: PdfTextContentInfo; readonly cells: PdfTextContentTextLayoutCell[]; - private spans: HTMLElement[] | undefined; + private divs: HTMLElement[] | undefined; constructor(textContentInfo: PdfTextContentInfo, pageNum: number, htmlBboxInfo?: HtmlBboxInfo) { this.textContentInfo = textContentInfo; @@ -29,25 +32,31 @@ export class PdfTextContentTextLayout implements TextLayout { - // private readonly textItem: TextContentItem; - constructor( parent: PdfTextContentTextLayout, index: number, @@ -58,12 +67,11 @@ class PdfTextContentTextLayoutCell extends BaseTextLayoutCell { readonly cells: TextMappingsTextLayoutCell[]; @@ -24,10 +27,16 @@ export class TextMappingsTextLayout implements TextLayout { readonly cellField: CellField; diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textLayout/dom.ts b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textLayout/dom.ts index 498879b72..23dfe7d4a 100644 --- a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textLayout/dom.ts +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textLayout/dom.ts @@ -9,6 +9,14 @@ function debug(...args: any) { debugOut?.apply(null, args); } +/** + * Get a bbox for a span on a text layout cell using DOM element rendered on browser + * @param cell text layout cell + * @param textSpan span on the text layout cell + * @param spanElement an DOM element where the text layout cell is rendered + * @param scale the current scale factor + * @returns bbox for the span on the cell + */ export function getAdjustedCellByOffsetByDom( cell: TextLayoutCell, textSpan: TextSpan, diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textLayout/types.ts b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textLayout/types.ts index 7ed59012d..061910b6a 100644 --- a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textLayout/types.ts +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textLayout/types.ts @@ -6,7 +6,7 @@ import { Bbox, DocumentFields, TextSpan } from '../../types'; /** * Text layout information */ -export interface TextLayout { +export interface TextLayout { /** cells, paris of bbox and text, of this text layout */ readonly cells: CellType[]; /** get cell by ID */ diff --git a/packages/discovery-react-components/src/components/DocumentPreview/utils/box.ts b/packages/discovery-react-components/src/components/DocumentPreview/utils/box.ts index 99d01bdf7..986bd8b5f 100644 --- a/packages/discovery-react-components/src/components/DocumentPreview/utils/box.ts +++ b/packages/discovery-react-components/src/components/DocumentPreview/utils/box.ts @@ -7,7 +7,7 @@ import { ProcessedBbox } from '../../../utils/document/processDoc'; * @param boxB second bbox * @returns bool */ -function intersects(boxA: number[], boxB: number[]): boolean { +export function intersects(boxA: number[], boxB: number[]): boolean { const [leftA, topA, rightA, bottomA, pageA] = boxA; const [leftB, topB, rightB, bottomB, pageB] = boxB; return !(leftB > rightA || rightB < leftA || topB > bottomA || bottomB < topA || pageA !== pageB); From 3e06fdae114179bd74402b3f6a7c2d7f151cda17 Mon Sep 17 00:00:00 2001 From: Susumu Fukuda Date: Wed, 17 Nov 2021 22:45:17 +0900 Subject: [PATCH 21/51] fix: remove unnecessary commets --- .../utils/textLayout/PdfTextContentTextLayout.ts | 2 -- 1 file changed, 2 deletions(-) diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textLayout/PdfTextContentTextLayout.ts b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textLayout/PdfTextContentTextLayout.ts index 401dd7578..dce5d916d 100644 --- a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textLayout/PdfTextContentTextLayout.ts +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textLayout/PdfTextContentTextLayout.ts @@ -91,9 +91,7 @@ class PdfTextContentTextLayoutCell extends BaseTextLayoutCell Date: Wed, 17 Nov 2021 22:46:11 +0900 Subject: [PATCH 22/51] fix: highlighting on header and footer --- .../utils/textBoxMapping/getTextBoxMapping.ts | 7 +++- .../textLayout/PdfTextContentTextLayout.ts | 32 ++++++++++--------- .../utils/textLayout/types.ts | 3 ++ 3 files changed, 26 insertions(+), 16 deletions(-) diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/getTextBoxMapping.ts b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/getTextBoxMapping.ts index f664757b3..d692bf114 100644 --- a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/getTextBoxMapping.ts +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/getTextBoxMapping.ts @@ -69,7 +69,7 @@ export function getTextBoxMappings< const targetProvider = new MappingTargetBoxProvider(target.cells); const targetIndexToSources = target.cells.map(targetCell => { - return source.cells + const cells = source.cells .map((sourceCell, index) => { if (!bboxIntersects(sourceCell.bbox, targetCell.bbox)) { return null; @@ -77,6 +77,11 @@ export function getTextBoxMappings< return { cell: sourceCell, provider: sourceProviders[index] }; }) .filter(nonEmpty); + + if (cells.some(({ cell }) => cell.isInHtmlBbox)) { + return cells.filter(({ cell }) => cell.isInHtmlBbox); + } + return cells; }); const mappingEntries: TextBoxMappingEntry[] = []; diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textLayout/PdfTextContentTextLayout.ts b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textLayout/PdfTextContentTextLayout.ts index dce5d916d..b6a97afec 100644 --- a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textLayout/PdfTextContentTextLayout.ts +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textLayout/PdfTextContentTextLayout.ts @@ -18,18 +18,16 @@ export class PdfTextContentTextLayout implements TextLayout { - return new PdfTextContentTextLayoutCell(this, index, item, pageNum); - }) - .filter(cell => { - if (htmlBboxInfo?.bboxes?.length) { - return htmlBboxInfo.bboxes.some(bbox => { - return bboxIntersects(cell.bbox, [bbox.left, bbox.top, bbox.right, bbox.bottom]); - }); - } - return true; - }); + this.cells = textContentItems.map((item, index) => { + const cellBbox = PdfTextContentTextLayoutCell.getBbox(item, this.viewport); + let isInHtmlBbox = false; + if (htmlBboxInfo?.bboxes?.length) { + isInHtmlBbox = htmlBboxInfo.bboxes.some(bbox => { + return bboxIntersects(cellBbox, [bbox.left, bbox.top, bbox.right, bbox.bottom]); + }); + } + return new PdfTextContentTextLayoutCell(this, index, item, pageNum, cellBbox, isInHtmlBbox); + }); } /** get viewport of the current page */ @@ -57,14 +55,18 @@ export class PdfTextContentTextLayout implements TextLayout { + /** @inheritdoc */ + readonly isInHtmlBbox?: boolean; + constructor( parent: PdfTextContentTextLayout, index: number, textItem: TextContentItem, - pageNum: number + pageNum: number, + bbox: Bbox, + isInHtmlBbox?: boolean ) { const id = index; - const bbox = PdfTextContentTextLayoutCell.getBbox(textItem, parent.viewport); const text = textItem.str; super({ parent, id, pageNum, bbox, text }); } @@ -85,7 +87,7 @@ class PdfTextContentTextLayoutCell extends BaseTextLayoutCell extends TextLayoutCellBase { * @returns null when it's not available */ getBboxForTextSpan(span: TextSpan, options?: { useRatio?: boolean }): Bbox | null; + + /** a special property for PDF text content item cell. True when this cell overlaps HTML cell */ + readonly isInHtmlBbox?: boolean; } /** From 7a6ef58d82e6a58c5691b9a611a4185f497dfb13 Mon Sep 17 00:00:00 2001 From: Susumu Fukuda Date: Thu, 18 Nov 2021 12:34:17 +0900 Subject: [PATCH 23/51] fix: fix boxUtil test failure --- .../PdfViewerHighlight/utils/common/bboxUtils.ts | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/bboxUtils.ts b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/bboxUtils.ts index a918f64f0..8884a2d97 100644 --- a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/bboxUtils.ts +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/bboxUtils.ts @@ -1,4 +1,3 @@ -import { intersects } from 'components/DocumentPreview/utils/box'; import { Bbox, TextSpan } from '../../types'; import { spanIntersection, spanLen } from './textSpanUtils'; @@ -10,14 +9,18 @@ export const BOTTOM = 3; /** * Check whether two bbox intersect * - * Same to `intersects` in DocumentPreview/utils/box.ts, - * but for type `Bbox`, which doesn't have page property + * Similar to `intersects` in DocumentPreview/utils/box.ts, differences are: + * - this is for type `Bbox`, which doesn't have page property + * - the `right` and `bottom` values are exclusive. So + * `bboxIntersects([0,1,0,1], [1,2,0,1])` returns `false` * @param boxA one bbox * @param boxB another bbox * @returns true iff boxA and boxB are overlapped */ export function bboxIntersects(boxA: Bbox, boxB: Bbox): boolean { - return intersects(boxA, boxB); + const [leftA, topA, rightA, bottomA] = boxA; + const [leftB, topB, rightB, bottomB] = boxB; + return !(leftB >= rightA || rightB <= leftA || topB >= bottomA || bottomB <= topA); } /** From 3d02caf212b1a22c2dc108125d6ca026e0edc407 Mon Sep 17 00:00:00 2001 From: Susumu Fukuda Date: Thu, 18 Nov 2021 15:21:50 +0900 Subject: [PATCH 24/51] refactor: use one bbox intersection logic --- .../PdfViewerHighlight/utils/common/bboxUtils.ts | 10 ++-------- .../src/components/DocumentPreview/utils/box.ts | 8 +++++++- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/bboxUtils.ts b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/bboxUtils.ts index 8884a2d97..f7e911f8e 100644 --- a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/bboxUtils.ts +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/bboxUtils.ts @@ -1,3 +1,4 @@ +import { intersects } from 'components/DocumentPreview/utils/box'; import { Bbox, TextSpan } from '../../types'; import { spanIntersection, spanLen } from './textSpanUtils'; @@ -8,19 +9,12 @@ export const BOTTOM = 3; /** * Check whether two bbox intersect - * - * Similar to `intersects` in DocumentPreview/utils/box.ts, differences are: - * - this is for type `Bbox`, which doesn't have page property - * - the `right` and `bottom` values are exclusive. So - * `bboxIntersects([0,1,0,1], [1,2,0,1])` returns `false` * @param boxA one bbox * @param boxB another bbox * @returns true iff boxA and boxB are overlapped */ export function bboxIntersects(boxA: Bbox, boxB: Bbox): boolean { - const [leftA, topA, rightA, bottomA] = boxA; - const [leftB, topB, rightB, bottomB] = boxB; - return !(leftB >= rightA || rightB <= leftA || topB >= bottomA || bottomB <= topA); + return intersects(boxA, boxB); } /** diff --git a/packages/discovery-react-components/src/components/DocumentPreview/utils/box.ts b/packages/discovery-react-components/src/components/DocumentPreview/utils/box.ts index 986bd8b5f..e5dd2643d 100644 --- a/packages/discovery-react-components/src/components/DocumentPreview/utils/box.ts +++ b/packages/discovery-react-components/src/components/DocumentPreview/utils/box.ts @@ -10,7 +10,13 @@ import { ProcessedBbox } from '../../../utils/document/processDoc'; export function intersects(boxA: number[], boxB: number[]): boolean { const [leftA, topA, rightA, bottomA, pageA] = boxA; const [leftB, topB, rightB, bottomB, pageB] = boxB; - return !(leftB > rightA || rightB < leftA || topB > bottomA || bottomB < topA || pageA !== pageB); + return !( + leftB >= rightA || + rightB <= leftA || + topB >= bottomA || + bottomB <= topA || + pageA !== pageB + ); } /** From 3b810afc7e4a12d0373b1c8193a25bd65e4b7c69 Mon Sep 17 00:00:00 2001 From: Susumu Fukuda Date: Thu, 18 Nov 2021 21:08:09 +0900 Subject: [PATCH 25/51] feat: add Japanese PDF sample --- .../__fixtures__/DiscoComponent-ja.pdf.ts | 2 + .../DiscoComponents-ja_document.json | 55 +++++++++++++++++++ .../PdfViewerWithHighlight.stories.tsx | 36 ++++++++++-- 3 files changed, 88 insertions(+), 5 deletions(-) create mode 100644 packages/discovery-react-components/src/components/DocumentPreview/__fixtures__/DiscoComponent-ja.pdf.ts create mode 100644 packages/discovery-react-components/src/components/DocumentPreview/__fixtures__/DiscoComponents-ja_document.json diff --git a/packages/discovery-react-components/src/components/DocumentPreview/__fixtures__/DiscoComponent-ja.pdf.ts b/packages/discovery-react-components/src/components/DocumentPreview/__fixtures__/DiscoComponent-ja.pdf.ts new file mode 100644 index 000000000..9d62e2f10 --- /dev/null +++ b/packages/discovery-react-components/src/components/DocumentPreview/__fixtures__/DiscoComponent-ja.pdf.ts @@ -0,0 +1,2 @@ +export const document = + ''; diff --git a/packages/discovery-react-components/src/components/DocumentPreview/__fixtures__/DiscoComponents-ja_document.json b/packages/discovery-react-components/src/components/DocumentPreview/__fixtures__/DiscoComponents-ja_document.json new file mode 100644 index 000000000..4789d596d --- /dev/null +++ b/packages/discovery-react-components/src/components/DocumentPreview/__fixtures__/DiscoComponents-ja_document.json @@ -0,0 +1,55 @@ +{ + "document_id": "feab8705259090b89fbcbb15942cb10d", + "result_metadata": { + "collection_id": "b6cdf1cd-902c-8ea3-0000-017d32224d8f" + }, + "enriched_text": [ + { + "entities": [ + { + "model_name": "natural_language_understanding", + "mentions": [ + { + "confidence": 0.9950965, + "location": { + "end": 2, + "begin": 0 + }, + "text": "最初" + } + ], + "text": "最初", + "type": "Ordinal" + } + ] + } + ], + "metadata": { + "parent_document_id": "feab8705259090b89fbcbb15942cb10d", + "customer_id": "IBMid-270001M55T" + }, + "extracted_metadata": { + "sha1": "4FF2B41ED7A77975ABB21D9E4025DF31335E6451", + "numPages": "1", + "filename": "DiscoComponents-ja-updated.pdf", + "file_type": "pdf", + "text_mappings": "{\"text_mappings\":[{\"page\":{\"page_number\":1,\"bbox\":[54.51987838745117,87.82411193847656,400.4930725097656,194.260009765625]},\"field\":{\"name\":\"title\",\"index\":0,\"span\":[0,20]}},{\"page\":{\"page_number\":1,\"bbox\":[54.51987838745117,411.83612060546875,262.9510192871094,425.62003993988037]},\"field\":{\"name\":\"subtitle\",\"index\":0,\"span\":[0,19]}},{\"page\":{\"page_number\":1,\"bbox\":[268.46466064453125,416.1183776855469,325.5726318359375,425.375319480896]},\"field\":{\"name\":\"subtitle\",\"index\":1,\"span\":[0,3]}},{\"page\":{\"page_number\":1,\"bbox\":[54.51987838745117,644.3582763671875,313.07745361328125,653.6152181625366]},\"field\":{\"name\":\"subtitle\",\"index\":2,\"span\":[0,15]}},{\"page\":{\"page_number\":1,\"bbox\":[54.51987838745117,456.12786865234375,95.6172866821289,463.06002855300903]},\"field\":{\"name\":\"text\",\"index\":0,\"span\":[0,4]}},{\"page\":{\"page_number\":1,\"bbox\":[100.0745620727539,452.9471435546875,257.0570983886719,463.06002855300903]},\"field\":{\"name\":\"text\",\"index\":0,\"span\":[4,27]}},{\"page\":{\"page_number\":1,\"bbox\":[261.5120849609375,452.9471435546875,408.1592712402344,463.0600233078003]},\"field\":{\"name\":\"text\",\"index\":0,\"span\":[27,49]}},{\"page\":{\"page_number\":1,\"bbox\":[412.5315856933594,456.12786865234375,464.3571472167969,463.06002855300903]},\"field\":{\"name\":\"text\",\"index\":0,\"span\":[49,54]}},{\"page\":{\"page_number\":1,\"bbox\":[54.51987838745117,452.9471435546875,534.0211791992188,596.2600049972534]},\"field\":{\"name\":\"text\",\"index\":0,\"span\":[54,234]}},{\"page\":{\"page_number\":1,\"bbox\":[54.519996643066406,679.4979858398438,535.1033325195312,723.2200269699097]},\"field\":{\"name\":\"text\",\"index\":0,\"span\":[234,353]}}],\"pages\":[{\"page_number\":0,\"height\":842.0,\"width\":595.0,\"origin\":\"TopLeft\"}]}", + "title": "Discovery Component README Japanese", + "publicationdate": "2021-11-18" + }, + "subtitle": ["Discovery Component", "の使用", "サンプルアプリケーションの実行"], + "html": "Discovery Component README Japanese

Discovery Components

Discovery Component

の使用

最初に

IBM Watson Discovery の

Improve and Customize

ページで

Document retrieval プロジェクトをカスタマイズする必要があります。たとえばファセットや検索 バーや検索結果を設定できます。その後 Discovery component を使ったアプリケ ーションを作成します。アプリケーションは指定したプロジェクトの設定をロードしま す。 必要なソフトウェア: git, nvm, yarn または npm

サンプルアプリケーションの実行

• サンプルアプリケーションはこのライブラリーが提供するコアコンポーネントのカタログです。実際のデ ータを使ってコンポーネントがどのように動くかを簡単に見ることができます。コードを変更して、カスタ マイズする方法を確認することもできます。

", + "text": [ + "最初に IBM Watson Discovery の Improve and Customize ページで Document retrieval プロジェクトをカスタマイズする必要があります。たとえばファセットや検索 バーや検索結果を設定できます。その後 Discovery component を使ったアプリケ ーションを作成します。アプリケーションは指定したプロジェクトの設定をロードしま す。 必要なソフトウェア: git, nvm, yarn または npm • サンプルアプリケーションはこのライブラリーが提供するコアコンポーネントのカタログです。実際のデ ータを使ってコンポーネントがどのように動くかを簡単に見ることができます。コードを変更して、カスタ マイズする方法を確認することもできます。" + ], + "title": "Discovery Components", + "document_passages": [ + { + "passage_text": "Discovery Components", + "start_offset": 0, + "end_offset": 20, + "field": "title" + } + ], + "table_results_references": [] +} diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/PdfViewerWithHighlight.stories.tsx b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/PdfViewerWithHighlight.stories.tsx index 67bdd3ffa..fe2dde415 100644 --- a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/PdfViewerWithHighlight.stories.tsx +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/PdfViewerWithHighlight.stories.tsx @@ -5,11 +5,18 @@ import { action } from '@storybook/addon-actions'; import PdfViewerWithHighlight from './PdfViewerWithHighlight'; import { flatten } from 'lodash'; import { DocumentFieldHighlight } from './types'; +import './PdfViewerWithHighlight.stories.scss'; import { document as doc } from 'components/DocumentPreview/__fixtures__/Art Effects.pdf'; import document from 'components/DocumentPreview/__fixtures__/Art Effects Koya Creative Base TSA 2008.pdf.json'; -import './PdfViewerWithHighlight.stories.scss'; +import { document as docJa } from 'components/DocumentPreview/__fixtures__/DiscoComponent-ja.pdf'; +import documentJa from 'components/DocumentPreview/__fixtures__/DiscoComponents-ja_document.json'; + +import PDFJS from 'pdfjs-dist'; +import { getDocFieldValue } from './utils/common/documentUtils'; +(PDFJS as any).cMapUrl = './node_modules/pdfjs-dist/cmaps/'; +(PDFJS as any).cMapPacked = true; const pageKnob = { label: 'Page', @@ -49,9 +56,11 @@ const WithTextSelection: typeof PdfViewerWithHighlight = props => { const fields = Object.keys(document).filter(field => { return !field.match(/^(document_id|extracted_|enriched_)/) && document[field]?.length > 0; }); + return flatten( fields.map(field => { - return document[field] + const documentFields = Array.isArray(document[field]) ? document[field] : [document[field]]; + return documentFields .map((content: any, index: number) => { if (typeof content === 'string') { return { @@ -99,7 +108,7 @@ const WithTextSelection: typeof PdfViewerWithHighlight = props => { } const { begin, end } = textSelection; - const fieldText = document[selectedFieldName][selectedFieldIndex]; + const fieldText = getDocFieldValue(document, selectedFieldName, selectedFieldIndex); const highlight: DocumentFieldHighlight = { field: selectedFieldName, @@ -135,9 +144,9 @@ const WithTextSelection: typeof PdfViewerWithHighlight = props => { {/* eslint-disable-next-line jsx-a11y/no-noninteractive-element-interactions */}

{selectedField && - document[selectedFieldName][selectedFieldIndex] + getDocFieldValue(document, selectedFieldName, selectedFieldIndex)! .replace(/ /g, '\u00a0') // NBSP - .replaceAll('\n', '\\n')} + .replace(/\n/g, '\\n')}

@@ -179,4 +188,21 @@ storiesOf('DocumentPreview/components/PdfViewerWithHighlight', module) highlights={EMPTY} /> ); + }) + .add('with PDF in Japanese', () => { + const page = number(pageKnob.label, pageKnob.defaultValue, pageKnob.options); + const zoom = radios(zoomKnob.label, zoomKnob.options, zoomKnob.defaultValue); + const scale = parseFloat(zoom); + const setLoadingAction = action('setLoading'); + + return ( + + ); }); From 488b160d2fdb7573cc22fe8fa3df04daac50c4cf Mon Sep 17 00:00:00 2001 From: Susumu Fukuda Date: Fri, 19 Nov 2021 13:57:36 +0900 Subject: [PATCH 26/51] refactor: move cell trim method --- .../utils/textBoxMapping/getTextBoxMapping.ts | 27 ++------------ .../utils/textLayout/BaseTextLayout.ts | 27 ++++++++++++++ .../textLayout/PdfTextContentTextLayout.ts | 35 ++++++++++--------- .../utils/textLayout/types.ts | 2 ++ 4 files changed, 50 insertions(+), 41 deletions(-) diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/getTextBoxMapping.ts b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/getTextBoxMapping.ts index d692bf114..0f280ff1e 100644 --- a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/getTextBoxMapping.ts +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/getTextBoxMapping.ts @@ -3,7 +3,7 @@ import { TextSpan } from '../../types'; import { bboxIntersects } from '../common/bboxUtils'; import { nonEmpty } from '../common/nonEmpty'; import { spanLen, spanMerge } from '../common/textSpanUtils'; -import { TextLayout, TextLayoutCell, TextLayoutCellBase } from '../textLayout/types'; +import { TextLayout, TextLayoutCell } from '../textLayout/types'; import { MappingSourceTextProvider } from './MappingSourceTextProvider'; import { MappingTargetBoxProvider } from './MappingTargetCellProvider'; import { TextBoxMappingImpl } from './TextBoxMapping'; @@ -30,11 +30,7 @@ function findMatchInSources( // find matches const matches = sources.map(source => { const match = source.provider.getMatch(textToMatch); - return { - cell: source.cell, - provider: source.provider, - match - }; + return { ...source, match }; }); // calc cost for each match @@ -110,7 +106,7 @@ export function getTextBoxMappings< let consumedSourceSpan: TextSpan = [0, 0]; matchedTargetCells.forEach(mTargetCell => { - const trimmedCell = trimCell(mTargetCell); + const trimmedCell = mTargetCell.trim(); if (trimmedCell.text.length > 0) { const matchToTargetCell = matchedSourceProvider.getMatch(trimmedCell.text); debug('>> target cell %o (%o) to source %o', mTargetCell, trimmedCell, matchToTargetCell); @@ -135,20 +131,3 @@ export function getTextBoxMappings< return new TextBoxMappingImpl(mappingEntries); } - -/** - * Get a text layout cell that represents a trimmed text of a given `cell` - * @returns a new cell for the trimmed text. Zero-length cell when the text of the given `cell` is blank - */ -function trimCell(cell: TextLayoutCellBase) { - const text = cell.text; - const nLeadingSpaces = text.match(/^\s*/)![0].length; - const nTrailingSpaces = text.match(/\s*$/)![0].length; - if (nLeadingSpaces === 0 && nTrailingSpaces === 0) { - return cell; - } - if (text.length > nLeadingSpaces + nTrailingSpaces) { - return cell.getPartial([nLeadingSpaces, text.length - nTrailingSpaces]); - } - return cell.getPartial([0, 0]); // return zero-length cell -} diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textLayout/BaseTextLayout.ts b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textLayout/BaseTextLayout.ts index 7e8f7f9e5..099934ac2 100644 --- a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textLayout/BaseTextLayout.ts +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textLayout/BaseTextLayout.ts @@ -52,6 +52,11 @@ export class BaseTextLayoutCell> } return null; } + + /** @inheritdoc */ + trim(): TextLayoutCellBase { + return trimCell(this); + } } /** @@ -81,4 +86,26 @@ export class PartialTextLayoutCell implements TextLayoutCellBase { getNormalized() { return { cell: this.base, span: this.span }; } + + /** @inheritdoc */ + trim(): TextLayoutCellBase { + return trimCell(this); + } +} + +/** + * Get a text layout cell that represents a trimmed text of a given `cell` + * @returns a new cell for the trimmed text. Zero-length cell when the text of the given `cell` is blank + */ +function trimCell(cell: TextLayoutCellBase) { + const text = cell.text; + const nLeadingSpaces = text.match(/^\s*/)![0].length; + const nTrailingSpaces = text.match(/\s*$/)![0].length; + if (nLeadingSpaces === 0 && nTrailingSpaces === 0) { + return cell; + } + if (text.length > nLeadingSpaces + nTrailingSpaces) { + return cell.getPartial([nLeadingSpaces, text.length - nTrailingSpaces]); + } + return cell.getPartial([0, 0]); // return zero-length cell } diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textLayout/PdfTextContentTextLayout.ts b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textLayout/PdfTextContentTextLayout.ts index b6a97afec..0b5e53007 100644 --- a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textLayout/PdfTextContentTextLayout.ts +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textLayout/PdfTextContentTextLayout.ts @@ -19,7 +19,7 @@ export class PdfTextContentTextLayout implements TextLayout { - const cellBbox = PdfTextContentTextLayoutCell.getBbox(item, this.viewport); + const cellBbox = getBbox(item, this.viewport); let isInHtmlBbox = false; if (htmlBboxInfo?.bboxes?.length) { isInHtmlBbox = htmlBboxInfo.bboxes.some(bbox => { @@ -64,11 +64,12 @@ class PdfTextContentTextLayoutCell extends BaseTextLayoutCell Date: Wed, 24 Nov 2021 13:20:31 +0900 Subject: [PATCH 27/51] fix: pdfjs typings version --- package.json | 1 + .../discovery-react-components/package.json | 3 +-- yarn.lock | 24 ++++--------------- 3 files changed, 7 insertions(+), 21 deletions(-) diff --git a/package.json b/package.json index 3be7ea346..a53e97d36 100644 --- a/package.json +++ b/package.json @@ -43,6 +43,7 @@ "@types/lodash": "^4.14.141", "@types/mustache": "^0.8.32", "@types/node": "^12.7.3", + "@types/pdfjs-dist": "2.7.5", "@types/react": "^16.9.2", "@types/react-dom": "^16.9.0", "@types/react-resize-detector": "^4.2.0", diff --git a/packages/discovery-react-components/package.json b/packages/discovery-react-components/package.json index 49e64485a..5c8961b2f 100644 --- a/packages/discovery-react-components/package.json +++ b/packages/discovery-react-components/package.json @@ -18,7 +18,7 @@ "eslint": "yarn run g:eslint --quiet '{src,.storybook}/**/*.{js,jsx,ts,tsx}'", "lint": "yarn run circular && yarn run eslint", "start": "rollup -c -w", - "storybook": "start-storybook --ci --port=9002", + "storybook": "../../node_modules/.bin/start-storybook --ci --port=9002", "storybook:build": "build-storybook", "storybook:build:release": "cross-env STORYBOOK_BUILD_MODE=production build-storybook -o ../../docs/storybook", "analyze": "yarn run g:analyze 'dist/index.js'", @@ -43,7 +43,6 @@ "react-virtualized": "9.21.1" }, "devDependencies": { - "@types/pdfjs-dist": "^2.10.378", "cross-env": "^7.0.3", "css-loader": "^3.4.2", "madge": "^5.0.1", diff --git a/yarn.lock b/yarn.lock index 513c03d4e..b91ed1353 100644 --- a/yarn.lock +++ b/yarn.lock @@ -2271,7 +2271,6 @@ __metadata: version: 0.0.0-use.local resolution: "@ibm-watson/discovery-react-components@workspace:packages/discovery-react-components" dependencies: - "@types/pdfjs-dist": ^2.10.378 classnames: ^2.2.6 cross-env: ^7.0.3 css-loader: ^3.4.2 @@ -4939,12 +4938,10 @@ __metadata: languageName: node linkType: hard -"@types/pdfjs-dist@npm:^2.10.378": - version: 2.10.378 - resolution: "@types/pdfjs-dist@npm:2.10.378" - dependencies: - pdfjs-dist: "*" - checksum: 36dd6010f7d23a995efdf11ea4ecb56f371f8bfb3e83a5c311666726e13238597ed1519701d0e2e6fb297270d01ad6aece9582b036fd4cb3aa301e61ea364978 +"@types/pdfjs-dist@npm:2.7.5": + version: 2.7.5 + resolution: "@types/pdfjs-dist@npm:2.7.5" + checksum: a81d499327520f46cf25c683a1f56cfcf07e12d9ad0ef4d560e320a7072f7dd7e7f2a2b26966655120cf7aa084c90f5c732ba65fc835a6271d4ab9858b9fc2b4 languageName: node linkType: hard @@ -19528,18 +19525,6 @@ __metadata: languageName: node linkType: hard -"pdfjs-dist@npm:*": - version: 2.11.338 - resolution: "pdfjs-dist@npm:2.11.338" - peerDependencies: - worker-loader: ^3.0.8 - peerDependenciesMeta: - worker-loader: - optional: true - checksum: 1b946a3eeb3312a79e12b4e0aa066bb2b98487b9ee329666edc840a194602595cf84de9a3f6dbb023b808699a6ebb0cd06e751314fc4c0ffa56f7be12855d296 - languageName: node - linkType: hard - "pdfjs-dist@npm:^2.2.228": version: 2.2.228 resolution: "pdfjs-dist@npm:2.2.228" @@ -22919,6 +22904,7 @@ __metadata: "@types/lodash": ^4.14.141 "@types/mustache": ^0.8.32 "@types/node": ^12.7.3 + "@types/pdfjs-dist": 2.7.5 "@types/react": ^16.9.2 "@types/react-dom": ^16.9.0 "@types/react-resize-detector": ^4.2.0 From 5811f4f022b0ff640dbc1f1ad9fdf80ac79942e9 Mon Sep 17 00:00:00 2001 From: Susumu Fukuda Date: Wed, 24 Nov 2021 13:21:03 +0900 Subject: [PATCH 28/51] fix: adapt to latest PdfViewer --- .../PdfViewerHighlight/PdfViewerHighlight.tsx | 18 +++++++++--------- .../PdfViewerWithHighlight.tsx | 10 +++++----- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/PdfViewerHighlight.tsx b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/PdfViewerHighlight.tsx index 44e626c33..636e25ef4 100644 --- a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/PdfViewerHighlight.tsx +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/PdfViewerHighlight.tsx @@ -2,7 +2,7 @@ import React, { FC, useMemo, useEffect } from 'react'; import cx from 'classnames'; import { DocumentFieldHighlight } from './types'; import { QueryResult } from 'ibm-watson/discovery/v2'; -import { PdfTextLayerInfo } from '../PdfViewer/PdfViewerTextLayer'; +import { PdfRenderedText } from '../PdfViewer/PdfViewerTextLayer'; import { Highlighter } from './utils/Highlighter'; import { ExtractedDocumentInfo } from './utils/common/documentUtils'; import { settings } from 'carbon-components'; @@ -43,7 +43,7 @@ interface Props { /** * PDF text content information in a page from parsed PDF */ - pdfTextLayerInfo: PdfTextLayerInfo | null; + pdfRenderedText: PdfRenderedText | null; /** * Zoom factor, where `1` is equal to 100% @@ -73,7 +73,7 @@ const PdfViewerHighlight: FC = ({ parsedDocument, pageNum, highlights, - pdfTextLayerInfo, + pdfRenderedText, scale = 1.0, useHtmlBbox = true, usePdfTextItem = true @@ -82,11 +82,11 @@ const PdfViewerHighlight: FC = ({ document, textMappings: parsedDocument?.textMappings, processedDoc: useHtmlBbox ? parsedDocument?.processedDoc : undefined, - pdfTextLayerInfo: (usePdfTextItem && pdfTextLayerInfo) || undefined, + pdfRenderedText: (usePdfTextItem && pdfRenderedText) || undefined, pageNum }); - const { textDivs } = pdfTextLayerInfo || {}; + const { textDivs } = pdfRenderedText || {}; useEffect(() => { if (highlighter) { highlighter.setTextContentDivs(textDivs); @@ -136,13 +136,13 @@ const useHighlighter = ({ document, textMappings, processedDoc, - pdfTextLayerInfo, + pdfRenderedText, pageNum }: { document: QueryResult; textMappings?: TextMappings; processedDoc?: ProcessedDoc; - pdfTextLayerInfo?: PdfTextLayerInfo; + pdfRenderedText?: PdfRenderedText; pageNum: number; }) => { return useMemo(() => { @@ -156,11 +156,11 @@ const useHighlighter = ({ styles: processedDoc.styles }, pdfTextContentInfo: - pdfTextLayerInfo?.textContent && pdfTextLayerInfo?.viewport ? pdfTextLayerInfo : undefined + pdfRenderedText?.textContent && pdfRenderedText?.viewport ? pdfRenderedText : undefined }); } return null; - }, [document, pageNum, pdfTextLayerInfo, processedDoc, textMappings]); + }, [document, pageNum, pdfRenderedText, processedDoc, textMappings]); }; export default PdfViewerHighlight; diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/PdfViewerWithHighlight.tsx b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/PdfViewerWithHighlight.tsx index 6036ff996..cd32c1598 100644 --- a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/PdfViewerWithHighlight.tsx +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/PdfViewerWithHighlight.tsx @@ -4,7 +4,7 @@ import PdfViewer, { PdfViewerProps } from '../PdfViewer/PdfViewer'; import PdfViewerHighlight from './PdfViewerHighlight'; import { extractDocumentInfo, ExtractedDocumentInfo } from './utils/common/documentUtils'; import { QueryResult } from 'ibm-watson/discovery/v2'; -import { PdfTextLayerInfo } from '../PdfViewer/PdfViewerTextLayer'; +import { PdfRenderedText } from '../PdfViewer/PdfViewerTextLayer'; interface Props extends PdfViewerProps { /** @@ -40,7 +40,7 @@ const PdfViewerWithHighlight: FC = ({ ...rest }) => { const { page, scale } = rest; - const [textLayerInfo, setTextLayerInfo] = useState(null); + const [renderedText, setRenderedText] = useState(null); const [documentInfo, setDocumentInfo] = useState(null); useEffect(() => { @@ -57,14 +57,14 @@ const PdfViewerWithHighlight: FC = ({ }; }, [document]); - const highlightReady = !!documentInfo && !!textLayerInfo; + const highlightReady = !!documentInfo && !!renderedText; return ( - + Date: Thu, 2 Dec 2021 20:43:19 +0900 Subject: [PATCH 29/51] fix: use postcss to manupulate pdfjs-web css --- .../scripts/generate-pdfjs_web_mixin.js | 41 +++++++++++++++++++ .../discovery-styles/scripts/update-styles.sh | 24 ++++------- .../document-preview/_pdfjs_web_mixins.scss | 18 +++++--- 3 files changed, 60 insertions(+), 23 deletions(-) create mode 100644 packages/discovery-styles/scripts/generate-pdfjs_web_mixin.js diff --git a/packages/discovery-styles/scripts/generate-pdfjs_web_mixin.js b/packages/discovery-styles/scripts/generate-pdfjs_web_mixin.js new file mode 100644 index 000000000..68265ccfc --- /dev/null +++ b/packages/discovery-styles/scripts/generate-pdfjs_web_mixin.js @@ -0,0 +1,41 @@ +/** + * Generate mixin SCSS for pdfjs web.css .textLayer styles + * + * Usage: node + */ +const postcss = require('postcss'); +const fs = require('fs'); + +const originalPdfjsWebCss = process.argv[2]; +const mixinPdfjsWebScss = process.argv[3]; + +// load teh original style +const cssText = fs.readFileSync(originalPdfjsWebCss, { encoding: 'utf-8' }); +const cssRoot = postcss.parse(cssText); + +// remove rules not related to .textLayer +cssRoot.walkRules(rule => { + if (rule.selector.includes('.textLayer')) { + return; + } + rule.remove(); +}); + +// keep copyright comment +cssRoot.walkComments(comment => { + if (comment.text.includes('Copyright')) { + return; + } + comment.remove(); +}); + +// write mixin scss +const generatedCss = ` +/* DO NOT EDIT. THIS FILE IS AUTOMATICALLY GENERATED FROM \`update-styles.sh\`. */ +@mixin pdfjsTextLayer { + // CSS from ~pdfjs-dist/web/pdf_viewer.css for scoped style + ${cssRoot.toString()} +} +`; + +fs.writeFileSync(mixinPdfjsWebScss, generatedCss, { encoding: 'utf-8' }); diff --git a/packages/discovery-styles/scripts/update-styles.sh b/packages/discovery-styles/scripts/update-styles.sh index 49ab2c595..423191ce1 100755 --- a/packages/discovery-styles/scripts/update-styles.sh +++ b/packages/discovery-styles/scripts/update-styles.sh @@ -1,20 +1,10 @@ #!/bin/sh +BASEDIR=$(dirname "$0")/.. +PDFJS_WEB_CSS=$BASEDIR/../../node_modules/pdfjs-dist/web/pdf_viewer.css +PDFJS_SCSS=$BASEDIR/scss/components/document-preview/_pdfjs_web_mixins.scss -PDFJS_WEB_CSS=../../node_modules/pdfjs-dist/web/pdf_viewer.css -PDFJS_SCSS=scss/components/document-preview/_pdfjs_web_mixins.scss +# generate PDFJS_SCSS +node $BASEDIR/scripts/generate-pdfjs_web_mixin.js "$PDFJS_WEB_CSS" "$PDFJS_SCSS" -function replace_quote() { - file=$1 - key=$2 - tmp=$file.tmp - - sed -e "/BEGIN-QUOTE $key/q" $file > $tmp - cat >> $tmp - sed -ne "/END-QUOTE $key/,\$p" $file >> $tmp - cp $tmp $file; - rm $tmp; -} - -cat $PDFJS_WEB_CSS | awk '/^\/\*/,/\*\//' | replace_quote $PDFJS_SCSS "COMMENT" -cat $PDFJS_WEB_CSS | awk '/^\.textLayer/,/}/' | replace_quote $PDFJS_SCSS "TEXT-LAYER" -../../node_modules/.bin/prettier --write $PDFJS_SCSS +# perttier +../../node_modules/.bin/prettier --write "$PDFJS_SCSS" diff --git a/packages/discovery-styles/scss/components/document-preview/_pdfjs_web_mixins.scss b/packages/discovery-styles/scss/components/document-preview/_pdfjs_web_mixins.scss index 37e94e1c8..22f31964b 100644 --- a/packages/discovery-styles/scss/components/document-preview/_pdfjs_web_mixins.scss +++ b/packages/discovery-styles/scss/components/document-preview/_pdfjs_web_mixins.scss @@ -1,7 +1,6 @@ +/* DO NOT EDIT. THIS FILE IS AUTOMATICALLY GENERATED FROM `update-styles.sh`. */ @mixin pdfjsTextLayer { // CSS from ~pdfjs-dist/web/pdf_viewer.css for scoped style - - // BEGIN-QUOTE COMMENT /* Copyright 2014 Mozilla Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -16,9 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - // END-QUOTE COMMENT - // BEGIN-QUOTE TEXT-LAYER .textLayer { position: absolute; left: 0; @@ -29,6 +26,7 @@ opacity: 0.2; line-height: 1; } + .textLayer > span { color: transparent; position: absolute; @@ -37,6 +35,7 @@ -webkit-transform-origin: 0% 0%; transform-origin: 0% 0%; } + .textLayer .highlight { margin: -1px; padding: 1px; @@ -44,24 +43,31 @@ background-color: rgb(180, 0, 170); border-radius: 4px; } + .textLayer .highlight.begin { border-radius: 4px 0px 0px 4px; } + .textLayer .highlight.end { border-radius: 0px 4px 4px 0px; } + .textLayer .highlight.middle { border-radius: 0px; } + .textLayer .highlight.selected { background-color: rgb(0, 100, 0); } + .textLayer ::-moz-selection { background: rgb(0, 0, 255); } + .textLayer ::selection { background: rgb(0, 0, 255); } + .textLayer .endOfContent { display: block; position: absolute; @@ -76,8 +82,8 @@ -ms-user-select: none; user-select: none; } + .textLayer .endOfContent.active { top: 0px; } - // END-QUOTE TEXT-LAYER -} // end mixin +} From 158b8a23aab35d607d40498d85da65f1ae60cb32 Mon Sep 17 00:00:00 2001 From: Susumu Fukuda Date: Thu, 2 Dec 2021 20:49:45 +0900 Subject: [PATCH 30/51] fix: add comment to the style update script --- packages/discovery-styles/scripts/update-styles.sh | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/packages/discovery-styles/scripts/update-styles.sh b/packages/discovery-styles/scripts/update-styles.sh index 423191ce1..7c0e47dbb 100755 --- a/packages/discovery-styles/scripts/update-styles.sh +++ b/packages/discovery-styles/scripts/update-styles.sh @@ -1,9 +1,18 @@ #!/bin/sh +# +# This script updates PDF text layer CSS from the `pdfjs-dist` npm package +# +# When you upgrade the `pdfjs-dist` package, you have to run this script +# to include the style changes. +# +# Usage: $ scripts/update-styles.sh +# - You must run `yarn` to install `pdfjs-dist` package before running +# BASEDIR=$(dirname "$0")/.. + +# pdfjs textLayer styles PDFJS_WEB_CSS=$BASEDIR/../../node_modules/pdfjs-dist/web/pdf_viewer.css PDFJS_SCSS=$BASEDIR/scss/components/document-preview/_pdfjs_web_mixins.scss - -# generate PDFJS_SCSS node $BASEDIR/scripts/generate-pdfjs_web_mixin.js "$PDFJS_WEB_CSS" "$PDFJS_SCSS" # perttier From 0705d2ea2000e2b956df2affe57bedacd1d9eca0 Mon Sep 17 00:00:00 2001 From: Susumu Fukuda Date: Thu, 2 Dec 2021 20:58:11 +0900 Subject: [PATCH 31/51] refactor: move useAsyncFunctionCall to utils --- .../src/components/DocumentPreview/DocumentPreview.tsx | 1 + .../DocumentPreview/components/PdfViewer/PdfViewer.tsx | 2 +- .../components/PdfViewer/PdfViewerTextLayer.tsx | 4 ++-- .../PdfViewer => utils}/useAsyncFunctionCall.ts | 8 ++++---- 4 files changed, 8 insertions(+), 7 deletions(-) rename packages/discovery-react-components/src/{components/DocumentPreview/components/PdfViewer => utils}/useAsyncFunctionCall.ts (88%) diff --git a/packages/discovery-react-components/src/components/DocumentPreview/DocumentPreview.tsx b/packages/discovery-react-components/src/components/DocumentPreview/DocumentPreview.tsx index b06e33612..13813ab7b 100644 --- a/packages/discovery-react-components/src/components/DocumentPreview/DocumentPreview.tsx +++ b/packages/discovery-react-components/src/components/DocumentPreview/DocumentPreview.tsx @@ -154,6 +154,7 @@ function PreviewDocument({ const ErrorBoundDocumentPreview: any = withErrorBoundary(DocumentPreview); ErrorBoundDocumentPreview.PreviewToolbar = PreviewToolbar; ErrorBoundDocumentPreview.PreviewDocument = PreviewDocument; +ErrorBoundDocumentPreview.PdfViewerHighlight = PdfViewerHighlight; export default ErrorBoundDocumentPreview; export { ErrorBoundDocumentPreview as DocumentPreview }; diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewer/PdfViewer.tsx b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewer/PdfViewer.tsx index 4a0de76ef..dc34badf7 100644 --- a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewer/PdfViewer.tsx +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewer/PdfViewer.tsx @@ -9,7 +9,7 @@ import PdfjsLib, { import PdfjsWorkerAsText from 'pdfjs-dist/build/pdf.worker.min.js'; import { settings } from 'carbon-components'; import PdfViewerTextLayer, { PdfRenderedText } from './PdfViewerTextLayer'; -import useAsyncFunctionCall from './useAsyncFunctionCall'; +import useAsyncFunctionCall from 'utils/useAsyncFunctionCall'; setupPdfjs(); diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewer/PdfViewerTextLayer.tsx b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewer/PdfViewerTextLayer.tsx index e6f536076..21277ee2e 100644 --- a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewer/PdfViewerTextLayer.tsx +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewer/PdfViewerTextLayer.tsx @@ -3,7 +3,7 @@ import cx from 'classnames'; import { PDFPageProxy, PDFPageViewport, TextContent, TextContentItem } from 'pdfjs-dist'; import { EventBus } from 'pdfjs-dist/lib/web/ui_utils'; import { TextLayerBuilder } from 'pdfjs-dist/lib/web/text_layer_builder'; -import useAsyncFunctionCall from './useAsyncFunctionCall'; +import useAsyncFunctionCall from 'utils/useAsyncFunctionCall'; interface Props { className?: string; @@ -127,7 +127,7 @@ async function _renderTextLayer( textLayerDiv.innerHTML = ''; const deferredRenderEndPromise = new Promise(resolve => { const listener = () => { - resolve(undefined); + resolve(); builder?.eventBus.off('textlayerrendered', listener); }; builder?.eventBus.on('textlayerrendered', listener); diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewer/useAsyncFunctionCall.ts b/packages/discovery-react-components/src/utils/useAsyncFunctionCall.ts similarity index 88% rename from packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewer/useAsyncFunctionCall.ts rename to packages/discovery-react-components/src/utils/useAsyncFunctionCall.ts index 6f3d5f929..c5077d4d1 100644 --- a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewer/useAsyncFunctionCall.ts +++ b/packages/discovery-react-components/src/utils/useAsyncFunctionCall.ts @@ -16,25 +16,25 @@ function useAsyncFunctionCall, ReturnType = AsyncFun const [result, setResult] = useState(); useEffect(() => { - let state: 'pending' | 'fulfilled' | 'rejected' = 'pending'; + let resolved = false; const abortController = new AbortController(); asyncFunction(abortController.signal) .then((promiseResult: ReturnType) => { - state = 'fulfilled'; + resolved = false; if (!abortController.signal.aborted && promiseResult !== undefined) { setResult(promiseResult); } }) .catch(err => { - state = 'rejected'; + resolved = false; if (!abortController.signal.aborted) { throw err; } }); return (): void => { - if (state === 'pending') { + if (!resolved) { abortController.abort(); } }; From 9e9cad5058d02d827f27971ef3d80d80072790fa Mon Sep 17 00:00:00 2001 From: Susumu Fukuda Date: Thu, 2 Dec 2021 20:59:06 +0900 Subject: [PATCH 32/51] fix: name of package script --- packages/discovery-styles/package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/discovery-styles/package.json b/packages/discovery-styles/package.json index 7c6b9e05f..ceeb66d32 100644 --- a/packages/discovery-styles/package.json +++ b/packages/discovery-styles/package.json @@ -7,7 +7,7 @@ "repository": "https://github.com/watson-developer-cloud/discovery-components", "main": "scss/index.scss", "scripts": { - "prebuild": "scripts/update-styles.sh", + "update-style": "scripts/update-styles.sh", "build": "node-sass --importer=../../node_modules/node-sass-tilde-importer --source-map=true scss/index.scss css/index.css", "prepublish": "yarn run build", "start": "yarn run build -- --watch", From 052336a81b92576ce11101ffc70766fdce6bda64 Mon Sep 17 00:00:00 2001 From: Susumu Fukuda Date: Thu, 2 Dec 2021 21:44:14 +0900 Subject: [PATCH 33/51] fix: apply CI comment --- yarn.lock | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yarn.lock b/yarn.lock index 8f1c45781..6da07fa05 100644 --- a/yarn.lock +++ b/yarn.lock @@ -2271,13 +2271,13 @@ __metadata: version: 0.0.0-use.local resolution: "@ibm-watson/discovery-react-components@workspace:packages/discovery-react-components" dependencies: - "@types/pdfjs-dist": ^2.10.378 "@storybook/addon-actions": ^5.3.21 "@storybook/addon-docs": ^5.3.21 "@storybook/addon-knobs": ^5.3.21 "@storybook/core": ^5.3.21 "@storybook/react": ^5.3.21 "@storybook/source-loader": ^5.3.21 + "@types/pdfjs-dist": ^2.10.378 classnames: ^2.2.6 cross-env: ^7.0.3 css-loader: ^3.4.2 From e081c92ee35042ab074f196f5e0218247c179ce9 Mon Sep 17 00:00:00 2001 From: Susumu Fukuda Date: Thu, 2 Dec 2021 22:42:53 +0900 Subject: [PATCH 34/51] fix: fix broken logic --- .../src/utils/useAsyncFunctionCall.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/packages/discovery-react-components/src/utils/useAsyncFunctionCall.ts b/packages/discovery-react-components/src/utils/useAsyncFunctionCall.ts index c5077d4d1..fe97a789e 100644 --- a/packages/discovery-react-components/src/utils/useAsyncFunctionCall.ts +++ b/packages/discovery-react-components/src/utils/useAsyncFunctionCall.ts @@ -21,13 +21,13 @@ function useAsyncFunctionCall, ReturnType = AsyncFun asyncFunction(abortController.signal) .then((promiseResult: ReturnType) => { - resolved = false; + resolved = true; if (!abortController.signal.aborted && promiseResult !== undefined) { setResult(promiseResult); } }) .catch(err => { - resolved = false; + resolved = true; if (!abortController.signal.aborted) { throw err; } From 68d895dbaa4952e502318ff2a56e969ba5195d44 Mon Sep 17 00:00:00 2001 From: Susumu Fukuda Date: Thu, 2 Dec 2021 22:49:59 +0900 Subject: [PATCH 35/51] fix: remove unused code --- .../src/components/DocumentPreview/DocumentPreview.tsx | 1 - 1 file changed, 1 deletion(-) diff --git a/packages/discovery-react-components/src/components/DocumentPreview/DocumentPreview.tsx b/packages/discovery-react-components/src/components/DocumentPreview/DocumentPreview.tsx index 13813ab7b..b06e33612 100644 --- a/packages/discovery-react-components/src/components/DocumentPreview/DocumentPreview.tsx +++ b/packages/discovery-react-components/src/components/DocumentPreview/DocumentPreview.tsx @@ -154,7 +154,6 @@ function PreviewDocument({ const ErrorBoundDocumentPreview: any = withErrorBoundary(DocumentPreview); ErrorBoundDocumentPreview.PreviewToolbar = PreviewToolbar; ErrorBoundDocumentPreview.PreviewDocument = PreviewDocument; -ErrorBoundDocumentPreview.PdfViewerHighlight = PdfViewerHighlight; export default ErrorBoundDocumentPreview; export { ErrorBoundDocumentPreview as DocumentPreview }; From 8c82fae7f89a722606f042998db2097a37daed96 Mon Sep 17 00:00:00 2001 From: Susumu Fukuda Date: Fri, 3 Dec 2021 13:57:50 +0900 Subject: [PATCH 36/51] fix: apply review comments around pdfjs css --- packages/discovery-styles/package.json | 8 ++++++-- .../scripts/generate-pdfjs_web_mixin.js | 4 ++-- .../{update-styles.sh => update-styles-from-pdfjs.sh} | 6 +++--- .../components/document-preview/_pdfjs_web_mixins.scss | 2 +- yarn.lock | 10 ++++++++++ 5 files changed, 22 insertions(+), 8 deletions(-) rename packages/discovery-styles/scripts/{update-styles.sh => update-styles-from-pdfjs.sh} (77%) diff --git a/packages/discovery-styles/package.json b/packages/discovery-styles/package.json index ceeb66d32..e8c84e651 100644 --- a/packages/discovery-styles/package.json +++ b/packages/discovery-styles/package.json @@ -7,11 +7,11 @@ "repository": "https://github.com/watson-developer-cloud/discovery-components", "main": "scss/index.scss", "scripts": { - "update-style": "scripts/update-styles.sh", "build": "node-sass --importer=../../node_modules/node-sass-tilde-importer --source-map=true scss/index.scss css/index.css", "prepublish": "yarn run build", "start": "yarn run build -- --watch", - "analyze": "yarn run g:analyze css/index.css" + "analyze": "yarn run g:analyze css/index.css", + "update-styles-from-pdfjs": "scripts/update-styles-from-pdfjs.sh" }, "files": [ "css/**/*", @@ -25,5 +25,9 @@ }, "publishConfig": { "access": "public" + }, + "devDependencies": { + "@types/prettier": "^2", + "prettier": "^2.4.1" } } diff --git a/packages/discovery-styles/scripts/generate-pdfjs_web_mixin.js b/packages/discovery-styles/scripts/generate-pdfjs_web_mixin.js index 68265ccfc..ef99bdf52 100644 --- a/packages/discovery-styles/scripts/generate-pdfjs_web_mixin.js +++ b/packages/discovery-styles/scripts/generate-pdfjs_web_mixin.js @@ -9,7 +9,7 @@ const fs = require('fs'); const originalPdfjsWebCss = process.argv[2]; const mixinPdfjsWebScss = process.argv[3]; -// load teh original style +// load the original style const cssText = fs.readFileSync(originalPdfjsWebCss, { encoding: 'utf-8' }); const cssRoot = postcss.parse(cssText); @@ -31,7 +31,7 @@ cssRoot.walkComments(comment => { // write mixin scss const generatedCss = ` -/* DO NOT EDIT. THIS FILE IS AUTOMATICALLY GENERATED FROM \`update-styles.sh\`. */ +/* DO NOT EDIT. THIS FILE IS AUTOMATICALLY GENERATED FROM \`update-styles-from-pdfjs.sh\`. */ @mixin pdfjsTextLayer { // CSS from ~pdfjs-dist/web/pdf_viewer.css for scoped style ${cssRoot.toString()} diff --git a/packages/discovery-styles/scripts/update-styles.sh b/packages/discovery-styles/scripts/update-styles-from-pdfjs.sh similarity index 77% rename from packages/discovery-styles/scripts/update-styles.sh rename to packages/discovery-styles/scripts/update-styles-from-pdfjs.sh index 7c0e47dbb..d4cf33c89 100755 --- a/packages/discovery-styles/scripts/update-styles.sh +++ b/packages/discovery-styles/scripts/update-styles-from-pdfjs.sh @@ -13,7 +13,7 @@ BASEDIR=$(dirname "$0")/.. # pdfjs textLayer styles PDFJS_WEB_CSS=$BASEDIR/../../node_modules/pdfjs-dist/web/pdf_viewer.css PDFJS_SCSS=$BASEDIR/scss/components/document-preview/_pdfjs_web_mixins.scss -node $BASEDIR/scripts/generate-pdfjs_web_mixin.js "$PDFJS_WEB_CSS" "$PDFJS_SCSS" +yarn node $BASEDIR/scripts/generate-pdfjs_web_mixin.js "$PDFJS_WEB_CSS" "$PDFJS_SCSS" -# perttier -../../node_modules/.bin/prettier --write "$PDFJS_SCSS" +# prettier +yarn run prettier --write "$PDFJS_SCSS" diff --git a/packages/discovery-styles/scss/components/document-preview/_pdfjs_web_mixins.scss b/packages/discovery-styles/scss/components/document-preview/_pdfjs_web_mixins.scss index 22f31964b..f8d96b2f4 100644 --- a/packages/discovery-styles/scss/components/document-preview/_pdfjs_web_mixins.scss +++ b/packages/discovery-styles/scss/components/document-preview/_pdfjs_web_mixins.scss @@ -1,4 +1,4 @@ -/* DO NOT EDIT. THIS FILE IS AUTOMATICALLY GENERATED FROM `update-styles.sh`. */ +/* DO NOT EDIT. THIS FILE IS AUTOMATICALLY GENERATED FROM `update-styles-from-pdfjs.sh`. */ @mixin pdfjsTextLayer { // CSS from ~pdfjs-dist/web/pdf_viewer.css for scoped style /* Copyright 2014 Mozilla Foundation diff --git a/yarn.lock b/yarn.lock index 6da07fa05..f901dc1f1 100644 --- a/yarn.lock +++ b/yarn.lock @@ -2308,6 +2308,9 @@ __metadata: "@ibm-watson/discovery-styles@^1.5.0-beta.2, @ibm-watson/discovery-styles@workspace:packages/discovery-styles": version: 0.0.0-use.local resolution: "@ibm-watson/discovery-styles@workspace:packages/discovery-styles" + dependencies: + "@types/prettier": ^2 + prettier: ^2.4.1 peerDependencies: carbon-components: ">= 10.6.0 < 11" languageName: unknown @@ -4954,6 +4957,13 @@ __metadata: languageName: node linkType: hard +"@types/prettier@npm:^2": + version: 2.4.2 + resolution: "@types/prettier@npm:2.4.2" + checksum: 76e230b2d11028af11fe12e09b2d5b10b03738e9abf819ae6ebb0f78cac13d39f860755ce05ac3855b608222518d956628f5d00322dc206cc6d1f2d8d1519f1e + languageName: node + linkType: hard + "@types/prop-types@npm:*": version: 15.7.3 resolution: "@types/prop-types@npm:15.7.3" From 2c28bd939b35ff4dd334115f425e2ee3b9bcf49f Mon Sep 17 00:00:00 2001 From: Susumu Fukuda Date: Fri, 3 Dec 2021 14:19:06 +0900 Subject: [PATCH 37/51] fix: pdfjs typings version --- packages/discovery-react-components/package.json | 2 +- yarn.lock | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/packages/discovery-react-components/package.json b/packages/discovery-react-components/package.json index 59e406ec5..1fde74fe7 100644 --- a/packages/discovery-react-components/package.json +++ b/packages/discovery-react-components/package.json @@ -43,13 +43,13 @@ "react-virtualized": "9.21.1" }, "devDependencies": { - "@types/pdfjs-dist": "^2.10.378", "@storybook/addon-actions": "^5.3.21", "@storybook/addon-docs": "^5.3.21", "@storybook/addon-knobs": "^5.3.21", "@storybook/core": "^5.3.21", "@storybook/react": "^5.3.21", "@storybook/source-loader": "^5.3.21", + "@types/pdfjs-dist": "^2.1.7", "cross-env": "^7.0.3", "css-loader": "^3.4.2", "madge": "^5.0.1", diff --git a/yarn.lock b/yarn.lock index f901dc1f1..5fb35b26a 100644 --- a/yarn.lock +++ b/yarn.lock @@ -2277,7 +2277,7 @@ __metadata: "@storybook/core": ^5.3.21 "@storybook/react": ^5.3.21 "@storybook/source-loader": ^5.3.21 - "@types/pdfjs-dist": ^2.10.378 + "@types/pdfjs-dist": ^2.1.7 classnames: ^2.2.6 cross-env: ^7.0.3 css-loader: ^3.4.2 @@ -4948,12 +4948,12 @@ __metadata: languageName: node linkType: hard -"@types/pdfjs-dist@npm:^2.10.378": - version: 2.10.378 - resolution: "@types/pdfjs-dist@npm:2.10.378" +"@types/pdfjs-dist@npm:^2.1.7": + version: 2.10.377 + resolution: "@types/pdfjs-dist@npm:2.10.377" dependencies: pdfjs-dist: "*" - checksum: 36dd6010f7d23a995efdf11ea4ecb56f371f8bfb3e83a5c311666726e13238597ed1519701d0e2e6fb297270d01ad6aece9582b036fd4cb3aa301e61ea364978 + checksum: c4623b60e334dfcc50bc584d35977f13edd95646e139df8ae3cb52338e919dbdaf443454055368eb535cf3d1cc3f2f62463c3d883ab85c60ea9acf98e794aba1 languageName: node linkType: hard From 032842c30c71c263a72f37c62dfcdaa43b5da982 Mon Sep 17 00:00:00 2001 From: Susumu Fukuda Date: Fri, 3 Dec 2021 14:57:28 +0900 Subject: [PATCH 38/51] fix: apply review comments --- .../discovery-react-components/package.json | 2 +- .../PdfViewer/PdfViewer.stories.tsx | 4 +- .../components/PdfViewer/PdfViewer.tsx | 44 +++++++------------ .../PdfViewer/PdfViewerTextLayer.tsx | 10 ++--- .../components/PdfViewer/types.ts | 11 +++++ yarn.lock | 24 +++------- 6 files changed, 36 insertions(+), 59 deletions(-) create mode 100644 packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewer/types.ts diff --git a/packages/discovery-react-components/package.json b/packages/discovery-react-components/package.json index 1fde74fe7..0f5baa81a 100644 --- a/packages/discovery-react-components/package.json +++ b/packages/discovery-react-components/package.json @@ -49,7 +49,7 @@ "@storybook/core": "^5.3.21", "@storybook/react": "^5.3.21", "@storybook/source-loader": "^5.3.21", - "@types/pdfjs-dist": "^2.1.7", + "@types/pdfjs-dist": "2.1.7", "cross-env": "^7.0.3", "css-loader": "^3.4.2", "madge": "^5.0.1", diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewer/PdfViewer.stories.tsx b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewer/PdfViewer.stories.tsx index 11a3d0f08..9284c5c16 100644 --- a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewer/PdfViewer.stories.tsx +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewer/PdfViewer.stories.tsx @@ -1,6 +1,6 @@ import React from 'react'; import { storiesOf } from '@storybook/react'; -import { withKnobs, radios, number, boolean } from '@storybook/addon-knobs'; +import { withKnobs, radios, number } from '@storybook/addon-knobs'; import { action } from '@storybook/addon-actions'; import PdfViewer from './PdfViewer'; import { document as doc } from 'components/DocumentPreview/__fixtures__/Art Effects.pdf'; @@ -33,7 +33,6 @@ storiesOf('DocumentPreview/components/PdfViewer', module) const zoom = radios(zoomKnob.label, zoomKnob.options, zoomKnob.defaultValue); const scale = parseFloat(zoom); - const showTextLayer = boolean('Show text layer', false); const setLoadingAction = action('setLoading'); const setRenderedTextAction = action('setRenderedText'); @@ -43,7 +42,6 @@ storiesOf('DocumentPreview/components/PdfViewer', module) file={atob(doc)} page={page} scale={scale} - showTextLayer={showTextLayer} setLoading={setLoadingAction} setRenderedText={setRenderedTextAction} /> diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewer/PdfViewer.tsx b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewer/PdfViewer.tsx index dc34badf7..ea37114a6 100644 --- a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewer/PdfViewer.tsx +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewer/PdfViewer.tsx @@ -4,16 +4,18 @@ import PdfjsLib, { PDFDocumentProxy, PDFPageProxy, PDFPageViewport, + PDFPromise, PDFRenderTask } from 'pdfjs-dist'; import PdfjsWorkerAsText from 'pdfjs-dist/build/pdf.worker.min.js'; import { settings } from 'carbon-components'; -import PdfViewerTextLayer, { PdfRenderedText } from './PdfViewerTextLayer'; import useAsyncFunctionCall from 'utils/useAsyncFunctionCall'; +import PdfViewerTextLayer, { PdfRenderedText } from './PdfViewerTextLayer'; +import { PdfDisplayProps } from './types'; setupPdfjs(); -interface Props { +type Props = PdfDisplayProps & { className?: string; /** @@ -22,22 +24,7 @@ interface Props { file: string; /** - * Page number, starting at 1 - */ - page: number; - - /** - * Zoom factor, where `1` is equal to 100% - */ - scale: number; - - /** - * Render text layer - */ - showTextLayer?: boolean; - - /** - * Text layer class name. Only applicable when showTextLayer is true + * Text layer class name */ textLayerClassName?: string; @@ -57,14 +44,13 @@ interface Props { * Callback for text layer info */ setRenderedText?: (info: PdfRenderedText | null) => any; -} +}; const PdfViewer: FC = ({ className, file, page, scale, - showTextLayer, textLayerClassName, setPageCount, setLoading, @@ -128,14 +114,12 @@ const PdfViewer: FC = ({ width={canvasInfo?.canvasWidth} height={canvasInfo?.canvasHeight} /> - {showTextLayer && ( - - )} + {children}
); @@ -146,7 +130,7 @@ PdfViewer.defaultProps = { scale: 1 }; -function _loadPdf(data: string): Promise { +function _loadPdf(data: string): PDFPromise { return PdfjsLib.getDocument({ data }).promise; } @@ -175,6 +159,7 @@ function setupPdfjs(): void { if (typeof Worker !== 'undefined') { const blob = new Blob([PdfjsWorkerAsText], { type: 'text/javascript' }); const pdfjsWorker = new Worker(URL.createObjectURL(blob)) as any; + // @ts-expect-error Upgrading pdfjs-dist and its typings would resolve the issue PdfjsLib.GlobalWorkerOptions.workerPort = pdfjsWorker; } else { PdfjsLib.GlobalWorkerOptions.workerSrc = PdfjsWorkerAsText; @@ -199,4 +184,5 @@ function getCanvasInfo(viewport: any): CanvasInfo { return { width, height, canvasWidth, canvasHeight, canvasScale }; } +export type PdfViewerProps = Props; export default PdfViewer; diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewer/PdfViewerTextLayer.tsx b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewer/PdfViewerTextLayer.tsx index 21277ee2e..5c76fb515 100644 --- a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewer/PdfViewerTextLayer.tsx +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewer/PdfViewerTextLayer.tsx @@ -4,8 +4,9 @@ import { PDFPageProxy, PDFPageViewport, TextContent, TextContentItem } from 'pdf import { EventBus } from 'pdfjs-dist/lib/web/ui_utils'; import { TextLayerBuilder } from 'pdfjs-dist/lib/web/text_layer_builder'; import useAsyncFunctionCall from 'utils/useAsyncFunctionCall'; +import { PdfDisplayProps } from './types'; -interface Props { +type Props = Pick & { className?: string; /** @@ -13,16 +14,11 @@ interface Props { */ loadedPage: PDFPageProxy | null | undefined; - /** - * Zoom factor, where `1` is equal to 100% - */ - scale: number; - /** * Callback for text layer info */ setRenderedText?: (info: PdfRenderedText | null) => any; -} +}; export type PdfRenderedText = { /** diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewer/types.ts b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewer/types.ts new file mode 100644 index 000000000..063a65818 --- /dev/null +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewer/types.ts @@ -0,0 +1,11 @@ +export type PdfDisplayProps = { + /** + * Page number, starting at 1 + */ + page: number; + + /** + * Zoom factor, where `1` is equal to 100% + */ + scale: number; +}; diff --git a/yarn.lock b/yarn.lock index 5fb35b26a..6ebd9e514 100644 --- a/yarn.lock +++ b/yarn.lock @@ -2277,7 +2277,7 @@ __metadata: "@storybook/core": ^5.3.21 "@storybook/react": ^5.3.21 "@storybook/source-loader": ^5.3.21 - "@types/pdfjs-dist": ^2.1.7 + "@types/pdfjs-dist": 2.1.7 classnames: ^2.2.6 cross-env: ^7.0.3 css-loader: ^3.4.2 @@ -4948,12 +4948,10 @@ __metadata: languageName: node linkType: hard -"@types/pdfjs-dist@npm:^2.1.7": - version: 2.10.377 - resolution: "@types/pdfjs-dist@npm:2.10.377" - dependencies: - pdfjs-dist: "*" - checksum: c4623b60e334dfcc50bc584d35977f13edd95646e139df8ae3cb52338e919dbdaf443454055368eb535cf3d1cc3f2f62463c3d883ab85c60ea9acf98e794aba1 +"@types/pdfjs-dist@npm:2.1.7": + version: 2.1.7 + resolution: "@types/pdfjs-dist@npm:2.1.7" + checksum: 14ca335658a85ab5cab908f3ef3ec104cb62487acc2465bb74b6d0430cd720dca30a8804440e152967b1dd7fb384c65ef05da1702a1ec87b905c0f5c73bbe653 languageName: node linkType: hard @@ -19544,18 +19542,6 @@ __metadata: languageName: node linkType: hard -"pdfjs-dist@npm:*": - version: 2.11.338 - resolution: "pdfjs-dist@npm:2.11.338" - peerDependencies: - worker-loader: ^3.0.8 - peerDependenciesMeta: - worker-loader: - optional: true - checksum: 1b946a3eeb3312a79e12b4e0aa066bb2b98487b9ee329666edc840a194602595cf84de9a3f6dbb023b808699a6ebb0cd06e751314fc4c0ffa56f7be12855d296 - languageName: node - linkType: hard - "pdfjs-dist@npm:^2.2.228": version: 2.2.228 resolution: "pdfjs-dist@npm:2.2.228" From 3eedcf8d05f9db99c061d1fe3c39c3843c9a2d88 Mon Sep 17 00:00:00 2001 From: Susumu Fukuda Date: Mon, 6 Dec 2021 18:54:13 +0900 Subject: [PATCH 39/51] refactor: refactor common utils --- .../components/PdfViewerHighlight/types.ts | 9 ++++++--- .../utils/common/textSpanUtils.ts | 17 +++++++++-------- .../src/components/DocumentPreview/types.ts | 6 ++++-- .../src/utils/document/documentUtils.ts | 2 +- 4 files changed, 20 insertions(+), 14 deletions(-) diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/types.ts b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/types.ts index 5496895c1..151540c5f 100644 --- a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/types.ts +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/types.ts @@ -1,9 +1,12 @@ -import { Bbox as DocumentPreviewBbox } from '../../../DocumentPreview/types'; +import { + Bbox as DocPreviewBbox, + TextSpan as DocPreviewTextSpan +} from '../../../DocumentPreview/types'; import { Location } from 'utils/document/processDoc'; // (re-)export useful types -export type Bbox = DocumentPreviewBbox; -export type TextSpan = [number, number]; +export type Bbox = DocPreviewBbox; +export type TextSpan = DocPreviewTextSpan; /** * A document. Same to QueryResult, but this more focuses on document fields diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/textSpanUtils.ts b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/textSpanUtils.ts index 58b388e01..fe173649b 100644 --- a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/textSpanUtils.ts +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/textSpanUtils.ts @@ -1,8 +1,17 @@ +import { spansIntersect } from 'utils/document/documentUtils'; import { TextSpan } from '../../types'; export const START = 0; export const END = 1; +/** + * Check whether two spans has intersection or not + * TextSpan version of spansIntersect in utils/document/documentUtil.ts + */ +export function spanIntersects([beginA, endA]: TextSpan, [beginB, endB]: TextSpan): boolean { + return spansIntersect({ begin: beginA, end: endA }, { begin: beginB, end: endB }); +} + /** * Get text for a given span */ @@ -22,14 +31,6 @@ export function spanLen(span: TextSpan): number { return Math.max(0, span[END] - span[START]); } -/** - * Check whether two spans has intersection or not - */ -export function spanIntersects([beginA, endA]: TextSpan, [beginB, endB]: TextSpan): boolean { - // TODO: integrate with spansIntersect in documentUtils.ts - return beginA < endB && endA > beginB; -} - /** * Check whether a span includes an given character index or not */ diff --git a/packages/discovery-react-components/src/components/DocumentPreview/types.ts b/packages/discovery-react-components/src/components/DocumentPreview/types.ts index e72598074..70bdf5ffe 100644 --- a/packages/discovery-react-components/src/components/DocumentPreview/types.ts +++ b/packages/discovery-react-components/src/components/DocumentPreview/types.ts @@ -6,6 +6,9 @@ export interface TextMappings { // [ left, top, right, bottom ] export type Bbox = [number, number, number, number]; +/** [ start (inclusive), end (exclusive) ] */ +export type TextSpan = [number, number]; + export type Origin = 'TopLeft' | 'BottomLeft'; export interface Page { @@ -32,8 +35,7 @@ export interface CellPage { export interface CellField { name: string; index: number; - // [ START, END ] - span: [number, number]; + span: TextSpan; } export interface StyledCell extends CellPage { diff --git a/packages/discovery-react-components/src/utils/document/documentUtils.ts b/packages/discovery-react-components/src/utils/document/documentUtils.ts index ebadd1cd0..6ea03da37 100644 --- a/packages/discovery-react-components/src/utils/document/documentUtils.ts +++ b/packages/discovery-react-components/src/utils/document/documentUtils.ts @@ -217,5 +217,5 @@ export function spansIntersect( { begin: beginA, end: endA }: Span, { begin: beginB, end: endB }: Span ): boolean { - return beginA <= endB && endA > beginB; + return beginA < endB && endA > beginB; } From c13942b792eb899d645de81bf1f2b382c7e24cd1 Mon Sep 17 00:00:00 2001 From: Susumu Fukuda Date: Mon, 6 Dec 2021 18:55:23 +0900 Subject: [PATCH 40/51] refactor: refacto getTextBoxMapping --- .../utils/textBoxMapping/TextBoxMapping.ts | 22 +- .../utils/textBoxMapping/getTextBoxMapping.ts | 268 +++++++++++------- 2 files changed, 190 insertions(+), 100 deletions(-) diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/TextBoxMapping.ts b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/TextBoxMapping.ts index 95c6cded0..a326f6c30 100644 --- a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/TextBoxMapping.ts +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/TextBoxMapping.ts @@ -12,7 +12,7 @@ import { import { TextLayoutCell, TextLayoutCellBase } from '../textLayout/types'; import { TextNormalizer } from '../common/TextNormalizer'; -const debugOut = require('debug')?.('pdf:mapping:TextBoxMappingImpl'); +const debugOut = require('debug')?.('pdf:mapping:TextBoxMapping'); function debug(...args: any) { debugOut?.apply(null, args); } @@ -20,7 +20,7 @@ function debug(...args: any) { /** * Text box mapping */ -export class TextBoxMappingImpl implements TextBoxMapping { +class TextBoxMappingImpl implements TextBoxMapping { private readonly mappingEntryMap: Dictionary; constructor(mappingEntries: TextBoxMappingEntry[]) { @@ -83,6 +83,24 @@ export class TextBoxMappingImpl implements TextBoxMapping { } } +/** + * Text mapping builder + */ +export class TextBoxMappingBuilder { + mappingEntries: TextBoxMappingEntry[] = []; + + /** add new mapping data */ + addMapping(text: TextBoxMappingEntry['text'], box: TextBoxMappingEntry['box']) { + this.mappingEntries.push({ text, box }); + debug('>> added a new mapping entry (%o) => (cell: %o)', text, text, box?.cell); + } + + /** get TextBoxMapping */ + toTextBoxMapping() { + return new TextBoxMappingImpl(this.mappingEntries); + } +} + /** * Check if text on spans on cells are the same or not */ diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/getTextBoxMapping.ts b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/getTextBoxMapping.ts index 0f280ff1e..fb0174ebc 100644 --- a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/getTextBoxMapping.ts +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/getTextBoxMapping.ts @@ -3,54 +3,17 @@ import { TextSpan } from '../../types'; import { bboxIntersects } from '../common/bboxUtils'; import { nonEmpty } from '../common/nonEmpty'; import { spanLen, spanMerge } from '../common/textSpanUtils'; -import { TextLayout, TextLayoutCell } from '../textLayout/types'; +import { TextLayout, TextLayoutCell, TextLayoutCellBase } from '../textLayout/types'; import { MappingSourceTextProvider } from './MappingSourceTextProvider'; import { MappingTargetBoxProvider } from './MappingTargetCellProvider'; -import { TextBoxMappingImpl } from './TextBoxMapping'; -import { TextBoxMapping, TextBoxMappingEntry } from './types'; +import { TextBoxMappingBuilder } from './TextBoxMapping'; +import { TextBoxMapping } from './types'; const debugOut = require('debug')?.('pdf:mapping:getTextBoxMapping'); function debug(...args: any) { debugOut?.apply(null, args); } -/** - * Find the best source (larger text layout cell) where text `textToMatch` is in - * @param sources source (larger) text layout cells overlapping the current target cell - * @param textToMatch text form target cell(s) - * @returns the best source where the `textToMatch` is matched and the text location in the source - */ -function findMatchInSources( - sources: { - cell: TextLayoutCell; - provider: MappingSourceTextProvider; - }[], - textToMatch: string -) { - // find matches - const matches = sources.map(source => { - const match = source.provider.getMatch(textToMatch); - return { ...source, match }; - }); - - // calc cost for each match - let skipTextLen = 0; - const matchesWithCost = matches.map(aMatch => { - const { match: providerMatch } = aMatch; - const cost = !providerMatch - ? Number.MAX_SAFE_INTEGER - : skipTextLen + providerMatch.skipText.length - spanLen(providerMatch.span); - - skipTextLen += providerMatch?.approxLenAfterEnd ?? 0; - - return { ...aMatch, cost }; - }); - - // find best match - const bestMatch = minBy(matchesWithCost, match => match.cost); - return bestMatch; -} - /** * Calculate text box mapping from `source` text layout to `target` text layout * @param source text layout with larger cells @@ -60,74 +23,183 @@ function findMatchInSources( export function getTextBoxMappings< SourceCell extends TextLayoutCell, TargetCell extends TextLayoutCell ->(source: TextLayout, target: TextLayout): TextBoxMapping { - const sourceProviders = source.cells.map(cell => new MappingSourceTextProvider(cell)); - const targetProvider = new MappingTargetBoxProvider(target.cells); - - const targetIndexToSources = target.cells.map(targetCell => { - const cells = source.cells - .map((sourceCell, index) => { - if (!bboxIntersects(sourceCell.bbox, targetCell.bbox)) { - return null; +>(sourceLayout: TextLayout, targetLayout: TextLayout): TextBoxMapping { + debug('getTextBoxMapping: enter'); + + const target = new Target(targetLayout); + const source = new Source(sourceLayout, targetLayout); + const builder = new TextBoxMappingBuilder(); + + target.processText((targetCellId, targetText, markTargetAsMapped) => { + const matchInSource = source.findMatch(targetCellId, targetText); + if (matchInSource) { + const mappedTargetCells = markTargetAsMapped(matchInSource.matchLength); + + let mappedSourceFullSpan: TextSpan = [0, 0]; + mappedTargetCells.forEach(targetCell => { + const mappedSourceSpan = matchInSource.markSourceAsMapped(targetCell.text); + if (mappedSourceSpan) { + builder.addMapping( + { cell: matchInSource.cell, span: mappedSourceSpan }, + { cell: targetCell } + ); + mappedSourceFullSpan = spanMerge(mappedSourceFullSpan, mappedSourceSpan); } - return { cell: sourceCell, provider: sourceProviders[index] }; - }) - .filter(nonEmpty); - - if (cells.some(({ cell }) => cell.isInHtmlBbox)) { - return cells.filter(({ cell }) => cell.isInHtmlBbox); + }); + if (spanLen(mappedSourceFullSpan) > 0) { + matchInSource.markSourceMappedBySpan(mappedSourceFullSpan); + } } - return cells; }); - const mappingEntries: TextBoxMappingEntry[] = []; + return builder.toTextBoxMapping(); +} - debug('getTextBoxMapping'); - while (targetProvider.hasNext()) { - // find matches - const { index: targetCellIndex, text: targetText } = targetProvider.getNextInfo(); - debug('> find match at index %d, text: %s', targetCellIndex, targetText); - const matchInSource = findMatchInSources(targetIndexToSources[targetCellIndex], targetText); - debug('> source cell(s) matched: %o', matchInSource); - - // skip when no match found... - if (!matchInSource?.match || spanLen(matchInSource.match.span) === 0) { - targetProvider.skip(); - continue; - } +/** + * Utility class for manipulating target text layout in getTextBoxMapping + */ +class Target { + targetProvider: MappingTargetBoxProvider; - const matchedSourceSpan = matchInSource.match.span; - const matchedSourceProvider = matchInSource.provider; - const matchedLength = spanLen(matchedSourceSpan); - - const matchedTargetCells = targetProvider.consume(matchedLength); - debug('> target cells for matched length: %d', matchedLength); - debug(matchedTargetCells); - - let consumedSourceSpan: TextSpan = [0, 0]; - matchedTargetCells.forEach(mTargetCell => { - const trimmedCell = mTargetCell.trim(); - if (trimmedCell.text.length > 0) { - const matchToTargetCell = matchedSourceProvider.getMatch(trimmedCell.text); - debug('>> target cell %o (%o) to source %o', mTargetCell, trimmedCell, matchToTargetCell); - if (matchToTargetCell) { - // consume source text which is just mapped to the target - matchedSourceProvider.consume(matchToTargetCell.span); - consumedSourceSpan = spanMerge(consumedSourceSpan, matchToTargetCell.span); - mappingEntries.push({ - text: { cell: matchInSource.cell, span: matchToTargetCell.span }, - box: { cell: trimmedCell } - }); - debug('>> added mapping entry %o', mappingEntries[mappingEntries.length - 1]); + constructor(targetLayout: TextLayout) { + this.targetProvider = new MappingTargetBoxProvider(targetLayout.cells); + } + + /** + * Try to map text fragments (`cellId` and `text` passed to `textMapper`) + * in target using a given `textMapper` + */ + processText( + textMapper: ( + cellId: number, + text: string, + markTargetMapped: (length: number) => TextLayoutCellBase[] + ) => void + ) { + while (this.targetProvider.hasNext()) { + const { index: cellId, text: nextText } = this.targetProvider.getNextInfo(); + debug('> find match at index %d, text: %s', cellId, nextText); + + let isMapped = false; + const markAsMapped = (matchedLength: number) => { + if (matchedLength > 0) { + isMapped = true; + const matchedTargetCells = this.targetProvider.consume(matchedLength); + debug('> raw target cells for matched length: %d', matchedLength); + debug(matchedTargetCells); + + return matchedTargetCells.map(cell => cell.trim()).filter(cell => cell.text.length > 0); } + return []; + }; + + textMapper(cellId, nextText, markAsMapped); + if (!isMapped) { + this.targetProvider.skip(); } + } + } +} + +/** + * Utility class for manipulating source text layout and its source text in getTextBoxMapping + */ +class Source { + sourceProviders: MappingSourceTextProvider[]; + targetIndexToSources: { + cell: SourceCell; + provider: MappingSourceTextProvider; + }[][]; + + constructor(sourceLayout: TextLayout, targetLayout: TextLayout) { + this.sourceProviders = sourceLayout.cells.map(cell => new MappingSourceTextProvider(cell)); + this.targetIndexToSources = targetLayout.cells.map(targetCell => { + const cells = sourceLayout.cells + .map((sourceCell, index) => { + if (!bboxIntersects(sourceCell.bbox, targetCell.bbox)) { + return null; + } + return { cell: sourceCell, provider: this.sourceProviders[index] }; + }) + .filter(nonEmpty); + + if (cells.some(({ cell }) => cell.isInHtmlBbox)) { + return cells.filter(({ cell }) => cell.isInHtmlBbox); + } + return cells; }); - // consume entire the range that is matched to sources - if (spanLen(consumedSourceSpan) > 0) { - matchedSourceProvider.consume(consumedSourceSpan); - debug('> span consumed in source: ', consumedSourceSpan); + } + + /** + * Find the best (i.e. longest length `text`) match in source which intersects + * with the target cell of given `targetCellId` + * @param targetCellId + * @param text + * @return matched source information and functions to mark the matched span as mapped + */ + findMatch(targetCellId: TargetCell['id'], text: string) { + const candidateSources = this.targetIndexToSources[targetCellId]; + const bestMatch = Source.findBestMatch(candidateSources, text); + debug('> source cell(s) matched: %o', bestMatch); + + if (!bestMatch?.match || spanLen(bestMatch.match.span) === 0) { + return null; } + + const matchedCell = bestMatch.cell; + const matchedSourceSpan = bestMatch.match.span; + const matchedSourceProvider = bestMatch.provider; + + return { + cell: matchedCell, + matchLength: spanLen(matchedSourceSpan), + markSourceAsMapped: (text: string) => { + const mappedSource = matchedSourceProvider.getMatch(text); + debug('>> target cell %o to source %o', text, mappedSource); + return mappedSource?.span; + }, + markSourceMappedBySpan: (span: TextSpan) => { + if (spanLen(span) > 0) { + matchedSourceProvider.consume(span); + } + } + }; } - return new TextBoxMappingImpl(mappingEntries); + /** + * Find the best source (larger text layout cell) where text `textToMatch` is in + * @param sources source (larger) text layout cells overlapping the current target cell + * @param textToMatch text form target cell(s) + * @returns the best source where the `textToMatch` is matched and the text location in the source + */ + private static findBestMatch( + sources: { + cell: TextLayoutCell; + provider: MappingSourceTextProvider; + }[], + textToMatch: string + ) { + // find matches + const matches = sources.map(source => { + const match = source.provider.getMatch(textToMatch); + return { ...source, match }; + }); + + // calc cost for each match + let skipTextLen = 0; + const matchesWithCost = matches.map(aMatch => { + const { match: providerMatch } = aMatch; + const cost = !providerMatch + ? Number.MAX_SAFE_INTEGER + : skipTextLen + providerMatch.skipText.length - spanLen(providerMatch.span); + + skipTextLen += providerMatch?.approxLenAfterEnd ?? 0; + + return { ...aMatch, cost }; + }); + + // find best match + const bestMatch = minBy(matchesWithCost, match => match.cost); + return bestMatch; + } } From a9dd38ec2b91b2f04f0e7c17a67f4cdd112916ca Mon Sep 17 00:00:00 2001 From: Susumu Fukuda Date: Mon, 6 Dec 2021 19:04:48 +0900 Subject: [PATCH 41/51] fix: fix yarn error --- yarn.lock | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/yarn.lock b/yarn.lock index 6ebd9e514..b047d1520 100644 --- a/yarn.lock +++ b/yarn.lock @@ -2267,7 +2267,7 @@ __metadata: languageName: node linkType: hard -"@ibm-watson/discovery-react-components@^1.5.0-beta.2, @ibm-watson/discovery-react-components@workspace:packages/discovery-react-components": +"@ibm-watson/discovery-react-components@^1.5.0-beta.3, @ibm-watson/discovery-react-components@workspace:packages/discovery-react-components": version: 0.0.0-use.local resolution: "@ibm-watson/discovery-react-components@workspace:packages/discovery-react-components" dependencies: @@ -10260,7 +10260,7 @@ __metadata: resolution: "discovery-search-app@workspace:examples/discovery-search-app" dependencies: "@carbon/icons": ^10.5.0 - "@ibm-watson/discovery-react-components": ^1.5.0-beta.2 + "@ibm-watson/discovery-react-components": ^1.5.0-beta.3 "@ibm-watson/discovery-styles": ^1.5.0-beta.2 body-parser: ^1.19.0 carbon-components: ^10.6.0 From f6fbcd28bc2a403c635711d679582fc94b1b0fbe Mon Sep 17 00:00:00 2001 From: Susumu Fukuda Date: Mon, 6 Dec 2021 20:30:48 +0900 Subject: [PATCH 42/51] fix: fix test error --- .../PdfViewerHighlight/utils/common/textSpanUtils.ts | 7 +++++-- .../src/utils/document/documentUtils.ts | 2 +- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/textSpanUtils.ts b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/textSpanUtils.ts index fe173649b..e40acefce 100644 --- a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/textSpanUtils.ts +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/textSpanUtils.ts @@ -1,4 +1,3 @@ -import { spansIntersect } from 'utils/document/documentUtils'; import { TextSpan } from '../../types'; export const START = 0; @@ -9,7 +8,11 @@ export const END = 1; * TextSpan version of spansIntersect in utils/document/documentUtil.ts */ export function spanIntersects([beginA, endA]: TextSpan, [beginB, endB]: TextSpan): boolean { - return spansIntersect({ begin: beginA, end: endA }, { begin: beginB, end: endB }); + // TODO: integrate with spansIntersect in documentUtils.ts + // currently, the function returns true to spansIntersect([1,2], [0,1]) + // which is expected to be false here. And fixing it results in test error + // We need further investigate if we can fix the spansIntersect. + return beginA < endB && endA > beginB; } /** diff --git a/packages/discovery-react-components/src/utils/document/documentUtils.ts b/packages/discovery-react-components/src/utils/document/documentUtils.ts index 6ea03da37..ebadd1cd0 100644 --- a/packages/discovery-react-components/src/utils/document/documentUtils.ts +++ b/packages/discovery-react-components/src/utils/document/documentUtils.ts @@ -217,5 +217,5 @@ export function spansIntersect( { begin: beginA, end: endA }: Span, { begin: beginB, end: endB }: Span ): boolean { - return beginA < endB && endA > beginB; + return beginA <= endB && endA > beginB; } From dc09472f13c440607ec3d6862be5b566e183ff0d Mon Sep 17 00:00:00 2001 From: Susumu Fukuda Date: Mon, 6 Dec 2021 21:08:30 +0900 Subject: [PATCH 43/51] refactor: move utility methods --- .../PdfViewerHighlight/utils/Highlighter.ts | 2 +- .../utils/common/TextNormalizer.ts | 2 +- .../utils/common/__tests__/bboxUtils.test.ts | 19 +---------------- .../utils/common/bboxUtils.ts | 21 +++---------------- .../MappingSourceTextProvider.ts | 6 +++--- .../MappingTargetCellProvider.ts | 2 +- .../utils/textBoxMapping/TextBoxMapping.ts | 6 +++--- .../utils/textBoxMapping/TextProvider.ts | 2 +- .../utils/textBoxMapping/getTextBoxMapping.ts | 6 +++--- .../utils/textLayout/BaseTextLayout.ts | 4 ++-- .../textLayout/PdfTextContentTextLayout.ts | 4 ++-- .../textLayout/TextMappingsTextLayout.ts | 10 ++++----- .../utils/textLayout/dom.ts | 4 ++-- .../utils/__tests__/box.test.ts | 19 ++++++++++++++++- .../__tests__/textSpan.test.ts} | 4 ++-- .../components/DocumentPreview/utils/box.ts | 9 ++++++-- .../textSpanUtils.ts => utils/textSpan.ts} | 2 +- 17 files changed, 56 insertions(+), 66 deletions(-) rename packages/discovery-react-components/src/components/DocumentPreview/{components/PdfViewerHighlight/utils/common/__tests__/textSpanUtils.test.ts => utils/__tests__/textSpan.test.ts} (98%) rename packages/discovery-react-components/src/components/DocumentPreview/{components/PdfViewerHighlight/utils/common/textSpanUtils.ts => utils/textSpan.ts} (98%) diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/Highlighter.ts b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/Highlighter.ts index f6b0ed4f7..7afa139bf 100644 --- a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/Highlighter.ts +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/Highlighter.ts @@ -7,11 +7,11 @@ import { HighlightShape, HighlightShapeBox } from '../types'; +import { spanOffset, START } from '../../../utils/textSpan'; import { getTextBoxMappings } from './textBoxMapping'; import { TextBoxMapping, TextBoxMappingResult } from './textBoxMapping/types'; import { HtmlBboxTextLayout, PdfTextContentTextLayout, TextMappingsTextLayout } from './textLayout'; import { HtmlBboxInfo, TextLayout, TextLayoutCell } from './textLayout/types'; -import { spanOffset, START } from './common/textSpanUtils'; import { nonEmpty } from './common/nonEmpty'; const debugOut = require('debug')?.('pdf:Highlighter'); diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/TextNormalizer.ts b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/TextNormalizer.ts index edfc7967f..bb4b0f67c 100644 --- a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/TextNormalizer.ts +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/TextNormalizer.ts @@ -1,5 +1,5 @@ import { TextSpan } from '../../types'; -import { END, spanLen, START } from './textSpanUtils'; +import { END, spanLen, START } from '../../../../utils/textSpan'; type SpanMapping = { rawSpan: TextSpan; normalizedSpan: TextSpan }; diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/__tests__/bboxUtils.test.ts b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/__tests__/bboxUtils.test.ts index 419f0836b..153a2e1b3 100644 --- a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/__tests__/bboxUtils.test.ts +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/__tests__/bboxUtils.test.ts @@ -1,21 +1,4 @@ -import { bboxGetSpanByRatio, bboxIntersects, isNextToEachOther } from '../bboxUtils'; - -describe('bboxIntersects', () => { - it('should return true when boxes intersect', () => { - expect(bboxIntersects([10, 10, 20, 20], [15, 15, 25, 25])).toBeTruthy(); - }); - - it("should return false when boxes don't intersect", () => { - expect(bboxIntersects([10, 10, 20, 20], [15, 25, 25, 35])).toBeFalsy(); - }); - - it('should return false when one box is on another', () => { - expect(bboxIntersects([10, 10, 20, 20], [20, 10, 30, 20])).toBeFalsy(); - expect(bboxIntersects([10, 10, 20, 20], [0, 10, 10, 20])).toBeFalsy(); - expect(bboxIntersects([10, 10, 20, 20], [10, 20, 20, 30])).toBeFalsy(); - expect(bboxIntersects([10, 10, 20, 20], [10, 0, 20, 10])).toBeFalsy(); - }); -}); +import { bboxGetSpanByRatio, isNextToEachOther } from '../bboxUtils'; describe('bboxGetSpanByRatio', () => { it('should return proper bbox for spans on text', () => { diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/bboxUtils.ts b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/bboxUtils.ts index f7e911f8e..533f0918c 100644 --- a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/bboxUtils.ts +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/bboxUtils.ts @@ -1,21 +1,6 @@ -import { intersects } from 'components/DocumentPreview/utils/box'; import { Bbox, TextSpan } from '../../types'; -import { spanIntersection, spanLen } from './textSpanUtils'; - -export const LEFT = 0; -export const TOP = 1; -export const RIGHT = 2; -export const BOTTOM = 3; - -/** - * Check whether two bbox intersect - * @param boxA one bbox - * @param boxB another bbox - * @returns true iff boxA and boxB are overlapped - */ -export function bboxIntersects(boxA: Bbox, boxB: Bbox): boolean { - return intersects(boxA, boxB); -} +import { bboxesIntersect } from '../../../../utils/box'; +import { spanIntersection, spanLen } from '../../../../utils/textSpan'; /** * Get bbox for a text span assuming each character takes horizontal spaces evenly @@ -43,7 +28,7 @@ export function bboxGetSpanByRatio(bbox: Bbox, origLength: number, span: TextSpa * This is used to get a text of a line from a list of small text cells. */ export function isNextToEachOther(boxA: Bbox, boxB: Bbox): boolean { - if (bboxIntersects(boxA, boxB)) { + if (bboxesIntersect(boxA, boxB)) { return false; } diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/MappingSourceTextProvider.ts b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/MappingSourceTextProvider.ts index 22dadc34a..337fe72d9 100644 --- a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/MappingSourceTextProvider.ts +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/MappingSourceTextProvider.ts @@ -1,9 +1,9 @@ +import minBy from 'lodash/minBy'; +import { spanGetText, spanLen, START } from '../../../../utils/textSpan'; import { TextSpan } from '../../types'; -import { TextProvider } from './TextProvider'; import { TextNormalizer } from '../common/TextNormalizer'; -import minBy from 'lodash/minBy'; -import { spanGetText, spanLen, START } from '../common/textSpanUtils'; import { TextLayoutCell } from '../textLayout/types'; +import { TextProvider } from './TextProvider'; const debugOut = require('debug')?.('pdf:mapping:MappingSourceTextProvider'); function debug(...args: any) { diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/MappingTargetCellProvider.ts b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/MappingTargetCellProvider.ts index 04bad0ac4..fe4434991 100644 --- a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/MappingTargetCellProvider.ts +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/MappingTargetCellProvider.ts @@ -1,7 +1,7 @@ +import { END } from '../../../../utils/textSpan'; import { TextLayoutCellBase } from '../textLayout/types'; import { TextNormalizer } from '../common/TextNormalizer'; import { CellProvider } from './CellProvider'; -import { END } from '../common/textSpanUtils'; /** * Cell provider with normalization diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/TextBoxMapping.ts b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/TextBoxMapping.ts index a326f6c30..68e0c33f6 100644 --- a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/TextBoxMapping.ts +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/TextBoxMapping.ts @@ -1,16 +1,16 @@ -import { TextSpan } from '../../types'; -import { TextBoxMapping, TextBoxMappingEntry, TextBoxMappingResult } from './types'; import { Dictionary } from 'lodash'; import groupBy from 'lodash/groupBy'; +import { TextSpan } from '../../types'; import { spanCompare, spanFromSubSpan, spanGetSubSpan, spanIntersection, spanIntersects -} from '../common/textSpanUtils'; +} from '../../../../utils/textSpan'; import { TextLayoutCell, TextLayoutCellBase } from '../textLayout/types'; import { TextNormalizer } from '../common/TextNormalizer'; +import { TextBoxMapping, TextBoxMappingEntry, TextBoxMappingResult } from './types'; const debugOut = require('debug')?.('pdf:mapping:TextBoxMapping'); function debug(...args: any) { diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/TextProvider.ts b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/TextProvider.ts index 2a5825240..9528b7669 100644 --- a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/TextProvider.ts +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/TextProvider.ts @@ -6,7 +6,7 @@ import { spanIncludesIndex, spanGetText, spanIntersection -} from '../common/textSpanUtils'; +} from '../../../../utils/textSpan'; import { findLargestIndex } from '../common/findLargestIndex'; const MAX_HISTORY = 3; diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/getTextBoxMapping.ts b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/getTextBoxMapping.ts index fb0174ebc..1ca5d1329 100644 --- a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/getTextBoxMapping.ts +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/getTextBoxMapping.ts @@ -1,8 +1,8 @@ import minBy from 'lodash/minBy'; import { TextSpan } from '../../types'; -import { bboxIntersects } from '../common/bboxUtils'; import { nonEmpty } from '../common/nonEmpty'; -import { spanLen, spanMerge } from '../common/textSpanUtils'; +import { bboxesIntersect } from '../../../../utils/box'; +import { spanLen, spanMerge } from '../../../../utils/textSpan'; import { TextLayout, TextLayoutCell, TextLayoutCellBase } from '../textLayout/types'; import { MappingSourceTextProvider } from './MappingSourceTextProvider'; import { MappingTargetBoxProvider } from './MappingTargetCellProvider'; @@ -116,7 +116,7 @@ class Source { const cells = sourceLayout.cells .map((sourceCell, index) => { - if (!bboxIntersects(sourceCell.bbox, targetCell.bbox)) { + if (!bboxesIntersect(sourceCell.bbox, targetCell.bbox)) { return null; } return { cell: sourceCell, provider: this.sourceProviders[index] }; diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textLayout/BaseTextLayout.ts b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textLayout/BaseTextLayout.ts index 099934ac2..7ad9ac173 100644 --- a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textLayout/BaseTextLayout.ts +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textLayout/BaseTextLayout.ts @@ -1,7 +1,7 @@ +import { spanGetText, spanIntersection, spanOffset, START } from '../../../../utils/textSpan'; import { Bbox, TextSpan } from '../../types'; -import { TextLayout, TextLayoutCell, TextLayoutCellBase } from './types'; -import { spanGetText, spanIntersection, spanOffset, START } from '../common/textSpanUtils'; import { bboxGetSpanByRatio } from '../common/bboxUtils'; +import { TextLayout, TextLayoutCell, TextLayoutCellBase } from './types'; /** * Base implementation of text layout cell diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textLayout/PdfTextContentTextLayout.ts b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textLayout/PdfTextContentTextLayout.ts index 0b5e53007..3399b6d33 100644 --- a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textLayout/PdfTextContentTextLayout.ts +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textLayout/PdfTextContentTextLayout.ts @@ -1,6 +1,6 @@ +import { bboxesIntersect } from 'components/DocumentPreview/utils/box'; import { PDFPageViewport, PDFPageViewportOptions, TextContentItem } from 'pdfjs-dist'; import { Bbox, TextSpan } from '../../types'; -import { bboxIntersects } from '../common/bboxUtils'; import { BaseTextLayoutCell } from './BaseTextLayout'; import { getAdjustedCellByOffsetByDom } from './dom'; import { HtmlBboxInfo, PdfTextContentInfo, TextLayout } from './types'; @@ -23,7 +23,7 @@ export class PdfTextContentTextLayout implements TextLayout { - return bboxIntersects(cellBbox, [bbox.left, bbox.top, bbox.right, bbox.bottom]); + return bboxesIntersect(cellBbox, [bbox.left, bbox.top, bbox.right, bbox.bottom]); }); } return new PdfTextContentTextLayoutCell(this, index, item, pageNum, cellBbox, isInHtmlBbox); diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textLayout/TextMappingsTextLayout.ts b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textLayout/TextMappingsTextLayout.ts index 53c7e7c81..dddb92c58 100644 --- a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textLayout/TextMappingsTextLayout.ts +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textLayout/TextMappingsTextLayout.ts @@ -1,13 +1,13 @@ -import { Cell, CellField } from 'components/DocumentPreview/types'; -import { DocumentFields, DocumentFieldHighlight, TextSpan } from '../../types'; -import { getDocFieldValue } from '../common/documentUtils'; -import { TextBoxMappingResult } from '../textBoxMapping/types'; +import { Cell, CellField } from '../../../../types'; import { spanGetSubSpan, spanContains, spanIntersection, spanIntersects -} from '../common/textSpanUtils'; +} from '../../../../utils/textSpan'; +import { DocumentFields, DocumentFieldHighlight, TextSpan } from '../../types'; +import { getDocFieldValue } from '../common/documentUtils'; +import { TextBoxMappingResult } from '../textBoxMapping/types'; import { BaseTextLayoutCell } from './BaseTextLayout'; import { TextLayout, TextMappingInfo } from './types'; diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textLayout/dom.ts b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textLayout/dom.ts index 23dfe7d4a..0d461433e 100644 --- a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textLayout/dom.ts +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textLayout/dom.ts @@ -1,7 +1,7 @@ import { forEachRectInRange, getTextNodeAndOffset } from 'utils/document/documentUtils'; import { Bbox, TextSpan } from '../../types'; -import { BOTTOM, LEFT, RIGHT, TOP } from '../common/bboxUtils'; -import { END, START } from '../common/textSpanUtils'; +import { BOTTOM, LEFT, RIGHT, TOP } from '../../../../utils/box'; +import { END, START } from '../../../../utils/textSpan'; import { TextLayoutCell } from './types'; const debugOut = require('debug')?.('pdf:textLayout:dom'); diff --git a/packages/discovery-react-components/src/components/DocumentPreview/utils/__tests__/box.test.ts b/packages/discovery-react-components/src/components/DocumentPreview/utils/__tests__/box.test.ts index 4c3bfeb78..e76b5f828 100644 --- a/packages/discovery-react-components/src/components/DocumentPreview/utils/__tests__/box.test.ts +++ b/packages/discovery-react-components/src/components/DocumentPreview/utils/__tests__/box.test.ts @@ -1,4 +1,4 @@ -import { findMatchingBbox } from '../box'; +import { findMatchingBbox, bboxesIntersect } from '../box'; import { CellPage } from '../../types'; const originalDocBbox = [ { @@ -329,4 +329,21 @@ describe('box', () => { ]; expect(findMatchingBbox(originalDocBbox[1] as CellPage, processedDocBbox)).toEqual(result); }); + + describe('bboxesIntersect', () => { + it('should return true when boxes intersect', () => { + expect(bboxesIntersect([10, 10, 20, 20], [15, 15, 25, 25])).toBeTruthy(); + }); + + it("should return false when boxes don't intersect", () => { + expect(bboxesIntersect([10, 10, 20, 20], [15, 25, 25, 35])).toBeFalsy(); + }); + + it('should return false when one box is on another', () => { + expect(bboxesIntersect([10, 10, 20, 20], [20, 10, 30, 20])).toBeFalsy(); + expect(bboxesIntersect([10, 10, 20, 20], [0, 10, 10, 20])).toBeFalsy(); + expect(bboxesIntersect([10, 10, 20, 20], [10, 20, 20, 30])).toBeFalsy(); + expect(bboxesIntersect([10, 10, 20, 20], [10, 0, 20, 10])).toBeFalsy(); + }); + }); }); diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/__tests__/textSpanUtils.test.ts b/packages/discovery-react-components/src/components/DocumentPreview/utils/__tests__/textSpan.test.ts similarity index 98% rename from packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/__tests__/textSpanUtils.test.ts rename to packages/discovery-react-components/src/components/DocumentPreview/utils/__tests__/textSpan.test.ts index 78c663ced..a1f7d18ad 100644 --- a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/__tests__/textSpanUtils.test.ts +++ b/packages/discovery-react-components/src/components/DocumentPreview/utils/__tests__/textSpan.test.ts @@ -1,4 +1,3 @@ -import { TextSpan } from '../../../types'; import { spanCompare, spanContains, @@ -9,7 +8,8 @@ import { spanIntersection, spanIntersects, spanLen -} from '../textSpanUtils'; +} from '../textSpan'; +import { TextSpan } from '../../types'; describe('spanGetText', () => { it('should return valid span text', () => { diff --git a/packages/discovery-react-components/src/components/DocumentPreview/utils/box.ts b/packages/discovery-react-components/src/components/DocumentPreview/utils/box.ts index e5dd2643d..448eadf82 100644 --- a/packages/discovery-react-components/src/components/DocumentPreview/utils/box.ts +++ b/packages/discovery-react-components/src/components/DocumentPreview/utils/box.ts @@ -1,13 +1,18 @@ import { CellPage } from '../types'; import { ProcessedBbox } from '../../../utils/document/processDoc'; +export const LEFT = 0; +export const TOP = 1; +export const RIGHT = 2; +export const BOTTOM = 3; + /** * Check whether two bbox intersect * @param boxA first bbox * @param boxB second bbox * @returns bool */ -export function intersects(boxA: number[], boxB: number[]): boolean { +export function bboxesIntersect(boxA: number[], boxB: number[]): boolean { const [leftA, topA, rightA, bottomA, pageA] = boxA; const [leftB, topB, rightB, bottomB, pageB] = boxB; return !( @@ -28,7 +33,7 @@ export const findMatchingBbox = (docBox: CellPage, htmlBox: ProcessedBbox[]) => return htmlBox.filter(pBbox => { const { left, top, right, bottom, page } = pBbox; const [left2, top2, right2, bottom2] = docBox.bbox; - return intersects( + return bboxesIntersect( [left2, top2, right2, bottom2, docBox.page_number], [left, top, right, bottom, page] ); diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/textSpanUtils.ts b/packages/discovery-react-components/src/components/DocumentPreview/utils/textSpan.ts similarity index 98% rename from packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/textSpanUtils.ts rename to packages/discovery-react-components/src/components/DocumentPreview/utils/textSpan.ts index e40acefce..2b0f543bf 100644 --- a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/textSpanUtils.ts +++ b/packages/discovery-react-components/src/components/DocumentPreview/utils/textSpan.ts @@ -1,4 +1,4 @@ -import { TextSpan } from '../../types'; +import { TextSpan } from '../types'; export const START = 0; export const END = 1; From 0967b0511e94c57e3ba7cf5b1ab7d9261b1f1e3e Mon Sep 17 00:00:00 2001 From: Susumu Fukuda Date: Mon, 6 Dec 2021 21:29:27 +0900 Subject: [PATCH 44/51] fix: apply review comments --- .../PdfViewerHighlight/PdfViewerHighlight.tsx | 31 +++++++----------- .../PdfViewerWithHighlight.tsx | 32 +++++++------------ .../components/PdfViewerHighlight/types.ts | 5 +-- .../utils/common/TextNormalizer.ts | 6 ++-- 4 files changed, 26 insertions(+), 48 deletions(-) diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/PdfViewerHighlight.tsx b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/PdfViewerHighlight.tsx index 636e25ef4..e0e7c886a 100644 --- a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/PdfViewerHighlight.tsx +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/PdfViewerHighlight.tsx @@ -1,15 +1,16 @@ import React, { FC, useMemo, useEffect } from 'react'; import cx from 'classnames'; -import { DocumentFieldHighlight } from './types'; +import { settings } from 'carbon-components'; import { QueryResult } from 'ibm-watson/discovery/v2'; +import { ProcessedDoc } from 'utils/document'; +import { TextMappings } from '../../types'; +import { PdfDisplayProps } from '../PdfViewer/types'; import { PdfRenderedText } from '../PdfViewer/PdfViewerTextLayer'; -import { Highlighter } from './utils/Highlighter'; +import { DocumentFieldHighlight } from './types'; import { ExtractedDocumentInfo } from './utils/common/documentUtils'; -import { settings } from 'carbon-components'; -import { TextMappings } from 'components/DocumentPreview/types'; -import { ProcessedDoc } from 'utils/document'; +import { Highlighter } from './utils/Highlighter'; -interface Props { +type Props = PdfDisplayProps & { /** * Class name to style highlight layer */ @@ -30,11 +31,6 @@ interface Props { */ parsedDocument: ExtractedDocumentInfo | null; - /** - * Current page, starting at index 1 - */ - pageNum: number; - /** * Highlight spans on fields in document */ @@ -45,11 +41,6 @@ interface Props { */ pdfRenderedText: PdfRenderedText | null; - /** - * Zoom factor, where `1` is equal to 100% - */ - scale?: number; - /** * Flag to whether or not to use bbox information from html field in the document. * True by default. This is for testing and debugging purpose. @@ -61,7 +52,7 @@ interface Props { * True by default. This is for testing and debugging purpose. */ usePdfTextItem?: boolean; -} +}; /** * Text highlight layer for PdfViewer @@ -71,10 +62,10 @@ const PdfViewerHighlight: FC = ({ highlightClassName, document, parsedDocument, - pageNum, + page, highlights, pdfRenderedText, - scale = 1.0, + scale, useHtmlBbox = true, usePdfTextItem = true }) => { @@ -83,7 +74,7 @@ const PdfViewerHighlight: FC = ({ textMappings: parsedDocument?.textMappings, processedDoc: useHtmlBbox ? parsedDocument?.processedDoc : undefined, pdfRenderedText: (usePdfTextItem && pdfRenderedText) || undefined, - pageNum + pageNum: page }); const { textDivs } = pdfRenderedText || {}; diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/PdfViewerWithHighlight.tsx b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/PdfViewerWithHighlight.tsx index cd32c1598..5960059d4 100644 --- a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/PdfViewerWithHighlight.tsx +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/PdfViewerWithHighlight.tsx @@ -1,10 +1,11 @@ -import React, { FC, useState, useEffect } from 'react'; -import { DocumentFieldHighlight } from './types'; -import PdfViewer, { PdfViewerProps } from '../PdfViewer/PdfViewer'; -import PdfViewerHighlight from './PdfViewerHighlight'; -import { extractDocumentInfo, ExtractedDocumentInfo } from './utils/common/documentUtils'; +import React, { FC, useState, useCallback } from 'react'; import { QueryResult } from 'ibm-watson/discovery/v2'; +import useAsyncFunctionCall from 'utils/useAsyncFunctionCall'; +import PdfViewer, { PdfViewerProps } from '../PdfViewer/PdfViewer'; import { PdfRenderedText } from '../PdfViewer/PdfViewerTextLayer'; +import { DocumentFieldHighlight } from './types'; +import PdfViewerHighlight from './PdfViewerHighlight'; +import { extractDocumentInfo } from './utils/common/documentUtils'; interface Props extends PdfViewerProps { /** @@ -42,30 +43,19 @@ const PdfViewerWithHighlight: FC = ({ const { page, scale } = rest; const [renderedText, setRenderedText] = useState(null); - const [documentInfo, setDocumentInfo] = useState(null); - useEffect(() => { - let cancelled = false; - const extractDocInfo = async () => { - const info = await extractDocumentInfo(document); - if (!cancelled) { - setDocumentInfo(info); - } - }; - extractDocInfo(); - return () => { - cancelled = true; - }; - }, [document]); + const documentInfo = useAsyncFunctionCall( + useCallback(async () => await extractDocumentInfo(document), [document]) + ); const highlightReady = !!documentInfo && !!renderedText; return ( - + ' ', + normal: (_: string) => ' ', regexString: '\\s+' }; @@ -39,7 +39,7 @@ const DOUBLE_QUOTE: CharNormalizer = { }; const QUOTE: CharNormalizer = { - normal: () => "'", + normal: (_: string) => "'", regexString: `[${[ '‹', // U+2039 '›', // U+203A @@ -67,7 +67,7 @@ const SURROGATE_PAIR: CharNormalizer = { // NOTE: we may have to do this after conversion again // str.normalize("NFD").replace(/[\u0300-\u036f]/g, "") const DIACRITICAL_MARK: CharNormalizer = { - normal: () => '', + normal: (_: string) => '', regexString: '[\u0300-\u036f]' }; const DIACRITICAL_MARK_REGEX = new RegExp(DIACRITICAL_MARK.regexString, 'g'); From bb448d5b37cbbc70a2354bd32813368c696718f9 Mon Sep 17 00:00:00 2001 From: Susumu Fukuda Date: Mon, 6 Dec 2021 22:00:15 +0900 Subject: [PATCH 45/51] fix: remove unnecessary change --- packages/discovery-react-components/package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/discovery-react-components/package.json b/packages/discovery-react-components/package.json index f4c8bd4e6..6bd4f8468 100644 --- a/packages/discovery-react-components/package.json +++ b/packages/discovery-react-components/package.json @@ -18,7 +18,7 @@ "eslint": "yarn run g:eslint --quiet '{src,.storybook}/**/*.{js,jsx,ts,tsx}'", "lint": "yarn run circular && yarn run eslint", "start": "rollup -c -w", - "storybook": "../../node_modules/.bin/start-storybook --ci --port=9002", + "storybook": "start-storybook --ci --port=9002", "storybook:build": "build-storybook", "storybook:build:release": "cross-env STORYBOOK_BUILD_MODE=production build-storybook -o ../../docs/storybook", "analyze": "yarn run g:analyze 'dist/index.js'", From d651ea81863f7f09b044296c4cef9a477ab19c7e Mon Sep 17 00:00:00 2001 From: Susumu Fukuda Date: Mon, 6 Dec 2021 22:00:27 +0900 Subject: [PATCH 46/51] chore: add comment --- .../components/PdfViewerHighlight/utils/Highlighter.ts | 3 +++ 1 file changed, 3 insertions(+) diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/Highlighter.ts b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/Highlighter.ts index 7afa139bf..b977bbda0 100644 --- a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/Highlighter.ts +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/Highlighter.ts @@ -19,6 +19,9 @@ function debug(...args: any) { debugOut?.apply(null, args); } +/** + * Highlighter - calculate highlight bbox from spans on text fields + */ export class Highlighter { readonly pageNum: number; private readonly textMappingsLayout: TextMappingsTextLayout; From e1fe864bed333975123024e82165ba73f158955b Mon Sep 17 00:00:00 2001 From: Susumu Fukuda Date: Tue, 7 Dec 2021 15:36:26 +0900 Subject: [PATCH 47/51] chore: upadte yarn lock file --- yarn.lock | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/yarn.lock b/yarn.lock index b047d1520..59fa173ae 100644 --- a/yarn.lock +++ b/yarn.lock @@ -2267,7 +2267,7 @@ __metadata: languageName: node linkType: hard -"@ibm-watson/discovery-react-components@^1.5.0-beta.3, @ibm-watson/discovery-react-components@workspace:packages/discovery-react-components": +"@ibm-watson/discovery-react-components@^1.5.0-beta.4, @ibm-watson/discovery-react-components@workspace:packages/discovery-react-components": version: 0.0.0-use.local resolution: "@ibm-watson/discovery-react-components@workspace:packages/discovery-react-components" dependencies: @@ -2305,7 +2305,7 @@ __metadata: languageName: unknown linkType: soft -"@ibm-watson/discovery-styles@^1.5.0-beta.2, @ibm-watson/discovery-styles@workspace:packages/discovery-styles": +"@ibm-watson/discovery-styles@^1.5.0-beta.4, @ibm-watson/discovery-styles@workspace:packages/discovery-styles": version: 0.0.0-use.local resolution: "@ibm-watson/discovery-styles@workspace:packages/discovery-styles" dependencies: @@ -10260,8 +10260,8 @@ __metadata: resolution: "discovery-search-app@workspace:examples/discovery-search-app" dependencies: "@carbon/icons": ^10.5.0 - "@ibm-watson/discovery-react-components": ^1.5.0-beta.3 - "@ibm-watson/discovery-styles": ^1.5.0-beta.2 + "@ibm-watson/discovery-react-components": ^1.5.0-beta.4 + "@ibm-watson/discovery-styles": ^1.5.0-beta.4 body-parser: ^1.19.0 carbon-components: ^10.6.0 carbon-components-react: ^7.7.0 From 0911538d0cf89a11a81b40795ca0c863465091de Mon Sep 17 00:00:00 2001 From: Susumu Fukuda Date: Tue, 7 Dec 2021 19:04:08 +0900 Subject: [PATCH 48/51] fix: apply review comments --- .../PdfViewerHighlight/PdfViewerHighlight.tsx | 12 +++--- .../PdfViewerWithHighlight.stories.scss | 13 +++--- .../PdfViewerWithHighlight.tsx | 8 ++-- .../PdfViewerHighlight/utils/Highlighter.ts | 2 +- .../utils/common/bboxUtils.ts | 8 +++- .../utils/textBoxMapping/CellProvider.ts | 12 ++++-- .../MappingTargetCellProvider.ts | 12 ++++-- .../utils/textBoxMapping/TextBoxMapping.ts | 18 +++++--- .../utils/textBoxMapping/TextProvider.ts | 19 ++++++-- .../utils/textBoxMapping/getTextBoxMapping.ts | 2 +- .../utils/textLayout/BaseTextLayout.ts | 28 +++++++++--- .../utils/textLayout/HtmlBboxTextLayout.ts | 8 +++- .../textLayout/PdfTextContentTextLayout.ts | 24 ++++++++--- .../textLayout/TextMappingsTextLayout.ts | 4 +- .../utils/textLayout/dom.ts | 6 +-- .../utils/textLayout/types.ts | 43 +++++++++++++++---- .../src/components/DocumentPreview/types.ts | 2 +- .../components/DocumentPreview/utils/box.ts | 5 --- .../utils/common => utils}/nonEmpty.ts | 0 .../_document-preview-pdf-viewer.scss | 6 ++- 20 files changed, 160 insertions(+), 72 deletions(-) rename packages/discovery-react-components/src/{components/DocumentPreview/components/PdfViewerHighlight/utils/common => utils}/nonEmpty.ts (100%) diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/PdfViewerHighlight.tsx b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/PdfViewerHighlight.tsx index e0e7c886a..5ab03b7ac 100644 --- a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/PdfViewerHighlight.tsx +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/PdfViewerHighlight.tsx @@ -45,13 +45,13 @@ type Props = PdfDisplayProps & { * Flag to whether or not to use bbox information from html field in the document. * True by default. This is for testing and debugging purpose. */ - useHtmlBbox?: boolean; + _useHtmlBbox?: boolean; /** * Flag to whether to use PDF text items for finding bbox for highlighting. * True by default. This is for testing and debugging purpose. */ - usePdfTextItem?: boolean; + _usePdfTextItem?: boolean; }; /** @@ -66,14 +66,14 @@ const PdfViewerHighlight: FC = ({ highlights, pdfRenderedText, scale, - useHtmlBbox = true, - usePdfTextItem = true + _useHtmlBbox = true, + _usePdfTextItem = true }) => { const highlighter = useHighlighter({ document, textMappings: parsedDocument?.textMappings, - processedDoc: useHtmlBbox ? parsedDocument?.processedDoc : undefined, - pdfRenderedText: (usePdfTextItem && pdfRenderedText) || undefined, + processedDoc: _useHtmlBbox ? parsedDocument?.processedDoc : undefined, + pdfRenderedText: (_usePdfTextItem && pdfRenderedText) || undefined, pageNum: page }); diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/PdfViewerWithHighlight.stories.scss b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/PdfViewerWithHighlight.stories.scss index 6de187694..5703cca25 100644 --- a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/PdfViewerWithHighlight.stories.scss +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/PdfViewerWithHighlight.stories.scss @@ -1,10 +1,13 @@ +// Carbon highlight color for white theme +// https://www.carbondesignsystem.com/guidelines/color/usage/ +$highlight: #d0e2ff; + .withTextSelection { display: flex; - height: 800px; .rightPane { - flex: 1 1 auto; - width: 20%; + flex: 1 1 30%; + height: 100vh; overflow-y: scroll; p { @@ -19,7 +22,7 @@ } .highlight { - opacity: 0.4; - background: rgba(255, 64, 128, 1); + opacity: 0.3; + background: darken($highlight, 30%); } } diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/PdfViewerWithHighlight.tsx b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/PdfViewerWithHighlight.tsx index 5960059d4..1a2955b1a 100644 --- a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/PdfViewerWithHighlight.tsx +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/PdfViewerWithHighlight.tsx @@ -27,7 +27,7 @@ interface Props extends PdfViewerProps { * Consider bboxes in HTML field to highlight. * True by default. This is for testing purpose. */ - useHtmlBbox?: boolean; + _useHtmlBbox?: boolean; } /** @@ -37,7 +37,7 @@ const PdfViewerWithHighlight: FC = ({ highlightClassName, document, highlights, - useHtmlBbox, + _useHtmlBbox, ...rest }) => { const { page, scale } = rest; @@ -58,8 +58,8 @@ const PdfViewerWithHighlight: FC = ({ page={page} highlights={highlights} scale={scale} - useHtmlBbox={useHtmlBbox} - usePdfTextItem={true} + _useHtmlBbox={_useHtmlBbox} + _usePdfTextItem={true} /> ); diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/Highlighter.ts b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/Highlighter.ts index b977bbda0..0dc14f75b 100644 --- a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/Highlighter.ts +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/Highlighter.ts @@ -1,6 +1,7 @@ import { TextMappings } from 'components/DocumentPreview/types'; import flatMap from 'lodash/flatMap'; import { PDFPageViewport, TextContent } from 'pdfjs-dist'; +import { nonEmpty } from 'utils/nonEmpty'; import { DocumentFields, DocumentFieldHighlight, @@ -12,7 +13,6 @@ import { getTextBoxMappings } from './textBoxMapping'; import { TextBoxMapping, TextBoxMappingResult } from './textBoxMapping/types'; import { HtmlBboxTextLayout, PdfTextContentTextLayout, TextMappingsTextLayout } from './textLayout'; import { HtmlBboxInfo, TextLayout, TextLayoutCell } from './textLayout/types'; -import { nonEmpty } from './common/nonEmpty'; const debugOut = require('debug')?.('pdf:Highlighter'); function debug(...args: any) { diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/bboxUtils.ts b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/bboxUtils.ts index 533f0918c..cf37e0f34 100644 --- a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/bboxUtils.ts +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/bboxUtils.ts @@ -28,6 +28,13 @@ export function bboxGetSpanByRatio(bbox: Bbox, origLength: number, span: TextSpa * This is used to get a text of a line from a list of small text cells. */ export function isNextToEachOther(boxA: Bbox, boxB: Bbox): boolean { + // + // The ratio of height used to check whether two bboxes are on the same line or not. + // With the value 0.8, when more than 80% of range of height of each bbox overlaps + // one of another, they are considered on the same line. + // + const OVERLAP_RATIO = 0.8; + if (bboxesIntersect(boxA, boxB)) { return false; } @@ -38,7 +45,6 @@ export function isNextToEachOther(boxA: Bbox, boxB: Bbox): boolean { const heightB = bottomB - topB; // compare height ratio - const OVERLAP_RATIO = 0.8; if (!(heightA * OVERLAP_RATIO < heightB || heightB * OVERLAP_RATIO < heightA)) { return false; } diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/CellProvider.ts b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/CellProvider.ts index 73dd4ffe8..dd8e2f932 100644 --- a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/CellProvider.ts +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/CellProvider.ts @@ -21,7 +21,9 @@ export class CellProvider { return this.cursor < this.cells.length; } - /** get cells on a line */ + /** + * get cells on a line + */ private getNextCells(): TextLayoutCellBase[] { const { cells: lastCells, @@ -60,7 +62,9 @@ export class CellProvider { result: TextLayoutCellBase[]; } | null = null; - /** get text from cells on a line */ + /** + * get text from cells on a line + */ getNextText(): { texts: string[]; nextCellIndex: number } { const nextCells = this.getNextCells(); const texts = nextCells.map(cell => cell.text); @@ -99,7 +103,9 @@ export class CellProvider { return result; } - /** skip the current cell */ + /** + * skip the current cell + */ skip() { this.skippedCells.push(this.cells[this.cursor]); this.cursor += 1; diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/MappingTargetCellProvider.ts b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/MappingTargetCellProvider.ts index fe4434991..ff6984f6e 100644 --- a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/MappingTargetCellProvider.ts +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/MappingTargetCellProvider.ts @@ -19,7 +19,9 @@ export class MappingTargetBoxProvider { this.cellProvider = new CellProvider(cells); } - /** check whether this provider has another item to visit or not */ + /** + * check whether this provider has another item to visit or not + */ hasNext(): boolean { while (this.cellProvider.hasNext()) { const { texts, nextCellIndex } = this.cellProvider.getNextText(); @@ -41,7 +43,9 @@ export class MappingTargetBoxProvider { return false; } - /** get the next value */ + /** + * get the next value + */ getNextInfo(): { text: string; index: number } { return { text: this.current!.normalizer.normalizedText, @@ -60,7 +64,9 @@ export class MappingTargetBoxProvider { return this.cellProvider.consume(rawLength); } - /** mark the current cell skipped (when no match found in source) */ + /** + * mark the current cell skipped (when no match found in source) + */ skip() { this.current = null; this.cellProvider.skip(); diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/TextBoxMapping.ts b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/TextBoxMapping.ts index 68e0c33f6..00aed9e40 100644 --- a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/TextBoxMapping.ts +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/TextBoxMapping.ts @@ -18,7 +18,8 @@ function debug(...args: any) { } /** - * Text box mapping + * Text box mapping. Mapping between cells (i.e. text box) in a TextLayout + * to ones in another TextLayout. */ class TextBoxMappingImpl implements TextBoxMapping { private readonly mappingEntryMap: Dictionary; @@ -34,7 +35,9 @@ class TextBoxMappingImpl implements TextBoxMapping { debug(this); } - /** get text mapping entries for a given span `spanInSourceCell` on a given `sourceCell` */ + /** + * get text mapping entries for a given span `spanInSourceCell` on a given `sourceCell` + */ private getEntries( sourceCell: TextLayoutCell, spanOnSourceCell: TextSpan @@ -44,7 +47,9 @@ class TextBoxMappingImpl implements TextBoxMapping { ); } - /** @inheritdoc */ + /** + * @inheritdoc + */ apply(source: TextLayoutCellBase, aSpan?: TextSpan): TextBoxMappingResult { const span: TextSpan = aSpan || [0, source.text.length]; @@ -84,18 +89,19 @@ class TextBoxMappingImpl implements TextBoxMapping { } /** - * Text mapping builder + * Builder for the TextMapping */ export class TextBoxMappingBuilder { mappingEntries: TextBoxMappingEntry[] = []; - /** add new mapping data */ + /** + * add new mapping data + */ addMapping(text: TextBoxMappingEntry['text'], box: TextBoxMappingEntry['box']) { this.mappingEntries.push({ text, box }); debug('>> added a new mapping entry (%o) => (cell: %o)', text, text, box?.cell); } - /** get TextBoxMapping */ toTextBoxMapping() { return new TextBoxMappingImpl(this.mappingEntries); } diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/TextProvider.ts b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/TextProvider.ts index 9528b7669..700be90ce 100644 --- a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/TextProvider.ts +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/TextProvider.ts @@ -12,13 +12,24 @@ import { findLargestIndex } from '../common/findLargestIndex'; const MAX_HISTORY = 3; export type TextMatch = { - /** matched text span */ + /** + * matched text span + */ span: TextSpan; - /** text before the matched text. i.e. text that will be skipped by using this match */ + + /** + * text before the matched text. i.e. text that will be skipped by using this match + */ skipText: string; - /** distance from the nearest cursors */ + + /** + * distance from the nearest cursors + */ minHistoryDistance: number; - /** text after the matched text */ + + /** + * text after the matched text + */ textAfterEnd: string; }; diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/getTextBoxMapping.ts b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/getTextBoxMapping.ts index 1ca5d1329..44f0f0b1d 100644 --- a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/getTextBoxMapping.ts +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/getTextBoxMapping.ts @@ -1,6 +1,6 @@ import minBy from 'lodash/minBy'; +import { nonEmpty } from 'utils/nonEmpty'; import { TextSpan } from '../../types'; -import { nonEmpty } from '../common/nonEmpty'; import { bboxesIntersect } from '../../../../utils/box'; import { spanLen, spanMerge } from '../../../../utils/textSpan'; import { TextLayout, TextLayoutCell, TextLayoutCellBase } from '../textLayout/types'; diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textLayout/BaseTextLayout.ts b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textLayout/BaseTextLayout.ts index 7ad9ac173..8732f4571 100644 --- a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textLayout/BaseTextLayout.ts +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textLayout/BaseTextLayout.ts @@ -35,17 +35,23 @@ export class BaseTextLayoutCell> this.text = text; } - /** @inheritdoc */ + /** + * @inheritdoc + */ getPartial(span: TextSpan): TextLayoutCellBase { return new PartialTextLayoutCell(this, span); } - /** @inheritdoc */ + /** + * @inheritdoc + */ getNormalized(): { cell: TextLayoutCell; span?: TextSpan } { return { cell: this }; } - /** @inheritdoc */ + /** + * @inheritdoc + */ getBboxForTextSpan(span: TextSpan, options: { useRatio?: boolean }): Bbox | null { if (options?.useRatio) { return bboxGetSpanByRatio(this.bbox, this.text.length, span); @@ -53,7 +59,9 @@ export class BaseTextLayoutCell> return null; } - /** @inheritdoc */ + /** + * @inheritdoc + */ trim(): TextLayoutCellBase { return trimCell(this); } @@ -76,18 +84,24 @@ export class PartialTextLayoutCell implements TextLayoutCellBase { return spanGetText(this.base.text, this.span); } - /** @inheritdoc */ + /** + * @inheritdoc + */ getPartial(span: TextSpan): TextLayoutCellBase { const newSpan = spanIntersection(this.span, spanOffset(span, this.span[START])); return new PartialTextLayoutCell(this.base, newSpan); } - /** @inheritdoc */ + /** + * @inheritdoc + */ getNormalized() { return { cell: this.base, span: this.span }; } - /** @inheritdoc */ + /** + * @inheritdoc + */ trim(): TextLayoutCellBase { return trimCell(this); } diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textLayout/HtmlBboxTextLayout.ts b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textLayout/HtmlBboxTextLayout.ts index 359120466..db60ce738 100644 --- a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textLayout/HtmlBboxTextLayout.ts +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textLayout/HtmlBboxTextLayout.ts @@ -21,7 +21,9 @@ export class HtmlBboxTextLayout implements TextLayout { }) ?? []; } - /** @inheritdoc */ + /** + * @inheritdoc + */ cellAt(id: number) { return this.cells[id]; } @@ -57,7 +59,9 @@ class HtmlBboxTextLayoutCell extends BaseTextLayoutCell { this.processedBbox = processedBbox; // keep this for later improvement } - /** @inheritdoc */ + /** + * @inheritdoc + */ getBboxForTextSpan(span: TextSpan, options: { useRatio?: boolean }): Bbox | null { if (this.processedBbox != null) { // TODO: implement this. calculate bbox for text span using text on browser diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textLayout/PdfTextContentTextLayout.ts b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textLayout/PdfTextContentTextLayout.ts index 3399b6d33..a57dd011c 100644 --- a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textLayout/PdfTextContentTextLayout.ts +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textLayout/PdfTextContentTextLayout.ts @@ -30,22 +30,30 @@ export class PdfTextContentTextLayout implements TextLayout { - /** @inheritdoc */ + /** + * @inheritdoc + */ readonly isInHtmlBbox?: boolean; constructor( @@ -72,7 +82,9 @@ class PdfTextContentTextLayoutCell extends BaseTextLayoutCell { diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textLayout/types.ts b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textLayout/types.ts index a568253df..3dc3e9776 100644 --- a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textLayout/types.ts +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textLayout/types.ts @@ -7,9 +7,14 @@ import { Bbox, DocumentFields, TextSpan } from '../../types'; * Text layout information */ export interface TextLayout { - /** cells, paris of bbox and text, of this text layout */ + /** + * cells, paris of bbox and text, of this text layout + */ readonly cells: CellType[]; - /** get cell by ID */ + + /** + * get cell by ID + */ cellAt(id: CellType['id']): CellType; } @@ -18,10 +23,17 @@ export interface TextLayout { */ export interface TextLayoutCell extends TextLayoutCellBase { readonly parent: TextLayout; - /** ID to identify this cell in */ + + /** + * ID to identify this cell in + */ readonly id: IDType; - /** text of this cell */ + + /** + * text of this cell + */ readonly text: string; + readonly pageNum: number; readonly bbox: Bbox; @@ -31,7 +43,9 @@ export interface TextLayoutCell extends TextLayoutCellBase { */ getBboxForTextSpan(span: TextSpan, options?: { useRatio?: boolean }): Bbox | null; - /** a special property for PDF text content item cell. True when this cell overlaps HTML cell */ + /** + * a special property for PDF text content item cell. True when this cell overlaps HTML cell + */ readonly isInHtmlBbox?: boolean; } @@ -40,13 +54,24 @@ export interface TextLayoutCell extends TextLayoutCellBase { * Mainly for sub-string of a text layout cell. */ export interface TextLayoutCellBase { - /** text of this cell */ + /** + * text of this cell + */ readonly text: string; - /** get sub-span of this text layout */ + + /** + * get sub-span of this text layout + */ getPartial(span: TextSpan): TextLayoutCellBase; - /** get normalized form, the base text layout cell and a span on it */ + + /** + * get normalized form, the base text layout cell and a span on it + */ getNormalized(): { cell: TextLayoutCell; span?: TextSpan }; - /** get cell for the trimmed text */ + + /** + * get cell for the trimmed text + */ trim(): TextLayoutCellBase; } diff --git a/packages/discovery-react-components/src/components/DocumentPreview/types.ts b/packages/discovery-react-components/src/components/DocumentPreview/types.ts index 70bdf5ffe..686203d42 100644 --- a/packages/discovery-react-components/src/components/DocumentPreview/types.ts +++ b/packages/discovery-react-components/src/components/DocumentPreview/types.ts @@ -6,7 +6,7 @@ export interface TextMappings { // [ left, top, right, bottom ] export type Bbox = [number, number, number, number]; -/** [ start (inclusive), end (exclusive) ] */ +// [ start (inclusive), end (exclusive) ] export type TextSpan = [number, number]; export type Origin = 'TopLeft' | 'BottomLeft'; diff --git a/packages/discovery-react-components/src/components/DocumentPreview/utils/box.ts b/packages/discovery-react-components/src/components/DocumentPreview/utils/box.ts index 448eadf82..33005c151 100644 --- a/packages/discovery-react-components/src/components/DocumentPreview/utils/box.ts +++ b/packages/discovery-react-components/src/components/DocumentPreview/utils/box.ts @@ -1,11 +1,6 @@ import { CellPage } from '../types'; import { ProcessedBbox } from '../../../utils/document/processDoc'; -export const LEFT = 0; -export const TOP = 1; -export const RIGHT = 2; -export const BOTTOM = 3; - /** * Check whether two bbox intersect * @param boxA first bbox diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/nonEmpty.ts b/packages/discovery-react-components/src/utils/nonEmpty.ts similarity index 100% rename from packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/nonEmpty.ts rename to packages/discovery-react-components/src/utils/nonEmpty.ts diff --git a/packages/discovery-styles/scss/components/document-preview/_document-preview-pdf-viewer.scss b/packages/discovery-styles/scss/components/document-preview/_document-preview-pdf-viewer.scss index fb66bb360..28a1d3544 100644 --- a/packages/discovery-styles/scss/components/document-preview/_document-preview-pdf-viewer.scss +++ b/packages/discovery-styles/scss/components/document-preview/_document-preview-pdf-viewer.scss @@ -1,4 +1,6 @@ @import './pdfjs_web_mixins'; +@import '../../vars'; +@import './mixins'; .#{$prefix}--document-preview-pdf-viewer { position: relative; @@ -22,6 +24,6 @@ .#{$prefix}--document-preview-pdf-viewer-highlight--item { position: absolute; - opacity: 0.5; - background: rgba(0, 0, 255, 1); + opacity: 0.3; + background: darken($highlight, 30%); } From 457f2ec67d55a5a817be4b3f7448d1e8637eb2a0 Mon Sep 17 00:00:00 2001 From: Susumu Fukuda Date: Tue, 7 Dec 2021 19:42:18 +0900 Subject: [PATCH 49/51] refactor: extract common props --- .../PdfViewerHighlight/PdfViewerHighlight.tsx | 58 +++++-------------- .../PdfViewerWithHighlight.tsx | 29 ++-------- .../components/PdfViewerHighlight/types.ts | 30 ++++++++++ 3 files changed, 50 insertions(+), 67 deletions(-) diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/PdfViewerHighlight.tsx b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/PdfViewerHighlight.tsx index 5ab03b7ac..326b33afa 100644 --- a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/PdfViewerHighlight.tsx +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/PdfViewerHighlight.tsx @@ -6,53 +6,27 @@ import { ProcessedDoc } from 'utils/document'; import { TextMappings } from '../../types'; import { PdfDisplayProps } from '../PdfViewer/types'; import { PdfRenderedText } from '../PdfViewer/PdfViewerTextLayer'; -import { DocumentFieldHighlight } from './types'; import { ExtractedDocumentInfo } from './utils/common/documentUtils'; import { Highlighter } from './utils/Highlighter'; +import { HighlightProps } from './types'; -type Props = PdfDisplayProps & { - /** - * Class name to style highlight layer - */ - className?: string; +type Props = PdfDisplayProps & + HighlightProps & { + /** + * Class name to style highlight layer + */ + className?: string; - /** - * Class name to style each highlight - */ - highlightClassName?: string; + /** + * Parsed document information + */ + parsedDocument: ExtractedDocumentInfo | null; - /** - * Document data returned by query - */ - document: QueryResult; - - /** - * Parsed document information - */ - parsedDocument: ExtractedDocumentInfo | null; - - /** - * Highlight spans on fields in document - */ - highlights: DocumentFieldHighlight[]; - - /** - * PDF text content information in a page from parsed PDF - */ - pdfRenderedText: PdfRenderedText | null; - - /** - * Flag to whether or not to use bbox information from html field in the document. - * True by default. This is for testing and debugging purpose. - */ - _useHtmlBbox?: boolean; - - /** - * Flag to whether to use PDF text items for finding bbox for highlighting. - * True by default. This is for testing and debugging purpose. - */ - _usePdfTextItem?: boolean; -}; + /** + * PDF text content information in a page from parsed PDF + */ + pdfRenderedText: PdfRenderedText | null; + }; /** * Text highlight layer for PdfViewer diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/PdfViewerWithHighlight.tsx b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/PdfViewerWithHighlight.tsx index 1a2955b1a..b0c4a3661 100644 --- a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/PdfViewerWithHighlight.tsx +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/PdfViewerWithHighlight.tsx @@ -1,34 +1,12 @@ import React, { FC, useState, useCallback } from 'react'; -import { QueryResult } from 'ibm-watson/discovery/v2'; import useAsyncFunctionCall from 'utils/useAsyncFunctionCall'; import PdfViewer, { PdfViewerProps } from '../PdfViewer/PdfViewer'; import { PdfRenderedText } from '../PdfViewer/PdfViewerTextLayer'; -import { DocumentFieldHighlight } from './types'; import PdfViewerHighlight from './PdfViewerHighlight'; import { extractDocumentInfo } from './utils/common/documentUtils'; +import { HighlightProps } from './types'; -interface Props extends PdfViewerProps { - /** - * Class name to style each highlight - */ - highlightClassName?: string; - - /** - * Document data returned by query - */ - document: QueryResult; - - /** - * Highlight spans on fields in document - */ - highlights: DocumentFieldHighlight[]; - - /** - * Consider bboxes in HTML field to highlight. - * True by default. This is for testing purpose. - */ - _useHtmlBbox?: boolean; -} +type Props = PdfViewerProps & HighlightProps; /** * PDF viewer component with text highlighting capability @@ -38,6 +16,7 @@ const PdfViewerWithHighlight: FC = ({ document, highlights, _useHtmlBbox, + _usePdfTextItem, ...rest }) => { const { page, scale } = rest; @@ -59,7 +38,7 @@ const PdfViewerWithHighlight: FC = ({ highlights={highlights} scale={scale} _useHtmlBbox={_useHtmlBbox} - _usePdfTextItem={true} + _usePdfTextItem={_usePdfTextItem} /> ); diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/types.ts b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/types.ts index 962b2b9eb..a6936c80b 100644 --- a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/types.ts +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/types.ts @@ -1,5 +1,6 @@ import { Bbox as DocPreviewBbox, TextSpan as DocPreviewTextSpan } from '../../types'; import { Location } from 'utils/document/processDoc'; +import { QueryResult } from 'ibm-watson/discovery/v2'; // (re-)export useful types export type Bbox = DocPreviewBbox; @@ -37,3 +38,32 @@ export interface HighlightShapeBox { isStart?: boolean; isEnd?: boolean; } + +export interface HighlightProps { + /** + * Class name to style each highlight + */ + highlightClassName?: string; + + /** + * Document data returned by query + */ + document: QueryResult; + + /** + * Highlight spans on fields in document + */ + highlights: DocumentFieldHighlight[]; + + /** + * Consider bboxes in HTML field to highlight. + * True by default. This is for testing purpose. + */ + _useHtmlBbox?: boolean; + + /** + * Flag to whether to use PDF text items for finding bbox for highlighting. + * True by default. This is for testing and debugging purpose. + */ + _usePdfTextItem?: boolean; +} From 89e4f753d42e3e77feee2be264c5a31d22540750 Mon Sep 17 00:00:00 2001 From: Susumu Fukuda Date: Tue, 7 Dec 2021 20:01:10 +0900 Subject: [PATCH 50/51] feat: export PdfViewerWithHighlight via DocPreview --- .../src/components/DocumentPreview/DocumentPreview.tsx | 2 ++ .../components/PdfViewerHighlight/PdfViewerWithHighlight.tsx | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/packages/discovery-react-components/src/components/DocumentPreview/DocumentPreview.tsx b/packages/discovery-react-components/src/components/DocumentPreview/DocumentPreview.tsx index b06e33612..342cce130 100644 --- a/packages/discovery-react-components/src/components/DocumentPreview/DocumentPreview.tsx +++ b/packages/discovery-react-components/src/components/DocumentPreview/DocumentPreview.tsx @@ -8,6 +8,7 @@ import SimpleDocument from './components/SimpleDocument/SimpleDocument'; import withErrorBoundary, { WithErrorBoundaryProps } from 'utils/hoc/withErrorBoundary'; import { defaultMessages, Messages } from './messages'; import HtmlView from './components/HtmlView/HtmlView'; +import PdfViewerWithHighlight from './components/PdfViewerHighlight/PdfViewerWithHighlight'; import { isCsvFile, isJsonFile } from './utils/documentData'; const { ZOOM_IN, ZOOM_OUT } = PreviewToolbar; @@ -154,6 +155,7 @@ function PreviewDocument({ const ErrorBoundDocumentPreview: any = withErrorBoundary(DocumentPreview); ErrorBoundDocumentPreview.PreviewToolbar = PreviewToolbar; ErrorBoundDocumentPreview.PreviewDocument = PreviewDocument; +ErrorBoundDocumentPreview.PdfViewerWithHighlight = PdfViewerWithHighlight; export default ErrorBoundDocumentPreview; export { ErrorBoundDocumentPreview as DocumentPreview }; diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/PdfViewerWithHighlight.tsx b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/PdfViewerWithHighlight.tsx index b0c4a3661..e706cb321 100644 --- a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/PdfViewerWithHighlight.tsx +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/PdfViewerWithHighlight.tsx @@ -32,7 +32,7 @@ const PdfViewerWithHighlight: FC = ({ Date: Wed, 8 Dec 2021 21:55:57 +0900 Subject: [PATCH 51/51] fix: fix app build --- examples/discovery-search-app/package.json | 1 + yarn.lock | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/examples/discovery-search-app/package.json b/examples/discovery-search-app/package.json index 283978d2a..0e67e055f 100644 --- a/examples/discovery-search-app/package.json +++ b/examples/discovery-search-app/package.json @@ -28,6 +28,7 @@ "carbon-components": "^10.6.0", "carbon-components-react": "^7.7.0", "classnames": "^2.2.6", + "core-js": "^2.6.12", "cors": "^2.8.5", "dotenv": "^8.1.0", "express": "^4.17.1", diff --git a/yarn.lock b/yarn.lock index 59fa173ae..cf9abdb5f 100644 --- a/yarn.lock +++ b/yarn.lock @@ -9102,7 +9102,7 @@ __metadata: languageName: node linkType: hard -"core-js@npm:^2.4.0": +"core-js@npm:^2.4.0, core-js@npm:^2.6.12": version: 2.6.12 resolution: "core-js@npm:2.6.12" checksum: 44fa9934a85f8c78d61e0c8b7b22436330471ffe59ec5076fe7f324d6e8cf7f824b14b1c81ca73608b13bdb0fef035bd820989bf059767ad6fa13123bb8bd016 @@ -10266,6 +10266,7 @@ __metadata: carbon-components: ^10.6.0 carbon-components-react: ^7.7.0 classnames: ^2.2.6 + core-js: ^2.6.12 cors: ^2.8.5 cross-env: ^7.0.3 dotenv: ^8.1.0