Skip to content

Commit dbce1f2

Browse files
authored
Merge branch 'cppdoc-cc:main' into main
2 parents e9a541b + 186f8f1 commit dbce1f2

File tree

2 files changed

+177
-10
lines changed

2 files changed

+177
-10
lines changed

migrate/migrate-bot.ts

Lines changed: 34 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import path, { join } from "path";
55
import { fileURLToPath } from "url";
66
import { execSync, spawnSync } from "child_process";
77
import { visualizeTextDiff } from "./text-diff-visualizer";
8+
import { getTextFromDOM } from "./text-from-element";
89

910
const __dirname = path.dirname(fileURLToPath(import.meta.url));
1011

@@ -69,7 +70,17 @@ async function fetchPageContent(
6970
const html = await response.text();
7071
const dom = new JSDOM(html);
7172
const contentElement = dom.window.document.querySelector("#mw-content-text");
72-
contentElement?.querySelector(".t-navbar")?.remove();
73+
74+
const selectorsToRemove = [
75+
".t-navbar",
76+
".t-example-live-link",
77+
".editsection",
78+
"#toc",
79+
];
80+
for (const selector of selectorsToRemove) {
81+
const elements = contentElement?.querySelectorAll(selector);
82+
elements?.forEach((el) => el.remove());
83+
}
7384
const headingElement = dom.window.document.querySelector("#firstHeading");
7485
if (!contentElement) {
7586
throw new Error("Could not find #mw-content-text");
@@ -78,7 +89,7 @@ async function fetchPageContent(
7889
html: contentElement.innerHTML,
7990
title: headingElement?.textContent?.trim() || "",
8091
url,
81-
innerText: contentElement.textContent?.trim() || "",
92+
innerText: getTextFromDOM(contentElement),
8293
};
8394
}
8495

@@ -91,7 +102,7 @@ async function convertToMDX(
91102
"{{LLM_DOCS}}",
92103
await readFile(
93104
__dirname +
94-
"/../src/content/docs/development/guide/component-docs-for-llm.mdx",
105+
"/../src/content/docs/development/guide/component-docs-for-llm.mdx",
95106
"utf8"
96107
)
97108
);
@@ -295,7 +306,14 @@ async function createPullRequest(
295306
.then((data) => {
296307
const dom = new JSDOM(data);
297308
const contentElement = dom.window.document.querySelector("main");
298-
return contentElement?.textContent?.trim() || "";
309+
const selectorsToRemove = [".sl-anchor-link"];
310+
for (const selector of selectorsToRemove) {
311+
const elements = contentElement?.querySelectorAll(selector);
312+
elements?.forEach((el) => el.remove());
313+
}
314+
315+
if (!contentElement) return "";
316+
return getTextFromDOM(contentElement);
299317
})
300318
.catch(() => "");
301319

@@ -423,19 +441,25 @@ async function main() {
423441
console.log(` 写入 ${filePath}`);
424442
await writeMDXFile(filePath, mdx, title);
425443

426-
console.log(` 尝试构建...`);
444+
console.log(` 重新格式化...`);
445+
spawnSync(`npm`, ["run", "format"], {
446+
stdio: "inherit",
447+
shell: true,
448+
});
449+
450+
console.log(` 构建...`);
427451
const res = spawnSync(`npm`, ["run", "build"], {
428452
stdio: "inherit",
429453
shell: true,
430454
});
431455
if (res.status !== 0) {
432456
throw new Error(
433457
"构建失败,可能生成的MDX有问题:" +
434-
res.stderr?.toString() +
435-
res.stdout?.toString() +
436-
res.error?.toString() +
437-
" exit code " +
438-
res.status
458+
res.stderr?.toString() +
459+
res.stdout?.toString() +
460+
res.error?.toString() +
461+
" exit code " +
462+
res.status
439463
);
440464
}
441465

migrate/text-from-element.ts

Lines changed: 143 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,143 @@
1+
import { JSDOM } from "jsdom"
2+
3+
interface GetTextOptions {
4+
treatBlockAsNewline?: boolean;
5+
collapseSpaces?: boolean;
6+
trimResult?: boolean;
7+
}
8+
9+
const BLOCK_ELEMENTS = [
10+
'div', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
11+
'ul', 'ol', 'li', 'table', 'tr', 'td', 'th',
12+
'section', 'article', 'header', 'footer', 'nav',
13+
'aside', 'main', 'figure', 'figcaption', 'blockquote',
14+
'pre', 'form', 'fieldset', 'legend', 'dl', 'dt', 'dd',
15+
'hr', 'br'
16+
];
17+
18+
const Node = new JSDOM('').window.Node;
19+
function isBlockElement(node: Node): boolean {
20+
if (node.nodeType !== Node.ELEMENT_NODE) return false;
21+
const element = node as HTMLElement;
22+
23+
if (BLOCK_ELEMENTS.includes(element.tagName.toLowerCase())) {
24+
return true;
25+
}
26+
27+
const style = window.getComputedStyle(element);
28+
return style.display === 'block' ||
29+
style.display === 'flex' ||
30+
style.display === 'grid' ||
31+
style.display.startsWith('table');
32+
}
33+
34+
function isElementVisible(element: HTMLElement): boolean {
35+
const style = window.getComputedStyle(element);
36+
return style.display !== 'none' &&
37+
style.visibility !== 'hidden' &&
38+
style.opacity !== '0';
39+
}
40+
41+
export function getTextFromDOM(
42+
node: Node,
43+
options: GetTextOptions = {}
44+
): string {
45+
const {
46+
treatBlockAsNewline = true,
47+
collapseSpaces = true,
48+
trimResult = true
49+
} = options;
50+
51+
let result = '';
52+
let lastChar = '';
53+
54+
function processNode(currentNode: Node, isBlockContext: boolean) {
55+
if (!currentNode) return;
56+
if (currentNode.nodeType === Node.ELEMENT_NODE) {
57+
const element = currentNode as HTMLElement;
58+
if (!isElementVisible(element)) return;
59+
60+
const isBlock = isBlockElement(currentNode);
61+
const tagName = element.tagName.toLowerCase();
62+
63+
if (tagName === 'br') {
64+
result += '\n';
65+
lastChar = '\n';
66+
return;
67+
}
68+
69+
if (tagName === 'hr') {
70+
result += '\n---\n';
71+
lastChar = '\n';
72+
return;
73+
}
74+
75+
76+
if (tagName === 'pre') {
77+
const text = element.textContent || '';
78+
if (text) {
79+
result += text;
80+
lastChar = text[text.length - 1] || '';
81+
}
82+
return;
83+
}
84+
85+
const shouldAddNewline = treatBlockAsNewline && isBlock;
86+
const separator = shouldAddNewline ? '\n' : ' ';
87+
88+
if (isBlock && result.length > 0 && lastChar !== '\n') {
89+
result += separator;
90+
lastChar = separator;
91+
}
92+
93+
const currentIsBlockContext = isBlock || isBlockContext;
94+
for (const childNode of Array.from(element.childNodes)) {
95+
processNode(childNode, currentIsBlockContext);
96+
}
97+
98+
if (isBlock && result.length > 0 && lastChar !== '\n') {
99+
result += separator;
100+
lastChar = separator;
101+
}
102+
} else if (currentNode.nodeType === Node.TEXT_NODE) {
103+
let text = currentNode.textContent || '';
104+
if (text.trim() === '') return;
105+
text = text.replace(/\s+/g, ' ');
106+
if (text.startsWith(' ')) {
107+
if (result.length > 0 && lastChar !== ' ' && lastChar !== '\n') {
108+
result += ' ';
109+
lastChar = ' ';
110+
}
111+
text = text.substring(1);
112+
}
113+
114+
if (text) {
115+
const endsWithSpace = text.endsWith(' ');
116+
const cleanText = endsWithSpace ? text.slice(0, -1) : text;
117+
result += cleanText;
118+
lastChar = cleanText[cleanText.length - 1] || '';
119+
if (endsWithSpace && lastChar !== ' ' && lastChar !== '\n') {
120+
result += ' ';
121+
lastChar = ' ';
122+
}
123+
}
124+
} else {
125+
return;
126+
}
127+
}
128+
129+
130+
const initialIsBlock = isBlockElement(node);
131+
processNode(node, initialIsBlock);
132+
133+
if (collapseSpaces) {
134+
result = result.replace(/[ \t]+/g, ' ');
135+
result = result.replace(/\n{3,}/g, '\n\n');
136+
}
137+
138+
if (trimResult) {
139+
result = result.trim();
140+
}
141+
142+
return result;
143+
}

0 commit comments

Comments
 (0)