Skip to content

Commit f43bbab

Browse files
fix: improve migrate
1 parent f576d20 commit f43bbab

File tree

2 files changed

+29
-4
lines changed

2 files changed

+29
-4
lines changed

migrate/PROMPT.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
You are a professional C++ documentation writer. You are now migrating cppreference.com documentation from HTML format to MDX format. During this process, you must adhere to the following rules:
22
1. Only migrate the format, ensuring that the text of the migrated result is **exactly the same** as the original. Of course, you don't need to process text that was originally invisible.
3-
2. Do not try to write your own component. Do not try to write your own component. Do not try to write your own component. DO NOT USE NORMAL HTML ELEMENTS. DO NOT USE <table>, <tr> <td>. Replace them with our MDX component or markdown grammer.
3+
2. Do not try to write your own component. Do not try to write your own component. Do not try to write your own component. DO NOT USE NORMAL HTML ELEMENTS. DO NOT USE <table>, <tr> <td>. Replace them with our MDX component or markdown table.
44
3. For links, take the URL part, remove `/w/` and the latter part `.html`, and then wrap it with `DocLink`. For example:
55
If the current path is: `/w/cpp/language/basics.html`
66
Link: `<a href="declarations.html" title="cpp/language/declarations">declarations</a>`

migrate/migrate-bot.js

Lines changed: 28 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,19 @@ if (!OPENROUTER_API_KEY) {
2525

2626
const octokit = new Octokit({ auth: GITHUB_TOKEN });
2727

28+
function retry(fn, retries = 3, delay = 1000) {
29+
return fn().catch((err) => {
30+
if (retries > 0) {
31+
console.log(`Retrying... (${retries} retries left)`, err.message, err.stack);
32+
return new Promise((resolve) =>
33+
setTimeout(() => resolve(retry(fn, retries - 1, delay)), delay),
34+
);
35+
} else {
36+
return Promise.reject(err);
37+
}
38+
});
39+
}
40+
2841
function extractLink(title) {
2942
const urlRegex = /https?:\/\/en\.cppreference\.com\/w\/[^\s]+/g;
3043
const match = title.match(urlRegex);
@@ -133,7 +146,7 @@ ${html}
133146
]
134147

135148
const usedComponents = components.filter((comp) => content.includes(`<${comp} `) || content.includes(`<${comp}>`));
136-
149+
137150
// Remove all existing import statements
138151
content = content.split('\n').filter(line => !line.startsWith('import ')).join('\n');
139152

@@ -145,6 +158,18 @@ ${html}
145158
content = importStatements + content;
146159
}
147160

161+
// Verify content
162+
let normalElements = ["<div", "<section", "<span", "<table", "<thead", "<tbody", "<tr", "<td", "<th"], normalElementsCount = 0;
163+
for (const elem of normalElements) {
164+
normalElementsCount += (content.match(new RegExp(elem, 'g')) || []).length;
165+
}
166+
167+
console.log(`Normal HTML elements count: ${normalElementsCount}`);
168+
169+
if (normalElementsCount > 4) {
170+
throw new Error("生成的内容中包含过多原生HTML元素,可能转换失败。");
171+
}
172+
148173
return content;
149174
}
150175

@@ -267,10 +292,10 @@ async function main() {
267292
}
268293

269294
console.log(` 获取 ${url}`);
270-
const { html, title } = await fetchPageContent(url);
295+
const { html, title } = await retry(() => fetchPageContent(url), 3, 2000);
271296

272297
console.log(` 转换HTML为MDX...`);
273-
const mdx = await convertToMDX(html, title, url);
298+
const mdx = await retry(() => convertToMDX(html, title, url), 3, 2000);
274299

275300
const filePath = getLocalPath(url);
276301
console.log(` 写入 ${filePath}`);

0 commit comments

Comments
 (0)