// 删除标题里面没用的内容
const pickTextNode = (el: HTMLElement) => {
const fragment = document.createDocumentFragment();
while (el.firstChild) {
// 检查当前子节点是否是文本节点
if (el.firstChild.nodeType === Node.TEXT_NODE) {
// 如果是文本节点,将其移动到 DocumentFragment 中
fragment.appendChild(el.firstChild);
}
else {
el.removeChild(el.firstChild);
}
}
return fragment.textContent;
}
// 删除里面的属性
const removeAttributes = (el: HTMLElement) => {
const attrs = el.attributes;
Array.from(attrs).forEach((attr) => {
el.removeAttribute(attr.name);
});
}
interface FormatHTMLOptions {
formatter?: (doc: Document) => void;
baseURL?: string;
}
// 格式化 HTML 内容
export const formatHTML = (html: string, options: FormatHTMLOptions = {}) => {
const { formatter, baseURL } = options;
// 删除空格
html = html.replaceAll(" ", "");
const nextDocument = document.implementation.createHTMLDocument();
nextDocument.documentElement.innerHTML = html;
// 删除标题内无效元素
const titleEl = nextDocument.querySelectorAll("h2, h3, h4, h5, h6") as NodeListOf;
titleEl.forEach((el) => {
// 遍历并移除所有属性
removeAttributes(el);
el.innerHTML = pickTextNode(el);
});
// 优化 p 标签
const paraEl = nextDocument.querySelectorAll("p") as NodeListOf;
paraEl.forEach((el) => {
// 删除空的 p 标签(没图片的)
if (!el.innerText.trim() && !el.querySelector("img")) {
el.remove();
}
// 遍历并移除所有属性
removeAttributes(el);
// 删除 p 里面的 span 替换成普通 Text,应该没用的
const spanEl = el.querySelectorAll("span");
spanEl.forEach((el) => {
el.parentNode.replaceChild(document.createTextNode(el.innerText), el);
});
});
// 优化 img 标签,仅保留有效内容
const imgEl = nextDocument.querySelectorAll("img") as NodeListOf;
imgEl.forEach((el) => {
const tempImg = document.createElement("img");
// 创建一个临时的 a 元素来解析相对路径
const tempLink = document.createElement("a");
tempLink.href = el.src;
// 如果提供了 baseURL 且图片链接是相对路径,则使用 baseURL 构建完整链接
if (baseURL && !tempLink.href.startsWith('http')) {
tempImg.src = new URL(el.src, baseURL).href;
} else {
tempImg.src = tempLink.href;
}
tempImg.alt = el.alt;
// 复制 width 和 height 属性
if (el.width) {
tempImg.width = el.width;
}
if (el.height) {
tempImg.height = el.height;
}
el.parentNode.replaceChild(tempImg, el);
});
// 删除 figure 标签
const figureEl = nextDocument.querySelectorAll("figure") as NodeListOf;
figureEl.forEach((el) => {
const img = el.querySelector("img");
if (img) {
el.innerHTML = "";
el.appendChild(img);
}
});
// 删除 style 标签
const stylesEl = nextDocument.querySelectorAll("style");
stylesEl.forEach((el) => {
el.remove();
});
// 删除 script 标签
const scriptEl = nextDocument.querySelectorAll("script");
scriptEl.forEach((el) => {
el.remove();
});
// 提取 pre 下面的内容
const preEl = nextDocument.querySelectorAll("pre");
preEl.forEach((el) => {
removeAttributes(el);
// hljs / prism
const codeEl = el.querySelector("code") as HTMLElement;
if (codeEl) {
const nextCodeEl = document.createElement("code");
nextCodeEl.innerText = codeEl.innerText;
el.innerHTML = null;
el.appendChild(nextCodeEl);
}
});
if (formatter) {
formatter(nextDocument);
}
return nextDocument.documentElement.innerHTML;
}