Switch from happy-dom to cheerio

* Refine code
This commit is contained in:
Philipinho
2025-05-29 17:49:34 -07:00
parent 8143452a21
commit 95f24d9ba5
7 changed files with 538 additions and 803 deletions

View File

@ -31,7 +31,7 @@
},
"dependencies": {
"@aws-sdk/client-s3": "3.701.0",
"@aws-sdk/lib-storage": "^3.701.0",
"@aws-sdk/lib-storage": "3.701.0",
"@aws-sdk/s3-request-presigner": "3.701.0",
"@casl/ability": "^6.7.3",
"@fastify/cookie": "^11.0.2",
@ -57,6 +57,7 @@
"bcrypt": "^5.1.1",
"bullmq": "^5.41.3",
"cache-manager": "^6.4.0",
"cheerio": "^1.0.0",
"class-transformer": "^0.5.1",
"class-validator": "^0.14.1",
"cookie": "^1.0.2",

View File

@ -18,17 +18,19 @@ import {
import { v7 } from 'uuid';
import { generateJitteredKeyBetween } from 'fractional-indexing-jittered';
import { FileTask, InsertablePage } from '@docmost/db/types/entity.types';
import {
DOMParser,
Node as HDNode,
Element as HDElement,
Window,
} from 'happy-dom';
import { markdownToHtml } from '@docmost/editor-ext';
import { getAttachmentFolderPath } from '../../core/attachment/attachment.utils';
import { AttachmentType } from '../../core/attachment/attachment.constants';
import { getProsemirrorContent } from '../../common/helpers/prosemirror/utils';
import { formatImportHtml, notionFormatter } from './import-formatter';
import { formatImportHtml, unwrapFromParagraph } from './import-formatter';
import {
buildAttachmentCandidates,
collectMarkdownAndHtmlFiles,
resolveRelativeAttachmentPath,
} from './import.utils';
import { executeTx } from '@docmost/db/utils';
import { BacklinkRepo } from '@docmost/db/repos/backlink/backlink.repo';
import { load } from 'cheerio';
@Injectable()
export class FileTaskService {
@ -37,6 +39,7 @@ export class FileTaskService {
constructor(
private readonly storageService: StorageService,
private readonly importService: ImportService,
private readonly backlinkRepo: BacklinkRepo,
@InjectKysely() private readonly db: KyselyDB,
) {}
@ -48,7 +51,12 @@ export class FileTaskService {
.executeTakeFirst();
if (!fileTask) {
this.logger.log(`File task with ID ${fileTaskId} not found`);
this.logger.log(`Import file task with ID ${fileTaskId} not found`);
return;
}
if (fileTask.status === FileTaskStatus.Success) {
this.logger.log('Imported task already processed.');
return;
}
@ -68,15 +76,27 @@ export class FileTaskService {
await extractZip(tmpZipPath, tmpExtractDir);
// TODO: backlinks
try {
await this.updateTaskStatus(fileTaskId, FileTaskStatus.Processing);
// if type == generic
await this.processGenericImport({ extractDir: tmpExtractDir, fileTask });
if (fileTask.source === 'generic') {
await this.processGenericImport({
extractDir: tmpExtractDir,
fileTask,
});
}
/*
if (fileTask.source === 'confluence') {
await this.processConfluenceImport({
extractDir: tmpExtractDir,
fileTask,
});
}*/
await this.updateTaskStatus(fileTaskId, FileTaskStatus.Success);
} catch (error) {
await this.updateTaskStatus(fileTaskId, FileTaskStatus.Failed);
console.error(error);
this.logger.error(error);
} finally {
await cleanupTmpFile();
await cleanupTmpDir();
@ -88,12 +108,8 @@ export class FileTaskService {
fileTask: FileTask;
}): Promise<void> {
const { extractDir, fileTask } = opts;
const allFiles = await this.collectMarkdownAndHtmlFiles(extractDir);
const attachmentCandidates =
await this.buildAttachmentCandidates(extractDir);
console.log('attachment count: ', attachmentCandidates.size);
const allFiles = await collectMarkdownAndHtmlFiles(extractDir);
const attachmentCandidates = await buildAttachmentCandidates(extractDir);
const pagesMap = new Map<
string,
@ -117,22 +133,8 @@ export class FileTaskService {
const ext = path.extname(relPath).toLowerCase();
let content = await fs.readFile(absPath, 'utf-8');
console.log('relative path: ', relPath, ' abs path: ', absPath);
if (ext.toLowerCase() === '.html' || ext.toLowerCase() === '.md') {
// we want to process all inputs as markr
if (ext === '.md') {
content = await markdownToHtml(content);
}
content = await this.rewriteLocalFilesInHtml({
html: content,
pageRelativePath: relPath,
extractDir,
pageId: v7(),
fileTask,
attachmentCandidates,
});
if (ext.toLowerCase() === '.md') {
content = await markdownToHtml(content);
}
pagesMap.set(relPath, {
@ -195,23 +197,34 @@ export class FileTaskService {
});
});
const insertablePages: InsertablePage[] = await Promise.all(
const pageResults = await Promise.all(
Array.from(pagesMap.values()).map(async (page) => {
const htmlContent = await this.rewriteInternalLinksToMentionHtml(
page.content,
page.filePath,
filePathToPageMetaMap,
fileTask.creatorId,
);
const htmlContent = await this.rewriteLocalFilesInHtml({
html: page.content,
pageRelativePath: page.filePath,
extractDir,
pageId: page.id,
fileTask,
attachmentCandidates,
});
const { html, backlinks } = await formatImportHtml({
html: htmlContent,
currentFilePath: page.filePath,
filePathToPageMetaMap: filePathToPageMetaMap,
creatorId: fileTask.creatorId,
sourcePageId: page.id,
workspaceId: fileTask.workspaceId,
});
const pmState = getProsemirrorContent(
await this.importService.processHTML(formatImportHtml(htmlContent)),
await this.importService.processHTML(html),
);
const { title, prosemirrorJson } =
this.importService.extractTitleAndRemoveHeading(pmState);
return {
const insertablePage: InsertablePage = {
id: page.id,
slugId: page.slugId,
title: title || page.name,
@ -225,18 +238,28 @@ export class FileTaskService {
lastUpdatedById: fileTask.creatorId,
parentPageId: page.parentPageId,
};
return { insertablePage, backlinks };
}),
);
try {
await this.db.insertInto('pages').values(insertablePages).execute();
//todo: avoid duplicates
// log success
// backlinks mapping
// handle svg diagram nodes
} catch (e) {
console.error(e);
}
const insertablePages = pageResults.map((r) => r.insertablePage);
const insertableBacklinks = pageResults.flatMap((r) => r.backlinks);
if (insertablePages.length < 1) return;
const validPageIds = new Set(insertablePages.map((row) => row.id));
const filteredBacklinks = insertableBacklinks.filter(
({ sourcePageId, targetPageId }) =>
validPageIds.has(sourcePageId) && validPageIds.has(targetPageId),
);
await executeTx(this.db, async (trx) => {
await trx.insertInto('pages').values(insertablePages).execute();
if (filteredBacklinks.length > 0) {
await this.backlinkRepo.insertBacklink(filteredBacklinks, trx);
}
});
}
async rewriteLocalFilesInHtml(opts: {
@ -256,11 +279,7 @@ export class FileTaskService {
attachmentCandidates,
} = opts;
const window = new Window();
const doc = window.document;
doc.body.innerHTML = html;
const tasks: Promise<void>[] = [];
const attachmentTasks: Promise<void>[] = [];
const processFile = (relPath: string) => {
const abs = attachmentCandidates.get(relPath)!;
@ -274,13 +293,13 @@ export class FileTaskService {
const apiFilePath = `/api/files/${attachmentId}/${fileNameWithExt}`;
tasks.push(
attachmentTasks.push(
(async () => {
const fileStream = createReadStream(abs);
await this.storageService.uploadStream(storageFilePath, fileStream);
const stat = await fs.stat(abs);
const uploaded = await this.db
await this.db
.insertInto('attachments')
.values({
id: attachmentId,
@ -295,9 +314,7 @@ export class FileTaskService {
pageId,
spaceId: fileTask.spaceId,
})
.returningAll()
.execute();
console.log(uploaded);
})(),
);
@ -311,12 +328,15 @@ export class FileTaskService {
};
const pageDir = path.dirname(pageRelativePath);
const $ = load(html);
for (const img of Array.from(doc.getElementsByTagName('img'))) {
const src = cleanUrlString(img.getAttribute('src')) ?? '';
// image
for (const imgEl of $('img').toArray()) {
const $img = $(imgEl);
const src = cleanUrlString($img.attr('src') ?? '')!;
if (!src || src.startsWith('http')) continue;
const relPath = this.resolveRelativeAttachmentPath(
const relPath = resolveRelativeAttachmentPath(
src,
pageDir,
attachmentCandidates,
@ -326,24 +346,26 @@ export class FileTaskService {
const { attachmentId, apiFilePath, abs } = processFile(relPath);
const stat = await fs.stat(abs);
const width = img.getAttribute('width') || '100%';
const align = img.getAttribute('data-align') || 'center';
const width = $img.attr('width') ?? '100%';
const align = $img.attr('data-align') ?? 'center';
img.setAttribute('src', apiFilePath);
img.setAttribute('data-attachment-id', attachmentId);
img.setAttribute('data-size', stat.size.toString());
img.setAttribute('width', width);
img.setAttribute('data-align', align);
$img
.attr('src', apiFilePath)
.attr('data-attachment-id', attachmentId)
.attr('data-size', stat.size.toString())
.attr('width', width)
.attr('data-align', align);
this.unwrapFromParagraph(img);
unwrapFromParagraph($, $img);
}
// rewrite <video>
for (const vid of Array.from(doc.getElementsByTagName('video'))) {
const src = cleanUrlString(vid.getAttribute('src')) ?? '';
// video
for (const vidEl of $('video').toArray()) {
const $vid = $(vidEl);
const src = cleanUrlString($vid.attr('src') ?? '')!;
if (!src || src.startsWith('http')) continue;
const relPath = this.resolveRelativeAttachmentPath(
const relPath = resolveRelativeAttachmentPath(
src,
pageDir,
attachmentCandidates,
@ -353,69 +375,25 @@ export class FileTaskService {
const { attachmentId, apiFilePath, abs } = processFile(relPath);
const stat = await fs.stat(abs);
const width = vid.getAttribute('width') || '100%';
const align = vid.getAttribute('data-align') || 'center';
const width = $vid.attr('width') ?? '100%';
const align = $vid.attr('data-align') ?? 'center';
vid.setAttribute('src', apiFilePath);
vid.setAttribute('data-attachment-id', attachmentId);
vid.setAttribute('data-size', stat.size.toString());
vid.setAttribute('width', width);
vid.setAttribute('data-align', align);
$vid
.attr('src', apiFilePath)
.attr('data-attachment-id', attachmentId)
.attr('data-size', stat.size.toString())
.attr('width', width)
.attr('data-align', align);
// @ts-ignore
this.unwrapFromParagraph(vid);
unwrapFromParagraph($, $vid);
}
// rewrite other attachments via <a>
for (const a of Array.from(doc.getElementsByTagName('a'))) {
const href = cleanUrlString(a.getAttribute('href')) ?? '';
if (!href || href.startsWith('http')) continue;
const relPath = this.resolveRelativeAttachmentPath(
href,
pageDir,
attachmentCandidates,
);
if (!relPath) continue;
const { attachmentId, apiFilePath, abs } = processFile(relPath);
const stat = await fs.stat(abs);
const ext = path.extname(relPath).toLowerCase();
if (ext === '.mp4') {
const video = doc.createElement('video');
video.setAttribute('src', apiFilePath);
video.setAttribute('data-attachment-id', attachmentId);
video.setAttribute('data-size', stat.size.toString());
video.setAttribute('width', '100%');
video.setAttribute('data-align', 'center');
a.replaceWith(video);
// @ts-ignore
this.unwrapFromParagraph(video);
} else {
const div = doc.createElement('div') as HDElement;
div.setAttribute('data-type', 'attachment');
div.setAttribute('data-attachment-url', apiFilePath);
div.setAttribute('data-attachment-name', path.basename(abs));
div.setAttribute('data-attachment-mime', getMimeType(abs));
div.setAttribute('data-attachment-size', stat.size.toString());
div.setAttribute('data-attachment-id', attachmentId);
a.replaceWith(div);
this.unwrapFromParagraph(div);
}
}
const attachmentDivs = Array.from(
doc.querySelectorAll('div[data-type="attachment"]'),
);
for (const oldDiv of attachmentDivs) {
const rawUrl =
cleanUrlString(oldDiv.getAttribute('data-attachment-url')) ?? '';
for (const el of $('div[data-type="attachment"]').toArray()) {
const $oldDiv = $(el);
const rawUrl = cleanUrlString($oldDiv.attr('data-attachment-url') ?? '')!;
if (!rawUrl || rawUrl.startsWith('http')) continue;
const relPath = this.resolveRelativeAttachmentPath(
const relPath = resolveRelativeAttachmentPath(
rawUrl,
pageDir,
attachmentCandidates,
@ -427,27 +405,71 @@ export class FileTaskService {
const fileName = path.basename(abs);
const mime = getMimeType(abs);
const div = doc.createElement('div') as HDElement;
div.setAttribute('data-type', 'attachment');
div.setAttribute('data-attachment-url', apiFilePath);
div.setAttribute('data-attachment-name', fileName);
div.setAttribute('data-attachment-mime', mime);
div.setAttribute('data-attachment-size', stat.size.toString());
div.setAttribute('data-attachment-id', attachmentId);
const $newDiv = $('<div>')
.attr('data-type', 'attachment')
.attr('data-attachment-url', apiFilePath)
.attr('data-attachment-name', fileName)
.attr('data-attachment-mime', mime)
.attr('data-attachment-size', stat.size.toString())
.attr('data-attachment-id', attachmentId);
oldDiv.replaceWith(div);
this.unwrapFromParagraph(div);
$oldDiv.replaceWith($newDiv);
unwrapFromParagraph($, $newDiv);
}
for (const type of ['excalidraw', 'drawio'] as const) {
const selector = `div[data-type="${type}"]`;
const oldDivs = Array.from(doc.querySelectorAll(selector));
// rewrite other attachments via <a>
for (const aEl of $('a').toArray()) {
const $a = $(aEl);
const href = cleanUrlString($a.attr('href') ?? '')!;
if (!href || href.startsWith('http')) continue;
for (const oldDiv of oldDivs) {
const rawSrc = cleanUrlString(oldDiv.getAttribute('data-src')) ?? '';
const relPath = resolveRelativeAttachmentPath(
href,
pageDir,
attachmentCandidates,
);
if (!relPath) continue;
const { attachmentId, apiFilePath, abs } = processFile(relPath);
const stat = await fs.stat(abs);
const ext = path.extname(relPath).toLowerCase();
if (ext === '.mp4') {
const $video = $('<video>')
.attr('src', apiFilePath)
.attr('data-attachment-id', attachmentId)
.attr('data-size', stat.size.toString())
.attr('width', '100%')
.attr('data-align', 'center');
$a.replaceWith($video);
unwrapFromParagraph($, $video);
} else {
// build attachment <div>
const confAliasName = $a.attr('data-linked-resource-default-alias');
let attachmentName = path.basename(abs);
if (confAliasName) attachmentName = confAliasName;
const $div = $('<div>')
.attr('data-type', 'attachment')
.attr('data-attachment-url', apiFilePath)
.attr('data-attachment-name', attachmentName)
.attr('data-attachment-mime', getMimeType(abs))
.attr('data-attachment-size', stat.size.toString())
.attr('data-attachment-id', attachmentId);
$a.replaceWith($div);
unwrapFromParagraph($, $div);
}
}
// excalidraw and drawio
for (const type of ['excalidraw', 'drawio'] as const) {
for (const el of $(`div[data-type="${type}"]`).toArray()) {
const $oldDiv = $(el);
const rawSrc = cleanUrlString($oldDiv.attr('data-src') ?? '')!;
if (!rawSrc || rawSrc.startsWith('http')) continue;
const relPath = this.resolveRelativeAttachmentPath(
const relPath = resolveRelativeAttachmentPath(
rawSrc,
pageDir,
attachmentCandidates,
@ -458,155 +480,27 @@ export class FileTaskService {
const stat = await fs.stat(abs);
const fileName = path.basename(abs);
const width = oldDiv.getAttribute('data-width') || '100%';
const align = oldDiv.getAttribute('data-align') || 'center';
const width = $oldDiv.attr('data-width') || '100%';
const align = $oldDiv.attr('data-align') || 'center';
const newDiv = doc.createElement('div') as HDElement;
newDiv.setAttribute('data-type', type);
newDiv.setAttribute('data-src', apiFilePath);
newDiv.setAttribute('data-title', fileName);
newDiv.setAttribute('data-width', width);
newDiv.setAttribute('data-size', stat.size.toString());
newDiv.setAttribute('data-align', align);
newDiv.setAttribute('data-attachment-id', attachmentId);
const $newDiv = $('<div>')
.attr('data-type', type)
.attr('data-src', apiFilePath)
.attr('data-title', fileName)
.attr('data-width', width)
.attr('data-size', stat.size.toString())
.attr('data-align', align)
.attr('data-attachment-id', attachmentId);
oldDiv.replaceWith(newDiv);
this.unwrapFromParagraph(newDiv);
$oldDiv.replaceWith($newDiv);
unwrapFromParagraph($, $newDiv);
}
}
// wait for all uploads & DB inserts
await Promise.all(tasks);
await Promise.all(attachmentTasks);
return doc.documentElement.outerHTML;
}
async rewriteInternalLinksToMentionHtml(
html: string,
currentFilePath: string,
filePathToPageMetaMap: Map<
string,
{ id: string; title: string; slugId: string }
>,
creatorId: string,
): Promise<string> {
const window = new Window();
const doc = window.document;
doc.body.innerHTML = html;
// normalize helper
const normalize = (p: string) => p.replace(/\\/g, '/');
for (const a of Array.from(doc.getElementsByTagName('a'))) {
const rawHref = a.getAttribute('href');
if (!rawHref) continue;
// skip absolute/external URLs
if (rawHref.startsWith('http') || rawHref.startsWith('/api/')) {
continue;
}
const decodedRef = decodeURIComponent(rawHref);
const parentDir = path.dirname(currentFilePath);
const joined = path.join(parentDir, decodedRef);
const resolved = normalize(joined);
const pageMeta = filePathToPageMetaMap.get(resolved);
if (!pageMeta) {
// not an internal link we know about
continue;
}
const mentionEl = doc.createElement('span') as HDElement;
mentionEl.setAttribute('data-type', 'mention');
mentionEl.setAttribute('data-id', v7());
mentionEl.setAttribute('data-entity-type', 'page');
mentionEl.setAttribute('data-entity-id', pageMeta.id);
mentionEl.setAttribute('data-label', pageMeta.title);
mentionEl.setAttribute('data-slug-id', pageMeta.slugId);
mentionEl.setAttribute('data-creator-id', creatorId);
mentionEl.textContent = pageMeta.title;
a.replaceWith(mentionEl);
}
return doc.body.innerHTML;
}
unwrapFromParagraph(node: HDElement) {
let wrapper = node.closest('p, a') as HDElement | null;
while (wrapper) {
if (wrapper.childNodes.length === 1) {
// e.g. <p><node/></p> or <a><node/></a> → <node/>
wrapper.replaceWith(node);
} else {
wrapper.parentNode!.insertBefore(node, wrapper);
}
wrapper = node.closest('p, a') as HDElement | null;
}
}
async buildAttachmentCandidates(
extractDir: string,
): Promise<Map<string, string>> {
const map = new Map<string, string>();
async function walk(dir: string) {
for (const ent of await fs.readdir(dir, { withFileTypes: true })) {
const abs = path.join(dir, ent.name);
if (ent.isDirectory()) {
await walk(abs);
} else {
if (['.md', '.html'].includes(path.extname(ent.name).toLowerCase())) {
continue;
}
const rel = path.relative(extractDir, abs).split(path.sep).join('/');
map.set(rel, abs);
}
}
}
await walk(extractDir);
return map;
}
async collectMarkdownAndHtmlFiles(dir: string): Promise<string[]> {
const results: string[] = [];
async function walk(current: string) {
const entries = await fs.readdir(current, { withFileTypes: true });
for (const ent of entries) {
const fullPath = path.join(current, ent.name);
if (ent.isDirectory()) {
await walk(fullPath);
} else if (
['.md', '.html'].includes(path.extname(ent.name).toLowerCase())
) {
results.push(fullPath);
}
}
}
await walk(dir);
return results;
}
resolveRelativeAttachmentPath(
raw: string,
pageDir: string,
attachmentCandidates: Map<string, string>,
): string | null {
const mainRel = decodeURIComponent(raw.replace(/^\.?\/+/, ''));
const fallback = path.normalize(path.join(pageDir, mainRel));
if (attachmentCandidates.has(mainRel)) {
return mainRel;
}
if (attachmentCandidates.has(fallback)) {
return fallback;
}
return null;
return $.root().html() || '';
}
async updateTaskStatus(fileTaskId: string, status: FileTaskStatus) {

View File

@ -39,7 +39,7 @@ export async function extractZip(
source: string,
target: string,
): Promise<void> {
await extractZipInternal(source, target, true);
return extractZipInternal(source, target, true);
}
/**
@ -69,6 +69,7 @@ function extractZipInternal(
!/\/$/.test(entry.fileName) &&
name.toLowerCase().endsWith('.zip');
if (isZip) {
// temporary name to avoid overwriting file
const nestedPath = source.endsWith('.zip')
? source.slice(0, -4) + '.inner.zip'
: source + '.inner.zip';

View File

@ -1,215 +1,218 @@
import {
Window,
HTMLAnchorElement,
HTMLIFrameElement,
Element as HDElement,
} from 'happy-dom';
import { getEmbedUrlAndProvider } from '@docmost/editor-ext';
import * as path from 'path';
import { v7 } from 'uuid';
import { InsertableBacklink } from '@docmost/db/types/entity.types';
import { Cheerio, CheerioAPI, load } from 'cheerio';
export function formatImportHtml(html: string) {
const pmHtml = notionFormatter(html);
return defaultHtmlFormatter(pmHtml);
export async function formatImportHtml(opts: {
html: string;
currentFilePath: string;
filePathToPageMetaMap: Map<
string,
{ id: string; title: string; slugId: string }
>;
creatorId: string;
sourcePageId: string;
workspaceId: string;
pageDir?: string;
attachmentCandidates?: string[];
}): Promise<{ html: string; backlinks: InsertableBacklink[] }> {
const {
html,
currentFilePath,
filePathToPageMetaMap,
creatorId,
sourcePageId,
workspaceId,
} = opts;
const $: CheerioAPI = load(html);
const $root: Cheerio<any> = $.root();
notionFormatter($, $root);
defaultHtmlFormatter($, $root);
const backlinks = await rewriteInternalLinksToMentionHtml(
$,
$root,
currentFilePath,
filePathToPageMetaMap,
creatorId,
sourcePageId,
workspaceId,
);
return {
html: $root.html() || '',
backlinks,
};
}
export function defaultHtmlFormatter(html: string): string {
const window = new Window();
const doc = window.document;
doc.body.innerHTML = html;
export function defaultHtmlFormatter($: CheerioAPI, $root: Cheerio<any>) {
$root.find('a[href]').each((_, el) => {
const $el = $(el);
const url = $el.attr('href')!;
const { provider } = getEmbedUrlAndProvider(url);
if (provider === 'iframe') return;
// embed providers
const anchors = Array.from(doc.getElementsByTagName('a'));
for (const node of anchors) {
const url = (node as HTMLAnchorElement).href;
if (!url) continue;
const embedProvider = getEmbedUrlAndProvider(url);
// we only want to embed valid matches
if (embedProvider.provider === 'iframe') continue;
const embed = doc.createElement('div');
embed.setAttribute('data-type', 'embed');
embed.setAttribute('data-src', url);
embed.setAttribute('data-provider', embedProvider.provider);
embed.setAttribute('data-align', 'center');
embed.setAttribute('data-width', '640');
embed.setAttribute('data-height', '480');
node.replaceWith(embed);
}
// embed providers
const iframes = Array.from(doc.getElementsByTagName('iframe'));
for (const iframe of iframes) {
const url = (iframe as HTMLIFrameElement).src;
if (!url) continue;
const embedProvider = getEmbedUrlAndProvider(url);
const embed = doc.createElement('div');
embed.setAttribute('data-type', 'embed');
embed.setAttribute('data-src', url);
embed.setAttribute('data-provider', embedProvider.provider);
embed.setAttribute('data-align', 'center');
embed.setAttribute('data-width', '640');
embed.setAttribute('data-height', '480');
iframe.replaceWith(embed);
}
return doc.body.innerHTML;
}
export function notionFormatter(html: string): string {
const window = new Window();
const doc = window.document;
doc.body.innerHTML = html;
// remove empty description paragraph
doc.querySelectorAll('p.page-description').forEach((p) => {
if (p.textContent?.trim() === '') {
p.remove();
}
const embed = `<div data-type=\"embed\" data-src=\"${url}\" data-provider=\"${provider}\" data-align=\"center\" data-width=\"640\" data-height=\"480\"></div>`;
$el.replaceWith(embed);
});
// Block math
for (const fig of Array.from(doc.querySelectorAll('figure.equation'))) {
// get TeX source from the MathML <annotation>
const annotation = fig.querySelector(
'annotation[encoding="application/x-tex"]',
);
const tex = annotation?.textContent?.trim() ?? '';
$root.find('iframe[src]').each((_, el) => {
const $el = $(el);
const url = $el.attr('src')!;
const { provider } = getEmbedUrlAndProvider(url);
const mathBlock = doc.createElement('div');
mathBlock.setAttribute('data-type', 'mathBlock');
mathBlock.setAttribute('data-katex', 'true');
mathBlock.textContent = tex;
fig.replaceWith(mathBlock);
}
// Inline math
for (const token of Array.from(
doc.querySelectorAll('span.notion-text-equation-token'),
)) {
// remove the preceding <style> if its that KaTeX import
const prev = token.previousElementSibling;
if (prev?.tagName === 'STYLE') prev.remove();
const annotation = token.querySelector(
'annotation[encoding="application/x-tex"]',
);
const tex = annotation?.textContent?.trim() ?? '';
const mathInline = doc.createElement('span');
mathInline.setAttribute('data-type', 'mathInline');
mathInline.setAttribute('data-katex', 'true');
mathInline.textContent = tex;
token.replaceWith(mathInline);
}
// Callouts
const figs = Array.from(doc.querySelectorAll('figure.callout')).reverse();
for (const fig of figs) {
// find the content <div> (always the 2nd child in a Notion callout)
const contentDiv = fig.querySelector(
'div:nth-of-type(2)',
) as unknown as HTMLElement | null;
if (!contentDiv) continue;
// pull out every block inside (tables, p, nested callouts, lists…)
const blocks = Array.from(contentDiv.childNodes);
const wrapper = fig.ownerDocument.createElement('div');
wrapper.setAttribute('data-type', 'callout');
wrapper.setAttribute('data-callout-type', 'info');
// move each real node into the wrapper (preserves nested structure)
// @ts-ignore
wrapper.append(...blocks);
fig.replaceWith(wrapper);
}
// Todolist
const todoLists = Array.from(doc.querySelectorAll('ul.to-do-list'));
for (const oldList of todoLists) {
const newList = doc.createElement('ul');
newList.setAttribute('data-type', 'taskList');
// for each old <li>, create a <li data-type="taskItem" data-checked="…">
for (const li of oldList.querySelectorAll('li')) {
const isChecked = li.querySelector('.checkbox.checkbox-on') != null;
const textSpan = li.querySelector(
'span.to-do-children-unchecked, span.to-do-children-checked',
);
const text = textSpan?.textContent?.trim() ?? '';
// <li data-type="taskItem" data-checked="true|false">
const taskItem = doc.createElement('li');
taskItem.setAttribute('data-type', 'taskItem');
taskItem.setAttribute('data-checked', String(isChecked));
// <label><input type="checkbox" [checked]><span></span></label>
const label = doc.createElement('label');
const input = doc.createElement('input');
input.type = 'checkbox';
if (isChecked) input.checked = true;
const spacer = doc.createElement('span');
label.append(input, spacer);
const container = doc.createElement('div');
const p = doc.createElement('p');
p.textContent = text;
container.appendChild(p);
taskItem.append(label, container);
newList.appendChild(taskItem);
}
oldList.replaceWith(newList);
}
// Fix toggle blocks
const detailsList = Array.from(
doc.querySelectorAll('ul.toggle details'),
).reverse();
// unwrap from ul and li tags
for (const details of detailsList) {
const li = details.closest('li');
if (li) {
li.parentNode!.insertBefore(details, li);
if (li.childNodes.length === 0) li.remove();
}
const ul = details.closest('ul.toggle');
if (ul) {
ul.parentNode!.insertBefore(details, ul);
if (ul.childNodes.length === 0) ul.remove();
}
}
return doc.body.innerHTML;
const embed = `<div data-type=\"embed\" data-src=\"${url}\" data-provider=\"${provider}\" data-align=\"center\" data-width=\"640\" data-height=\"480\"></div>`;
$el.replaceWith(embed);
});
}
export function unwrapFromParagraph(node: HDElement) {
let wrapper = node.closest('p, a') as HDElement | null;
export function notionFormatter($: CheerioAPI, $root: Cheerio<any>) {
// remove empty description paragraphs
$root.find('p.page-description').each((_, el) => {
if (!$(el).text().trim()) $(el).remove();
});
while (wrapper) {
if (wrapper.childNodes.length === 1) {
// e.g. <p><node/></p> or <a><node/></a> → <node/>
wrapper.replaceWith(node);
// block math → mathBlock
$root.find('figure.equation').each((_: any, fig: any) => {
const $fig = $(fig);
const tex = $fig
.find('annotation[encoding="application/x-tex"]')
.text()
.trim();
const $math = $('<div>')
.attr('data-type', 'mathBlock')
.attr('data-katex', 'true')
.text(tex);
$fig.replaceWith($math);
});
// inline math → mathInline
$root.find('span.notion-text-equation-token').each((_, tok) => {
const $tok = $(tok);
const $prev = $tok.prev('style');
if ($prev.length) $prev.remove();
const tex = $tok
.find('annotation[encoding="application/x-tex"]')
.text()
.trim();
const $inline = $('<span>')
.attr('data-type', 'mathInline')
.attr('data-katex', 'true')
.text(tex);
$tok.replaceWith($inline);
});
// callouts
$root
.find('figure.callout')
.get()
.reverse()
.forEach((fig) => {
const $fig = $(fig);
const $content = $fig.find('div').eq(1);
if (!$content.length) return;
const $wrapper = $('<div>')
.attr('data-type', 'callout')
.attr('data-callout-type', 'info');
// @ts-ignore
$content.children().each((_, child) => $wrapper.append(child));
$fig.replaceWith($wrapper);
});
// to-do lists
$root.find('ul.to-do-list').each((_, list) => {
const $old = $(list);
const $new = $('<ul>').attr('data-type', 'taskList');
$old.find('li').each((_, li) => {
const $li = $(li);
const isChecked = $li.find('.checkbox.checkbox-on').length > 0;
const text =
$li
.find('span.to-do-children-unchecked, span.to-do-children-checked')
.first()
.text()
.trim() || '';
const $taskItem = $('<li>')
.attr('data-type', 'taskItem')
.attr('data-checked', String(isChecked));
const $label = $('<label>');
const $input = $('<input>').attr('type', 'checkbox');
if (isChecked) $input.attr('checked', '');
$label.append($input, $('<span>'));
const $container = $('<div>').append($('<p>').text(text));
$taskItem.append($label, $container);
$new.append($taskItem);
});
$old.replaceWith($new);
});
// toggle blocks
$root
.find('ul.toggle details')
.get()
.reverse()
.forEach((det) => {
const $det = $(det);
const $li = $det.closest('li');
if ($li.length) {
$li.before($det);
if (!$li.children().length) $li.remove();
}
const $ul = $det.closest('ul.toggle');
if ($ul.length) {
$ul.before($det);
if (!$ul.children().length) $ul.remove();
}
});
// bookmarks
$root
.find('figure')
.filter((_, fig) => $(fig).find('a.bookmark.source').length > 0)
.get()
.reverse()
.forEach((fig) => {
const $fig = $(fig);
const $link = $fig.find('a.bookmark.source').first();
if (!$link.length) return;
const href = $link.attr('href')!;
const title = $link.find('.bookmark-title').text().trim() || href;
const $newAnchor = $('<a>')
.addClass('bookmark source')
.attr('href', href)
.append($('<div>').addClass('bookmark-info').text(title));
$fig.replaceWith($newAnchor);
});
// remove toc
$root.find('nav.table_of_contents').remove();
}
export function unwrapFromParagraph($: CheerioAPI, $node: Cheerio<any>) {
// find the nearest <p> or <a> ancestor
let $wrapper = $node.closest('p, a');
while ($wrapper.length) {
// if the wrapper has only our node inside, replace it entirely
if ($wrapper.contents().length === 1) {
$wrapper.replaceWith($node);
} else {
wrapper.parentNode!.insertBefore(node, wrapper);
// otherwise just move the node to before the wrapper
$wrapper.before($node);
}
wrapper = node.closest('p, a') as HDElement | null;
// look again for any new wrapper around $node
$wrapper = $node.closest('p, a');
}
}
export async function rewriteInternalLinksToMentionHtml(
html: string,
$: CheerioAPI,
$root: Cheerio<any>,
currentFilePath: string,
filePathToPageMetaMap: Map<
string,
@ -218,53 +221,34 @@ export async function rewriteInternalLinksToMentionHtml(
creatorId: string,
sourcePageId: string,
workspaceId: string,
): Promise<{ html: string; backlinks: InsertableBacklink[] }> {
const window = new Window();
const doc = window.document;
doc.body.innerHTML = html;
// normalize helper
): Promise<InsertableBacklink[]> {
const normalize = (p: string) => p.replace(/\\/g, '/');
const backlinks: InsertableBacklink[] = [];
for (const a of Array.from(doc.getElementsByTagName('a'))) {
const rawHref = a.getAttribute('href');
if (!rawHref) continue;
$root.find('a[href]').each((_, el) => {
const $a = $(el);
const raw = $a.attr('href')!;
if (raw.startsWith('http') || raw.startsWith('/api/')) return;
const resolved = normalize(
path.join(path.dirname(currentFilePath), decodeURIComponent(raw)),
);
const meta = filePathToPageMetaMap.get(resolved);
if (!meta) return;
const mentionId = v7();
const $mention = $('<span>')
.attr({
'data-type': 'mention',
'data-id': mentionId,
'data-entity-type': 'page',
'data-entity-id': meta.id,
'data-label': meta.title,
'data-slug-id': meta.slugId,
'data-creator-id': creatorId,
})
.text(meta.title);
$a.replaceWith($mention);
backlinks.push({ sourcePageId, targetPageId: meta.id, workspaceId });
});
// skip absolute/external URLs
if (rawHref.startsWith('http') || rawHref.startsWith('/api/')) {
continue;
}
const decodedRef = decodeURIComponent(rawHref);
const parentDir = path.dirname(currentFilePath);
const joined = path.join(parentDir, decodedRef);
const resolved = normalize(joined);
const pageMeta = filePathToPageMetaMap.get(resolved);
if (!pageMeta) {
continue;
}
const mentionEl = doc.createElement('span') as HDElement;
mentionEl.setAttribute('data-type', 'mention');
mentionEl.setAttribute('data-id', v7());
mentionEl.setAttribute('data-entity-type', 'page');
mentionEl.setAttribute('data-entity-id', pageMeta.id);
mentionEl.setAttribute('data-label', pageMeta.title);
mentionEl.setAttribute('data-slug-id', pageMeta.slugId);
mentionEl.setAttribute('data-creator-id', creatorId);
mentionEl.textContent = pageMeta.title;
a.replaceWith(mentionEl);
backlinks.push({
sourcePageId,
targetPageId: pageMeta.id,
workspaceId: workspaceId,
});
}
return { html: doc.body.innerHTML, backlinks };
return backlinks;
}

View File

@ -15,7 +15,6 @@ export class FileTaskProcessor extends WorkerHost implements OnModuleDestroy {
try {
switch (job.name) {
case QueueJob.IMPORT_TASK:
console.log('import task', job.data.fileTaskId);
await this.fileTaskService.processZIpImport(job.data.fileTaskId);
break;
case QueueJob.EXPORT_TASK: