From 95f24d9ba5b0775c42b8d358c6e9c856eace838e Mon Sep 17 00:00:00 2001 From: Philipinho <16838612+Philipinho@users.noreply.github.com> Date: Thu, 29 May 2025 17:49:34 -0700 Subject: [PATCH] Switch from happy-dom to cheerio * Refine code --- apps/server/package.json | 3 +- .../integrations/import/file-task.service.ts | 472 +++++++----------- .../src/integrations/import/file.utils.ts | 3 +- .../integrations/import/import-formatter.ts | 458 ++++++++--------- .../import/processors/file-task.processor.ts | 1 - packages/editor-ext/src/lib/video/video.ts | 8 + pnpm-lock.yaml | 396 +++++---------- 7 files changed, 538 insertions(+), 803 deletions(-) diff --git a/apps/server/package.json b/apps/server/package.json index 8e85bef6..d105f20c 100644 --- a/apps/server/package.json +++ b/apps/server/package.json @@ -31,7 +31,7 @@ }, "dependencies": { "@aws-sdk/client-s3": "3.701.0", - "@aws-sdk/lib-storage": "^3.701.0", + "@aws-sdk/lib-storage": "3.701.0", "@aws-sdk/s3-request-presigner": "3.701.0", "@casl/ability": "^6.7.3", "@fastify/cookie": "^11.0.2", @@ -57,6 +57,7 @@ "bcrypt": "^5.1.1", "bullmq": "^5.41.3", "cache-manager": "^6.4.0", + "cheerio": "^1.0.0", "class-transformer": "^0.5.1", "class-validator": "^0.14.1", "cookie": "^1.0.2", diff --git a/apps/server/src/integrations/import/file-task.service.ts b/apps/server/src/integrations/import/file-task.service.ts index 3ad92d3c..8b55e0f5 100644 --- a/apps/server/src/integrations/import/file-task.service.ts +++ b/apps/server/src/integrations/import/file-task.service.ts @@ -18,17 +18,19 @@ import { import { v7 } from 'uuid'; import { generateJitteredKeyBetween } from 'fractional-indexing-jittered'; import { FileTask, InsertablePage } from '@docmost/db/types/entity.types'; -import { - DOMParser, - Node as HDNode, - Element as HDElement, - Window, -} from 'happy-dom'; import { markdownToHtml } from '@docmost/editor-ext'; import { getAttachmentFolderPath } from '../../core/attachment/attachment.utils'; import { AttachmentType } from '../../core/attachment/attachment.constants'; import { getProsemirrorContent } from '../../common/helpers/prosemirror/utils'; -import { formatImportHtml, notionFormatter } from './import-formatter'; +import { formatImportHtml, unwrapFromParagraph } from './import-formatter'; +import { + buildAttachmentCandidates, + collectMarkdownAndHtmlFiles, + resolveRelativeAttachmentPath, +} from './import.utils'; +import { executeTx } from '@docmost/db/utils'; +import { BacklinkRepo } from '@docmost/db/repos/backlink/backlink.repo'; +import { load } from 'cheerio'; @Injectable() export class FileTaskService { @@ -37,6 +39,7 @@ export class FileTaskService { constructor( private readonly storageService: StorageService, private readonly importService: ImportService, + private readonly backlinkRepo: BacklinkRepo, @InjectKysely() private readonly db: KyselyDB, ) {} @@ -48,7 +51,12 @@ export class FileTaskService { .executeTakeFirst(); if (!fileTask) { - this.logger.log(`File task with ID ${fileTaskId} not found`); + this.logger.log(`Import file task with ID ${fileTaskId} not found`); + return; + } + + if (fileTask.status === FileTaskStatus.Success) { + this.logger.log('Imported task already processed.'); return; } @@ -68,15 +76,27 @@ export class FileTaskService { await extractZip(tmpZipPath, tmpExtractDir); - // TODO: backlinks try { await this.updateTaskStatus(fileTaskId, FileTaskStatus.Processing); // if type == generic - await this.processGenericImport({ extractDir: tmpExtractDir, fileTask }); + if (fileTask.source === 'generic') { + await this.processGenericImport({ + extractDir: tmpExtractDir, + fileTask, + }); + } + + /* + if (fileTask.source === 'confluence') { + await this.processConfluenceImport({ + extractDir: tmpExtractDir, + fileTask, + }); + }*/ await this.updateTaskStatus(fileTaskId, FileTaskStatus.Success); } catch (error) { await this.updateTaskStatus(fileTaskId, FileTaskStatus.Failed); - console.error(error); + this.logger.error(error); } finally { await cleanupTmpFile(); await cleanupTmpDir(); @@ -88,12 +108,8 @@ export class FileTaskService { fileTask: FileTask; }): Promise { const { extractDir, fileTask } = opts; - - const allFiles = await this.collectMarkdownAndHtmlFiles(extractDir); - const attachmentCandidates = - await this.buildAttachmentCandidates(extractDir); - - console.log('attachment count: ', attachmentCandidates.size); + const allFiles = await collectMarkdownAndHtmlFiles(extractDir); + const attachmentCandidates = await buildAttachmentCandidates(extractDir); const pagesMap = new Map< string, @@ -117,22 +133,8 @@ export class FileTaskService { const ext = path.extname(relPath).toLowerCase(); let content = await fs.readFile(absPath, 'utf-8'); - console.log('relative path: ', relPath, ' abs path: ', absPath); - - if (ext.toLowerCase() === '.html' || ext.toLowerCase() === '.md') { - // we want to process all inputs as markr - if (ext === '.md') { - content = await markdownToHtml(content); - } - - content = await this.rewriteLocalFilesInHtml({ - html: content, - pageRelativePath: relPath, - extractDir, - pageId: v7(), - fileTask, - attachmentCandidates, - }); + if (ext.toLowerCase() === '.md') { + content = await markdownToHtml(content); } pagesMap.set(relPath, { @@ -195,23 +197,34 @@ export class FileTaskService { }); }); - const insertablePages: InsertablePage[] = await Promise.all( + const pageResults = await Promise.all( Array.from(pagesMap.values()).map(async (page) => { - const htmlContent = await this.rewriteInternalLinksToMentionHtml( - page.content, - page.filePath, - filePathToPageMetaMap, - fileTask.creatorId, - ); + const htmlContent = await this.rewriteLocalFilesInHtml({ + html: page.content, + pageRelativePath: page.filePath, + extractDir, + pageId: page.id, + fileTask, + attachmentCandidates, + }); + + const { html, backlinks } = await formatImportHtml({ + html: htmlContent, + currentFilePath: page.filePath, + filePathToPageMetaMap: filePathToPageMetaMap, + creatorId: fileTask.creatorId, + sourcePageId: page.id, + workspaceId: fileTask.workspaceId, + }); const pmState = getProsemirrorContent( - await this.importService.processHTML(formatImportHtml(htmlContent)), + await this.importService.processHTML(html), ); const { title, prosemirrorJson } = this.importService.extractTitleAndRemoveHeading(pmState); - return { + const insertablePage: InsertablePage = { id: page.id, slugId: page.slugId, title: title || page.name, @@ -225,18 +238,28 @@ export class FileTaskService { lastUpdatedById: fileTask.creatorId, parentPageId: page.parentPageId, }; + + return { insertablePage, backlinks }; }), ); - try { - await this.db.insertInto('pages').values(insertablePages).execute(); - //todo: avoid duplicates - // log success - // backlinks mapping - // handle svg diagram nodes - } catch (e) { - console.error(e); - } + const insertablePages = pageResults.map((r) => r.insertablePage); + const insertableBacklinks = pageResults.flatMap((r) => r.backlinks); + + if (insertablePages.length < 1) return; + const validPageIds = new Set(insertablePages.map((row) => row.id)); + const filteredBacklinks = insertableBacklinks.filter( + ({ sourcePageId, targetPageId }) => + validPageIds.has(sourcePageId) && validPageIds.has(targetPageId), + ); + + await executeTx(this.db, async (trx) => { + await trx.insertInto('pages').values(insertablePages).execute(); + + if (filteredBacklinks.length > 0) { + await this.backlinkRepo.insertBacklink(filteredBacklinks, trx); + } + }); } async rewriteLocalFilesInHtml(opts: { @@ -256,11 +279,7 @@ export class FileTaskService { attachmentCandidates, } = opts; - const window = new Window(); - const doc = window.document; - doc.body.innerHTML = html; - - const tasks: Promise[] = []; + const attachmentTasks: Promise[] = []; const processFile = (relPath: string) => { const abs = attachmentCandidates.get(relPath)!; @@ -274,13 +293,13 @@ export class FileTaskService { const apiFilePath = `/api/files/${attachmentId}/${fileNameWithExt}`; - tasks.push( + attachmentTasks.push( (async () => { const fileStream = createReadStream(abs); await this.storageService.uploadStream(storageFilePath, fileStream); const stat = await fs.stat(abs); - const uploaded = await this.db + await this.db .insertInto('attachments') .values({ id: attachmentId, @@ -295,9 +314,7 @@ export class FileTaskService { pageId, spaceId: fileTask.spaceId, }) - .returningAll() .execute(); - console.log(uploaded); })(), ); @@ -311,12 +328,15 @@ export class FileTaskService { }; const pageDir = path.dirname(pageRelativePath); + const $ = load(html); - for (const img of Array.from(doc.getElementsByTagName('img'))) { - const src = cleanUrlString(img.getAttribute('src')) ?? ''; + // image + for (const imgEl of $('img').toArray()) { + const $img = $(imgEl); + const src = cleanUrlString($img.attr('src') ?? '')!; if (!src || src.startsWith('http')) continue; - const relPath = this.resolveRelativeAttachmentPath( + const relPath = resolveRelativeAttachmentPath( src, pageDir, attachmentCandidates, @@ -326,24 +346,26 @@ export class FileTaskService { const { attachmentId, apiFilePath, abs } = processFile(relPath); const stat = await fs.stat(abs); - const width = img.getAttribute('width') || '100%'; - const align = img.getAttribute('data-align') || 'center'; + const width = $img.attr('width') ?? '100%'; + const align = $img.attr('data-align') ?? 'center'; - img.setAttribute('src', apiFilePath); - img.setAttribute('data-attachment-id', attachmentId); - img.setAttribute('data-size', stat.size.toString()); - img.setAttribute('width', width); - img.setAttribute('data-align', align); + $img + .attr('src', apiFilePath) + .attr('data-attachment-id', attachmentId) + .attr('data-size', stat.size.toString()) + .attr('width', width) + .attr('data-align', align); - this.unwrapFromParagraph(img); + unwrapFromParagraph($, $img); } - // rewrite