From 9ac180f719f0010f508f2ff3be3b0ffa7ffde904 Mon Sep 17 00:00:00 2001 From: Philip Okugbe <16838612+Philipinho@users.noreply.github.com> Date: Wed, 17 Sep 2025 23:50:27 +0100 Subject: [PATCH] fix: enhance page import (#1570) * change import process * fix processor * fix page name in notion import * preserve confluence table bg color * sync --- .../import/processors/file-task.processor.ts | 41 ++-- .../services/file-import-task.service.ts | 219 ++++++++++++------ .../services/import-attachment.service.ts | 2 +- .../integrations/import/utils/import.utils.ts | 6 + packages/editor-ext/src/lib/table/cell.ts | 18 +- packages/editor-ext/src/lib/table/header.ts | 20 +- 6 files changed, 199 insertions(+), 107 deletions(-) diff --git a/apps/server/src/integrations/import/processors/file-task.processor.ts b/apps/server/src/integrations/import/processors/file-task.processor.ts index 7b65f5ab..20001dd7 100644 --- a/apps/server/src/integrations/import/processors/file-task.processor.ts +++ b/apps/server/src/integrations/import/processors/file-task.processor.ts @@ -47,15 +47,23 @@ export class FileTaskProcessor extends WorkerHost implements OnModuleDestroy { await this.handleFailedJob(job); } - @OnWorkerEvent('stalled') - async onStalled(job: Job) { - this.logger.error( - `Job ${job.name} stalled. . Import Task ID: ${job.data.fileTaskId}.. Job ID: ${job.id}`, + @OnWorkerEvent('completed') + async onCompleted(job: Job) { + this.logger.log( + `Completed ${job.name} job for File task ID ${job.data.fileTaskId}`, ); - // Set failedReason for stalled jobs since it's not automatically set - job.failedReason = 'Job stalled and was marked as failed'; - await this.handleFailedJob(job); + try { + const fileTask = await this.fileTaskService.getFileTask( + job.data.fileTaskId, + ); + if (fileTask) { + await this.storageService.delete(fileTask.filePath); + this.logger.debug(`Deleted imported zip file: ${fileTask.filePath}`); + } + } catch (err) { + this.logger.error(`Failed to delete imported zip file:`, err); + } } private async handleFailedJob(job: Job) { @@ -78,25 +86,6 @@ export class FileTaskProcessor extends WorkerHost implements OnModuleDestroy { } } - @OnWorkerEvent('completed') - async onCompleted(job: Job) { - this.logger.log( - `Completed ${job.name} job for File task ID ${job.data.fileTaskId}`, - ); - - try { - const fileTask = await this.fileTaskService.getFileTask( - job.data.fileTaskId, - ); - if (fileTask) { - await this.storageService.delete(fileTask.filePath); - this.logger.debug(`Deleted imported zip file: ${fileTask.filePath}`); - } - } catch (err) { - this.logger.error(`Failed to delete imported zip file:`, err); - } - } - async onModuleDestroy(): Promise { if (this.worker) { await this.worker.close(); diff --git a/apps/server/src/integrations/import/services/file-import-task.service.ts b/apps/server/src/integrations/import/services/file-import-task.service.ts index 30338568..f7d93ec0 100644 --- a/apps/server/src/integrations/import/services/file-import-task.service.ts +++ b/apps/server/src/integrations/import/services/file-import-task.service.ts @@ -24,6 +24,7 @@ import { formatImportHtml } from '../utils/import-formatter'; import { buildAttachmentCandidates, collectMarkdownAndHtmlFiles, + stripNotionID, } from '../utils/import.utils'; import { executeTx } from '@docmost/db/utils'; import { BacklinkRepo } from '@docmost/db/repos/backlink/backlink.repo'; @@ -159,17 +160,12 @@ export class FileImportTaskService { .split(path.sep) .join('/'); // normalize to forward-slashes const ext = path.extname(relPath).toLowerCase(); - let content = await fs.readFile(absPath, 'utf-8'); - - if (ext.toLowerCase() === '.md') { - content = await markdownToHtml(content); - } pagesMap.set(relPath, { id: v7(), slugId: generateSlugId(), - name: path.basename(relPath, ext), - content, + name: stripNotionID(path.basename(relPath, ext)), + content: '', parentPageId: null, fileExtension: ext, filePath: relPath, @@ -254,71 +250,160 @@ export class FileImportTaskService { }); }); - const pageResults = await Promise.all( - Array.from(pagesMap.values()).map(async (page) => { - const htmlContent = - await this.importAttachmentService.processAttachments({ - html: page.content, - pageRelativePath: page.filePath, - extractDir, - pageId: page.id, - fileTask, - attachmentCandidates, - }); + // Group pages by level (topological sort for parent-child relationships) + const pagesByLevel = new Map>(); + const pageLevel = new Map(); - const { html, backlinks, pageIcon } = await formatImportHtml({ - html: htmlContent, - currentFilePath: page.filePath, - filePathToPageMetaMap: filePathToPageMetaMap, - creatorId: fileTask.creatorId, - sourcePageId: page.id, - workspaceId: fileTask.workspaceId, - }); + // Calculate levels using BFS + const calculateLevels = () => { + const queue: Array<{ filePath: string; level: number }> = []; - const pmState = getProsemirrorContent( - await this.importService.processHTML(html), + // Start with root pages (no parent) + for (const [filePath, page] of pagesMap.entries()) { + if (!page.parentPageId) { + queue.push({ filePath, level: 0 }); + pageLevel.set(filePath, 0); + } + } + + // BFS to assign levels + while (queue.length > 0) { + const { filePath, level } = queue.shift()!; + const currentPage = pagesMap.get(filePath)!; + + // Find children of current page + for (const [childFilePath, childPage] of pagesMap.entries()) { + if ( + childPage.parentPageId === currentPage.id && + !pageLevel.has(childFilePath) + ) { + pageLevel.set(childFilePath, level + 1); + queue.push({ filePath: childFilePath, level: level + 1 }); + } + } + } + + // Group pages by level + for (const [filePath, page] of pagesMap.entries()) { + const level = pageLevel.get(filePath) || 0; + if (!pagesByLevel.has(level)) { + pagesByLevel.set(level, []); + } + pagesByLevel.get(level)!.push([filePath, page]); + } + }; + + calculateLevels(); + + if (pagesMap.size < 1) return; + + // Process pages level by level sequentially to respect foreign key constraints + const allBacklinks: any[] = []; + const validPageIds = new Set(); + let totalPagesProcessed = 0; + + // Sort levels to process in order + const sortedLevels = Array.from(pagesByLevel.keys()).sort((a, b) => a - b); + + try { + await executeTx(this.db, async (trx) => { + // Process pages level by level sequentially within the transaction + for (const level of sortedLevels) { + const levelPages = pagesByLevel.get(level)!; + + for (const [filePath, page] of levelPages) { + const absPath = path.join(extractDir, filePath); + let content = await fs.readFile(absPath, 'utf-8'); + + if (page.fileExtension.toLowerCase() === '.md') { + content = await markdownToHtml(content); + } + + const htmlContent = + await this.importAttachmentService.processAttachments({ + html: content, + pageRelativePath: page.filePath, + extractDir, + pageId: page.id, + fileTask, + attachmentCandidates, + }); + + const { html, backlinks, pageIcon } = await formatImportHtml({ + html: htmlContent, + currentFilePath: page.filePath, + filePathToPageMetaMap: filePathToPageMetaMap, + creatorId: fileTask.creatorId, + sourcePageId: page.id, + workspaceId: fileTask.workspaceId, + }); + + const pmState = getProsemirrorContent( + await this.importService.processHTML(html), + ); + + const { title, prosemirrorJson } = + this.importService.extractTitleAndRemoveHeading(pmState); + + const insertablePage: InsertablePage = { + id: page.id, + slugId: page.slugId, + title: title || page.name, + icon: pageIcon || null, + content: prosemirrorJson, + textContent: jsonToText(prosemirrorJson), + ydoc: await this.importService.createYdoc(prosemirrorJson), + position: page.position!, + spaceId: fileTask.spaceId, + workspaceId: fileTask.workspaceId, + creatorId: fileTask.creatorId, + lastUpdatedById: fileTask.creatorId, + parentPageId: page.parentPageId, + }; + + await trx.insertInto('pages').values(insertablePage).execute(); + + // Track valid page IDs and collect backlinks + validPageIds.add(insertablePage.id); + allBacklinks.push(...backlinks); + totalPagesProcessed++; + + // Log progress periodically + if (totalPagesProcessed % 50 === 0) { + this.logger.debug(`Processed ${totalPagesProcessed} pages...`); + } + } + } + + const filteredBacklinks = allBacklinks.filter( + ({ sourcePageId, targetPageId }) => + validPageIds.has(sourcePageId) && validPageIds.has(targetPageId), ); - const { title, prosemirrorJson } = - this.importService.extractTitleAndRemoveHeading(pmState); + // Insert backlinks in batches + if (filteredBacklinks.length > 0) { + const BACKLINK_BATCH_SIZE = 100; + for ( + let i = 0; + i < filteredBacklinks.length; + i += BACKLINK_BATCH_SIZE + ) { + const backlinkChunk = filteredBacklinks.slice( + i, + Math.min(i + BACKLINK_BATCH_SIZE, filteredBacklinks.length), + ); + await this.backlinkRepo.insertBacklink(backlinkChunk, trx); + } + } - const insertablePage: InsertablePage = { - id: page.id, - slugId: page.slugId, - title: title || page.name, - icon: pageIcon || null, - content: prosemirrorJson, - textContent: jsonToText(prosemirrorJson), - ydoc: await this.importService.createYdoc(prosemirrorJson), - position: page.position!, - spaceId: fileTask.spaceId, - workspaceId: fileTask.workspaceId, - creatorId: fileTask.creatorId, - lastUpdatedById: fileTask.creatorId, - parentPageId: page.parentPageId, - }; - - return { insertablePage, backlinks }; - }), - ); - - const insertablePages = pageResults.map((r) => r.insertablePage); - const insertableBacklinks = pageResults.flatMap((r) => r.backlinks); - - if (insertablePages.length < 1) return; - const validPageIds = new Set(insertablePages.map((row) => row.id)); - const filteredBacklinks = insertableBacklinks.filter( - ({ sourcePageId, targetPageId }) => - validPageIds.has(sourcePageId) && validPageIds.has(targetPageId), - ); - - await executeTx(this.db, async (trx) => { - await trx.insertInto('pages').values(insertablePages).execute(); - - if (filteredBacklinks.length > 0) { - await this.backlinkRepo.insertBacklink(filteredBacklinks, trx); - } - }); + this.logger.log( + `Successfully imported ${totalPagesProcessed} pages with ${filteredBacklinks.length} backlinks`, + ); + }); + } catch (error) { + this.logger.error('Failed to import files:', error); + throw new Error(`File import failed: ${error?.['message']}`); + } } async getFileTask(fileTaskId: string) { diff --git a/apps/server/src/integrations/import/services/import-attachment.service.ts b/apps/server/src/integrations/import/services/import-attachment.service.ts index 04a18fda..43f4d3fa 100644 --- a/apps/server/src/integrations/import/services/import-attachment.service.ts +++ b/apps/server/src/integrations/import/services/import-attachment.service.ts @@ -35,7 +35,7 @@ interface DrawioPair { @Injectable() export class ImportAttachmentService { private readonly logger = new Logger(ImportAttachmentService.name); - private readonly CONCURRENT_UPLOADS = 1; + private readonly CONCURRENT_UPLOADS = 3; private readonly MAX_RETRIES = 2; private readonly RETRY_DELAY = 2000; diff --git a/apps/server/src/integrations/import/utils/import.utils.ts b/apps/server/src/integrations/import/utils/import.utils.ts index 19cc66bc..1fa10d7a 100644 --- a/apps/server/src/integrations/import/utils/import.utils.ts +++ b/apps/server/src/integrations/import/utils/import.utils.ts @@ -64,3 +64,9 @@ export async function collectMarkdownAndHtmlFiles( await walk(dir); return results; } + +export function stripNotionID(fileName: string): string { + // Handle optional separator (space or dash) + 32 alphanumeric chars at end + const notionIdPattern = /[ -]?[a-z0-9]{32}$/i; + return fileName.replace(notionIdPattern, '').trim(); +} diff --git a/packages/editor-ext/src/lib/table/cell.ts b/packages/editor-ext/src/lib/table/cell.ts index 25a311b9..63df7dcf 100644 --- a/packages/editor-ext/src/lib/table/cell.ts +++ b/packages/editor-ext/src/lib/table/cell.ts @@ -2,33 +2,39 @@ import { TableCell as TiptapTableCell } from "@tiptap/extension-table-cell"; export const TableCell = TiptapTableCell.extend({ name: "tableCell", - content: "(paragraph | heading | bulletList | orderedList | taskList | blockquote | callout | image | video | attachment | mathBlock | details | codeBlock)+", - + content: + "(paragraph | heading | bulletList | orderedList | taskList | blockquote | callout | image | video | attachment | mathBlock | details | codeBlock)+", + addAttributes() { return { ...this.parent?.(), backgroundColor: { default: null, - parseHTML: (element) => element.style.backgroundColor || null, + parseHTML: (element) => + element.style.backgroundColor || + element.getAttribute("data-background-color") || + null, renderHTML: (attributes) => { if (!attributes.backgroundColor) { return {}; } return { style: `background-color: ${attributes.backgroundColor}`, - 'data-background-color': attributes.backgroundColor, + "data-background-color": attributes.backgroundColor, }; }, }, backgroundColorName: { default: null, - parseHTML: (element) => element.getAttribute('data-background-color-name') || null, + parseHTML: (element) => + element.getAttribute("data-background-color-name") || null, renderHTML: (attributes) => { if (!attributes.backgroundColorName) { return {}; } return { - 'data-background-color-name': attributes.backgroundColorName.toLowerCase(), + "data-background-color-name": + attributes.backgroundColorName.toLowerCase(), }; }, }, diff --git a/packages/editor-ext/src/lib/table/header.ts b/packages/editor-ext/src/lib/table/header.ts index 399a8cf0..501f089d 100644 --- a/packages/editor-ext/src/lib/table/header.ts +++ b/packages/editor-ext/src/lib/table/header.ts @@ -2,36 +2,42 @@ import { TableHeader as TiptapTableHeader } from "@tiptap/extension-table-header export const TableHeader = TiptapTableHeader.extend({ name: "tableHeader", - content: "(paragraph | heading | bulletList | orderedList | taskList | blockquote | callout | image | video | attachment | mathBlock | details | codeBlock)+", - + content: + "(paragraph | heading | bulletList | orderedList | taskList | blockquote | callout | image | video | attachment | mathBlock | details | codeBlock)+", + addAttributes() { return { ...this.parent?.(), backgroundColor: { default: null, - parseHTML: (element) => element.style.backgroundColor || null, + parseHTML: (element) => + element.style.backgroundColor || + element.getAttribute("data-background-color") || + null, renderHTML: (attributes) => { if (!attributes.backgroundColor) { return {}; } return { style: `background-color: ${attributes.backgroundColor}`, - 'data-background-color': attributes.backgroundColor, + "data-background-color": attributes.backgroundColor, }; }, }, backgroundColorName: { default: null, - parseHTML: (element) => element.getAttribute('data-background-color-name') || null, + parseHTML: (element) => + element.getAttribute("data-background-color-name") || null, renderHTML: (attributes) => { if (!attributes.backgroundColorName) { return {}; } return { - 'data-background-color-name': attributes.backgroundColorName.toLowerCase(), + "data-background-color-name": + attributes.backgroundColorName.toLowerCase(), }; }, }, }; }, -}); \ No newline at end of file +});