fix: enhance page import (#1570)

* change import process

* fix processor

* fix page name in notion import

* preserve confluence table bg color

* sync
This commit is contained in:
Philip Okugbe
2025-09-17 23:50:27 +01:00
committed by GitHub
parent 46669fea56
commit 9ac180f719
6 changed files with 199 additions and 107 deletions

View File

@ -47,15 +47,23 @@ export class FileTaskProcessor extends WorkerHost implements OnModuleDestroy {
await this.handleFailedJob(job); await this.handleFailedJob(job);
} }
@OnWorkerEvent('stalled') @OnWorkerEvent('completed')
async onStalled(job: Job) { async onCompleted(job: Job) {
this.logger.error( this.logger.log(
`Job ${job.name} stalled. . Import Task ID: ${job.data.fileTaskId}.. Job ID: ${job.id}`, `Completed ${job.name} job for File task ID ${job.data.fileTaskId}`,
); );
// Set failedReason for stalled jobs since it's not automatically set try {
job.failedReason = 'Job stalled and was marked as failed'; const fileTask = await this.fileTaskService.getFileTask(
await this.handleFailedJob(job); job.data.fileTaskId,
);
if (fileTask) {
await this.storageService.delete(fileTask.filePath);
this.logger.debug(`Deleted imported zip file: ${fileTask.filePath}`);
}
} catch (err) {
this.logger.error(`Failed to delete imported zip file:`, err);
}
} }
private async handleFailedJob(job: Job) { private async handleFailedJob(job: Job) {
@ -78,25 +86,6 @@ export class FileTaskProcessor extends WorkerHost implements OnModuleDestroy {
} }
} }
@OnWorkerEvent('completed')
async onCompleted(job: Job) {
this.logger.log(
`Completed ${job.name} job for File task ID ${job.data.fileTaskId}`,
);
try {
const fileTask = await this.fileTaskService.getFileTask(
job.data.fileTaskId,
);
if (fileTask) {
await this.storageService.delete(fileTask.filePath);
this.logger.debug(`Deleted imported zip file: ${fileTask.filePath}`);
}
} catch (err) {
this.logger.error(`Failed to delete imported zip file:`, err);
}
}
async onModuleDestroy(): Promise<void> { async onModuleDestroy(): Promise<void> {
if (this.worker) { if (this.worker) {
await this.worker.close(); await this.worker.close();

View File

@ -24,6 +24,7 @@ import { formatImportHtml } from '../utils/import-formatter';
import { import {
buildAttachmentCandidates, buildAttachmentCandidates,
collectMarkdownAndHtmlFiles, collectMarkdownAndHtmlFiles,
stripNotionID,
} from '../utils/import.utils'; } from '../utils/import.utils';
import { executeTx } from '@docmost/db/utils'; import { executeTx } from '@docmost/db/utils';
import { BacklinkRepo } from '@docmost/db/repos/backlink/backlink.repo'; import { BacklinkRepo } from '@docmost/db/repos/backlink/backlink.repo';
@ -159,17 +160,12 @@ export class FileImportTaskService {
.split(path.sep) .split(path.sep)
.join('/'); // normalize to forward-slashes .join('/'); // normalize to forward-slashes
const ext = path.extname(relPath).toLowerCase(); const ext = path.extname(relPath).toLowerCase();
let content = await fs.readFile(absPath, 'utf-8');
if (ext.toLowerCase() === '.md') {
content = await markdownToHtml(content);
}
pagesMap.set(relPath, { pagesMap.set(relPath, {
id: v7(), id: v7(),
slugId: generateSlugId(), slugId: generateSlugId(),
name: path.basename(relPath, ext), name: stripNotionID(path.basename(relPath, ext)),
content, content: '',
parentPageId: null, parentPageId: null,
fileExtension: ext, fileExtension: ext,
filePath: relPath, filePath: relPath,
@ -254,71 +250,160 @@ export class FileImportTaskService {
}); });
}); });
const pageResults = await Promise.all( // Group pages by level (topological sort for parent-child relationships)
Array.from(pagesMap.values()).map(async (page) => { const pagesByLevel = new Map<number, Array<[string, ImportPageNode]>>();
const htmlContent = const pageLevel = new Map<string, number>();
await this.importAttachmentService.processAttachments({
html: page.content,
pageRelativePath: page.filePath,
extractDir,
pageId: page.id,
fileTask,
attachmentCandidates,
});
const { html, backlinks, pageIcon } = await formatImportHtml({ // Calculate levels using BFS
html: htmlContent, const calculateLevels = () => {
currentFilePath: page.filePath, const queue: Array<{ filePath: string; level: number }> = [];
filePathToPageMetaMap: filePathToPageMetaMap,
creatorId: fileTask.creatorId,
sourcePageId: page.id,
workspaceId: fileTask.workspaceId,
});
const pmState = getProsemirrorContent( // Start with root pages (no parent)
await this.importService.processHTML(html), for (const [filePath, page] of pagesMap.entries()) {
if (!page.parentPageId) {
queue.push({ filePath, level: 0 });
pageLevel.set(filePath, 0);
}
}
// BFS to assign levels
while (queue.length > 0) {
const { filePath, level } = queue.shift()!;
const currentPage = pagesMap.get(filePath)!;
// Find children of current page
for (const [childFilePath, childPage] of pagesMap.entries()) {
if (
childPage.parentPageId === currentPage.id &&
!pageLevel.has(childFilePath)
) {
pageLevel.set(childFilePath, level + 1);
queue.push({ filePath: childFilePath, level: level + 1 });
}
}
}
// Group pages by level
for (const [filePath, page] of pagesMap.entries()) {
const level = pageLevel.get(filePath) || 0;
if (!pagesByLevel.has(level)) {
pagesByLevel.set(level, []);
}
pagesByLevel.get(level)!.push([filePath, page]);
}
};
calculateLevels();
if (pagesMap.size < 1) return;
// Process pages level by level sequentially to respect foreign key constraints
const allBacklinks: any[] = [];
const validPageIds = new Set<string>();
let totalPagesProcessed = 0;
// Sort levels to process in order
const sortedLevels = Array.from(pagesByLevel.keys()).sort((a, b) => a - b);
try {
await executeTx(this.db, async (trx) => {
// Process pages level by level sequentially within the transaction
for (const level of sortedLevels) {
const levelPages = pagesByLevel.get(level)!;
for (const [filePath, page] of levelPages) {
const absPath = path.join(extractDir, filePath);
let content = await fs.readFile(absPath, 'utf-8');
if (page.fileExtension.toLowerCase() === '.md') {
content = await markdownToHtml(content);
}
const htmlContent =
await this.importAttachmentService.processAttachments({
html: content,
pageRelativePath: page.filePath,
extractDir,
pageId: page.id,
fileTask,
attachmentCandidates,
});
const { html, backlinks, pageIcon } = await formatImportHtml({
html: htmlContent,
currentFilePath: page.filePath,
filePathToPageMetaMap: filePathToPageMetaMap,
creatorId: fileTask.creatorId,
sourcePageId: page.id,
workspaceId: fileTask.workspaceId,
});
const pmState = getProsemirrorContent(
await this.importService.processHTML(html),
);
const { title, prosemirrorJson } =
this.importService.extractTitleAndRemoveHeading(pmState);
const insertablePage: InsertablePage = {
id: page.id,
slugId: page.slugId,
title: title || page.name,
icon: pageIcon || null,
content: prosemirrorJson,
textContent: jsonToText(prosemirrorJson),
ydoc: await this.importService.createYdoc(prosemirrorJson),
position: page.position!,
spaceId: fileTask.spaceId,
workspaceId: fileTask.workspaceId,
creatorId: fileTask.creatorId,
lastUpdatedById: fileTask.creatorId,
parentPageId: page.parentPageId,
};
await trx.insertInto('pages').values(insertablePage).execute();
// Track valid page IDs and collect backlinks
validPageIds.add(insertablePage.id);
allBacklinks.push(...backlinks);
totalPagesProcessed++;
// Log progress periodically
if (totalPagesProcessed % 50 === 0) {
this.logger.debug(`Processed ${totalPagesProcessed} pages...`);
}
}
}
const filteredBacklinks = allBacklinks.filter(
({ sourcePageId, targetPageId }) =>
validPageIds.has(sourcePageId) && validPageIds.has(targetPageId),
); );
const { title, prosemirrorJson } = // Insert backlinks in batches
this.importService.extractTitleAndRemoveHeading(pmState); if (filteredBacklinks.length > 0) {
const BACKLINK_BATCH_SIZE = 100;
for (
let i = 0;
i < filteredBacklinks.length;
i += BACKLINK_BATCH_SIZE
) {
const backlinkChunk = filteredBacklinks.slice(
i,
Math.min(i + BACKLINK_BATCH_SIZE, filteredBacklinks.length),
);
await this.backlinkRepo.insertBacklink(backlinkChunk, trx);
}
}
const insertablePage: InsertablePage = { this.logger.log(
id: page.id, `Successfully imported ${totalPagesProcessed} pages with ${filteredBacklinks.length} backlinks`,
slugId: page.slugId, );
title: title || page.name, });
icon: pageIcon || null, } catch (error) {
content: prosemirrorJson, this.logger.error('Failed to import files:', error);
textContent: jsonToText(prosemirrorJson), throw new Error(`File import failed: ${error?.['message']}`);
ydoc: await this.importService.createYdoc(prosemirrorJson), }
position: page.position!,
spaceId: fileTask.spaceId,
workspaceId: fileTask.workspaceId,
creatorId: fileTask.creatorId,
lastUpdatedById: fileTask.creatorId,
parentPageId: page.parentPageId,
};
return { insertablePage, backlinks };
}),
);
const insertablePages = pageResults.map((r) => r.insertablePage);
const insertableBacklinks = pageResults.flatMap((r) => r.backlinks);
if (insertablePages.length < 1) return;
const validPageIds = new Set(insertablePages.map((row) => row.id));
const filteredBacklinks = insertableBacklinks.filter(
({ sourcePageId, targetPageId }) =>
validPageIds.has(sourcePageId) && validPageIds.has(targetPageId),
);
await executeTx(this.db, async (trx) => {
await trx.insertInto('pages').values(insertablePages).execute();
if (filteredBacklinks.length > 0) {
await this.backlinkRepo.insertBacklink(filteredBacklinks, trx);
}
});
} }
async getFileTask(fileTaskId: string) { async getFileTask(fileTaskId: string) {

View File

@ -35,7 +35,7 @@ interface DrawioPair {
@Injectable() @Injectable()
export class ImportAttachmentService { export class ImportAttachmentService {
private readonly logger = new Logger(ImportAttachmentService.name); private readonly logger = new Logger(ImportAttachmentService.name);
private readonly CONCURRENT_UPLOADS = 1; private readonly CONCURRENT_UPLOADS = 3;
private readonly MAX_RETRIES = 2; private readonly MAX_RETRIES = 2;
private readonly RETRY_DELAY = 2000; private readonly RETRY_DELAY = 2000;

View File

@ -64,3 +64,9 @@ export async function collectMarkdownAndHtmlFiles(
await walk(dir); await walk(dir);
return results; return results;
} }
export function stripNotionID(fileName: string): string {
// Handle optional separator (space or dash) + 32 alphanumeric chars at end
const notionIdPattern = /[ -]?[a-z0-9]{32}$/i;
return fileName.replace(notionIdPattern, '').trim();
}

View File

@ -2,33 +2,39 @@ import { TableCell as TiptapTableCell } from "@tiptap/extension-table-cell";
export const TableCell = TiptapTableCell.extend({ export const TableCell = TiptapTableCell.extend({
name: "tableCell", name: "tableCell",
content: "(paragraph | heading | bulletList | orderedList | taskList | blockquote | callout | image | video | attachment | mathBlock | details | codeBlock)+", content:
"(paragraph | heading | bulletList | orderedList | taskList | blockquote | callout | image | video | attachment | mathBlock | details | codeBlock)+",
addAttributes() { addAttributes() {
return { return {
...this.parent?.(), ...this.parent?.(),
backgroundColor: { backgroundColor: {
default: null, default: null,
parseHTML: (element) => element.style.backgroundColor || null, parseHTML: (element) =>
element.style.backgroundColor ||
element.getAttribute("data-background-color") ||
null,
renderHTML: (attributes) => { renderHTML: (attributes) => {
if (!attributes.backgroundColor) { if (!attributes.backgroundColor) {
return {}; return {};
} }
return { return {
style: `background-color: ${attributes.backgroundColor}`, style: `background-color: ${attributes.backgroundColor}`,
'data-background-color': attributes.backgroundColor, "data-background-color": attributes.backgroundColor,
}; };
}, },
}, },
backgroundColorName: { backgroundColorName: {
default: null, default: null,
parseHTML: (element) => element.getAttribute('data-background-color-name') || null, parseHTML: (element) =>
element.getAttribute("data-background-color-name") || null,
renderHTML: (attributes) => { renderHTML: (attributes) => {
if (!attributes.backgroundColorName) { if (!attributes.backgroundColorName) {
return {}; return {};
} }
return { return {
'data-background-color-name': attributes.backgroundColorName.toLowerCase(), "data-background-color-name":
attributes.backgroundColorName.toLowerCase(),
}; };
}, },
}, },

View File

@ -2,36 +2,42 @@ import { TableHeader as TiptapTableHeader } from "@tiptap/extension-table-header
export const TableHeader = TiptapTableHeader.extend({ export const TableHeader = TiptapTableHeader.extend({
name: "tableHeader", name: "tableHeader",
content: "(paragraph | heading | bulletList | orderedList | taskList | blockquote | callout | image | video | attachment | mathBlock | details | codeBlock)+", content:
"(paragraph | heading | bulletList | orderedList | taskList | blockquote | callout | image | video | attachment | mathBlock | details | codeBlock)+",
addAttributes() { addAttributes() {
return { return {
...this.parent?.(), ...this.parent?.(),
backgroundColor: { backgroundColor: {
default: null, default: null,
parseHTML: (element) => element.style.backgroundColor || null, parseHTML: (element) =>
element.style.backgroundColor ||
element.getAttribute("data-background-color") ||
null,
renderHTML: (attributes) => { renderHTML: (attributes) => {
if (!attributes.backgroundColor) { if (!attributes.backgroundColor) {
return {}; return {};
} }
return { return {
style: `background-color: ${attributes.backgroundColor}`, style: `background-color: ${attributes.backgroundColor}`,
'data-background-color': attributes.backgroundColor, "data-background-color": attributes.backgroundColor,
}; };
}, },
}, },
backgroundColorName: { backgroundColorName: {
default: null, default: null,
parseHTML: (element) => element.getAttribute('data-background-color-name') || null, parseHTML: (element) =>
element.getAttribute("data-background-color-name") || null,
renderHTML: (attributes) => { renderHTML: (attributes) => {
if (!attributes.backgroundColorName) { if (!attributes.backgroundColorName) {
return {}; return {};
} }
return { return {
'data-background-color-name': attributes.backgroundColorName.toLowerCase(), "data-background-color-name":
attributes.backgroundColorName.toLowerCase(),
}; };
}, },
}, },
}; };
}, },
}); });