This commit is contained in:
Philipinho
2025-05-29 19:51:45 -07:00
parent 95f24d9ba5
commit 9ae81b9817
11 changed files with 582 additions and 538 deletions

View File

@ -1,513 +0,0 @@
import { Injectable, Logger } from '@nestjs/common';
import * as path from 'path';
import { jsonToText } from '../../collaboration/collaboration.util';
import { InjectKysely } from 'nestjs-kysely';
import { KyselyDB } from '@docmost/db/types/kysely.types';
import { cleanUrlString, extractZip, FileTaskStatus } from './file.utils';
import { StorageService } from '../storage/storage.service';
import * as tmp from 'tmp-promise';
import { pipeline } from 'node:stream/promises';
import { createReadStream, createWriteStream } from 'node:fs';
import { ImportService } from './import.service';
import { promises as fs } from 'fs';
import {
generateSlugId,
getMimeType,
sanitizeFileName,
} from '../../common/helpers';
import { v7 } from 'uuid';
import { generateJitteredKeyBetween } from 'fractional-indexing-jittered';
import { FileTask, InsertablePage } from '@docmost/db/types/entity.types';
import { markdownToHtml } from '@docmost/editor-ext';
import { getAttachmentFolderPath } from '../../core/attachment/attachment.utils';
import { AttachmentType } from '../../core/attachment/attachment.constants';
import { getProsemirrorContent } from '../../common/helpers/prosemirror/utils';
import { formatImportHtml, unwrapFromParagraph } from './import-formatter';
import {
buildAttachmentCandidates,
collectMarkdownAndHtmlFiles,
resolveRelativeAttachmentPath,
} from './import.utils';
import { executeTx } from '@docmost/db/utils';
import { BacklinkRepo } from '@docmost/db/repos/backlink/backlink.repo';
import { load } from 'cheerio';
@Injectable()
export class FileTaskService {
private readonly logger = new Logger(FileTaskService.name);
constructor(
private readonly storageService: StorageService,
private readonly importService: ImportService,
private readonly backlinkRepo: BacklinkRepo,
@InjectKysely() private readonly db: KyselyDB,
) {}
async processZIpImport(fileTaskId: string): Promise<void> {
const fileTask = await this.db
.selectFrom('fileTasks')
.selectAll()
.where('id', '=', fileTaskId)
.executeTakeFirst();
if (!fileTask) {
this.logger.log(`Import file task with ID ${fileTaskId} not found`);
return;
}
if (fileTask.status === FileTaskStatus.Success) {
this.logger.log('Imported task already processed.');
return;
}
const { path: tmpZipPath, cleanup: cleanupTmpFile } = await tmp.file({
prefix: 'docmost-import',
postfix: '.zip',
discardDescriptor: true,
});
const { path: tmpExtractDir, cleanup: cleanupTmpDir } = await tmp.dir({
prefix: 'docmost-extract-',
unsafeCleanup: true,
});
const fileStream = await this.storageService.readStream(fileTask.filePath);
await pipeline(fileStream, createWriteStream(tmpZipPath));
await extractZip(tmpZipPath, tmpExtractDir);
try {
await this.updateTaskStatus(fileTaskId, FileTaskStatus.Processing);
// if type == generic
if (fileTask.source === 'generic') {
await this.processGenericImport({
extractDir: tmpExtractDir,
fileTask,
});
}
/*
if (fileTask.source === 'confluence') {
await this.processConfluenceImport({
extractDir: tmpExtractDir,
fileTask,
});
}*/
await this.updateTaskStatus(fileTaskId, FileTaskStatus.Success);
} catch (error) {
await this.updateTaskStatus(fileTaskId, FileTaskStatus.Failed);
this.logger.error(error);
} finally {
await cleanupTmpFile();
await cleanupTmpDir();
}
}
async processGenericImport(opts: {
extractDir: string;
fileTask: FileTask;
}): Promise<void> {
const { extractDir, fileTask } = opts;
const allFiles = await collectMarkdownAndHtmlFiles(extractDir);
const attachmentCandidates = await buildAttachmentCandidates(extractDir);
const pagesMap = new Map<
string,
{
id: string;
slugId: string;
name: string;
content: string;
position?: string | null;
parentPageId: string | null;
fileExtension: string;
filePath: string;
}
>();
for (const absPath of allFiles) {
const relPath = path
.relative(extractDir, absPath)
.split(path.sep)
.join('/'); // normalize to forward-slashes
const ext = path.extname(relPath).toLowerCase();
let content = await fs.readFile(absPath, 'utf-8');
if (ext.toLowerCase() === '.md') {
content = await markdownToHtml(content);
}
pagesMap.set(relPath, {
id: v7(),
slugId: generateSlugId(),
name: path.basename(relPath, ext),
content,
parentPageId: null,
fileExtension: ext,
filePath: relPath,
});
}
// parent/child linking
pagesMap.forEach((page, filePath) => {
const segments = filePath.split('/');
segments.pop();
let parentPage = null;
while (segments.length) {
const tryMd = segments.join('/') + '.md';
const tryHtml = segments.join('/') + '.html';
if (pagesMap.has(tryMd)) {
parentPage = pagesMap.get(tryMd)!;
break;
}
if (pagesMap.has(tryHtml)) {
parentPage = pagesMap.get(tryHtml)!;
break;
}
segments.pop();
}
if (parentPage) page.parentPageId = parentPage.id;
});
// generate position keys
const siblingsMap = new Map<string | null, typeof Array.prototype>();
pagesMap.forEach((page) => {
const sibs = siblingsMap.get(page.parentPageId) || [];
sibs.push(page);
siblingsMap.set(page.parentPageId, sibs);
});
siblingsMap.forEach((sibs) => {
sibs.sort((a, b) => a.name.localeCompare(b.name));
let prevPos: string | null = null;
for (const page of sibs) {
page.position = generateJitteredKeyBetween(prevPos, null);
prevPos = page.position;
}
});
const filePathToPageMetaMap = new Map<
string,
{ id: string; title: string; slugId: string }
>();
pagesMap.forEach((page) => {
filePathToPageMetaMap.set(page.filePath, {
id: page.id,
title: page.name,
slugId: page.slugId,
});
});
const pageResults = await Promise.all(
Array.from(pagesMap.values()).map(async (page) => {
const htmlContent = await this.rewriteLocalFilesInHtml({
html: page.content,
pageRelativePath: page.filePath,
extractDir,
pageId: page.id,
fileTask,
attachmentCandidates,
});
const { html, backlinks } = await formatImportHtml({
html: htmlContent,
currentFilePath: page.filePath,
filePathToPageMetaMap: filePathToPageMetaMap,
creatorId: fileTask.creatorId,
sourcePageId: page.id,
workspaceId: fileTask.workspaceId,
});
const pmState = getProsemirrorContent(
await this.importService.processHTML(html),
);
const { title, prosemirrorJson } =
this.importService.extractTitleAndRemoveHeading(pmState);
const insertablePage: InsertablePage = {
id: page.id,
slugId: page.slugId,
title: title || page.name,
content: prosemirrorJson,
textContent: jsonToText(prosemirrorJson),
ydoc: await this.importService.createYdoc(prosemirrorJson),
position: page.position!,
spaceId: fileTask.spaceId,
workspaceId: fileTask.workspaceId,
creatorId: fileTask.creatorId,
lastUpdatedById: fileTask.creatorId,
parentPageId: page.parentPageId,
};
return { insertablePage, backlinks };
}),
);
const insertablePages = pageResults.map((r) => r.insertablePage);
const insertableBacklinks = pageResults.flatMap((r) => r.backlinks);
if (insertablePages.length < 1) return;
const validPageIds = new Set(insertablePages.map((row) => row.id));
const filteredBacklinks = insertableBacklinks.filter(
({ sourcePageId, targetPageId }) =>
validPageIds.has(sourcePageId) && validPageIds.has(targetPageId),
);
await executeTx(this.db, async (trx) => {
await trx.insertInto('pages').values(insertablePages).execute();
if (filteredBacklinks.length > 0) {
await this.backlinkRepo.insertBacklink(filteredBacklinks, trx);
}
});
}
async rewriteLocalFilesInHtml(opts: {
html: string;
pageRelativePath: string;
extractDir: string;
pageId: string;
fileTask: FileTask;
attachmentCandidates: Map<string, string>;
}): Promise<string> {
const {
html,
pageRelativePath,
extractDir,
pageId,
fileTask,
attachmentCandidates,
} = opts;
const attachmentTasks: Promise<void>[] = [];
const processFile = (relPath: string) => {
const abs = attachmentCandidates.get(relPath)!;
const attachmentId = v7();
const ext = path.extname(abs);
const fileNameWithExt =
sanitizeFileName(path.basename(abs, ext)) + ext.toLowerCase();
const storageFilePath = `${getAttachmentFolderPath(AttachmentType.File, fileTask.workspaceId)}/${attachmentId}/${fileNameWithExt}`;
const apiFilePath = `/api/files/${attachmentId}/${fileNameWithExt}`;
attachmentTasks.push(
(async () => {
const fileStream = createReadStream(abs);
await this.storageService.uploadStream(storageFilePath, fileStream);
const stat = await fs.stat(abs);
await this.db
.insertInto('attachments')
.values({
id: attachmentId,
filePath: storageFilePath,
fileName: fileNameWithExt,
fileSize: stat.size,
mimeType: getMimeType(fileNameWithExt),
type: 'file',
fileExt: ext,
creatorId: fileTask.creatorId,
workspaceId: fileTask.workspaceId,
pageId,
spaceId: fileTask.spaceId,
})
.execute();
})(),
);
return {
attachmentId,
storageFilePath,
apiFilePath,
fileNameWithExt,
abs,
};
};
const pageDir = path.dirname(pageRelativePath);
const $ = load(html);
// image
for (const imgEl of $('img').toArray()) {
const $img = $(imgEl);
const src = cleanUrlString($img.attr('src') ?? '')!;
if (!src || src.startsWith('http')) continue;
const relPath = resolveRelativeAttachmentPath(
src,
pageDir,
attachmentCandidates,
);
if (!relPath) continue;
const { attachmentId, apiFilePath, abs } = processFile(relPath);
const stat = await fs.stat(abs);
const width = $img.attr('width') ?? '100%';
const align = $img.attr('data-align') ?? 'center';
$img
.attr('src', apiFilePath)
.attr('data-attachment-id', attachmentId)
.attr('data-size', stat.size.toString())
.attr('width', width)
.attr('data-align', align);
unwrapFromParagraph($, $img);
}
// video
for (const vidEl of $('video').toArray()) {
const $vid = $(vidEl);
const src = cleanUrlString($vid.attr('src') ?? '')!;
if (!src || src.startsWith('http')) continue;
const relPath = resolveRelativeAttachmentPath(
src,
pageDir,
attachmentCandidates,
);
if (!relPath) continue;
const { attachmentId, apiFilePath, abs } = processFile(relPath);
const stat = await fs.stat(abs);
const width = $vid.attr('width') ?? '100%';
const align = $vid.attr('data-align') ?? 'center';
$vid
.attr('src', apiFilePath)
.attr('data-attachment-id', attachmentId)
.attr('data-size', stat.size.toString())
.attr('width', width)
.attr('data-align', align);
unwrapFromParagraph($, $vid);
}
for (const el of $('div[data-type="attachment"]').toArray()) {
const $oldDiv = $(el);
const rawUrl = cleanUrlString($oldDiv.attr('data-attachment-url') ?? '')!;
if (!rawUrl || rawUrl.startsWith('http')) continue;
const relPath = resolveRelativeAttachmentPath(
rawUrl,
pageDir,
attachmentCandidates,
);
if (!relPath) continue;
const { attachmentId, apiFilePath, abs } = processFile(relPath);
const stat = await fs.stat(abs);
const fileName = path.basename(abs);
const mime = getMimeType(abs);
const $newDiv = $('<div>')
.attr('data-type', 'attachment')
.attr('data-attachment-url', apiFilePath)
.attr('data-attachment-name', fileName)
.attr('data-attachment-mime', mime)
.attr('data-attachment-size', stat.size.toString())
.attr('data-attachment-id', attachmentId);
$oldDiv.replaceWith($newDiv);
unwrapFromParagraph($, $newDiv);
}
// rewrite other attachments via <a>
for (const aEl of $('a').toArray()) {
const $a = $(aEl);
const href = cleanUrlString($a.attr('href') ?? '')!;
if (!href || href.startsWith('http')) continue;
const relPath = resolveRelativeAttachmentPath(
href,
pageDir,
attachmentCandidates,
);
if (!relPath) continue;
const { attachmentId, apiFilePath, abs } = processFile(relPath);
const stat = await fs.stat(abs);
const ext = path.extname(relPath).toLowerCase();
if (ext === '.mp4') {
const $video = $('<video>')
.attr('src', apiFilePath)
.attr('data-attachment-id', attachmentId)
.attr('data-size', stat.size.toString())
.attr('width', '100%')
.attr('data-align', 'center');
$a.replaceWith($video);
unwrapFromParagraph($, $video);
} else {
// build attachment <div>
const confAliasName = $a.attr('data-linked-resource-default-alias');
let attachmentName = path.basename(abs);
if (confAliasName) attachmentName = confAliasName;
const $div = $('<div>')
.attr('data-type', 'attachment')
.attr('data-attachment-url', apiFilePath)
.attr('data-attachment-name', attachmentName)
.attr('data-attachment-mime', getMimeType(abs))
.attr('data-attachment-size', stat.size.toString())
.attr('data-attachment-id', attachmentId);
$a.replaceWith($div);
unwrapFromParagraph($, $div);
}
}
// excalidraw and drawio
for (const type of ['excalidraw', 'drawio'] as const) {
for (const el of $(`div[data-type="${type}"]`).toArray()) {
const $oldDiv = $(el);
const rawSrc = cleanUrlString($oldDiv.attr('data-src') ?? '')!;
if (!rawSrc || rawSrc.startsWith('http')) continue;
const relPath = resolveRelativeAttachmentPath(
rawSrc,
pageDir,
attachmentCandidates,
);
if (!relPath) continue;
const { attachmentId, apiFilePath, abs } = processFile(relPath);
const stat = await fs.stat(abs);
const fileName = path.basename(abs);
const width = $oldDiv.attr('data-width') || '100%';
const align = $oldDiv.attr('data-align') || 'center';
const $newDiv = $('<div>')
.attr('data-type', type)
.attr('data-src', apiFilePath)
.attr('data-title', fileName)
.attr('data-width', width)
.attr('data-size', stat.size.toString())
.attr('data-align', align)
.attr('data-attachment-id', attachmentId);
$oldDiv.replaceWith($newDiv);
unwrapFromParagraph($, $newDiv);
}
}
// wait for all uploads & DB inserts
await Promise.all(attachmentTasks);
return $.root().html() || '';
}
async updateTaskStatus(fileTaskId: string, status: FileTaskStatus) {
await this.db
.updateTable('fileTasks')
.set({ status: status })
.where('id', '=', fileTaskId)
.execute();
}
}

View File

@ -21,7 +21,7 @@ import {
import { FileInterceptor } from '../../common/interceptors/file.interceptor';
import * as bytes from 'bytes';
import * as path from 'path';
import { ImportService } from './import.service';
import { ImportService } from './services/import.service';
import { AuthWorkspace } from '../../common/decorators/auth-workspace.decorator';
@Controller()

View File

@ -1,12 +1,19 @@
import { Module } from '@nestjs/common';
import { ImportService } from './import.service';
import { ImportService } from './services/import.service';
import { ImportController } from './import.controller';
import { StorageModule } from '../storage/storage.module';
import { FileTaskService } from './file-task.service';
import { FileTaskService } from './services/file-task.service';
import { FileTaskProcessor } from './processors/file-task.processor';
import { ImportAttachmentService } from './services/import-attachment.service';
@Module({
providers: [ImportService, FileTaskService, FileTaskProcessor],
providers: [
ImportService,
FileTaskService,
FileTaskProcessor,
ImportAttachmentService,
],
exports: [ImportService, ImportAttachmentService],
controllers: [ImportController],
imports: [StorageModule],
})

View File

@ -2,7 +2,7 @@ import { Logger, OnModuleDestroy } from '@nestjs/common';
import { OnWorkerEvent, Processor, WorkerHost } from '@nestjs/bullmq';
import { Job } from 'bullmq';
import { QueueJob, QueueName } from 'src/integrations/queue/constants';
import { FileTaskService } from '../file-task.service';
import { FileTaskService } from '../services/file-task.service';
@Processor(QueueName.FILE_TASK_QUEUE)
export class FileTaskProcessor extends WorkerHost implements OnModuleDestroy {

View File

@ -0,0 +1,289 @@
import { Injectable, Logger } from '@nestjs/common';
import * as path from 'path';
import { jsonToText } from '../../../collaboration/collaboration.util';
import { InjectKysely } from 'nestjs-kysely';
import { KyselyDB } from '@docmost/db/types/kysely.types';
import {
extractZip,
FileImportType,
FileTaskStatus,
} from '../utils/file.utils';
import { StorageService } from '../../storage/storage.service';
import * as tmp from 'tmp-promise';
import { pipeline } from 'node:stream/promises';
import { createWriteStream } from 'node:fs';
import { ImportService } from './import.service';
import { promises as fs } from 'fs';
import { generateSlugId } from '../../../common/helpers';
import { v7 } from 'uuid';
import { generateJitteredKeyBetween } from 'fractional-indexing-jittered';
import { FileTask, InsertablePage } from '@docmost/db/types/entity.types';
import { markdownToHtml } from '@docmost/editor-ext';
import { getProsemirrorContent } from '../../../common/helpers/prosemirror/utils';
import { formatImportHtml } from '../utils/import-formatter';
import {
buildAttachmentCandidates,
collectMarkdownAndHtmlFiles,
} from '../utils/import.utils';
import { executeTx } from '@docmost/db/utils';
import { BacklinkRepo } from '@docmost/db/repos/backlink/backlink.repo';
import { ImportAttachmentService } from './import-attachment.service';
import { ModuleRef } from '@nestjs/core';
@Injectable()
export class FileTaskService {
private readonly logger = new Logger(FileTaskService.name);
constructor(
private readonly storageService: StorageService,
private readonly importService: ImportService,
private readonly backlinkRepo: BacklinkRepo,
@InjectKysely() private readonly db: KyselyDB,
private readonly importAttachmentService: ImportAttachmentService,
// private readonly confluenceTaskService: ConfluenceImportService,
private moduleRef: ModuleRef,
) {}
async processZIpImport(fileTaskId: string): Promise<void> {
const fileTask = await this.db
.selectFrom('fileTasks')
.selectAll()
.where('id', '=', fileTaskId)
.executeTakeFirst();
if (!fileTask) {
this.logger.log(`Import file task with ID ${fileTaskId} not found`);
return;
}
if (fileTask.status === FileTaskStatus.Success) {
this.logger.log('Imported task already processed.');
return;
}
const { path: tmpZipPath, cleanup: cleanupTmpFile } = await tmp.file({
prefix: 'docmost-import',
postfix: '.zip',
discardDescriptor: true,
});
const { path: tmpExtractDir, cleanup: cleanupTmpDir } = await tmp.dir({
prefix: 'docmost-extract-',
unsafeCleanup: true,
});
const fileStream = await this.storageService.readStream(fileTask.filePath);
await pipeline(fileStream, createWriteStream(tmpZipPath));
await extractZip(tmpZipPath, tmpExtractDir);
try {
if (
fileTask.source === FileImportType.Generic ||
fileTask.source === FileImportType.Notion
) {
await this.processGenericImport({
extractDir: tmpExtractDir,
fileTask,
});
}
if (fileTask.source === FileImportType.Confluence) {
let ConfluenceModule: any;
try {
// eslint-disable-next-line @typescript-eslint/no-require-imports
ConfluenceModule = require('./../../../ee/confluence-import/confluence-import.service');
} catch (err) {
this.logger.error(
'Confluence import requested but EE module not bundled in this build',
);
return;
}
const confluenceImportService = this.moduleRef.get(
ConfluenceModule.ConfluenceImportService,
{ strict: false },
);
await confluenceImportService.processConfluenceImport({
extractDir: tmpExtractDir,
fileTask,
});
}
await this.updateTaskStatus(fileTaskId, FileTaskStatus.Success);
} catch (error) {
await this.updateTaskStatus(fileTaskId, FileTaskStatus.Failed);
this.logger.error(error);
} finally {
await cleanupTmpFile();
await cleanupTmpDir();
}
}
async processGenericImport(opts: {
extractDir: string;
fileTask: FileTask;
}): Promise<void> {
const { extractDir, fileTask } = opts;
const allFiles = await collectMarkdownAndHtmlFiles(extractDir);
const attachmentCandidates = await buildAttachmentCandidates(extractDir);
const pagesMap = new Map<
string,
{
id: string;
slugId: string;
name: string;
content: string;
position?: string | null;
parentPageId: string | null;
fileExtension: string;
filePath: string;
}
>();
for (const absPath of allFiles) {
const relPath = path
.relative(extractDir, absPath)
.split(path.sep)
.join('/'); // normalize to forward-slashes
const ext = path.extname(relPath).toLowerCase();
let content = await fs.readFile(absPath, 'utf-8');
if (ext.toLowerCase() === '.md') {
content = await markdownToHtml(content);
}
pagesMap.set(relPath, {
id: v7(),
slugId: generateSlugId(),
name: path.basename(relPath, ext),
content,
parentPageId: null,
fileExtension: ext,
filePath: relPath,
});
}
// parent/child linking
pagesMap.forEach((page, filePath) => {
const segments = filePath.split('/');
segments.pop();
let parentPage = null;
while (segments.length) {
const tryMd = segments.join('/') + '.md';
const tryHtml = segments.join('/') + '.html';
if (pagesMap.has(tryMd)) {
parentPage = pagesMap.get(tryMd)!;
break;
}
if (pagesMap.has(tryHtml)) {
parentPage = pagesMap.get(tryHtml)!;
break;
}
segments.pop();
}
if (parentPage) page.parentPageId = parentPage.id;
});
// generate position keys
const siblingsMap = new Map<string | null, typeof Array.prototype>();
pagesMap.forEach((page) => {
const sibs = siblingsMap.get(page.parentPageId) || [];
sibs.push(page);
siblingsMap.set(page.parentPageId, sibs);
});
siblingsMap.forEach((sibs) => {
sibs.sort((a, b) => a.name.localeCompare(b.name));
let prevPos: string | null = null;
for (const page of sibs) {
page.position = generateJitteredKeyBetween(prevPos, null);
prevPos = page.position;
}
});
const filePathToPageMetaMap = new Map<
string,
{ id: string; title: string; slugId: string }
>();
pagesMap.forEach((page) => {
filePathToPageMetaMap.set(page.filePath, {
id: page.id,
title: page.name,
slugId: page.slugId,
});
});
const pageResults = await Promise.all(
Array.from(pagesMap.values()).map(async (page) => {
const htmlContent =
await this.importAttachmentService.processAttachments({
html: page.content,
pageRelativePath: page.filePath,
extractDir,
pageId: page.id,
fileTask,
attachmentCandidates,
});
const { html, backlinks } = await formatImportHtml({
html: htmlContent,
currentFilePath: page.filePath,
filePathToPageMetaMap: filePathToPageMetaMap,
creatorId: fileTask.creatorId,
sourcePageId: page.id,
workspaceId: fileTask.workspaceId,
});
const pmState = getProsemirrorContent(
await this.importService.processHTML(html),
);
const { title, prosemirrorJson } =
this.importService.extractTitleAndRemoveHeading(pmState);
const insertablePage: InsertablePage = {
id: page.id,
slugId: page.slugId,
title: title || page.name,
content: prosemirrorJson,
textContent: jsonToText(prosemirrorJson),
ydoc: await this.importService.createYdoc(prosemirrorJson),
position: page.position!,
spaceId: fileTask.spaceId,
workspaceId: fileTask.workspaceId,
creatorId: fileTask.creatorId,
lastUpdatedById: fileTask.creatorId,
parentPageId: page.parentPageId,
};
return { insertablePage, backlinks };
}),
);
const insertablePages = pageResults.map((r) => r.insertablePage);
const insertableBacklinks = pageResults.flatMap((r) => r.backlinks);
if (insertablePages.length < 1) return;
const validPageIds = new Set(insertablePages.map((row) => row.id));
const filteredBacklinks = insertableBacklinks.filter(
({ sourcePageId, targetPageId }) =>
validPageIds.has(sourcePageId) && validPageIds.has(targetPageId),
);
await executeTx(this.db, async (trx) => {
await trx.insertInto('pages').values(insertablePages).execute();
if (filteredBacklinks.length > 0) {
await this.backlinkRepo.insertBacklink(filteredBacklinks, trx);
}
});
}
async updateTaskStatus(fileTaskId: string, status: FileTaskStatus) {
await this.db
.updateTable('fileTasks')
.set({ status: status })
.where('id', '=', fileTaskId)
.execute();
}
}

View File

@ -0,0 +1,267 @@
import { Injectable, Logger } from '@nestjs/common';
import * as path from 'path';
import { InjectKysely } from 'nestjs-kysely';
import { KyselyDB } from '@docmost/db/types/kysely.types';
import { cleanUrlString } from '../utils/file.utils';
import { StorageService } from '../../storage/storage.service';
import { createReadStream } from 'node:fs';
import { promises as fs } from 'fs';
import { getMimeType, sanitizeFileName } from '../../../common/helpers';
import { v7 } from 'uuid';
import { FileTask } from '@docmost/db/types/entity.types';
import { getAttachmentFolderPath } from '../../../core/attachment/attachment.utils';
import { AttachmentType } from '../../../core/attachment/attachment.constants';
import { unwrapFromParagraph } from '../utils/import-formatter';
import { resolveRelativeAttachmentPath } from '../utils/import.utils';
import { load } from 'cheerio';
@Injectable()
export class ImportAttachmentService {
private readonly logger = new Logger(ImportAttachmentService.name);
constructor(
private readonly storageService: StorageService,
@InjectKysely() private readonly db: KyselyDB,
) {}
async processAttachments(opts: {
html: string;
pageRelativePath: string;
extractDir: string;
pageId: string;
fileTask: FileTask;
attachmentCandidates: Map<string, string>;
}): Promise<string> {
const {
html,
pageRelativePath,
extractDir,
pageId,
fileTask,
attachmentCandidates,
} = opts;
const attachmentTasks: Promise<void>[] = [];
const processFile = (relPath: string) => {
const abs = attachmentCandidates.get(relPath)!;
const attachmentId = v7();
const ext = path.extname(abs);
const fileNameWithExt =
sanitizeFileName(path.basename(abs, ext)) + ext.toLowerCase();
const storageFilePath = `${getAttachmentFolderPath(AttachmentType.File, fileTask.workspaceId)}/${attachmentId}/${fileNameWithExt}`;
const apiFilePath = `/api/files/${attachmentId}/${fileNameWithExt}`;
attachmentTasks.push(
(async () => {
const fileStream = createReadStream(abs);
await this.storageService.uploadStream(storageFilePath, fileStream);
const stat = await fs.stat(abs);
await this.db
.insertInto('attachments')
.values({
id: attachmentId,
filePath: storageFilePath,
fileName: fileNameWithExt,
fileSize: stat.size,
mimeType: getMimeType(fileNameWithExt),
type: 'file',
fileExt: ext,
creatorId: fileTask.creatorId,
workspaceId: fileTask.workspaceId,
pageId,
spaceId: fileTask.spaceId,
})
.execute();
})(),
);
return {
attachmentId,
storageFilePath,
apiFilePath,
fileNameWithExt,
abs,
};
};
const pageDir = path.dirname(pageRelativePath);
const $ = load(html);
// image
for (const imgEl of $('img').toArray()) {
const $img = $(imgEl);
const src = cleanUrlString($img.attr('src') ?? '')!;
if (!src || src.startsWith('http')) continue;
const relPath = resolveRelativeAttachmentPath(
src,
pageDir,
attachmentCandidates,
);
if (!relPath) continue;
const { attachmentId, apiFilePath, abs } = processFile(relPath);
const stat = await fs.stat(abs);
const width = $img.attr('width') ?? '100%';
const align = $img.attr('data-align') ?? 'center';
$img
.attr('src', apiFilePath)
.attr('data-attachment-id', attachmentId)
.attr('data-size', stat.size.toString())
.attr('width', width)
.attr('data-align', align);
unwrapFromParagraph($, $img);
}
// video
for (const vidEl of $('video').toArray()) {
const $vid = $(vidEl);
const src = cleanUrlString($vid.attr('src') ?? '')!;
if (!src || src.startsWith('http')) continue;
const relPath = resolveRelativeAttachmentPath(
src,
pageDir,
attachmentCandidates,
);
if (!relPath) continue;
const { attachmentId, apiFilePath, abs } = processFile(relPath);
const stat = await fs.stat(abs);
const width = $vid.attr('width') ?? '100%';
const align = $vid.attr('data-align') ?? 'center';
$vid
.attr('src', apiFilePath)
.attr('data-attachment-id', attachmentId)
.attr('data-size', stat.size.toString())
.attr('width', width)
.attr('data-align', align);
unwrapFromParagraph($, $vid);
}
for (const el of $('div[data-type="attachment"]').toArray()) {
const $oldDiv = $(el);
const rawUrl = cleanUrlString($oldDiv.attr('data-attachment-url') ?? '')!;
if (!rawUrl || rawUrl.startsWith('http')) continue;
const relPath = resolveRelativeAttachmentPath(
rawUrl,
pageDir,
attachmentCandidates,
);
if (!relPath) continue;
const { attachmentId, apiFilePath, abs } = processFile(relPath);
const stat = await fs.stat(abs);
const fileName = path.basename(abs);
const mime = getMimeType(abs);
const $newDiv = $('<div>')
.attr('data-type', 'attachment')
.attr('data-attachment-url', apiFilePath)
.attr('data-attachment-name', fileName)
.attr('data-attachment-mime', mime)
.attr('data-attachment-size', stat.size.toString())
.attr('data-attachment-id', attachmentId);
$oldDiv.replaceWith($newDiv);
unwrapFromParagraph($, $newDiv);
}
// rewrite other attachments via <a>
for (const aEl of $('a').toArray()) {
const $a = $(aEl);
const href = cleanUrlString($a.attr('href') ?? '')!;
if (!href || href.startsWith('http')) continue;
const relPath = resolveRelativeAttachmentPath(
href,
pageDir,
attachmentCandidates,
);
if (!relPath) continue;
const { attachmentId, apiFilePath, abs } = processFile(relPath);
const stat = await fs.stat(abs);
const ext = path.extname(relPath).toLowerCase();
if (ext === '.mp4') {
const $video = $('<video>')
.attr('src', apiFilePath)
.attr('data-attachment-id', attachmentId)
.attr('data-size', stat.size.toString())
.attr('width', '100%')
.attr('data-align', 'center');
$a.replaceWith($video);
unwrapFromParagraph($, $video);
} else {
// build attachment <div>
const confAliasName = $a.attr('data-linked-resource-default-alias');
let attachmentName = path.basename(abs);
if (confAliasName) attachmentName = confAliasName;
const $div = $('<div>')
.attr('data-type', 'attachment')
.attr('data-attachment-url', apiFilePath)
.attr('data-attachment-name', attachmentName)
.attr('data-attachment-mime', getMimeType(abs))
.attr('data-attachment-size', stat.size.toString())
.attr('data-attachment-id', attachmentId);
$a.replaceWith($div);
unwrapFromParagraph($, $div);
}
}
// excalidraw and drawio
for (const type of ['excalidraw', 'drawio'] as const) {
for (const el of $(`div[data-type="${type}"]`).toArray()) {
const $oldDiv = $(el);
const rawSrc = cleanUrlString($oldDiv.attr('data-src') ?? '')!;
if (!rawSrc || rawSrc.startsWith('http')) continue;
const relPath = resolveRelativeAttachmentPath(
rawSrc,
pageDir,
attachmentCandidates,
);
if (!relPath) continue;
const { attachmentId, apiFilePath, abs } = processFile(relPath);
const stat = await fs.stat(abs);
const fileName = path.basename(abs);
const width = $oldDiv.attr('data-width') || '100%';
const align = $oldDiv.attr('data-align') || 'center';
const $newDiv = $('<div>')
.attr('data-type', type)
.attr('data-src', apiFilePath)
.attr('data-title', fileName)
.attr('data-width', width)
.attr('data-size', stat.size.toString())
.attr('data-align', align)
.attr('data-attachment-id', attachmentId);
$oldDiv.replaceWith($newDiv);
unwrapFromParagraph($, $newDiv);
}
}
// wait for all uploads & DB inserts
await Promise.all(attachmentTasks);
return $.root().html() || '';
}
}

View File

@ -7,10 +7,10 @@ import {
htmlToJson,
jsonToText,
tiptapExtensions,
} from '../../collaboration/collaboration.util';
} from '../../../collaboration/collaboration.util';
import { InjectKysely } from 'nestjs-kysely';
import { KyselyDB } from '@docmost/db/types/kysely.types';
import { generateSlugId } from '../../common/helpers';
import { generateSlugId } from '../../../common/helpers';
import { generateJitteredKeyBetween } from 'fractional-indexing-jittered';
import { TiptapTransformer } from '@hocuspocus/transformer';
import * as Y from 'yjs';
@ -19,15 +19,16 @@ import {
FileTaskStatus,
FileTaskType,
getFileTaskFolderPath,
} from './file.utils';
} from '../utils/file.utils';
import { v7, v7 as uuid7 } from 'uuid';
import { StorageService } from '../storage/storage.service';
import { StorageService } from '../../storage/storage.service';
import { InjectQueue } from '@nestjs/bullmq';
import { Queue } from 'bullmq';
import { QueueJob, QueueName } from '../queue/constants';
import { QueueJob, QueueName } from '../../queue/constants';
import { Node as PMNode } from '@tiptap/pm/model';
import { EditorState, Transaction } from '@tiptap/pm/state';
import { getSchema } from '@tiptap/core';
import { FileTask } from '@docmost/db/types/entity.types';
@Injectable()
export class ImportService {
@ -199,13 +200,14 @@ export class ImportService {
userId: string,
spaceId: string,
workspaceId: string,
): Promise<void> {
) {
const file = await filePromise;
const fileBuffer = await file.toBuffer();
const fileExtension = path.extname(file.filename).toLowerCase();
const fileName = sanitize(
path.basename(file.filename, fileExtension).slice(0, 255),
);
const fileSize = fileBuffer.length;
const fileTaskId = uuid7();
const filePath = `${getFileTaskFolderPath(FileTaskType.Import, workspaceId)}/${fileTaskId}/${fileName}`;
@ -213,36 +215,29 @@ export class ImportService {
// upload file
await this.storageService.upload(filePath, fileBuffer);
// store in fileTasks table
await this.db
const fileTask = await this.db
.insertInto('fileTasks')
.values({
id: fileTaskId,
type: FileTaskType.Import,
source: source,
status: FileTaskStatus.Pending,
status: FileTaskStatus.Processing,
fileName: fileName,
filePath: filePath,
fileSize: 0,
fileSize: fileSize,
fileExt: 'zip',
creatorId: userId,
spaceId: spaceId,
workspaceId: workspaceId,
})
.returningAll()
.execute();
// what to send to queue
// pass the task ID
await this.fileTaskQueue.add(QueueJob.IMPORT_TASK, {
fileTaskId: fileTaskId,
});
// return tasks info
// when the processor picks it up
// we change the status to processing
// if it gets processed successfully,
// we change the status to success
// else failed
return fileTask;
}
async markdownOrHtmlToProsemirror(

View File

@ -14,7 +14,6 @@ export enum FileImportType {
}
export enum FileTaskStatus {
Pending = 'pending',
Processing = 'processing',
Success = 'success',
Failed = 'failed',