From 065f888c321deba83d25c635acc51c968e045c72 Mon Sep 17 00:00:00 2001 From: Philipinho <16838612+Philipinho@users.noreply.github.com> Date: Fri, 23 May 2025 22:31:37 -0700 Subject: [PATCH] feat: add stream upload support and improve file handling - Add stream upload functionality to storage drivers\n- Improve ZIP file extraction with better encoding handling\n- Fix attachment ID rendering issues\n- Add AWS S3 upload stream support\n- Update dependencies for better compatibility --- apps/server/package.json | 1 + apps/server/src/common/helpers/utils.ts | 6 + .../integrations/import/file-task.service.ts | 304 ++++++++++++++++-- .../src/integrations/import/file.utils.ts | 76 +++-- .../src/integrations/import/import.service.ts | 34 +- .../import/processors/file-task.processor.ts | 1 + .../storage/drivers/local.driver.ts | 15 +- .../integrations/storage/drivers/s3.driver.ts | 21 ++ .../interfaces/storage-driver.interface.ts | 3 +- .../integrations/storage/storage.service.ts | 5 + packages/editor-ext/src/lib/trailing-node.ts | 9 +- pnpm-lock.yaml | 291 ++++++++++++++++- 12 files changed, 697 insertions(+), 69 deletions(-) diff --git a/apps/server/package.json b/apps/server/package.json index 4da33b60..8e85bef6 100644 --- a/apps/server/package.json +++ b/apps/server/package.json @@ -31,6 +31,7 @@ }, "dependencies": { "@aws-sdk/client-s3": "3.701.0", + "@aws-sdk/lib-storage": "^3.701.0", "@aws-sdk/s3-request-presigner": "3.701.0", "@casl/ability": "^6.7.3", "@fastify/cookie": "^11.0.2", diff --git a/apps/server/src/common/helpers/utils.ts b/apps/server/src/common/helpers/utils.ts index e2a4d5eb..d1748850 100644 --- a/apps/server/src/common/helpers/utils.ts +++ b/apps/server/src/common/helpers/utils.ts @@ -1,5 +1,6 @@ import * as path from 'path'; import * as bcrypt from 'bcrypt'; +import { sanitize } from 'sanitize-filename-ts'; export const envPath = path.resolve(process.cwd(), '..', '..', '.env'); @@ -62,3 +63,8 @@ export function extractDateFromUuid7(uuid7: string) { return new Date(timestamp); } + +export function sanitizeFileName(fileName: string): string { + const sanitizedFilename = sanitize(fileName).replace(/ /g, '_'); + return sanitizedFilename.slice(0, 255); +} diff --git a/apps/server/src/integrations/import/file-task.service.ts b/apps/server/src/integrations/import/file-task.service.ts index a171fb2f..84740fc4 100644 --- a/apps/server/src/integrations/import/file-task.service.ts +++ b/apps/server/src/integrations/import/file-task.service.ts @@ -7,13 +7,27 @@ import { extractZip, FileTaskStatus } from './file.utils'; import { StorageService } from '../storage/storage.service'; import * as tmp from 'tmp-promise'; import { pipeline } from 'node:stream/promises'; -import { createWriteStream } from 'node:fs'; +import { createReadStream, createWriteStream } from 'node:fs'; import { ImportService } from './import.service'; import { promises as fs } from 'fs'; -import { generateSlugId } from '../../common/helpers'; +import { + generateSlugId, + getMimeType, + sanitizeFileName, +} from '../../common/helpers'; import { v7 } from 'uuid'; import { generateJitteredKeyBetween } from 'fractional-indexing-jittered'; import { FileTask, InsertablePage } from '@docmost/db/types/entity.types'; +import { + DOMParser, + Node as HDNode, + Element as HDElement, + Window, +} from 'happy-dom'; +import { markdownToHtml } from '@docmost/editor-ext'; +import { getAttachmentFolderPath } from '../../core/attachment/attachment.utils'; +import { AttachmentType } from '../../core/attachment/attachment.constants'; +import { getProsemirrorContent } from '../../common/helpers/prosemirror/utils'; @Injectable() export class FileTaskService { @@ -52,15 +66,17 @@ export class FileTaskService { await pipeline(fileStream, createWriteStream(tmpZipPath)); await extractZip(tmpZipPath, tmpExtractDir); + console.log('extract here'); // TODO: internal link mentions, backlinks, attachments try { await this.updateTaskStatus(fileTaskId, FileTaskStatus.Processing); - + // if type == generic await this.processGenericImport({ extractDir: tmpExtractDir, fileTask }); await this.updateTaskStatus(fileTaskId, FileTaskStatus.Success); } catch (error) { await this.updateTaskStatus(fileTaskId, FileTaskStatus.Failed); + console.error(error); } finally { await cleanupTmpFile(); await cleanupTmpDir(); @@ -74,6 +90,10 @@ export class FileTaskService { const { extractDir, fileTask } = opts; const allFiles = await this.collectMarkdownAndHtmlFiles(extractDir); + const attachmentCandidates = + await this.buildAttachmentCandidates(extractDir); + + console.log('attachment count: ', attachmentCandidates.size); const pagesMap = new Map< string, @@ -95,7 +115,22 @@ export class FileTaskService { .split(path.sep) .join('/'); // normalize to forward-slashes const ext = path.extname(relPath).toLowerCase(); - const content = await fs.readFile(absPath, 'utf-8'); + let content = await fs.readFile(absPath, 'utf-8'); + + if (ext.toLowerCase() === '.html' || ext.toLowerCase() === '.md') { + // rewrite local asset references + if (ext === '.md') { + content = await markdownToHtml(content); + } + + content = await this.rewriteLocalFilesInHtml({ + html: content, + extractDir, + pageId: v7(), + fileTask, + attachmentCandidates, + }); + } pagesMap.set(relPath, { id: v7(), @@ -159,28 +194,27 @@ export class FileTaskService { const insertablePages: InsertablePage[] = await Promise.all( Array.from(pagesMap.values()).map(async (page) => { - const pmState = await this.importService.markdownOrHtmlToProsemirror( + const htmlContent = await this.rewriteInternalLinksToMentionHtml( page.content, - page.fileExtension, + page.filePath, + filePathToPageMetaMap, + fileTask.creatorId, ); + + const pmState = getProsemirrorContent( + await this.importService.processHTML(htmlContent), + ); + const { title, prosemirrorJson } = this.importService.extractTitleAndRemoveHeading(pmState); - /*const rewDoc = - await this.importService.convertInternalLinksToMentionsPM( - jsonToNode(prosemirrorJson), - page.filePath, - filePathToPageMetaMap, - );*/ - const proseJson = prosemirrorJson; //rewDoc.toJSON(); - return { id: page.id, slugId: page.slugId, title: title || page.name, - content: proseJson, - textContent: jsonToText(proseJson), - ydoc: await this.importService.createYdoc(proseJson), + content: prosemirrorJson, + textContent: jsonToText(prosemirrorJson), + ydoc: await this.importService.createYdoc(prosemirrorJson), position: page.position!, spaceId: fileTask.spaceId, workspaceId: fileTask.workspaceId, @@ -191,7 +225,241 @@ export class FileTaskService { }), ); - await this.db.insertInto('pages').values(insertablePages).execute(); + try { + await this.db.insertInto('pages').values(insertablePages).execute(); + } catch (e) { + console.error(e); + } + } + + async rewriteLocalFilesInHtml(opts: { + html: string; + extractDir: string; + pageId: string; + fileTask: FileTask; + attachmentCandidates: Map; + }): Promise { + const { html, extractDir, pageId, fileTask, attachmentCandidates } = opts; + + const window = new Window(); + const doc = window.document; + doc.body.innerHTML = html; + + const tasks: Promise[] = []; + + const processFile = (relPath: string) => { + const abs = attachmentCandidates.get(relPath)!; + const attachmentId = v7(); + const ext = path.extname(abs); + + const fileName = sanitizeFileName(path.basename(abs, ext)); + const fileNameWithExt = + sanitizeFileName(path.basename(abs, ext)) + ext.toLowerCase(); + + const storageFilePath = `${getAttachmentFolderPath(AttachmentType.File, fileTask.workspaceId)}/${attachmentId}/${fileNameWithExt}`; + + const apiFilePath = `/api/files/${attachmentId}/${fileNameWithExt}`; + // console.log('file Path:', apiFilePath, ' and ', storageFilePath); + + console.log(storageFilePath); + + tasks.push( + (async () => { + const fileStream = createReadStream(abs); + await this.storageService.uploadStream(storageFilePath, fileStream); + const stat = await fs.stat(abs); + const uploaded = await this.db + .insertInto('attachments') + .values({ + id: attachmentId, + filePath: storageFilePath, + fileName: fileNameWithExt, + fileSize: stat.size, + mimeType: getMimeType(ext), + fileExt: ext, + creatorId: fileTask.creatorId, + workspaceId: fileTask.workspaceId, + pageId, + spaceId: fileTask.spaceId, + }) + .returningAll() + .execute(); + console.log(uploaded); + })(), + ); + + console.log('upload file'); + return { + attachmentId, + storageFilePath, + apiFilePath, + fileNameWithExt, + abs, + }; + }; + + // rewrite + for (const img of Array.from(doc.getElementsByTagName('img'))) { + const src = img.getAttribute('src') ?? ''; + if (!src || src.startsWith('http') || src.startsWith('/api/files/')) + continue; + const rel = src.replace(/^\.\/?/, ''); + if (!attachmentCandidates.has(rel)) continue; + + const { + attachmentId, + storageFilePath, + apiFilePath, + fileNameWithExt, + abs, + } = processFile(rel); + + const stat = await fs.stat(abs); + img.setAttribute('src', apiFilePath); + img.setAttribute('data-attachment-id', attachmentId); + img.setAttribute('data-size', stat.size.toString()); + img.setAttribute('width', '100%'); + img.setAttribute('data-align', 'center'); + + this.unwrapFromParagraph(img); + } + + // rewrite