This commit is contained in:
Philipinho
2025-05-21 23:40:43 -07:00
parent f6e3230eec
commit e2b8899569
5 changed files with 378 additions and 40 deletions

View File

@ -1,30 +1,19 @@
import { BadRequestException, Injectable, Logger } from '@nestjs/common';
import { PageRepo } from '@docmost/db/repos/page/page.repo';
import { MultipartFile } from '@fastify/multipart';
import { sanitize } from 'sanitize-filename-ts';
import { Injectable, Logger } from '@nestjs/common';
import * as path from 'path';
import {
htmlToJson,
jsonToText,
tiptapExtensions,
} from '../../collaboration/collaboration.util';
import { jsonToText } from '../../collaboration/collaboration.util';
import { InjectKysely } from 'nestjs-kysely';
import { KyselyDB } from '@docmost/db/types/kysely.types';
import { generateSlugId } from '../../common/helpers';
import { generateJitteredKeyBetween } from 'fractional-indexing-jittered';
import { TiptapTransformer } from '@hocuspocus/transformer';
import * as Y from 'yjs';
import { markdownToHtml } from '@docmost/editor-ext';
import {
FileTaskStatus,
FileTaskType,
getFileTaskFolderPath,
} from './file.utils';
import { v7 as uuid7 } from 'uuid';
import { extractZip, FileTaskStatus } from './file.utils';
import { StorageService } from '../storage/storage.service';
import { InjectQueue } from '@nestjs/bullmq';
import { Queue } from 'bullmq';
import { QueueJob, QueueName } from '../queue/constants';
import * as tmp from 'tmp-promise';
import { pipeline } from 'node:stream/promises';
import { createWriteStream } from 'node:fs';
import { ImportService } from './import.service';
import { promises as fs } from 'fs';
import { generateSlugId } from '../../common/helpers';
import { v7 } from 'uuid';
import { generateJitteredKeyBetween } from 'fractional-indexing-jittered';
import { FileTask, InsertablePage } from '@docmost/db/types/entity.types';
@Injectable()
export class FileTaskService {
@ -32,12 +21,11 @@ export class FileTaskService {
constructor(
private readonly storageService: StorageService,
private readonly importService: ImportService,
@InjectKysely() private readonly db: KyselyDB,
) {}
async processZIpImport(fileTaskId: string): Promise<void> {
console.log(`Processing zip import: ${fileTaskId}`);
const fileTask = await this.db
.selectFrom('fileTasks')
.selectAll()
@ -49,20 +37,189 @@ export class FileTaskService {
return;
}
// update status to processing
const { path: tmpZipPath, cleanup: cleanupTmpFile } = await tmp.file({
prefix: 'docmost-import',
postfix: '.zip',
discardDescriptor: true,
});
const { path: tmpExtractDir, cleanup: cleanupTmpDir } = await tmp.dir({
prefix: 'docmost-extract-',
unsafeCleanup: true,
});
const fileStream = await this.storageService.readStream(fileTask.filePath);
await pipeline(fileStream, createWriteStream(tmpZipPath));
await extractZip(tmpZipPath, tmpExtractDir);
// TODO: internal link mentions, backlinks, attachments
try {
await this.updateTaskStatus(fileTaskId, FileTaskStatus.Processing);
await this.processGenericImport({ extractDir: tmpExtractDir, fileTask });
await this.updateTaskStatus(fileTaskId, FileTaskStatus.Success);
} catch (error) {
await this.updateTaskStatus(fileTaskId, FileTaskStatus.Failed);
} finally {
await cleanupTmpFile();
await cleanupTmpDir();
}
}
async processGenericImport(opts: {
extractDir: string;
fileTask: FileTask;
}): Promise<void> {
const { extractDir, fileTask } = opts;
const allFiles = await this.collectMarkdownAndHtmlFiles(extractDir);
const pagesMap = new Map<
string,
{
id: string;
slugId: string;
name: string;
content: string;
position?: string | null;
parentPageId: string | null;
fileExtension: string;
filePath: string;
}
>();
for (const absPath of allFiles) {
const relPath = path
.relative(extractDir, absPath)
.split(path.sep)
.join('/'); // normalize to forward-slashes
const ext = path.extname(relPath).toLowerCase();
const content = await fs.readFile(absPath, 'utf-8');
pagesMap.set(relPath, {
id: v7(),
slugId: generateSlugId(),
name: path.basename(relPath, ext),
content,
parentPageId: null,
fileExtension: ext,
filePath: relPath,
});
}
// parent/child linking
pagesMap.forEach((page, filePath) => {
const segments = filePath.split('/');
segments.pop();
let parentPage = null;
while (segments.length) {
const tryMd = segments.join('/') + '.md';
const tryHtml = segments.join('/') + '.html';
if (pagesMap.has(tryMd)) {
parentPage = pagesMap.get(tryMd)!;
break;
}
if (pagesMap.has(tryHtml)) {
parentPage = pagesMap.get(tryHtml)!;
break;
}
segments.pop();
}
if (parentPage) page.parentPageId = parentPage.id;
});
// generate position keys
const siblingsMap = new Map<string | null, typeof Array.prototype>();
pagesMap.forEach((page) => {
const sibs = siblingsMap.get(page.parentPageId) || [];
sibs.push(page);
siblingsMap.set(page.parentPageId, sibs);
});
siblingsMap.forEach((sibs) => {
sibs.sort((a, b) => a.name.localeCompare(b.name));
let prevPos: string | null = null;
for (const page of sibs) {
page.position = generateJitteredKeyBetween(prevPos, null);
prevPos = page.position;
}
});
const filePathToPageMetaMap = new Map<
string,
{ id: string; title: string; slugId: string }
>();
pagesMap.forEach((page) => {
filePathToPageMetaMap.set(page.filePath, {
id: page.id,
title: page.name,
slugId: page.slugId,
});
});
const insertablePages: InsertablePage[] = await Promise.all(
Array.from(pagesMap.values()).map(async (page) => {
const pmState = await this.importService.markdownOrHtmlToProsemirror(
page.content,
page.fileExtension,
);
const { title, prosemirrorJson } =
this.importService.extractTitleAndRemoveHeading(pmState);
/*const rewDoc =
await this.importService.convertInternalLinksToMentionsPM(
jsonToNode(prosemirrorJson),
page.filePath,
filePathToPageMetaMap,
);*/
const proseJson = prosemirrorJson; //rewDoc.toJSON();
return {
id: page.id,
slugId: page.slugId,
title: title || page.name,
content: proseJson,
textContent: jsonToText(proseJson),
ydoc: await this.importService.createYdoc(proseJson),
position: page.position!,
spaceId: fileTask.spaceId,
workspaceId: fileTask.workspaceId,
creatorId: fileTask.creatorId,
lastUpdatedById: fileTask.creatorId,
parentPageId: page.parentPageId,
};
}),
);
await this.db.insertInto('pages').values(insertablePages).execute();
}
async collectMarkdownAndHtmlFiles(dir: string): Promise<string[]> {
const results: string[] = [];
async function walk(current: string) {
const entries = await fs.readdir(current, { withFileTypes: true });
for (const ent of entries) {
const fullPath = path.join(current, ent.name);
if (ent.isDirectory()) {
await walk(fullPath);
} else if (
['.md', '.html'].includes(path.extname(ent.name).toLowerCase())
) {
results.push(fullPath);
}
}
}
await walk(dir);
return results;
}
async updateTaskStatus(fileTaskId: string, status: FileTaskStatus) {
await this.db
.updateTable('fileTasks')
.set({ status: FileTaskStatus.Processing })
.set({ status: status })
.where('id', '=', fileTaskId)
.execute();
// it did, what next?
const file = await this.storageService.read(fileTask.filePath);
}
// receive the file
async processGenericImport(fileTaskId: string): Promise<void> {
}
}

View File

@ -1,3 +1,7 @@
import * as yauzl from 'yauzl';
import * as path from 'path';
import * as fs from 'node:fs';
export enum FileTaskType {
Import = 'import',
Export = 'export',
@ -27,3 +31,47 @@ export function getFileTaskFolderPath(
return `${workspaceId}/exports`;
}
}
export function extractZip(source: string, target: string) {
//https://github.com/Surfer-Org
return new Promise((resolve, reject) => {
yauzl.open(source, { lazyEntries: true }, (err, zipfile) => {
if (err) return reject(err);
zipfile.readEntry();
zipfile.on('entry', (entry) => {
const fullPath = path.join(target, entry.fileName);
const directory = path.dirname(fullPath);
if (/\/$/.test(entry.fileName)) {
// Directory entry
try {
fs.mkdirSync(fullPath, { recursive: true });
zipfile.readEntry();
} catch (err) {
reject(err);
}
} else {
// File entry
try {
fs.mkdirSync(directory, { recursive: true });
zipfile.openReadStream(entry, (err, readStream) => {
if (err) return reject(err);
const writeStream = fs.createWriteStream(fullPath);
readStream.on('end', () => {
writeStream.end();
zipfile.readEntry();
});
readStream.pipe(writeStream);
});
} catch (err) {
reject(err);
}
}
});
zipfile.on('end', resolve);
zipfile.on('error', reject);
});
});
}

View File

@ -20,11 +20,14 @@ import {
FileTaskType,
getFileTaskFolderPath,
} from './file.utils';
import { v7 as uuid7 } from 'uuid';
import { v7, v7 as uuid7 } from 'uuid';
import { StorageService } from '../storage/storage.service';
import { InjectQueue } from '@nestjs/bullmq';
import { Queue } from 'bullmq';
import { QueueJob, QueueName } from '../queue/constants';
import { Node as PMNode } from '@tiptap/pm/model';
import { EditorState, Transaction } from '@tiptap/pm/state';
import { getSchema } from '@tiptap/core';
@Injectable()
export class ImportService {
@ -127,7 +130,7 @@ export class ImportService {
async createYdoc(prosemirrorJson: any): Promise<Buffer | null> {
if (prosemirrorJson) {
this.logger.debug(`Converting prosemirror json state to ydoc`);
// this.logger.debug(`Converting prosemirror json state to ydoc`);
const ydoc = TiptapTransformer.toYdoc(
prosemirrorJson,
@ -227,4 +230,89 @@ export class ImportService {
// we change the status to success
// else failed
}
async markdownOrHtmlToProsemirror(
fileContent: string,
fileExtension: string,
): Promise<any> {
let prosemirrorState = '';
if (fileExtension === '.md') {
prosemirrorState = await this.processMarkdown(fileContent);
} else if (fileExtension.endsWith('.html')) {
prosemirrorState = await this.processHTML(fileContent);
}
return prosemirrorState;
}
async convertInternalLinksToMentionsPM(
doc: PMNode,
currentFilePath: string,
filePathToPageMetaMap: Map<
string,
{ id: string; title: string; slugId: string }
>,
): Promise<PMNode> {
const schema = getSchema(tiptapExtensions);
const state = EditorState.create({ doc, schema });
let tr: Transaction = state.tr;
const normalizePath = (p: string) => p.replace(/\\/g, '/');
// Collect replacements from the original doc.
const replacements: Array<{
from: number;
to: number;
mentionNode: PMNode;
}> = [];
doc.descendants((node, pos) => {
if (!node.isText || !node.marks?.length) return;
// Look for the link mark
const linkMark = node.marks.find(
(mark) => mark.type.name === 'link' && mark.attrs?.href,
);
if (!linkMark) return;
// Compute the range for the entire text node.
const from = pos;
const to = pos + node.nodeSize;
// Resolve the path and get page meta.
const resolvedPath = normalizePath(
path.join(path.dirname(currentFilePath), linkMark.attrs.href),
);
const pageMeta = filePathToPageMetaMap.get(resolvedPath);
if (!pageMeta) return;
// Create the mention node with all required attributes.
const mentionNode = schema.nodes.mention.create({
id: v7(),
entityType: 'page',
entityId: pageMeta.id,
label: node.text || pageMeta.title,
slugId: pageMeta.slugId,
creatorId: 'not available', // This is required per your schema.
});
replacements.push({ from, to, mentionNode });
});
// Apply replacements in reverse order.
for (let i = replacements.length - 1; i >= 0; i--) {
const { from, to, mentionNode } = replacements[i];
try {
tr = tr.replaceWith(from, to, mentionNode);
} catch (err) {
console.error('❌ Failed to insert mention:', err);
}
}
if (tr.docChanged) {
console.log('doc changed');
console.log(JSON.stringify(state.apply(tr).doc.toJSON()));
}
// Return the updated document if any change was made.
return tr.docChanged ? state.apply(tr).doc : doc;
}
}