Compare commits

...

2 Commits

Author SHA1 Message Date
e2b8899569 WIP 2025-05-21 23:40:43 -07:00
f6e3230eec Add readstream 2025-05-21 10:58:40 -07:00
9 changed files with 413 additions and 40 deletions

View File

@ -80,7 +80,9 @@
"sanitize-filename-ts": "^1.0.2",
"socket.io": "^4.8.1",
"stripe": "^17.5.0",
"ws": "^8.18.0"
"tmp-promise": "^3.0.3",
"ws": "^8.18.0",
"yauzl": "^3.2.0"
},
"devDependencies": {
"@eslint/js": "^9.20.0",
@ -99,6 +101,7 @@
"@types/pg": "^8.11.11",
"@types/supertest": "^6.0.2",
"@types/ws": "^8.5.14",
"@types/yauzl": "^2.10.3",
"eslint": "^9.20.1",
"eslint-config-prettier": "^10.0.1",
"globals": "^15.15.0",

View File

@ -1,30 +1,19 @@
import { BadRequestException, Injectable, Logger } from '@nestjs/common';
import { PageRepo } from '@docmost/db/repos/page/page.repo';
import { MultipartFile } from '@fastify/multipart';
import { sanitize } from 'sanitize-filename-ts';
import { Injectable, Logger } from '@nestjs/common';
import * as path from 'path';
import {
htmlToJson,
jsonToText,
tiptapExtensions,
} from '../../collaboration/collaboration.util';
import { jsonToText } from '../../collaboration/collaboration.util';
import { InjectKysely } from 'nestjs-kysely';
import { KyselyDB } from '@docmost/db/types/kysely.types';
import { generateSlugId } from '../../common/helpers';
import { generateJitteredKeyBetween } from 'fractional-indexing-jittered';
import { TiptapTransformer } from '@hocuspocus/transformer';
import * as Y from 'yjs';
import { markdownToHtml } from '@docmost/editor-ext';
import {
FileTaskStatus,
FileTaskType,
getFileTaskFolderPath,
} from './file.utils';
import { v7 as uuid7 } from 'uuid';
import { extractZip, FileTaskStatus } from './file.utils';
import { StorageService } from '../storage/storage.service';
import { InjectQueue } from '@nestjs/bullmq';
import { Queue } from 'bullmq';
import { QueueJob, QueueName } from '../queue/constants';
import * as tmp from 'tmp-promise';
import { pipeline } from 'node:stream/promises';
import { createWriteStream } from 'node:fs';
import { ImportService } from './import.service';
import { promises as fs } from 'fs';
import { generateSlugId } from '../../common/helpers';
import { v7 } from 'uuid';
import { generateJitteredKeyBetween } from 'fractional-indexing-jittered';
import { FileTask, InsertablePage } from '@docmost/db/types/entity.types';
@Injectable()
export class FileTaskService {
@ -32,12 +21,11 @@ export class FileTaskService {
constructor(
private readonly storageService: StorageService,
private readonly importService: ImportService,
@InjectKysely() private readonly db: KyselyDB,
) {}
async processZIpImport(fileTaskId: string): Promise<void> {
console.log(`Processing zip import: ${fileTaskId}`);
const fileTask = await this.db
.selectFrom('fileTasks')
.selectAll()
@ -49,20 +37,189 @@ export class FileTaskService {
return;
}
// update status to processing
const { path: tmpZipPath, cleanup: cleanupTmpFile } = await tmp.file({
prefix: 'docmost-import',
postfix: '.zip',
discardDescriptor: true,
});
const { path: tmpExtractDir, cleanup: cleanupTmpDir } = await tmp.dir({
prefix: 'docmost-extract-',
unsafeCleanup: true,
});
const fileStream = await this.storageService.readStream(fileTask.filePath);
await pipeline(fileStream, createWriteStream(tmpZipPath));
await extractZip(tmpZipPath, tmpExtractDir);
// TODO: internal link mentions, backlinks, attachments
try {
await this.updateTaskStatus(fileTaskId, FileTaskStatus.Processing);
await this.processGenericImport({ extractDir: tmpExtractDir, fileTask });
await this.updateTaskStatus(fileTaskId, FileTaskStatus.Success);
} catch (error) {
await this.updateTaskStatus(fileTaskId, FileTaskStatus.Failed);
} finally {
await cleanupTmpFile();
await cleanupTmpDir();
}
}
async processGenericImport(opts: {
extractDir: string;
fileTask: FileTask;
}): Promise<void> {
const { extractDir, fileTask } = opts;
const allFiles = await this.collectMarkdownAndHtmlFiles(extractDir);
const pagesMap = new Map<
string,
{
id: string;
slugId: string;
name: string;
content: string;
position?: string | null;
parentPageId: string | null;
fileExtension: string;
filePath: string;
}
>();
for (const absPath of allFiles) {
const relPath = path
.relative(extractDir, absPath)
.split(path.sep)
.join('/'); // normalize to forward-slashes
const ext = path.extname(relPath).toLowerCase();
const content = await fs.readFile(absPath, 'utf-8');
pagesMap.set(relPath, {
id: v7(),
slugId: generateSlugId(),
name: path.basename(relPath, ext),
content,
parentPageId: null,
fileExtension: ext,
filePath: relPath,
});
}
// parent/child linking
pagesMap.forEach((page, filePath) => {
const segments = filePath.split('/');
segments.pop();
let parentPage = null;
while (segments.length) {
const tryMd = segments.join('/') + '.md';
const tryHtml = segments.join('/') + '.html';
if (pagesMap.has(tryMd)) {
parentPage = pagesMap.get(tryMd)!;
break;
}
if (pagesMap.has(tryHtml)) {
parentPage = pagesMap.get(tryHtml)!;
break;
}
segments.pop();
}
if (parentPage) page.parentPageId = parentPage.id;
});
// generate position keys
const siblingsMap = new Map<string | null, typeof Array.prototype>();
pagesMap.forEach((page) => {
const sibs = siblingsMap.get(page.parentPageId) || [];
sibs.push(page);
siblingsMap.set(page.parentPageId, sibs);
});
siblingsMap.forEach((sibs) => {
sibs.sort((a, b) => a.name.localeCompare(b.name));
let prevPos: string | null = null;
for (const page of sibs) {
page.position = generateJitteredKeyBetween(prevPos, null);
prevPos = page.position;
}
});
const filePathToPageMetaMap = new Map<
string,
{ id: string; title: string; slugId: string }
>();
pagesMap.forEach((page) => {
filePathToPageMetaMap.set(page.filePath, {
id: page.id,
title: page.name,
slugId: page.slugId,
});
});
const insertablePages: InsertablePage[] = await Promise.all(
Array.from(pagesMap.values()).map(async (page) => {
const pmState = await this.importService.markdownOrHtmlToProsemirror(
page.content,
page.fileExtension,
);
const { title, prosemirrorJson } =
this.importService.extractTitleAndRemoveHeading(pmState);
/*const rewDoc =
await this.importService.convertInternalLinksToMentionsPM(
jsonToNode(prosemirrorJson),
page.filePath,
filePathToPageMetaMap,
);*/
const proseJson = prosemirrorJson; //rewDoc.toJSON();
return {
id: page.id,
slugId: page.slugId,
title: title || page.name,
content: proseJson,
textContent: jsonToText(proseJson),
ydoc: await this.importService.createYdoc(proseJson),
position: page.position!,
spaceId: fileTask.spaceId,
workspaceId: fileTask.workspaceId,
creatorId: fileTask.creatorId,
lastUpdatedById: fileTask.creatorId,
parentPageId: page.parentPageId,
};
}),
);
await this.db.insertInto('pages').values(insertablePages).execute();
}
async collectMarkdownAndHtmlFiles(dir: string): Promise<string[]> {
const results: string[] = [];
async function walk(current: string) {
const entries = await fs.readdir(current, { withFileTypes: true });
for (const ent of entries) {
const fullPath = path.join(current, ent.name);
if (ent.isDirectory()) {
await walk(fullPath);
} else if (
['.md', '.html'].includes(path.extname(ent.name).toLowerCase())
) {
results.push(fullPath);
}
}
}
await walk(dir);
return results;
}
async updateTaskStatus(fileTaskId: string, status: FileTaskStatus) {
await this.db
.updateTable('fileTasks')
.set({ status: FileTaskStatus.Processing })
.set({ status: status })
.where('id', '=', fileTaskId)
.execute();
// it did, what next?
const file = await this.storageService.read(fileTask.filePath);
}
// receive the file
async processGenericImport(fileTaskId: string): Promise<void> {
}
}

View File

@ -1,3 +1,7 @@
import * as yauzl from 'yauzl';
import * as path from 'path';
import * as fs from 'node:fs';
export enum FileTaskType {
Import = 'import',
Export = 'export',
@ -27,3 +31,47 @@ export function getFileTaskFolderPath(
return `${workspaceId}/exports`;
}
}
export function extractZip(source: string, target: string) {
//https://github.com/Surfer-Org
return new Promise((resolve, reject) => {
yauzl.open(source, { lazyEntries: true }, (err, zipfile) => {
if (err) return reject(err);
zipfile.readEntry();
zipfile.on('entry', (entry) => {
const fullPath = path.join(target, entry.fileName);
const directory = path.dirname(fullPath);
if (/\/$/.test(entry.fileName)) {
// Directory entry
try {
fs.mkdirSync(fullPath, { recursive: true });
zipfile.readEntry();
} catch (err) {
reject(err);
}
} else {
// File entry
try {
fs.mkdirSync(directory, { recursive: true });
zipfile.openReadStream(entry, (err, readStream) => {
if (err) return reject(err);
const writeStream = fs.createWriteStream(fullPath);
readStream.on('end', () => {
writeStream.end();
zipfile.readEntry();
});
readStream.pipe(writeStream);
});
} catch (err) {
reject(err);
}
}
});
zipfile.on('end', resolve);
zipfile.on('error', reject);
});
});
}

View File

@ -20,11 +20,14 @@ import {
FileTaskType,
getFileTaskFolderPath,
} from './file.utils';
import { v7 as uuid7 } from 'uuid';
import { v7, v7 as uuid7 } from 'uuid';
import { StorageService } from '../storage/storage.service';
import { InjectQueue } from '@nestjs/bullmq';
import { Queue } from 'bullmq';
import { QueueJob, QueueName } from '../queue/constants';
import { Node as PMNode } from '@tiptap/pm/model';
import { EditorState, Transaction } from '@tiptap/pm/state';
import { getSchema } from '@tiptap/core';
@Injectable()
export class ImportService {
@ -127,7 +130,7 @@ export class ImportService {
async createYdoc(prosemirrorJson: any): Promise<Buffer | null> {
if (prosemirrorJson) {
this.logger.debug(`Converting prosemirror json state to ydoc`);
// this.logger.debug(`Converting prosemirror json state to ydoc`);
const ydoc = TiptapTransformer.toYdoc(
prosemirrorJson,
@ -227,4 +230,89 @@ export class ImportService {
// we change the status to success
// else failed
}
async markdownOrHtmlToProsemirror(
fileContent: string,
fileExtension: string,
): Promise<any> {
let prosemirrorState = '';
if (fileExtension === '.md') {
prosemirrorState = await this.processMarkdown(fileContent);
} else if (fileExtension.endsWith('.html')) {
prosemirrorState = await this.processHTML(fileContent);
}
return prosemirrorState;
}
async convertInternalLinksToMentionsPM(
doc: PMNode,
currentFilePath: string,
filePathToPageMetaMap: Map<
string,
{ id: string; title: string; slugId: string }
>,
): Promise<PMNode> {
const schema = getSchema(tiptapExtensions);
const state = EditorState.create({ doc, schema });
let tr: Transaction = state.tr;
const normalizePath = (p: string) => p.replace(/\\/g, '/');
// Collect replacements from the original doc.
const replacements: Array<{
from: number;
to: number;
mentionNode: PMNode;
}> = [];
doc.descendants((node, pos) => {
if (!node.isText || !node.marks?.length) return;
// Look for the link mark
const linkMark = node.marks.find(
(mark) => mark.type.name === 'link' && mark.attrs?.href,
);
if (!linkMark) return;
// Compute the range for the entire text node.
const from = pos;
const to = pos + node.nodeSize;
// Resolve the path and get page meta.
const resolvedPath = normalizePath(
path.join(path.dirname(currentFilePath), linkMark.attrs.href),
);
const pageMeta = filePathToPageMetaMap.get(resolvedPath);
if (!pageMeta) return;
// Create the mention node with all required attributes.
const mentionNode = schema.nodes.mention.create({
id: v7(),
entityType: 'page',
entityId: pageMeta.id,
label: node.text || pageMeta.title,
slugId: pageMeta.slugId,
creatorId: 'not available', // This is required per your schema.
});
replacements.push({ from, to, mentionNode });
});
// Apply replacements in reverse order.
for (let i = replacements.length - 1; i >= 0; i--) {
const { from, to, mentionNode } = replacements[i];
try {
tr = tr.replaceWith(from, to, mentionNode);
} catch (err) {
console.error('❌ Failed to insert mention:', err);
}
}
if (tr.docChanged) {
console.log('doc changed');
console.log(JSON.stringify(state.apply(tr).doc.toJSON()));
}
// Return the updated document if any change was made.
return tr.docChanged ? state.apply(tr).doc : doc;
}
}

View File

@ -5,6 +5,8 @@ import {
} from '../interfaces';
import { join } from 'path';
import * as fs from 'fs-extra';
import { Readable } from 'stream';
import { createReadStream } from 'node:fs';
export class LocalDriver implements StorageDriver {
private readonly config: LocalStorageConfig;
@ -43,6 +45,14 @@ export class LocalDriver implements StorageDriver {
}
}
async readStream(filePath: string): Promise<Readable> {
try {
return createReadStream(this._fullPath(filePath));
} catch (err) {
throw new Error(`Failed to read file: ${(err as Error).message}`);
}
}
async exists(filePath: string): Promise<boolean> {
try {
return await fs.pathExists(this._fullPath(filePath));

View File

@ -71,6 +71,21 @@ export class S3Driver implements StorageDriver {
}
}
async readStream(filePath: string): Promise<Readable> {
try {
const command = new GetObjectCommand({
Bucket: this.config.bucket,
Key: filePath,
});
const response = await this.s3Client.send(command);
return response.Body as Readable;
} catch (err) {
throw new Error(`Failed to read file from S3: ${(err as Error).message}`);
}
}
async exists(filePath: string): Promise<boolean> {
try {
const command = new HeadObjectCommand({

View File

@ -1,3 +1,5 @@
import { Readable } from 'stream';
export interface StorageDriver {
upload(filePath: string, file: Buffer): Promise<void>;
@ -5,6 +7,9 @@ export interface StorageDriver {
read(filePath: string): Promise<Buffer>;
readStream(filePath: string): Promise<Readable>;
exists(filePath: string): Promise<boolean>;
getUrl(filePath: string): string;

View File

@ -1,6 +1,7 @@
import { Inject, Injectable, Logger } from '@nestjs/common';
import { STORAGE_DRIVER_TOKEN } from './constants/storage.constants';
import { StorageDriver } from './interfaces';
import { Readable } from 'stream';
@Injectable()
export class StorageService {
@ -23,6 +24,10 @@ export class StorageService {
return this.storageDriver.read(filePath);
}
async readStream(filePath: string): Promise<Readable> {
return this.storageDriver.readStream(filePath);
}
async exists(filePath: string): Promise<boolean> {
return this.storageDriver.exists(filePath);
}

42
pnpm-lock.yaml generated
View File

@ -552,9 +552,15 @@ importers:
stripe:
specifier: ^17.5.0
version: 17.5.0
tmp-promise:
specifier: ^3.0.3
version: 3.0.3
ws:
specifier: ^8.18.0
version: 8.18.0
yauzl:
specifier: ^3.2.0
version: 3.2.0
devDependencies:
'@eslint/js':
specifier: ^9.20.0
@ -604,6 +610,9 @@ importers:
'@types/ws':
specifier: ^8.5.14
version: 8.5.14
'@types/yauzl':
specifier: ^2.10.3
version: 2.10.3
eslint:
specifier: ^9.20.1
version: 9.20.1(jiti@1.21.0)
@ -4093,6 +4102,9 @@ packages:
'@types/yargs@17.0.32':
resolution: {integrity: sha512-xQ67Yc/laOG5uMfX/093MRlGGCIBzZMarVa+gfNKJxWAIgykYpVGkBdbqEzGDDfCrVUj6Hiff4mTZ5BA6TmAog==}
'@types/yauzl@2.10.3':
resolution: {integrity: sha512-oJoftv0LSuaDZE3Le4DbKX+KS9G36NzOeSap90UIK0yMA/NhKJhqlSGtNDORNRaIbQfzjXDrQa0ytJ6mNRGz/Q==}
'@typescript-eslint/eslint-plugin@8.17.0':
resolution: {integrity: sha512-HU1KAdW3Tt8zQkdvNoIijfWDMvdSweFYm4hWh+KwhPstv+sCmWb89hCIP8msFm9N1R/ooh9honpSuvqKWlYy3w==}
engines: {node: ^18.18.0 || ^20.9.0 || >=21.1.0}
@ -4610,6 +4622,9 @@ packages:
bser@2.1.1:
resolution: {integrity: sha512-gQxTNE/GAfIIrmHLUE3oJyp5FO6HRBfhjnw4/wMmA63ZGDJnWBmgY/lyQBpnDUkGmAhbSe39tx2d/iTOAfglwQ==}
buffer-crc32@0.2.13:
resolution: {integrity: sha512-VO9Ht/+p3SN7SKWqcrgEzjGbRSJYTx+Q1pTQC0wrWqHx0vpJraQ6GtHx8tvcg1rlK1byhU5gccxgOgj7B0TDkQ==}
buffer-equal-constant-time@1.0.1:
resolution: {integrity: sha512-zRpUiDwd/xk6ADqPMATG8vc9VPrkck7T07OIx0gnjmJAnHnTVXNQG3vfvWNuiZIkwu9KrKdA1iJKfsfTVxE6NA==}
@ -7196,6 +7211,9 @@ packages:
resolution: {integrity: sha512-nri2TO5JE3/mRryik9LlHFT53cgHfRK0Lt0BAZQXku/AW3E6XLt2GaY8siWi7dvW/m1z0ecn+J+bpDa9ZN3IsQ==}
engines: {node: '>=18'}
pend@1.2.0:
resolution: {integrity: sha512-F3asv42UuXchdzt+xXqfW1OGlVBe+mxa2mqI0pg5yAHZPvFmY3Y6drSf/GQ1A86WgWEN9Kzh/WrgKa6iGcHXLg==}
pg-cloudflare@1.1.1:
resolution: {integrity: sha512-xWPagP/4B6BgFO+EKz3JONXv3YDgvkbVrGw2mTo3D6tVDQRh1e7cqVGvyR3BE+eQgAvx1XhW/iEASj4/jCWl3Q==}
@ -8248,6 +8266,9 @@ packages:
resolution: {integrity: sha512-QNtgIqSUb9o2CoUjX9T5TwaIvUUJFU1+12PJkgt42DFV2yf9J6549yTF2uGloQsJ/JOC8X+gIB81ind97hRiIQ==}
hasBin: true
tmp-promise@3.0.3:
resolution: {integrity: sha512-RwM7MoPojPxsOBYnyd2hy0bxtIlVrihNs9pj5SUvY8Zz1sQcQG2tG1hSr8PDxfgEB8RNKDhqbIlroIarSNDNsQ==}
tmp@0.0.33:
resolution: {integrity: sha512-jRCJlojKnZ3addtTOjdIqoRuPEKBvNXcGYqzO6zWZX8KfKEpnGY5jfggJQ3EjKuu8D4bJRr0y+cYJFmYbImXGw==}
engines: {node: '>=0.6.0'}
@ -8904,6 +8925,10 @@ packages:
resolution: {integrity: sha512-7dSzzRQ++CKnNI/krKnYRV7JKKPUXMEh61soaHKg9mrWEhzFWhFnxPxGl+69cD1Ou63C13NUPCnmIcrvqCuM6w==}
engines: {node: '>=12'}
yauzl@3.2.0:
resolution: {integrity: sha512-Ow9nuGZE+qp1u4JIPvg+uCiUr7xGQWdff7JQSk5VGYTAZMDe2q8lxJ10ygv10qmSj031Ty/6FNJpLO4o1Sgc+w==}
engines: {node: '>=12'}
yjs@13.6.20:
resolution: {integrity: sha512-Z2YZI+SYqK7XdWlloI3lhMiKnCdFCVC4PchpdO+mCYwtiTwncjUbnRK9R1JmkNfdmHyDXuWN3ibJAt0wsqTbLQ==}
engines: {node: '>=16.0.0', npm: '>=8.0.0'}
@ -13261,6 +13286,10 @@ snapshots:
dependencies:
'@types/yargs-parser': 21.0.3
'@types/yauzl@2.10.3':
dependencies:
'@types/node': 22.13.4
'@typescript-eslint/eslint-plugin@8.17.0(@typescript-eslint/parser@8.17.0(eslint@9.15.0(jiti@1.21.0))(typescript@5.7.2))(eslint@9.15.0(jiti@1.21.0))(typescript@5.7.2)':
dependencies:
'@eslint-community/regexpp': 4.12.1
@ -13959,6 +13988,8 @@ snapshots:
dependencies:
node-int64: 0.4.0
buffer-crc32@0.2.13: {}
buffer-equal-constant-time@1.0.1: {}
buffer-from@1.1.2: {}
@ -16994,6 +17025,8 @@ snapshots:
peek-readable@7.0.0: {}
pend@1.2.0: {}
pg-cloudflare@1.1.1:
optional: true
@ -18163,6 +18196,10 @@ snapshots:
dependencies:
tldts-core: 6.1.72
tmp-promise@3.0.3:
dependencies:
tmp: 0.2.1
tmp@0.0.33:
dependencies:
os-tmpdir: 1.0.2
@ -18759,6 +18796,11 @@ snapshots:
y18n: 5.0.8
yargs-parser: 21.1.1
yauzl@3.2.0:
dependencies:
buffer-crc32: 0.2.13
pend: 1.2.0
yjs@13.6.20:
dependencies:
lib0: 0.2.98