Compare commits

...

2 Commits

Author SHA1 Message Date
8143452a21 cleanup 2025-05-28 19:36:24 -07:00
003b8f5515 utility files 2025-05-28 19:35:12 -07:00
4 changed files with 312 additions and 53 deletions

View File

@ -32,8 +32,27 @@ export function getFileTaskFolderPath(
}
}
export function extractZip(source: string, target: string) {
//https://github.com/Surfer-Org
/**
* Extracts a ZIP archive.
*/
export async function extractZip(
source: string,
target: string,
): Promise<void> {
await extractZipInternal(source, target, true);
}
/**
* Internal helper to extract a ZIP, with optional single-nested-ZIP handling.
* @param source Path to the ZIP file
* @param target Directory to extract into
* @param allowNested Whether to check and unwrap one level of nested ZIP
*/
function extractZipInternal(
source: string,
target: string,
allowNested: boolean,
): Promise<void> {
return new Promise((resolve, reject) => {
yauzl.open(
source,
@ -41,54 +60,128 @@ export function extractZip(source: string, target: string) {
(err, zipfile) => {
if (err) return reject(err);
// Handle one level of nested ZIP if allowed
if (allowNested && zipfile.entryCount === 1) {
zipfile.readEntry();
zipfile.once('entry', (entry) => {
const name = entry.fileName.toString('utf8').replace(/^\/+/, '');
const isZip =
!/\/$/.test(entry.fileName) &&
name.toLowerCase().endsWith('.zip');
if (isZip) {
const nestedPath = source.endsWith('.zip')
? source.slice(0, -4) + '.inner.zip'
: source + '.inner.zip';
zipfile.openReadStream(entry, (openErr, rs) => {
if (openErr) return reject(openErr);
const ws = fs.createWriteStream(nestedPath);
rs.on('error', reject);
ws.on('error', reject);
ws.on('finish', () => {
zipfile.close();
extractZipInternal(nestedPath, target, false)
.then(() => {
fs.unlinkSync(nestedPath);
resolve();
})
.catch(reject);
});
rs.pipe(ws);
});
} else {
zipfile.close();
extractZipInternal(source, target, false).then(resolve, reject);
}
});
zipfile.once('error', reject);
return;
}
// Normal extraction
zipfile.readEntry();
zipfile.on('entry', (entry) => {
const name = entry.fileName.toString('utf8'); // or 'cp437' if you need the original DOS charset
const safeName = name.replace(/^\/+/, ''); // strip any leading slashes
const fullPath = path.join(target, safeName);
const directory = path.dirname(fullPath);
// <-- skip all macOS metadata
if (safeName.startsWith('__MACOSX/')) {
return zipfile.readEntry();
const name = entry.fileName.toString('utf8');
const safe = name.replace(/^\/+/, '');
if (safe.startsWith('__MACOSX/')) {
zipfile.readEntry();
return;
}
if (/\/$/.test(entry.fileName)) {
// Directory entry
const fullPath = path.join(target, safe);
// Handle directories
if (/\/$/.test(name)) {
try {
fs.mkdirSync(fullPath, { recursive: true });
zipfile.readEntry();
} catch (err) {
reject(err);
}
} else {
// File entry
try {
fs.mkdirSync(directory, { recursive: true });
zipfile.openReadStream(entry, (err, readStream) => {
if (err) return reject(err);
const writeStream = fs.createWriteStream(fullPath);
readStream.on('end', () => {
writeStream.end();
zipfile.readEntry();
});
readStream.pipe(writeStream);
});
} catch (err) {
reject(err);
} catch (mkdirErr: any) {
if (mkdirErr.code === 'ENAMETOOLONG') {
console.warn(`Skipping directory (path too long): ${fullPath}`);
zipfile.readEntry();
return;
}
return reject(mkdirErr);
}
zipfile.readEntry();
return;
}
// Handle files
try {
fs.mkdirSync(path.dirname(fullPath), { recursive: true });
} catch (mkdirErr: any) {
if (mkdirErr.code === 'ENAMETOOLONG') {
console.warn(
`Skipping file directory creation (path too long): ${fullPath}`,
);
zipfile.readEntry();
return;
}
return reject(mkdirErr);
}
zipfile.openReadStream(entry, (openErr, rs) => {
if (openErr) return reject(openErr);
let ws: fs.WriteStream;
try {
ws = fs.createWriteStream(fullPath);
} catch (openWsErr: any) {
if (openWsErr.code === 'ENAMETOOLONG') {
console.warn(
`Skipping file write (path too long): ${fullPath}`,
);
zipfile.readEntry();
return;
}
return reject(openWsErr);
}
rs.on('error', (err) => reject(err));
ws.on('error', (err) => {
if ((err as any).code === 'ENAMETOOLONG') {
console.warn(
`Skipping file write on stream (path too long): ${fullPath}`,
);
zipfile.readEntry();
} else {
reject(err);
}
});
ws.on('finish', () => zipfile.readEntry());
rs.pipe(ws);
});
});
zipfile.on('end', resolve);
zipfile.on('error', reject);
zipfile.on('end', () => resolve());
zipfile.on('error', (err) => reject(err));
},
);
});
}
export function cleanUrlString(url: string): string {
const [maybePath] = url.split('?', 1);
return maybePath;
if (!url) return null;
const [mainUrl] = url.split('?', 1);
return mainUrl;
}

View File

@ -1,6 +1,13 @@
import { Window } from 'happy-dom';
import { cleanUrlString } from './file.utils';
import {
Window,
HTMLAnchorElement,
HTMLIFrameElement,
Element as HDElement,
} from 'happy-dom';
import { getEmbedUrlAndProvider } from '@docmost/editor-ext';
import * as path from 'path';
import { v7 } from 'uuid';
import { InsertableBacklink } from '@docmost/db/types/entity.types';
export function formatImportHtml(html: string) {
const pmHtml = notionFormatter(html);
@ -14,23 +21,41 @@ export function defaultHtmlFormatter(html: string): string {
// embed providers
const anchors = Array.from(doc.getElementsByTagName('a'));
for (const a of anchors) {
const href = cleanUrlString(a.getAttribute('href')) ?? '';
if (!href) continue;
for (const node of anchors) {
const url = (node as HTMLAnchorElement).href;
if (!url) continue;
const embedProvider = getEmbedUrlAndProvider(href);
const embedProvider = getEmbedUrlAndProvider(url);
// we only want to embed valid matches
if (embedProvider.provider === 'iframe') continue;
if (embedProvider) {
const embed = doc.createElement('div');
embed.setAttribute('data-type', 'embed');
embed.setAttribute('data-src', href);
embed.setAttribute('data-provider', embedProvider.provider);
embed.setAttribute('data-align', 'center');
embed.setAttribute('data-width', '640');
embed.setAttribute('data-height', '480');
const embed = doc.createElement('div');
embed.setAttribute('data-type', 'embed');
embed.setAttribute('data-src', url);
embed.setAttribute('data-provider', embedProvider.provider);
embed.setAttribute('data-align', 'center');
embed.setAttribute('data-width', '640');
embed.setAttribute('data-height', '480');
a.replaceWith(embed);
}
node.replaceWith(embed);
}
// embed providers
const iframes = Array.from(doc.getElementsByTagName('iframe'));
for (const iframe of iframes) {
const url = (iframe as HTMLIFrameElement).src;
if (!url) continue;
const embedProvider = getEmbedUrlAndProvider(url);
const embed = doc.createElement('div');
embed.setAttribute('data-type', 'embed');
embed.setAttribute('data-src', url);
embed.setAttribute('data-provider', embedProvider.provider);
embed.setAttribute('data-align', 'center');
embed.setAttribute('data-width', '640');
embed.setAttribute('data-height', '480');
iframe.replaceWith(embed);
}
return doc.body.innerHTML;
@ -168,3 +193,78 @@ export function notionFormatter(html: string): string {
}
return doc.body.innerHTML;
}
export function unwrapFromParagraph(node: HDElement) {
let wrapper = node.closest('p, a') as HDElement | null;
while (wrapper) {
if (wrapper.childNodes.length === 1) {
// e.g. <p><node/></p> or <a><node/></a> → <node/>
wrapper.replaceWith(node);
} else {
wrapper.parentNode!.insertBefore(node, wrapper);
}
wrapper = node.closest('p, a') as HDElement | null;
}
}
export async function rewriteInternalLinksToMentionHtml(
html: string,
currentFilePath: string,
filePathToPageMetaMap: Map<
string,
{ id: string; title: string; slugId: string }
>,
creatorId: string,
sourcePageId: string,
workspaceId: string,
): Promise<{ html: string; backlinks: InsertableBacklink[] }> {
const window = new Window();
const doc = window.document;
doc.body.innerHTML = html;
// normalize helper
const normalize = (p: string) => p.replace(/\\/g, '/');
const backlinks: InsertableBacklink[] = [];
for (const a of Array.from(doc.getElementsByTagName('a'))) {
const rawHref = a.getAttribute('href');
if (!rawHref) continue;
// skip absolute/external URLs
if (rawHref.startsWith('http') || rawHref.startsWith('/api/')) {
continue;
}
const decodedRef = decodeURIComponent(rawHref);
const parentDir = path.dirname(currentFilePath);
const joined = path.join(parentDir, decodedRef);
const resolved = normalize(joined);
const pageMeta = filePathToPageMetaMap.get(resolved);
if (!pageMeta) {
continue;
}
const mentionEl = doc.createElement('span') as HDElement;
mentionEl.setAttribute('data-type', 'mention');
mentionEl.setAttribute('data-id', v7());
mentionEl.setAttribute('data-entity-type', 'page');
mentionEl.setAttribute('data-entity-id', pageMeta.id);
mentionEl.setAttribute('data-label', pageMeta.title);
mentionEl.setAttribute('data-slug-id', pageMeta.slugId);
mentionEl.setAttribute('data-creator-id', creatorId);
mentionEl.textContent = pageMeta.title;
a.replaceWith(mentionEl);
backlinks.push({
sourcePageId,
targetPageId: pageMeta.id,
workspaceId: workspaceId,
});
}
return { html: doc.body.innerHTML, backlinks };
}

View File

@ -0,0 +1,66 @@
import { promises as fs } from 'fs';
import * as path from 'path';
export async function buildAttachmentCandidates(
extractDir: string,
): Promise<Map<string, string>> {
const map = new Map<string, string>();
async function walk(dir: string) {
for (const ent of await fs.readdir(dir, { withFileTypes: true })) {
const abs = path.join(dir, ent.name);
if (ent.isDirectory()) {
await walk(abs);
} else {
if (['.md', '.html'].includes(path.extname(ent.name).toLowerCase())) {
continue;
}
const rel = path.relative(extractDir, abs).split(path.sep).join('/');
map.set(rel, abs);
}
}
}
await walk(extractDir);
return map;
}
export function resolveRelativeAttachmentPath(
raw: string,
pageDir: string,
attachmentCandidates: Map<string, string>,
): string | null {
const mainRel = decodeURIComponent(raw.replace(/^\.?\/+/, ''));
const fallback = path.normalize(path.join(pageDir, mainRel));
if (attachmentCandidates.has(mainRel)) {
return mainRel;
}
if (attachmentCandidates.has(fallback)) {
return fallback;
}
return null;
}
export async function collectMarkdownAndHtmlFiles(
dir: string,
): Promise<string[]> {
const results: string[] = [];
async function walk(current: string) {
const entries = await fs.readdir(current, { withFileTypes: true });
for (const ent of entries) {
const fullPath = path.join(current, ent.name);
if (ent.isDirectory()) {
await walk(fullPath);
} else if (
['.md', '.html'].includes(path.extname(ent.name).toLowerCase())
) {
results.push(fullPath);
}
}
}
await walk(dir);
return results;
}

View File

@ -88,7 +88,7 @@ async function bootstrap() {
const logger = new Logger('NestApplication');
process.on('unhandledRejection', (reason, promise) => {
logger.error(`UnhandledRejection: ${promise}, reason: ${reason}`);
logger.error(`UnhandledRejection, reason: ${reason}`, promise);
});
process.on('uncaughtException', (error) => {