mirror of
https://github.com/documenso/documenso.git
synced 2025-11-13 08:13:56 +10:00
feat: chat with pdf
This commit is contained in:
35
packages/lib/server-only/context.ts
Normal file
35
packages/lib/server-only/context.ts
Normal file
@ -0,0 +1,35 @@
|
||||
import { Pinecone } from '@pinecone-database/pinecone';
|
||||
|
||||
import { getEmbeddings } from './embeddings';
|
||||
|
||||
export async function getMatchesFromEmbeddings(embeddings: number[]) {
|
||||
const pc = new Pinecone({
|
||||
apiKey: process.env.PINECONE_API_KEY!,
|
||||
environment: process.env.PINECONE_ENV!,
|
||||
});
|
||||
|
||||
const pineconeIndex = pc.index('documenso-chat-with-pdf-test');
|
||||
|
||||
try {
|
||||
const queryResult = await pineconeIndex.query({
|
||||
topK: 5,
|
||||
vector: embeddings,
|
||||
includeMetadata: true,
|
||||
});
|
||||
|
||||
return queryResult.matches || [];
|
||||
} catch (error) {
|
||||
console.error('There was an error getting matches from embeddings: ', error);
|
||||
throw new Error('There was an error getting matches from embeddings');
|
||||
}
|
||||
}
|
||||
|
||||
export async function getContext(query: string) {
|
||||
const queryEmbeddings = await getEmbeddings(query);
|
||||
const matches = await getMatchesFromEmbeddings(queryEmbeddings);
|
||||
|
||||
const qualifyingMatches = matches.filter((match) => match.score && match.score > 0.7);
|
||||
const docs = qualifyingMatches.map((match) => match.metadata?.text);
|
||||
|
||||
return docs.join('\n').substring(0, 3000);
|
||||
}
|
||||
23
packages/lib/server-only/embeddings.ts
Normal file
23
packages/lib/server-only/embeddings.ts
Normal file
@ -0,0 +1,23 @@
|
||||
import { Configuration, OpenAIApi } from 'openai-edge';
|
||||
|
||||
const config = new Configuration({
|
||||
apiKey: process.env.OPENAI_API_KEY!,
|
||||
});
|
||||
|
||||
const openai = new OpenAIApi(config);
|
||||
|
||||
export async function getEmbeddings(text: string) {
|
||||
try {
|
||||
const response = await openai.createEmbedding({
|
||||
model: 'text-embedding-ada-002',
|
||||
input: text.replace(/\n/g, ' '),
|
||||
});
|
||||
|
||||
const result = await response.json();
|
||||
|
||||
return result.data[0].embedding;
|
||||
} catch (error) {
|
||||
console.error('There was an error getting embeddings: ', error);
|
||||
throw new Error('There was an error getting embeddings');
|
||||
}
|
||||
}
|
||||
113
packages/lib/server-only/pinecone.ts
Normal file
113
packages/lib/server-only/pinecone.ts
Normal file
@ -0,0 +1,113 @@
|
||||
import { Pinecone } from '@pinecone-database/pinecone';
|
||||
import { PDFLoader } from 'langchain/document_loaders/fs/pdf';
|
||||
import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter';
|
||||
import md5 from 'md5';
|
||||
|
||||
import { getEmbeddings } from './embeddings';
|
||||
|
||||
let pc: Pinecone | null = null;
|
||||
|
||||
// export type PDFPage = {
|
||||
// pageContent: string;
|
||||
// metadata: {
|
||||
// source: string;
|
||||
// pdf: {
|
||||
// version: string;
|
||||
// info: {
|
||||
// pdfformatversion: string;
|
||||
// isacroformpresent: boolean;
|
||||
// isxfapresent: boolean;
|
||||
// creator: string;
|
||||
// producer: string;
|
||||
// ceationdate: string;
|
||||
// moddate: string;
|
||||
// };
|
||||
// metadata: null;
|
||||
// totalPages: number;
|
||||
// };
|
||||
// loc: {
|
||||
// pageNumber: number;
|
||||
// };
|
||||
// };
|
||||
// };
|
||||
|
||||
export type PDFPage = unknown;
|
||||
export const getPineconeClient = () => {
|
||||
if (!pc) {
|
||||
pc = new Pinecone({
|
||||
apiKey: process.env.PINECONE_API_KEY!,
|
||||
environment: process.env.PINECONE_ENV!,
|
||||
});
|
||||
}
|
||||
|
||||
return pc;
|
||||
};
|
||||
|
||||
export async function loadFileIntoPinecone(file: string) {
|
||||
if (!file) {
|
||||
throw new Error('No file provided');
|
||||
}
|
||||
|
||||
const loader = new PDFLoader(file);
|
||||
const pages: PDFPage[] = await loader.load();
|
||||
|
||||
const documents = await Promise.all(pages.map(prepareDocument));
|
||||
|
||||
const vectors = await Promise.all(documents.flat().map(embedDocuments));
|
||||
|
||||
const client = getPineconeClient();
|
||||
const pineconeIndex = client.index('documenso-chat-with-pdf-test');
|
||||
|
||||
try {
|
||||
await pineconeIndex.upsert(vectors);
|
||||
} catch (error) {
|
||||
console.error('There was an error upserting vectors: ', error);
|
||||
}
|
||||
}
|
||||
|
||||
async function embedDocuments(doc) {
|
||||
try {
|
||||
const embeddings = await getEmbeddings(doc.pageContent);
|
||||
const hash = md5(doc.pageContent);
|
||||
|
||||
return {
|
||||
id: hash,
|
||||
values: embeddings,
|
||||
metadata: {
|
||||
text: doc.metadata.text,
|
||||
pageNumber: doc.metadata.pageNumber,
|
||||
},
|
||||
};
|
||||
} catch (error) {
|
||||
console.error('There was an error embedding documents: ', error);
|
||||
throw new Error('There was an error embedding documents');
|
||||
}
|
||||
}
|
||||
|
||||
export const truncateStringByBytes = (str: string, numBytes: number) => {
|
||||
const encoder = new TextEncoder();
|
||||
|
||||
return new TextDecoder('utf-8').decode(encoder.encode(str).slice(0, numBytes));
|
||||
};
|
||||
|
||||
async function prepareDocument(page: PDFPage) {
|
||||
let { pageContent, metadata } = page;
|
||||
pageContent = pageContent.replace(/\n/g, '');
|
||||
|
||||
const splitter = new RecursiveCharacterTextSplitter();
|
||||
const docs = await splitter.splitDocuments([
|
||||
{
|
||||
pageContent,
|
||||
metadata: {
|
||||
pageNumber: metadata.loc.pageNumber,
|
||||
text: truncateStringByBytes(pageContent, 36000),
|
||||
},
|
||||
},
|
||||
]);
|
||||
|
||||
return docs;
|
||||
}
|
||||
|
||||
function convertToAscii(input: string) {
|
||||
return input.replace(/[^\x00-\x7F]/g, '');
|
||||
}
|
||||
Reference in New Issue
Block a user