feat: chat with pdf

This commit is contained in:
pit
2023-10-25 09:29:34 +03:00
parent 1c34eddd10
commit a03e74d660
10 changed files with 2030 additions and 45 deletions

View File

@ -0,0 +1,35 @@
import { Pinecone } from '@pinecone-database/pinecone';
import { getEmbeddings } from './embeddings';
export async function getMatchesFromEmbeddings(embeddings: number[]) {
const pc = new Pinecone({
apiKey: process.env.PINECONE_API_KEY!,
environment: process.env.PINECONE_ENV!,
});
const pineconeIndex = pc.index('documenso-chat-with-pdf-test');
try {
const queryResult = await pineconeIndex.query({
topK: 5,
vector: embeddings,
includeMetadata: true,
});
return queryResult.matches || [];
} catch (error) {
console.error('There was an error getting matches from embeddings: ', error);
throw new Error('There was an error getting matches from embeddings');
}
}
export async function getContext(query: string) {
const queryEmbeddings = await getEmbeddings(query);
const matches = await getMatchesFromEmbeddings(queryEmbeddings);
const qualifyingMatches = matches.filter((match) => match.score && match.score > 0.7);
const docs = qualifyingMatches.map((match) => match.metadata?.text);
return docs.join('\n').substring(0, 3000);
}

View File

@ -0,0 +1,23 @@
import { Configuration, OpenAIApi } from 'openai-edge';
const config = new Configuration({
apiKey: process.env.OPENAI_API_KEY!,
});
const openai = new OpenAIApi(config);
export async function getEmbeddings(text: string) {
try {
const response = await openai.createEmbedding({
model: 'text-embedding-ada-002',
input: text.replace(/\n/g, ' '),
});
const result = await response.json();
return result.data[0].embedding;
} catch (error) {
console.error('There was an error getting embeddings: ', error);
throw new Error('There was an error getting embeddings');
}
}

View File

@ -0,0 +1,113 @@
import { Pinecone } from '@pinecone-database/pinecone';
import { PDFLoader } from 'langchain/document_loaders/fs/pdf';
import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter';
import md5 from 'md5';
import { getEmbeddings } from './embeddings';
let pc: Pinecone | null = null;
// export type PDFPage = {
// pageContent: string;
// metadata: {
// source: string;
// pdf: {
// version: string;
// info: {
// pdfformatversion: string;
// isacroformpresent: boolean;
// isxfapresent: boolean;
// creator: string;
// producer: string;
// ceationdate: string;
// moddate: string;
// };
// metadata: null;
// totalPages: number;
// };
// loc: {
// pageNumber: number;
// };
// };
// };
export type PDFPage = unknown;
export const getPineconeClient = () => {
if (!pc) {
pc = new Pinecone({
apiKey: process.env.PINECONE_API_KEY!,
environment: process.env.PINECONE_ENV!,
});
}
return pc;
};
export async function loadFileIntoPinecone(file: string) {
if (!file) {
throw new Error('No file provided');
}
const loader = new PDFLoader(file);
const pages: PDFPage[] = await loader.load();
const documents = await Promise.all(pages.map(prepareDocument));
const vectors = await Promise.all(documents.flat().map(embedDocuments));
const client = getPineconeClient();
const pineconeIndex = client.index('documenso-chat-with-pdf-test');
try {
await pineconeIndex.upsert(vectors);
} catch (error) {
console.error('There was an error upserting vectors: ', error);
}
}
async function embedDocuments(doc) {
try {
const embeddings = await getEmbeddings(doc.pageContent);
const hash = md5(doc.pageContent);
return {
id: hash,
values: embeddings,
metadata: {
text: doc.metadata.text,
pageNumber: doc.metadata.pageNumber,
},
};
} catch (error) {
console.error('There was an error embedding documents: ', error);
throw new Error('There was an error embedding documents');
}
}
export const truncateStringByBytes = (str: string, numBytes: number) => {
const encoder = new TextEncoder();
return new TextDecoder('utf-8').decode(encoder.encode(str).slice(0, numBytes));
};
async function prepareDocument(page: PDFPage) {
let { pageContent, metadata } = page;
pageContent = pageContent.replace(/\n/g, '');
const splitter = new RecursiveCharacterTextSplitter();
const docs = await splitter.splitDocuments([
{
pageContent,
metadata: {
pageNumber: metadata.loc.pageNumber,
text: truncateStringByBytes(pageContent, 36000),
},
},
]);
return docs;
}
function convertToAscii(input: string) {
return input.replace(/[^\x00-\x7F]/g, '');
}