Merge remote-tracking branch 'origin/master' into ollama-auth

This commit is contained in:
projectmoon 2024-08-02 10:17:08 +02:00
commit c3dac38b6a
3 changed files with 41 additions and 30 deletions

View File

@ -157,35 +157,43 @@ const createBasicWebSearchRetrieverChain = (llm: BaseChatModel) => {
question = 'Summarize'; question = 'Summarize';
} }
let docs = [] let docs = [];
const linkDocs = await getDocumentsFromLinks({ links }); const linkDocs = await getDocumentsFromLinks({ links });
const docGroups: Document[] = []; const docGroups: Document[] = [];
linkDocs.map((doc) => { linkDocs.map((doc) => {
const URLDocExists = docGroups.find((d) => d.metadata.url === doc.metadata.url && d.metadata.totalDocs < 10); const URLDocExists = docGroups.find(
(d) =>
d.metadata.url === doc.metadata.url && d.metadata.totalDocs < 10,
);
if (!URLDocExists) { if (!URLDocExists) {
docGroups.push({ docGroups.push({
...doc, ...doc,
metadata: { metadata: {
...doc.metadata, ...doc.metadata,
totalDocs: 1 totalDocs: 1,
} },
}); });
} }
const docIndex = docGroups.findIndex((d) => d.metadata.url === doc.metadata.url && d.metadata.totalDocs < 10); const docIndex = docGroups.findIndex(
(d) =>
d.metadata.url === doc.metadata.url && d.metadata.totalDocs < 10,
);
if (docIndex !== -1) { if (docIndex !== -1) {
docGroups[docIndex].pageContent = docGroups[docIndex].pageContent + `\n\n` + doc.pageContent; docGroups[docIndex].pageContent =
docGroups[docIndex].pageContent + `\n\n` + doc.pageContent;
docGroups[docIndex].metadata.totalDocs += 1; docGroups[docIndex].metadata.totalDocs += 1;
} }
}) });
await Promise.all(docGroups.map(async (doc) => { await Promise.all(
const res = await llm.invoke(` docGroups.map(async (doc) => {
const res = await llm.invoke(`
You are a text summarizer. You need to summarize the text provided inside the \`text\` XML block. You are a text summarizer. You need to summarize the text provided inside the \`text\` XML block.
You need to summarize the text into 1 or 2 sentences capturing the main idea of the text. You need to summarize the text into 1 or 2 sentences capturing the main idea of the text.
You need to make sure that you don't miss any point while summarizing the text. You need to make sure that you don't miss any point while summarizing the text.
@ -204,16 +212,17 @@ const createBasicWebSearchRetrieverChain = (llm: BaseChatModel) => {
Make sure to answer the query in the summary. Make sure to answer the query in the summary.
`); `);
const document = new Document({ const document = new Document({
pageContent: res.content as string, pageContent: res.content as string,
metadata: { metadata: {
title: doc.metadata.title, title: doc.metadata.title,
url: doc.metadata.url, url: doc.metadata.url,
}, },
}) });
docs.push(document) docs.push(document);
})) }),
);
return { query: question, docs: docs }; return { query: question, docs: docs };
} else { } else {

View File

@ -30,9 +30,9 @@ server.listen(port, () => {
startWebSocketServer(server); startWebSocketServer(server);
process.on('uncaughtException', (err, origin) => { process.on('uncaughtException', (err, origin) => {
logger.error(`Uncaught Exception at ${origin}: ${err}`) logger.error(`Uncaught Exception at ${origin}: ${err}`);
}) });
process.on('unhandledRejection', (reason, promise) => { process.on('unhandledRejection', (reason, promise) => {
logger.error(`Unhandled Rejection at: ${promise}, reason: ${reason}`) logger.error(`Unhandled Rejection at: ${promise}, reason: ${reason}`);
}) });

View File

@ -1,8 +1,8 @@
import axios from 'axios'; import axios from 'axios';
import { htmlToText } from 'html-to-text' import { htmlToText } from 'html-to-text';
import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter'; import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter';
import { Document } from '@langchain/core/documents'; import { Document } from '@langchain/core/documents';
import pdfParse from 'pdf-parse' import pdfParse from 'pdf-parse';
export const getDocumentsFromLinks = async ({ links }: { links: string[] }) => { export const getDocumentsFromLinks = async ({ links }: { links: string[] }) => {
const splitter = new RecursiveCharacterTextSplitter(); const splitter = new RecursiveCharacterTextSplitter();
@ -23,14 +23,14 @@ export const getDocumentsFromLinks = async ({ links }: { links: string[] }) => {
const isPdf = res.headers['content-type'] === 'application/pdf'; const isPdf = res.headers['content-type'] === 'application/pdf';
if (isPdf) { if (isPdf) {
const pdfText = await pdfParse(res.data) const pdfText = await pdfParse(res.data);
const parsedText = pdfText.text const parsedText = pdfText.text
.replace(/(\r\n|\n|\r)/gm, ' ') .replace(/(\r\n|\n|\r)/gm, ' ')
.replace(/\s+/g, ' ') .replace(/\s+/g, ' ')
.trim(); .trim();
const splittedText = await splitter.splitText(parsedText); const splittedText = await splitter.splitText(parsedText);
const title = 'PDF Document' const title = 'PDF Document';
const linkDocs = splittedText.map((text) => { const linkDocs = splittedText.map((text) => {
return new Document({ return new Document({
@ -52,16 +52,18 @@ export const getDocumentsFromLinks = async ({ links }: { links: string[] }) => {
selector: 'a', selector: 'a',
options: { options: {
ignoreHref: true, ignoreHref: true,
} },
}, },
] ],
}) })
.replace(/(\r\n|\n|\r)/gm, ' ') .replace(/(\r\n|\n|\r)/gm, ' ')
.replace(/\s+/g, ' ') .replace(/\s+/g, ' ')
.trim(); .trim();
const splittedText = await splitter.splitText(parsedText); const splittedText = await splitter.splitText(parsedText);
const title = res.data.toString('utf8').match(/<title>(.*?)<\/title>/)?.[1]; const title = res.data
.toString('utf8')
.match(/<title>(.*?)<\/title>/)?.[1];
const linkDocs = splittedText.map((text) => { const linkDocs = splittedText.map((text) => {
return new Document({ return new Document({