Compare commits
No commits in common. "c3dac38b6a0bcdbc801a73580953dfd6148c8ccf" and "df8d924a89c5373855a3d78ec0a37711a1774d85" have entirely different histories.
c3dac38b6a
...
df8d924a89
|
@ -157,43 +157,35 @@ const createBasicWebSearchRetrieverChain = (llm: BaseChatModel) => {
|
||||||
question = 'Summarize';
|
question = 'Summarize';
|
||||||
}
|
}
|
||||||
|
|
||||||
let docs = [];
|
let docs = []
|
||||||
|
|
||||||
const linkDocs = await getDocumentsFromLinks({ links });
|
const linkDocs = await getDocumentsFromLinks({ links });
|
||||||
|
|
||||||
const docGroups: Document[] = [];
|
const docGroups: Document[] = [];
|
||||||
|
|
||||||
linkDocs.map((doc) => {
|
linkDocs.map((doc) => {
|
||||||
const URLDocExists = docGroups.find(
|
const URLDocExists = docGroups.find((d) => d.metadata.url === doc.metadata.url && d.metadata.totalDocs < 10);
|
||||||
(d) =>
|
|
||||||
d.metadata.url === doc.metadata.url && d.metadata.totalDocs < 10,
|
|
||||||
);
|
|
||||||
|
|
||||||
if (!URLDocExists) {
|
if (!URLDocExists) {
|
||||||
docGroups.push({
|
docGroups.push({
|
||||||
...doc,
|
...doc,
|
||||||
metadata: {
|
metadata: {
|
||||||
...doc.metadata,
|
...doc.metadata,
|
||||||
totalDocs: 1,
|
totalDocs: 1
|
||||||
},
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
const docIndex = docGroups.findIndex(
|
const docIndex = docGroups.findIndex((d) => d.metadata.url === doc.metadata.url && d.metadata.totalDocs < 10);
|
||||||
(d) =>
|
|
||||||
d.metadata.url === doc.metadata.url && d.metadata.totalDocs < 10,
|
|
||||||
);
|
|
||||||
|
|
||||||
if (docIndex !== -1) {
|
if (docIndex !== -1) {
|
||||||
docGroups[docIndex].pageContent =
|
docGroups[docIndex].pageContent = docGroups[docIndex].pageContent + `\n\n` + doc.pageContent;
|
||||||
docGroups[docIndex].pageContent + `\n\n` + doc.pageContent;
|
|
||||||
docGroups[docIndex].metadata.totalDocs += 1;
|
docGroups[docIndex].metadata.totalDocs += 1;
|
||||||
}
|
}
|
||||||
});
|
})
|
||||||
|
|
||||||
await Promise.all(
|
await Promise.all(docGroups.map(async (doc) => {
|
||||||
docGroups.map(async (doc) => {
|
const res = await llm.invoke(`
|
||||||
const res = await llm.invoke(`
|
|
||||||
You are a text summarizer. You need to summarize the text provided inside the \`text\` XML block.
|
You are a text summarizer. You need to summarize the text provided inside the \`text\` XML block.
|
||||||
You need to summarize the text into 1 or 2 sentences capturing the main idea of the text.
|
You need to summarize the text into 1 or 2 sentences capturing the main idea of the text.
|
||||||
You need to make sure that you don't miss any point while summarizing the text.
|
You need to make sure that you don't miss any point while summarizing the text.
|
||||||
|
@ -212,17 +204,16 @@ const createBasicWebSearchRetrieverChain = (llm: BaseChatModel) => {
|
||||||
Make sure to answer the query in the summary.
|
Make sure to answer the query in the summary.
|
||||||
`);
|
`);
|
||||||
|
|
||||||
const document = new Document({
|
const document = new Document({
|
||||||
pageContent: res.content as string,
|
pageContent: res.content as string,
|
||||||
metadata: {
|
metadata: {
|
||||||
title: doc.metadata.title,
|
title: doc.metadata.title,
|
||||||
url: doc.metadata.url,
|
url: doc.metadata.url,
|
||||||
},
|
},
|
||||||
});
|
})
|
||||||
|
|
||||||
docs.push(document);
|
docs.push(document)
|
||||||
}),
|
}))
|
||||||
);
|
|
||||||
|
|
||||||
return { query: question, docs: docs };
|
return { query: question, docs: docs };
|
||||||
} else {
|
} else {
|
||||||
|
|
|
@ -30,9 +30,9 @@ server.listen(port, () => {
|
||||||
startWebSocketServer(server);
|
startWebSocketServer(server);
|
||||||
|
|
||||||
process.on('uncaughtException', (err, origin) => {
|
process.on('uncaughtException', (err, origin) => {
|
||||||
logger.error(`Uncaught Exception at ${origin}: ${err}`);
|
logger.error(`Uncaught Exception at ${origin}: ${err}`)
|
||||||
});
|
})
|
||||||
|
|
||||||
process.on('unhandledRejection', (reason, promise) => {
|
process.on('unhandledRejection', (reason, promise) => {
|
||||||
logger.error(`Unhandled Rejection at: ${promise}, reason: ${reason}`);
|
logger.error(`Unhandled Rejection at: ${promise}, reason: ${reason}`)
|
||||||
});
|
})
|
|
@ -1,8 +1,8 @@
|
||||||
import axios from 'axios';
|
import axios from 'axios';
|
||||||
import { htmlToText } from 'html-to-text';
|
import { htmlToText } from 'html-to-text'
|
||||||
import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter';
|
import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter';
|
||||||
import { Document } from '@langchain/core/documents';
|
import { Document } from '@langchain/core/documents';
|
||||||
import pdfParse from 'pdf-parse';
|
import pdfParse from 'pdf-parse'
|
||||||
|
|
||||||
export const getDocumentsFromLinks = async ({ links }: { links: string[] }) => {
|
export const getDocumentsFromLinks = async ({ links }: { links: string[] }) => {
|
||||||
const splitter = new RecursiveCharacterTextSplitter();
|
const splitter = new RecursiveCharacterTextSplitter();
|
||||||
|
@ -23,14 +23,14 @@ export const getDocumentsFromLinks = async ({ links }: { links: string[] }) => {
|
||||||
const isPdf = res.headers['content-type'] === 'application/pdf';
|
const isPdf = res.headers['content-type'] === 'application/pdf';
|
||||||
|
|
||||||
if (isPdf) {
|
if (isPdf) {
|
||||||
const pdfText = await pdfParse(res.data);
|
const pdfText = await pdfParse(res.data)
|
||||||
const parsedText = pdfText.text
|
const parsedText = pdfText.text
|
||||||
.replace(/(\r\n|\n|\r)/gm, ' ')
|
.replace(/(\r\n|\n|\r)/gm, ' ')
|
||||||
.replace(/\s+/g, ' ')
|
.replace(/\s+/g, ' ')
|
||||||
.trim();
|
.trim();
|
||||||
|
|
||||||
const splittedText = await splitter.splitText(parsedText);
|
const splittedText = await splitter.splitText(parsedText);
|
||||||
const title = 'PDF Document';
|
const title = 'PDF Document'
|
||||||
|
|
||||||
const linkDocs = splittedText.map((text) => {
|
const linkDocs = splittedText.map((text) => {
|
||||||
return new Document({
|
return new Document({
|
||||||
|
@ -52,18 +52,16 @@ export const getDocumentsFromLinks = async ({ links }: { links: string[] }) => {
|
||||||
selector: 'a',
|
selector: 'a',
|
||||||
options: {
|
options: {
|
||||||
ignoreHref: true,
|
ignoreHref: true,
|
||||||
},
|
}
|
||||||
},
|
},
|
||||||
],
|
]
|
||||||
})
|
})
|
||||||
.replace(/(\r\n|\n|\r)/gm, ' ')
|
.replace(/(\r\n|\n|\r)/gm, ' ')
|
||||||
.replace(/\s+/g, ' ')
|
.replace(/\s+/g, ' ')
|
||||||
.trim();
|
.trim();
|
||||||
|
|
||||||
const splittedText = await splitter.splitText(parsedText);
|
const splittedText = await splitter.splitText(parsedText);
|
||||||
const title = res.data
|
const title = res.data.toString('utf8').match(/<title>(.*?)<\/title>/)?.[1];
|
||||||
.toString('utf8')
|
|
||||||
.match(/<title>(.*?)<\/title>/)?.[1];
|
|
||||||
|
|
||||||
const linkDocs = splittedText.map((text) => {
|
const linkDocs = splittedText.map((text) => {
|
||||||
return new Document({
|
return new Document({
|
||||||
|
|
Loading…
Reference in New Issue