feat(linkDocument): add error handling

This commit is contained in:
ItzCrazyKns 2024-08-29 16:51:12 +05:30
parent e8ed4df31a
commit f620252406
1 changed files with 57 additions and 41 deletions

View File

@ -3,6 +3,7 @@ import { htmlToText } from 'html-to-text';
import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter'; import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter';
import { Document } from '@langchain/core/documents'; import { Document } from '@langchain/core/documents';
import pdfParse from 'pdf-parse'; import pdfParse from 'pdf-parse';
import logger from '../utils/logger';
export const getDocumentsFromLinks = async ({ links }: { links: string[] }) => { export const getDocumentsFromLinks = async ({ links }: { links: string[] }) => {
const splitter = new RecursiveCharacterTextSplitter(); const splitter = new RecursiveCharacterTextSplitter();
@ -16,66 +17,81 @@ export const getDocumentsFromLinks = async ({ links }: { links: string[] }) => {
? link ? link
: `https://${link}`; : `https://${link}`;
const res = await axios.get(link, { try {
responseType: 'arraybuffer', const res = await axios.get(link, {
}); responseType: 'arraybuffer',
});
const isPdf = res.headers['content-type'] === 'application/pdf'; const isPdf = res.headers['content-type'] === 'application/pdf';
if (isPdf) { if (isPdf) {
const pdfText = await pdfParse(res.data); const pdfText = await pdfParse(res.data);
const parsedText = pdfText.text const parsedText = pdfText.text
.replace(/(\r\n|\n|\r)/gm, ' ')
.replace(/\s+/g, ' ')
.trim();
const splittedText = await splitter.splitText(parsedText);
const title = 'PDF Document';
const linkDocs = splittedText.map((text) => {
return new Document({
pageContent: text,
metadata: {
title: title,
url: link,
},
});
});
docs.push(...linkDocs);
return;
}
const parsedText = htmlToText(res.data.toString('utf8'), {
selectors: [
{
selector: 'a',
options: {
ignoreHref: true,
},
},
],
})
.replace(/(\r\n|\n|\r)/gm, ' ') .replace(/(\r\n|\n|\r)/gm, ' ')
.replace(/\s+/g, ' ') .replace(/\s+/g, ' ')
.trim(); .trim();
const splittedText = await splitter.splitText(parsedText); const splittedText = await splitter.splitText(parsedText);
const title = 'PDF Document'; const title = res.data
.toString('utf8')
.match(/<title>(.*?)<\/title>/)?.[1];
const linkDocs = splittedText.map((text) => { const linkDocs = splittedText.map((text) => {
return new Document({ return new Document({
pageContent: text, pageContent: text,
metadata: { metadata: {
title: title, title: title || link,
url: link, url: link,
}, },
}); });
}); });
docs.push(...linkDocs); docs.push(...linkDocs);
return; } catch (err) {
} logger.error(
`Error at generating documents from links: ${err.message}`,
const parsedText = htmlToText(res.data.toString('utf8'), { );
selectors: [ docs.push(
{ new Document({
selector: 'a', pageContent: `Failed to retrieve content from the link: ${err.message}`,
options: { metadata: {
ignoreHref: true, title: 'Failed to retrieve content',
url: link,
}, },
}, }),
], );
}) }
.replace(/(\r\n|\n|\r)/gm, ' ')
.replace(/\s+/g, ' ')
.trim();
const splittedText = await splitter.splitText(parsedText);
const title = res.data
.toString('utf8')
.match(/<title>(.*?)<\/title>/)?.[1];
const linkDocs = splittedText.map((text) => {
return new Document({
pageContent: text,
metadata: {
title: title || link,
url: link,
},
});
});
docs.push(...linkDocs);
}), }),
); );