feat(linkDocument): add error handling

2024-08-29 16:51:12 +05:30 · 2024-08-29 16:51:12 +05:30 · f620252406
parent e8ed4df31a
commit f620252406
1 changed files with 57 additions and 41 deletions
--- a/src/lib/linkDocument.ts
+++ b/src/lib/linkDocument.ts
@ -3,6 +3,7 @@ import { htmlToText } from 'html-to-text';
 import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter';
 import { Document } from '@langchain/core/documents';
 import pdfParse from 'pdf-parse';
 import logger from '../utils/logger';
 export const getDocumentsFromLinks = async ({ links }: { links: string[] }) => {
  const splitter = new RecursiveCharacterTextSplitter();
@ -16,66 +17,81 @@ export const getDocumentsFromLinks = async ({ links }: { links: string[] }) => {
          ? link
          : `https://${link}`;
-      const res = await axios.get(link, {
+      try {
-        responseType: 'arraybuffer',
+        const res = await axios.get(link, {
-      });
+          responseType: 'arraybuffer',
        });
-      const isPdf = res.headers['content-type'] === 'application/pdf';
+        const isPdf = res.headers['content-type'] === 'application/pdf';
-      if (isPdf) {
+        if (isPdf) {
-        const pdfText = await pdfParse(res.data);
+          const pdfText = await pdfParse(res.data);
-        const parsedText = pdfText.text
+          const parsedText = pdfText.text
            .replace(/(\r\n|\n|\r)/gm, ' ')
            .replace(/\s+/g, ' ')
            .trim();
          const splittedText = await splitter.splitText(parsedText);
          const title = 'PDF Document';
          const linkDocs = splittedText.map((text) => {
            return new Document({
              pageContent: text,
              metadata: {
                title: title,
                url: link,
              },
            });
          });
          docs.push(...linkDocs);
          return;
        }
        const parsedText = htmlToText(res.data.toString('utf8'), {
          selectors: [
            {
              selector: 'a',
              options: {
                ignoreHref: true,
              },
            },
          ],
        })
          .replace(/(\r\n|\n|\r)/gm, ' ')
          .replace(/\s+/g, ' ')
          .trim();
        const splittedText = await splitter.splitText(parsedText);
-        const title = 'PDF Document';
+        const title = res.data
          .toString('utf8')
          .match(/<title>(.*?)<\/title>/)?.[1];
        const linkDocs = splittedText.map((text) => {
          return new Document({
            pageContent: text,
            metadata: {
-              title: title,
+              title: title || link,
              url: link,
            },
          });
        });
        docs.push(...linkDocs);
-        return;
+      } catch (err) {
-      }
+        logger.error(
-
+          `Error at generating documents from links: ${err.message}`,
-      const parsedText = htmlToText(res.data.toString('utf8'), {
+        );
-        selectors: [
+        docs.push(
-          {
+          new Document({
-            selector: 'a',
+            pageContent: `Failed to retrieve content from the link: ${err.message}`,
-            options: {
+            metadata: {
-              ignoreHref: true,
+              title: 'Failed to retrieve content',
              url: link,
            },
-          },
+          }),
-        ],
+        );
-      })
+      }
        .replace(/(\r\n|\n|\r)/gm, ' ')
        .replace(/\s+/g, ' ')
        .trim();
      const splittedText = await splitter.splitText(parsedText);
      const title = res.data
        .toString('utf8')
        .match(/<title>(.*?)<\/title>/)?.[1];
      const linkDocs = splittedText.map((text) => {
        return new Document({
          pageContent: text,
          metadata: {
            title: title || link,
            url: link,
          },
        });
      });
      docs.push(...linkDocs);
    }),
  );