Merge remote-tracking branch 'origin/master' into ollama-auth

2024-09-01 22:10:57 +02:00 · 2024-09-01 22:10:57 +02:00 · 4447cf7b5f
parent fb6ec2fc8a 92abbc5b98
commit 4447cf7b5f
11 changed files with 129 additions and 2421 deletions
--- a/package.json
+++ b/package.json
@ -1,6 +1,6 @@
 {
  "name": "perplexica-backend",
-  "version": "1.9.0-rc1",
+  "version": "1.9.0-rc2",
  "license": "MIT",
  "author": "ItzCrazyKns",
  "scripts": {
--- a/searxng/settings.yml
+++ b/searxng/settings.yml
--- a/src/agents/academicSearchAgent.ts
+++ b/src/agents/academicSearchAgent.ts
@ -19,6 +19,7 @@ import formatChatHistoryAsString from '../utils/formatHistory';
 import eventEmitter from 'events';
 import computeSimilarity from '../utils/computeSimilarity';
 import logger from '../utils/logger';
+import { IterableReadableStream } from '@langchain/core/utils/stream';

 const basicAcademicSearchRetrieverPrompt = `
 You will be given a conversation below and a follow up question. You need to rephrase the follow-up question if needed so it is a standalone question that can be used by the LLM to search the web for information.
@ -66,7 +67,7 @@ const basicAcademicSearchResponsePrompt = `
 const strParser = new StringOutputParser();

 const handleStream = async (
-  stream: AsyncGenerator<StreamEvent, any, unknown>,
+  stream: IterableReadableStream<StreamEvent>,
  emitter: eventEmitter,
 ) => {
  for await (const event of stream) {
--- a/src/agents/redditSearchAgent.ts
+++ b/src/agents/redditSearchAgent.ts
@ -19,6 +19,7 @@ import formatChatHistoryAsString from '../utils/formatHistory';
 import eventEmitter from 'events';
 import computeSimilarity from '../utils/computeSimilarity';
 import logger from '../utils/logger';
+import { IterableReadableStream } from '@langchain/core/utils/stream';

 const basicRedditSearchRetrieverPrompt = `
 You will be given a conversation below and a follow up question. You need to rephrase the follow-up question if needed so it is a standalone question that can be used by the LLM to search the web for information.
@ -66,7 +67,7 @@ const basicRedditSearchResponsePrompt = `
 const strParser = new StringOutputParser();

 const handleStream = async (
-  stream: AsyncGenerator<StreamEvent, any, unknown>,
+  stream: IterableReadableStream<StreamEvent>,
  emitter: eventEmitter,
 ) => {
  for await (const event of stream) {
--- a/src/agents/webSearchAgent.ts
+++ b/src/agents/webSearchAgent.ts
@ -22,22 +22,38 @@ import logger from '../utils/logger';
 import LineListOutputParser from '../lib/outputParsers/listLineOutputParser';
 import { getDocumentsFromLinks } from '../lib/linkDocument';
 import LineOutputParser from '../lib/outputParsers/lineOutputParser';
+import { IterableReadableStream } from '@langchain/core/utils/stream';
+import { ChatOpenAI } from '@langchain/openai';

 const basicSearchRetrieverPrompt = `
-You will be given a conversation below and a follow up question. You need to rephrase the follow-up question if needed so it is a standalone question that can be used by the LLM to search the web for information.
-If it is a writing task or a simple hi, hello rather than a question, you need to return \`not_needed\` as the response.
-If the question contains some links and asks to answer from those links or even if they don't you need to return the links inside 'links' XML block and the question inside 'question' XML block. If there are no links then you need to return the question without any XML block.
-If the user asks to summarrize the content from some links you need to return \`Summarize\` as the question inside the 'question' XML block and the links inside the 'links' XML block.
+You are an AI question rephraser. You will be given a conversation and a follow-up question,  you will have to rephrase the follow up question so it is a standalone question and can be used by another LLM to search the web for information to answer it.
+If it is a smple writing task or a greeting (unless the greeting contains a question after it) like Hi, Hello, How are you, etc. than a question then you need to return \`not_needed\` as the response (This is because the LLM won't need to search the web for finding information on this topic).
+If the user asks some question from some URL or wants you to summarize a PDF or a webpage (via URL) you need to return the links inside the \`links\` XML block and the question inside the \`question\` XML block. If the user wants to you to summarize the webpage or the PDF you need to return \`summarize\` inside the \`question\` XML block in place of a question and the link to summarize in the \`links\` XML block.
+You must always return the rephrased question inside the \`question\` XML block, if there are no links in the follow-up question then don't insert a \`links\` XML block in your response.

-Example:
-1. Follow up question: What is the capital of France?
-Rephrased question: \`Capital of france\`
+There are several examples attached for your reference inside the below \`examples\` XML block

-2. Follow up question: What is the population of New York City?
-Rephrased question: \`Population of New York City\`
+<examples>
+1. Follow up question: What is the capital of France
+Rephrased question:\`
+<question>
+Capital of france
+</question>
+\`
+
+2. Hi, how are you?
+Rephrased question\`
+<question>
+not_needed
+</question>
+\`

 3. Follow up question: What is Docker?
-Rephrased question: \`What is Docker\`
+Rephrased question: \`
+<question>
+What is Docker
+</question>
+\`

 4. Follow up question: Can you tell me what is X from https://example.com
 Rephrased question: \`
@ -53,16 +69,20 @@ https://example.com
 5. Follow up question: Summarize the content from https://example.com
 Rephrased question: \`
 <question>
-Summarize
+summarize
 </question>

 <links>
 https://example.com
 </links>
 \`
+</examples>

-Conversation:
+Anything below is the part of the actual conversation and you need to use conversation and the follow-up question to rephrase the follow-up question as a standalone question based on the guidelines shared above.
+
+<conversation>
 {chat_history}
+</conversation>

 Follow up question: {query}
 Rephrased question:
@ -95,7 +115,7 @@ const basicWebSearchResponsePrompt = `
 const strParser = new StringOutputParser();

 const handleStream = async (
-  stream: AsyncGenerator<StreamEvent, any, unknown>,
+  stream: IterableReadableStream<StreamEvent>,
  emitter: eventEmitter,
 ) => {
  for await (const event of stream) {
@ -132,15 +152,13 @@ type BasicChainInput = {
 };

 const createBasicWebSearchRetrieverChain = (llm: BaseChatModel) => {
+  (llm as unknown as ChatOpenAI).temperature = 0;
+
  return RunnableSequence.from([
    PromptTemplate.fromTemplate(basicSearchRetrieverPrompt),
    llm,
    strParser,
    RunnableLambda.from(async (input: string) => {
-      if (input === 'not_needed') {
-        return { query: '', docs: [] };
-      }
-
      const linksOutputParser = new LineListOutputParser({
        key: 'links',
      });
@ -152,9 +170,13 @@ const createBasicWebSearchRetrieverChain = (llm: BaseChatModel) => {
      const links = await linksOutputParser.parse(input);
      let question = await questionOutputParser.parse(input);

+      if (question === 'not_needed') {
+        return { query: '', docs: [] };
+      }
+
      if (links.length > 0) {
        if (question.length === 0) {
-          question = 'Summarize';
+          question = 'summarize';
        }

        let docs = [];
@ -226,7 +248,7 @@ const createBasicWebSearchRetrieverChain = (llm: BaseChatModel) => {

        return { query: question, docs: docs };
      } else {
-        const res = await searchSearxng(input, {
+        const res = await searchSearxng(question, {
          language: 'en',
        });

@ -242,7 +264,7 @@ const createBasicWebSearchRetrieverChain = (llm: BaseChatModel) => {
            }),
        );

-        return { query: input, docs: documents };
+        return { query: question, docs: documents };
      }
    }),
  ]);
@ -271,7 +293,7 @@ const createBasicWebSearchAnsweringChain = (
      return docs;
    }

-    if (query === 'Summarize') {
+    if (query.toLocaleLowerCase() === 'summarize') {
      return docs;
    }

@ -294,7 +316,7 @@ const createBasicWebSearchAnsweringChain = (
    });

    const sortedDocs = similarity
-      .filter((sim) => sim.similarity > 0.5)
+      .filter((sim) => sim.similarity > 0.3)
      .sort((a, b) => b.similarity - a.similarity)
      .slice(0, 15)
      .map((sim) => docsWithContent[sim.index]);
--- a/src/agents/wolframAlphaSearchAgent.ts
+++ b/src/agents/wolframAlphaSearchAgent.ts
@ -18,6 +18,7 @@ import type { Embeddings } from '@langchain/core/embeddings';
 import formatChatHistoryAsString from '../utils/formatHistory';
 import eventEmitter from 'events';
 import logger from '../utils/logger';
+import { IterableReadableStream } from '@langchain/core/utils/stream';

 const basicWolframAlphaSearchRetrieverPrompt = `
 You will be given a conversation below and a follow up question. You need to rephrase the follow-up question if needed so it is a standalone question that can be used by the LLM to search the web for information.
@ -65,7 +66,7 @@ const basicWolframAlphaSearchResponsePrompt = `
 const strParser = new StringOutputParser();

 const handleStream = async (
-  stream: AsyncGenerator<StreamEvent, any, unknown>,
+  stream: IterableReadableStream<StreamEvent>,
  emitter: eventEmitter,
 ) => {
  for await (const event of stream) {
--- a/src/agents/writingAssistant.ts
+++ b/src/agents/writingAssistant.ts
@ -10,6 +10,7 @@ import eventEmitter from 'events';
 import type { BaseChatModel } from '@langchain/core/language_models/chat_models';
 import type { Embeddings } from '@langchain/core/embeddings';
 import logger from '../utils/logger';
+import { IterableReadableStream } from '@langchain/core/utils/stream';

 const writingAssistantPrompt = `
 You are Perplexica, an AI model who is expert at searching the web and answering user's queries. You are currently set on focus mode 'Writing Assistant', this means you will be helping the user write a response to a given query. 
@ -19,7 +20,7 @@ Since you are a writing assistant, you would not perform web searches. If you th
 const strParser = new StringOutputParser();

 const handleStream = async (
-  stream: AsyncGenerator<StreamEvent, any, unknown>,
+  stream: IterableReadableStream<StreamEvent>,
  emitter: eventEmitter,
 ) => {
  for await (const event of stream) {
--- a/src/agents/youtubeSearchAgent.ts
+++ b/src/agents/youtubeSearchAgent.ts
@ -19,6 +19,7 @@ import formatChatHistoryAsString from '../utils/formatHistory';
 import eventEmitter from 'events';
 import computeSimilarity from '../utils/computeSimilarity';
 import logger from '../utils/logger';
+import { IterableReadableStream } from '@langchain/core/utils/stream';

 const basicYoutubeSearchRetrieverPrompt = `
 You will be given a conversation below and a follow up question. You need to rephrase the follow-up question if needed so it is a standalone question that can be used by the LLM to search the web for information.
@ -66,7 +67,7 @@ const basicYoutubeSearchResponsePrompt = `
 const strParser = new StringOutputParser();

 const handleStream = async (
-  stream: AsyncGenerator<StreamEvent, any, unknown>,
+  stream: IterableReadableStream<StreamEvent>,
  emitter: eventEmitter,
 ) => {
  for await (const event of stream) {
--- a/src/lib/linkDocument.ts
+++ b/src/lib/linkDocument.ts
@ -3,6 +3,7 @@ import { htmlToText } from 'html-to-text';
 import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter';
 import { Document } from '@langchain/core/documents';
 import pdfParse from 'pdf-parse';
+import logger from '../utils/logger';

 export const getDocumentsFromLinks = async ({ links }: { links: string[] }) => {
  const splitter = new RecursiveCharacterTextSplitter();
@ -16,66 +17,81 @@ export const getDocumentsFromLinks = async ({ links }: { links: string[] }) => {
          ? link
          : `https://${link}`;

-      const res = await axios.get(link, {
-        responseType: 'arraybuffer',
-      });
+      try {
+        const res = await axios.get(link, {
+          responseType: 'arraybuffer',
+        });

-      const isPdf = res.headers['content-type'] === 'application/pdf';
+        const isPdf = res.headers['content-type'] === 'application/pdf';

-      if (isPdf) {
-        const pdfText = await pdfParse(res.data);
-        const parsedText = pdfText.text
+        if (isPdf) {
+          const pdfText = await pdfParse(res.data);
+          const parsedText = pdfText.text
+            .replace(/(\r\n|\n|\r)/gm, ' ')
+            .replace(/\s+/g, ' ')
+            .trim();
+
+          const splittedText = await splitter.splitText(parsedText);
+          const title = 'PDF Document';
+
+          const linkDocs = splittedText.map((text) => {
+            return new Document({
+              pageContent: text,
+              metadata: {
+                title: title,
+                url: link,
+              },
+            });
+          });
+
+          docs.push(...linkDocs);
+          return;
+        }
+
+        const parsedText = htmlToText(res.data.toString('utf8'), {
+          selectors: [
+            {
+              selector: 'a',
+              options: {
+                ignoreHref: true,
+              },
+            },
+          ],
+        })
          .replace(/(\r\n|\n|\r)/gm, ' ')
          .replace(/\s+/g, ' ')
          .trim();

        const splittedText = await splitter.splitText(parsedText);
-        const title = 'PDF Document';
+        const title = res.data
+          .toString('utf8')
+          .match(/<title>(.*?)<\/title>/)?.[1];

        const linkDocs = splittedText.map((text) => {
          return new Document({
            pageContent: text,
            metadata: {
-              title: title,
+              title: title || link,
              url: link,
            },
          });
        });

        docs.push(...linkDocs);
-        return;
-      }
-
-      const parsedText = htmlToText(res.data.toString('utf8'), {
-        selectors: [
-          {
-            selector: 'a',
-            options: {
-              ignoreHref: true,
+      } catch (err) {
+        logger.error(
+          `Error at generating documents from links: ${err.message}`,
+        );
+        docs.push(
+          new Document({
+            pageContent: `Failed to retrieve content from the link: ${err.message}`,
+            metadata: {
+              title: 'Failed to retrieve content',
+              url: link,
            },
-          },
-        ],
-      })
-        .replace(/(\r\n|\n|\r)/gm, ' ')
-        .replace(/\s+/g, ' ')
-        .trim();
-
-      const splittedText = await splitter.splitText(parsedText);
-      const title = res.data
-        .toString('utf8')
-        .match(/<title>(.*?)<\/title>/)?.[1];
-
-      const linkDocs = splittedText.map((text) => {
-        return new Document({
-          pageContent: text,
-          metadata: {
-            title: title || link,
-            url: link,
-          },
-        });
-      });
-
-      docs.push(...linkDocs);
+          }),
+        );
+      }
    }),
  );

--- a/ui/components/ChatWindow.tsx
+++ b/ui/components/ChatWindow.tsx
@ -83,7 +83,9 @@ const useSocket = (
            chatModelProvider = Object.keys(chatModelProviders)[0];

            if (chatModelProvider === 'custom_openai') {
-              toast.error('Seems like you are using the custom OpenAI provider, please open the settings and configure the API key and base URL');
+              toast.error(
+                'Seems like you are using the custom OpenAI provider, please open the settings and configure the API key and base URL',
+              );
              setError(true);
              return;
            } else {
@ -220,7 +222,7 @@ const useSocket = (
          if (data.type === 'error') {
            toast.error(data.data);
          }
-        })
+        });

        setWs(ws);
      };
@ -235,13 +237,6 @@ const useSocket = (
 	setError(true);
      }
    }
-
-    return () => {
-      if (ws?.readyState === 1) {
-        ws?.close();
-        console.log('[DEBUG] closed');
-      }
-    };
  }, [ws, url, setIsWSReady, setError]);

  return ws;
@ -348,6 +343,15 @@ const ChatWindow = ({ id }: { id?: string }) => {
    // eslint-disable-next-line react-hooks/exhaustive-deps
  }, []);

+  useEffect(() => {
+    return () => {
+      if (ws?.readyState === 1) {
+        ws.close();
+        console.log('[DEBUG] closed');
+      }
+    };
+  }, []);
+
  const messagesRef = useRef<Message[]>([]);

  useEffect(() => {
--- a/ui/package.json
+++ b/ui/package.json
@ -1,6 +1,6 @@
 {
  "name": "perplexica-frontend",
-  "version": "1.9.0-rc1",
+  "version": "1.9.0-rc2",
  "license": "MIT",
  "author": "ItzCrazyKns",
  "scripts": {