From 92f66266b0a0fafd153ad653708105511c10e88d Mon Sep 17 00:00:00 2001 From: ItzCrazyKns <95534749+ItzCrazyKns@users.noreply.github.com> Date: Fri, 29 Nov 2024 18:05:28 +0530 Subject: [PATCH] feat(agents): add a unified agent --- src/search/metaSearchAgent.ts | 486 ++++++++++++++++++++++++++++++++++ 1 file changed, 486 insertions(+) create mode 100644 src/search/metaSearchAgent.ts diff --git a/src/search/metaSearchAgent.ts b/src/search/metaSearchAgent.ts new file mode 100644 index 0000000..0dde085 --- /dev/null +++ b/src/search/metaSearchAgent.ts @@ -0,0 +1,486 @@ +import { ChatOpenAI } from '@langchain/openai'; +import type { BaseChatModel } from '@langchain/core/language_models/chat_models'; +import type { Embeddings } from '@langchain/core/embeddings'; +import { + ChatPromptTemplate, + MessagesPlaceholder, + PromptTemplate, +} from '@langchain/core/prompts'; +import { + RunnableLambda, + RunnableMap, + RunnableSequence, +} from '@langchain/core/runnables'; +import { BaseMessage } from '@langchain/core/messages'; +import { StringOutputParser } from '@langchain/core/output_parsers'; +import LineListOutputParser from '../lib/outputParsers/listLineOutputParser'; +import LineOutputParser from '../lib/outputParsers/lineOutputParser'; +import { getDocumentsFromLinks } from '../utils/documents'; +import { Document } from 'langchain/document'; +import { searchSearxng } from '../lib/searxng'; +import path from 'path'; +import fs from 'fs'; +import computeSimilarity from '../utils/computeSimilarity'; +import formatChatHistoryAsString from '../utils/formatHistory'; +import eventEmitter from 'events'; +import { StreamEvent } from '@langchain/core/tracers/log_stream'; +import { IterableReadableStream } from '@langchain/core/utils/stream'; + +export interface MetaSearchAgentType { + searchAndAnswer: ( + message: string, + history: BaseMessage[], + llm: BaseChatModel, + embeddings: Embeddings, + optimizationMode: 'speed' | 'balanced' | 'quality', + fileIds: string[], + ) => Promise; +} + +interface Config { + searchWeb: boolean; + rerank: boolean; + summarizer: boolean; + rerankThreshold: number; + queryGeneratorPrompt: string; + responsePrompt: string; + activeEngines: string[]; +} + +type BasicChainInput = { + chat_history: BaseMessage[]; + query: string; +}; + +class MetaSearchAgent implements MetaSearchAgentType { + private config: Config; + private strParser = new StringOutputParser(); + + constructor(config: Config) { + this.config = config; + } + + private async createSearchRetrieverChain(llm: BaseChatModel) { + (llm as unknown as ChatOpenAI).temperature = 0; + + return RunnableSequence.from([ + PromptTemplate.fromTemplate(this.config.queryGeneratorPrompt), + llm, + this.strParser, + RunnableLambda.from(async (input: string) => { + const linksOutputParser = new LineListOutputParser({ + key: 'links', + }); + + const questionOutputParser = new LineOutputParser({ + key: 'question', + }); + + const links = await linksOutputParser.parse(input); + let question = this.config.summarizer + ? await questionOutputParser.parse(input) + : input; + + if (question === 'not_needed') { + return { query: '', docs: [] }; + } + + if (links.length > 0) { + if (question.length === 0) { + question = 'summarize'; + } + + let docs = []; + + const linkDocs = await getDocumentsFromLinks({ links }); + + const docGroups: Document[] = []; + + linkDocs.map((doc) => { + const URLDocExists = docGroups.find( + (d) => + d.metadata.url === doc.metadata.url && + d.metadata.totalDocs < 10, + ); + + if (!URLDocExists) { + docGroups.push({ + ...doc, + metadata: { + ...doc.metadata, + totalDocs: 1, + }, + }); + } + + const docIndex = docGroups.findIndex( + (d) => + d.metadata.url === doc.metadata.url && + d.metadata.totalDocs < 10, + ); + + if (docIndex !== -1) { + docGroups[docIndex].pageContent = + docGroups[docIndex].pageContent + `\n\n` + doc.pageContent; + docGroups[docIndex].metadata.totalDocs += 1; + } + }); + + await Promise.all( + docGroups.map(async (doc) => { + const res = await llm.invoke(` + You are a web search summarizer, tasked with summarizing a piece of text retrieved from a web search. Your job is to summarize the + text into a detailed, 2-4 paragraph explanation that captures the main ideas and provides a comprehensive answer to the query. + If the query is \"summarize\", you should provide a detailed summary of the text. If the query is a specific question, you should answer it in the summary. + + - **Journalistic tone**: The summary should sound professional and journalistic, not too casual or vague. + - **Thorough and detailed**: Ensure that every key point from the text is captured and that the summary directly answers the query. + - **Not too lengthy, but detailed**: The summary should be informative but not excessively long. Focus on providing detailed information in a concise format. + + The text will be shared inside the \`text\` XML tag, and the query inside the \`query\` XML tag. + + + 1. \` + Docker is a set of platform-as-a-service products that use OS-level virtualization to deliver software in packages called containers. + It was first released in 2013 and is developed by Docker, Inc. Docker is designed to make it easier to create, deploy, and run applications + by using containers. + + + + What is Docker and how does it work? + + + Response: + Docker is a revolutionary platform-as-a-service product developed by Docker, Inc., that uses container technology to make application + deployment more efficient. It allows developers to package their software with all necessary dependencies, making it easier to run in + any environment. Released in 2013, Docker has transformed the way applications are built, deployed, and managed. + \` + 2. \` + The theory of relativity, or simply relativity, encompasses two interrelated theories of Albert Einstein: special relativity and general + relativity. However, the word "relativity" is sometimes used in reference to Galilean invariance. The term "theory of relativity" was based + on the expression "relative theory" used by Max Planck in 1906. The theory of relativity usually encompasses two interrelated theories by + Albert Einstein: special relativity and general relativity. Special relativity applies to all physical phenomena in the absence of gravity. + General relativity explains the law of gravitation and its relation to other forces of nature. It applies to the cosmological and astrophysical + realm, including astronomy. + + + + summarize + + + Response: + The theory of relativity, developed by Albert Einstein, encompasses two main theories: special relativity and general relativity. Special + relativity applies to all physical phenomena in the absence of gravity, while general relativity explains the law of gravitation and its + relation to other forces of nature. The theory of relativity is based on the concept of "relative theory," as introduced by Max Planck in + 1906. It is a fundamental theory in physics that has revolutionized our understanding of the universe. + \` + + + Everything below is the actual data you will be working with. Good luck! + + + ${question} + + + + ${doc.pageContent} + + + Make sure to answer the query in the summary. + `); + + const document = new Document({ + pageContent: res.content as string, + metadata: { + title: doc.metadata.title, + url: doc.metadata.url, + }, + }); + + docs.push(document); + }), + ); + + return { query: question, docs: docs }; + } else { + const res = await searchSearxng(question, { + language: 'en', + engines: this.config.activeEngines, + }); + + const documents = res.results.map( + (result) => + new Document({ + pageContent: result.content, + metadata: { + title: result.title, + url: result.url, + ...(result.img_src && { img_src: result.img_src }), + }, + }), + ); + + return { query: question, docs: documents }; + } + }), + ]); + } + + private async createAnsweringChain( + llm: BaseChatModel, + fileIds: string[], + embeddings: Embeddings, + optimizationMode: 'speed' | 'balanced' | 'quality', + ) { + return RunnableSequence.from([ + RunnableMap.from({ + query: (input: BasicChainInput) => input.query, + chat_history: (input: BasicChainInput) => input.chat_history, + context: RunnableLambda.from(async (input: BasicChainInput) => { + const processedHistory = formatChatHistoryAsString( + input.chat_history, + ); + + let docs: Document[] | null = null; + let query = input.query; + + if (this.config.searchWeb) { + const searchRetrieverChain = + await this.createSearchRetrieverChain(llm); + + const searchRetrieverResult = await searchRetrieverChain.invoke({ + chat_history: processedHistory, + query, + }); + + query = searchRetrieverResult.query; + docs = searchRetrieverResult.docs; + } + + const sortedDocs = await this.rerankDocs( + query, + docs ?? [], + fileIds, + embeddings, + optimizationMode, + ); + + return sortedDocs; + }) + .withConfig({ + runName: 'FinalSourceRetriever', + }) + .pipe(this.processDocs), + }), + ChatPromptTemplate.fromMessages([ + ['system', this.config.responsePrompt], + new MessagesPlaceholder('chat_history'), + ['user', '{query}'], + ]), + llm, + this.strParser, + ]).withConfig({ + runName: 'FinalResponseGenerator', + }); + } + + private async rerankDocs( + query: string, + docs: Document[], + fileIds: string[], + embeddings: Embeddings, + optimizationMode: 'speed' | 'balanced' | 'quality', + ) { + if (docs.length === 0 && fileIds.length === 0) { + return docs; + } + + const filesData = fileIds + .map((file) => { + const filePath = path.join(process.cwd(), 'uploads', file); + + const contentPath = filePath + '-extracted.json'; + const embeddingsPath = filePath + '-embeddings.json'; + + const content = JSON.parse(fs.readFileSync(contentPath, 'utf8')); + const embeddings = JSON.parse(fs.readFileSync(embeddingsPath, 'utf8')); + + const fileSimilaritySearchObject = content.contents.map( + (c: string, i) => { + return { + fileName: content.title, + content: c, + embeddings: embeddings.embeddings[i], + }; + }, + ); + + return fileSimilaritySearchObject; + }) + .flat(); + + if (query.toLocaleLowerCase() === 'summarize') { + return docs.slice(0, 15); + } + + const docsWithContent = docs.filter( + (doc) => doc.pageContent && doc.pageContent.length > 0, + ); + + if (optimizationMode === 'speed' || this.config.rerank === false) { + if (filesData.length > 0) { + const [queryEmbedding] = await Promise.all([ + embeddings.embedQuery(query), + ]); + + const fileDocs = filesData.map((fileData) => { + return new Document({ + pageContent: fileData.content, + metadata: { + title: fileData.fileName, + url: `File`, + }, + }); + }); + + const similarity = filesData.map((fileData, i) => { + const sim = computeSimilarity(queryEmbedding, fileData.embeddings); + + return { + index: i, + similarity: sim, + }; + }); + + let sortedDocs = similarity + .filter( + (sim) => sim.similarity > (this.config.rerankThreshold ?? 0.3), + ) + .sort((a, b) => b.similarity - a.similarity) + .slice(0, 15) + .map((sim) => fileDocs[sim.index]); + + sortedDocs = + docsWithContent.length > 0 ? sortedDocs.slice(0, 8) : sortedDocs; + + return [ + ...sortedDocs, + ...docsWithContent.slice(0, 15 - sortedDocs.length), + ]; + } else { + return docsWithContent.slice(0, 15); + } + } else if (optimizationMode === 'balanced') { + const [docEmbeddings, queryEmbedding] = await Promise.all([ + embeddings.embedDocuments( + docsWithContent.map((doc) => doc.pageContent), + ), + embeddings.embedQuery(query), + ]); + + docsWithContent.push( + ...filesData.map((fileData) => { + return new Document({ + pageContent: fileData.content, + metadata: { + title: fileData.fileName, + url: `File`, + }, + }); + }), + ); + + docEmbeddings.push(...filesData.map((fileData) => fileData.embeddings)); + + const similarity = docEmbeddings.map((docEmbedding, i) => { + const sim = computeSimilarity(queryEmbedding, docEmbedding); + + return { + index: i, + similarity: sim, + }; + }); + + const sortedDocs = similarity + .filter((sim) => sim.similarity > (this.config.rerankThreshold ?? 0.3)) + .sort((a, b) => b.similarity - a.similarity) + .slice(0, 15) + .map((sim) => docsWithContent[sim.index]); + + return sortedDocs; + } + } + + private processDocs(docs: Document[]) { + return docs + .map((_, index) => `${index + 1}. ${docs[index].pageContent}`) + .join('\n'); + } + + private async handleStream( + stream: IterableReadableStream, + emitter: eventEmitter, + ) { + for await (const event of stream) { + if ( + event.event === 'on_chain_end' && + event.name === 'FinalSourceRetriever' + ) { + ``; + emitter.emit( + 'data', + JSON.stringify({ type: 'sources', data: event.data.output }), + ); + } + if ( + event.event === 'on_chain_stream' && + event.name === 'FinalResponseGenerator' + ) { + emitter.emit( + 'data', + JSON.stringify({ type: 'response', data: event.data.chunk }), + ); + } + if ( + event.event === 'on_chain_end' && + event.name === 'FinalResponseGenerator' + ) { + emitter.emit('end'); + } + } + } + + async searchAndAnswer( + message: string, + history: BaseMessage[], + llm: BaseChatModel, + embeddings: Embeddings, + optimizationMode: 'speed' | 'balanced' | 'quality', + fileIds: string[], + ) { + const emitter = new eventEmitter(); + + const answeringChain = await this.createAnsweringChain( + llm, + fileIds, + embeddings, + optimizationMode, + ); + + const stream = answeringChain.streamEvents( + { + chat_history: history, + query: message, + }, + { + version: 'v1', + }, + ); + + this.handleStream(stream, emitter); + + return emitter; + } +} + +export default MetaSearchAgent;