commit 84e82bcb5d385be5c242fe7420b103d0b8eb6b1f Author: Christopher Arraya Date: Wed Jun 14 00:12:43 2023 -0400 initial commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..74ef9d7 --- /dev/null +++ b/.gitignore @@ -0,0 +1,139 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# JetBrains +.idea + +*.db + +.DS_Store + +vectorstore.pkl +langchain.readthedocs.io/ diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..aef13d9 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2023 Harrison Chase + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..62c80d6 --- /dev/null +++ b/Makefile @@ -0,0 +1,8 @@ +.PHONY: start +start: + uvicorn main:app --reload --port 9000 + +.PHONY: format +format: + black . + isort . \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..fd2de7e --- /dev/null +++ b/README.md @@ -0,0 +1,41 @@ +# 🦜️🔗 ChatLangChain + +This repo is an implementation of a locally hosted chatbot specifically focused on question answering over the [LangChain documentation](https://langchain.readthedocs.io/en/latest/). +Built with [LangChain](https://github.com/hwchase17/langchain/) and [FastAPI](https://fastapi.tiangolo.com/). + +The app leverages LangChain's streaming support and async API to update the page in real time for multiple users. + +## ✅ Running locally +1. Install dependencies: `pip install -r requirements.txt` +1. Run `ingest.sh` to ingest LangChain docs data into the vectorstore (only needs to be done once). + 1. You can use other [Document Loaders](https://langchain.readthedocs.io/en/latest/modules/document_loaders.html) to load your own data into the vectorstore. +1. Run the app: `make start` + 1. To enable tracing, make sure `langchain-server` is running locally and pass `tracing=True` to `get_chain` in `main.py`. You can find more documentation [here](https://langchain.readthedocs.io/en/latest/tracing.html). +1. Open [localhost:9000](http://localhost:9000) in your browser. + +## 🚀 Important Links + +Deployed version (to be updated soon): [chat.langchain.dev](https://chat.langchain.dev) + +Hugging Face Space (to be updated soon): [huggingface.co/spaces/hwchase17/chat-langchain](https://huggingface.co/spaces/hwchase17/chat-langchain) + +Blog Posts: +* [Initial Launch](https://blog.langchain.dev/langchain-chat/) +* [Streaming Support](https://blog.langchain.dev/streaming-support-in-langchain/) + +## 📚 Technical description + +There are two components: ingestion and question-answering. + +Ingestion has the following steps: + +1. Pull html from documentation site +2. Load html with LangChain's [ReadTheDocs Loader](https://langchain.readthedocs.io/en/latest/modules/document_loaders/examples/readthedocs_documentation.html) +3. Split documents with LangChain's [TextSplitter](https://langchain.readthedocs.io/en/latest/reference/modules/text_splitter.html) +4. Create a vectorstore of embeddings, using LangChain's [vectorstore wrapper](https://python.langchain.com/en/latest/modules/indexes/vectorstores.html) (with OpenAI's embeddings and FAISS vectorstore). + +Question-Answering has the following steps, all handled by [ChatVectorDBChain](https://langchain.readthedocs.io/en/latest/modules/indexes/chain_examples/chat_vector_db.html): + +1. Given the chat history and new user input, determine what a standalone question would be (using GPT-3). +2. Given that standalone question, look up relevant documents from the vectorstore. +3. Pass the standalone question and relevant documents to GPT-3 to generate a final answer. diff --git a/archive/app.py b/archive/app.py new file mode 100644 index 0000000..d1e3b9c --- /dev/null +++ b/archive/app.py @@ -0,0 +1,98 @@ +import datetime +import os + +import gradio as gr +import langchain +import weaviate +from chain import get_new_chain1 +from langchain.vectorstores import Weaviate + +WEAVIATE_URL = os.environ["WEAVIATE_URL"] + + +def get_weaviate_store(): + client = weaviate.Client( + url=WEAVIATE_URL, + additional_headers={"X-OpenAI-Api-Key": os.environ["OPENAI_API_KEY"]}, + ) + return Weaviate(client, "Paragraph", "content", attributes=["source"]) + + +def set_openai_api_key(api_key, agent): + if api_key: + os.environ["OPENAI_API_KEY"] = api_key + vectorstore = get_weaviate_store() + qa_chain = get_new_chain1(vectorstore) + os.environ["OPENAI_API_KEY"] = "" + return qa_chain + + +def chat(inp, history, agent): + history = history or [] + if agent is None: + history.append((inp, "Please paste your OpenAI key to use")) + return history, history + print("\n==== date/time: " + str(datetime.datetime.now()) + " ====") + print("inp: " + inp) + history = history or [] + output = agent({"question": inp, "chat_history": history}) + answer = output["answer"] + history.append((inp, answer)) + print(history) + return history, history + + +block = gr.Blocks(css=".gradio-container {background-color: lightgray}") + +with block: + with gr.Row(): + gr.Markdown("

LangChain AI

") + + openai_api_key_textbox = gr.Textbox( + placeholder="Paste your OpenAI API key (sk-...)", + show_label=False, + lines=1, + type="password", + ) + + chatbot = gr.Chatbot() + + with gr.Row(): + message = gr.Textbox( + label="What's your question?", + placeholder="What's the answer to life, the universe, and everything?", + lines=1, + ) + submit = gr.Button(value="Send", variant="secondary").style(full_width=False) + + gr.Examples( + examples=[ + "What are agents?", + "How do I summarize a long document?", + "What types of memory exist?", + ], + inputs=message, + ) + + gr.HTML( + """ + This simple application is an implementation of ChatGPT but over an external dataset (in this case, the LangChain documentation).""" + ) + + gr.HTML( + "
Powered by LangChain 🦜️🔗
" + ) + + state = gr.State() + agent_state = gr.State() + + submit.click(chat, inputs=[message, state, agent_state], outputs=[chatbot, state]) + message.submit(chat, inputs=[message, state, agent_state], outputs=[chatbot, state]) + + openai_api_key_textbox.change( + set_openai_api_key, + inputs=[openai_api_key_textbox, agent_state], + outputs=[agent_state], + ) + +block.launch(debug=True) diff --git a/archive/chain.py b/archive/chain.py new file mode 100644 index 0000000..0d03b28 --- /dev/null +++ b/archive/chain.py @@ -0,0 +1,126 @@ +import json +import os +import pathlib +from typing import Dict, List, Tuple + +import weaviate +from langchain import OpenAI, PromptTemplate +from langchain.chains import LLMChain +from langchain.chains.base import Chain +from langchain.chains.combine_documents.base import BaseCombineDocumentsChain +from langchain.chains.conversation.memory import ConversationBufferMemory +from langchain.chains.question_answering import load_qa_chain +from langchain.embeddings import OpenAIEmbeddings +from langchain.prompts import FewShotPromptTemplate, PromptTemplate +from langchain.prompts.example_selector import \ + SemanticSimilarityExampleSelector +from langchain.vectorstores import FAISS, Weaviate +from pydantic import BaseModel + + +class CustomChain(Chain, BaseModel): + vstore: Weaviate + chain: BaseCombineDocumentsChain + key_word_extractor: Chain + + @property + def input_keys(self) -> List[str]: + return ["question"] + + @property + def output_keys(self) -> List[str]: + return ["answer"] + + def _call(self, inputs: Dict[str, str]) -> Dict[str, str]: + question = inputs["question"] + chat_history_str = _get_chat_history(inputs["chat_history"]) + if chat_history_str: + new_question = self.key_word_extractor.run( + question=question, chat_history=chat_history_str + ) + else: + new_question = question + print(new_question) + docs = self.vstore.similarity_search(new_question, k=4) + new_inputs = inputs.copy() + new_inputs["question"] = new_question + new_inputs["chat_history"] = chat_history_str + answer, _ = self.chain.combine_docs(docs, **new_inputs) + return {"answer": answer} + + +def get_new_chain1(vectorstore) -> Chain: + WEAVIATE_URL = os.environ["WEAVIATE_URL"] + client = weaviate.Client( + url=WEAVIATE_URL, + additional_headers={"X-OpenAI-Api-Key": os.environ["OPENAI_API_KEY"]}, + ) + + _eg_template = """## Example: + + Chat History: + {chat_history} + Follow Up Input: {question} + Standalone question: {answer}""" + _eg_prompt = PromptTemplate( + template=_eg_template, + input_variables=["chat_history", "question", "answer"], + ) + + _prefix = """Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question. You should assume that the question is related to LangChain.""" + _suffix = """## Example: + + Chat History: + {chat_history} + Follow Up Input: {question} + Standalone question:""" + eg_store = Weaviate( + client, + "Rephrase", + "content", + attributes=["question", "answer", "chat_history"], + ) + example_selector = SemanticSimilarityExampleSelector(vectorstore=eg_store, k=4) + prompt = FewShotPromptTemplate( + prefix=_prefix, + suffix=_suffix, + example_selector=example_selector, + example_prompt=_eg_prompt, + input_variables=["question", "chat_history"], + ) + llm = OpenAI(temperature=0.8, model_name="gpt-3.5-turbo") + key_word_extractor = LLMChain(llm=llm, prompt=prompt) + + EXAMPLE_PROMPT = PromptTemplate( + template=">Example:\nContent:\n---------\n{page_content}\n----------\nSource: {source}", + input_variables=["page_content", "source"], + ) + template = """You are an AI assistant for the open source library LangChain. The documentation is located at https://langchain.readthedocs.io. +You are given the following extracted parts of a long document and a question. Provide a conversational answer with a hyperlink to the documentation. +You should only use hyperlinks that are explicitly listed as a source in the context. Do NOT make up a hyperlink that is not listed. +If the question includes a request for code, provide a code block directly from the documentation. +If you don't know the answer, just say "Hmm, I'm not sure." Don't try to make up an answer. +Question: {question} +========= +{context} +========= +Answer in Markdown:""" + PROMPT = PromptTemplate(template=template, input_variables=["question", "context"]) + doc_chain = load_qa_chain( + OpenAI(temperature=0.8, model_name="gpt-3.5-turbo", max_tokens=-1), + chain_type="stuff", + prompt=PROMPT, + document_prompt=EXAMPLE_PROMPT, + ) + return CustomChain( + chain=doc_chain, vstore=vectorstore, key_word_extractor=key_word_extractor + ) + + +def _get_chat_history(chat_history: List[Tuple[str, str]]): + buffer = "" + for human_s, ai_s in chat_history: + human = f"Human: " + human_s + ai = f"Assistant: " + ai_s + buffer += "\n" + "\n".join([human, ai]) + return buffer diff --git a/archive/ingest.py b/archive/ingest.py new file mode 100644 index 0000000..c3e86cb --- /dev/null +++ b/archive/ingest.py @@ -0,0 +1,92 @@ +"""Load html from files, clean up, split, ingest into Weaviate.""" +import os +from pathlib import Path + +import weaviate +from bs4 import BeautifulSoup +from langchain.text_splitter import CharacterTextSplitter + + +def clean_data(data): + soup = BeautifulSoup(data) + text = soup.find_all("main", {"id": "main-content"})[0].get_text() + return "\n".join([t for t in text.split("\n") if t]) + + +docs = [] +metadatas = [] +for p in Path("langchain.readthedocs.io/en/latest/").rglob("*"): + if p.is_dir(): + continue + with open(p) as f: + docs.append(clean_data(f.read())) + metadatas.append({"source": p}) + + +text_splitter = CharacterTextSplitter( + separator="\n", + chunk_size=1000, + chunk_overlap=200, + length_function=len, +) + +documents = text_splitter.create_documents(docs, metadatas=metadatas) + + +WEAVIATE_URL = os.environ["WEAVIATE_URL"] +client = weaviate.Client( + url=WEAVIATE_URL, + additional_headers={"X-OpenAI-Api-Key": os.environ["OPENAI_API_KEY"]}, +) + +client.schema.delete_class("Paragraph") +client.schema.get() +schema = { + "classes": [ + { + "class": "Paragraph", + "description": "A written paragraph", + "vectorizer": "text2vec-openai", + "moduleConfig": { + "text2vec-openai": { + "model": "ada", + "modelVersion": "002", + "type": "text", + } + }, + "properties": [ + { + "dataType": ["text"], + "description": "The content of the paragraph", + "moduleConfig": { + "text2vec-openai": { + "skip": False, + "vectorizePropertyName": False, + } + }, + "name": "content", + }, + { + "dataType": ["text"], + "description": "The link", + "moduleConfig": { + "text2vec-openai": { + "skip": True, + "vectorizePropertyName": False, + } + }, + "name": "source", + }, + ], + }, + ] +} + +client.schema.create(schema) + +with client.batch as batch: + for text in documents: + batch.add_data_object( + {"content": text.page_content, "source": str(text.metadata["source"])}, + "Paragraph", + ) diff --git a/archive/ingest.sh b/archive/ingest.sh new file mode 100755 index 0000000..aa5c68d --- /dev/null +++ b/archive/ingest.sh @@ -0,0 +1,6 @@ +# Bash script to ingest data +# This involves scraping the data from the web and then cleaning up and putting in Weaviate. +!set -eu +wget -r -A.html https://langchain.readthedocs.io/en/latest/ +python3 ingest.py +python3 ingest_examples.py diff --git a/archive/ingest_examples.py b/archive/ingest_examples.py new file mode 100644 index 0000000..d2a1e7a --- /dev/null +++ b/archive/ingest_examples.py @@ -0,0 +1,219 @@ +"""Ingest examples into Weaviate.""" +import os +from pathlib import Path + +import weaviate + +WEAVIATE_URL = os.environ["WEAVIATE_URL"] +client = weaviate.Client( + url=WEAVIATE_URL, + additional_headers={"X-OpenAI-Api-Key": os.environ["OPENAI_API_KEY"]}, +) + +client.schema.delete_class("Rephrase") +client.schema.delete_class("QA") +client.schema.get() +schema = { + "classes": [ + { + "class": "Rephrase", + "description": "Rephrase Examples", + "vectorizer": "text2vec-openai", + "moduleConfig": { + "text2vec-openai": { + "model": "ada", + "modelVersion": "002", + "type": "text", + } + }, + "properties": [ + { + "dataType": ["text"], + "moduleConfig": { + "text2vec-openai": { + "skip": False, + "vectorizePropertyName": False, + } + }, + "name": "content", + }, + { + "dataType": ["text"], + "description": "The link", + "moduleConfig": { + "text2vec-openai": { + "skip": True, + "vectorizePropertyName": False, + } + }, + "name": "question", + }, + { + "dataType": ["text"], + "description": "The link", + "moduleConfig": { + "text2vec-openai": { + "skip": True, + "vectorizePropertyName": False, + } + }, + "name": "answer", + }, + { + "dataType": ["text"], + "description": "The link", + "moduleConfig": { + "text2vec-openai": { + "skip": True, + "vectorizePropertyName": False, + } + }, + "name": "chat_history", + }, + ], + }, + ] +} + +client.schema.create(schema) + +documents = [ + { + "question": "how do i load those?", + "chat_history": "Human: What types of memory exist?\nAssistant: \n\nThere are a few different types of memory: Buffer, Summary, and Conversational Memory.", + "answer": "How do I load Buffer, Summary, and Conversational Memory", + }, + { + "question": "how do i install this package?", + "chat_history": "", + "answer": "How do I install langchain?", + }, + { + "question": "how do I set serpapi_api_key?", + "chat_history": "Human: can you write me a code snippet for that?\nAssistant: \n\nYes, you can create an Agent with a custom LLMChain in LangChain. Here is a [link](https://langchain.readthedocs.io/en/latest/modules/agents/examples/custom_agent.html) to the documentation that provides a code snippet for creating a custom Agent.", + "answer": "How do I set the serpapi_api_key?", + }, + { + "question": "What are some methods for data augmented generation?", + "chat_history": "Human: List all methods of an Agent class please\nAssistant: \n\nTo answer your question, you can find a list of all the methods of the Agent class in the [API reference documentation](https://langchain.readthedocs.io/en/latest/modules/agents/reference.html).", + "answer": "What are some methods for data augmented generation?", + }, + { + "question": "can you write me a code snippet for that?", + "chat_history": "Human: how do I create an agent with custom LLMChain?\nAssistant: \n\nTo create an Agent with a custom LLMChain in LangChain, you can use the [Custom Agent example](https://langchain.readthedocs.io/en/latest/modules/agents/examples/custom_agent.html). This example shows how to create a custom LLMChain and use an existing Agent class to parse the output. For more information on Agents and Tools, check out the [Key Concepts](https://langchain.readthedocs.io/en/latest/modules/agents/key_concepts.html) documentation.", + "answer": "Can you provide a code snippet for creating an Agent with a custom LLMChain?", + }, +] +from langchain.prompts.example_selector.semantic_similarity import \ + sorted_values + +for d in documents: + d["content"] = " ".join(sorted_values(d)) +with client.batch as batch: + for text in documents: + batch.add_data_object( + text, + "Rephrase", + ) + +client.schema.get() +schema = { + "classes": [ + { + "class": "QA", + "description": "Rephrase Examples", + "vectorizer": "text2vec-openai", + "moduleConfig": { + "text2vec-openai": { + "model": "ada", + "modelVersion": "002", + "type": "text", + } + }, + "properties": [ + { + "dataType": ["text"], + "moduleConfig": { + "text2vec-openai": { + "skip": False, + "vectorizePropertyName": False, + } + }, + "name": "content", + }, + { + "dataType": ["text"], + "description": "The link", + "moduleConfig": { + "text2vec-openai": { + "skip": True, + "vectorizePropertyName": False, + } + }, + "name": "question", + }, + { + "dataType": ["text"], + "description": "The link", + "moduleConfig": { + "text2vec-openai": { + "skip": True, + "vectorizePropertyName": False, + } + }, + "name": "answer", + }, + { + "dataType": ["text"], + "description": "The link", + "moduleConfig": { + "text2vec-openai": { + "skip": True, + "vectorizePropertyName": False, + } + }, + "name": "summaries", + }, + { + "dataType": ["text"], + "description": "The link", + "moduleConfig": { + "text2vec-openai": { + "skip": True, + "vectorizePropertyName": False, + } + }, + "name": "sources", + }, + ], + }, + ] +} + +client.schema.create(schema) + +documents = [ + { + "question": "how do i install langchain?", + "answer": "```pip install langchain```", + "summaries": ">Example:\nContent:\n---------\nYou can pip install langchain package by running 'pip install langchain'\n----------\nSource: foo.html", + "sources": "foo.html", + }, + { + "question": "how do i import an openai LLM?", + "answer": "```from langchain.llm import OpenAI```", + "summaries": ">Example:\nContent:\n---------\nyou can import the open ai wrapper (OpenAI) from the langchain.llm module\n----------\nSource: bar.html", + "sources": "bar.html", + }, +] +from langchain.prompts.example_selector.semantic_similarity import \ + sorted_values + +for d in documents: + d["content"] = " ".join(sorted_values(d)) +with client.batch as batch: + for text in documents: + batch.add_data_object( + text, + "QA", + ) diff --git a/archive/requirements.txt b/archive/requirements.txt new file mode 100644 index 0000000..d0c82e2 --- /dev/null +++ b/archive/requirements.txt @@ -0,0 +1,9 @@ +langchain==0.0.64 +beautifulsoup4 +weaviate-client +openai +black +isort +Flask +transformers +gradio diff --git a/assets/images/Chat_Your_Data.gif b/assets/images/Chat_Your_Data.gif new file mode 100644 index 0000000..14b85ac Binary files /dev/null and b/assets/images/Chat_Your_Data.gif differ diff --git a/callback.py b/callback.py new file mode 100644 index 0000000..d55ad49 --- /dev/null +++ b/callback.py @@ -0,0 +1,33 @@ +"""Callback handlers used in the app.""" +from typing import Any, Dict, List + +from langchain.callbacks.base import AsyncCallbackHandler + +from schemas import ChatResponse + + +class StreamingLLMCallbackHandler(AsyncCallbackHandler): + """Callback handler for streaming LLM responses.""" + + def __init__(self, websocket): + self.websocket = websocket + + async def on_llm_new_token(self, token: str, **kwargs: Any) -> None: + resp = ChatResponse(sender="bot", message=token, type="stream") + await self.websocket.send_json(resp.dict()) + + +class QuestionGenCallbackHandler(AsyncCallbackHandler): + """Callback handler for question generation.""" + + def __init__(self, websocket): + self.websocket = websocket + + async def on_llm_start( + self, serialized: Dict[str, Any], prompts: List[str], **kwargs: Any + ) -> None: + """Run when LLM starts running.""" + resp = ChatResponse( + sender="bot", message="Synthesizing question...", type="info" + ) + await self.websocket.send_json(resp.dict()) diff --git a/ingest.py b/ingest.py new file mode 100644 index 0000000..a45d41f --- /dev/null +++ b/ingest.py @@ -0,0 +1,28 @@ +"""Load html from files, clean up, split, ingest into Weaviate.""" +import pickle + +from langchain.document_loaders import ReadTheDocsLoader +from langchain.embeddings import OpenAIEmbeddings +from langchain.text_splitter import RecursiveCharacterTextSplitter +from langchain.vectorstores.faiss import FAISS + + +def ingest_docs(): + """Get documents from web pages.""" + loader = ReadTheDocsLoader("langchain.readthedocs.io/en/latest/") + raw_documents = loader.load() + text_splitter = RecursiveCharacterTextSplitter( + chunk_size=1000, + chunk_overlap=200, + ) + documents = text_splitter.split_documents(raw_documents) + embeddings = OpenAIEmbeddings(openai_api_key="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") + vectorstore = FAISS.from_documents(documents, embeddings) + + # Save vectorstore + with open("vectorstore.pkl", "wb") as f: + pickle.dump(vectorstore, f) + + +if __name__ == "__main__": + ingest_docs() diff --git a/ingest.sh b/ingest.sh new file mode 100755 index 0000000..73b75a8 --- /dev/null +++ b/ingest.sh @@ -0,0 +1,6 @@ +# Bash script to ingest data +# This involves scraping the data from the web and then cleaning up and putting in Weaviate. +# Error if any command fails +set -e +wget -r -A.html https://langchain.readthedocs.io/en/latest/ +python3 ingest.py diff --git a/main.py b/main.py new file mode 100644 index 0000000..63b7157 --- /dev/null +++ b/main.py @@ -0,0 +1,81 @@ +"""Main entrypoint for the app.""" +import logging +import pickle +from pathlib import Path +from typing import Optional + +from fastapi import FastAPI, Request, WebSocket, WebSocketDisconnect +from fastapi.templating import Jinja2Templates +from langchain.vectorstores import VectorStore + +from callback import QuestionGenCallbackHandler, StreamingLLMCallbackHandler +from query_data import get_chain +from schemas import ChatResponse + +app = FastAPI() +templates = Jinja2Templates(directory="templates") +vectorstore: Optional[VectorStore] = None +import os +os.environ["OPENAI_API_KEY"] = "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" + +@app.on_event("startup") +async def startup_event(): + logging.info("loading vectorstore") + if not Path("vectorstore.pkl").exists(): + raise ValueError("vectorstore.pkl does not exist, please run ingest.py first") + with open("vectorstore.pkl", "rb") as f: + global vectorstore + vectorstore = pickle.load(f) + + +@app.get("/") +async def get(request: Request): + return templates.TemplateResponse("index.html", {"request": request}) + + +@app.websocket("/chat") +async def websocket_endpoint(websocket: WebSocket): + await websocket.accept() + question_handler = QuestionGenCallbackHandler(websocket) + stream_handler = StreamingLLMCallbackHandler(websocket) + chat_history = [] + qa_chain = get_chain(vectorstore, question_handler, stream_handler) + # Use the below line instead of the above line to enable tracing + # Ensure `langchain-server` is running + # qa_chain = get_chain(vectorstore, question_handler, stream_handler, tracing=True) + + while True: + try: + # Receive and send back the client message + question = await websocket.receive_text() + resp = ChatResponse(sender="you", message=question, type="stream") + await websocket.send_json(resp.dict()) + + # Construct a response + start_resp = ChatResponse(sender="bot", message="", type="start") + await websocket.send_json(start_resp.dict()) + + result = await qa_chain.acall( + {"question": question, "chat_history": chat_history} + ) + chat_history.append((question, result["answer"])) + + end_resp = ChatResponse(sender="bot", message="", type="end") + await websocket.send_json(end_resp.dict()) + except WebSocketDisconnect: + logging.info("websocket disconnect") + break + except Exception as e: + logging.error(e) + resp = ChatResponse( + sender="bot", + message="Sorry, something went wrong. Try again.", + type="error", + ) + await websocket.send_json(resp.dict()) + + +if __name__ == "__main__": + import uvicorn + + uvicorn.run(app, host="0.0.0.0", port=9000) diff --git a/query_data.py b/query_data.py new file mode 100644 index 0000000..492f1e8 --- /dev/null +++ b/query_data.py @@ -0,0 +1,54 @@ +"""Create a ChatVectorDBChain for question/answering.""" +from langchain.callbacks.manager import AsyncCallbackManager +from langchain.callbacks.tracers import LangChainTracer +from langchain.chains import ConversationalRetrievalChain +from langchain.chains.chat_vector_db.prompts import (CONDENSE_QUESTION_PROMPT, + QA_PROMPT) +from langchain.chains.llm import LLMChain +from langchain.chains.question_answering import load_qa_chain +from langchain.llms import OpenAI +from langchain.vectorstores.base import VectorStore + + +def get_chain( + vectorstore: VectorStore, question_handler, stream_handler, tracing: bool = False +) -> ConversationalRetrievalChain: + """Create a ConversationalRetrievalChain for question/answering.""" + # Construct a ConversationalRetrievalChain with a streaming llm for combine docs + manager = AsyncCallbackManager([]) + question_manager = AsyncCallbackManager([question_handler]) + stream_manager = AsyncCallbackManager([stream_handler]) + if tracing: + tracer = LangChainTracer() + tracer.load_default_session() + manager.add_handler(tracer) + question_manager.add_handler(tracer) + stream_manager.add_handler(tracer) + + question_gen_llm = OpenAI( + temperature=0.8, + verbose=True, + callback_manager=question_manager, + ) + streaming_llm = OpenAI( + streaming=True, + callback_manager=stream_manager, + verbose=True, + temperature=0.8, + ) + + question_generator = LLMChain( + llm=question_gen_llm, prompt=CONDENSE_QUESTION_PROMPT, callback_manager=manager + ) + doc_chain = load_qa_chain( + streaming_llm, chain_type="stuff", prompt=QA_PROMPT, callback_manager=manager + ) + + qa = ConversationalRetrievalChain( + retriever=vectorstore.as_retriever(), + combine_docs_chain=doc_chain, + question_generator=question_generator, + callback_manager=manager, + verbose=True + ) + return qa diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..1b7831d --- /dev/null +++ b/requirements.txt @@ -0,0 +1,13 @@ +openai +fastapi +black +isort +websockets +pydantic +langchain +uvicorn +jinja2 +faiss-cpu +bs4 +unstructured +libmagic diff --git a/schemas.py b/schemas.py new file mode 100644 index 0000000..f4d83a0 --- /dev/null +++ b/schemas.py @@ -0,0 +1,22 @@ +"""Schemas for the chat app.""" +from pydantic import BaseModel, validator + + +class ChatResponse(BaseModel): + """Chat response schema.""" + + sender: str + message: str + type: str + + @validator("sender") + def sender_must_be_bot_or_you(cls, v): + if v not in ["bot", "you"]: + raise ValueError("sender must be bot or you") + return v + + @validator("type") + def validate_message_type(cls, v): + if v not in ["start", "stream", "end", "error", "info"]: + raise ValueError("type must be start, stream or end") + return v diff --git a/templates/index.html b/templates/index.html new file mode 100644 index 0000000..c123510 --- /dev/null +++ b/templates/index.html @@ -0,0 +1,141 @@ + + + + + Chat Your Data + + + + + +
+
+

Chat Your Data

+ +
+
+
+
+ + +
+
+
+ + \ No newline at end of file