mirror of
https://github.com/SkalaraAI/langchain-chatbot.git
synced 2025-04-03 20:10:17 -04:00
initial commit
This commit is contained in:
commit
84e82bcb5d
139
.gitignore
vendored
Normal file
139
.gitignore
vendored
Normal file
|
@ -0,0 +1,139 @@
|
|||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
|
||||
# C extensions
|
||||
*.so
|
||||
|
||||
# Distribution / packaging
|
||||
.Python
|
||||
build/
|
||||
develop-eggs/
|
||||
dist/
|
||||
downloads/
|
||||
eggs/
|
||||
.eggs/
|
||||
lib/
|
||||
lib64/
|
||||
parts/
|
||||
sdist/
|
||||
var/
|
||||
wheels/
|
||||
pip-wheel-metadata/
|
||||
share/python-wheels/
|
||||
*.egg-info/
|
||||
.installed.cfg
|
||||
*.egg
|
||||
MANIFEST
|
||||
|
||||
# PyInstaller
|
||||
# Usually these files are written by a python script from a template
|
||||
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
||||
*.manifest
|
||||
*.spec
|
||||
|
||||
# Installer logs
|
||||
pip-log.txt
|
||||
pip-delete-this-directory.txt
|
||||
|
||||
# Unit test / coverage reports
|
||||
htmlcov/
|
||||
.tox/
|
||||
.nox/
|
||||
.coverage
|
||||
.coverage.*
|
||||
.cache
|
||||
nosetests.xml
|
||||
coverage.xml
|
||||
*.cover
|
||||
*.py,cover
|
||||
.hypothesis/
|
||||
.pytest_cache/
|
||||
|
||||
# Translations
|
||||
*.mo
|
||||
*.pot
|
||||
|
||||
# Django stuff:
|
||||
*.log
|
||||
local_settings.py
|
||||
db.sqlite3
|
||||
db.sqlite3-journal
|
||||
|
||||
# Flask stuff:
|
||||
instance/
|
||||
.webassets-cache
|
||||
|
||||
# Scrapy stuff:
|
||||
.scrapy
|
||||
|
||||
# Sphinx documentation
|
||||
docs/_build/
|
||||
|
||||
# PyBuilder
|
||||
target/
|
||||
|
||||
# Jupyter Notebook
|
||||
.ipynb_checkpoints
|
||||
|
||||
# IPython
|
||||
profile_default/
|
||||
ipython_config.py
|
||||
|
||||
# pyenv
|
||||
.python-version
|
||||
|
||||
# pipenv
|
||||
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
||||
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
||||
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
||||
# install all needed dependencies.
|
||||
#Pipfile.lock
|
||||
|
||||
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
|
||||
__pypackages__/
|
||||
|
||||
# Celery stuff
|
||||
celerybeat-schedule
|
||||
celerybeat.pid
|
||||
|
||||
# SageMath parsed files
|
||||
*.sage.py
|
||||
|
||||
# Environments
|
||||
.env
|
||||
.venv
|
||||
env/
|
||||
venv/
|
||||
ENV/
|
||||
env.bak/
|
||||
venv.bak/
|
||||
|
||||
# Spyder project settings
|
||||
.spyderproject
|
||||
.spyproject
|
||||
|
||||
# Rope project settings
|
||||
.ropeproject
|
||||
|
||||
# mkdocs documentation
|
||||
/site
|
||||
|
||||
# mypy
|
||||
.mypy_cache/
|
||||
.dmypy.json
|
||||
dmypy.json
|
||||
|
||||
# Pyre type checker
|
||||
.pyre/
|
||||
|
||||
# JetBrains
|
||||
.idea
|
||||
|
||||
*.db
|
||||
|
||||
.DS_Store
|
||||
|
||||
vectorstore.pkl
|
||||
langchain.readthedocs.io/
|
21
LICENSE
Normal file
21
LICENSE
Normal file
|
@ -0,0 +1,21 @@
|
|||
MIT License
|
||||
|
||||
Copyright (c) 2023 Harrison Chase
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
8
Makefile
Normal file
8
Makefile
Normal file
|
@ -0,0 +1,8 @@
|
|||
.PHONY: start
|
||||
start:
|
||||
uvicorn main:app --reload --port 9000
|
||||
|
||||
.PHONY: format
|
||||
format:
|
||||
black .
|
||||
isort .
|
41
README.md
Normal file
41
README.md
Normal file
|
@ -0,0 +1,41 @@
|
|||
# 🦜️🔗 ChatLangChain
|
||||
|
||||
This repo is an implementation of a locally hosted chatbot specifically focused on question answering over the [LangChain documentation](https://langchain.readthedocs.io/en/latest/).
|
||||
Built with [LangChain](https://github.com/hwchase17/langchain/) and [FastAPI](https://fastapi.tiangolo.com/).
|
||||
|
||||
The app leverages LangChain's streaming support and async API to update the page in real time for multiple users.
|
||||
|
||||
## ✅ Running locally
|
||||
1. Install dependencies: `pip install -r requirements.txt`
|
||||
1. Run `ingest.sh` to ingest LangChain docs data into the vectorstore (only needs to be done once).
|
||||
1. You can use other [Document Loaders](https://langchain.readthedocs.io/en/latest/modules/document_loaders.html) to load your own data into the vectorstore.
|
||||
1. Run the app: `make start`
|
||||
1. To enable tracing, make sure `langchain-server` is running locally and pass `tracing=True` to `get_chain` in `main.py`. You can find more documentation [here](https://langchain.readthedocs.io/en/latest/tracing.html).
|
||||
1. Open [localhost:9000](http://localhost:9000) in your browser.
|
||||
|
||||
## 🚀 Important Links
|
||||
|
||||
Deployed version (to be updated soon): [chat.langchain.dev](https://chat.langchain.dev)
|
||||
|
||||
Hugging Face Space (to be updated soon): [huggingface.co/spaces/hwchase17/chat-langchain](https://huggingface.co/spaces/hwchase17/chat-langchain)
|
||||
|
||||
Blog Posts:
|
||||
* [Initial Launch](https://blog.langchain.dev/langchain-chat/)
|
||||
* [Streaming Support](https://blog.langchain.dev/streaming-support-in-langchain/)
|
||||
|
||||
## 📚 Technical description
|
||||
|
||||
There are two components: ingestion and question-answering.
|
||||
|
||||
Ingestion has the following steps:
|
||||
|
||||
1. Pull html from documentation site
|
||||
2. Load html with LangChain's [ReadTheDocs Loader](https://langchain.readthedocs.io/en/latest/modules/document_loaders/examples/readthedocs_documentation.html)
|
||||
3. Split documents with LangChain's [TextSplitter](https://langchain.readthedocs.io/en/latest/reference/modules/text_splitter.html)
|
||||
4. Create a vectorstore of embeddings, using LangChain's [vectorstore wrapper](https://python.langchain.com/en/latest/modules/indexes/vectorstores.html) (with OpenAI's embeddings and FAISS vectorstore).
|
||||
|
||||
Question-Answering has the following steps, all handled by [ChatVectorDBChain](https://langchain.readthedocs.io/en/latest/modules/indexes/chain_examples/chat_vector_db.html):
|
||||
|
||||
1. Given the chat history and new user input, determine what a standalone question would be (using GPT-3).
|
||||
2. Given that standalone question, look up relevant documents from the vectorstore.
|
||||
3. Pass the standalone question and relevant documents to GPT-3 to generate a final answer.
|
98
archive/app.py
Normal file
98
archive/app.py
Normal file
|
@ -0,0 +1,98 @@
|
|||
import datetime
|
||||
import os
|
||||
|
||||
import gradio as gr
|
||||
import langchain
|
||||
import weaviate
|
||||
from chain import get_new_chain1
|
||||
from langchain.vectorstores import Weaviate
|
||||
|
||||
WEAVIATE_URL = os.environ["WEAVIATE_URL"]
|
||||
|
||||
|
||||
def get_weaviate_store():
|
||||
client = weaviate.Client(
|
||||
url=WEAVIATE_URL,
|
||||
additional_headers={"X-OpenAI-Api-Key": os.environ["OPENAI_API_KEY"]},
|
||||
)
|
||||
return Weaviate(client, "Paragraph", "content", attributes=["source"])
|
||||
|
||||
|
||||
def set_openai_api_key(api_key, agent):
|
||||
if api_key:
|
||||
os.environ["OPENAI_API_KEY"] = api_key
|
||||
vectorstore = get_weaviate_store()
|
||||
qa_chain = get_new_chain1(vectorstore)
|
||||
os.environ["OPENAI_API_KEY"] = ""
|
||||
return qa_chain
|
||||
|
||||
|
||||
def chat(inp, history, agent):
|
||||
history = history or []
|
||||
if agent is None:
|
||||
history.append((inp, "Please paste your OpenAI key to use"))
|
||||
return history, history
|
||||
print("\n==== date/time: " + str(datetime.datetime.now()) + " ====")
|
||||
print("inp: " + inp)
|
||||
history = history or []
|
||||
output = agent({"question": inp, "chat_history": history})
|
||||
answer = output["answer"]
|
||||
history.append((inp, answer))
|
||||
print(history)
|
||||
return history, history
|
||||
|
||||
|
||||
block = gr.Blocks(css=".gradio-container {background-color: lightgray}")
|
||||
|
||||
with block:
|
||||
with gr.Row():
|
||||
gr.Markdown("<h3><center>LangChain AI</center></h3>")
|
||||
|
||||
openai_api_key_textbox = gr.Textbox(
|
||||
placeholder="Paste your OpenAI API key (sk-...)",
|
||||
show_label=False,
|
||||
lines=1,
|
||||
type="password",
|
||||
)
|
||||
|
||||
chatbot = gr.Chatbot()
|
||||
|
||||
with gr.Row():
|
||||
message = gr.Textbox(
|
||||
label="What's your question?",
|
||||
placeholder="What's the answer to life, the universe, and everything?",
|
||||
lines=1,
|
||||
)
|
||||
submit = gr.Button(value="Send", variant="secondary").style(full_width=False)
|
||||
|
||||
gr.Examples(
|
||||
examples=[
|
||||
"What are agents?",
|
||||
"How do I summarize a long document?",
|
||||
"What types of memory exist?",
|
||||
],
|
||||
inputs=message,
|
||||
)
|
||||
|
||||
gr.HTML(
|
||||
"""
|
||||
This simple application is an implementation of ChatGPT but over an external dataset (in this case, the LangChain documentation)."""
|
||||
)
|
||||
|
||||
gr.HTML(
|
||||
"<center>Powered by <a href='https://github.com/hwchase17/langchain'>LangChain 🦜️🔗</a></center>"
|
||||
)
|
||||
|
||||
state = gr.State()
|
||||
agent_state = gr.State()
|
||||
|
||||
submit.click(chat, inputs=[message, state, agent_state], outputs=[chatbot, state])
|
||||
message.submit(chat, inputs=[message, state, agent_state], outputs=[chatbot, state])
|
||||
|
||||
openai_api_key_textbox.change(
|
||||
set_openai_api_key,
|
||||
inputs=[openai_api_key_textbox, agent_state],
|
||||
outputs=[agent_state],
|
||||
)
|
||||
|
||||
block.launch(debug=True)
|
126
archive/chain.py
Normal file
126
archive/chain.py
Normal file
|
@ -0,0 +1,126 @@
|
|||
import json
|
||||
import os
|
||||
import pathlib
|
||||
from typing import Dict, List, Tuple
|
||||
|
||||
import weaviate
|
||||
from langchain import OpenAI, PromptTemplate
|
||||
from langchain.chains import LLMChain
|
||||
from langchain.chains.base import Chain
|
||||
from langchain.chains.combine_documents.base import BaseCombineDocumentsChain
|
||||
from langchain.chains.conversation.memory import ConversationBufferMemory
|
||||
from langchain.chains.question_answering import load_qa_chain
|
||||
from langchain.embeddings import OpenAIEmbeddings
|
||||
from langchain.prompts import FewShotPromptTemplate, PromptTemplate
|
||||
from langchain.prompts.example_selector import \
|
||||
SemanticSimilarityExampleSelector
|
||||
from langchain.vectorstores import FAISS, Weaviate
|
||||
from pydantic import BaseModel
|
||||
|
||||
|
||||
class CustomChain(Chain, BaseModel):
|
||||
vstore: Weaviate
|
||||
chain: BaseCombineDocumentsChain
|
||||
key_word_extractor: Chain
|
||||
|
||||
@property
|
||||
def input_keys(self) -> List[str]:
|
||||
return ["question"]
|
||||
|
||||
@property
|
||||
def output_keys(self) -> List[str]:
|
||||
return ["answer"]
|
||||
|
||||
def _call(self, inputs: Dict[str, str]) -> Dict[str, str]:
|
||||
question = inputs["question"]
|
||||
chat_history_str = _get_chat_history(inputs["chat_history"])
|
||||
if chat_history_str:
|
||||
new_question = self.key_word_extractor.run(
|
||||
question=question, chat_history=chat_history_str
|
||||
)
|
||||
else:
|
||||
new_question = question
|
||||
print(new_question)
|
||||
docs = self.vstore.similarity_search(new_question, k=4)
|
||||
new_inputs = inputs.copy()
|
||||
new_inputs["question"] = new_question
|
||||
new_inputs["chat_history"] = chat_history_str
|
||||
answer, _ = self.chain.combine_docs(docs, **new_inputs)
|
||||
return {"answer": answer}
|
||||
|
||||
|
||||
def get_new_chain1(vectorstore) -> Chain:
|
||||
WEAVIATE_URL = os.environ["WEAVIATE_URL"]
|
||||
client = weaviate.Client(
|
||||
url=WEAVIATE_URL,
|
||||
additional_headers={"X-OpenAI-Api-Key": os.environ["OPENAI_API_KEY"]},
|
||||
)
|
||||
|
||||
_eg_template = """## Example:
|
||||
|
||||
Chat History:
|
||||
{chat_history}
|
||||
Follow Up Input: {question}
|
||||
Standalone question: {answer}"""
|
||||
_eg_prompt = PromptTemplate(
|
||||
template=_eg_template,
|
||||
input_variables=["chat_history", "question", "answer"],
|
||||
)
|
||||
|
||||
_prefix = """Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question. You should assume that the question is related to LangChain."""
|
||||
_suffix = """## Example:
|
||||
|
||||
Chat History:
|
||||
{chat_history}
|
||||
Follow Up Input: {question}
|
||||
Standalone question:"""
|
||||
eg_store = Weaviate(
|
||||
client,
|
||||
"Rephrase",
|
||||
"content",
|
||||
attributes=["question", "answer", "chat_history"],
|
||||
)
|
||||
example_selector = SemanticSimilarityExampleSelector(vectorstore=eg_store, k=4)
|
||||
prompt = FewShotPromptTemplate(
|
||||
prefix=_prefix,
|
||||
suffix=_suffix,
|
||||
example_selector=example_selector,
|
||||
example_prompt=_eg_prompt,
|
||||
input_variables=["question", "chat_history"],
|
||||
)
|
||||
llm = OpenAI(temperature=0.8, model_name="gpt-3.5-turbo")
|
||||
key_word_extractor = LLMChain(llm=llm, prompt=prompt)
|
||||
|
||||
EXAMPLE_PROMPT = PromptTemplate(
|
||||
template=">Example:\nContent:\n---------\n{page_content}\n----------\nSource: {source}",
|
||||
input_variables=["page_content", "source"],
|
||||
)
|
||||
template = """You are an AI assistant for the open source library LangChain. The documentation is located at https://langchain.readthedocs.io.
|
||||
You are given the following extracted parts of a long document and a question. Provide a conversational answer with a hyperlink to the documentation.
|
||||
You should only use hyperlinks that are explicitly listed as a source in the context. Do NOT make up a hyperlink that is not listed.
|
||||
If the question includes a request for code, provide a code block directly from the documentation.
|
||||
If you don't know the answer, just say "Hmm, I'm not sure." Don't try to make up an answer.
|
||||
Question: {question}
|
||||
=========
|
||||
{context}
|
||||
=========
|
||||
Answer in Markdown:"""
|
||||
PROMPT = PromptTemplate(template=template, input_variables=["question", "context"])
|
||||
doc_chain = load_qa_chain(
|
||||
OpenAI(temperature=0.8, model_name="gpt-3.5-turbo", max_tokens=-1),
|
||||
chain_type="stuff",
|
||||
prompt=PROMPT,
|
||||
document_prompt=EXAMPLE_PROMPT,
|
||||
)
|
||||
return CustomChain(
|
||||
chain=doc_chain, vstore=vectorstore, key_word_extractor=key_word_extractor
|
||||
)
|
||||
|
||||
|
||||
def _get_chat_history(chat_history: List[Tuple[str, str]]):
|
||||
buffer = ""
|
||||
for human_s, ai_s in chat_history:
|
||||
human = f"Human: " + human_s
|
||||
ai = f"Assistant: " + ai_s
|
||||
buffer += "\n" + "\n".join([human, ai])
|
||||
return buffer
|
92
archive/ingest.py
Normal file
92
archive/ingest.py
Normal file
|
@ -0,0 +1,92 @@
|
|||
"""Load html from files, clean up, split, ingest into Weaviate."""
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
import weaviate
|
||||
from bs4 import BeautifulSoup
|
||||
from langchain.text_splitter import CharacterTextSplitter
|
||||
|
||||
|
||||
def clean_data(data):
|
||||
soup = BeautifulSoup(data)
|
||||
text = soup.find_all("main", {"id": "main-content"})[0].get_text()
|
||||
return "\n".join([t for t in text.split("\n") if t])
|
||||
|
||||
|
||||
docs = []
|
||||
metadatas = []
|
||||
for p in Path("langchain.readthedocs.io/en/latest/").rglob("*"):
|
||||
if p.is_dir():
|
||||
continue
|
||||
with open(p) as f:
|
||||
docs.append(clean_data(f.read()))
|
||||
metadatas.append({"source": p})
|
||||
|
||||
|
||||
text_splitter = CharacterTextSplitter(
|
||||
separator="\n",
|
||||
chunk_size=1000,
|
||||
chunk_overlap=200,
|
||||
length_function=len,
|
||||
)
|
||||
|
||||
documents = text_splitter.create_documents(docs, metadatas=metadatas)
|
||||
|
||||
|
||||
WEAVIATE_URL = os.environ["WEAVIATE_URL"]
|
||||
client = weaviate.Client(
|
||||
url=WEAVIATE_URL,
|
||||
additional_headers={"X-OpenAI-Api-Key": os.environ["OPENAI_API_KEY"]},
|
||||
)
|
||||
|
||||
client.schema.delete_class("Paragraph")
|
||||
client.schema.get()
|
||||
schema = {
|
||||
"classes": [
|
||||
{
|
||||
"class": "Paragraph",
|
||||
"description": "A written paragraph",
|
||||
"vectorizer": "text2vec-openai",
|
||||
"moduleConfig": {
|
||||
"text2vec-openai": {
|
||||
"model": "ada",
|
||||
"modelVersion": "002",
|
||||
"type": "text",
|
||||
}
|
||||
},
|
||||
"properties": [
|
||||
{
|
||||
"dataType": ["text"],
|
||||
"description": "The content of the paragraph",
|
||||
"moduleConfig": {
|
||||
"text2vec-openai": {
|
||||
"skip": False,
|
||||
"vectorizePropertyName": False,
|
||||
}
|
||||
},
|
||||
"name": "content",
|
||||
},
|
||||
{
|
||||
"dataType": ["text"],
|
||||
"description": "The link",
|
||||
"moduleConfig": {
|
||||
"text2vec-openai": {
|
||||
"skip": True,
|
||||
"vectorizePropertyName": False,
|
||||
}
|
||||
},
|
||||
"name": "source",
|
||||
},
|
||||
],
|
||||
},
|
||||
]
|
||||
}
|
||||
|
||||
client.schema.create(schema)
|
||||
|
||||
with client.batch as batch:
|
||||
for text in documents:
|
||||
batch.add_data_object(
|
||||
{"content": text.page_content, "source": str(text.metadata["source"])},
|
||||
"Paragraph",
|
||||
)
|
6
archive/ingest.sh
Executable file
6
archive/ingest.sh
Executable file
|
@ -0,0 +1,6 @@
|
|||
# Bash script to ingest data
|
||||
# This involves scraping the data from the web and then cleaning up and putting in Weaviate.
|
||||
!set -eu
|
||||
wget -r -A.html https://langchain.readthedocs.io/en/latest/
|
||||
python3 ingest.py
|
||||
python3 ingest_examples.py
|
219
archive/ingest_examples.py
Normal file
219
archive/ingest_examples.py
Normal file
|
@ -0,0 +1,219 @@
|
|||
"""Ingest examples into Weaviate."""
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
import weaviate
|
||||
|
||||
WEAVIATE_URL = os.environ["WEAVIATE_URL"]
|
||||
client = weaviate.Client(
|
||||
url=WEAVIATE_URL,
|
||||
additional_headers={"X-OpenAI-Api-Key": os.environ["OPENAI_API_KEY"]},
|
||||
)
|
||||
|
||||
client.schema.delete_class("Rephrase")
|
||||
client.schema.delete_class("QA")
|
||||
client.schema.get()
|
||||
schema = {
|
||||
"classes": [
|
||||
{
|
||||
"class": "Rephrase",
|
||||
"description": "Rephrase Examples",
|
||||
"vectorizer": "text2vec-openai",
|
||||
"moduleConfig": {
|
||||
"text2vec-openai": {
|
||||
"model": "ada",
|
||||
"modelVersion": "002",
|
||||
"type": "text",
|
||||
}
|
||||
},
|
||||
"properties": [
|
||||
{
|
||||
"dataType": ["text"],
|
||||
"moduleConfig": {
|
||||
"text2vec-openai": {
|
||||
"skip": False,
|
||||
"vectorizePropertyName": False,
|
||||
}
|
||||
},
|
||||
"name": "content",
|
||||
},
|
||||
{
|
||||
"dataType": ["text"],
|
||||
"description": "The link",
|
||||
"moduleConfig": {
|
||||
"text2vec-openai": {
|
||||
"skip": True,
|
||||
"vectorizePropertyName": False,
|
||||
}
|
||||
},
|
||||
"name": "question",
|
||||
},
|
||||
{
|
||||
"dataType": ["text"],
|
||||
"description": "The link",
|
||||
"moduleConfig": {
|
||||
"text2vec-openai": {
|
||||
"skip": True,
|
||||
"vectorizePropertyName": False,
|
||||
}
|
||||
},
|
||||
"name": "answer",
|
||||
},
|
||||
{
|
||||
"dataType": ["text"],
|
||||
"description": "The link",
|
||||
"moduleConfig": {
|
||||
"text2vec-openai": {
|
||||
"skip": True,
|
||||
"vectorizePropertyName": False,
|
||||
}
|
||||
},
|
||||
"name": "chat_history",
|
||||
},
|
||||
],
|
||||
},
|
||||
]
|
||||
}
|
||||
|
||||
client.schema.create(schema)
|
||||
|
||||
documents = [
|
||||
{
|
||||
"question": "how do i load those?",
|
||||
"chat_history": "Human: What types of memory exist?\nAssistant: \n\nThere are a few different types of memory: Buffer, Summary, and Conversational Memory.",
|
||||
"answer": "How do I load Buffer, Summary, and Conversational Memory",
|
||||
},
|
||||
{
|
||||
"question": "how do i install this package?",
|
||||
"chat_history": "",
|
||||
"answer": "How do I install langchain?",
|
||||
},
|
||||
{
|
||||
"question": "how do I set serpapi_api_key?",
|
||||
"chat_history": "Human: can you write me a code snippet for that?\nAssistant: \n\nYes, you can create an Agent with a custom LLMChain in LangChain. Here is a [link](https://langchain.readthedocs.io/en/latest/modules/agents/examples/custom_agent.html) to the documentation that provides a code snippet for creating a custom Agent.",
|
||||
"answer": "How do I set the serpapi_api_key?",
|
||||
},
|
||||
{
|
||||
"question": "What are some methods for data augmented generation?",
|
||||
"chat_history": "Human: List all methods of an Agent class please\nAssistant: \n\nTo answer your question, you can find a list of all the methods of the Agent class in the [API reference documentation](https://langchain.readthedocs.io/en/latest/modules/agents/reference.html).",
|
||||
"answer": "What are some methods for data augmented generation?",
|
||||
},
|
||||
{
|
||||
"question": "can you write me a code snippet for that?",
|
||||
"chat_history": "Human: how do I create an agent with custom LLMChain?\nAssistant: \n\nTo create an Agent with a custom LLMChain in LangChain, you can use the [Custom Agent example](https://langchain.readthedocs.io/en/latest/modules/agents/examples/custom_agent.html). This example shows how to create a custom LLMChain and use an existing Agent class to parse the output. For more information on Agents and Tools, check out the [Key Concepts](https://langchain.readthedocs.io/en/latest/modules/agents/key_concepts.html) documentation.",
|
||||
"answer": "Can you provide a code snippet for creating an Agent with a custom LLMChain?",
|
||||
},
|
||||
]
|
||||
from langchain.prompts.example_selector.semantic_similarity import \
|
||||
sorted_values
|
||||
|
||||
for d in documents:
|
||||
d["content"] = " ".join(sorted_values(d))
|
||||
with client.batch as batch:
|
||||
for text in documents:
|
||||
batch.add_data_object(
|
||||
text,
|
||||
"Rephrase",
|
||||
)
|
||||
|
||||
client.schema.get()
|
||||
schema = {
|
||||
"classes": [
|
||||
{
|
||||
"class": "QA",
|
||||
"description": "Rephrase Examples",
|
||||
"vectorizer": "text2vec-openai",
|
||||
"moduleConfig": {
|
||||
"text2vec-openai": {
|
||||
"model": "ada",
|
||||
"modelVersion": "002",
|
||||
"type": "text",
|
||||
}
|
||||
},
|
||||
"properties": [
|
||||
{
|
||||
"dataType": ["text"],
|
||||
"moduleConfig": {
|
||||
"text2vec-openai": {
|
||||
"skip": False,
|
||||
"vectorizePropertyName": False,
|
||||
}
|
||||
},
|
||||
"name": "content",
|
||||
},
|
||||
{
|
||||
"dataType": ["text"],
|
||||
"description": "The link",
|
||||
"moduleConfig": {
|
||||
"text2vec-openai": {
|
||||
"skip": True,
|
||||
"vectorizePropertyName": False,
|
||||
}
|
||||
},
|
||||
"name": "question",
|
||||
},
|
||||
{
|
||||
"dataType": ["text"],
|
||||
"description": "The link",
|
||||
"moduleConfig": {
|
||||
"text2vec-openai": {
|
||||
"skip": True,
|
||||
"vectorizePropertyName": False,
|
||||
}
|
||||
},
|
||||
"name": "answer",
|
||||
},
|
||||
{
|
||||
"dataType": ["text"],
|
||||
"description": "The link",
|
||||
"moduleConfig": {
|
||||
"text2vec-openai": {
|
||||
"skip": True,
|
||||
"vectorizePropertyName": False,
|
||||
}
|
||||
},
|
||||
"name": "summaries",
|
||||
},
|
||||
{
|
||||
"dataType": ["text"],
|
||||
"description": "The link",
|
||||
"moduleConfig": {
|
||||
"text2vec-openai": {
|
||||
"skip": True,
|
||||
"vectorizePropertyName": False,
|
||||
}
|
||||
},
|
||||
"name": "sources",
|
||||
},
|
||||
],
|
||||
},
|
||||
]
|
||||
}
|
||||
|
||||
client.schema.create(schema)
|
||||
|
||||
documents = [
|
||||
{
|
||||
"question": "how do i install langchain?",
|
||||
"answer": "```pip install langchain```",
|
||||
"summaries": ">Example:\nContent:\n---------\nYou can pip install langchain package by running 'pip install langchain'\n----------\nSource: foo.html",
|
||||
"sources": "foo.html",
|
||||
},
|
||||
{
|
||||
"question": "how do i import an openai LLM?",
|
||||
"answer": "```from langchain.llm import OpenAI```",
|
||||
"summaries": ">Example:\nContent:\n---------\nyou can import the open ai wrapper (OpenAI) from the langchain.llm module\n----------\nSource: bar.html",
|
||||
"sources": "bar.html",
|
||||
},
|
||||
]
|
||||
from langchain.prompts.example_selector.semantic_similarity import \
|
||||
sorted_values
|
||||
|
||||
for d in documents:
|
||||
d["content"] = " ".join(sorted_values(d))
|
||||
with client.batch as batch:
|
||||
for text in documents:
|
||||
batch.add_data_object(
|
||||
text,
|
||||
"QA",
|
||||
)
|
9
archive/requirements.txt
Normal file
9
archive/requirements.txt
Normal file
|
@ -0,0 +1,9 @@
|
|||
langchain==0.0.64
|
||||
beautifulsoup4
|
||||
weaviate-client
|
||||
openai
|
||||
black
|
||||
isort
|
||||
Flask
|
||||
transformers
|
||||
gradio
|
BIN
assets/images/Chat_Your_Data.gif
Normal file
BIN
assets/images/Chat_Your_Data.gif
Normal file
Binary file not shown.
After Width: | Height: | Size: 274 KiB |
33
callback.py
Normal file
33
callback.py
Normal file
|
@ -0,0 +1,33 @@
|
|||
"""Callback handlers used in the app."""
|
||||
from typing import Any, Dict, List
|
||||
|
||||
from langchain.callbacks.base import AsyncCallbackHandler
|
||||
|
||||
from schemas import ChatResponse
|
||||
|
||||
|
||||
class StreamingLLMCallbackHandler(AsyncCallbackHandler):
|
||||
"""Callback handler for streaming LLM responses."""
|
||||
|
||||
def __init__(self, websocket):
|
||||
self.websocket = websocket
|
||||
|
||||
async def on_llm_new_token(self, token: str, **kwargs: Any) -> None:
|
||||
resp = ChatResponse(sender="bot", message=token, type="stream")
|
||||
await self.websocket.send_json(resp.dict())
|
||||
|
||||
|
||||
class QuestionGenCallbackHandler(AsyncCallbackHandler):
|
||||
"""Callback handler for question generation."""
|
||||
|
||||
def __init__(self, websocket):
|
||||
self.websocket = websocket
|
||||
|
||||
async def on_llm_start(
|
||||
self, serialized: Dict[str, Any], prompts: List[str], **kwargs: Any
|
||||
) -> None:
|
||||
"""Run when LLM starts running."""
|
||||
resp = ChatResponse(
|
||||
sender="bot", message="Synthesizing question...", type="info"
|
||||
)
|
||||
await self.websocket.send_json(resp.dict())
|
28
ingest.py
Normal file
28
ingest.py
Normal file
|
@ -0,0 +1,28 @@
|
|||
"""Load html from files, clean up, split, ingest into Weaviate."""
|
||||
import pickle
|
||||
|
||||
from langchain.document_loaders import ReadTheDocsLoader
|
||||
from langchain.embeddings import OpenAIEmbeddings
|
||||
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
||||
from langchain.vectorstores.faiss import FAISS
|
||||
|
||||
|
||||
def ingest_docs():
|
||||
"""Get documents from web pages."""
|
||||
loader = ReadTheDocsLoader("langchain.readthedocs.io/en/latest/")
|
||||
raw_documents = loader.load()
|
||||
text_splitter = RecursiveCharacterTextSplitter(
|
||||
chunk_size=1000,
|
||||
chunk_overlap=200,
|
||||
)
|
||||
documents = text_splitter.split_documents(raw_documents)
|
||||
embeddings = OpenAIEmbeddings(openai_api_key="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")
|
||||
vectorstore = FAISS.from_documents(documents, embeddings)
|
||||
|
||||
# Save vectorstore
|
||||
with open("vectorstore.pkl", "wb") as f:
|
||||
pickle.dump(vectorstore, f)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
ingest_docs()
|
6
ingest.sh
Executable file
6
ingest.sh
Executable file
|
@ -0,0 +1,6 @@
|
|||
# Bash script to ingest data
|
||||
# This involves scraping the data from the web and then cleaning up and putting in Weaviate.
|
||||
# Error if any command fails
|
||||
set -e
|
||||
wget -r -A.html https://langchain.readthedocs.io/en/latest/
|
||||
python3 ingest.py
|
81
main.py
Normal file
81
main.py
Normal file
|
@ -0,0 +1,81 @@
|
|||
"""Main entrypoint for the app."""
|
||||
import logging
|
||||
import pickle
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
from fastapi import FastAPI, Request, WebSocket, WebSocketDisconnect
|
||||
from fastapi.templating import Jinja2Templates
|
||||
from langchain.vectorstores import VectorStore
|
||||
|
||||
from callback import QuestionGenCallbackHandler, StreamingLLMCallbackHandler
|
||||
from query_data import get_chain
|
||||
from schemas import ChatResponse
|
||||
|
||||
app = FastAPI()
|
||||
templates = Jinja2Templates(directory="templates")
|
||||
vectorstore: Optional[VectorStore] = None
|
||||
import os
|
||||
os.environ["OPENAI_API_KEY"] = "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"
|
||||
|
||||
@app.on_event("startup")
|
||||
async def startup_event():
|
||||
logging.info("loading vectorstore")
|
||||
if not Path("vectorstore.pkl").exists():
|
||||
raise ValueError("vectorstore.pkl does not exist, please run ingest.py first")
|
||||
with open("vectorstore.pkl", "rb") as f:
|
||||
global vectorstore
|
||||
vectorstore = pickle.load(f)
|
||||
|
||||
|
||||
@app.get("/")
|
||||
async def get(request: Request):
|
||||
return templates.TemplateResponse("index.html", {"request": request})
|
||||
|
||||
|
||||
@app.websocket("/chat")
|
||||
async def websocket_endpoint(websocket: WebSocket):
|
||||
await websocket.accept()
|
||||
question_handler = QuestionGenCallbackHandler(websocket)
|
||||
stream_handler = StreamingLLMCallbackHandler(websocket)
|
||||
chat_history = []
|
||||
qa_chain = get_chain(vectorstore, question_handler, stream_handler)
|
||||
# Use the below line instead of the above line to enable tracing
|
||||
# Ensure `langchain-server` is running
|
||||
# qa_chain = get_chain(vectorstore, question_handler, stream_handler, tracing=True)
|
||||
|
||||
while True:
|
||||
try:
|
||||
# Receive and send back the client message
|
||||
question = await websocket.receive_text()
|
||||
resp = ChatResponse(sender="you", message=question, type="stream")
|
||||
await websocket.send_json(resp.dict())
|
||||
|
||||
# Construct a response
|
||||
start_resp = ChatResponse(sender="bot", message="", type="start")
|
||||
await websocket.send_json(start_resp.dict())
|
||||
|
||||
result = await qa_chain.acall(
|
||||
{"question": question, "chat_history": chat_history}
|
||||
)
|
||||
chat_history.append((question, result["answer"]))
|
||||
|
||||
end_resp = ChatResponse(sender="bot", message="", type="end")
|
||||
await websocket.send_json(end_resp.dict())
|
||||
except WebSocketDisconnect:
|
||||
logging.info("websocket disconnect")
|
||||
break
|
||||
except Exception as e:
|
||||
logging.error(e)
|
||||
resp = ChatResponse(
|
||||
sender="bot",
|
||||
message="Sorry, something went wrong. Try again.",
|
||||
type="error",
|
||||
)
|
||||
await websocket.send_json(resp.dict())
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import uvicorn
|
||||
|
||||
uvicorn.run(app, host="0.0.0.0", port=9000)
|
54
query_data.py
Normal file
54
query_data.py
Normal file
|
@ -0,0 +1,54 @@
|
|||
"""Create a ChatVectorDBChain for question/answering."""
|
||||
from langchain.callbacks.manager import AsyncCallbackManager
|
||||
from langchain.callbacks.tracers import LangChainTracer
|
||||
from langchain.chains import ConversationalRetrievalChain
|
||||
from langchain.chains.chat_vector_db.prompts import (CONDENSE_QUESTION_PROMPT,
|
||||
QA_PROMPT)
|
||||
from langchain.chains.llm import LLMChain
|
||||
from langchain.chains.question_answering import load_qa_chain
|
||||
from langchain.llms import OpenAI
|
||||
from langchain.vectorstores.base import VectorStore
|
||||
|
||||
|
||||
def get_chain(
|
||||
vectorstore: VectorStore, question_handler, stream_handler, tracing: bool = False
|
||||
) -> ConversationalRetrievalChain:
|
||||
"""Create a ConversationalRetrievalChain for question/answering."""
|
||||
# Construct a ConversationalRetrievalChain with a streaming llm for combine docs
|
||||
manager = AsyncCallbackManager([])
|
||||
question_manager = AsyncCallbackManager([question_handler])
|
||||
stream_manager = AsyncCallbackManager([stream_handler])
|
||||
if tracing:
|
||||
tracer = LangChainTracer()
|
||||
tracer.load_default_session()
|
||||
manager.add_handler(tracer)
|
||||
question_manager.add_handler(tracer)
|
||||
stream_manager.add_handler(tracer)
|
||||
|
||||
question_gen_llm = OpenAI(
|
||||
temperature=0.8,
|
||||
verbose=True,
|
||||
callback_manager=question_manager,
|
||||
)
|
||||
streaming_llm = OpenAI(
|
||||
streaming=True,
|
||||
callback_manager=stream_manager,
|
||||
verbose=True,
|
||||
temperature=0.8,
|
||||
)
|
||||
|
||||
question_generator = LLMChain(
|
||||
llm=question_gen_llm, prompt=CONDENSE_QUESTION_PROMPT, callback_manager=manager
|
||||
)
|
||||
doc_chain = load_qa_chain(
|
||||
streaming_llm, chain_type="stuff", prompt=QA_PROMPT, callback_manager=manager
|
||||
)
|
||||
|
||||
qa = ConversationalRetrievalChain(
|
||||
retriever=vectorstore.as_retriever(),
|
||||
combine_docs_chain=doc_chain,
|
||||
question_generator=question_generator,
|
||||
callback_manager=manager,
|
||||
verbose=True
|
||||
)
|
||||
return qa
|
13
requirements.txt
Normal file
13
requirements.txt
Normal file
|
@ -0,0 +1,13 @@
|
|||
openai
|
||||
fastapi
|
||||
black
|
||||
isort
|
||||
websockets
|
||||
pydantic
|
||||
langchain
|
||||
uvicorn
|
||||
jinja2
|
||||
faiss-cpu
|
||||
bs4
|
||||
unstructured
|
||||
libmagic
|
22
schemas.py
Normal file
22
schemas.py
Normal file
|
@ -0,0 +1,22 @@
|
|||
"""Schemas for the chat app."""
|
||||
from pydantic import BaseModel, validator
|
||||
|
||||
|
||||
class ChatResponse(BaseModel):
|
||||
"""Chat response schema."""
|
||||
|
||||
sender: str
|
||||
message: str
|
||||
type: str
|
||||
|
||||
@validator("sender")
|
||||
def sender_must_be_bot_or_you(cls, v):
|
||||
if v not in ["bot", "you"]:
|
||||
raise ValueError("sender must be bot or you")
|
||||
return v
|
||||
|
||||
@validator("type")
|
||||
def validate_message_type(cls, v):
|
||||
if v not in ["start", "stream", "end", "error", "info"]:
|
||||
raise ValueError("type must be start, stream or end")
|
||||
return v
|
141
templates/index.html
Normal file
141
templates/index.html
Normal file
|
@ -0,0 +1,141 @@
|
|||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<title>Chat Your Data</title>
|
||||
<link href="https://unpkg.com/tailwindcss@^2/dist/tailwind.min.css" rel="stylesheet">
|
||||
<style>
|
||||
.chat-body {
|
||||
width: 550px;
|
||||
margin: 50px auto;
|
||||
}
|
||||
.card-body {
|
||||
background-color: #333;
|
||||
color: #fff;
|
||||
border-radius: 10px;
|
||||
}
|
||||
.server-message {
|
||||
background-color: #444;
|
||||
padding: 10px;
|
||||
margin: 10px;
|
||||
border-radius: 10px;
|
||||
}
|
||||
.client-message {
|
||||
background-color: #555;
|
||||
padding: 10px;
|
||||
margin: 10px;
|
||||
border-radius: 10px;
|
||||
}
|
||||
.form-inline {
|
||||
display: flex;
|
||||
justify-content: space-between;
|
||||
}
|
||||
.form-control {
|
||||
width: 80%;
|
||||
background-color: #333;
|
||||
color: #fff;
|
||||
border: none;
|
||||
border-radius: 5px;
|
||||
padding: 10px;
|
||||
margin-right: 10px;
|
||||
}
|
||||
#send {
|
||||
background-color: #4C4CFF;
|
||||
color: #fff;
|
||||
border: none;
|
||||
border-radius: 5px;
|
||||
padding: 10px 20px;
|
||||
}
|
||||
.form-message {
|
||||
margin-top: 10px;
|
||||
}
|
||||
</style>
|
||||
<script>
|
||||
var endpoint = "ws://localhost:9000/chat";
|
||||
var ws = new WebSocket(endpoint);
|
||||
// Receive message from server word by word. Display the words as they are received.
|
||||
ws.onmessage = function (event) {
|
||||
var messages = document.getElementById('messages');
|
||||
var data = JSON.parse(event.data);
|
||||
if (data.sender === "bot") {
|
||||
if (data.type === "start") {
|
||||
var header = document.getElementById('header');
|
||||
header.innerHTML = "Computing answer...";
|
||||
var div = document.createElement('div');
|
||||
div.className = 'server-message';
|
||||
var p = document.createElement('p');
|
||||
p.innerHTML = "<strong>" + "Chatbot: " + "</strong>";
|
||||
div.appendChild(p);
|
||||
messages.appendChild(div);
|
||||
} else if (data.type === "stream") {
|
||||
var header = document.getElementById('header');
|
||||
header.innerHTML = "Chatbot is typing...";
|
||||
var p = messages.lastChild.lastChild;
|
||||
if (data.message === "\n") {
|
||||
p.innerHTML += "<br>";
|
||||
} else {
|
||||
p.innerHTML += data.message;
|
||||
}
|
||||
} else if (data.type === "info") {
|
||||
var header = document.getElementById('header');
|
||||
header.innerHTML = data.message;
|
||||
} else if (data.type === "end") {
|
||||
var header = document.getElementById('header');
|
||||
header.innerHTML = "Ask a question";
|
||||
var button = document.getElementById('send');
|
||||
button.innerHTML = "Send";
|
||||
button.disabled = false;
|
||||
} else if (data.type === "error") {
|
||||
var header = document.getElementById('header');
|
||||
header.innerHTML = "Ask a question";
|
||||
var button = document.getElementById('send');
|
||||
button.innerHTML = "Send";
|
||||
button.disabled = false;
|
||||
var p = messages.lastChild.lastChild;
|
||||
p.innerHTML += data.message;
|
||||
}
|
||||
} else {
|
||||
var div = document.createElement('div');
|
||||
div.className = 'client-message';
|
||||
var p = document.createElement('p');
|
||||
p.innerHTML = "<strong>" + "You: " + "</strong>";
|
||||
p.innerHTML += data.message;
|
||||
div.appendChild(p);
|
||||
messages.appendChild(div);
|
||||
}
|
||||
// Scroll to the bottom of the chat
|
||||
messages.scrollTop = messages.scrollHeight;
|
||||
};
|
||||
// Send message to server
|
||||
function sendMessage(event) {
|
||||
event.preventDefault();
|
||||
var message = document.getElementById('messageText').value;
|
||||
if (message === "") {
|
||||
return;
|
||||
}
|
||||
ws.send(message);
|
||||
document.getElementById('messageText').value = "";
|
||||
|
||||
// Turn the button into a loading button
|
||||
var button = document.getElementById('send');
|
||||
button.innerHTML = "Loading...";
|
||||
button.disabled = true;
|
||||
}
|
||||
</script>
|
||||
</head>
|
||||
<body class="bg-black">
|
||||
<div class="chat-body card">
|
||||
<div class="card-body p-5">
|
||||
<h4 class="card-title text-center text-xl font-medium"> Chat Your Data </h4>
|
||||
<p class="card-text text-center text-sm" id="header"> Ask a question </p>
|
||||
<hr class="border-gray-500 mb-5" style="margin-top: 20px;">
|
||||
<div id="messages" class="overflow-auto" style="max-height: 500px;">
|
||||
</div>
|
||||
<form action="" class="form-inline mt-5" id="chat-form" onsubmit="sendMessage(event)">
|
||||
<input type="text" class="form-control" placeholder="Write your question" id="messageText">
|
||||
<button id="send" type="submit" class="btn btn-primary">Send</button>
|
||||
</form>
|
||||
</div>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
Loading…
Reference in New Issue
Block a user