mirror of
https://github.com/SkalaraAI/langchain-chatbot.git
synced 2025-04-09 23:10:16 -04:00
93 lines
2.5 KiB
Python
93 lines
2.5 KiB
Python
"""Load html from files, clean up, split, ingest into Weaviate."""
|
|
import os
|
|
from pathlib import Path
|
|
|
|
import weaviate
|
|
from bs4 import BeautifulSoup
|
|
from langchain.text_splitter import CharacterTextSplitter
|
|
|
|
|
|
def clean_data(data):
|
|
soup = BeautifulSoup(data)
|
|
text = soup.find_all("main", {"id": "main-content"})[0].get_text()
|
|
return "\n".join([t for t in text.split("\n") if t])
|
|
|
|
|
|
docs = []
|
|
metadatas = []
|
|
for p in Path("langchain.readthedocs.io/en/latest/").rglob("*"):
|
|
if p.is_dir():
|
|
continue
|
|
with open(p) as f:
|
|
docs.append(clean_data(f.read()))
|
|
metadatas.append({"source": p})
|
|
|
|
|
|
text_splitter = CharacterTextSplitter(
|
|
separator="\n",
|
|
chunk_size=1000,
|
|
chunk_overlap=200,
|
|
length_function=len,
|
|
)
|
|
|
|
documents = text_splitter.create_documents(docs, metadatas=metadatas)
|
|
|
|
|
|
WEAVIATE_URL = os.environ["WEAVIATE_URL"]
|
|
client = weaviate.Client(
|
|
url=WEAVIATE_URL,
|
|
additional_headers={"X-OpenAI-Api-Key": os.environ["OPENAI_API_KEY"]},
|
|
)
|
|
|
|
client.schema.delete_class("Paragraph")
|
|
client.schema.get()
|
|
schema = {
|
|
"classes": [
|
|
{
|
|
"class": "Paragraph",
|
|
"description": "A written paragraph",
|
|
"vectorizer": "text2vec-openai",
|
|
"moduleConfig": {
|
|
"text2vec-openai": {
|
|
"model": "ada",
|
|
"modelVersion": "002",
|
|
"type": "text",
|
|
}
|
|
},
|
|
"properties": [
|
|
{
|
|
"dataType": ["text"],
|
|
"description": "The content of the paragraph",
|
|
"moduleConfig": {
|
|
"text2vec-openai": {
|
|
"skip": False,
|
|
"vectorizePropertyName": False,
|
|
}
|
|
},
|
|
"name": "content",
|
|
},
|
|
{
|
|
"dataType": ["text"],
|
|
"description": "The link",
|
|
"moduleConfig": {
|
|
"text2vec-openai": {
|
|
"skip": True,
|
|
"vectorizePropertyName": False,
|
|
}
|
|
},
|
|
"name": "source",
|
|
},
|
|
],
|
|
},
|
|
]
|
|
}
|
|
|
|
client.schema.create(schema)
|
|
|
|
with client.batch as batch:
|
|
for text in documents:
|
|
batch.add_data_object(
|
|
{"content": text.page_content, "source": str(text.metadata["source"])},
|
|
"Paragraph",
|
|
)
|