mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-01 19:12:42 +00:00
Redis langserve template (#12443)
Add Redis langserve template! Eventually will add semantic caching to this too. But I was struggling to get that to work for some reason with the LCEL implementation here. - **Description:** Introduces the Redis LangServe template. A simple RAG based app built on top of Redis that allows you to chat with company's public financial data (Edgar 10k filings) - **Issue:** None - **Dependencies:** The template contains the poetry project requirements to run this template - **Tag maintainer:** @baskaryan @Spartee - **Twitter handle:** @tchutch94 **Note**: this requires the commit here that deletes the `_aget_relevant_documents()` method from the Redis retriever class that wasn't implemented. That was breaking the langserve app. --------- Co-authored-by: Sam Partee <sam.partee@redis.com>
This commit is contained in:
49
templates/rag-redis/ingest.py
Normal file
49
templates/rag-redis/ingest.py
Normal file
@@ -0,0 +1,49 @@
|
||||
import os
|
||||
|
||||
from langchain.document_loaders import UnstructuredFileLoader
|
||||
from langchain.embeddings import HuggingFaceEmbeddings
|
||||
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
||||
from langchain.vectorstores import Redis
|
||||
from rag_redis.config import EMBED_MODEL, INDEX_NAME, INDEX_SCHEMA, REDIS_URL
|
||||
|
||||
|
||||
def ingest_documents():
|
||||
"""
|
||||
Ingest PDF to Redis from the data/ directory that
|
||||
contains Edgar 10k filings data for Nike.
|
||||
"""
|
||||
# Load list of pdfs
|
||||
company_name = "Nike"
|
||||
data_path = "data/"
|
||||
doc = [
|
||||
os.path.join(data_path, file) for file in os.listdir(data_path)
|
||||
][0]
|
||||
|
||||
print("Parsing 10k filing doc for NIKE", doc)
|
||||
|
||||
text_splitter = RecursiveCharacterTextSplitter(
|
||||
chunk_size=1500, chunk_overlap=100, add_start_index=True
|
||||
)
|
||||
loader = UnstructuredFileLoader(doc, mode="single", strategy="fast")
|
||||
chunks = loader.load_and_split(text_splitter)
|
||||
|
||||
print("Done preprocessing. Created", len(chunks), "chunks of the original pdf")
|
||||
# Create vectorstore
|
||||
embedder = HuggingFaceEmbeddings(
|
||||
model_name=EMBED_MODEL
|
||||
)
|
||||
|
||||
_ = Redis.from_texts(
|
||||
# appending this little bit can sometimes help with semantic retrieval
|
||||
# especially with multiple companies
|
||||
texts=[f"Company: {company_name}. " + chunk.page_content for chunk in chunks],
|
||||
metadatas=[chunk.metadata for chunk in chunks],
|
||||
embedding=embedder,
|
||||
index_name=INDEX_NAME,
|
||||
index_schema=INDEX_SCHEMA,
|
||||
redis_url=REDIS_URL
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
ingest_documents()
|
Reference in New Issue
Block a user