mirror of
https://github.com/hwchase17/langchain.git
synced 2026-02-13 06:16:26 +00:00
Compare commits
19 Commits
langchain-
...
cc/oai_str
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
3c8328e9b7 | ||
|
|
ea0d921e72 | ||
|
|
d18e1c67ff | ||
|
|
c318ab6152 | ||
|
|
d9c51b71c4 | ||
|
|
2921597c71 | ||
|
|
a49448a7c9 | ||
|
|
0d226de25c | ||
|
|
55677e31f7 | ||
|
|
187131c55c | ||
|
|
3d7ae8b5d2 | ||
|
|
b9db8e9921 | ||
|
|
1f78d4faf4 | ||
|
|
6a152ce245 | ||
|
|
20a715a103 | ||
|
|
c8d6f9d52b | ||
|
|
acddfc772e | ||
|
|
3e618b16cd | ||
|
|
18eb9c249d |
3
.github/scripts/check_diff.py
vendored
3
.github/scripts/check_diff.py
vendored
@@ -30,6 +30,9 @@ IGNORED_PARTNERS = [
|
||||
# specifically in huggingface jobs
|
||||
# https://github.com/langchain-ai/langchain/issues/25558
|
||||
"huggingface",
|
||||
# prompty exhibiting issues with numpy for Python 3.13
|
||||
# https://github.com/langchain-ai/langchain/actions/runs/12651104685/job/35251034969?pr=29065
|
||||
"prompty",
|
||||
]
|
||||
|
||||
PY_312_MAX_PACKAGES = [
|
||||
|
||||
@@ -61,7 +61,7 @@
|
||||
" * document addition by id (`add_documents` method with `ids` argument)\n",
|
||||
" * delete by id (`delete` method with `ids` argument)\n",
|
||||
"\n",
|
||||
"Compatible Vectorstores: `Aerospike`, `AnalyticDB`, `AstraDB`, `AwaDB`, `AzureCosmosDBNoSqlVectorSearch`, `AzureCosmosDBVectorSearch`, `Bagel`, `Cassandra`, `Chroma`, `CouchbaseVectorStore`, `DashVector`, `DatabricksVectorSearch`, `DeepLake`, `Dingo`, `ElasticVectorSearch`, `ElasticsearchStore`, `FAISS`, `HanaDB`, `Milvus`, `MongoDBAtlasVectorSearch`, `MyScale`, `OpenSearchVectorSearch`, `PGVector`, `Pinecone`, `Qdrant`, `Redis`, `Rockset`, `ScaNN`, `SingleStoreDB`, `SupabaseVectorStore`, `SurrealDBStore`, `TimescaleVector`, `Vald`, `VDMS`, `Vearch`, `VespaStore`, `Weaviate`, `Yellowbrick`, `ZepVectorStore`, `TencentVectorDB`, `OpenSearchVectorSearch`.\n",
|
||||
"Compatible Vectorstores: `Aerospike`, `AnalyticDB`, `AstraDB`, `AwaDB`, `AzureCosmosDBNoSqlVectorSearch`, `AzureCosmosDBVectorSearch`, `AzureSearch`, `Bagel`, `Cassandra`, `Chroma`, `CouchbaseVectorStore`, `DashVector`, `DatabricksVectorSearch`, `DeepLake`, `Dingo`, `ElasticVectorSearch`, `ElasticsearchStore`, `FAISS`, `HanaDB`, `Milvus`, `MongoDBAtlasVectorSearch`, `MyScale`, `OpenSearchVectorSearch`, `PGVector`, `Pinecone`, `Qdrant`, `Redis`, `Rockset`, `ScaNN`, `SingleStoreDB`, `SupabaseVectorStore`, `SurrealDBStore`, `TimescaleVector`, `Vald`, `VDMS`, `Vearch`, `VespaStore`, `Weaviate`, `Yellowbrick`, `ZepVectorStore`, `TencentVectorDB`, `OpenSearchVectorSearch`.\n",
|
||||
" \n",
|
||||
"## Caution\n",
|
||||
"\n",
|
||||
|
||||
@@ -45,7 +45,7 @@ The below document loaders allow you to load documents from your favorite cloud
|
||||
|
||||
## Social Platforms
|
||||
|
||||
The below document loaders allow you to load documents from differnt social media platforms.
|
||||
The below document loaders allow you to load documents from different social media platforms.
|
||||
|
||||
<CategoryTable category="social_loaders"/>
|
||||
|
||||
|
||||
140
docs/docs/integrations/document_loaders/pull_md.ipynb
Normal file
140
docs/docs/integrations/document_loaders/pull_md.ipynb
Normal file
@@ -0,0 +1,140 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "raw",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"---\n",
|
||||
"sidebar_label: PullMdLoader\n",
|
||||
"---"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# PullMdLoader\n",
|
||||
"\n",
|
||||
"Loader for converting URLs into Markdown using the pull.md service.\n",
|
||||
"\n",
|
||||
"This package implements a [document loader](/docs/concepts/document_loaders/) for web content. Unlike traditional web scrapers, PullMdLoader can handle web pages built with dynamic JavaScript frameworks like React, Angular, or Vue.js, converting them into Markdown without local rendering.\n",
|
||||
"\n",
|
||||
"## Overview\n",
|
||||
"### Integration details\n",
|
||||
"\n",
|
||||
"| Class | Package | Local | Serializable | JS Support |\n",
|
||||
"| :--- | :--- | :---: | :---: | :---: |\n",
|
||||
"| PullMdLoader | langchain-pull-md | ✅ | ✅ | ❌ |\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Setup\n",
|
||||
"\n",
|
||||
"### Installation\n",
|
||||
"\n",
|
||||
"```bash\n",
|
||||
"pip install langchain-pull-md\n",
|
||||
"```"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Initialization"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain_pull_md.markdown_loader import PullMdLoader\n",
|
||||
"\n",
|
||||
"# Instantiate the loader with a URL\n",
|
||||
"loader = PullMdLoader(url=\"https://example.com\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Load"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"documents = loader.load()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'source': 'https://example.com',\n",
|
||||
" 'page_content': '# Example Domain\\nThis domain is used for illustrative examples in documents. You may use this domain in literature without prior coordination or asking for permission.'}"
|
||||
]
|
||||
},
|
||||
"execution_count": 12,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"documents[0].metadata"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Lazy Load\n",
|
||||
"\n",
|
||||
"No lazy loading is implemented. `PullMdLoader` performs a real-time conversion of the provided URL into Markdown format whenever the `load` method is called."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## API reference:\n",
|
||||
"\n",
|
||||
"- [GitHub](https://github.com/chigwell/langchain-pull-md)\n",
|
||||
"- [PyPi](https://pypi.org/project/langchain-pull-md/)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.12"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
42
docs/docs/integrations/providers/pull-md.mdx
Normal file
42
docs/docs/integrations/providers/pull-md.mdx
Normal file
@@ -0,0 +1,42 @@
|
||||
# PullMd Loader
|
||||
|
||||
>[PullMd](https://pull.md/) is a service that converts web pages into Markdown format. The `langchain-pull-md` package utilizes this service to convert URLs, especially those rendered with JavaScript frameworks like React, Angular, or Vue.js, into Markdown without the need for local rendering.
|
||||
|
||||
## Installation and Setup
|
||||
|
||||
To get started with `langchain-pull-md`, you need to install the package via pip:
|
||||
|
||||
```bash
|
||||
pip install langchain-pull-md
|
||||
```
|
||||
|
||||
See the [usage example](/docs/integrations/document_loaders/pull_md) for detailed integration and usage instructions.
|
||||
|
||||
## Document Loader
|
||||
|
||||
The `PullMdLoader` class in `langchain-pull-md` provides an easy way to convert URLs to Markdown. It's particularly useful for loading content from modern web applications for use within LangChain's processing capabilities.
|
||||
|
||||
```python
|
||||
from langchain_pull_md import PullMdLoader
|
||||
|
||||
# Initialize the loader with a URL of a JavaScript-rendered webpage
|
||||
loader = PullMdLoader(url='https://example.com')
|
||||
|
||||
# Load the content as a Document
|
||||
documents = loader.load()
|
||||
|
||||
# Access the Markdown content
|
||||
for document in documents:
|
||||
print(document.page_content)
|
||||
```
|
||||
|
||||
This loader supports any URL and is particularly adept at handling sites built with dynamic JavaScript, making it a versatile tool for markdown extraction in data processing workflows.
|
||||
|
||||
## API Reference
|
||||
|
||||
For a comprehensive guide to all available functions and their parameters, visit the [API reference](https://github.com/chigwell/langchain-pull-md).
|
||||
|
||||
## Additional Resources
|
||||
|
||||
- [GitHub Repository](https://github.com/chigwell/langchain-pull-md)
|
||||
- [PyPi Package](https://pypi.org/project/langchain-pull-md/)
|
||||
@@ -70,7 +70,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 2,
|
||||
"id": "dfa92a08",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@@ -91,12 +91,12 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 2,
|
||||
"id": "c795913e",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"embedding_model = VertexAIEmbeddings(model_name=\"textembedding-gecko@003\")"
|
||||
"embedding_model = VertexAIEmbeddings(model_name=\"text-embedding-005\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -722,7 +722,139 @@
|
||||
"cell_type": "markdown",
|
||||
"id": "31222b03",
|
||||
"metadata": {},
|
||||
"source": []
|
||||
"source": [
|
||||
"## Hybrid Search"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "b8a308f2",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Vector Search supports hybrid search, a popular architecture pattern in information retrieval (IR) that combines both semantic search and keyword search (also called token-based search). With hybrid search, developers can take advantage of the best of the two approaches, effectively providing higher search quality.\n",
|
||||
"Click [here](https://cloud.google.com/vertex-ai/docs/vector-search/about-hybrid-search) to learn more.\n",
|
||||
"\n",
|
||||
"In order to use hybrid search, we need to fit a sparse embedding vectorizer and handle the embeddings outside of the Vector Search integration.\n",
|
||||
"An example of sparse embedding vectorizer is sklearn TfidfVectorizer but other techniques can be used, for instance BM25."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 16,
|
||||
"id": "e319402d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Define some sample data\n",
|
||||
"texts = [\n",
|
||||
" \"The cat sat on\",\n",
|
||||
" \"the mat.\",\n",
|
||||
" \"I like to\",\n",
|
||||
" \"eat pizza for\",\n",
|
||||
" \"dinner.\",\n",
|
||||
" \"The sun sets\",\n",
|
||||
" \"in the west.\",\n",
|
||||
"]\n",
|
||||
"\n",
|
||||
"# optional IDs\n",
|
||||
"ids = [\"i_\" + str(i + 1) for i in range(len(texts))]\n",
|
||||
"\n",
|
||||
"# optional metadata\n",
|
||||
"metadatas = [{\"my_metadata\": i} for i in range(len(texts))]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "14efefc1",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
|
||||
"\n",
|
||||
"# Fit the TFIDF Vectorizer (This is usually done on a very large corpus of data to make sure that word statistics generalize well on new data)\n",
|
||||
"vectorizer = TfidfVectorizer()\n",
|
||||
"vectorizer.fit(texts)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "2c7206c2",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Utility function to transform text into a TF-IDF Sparse Vector\n",
|
||||
"def get_sparse_embedding(tfidf_vectorizer, text):\n",
|
||||
" tfidf_vector = tfidf_vectorizer.transform([text])\n",
|
||||
" values = []\n",
|
||||
" dims = []\n",
|
||||
" for i, tfidf_value in enumerate(tfidf_vector.data):\n",
|
||||
" values.append(float(tfidf_value))\n",
|
||||
" dims.append(int(tfidf_vector.indices[i]))\n",
|
||||
" return {\"values\": values, \"dimensions\": dims}"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 19,
|
||||
"id": "0dc5b782",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# semantic (dense) embeddings\n",
|
||||
"embeddings = embedding_model.embed_documents(texts)\n",
|
||||
"# tfidf (sparse) embeddings\n",
|
||||
"sparse_embeddings = [get_sparse_embedding(vectorizer, x) for x in texts]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "3a353679",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"sparse_embeddings[0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "2623cad9",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Add the dense and sparse embeddings in Vector Search\n",
|
||||
"\n",
|
||||
"vector_store.add_texts_with_embeddings(\n",
|
||||
" texts=texts,\n",
|
||||
" embeddings=embeddings,\n",
|
||||
" sparse_embeddings=sparse_embeddings,\n",
|
||||
" ids=ids,\n",
|
||||
" metadatas=metadatas,\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "29885e38",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Run hybrid search\n",
|
||||
"query = \"the cat\"\n",
|
||||
"embedding = embedding_model.embed_query(query)\n",
|
||||
"sparse_embedding = get_sparse_embedding(vectorizer, query)\n",
|
||||
"\n",
|
||||
"vector_store.similarity_search_by_vector_with_score(\n",
|
||||
" embedding=embedding,\n",
|
||||
" sparse_embedding=sparse_embedding,\n",
|
||||
" k=5,\n",
|
||||
" rrf_ranking_alpha=0.7, # 0.7 weight to dense and 0.3 weight to sparse\n",
|
||||
")"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
@@ -733,7 +865,7 @@
|
||||
"uri": "gcr.io/deeplearning-platform-release/base-cpu:m107"
|
||||
},
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"display_name": "langchain-google-community-3Os9yvMd-py3.10",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
@@ -747,7 +879,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.6"
|
||||
"version": "3.10.14"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
||||
@@ -87,6 +87,7 @@ if TYPE_CHECKING:
|
||||
from langchain_community.document_loaders.blob_loaders import (
|
||||
Blob,
|
||||
BlobLoader,
|
||||
CloudBlobLoader,
|
||||
FileSystemBlobLoader,
|
||||
YoutubeAudioLoader,
|
||||
)
|
||||
@@ -574,6 +575,7 @@ _module_lookup = {
|
||||
"CSVLoader": "langchain_community.document_loaders.csv_loader",
|
||||
"CassandraLoader": "langchain_community.document_loaders.cassandra",
|
||||
"ChatGPTLoader": "langchain_community.document_loaders.chatgpt",
|
||||
"CloudBlobLoader": "langchain_community.document_loaders.blob_loaders",
|
||||
"CoNLLULoader": "langchain_community.document_loaders.conllu",
|
||||
"CollegeConfidentialLoader": "langchain_community.document_loaders.college_confidential", # noqa: E501
|
||||
"ConcurrentLoader": "langchain_community.document_loaders.concurrent",
|
||||
@@ -781,6 +783,7 @@ __all__ = [
|
||||
"CSVLoader",
|
||||
"CassandraLoader",
|
||||
"ChatGPTLoader",
|
||||
"CloudBlobLoader",
|
||||
"CoNLLULoader",
|
||||
"CollegeConfidentialLoader",
|
||||
"ConcurrentLoader",
|
||||
|
||||
@@ -6,7 +6,6 @@ import warnings
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
Dict,
|
||||
Iterable,
|
||||
Iterator,
|
||||
Mapping,
|
||||
@@ -23,15 +22,13 @@ from langchain_community.document_loaders.base import BaseBlobParser
|
||||
from langchain_community.document_loaders.blob_loaders import Blob
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import fitz.fitz
|
||||
import pdfminer.layout
|
||||
import pdfplumber.page
|
||||
import pypdf._page
|
||||
import pypdfium2._helpers.page
|
||||
from pypdf import PageObject
|
||||
import fitz
|
||||
import pdfminer
|
||||
import pdfplumber
|
||||
import pypdf
|
||||
import pypdfium2
|
||||
from textractor.data.text_linearization_config import TextLinearizationConfig
|
||||
|
||||
|
||||
_PDF_FILTER_WITH_LOSS = ["DCTDecode", "DCT", "JPXDecode"]
|
||||
_PDF_FILTER_WITHOUT_LOSS = [
|
||||
"LZWDecode",
|
||||
@@ -90,7 +87,7 @@ class PyPDFParser(BaseBlobParser):
|
||||
extract_images: bool = False,
|
||||
*,
|
||||
extraction_mode: str = "plain",
|
||||
extraction_kwargs: Optional[Dict[str, Any]] = None,
|
||||
extraction_kwargs: Optional[dict[str, Any]] = None,
|
||||
):
|
||||
self.password = password
|
||||
self.extract_images = extract_images
|
||||
@@ -107,7 +104,7 @@ class PyPDFParser(BaseBlobParser):
|
||||
"`pip install pypdf`"
|
||||
)
|
||||
|
||||
def _extract_text_from_page(page: "PageObject") -> str:
|
||||
def _extract_text_from_page(page: pypdf.PageObject) -> str:
|
||||
"""
|
||||
Extract text from image given the version of pypdf.
|
||||
"""
|
||||
@@ -126,12 +123,13 @@ class PyPDFParser(BaseBlobParser):
|
||||
Document(
|
||||
page_content=_extract_text_from_page(page=page)
|
||||
+ self._extract_images_from_page(page),
|
||||
metadata={"source": blob.source, "page": page_number}, # type: ignore[attr-defined]
|
||||
metadata={"source": blob.source, "page": page_number},
|
||||
# type: ignore[attr-defined]
|
||||
)
|
||||
for page_number, page in enumerate(pdf_reader.pages)
|
||||
]
|
||||
|
||||
def _extract_images_from_page(self, page: pypdf._page.PageObject) -> str:
|
||||
def _extract_images_from_page(self, page: pypdf.PageObject) -> str:
|
||||
"""Extract images from page and get the text with RapidOCR."""
|
||||
if not self.extract_images or "/XObject" not in page["/Resources"].keys(): # type: ignore[attr-defined]
|
||||
return ""
|
||||
@@ -307,9 +305,7 @@ class PyMuPDFParser(BaseBlobParser):
|
||||
for page in doc
|
||||
]
|
||||
|
||||
def _get_page_content(
|
||||
self, doc: fitz.fitz.Document, page: fitz.fitz.Page, blob: Blob
|
||||
) -> str:
|
||||
def _get_page_content(self, doc: fitz.Document, page: fitz.Page, blob: Blob) -> str:
|
||||
"""
|
||||
Get the text of the page using PyMuPDF and RapidOCR and issue a warning
|
||||
if it is empty.
|
||||
@@ -327,7 +323,7 @@ class PyMuPDFParser(BaseBlobParser):
|
||||
return content
|
||||
|
||||
def _extract_metadata(
|
||||
self, doc: fitz.fitz.Document, page: fitz.fitz.Page, blob: Blob
|
||||
self, doc: fitz.Document, page: fitz.Page, blob: Blob
|
||||
) -> dict:
|
||||
"""Extract metadata from the document and page."""
|
||||
return dict(
|
||||
@@ -344,9 +340,7 @@ class PyMuPDFParser(BaseBlobParser):
|
||||
},
|
||||
)
|
||||
|
||||
def _extract_images_from_page(
|
||||
self, doc: fitz.fitz.Document, page: fitz.fitz.Page
|
||||
) -> str:
|
||||
def _extract_images_from_page(self, doc: fitz.Document, page: fitz.Page) -> str:
|
||||
"""Extract images from page and get the text with RapidOCR."""
|
||||
if not self.extract_images:
|
||||
return ""
|
||||
@@ -558,7 +552,7 @@ class AmazonTextractPDFParser(BaseBlobParser):
|
||||
textract_features: Optional[Sequence[int]] = None,
|
||||
client: Optional[Any] = None,
|
||||
*,
|
||||
linearization_config: Optional["TextLinearizationConfig"] = None,
|
||||
linearization_config: Optional[TextLinearizationConfig] = None,
|
||||
) -> None:
|
||||
"""Initializes the parser.
|
||||
|
||||
|
||||
@@ -6,17 +6,17 @@ import tempfile
|
||||
import time
|
||||
from abc import ABC
|
||||
from io import StringIO
|
||||
from pathlib import Path
|
||||
from pathlib import Path, PurePath
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
Dict,
|
||||
BinaryIO,
|
||||
Iterator,
|
||||
List,
|
||||
Mapping,
|
||||
Optional,
|
||||
Sequence,
|
||||
Union,
|
||||
cast,
|
||||
)
|
||||
from urllib.parse import urlparse
|
||||
|
||||
@@ -68,7 +68,7 @@ class UnstructuredPDFLoader(UnstructuredFileLoader):
|
||||
https://unstructured-io.github.io/unstructured/bricks.html#partition-pdf
|
||||
"""
|
||||
|
||||
def _get_elements(self) -> List:
|
||||
def _get_elements(self) -> list:
|
||||
from unstructured.partition.pdf import partition_pdf
|
||||
|
||||
return partition_pdf(filename=self.file_path, **self.unstructured_kwargs) # type: ignore[arg-type]
|
||||
@@ -81,7 +81,9 @@ class BasePDFLoader(BaseLoader, ABC):
|
||||
clean up the temporary file after completion.
|
||||
"""
|
||||
|
||||
def __init__(self, file_path: Union[str, Path], *, headers: Optional[Dict] = None):
|
||||
def __init__(
|
||||
self, file_path: Union[str, PurePath], *, headers: Optional[dict] = None
|
||||
):
|
||||
"""Initialize with a file path.
|
||||
|
||||
Args:
|
||||
@@ -154,7 +156,7 @@ class BasePDFLoader(BaseLoader, ABC):
|
||||
class OnlinePDFLoader(BasePDFLoader):
|
||||
"""Load online `PDF`."""
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
def load(self) -> list[Document]:
|
||||
"""Load documents."""
|
||||
loader = UnstructuredPDFLoader(str(self.file_path))
|
||||
return loader.load()
|
||||
@@ -223,13 +225,13 @@ class PyPDFLoader(BasePDFLoader):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
file_path: str,
|
||||
file_path: Union[str, PurePath],
|
||||
password: Optional[Union[str, bytes]] = None,
|
||||
headers: Optional[Dict] = None,
|
||||
headers: Optional[dict] = None,
|
||||
extract_images: bool = False,
|
||||
*,
|
||||
extraction_mode: str = "plain",
|
||||
extraction_kwargs: Optional[Dict] = None,
|
||||
extraction_kwargs: Optional[dict] = None,
|
||||
) -> None:
|
||||
"""Initialize with a file path."""
|
||||
try:
|
||||
@@ -262,9 +264,9 @@ class PyPDFium2Loader(BasePDFLoader):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
file_path: str,
|
||||
file_path: Union[str, PurePath],
|
||||
*,
|
||||
headers: Optional[Dict] = None,
|
||||
headers: Optional[dict] = None,
|
||||
extract_images: bool = False,
|
||||
):
|
||||
"""Initialize with a file path."""
|
||||
@@ -290,7 +292,7 @@ class PyPDFDirectoryLoader(BaseLoader):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
path: Union[str, Path],
|
||||
path: Union[str, PurePath],
|
||||
glob: str = "**/[!.]*.pdf",
|
||||
silent_errors: bool = False,
|
||||
load_hidden: bool = False,
|
||||
@@ -308,7 +310,7 @@ class PyPDFDirectoryLoader(BaseLoader):
|
||||
def _is_visible(path: Path) -> bool:
|
||||
return not any(part.startswith(".") for part in path.parts)
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
def load(self) -> list[Document]:
|
||||
p = Path(self.path)
|
||||
docs = []
|
||||
items = p.rglob(self.glob) if self.recursive else p.glob(self.glob)
|
||||
@@ -334,9 +336,9 @@ class PDFMinerLoader(BasePDFLoader):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
file_path: str,
|
||||
file_path: Union[str, PurePath],
|
||||
*,
|
||||
headers: Optional[Dict] = None,
|
||||
headers: Optional[dict] = None,
|
||||
extract_images: bool = False,
|
||||
concatenate_pages: bool = True,
|
||||
) -> None:
|
||||
@@ -374,7 +376,9 @@ class PDFMinerLoader(BasePDFLoader):
|
||||
class PDFMinerPDFasHTMLLoader(BasePDFLoader):
|
||||
"""Load `PDF` files as HTML content using `PDFMiner`."""
|
||||
|
||||
def __init__(self, file_path: str, *, headers: Optional[Dict] = None):
|
||||
def __init__(
|
||||
self, file_path: Union[str, PurePath], *, headers: Optional[dict] = None
|
||||
):
|
||||
"""Initialize with a file path."""
|
||||
try:
|
||||
from pdfminer.high_level import extract_text_to_fp # noqa:F401
|
||||
@@ -395,14 +399,14 @@ class PDFMinerPDFasHTMLLoader(BasePDFLoader):
|
||||
output_string = StringIO()
|
||||
with open_filename(self.file_path, "rb") as fp:
|
||||
extract_text_to_fp(
|
||||
fp,
|
||||
cast(BinaryIO, fp),
|
||||
output_string,
|
||||
codec="",
|
||||
laparams=LAParams(),
|
||||
output_type="html",
|
||||
)
|
||||
metadata = {
|
||||
"source": self.file_path if self.web_path is None else self.web_path
|
||||
"source": str(self.file_path) if self.web_path is None else self.web_path
|
||||
}
|
||||
yield Document(page_content=output_string.getvalue(), metadata=metadata)
|
||||
|
||||
@@ -412,9 +416,9 @@ class PyMuPDFLoader(BasePDFLoader):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
file_path: str,
|
||||
file_path: Union[str, PurePath],
|
||||
*,
|
||||
headers: Optional[Dict] = None,
|
||||
headers: Optional[dict] = None,
|
||||
extract_images: bool = False,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
@@ -447,7 +451,7 @@ class PyMuPDFLoader(BasePDFLoader):
|
||||
blob = Blob.from_path(self.file_path) # type: ignore[attr-defined]
|
||||
yield from parser.lazy_parse(blob)
|
||||
|
||||
def load(self, **kwargs: Any) -> List[Document]:
|
||||
def load(self, **kwargs: Any) -> list[Document]:
|
||||
return list(self._lazy_load(**kwargs))
|
||||
|
||||
def lazy_load(self) -> Iterator[Document]:
|
||||
@@ -461,11 +465,11 @@ class MathpixPDFLoader(BasePDFLoader):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
file_path: str,
|
||||
file_path: Union[str, PurePath],
|
||||
processed_file_format: str = "md",
|
||||
max_wait_time_seconds: int = 500,
|
||||
should_clean_pdf: bool = False,
|
||||
extra_request_data: Optional[Dict[str, Any]] = None,
|
||||
extra_request_data: Optional[dict[str, Any]] = None,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
"""Initialize with a file path.
|
||||
@@ -499,7 +503,7 @@ class MathpixPDFLoader(BasePDFLoader):
|
||||
self.should_clean_pdf = should_clean_pdf
|
||||
|
||||
@property
|
||||
def _mathpix_headers(self) -> Dict[str, str]:
|
||||
def _mathpix_headers(self) -> dict[str, str]:
|
||||
return {"app_id": self.mathpix_api_id, "app_key": self.mathpix_api_key}
|
||||
|
||||
@property
|
||||
@@ -515,7 +519,7 @@ class MathpixPDFLoader(BasePDFLoader):
|
||||
return {"options_json": json.dumps(options)}
|
||||
|
||||
def send_pdf(self) -> str:
|
||||
with open(self.file_path, "rb") as f:
|
||||
with open(str(self.file_path), "rb") as f:
|
||||
files = {"file": f}
|
||||
response = requests.post(
|
||||
self.url, headers=self._mathpix_headers, files=files, data=self.data
|
||||
@@ -562,7 +566,7 @@ class MathpixPDFLoader(BasePDFLoader):
|
||||
# This indicates an error with the PDF processing
|
||||
raise ValueError("Unable to retrieve PDF from Mathpix")
|
||||
else:
|
||||
print(f"Status: {status}, waiting for processing to complete") # noqa: T201
|
||||
logger.info("Status: %s, waiting for processing to complete", status)
|
||||
time.sleep(5)
|
||||
raise TimeoutError
|
||||
|
||||
@@ -572,8 +576,7 @@ class MathpixPDFLoader(BasePDFLoader):
|
||||
response = requests.get(url, headers=self._mathpix_headers)
|
||||
return response.content.decode("utf-8")
|
||||
|
||||
@staticmethod
|
||||
def clean_pdf(contents: str) -> str:
|
||||
def clean_pdf(self, contents: str) -> str:
|
||||
"""Clean the PDF file.
|
||||
|
||||
Args:
|
||||
@@ -596,7 +599,7 @@ class MathpixPDFLoader(BasePDFLoader):
|
||||
)
|
||||
return contents
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
def load(self) -> list[Document]:
|
||||
pdf_id = self.send_pdf()
|
||||
contents = self.get_processed_pdf(pdf_id)
|
||||
if self.should_clean_pdf:
|
||||
@@ -610,10 +613,10 @@ class PDFPlumberLoader(BasePDFLoader):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
file_path: str,
|
||||
file_path: Union[str, PurePath],
|
||||
text_kwargs: Optional[Mapping[str, Any]] = None,
|
||||
dedupe: bool = False,
|
||||
headers: Optional[Dict] = None,
|
||||
headers: Optional[dict] = None,
|
||||
extract_images: bool = False,
|
||||
) -> None:
|
||||
"""Initialize with a file path."""
|
||||
@@ -630,7 +633,7 @@ class PDFPlumberLoader(BasePDFLoader):
|
||||
self.dedupe = dedupe
|
||||
self.extract_images = extract_images
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
def load(self) -> list[Document]:
|
||||
"""Load file."""
|
||||
|
||||
parser = PDFPlumberParser(
|
||||
@@ -669,13 +672,13 @@ class AmazonTextractPDFLoader(BasePDFLoader):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
file_path: str,
|
||||
file_path: Union[str, PurePath],
|
||||
textract_features: Optional[Sequence[str]] = None,
|
||||
client: Optional[Any] = None,
|
||||
credentials_profile_name: Optional[str] = None,
|
||||
region_name: Optional[str] = None,
|
||||
endpoint_url: Optional[str] = None,
|
||||
headers: Optional[Dict] = None,
|
||||
headers: Optional[dict] = None,
|
||||
*,
|
||||
linearization_config: Optional["TextLinearizationConfig"] = None,
|
||||
) -> None:
|
||||
@@ -743,7 +746,7 @@ class AmazonTextractPDFLoader(BasePDFLoader):
|
||||
linearization_config=linearization_config,
|
||||
)
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
def load(self) -> list[Document]:
|
||||
"""Load given path as pages."""
|
||||
return list(self.lazy_load())
|
||||
|
||||
@@ -758,7 +761,7 @@ class AmazonTextractPDFLoader(BasePDFLoader):
|
||||
if self.web_path and self._is_s3_url(self.web_path):
|
||||
blob = Blob(path=self.web_path) # type: ignore[call-arg] # type: ignore[misc]
|
||||
else:
|
||||
blob = Blob.from_path(self.file_path) # type: ignore[attr-defined]
|
||||
blob = Blob.from_path(self.file_path)
|
||||
if AmazonTextractPDFLoader._get_number_of_pages(blob) > 1:
|
||||
raise ValueError(
|
||||
f"the file {blob.path} is a multi-page document, \
|
||||
@@ -792,7 +795,9 @@ class AmazonTextractPDFLoader(BasePDFLoader):
|
||||
elif blob.mimetype in ["image/png", "image/jpeg"]: # type: ignore[attr-defined]
|
||||
return 1
|
||||
else:
|
||||
raise ValueError(f"unsupported mime type: {blob.mimetype}") # type: ignore[attr-defined]
|
||||
raise ValueError( # type: ignore[attr-defined]
|
||||
f"unsupported mime type: {blob.mimetype}"
|
||||
)
|
||||
|
||||
|
||||
class DedocPDFLoader(DedocBaseLoader):
|
||||
@@ -887,7 +892,7 @@ class DedocPDFLoader(DedocBaseLoader):
|
||||
from dedoc.utils.langchain import make_manager_pdf_config
|
||||
|
||||
return make_manager_pdf_config(
|
||||
file_path=self.file_path,
|
||||
file_path=str(self.file_path),
|
||||
parsing_params=self.parsing_parameters,
|
||||
split=self.split,
|
||||
)
|
||||
@@ -898,10 +903,10 @@ class DocumentIntelligenceLoader(BasePDFLoader):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
file_path: str,
|
||||
file_path: Union[str, PurePath],
|
||||
client: Any,
|
||||
model: str = "prebuilt-document",
|
||||
headers: Optional[Dict] = None,
|
||||
headers: Optional[dict] = None,
|
||||
) -> None:
|
||||
"""
|
||||
Initialize the object for file processing with Azure Document Intelligence
|
||||
@@ -930,10 +935,10 @@ class DocumentIntelligenceLoader(BasePDFLoader):
|
||||
... )
|
||||
"""
|
||||
|
||||
self.parser = DocumentIntelligenceParser(client=client, model=model)
|
||||
super().__init__(file_path, headers=headers)
|
||||
self.parser = DocumentIntelligenceParser(client=client, model=model)
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
def load(self) -> list[Document]:
|
||||
"""Load given path as pages."""
|
||||
return list(self.lazy_load())
|
||||
|
||||
@@ -964,7 +969,7 @@ class ZeroxPDFLoader(BasePDFLoader):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
file_path: Union[str, Path],
|
||||
file_path: Union[str, PurePath],
|
||||
model: str = "gpt-4o-mini",
|
||||
**zerox_kwargs: Any,
|
||||
) -> None:
|
||||
@@ -1005,7 +1010,7 @@ class ZeroxPDFLoader(BasePDFLoader):
|
||||
|
||||
# Directly call asyncio.run to execute zerox synchronously
|
||||
zerox_output = asyncio.run(
|
||||
zerox(file_path=self.file_path, model=self.model, **self.zerox_kwargs)
|
||||
zerox(file_path=str(self.file_path), model=self.model, **self.zerox_kwargs)
|
||||
)
|
||||
|
||||
# Convert zerox output to Document instances and yield them
|
||||
|
||||
@@ -434,7 +434,7 @@ class AzureMLBaseEndpoint(BaseModel):
|
||||
raise ValueError(
|
||||
"`endpoint_url` should contain the full invocation URL including "
|
||||
"`/score` for `endpoint_api_type='dedicated'` or `/completions` "
|
||||
"or `/chat/completions` or `/models/chat/completions` "
|
||||
"or `/models/chat/completions` "
|
||||
"for `endpoint_api_type='serverless'`"
|
||||
)
|
||||
return field_value
|
||||
@@ -456,18 +456,16 @@ class AzureMLBaseEndpoint(BaseModel):
|
||||
"Endpoints of type `dedicated` should follow the format "
|
||||
"`https://<your-endpoint>.<your_region>.inference.ml.azure.com/score`."
|
||||
" If your endpoint URL ends with `/completions` or"
|
||||
"`/chat/completions` or `/models/chat/completions`,"
|
||||
"`/models/chat/completions`,"
|
||||
"use `endpoint_api_type='serverless'` instead."
|
||||
)
|
||||
if field_value == AzureMLEndpointApiType.serverless and not (
|
||||
endpoint_url.endswith("/completions") # type: ignore[union-attr]
|
||||
or endpoint_url.endswith("/chat/completions") # type: ignore[union-attr]
|
||||
or endpoint_url.endswith("/models/chat/completions") # type: ignore[union-attr]
|
||||
):
|
||||
raise ValueError(
|
||||
"Endpoints of type `serverless` should follow the format "
|
||||
"`https://<your-endpoint>.<your_region>.inference.ml.azure.com/completions`"
|
||||
" or `https://<your-endpoint>.<your_region>.inference.ml.azure.com/chat/completions`"
|
||||
" or `https://<your-endpoint>.<your_region>.inference.ml.azure.com/models/chat/completions`"
|
||||
)
|
||||
|
||||
|
||||
21
libs/community/poetry.lock
generated
21
libs/community/poetry.lock
generated
@@ -1,4 +1,4 @@
|
||||
# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand.
|
||||
# This file is automatically @generated by Poetry 1.8.4 and should not be changed by hand.
|
||||
|
||||
[[package]]
|
||||
name = "aiohappyeyeballs"
|
||||
@@ -1829,7 +1829,7 @@ files = [
|
||||
|
||||
[[package]]
|
||||
name = "langchain"
|
||||
version = "0.3.13"
|
||||
version = "0.3.14"
|
||||
description = "Building applications with LLMs through composability"
|
||||
optional = false
|
||||
python-versions = ">=3.9,<4.0"
|
||||
@@ -1839,7 +1839,7 @@ develop = true
|
||||
[package.dependencies]
|
||||
aiohttp = "^3.8.3"
|
||||
async-timeout = {version = "^4.0.0", markers = "python_version < \"3.11\""}
|
||||
langchain-core = "^0.3.26"
|
||||
langchain-core = "^0.3.29"
|
||||
langchain-text-splitters = "^0.3.3"
|
||||
langsmith = ">=0.1.17,<0.3"
|
||||
numpy = [
|
||||
@@ -1858,7 +1858,7 @@ url = "../langchain"
|
||||
|
||||
[[package]]
|
||||
name = "langchain-core"
|
||||
version = "0.3.27"
|
||||
version = "0.3.29"
|
||||
description = "Building applications with LLMs through composability"
|
||||
optional = false
|
||||
python-versions = ">=3.9,<4.0"
|
||||
@@ -2618,41 +2618,30 @@ files = [
|
||||
{file = "pandas-2.2.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:381175499d3802cde0eabbaf6324cce0c4f5d52ca6f8c377c29ad442f50f6348"},
|
||||
{file = "pandas-2.2.3-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:d9c45366def9a3dd85a6454c0e7908f2b3b8e9c138f5dc38fed7ce720d8453ed"},
|
||||
{file = "pandas-2.2.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:86976a1c5b25ae3f8ccae3a5306e443569ee3c3faf444dfd0f41cda24667ad57"},
|
||||
{file = "pandas-2.2.3-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:b8661b0238a69d7aafe156b7fa86c44b881387509653fdf857bebc5e4008ad42"},
|
||||
{file = "pandas-2.2.3-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:37e0aced3e8f539eccf2e099f65cdb9c8aa85109b0be6e93e2baff94264bdc6f"},
|
||||
{file = "pandas-2.2.3-cp310-cp310-win_amd64.whl", hash = "sha256:56534ce0746a58afaf7942ba4863e0ef81c9c50d3f0ae93e9497d6a41a057645"},
|
||||
{file = "pandas-2.2.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:66108071e1b935240e74525006034333f98bcdb87ea116de573a6a0dccb6c039"},
|
||||
{file = "pandas-2.2.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7c2875855b0ff77b2a64a0365e24455d9990730d6431b9e0ee18ad8acee13dbd"},
|
||||
{file = "pandas-2.2.3-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:cd8d0c3be0515c12fed0bdbae072551c8b54b7192c7b1fda0ba56059a0179698"},
|
||||
{file = "pandas-2.2.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c124333816c3a9b03fbeef3a9f230ba9a737e9e5bb4060aa2107a86cc0a497fc"},
|
||||
{file = "pandas-2.2.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:63cc132e40a2e084cf01adf0775b15ac515ba905d7dcca47e9a251819c575ef3"},
|
||||
{file = "pandas-2.2.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:29401dbfa9ad77319367d36940cd8a0b3a11aba16063e39632d98b0e931ddf32"},
|
||||
{file = "pandas-2.2.3-cp311-cp311-win_amd64.whl", hash = "sha256:3fc6873a41186404dad67245896a6e440baacc92f5b716ccd1bc9ed2995ab2c5"},
|
||||
{file = "pandas-2.2.3-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:b1d432e8d08679a40e2a6d8b2f9770a5c21793a6f9f47fdd52c5ce1948a5a8a9"},
|
||||
{file = "pandas-2.2.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:a5a1595fe639f5988ba6a8e5bc9649af3baf26df3998a0abe56c02609392e0a4"},
|
||||
{file = "pandas-2.2.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:5de54125a92bb4d1c051c0659e6fcb75256bf799a732a87184e5ea503965bce3"},
|
||||
{file = "pandas-2.2.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fffb8ae78d8af97f849404f21411c95062db1496aeb3e56f146f0355c9989319"},
|
||||
{file = "pandas-2.2.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:6dfcb5ee8d4d50c06a51c2fffa6cff6272098ad6540aed1a76d15fb9318194d8"},
|
||||
{file = "pandas-2.2.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:062309c1b9ea12a50e8ce661145c6aab431b1e99530d3cd60640e255778bd43a"},
|
||||
{file = "pandas-2.2.3-cp312-cp312-win_amd64.whl", hash = "sha256:59ef3764d0fe818125a5097d2ae867ca3fa64df032331b7e0917cf5d7bf66b13"},
|
||||
{file = "pandas-2.2.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:f00d1345d84d8c86a63e476bb4955e46458b304b9575dcf71102b5c705320015"},
|
||||
{file = "pandas-2.2.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:3508d914817e153ad359d7e069d752cdd736a247c322d932eb89e6bc84217f28"},
|
||||
{file = "pandas-2.2.3-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:22a9d949bfc9a502d320aa04e5d02feab689d61da4e7764b62c30b991c42c5f0"},
|
||||
{file = "pandas-2.2.3-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f3a255b2c19987fbbe62a9dfd6cff7ff2aa9ccab3fc75218fd4b7530f01efa24"},
|
||||
{file = "pandas-2.2.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:800250ecdadb6d9c78eae4990da62743b857b470883fa27f652db8bdde7f6659"},
|
||||
{file = "pandas-2.2.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:6374c452ff3ec675a8f46fd9ab25c4ad0ba590b71cf0656f8b6daa5202bca3fb"},
|
||||
{file = "pandas-2.2.3-cp313-cp313-win_amd64.whl", hash = "sha256:61c5ad4043f791b61dd4752191d9f07f0ae412515d59ba8f005832a532f8736d"},
|
||||
{file = "pandas-2.2.3-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:3b71f27954685ee685317063bf13c7709a7ba74fc996b84fc6821c59b0f06468"},
|
||||
{file = "pandas-2.2.3-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:38cf8125c40dae9d5acc10fa66af8ea6fdf760b2714ee482ca691fc66e6fcb18"},
|
||||
{file = "pandas-2.2.3-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:ba96630bc17c875161df3818780af30e43be9b166ce51c9a18c1feae342906c2"},
|
||||
{file = "pandas-2.2.3-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1db71525a1538b30142094edb9adc10be3f3e176748cd7acc2240c2f2e5aa3a4"},
|
||||
{file = "pandas-2.2.3-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:15c0e1e02e93116177d29ff83e8b1619c93ddc9c49083f237d4312337a61165d"},
|
||||
{file = "pandas-2.2.3-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:ad5b65698ab28ed8d7f18790a0dc58005c7629f227be9ecc1072aa74c0c1d43a"},
|
||||
{file = "pandas-2.2.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:bc6b93f9b966093cb0fd62ff1a7e4c09e6d546ad7c1de191767baffc57628f39"},
|
||||
{file = "pandas-2.2.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:5dbca4c1acd72e8eeef4753eeca07de9b1db4f398669d5994086f788a5d7cc30"},
|
||||
{file = "pandas-2.2.3-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:8cd6d7cc958a3910f934ea8dbdf17b2364827bb4dafc38ce6eef6bb3d65ff09c"},
|
||||
{file = "pandas-2.2.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:99df71520d25fade9db7c1076ac94eb994f4d2673ef2aa2e86ee039b6746d20c"},
|
||||
{file = "pandas-2.2.3-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:31d0ced62d4ea3e231a9f228366919a5ea0b07440d9d4dac345376fd8e1477ea"},
|
||||
{file = "pandas-2.2.3-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:7eee9e7cea6adf3e3d24e304ac6b8300646e2a5d1cd3a3c2abed9101b0846761"},
|
||||
{file = "pandas-2.2.3-cp39-cp39-win_amd64.whl", hash = "sha256:4850ba03528b6dd51d6c5d273c46f183f39a9baf3f0143e566b89450965b105e"},
|
||||
{file = "pandas-2.2.3.tar.gz", hash = "sha256:4f18ba62b61d7e192368b84517265a99b4d7ee8912f8708660fb4a366cc82667"},
|
||||
@@ -4715,4 +4704,4 @@ type = ["pytest-mypy"]
|
||||
[metadata]
|
||||
lock-version = "2.0"
|
||||
python-versions = ">=3.9,<4.0"
|
||||
content-hash = "92c00e0689d9f6cec3122bf6faf8e4fac3829f1cdd33b21a39ec92e1b5aa8585"
|
||||
content-hash = "285e19251cdc78fc8dbde4ccf887f0ca35a0800e757223bb4abefa471a04a33a"
|
||||
|
||||
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
|
||||
|
||||
[tool.poetry]
|
||||
name = "langchain-community"
|
||||
version = "0.3.13"
|
||||
version = "0.3.14"
|
||||
description = "Community contributed LangChain integrations."
|
||||
authors = []
|
||||
license = "MIT"
|
||||
@@ -33,8 +33,8 @@ ignore-words-list = "momento,collison,ned,foor,reworkd,parth,whats,aapply,mysogy
|
||||
|
||||
[tool.poetry.dependencies]
|
||||
python = ">=3.9,<4.0"
|
||||
langchain-core = "^0.3.27"
|
||||
langchain = "^0.3.13"
|
||||
langchain-core = "^0.3.29"
|
||||
langchain = "^0.3.14"
|
||||
SQLAlchemy = ">=1.4,<3"
|
||||
requests = "^2"
|
||||
PyYAML = ">=5.3"
|
||||
|
||||
@@ -61,7 +61,7 @@ def _assert_with_parser(parser: BaseBlobParser, splits_by_page: bool = True) ->
|
||||
assert metadata["source"] == str(LAYOUT_PARSER_PAPER_PDF)
|
||||
|
||||
if splits_by_page:
|
||||
assert metadata["page"] == 0
|
||||
assert int(metadata["page"]) == 0
|
||||
|
||||
|
||||
def _assert_with_duplicate_parser(parser: BaseBlobParser, dedupe: bool = False) -> None:
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import Sequence, Union
|
||||
|
||||
@@ -17,7 +18,7 @@ from langchain_community.document_loaders import (
|
||||
def test_unstructured_pdf_loader_elements_mode() -> None:
|
||||
"""Test unstructured loader with various modes."""
|
||||
file_path = Path(__file__).parent.parent / "examples/hello.pdf"
|
||||
loader = UnstructuredPDFLoader(str(file_path), mode="elements")
|
||||
loader = UnstructuredPDFLoader(file_path, mode="elements")
|
||||
docs = loader.load()
|
||||
|
||||
assert len(docs) == 2
|
||||
@@ -26,7 +27,7 @@ def test_unstructured_pdf_loader_elements_mode() -> None:
|
||||
def test_unstructured_pdf_loader_paged_mode() -> None:
|
||||
"""Test unstructured loader with various modes."""
|
||||
file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf"
|
||||
loader = UnstructuredPDFLoader(str(file_path), mode="paged")
|
||||
loader = UnstructuredPDFLoader(file_path, mode="paged")
|
||||
docs = loader.load()
|
||||
|
||||
assert len(docs) == 16
|
||||
@@ -35,7 +36,7 @@ def test_unstructured_pdf_loader_paged_mode() -> None:
|
||||
def test_unstructured_pdf_loader_default_mode() -> None:
|
||||
"""Test unstructured loader."""
|
||||
file_path = Path(__file__).parent.parent / "examples/hello.pdf"
|
||||
loader = UnstructuredPDFLoader(str(file_path))
|
||||
loader = UnstructuredPDFLoader(file_path)
|
||||
docs = loader.load()
|
||||
|
||||
assert len(docs) == 1
|
||||
@@ -44,26 +45,26 @@ def test_unstructured_pdf_loader_default_mode() -> None:
|
||||
def test_pdfminer_loader() -> None:
|
||||
"""Test PDFMiner loader."""
|
||||
file_path = Path(__file__).parent.parent / "examples/hello.pdf"
|
||||
loader = PDFMinerLoader(str(file_path))
|
||||
loader = PDFMinerLoader(file_path)
|
||||
docs = loader.load()
|
||||
|
||||
assert len(docs) == 1
|
||||
|
||||
file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf"
|
||||
loader = PDFMinerLoader(str(file_path))
|
||||
loader = PDFMinerLoader(file_path)
|
||||
|
||||
docs = loader.load()
|
||||
assert len(docs) == 1
|
||||
|
||||
# Verify that concatenating pages parameter works
|
||||
file_path = Path(__file__).parent.parent / "examples/hello.pdf"
|
||||
loader = PDFMinerLoader(str(file_path), concatenate_pages=True)
|
||||
loader = PDFMinerLoader(file_path, concatenate_pages=True)
|
||||
docs = loader.load()
|
||||
|
||||
assert len(docs) == 1
|
||||
|
||||
file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf"
|
||||
loader = PDFMinerLoader(str(file_path), concatenate_pages=False)
|
||||
loader = PDFMinerLoader(file_path, concatenate_pages=False)
|
||||
|
||||
docs = loader.load()
|
||||
assert len(docs) == 16
|
||||
@@ -72,13 +73,13 @@ def test_pdfminer_loader() -> None:
|
||||
def test_pdfminer_pdf_as_html_loader() -> None:
|
||||
"""Test PDFMinerPDFasHTMLLoader."""
|
||||
file_path = Path(__file__).parent.parent / "examples/hello.pdf"
|
||||
loader = PDFMinerPDFasHTMLLoader(str(file_path))
|
||||
loader = PDFMinerPDFasHTMLLoader(file_path)
|
||||
docs = loader.load()
|
||||
|
||||
assert len(docs) == 1
|
||||
|
||||
file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf"
|
||||
loader = PDFMinerPDFasHTMLLoader(str(file_path))
|
||||
loader = PDFMinerPDFasHTMLLoader(file_path)
|
||||
|
||||
docs = loader.load()
|
||||
assert len(docs) == 1
|
||||
@@ -87,13 +88,13 @@ def test_pdfminer_pdf_as_html_loader() -> None:
|
||||
def test_pypdfium2_loader() -> None:
|
||||
"""Test PyPDFium2Loader."""
|
||||
file_path = Path(__file__).parent.parent / "examples/hello.pdf"
|
||||
loader = PyPDFium2Loader(str(file_path))
|
||||
loader = PyPDFium2Loader(file_path)
|
||||
docs = loader.load()
|
||||
|
||||
assert len(docs) == 1
|
||||
|
||||
file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf"
|
||||
loader = PyPDFium2Loader(str(file_path))
|
||||
loader = PyPDFium2Loader(file_path)
|
||||
|
||||
docs = loader.load()
|
||||
assert len(docs) == 16
|
||||
@@ -102,13 +103,13 @@ def test_pypdfium2_loader() -> None:
|
||||
def test_pymupdf_loader() -> None:
|
||||
"""Test PyMuPDF loader."""
|
||||
file_path = Path(__file__).parent.parent / "examples/hello.pdf"
|
||||
loader = PyMuPDFLoader(str(file_path))
|
||||
loader = PyMuPDFLoader(file_path)
|
||||
|
||||
docs = loader.load()
|
||||
assert len(docs) == 1
|
||||
|
||||
file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf"
|
||||
loader = PyMuPDFLoader(str(file_path))
|
||||
loader = PyMuPDFLoader(file_path)
|
||||
|
||||
docs = loader.load()
|
||||
assert len(docs) == 16
|
||||
@@ -123,20 +124,21 @@ def test_pymupdf_loader() -> None:
|
||||
assert len(docs) == 1
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
not os.environ.get("MATHPIX_API_KEY"), reason="Mathpix API key not found"
|
||||
)
|
||||
def test_mathpix_loader() -> None:
|
||||
file_path = Path(__file__).parent.parent / "examples/hello.pdf"
|
||||
loader = MathpixPDFLoader(str(file_path))
|
||||
loader = MathpixPDFLoader(file_path)
|
||||
docs = loader.load()
|
||||
|
||||
assert len(docs) == 1
|
||||
print(docs[0].page_content) # noqa: T201
|
||||
|
||||
file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf"
|
||||
loader = MathpixPDFLoader(str(file_path))
|
||||
loader = MathpixPDFLoader(file_path)
|
||||
|
||||
docs = loader.load()
|
||||
assert len(docs) == 1
|
||||
print(docs[0].page_content) # noqa: T201
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
@@ -187,8 +189,8 @@ def test_mathpix_loader() -> None:
|
||||
1,
|
||||
False,
|
||||
),
|
||||
(str(Path(__file__).parent.parent / "examples/hello.pdf"), ["FORMS"], 1, False),
|
||||
(str(Path(__file__).parent.parent / "examples/hello.pdf"), [], 1, False),
|
||||
(Path(__file__).parent.parent / "examples/hello.pdf", ["FORMS"], 1, False),
|
||||
(Path(__file__).parent.parent / "examples/hello.pdf", [], 1, False),
|
||||
(
|
||||
"s3://amazon-textract-public-content/langchain/layout-parser-paper.pdf",
|
||||
["FORMS", "TABLES", "LAYOUT"],
|
||||
@@ -222,7 +224,7 @@ def test_amazontextract_loader(
|
||||
@pytest.mark.skip(reason="Requires AWS credentials to run")
|
||||
def test_amazontextract_loader_failures() -> None:
|
||||
# 2-page PDF local file system
|
||||
two_page_pdf = str(
|
||||
two_page_pdf = (
|
||||
Path(__file__).parent.parent / "examples/multi-page-forms-sample-2-page.pdf"
|
||||
)
|
||||
loader = AmazonTextractPDFLoader(two_page_pdf)
|
||||
|
||||
@@ -43,6 +43,7 @@ EXPECTED_ALL = [
|
||||
"CassandraLoader",
|
||||
"CSVLoader",
|
||||
"ChatGPTLoader",
|
||||
"CloudBlobLoader",
|
||||
"CoNLLULoader",
|
||||
"CollegeConfidentialLoader",
|
||||
"ConcurrentLoader",
|
||||
|
||||
@@ -282,7 +282,7 @@ class FewShotChatMessagePromptTemplate(
|
||||
]
|
||||
|
||||
example_prompt = ChatPromptTemplate.from_messages(
|
||||
[('human', '{input}'), ('ai', '{output}')]
|
||||
[('human', 'What is {input}?'), ('ai', '{output}')]
|
||||
)
|
||||
|
||||
few_shot_prompt = FewShotChatMessagePromptTemplate(
|
||||
|
||||
@@ -1,11 +1,12 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from functools import partial
|
||||
from typing import Optional
|
||||
from typing import Literal, Optional, Union
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
from langchain_core.callbacks import Callbacks
|
||||
from langchain_core.documents import Document
|
||||
from langchain_core.prompts import (
|
||||
BasePromptTemplate,
|
||||
PromptTemplate,
|
||||
@@ -28,11 +29,16 @@ def _get_relevant_documents(
|
||||
document_prompt: BasePromptTemplate,
|
||||
document_separator: str,
|
||||
callbacks: Callbacks = None,
|
||||
) -> str:
|
||||
response_format: Literal["content", "content_and_artifact"] = "content",
|
||||
) -> Union[str, tuple[str, list[Document]]]:
|
||||
docs = retriever.invoke(query, config={"callbacks": callbacks})
|
||||
return document_separator.join(
|
||||
content = document_separator.join(
|
||||
format_document(doc, document_prompt) for doc in docs
|
||||
)
|
||||
if response_format == "content_and_artifact":
|
||||
return (content, docs)
|
||||
|
||||
return content
|
||||
|
||||
|
||||
async def _aget_relevant_documents(
|
||||
@@ -41,12 +47,18 @@ async def _aget_relevant_documents(
|
||||
document_prompt: BasePromptTemplate,
|
||||
document_separator: str,
|
||||
callbacks: Callbacks = None,
|
||||
) -> str:
|
||||
response_format: Literal["content", "content_and_artifact"] = "content",
|
||||
) -> Union[str, tuple[str, list[Document]]]:
|
||||
docs = await retriever.ainvoke(query, config={"callbacks": callbacks})
|
||||
return document_separator.join(
|
||||
content = document_separator.join(
|
||||
[await aformat_document(doc, document_prompt) for doc in docs]
|
||||
)
|
||||
|
||||
if response_format == "content_and_artifact":
|
||||
return (content, docs)
|
||||
|
||||
return content
|
||||
|
||||
|
||||
def create_retriever_tool(
|
||||
retriever: BaseRetriever,
|
||||
@@ -55,6 +67,7 @@ def create_retriever_tool(
|
||||
*,
|
||||
document_prompt: Optional[BasePromptTemplate] = None,
|
||||
document_separator: str = "\n\n",
|
||||
response_format: Literal["content", "content_and_artifact"] = "content",
|
||||
) -> Tool:
|
||||
"""Create a tool to do retrieval of documents.
|
||||
|
||||
@@ -66,6 +79,11 @@ def create_retriever_tool(
|
||||
model, so should be descriptive.
|
||||
document_prompt: The prompt to use for the document. Defaults to None.
|
||||
document_separator: The separator to use between documents. Defaults to "\n\n".
|
||||
response_format: The tool response format. If "content" then the output of
|
||||
the tool is interpreted as the contents of a ToolMessage. If
|
||||
"content_and_artifact" then the output is expected to be a two-tuple
|
||||
corresponding to the (content, artifact) of a ToolMessage (artifact
|
||||
being a list of documents in this case). Defaults to "content".
|
||||
|
||||
Returns:
|
||||
Tool class to pass to an agent.
|
||||
@@ -76,12 +94,14 @@ def create_retriever_tool(
|
||||
retriever=retriever,
|
||||
document_prompt=document_prompt,
|
||||
document_separator=document_separator,
|
||||
response_format=response_format,
|
||||
)
|
||||
afunc = partial(
|
||||
_aget_relevant_documents,
|
||||
retriever=retriever,
|
||||
document_prompt=document_prompt,
|
||||
document_separator=document_separator,
|
||||
response_format=response_format,
|
||||
)
|
||||
return Tool(
|
||||
name=name,
|
||||
@@ -89,4 +109,5 @@ def create_retriever_tool(
|
||||
func=func,
|
||||
coroutine=afunc,
|
||||
args_schema=RetrieverInput,
|
||||
response_format=response_format,
|
||||
)
|
||||
|
||||
@@ -30,8 +30,13 @@ from langchain_core.callbacks import (
|
||||
AsyncCallbackManagerForToolRun,
|
||||
CallbackManagerForToolRun,
|
||||
)
|
||||
from langchain_core.messages import ToolMessage
|
||||
from langchain_core.callbacks.manager import (
|
||||
CallbackManagerForRetrieverRun,
|
||||
)
|
||||
from langchain_core.documents import Document
|
||||
from langchain_core.messages import ToolCall, ToolMessage
|
||||
from langchain_core.messages.tool import ToolOutputMixin
|
||||
from langchain_core.retrievers import BaseRetriever
|
||||
from langchain_core.runnables import (
|
||||
Runnable,
|
||||
RunnableConfig,
|
||||
@@ -2118,6 +2123,57 @@ def test_tool_annotations_preserved() -> None:
|
||||
assert schema.__annotations__ == expected_type_hints
|
||||
|
||||
|
||||
def test_create_retriever_tool() -> None:
|
||||
class MyRetriever(BaseRetriever):
|
||||
def _get_relevant_documents(
|
||||
self, query: str, *, run_manager: CallbackManagerForRetrieverRun
|
||||
) -> list[Document]:
|
||||
return [Document(page_content=f"foo {query}"), Document(page_content="bar")]
|
||||
|
||||
retriever = MyRetriever()
|
||||
retriever_tool = tools.create_retriever_tool(
|
||||
retriever, "retriever_tool_content", "Retriever Tool Content"
|
||||
)
|
||||
assert isinstance(retriever_tool, BaseTool)
|
||||
assert retriever_tool.name == "retriever_tool_content"
|
||||
assert retriever_tool.description == "Retriever Tool Content"
|
||||
assert retriever_tool.invoke("bar") == "foo bar\n\nbar"
|
||||
assert retriever_tool.invoke(
|
||||
ToolCall(
|
||||
name="retriever_tool_content",
|
||||
args={"query": "bar"},
|
||||
id="123",
|
||||
type="tool_call",
|
||||
)
|
||||
) == ToolMessage(
|
||||
"foo bar\n\nbar", tool_call_id="123", name="retriever_tool_content"
|
||||
)
|
||||
|
||||
retriever_tool_artifact = tools.create_retriever_tool(
|
||||
retriever,
|
||||
"retriever_tool_artifact",
|
||||
"Retriever Tool Artifact",
|
||||
response_format="content_and_artifact",
|
||||
)
|
||||
assert isinstance(retriever_tool_artifact, BaseTool)
|
||||
assert retriever_tool_artifact.name == "retriever_tool_artifact"
|
||||
assert retriever_tool_artifact.description == "Retriever Tool Artifact"
|
||||
assert retriever_tool_artifact.invoke("bar") == "foo bar\n\nbar"
|
||||
assert retriever_tool_artifact.invoke(
|
||||
ToolCall(
|
||||
name="retriever_tool_artifact",
|
||||
args={"query": "bar"},
|
||||
id="123",
|
||||
type="tool_call",
|
||||
)
|
||||
) == ToolMessage(
|
||||
"foo bar\n\nbar",
|
||||
artifact=[Document(page_content="foo bar"), Document(page_content="bar")],
|
||||
tool_call_id="123",
|
||||
name="retriever_tool_artifact",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.skipif(PYDANTIC_MAJOR_VERSION != 2, reason="Testing pydantic v2.")
|
||||
def test_tool_args_schema_pydantic_v2_with_metadata() -> None:
|
||||
from pydantic import BaseModel as BaseModelV2
|
||||
|
||||
23
libs/langchain/poetry.lock
generated
23
libs/langchain/poetry.lock
generated
@@ -1,4 +1,4 @@
|
||||
# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand.
|
||||
# This file is automatically @generated by Poetry 1.8.4 and should not be changed by hand.
|
||||
|
||||
[[package]]
|
||||
name = "aiohappyeyeballs"
|
||||
@@ -1990,7 +1990,7 @@ files = [
|
||||
|
||||
[[package]]
|
||||
name = "langchain-core"
|
||||
version = "0.3.26"
|
||||
version = "0.3.29"
|
||||
description = "Building applications with LLMs through composability"
|
||||
optional = false
|
||||
python-versions = ">=3.9,<4.0"
|
||||
@@ -2015,7 +2015,7 @@ url = "../core"
|
||||
|
||||
[[package]]
|
||||
name = "langchain-openai"
|
||||
version = "0.2.12"
|
||||
version = "0.2.14"
|
||||
description = "An integration package connecting OpenAI and LangChain"
|
||||
optional = true
|
||||
python-versions = ">=3.9,<4.0"
|
||||
@@ -2023,8 +2023,8 @@ files = []
|
||||
develop = true
|
||||
|
||||
[package.dependencies]
|
||||
langchain-core = "^0.3.21"
|
||||
openai = "^1.55.3"
|
||||
langchain-core = "^0.3.27"
|
||||
openai = "^1.58.1"
|
||||
tiktoken = ">=0.7,<1"
|
||||
|
||||
[package.source]
|
||||
@@ -2790,41 +2790,30 @@ files = [
|
||||
{file = "pandas-2.2.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:381175499d3802cde0eabbaf6324cce0c4f5d52ca6f8c377c29ad442f50f6348"},
|
||||
{file = "pandas-2.2.3-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:d9c45366def9a3dd85a6454c0e7908f2b3b8e9c138f5dc38fed7ce720d8453ed"},
|
||||
{file = "pandas-2.2.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:86976a1c5b25ae3f8ccae3a5306e443569ee3c3faf444dfd0f41cda24667ad57"},
|
||||
{file = "pandas-2.2.3-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:b8661b0238a69d7aafe156b7fa86c44b881387509653fdf857bebc5e4008ad42"},
|
||||
{file = "pandas-2.2.3-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:37e0aced3e8f539eccf2e099f65cdb9c8aa85109b0be6e93e2baff94264bdc6f"},
|
||||
{file = "pandas-2.2.3-cp310-cp310-win_amd64.whl", hash = "sha256:56534ce0746a58afaf7942ba4863e0ef81c9c50d3f0ae93e9497d6a41a057645"},
|
||||
{file = "pandas-2.2.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:66108071e1b935240e74525006034333f98bcdb87ea116de573a6a0dccb6c039"},
|
||||
{file = "pandas-2.2.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7c2875855b0ff77b2a64a0365e24455d9990730d6431b9e0ee18ad8acee13dbd"},
|
||||
{file = "pandas-2.2.3-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:cd8d0c3be0515c12fed0bdbae072551c8b54b7192c7b1fda0ba56059a0179698"},
|
||||
{file = "pandas-2.2.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c124333816c3a9b03fbeef3a9f230ba9a737e9e5bb4060aa2107a86cc0a497fc"},
|
||||
{file = "pandas-2.2.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:63cc132e40a2e084cf01adf0775b15ac515ba905d7dcca47e9a251819c575ef3"},
|
||||
{file = "pandas-2.2.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:29401dbfa9ad77319367d36940cd8a0b3a11aba16063e39632d98b0e931ddf32"},
|
||||
{file = "pandas-2.2.3-cp311-cp311-win_amd64.whl", hash = "sha256:3fc6873a41186404dad67245896a6e440baacc92f5b716ccd1bc9ed2995ab2c5"},
|
||||
{file = "pandas-2.2.3-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:b1d432e8d08679a40e2a6d8b2f9770a5c21793a6f9f47fdd52c5ce1948a5a8a9"},
|
||||
{file = "pandas-2.2.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:a5a1595fe639f5988ba6a8e5bc9649af3baf26df3998a0abe56c02609392e0a4"},
|
||||
{file = "pandas-2.2.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:5de54125a92bb4d1c051c0659e6fcb75256bf799a732a87184e5ea503965bce3"},
|
||||
{file = "pandas-2.2.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fffb8ae78d8af97f849404f21411c95062db1496aeb3e56f146f0355c9989319"},
|
||||
{file = "pandas-2.2.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:6dfcb5ee8d4d50c06a51c2fffa6cff6272098ad6540aed1a76d15fb9318194d8"},
|
||||
{file = "pandas-2.2.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:062309c1b9ea12a50e8ce661145c6aab431b1e99530d3cd60640e255778bd43a"},
|
||||
{file = "pandas-2.2.3-cp312-cp312-win_amd64.whl", hash = "sha256:59ef3764d0fe818125a5097d2ae867ca3fa64df032331b7e0917cf5d7bf66b13"},
|
||||
{file = "pandas-2.2.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:f00d1345d84d8c86a63e476bb4955e46458b304b9575dcf71102b5c705320015"},
|
||||
{file = "pandas-2.2.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:3508d914817e153ad359d7e069d752cdd736a247c322d932eb89e6bc84217f28"},
|
||||
{file = "pandas-2.2.3-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:22a9d949bfc9a502d320aa04e5d02feab689d61da4e7764b62c30b991c42c5f0"},
|
||||
{file = "pandas-2.2.3-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f3a255b2c19987fbbe62a9dfd6cff7ff2aa9ccab3fc75218fd4b7530f01efa24"},
|
||||
{file = "pandas-2.2.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:800250ecdadb6d9c78eae4990da62743b857b470883fa27f652db8bdde7f6659"},
|
||||
{file = "pandas-2.2.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:6374c452ff3ec675a8f46fd9ab25c4ad0ba590b71cf0656f8b6daa5202bca3fb"},
|
||||
{file = "pandas-2.2.3-cp313-cp313-win_amd64.whl", hash = "sha256:61c5ad4043f791b61dd4752191d9f07f0ae412515d59ba8f005832a532f8736d"},
|
||||
{file = "pandas-2.2.3-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:3b71f27954685ee685317063bf13c7709a7ba74fc996b84fc6821c59b0f06468"},
|
||||
{file = "pandas-2.2.3-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:38cf8125c40dae9d5acc10fa66af8ea6fdf760b2714ee482ca691fc66e6fcb18"},
|
||||
{file = "pandas-2.2.3-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:ba96630bc17c875161df3818780af30e43be9b166ce51c9a18c1feae342906c2"},
|
||||
{file = "pandas-2.2.3-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1db71525a1538b30142094edb9adc10be3f3e176748cd7acc2240c2f2e5aa3a4"},
|
||||
{file = "pandas-2.2.3-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:15c0e1e02e93116177d29ff83e8b1619c93ddc9c49083f237d4312337a61165d"},
|
||||
{file = "pandas-2.2.3-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:ad5b65698ab28ed8d7f18790a0dc58005c7629f227be9ecc1072aa74c0c1d43a"},
|
||||
{file = "pandas-2.2.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:bc6b93f9b966093cb0fd62ff1a7e4c09e6d546ad7c1de191767baffc57628f39"},
|
||||
{file = "pandas-2.2.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:5dbca4c1acd72e8eeef4753eeca07de9b1db4f398669d5994086f788a5d7cc30"},
|
||||
{file = "pandas-2.2.3-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:8cd6d7cc958a3910f934ea8dbdf17b2364827bb4dafc38ce6eef6bb3d65ff09c"},
|
||||
{file = "pandas-2.2.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:99df71520d25fade9db7c1076ac94eb994f4d2673ef2aa2e86ee039b6746d20c"},
|
||||
{file = "pandas-2.2.3-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:31d0ced62d4ea3e231a9f228366919a5ea0b07440d9d4dac345376fd8e1477ea"},
|
||||
{file = "pandas-2.2.3-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:7eee9e7cea6adf3e3d24e304ac6b8300646e2a5d1cd3a3c2abed9101b0846761"},
|
||||
{file = "pandas-2.2.3-cp39-cp39-win_amd64.whl", hash = "sha256:4850ba03528b6dd51d6c5d273c46f183f39a9baf3f0143e566b89450965b105e"},
|
||||
{file = "pandas-2.2.3.tar.gz", hash = "sha256:4f18ba62b61d7e192368b84517265a99b4d7ee8912f8708660fb4a366cc82667"},
|
||||
@@ -5033,4 +5022,4 @@ type = ["pytest-mypy"]
|
||||
[metadata]
|
||||
lock-version = "2.0"
|
||||
python-versions = ">=3.9,<4.0"
|
||||
content-hash = "1113adf90d5867bd2c173e9022b6eee5ebfa5f77176d0eba67326d38ea5ca1f9"
|
||||
content-hash = "8263b9b9697f3251c51de95c653f97704713cef0c14dbc4f537dbdc8a901fd12"
|
||||
|
||||
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
|
||||
|
||||
[tool.poetry]
|
||||
name = "langchain"
|
||||
version = "0.3.13"
|
||||
version = "0.3.14"
|
||||
description = "Building applications with LLMs through composability"
|
||||
authors = []
|
||||
license = "MIT"
|
||||
@@ -33,7 +33,7 @@ langchain-server = "langchain.server:main"
|
||||
|
||||
[tool.poetry.dependencies]
|
||||
python = ">=3.9,<4.0"
|
||||
langchain-core = "^0.3.26"
|
||||
langchain-core = "^0.3.29"
|
||||
langchain-text-splitters = "^0.3.3"
|
||||
langsmith = ">=0.1.17,<0.3"
|
||||
pydantic = "^2.7.4"
|
||||
|
||||
@@ -320,4 +320,8 @@ packages:
|
||||
- name: langchain-dappier
|
||||
path: .
|
||||
repo: DappierAI/langchain-dappier
|
||||
downloads: 0
|
||||
- name: langchain-pull-md
|
||||
path: .
|
||||
repo: chigwell/langchain-pull-md
|
||||
downloads: 0
|
||||
@@ -2195,6 +2195,39 @@ def _resize(width: int, height: int) -> Tuple[int, int]:
|
||||
return width, height
|
||||
|
||||
|
||||
def _update_schema_with_optional_fields(input_dict: dict) -> dict:
|
||||
"""Convert optional fields to required fields allowing 'null' type."""
|
||||
|
||||
def _update_properties(schema: dict) -> None:
|
||||
if schema.get("type") != "object":
|
||||
return
|
||||
|
||||
properties = schema.get("properties", {})
|
||||
required_fields = schema.get("required", [])
|
||||
|
||||
for field, field_schema in properties.items():
|
||||
field_schema.pop("default", None)
|
||||
|
||||
if field_schema.get("type") == "object":
|
||||
_update_properties(field_schema)
|
||||
|
||||
if field not in required_fields:
|
||||
original_type = field_schema.get("type")
|
||||
if isinstance(original_type, str):
|
||||
field_schema["type"] = [original_type, "null"]
|
||||
elif isinstance(original_type, list) and "null" not in original_type:
|
||||
field_schema["type"].append("null")
|
||||
|
||||
required_fields.append(field)
|
||||
|
||||
schema["required"] = required_fields
|
||||
|
||||
schema = input_dict.get("json_schema", {}).get("schema", {})
|
||||
_update_properties(schema)
|
||||
|
||||
return input_dict
|
||||
|
||||
|
||||
def _convert_to_openai_response_format(
|
||||
schema: Union[Dict[str, Any], Type], *, strict: Optional[bool] = None
|
||||
) -> Union[Dict, TypeBaseModel]:
|
||||
@@ -2225,6 +2258,8 @@ def _convert_to_openai_response_format(
|
||||
f"'strict' is only specified in one place."
|
||||
)
|
||||
raise ValueError(msg)
|
||||
if strict:
|
||||
_update_schema_with_optional_fields(response_format)
|
||||
return response_format
|
||||
|
||||
|
||||
|
||||
@@ -18,7 +18,7 @@ from langchain_core.messages import (
|
||||
)
|
||||
from langchain_core.messages.ai import UsageMetadata
|
||||
from pydantic import BaseModel, Field
|
||||
from typing_extensions import TypedDict
|
||||
from typing_extensions import Annotated, TypedDict
|
||||
|
||||
from langchain_openai import ChatOpenAI
|
||||
from langchain_openai.chat_models.base import (
|
||||
@@ -822,6 +822,71 @@ def test__convert_to_openai_response_format() -> None:
|
||||
with pytest.raises(ValueError):
|
||||
_convert_to_openai_response_format(response_format, strict=False)
|
||||
|
||||
# Test handling of optional fields
|
||||
## TypedDict
|
||||
class Entity(TypedDict):
|
||||
"""Extracted entity."""
|
||||
|
||||
animal: Annotated[str, ..., "The animal"]
|
||||
color: Annotated[Optional[str], None, "The color"]
|
||||
|
||||
actual = _convert_to_openai_response_format(Entity, strict=True)
|
||||
expected = {
|
||||
"type": "json_schema",
|
||||
"json_schema": {
|
||||
"name": "Entity",
|
||||
"description": "Extracted entity.",
|
||||
"strict": True,
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"animal": {"description": "The animal", "type": "string"},
|
||||
"color": {"description": "The color", "type": ["string", "null"]},
|
||||
},
|
||||
"required": ["animal", "color"],
|
||||
"additionalProperties": False,
|
||||
},
|
||||
},
|
||||
}
|
||||
assert expected == actual
|
||||
|
||||
## JSON Schema
|
||||
class EntityModel(BaseModel):
|
||||
"""Extracted entity."""
|
||||
|
||||
animal: str = Field(description="The animal")
|
||||
color: Optional[str] = Field(default=None, description="The color")
|
||||
|
||||
actual = _convert_to_openai_response_format(
|
||||
EntityModel.model_json_schema(), strict=True
|
||||
)
|
||||
expected = {
|
||||
"type": "json_schema",
|
||||
"json_schema": {
|
||||
"name": "EntityModel",
|
||||
"description": "Extracted entity.",
|
||||
"strict": True,
|
||||
"schema": {
|
||||
"properties": {
|
||||
"animal": {
|
||||
"description": "The animal",
|
||||
"title": "Animal",
|
||||
"type": "string",
|
||||
},
|
||||
"color": {
|
||||
"anyOf": [{"type": "string"}, {"type": "null"}],
|
||||
"description": "The color",
|
||||
"title": "Color",
|
||||
},
|
||||
},
|
||||
"required": ["animal", "color"],
|
||||
"type": "object",
|
||||
"additionalProperties": False,
|
||||
},
|
||||
},
|
||||
}
|
||||
assert expected == actual
|
||||
|
||||
|
||||
@pytest.mark.parametrize("method", ["function_calling", "json_schema"])
|
||||
@pytest.mark.parametrize("strict", [True, None])
|
||||
|
||||
6
libs/partners/voyageai/poetry.lock
generated
6
libs/partners/voyageai/poetry.lock
generated
@@ -1,4 +1,4 @@
|
||||
# This file is automatically @generated by Poetry 1.8.4 and should not be changed by hand.
|
||||
# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand.
|
||||
|
||||
[[package]]
|
||||
name = "aiohappyeyeballs"
|
||||
@@ -676,7 +676,7 @@ files = [
|
||||
|
||||
[[package]]
|
||||
name = "langchain-core"
|
||||
version = "0.3.25"
|
||||
version = "0.3.29"
|
||||
description = "Building applications with LLMs through composability"
|
||||
optional = false
|
||||
python-versions = ">=3.9,<4.0"
|
||||
@@ -1905,4 +1905,4 @@ propcache = ">=0.2.0"
|
||||
[metadata]
|
||||
lock-version = "2.0"
|
||||
python-versions = ">=3.9,<3.13"
|
||||
content-hash = "e554f41a0c54d62654ba52624e92749d684ac6ef6483b5488c2c8810009e6fa9"
|
||||
content-hash = "e1582b69124ba27255c851d955e40ebc8c7483543240843d276a933f10fe71c7"
|
||||
|
||||
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
|
||||
|
||||
[tool.poetry]
|
||||
name = "langchain-voyageai"
|
||||
version = "0.1.3"
|
||||
version = "0.1.4"
|
||||
description = "An integration package connecting VoyageAI and LangChain"
|
||||
authors = []
|
||||
readme = "README.md"
|
||||
@@ -20,7 +20,7 @@ disallow_untyped_defs = "True"
|
||||
|
||||
[tool.poetry.dependencies]
|
||||
python = ">=3.9,<3.13"
|
||||
langchain-core = "^0.3.15"
|
||||
langchain-core = "^0.3.29"
|
||||
voyageai = ">=0.3.2,<1"
|
||||
pydantic = ">=2,<3"
|
||||
|
||||
|
||||
@@ -21,6 +21,7 @@ from langchain_core.utils.function_calling import tool_example_to_messages
|
||||
from pydantic import BaseModel, Field
|
||||
from pydantic.v1 import BaseModel as BaseModelV1
|
||||
from pydantic.v1 import Field as FieldV1
|
||||
from typing_extensions import Annotated, TypedDict
|
||||
|
||||
from langchain_tests.unit_tests.chat_models import (
|
||||
ChatModelTests,
|
||||
@@ -1293,6 +1294,7 @@ class ChatModelIntegrationTests(ChatModelTests):
|
||||
if not self.has_tool_calling:
|
||||
pytest.skip("Test requires tool calling.")
|
||||
|
||||
# Pydantic
|
||||
class Joke(BaseModel):
|
||||
"""Joke to tell user."""
|
||||
|
||||
@@ -1310,6 +1312,22 @@ class ChatModelIntegrationTests(ChatModelTests):
|
||||
joke_result = chat.invoke("Give me a joke about cats, include the punchline.")
|
||||
assert isinstance(joke_result, Joke)
|
||||
|
||||
# Schema
|
||||
chat = model.with_structured_output(Joke.model_json_schema())
|
||||
result = chat.invoke("Tell me a joke about cats.")
|
||||
assert isinstance(result, dict)
|
||||
|
||||
# TypedDict
|
||||
class JokeDict(TypedDict):
|
||||
"""Joke to tell user."""
|
||||
|
||||
setup: Annotated[str, ..., "question to set up a joke"]
|
||||
punchline: Annotated[Optional[str], None, "answer to resolve the joke"]
|
||||
|
||||
chat = model.with_structured_output(JokeDict)
|
||||
result = chat.invoke("Tell me a joke about cats.")
|
||||
assert isinstance(result, dict)
|
||||
|
||||
def test_json_mode(self, model: BaseChatModel) -> None:
|
||||
"""Test structured output via `JSON mode. <https://python.langchain.com/docs/concepts/structured_outputs/#json-mode>`_
|
||||
|
||||
|
||||
7
libs/text-splitters/poetry.lock
generated
7
libs/text-splitters/poetry.lock
generated
@@ -1,4 +1,4 @@
|
||||
# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand.
|
||||
# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand.
|
||||
|
||||
[[package]]
|
||||
name = "annotated-types"
|
||||
@@ -1426,7 +1426,7 @@ files = [
|
||||
|
||||
[[package]]
|
||||
name = "langchain-core"
|
||||
version = "0.3.26"
|
||||
version = "0.3.29"
|
||||
description = "Building applications with LLMs through composability"
|
||||
optional = false
|
||||
python-versions = ">=3.9,<4.0"
|
||||
@@ -2227,7 +2227,6 @@ description = "Nvidia JIT LTO Library"
|
||||
optional = false
|
||||
python-versions = ">=3"
|
||||
files = [
|
||||
{file = "nvidia_nvjitlink_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:4abe7fef64914ccfa909bc2ba39739670ecc9e820c83ccc7a6ed414122599b83"},
|
||||
{file = "nvidia_nvjitlink_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:06b3b9b25bf3f8af351d664978ca26a16d2c5127dbd53c0497e28d1fb9611d57"},
|
||||
{file = "nvidia_nvjitlink_cu12-12.4.127-py3-none-win_amd64.whl", hash = "sha256:fd9020c501d27d135f983c6d3e244b197a7ccad769e34df53a42e276b0e25fa1"},
|
||||
]
|
||||
@@ -4803,4 +4802,4 @@ type = ["pytest-mypy"]
|
||||
[metadata]
|
||||
lock-version = "2.0"
|
||||
python-versions = ">=3.9,<4.0"
|
||||
content-hash = "b7eb1002788ae30d0aaa8872266ee1fa12bd6f845ba3fbf76a8785f6425da25c"
|
||||
content-hash = "cdc4edb327b2a72ce35cda46d12dcd77403aa32ddf210e6ae96b32d98deba9b3"
|
||||
|
||||
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
|
||||
|
||||
[tool.poetry]
|
||||
name = "langchain-text-splitters"
|
||||
version = "0.3.4"
|
||||
version = "0.3.5"
|
||||
description = "LangChain text splitting utilities"
|
||||
authors = []
|
||||
license = "MIT"
|
||||
@@ -36,7 +36,7 @@ ignore_missing_imports = "True"
|
||||
|
||||
[tool.poetry.dependencies]
|
||||
python = ">=3.9,<4.0"
|
||||
langchain-core = "^0.3.26"
|
||||
langchain-core = "^0.3.29"
|
||||
|
||||
[tool.ruff.lint]
|
||||
select = ["E", "F", "I", "T201", "D"]
|
||||
|
||||
Reference in New Issue
Block a user