mirror of
https://github.com/hwchase17/langchain.git
synced 2025-11-29 18:17:10 +00:00
```python
"""python scripts/update_mypy_ruff.py"""
import glob
import tomllib
from pathlib import Path
import toml
import subprocess
import re
ROOT_DIR = Path(__file__).parents[1]
def main():
for path in glob.glob(str(ROOT_DIR / "libs/**/pyproject.toml"), recursive=True):
print(path)
with open(path, "rb") as f:
pyproject = tomllib.load(f)
try:
pyproject["tool"]["poetry"]["group"]["typing"]["dependencies"]["mypy"] = (
"^1.10"
)
pyproject["tool"]["poetry"]["group"]["lint"]["dependencies"]["ruff"] = (
"^0.5"
)
except KeyError:
continue
with open(path, "w") as f:
toml.dump(pyproject, f)
cwd = "/".join(path.split("/")[:-1])
completed = subprocess.run(
"poetry lock --no-update; poetry install --with typing; poetry run mypy . --no-color",
cwd=cwd,
shell=True,
capture_output=True,
text=True,
)
logs = completed.stdout.split("\n")
to_ignore = {}
for l in logs:
if re.match("^(.*)\:(\d+)\: error:.*\[(.*)\]", l):
path, line_no, error_type = re.match(
"^(.*)\:(\d+)\: error:.*\[(.*)\]", l
).groups()
if (path, line_no) in to_ignore:
to_ignore[(path, line_no)].append(error_type)
else:
to_ignore[(path, line_no)] = [error_type]
print(len(to_ignore))
for (error_path, line_no), error_types in to_ignore.items():
all_errors = ", ".join(error_types)
full_path = f"{cwd}/{error_path}"
try:
with open(full_path, "r") as f:
file_lines = f.readlines()
except FileNotFoundError:
continue
file_lines[int(line_no) - 1] = (
file_lines[int(line_no) - 1][:-1] + f" # type: ignore[{all_errors}]\n"
)
with open(full_path, "w") as f:
f.write("".join(file_lines))
subprocess.run(
"poetry run ruff format .; poetry run ruff --select I --fix .",
cwd=cwd,
shell=True,
capture_output=True,
text=True,
)
if __name__ == "__main__":
main()
```
143 lines
6.1 KiB
Python
143 lines
6.1 KiB
Python
"""Document transformers that use OpenAI Functions models"""
|
|
|
|
from typing import Any, Dict, Optional, Sequence, Type, Union
|
|
|
|
from langchain_core.documents import BaseDocumentTransformer, Document
|
|
from langchain_core.language_models import BaseLanguageModel
|
|
from langchain_core.prompts import ChatPromptTemplate
|
|
from langchain_core.pydantic_v1 import BaseModel
|
|
|
|
|
|
class OpenAIMetadataTagger(BaseDocumentTransformer, BaseModel):
|
|
"""Extract metadata tags from document contents using OpenAI functions.
|
|
|
|
Example:
|
|
.. code-block:: python
|
|
|
|
from langchain_community.chat_models import ChatOpenAI
|
|
from langchain_community.document_transformers import OpenAIMetadataTagger
|
|
from langchain_core.documents import Document
|
|
|
|
schema = {
|
|
"properties": {
|
|
"movie_title": { "type": "string" },
|
|
"critic": { "type": "string" },
|
|
"tone": {
|
|
"type": "string",
|
|
"enum": ["positive", "negative"]
|
|
},
|
|
"rating": {
|
|
"type": "integer",
|
|
"description": "The number of stars the critic rated the movie"
|
|
}
|
|
},
|
|
"required": ["movie_title", "critic", "tone"]
|
|
}
|
|
|
|
# Must be an OpenAI model that supports functions
|
|
llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo-0613")
|
|
tagging_chain = create_tagging_chain(schema, llm)
|
|
document_transformer = OpenAIMetadataTagger(tagging_chain=tagging_chain)
|
|
original_documents = [
|
|
Document(page_content="Review of The Bee Movie\nBy Roger Ebert\n\nThis is the greatest movie ever made. 4 out of 5 stars."),
|
|
Document(page_content="Review of The Godfather\nBy Anonymous\n\nThis movie was super boring. 1 out of 5 stars.", metadata={"reliable": False}),
|
|
]
|
|
|
|
enhanced_documents = document_transformer.transform_documents(original_documents)
|
|
""" # noqa: E501
|
|
|
|
tagging_chain: Any
|
|
"""The chain used to extract metadata from each document."""
|
|
|
|
def transform_documents(
|
|
self, documents: Sequence[Document], **kwargs: Any
|
|
) -> Sequence[Document]:
|
|
"""Automatically extract and populate metadata
|
|
for each document according to the provided schema."""
|
|
|
|
new_documents = []
|
|
|
|
for document in documents:
|
|
extracted_metadata: Dict = self.tagging_chain.run(document.page_content) # type: ignore[assignment]
|
|
new_document = Document(
|
|
page_content=document.page_content,
|
|
metadata={**extracted_metadata, **document.metadata},
|
|
)
|
|
new_documents.append(new_document)
|
|
return new_documents
|
|
|
|
async def atransform_documents(
|
|
self, documents: Sequence[Document], **kwargs: Any
|
|
) -> Sequence[Document]:
|
|
raise NotImplementedError
|
|
|
|
|
|
def create_metadata_tagger(
|
|
metadata_schema: Union[Dict[str, Any], Type[BaseModel]],
|
|
llm: BaseLanguageModel,
|
|
prompt: Optional[ChatPromptTemplate] = None,
|
|
*,
|
|
tagging_chain_kwargs: Optional[Dict] = None,
|
|
) -> OpenAIMetadataTagger:
|
|
"""Create a DocumentTransformer that uses an OpenAI function chain to automatically
|
|
tag documents with metadata based on their content and an input schema.
|
|
|
|
Args:
|
|
metadata_schema: Either a dictionary or pydantic.BaseModel class. If a dictionary
|
|
is passed in, it's assumed to already be a valid JsonSchema.
|
|
For best results, pydantic.BaseModels should have docstrings describing what
|
|
the schema represents and descriptions for the parameters.
|
|
llm: Language model to use, assumed to support the OpenAI function-calling API.
|
|
Defaults to use "gpt-3.5-turbo-0613"
|
|
prompt: BasePromptTemplate to pass to the model.
|
|
|
|
Returns:
|
|
An LLMChain that will pass the given function to the model.
|
|
|
|
Example:
|
|
.. code-block:: python
|
|
|
|
from langchain_community.chat_models import ChatOpenAI
|
|
from langchain_community.document_transformers import create_metadata_tagger
|
|
from langchain_core.documents import Document
|
|
|
|
schema = {
|
|
"properties": {
|
|
"movie_title": { "type": "string" },
|
|
"critic": { "type": "string" },
|
|
"tone": {
|
|
"type": "string",
|
|
"enum": ["positive", "negative"]
|
|
},
|
|
"rating": {
|
|
"type": "integer",
|
|
"description": "The number of stars the critic rated the movie"
|
|
}
|
|
},
|
|
"required": ["movie_title", "critic", "tone"]
|
|
}
|
|
|
|
# Must be an OpenAI model that supports functions
|
|
llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo-0613")
|
|
|
|
document_transformer = create_metadata_tagger(schema, llm)
|
|
original_documents = [
|
|
Document(page_content="Review of The Bee Movie\nBy Roger Ebert\n\nThis is the greatest movie ever made. 4 out of 5 stars."),
|
|
Document(page_content="Review of The Godfather\nBy Anonymous\n\nThis movie was super boring. 1 out of 5 stars.", metadata={"reliable": False}),
|
|
]
|
|
|
|
enhanced_documents = document_transformer.transform_documents(original_documents)
|
|
""" # noqa: E501
|
|
from langchain.chains.openai_functions import create_tagging_chain
|
|
|
|
metadata_schema = (
|
|
metadata_schema
|
|
if isinstance(metadata_schema, dict)
|
|
else metadata_schema.schema()
|
|
)
|
|
_tagging_chain_kwargs = tagging_chain_kwargs or {}
|
|
tagging_chain = create_tagging_chain(
|
|
metadata_schema, llm, prompt=prompt, **_tagging_chain_kwargs
|
|
)
|
|
return OpenAIMetadataTagger(tagging_chain=tagging_chain)
|