mirror of
				https://github.com/hwchase17/langchain.git
				synced 2025-10-25 04:30:13 +00:00 
			
		
		
		
	Moved the following modules to new package langchain-community in a backwards compatible fashion: ``` mv langchain/langchain/adapters community/langchain_community mv langchain/langchain/callbacks community/langchain_community/callbacks mv langchain/langchain/chat_loaders community/langchain_community mv langchain/langchain/chat_models community/langchain_community mv langchain/langchain/document_loaders community/langchain_community mv langchain/langchain/docstore community/langchain_community mv langchain/langchain/document_transformers community/langchain_community mv langchain/langchain/embeddings community/langchain_community mv langchain/langchain/graphs community/langchain_community mv langchain/langchain/llms community/langchain_community mv langchain/langchain/memory/chat_message_histories community/langchain_community mv langchain/langchain/retrievers community/langchain_community mv langchain/langchain/storage community/langchain_community mv langchain/langchain/tools community/langchain_community mv langchain/langchain/utilities community/langchain_community mv langchain/langchain/vectorstores community/langchain_community mv langchain/langchain/agents/agent_toolkits community/langchain_community mv langchain/langchain/cache.py community/langchain_community mv langchain/langchain/adapters community/langchain_community mv langchain/langchain/callbacks community/langchain_community/callbacks mv langchain/langchain/chat_loaders community/langchain_community mv langchain/langchain/chat_models community/langchain_community mv langchain/langchain/document_loaders community/langchain_community mv langchain/langchain/docstore community/langchain_community mv langchain/langchain/document_transformers community/langchain_community mv langchain/langchain/embeddings community/langchain_community mv langchain/langchain/graphs community/langchain_community mv langchain/langchain/llms community/langchain_community mv langchain/langchain/memory/chat_message_histories community/langchain_community mv langchain/langchain/retrievers community/langchain_community mv langchain/langchain/storage community/langchain_community mv langchain/langchain/tools community/langchain_community mv langchain/langchain/utilities community/langchain_community mv langchain/langchain/vectorstores community/langchain_community mv langchain/langchain/agents/agent_toolkits community/langchain_community mv langchain/langchain/cache.py community/langchain_community ``` Moved the following to core ``` mv langchain/langchain/utils/json_schema.py core/langchain_core/utils mv langchain/langchain/utils/html.py core/langchain_core/utils mv langchain/langchain/utils/strings.py core/langchain_core/utils cat langchain/langchain/utils/env.py >> core/langchain_core/utils/env.py rm langchain/langchain/utils/env.py ``` See .scripts/community_split/script_integrations.sh for all changes
		
			
				
	
	
		
			196 lines
		
	
	
		
			6.5 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			196 lines
		
	
	
		
			6.5 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| """Schema for Blobs and Blob Loaders.
 | |
| 
 | |
| The goal is to facilitate decoupling of content loading from content parsing code.
 | |
| 
 | |
| In addition, content loading code should provide a lazy loading interface by default.
 | |
| """
 | |
| from __future__ import annotations
 | |
| 
 | |
| import contextlib
 | |
| import mimetypes
 | |
| from abc import ABC, abstractmethod
 | |
| from io import BufferedReader, BytesIO
 | |
| from pathlib import PurePath
 | |
| from typing import Any, Dict, Generator, Iterable, Mapping, Optional, Union, cast
 | |
| 
 | |
| from langchain_core.pydantic_v1 import BaseModel, Field, root_validator
 | |
| 
 | |
| PathLike = Union[str, PurePath]
 | |
| 
 | |
| 
 | |
| class Blob(BaseModel):
 | |
|     """Blob represents raw data by either reference or value.
 | |
| 
 | |
|     Provides an interface to materialize the blob in different representations, and
 | |
|     help to decouple the development of data loaders from the downstream parsing of
 | |
|     the raw data.
 | |
| 
 | |
|     Inspired by: https://developer.mozilla.org/en-US/docs/Web/API/Blob
 | |
|     """
 | |
| 
 | |
|     data: Union[bytes, str, None]
 | |
|     """Raw data associated with the blob."""
 | |
|     mimetype: Optional[str] = None
 | |
|     """MimeType not to be confused with a file extension."""
 | |
|     encoding: str = "utf-8"
 | |
|     """Encoding to use if decoding the bytes into a string.
 | |
|     
 | |
|     Use utf-8 as default encoding, if decoding to string.
 | |
|     """
 | |
|     path: Optional[PathLike] = None
 | |
|     """Location where the original content was found."""
 | |
| 
 | |
|     metadata: Dict[str, Any] = Field(default_factory=dict)
 | |
|     """Metadata about the blob (e.g., source)"""
 | |
| 
 | |
|     class Config:
 | |
|         arbitrary_types_allowed = True
 | |
|         frozen = True
 | |
| 
 | |
|     @property
 | |
|     def source(self) -> Optional[str]:
 | |
|         """The source location of the blob as string if known otherwise none.
 | |
| 
 | |
|         If a path is associated with the blob, it will default to the path location.
 | |
| 
 | |
|         Unless explicitly set via a metadata field called "source", in which
 | |
|         case that value will be used instead.
 | |
|         """
 | |
|         if self.metadata and "source" in self.metadata:
 | |
|             return cast(Optional[str], self.metadata["source"])
 | |
|         return str(self.path) if self.path else None
 | |
| 
 | |
|     @root_validator(pre=True)
 | |
|     def check_blob_is_valid(cls, values: Mapping[str, Any]) -> Mapping[str, Any]:
 | |
|         """Verify that either data or path is provided."""
 | |
|         if "data" not in values and "path" not in values:
 | |
|             raise ValueError("Either data or path must be provided")
 | |
|         return values
 | |
| 
 | |
|     def as_string(self) -> str:
 | |
|         """Read data as a string."""
 | |
|         if self.data is None and self.path:
 | |
|             with open(str(self.path), "r", encoding=self.encoding) as f:
 | |
|                 return f.read()
 | |
|         elif isinstance(self.data, bytes):
 | |
|             return self.data.decode(self.encoding)
 | |
|         elif isinstance(self.data, str):
 | |
|             return self.data
 | |
|         else:
 | |
|             raise ValueError(f"Unable to get string for blob {self}")
 | |
| 
 | |
|     def as_bytes(self) -> bytes:
 | |
|         """Read data as bytes."""
 | |
|         if isinstance(self.data, bytes):
 | |
|             return self.data
 | |
|         elif isinstance(self.data, str):
 | |
|             return self.data.encode(self.encoding)
 | |
|         elif self.data is None and self.path:
 | |
|             with open(str(self.path), "rb") as f:
 | |
|                 return f.read()
 | |
|         else:
 | |
|             raise ValueError(f"Unable to get bytes for blob {self}")
 | |
| 
 | |
|     @contextlib.contextmanager
 | |
|     def as_bytes_io(self) -> Generator[Union[BytesIO, BufferedReader], None, None]:
 | |
|         """Read data as a byte stream."""
 | |
|         if isinstance(self.data, bytes):
 | |
|             yield BytesIO(self.data)
 | |
|         elif self.data is None and self.path:
 | |
|             with open(str(self.path), "rb") as f:
 | |
|                 yield f
 | |
|         else:
 | |
|             raise NotImplementedError(f"Unable to convert blob {self}")
 | |
| 
 | |
|     @classmethod
 | |
|     def from_path(
 | |
|         cls,
 | |
|         path: PathLike,
 | |
|         *,
 | |
|         encoding: str = "utf-8",
 | |
|         mime_type: Optional[str] = None,
 | |
|         guess_type: bool = True,
 | |
|         metadata: Optional[dict] = None,
 | |
|     ) -> Blob:
 | |
|         """Load the blob from a path like object.
 | |
| 
 | |
|         Args:
 | |
|             path: path like object to file to be read
 | |
|             encoding: Encoding to use if decoding the bytes into a string
 | |
|             mime_type: if provided, will be set as the mime-type of the data
 | |
|             guess_type: If True, the mimetype will be guessed from the file extension,
 | |
|                         if a mime-type was not provided
 | |
|             metadata: Metadata to associate with the blob
 | |
| 
 | |
|         Returns:
 | |
|             Blob instance
 | |
|         """
 | |
|         if mime_type is None and guess_type:
 | |
|             _mimetype = mimetypes.guess_type(path)[0] if guess_type else None
 | |
|         else:
 | |
|             _mimetype = mime_type
 | |
|         # We do not load the data immediately, instead we treat the blob as a
 | |
|         # reference to the underlying data.
 | |
|         return cls(
 | |
|             data=None,
 | |
|             mimetype=_mimetype,
 | |
|             encoding=encoding,
 | |
|             path=path,
 | |
|             metadata=metadata if metadata is not None else {},
 | |
|         )
 | |
| 
 | |
|     @classmethod
 | |
|     def from_data(
 | |
|         cls,
 | |
|         data: Union[str, bytes],
 | |
|         *,
 | |
|         encoding: str = "utf-8",
 | |
|         mime_type: Optional[str] = None,
 | |
|         path: Optional[str] = None,
 | |
|         metadata: Optional[dict] = None,
 | |
|     ) -> Blob:
 | |
|         """Initialize the blob from in-memory data.
 | |
| 
 | |
|         Args:
 | |
|             data: the in-memory data associated with the blob
 | |
|             encoding: Encoding to use if decoding the bytes into a string
 | |
|             mime_type: if provided, will be set as the mime-type of the data
 | |
|             path: if provided, will be set as the source from which the data came
 | |
|             metadata: Metadata to associate with the blob
 | |
| 
 | |
|         Returns:
 | |
|             Blob instance
 | |
|         """
 | |
|         return cls(
 | |
|             data=data,
 | |
|             mimetype=mime_type,
 | |
|             encoding=encoding,
 | |
|             path=path,
 | |
|             metadata=metadata if metadata is not None else {},
 | |
|         )
 | |
| 
 | |
|     def __repr__(self) -> str:
 | |
|         """Define the blob representation."""
 | |
|         str_repr = f"Blob {id(self)}"
 | |
|         if self.source:
 | |
|             str_repr += f" {self.source}"
 | |
|         return str_repr
 | |
| 
 | |
| 
 | |
| class BlobLoader(ABC):
 | |
|     """Abstract interface for blob loaders implementation.
 | |
| 
 | |
|     Implementer should be able to load raw content from a storage system according
 | |
|     to some criteria and return the raw content lazily as a stream of blobs.
 | |
|     """
 | |
| 
 | |
|     @abstractmethod
 | |
|     def yield_blobs(
 | |
|         self,
 | |
|     ) -> Iterable[Blob]:
 | |
|         """A lazy loader for raw data represented by LangChain's Blob object.
 | |
| 
 | |
|         Returns:
 | |
|             A generator over blobs
 | |
|         """
 |