mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-01 19:12:42 +00:00
community[major], core[patch], langchain[patch], experimental[patch]: Create langchain-community (#14463)
Moved the following modules to new package langchain-community in a backwards compatible fashion: ``` mv langchain/langchain/adapters community/langchain_community mv langchain/langchain/callbacks community/langchain_community/callbacks mv langchain/langchain/chat_loaders community/langchain_community mv langchain/langchain/chat_models community/langchain_community mv langchain/langchain/document_loaders community/langchain_community mv langchain/langchain/docstore community/langchain_community mv langchain/langchain/document_transformers community/langchain_community mv langchain/langchain/embeddings community/langchain_community mv langchain/langchain/graphs community/langchain_community mv langchain/langchain/llms community/langchain_community mv langchain/langchain/memory/chat_message_histories community/langchain_community mv langchain/langchain/retrievers community/langchain_community mv langchain/langchain/storage community/langchain_community mv langchain/langchain/tools community/langchain_community mv langchain/langchain/utilities community/langchain_community mv langchain/langchain/vectorstores community/langchain_community mv langchain/langchain/agents/agent_toolkits community/langchain_community mv langchain/langchain/cache.py community/langchain_community mv langchain/langchain/adapters community/langchain_community mv langchain/langchain/callbacks community/langchain_community/callbacks mv langchain/langchain/chat_loaders community/langchain_community mv langchain/langchain/chat_models community/langchain_community mv langchain/langchain/document_loaders community/langchain_community mv langchain/langchain/docstore community/langchain_community mv langchain/langchain/document_transformers community/langchain_community mv langchain/langchain/embeddings community/langchain_community mv langchain/langchain/graphs community/langchain_community mv langchain/langchain/llms community/langchain_community mv langchain/langchain/memory/chat_message_histories community/langchain_community mv langchain/langchain/retrievers community/langchain_community mv langchain/langchain/storage community/langchain_community mv langchain/langchain/tools community/langchain_community mv langchain/langchain/utilities community/langchain_community mv langchain/langchain/vectorstores community/langchain_community mv langchain/langchain/agents/agent_toolkits community/langchain_community mv langchain/langchain/cache.py community/langchain_community ``` Moved the following to core ``` mv langchain/langchain/utils/json_schema.py core/langchain_core/utils mv langchain/langchain/utils/html.py core/langchain_core/utils mv langchain/langchain/utils/strings.py core/langchain_core/utils cat langchain/langchain/utils/env.py >> core/langchain_core/utils/env.py rm langchain/langchain/utils/env.py ``` See .scripts/community_split/script_integrations.sh for all changes
This commit is contained in:
@@ -0,0 +1,151 @@
|
||||
import json
|
||||
from pathlib import Path
|
||||
from typing import Any, Callable, Dict, List, Optional, Union
|
||||
|
||||
from langchain_core.documents import Document
|
||||
|
||||
from langchain_community.document_loaders.base import BaseLoader
|
||||
|
||||
|
||||
class JSONLoader(BaseLoader):
|
||||
"""Load a `JSON` file using a `jq` schema.
|
||||
|
||||
Example:
|
||||
[{"text": ...}, {"text": ...}, {"text": ...}] -> schema = .[].text
|
||||
{"key": [{"text": ...}, {"text": ...}, {"text": ...}]} -> schema = .key[].text
|
||||
["", "", ""] -> schema = .[]
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
file_path: Union[str, Path],
|
||||
jq_schema: str,
|
||||
content_key: Optional[str] = None,
|
||||
metadata_func: Optional[Callable[[Dict, Dict], Dict]] = None,
|
||||
text_content: bool = True,
|
||||
json_lines: bool = False,
|
||||
):
|
||||
"""Initialize the JSONLoader.
|
||||
|
||||
Args:
|
||||
file_path (Union[str, Path]): The path to the JSON or JSON Lines file.
|
||||
jq_schema (str): The jq schema to use to extract the data or text from
|
||||
the JSON.
|
||||
content_key (str): The key to use to extract the content from the JSON if
|
||||
the jq_schema results to a list of objects (dict).
|
||||
metadata_func (Callable[Dict, Dict]): A function that takes in the JSON
|
||||
object extracted by the jq_schema and the default metadata and returns
|
||||
a dict of the updated metadata.
|
||||
text_content (bool): Boolean flag to indicate whether the content is in
|
||||
string format, default to True.
|
||||
json_lines (bool): Boolean flag to indicate whether the input is in
|
||||
JSON Lines format.
|
||||
"""
|
||||
try:
|
||||
import jq # noqa:F401
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"jq package not found, please install it with `pip install jq`"
|
||||
)
|
||||
|
||||
self.file_path = Path(file_path).resolve()
|
||||
self._jq_schema = jq.compile(jq_schema)
|
||||
self._content_key = content_key
|
||||
self._metadata_func = metadata_func
|
||||
self._text_content = text_content
|
||||
self._json_lines = json_lines
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load and return documents from the JSON file."""
|
||||
docs: List[Document] = []
|
||||
if self._json_lines:
|
||||
with self.file_path.open(encoding="utf-8") as f:
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if line:
|
||||
self._parse(line, docs)
|
||||
else:
|
||||
self._parse(self.file_path.read_text(encoding="utf-8"), docs)
|
||||
return docs
|
||||
|
||||
def _parse(self, content: str, docs: List[Document]) -> None:
|
||||
"""Convert given content to documents."""
|
||||
data = self._jq_schema.input(json.loads(content))
|
||||
|
||||
# Perform some validation
|
||||
# This is not a perfect validation, but it should catch most cases
|
||||
# and prevent the user from getting a cryptic error later on.
|
||||
if self._content_key is not None:
|
||||
self._validate_content_key(data)
|
||||
if self._metadata_func is not None:
|
||||
self._validate_metadata_func(data)
|
||||
|
||||
for i, sample in enumerate(data, len(docs) + 1):
|
||||
text = self._get_text(sample=sample)
|
||||
metadata = self._get_metadata(
|
||||
sample=sample, source=str(self.file_path), seq_num=i
|
||||
)
|
||||
docs.append(Document(page_content=text, metadata=metadata))
|
||||
|
||||
def _get_text(self, sample: Any) -> str:
|
||||
"""Convert sample to string format"""
|
||||
if self._content_key is not None:
|
||||
content = sample.get(self._content_key)
|
||||
else:
|
||||
content = sample
|
||||
|
||||
if self._text_content and not isinstance(content, str):
|
||||
raise ValueError(
|
||||
f"Expected page_content is string, got {type(content)} instead. \
|
||||
Set `text_content=False` if the desired input for \
|
||||
`page_content` is not a string"
|
||||
)
|
||||
|
||||
# In case the text is None, set it to an empty string
|
||||
elif isinstance(content, str):
|
||||
return content
|
||||
elif isinstance(content, dict):
|
||||
return json.dumps(content) if content else ""
|
||||
else:
|
||||
return str(content) if content is not None else ""
|
||||
|
||||
def _get_metadata(
|
||||
self, sample: Dict[str, Any], **additional_fields: Any
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Return a metadata dictionary base on the existence of metadata_func
|
||||
:param sample: single data payload
|
||||
:param additional_fields: key-word arguments to be added as metadata values
|
||||
:return:
|
||||
"""
|
||||
if self._metadata_func is not None:
|
||||
return self._metadata_func(sample, additional_fields)
|
||||
else:
|
||||
return additional_fields
|
||||
|
||||
def _validate_content_key(self, data: Any) -> None:
|
||||
"""Check if a content key is valid"""
|
||||
sample = data.first()
|
||||
if not isinstance(sample, dict):
|
||||
raise ValueError(
|
||||
f"Expected the jq schema to result in a list of objects (dict), \
|
||||
so sample must be a dict but got `{type(sample)}`"
|
||||
)
|
||||
|
||||
if sample.get(self._content_key) is None:
|
||||
raise ValueError(
|
||||
f"Expected the jq schema to result in a list of objects (dict) \
|
||||
with the key `{self._content_key}`"
|
||||
)
|
||||
|
||||
def _validate_metadata_func(self, data: Any) -> None:
|
||||
"""Check if the metadata_func output is valid"""
|
||||
|
||||
sample = data.first()
|
||||
if self._metadata_func is not None:
|
||||
sample_metadata = self._metadata_func(sample, {})
|
||||
if not isinstance(sample_metadata, dict):
|
||||
raise ValueError(
|
||||
f"Expected the metadata_func to return a dict but got \
|
||||
`{type(sample_metadata)}`"
|
||||
)
|
Reference in New Issue
Block a user