mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-21 22:29:51 +00:00
community[minor]: add hugging_face_model document loader (#17323)
- **Description:** add hugging_face_model document loader, - **Issue:** NA, - **Dependencies:** NA, --------- Co-authored-by: Eugene Yurtsev <eyurtsev@gmail.com>
This commit is contained in:
parent
b9a495e56e
commit
af35e2525a
@ -118,6 +118,9 @@ from langchain_community.document_loaders.html_bs import BSHTMLLoader
|
|||||||
from langchain_community.document_loaders.hugging_face_dataset import (
|
from langchain_community.document_loaders.hugging_face_dataset import (
|
||||||
HuggingFaceDatasetLoader,
|
HuggingFaceDatasetLoader,
|
||||||
)
|
)
|
||||||
|
from langchain_community.document_loaders.hugging_face_model import (
|
||||||
|
HuggingFaceModelLoader,
|
||||||
|
)
|
||||||
from langchain_community.document_loaders.ifixit import IFixitLoader
|
from langchain_community.document_loaders.ifixit import IFixitLoader
|
||||||
from langchain_community.document_loaders.image import UnstructuredImageLoader
|
from langchain_community.document_loaders.image import UnstructuredImageLoader
|
||||||
from langchain_community.document_loaders.image_captions import ImageCaptionLoader
|
from langchain_community.document_loaders.image_captions import ImageCaptionLoader
|
||||||
@ -315,6 +318,7 @@ __all__ = [
|
|||||||
"GutenbergLoader",
|
"GutenbergLoader",
|
||||||
"HNLoader",
|
"HNLoader",
|
||||||
"HuggingFaceDatasetLoader",
|
"HuggingFaceDatasetLoader",
|
||||||
|
"HuggingFaceModelLoader",
|
||||||
"IFixitLoader",
|
"IFixitLoader",
|
||||||
"IMSDbLoader",
|
"IMSDbLoader",
|
||||||
"ImageCaptionLoader",
|
"ImageCaptionLoader",
|
||||||
|
@ -0,0 +1,112 @@
|
|||||||
|
from typing import Iterator, List, Optional
|
||||||
|
|
||||||
|
import requests
|
||||||
|
from langchain_core.documents import Document
|
||||||
|
|
||||||
|
from langchain_community.document_loaders.base import BaseLoader
|
||||||
|
|
||||||
|
|
||||||
|
class HuggingFaceModelLoader(BaseLoader):
|
||||||
|
"""
|
||||||
|
Load model information from `Hugging Face Hub`, including README content.
|
||||||
|
|
||||||
|
This loader interfaces with the Hugging Face Models API to fetch and load
|
||||||
|
model metadata and README files.
|
||||||
|
The API allows you to search and filter models based on specific criteria
|
||||||
|
such as model tags, authors, and more.
|
||||||
|
|
||||||
|
API URL: https://huggingface.co/api/models
|
||||||
|
DOC URL: https://huggingface.co/docs/hub/en/api
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
|
||||||
|
.. code-block:: python
|
||||||
|
|
||||||
|
from langchain_community.document_loaders import HuggingFaceModelLoader
|
||||||
|
|
||||||
|
# Initialize the loader with search criteria
|
||||||
|
loader = HuggingFaceModelLoader(search="bert", limit=10)
|
||||||
|
|
||||||
|
# Load models
|
||||||
|
documents = loader.load()
|
||||||
|
|
||||||
|
# Iterate through the fetched documents
|
||||||
|
for doc in documents:
|
||||||
|
print(doc.page_content) # README content of the model
|
||||||
|
print(doc.metadata) # Metadata of the model
|
||||||
|
"""
|
||||||
|
|
||||||
|
BASE_URL = "https://huggingface.co/api/models"
|
||||||
|
README_BASE_URL = "https://huggingface.co/{model_id}/raw/main/README.md"
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
*,
|
||||||
|
search: Optional[str] = None,
|
||||||
|
author: Optional[str] = None,
|
||||||
|
filter: Optional[str] = None,
|
||||||
|
sort: Optional[str] = None,
|
||||||
|
direction: Optional[str] = None,
|
||||||
|
limit: Optional[int] = 3,
|
||||||
|
full: Optional[bool] = None,
|
||||||
|
config: Optional[bool] = None,
|
||||||
|
):
|
||||||
|
"""Initialize the HuggingFaceModelLoader.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
search: Filter based on substrings for repos and their usernames.
|
||||||
|
author: Filter models by an author or organization.
|
||||||
|
filter: Filter based on tags.
|
||||||
|
sort: Property to use when sorting.
|
||||||
|
direction: Direction in which to sort.
|
||||||
|
limit: Limit the number of models fetched.
|
||||||
|
full: Whether to fetch most model data.
|
||||||
|
config: Whether to also fetch the repo config.
|
||||||
|
"""
|
||||||
|
|
||||||
|
self.params = {
|
||||||
|
"search": search,
|
||||||
|
"author": author,
|
||||||
|
"filter": filter,
|
||||||
|
"sort": sort,
|
||||||
|
"direction": direction,
|
||||||
|
"limit": limit,
|
||||||
|
"full": full,
|
||||||
|
"config": config,
|
||||||
|
}
|
||||||
|
|
||||||
|
def fetch_models(self) -> List[dict]:
|
||||||
|
"""Fetch model information from Hugging Face Hub."""
|
||||||
|
response = requests.get(
|
||||||
|
self.BASE_URL,
|
||||||
|
params={k: v for k, v in self.params.items() if v is not None},
|
||||||
|
)
|
||||||
|
response.raise_for_status()
|
||||||
|
return response.json()
|
||||||
|
|
||||||
|
def fetch_readme_content(self, model_id: str) -> str:
|
||||||
|
"""Fetch the README content for a given model."""
|
||||||
|
readme_url = self.README_BASE_URL.format(model_id=model_id)
|
||||||
|
try:
|
||||||
|
response = requests.get(readme_url)
|
||||||
|
response.raise_for_status()
|
||||||
|
return response.text
|
||||||
|
except requests.RequestException:
|
||||||
|
return "README not available for this model."
|
||||||
|
|
||||||
|
def lazy_load(self) -> Iterator[Document]:
|
||||||
|
"""Load model information lazily, including README content."""
|
||||||
|
models = self.fetch_models()
|
||||||
|
|
||||||
|
for model in models:
|
||||||
|
model_id = model.get("modelId", "")
|
||||||
|
readme_content = self.fetch_readme_content(model_id)
|
||||||
|
|
||||||
|
yield Document(
|
||||||
|
page_content=readme_content,
|
||||||
|
metadata=model,
|
||||||
|
)
|
||||||
|
|
||||||
|
def load(self) -> List[Document]:
|
||||||
|
"""Load model information, including README content."""
|
||||||
|
return list(self.lazy_load())
|
@ -0,0 +1,84 @@
|
|||||||
|
import json
|
||||||
|
from typing import Tuple
|
||||||
|
|
||||||
|
import responses
|
||||||
|
from requests import Request
|
||||||
|
|
||||||
|
from langchain_community.document_loaders import HuggingFaceModelLoader
|
||||||
|
|
||||||
|
# Mocked model data to simulate an API response
|
||||||
|
MOCKED_MODELS_RESPONSE = [
|
||||||
|
{
|
||||||
|
"_id": "657a1fff16886e681230c05a",
|
||||||
|
"id": "microsoft/phi-2",
|
||||||
|
"likes": 2692,
|
||||||
|
"private": False,
|
||||||
|
"downloads": 546775,
|
||||||
|
"tags": [
|
||||||
|
"transformers",
|
||||||
|
"safetensors",
|
||||||
|
"phi",
|
||||||
|
"text-generation",
|
||||||
|
"nlp",
|
||||||
|
"code",
|
||||||
|
"custom_code",
|
||||||
|
"en",
|
||||||
|
"license:mit",
|
||||||
|
"autotrain_compatible",
|
||||||
|
"endpoints_compatible",
|
||||||
|
"has_space",
|
||||||
|
"region:us",
|
||||||
|
],
|
||||||
|
"pipeline_tag": "text-generation",
|
||||||
|
"library_name": "transformers",
|
||||||
|
"createdAt": "2023-12-13T21:19:59.000Z",
|
||||||
|
"modelId": "microsoft/phi-2",
|
||||||
|
},
|
||||||
|
# Add additional models as needed
|
||||||
|
]
|
||||||
|
|
||||||
|
# Mocked README content for models
|
||||||
|
MOCKED_README_CONTENT = {
|
||||||
|
"microsoft/phi-2": "README content for microsoft/phi-2",
|
||||||
|
"openai/gpt-3": "README content for openai/gpt-3",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def response_callback(request: Request) -> Tuple[int, dict, str]:
|
||||||
|
if "/api/models" in request.url:
|
||||||
|
return (200, {}, json.dumps(MOCKED_MODELS_RESPONSE))
|
||||||
|
elif "README.md" in request.url:
|
||||||
|
model_id = (
|
||||||
|
request.url.split("/")[3] + "/" + request.url.split("/")[4]
|
||||||
|
) # Extract model_id
|
||||||
|
content = MOCKED_README_CONTENT.get(model_id, "")
|
||||||
|
return (200, {}, content)
|
||||||
|
return (404, {}, "Not Found")
|
||||||
|
|
||||||
|
|
||||||
|
@responses.activate
|
||||||
|
def test_load_models_with_readme() -> None:
|
||||||
|
"""Tests loading models along with their README content."""
|
||||||
|
responses.add_callback(
|
||||||
|
responses.GET,
|
||||||
|
"https://huggingface.co/api/models",
|
||||||
|
callback=response_callback, # type: ignore
|
||||||
|
content_type="application/json",
|
||||||
|
)
|
||||||
|
responses.add_callback(
|
||||||
|
responses.GET,
|
||||||
|
# Use a regex or update this placeholder
|
||||||
|
"https://huggingface.co/microsoft/phi-2/raw/main/README.md",
|
||||||
|
callback=response_callback, # type: ignore
|
||||||
|
content_type="text/plain",
|
||||||
|
)
|
||||||
|
|
||||||
|
loader = HuggingFaceModelLoader(search="phi-2", limit=2)
|
||||||
|
docs = loader.load()
|
||||||
|
|
||||||
|
assert len(docs) == len(MOCKED_MODELS_RESPONSE)
|
||||||
|
for doc, expected_model in zip(docs, MOCKED_MODELS_RESPONSE):
|
||||||
|
id_ = expected_model["id"]
|
||||||
|
assert isinstance(id_, str)
|
||||||
|
assert doc.page_content == MOCKED_README_CONTENT[id_]
|
||||||
|
assert doc.metadata["modelId"] == expected_model["id"]
|
@ -78,6 +78,7 @@ EXPECTED_ALL = [
|
|||||||
"GutenbergLoader",
|
"GutenbergLoader",
|
||||||
"HNLoader",
|
"HNLoader",
|
||||||
"HuggingFaceDatasetLoader",
|
"HuggingFaceDatasetLoader",
|
||||||
|
"HuggingFaceModelLoader",
|
||||||
"IFixitLoader",
|
"IFixitLoader",
|
||||||
"IMSDbLoader",
|
"IMSDbLoader",
|
||||||
"ImageCaptionLoader",
|
"ImageCaptionLoader",
|
||||||
|
Loading…
Reference in New Issue
Block a user