community[minor]: add hugging_face_model document loader (#17323)

- **Description:** add hugging_face_model document loader, - **Issue:** NA, - **Dependencies:** NA, --------- Co-authored-by: Eugene Yurtsev <eyurtsev@gmail.com>
2025-08-07 12:06:43 +00:00 · 2024-02-29 04:05:35 +08:00 · 2024-02-29 04:05:35 +08:00 · af35e2525a
commit af35e2525a
parent b9a495e56e
4 changed files with 201 additions and 0 deletions
--- a/libs/community/langchain_community/document_loaders/init.py
+++ b/libs/community/langchain_community/document_loaders/init.py
@ -118,6 +118,9 @@ from langchain_community.document_loaders.html_bs import BSHTMLLoader
 from langchain_community.document_loaders.hugging_face_dataset import (
    HuggingFaceDatasetLoader,
 )
 from langchain_community.document_loaders.hugging_face_model import (
    HuggingFaceModelLoader,
 )
 from langchain_community.document_loaders.ifixit import IFixitLoader
 from langchain_community.document_loaders.image import UnstructuredImageLoader
 from langchain_community.document_loaders.image_captions import ImageCaptionLoader
@ -315,6 +318,7 @@ __all__ = [
    "GutenbergLoader",
    "HNLoader",
    "HuggingFaceDatasetLoader",
    "HuggingFaceModelLoader",
    "IFixitLoader",
    "IMSDbLoader",
    "ImageCaptionLoader",
--- a/libs/community/langchain_community/document_loaders/hugging_face_model.py
+++ b/libs/community/langchain_community/document_loaders/hugging_face_model.py
@ -0,0 +1,112 @@
 from typing import Iterator, List, Optional
 import requests
 from langchain_core.documents import Document
 from langchain_community.document_loaders.base import BaseLoader
 class HuggingFaceModelLoader(BaseLoader):
    """
    Load model information from `Hugging Face Hub`, including README content.
    This loader interfaces with the Hugging Face Models API to fetch and load
    model metadata and README files.
    The API allows you to search and filter models based on specific criteria
    such as model tags, authors, and more.
    API URL: https://huggingface.co/api/models
    DOC URL: https://huggingface.co/docs/hub/en/api
    Examples:
        .. code-block:: python
            from langchain_community.document_loaders import HuggingFaceModelLoader
            # Initialize the loader with search criteria
            loader = HuggingFaceModelLoader(search="bert", limit=10)
            # Load models
            documents = loader.load()
            # Iterate through the fetched documents
            for doc in documents:
                print(doc.page_content)  # README content of the model
                print(doc.metadata)      # Metadata of the model
    """
    BASE_URL = "https://huggingface.co/api/models"
    README_BASE_URL = "https://huggingface.co/{model_id}/raw/main/README.md"
    def __init__(
        self,
        *,
        search: Optional[str] = None,
        author: Optional[str] = None,
        filter: Optional[str] = None,
        sort: Optional[str] = None,
        direction: Optional[str] = None,
        limit: Optional[int] = 3,
        full: Optional[bool] = None,
        config: Optional[bool] = None,
    ):
        """Initialize the HuggingFaceModelLoader.
        Args:
            search: Filter based on substrings for repos and their usernames.
            author: Filter models by an author or organization.
            filter: Filter based on tags.
            sort: Property to use when sorting.
            direction: Direction in which to sort.
            limit: Limit the number of models fetched.
            full: Whether to fetch most model data.
            config: Whether to also fetch the repo config.
        """
        self.params = {
            "search": search,
            "author": author,
            "filter": filter,
            "sort": sort,
            "direction": direction,
            "limit": limit,
            "full": full,
            "config": config,
        }
    def fetch_models(self) -> List[dict]:
        """Fetch model information from Hugging Face Hub."""
        response = requests.get(
            self.BASE_URL,
            params={k: v for k, v in self.params.items() if v is not None},
        )
        response.raise_for_status()
        return response.json()
    def fetch_readme_content(self, model_id: str) -> str:
        """Fetch the README content for a given model."""
        readme_url = self.README_BASE_URL.format(model_id=model_id)
        try:
            response = requests.get(readme_url)
            response.raise_for_status()
            return response.text
        except requests.RequestException:
            return "README not available for this model."
    def lazy_load(self) -> Iterator[Document]:
        """Load model information lazily, including README content."""
        models = self.fetch_models()
        for model in models:
            model_id = model.get("modelId", "")
            readme_content = self.fetch_readme_content(model_id)
            yield Document(
                page_content=readme_content,
                metadata=model,
            )
    def load(self) -> List[Document]:
        """Load model information, including README content."""
        return list(self.lazy_load())
--- a/libs/community/tests/unit_tests/document_loaders/test_hugging_face_model.py
+++ b/libs/community/tests/unit_tests/document_loaders/test_hugging_face_model.py
@ -0,0 +1,84 @@
 import json
 from typing import Tuple
 import responses
 from requests import Request
 from langchain_community.document_loaders import HuggingFaceModelLoader
 # Mocked model data to simulate an API response
 MOCKED_MODELS_RESPONSE = [
    {
        "_id": "657a1fff16886e681230c05a",
        "id": "microsoft/phi-2",
        "likes": 2692,
        "private": False,
        "downloads": 546775,
        "tags": [
            "transformers",
            "safetensors",
            "phi",
            "text-generation",
            "nlp",
            "code",
            "custom_code",
            "en",
            "license:mit",
            "autotrain_compatible",
            "endpoints_compatible",
            "has_space",
            "region:us",
        ],
        "pipeline_tag": "text-generation",
        "library_name": "transformers",
        "createdAt": "2023-12-13T21:19:59.000Z",
        "modelId": "microsoft/phi-2",
    },
    # Add additional models as needed
 ]
 # Mocked README content for models
 MOCKED_README_CONTENT = {
    "microsoft/phi-2": "README content for microsoft/phi-2",
    "openai/gpt-3": "README content for openai/gpt-3",
 }
 def response_callback(request: Request) -> Tuple[int, dict, str]:
    if "/api/models" in request.url:
        return (200, {}, json.dumps(MOCKED_MODELS_RESPONSE))
    elif "README.md" in request.url:
        model_id = (
            request.url.split("/")[3] + "/" + request.url.split("/")[4]
        )  # Extract model_id
        content = MOCKED_README_CONTENT.get(model_id, "")
        return (200, {}, content)
    return (404, {}, "Not Found")
@responses.activate
 def test_load_models_with_readme() -> None:
    """Tests loading models along with their README content."""
    responses.add_callback(
        responses.GET,
        "https://huggingface.co/api/models",
        callback=response_callback,  # type: ignore
        content_type="application/json",
    )
    responses.add_callback(
        responses.GET,
        # Use a regex or update this placeholder
        "https://huggingface.co/microsoft/phi-2/raw/main/README.md",
        callback=response_callback,  # type: ignore
        content_type="text/plain",
    )
    loader = HuggingFaceModelLoader(search="phi-2", limit=2)
    docs = loader.load()
    assert len(docs) == len(MOCKED_MODELS_RESPONSE)
    for doc, expected_model in zip(docs, MOCKED_MODELS_RESPONSE):
        id_ = expected_model["id"]
        assert isinstance(id_, str)
        assert doc.page_content == MOCKED_README_CONTENT[id_]
        assert doc.metadata["modelId"] == expected_model["id"]
--- a/libs/community/tests/unit_tests/document_loaders/test_imports.py
+++ b/libs/community/tests/unit_tests/document_loaders/test_imports.py
@ -78,6 +78,7 @@ EXPECTED_ALL = [
    "GutenbergLoader",
    "HNLoader",
    "HuggingFaceDatasetLoader",
    "HuggingFaceModelLoader",
    "IFixitLoader",
    "IMSDbLoader",
    "ImageCaptionLoader",