community[minor]: add hugging_face_model document loader (#17323)

- **Description:** add hugging_face_model document loader, - **Issue:** NA, - **Dependencies:** NA, --------- Co-authored-by: Eugene Yurtsev <eyurtsev@gmail.com>
2025-08-06 11:37:12 +00:00 · 2024-02-29 04:05:35 +08:00 · 2024-02-29 04:05:35 +08:00 · af35e2525a
commit af35e2525a
parent b9a495e56e
4 changed files with 201 additions and 0 deletions
--- a/libs/community/langchain_community/document_loaders/init.py
+++ b/libs/community/langchain_community/document_loaders/init.py
@ -118,6 +118,9 @@ from langchain_community.document_loaders.html_bs import BSHTMLLoader
 from langchain_community.document_loaders.hugging_face_dataset import (
    HuggingFaceDatasetLoader,
 )
+from langchain_community.document_loaders.hugging_face_model import (
+    HuggingFaceModelLoader,
+)
 from langchain_community.document_loaders.ifixit import IFixitLoader
 from langchain_community.document_loaders.image import UnstructuredImageLoader
 from langchain_community.document_loaders.image_captions import ImageCaptionLoader
@ -315,6 +318,7 @@ __all__ = [
    "GutenbergLoader",
    "HNLoader",
    "HuggingFaceDatasetLoader",
+    "HuggingFaceModelLoader",
    "IFixitLoader",
    "IMSDbLoader",
    "ImageCaptionLoader",
--- a/libs/community/langchain_community/document_loaders/hugging_face_model.py
+++ b/libs/community/langchain_community/document_loaders/hugging_face_model.py
@ -0,0 +1,112 @@
+from typing import Iterator, List, Optional
+
+import requests
+from langchain_core.documents import Document
+
+from langchain_community.document_loaders.base import BaseLoader
+
+
+class HuggingFaceModelLoader(BaseLoader):
+    """
+    Load model information from `Hugging Face Hub`, including README content.
+
+    This loader interfaces with the Hugging Face Models API to fetch and load
+    model metadata and README files.
+    The API allows you to search and filter models based on specific criteria
+    such as model tags, authors, and more.
+
+    API URL: https://huggingface.co/api/models
+    DOC URL: https://huggingface.co/docs/hub/en/api
+
+    Examples:
+
+        .. code-block:: python
+
+            from langchain_community.document_loaders import HuggingFaceModelLoader
+
+            # Initialize the loader with search criteria
+            loader = HuggingFaceModelLoader(search="bert", limit=10)
+
+            # Load models
+            documents = loader.load()
+
+            # Iterate through the fetched documents
+            for doc in documents:
+                print(doc.page_content)  # README content of the model
+                print(doc.metadata)      # Metadata of the model
+    """
+
+    BASE_URL = "https://huggingface.co/api/models"
+    README_BASE_URL = "https://huggingface.co/{model_id}/raw/main/README.md"
+
+    def __init__(
+        self,
+        *,
+        search: Optional[str] = None,
+        author: Optional[str] = None,
+        filter: Optional[str] = None,
+        sort: Optional[str] = None,
+        direction: Optional[str] = None,
+        limit: Optional[int] = 3,
+        full: Optional[bool] = None,
+        config: Optional[bool] = None,
+    ):
+        """Initialize the HuggingFaceModelLoader.
+
+        Args:
+            search: Filter based on substrings for repos and their usernames.
+            author: Filter models by an author or organization.
+            filter: Filter based on tags.
+            sort: Property to use when sorting.
+            direction: Direction in which to sort.
+            limit: Limit the number of models fetched.
+            full: Whether to fetch most model data.
+            config: Whether to also fetch the repo config.
+        """
+
+        self.params = {
+            "search": search,
+            "author": author,
+            "filter": filter,
+            "sort": sort,
+            "direction": direction,
+            "limit": limit,
+            "full": full,
+            "config": config,
+        }
+
+    def fetch_models(self) -> List[dict]:
+        """Fetch model information from Hugging Face Hub."""
+        response = requests.get(
+            self.BASE_URL,
+            params={k: v for k, v in self.params.items() if v is not None},
+        )
+        response.raise_for_status()
+        return response.json()
+
+    def fetch_readme_content(self, model_id: str) -> str:
+        """Fetch the README content for a given model."""
+        readme_url = self.README_BASE_URL.format(model_id=model_id)
+        try:
+            response = requests.get(readme_url)
+            response.raise_for_status()
+            return response.text
+        except requests.RequestException:
+            return "README not available for this model."
+
+    def lazy_load(self) -> Iterator[Document]:
+        """Load model information lazily, including README content."""
+        models = self.fetch_models()
+
+        for model in models:
+            model_id = model.get("modelId", "")
+            readme_content = self.fetch_readme_content(model_id)
+
+            yield Document(
+                page_content=readme_content,
+                metadata=model,
+            )
+
+    def load(self) -> List[Document]:
+        """Load model information, including README content."""
+        return list(self.lazy_load())
--- a/libs/community/tests/unit_tests/document_loaders/test_hugging_face_model.py
+++ b/libs/community/tests/unit_tests/document_loaders/test_hugging_face_model.py
@ -0,0 +1,84 @@
+import json
+from typing import Tuple
+
+import responses
+from requests import Request
+
+from langchain_community.document_loaders import HuggingFaceModelLoader
+
+# Mocked model data to simulate an API response
+MOCKED_MODELS_RESPONSE = [
+    {
+        "_id": "657a1fff16886e681230c05a",
+        "id": "microsoft/phi-2",
+        "likes": 2692,
+        "private": False,
+        "downloads": 546775,
+        "tags": [
+            "transformers",
+            "safetensors",
+            "phi",
+            "text-generation",
+            "nlp",
+            "code",
+            "custom_code",
+            "en",
+            "license:mit",
+            "autotrain_compatible",
+            "endpoints_compatible",
+            "has_space",
+            "region:us",
+        ],
+        "pipeline_tag": "text-generation",
+        "library_name": "transformers",
+        "createdAt": "2023-12-13T21:19:59.000Z",
+        "modelId": "microsoft/phi-2",
+    },
+    # Add additional models as needed
+]
+
+# Mocked README content for models
+MOCKED_README_CONTENT = {
+    "microsoft/phi-2": "README content for microsoft/phi-2",
+    "openai/gpt-3": "README content for openai/gpt-3",
+}
+
+
+def response_callback(request: Request) -> Tuple[int, dict, str]:
+    if "/api/models" in request.url:
+        return (200, {}, json.dumps(MOCKED_MODELS_RESPONSE))
+    elif "README.md" in request.url:
+        model_id = (
+            request.url.split("/")[3] + "/" + request.url.split("/")[4]
+        )  # Extract model_id
+        content = MOCKED_README_CONTENT.get(model_id, "")
+        return (200, {}, content)
+    return (404, {}, "Not Found")
+
+
+@responses.activate
+def test_load_models_with_readme() -> None:
+    """Tests loading models along with their README content."""
+    responses.add_callback(
+        responses.GET,
+        "https://huggingface.co/api/models",
+        callback=response_callback,  # type: ignore
+        content_type="application/json",
+    )
+    responses.add_callback(
+        responses.GET,
+        # Use a regex or update this placeholder
+        "https://huggingface.co/microsoft/phi-2/raw/main/README.md",
+        callback=response_callback,  # type: ignore
+        content_type="text/plain",
+    )
+
+    loader = HuggingFaceModelLoader(search="phi-2", limit=2)
+    docs = loader.load()
+
+    assert len(docs) == len(MOCKED_MODELS_RESPONSE)
+    for doc, expected_model in zip(docs, MOCKED_MODELS_RESPONSE):
+        id_ = expected_model["id"]
+        assert isinstance(id_, str)
+        assert doc.page_content == MOCKED_README_CONTENT[id_]
+        assert doc.metadata["modelId"] == expected_model["id"]
--- a/libs/community/tests/unit_tests/document_loaders/test_imports.py
+++ b/libs/community/tests/unit_tests/document_loaders/test_imports.py
@ -78,6 +78,7 @@ EXPECTED_ALL = [
    "GutenbergLoader",
    "HNLoader",
    "HuggingFaceDatasetLoader",
+    "HuggingFaceModelLoader",
    "IFixitLoader",
    "IMSDbLoader",
    "ImageCaptionLoader",