diff --git a/libs/community/langchain_community/document_loaders/__init__.py b/libs/community/langchain_community/document_loaders/__init__.py index 7e362abbe94..10190996e38 100644 --- a/libs/community/langchain_community/document_loaders/__init__.py +++ b/libs/community/langchain_community/document_loaders/__init__.py @@ -118,6 +118,9 @@ from langchain_community.document_loaders.html_bs import BSHTMLLoader from langchain_community.document_loaders.hugging_face_dataset import ( HuggingFaceDatasetLoader, ) +from langchain_community.document_loaders.hugging_face_model import ( + HuggingFaceModelLoader, +) from langchain_community.document_loaders.ifixit import IFixitLoader from langchain_community.document_loaders.image import UnstructuredImageLoader from langchain_community.document_loaders.image_captions import ImageCaptionLoader @@ -315,6 +318,7 @@ __all__ = [ "GutenbergLoader", "HNLoader", "HuggingFaceDatasetLoader", + "HuggingFaceModelLoader", "IFixitLoader", "IMSDbLoader", "ImageCaptionLoader", diff --git a/libs/community/langchain_community/document_loaders/hugging_face_model.py b/libs/community/langchain_community/document_loaders/hugging_face_model.py new file mode 100644 index 00000000000..07dec204ff7 --- /dev/null +++ b/libs/community/langchain_community/document_loaders/hugging_face_model.py @@ -0,0 +1,112 @@ +from typing import Iterator, List, Optional + +import requests +from langchain_core.documents import Document + +from langchain_community.document_loaders.base import BaseLoader + + +class HuggingFaceModelLoader(BaseLoader): + """ + Load model information from `Hugging Face Hub`, including README content. + + This loader interfaces with the Hugging Face Models API to fetch and load + model metadata and README files. + The API allows you to search and filter models based on specific criteria + such as model tags, authors, and more. + + API URL: https://huggingface.co/api/models + DOC URL: https://huggingface.co/docs/hub/en/api + + Examples: + + .. code-block:: python + + from langchain_community.document_loaders import HuggingFaceModelLoader + + # Initialize the loader with search criteria + loader = HuggingFaceModelLoader(search="bert", limit=10) + + # Load models + documents = loader.load() + + # Iterate through the fetched documents + for doc in documents: + print(doc.page_content) # README content of the model + print(doc.metadata) # Metadata of the model + """ + + BASE_URL = "https://huggingface.co/api/models" + README_BASE_URL = "https://huggingface.co/{model_id}/raw/main/README.md" + + def __init__( + self, + *, + search: Optional[str] = None, + author: Optional[str] = None, + filter: Optional[str] = None, + sort: Optional[str] = None, + direction: Optional[str] = None, + limit: Optional[int] = 3, + full: Optional[bool] = None, + config: Optional[bool] = None, + ): + """Initialize the HuggingFaceModelLoader. + + Args: + search: Filter based on substrings for repos and their usernames. + author: Filter models by an author or organization. + filter: Filter based on tags. + sort: Property to use when sorting. + direction: Direction in which to sort. + limit: Limit the number of models fetched. + full: Whether to fetch most model data. + config: Whether to also fetch the repo config. + """ + + self.params = { + "search": search, + "author": author, + "filter": filter, + "sort": sort, + "direction": direction, + "limit": limit, + "full": full, + "config": config, + } + + def fetch_models(self) -> List[dict]: + """Fetch model information from Hugging Face Hub.""" + response = requests.get( + self.BASE_URL, + params={k: v for k, v in self.params.items() if v is not None}, + ) + response.raise_for_status() + return response.json() + + def fetch_readme_content(self, model_id: str) -> str: + """Fetch the README content for a given model.""" + readme_url = self.README_BASE_URL.format(model_id=model_id) + try: + response = requests.get(readme_url) + response.raise_for_status() + return response.text + except requests.RequestException: + return "README not available for this model." + + def lazy_load(self) -> Iterator[Document]: + """Load model information lazily, including README content.""" + models = self.fetch_models() + + for model in models: + model_id = model.get("modelId", "") + readme_content = self.fetch_readme_content(model_id) + + yield Document( + page_content=readme_content, + metadata=model, + ) + + def load(self) -> List[Document]: + """Load model information, including README content.""" + return list(self.lazy_load()) diff --git a/libs/community/tests/unit_tests/document_loaders/test_hugging_face_model.py b/libs/community/tests/unit_tests/document_loaders/test_hugging_face_model.py new file mode 100644 index 00000000000..17da8f31d5f --- /dev/null +++ b/libs/community/tests/unit_tests/document_loaders/test_hugging_face_model.py @@ -0,0 +1,84 @@ +import json +from typing import Tuple + +import responses +from requests import Request + +from langchain_community.document_loaders import HuggingFaceModelLoader + +# Mocked model data to simulate an API response +MOCKED_MODELS_RESPONSE = [ + { + "_id": "657a1fff16886e681230c05a", + "id": "microsoft/phi-2", + "likes": 2692, + "private": False, + "downloads": 546775, + "tags": [ + "transformers", + "safetensors", + "phi", + "text-generation", + "nlp", + "code", + "custom_code", + "en", + "license:mit", + "autotrain_compatible", + "endpoints_compatible", + "has_space", + "region:us", + ], + "pipeline_tag": "text-generation", + "library_name": "transformers", + "createdAt": "2023-12-13T21:19:59.000Z", + "modelId": "microsoft/phi-2", + }, + # Add additional models as needed +] + +# Mocked README content for models +MOCKED_README_CONTENT = { + "microsoft/phi-2": "README content for microsoft/phi-2", + "openai/gpt-3": "README content for openai/gpt-3", +} + + +def response_callback(request: Request) -> Tuple[int, dict, str]: + if "/api/models" in request.url: + return (200, {}, json.dumps(MOCKED_MODELS_RESPONSE)) + elif "README.md" in request.url: + model_id = ( + request.url.split("/")[3] + "/" + request.url.split("/")[4] + ) # Extract model_id + content = MOCKED_README_CONTENT.get(model_id, "") + return (200, {}, content) + return (404, {}, "Not Found") + + +@responses.activate +def test_load_models_with_readme() -> None: + """Tests loading models along with their README content.""" + responses.add_callback( + responses.GET, + "https://huggingface.co/api/models", + callback=response_callback, # type: ignore + content_type="application/json", + ) + responses.add_callback( + responses.GET, + # Use a regex or update this placeholder + "https://huggingface.co/microsoft/phi-2/raw/main/README.md", + callback=response_callback, # type: ignore + content_type="text/plain", + ) + + loader = HuggingFaceModelLoader(search="phi-2", limit=2) + docs = loader.load() + + assert len(docs) == len(MOCKED_MODELS_RESPONSE) + for doc, expected_model in zip(docs, MOCKED_MODELS_RESPONSE): + id_ = expected_model["id"] + assert isinstance(id_, str) + assert doc.page_content == MOCKED_README_CONTENT[id_] + assert doc.metadata["modelId"] == expected_model["id"] diff --git a/libs/community/tests/unit_tests/document_loaders/test_imports.py b/libs/community/tests/unit_tests/document_loaders/test_imports.py index aefd22d4d99..d3c0b3b23d1 100644 --- a/libs/community/tests/unit_tests/document_loaders/test_imports.py +++ b/libs/community/tests/unit_tests/document_loaders/test_imports.py @@ -78,6 +78,7 @@ EXPECTED_ALL = [ "GutenbergLoader", "HNLoader", "HuggingFaceDatasetLoader", + "HuggingFaceModelLoader", "IFixitLoader", "IMSDbLoader", "ImageCaptionLoader",