mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-21 06:14:37 +00:00
community[minor]: add hugging_face_model document loader (#17323)
- **Description:** add hugging_face_model document loader, - **Issue:** NA, - **Dependencies:** NA, --------- Co-authored-by: Eugene Yurtsev <eyurtsev@gmail.com>
This commit is contained in:
parent
b9a495e56e
commit
af35e2525a
@ -118,6 +118,9 @@ from langchain_community.document_loaders.html_bs import BSHTMLLoader
|
||||
from langchain_community.document_loaders.hugging_face_dataset import (
|
||||
HuggingFaceDatasetLoader,
|
||||
)
|
||||
from langchain_community.document_loaders.hugging_face_model import (
|
||||
HuggingFaceModelLoader,
|
||||
)
|
||||
from langchain_community.document_loaders.ifixit import IFixitLoader
|
||||
from langchain_community.document_loaders.image import UnstructuredImageLoader
|
||||
from langchain_community.document_loaders.image_captions import ImageCaptionLoader
|
||||
@ -315,6 +318,7 @@ __all__ = [
|
||||
"GutenbergLoader",
|
||||
"HNLoader",
|
||||
"HuggingFaceDatasetLoader",
|
||||
"HuggingFaceModelLoader",
|
||||
"IFixitLoader",
|
||||
"IMSDbLoader",
|
||||
"ImageCaptionLoader",
|
||||
|
@ -0,0 +1,112 @@
|
||||
from typing import Iterator, List, Optional
|
||||
|
||||
import requests
|
||||
from langchain_core.documents import Document
|
||||
|
||||
from langchain_community.document_loaders.base import BaseLoader
|
||||
|
||||
|
||||
class HuggingFaceModelLoader(BaseLoader):
|
||||
"""
|
||||
Load model information from `Hugging Face Hub`, including README content.
|
||||
|
||||
This loader interfaces with the Hugging Face Models API to fetch and load
|
||||
model metadata and README files.
|
||||
The API allows you to search and filter models based on specific criteria
|
||||
such as model tags, authors, and more.
|
||||
|
||||
API URL: https://huggingface.co/api/models
|
||||
DOC URL: https://huggingface.co/docs/hub/en/api
|
||||
|
||||
Examples:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from langchain_community.document_loaders import HuggingFaceModelLoader
|
||||
|
||||
# Initialize the loader with search criteria
|
||||
loader = HuggingFaceModelLoader(search="bert", limit=10)
|
||||
|
||||
# Load models
|
||||
documents = loader.load()
|
||||
|
||||
# Iterate through the fetched documents
|
||||
for doc in documents:
|
||||
print(doc.page_content) # README content of the model
|
||||
print(doc.metadata) # Metadata of the model
|
||||
"""
|
||||
|
||||
BASE_URL = "https://huggingface.co/api/models"
|
||||
README_BASE_URL = "https://huggingface.co/{model_id}/raw/main/README.md"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
search: Optional[str] = None,
|
||||
author: Optional[str] = None,
|
||||
filter: Optional[str] = None,
|
||||
sort: Optional[str] = None,
|
||||
direction: Optional[str] = None,
|
||||
limit: Optional[int] = 3,
|
||||
full: Optional[bool] = None,
|
||||
config: Optional[bool] = None,
|
||||
):
|
||||
"""Initialize the HuggingFaceModelLoader.
|
||||
|
||||
Args:
|
||||
search: Filter based on substrings for repos and their usernames.
|
||||
author: Filter models by an author or organization.
|
||||
filter: Filter based on tags.
|
||||
sort: Property to use when sorting.
|
||||
direction: Direction in which to sort.
|
||||
limit: Limit the number of models fetched.
|
||||
full: Whether to fetch most model data.
|
||||
config: Whether to also fetch the repo config.
|
||||
"""
|
||||
|
||||
self.params = {
|
||||
"search": search,
|
||||
"author": author,
|
||||
"filter": filter,
|
||||
"sort": sort,
|
||||
"direction": direction,
|
||||
"limit": limit,
|
||||
"full": full,
|
||||
"config": config,
|
||||
}
|
||||
|
||||
def fetch_models(self) -> List[dict]:
|
||||
"""Fetch model information from Hugging Face Hub."""
|
||||
response = requests.get(
|
||||
self.BASE_URL,
|
||||
params={k: v for k, v in self.params.items() if v is not None},
|
||||
)
|
||||
response.raise_for_status()
|
||||
return response.json()
|
||||
|
||||
def fetch_readme_content(self, model_id: str) -> str:
|
||||
"""Fetch the README content for a given model."""
|
||||
readme_url = self.README_BASE_URL.format(model_id=model_id)
|
||||
try:
|
||||
response = requests.get(readme_url)
|
||||
response.raise_for_status()
|
||||
return response.text
|
||||
except requests.RequestException:
|
||||
return "README not available for this model."
|
||||
|
||||
def lazy_load(self) -> Iterator[Document]:
|
||||
"""Load model information lazily, including README content."""
|
||||
models = self.fetch_models()
|
||||
|
||||
for model in models:
|
||||
model_id = model.get("modelId", "")
|
||||
readme_content = self.fetch_readme_content(model_id)
|
||||
|
||||
yield Document(
|
||||
page_content=readme_content,
|
||||
metadata=model,
|
||||
)
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load model information, including README content."""
|
||||
return list(self.lazy_load())
|
@ -0,0 +1,84 @@
|
||||
import json
|
||||
from typing import Tuple
|
||||
|
||||
import responses
|
||||
from requests import Request
|
||||
|
||||
from langchain_community.document_loaders import HuggingFaceModelLoader
|
||||
|
||||
# Mocked model data to simulate an API response
|
||||
MOCKED_MODELS_RESPONSE = [
|
||||
{
|
||||
"_id": "657a1fff16886e681230c05a",
|
||||
"id": "microsoft/phi-2",
|
||||
"likes": 2692,
|
||||
"private": False,
|
||||
"downloads": 546775,
|
||||
"tags": [
|
||||
"transformers",
|
||||
"safetensors",
|
||||
"phi",
|
||||
"text-generation",
|
||||
"nlp",
|
||||
"code",
|
||||
"custom_code",
|
||||
"en",
|
||||
"license:mit",
|
||||
"autotrain_compatible",
|
||||
"endpoints_compatible",
|
||||
"has_space",
|
||||
"region:us",
|
||||
],
|
||||
"pipeline_tag": "text-generation",
|
||||
"library_name": "transformers",
|
||||
"createdAt": "2023-12-13T21:19:59.000Z",
|
||||
"modelId": "microsoft/phi-2",
|
||||
},
|
||||
# Add additional models as needed
|
||||
]
|
||||
|
||||
# Mocked README content for models
|
||||
MOCKED_README_CONTENT = {
|
||||
"microsoft/phi-2": "README content for microsoft/phi-2",
|
||||
"openai/gpt-3": "README content for openai/gpt-3",
|
||||
}
|
||||
|
||||
|
||||
def response_callback(request: Request) -> Tuple[int, dict, str]:
|
||||
if "/api/models" in request.url:
|
||||
return (200, {}, json.dumps(MOCKED_MODELS_RESPONSE))
|
||||
elif "README.md" in request.url:
|
||||
model_id = (
|
||||
request.url.split("/")[3] + "/" + request.url.split("/")[4]
|
||||
) # Extract model_id
|
||||
content = MOCKED_README_CONTENT.get(model_id, "")
|
||||
return (200, {}, content)
|
||||
return (404, {}, "Not Found")
|
||||
|
||||
|
||||
@responses.activate
|
||||
def test_load_models_with_readme() -> None:
|
||||
"""Tests loading models along with their README content."""
|
||||
responses.add_callback(
|
||||
responses.GET,
|
||||
"https://huggingface.co/api/models",
|
||||
callback=response_callback, # type: ignore
|
||||
content_type="application/json",
|
||||
)
|
||||
responses.add_callback(
|
||||
responses.GET,
|
||||
# Use a regex or update this placeholder
|
||||
"https://huggingface.co/microsoft/phi-2/raw/main/README.md",
|
||||
callback=response_callback, # type: ignore
|
||||
content_type="text/plain",
|
||||
)
|
||||
|
||||
loader = HuggingFaceModelLoader(search="phi-2", limit=2)
|
||||
docs = loader.load()
|
||||
|
||||
assert len(docs) == len(MOCKED_MODELS_RESPONSE)
|
||||
for doc, expected_model in zip(docs, MOCKED_MODELS_RESPONSE):
|
||||
id_ = expected_model["id"]
|
||||
assert isinstance(id_, str)
|
||||
assert doc.page_content == MOCKED_README_CONTENT[id_]
|
||||
assert doc.metadata["modelId"] == expected_model["id"]
|
@ -78,6 +78,7 @@ EXPECTED_ALL = [
|
||||
"GutenbergLoader",
|
||||
"HNLoader",
|
||||
"HuggingFaceDatasetLoader",
|
||||
"HuggingFaceModelLoader",
|
||||
"IFixitLoader",
|
||||
"IMSDbLoader",
|
||||
"ImageCaptionLoader",
|
||||
|
Loading…
Reference in New Issue
Block a user