community[minor]: Breebs docs retriever (#16578)

- **Description:** Implementation of breeb retriever with integration tests -> libs/community/tests/integration_tests/retrievers/test_breebs.py and documentation (notebook) -> docs/docs/integrations/retrievers/breebs.ipynb. - **Dependencies:** None
2025-09-24 03:52:10 +00:00 · 2024-02-06 00:51:08 +01:00
parent eb7b05885f
commit 334b6ebdf3
6 changed files with 192 additions and 0 deletions
--- a/docs/docs/integrations/providers/breebs.md
+++ b/docs/docs/integrations/providers/breebs.md
@@ -0,0 +1,24 @@
+# BREEBS (Open Knowledge)
+
+[BREEBS](https://www.breebs.com/) is an open collaborative knowledge platform. 
+Anybody can create a Breeb, a knowledge capsule based on PDFs stored on a Google Drive folder.
+A breeb can be used by any LLM/chatbot to improve its expertise, reduce hallucinations and give access to sources.
+Behind the scenes, Breebs implements several Retrieval Augmented Generation (RAG) models to seamlessly provide useful context at each iteration.  
+
+## List of available Breebs
+
+To get the full list of Breebs, including their key (breeb_key) and description : 
+https://breebs.promptbreeders.com/web/listbreebs.  
+Dozens of Breebs have already been created by the community and are freely available for use. They cover a wide range of expertise, from organic chemistry to mythology, as well as tips on seduction and decentralized finance.
+
+## Creating a new Breeb
+
+To generate a new Breeb, simply compile PDF files in a publicly shared Google Drive folder and initiate the creation process on the [BREEBS website](https://www.breebs.com/) by clicking the "Create Breeb" button. You can currently include up to 120 files, with a total character limit of 15 million.  
+
+## Retriever
+```python
+from langchain.retrievers import BreebsRetriever
+```
+
+# Example
+[See usage example (Retrieval & ConversationalRetrievalChain)](https://python.langchain.com/docs/integrations/retrievers/breebs)
--- a/docs/docs/integrations/retrievers/breebs.ipynb
+++ b/docs/docs/integrations/retrievers/breebs.ipynb
@@ -0,0 +1,95 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# BREEBS (Open Knowledge)\n",
+    "\n",
+    "[BREEBS](https://www.breebs.com/) is an open collaborative knowledge platform. \n",
+    "Anybody can create a Breeb, a knowledge capsule, based on PDFs stored on a Google Drive folder.\n",
+    "A breeb can be used by any LLM/chatbot to improve its expertise, reduce hallucinations and give access to sources.\n",
+    "Behind the scenes, Breebs implements several Retrieval Augmented Generation (RAG) models to seamlessly provide useful context at each iteration.  \n",
+    "\n",
+    "## List of available Breebs\n",
+    "\n",
+    "To get the full list of Breebs, including their key (breeb_key) and description : \n",
+    "https://breebs.promptbreeders.com/web/listbreebs.  \n",
+    "Dozens of Breebs have already been created by the community and are freely available for use. They cover a wide range of expertise, from organic chemistry to mythology, as well as tips on seduction and decentralized finance.\n",
+    "\n",
+    "\n",
+    "## Creating a new Breeb\n",
+    "\n",
+    "To generate a new Breeb, simply compile PDF files in a publicly shared Google Drive folder and initiate the creation process on the [BREEBS website](https://www.breebs.com/) by clicking the \"Create Breeb\" button. You can currently include up to 120 files, with a total character limit of 15 million.  \n",
+    "\n",
+    "## Retriever example\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-12-15T16:05:12.645840Z",
+     "iopub.status.busy": "2023-12-15T16:05:12.645698Z",
+     "iopub.status.idle": "2023-12-15T16:05:14.347548Z",
+     "shell.execute_reply": "2023-12-15T16:05:14.346835Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "from langchain.retrievers import BreebsRetriever"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-12-15T16:05:14.350102Z",
+     "iopub.status.busy": "2023-12-15T16:05:14.349837Z",
+     "iopub.status.idle": "2023-12-15T16:05:29.569958Z",
+     "shell.execute_reply": "2023-12-15T16:05:29.567172Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[Document(page_content=\"de poupées• Ladurée - Madeleine• Ladurée - rue Bonaparte• Flamant• Bonnichon Saint Germain• Dinh Van• Léonor Greyl• Berthillon• Christian Louboutin• Patrick Cox• Baby Dior• FNAC Musique - Bastille• FNAC - Saint Lazare• La guinguette pirate• Park Hyatt• Restaurant de Sers• Hilton Arc de Triomphe• Café Barge• Le Celadon• Le Drouant• La Perouse• Cigale Recamier• Ledoyen• Tanjia• Les Muses• Bistrot du Dôme• Avenue Foch• Fontaine Saint-Michel• Funiculaire de Montmartre• Promotrain - Place Blanche• Grand Palais• Hotel de Rohan• Hotel de Sully• Hotel des Ventes Drouot• Institut de France• Place des Invalides• Jardin d'acclimatation• Jardin des plantes Zoo• Jouffroy (passage)• Quartier de La Défense• La Villette (quartier)• Lac Inferieur du Bois de Boulogne• Les Catacombes de Paris• Place du Louvre• Rue Mazarine• Rue Monsieur le Prince11/12/2023 07:51Guide en pdf Paris à imprimer gratuitement.\", metadata={'source': 'https://breebs.promptbreeders.com/breeb?breeb_key=Parivoyage&doc=44d78553-a&page=11', 'score': 1}), Document(page_content=\"cafés et des restaurants situésdans les rues adjacentes. Il y a également une cafétéria dans le musée, qui propose des collations, desboissons et des repas légers.À voir et visiter autour :Le Muséum d'histoire naturelle de Paris est situé àproximité de plusieurs autres attractions populaires, notamment le Jardin des Plantes, la Grande Mosquéede Paris, la Sorbonne et la Bibliothèque nationale de France.Comment y aller en bus, métro, train :LeMuséum d'histoire naturelle de Paris est facilement accessible en transports en commun. Les stations demétro les plus proches sont la station Censier-Daubenton sur la ligne 7 et la station Jussieu sur les lignes 7et 10. Le musée est également accessible en bus, avec plusieurs lignes desservant la zone, telles que leslignes 24, 57, 61, 63, 67, 89 et 91. En train, la gare la plus proche est la Gare d'Austerlitz, qui est desserviepar plusieurs lignes, notamment les lignes RER C et les trains intercités. Il est également possible de serendre au musée en utilisant les services de taxis ou de VTC.Plus d'informations :+33140795601,6 euros,Ouverture : 10h - 17h, Week end: 10h - 18h ; Fermeture: Mardi(haut de\", metadata={'source': 'https://breebs.promptbreeders.com/breeb?breeb_key=Parivoyage&doc=44d78553-a&page=403', 'score': 1}), Document(page_content=\"Le célèbre Drugstore des Champs Elysées abrite de nombreuses boutiques dans un décor design. V ouspourrez y découvrir un espace beauté, des expositions éphémères, une pharmacie et des espaces réservésaux plaisirs des sens. A noter la façade d'architecture extérieure en verrePlus d'informations :+33144437900, https://www.publicisdrugstore.com/, Visite libre,(haut de page)• Place du Marché Sainte-CatherinePlace du Marché Sainte-Catherine, Paris, 75008, FR11/12/2023 07:51Guide en pdf Paris à imprimer gratuitement.\\nPage 200 sur 545https://www.cityzeum.com/imprimer-pdf/parisUne place hors de l'agitation de la capitale, où vous découvrirez des petits restaurants au charme certaindans un cadre fort agréable. Terrasses au rendez-vous l'été! Un bar à magie pour couronner le toutPlus d'informations :15-30 euros,(haut de page)• Rue de Lappe, ParisRue de Lappe, Paris, FR\", metadata={'source': 'https://breebs.promptbreeders.com/breeb?breeb_key=Parivoyage&doc=44d78553-a&page=198', 'score': 1}), Document(page_content=\"des visiteurs pour la nature etles attractions du parc. Les visiteurs peuvent prévoir de passer entre 1 à 2 heures pour visiter le parcL'accès au parc Montsouris est gratuit pour tous les visiteurs. Aucune réservation n'est nécessaire pourvisiter le parc. Cependant, pour les visites guidées, il est conseillé de réserver à l'avance pour garantir uneplace. Les tarifs pour les visites guidées peuvent varier en fonction de l'organisme proposant la visite.Ensomme, le parc Montsouris est un endroit magniﬁque pour se détendre et proﬁter de la nature en pleincœur de Paris. Avec ses attractions pittoresques, son paysage verdoyant et ses visites guidées, c'est unendroit idéal pour une sortie en famille ou entre amis.Plus d'informations :https://www.parisinfo.com/musee-monument-paris/71218/Parc-Montsouris,Gratuit,Ouverture : 8h/9h - 17h30/21h30(haut de page)• Parc des Buttes Chaumont\", metadata={'source': 'https://breebs.promptbreeders.com/breeb?breeb_key=Parivoyage&doc=44d78553-a&page=291', 'score': 1})]\n"
+     ]
+    }
+   ],
+   "source": [
+    "breeb_key = \"Parivoyage\"\n",
+    "retriever = BreebsRetriever(breeb_key)\n",
+    "documents = retriever.get_relevant_documents(\n",
+    "    \"What are some unique, lesser-known spots to explore in Paris?\"\n",
+    ")\n",
+    "print(documents)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/libs/community/langchain_community/retrievers/init.py
+++ b/libs/community/langchain_community/retrievers/init.py
@@ -25,6 +25,7 @@ from langchain_community.retrievers.azure_cognitive_search import (
 )
 from langchain_community.retrievers.bedrock import AmazonKnowledgeBasesRetriever
 from langchain_community.retrievers.bm25 import BM25Retriever
+from langchain_community.retrievers.breebs import BreebsRetriever
 from langchain_community.retrievers.chaindesk import ChaindeskRetriever
 from langchain_community.retrievers.chatgpt_plugin_retriever import (
    ChatGPTPluginRetriever,
@@ -78,6 +79,7 @@ __all__ = [
    "ArceeRetriever",
    "ArxivRetriever",
    "AzureCognitiveSearchRetriever",
+    "BreebsRetriever",
    "ChatGPTPluginRetriever",
    "ChaindeskRetriever",
    "CohereRagRetriever",
--- a/libs/community/langchain_community/retrievers/breebs.py
+++ b/libs/community/langchain_community/retrievers/breebs.py
@@ -0,0 +1,49 @@
+from typing import List
+
+import requests
+from langchain_core.callbacks.manager import CallbackManagerForRetrieverRun
+from langchain_core.documents.base import Document
+from langchain_core.retrievers import BaseRetriever
+
+
+class BreebsRetriever(BaseRetriever):
+    """A retriever class for `Breebs`.
+
+    See https://www.breebs.com/ for more info.
+    Args:
+        breeb_key: The key to trigger the breeb
+        (specialized knowledge pill on a specific topic).
+
+    To retrieve the list of all available Breebs : you can call https://breebs.promptbreeders.com/web/listbreebs
+    """
+
+    breeb_key: str
+    url = "https://breebs.promptbreeders.com/knowledge"
+
+    def __init__(self, breeb_key: str):
+        super().__init__(breeb_key=breeb_key)
+        self.breeb_key = breeb_key
+
+    def _get_relevant_documents(
+        self, query: str, *, run_manager: CallbackManagerForRetrieverRun
+    ) -> List[Document]:
+        """Retrieve context for given query.
+        Note that for time being there is no score."""
+        r = requests.post(
+            self.url,
+            json={
+                "breeb_key": self.breeb_key,
+                "query": query,
+            },
+        )
+        if r.status_code != 200:
+            return []
+        else:
+            chunks = r.json()
+            return [
+                Document(
+                    page_content=chunk["content"],
+                    metadata={"source": chunk["source_url"], "score": 1},
+                )
+                for chunk in chunks
+            ]
--- a/libs/community/tests/integration_tests/retrievers/test_breebs.py
+++ b/libs/community/tests/integration_tests/retrievers/test_breebs.py
@@ -0,0 +1,21 @@
+from typing import List
+
+from langchain_core.callbacks.manager import CallbackManagerForRetrieverRun
+from langchain_core.documents import Document
+
+from langchain_community.retrievers.breebs import BreebsRetriever
+
+
+class TestBreebsRetriever:
+    def test_breeb_query(self) -> None:
+        breeb_key = "Parivoyage"
+        query = "What are the best churches to visit in Paris?"
+        breeb_retriever = BreebsRetriever(breeb_key)
+        documents: List[Document] = breeb_retriever._get_relevant_documents(
+            query, run_manager=CallbackManagerForRetrieverRun
+        )
+        assert isinstance(documents, list), "Documents should be a list"
+        for doc in documents:
+            assert doc.page_content, "Document page_content should not be None"
+            assert doc.metadata["source"], "Document metadata should contain 'source'"
+            assert doc.metadata["score"] == 1, "Document score should be equal to 1"
--- a/libs/community/tests/unit_tests/retrievers/test_imports.py
+++ b/libs/community/tests/unit_tests/retrievers/test_imports.py
@@ -6,6 +6,7 @@ EXPECTED_ALL = [
    "ArceeRetriever",
    "ArxivRetriever",
    "AzureCognitiveSearchRetriever",
+    "BreebsRetriever",
    "ChatGPTPluginRetriever",
    "ChaindeskRetriever",
    "CohereRagRetriever",