From ec205fcee093e6bea3e8a5c36a2e81f9a3fe803b Mon Sep 17 00:00:00 2001
From: Kiril Buga <kirilbuga@gmail.com>
Date: Tue, 26 Nov 2024 20:59:15 +0100
Subject: [PATCH] Updated docs for the BM25 preprocessing function (#28101)

- [x] **PR title**: "docs: add explanation for preprocessing function"


- [x] **PR message**:
- **Description:** Extending the BM25 description and demonstrating the
preprocessing function
    - **Dependencies:** nltk
    - **Twitter handle:** @kirilbuga

@efriis
@baskaryan
@vbarda
@ccurme

---------

Co-authored-by: Chester Curme <chester.curme@gmail.com>
---
 docs/docs/integrations/retrievers/bm25.ipynb | 104 ++++++++++++++++---
 1 file changed, 90 insertions(+), 14 deletions(-)

diff --git a/docs/docs/integrations/retrievers/bm25.ipynb b/docs/docs/integrations/retrievers/bm25.ipynb
index 5e0b3fa1984..401031db5fb 100644
--- a/docs/docs/integrations/retrievers/bm25.ipynb
+++ b/docs/docs/integrations/retrievers/bm25.ipynb
@@ -15,7 +15,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "a801b57c",
+   "id": "eccbbc4a",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -24,9 +24,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
    "id": "393ac030",
    "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-11-13T23:35:51.348359Z",
+     "start_time": "2024-11-13T23:35:49.409254Z"
+    },
     "tags": []
    },
    "outputs": [],
@@ -44,9 +48,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 4,
    "id": "98b1c017",
    "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-11-13T23:35:53.096938Z",
+     "start_time": "2024-11-13T23:35:52.493243Z"
+    },
     "tags": []
    },
    "outputs": [],
@@ -66,9 +74,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 5,
    "id": "53af4f00",
-   "metadata": {},
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-11-13T23:35:54.202737Z",
+     "start_time": "2024-11-13T23:35:54.198431Z"
+    }
+   },
    "outputs": [],
    "source": [
     "from langchain_core.documents import Document\n",
@@ -96,9 +109,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 6,
    "id": "c0455218",
    "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-11-13T23:35:55.643026Z",
+     "start_time": "2024-11-13T23:35:55.595272Z"
+    },
     "tags": []
    },
    "outputs": [],
@@ -108,22 +125,26 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 7,
    "id": "7dfa5c29",
    "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-11-13T23:35:56.122327Z",
+     "start_time": "2024-11-13T23:35:56.112647Z"
+    },
     "tags": []
    },
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "[Document(page_content='foo', metadata={}),\n",
-       " Document(page_content='foo bar', metadata={}),\n",
-       " Document(page_content='hello', metadata={}),\n",
-       " Document(page_content='world', metadata={})]"
+       "[Document(metadata={}, page_content='foo'),\n",
+       " Document(metadata={}, page_content='foo bar'),\n",
+       " Document(metadata={}, page_content='hello'),\n",
+       " Document(metadata={}, page_content='world')]"
       ]
      },
-     "execution_count": 5,
+     "execution_count": 7,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -132,13 +153,68 @@
     "result"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "51043723814c0d68",
+   "metadata": {},
+   "source": [
+    "## Preprocessing Function\n",
+    "Pass a custom preprocessing function to the retriever to improve search results. Tokenizing text at the word level can enhance retrieval, especially when using vector stores like Chroma, Pinecone, or Faiss for chunked documents."
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "997aaa8d",
+   "id": "f5fea58b",
    "metadata": {},
    "outputs": [],
-   "source": []
+   "source": [
+    "import nltk\n",
+    "\n",
+    "nltk.download(\"punkt_tab\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "id": "566fcc801cda5da4",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-11-14T00:40:58.728953Z",
+     "start_time": "2024-11-14T00:40:58.722140Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[Document(metadata={}, page_content='bar'),\n",
+       " Document(metadata={}, page_content='foo bar')]"
+      ]
+     },
+     "execution_count": 32,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from nltk.tokenize import word_tokenize\n",
+    "\n",
+    "retriever = BM25Retriever.from_documents(\n",
+    "    [\n",
+    "        Document(page_content=\"foo\"),\n",
+    "        Document(page_content=\"bar\"),\n",
+    "        Document(page_content=\"world\"),\n",
+    "        Document(page_content=\"hello\"),\n",
+    "        Document(page_content=\"foo bar\"),\n",
+    "    ],\n",
+    "    k=2,\n",
+    "    preprocess_func=word_tokenize,\n",
+    ")\n",
+    "\n",
+    "result = retriever.invoke(\"bar\")\n",
+    "result"
+   ]
   }
  ],
  "metadata": {