Updated docs for the BM25 preprocessing function (#28101)

- [x] **PR title**: "docs: add explanation for preprocessing function" - [x] **PR message**: - **Description:** Extending the BM25 description and demonstrating the preprocessing function - **Dependencies:** nltk - **Twitter handle:** @kirilbuga @efriis @baskaryan @vbarda @ccurme --------- Co-authored-by: Chester Curme <chester.curme@gmail.com>
2025-08-30 17:29:56 +00:00 · 2024-11-26 20:59:15 +01:00 · 2024-11-26 20:59:15 +01:00 · ec205fcee0
commit ec205fcee0
parent 06fafc6651
1 changed files with 90 additions and 14 deletions
--- a/docs/docs/integrations/retrievers/bm25.ipynb
+++ b/docs/docs/integrations/retrievers/bm25.ipynb
@ -15,7 +15,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "id": "a801b57c",
+   "id": "eccbbc4a",
   "metadata": {},
   "outputs": [],
   "source": [
@ -24,9 +24,13 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
   "id": "393ac030",
   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-11-13T23:35:51.348359Z",
+     "start_time": "2024-11-13T23:35:49.409254Z"
+    },
    "tags": []
   },
   "outputs": [],
@ -44,9 +48,13 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 4,
   "id": "98b1c017",
   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-11-13T23:35:53.096938Z",
+     "start_time": "2024-11-13T23:35:52.493243Z"
+    },
    "tags": []
   },
   "outputs": [],
@ -66,9 +74,14 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 5,
   "id": "53af4f00",
-   "metadata": {},
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-11-13T23:35:54.202737Z",
+     "start_time": "2024-11-13T23:35:54.198431Z"
+    }
+   },
   "outputs": [],
   "source": [
    "from langchain_core.documents import Document\n",
@ -96,9 +109,13 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 6,
   "id": "c0455218",
   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-11-13T23:35:55.643026Z",
+     "start_time": "2024-11-13T23:35:55.595272Z"
+    },
    "tags": []
   },
   "outputs": [],
@ -108,22 +125,26 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 7,
   "id": "7dfa5c29",
   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-11-13T23:35:56.122327Z",
+     "start_time": "2024-11-13T23:35:56.112647Z"
+    },
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
-       "[Document(page_content='foo', metadata={}),\n",
-       " Document(page_content='foo bar', metadata={}),\n",
-       " Document(page_content='hello', metadata={}),\n",
-       " Document(page_content='world', metadata={})]"
+       "[Document(metadata={}, page_content='foo'),\n",
+       " Document(metadata={}, page_content='foo bar'),\n",
+       " Document(metadata={}, page_content='hello'),\n",
+       " Document(metadata={}, page_content='world')]"
      ]
     },
-     "execution_count": 5,
+     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -132,13 +153,68 @@
    "result"
   ]
  },
+  {
+   "cell_type": "markdown",
+   "id": "51043723814c0d68",
+   "metadata": {},
+   "source": [
+    "## Preprocessing Function\n",
+    "Pass a custom preprocessing function to the retriever to improve search results. Tokenizing text at the word level can enhance retrieval, especially when using vector stores like Chroma, Pinecone, or Faiss for chunked documents."
+   ]
+  },
  {
   "cell_type": "code",
   "execution_count": null,
-   "id": "997aaa8d",
+   "id": "f5fea58b",
   "metadata": {},
   "outputs": [],
-   "source": []
+   "source": [
+    "import nltk\n",
+    "\n",
+    "nltk.download(\"punkt_tab\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "id": "566fcc801cda5da4",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-11-14T00:40:58.728953Z",
+     "start_time": "2024-11-14T00:40:58.722140Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[Document(metadata={}, page_content='bar'),\n",
+       " Document(metadata={}, page_content='foo bar')]"
+      ]
+     },
+     "execution_count": 32,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from nltk.tokenize import word_tokenize\n",
+    "\n",
+    "retriever = BM25Retriever.from_documents(\n",
+    "    [\n",
+    "        Document(page_content=\"foo\"),\n",
+    "        Document(page_content=\"bar\"),\n",
+    "        Document(page_content=\"world\"),\n",
+    "        Document(page_content=\"hello\"),\n",
+    "        Document(page_content=\"foo bar\"),\n",
+    "    ],\n",
+    "    k=2,\n",
+    "    preprocess_func=word_tokenize,\n",
+    ")\n",
+    "\n",
+    "result = retriever.invoke(\"bar\")\n",
+    "result"
+   ]
  }
 ],
 "metadata": {