diff --git a/docs/docs/integrations/retrievers/bm25.ipynb b/docs/docs/integrations/retrievers/bm25.ipynb index 5e0b3fa1984..401031db5fb 100644 --- a/docs/docs/integrations/retrievers/bm25.ipynb +++ b/docs/docs/integrations/retrievers/bm25.ipynb @@ -15,7 +15,7 @@ { "cell_type": "code", "execution_count": null, - "id": "a801b57c", + "id": "eccbbc4a", "metadata": {}, "outputs": [], "source": [ @@ -24,9 +24,13 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "id": "393ac030", "metadata": { + "ExecuteTime": { + "end_time": "2024-11-13T23:35:51.348359Z", + "start_time": "2024-11-13T23:35:49.409254Z" + }, "tags": [] }, "outputs": [], @@ -44,9 +48,13 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 4, "id": "98b1c017", "metadata": { + "ExecuteTime": { + "end_time": "2024-11-13T23:35:53.096938Z", + "start_time": "2024-11-13T23:35:52.493243Z" + }, "tags": [] }, "outputs": [], @@ -66,9 +74,14 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 5, "id": "53af4f00", - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2024-11-13T23:35:54.202737Z", + "start_time": "2024-11-13T23:35:54.198431Z" + } + }, "outputs": [], "source": [ "from langchain_core.documents import Document\n", @@ -96,9 +109,13 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 6, "id": "c0455218", "metadata": { + "ExecuteTime": { + "end_time": "2024-11-13T23:35:55.643026Z", + "start_time": "2024-11-13T23:35:55.595272Z" + }, "tags": [] }, "outputs": [], @@ -108,22 +125,26 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 7, "id": "7dfa5c29", "metadata": { + "ExecuteTime": { + "end_time": "2024-11-13T23:35:56.122327Z", + "start_time": "2024-11-13T23:35:56.112647Z" + }, "tags": [] }, "outputs": [ { "data": { "text/plain": [ - "[Document(page_content='foo', metadata={}),\n", - " Document(page_content='foo bar', metadata={}),\n", - " Document(page_content='hello', metadata={}),\n", - " Document(page_content='world', metadata={})]" + "[Document(metadata={}, page_content='foo'),\n", + " Document(metadata={}, page_content='foo bar'),\n", + " Document(metadata={}, page_content='hello'),\n", + " Document(metadata={}, page_content='world')]" ] }, - "execution_count": 5, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -132,13 +153,68 @@ "result" ] }, + { + "cell_type": "markdown", + "id": "51043723814c0d68", + "metadata": {}, + "source": [ + "## Preprocessing Function\n", + "Pass a custom preprocessing function to the retriever to improve search results. Tokenizing text at the word level can enhance retrieval, especially when using vector stores like Chroma, Pinecone, or Faiss for chunked documents." + ] + }, { "cell_type": "code", "execution_count": null, - "id": "997aaa8d", + "id": "f5fea58b", "metadata": {}, "outputs": [], - "source": [] + "source": [ + "import nltk\n", + "\n", + "nltk.download(\"punkt_tab\")" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "566fcc801cda5da4", + "metadata": { + "ExecuteTime": { + "end_time": "2024-11-14T00:40:58.728953Z", + "start_time": "2024-11-14T00:40:58.722140Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[Document(metadata={}, page_content='bar'),\n", + " Document(metadata={}, page_content='foo bar')]" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from nltk.tokenize import word_tokenize\n", + "\n", + "retriever = BM25Retriever.from_documents(\n", + " [\n", + " Document(page_content=\"foo\"),\n", + " Document(page_content=\"bar\"),\n", + " Document(page_content=\"world\"),\n", + " Document(page_content=\"hello\"),\n", + " Document(page_content=\"foo bar\"),\n", + " ],\n", + " k=2,\n", + " preprocess_func=word_tokenize,\n", + ")\n", + "\n", + "result = retriever.invoke(\"bar\")\n", + "result" + ] } ], "metadata": {