mirror of
https://github.com/hwchase17/langchain.git
synced 2025-04-29 04:16:02 +00:00
Updated docs for the BM25 preprocessing function (#28101)
- [x] **PR title**: "docs: add explanation for preprocessing function" - [x] **PR message**: - **Description:** Extending the BM25 description and demonstrating the preprocessing function - **Dependencies:** nltk - **Twitter handle:** @kirilbuga @efriis @baskaryan @vbarda @ccurme --------- Co-authored-by: Chester Curme <chester.curme@gmail.com>
This commit is contained in:
parent
06fafc6651
commit
ec205fcee0
@ -15,7 +15,7 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "a801b57c",
|
||||
"id": "eccbbc4a",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@ -24,9 +24,13 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 3,
|
||||
"id": "393ac030",
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2024-11-13T23:35:51.348359Z",
|
||||
"start_time": "2024-11-13T23:35:49.409254Z"
|
||||
},
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
@ -44,9 +48,13 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"execution_count": 4,
|
||||
"id": "98b1c017",
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2024-11-13T23:35:53.096938Z",
|
||||
"start_time": "2024-11-13T23:35:52.493243Z"
|
||||
},
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
@ -66,9 +74,14 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"execution_count": 5,
|
||||
"id": "53af4f00",
|
||||
"metadata": {},
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2024-11-13T23:35:54.202737Z",
|
||||
"start_time": "2024-11-13T23:35:54.198431Z"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain_core.documents import Document\n",
|
||||
@ -96,9 +109,13 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"execution_count": 6,
|
||||
"id": "c0455218",
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2024-11-13T23:35:55.643026Z",
|
||||
"start_time": "2024-11-13T23:35:55.595272Z"
|
||||
},
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
@ -108,22 +125,26 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"execution_count": 7,
|
||||
"id": "7dfa5c29",
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2024-11-13T23:35:56.122327Z",
|
||||
"start_time": "2024-11-13T23:35:56.112647Z"
|
||||
},
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[Document(page_content='foo', metadata={}),\n",
|
||||
" Document(page_content='foo bar', metadata={}),\n",
|
||||
" Document(page_content='hello', metadata={}),\n",
|
||||
" Document(page_content='world', metadata={})]"
|
||||
"[Document(metadata={}, page_content='foo'),\n",
|
||||
" Document(metadata={}, page_content='foo bar'),\n",
|
||||
" Document(metadata={}, page_content='hello'),\n",
|
||||
" Document(metadata={}, page_content='world')]"
|
||||
]
|
||||
},
|
||||
"execution_count": 5,
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
@ -132,13 +153,68 @@
|
||||
"result"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "51043723814c0d68",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Preprocessing Function\n",
|
||||
"Pass a custom preprocessing function to the retriever to improve search results. Tokenizing text at the word level can enhance retrieval, especially when using vector stores like Chroma, Pinecone, or Faiss for chunked documents."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "997aaa8d",
|
||||
"id": "f5fea58b",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
"source": [
|
||||
"import nltk\n",
|
||||
"\n",
|
||||
"nltk.download(\"punkt_tab\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 32,
|
||||
"id": "566fcc801cda5da4",
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2024-11-14T00:40:58.728953Z",
|
||||
"start_time": "2024-11-14T00:40:58.722140Z"
|
||||
}
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[Document(metadata={}, page_content='bar'),\n",
|
||||
" Document(metadata={}, page_content='foo bar')]"
|
||||
]
|
||||
},
|
||||
"execution_count": 32,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from nltk.tokenize import word_tokenize\n",
|
||||
"\n",
|
||||
"retriever = BM25Retriever.from_documents(\n",
|
||||
" [\n",
|
||||
" Document(page_content=\"foo\"),\n",
|
||||
" Document(page_content=\"bar\"),\n",
|
||||
" Document(page_content=\"world\"),\n",
|
||||
" Document(page_content=\"hello\"),\n",
|
||||
" Document(page_content=\"foo bar\"),\n",
|
||||
" ],\n",
|
||||
" k=2,\n",
|
||||
" preprocess_func=word_tokenize,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"result = retriever.invoke(\"bar\")\n",
|
||||
"result"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
|
Loading…
Reference in New Issue
Block a user