mirror of
https://github.com/hwchase17/langchain.git
synced 2026-02-07 09:40:07 +00:00
Compare commits
63 Commits
isaac/crea
...
langchain-
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
8f3c052db1 | ||
|
|
29a3b3a711 | ||
|
|
20fe4deea0 | ||
|
|
3a55f4bfe9 | ||
|
|
fea9ff3831 | ||
|
|
b55f6105c6 | ||
|
|
4585eaef1b | ||
|
|
f337f3ed36 | ||
|
|
22175738ac | ||
|
|
12c3454fd9 | ||
|
|
e271965d1e | ||
|
|
b9bea36dd4 | ||
|
|
da06d4d7af | ||
|
|
5f73c836a6 | ||
|
|
597be7d501 | ||
|
|
379803751e | ||
|
|
ad18afc3ec | ||
|
|
464a525a5a | ||
|
|
0f45ac4088 | ||
|
|
ac41c97d21 | ||
|
|
aaf788b7cb | ||
|
|
47ae06698f | ||
|
|
03881c6743 | ||
|
|
2d6b0bf3e3 | ||
|
|
ee3955c68c | ||
|
|
325068bb53 | ||
|
|
bff6ca78a2 | ||
|
|
6878bc39b5 | ||
|
|
55e66aa40c | ||
|
|
9b7db08184 | ||
|
|
8691a5a37f | ||
|
|
4919d5d6df | ||
|
|
918e1c8a93 | ||
|
|
58def6e34d | ||
|
|
e787532479 | ||
|
|
e80b0932ee | ||
|
|
9e06991aae | ||
|
|
a14e02ab33 | ||
|
|
378db2e1a5 | ||
|
|
a197a8e184 | ||
|
|
0bb54ab9f0 | ||
|
|
f47b4edcc2 | ||
|
|
837a3d400b | ||
|
|
20b72a044c | ||
|
|
70c71efcab | ||
|
|
a5a3d28776 | ||
|
|
2a70a07aad | ||
|
|
5ac936a284 | ||
|
|
3c4652c906 | ||
|
|
2c6b9e8771 | ||
|
|
1639ccfd15 | ||
|
|
ab036c1a4c | ||
|
|
3dce2e1d35 | ||
|
|
c48e99e7f2 | ||
|
|
8a140ee77c | ||
|
|
df357f82ca | ||
|
|
236e957abb | ||
|
|
199e64d372 | ||
|
|
1f01c0fd98 | ||
|
|
884f76e05a | ||
|
|
a45337ea07 | ||
|
|
1318d534af | ||
|
|
10e3982b59 |
12
.github/scripts/get_min_versions.py
vendored
12
.github/scripts/get_min_versions.py
vendored
@@ -17,6 +17,8 @@ MIN_VERSION_LIBS = [
|
||||
"SQLAlchemy",
|
||||
]
|
||||
|
||||
SKIP_IF_PULL_REQUEST = ["langchain-core"]
|
||||
|
||||
|
||||
def get_min_version(version: str) -> str:
|
||||
# base regex for x.x.x with cases for rc/post/etc
|
||||
@@ -43,7 +45,7 @@ def get_min_version(version: str) -> str:
|
||||
raise ValueError(f"Unrecognized version format: {version}")
|
||||
|
||||
|
||||
def get_min_version_from_toml(toml_path: str):
|
||||
def get_min_version_from_toml(toml_path: str, versions_for: str):
|
||||
# Parse the TOML file
|
||||
with open(toml_path, "rb") as file:
|
||||
toml_data = tomllib.load(file)
|
||||
@@ -56,6 +58,10 @@ def get_min_version_from_toml(toml_path: str):
|
||||
|
||||
# Iterate over the libs in MIN_VERSION_LIBS
|
||||
for lib in MIN_VERSION_LIBS:
|
||||
if versions_for == "pull_request" and lib in SKIP_IF_PULL_REQUEST:
|
||||
# some libs only get checked on release because of simultaneous
|
||||
# changes
|
||||
continue
|
||||
# Check if the lib is present in the dependencies
|
||||
if lib in dependencies:
|
||||
# Get the version string
|
||||
@@ -76,8 +82,10 @@ def get_min_version_from_toml(toml_path: str):
|
||||
if __name__ == "__main__":
|
||||
# Get the TOML file path from the command line argument
|
||||
toml_file = sys.argv[1]
|
||||
versions_for = sys.argv[2]
|
||||
assert versions_for in ["release", "pull_request"]
|
||||
|
||||
# Call the function to get the minimum versions
|
||||
min_versions = get_min_version_from_toml(toml_file)
|
||||
min_versions = get_min_version_from_toml(toml_file, versions_for)
|
||||
|
||||
print(" ".join([f"{lib}=={version}" for lib, version in min_versions.items()]))
|
||||
|
||||
2
.github/workflows/_release.yml
vendored
2
.github/workflows/_release.yml
vendored
@@ -231,7 +231,7 @@ jobs:
|
||||
id: min-version
|
||||
run: |
|
||||
poetry run pip install packaging
|
||||
min_versions="$(poetry run python $GITHUB_WORKSPACE/.github/scripts/get_min_versions.py pyproject.toml)"
|
||||
min_versions="$(poetry run python $GITHUB_WORKSPACE/.github/scripts/get_min_versions.py pyproject.toml release)"
|
||||
echo "min-versions=$min_versions" >> "$GITHUB_OUTPUT"
|
||||
echo "min-versions=$min_versions"
|
||||
|
||||
|
||||
19
.github/workflows/_test.yml
vendored
19
.github/workflows/_test.yml
vendored
@@ -71,15 +71,16 @@ jobs:
|
||||
id: min-version
|
||||
run: |
|
||||
poetry run pip install packaging tomli
|
||||
min_versions="$(poetry run python $GITHUB_WORKSPACE/.github/scripts/get_min_versions.py pyproject.toml)"
|
||||
min_versions="$(poetry run python $GITHUB_WORKSPACE/.github/scripts/get_min_versions.py pyproject.toml pull_request)"
|
||||
echo "min-versions=$min_versions" >> "$GITHUB_OUTPUT"
|
||||
echo "min-versions=$min_versions"
|
||||
|
||||
- name: Run unit tests with minimum dependency versions
|
||||
if: ${{ steps.min-version.outputs.min-versions != '' }}
|
||||
env:
|
||||
MIN_VERSIONS: ${{ steps.min-version.outputs.min-versions }}
|
||||
run: |
|
||||
poetry run pip install --force-reinstall $MIN_VERSIONS --editable .
|
||||
make tests
|
||||
working-directory: ${{ inputs.working-directory }}
|
||||
# Temporarily disabled until we can get the minimum versions working
|
||||
# - name: Run unit tests with minimum dependency versions
|
||||
# if: ${{ steps.min-version.outputs.min-versions != '' }}
|
||||
# env:
|
||||
# MIN_VERSIONS: ${{ steps.min-version.outputs.min-versions }}
|
||||
# run: |
|
||||
# poetry run pip install --force-reinstall $MIN_VERSIONS --editable .
|
||||
# make tests
|
||||
# working-directory: ${{ inputs.working-directory }}
|
||||
|
||||
@@ -38,6 +38,8 @@ generate-files:
|
||||
|
||||
$(PYTHON) scripts/model_feat_table.py $(INTERMEDIATE_DIR)
|
||||
|
||||
$(PYTHON) scripts/tool_feat_table.py $(INTERMEDIATE_DIR)
|
||||
|
||||
$(PYTHON) scripts/document_loader_feat_table.py $(INTERMEDIATE_DIR)
|
||||
|
||||
$(PYTHON) scripts/copy_templates.py $(INTERMEDIATE_DIR)
|
||||
|
||||
@@ -267,9 +267,9 @@
|
||||
"We first instantiate a chat model that supports [tool calling](/docs/how_to/tool_calling/):\n",
|
||||
"\n",
|
||||
"```{=mdx}\n",
|
||||
"<ChatModelTabs\n",
|
||||
" customVarName=\"llm\"\n",
|
||||
"/>\n",
|
||||
"import ChatModelTabs from \"@theme/ChatModelTabs\";\n",
|
||||
"\n",
|
||||
"<ChatModelTabs customVarName=\"llm\" />\n",
|
||||
"```"
|
||||
]
|
||||
},
|
||||
@@ -541,7 +541,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.9"
|
||||
"version": "3.10.4"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
||||
@@ -81,7 +81,6 @@ These are the core building blocks you can use when building applications.
|
||||
- [How to: stream a response back](/docs/how_to/chat_streaming)
|
||||
- [How to: track token usage](/docs/how_to/chat_token_usage_tracking)
|
||||
- [How to: track response metadata across providers](/docs/how_to/response_metadata)
|
||||
- [How to: let your end users choose their model](/docs/how_to/chat_models_universal_init/)
|
||||
- [How to: use chat model to call tools](/docs/how_to/tool_calling)
|
||||
- [How to: stream tool calls](/docs/how_to/tool_streaming)
|
||||
- [How to: few shot prompt tool behavior](/docs/how_to/tools_few_shot)
|
||||
|
||||
@@ -284,17 +284,17 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"execution_count": 1,
|
||||
"id": "173e1a9c-2a18-4669-b0de-136f39197786",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"\"Arr, matey! I be sailin' the high seas with me crew, searchin' for buried treasure and adventure! How be ye doin' on this fine day?\""
|
||||
"\"Arrr, I be doin' well, me heartie! Just sailin' the high seas in search of treasure and adventure. How be ye?\""
|
||||
]
|
||||
},
|
||||
"execution_count": 8,
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
@@ -316,14 +316,20 @@
|
||||
"\n",
|
||||
"history = InMemoryChatMessageHistory()\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def get_history():\n",
|
||||
" return history\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"chain = prompt | ChatOpenAI() | StrOutputParser()\n",
|
||||
"\n",
|
||||
"wrapped_chain = RunnableWithMessageHistory(chain, lambda x: history)\n",
|
||||
"wrapped_chain = RunnableWithMessageHistory(\n",
|
||||
" chain,\n",
|
||||
" get_history,\n",
|
||||
" history_messages_key=\"chat_history\",\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"wrapped_chain.invoke(\n",
|
||||
" {\"input\": \"how are you?\"},\n",
|
||||
" config={\"configurable\": {\"session_id\": \"42\"}},\n",
|
||||
")"
|
||||
"wrapped_chain.invoke({\"input\": \"how are you?\"})"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -340,17 +346,17 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"execution_count": 2,
|
||||
"id": "4e05994f-1fbc-4699-bf2e-62cb0e4deeb8",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"AIMessage(content=\"Ahoy there! What be ye wantin' from this old pirate?\", response_metadata={'token_usage': {'completion_tokens': 15, 'prompt_tokens': 29, 'total_tokens': 44}, 'model_name': 'gpt-3.5-turbo-0125', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None}, id='run-1846d5f5-0dda-43b6-bb49-864e541f9c29-0', usage_metadata={'input_tokens': 29, 'output_tokens': 15, 'total_tokens': 44})"
|
||||
"'Ahoy matey! What can this old pirate do for ye today?'"
|
||||
]
|
||||
},
|
||||
"execution_count": 7,
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
@@ -370,9 +376,16 @@
|
||||
"\n",
|
||||
"chain = prompt | ChatOpenAI() | StrOutputParser()\n",
|
||||
"\n",
|
||||
"wrapped_chain = RunnableWithMessageHistory(chain, get_session_history)\n",
|
||||
"wrapped_chain = RunnableWithMessageHistory(\n",
|
||||
" chain,\n",
|
||||
" get_session_history,\n",
|
||||
" history_messages_key=\"chat_history\",\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"wrapped_chain.invoke(\"Hello!\", config={\"configurable\": {\"session_id\": \"abc123\"}})"
|
||||
"wrapped_chain.invoke(\n",
|
||||
" {\"input\": \"Hello!\"},\n",
|
||||
" config={\"configurable\": {\"session_id\": \"abc123\"}},\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -790,7 +803,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.5"
|
||||
"version": "3.10.4"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
||||
@@ -110,7 +110,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"execution_count": 3,
|
||||
"id": "cb09c344-1836-4e0c-acf8-11d13ac1dbae",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@@ -134,18 +134,21 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"execution_count": 4,
|
||||
"id": "62e0dbc3",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"AIMessage(content='Je adore le programmation.\\n\\n(Note: \"programmation\" is the feminine form of the noun in French, but if you want to use the masculine form, it would be \"le programme\" instead.)' response_metadata={'model': 'llama3', 'created_at': '2024-07-04T04:20:28.138164Z', 'message': {'role': 'assistant', 'content': ''}, 'done_reason': 'stop', 'done': True, 'total_duration': 1943337750, 'load_duration': 1128875, 'prompt_eval_count': 33, 'prompt_eval_duration': 322813000, 'eval_count': 43, 'eval_duration': 1618213000} id='run-ed8c17ab-7fc2-4c90-a88a-f6273b49bc78-0')\n"
|
||||
]
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"AIMessage(content='Je adore le programmation.\\n\\n(Note: \"programmation\" is not commonly used in French, but I translated it as \"le programmation\" to maintain the same grammatical structure and meaning as the original English sentence.)', response_metadata={'model': 'llama3', 'created_at': '2024-07-22T17:43:54.731273Z', 'message': {'role': 'assistant', 'content': ''}, 'done_reason': 'stop', 'done': True, 'total_duration': 11094839375, 'load_duration': 10121854667, 'prompt_eval_count': 36, 'prompt_eval_duration': 146569000, 'eval_count': 46, 'eval_duration': 816593000}, id='run-befccbdc-e1f9-42a9-85cf-e69b926d6b8b-0', usage_metadata={'input_tokens': 36, 'output_tokens': 46, 'total_tokens': 82})"
|
||||
]
|
||||
},
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
@@ -164,7 +167,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"execution_count": 5,
|
||||
"id": "d86145b3-bfef-46e8-b227-4dda5c9c2705",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@@ -174,7 +177,7 @@
|
||||
"text": [
|
||||
"Je adore le programmation.\n",
|
||||
"\n",
|
||||
"(Note: \"programmation\" is the feminine form of the noun in French, but if you want to use the masculine form, it would be \"le programme\" instead.)\n"
|
||||
"(Note: \"programmation\" is not commonly used in French, but I translated it as \"le programmation\" to maintain the same grammatical structure and meaning as the original English sentence.)\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
@@ -232,6 +235,86 @@
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "0f51345d-0a9d-43f1-8fca-d0662cb8e21b",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Tool calling\n",
|
||||
"\n",
|
||||
"We can use [tool calling](https://blog.langchain.dev/improving-core-tool-interfaces-and-docs-in-langchain/) with an LLM [that has been fine-tuned for tool use](https://ollama.com/library/llama3-groq-tool-use): \n",
|
||||
"\n",
|
||||
"```\n",
|
||||
"ollama pull llama3-groq-tool-use\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"We can just pass normal Python functions directly as tools."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"id": "5250bceb-1029-41ff-b447-983518704d88",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[{'name': 'validate_user',\n",
|
||||
" 'args': {'addresses': ['123 Fake St, Boston MA',\n",
|
||||
" '234 Pretend Boulevard, Houston TX'],\n",
|
||||
" 'user_id': 123},\n",
|
||||
" 'id': 'fe2148d3-95fb-48e9-845a-4bfecc1f1f96',\n",
|
||||
" 'type': 'tool_call'}]"
|
||||
]
|
||||
},
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from typing import List\n",
|
||||
"\n",
|
||||
"from langchain_ollama import ChatOllama\n",
|
||||
"from typing_extensions import TypedDict\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def validate_user(user_id: int, addresses: List) -> bool:\n",
|
||||
" \"\"\"Validate user using historical addresses.\n",
|
||||
"\n",
|
||||
" Args:\n",
|
||||
" user_id: (int) the user ID.\n",
|
||||
" addresses: Previous addresses.\n",
|
||||
" \"\"\"\n",
|
||||
" return True\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"llm = ChatOllama(\n",
|
||||
" model=\"llama3-groq-tool-use\",\n",
|
||||
" temperature=0,\n",
|
||||
").bind_tools([validate_user])\n",
|
||||
"\n",
|
||||
"result = llm.invoke(\n",
|
||||
" \"Could you validate user 123? They previously lived at \"\n",
|
||||
" \"123 Fake St in Boston MA and 234 Pretend Boulevard in \"\n",
|
||||
" \"Houston TX.\"\n",
|
||||
")\n",
|
||||
"result.tool_calls"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "2bb034ff-218f-4865-afea-3f5e57d3bdee",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"We look at the LangSmith trace to see that the tool call was performed: \n",
|
||||
"\n",
|
||||
"https://smith.langchain.com/public/4169348a-d6be-45df-a7cf-032f6baa4697/r\n",
|
||||
"\n",
|
||||
"In particular, the trace shows how the tool schema was populated."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "4c5e0197",
|
||||
@@ -384,7 +467,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.12.3"
|
||||
"version": "3.11.8"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
||||
@@ -6,6 +6,7 @@
|
||||
"source": [
|
||||
"---\n",
|
||||
"sidebar_label: Ollama Functions\n",
|
||||
"sidebar_class_name: hidden\n",
|
||||
"---"
|
||||
]
|
||||
},
|
||||
@@ -15,16 +16,16 @@
|
||||
"source": [
|
||||
"# OllamaFunctions\n",
|
||||
"\n",
|
||||
":::warning\n",
|
||||
"\n",
|
||||
"This was an experimental wrapper that attempts to bolt-on tool calling support to models that do not natively support it. The [primary Ollama integration](/docs/integrations/chat/ollama/) now supports tool calling, and should be used instead.\n",
|
||||
"\n",
|
||||
":::\n",
|
||||
"This notebook shows how to use an experimental wrapper around Ollama that gives it [tool calling capabilities](https://python.langchain.com/v0.2/docs/concepts/#functiontool-calling).\n",
|
||||
"\n",
|
||||
"Note that more powerful and capable models will perform better with complex schema and/or multiple functions. The examples below use llama3 and phi3 models.\n",
|
||||
"For a complete list of supported models and model variants, see the [Ollama model library](https://ollama.ai/library).\n",
|
||||
"\n",
|
||||
":::warning\n",
|
||||
"\n",
|
||||
"This is an experimental wrapper that attempts to bolt-on tool calling support to models that do not natively support it. Use with caution.\n",
|
||||
"\n",
|
||||
":::\n",
|
||||
"## Overview\n",
|
||||
"\n",
|
||||
"### Integration details\n",
|
||||
|
||||
@@ -82,9 +82,9 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# By default it will use the model which was deployed through the platform\n",
|
||||
"# in my case it will is \"claude-3-haiku\"\n",
|
||||
"# in my case it will is \"gpt-4o\"\n",
|
||||
"\n",
|
||||
"chat = ChatPremAI(project_id=8)"
|
||||
"chat = ChatPremAI(project_id=1234, model_name=\"gpt-4o\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -107,7 +107,7 @@
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"I am an artificial intelligence created by Anthropic. I'm here to help with a wide variety of tasks, from research and analysis to creative projects and open-ended conversation. I have general knowledge and capabilities, but I'm not a real person - I'm an AI assistant. Please let me know if you have any other questions!\n"
|
||||
"I am an AI language model created by OpenAI, designed to assist with answering questions and providing information based on the context provided. How can I help you today?\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
@@ -133,7 +133,7 @@
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"AIMessage(content=\"I am an artificial intelligence created by Anthropic. My purpose is to assist and converse with humans in a friendly and helpful way. I have a broad knowledge base that I can use to provide information, answer questions, and engage in discussions on a wide range of topics. Please let me know if you have any other questions - I'm here to help!\")"
|
||||
"AIMessage(content=\"I'm your friendly assistant! How can I help you today?\", response_metadata={'document_chunks': [{'repository_id': 1985, 'document_id': 1306, 'chunk_id': 173899, 'document_name': '[D] Difference between sparse and dense informati…', 'similarity_score': 0.3209080100059509, 'content': \"with the difference or anywhere\\nwhere I can read about it?\\n\\n\\n 17 9\\n\\n\\n u/ScotiabankCanada • Promoted\\n\\n\\n Accelerate your study permit process\\n with Scotiabank's Student GIC\\n Program. We're here to help you tur…\\n\\n\\n startright.scotiabank.com Learn More\\n\\n\\n Add a Comment\\n\\n\\nSort by: Best\\n\\n\\n DinosParkour • 1y ago\\n\\n\\n Dense Retrieval (DR) m\"}]}, id='run-510bbd0e-3f8f-4095-9b1f-c2d29fd89719-0')"
|
||||
]
|
||||
},
|
||||
"execution_count": 5,
|
||||
@@ -160,10 +160,18 @@
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/home/anindya/prem/langchain/libs/community/langchain_community/chat_models/premai.py:355: UserWarning: WARNING: Parameter top_p is not supported in kwargs.\n",
|
||||
" warnings.warn(f\"WARNING: Parameter {key} is not supported in kwargs.\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"AIMessage(content='I am an artificial intelligence created by Anthropic')"
|
||||
"AIMessage(content=\"Hello! I'm your friendly assistant. How can I\", response_metadata={'document_chunks': [{'repository_id': 1985, 'document_id': 1306, 'chunk_id': 173899, 'document_name': '[D] Difference between sparse and dense informati…', 'similarity_score': 0.3209080100059509, 'content': \"with the difference or anywhere\\nwhere I can read about it?\\n\\n\\n 17 9\\n\\n\\n u/ScotiabankCanada • Promoted\\n\\n\\n Accelerate your study permit process\\n with Scotiabank's Student GIC\\n Program. We're here to help you tur…\\n\\n\\n startright.scotiabank.com Learn More\\n\\n\\n Add a Comment\\n\\n\\nSort by: Best\\n\\n\\n DinosParkour • 1y ago\\n\\n\\n Dense Retrieval (DR) m\"}]}, id='run-c4b06b98-4161-4cca-8495-fd2fc98fa8f8-0')"
|
||||
]
|
||||
},
|
||||
"execution_count": 6,
|
||||
@@ -195,13 +203,13 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"query = \"what is the diameter of individual Galaxy\"\n",
|
||||
"query = \"Which models are used for dense retrieval\"\n",
|
||||
"repository_ids = [\n",
|
||||
" 1991,\n",
|
||||
" 1985,\n",
|
||||
"]\n",
|
||||
"repositories = dict(ids=repository_ids, similarity_threshold=0.3, limit=3)"
|
||||
]
|
||||
@@ -219,9 +227,34 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Dense retrieval models typically include:\n",
|
||||
"\n",
|
||||
"1. **BERT-based Models**: Such as DPR (Dense Passage Retrieval) which uses BERT for encoding queries and passages.\n",
|
||||
"2. **ColBERT**: A model that combines BERT with late interaction mechanisms.\n",
|
||||
"3. **ANCE (Approximate Nearest Neighbor Negative Contrastive Estimation)**: Uses BERT and focuses on efficient retrieval.\n",
|
||||
"4. **TCT-ColBERT**: A variant of ColBERT that uses a two-tower\n",
|
||||
"{\n",
|
||||
" \"document_chunks\": [\n",
|
||||
" {\n",
|
||||
" \"repository_id\": 1985,\n",
|
||||
" \"document_id\": 1306,\n",
|
||||
" \"chunk_id\": 173899,\n",
|
||||
" \"document_name\": \"[D] Difference between sparse and dense informati\\u2026\",\n",
|
||||
" \"similarity_score\": 0.3209080100059509,\n",
|
||||
" \"content\": \"with the difference or anywhere\\nwhere I can read about it?\\n\\n\\n 17 9\\n\\n\\n u/ScotiabankCanada \\u2022 Promoted\\n\\n\\n Accelerate your study permit process\\n with Scotiabank's Student GIC\\n Program. We're here to help you tur\\u2026\\n\\n\\n startright.scotiabank.com Learn More\\n\\n\\n Add a Comment\\n\\n\\nSort by: Best\\n\\n\\n DinosParkour \\u2022 1y ago\\n\\n\\n Dense Retrieval (DR) m\"\n",
|
||||
" }\n",
|
||||
" ]\n",
|
||||
"}\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import json\n",
|
||||
"\n",
|
||||
@@ -262,7 +295,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@@ -288,7 +321,7 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"template_id = \"78069ce8-xxxxx-xxxxx-xxxx-xxx\"\n",
|
||||
"response = chat.invoke([human_message], template_id=template_id)\n",
|
||||
"response = chat.invoke([human_messages], template_id=template_id)\n",
|
||||
"print(response.content)"
|
||||
]
|
||||
},
|
||||
@@ -310,14 +343,14 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"execution_count": 17,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Hello! As an AI language model, I don't have feelings or a physical state, but I'm functioning properly and ready to assist you with any questions or tasks you might have. How can I help you today?"
|
||||
"It looks like your message got cut off. If you need information about Dense Retrieval (DR) or any other topic, please provide more details or clarify your question."
|
||||
]
|
||||
}
|
||||
],
|
||||
@@ -338,14 +371,14 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"execution_count": 18,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Hello! As an AI language model, I don't have feelings or a physical form, but I'm functioning properly and ready to assist you. How can I help you today?"
|
||||
"Woof! 🐾 How can I help you today? Want to play fetch or maybe go for a walk 🐶🦴"
|
||||
]
|
||||
}
|
||||
],
|
||||
@@ -365,6 +398,275 @@
|
||||
" sys.stdout.write(chunk.content)\n",
|
||||
" sys.stdout.flush()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Tool/Function Calling\n",
|
||||
"\n",
|
||||
"LangChain PremAI supports tool/function calling. Tool/function calling allows a model to respond to a given prompt by generating output that matches a user-defined schema. \n",
|
||||
"\n",
|
||||
"- You can learn all about tool calling in details [in our documentation here](https://docs.premai.io/get-started/function-calling).\n",
|
||||
"- You can learn more about langchain tool calling in [this part of the docs](https://python.langchain.com/v0.1/docs/modules/model_io/chat/function_calling).\n",
|
||||
"\n",
|
||||
"**NOTE:**\n",
|
||||
"The current version of LangChain ChatPremAI do not support function/tool calling with streaming support. Streaming support along with function calling will come soon. \n",
|
||||
"\n",
|
||||
"#### Passing tools to model\n",
|
||||
"\n",
|
||||
"In order to pass tools and let the LLM choose the tool it needs to call, we need to pass a tool schema. A tool schema is the function definition along with proper docstring on what does the function do, what each argument of the function is etc. Below are some simple arithmetic functions with their schema. \n",
|
||||
"\n",
|
||||
"**NOTE:** When defining function/tool schema, do not forget to add information around the function arguments, otherwise it would throw error."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 19,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain_core.pydantic_v1 import BaseModel, Field\n",
|
||||
"from langchain_core.tools import tool\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# Define the schema for function arguments\n",
|
||||
"class OperationInput(BaseModel):\n",
|
||||
" a: int = Field(description=\"First number\")\n",
|
||||
" b: int = Field(description=\"Second number\")\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# Now define the function where schema for argument will be OperationInput\n",
|
||||
"@tool(\"add\", args_schema=OperationInput, return_direct=True)\n",
|
||||
"def add(a: int, b: int) -> int:\n",
|
||||
" \"\"\"Adds a and b.\n",
|
||||
"\n",
|
||||
" Args:\n",
|
||||
" a: first int\n",
|
||||
" b: second int\n",
|
||||
" \"\"\"\n",
|
||||
" return a + b\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"@tool(\"multiply\", args_schema=OperationInput, return_direct=True)\n",
|
||||
"def multiply(a: int, b: int) -> int:\n",
|
||||
" \"\"\"Multiplies a and b.\n",
|
||||
"\n",
|
||||
" Args:\n",
|
||||
" a: first int\n",
|
||||
" b: second int\n",
|
||||
" \"\"\"\n",
|
||||
" return a * b"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Binding tool schemas with our LLM\n",
|
||||
"\n",
|
||||
"We will now use the `bind_tools` method to convert our above functions to a \"tool\" and binding it with the model. This means we are going to pass these tool informations everytime we invoke the model. "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 20,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"tools = [add, multiply]\n",
|
||||
"llm_with_tools = chat.bind_tools(tools)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"After this, we get the response from the model which is now binded with the tools. "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 21,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"query = \"What is 3 * 12? Also, what is 11 + 49?\"\n",
|
||||
"\n",
|
||||
"messages = [HumanMessage(query)]\n",
|
||||
"ai_msg = llm_with_tools.invoke(messages)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"As we can see, when our chat model is binded with tools, then based on the given prompt, it calls the correct set of the tools and sequentially. "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 27,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[{'name': 'multiply',\n",
|
||||
" 'args': {'a': 3, 'b': 12},\n",
|
||||
" 'id': 'call_A9FL20u12lz6TpOLaiS6rFa8'},\n",
|
||||
" {'name': 'add',\n",
|
||||
" 'args': {'a': 11, 'b': 49},\n",
|
||||
" 'id': 'call_MPKYGLHbf39csJIyb5BZ9xIk'}]"
|
||||
]
|
||||
},
|
||||
"execution_count": 27,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"ai_msg.tool_calls"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"We append this message shown above to the LLM which acts as a context and makes the LLM aware that what all functions it has called. "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 23,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"messages.append(ai_msg)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Since tool calling happens into two phases, where:\n",
|
||||
"\n",
|
||||
"1. in our first call, we gathered all the tools that the LLM decided to tool, so that it can get the result as an added context to give more accurate and hallucination free result. \n",
|
||||
"\n",
|
||||
"2. in our second call, we will parse those set of tools decided by LLM and run them (in our case it will be the functions we defined, with the LLM's extracted arguments) and pass this result to the LLM"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 24,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain_core.messages import ToolMessage\n",
|
||||
"\n",
|
||||
"for tool_call in ai_msg.tool_calls:\n",
|
||||
" selected_tool = {\"add\": add, \"multiply\": multiply}[tool_call[\"name\"].lower()]\n",
|
||||
" tool_output = selected_tool.invoke(tool_call[\"args\"])\n",
|
||||
" messages.append(ToolMessage(tool_output, tool_call_id=tool_call[\"id\"]))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Finally, we call the LLM (binded with the tools) with the function response added in it's context. "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 28,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"The final answers are:\n",
|
||||
"\n",
|
||||
"- 3 * 12 = 36\n",
|
||||
"- 11 + 49 = 60\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"response = llm_with_tools.invoke(messages)\n",
|
||||
"print(response.content)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Defining tool schemas: Pydantic class\n",
|
||||
"\n",
|
||||
"Above we have shown how to define schema using `tool` decorator, however we can equivalently define the schema using Pydantic. Pydantic is useful when your tool inputs are more complex:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 29,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain_core.output_parsers.openai_tools import PydanticToolsParser\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"class add(BaseModel):\n",
|
||||
" \"\"\"Add two integers together.\"\"\"\n",
|
||||
"\n",
|
||||
" a: int = Field(..., description=\"First integer\")\n",
|
||||
" b: int = Field(..., description=\"Second integer\")\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"class multiply(BaseModel):\n",
|
||||
" \"\"\"Multiply two integers together.\"\"\"\n",
|
||||
"\n",
|
||||
" a: int = Field(..., description=\"First integer\")\n",
|
||||
" b: int = Field(..., description=\"Second integer\")\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"tools = [add, multiply]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Now, we can bind them to chat models and directly get the result:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 30,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[multiply(a=3, b=12), add(a=11, b=49)]"
|
||||
]
|
||||
},
|
||||
"execution_count": 30,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"chain = llm_with_tools | PydanticToolsParser(tools=[multiply, add])\n",
|
||||
"chain.invoke(query)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Now, as done above, we parse this and run this functions and call the LLM once again to get the result."
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
@@ -383,7 +685,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.7"
|
||||
"version": "3.9.19"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
||||
484
docs/docs/integrations/document_loaders/dedoc.ipynb
Normal file
484
docs/docs/integrations/document_loaders/dedoc.ipynb
Normal file
@@ -0,0 +1,484 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "6b74f73d-1763-42d0-9c24-8f65f445bb72",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Dedoc\n",
|
||||
"\n",
|
||||
"This sample demonstrates the use of `Dedoc` in combination with `LangChain` as a `DocumentLoader`.\n",
|
||||
"\n",
|
||||
"## Overview\n",
|
||||
"\n",
|
||||
"[Dedoc](https://dedoc.readthedocs.io) is an [open-source](https://github.com/ispras/dedoc)\n",
|
||||
"library/service that extracts texts, tables, attached files and document structure\n",
|
||||
"(e.g., titles, list items, etc.) from files of various formats.\n",
|
||||
"\n",
|
||||
"`Dedoc` supports `DOCX`, `XLSX`, `PPTX`, `EML`, `HTML`, `PDF`, images and more.\n",
|
||||
"Full list of supported formats can be found [here](https://dedoc.readthedocs.io/en/latest/#id1).\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"### Integration details\n",
|
||||
"\n",
|
||||
"| Class | Package | Local | Serializable | JS support |\n",
|
||||
"|:-----------------------------------------------------------------------------------------------------------------------------------------------------|:-----------------------------------------------------------------------------------------------|:-----:|:------------:|:----------:|\n",
|
||||
"| [DedocFileLoader](https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.dedoc.DedocFileLoader.html) | [langchain_community](https://api.python.langchain.com/en/latest/community_api_reference.html) | ❌ | beta | ❌ |\n",
|
||||
"| [DedocPDFLoader](https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.pdf.DedocPDFLoader.html) | [langchain_community](https://api.python.langchain.com/en/latest/community_api_reference.html) | ❌ | beta | ❌ | \n",
|
||||
"| [DedocAPIFileLoader](https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.dedoc.DedocAPIFileLoader.html) | [langchain_community](https://api.python.langchain.com/en/latest/community_api_reference.html) | ❌ | beta | ❌ | \n",
|
||||
"\n",
|
||||
"\n",
|
||||
"### Loader features\n",
|
||||
"\n",
|
||||
"Methods for lazy loading and async loading are available, but in fact, document loading is executed synchronously.\n",
|
||||
"\n",
|
||||
"| Source | Document Lazy Loading | Async Support |\n",
|
||||
"|:------------------:|:---------------------:|:-------------:| \n",
|
||||
"| DedocFileLoader | ❌ | ❌ |\n",
|
||||
"| DedocPDFLoader | ❌ | ❌ | \n",
|
||||
"| DedocAPIFileLoader | ❌ | ❌ | \n",
|
||||
"\n",
|
||||
"## Setup\n",
|
||||
"\n",
|
||||
"* To access `DedocFileLoader` and `DedocPDFLoader` document loaders, you'll need to install the `dedoc` integration package.\n",
|
||||
"* To access `DedocAPIFileLoader`, you'll need to run the `Dedoc` service, e.g. `Docker` container (please see [the documentation](https://dedoc.readthedocs.io/en/latest/getting_started/installation.html#install-and-run-dedoc-using-docker) \n",
|
||||
"for more details):\n",
|
||||
"\n",
|
||||
"```bash\n",
|
||||
"docker pull dedocproject/dedoc\n",
|
||||
"docker run -p 1231:1231\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"`Dedoc` installation instruction is given [here](https://dedoc.readthedocs.io/en/latest/getting_started/installation.html)."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "511c109d-a5c3-42ba-914e-5d1b385bc40f",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Note: you may need to restart the kernel to use updated packages.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Install package\n",
|
||||
"%pip install --quiet \"dedoc[torch]\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "6820c0e9-d56d-4899-b8c8-374760360e2b",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Instantiation"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "c1f98cae-71ec-4d60-87fb-96c1a76851d8",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain_community.document_loaders import DedocFileLoader\n",
|
||||
"\n",
|
||||
"loader = DedocFileLoader(\"./example_data/state_of_the_union.txt\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "5d7bc2b3-73a0-4cd6-8014-cc7184aa9d4a",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Load"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "b9097c14-6168-4726-819e-24abb9a63b13",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'\\nMadam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and t'"
|
||||
]
|
||||
},
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"docs = loader.load()\n",
|
||||
"docs[0].page_content[:100]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "9ed8bd46-0047-4ccc-b2d6-beb7761f7312",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Lazy Load"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "6ae12d7e-8105-4bbe-9031-0e968475f6bf",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\n",
|
||||
"Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and t\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"docs = loader.lazy_load()\n",
|
||||
"\n",
|
||||
"for doc in docs:\n",
|
||||
" print(doc.page_content[:100])\n",
|
||||
" break"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "8772ae40-6239-4751-bb2d-b4a9415c1ad1",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## API reference\n",
|
||||
"\n",
|
||||
"For detailed information on configuring and calling `Dedoc` loaders, please see the API references: \n",
|
||||
"\n",
|
||||
"* https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.dedoc.DedocFileLoader.html\n",
|
||||
"* https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.pdf.DedocPDFLoader.html\n",
|
||||
"* https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.dedoc.DedocAPIFileLoader.html"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "c4d5e702-0e21-4cad-a4c3-b9b3bff77203",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Loading any file\n",
|
||||
"\n",
|
||||
"For automatic handling of any file in a [supported format](https://dedoc.readthedocs.io/en/latest/#id1),\n",
|
||||
"`DedocFileLoader` can be useful.\n",
|
||||
"The file loader automatically detects the file type with a correct extension.\n",
|
||||
"\n",
|
||||
"File parsing process can be configured through `dedoc_kwargs` during the `DedocFileLoader` class initialization.\n",
|
||||
"Here the basic examples of some options usage are given, \n",
|
||||
"please see the documentation of `DedocFileLoader` and \n",
|
||||
"[dedoc documentation](https://dedoc.readthedocs.io/en/latest/parameters/parameters.html) \n",
|
||||
"to get more details about configuration parameters."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "de97d0ed-d6b1-44e0-b392-1f3d89c762f9",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Basic example"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "50ffeeee-db12-4801-b208-7e32ea3d72ad",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'\\nMadam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans. \\n\\n\\n\\nLast year COVID-19 kept us apart. This year we are finally together again. \\n\\n\\n\\nTonight, we meet as Democrats Republicans and Independents. But most importantly as Americans. \\n\\n\\n\\nWith a duty to one another to the American people to '"
|
||||
]
|
||||
},
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from langchain_community.document_loaders import DedocFileLoader\n",
|
||||
"\n",
|
||||
"loader = DedocFileLoader(\"./example_data/state_of_the_union.txt\")\n",
|
||||
"\n",
|
||||
"docs = loader.load()\n",
|
||||
"\n",
|
||||
"docs[0].page_content[:400]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "457e5d4c-a4ee-4f31-ae74-3f75a1bbd0af",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Modes of split\n",
|
||||
"\n",
|
||||
"`DedocFileLoader` supports different types of document splitting into parts (each part is returned separately).\n",
|
||||
"For this purpose, `split` parameter is used with the following options:\n",
|
||||
"* `document` (default value): document text is returned as a single langchain `Document` object (don't split);\n",
|
||||
"* `page`: split document text into pages (works for `PDF`, `DJVU`, `PPTX`, `PPT`, `ODP`);\n",
|
||||
"* `node`: split document text into `Dedoc` tree nodes (title nodes, list item nodes, raw text nodes);\n",
|
||||
"* `line`: split document text into textual lines."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "eec54d31-ae7a-4a3c-aa10-4ae276b1e4c4",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"2"
|
||||
]
|
||||
},
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"loader = DedocFileLoader(\n",
|
||||
" \"./example_data/layout-parser-paper.pdf\",\n",
|
||||
" split=\"page\",\n",
|
||||
" pages=\":2\",\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"docs = loader.load()\n",
|
||||
"\n",
|
||||
"len(docs)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "61e11769-4780-4f77-b10e-27db6936f226",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Handling tables\n",
|
||||
"\n",
|
||||
"`DedocFileLoader` supports tables handling when `with_tables` parameter is \n",
|
||||
"set to `True` during loader initialization (`with_tables=True` by default). \n",
|
||||
"\n",
|
||||
"Tables are not split - each table corresponds to one langchain `Document` object.\n",
|
||||
"For tables, `Document` object has additional `metadata` fields `type=\"table\"` \n",
|
||||
"and `text_as_html` with table `HTML` representation."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"id": "bbeb2f8a-ac5e-4b59-8026-7ea3fc14c928",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"('table',\n",
|
||||
" '<table border=\"1\" style=\"border-collapse: collapse; width: 100%;\">\\n<tbody>\\n<tr>\\n<td colspan=\"1\" rowspan=\"1\">Team</td>\\n<td colspan=\"1\" rowspan=\"1\"> "Payroll (millions)"</td>\\n<td colspan=\"1\" r')"
|
||||
]
|
||||
},
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"loader = DedocFileLoader(\"./example_data/mlb_teams_2012.csv\")\n",
|
||||
"\n",
|
||||
"docs = loader.load()\n",
|
||||
"\n",
|
||||
"docs[1].metadata[\"type\"], docs[1].metadata[\"text_as_html\"][:200]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "b4a2b872-2aba-4e4c-8b2f-83a5a81ee1da",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Handling attached files\n",
|
||||
"\n",
|
||||
"`DedocFileLoader` supports attached files handling when `with_attachments` is set \n",
|
||||
"to `True` during loader initialization (`with_attachments=False` by default). \n",
|
||||
"\n",
|
||||
"Attachments are split according to the `split` parameter.\n",
|
||||
"For attachments, langchain `Document` object has an additional metadata \n",
|
||||
"field `type=\"attachment\"`."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"id": "bb9d6c1c-e24c-4979-88a0-38d54abd6332",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"('attachment',\n",
|
||||
" '\\nContent-Type\\nmultipart/mixed; boundary=\"0000000000005d654405f082adb7\"\\nDate\\nFri, 23 Dec 2022 12:08:48 -0600\\nFrom\\nMallori Harrell <mallori@unstructured.io>\\nMIME-Version\\n1.0\\nMessage-ID\\n<CAPgNNXSzLVJ-d1OCX_TjFgJU7ugtQrjFybPtAMmmYZzphxNFYg@mail.gmail.com>\\nSubject\\nFake email with attachment\\nTo\\nMallori Harrell <mallori@unstructured.io>')"
|
||||
]
|
||||
},
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"loader = DedocFileLoader(\n",
|
||||
" \"./example_data/fake-email-attachment.eml\",\n",
|
||||
" with_attachments=True,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"docs = loader.load()\n",
|
||||
"\n",
|
||||
"docs[1].metadata[\"type\"], docs[1].page_content"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "d435c3f6-703a-4064-8307-ace140de967a",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Loading PDF file\n",
|
||||
"\n",
|
||||
"If you want to handle only `PDF` documents, you can use `DedocPDFLoader` with only `PDF` support.\n",
|
||||
"The loader supports the same parameters for document split, tables and attachments extraction.\n",
|
||||
"\n",
|
||||
"`Dedoc` can extract `PDF` with or without a textual layer, \n",
|
||||
"as well as automatically detect its presence and correctness.\n",
|
||||
"Several `PDF` handlers are available, you can use `pdf_with_text_layer` \n",
|
||||
"parameter to choose one of them.\n",
|
||||
"Please see [parameters description](https://dedoc.readthedocs.io/en/latest/parameters/pdf_handling.html) \n",
|
||||
"to get more details.\n",
|
||||
"\n",
|
||||
"For `PDF` without a textual layer, `Tesseract OCR` and its language packages should be installed.\n",
|
||||
"In this case, [the instruction](https://dedoc.readthedocs.io/en/latest/tutorials/add_new_language.html) can be useful."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"id": "0103a7f3-6b5e-4444-8f4d-83dd3724a9af",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'\\n2\\n\\nZ. Shen et al.\\n\\n37], layout detection [38, 22], table detection [26], and scene text detection [4].\\n\\nA generalized learning-based framework dramatically reduces the need for the\\n\\nmanual specification of complicated rules, which is the status quo with traditional\\n\\nmethods. DL has the potential to transform DIA pipelines and benefit a broad\\n\\nspectrum of large-scale document digitization projects.\\n'"
|
||||
]
|
||||
},
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from langchain_community.document_loaders import DedocPDFLoader\n",
|
||||
"\n",
|
||||
"loader = DedocPDFLoader(\n",
|
||||
" \"./example_data/layout-parser-paper.pdf\", pdf_with_text_layer=\"true\", pages=\"2:2\"\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"docs = loader.load()\n",
|
||||
"\n",
|
||||
"docs[0].page_content[:400]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "13061995-1805-40c2-a77a-a6cd80999e20",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Dedoc API\n",
|
||||
"\n",
|
||||
"If you want to get up and running with less set up, you can use `Dedoc` as a service.\n",
|
||||
"**`DedocAPIFileLoader` can be used without installation of `dedoc` library.**\n",
|
||||
"The loader supports the same parameters as `DedocFileLoader` and\n",
|
||||
"also automatically detects input file types.\n",
|
||||
"\n",
|
||||
"To use `DedocAPIFileLoader`, you should run the `Dedoc` service, e.g. `Docker` container (please see [the documentation](https://dedoc.readthedocs.io/en/latest/getting_started/installation.html#install-and-run-dedoc-using-docker) \n",
|
||||
"for more details):\n",
|
||||
"\n",
|
||||
"```bash\n",
|
||||
"docker pull dedocproject/dedoc\n",
|
||||
"docker run -p 1231:1231\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"Please do not use our demo URL `https://dedoc-readme.hf.space` in your code."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"id": "211fc0b5-6080-4974-a6c1-f982bafd87d6",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'\\nMadam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans. \\n\\n\\n\\nLast year COVID-19 kept us apart. This year we are finally together again. \\n\\n\\n\\nTonight, we meet as Democrats Republicans and Independents. But most importantly as Americans. \\n\\n\\n\\nWith a duty to one another to the American people to '"
|
||||
]
|
||||
},
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from langchain_community.document_loaders import DedocAPIFileLoader\n",
|
||||
"\n",
|
||||
"loader = DedocAPIFileLoader(\n",
|
||||
" \"./example_data/state_of_the_union.txt\",\n",
|
||||
" url=\"https://dedoc-readme.hf.space\",\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"docs = loader.load()\n",
|
||||
"\n",
|
||||
"docs[0].page_content[:400]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "faaff475-5209-436f-bcde-97d58daed05c",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.19"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
@@ -316,7 +316,7 @@
|
||||
"id": "eb00a625-a6c9-4766-b3f0-eaed024851c9",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Return SQARQL query\n",
|
||||
"## Return SPARQL query\n",
|
||||
"You can return the SPARQL query step from the Sparql QA Chain using the `return_sparql_query` parameter"
|
||||
]
|
||||
},
|
||||
@@ -358,7 +358,7 @@
|
||||
"\u001b[32;1m\u001b[1;3m[]\u001b[0m\n",
|
||||
"\n",
|
||||
"\u001b[1m> Finished chain.\u001b[0m\n",
|
||||
"SQARQL query: PREFIX foaf: <http://xmlns.com/foaf/0.1/>\n",
|
||||
"SPARQL query: PREFIX foaf: <http://xmlns.com/foaf/0.1/>\n",
|
||||
"SELECT ?workHomepage\n",
|
||||
"WHERE {\n",
|
||||
" ?person foaf:name \"Tim Berners-Lee\" .\n",
|
||||
@@ -370,7 +370,7 @@
|
||||
],
|
||||
"source": [
|
||||
"result = chain(\"What is Tim Berners-Lee's work homepage?\")\n",
|
||||
"print(f\"SQARQL query: {result['sparql_query']}\")\n",
|
||||
"print(f\"SPARQL query: {result['sparql_query']}\")\n",
|
||||
"print(f\"Final answer: {result['result']}\")"
|
||||
]
|
||||
},
|
||||
|
||||
@@ -194,12 +194,37 @@
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "e4a1e0f1",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"For certain requirements, there is an option to pass the IBM's [`APIClient`](https://ibm.github.io/watsonx-ai-python-sdk/base.html#apiclient) object into the `WatsonxLLM` class."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "4b28afc1",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from ibm_watsonx_ai import APIClient\n",
|
||||
"\n",
|
||||
"api_client = APIClient(...)\n",
|
||||
"\n",
|
||||
"watsonx_llm = WatsonxLLM(\n",
|
||||
" model_id=\"ibm/granite-13b-instruct-v2\",\n",
|
||||
" watsonx_client=api_client,\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "7c4a632b",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"You can also pass the IBM's [`ModelInference`](https://ibm.github.io/watsonx-ai-python-sdk/fm_model_inference.html) object into `WatsonxLLM` class."
|
||||
"You can also pass the IBM's [`ModelInference`](https://ibm.github.io/watsonx-ai-python-sdk/fm_model_inference.html) object into the `WatsonxLLM` class."
|
||||
]
|
||||
},
|
||||
{
|
||||
|
||||
@@ -88,6 +88,7 @@
|
||||
" \"max_tokens_to_generate\": 1000,\n",
|
||||
" \"temperature\": 0.01,\n",
|
||||
" \"select_expert\": \"llama-2-7b-chat-hf\",\n",
|
||||
" \"process_prompt\": False,\n",
|
||||
" # \"stop_sequences\": '\\\"sequence1\\\",\\\"sequence2\\\"',\n",
|
||||
" # \"repetition_penalty\": 1.0,\n",
|
||||
" # \"top_k\": 50,\n",
|
||||
@@ -116,6 +117,7 @@
|
||||
" \"max_tokens_to_generate\": 1000,\n",
|
||||
" \"temperature\": 0.01,\n",
|
||||
" \"select_expert\": \"llama-2-7b-chat-hf\",\n",
|
||||
" \"process_prompt\": False,\n",
|
||||
" # \"stop_sequences\": '\\\"sequence1\\\",\\\"sequence2\\\"',\n",
|
||||
" # \"repetition_penalty\": 1.0,\n",
|
||||
" # \"top_k\": 50,\n",
|
||||
@@ -175,9 +177,7 @@
|
||||
"import os\n",
|
||||
"\n",
|
||||
"sambastudio_base_url = \"<Your SambaStudio environment URL>\"\n",
|
||||
"sambastudio_base_uri = (\n",
|
||||
" \"<Your SambaStudio endpoint base URI>\" # optional, \"api/predict/nlp\" set as default\n",
|
||||
")\n",
|
||||
"sambastudio_base_uri = \"<Your SambaStudio endpoint base URI>\" # optional, \"api/predict/generic\" set as default\n",
|
||||
"sambastudio_project_id = \"<Your SambaStudio project id>\"\n",
|
||||
"sambastudio_endpoint_id = \"<Your SambaStudio endpoint id>\"\n",
|
||||
"sambastudio_api_key = \"<Your SambaStudio endpoint API key>\"\n",
|
||||
@@ -271,6 +271,7 @@
|
||||
" \"do_sample\": True,\n",
|
||||
" \"max_tokens_to_generate\": 1000,\n",
|
||||
" \"temperature\": 0.01,\n",
|
||||
" \"process_prompt\": False,\n",
|
||||
" \"select_expert\": \"Meta-Llama-3-8B-Instruct\",\n",
|
||||
" # \"repetition_penalty\": 1.0,\n",
|
||||
" # \"top_k\": 50,\n",
|
||||
|
||||
56
docs/docs/integrations/providers/dedoc.mdx
Normal file
56
docs/docs/integrations/providers/dedoc.mdx
Normal file
@@ -0,0 +1,56 @@
|
||||
# Dedoc
|
||||
|
||||
>[Dedoc](https://dedoc.readthedocs.io) is an [open-source](https://github.com/ispras/dedoc)
|
||||
library/service that extracts texts, tables, attached files and document structure
|
||||
(e.g., titles, list items, etc.) from files of various formats.
|
||||
|
||||
`Dedoc` supports `DOCX`, `XLSX`, `PPTX`, `EML`, `HTML`, `PDF`, images and more.
|
||||
Full list of supported formats can be found [here](https://dedoc.readthedocs.io/en/latest/#id1).
|
||||
|
||||
## Installation and Setup
|
||||
|
||||
### Dedoc library
|
||||
|
||||
You can install `Dedoc` using `pip`.
|
||||
In this case, you will need to install dependencies,
|
||||
please go [here](https://dedoc.readthedocs.io/en/latest/getting_started/installation.html)
|
||||
to get more information.
|
||||
|
||||
```bash
|
||||
pip install dedoc
|
||||
```
|
||||
|
||||
### Dedoc API
|
||||
|
||||
If you are going to use `Dedoc` API, you don't need to install `dedoc` library.
|
||||
In this case, you should run the `Dedoc` service, e.g. `Docker` container (please see
|
||||
[the documentation](https://dedoc.readthedocs.io/en/latest/getting_started/installation.html#install-and-run-dedoc-using-docker)
|
||||
for more details):
|
||||
|
||||
```bash
|
||||
docker pull dedocproject/dedoc
|
||||
docker run -p 1231:1231
|
||||
```
|
||||
|
||||
## Document Loader
|
||||
|
||||
* For handling files of any formats (supported by `Dedoc`), you can use `DedocFileLoader`:
|
||||
|
||||
```python
|
||||
from langchain_community.document_loaders import DedocFileLoader
|
||||
```
|
||||
|
||||
* For handling PDF files (with or without a textual layer), you can use `DedocPDFLoader`:
|
||||
|
||||
```python
|
||||
from langchain_community.document_loaders import DedocPDFLoader
|
||||
```
|
||||
|
||||
* For handling files of any formats without library installation,
|
||||
you can use `Dedoc API` with `DedocAPIFileLoader`:
|
||||
|
||||
```python
|
||||
from langchain_community.document_loaders import DedocAPIFileLoader
|
||||
```
|
||||
|
||||
Please see a [usage example](/docs/integrations/document_loaders/dedoc) for more details.
|
||||
@@ -38,7 +38,7 @@ import getpass
|
||||
if "PREMAI_API_KEY" not in os.environ:
|
||||
os.environ["PREMAI_API_KEY"] = getpass.getpass("PremAI API Key:")
|
||||
|
||||
chat = ChatPremAI(project_id=8)
|
||||
chat = ChatPremAI(project_id=1234, model_name="gpt-4o")
|
||||
```
|
||||
|
||||
### Chat Completions
|
||||
@@ -50,7 +50,8 @@ The first one will give us a static result. Whereas the second one will stream t
|
||||
```python
|
||||
human_message = HumanMessage(content="Who are you?")
|
||||
|
||||
chat.invoke([human_message])
|
||||
response = chat.invoke([human_message])
|
||||
print(response.content)
|
||||
```
|
||||
|
||||
You can provide system prompt here like this:
|
||||
@@ -84,8 +85,8 @@ Repositories are also supported in langchain premai. Here is how you can do it.
|
||||
|
||||
```python
|
||||
|
||||
query = "what is the diameter of individual Galaxy"
|
||||
repository_ids = [1991, ]
|
||||
query = "Which models are used for dense retrieval"
|
||||
repository_ids = [1985,]
|
||||
repositories = dict(
|
||||
ids=repository_ids,
|
||||
similarity_threshold=0.3,
|
||||
@@ -100,6 +101,8 @@ First we start by defining our repository with some repository ids. Make sure th
|
||||
Now, we connect the repository with our chat object to invoke RAG based generations.
|
||||
|
||||
```python
|
||||
import json
|
||||
|
||||
response = chat.invoke(query, max_tokens=100, repositories=repositories)
|
||||
|
||||
print(response.content)
|
||||
@@ -109,25 +112,22 @@ print(json.dumps(response.response_metadata, indent=4))
|
||||
This is how an output looks like.
|
||||
|
||||
```bash
|
||||
The diameters of individual galaxies range from 80,000-150,000 light-years.
|
||||
Dense retrieval models typically include:
|
||||
|
||||
1. **BERT-based Models**: Such as DPR (Dense Passage Retrieval) which uses BERT for encoding queries and passages.
|
||||
2. **ColBERT**: A model that combines BERT with late interaction mechanisms.
|
||||
3. **ANCE (Approximate Nearest Neighbor Negative Contrastive Estimation)**: Uses BERT and focuses on efficient retrieval.
|
||||
4. **TCT-ColBERT**: A variant of ColBERT that uses a two-tower
|
||||
{
|
||||
"document_chunks": [
|
||||
{
|
||||
"repository_id": 19xx,
|
||||
"document_id": 13xx,
|
||||
"chunk_id": 173xxx,
|
||||
"document_name": "Kegy 202 Chapter 2",
|
||||
"similarity_score": 0.586126983165741,
|
||||
"content": "n thousands\n of light-years. The diameters of individual\n galaxies range from 80,000-150,000 light\n "
|
||||
},
|
||||
{
|
||||
"repository_id": 19xx,
|
||||
"document_id": 13xx,
|
||||
"chunk_id": 173xxx,
|
||||
"document_name": "Kegy 202 Chapter 2",
|
||||
"similarity_score": 0.4815782308578491,
|
||||
"content": " for development of galaxies. A galaxy contains\n a large number of stars. Galaxies spread over\n vast distances that are measured in thousands\n "
|
||||
},
|
||||
"repository_id": 1985,
|
||||
"document_id": 1306,
|
||||
"chunk_id": 173899,
|
||||
"document_name": "[D] Difference between sparse and dense informati\u2026",
|
||||
"similarity_score": 0.3209080100059509,
|
||||
"content": "with the difference or anywhere\nwhere I can read about it?\n\n\n 17 9\n\n\n u/ScotiabankCanada \u2022 Promoted\n\n\n Accelerate your study permit process\n with Scotiabank's Student GIC\n Program. We're here to help you tur\u2026\n\n\n startright.scotiabank.com Learn More\n\n\n Add a Comment\n\n\nSort by: Best\n\n\n DinosParkour \u2022 1y ago\n\n\n Dense Retrieval (DR) m"
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
@@ -264,4 +264,164 @@ doc_result[:5]
|
||||
0.0008162345038726926,
|
||||
-0.004556538071483374,
|
||||
0.02918623760342598,
|
||||
-0.02547479420900345]
|
||||
-0.02547479420900345]
|
||||
|
||||
## Tool/Function Calling
|
||||
|
||||
LangChain PremAI supports tool/function calling. Tool/function calling allows a model to respond to a given prompt by generating output that matches a user-defined schema.
|
||||
|
||||
- You can learn all about tool calling in details [in our documentation here](https://docs.premai.io/get-started/function-calling).
|
||||
- You can learn more about langchain tool calling in [this part of the docs](https://python.langchain.com/v0.1/docs/modules/model_io/chat/function_calling).
|
||||
|
||||
**NOTE:**
|
||||
|
||||
> The current version of LangChain ChatPremAI do not support function/tool calling with streaming support. Streaming support along with function calling will come soon.
|
||||
|
||||
### Passing tools to model
|
||||
|
||||
In order to pass tools and let the LLM choose the tool it needs to call, we need to pass a tool schema. A tool schema is the function definition along with proper docstring on what does the function do, what each argument of the function is etc. Below are some simple arithmetic functions with their schema.
|
||||
|
||||
**NOTE:**
|
||||
> When defining function/tool schema, do not forget to add information around the function arguments, otherwise it would throw error.
|
||||
|
||||
```python
|
||||
from langchain_core.tools import tool
|
||||
from langchain_core.pydantic_v1 import BaseModel, Field
|
||||
|
||||
# Define the schema for function arguments
|
||||
class OperationInput(BaseModel):
|
||||
a: int = Field(description="First number")
|
||||
b: int = Field(description="Second number")
|
||||
|
||||
|
||||
# Now define the function where schema for argument will be OperationInput
|
||||
@tool("add", args_schema=OperationInput, return_direct=True)
|
||||
def add(a: int, b: int) -> int:
|
||||
"""Adds a and b.
|
||||
|
||||
Args:
|
||||
a: first int
|
||||
b: second int
|
||||
"""
|
||||
return a + b
|
||||
|
||||
|
||||
@tool("multiply", args_schema=OperationInput, return_direct=True)
|
||||
def multiply(a: int, b: int) -> int:
|
||||
"""Multiplies a and b.
|
||||
|
||||
Args:
|
||||
a: first int
|
||||
b: second int
|
||||
"""
|
||||
return a * b
|
||||
```
|
||||
|
||||
### Binding tool schemas with our LLM
|
||||
|
||||
We will now use the `bind_tools` method to convert our above functions to a "tool" and binding it with the model. This means we are going to pass these tool informations everytime we invoke the model.
|
||||
|
||||
```python
|
||||
tools = [add, multiply]
|
||||
llm_with_tools = chat.bind_tools(tools)
|
||||
```
|
||||
|
||||
After this, we get the response from the model which is now binded with the tools.
|
||||
|
||||
```python
|
||||
query = "What is 3 * 12? Also, what is 11 + 49?"
|
||||
|
||||
messages = [HumanMessage(query)]
|
||||
ai_msg = llm_with_tools.invoke(messages)
|
||||
```
|
||||
|
||||
As we can see, when our chat model is binded with tools, then based on the given prompt, it calls the correct set of the tools and sequentially.
|
||||
|
||||
```python
|
||||
ai_msg.tool_calls
|
||||
```
|
||||
**Output**
|
||||
|
||||
```python
|
||||
[{'name': 'multiply',
|
||||
'args': {'a': 3, 'b': 12},
|
||||
'id': 'call_A9FL20u12lz6TpOLaiS6rFa8'},
|
||||
{'name': 'add',
|
||||
'args': {'a': 11, 'b': 49},
|
||||
'id': 'call_MPKYGLHbf39csJIyb5BZ9xIk'}]
|
||||
```
|
||||
|
||||
We append this message shown above to the LLM which acts as a context and makes the LLM aware that what all functions it has called.
|
||||
|
||||
```python
|
||||
messages.append(ai_msg)
|
||||
```
|
||||
|
||||
Since tool calling happens into two phases, where:
|
||||
|
||||
1. in our first call, we gathered all the tools that the LLM decided to tool, so that it can get the result as an added context to give more accurate and hallucination free result.
|
||||
|
||||
2. in our second call, we will parse those set of tools decided by LLM and run them (in our case it will be the functions we defined, with the LLM's extracted arguments) and pass this result to the LLM
|
||||
|
||||
```python
|
||||
from langchain_core.messages import ToolMessage
|
||||
|
||||
for tool_call in ai_msg.tool_calls:
|
||||
selected_tool = {"add": add, "multiply": multiply}[tool_call["name"].lower()]
|
||||
tool_output = selected_tool.invoke(tool_call["args"])
|
||||
messages.append(ToolMessage(tool_output, tool_call_id=tool_call["id"]))
|
||||
```
|
||||
|
||||
Finally, we call the LLM (binded with the tools) with the function response added in it's context.
|
||||
|
||||
```python
|
||||
response = llm_with_tools.invoke(messages)
|
||||
print(response.content)
|
||||
```
|
||||
**Output**
|
||||
|
||||
```txt
|
||||
The final answers are:
|
||||
|
||||
- 3 * 12 = 36
|
||||
- 11 + 49 = 60
|
||||
```
|
||||
|
||||
### Defining tool schemas: Pydantic class `Optional`
|
||||
|
||||
Above we have shown how to define schema using `tool` decorator, however we can equivalently define the schema using Pydantic. Pydantic is useful when your tool inputs are more complex:
|
||||
|
||||
```python
|
||||
from langchain_core.output_parsers.openai_tools import PydanticToolsParser
|
||||
|
||||
class add(BaseModel):
|
||||
"""Add two integers together."""
|
||||
|
||||
a: int = Field(..., description="First integer")
|
||||
b: int = Field(..., description="Second integer")
|
||||
|
||||
|
||||
class multiply(BaseModel):
|
||||
"""Multiply two integers together."""
|
||||
|
||||
a: int = Field(..., description="First integer")
|
||||
b: int = Field(..., description="Second integer")
|
||||
|
||||
|
||||
tools = [add, multiply]
|
||||
```
|
||||
|
||||
Now, we can bind them to chat models and directly get the result:
|
||||
|
||||
```python
|
||||
chain = llm_with_tools | PydanticToolsParser(tools=[multiply, add])
|
||||
chain.invoke(query)
|
||||
```
|
||||
|
||||
**Output**
|
||||
|
||||
```txt
|
||||
[multiply(a=3, b=12), add(a=11, b=49)]
|
||||
```
|
||||
|
||||
Now, as done above, we parse this and run this functions and call the LLM once again to get the result.
|
||||
135
docs/docs/integrations/retrievers/nanopq.ipynb
Normal file
135
docs/docs/integrations/retrievers/nanopq.ipynb
Normal file
@@ -0,0 +1,135 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "661d5123-8ed2-4504-a846-7df0984e79f9",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# NanoPQ (Product Quantization)\n",
|
||||
"\n",
|
||||
">[Product Quantization algorithm (k-NN)](https://towardsdatascience.com/similarity-search-product-quantization-b2a1a6397701) in brief is a quantization algorithm that helps in compression of database vectors which helps in semantic search when large datasets are involved. In a nutshell, the embedding is split into M subspaces which further goes through clustering. Upon clustering the vectors the centroid vector gets mapped to the vectors present in the each of the clusters of the subspace. \n",
|
||||
"\n",
|
||||
"This notebook goes over how to use a retriever that under the hood uses a Product Quantization which has been implemented by the [nanopq](https://github.com/matsui528/nanopq) package."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "68794637-c13b-4145-944f-3b0c2f1258f9",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%pip install -qU langchain-community langchain-openai nanopq"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "39ecbf50-4623-4ee6-9c8e-fea5da21767e",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain_community.embeddings.spacy_embeddings import SpacyEmbeddings\n",
|
||||
"from langchain_community.retrievers import NanoPQRetriever"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "c1ce742a-5085-408a-a2c2-4bae0f605880",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Create New Retriever with Texts"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "6c80020e-bc9e-49e8-8f93-5f75fd823738",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"retriever = NanoPQRetriever.from_texts(\n",
|
||||
" [\"Great world\", \"great words\", \"world\", \"planets of the world\"],\n",
|
||||
" SpacyEmbeddings(model_name=\"en_core_web_sm\"),\n",
|
||||
" clusters=2,\n",
|
||||
" subspace=2,\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "743c26c1-0072-4e46-b41b-c28b3f1737c8",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Use Retriever\n",
|
||||
"\n",
|
||||
"We can now use the retriever!"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "f496de2d-9b8f-4f8b-a30f-279ef199259a",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"M: 2, Ks: 2, metric : <class 'numpy.uint8'>, code_dtype: l2\n",
|
||||
"iter: 20, seed: 123\n",
|
||||
"Training the subspace: 0 / 2\n",
|
||||
"Training the subspace: 1 / 2\n",
|
||||
"Encoding the subspace: 0 / 2\n",
|
||||
"Encoding the subspace: 1 / 2\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[Document(page_content='world'),\n",
|
||||
" Document(page_content='Great world'),\n",
|
||||
" Document(page_content='great words'),\n",
|
||||
" Document(page_content='planets of the world')]"
|
||||
]
|
||||
},
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"retriever.invoke(\"earth\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "617202a7-e3a6-49a8-b807-4b4d771159d5",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.11"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
@@ -156,6 +156,29 @@
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"For certain requirements, there is an option to pass the IBM's [`APIClient`](https://ibm.github.io/watsonx-ai-python-sdk/base.html#apiclient) object into the `WatsonxEmbeddings` class."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from ibm_watsonx_ai import APIClient\n",
|
||||
"\n",
|
||||
"api_client = APIClient(...)\n",
|
||||
"\n",
|
||||
"watsonx_llm = WatsonxEmbeddings(\n",
|
||||
" model_id=\"ibm/slate-125m-english-rtrvr\",\n",
|
||||
" watsonx_client=api_client,\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
|
||||
155
docs/docs/integrations/text_embedding/pinecone.ipynb
Normal file
155
docs/docs/integrations/text_embedding/pinecone.ipynb
Normal file
@@ -0,0 +1,155 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"source": [
|
||||
"# Pinecone Embeddings\n",
|
||||
"\n",
|
||||
"Pinecone's inference API can be accessed via `PineconeEmbeddings`. Providing text embeddings via the Pinecone service. We start by installing prerequisite libraries:"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"id": "f4b5d823fee826c2"
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!pip install -qU \"langchain-pinecone>=0.2.0\" "
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"id": "3bc5d3a5ed7f5ce3",
|
||||
"execution_count": null
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"source": [
|
||||
"Next, we [sign up / log in to Pinecone](https://app.pinecone.io) to get our API key:"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"id": "62a77d25c3fd8bd5"
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"from getpass import getpass\n",
|
||||
"\n",
|
||||
"os.environ[\"PINECONE_API_KEY\"] = os.getenv(\"PINECONE_API_KEY\") or getpass(\n",
|
||||
" \"Enter your Pinecone API key: \"\n",
|
||||
")"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"id": "8162dbcbcf7d3d55",
|
||||
"execution_count": null
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"source": [
|
||||
"Check the document for available [models](https://docs.pinecone.io/models/overview). Now we initialize our embedding model like so:"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"id": "98d860a0a2d8b907"
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain_pinecone import PineconeEmbeddings\n",
|
||||
"\n",
|
||||
"embeddings = PineconeEmbeddings(model=\"multilingual-e5-large\")"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"id": "2b3adb72786a5275",
|
||||
"execution_count": null
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"source": [
|
||||
"From here we can create embeddings either sync or async, let's start with sync! We embed a single text as a query embedding (ie what we search with in RAG) using `embed_query`:"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"id": "11e24da855517230"
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"docs = [\n",
|
||||
" \"Apple is a popular fruit known for its sweetness and crisp texture.\",\n",
|
||||
" \"The tech company Apple is known for its innovative products like the iPhone.\",\n",
|
||||
" \"Many people enjoy eating apples as a healthy snack.\",\n",
|
||||
" \"Apple Inc. has revolutionized the tech industry with its sleek designs and user-friendly interfaces.\",\n",
|
||||
" \"An apple a day keeps the doctor away, as the saying goes.\",\n",
|
||||
"]"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"id": "2da515e2a61ef7e9",
|
||||
"execution_count": null
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"doc_embeds = embeddings.embed_documents(docs)\n",
|
||||
"doc_embeds"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"id": "2897e0d570c90b2f",
|
||||
"execution_count": null
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"query = \"Tell me about the tech company known as Apple\"\n",
|
||||
"query_embed = embeddings.embed_query(query)\n",
|
||||
"query_embed"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"id": "510784963c0e17a",
|
||||
"execution_count": null
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 2
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython2",
|
||||
"version": "2.7.6"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
@@ -101,7 +101,7 @@
|
||||
" sambastudio_embeddings_project_id=sambastudio_project_id,\n",
|
||||
" sambastudio_embeddings_endpoint_id=sambastudio_endpoint_id,\n",
|
||||
" sambastudio_embeddings_api_key=sambastudio_api_key,\n",
|
||||
" batch_size=32,\n",
|
||||
" batch_size=32, # set depending on the deployed endpoint configuration\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
|
||||
@@ -8,87 +8,68 @@
|
||||
"# Sentence Transformers on Hugging Face\n",
|
||||
"\n",
|
||||
">[Hugging Face sentence-transformers](https://huggingface.co/sentence-transformers) is a Python framework for state-of-the-art sentence, text and image embeddings.\n",
|
||||
">One of the embedding models is used in the `HuggingFaceEmbeddings` class.\n",
|
||||
">We have also added an alias for `SentenceTransformerEmbeddings` for users who are more familiar with directly using that package.\n",
|
||||
"\n",
|
||||
"`sentence_transformers` package models are originating from [Sentence-BERT](https://arxiv.org/abs/1908.10084)"
|
||||
">You can use these embedding models from the `HuggingFaceEmbeddings` class."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"execution_count": null,
|
||||
"id": "06c9f47d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%pip install --upgrade --quiet langchain-huggingface"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"id": "ff9be586",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\n",
|
||||
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.0.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.1.1\u001b[0m\n",
|
||||
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n"
|
||||
"[-0.0383385568857193, 0.12346469610929489, -0.028642987832427025, 0.05365273728966713, 0.00884537026...\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"%pip install --upgrade --quiet sentence_transformers > /dev/null"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "861521a9",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain_huggingface import HuggingFaceEmbeddings"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "ff9be586",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain_huggingface import HuggingFaceEmbeddings\n",
|
||||
"\n",
|
||||
"embeddings = HuggingFaceEmbeddings(model_name=\"all-MiniLM-L6-v2\")\n",
|
||||
"# Equivalent to SentenceTransformerEmbeddings(model_name=\"all-MiniLM-L6-v2\")"
|
||||
"\n",
|
||||
"text = \"This is a test document.\"\n",
|
||||
"query_result = embeddings.embed_query(text)\n",
|
||||
"\n",
|
||||
"# show only the first 100 characters of the stringified vector\n",
|
||||
"print(str(query_result)[:100] + \"...\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "d0a98ae9",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"text = \"This is a test document.\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "5d6c682b",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"query_result = embeddings.embed_query(text)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"execution_count": 9,
|
||||
"id": "bb5e74c0",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"[[-0.038338493555784225, 0.12346471846103668, -0.028642840683460236, 0.05365276336669922, 0.00884535...\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"doc_result = embeddings.embed_documents([text, \"This is not a test document.\"])"
|
||||
"doc_result = embeddings.embed_documents([text, \"This is not a test document.\"])\n",
|
||||
"print(str(doc_result)[:100] + \"...\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "aaad49f8",
|
||||
"id": "d18544f5",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
@@ -110,7 +91,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.12"
|
||||
"version": "3.11.4"
|
||||
},
|
||||
"vscode": {
|
||||
"interpreter": {
|
||||
|
||||
File diff suppressed because one or more lines are too long
@@ -11,7 +11,7 @@
|
||||
"source": [
|
||||
"# SQL Database\n",
|
||||
"\n",
|
||||
"::: {.callout-note}\n",
|
||||
":::note\n",
|
||||
"The `SQLDatabase` adapter utility is a wrapper around a database connection.\n",
|
||||
"\n",
|
||||
"For talking to SQL databases, it uses the [SQLAlchemy] Core API .\n",
|
||||
@@ -405,7 +405,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.12"
|
||||
"version": "3.11.9"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
||||
@@ -102,8 +102,8 @@
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"agent_chain.run(\n",
|
||||
" \"What happens today with Microsoft stocks?\",\n",
|
||||
"agent_chain.invoke(\n",
|
||||
" \"What happened today with Microsoft stocks?\",\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
@@ -147,7 +147,7 @@
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"agent_chain.run(\n",
|
||||
"agent_chain.invoke(\n",
|
||||
" \"How does Microsoft feels today comparing with Nvidia?\",\n",
|
||||
")"
|
||||
]
|
||||
@@ -188,7 +188,7 @@
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"tool.run(\"NVDA\")"
|
||||
"tool.invoke(\"NVDA\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -210,7 +210,7 @@
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"res = tool.run(\"AAPL\")\n",
|
||||
"res = tool.invoke(\"AAPL\")\n",
|
||||
"print(res)"
|
||||
]
|
||||
},
|
||||
|
||||
@@ -72,7 +72,7 @@
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"tool.run(\"lex friedman\")"
|
||||
"tool.run(\"lex fridman\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
||||
@@ -88,9 +88,10 @@ CHAT_MODEL_FEAT_TABLE = {
|
||||
"link": "/docs/integrations/chat/huggingface/",
|
||||
},
|
||||
"ChatOllama": {
|
||||
"tool_calling": True,
|
||||
"local": True,
|
||||
"json_mode": True,
|
||||
"package": "langchain-community",
|
||||
"package": "langchain-ollama",
|
||||
"link": "/docs/integrations/chat/ollama/",
|
||||
},
|
||||
"vLLM Chat (via ChatOpenAI)": {
|
||||
|
||||
181
docs/scripts/tool_feat_table.py
Normal file
181
docs/scripts/tool_feat_table.py
Normal file
@@ -0,0 +1,181 @@
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
SEARCH_TOOL_FEAT_TABLE = {
|
||||
"Exa Search": {
|
||||
"pricing": "1000 free searches/month",
|
||||
"available_data": "URL, Author, Title, Published Date",
|
||||
"link": "/docs/integrations/tools/exa_search",
|
||||
},
|
||||
"Bing Search": {
|
||||
"pricing": "Paid",
|
||||
"available_data": "URL, Snippet, Title",
|
||||
"link": "/docs/integrations/tools/bing_search",
|
||||
},
|
||||
"DuckDuckgoSearch": {
|
||||
"pricing": "Free",
|
||||
"available_data": "URL, Snippet, Title",
|
||||
"link": "/docs/integrations/tools/ddg",
|
||||
},
|
||||
"Brave Search": {
|
||||
"pricing": "Free",
|
||||
"available_data": "URL, Snippet, Title",
|
||||
"link": "/docs/integrations/tools/brave_search",
|
||||
},
|
||||
"Google Search": {
|
||||
"pricing": "Paid",
|
||||
"available_data": "URL, Snippet, Title",
|
||||
"link": "/docs/integrations/tools/google_search",
|
||||
},
|
||||
"Google Serper": {
|
||||
"pricing": "Free",
|
||||
"available_data": "URL, Snippet, Title, Search Rank, Site Links",
|
||||
"link": "/docs/integrations/tools/google_serper",
|
||||
},
|
||||
"Mojeek Search": {
|
||||
"pricing": "Paid",
|
||||
"available_data": "URL, Snippet, Title",
|
||||
"link": "/docs/integrations/tools/mojeek_search",
|
||||
},
|
||||
"SearxNG Search": {
|
||||
"pricing": "Free",
|
||||
"available_data": "URL, Snippet, Title, Category",
|
||||
"link": "/docs/integrations/tools/searx_search",
|
||||
},
|
||||
"You.com Search": {
|
||||
"pricing": "Free for 60 days",
|
||||
"available_data": "URL, Title, Page Content",
|
||||
"link": "/docs/integrations/tools/you",
|
||||
},
|
||||
"SearchApi": {
|
||||
"pricing": "100 Free Searches on Sign Up",
|
||||
"available_data": "URL, Snippet, Title, Search Rank, Site Links, Authors",
|
||||
"link": "/docs/integrations/tools/searchapi",
|
||||
},
|
||||
"SerpAPI": {
|
||||
"pricing": "100 Free Searches/Month",
|
||||
"available_data": "Answer",
|
||||
"link": "/docs/integrations/tools/serpapi",
|
||||
},
|
||||
}
|
||||
|
||||
CODE_INTERPRETER_TOOL_FEAT_TABLE = {
|
||||
"Bearly Code Interpreter": {
|
||||
"langauges": "Python",
|
||||
"sandbox_lifetime": "Resets on Execution",
|
||||
"upload": True,
|
||||
"return_results": "Text",
|
||||
"link": "/docs/integrations/tools/bearly",
|
||||
},
|
||||
"Riza Code Interpreter": {
|
||||
"langauges": "Python, JavaScript, PHP, Ruby",
|
||||
"sandbox_lifetime": "Resets on Execution",
|
||||
"upload": False,
|
||||
"return_results": "Text",
|
||||
"link": "/docs/integrations/tools/riza",
|
||||
},
|
||||
"E2B Data Analysis": {
|
||||
"langauges": "Python. In beta: JavaScript, R, Java",
|
||||
"sandbox_lifetime": "24 Hours",
|
||||
"upload": True,
|
||||
"return_results": "Text, Images, Videos",
|
||||
"link": "/docs/integrations/tools/e2b_data_analysis",
|
||||
},
|
||||
"Azure Container Apps dynamic sessions": {
|
||||
"langauges": "Python",
|
||||
"sandbox_lifetime": "1 Hour",
|
||||
"upload": True,
|
||||
"return_results": "Text, Images",
|
||||
"link": "/docs/integrations/tools/azure_dynamic_sessions",
|
||||
},
|
||||
}
|
||||
|
||||
TOOLS_TEMPLATE = """\
|
||||
---
|
||||
sidebar_position: 0
|
||||
sidebar_class_name: hidden
|
||||
keywords: [compatibility]
|
||||
custom_edit_url:
|
||||
hide_table_of_contents: true
|
||||
---
|
||||
|
||||
# Tools
|
||||
|
||||
## Search Tools
|
||||
|
||||
The following table shows tools that execute online searches in some shape or form:
|
||||
|
||||
{search_table}
|
||||
|
||||
## Code Interpreter Tools
|
||||
|
||||
The following table shows tools that can be used as code interpreters:
|
||||
|
||||
{code_interpreter_table}
|
||||
|
||||
"""
|
||||
|
||||
|
||||
def get_search_tools_table() -> str:
|
||||
"""Get the table of search tools."""
|
||||
header = ["tool", "pricing", "available_data"]
|
||||
title = ["Tool", "Free/Paid", "Return Data"]
|
||||
rows = [title, [":-"] + [":-:"] * (len(title) - 1)]
|
||||
for search_tool, feats in sorted(SEARCH_TOOL_FEAT_TABLE.items()):
|
||||
# Fields are in the order of the header
|
||||
row = [
|
||||
f"[{search_tool}]({feats['link']})",
|
||||
]
|
||||
for h in header[1:]:
|
||||
row.append(feats.get(h))
|
||||
rows.append(row)
|
||||
return "\n".join(["|".join(row) for row in rows])
|
||||
|
||||
|
||||
def get_code_interpreter_table() -> str:
|
||||
"""Get the table of search tools."""
|
||||
header = [
|
||||
"tool",
|
||||
"langauges",
|
||||
"sandbox_lifetime",
|
||||
"upload",
|
||||
"return_results",
|
||||
]
|
||||
title = [
|
||||
"Tool",
|
||||
"Supported Languages",
|
||||
"Sandbox Lifetime",
|
||||
"Supports File Uploads",
|
||||
"Return Types",
|
||||
]
|
||||
rows = [title, [":-"] + [":-:"] * (len(title) - 1)]
|
||||
for search_tool, feats in sorted(CODE_INTERPRETER_TOOL_FEAT_TABLE.items()):
|
||||
# Fields are in the order of the header
|
||||
row = [
|
||||
f"[{search_tool}]({feats['link']})",
|
||||
]
|
||||
for h in header[1:]:
|
||||
value = feats.get(h)
|
||||
if h == "upload":
|
||||
if value is True:
|
||||
row.append("✅")
|
||||
else:
|
||||
row.append("❌")
|
||||
else:
|
||||
row.append(value)
|
||||
rows.append(row)
|
||||
return "\n".join(["|".join(row) for row in rows])
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
output_dir = Path(sys.argv[1])
|
||||
output_integrations_dir = output_dir / "integrations"
|
||||
output_integrations_dir_tools = output_integrations_dir / "tools"
|
||||
output_integrations_dir_tools.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
tools_page = TOOLS_TEMPLATE.format(
|
||||
search_table=get_search_tools_table(),
|
||||
code_interpreter_table=get_code_interpreter_table(),
|
||||
)
|
||||
with open(output_integrations_dir / "tools" / "index.mdx", "w") as f:
|
||||
f.write(tools_page)
|
||||
@@ -243,8 +243,8 @@ module.exports = {
|
||||
},
|
||||
],
|
||||
link: {
|
||||
type: "generated-index",
|
||||
slug: "integrations/tools",
|
||||
type: "doc",
|
||||
id: "integrations/tools/index",
|
||||
},
|
||||
},
|
||||
{
|
||||
|
||||
@@ -52,17 +52,17 @@ export default function ChatModelTabs(props) {
|
||||
customVarName,
|
||||
} = props;
|
||||
|
||||
const openAIParamsOrDefault = openaiParams ?? `model="gpt-3.5-turbo-0125"`;
|
||||
const openAIParamsOrDefault = openaiParams ?? `model="gpt-4o-mini"`;
|
||||
const anthropicParamsOrDefault =
|
||||
anthropicParams ?? `model="claude-3-sonnet-20240229"`;
|
||||
const cohereParamsOrDefault = cohereParams ?? `model="command-r"`;
|
||||
anthropicParams ?? `model="claude-3-5-sonnet-20240620"`;
|
||||
const cohereParamsOrDefault = cohereParams ?? `model="command-r-plus"`;
|
||||
const fireworksParamsOrDefault =
|
||||
fireworksParams ??
|
||||
`model="accounts/fireworks/models/mixtral-8x7b-instruct"`;
|
||||
`model="accounts/fireworks/models/llama-v3p1-70b-instruct"`;
|
||||
const groqParamsOrDefault = groqParams ?? `model="llama3-8b-8192"`;
|
||||
const mistralParamsOrDefault =
|
||||
mistralParams ?? `model="mistral-large-latest"`;
|
||||
const googleParamsOrDefault = googleParams ?? `model="gemini-pro"`;
|
||||
const googleParamsOrDefault = googleParams ?? `model="gemini-1.5-flash"`;
|
||||
const togetherParamsOrDefault =
|
||||
togetherParams ??
|
||||
`\n base_url="https://api.together.xyz/v1",\n api_key=os.environ["TOGETHER_API_KEY"],\n model="mistralai/Mixtral-8x7B-Instruct-v0.1",\n`;
|
||||
|
||||
@@ -61,6 +61,10 @@
|
||||
{
|
||||
"source": "/cookbook(/?)",
|
||||
"destination": "/v0.1/docs/cookbook/"
|
||||
},
|
||||
{
|
||||
"source": "/docs/integrations/toolkits/document_comparison_toolkit(/?)",
|
||||
"destination": "/docs/tutorials/rag/"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
@@ -9,6 +9,7 @@ license = "MIT"
|
||||
|
||||
[tool.poetry.urls]
|
||||
"Source Code" = "https://github.com/langchain-ai/langchain/tree/master/libs/partners/__package_name_short__"
|
||||
"Release Notes" = "https://github.com/langchain-ai/langchain/releases?q=tag%3A%22__package_name_short__%3D%3D0%22&expanded=true"
|
||||
|
||||
[tool.poetry.dependencies]
|
||||
python = ">=3.8.1,<4.0"
|
||||
@@ -77,8 +78,7 @@ build-backend = "poetry.core.masonry.api"
|
||||
# section of the configuration file raise errors.
|
||||
#
|
||||
# https://github.com/tophat/syrupy
|
||||
# --snapshot-warn-unused Prints a warning on unused snapshots rather than fail the test suite.
|
||||
addopts = "--snapshot-warn-unused --strict-markers --strict-config --durations=5"
|
||||
addopts = "--strict-markers --strict-config --durations=5"
|
||||
# Registering custom markers.
|
||||
# https://docs.pytest.org/en/7.1.x/example/markers.html#registering-markers
|
||||
markers = [
|
||||
|
||||
1520
libs/cli/poetry.lock
generated
1520
libs/cli/poetry.lock
generated
File diff suppressed because it is too large
Load Diff
@@ -1,6 +1,6 @@
|
||||
[tool.poetry]
|
||||
name = "langchain-cli"
|
||||
version = "0.0.25"
|
||||
version = "0.0.26"
|
||||
description = "CLI for interacting with LangChain"
|
||||
authors = ["Erick Friis <erick@langchain.dev>"]
|
||||
readme = "README.md"
|
||||
@@ -9,6 +9,7 @@ license = "MIT"
|
||||
|
||||
[tool.poetry.urls]
|
||||
"Source Code" = "https://github.com/langchain-ai/langchain/tree/master/libs/cli"
|
||||
"Release Notes" = "https://github.com/langchain-ai/langchain/releases?q=tag%3A%22langchain-cli%3D%3D0%22&expanded=true"
|
||||
|
||||
[tool.poetry.dependencies]
|
||||
python = ">=3.8.1,<4.0"
|
||||
|
||||
@@ -16,6 +16,7 @@ cloudpickle>=2.0.0
|
||||
cohere>=4,<6
|
||||
databricks-vectorsearch>=0.21,<0.22
|
||||
datasets>=2.15.0,<3
|
||||
dedoc>=2.2.6,<3
|
||||
dgml-utils>=0.3.0,<0.4
|
||||
elasticsearch>=8.12.0,<9
|
||||
esprima>=4.0.1,<5
|
||||
@@ -88,3 +89,4 @@ upstash-ratelimit>=1.1.0,<2
|
||||
vdms==0.0.20
|
||||
xata>=1.0.0a7,<2
|
||||
xmltodict>=0.13.0,<0.14
|
||||
nanopq==0.2.1
|
||||
|
||||
@@ -100,12 +100,14 @@ MODEL_COST_PER_1K_TOKENS = {
|
||||
"gpt-3.5-turbo-0613-finetuned": 0.003,
|
||||
"gpt-3.5-turbo-1106-finetuned": 0.003,
|
||||
"gpt-3.5-turbo-0125-finetuned": 0.003,
|
||||
"gpt-4o-mini-2024-07-18-finetuned": 0.0003,
|
||||
# Fine Tuned output
|
||||
"babbage-002-finetuned-completion": 0.0016,
|
||||
"davinci-002-finetuned-completion": 0.012,
|
||||
"gpt-3.5-turbo-0613-finetuned-completion": 0.006,
|
||||
"gpt-3.5-turbo-1106-finetuned-completion": 0.006,
|
||||
"gpt-3.5-turbo-0125-finetuned-completion": 0.006,
|
||||
"gpt-4o-mini-2024-07-18-finetuned-completion": 0.0012,
|
||||
# Azure Fine Tuned input
|
||||
"babbage-002-azure-finetuned": 0.0004,
|
||||
"davinci-002-azure-finetuned": 0.002,
|
||||
|
||||
@@ -24,6 +24,7 @@ from langchain_core.output_parsers import (
|
||||
from langchain_core.prompts import BasePromptTemplate
|
||||
from langchain_core.pydantic_v1 import BaseModel
|
||||
from langchain_core.runnables import Runnable
|
||||
from langchain_core.utils.pydantic import is_basemodel_subclass
|
||||
|
||||
from langchain_community.output_parsers.ernie_functions import (
|
||||
JsonOutputFunctionsParser,
|
||||
@@ -94,7 +95,7 @@ def _get_python_function_arguments(function: Callable, arg_descriptions: dict) -
|
||||
for arg, arg_type in annotations.items():
|
||||
if arg == "return":
|
||||
continue
|
||||
if isinstance(arg_type, type) and issubclass(arg_type, BaseModel):
|
||||
if isinstance(arg_type, type) and is_basemodel_subclass(arg_type):
|
||||
# Mypy error:
|
||||
# "type" has no attribute "schema"
|
||||
properties[arg] = arg_type.schema() # type: ignore[attr-defined]
|
||||
@@ -156,7 +157,7 @@ def convert_to_ernie_function(
|
||||
"""
|
||||
if isinstance(function, dict):
|
||||
return function
|
||||
elif isinstance(function, type) and issubclass(function, BaseModel):
|
||||
elif isinstance(function, type) and is_basemodel_subclass(function):
|
||||
return cast(Dict, convert_pydantic_to_ernie_function(function))
|
||||
elif callable(function):
|
||||
return convert_python_function_to_ernie_function(function)
|
||||
@@ -185,7 +186,7 @@ def get_ernie_output_parser(
|
||||
only the function arguments and not the function name.
|
||||
"""
|
||||
function_names = [convert_to_ernie_function(f)["name"] for f in functions]
|
||||
if isinstance(functions[0], type) and issubclass(functions[0], BaseModel):
|
||||
if isinstance(functions[0], type) and is_basemodel_subclass(functions[0]):
|
||||
if len(functions) > 1:
|
||||
pydantic_schema: Union[Dict, Type[BaseModel]] = {
|
||||
name: fn for name, fn in zip(function_names, functions)
|
||||
|
||||
@@ -28,7 +28,7 @@ class ElasticsearchChatMessageHistory(BaseChatMessageHistory):
|
||||
es_password: Password to use when connecting to Elasticsearch.
|
||||
es_api_key: API key to use when connecting to Elasticsearch.
|
||||
es_connection: Optional pre-existing Elasticsearch connection.
|
||||
esnsure_ascii: Used to escape ASCII symbols in json.dumps. Defaults to True.
|
||||
ensure_ascii: Used to escape ASCII symbols in json.dumps. Defaults to True.
|
||||
index: Name of the index to use.
|
||||
session_id: Arbitrary key that is used to store the messages
|
||||
of a single chat session.
|
||||
@@ -45,11 +45,11 @@ class ElasticsearchChatMessageHistory(BaseChatMessageHistory):
|
||||
es_user: Optional[str] = None,
|
||||
es_api_key: Optional[str] = None,
|
||||
es_password: Optional[str] = None,
|
||||
esnsure_ascii: Optional[bool] = True,
|
||||
ensure_ascii: Optional[bool] = True,
|
||||
):
|
||||
self.index: str = index
|
||||
self.session_id: str = session_id
|
||||
self.ensure_ascii = esnsure_ascii
|
||||
self.ensure_ascii = ensure_ascii
|
||||
|
||||
# Initialize Elasticsearch client from passed client arg or connection info
|
||||
if es_connection is not None:
|
||||
|
||||
@@ -15,7 +15,43 @@ logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class RedisChatMessageHistory(BaseChatMessageHistory):
|
||||
"""Chat message history stored in a Redis database."""
|
||||
"""Chat message history stored in a Redis database.
|
||||
|
||||
Setup:
|
||||
Install ``redis`` python package.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
pip install redis
|
||||
|
||||
Instantiate:
|
||||
.. code-block:: python
|
||||
|
||||
from langchain_community.chat_message_histories import RedisChatMessageHistory
|
||||
|
||||
history = RedisChatMessageHistory(
|
||||
session_id = "your-session-id",
|
||||
url="redis://your-host:your-port:your-database", # redis://localhost:6379/0
|
||||
)
|
||||
|
||||
Add and retrieve messages:
|
||||
.. code-block:: python
|
||||
|
||||
# Add single message
|
||||
history.add_message(message)
|
||||
|
||||
# Add batch messages
|
||||
history.add_messages([message1, message2, message3, ...])
|
||||
|
||||
# Add human message
|
||||
history.add_user_message(human_message)
|
||||
|
||||
# Add ai message
|
||||
history.add_ai_message(ai_message)
|
||||
|
||||
# Retrieve messages
|
||||
messages = history.messages
|
||||
""" # noqa: E501
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -24,6 +60,18 @@ class RedisChatMessageHistory(BaseChatMessageHistory):
|
||||
key_prefix: str = "message_store:",
|
||||
ttl: Optional[int] = None,
|
||||
):
|
||||
"""Initialize with a RedisChatMessageHistory instance.
|
||||
|
||||
Args:
|
||||
session_id: str
|
||||
The ID for single chat session. Used to form keys with `key_prefix`.
|
||||
url: Optional[str]
|
||||
String parameter configuration for connecting to the redis.
|
||||
key_prefix: Optional[str]
|
||||
The prefix of the key, combined with `session id` to form the key.
|
||||
ttl: Optional[int]
|
||||
Set the expiration time of `key`, the unit is seconds.
|
||||
"""
|
||||
try:
|
||||
import redis
|
||||
except ImportError:
|
||||
|
||||
@@ -187,7 +187,7 @@ class SQLChatMessageHistory(BaseChatMessageHistory):
|
||||
since="0.2.2",
|
||||
removal="0.3.0",
|
||||
name="connection_string",
|
||||
alternative="Use connection instead",
|
||||
alternative="connection",
|
||||
)
|
||||
_warned_once_already = True
|
||||
connection = connection_string
|
||||
|
||||
@@ -40,11 +40,17 @@ from langchain_core.output_parsers.openai_tools import (
|
||||
PydanticToolsParser,
|
||||
)
|
||||
from langchain_core.outputs import ChatGeneration, ChatGenerationChunk, ChatResult
|
||||
from langchain_core.pydantic_v1 import BaseModel, Field, SecretStr, root_validator
|
||||
from langchain_core.pydantic_v1 import (
|
||||
BaseModel,
|
||||
Field,
|
||||
SecretStr,
|
||||
root_validator,
|
||||
)
|
||||
from langchain_core.runnables import Runnable, RunnableMap, RunnablePassthrough
|
||||
from langchain_core.tools import BaseTool
|
||||
from langchain_core.utils import convert_to_secret_str, get_from_dict_or_env
|
||||
from langchain_core.utils.function_calling import convert_to_openai_tool
|
||||
from langchain_core.utils.pydantic import is_basemodel_subclass
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -769,7 +775,7 @@ class QianfanChatEndpoint(BaseChatModel):
|
||||
""" # noqa: E501
|
||||
if kwargs:
|
||||
raise ValueError(f"Received unsupported arguments {kwargs}")
|
||||
is_pydantic_schema = isinstance(schema, type) and issubclass(schema, BaseModel)
|
||||
is_pydantic_schema = isinstance(schema, type) and is_basemodel_subclass(schema)
|
||||
llm = self.bind_tools([schema])
|
||||
if is_pydantic_schema:
|
||||
output_parser: OutputParserLike = PydanticToolsParser(
|
||||
|
||||
@@ -57,6 +57,7 @@ from langchain_core.runnables import Runnable, RunnableMap, RunnablePassthrough
|
||||
from langchain_core.tools import BaseTool
|
||||
from langchain_core.utils import convert_to_secret_str, get_from_dict_or_env, pre_init
|
||||
from langchain_core.utils.function_calling import convert_to_openai_tool
|
||||
from langchain_core.utils.pydantic import is_basemodel_subclass
|
||||
|
||||
from langchain_community.utilities.requests import Requests
|
||||
|
||||
@@ -443,7 +444,7 @@ class ChatEdenAI(BaseChatModel):
|
||||
if kwargs:
|
||||
raise ValueError(f"Received unsupported arguments {kwargs}")
|
||||
llm = self.bind_tools([schema], tool_choice="required")
|
||||
if isinstance(schema, type) and issubclass(schema, BaseModel):
|
||||
if isinstance(schema, type) and is_basemodel_subclass(schema):
|
||||
output_parser: OutputParserLike = PydanticToolsParser(
|
||||
tools=[schema], first_tool_only=True
|
||||
)
|
||||
|
||||
@@ -46,10 +46,15 @@ from langchain_core.output_parsers.openai_tools import (
|
||||
parse_tool_call,
|
||||
)
|
||||
from langchain_core.outputs import ChatGeneration, ChatGenerationChunk, ChatResult
|
||||
from langchain_core.pydantic_v1 import BaseModel, Field, root_validator
|
||||
from langchain_core.pydantic_v1 import (
|
||||
BaseModel,
|
||||
Field,
|
||||
root_validator,
|
||||
)
|
||||
from langchain_core.runnables import Runnable, RunnableMap, RunnablePassthrough
|
||||
from langchain_core.tools import BaseTool
|
||||
from langchain_core.utils.function_calling import convert_to_openai_tool
|
||||
from langchain_core.utils.pydantic import is_basemodel_subclass
|
||||
|
||||
|
||||
class ChatLlamaCpp(BaseChatModel):
|
||||
@@ -525,7 +530,7 @@ class ChatLlamaCpp(BaseChatModel):
|
||||
|
||||
if kwargs:
|
||||
raise ValueError(f"Received unsupported arguments {kwargs}")
|
||||
is_pydantic_schema = isinstance(schema, type) and issubclass(schema, BaseModel)
|
||||
is_pydantic_schema = isinstance(schema, type) and is_basemodel_subclass(schema)
|
||||
if schema is None:
|
||||
raise ValueError(
|
||||
"schema must be specified when method is 'function_calling'. "
|
||||
|
||||
@@ -12,6 +12,7 @@ from typing import (
|
||||
Iterator,
|
||||
List,
|
||||
Optional,
|
||||
Sequence,
|
||||
Tuple,
|
||||
Type,
|
||||
Union,
|
||||
@@ -20,6 +21,7 @@ from typing import (
|
||||
from langchain_core.callbacks import (
|
||||
CallbackManagerForLLMRun,
|
||||
)
|
||||
from langchain_core.language_models import LanguageModelInput
|
||||
from langchain_core.language_models.chat_models import BaseChatModel
|
||||
from langchain_core.language_models.llms import create_base_retry_decorator
|
||||
from langchain_core.messages import (
|
||||
@@ -33,6 +35,7 @@ from langchain_core.messages import (
|
||||
HumanMessageChunk,
|
||||
SystemMessage,
|
||||
SystemMessageChunk,
|
||||
ToolMessage,
|
||||
)
|
||||
from langchain_core.outputs import ChatGeneration, ChatGenerationChunk, ChatResult
|
||||
from langchain_core.pydantic_v1 import (
|
||||
@@ -41,7 +44,10 @@ from langchain_core.pydantic_v1 import (
|
||||
Field,
|
||||
SecretStr,
|
||||
)
|
||||
from langchain_core.runnables import Runnable
|
||||
from langchain_core.tools import BaseTool
|
||||
from langchain_core.utils import get_from_dict_or_env, pre_init
|
||||
from langchain_core.utils.function_calling import convert_to_openai_tool
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from premai.api.chat_completions.v1_chat_completions_create import (
|
||||
@@ -51,6 +57,19 @@ if TYPE_CHECKING:
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
TOOL_PROMPT_HEADER = """
|
||||
Given the set of tools you used and the response, provide the final answer\n
|
||||
"""
|
||||
|
||||
INTERMEDIATE_TOOL_RESULT_TEMPLATE = """
|
||||
{json}
|
||||
"""
|
||||
|
||||
SINGLE_TOOL_PROMPT_TEMPLATE = """
|
||||
tool id: {tool_id}
|
||||
tool_response: {tool_response}
|
||||
"""
|
||||
|
||||
|
||||
class ChatPremAPIError(Exception):
|
||||
"""Error with the `PremAI` API."""
|
||||
@@ -91,8 +110,22 @@ def _response_to_result(
|
||||
raise ChatPremAPIError(f"ChatResponse must have a content: {content}")
|
||||
|
||||
if role == "assistant":
|
||||
tool_calls = choice.message["tool_calls"]
|
||||
if tool_calls is None:
|
||||
tools = []
|
||||
else:
|
||||
tools = [
|
||||
{
|
||||
"id": tool_call["id"],
|
||||
"name": tool_call["function"]["name"],
|
||||
"args": tool_call["function"]["arguments"],
|
||||
}
|
||||
for tool_call in tool_calls
|
||||
]
|
||||
generations.append(
|
||||
ChatGeneration(text=content, message=AIMessage(content=content))
|
||||
ChatGeneration(
|
||||
text=content, message=AIMessage(content=content, tool_calls=tools)
|
||||
)
|
||||
)
|
||||
elif role == "user":
|
||||
generations.append(
|
||||
@@ -156,41 +189,65 @@ def _messages_to_prompt_dict(
|
||||
system_prompt: Optional[str] = None
|
||||
examples_and_messages: List[Dict[str, Any]] = []
|
||||
|
||||
if template_id is not None:
|
||||
params: Dict[str, str] = {}
|
||||
for input_msg in input_messages:
|
||||
if isinstance(input_msg, SystemMessage):
|
||||
system_prompt = str(input_msg.content)
|
||||
for input_msg in input_messages:
|
||||
if isinstance(input_msg, SystemMessage):
|
||||
system_prompt = str(input_msg.content)
|
||||
|
||||
elif isinstance(input_msg, HumanMessage):
|
||||
if template_id is None:
|
||||
examples_and_messages.append(
|
||||
{"role": "user", "content": str(input_msg.content)}
|
||||
)
|
||||
else:
|
||||
params: Dict[str, str] = {}
|
||||
assert (input_msg.id is not None) and (input_msg.id != ""), ValueError(
|
||||
"When using prompt template there should be id associated ",
|
||||
"with each HumanMessage",
|
||||
)
|
||||
params[str(input_msg.id)] = str(input_msg.content)
|
||||
|
||||
examples_and_messages.append(
|
||||
{"role": "user", "template_id": template_id, "params": params}
|
||||
)
|
||||
|
||||
for input_msg in input_messages:
|
||||
if isinstance(input_msg, AIMessage):
|
||||
examples_and_messages.append(
|
||||
{"role": "assistant", "content": str(input_msg.content)}
|
||||
{"role": "user", "template_id": template_id, "params": params}
|
||||
)
|
||||
else:
|
||||
for input_msg in input_messages:
|
||||
if isinstance(input_msg, SystemMessage):
|
||||
system_prompt = str(input_msg.content)
|
||||
elif isinstance(input_msg, HumanMessage):
|
||||
examples_and_messages.append(
|
||||
{"role": "user", "content": str(input_msg.content)}
|
||||
)
|
||||
elif isinstance(input_msg, AIMessage):
|
||||
elif isinstance(input_msg, AIMessage):
|
||||
if input_msg.tool_calls is None or len(input_msg.tool_calls) == 0:
|
||||
examples_and_messages.append(
|
||||
{"role": "assistant", "content": str(input_msg.content)}
|
||||
)
|
||||
else:
|
||||
raise ChatPremAPIError("No such role explicitly exists")
|
||||
ai_msg_to_json = {
|
||||
"id": input_msg.id,
|
||||
"content": input_msg.content,
|
||||
"response_metadata": input_msg.response_metadata,
|
||||
"tool_calls": input_msg.tool_calls,
|
||||
}
|
||||
examples_and_messages.append(
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": INTERMEDIATE_TOOL_RESULT_TEMPLATE.format(
|
||||
json=ai_msg_to_json,
|
||||
),
|
||||
}
|
||||
)
|
||||
elif isinstance(input_msg, ToolMessage):
|
||||
pass
|
||||
|
||||
else:
|
||||
raise ChatPremAPIError("No such role explicitly exists")
|
||||
|
||||
# do a seperate search for tool calls
|
||||
tool_prompt = ""
|
||||
for input_msg in input_messages:
|
||||
if isinstance(input_msg, ToolMessage):
|
||||
tool_id = input_msg.tool_call_id
|
||||
tool_result = input_msg.content
|
||||
tool_prompt += SINGLE_TOOL_PROMPT_TEMPLATE.format(
|
||||
tool_id=tool_id, tool_response=tool_result
|
||||
)
|
||||
if tool_prompt != "":
|
||||
prompt = TOOL_PROMPT_HEADER
|
||||
prompt += tool_prompt
|
||||
examples_and_messages.append({"role": "user", "content": prompt})
|
||||
|
||||
return system_prompt, examples_and_messages
|
||||
|
||||
|
||||
@@ -289,7 +346,6 @@ class ChatPremAI(BaseChatModel, BaseModel):
|
||||
def _get_all_kwargs(self, **kwargs: Any) -> Dict[str, Any]:
|
||||
kwargs_to_ignore = [
|
||||
"top_p",
|
||||
"tools",
|
||||
"frequency_penalty",
|
||||
"presence_penalty",
|
||||
"logit_bias",
|
||||
@@ -392,6 +448,14 @@ class ChatPremAI(BaseChatModel, BaseModel):
|
||||
except Exception as _:
|
||||
continue
|
||||
|
||||
def bind_tools(
|
||||
self,
|
||||
tools: Sequence[Union[Dict[str, Any], Type[BaseModel], Callable, BaseTool]],
|
||||
**kwargs: Any,
|
||||
) -> Runnable[LanguageModelInput, BaseMessage]:
|
||||
formatted_tools = [convert_to_openai_tool(tool) for tool in tools]
|
||||
return super().bind(tools=formatted_tools, **kwargs)
|
||||
|
||||
|
||||
def create_prem_retry_decorator(
|
||||
llm: ChatPremAI,
|
||||
|
||||
@@ -53,11 +53,16 @@ from langchain_core.outputs import (
|
||||
ChatGenerationChunk,
|
||||
ChatResult,
|
||||
)
|
||||
from langchain_core.pydantic_v1 import BaseModel, Field, SecretStr
|
||||
from langchain_core.pydantic_v1 import (
|
||||
BaseModel,
|
||||
Field,
|
||||
SecretStr,
|
||||
)
|
||||
from langchain_core.runnables import Runnable, RunnableMap, RunnablePassthrough
|
||||
from langchain_core.tools import BaseTool
|
||||
from langchain_core.utils import convert_to_secret_str, get_from_dict_or_env, pre_init
|
||||
from langchain_core.utils.function_calling import convert_to_openai_tool
|
||||
from langchain_core.utils.pydantic import is_basemodel_subclass
|
||||
from requests.exceptions import HTTPError
|
||||
from tenacity import (
|
||||
before_sleep_log,
|
||||
@@ -865,7 +870,7 @@ class ChatTongyi(BaseChatModel):
|
||||
"""
|
||||
if kwargs:
|
||||
raise ValueError(f"Received unsupported arguments {kwargs}")
|
||||
is_pydantic_schema = isinstance(schema, type) and issubclass(schema, BaseModel)
|
||||
is_pydantic_schema = isinstance(schema, type) and is_basemodel_subclass(schema)
|
||||
llm = self.bind_tools([schema])
|
||||
if is_pydantic_schema:
|
||||
output_parser: OutputParserLike = PydanticToolsParser(
|
||||
|
||||
@@ -142,6 +142,10 @@ if TYPE_CHECKING:
|
||||
from langchain_community.document_loaders.dataframe import (
|
||||
DataFrameLoader,
|
||||
)
|
||||
from langchain_community.document_loaders.dedoc import (
|
||||
DedocAPIFileLoader,
|
||||
DedocFileLoader,
|
||||
)
|
||||
from langchain_community.document_loaders.diffbot import (
|
||||
DiffbotLoader,
|
||||
)
|
||||
@@ -340,6 +344,7 @@ if TYPE_CHECKING:
|
||||
)
|
||||
from langchain_community.document_loaders.pdf import (
|
||||
AmazonTextractPDFLoader,
|
||||
DedocPDFLoader,
|
||||
MathpixPDFLoader,
|
||||
OnlinePDFLoader,
|
||||
PagedPDFSplitter,
|
||||
@@ -570,6 +575,9 @@ _module_lookup = {
|
||||
"CubeSemanticLoader": "langchain_community.document_loaders.cube_semantic",
|
||||
"DataFrameLoader": "langchain_community.document_loaders.dataframe",
|
||||
"DatadogLogsLoader": "langchain_community.document_loaders.datadog_logs",
|
||||
"DedocAPIFileLoader": "langchain_community.document_loaders.dedoc",
|
||||
"DedocFileLoader": "langchain_community.document_loaders.dedoc",
|
||||
"DedocPDFLoader": "langchain_community.document_loaders.pdf",
|
||||
"DiffbotLoader": "langchain_community.document_loaders.diffbot",
|
||||
"DirectoryLoader": "langchain_community.document_loaders.directory",
|
||||
"DiscordChatLoader": "langchain_community.document_loaders.discord",
|
||||
@@ -771,6 +779,9 @@ __all__ = [
|
||||
"CubeSemanticLoader",
|
||||
"DataFrameLoader",
|
||||
"DatadogLogsLoader",
|
||||
"DedocAPIFileLoader",
|
||||
"DedocFileLoader",
|
||||
"DedocPDFLoader",
|
||||
"DiffbotLoader",
|
||||
"DirectoryLoader",
|
||||
"DiscordChatLoader",
|
||||
|
||||
546
libs/community/langchain_community/document_loaders/dedoc.py
Normal file
546
libs/community/langchain_community/document_loaders/dedoc.py
Normal file
@@ -0,0 +1,546 @@
|
||||
import html
|
||||
import json
|
||||
import os
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import (
|
||||
Dict,
|
||||
Iterator,
|
||||
Optional,
|
||||
Tuple,
|
||||
Union,
|
||||
)
|
||||
|
||||
from langchain_core.documents import Document
|
||||
|
||||
from langchain_community.document_loaders.base import BaseLoader
|
||||
|
||||
|
||||
class DedocBaseLoader(BaseLoader, ABC):
|
||||
"""
|
||||
Base Loader that uses `dedoc` (https://dedoc.readthedocs.io).
|
||||
|
||||
Loader enables extracting text, tables and attached files from the given file:
|
||||
* `Text` can be split by pages, `dedoc` tree nodes, textual lines
|
||||
(according to the `split` parameter).
|
||||
* `Attached files` (when with_attachments=True)
|
||||
are split according to the `split` parameter.
|
||||
For attachments, langchain Document object has an additional metadata field
|
||||
`type`="attachment".
|
||||
* `Tables` (when with_tables=True) are not split - each table corresponds to one
|
||||
langchain Document object.
|
||||
For tables, Document object has additional metadata fields `type`="table"
|
||||
and `text_as_html` with table HTML representation.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
file_path: str,
|
||||
*,
|
||||
split: str = "document",
|
||||
with_tables: bool = True,
|
||||
with_attachments: Union[str, bool] = False,
|
||||
recursion_deep_attachments: int = 10,
|
||||
pdf_with_text_layer: str = "auto_tabby",
|
||||
language: str = "rus+eng",
|
||||
pages: str = ":",
|
||||
is_one_column_document: str = "auto",
|
||||
document_orientation: str = "auto",
|
||||
need_header_footer_analysis: Union[str, bool] = False,
|
||||
need_binarization: Union[str, bool] = False,
|
||||
need_pdf_table_analysis: Union[str, bool] = True,
|
||||
delimiter: Optional[str] = None,
|
||||
encoding: Optional[str] = None,
|
||||
) -> None:
|
||||
"""
|
||||
Initialize with file path and parsing parameters.
|
||||
|
||||
Args:
|
||||
file_path: path to the file for processing
|
||||
split: type of document splitting into parts (each part is returned
|
||||
separately), default value "document"
|
||||
"document": document text is returned as a single langchain Document
|
||||
object (don't split)
|
||||
"page": split document text into pages (works for PDF, DJVU, PPTX, PPT,
|
||||
ODP)
|
||||
"node": split document text into tree nodes (title nodes, list item
|
||||
nodes, raw text nodes)
|
||||
"line": split document text into lines
|
||||
with_tables: add tables to the result - each table is returned as a single
|
||||
langchain Document object
|
||||
|
||||
Parameters used for document parsing via `dedoc`
|
||||
(https://dedoc.readthedocs.io/en/latest/parameters/parameters.html):
|
||||
|
||||
with_attachments: enable attached files extraction
|
||||
recursion_deep_attachments: recursion level for attached files
|
||||
extraction, works only when with_attachments==True
|
||||
pdf_with_text_layer: type of handler for parsing PDF documents,
|
||||
available options
|
||||
["true", "false", "tabby", "auto", "auto_tabby" (default)]
|
||||
language: language of the document for PDF without a textual layer and
|
||||
images, available options ["eng", "rus", "rus+eng" (default)],
|
||||
the list of languages can be extended, please see
|
||||
https://dedoc.readthedocs.io/en/latest/tutorials/add_new_language.html
|
||||
pages: page slice to define the reading range for parsing PDF documents
|
||||
is_one_column_document: detect number of columns for PDF without
|
||||
a textual layer and images, available options
|
||||
["true", "false", "auto" (default)]
|
||||
document_orientation: fix document orientation (90, 180, 270 degrees)
|
||||
for PDF without a textual layer and images, available options
|
||||
["auto" (default), "no_change"]
|
||||
need_header_footer_analysis: remove headers and footers from the output
|
||||
result for parsing PDF and images
|
||||
need_binarization: clean pages background (binarize) for PDF without a
|
||||
textual layer and images
|
||||
need_pdf_table_analysis: parse tables for PDF without a textual layer
|
||||
and images
|
||||
delimiter: column separator for CSV, TSV files
|
||||
encoding: encoding of TXT, CSV, TSV
|
||||
"""
|
||||
self.parsing_parameters = {
|
||||
key: value
|
||||
for key, value in locals().items()
|
||||
if key not in {"self", "file_path", "split", "with_tables"}
|
||||
}
|
||||
self.valid_split_values = {"document", "page", "node", "line"}
|
||||
if split not in self.valid_split_values:
|
||||
raise ValueError(
|
||||
f"Got {split} for `split`, but should be one of "
|
||||
f"`{self.valid_split_values}`"
|
||||
)
|
||||
self.split = split
|
||||
self.with_tables = with_tables
|
||||
self.file_path = file_path
|
||||
|
||||
structure_type = "tree" if self.split == "node" else "linear"
|
||||
self.parsing_parameters["structure_type"] = structure_type
|
||||
self.parsing_parameters["need_content_analysis"] = with_attachments
|
||||
|
||||
def lazy_load(self) -> Iterator[Document]:
|
||||
"""Lazily load documents."""
|
||||
import tempfile
|
||||
|
||||
try:
|
||||
from dedoc import DedocManager
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"`dedoc` package not found, please install it with `pip install dedoc`"
|
||||
)
|
||||
dedoc_manager = DedocManager(manager_config=self._make_config())
|
||||
dedoc_manager.config["logger"].disabled = True
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
document_tree = dedoc_manager.parse(
|
||||
file_path=self.file_path,
|
||||
parameters={**self.parsing_parameters, "attachments_dir": tmpdir},
|
||||
)
|
||||
yield from self._split_document(
|
||||
document_tree=document_tree.to_api_schema().dict(), split=self.split
|
||||
)
|
||||
|
||||
@abstractmethod
|
||||
def _make_config(self) -> dict:
|
||||
"""
|
||||
Make configuration for DedocManager according to the file extension and
|
||||
parsing parameters.
|
||||
"""
|
||||
pass
|
||||
|
||||
def _json2txt(self, paragraph: dict) -> str:
|
||||
"""Get text (recursively) of the document tree node."""
|
||||
subparagraphs_text = "\n".join(
|
||||
[
|
||||
self._json2txt(subparagraph)
|
||||
for subparagraph in paragraph["subparagraphs"]
|
||||
]
|
||||
)
|
||||
text = (
|
||||
f"{paragraph['text']}\n{subparagraphs_text}"
|
||||
if subparagraphs_text
|
||||
else paragraph["text"]
|
||||
)
|
||||
return text
|
||||
|
||||
def _parse_subparagraphs(
|
||||
self, document_tree: dict, document_metadata: dict
|
||||
) -> Iterator[Document]:
|
||||
"""Parse recursively document tree obtained by `dedoc`."""
|
||||
if len(document_tree["subparagraphs"]) > 0:
|
||||
for subparagraph in document_tree["subparagraphs"]:
|
||||
yield from self._parse_subparagraphs(
|
||||
document_tree=subparagraph, document_metadata=document_metadata
|
||||
)
|
||||
else:
|
||||
yield Document(
|
||||
page_content=document_tree["text"],
|
||||
metadata={**document_metadata, **document_tree["metadata"]},
|
||||
)
|
||||
|
||||
def _split_document(
|
||||
self,
|
||||
document_tree: dict,
|
||||
split: str,
|
||||
additional_metadata: Optional[dict] = None,
|
||||
) -> Iterator[Document]:
|
||||
"""Split document into parts according to the `split` parameter."""
|
||||
document_metadata = document_tree["metadata"]
|
||||
if additional_metadata:
|
||||
document_metadata = {**document_metadata, **additional_metadata}
|
||||
|
||||
if split == "document":
|
||||
text = self._json2txt(paragraph=document_tree["content"]["structure"])
|
||||
yield Document(page_content=text, metadata=document_metadata)
|
||||
|
||||
elif split == "page":
|
||||
nodes = document_tree["content"]["structure"]["subparagraphs"]
|
||||
page_id = nodes[0]["metadata"]["page_id"]
|
||||
page_text = ""
|
||||
|
||||
for node in nodes:
|
||||
if node["metadata"]["page_id"] == page_id:
|
||||
page_text += self._json2txt(node)
|
||||
else:
|
||||
yield Document(
|
||||
page_content=page_text,
|
||||
metadata={**document_metadata, "page_id": page_id},
|
||||
)
|
||||
page_id = node["metadata"]["page_id"]
|
||||
page_text = self._json2txt(node)
|
||||
|
||||
yield Document(
|
||||
page_content=page_text,
|
||||
metadata={**document_metadata, "page_id": page_id},
|
||||
)
|
||||
|
||||
elif split == "line":
|
||||
for node in document_tree["content"]["structure"]["subparagraphs"]:
|
||||
line_metadata = node["metadata"]
|
||||
yield Document(
|
||||
page_content=self._json2txt(node),
|
||||
metadata={**document_metadata, **line_metadata},
|
||||
)
|
||||
|
||||
elif split == "node":
|
||||
yield from self._parse_subparagraphs(
|
||||
document_tree=document_tree["content"]["structure"],
|
||||
document_metadata=document_metadata,
|
||||
)
|
||||
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Got {split} for `split`, but should be one of "
|
||||
f"`{self.valid_split_values}`"
|
||||
)
|
||||
|
||||
if self.with_tables:
|
||||
for table in document_tree["content"]["tables"]:
|
||||
table_text, table_html = self._get_table(table)
|
||||
yield Document(
|
||||
page_content=table_text,
|
||||
metadata={
|
||||
**table["metadata"],
|
||||
"type": "table",
|
||||
"text_as_html": table_html,
|
||||
},
|
||||
)
|
||||
|
||||
for attachment in document_tree["attachments"]:
|
||||
yield from self._split_document(
|
||||
document_tree=attachment,
|
||||
split=self.split,
|
||||
additional_metadata={"type": "attachment"},
|
||||
)
|
||||
|
||||
def _get_table(self, table: dict) -> Tuple[str, str]:
|
||||
"""Get text and HTML representation of the table."""
|
||||
table_text = ""
|
||||
for row in table["cells"]:
|
||||
for cell in row:
|
||||
table_text += " ".join(line["text"] for line in cell["lines"])
|
||||
table_text += "\t"
|
||||
table_text += "\n"
|
||||
|
||||
table_html = (
|
||||
'<table border="1" style="border-collapse: collapse; width: 100%;'
|
||||
'">\n<tbody>\n'
|
||||
)
|
||||
for row in table["cells"]:
|
||||
table_html += "<tr>\n"
|
||||
for cell in row:
|
||||
cell_text = "\n".join(line["text"] for line in cell["lines"])
|
||||
cell_text = html.escape(cell_text)
|
||||
table_html += "<td"
|
||||
if cell["invisible"]:
|
||||
table_html += ' style="display: none" '
|
||||
table_html += (
|
||||
f' colspan="{cell["colspan"]}" rowspan='
|
||||
f'"{cell["rowspan"]}">{cell_text}</td>\n'
|
||||
)
|
||||
table_html += "</tr>\n"
|
||||
table_html += "</tbody>\n</table>"
|
||||
|
||||
return table_text, table_html
|
||||
|
||||
|
||||
class DedocFileLoader(DedocBaseLoader):
|
||||
"""
|
||||
DedocFileLoader document loader integration to load files using `dedoc`.
|
||||
|
||||
The file loader automatically detects the file type (with the correct extension).
|
||||
The list of supported file types is gives at
|
||||
https://dedoc.readthedocs.io/en/latest/index.html#id1.
|
||||
Please see the documentation of DedocBaseLoader to get more details.
|
||||
|
||||
Setup:
|
||||
Install ``dedoc`` package.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
pip install -U dedoc
|
||||
|
||||
Instantiate:
|
||||
.. code-block:: python
|
||||
|
||||
from langchain_community.document_loaders import DedocFileLoader
|
||||
|
||||
loader = DedocFileLoader(
|
||||
file_path="example.pdf",
|
||||
# split=...,
|
||||
# with_tables=...,
|
||||
# pdf_with_text_layer=...,
|
||||
# pages=...,
|
||||
# ...
|
||||
)
|
||||
|
||||
Load:
|
||||
.. code-block:: python
|
||||
|
||||
docs = loader.load()
|
||||
print(docs[0].page_content[:100])
|
||||
print(docs[0].metadata)
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
Some text
|
||||
{
|
||||
'file_name': 'example.pdf',
|
||||
'file_type': 'application/pdf',
|
||||
# ...
|
||||
}
|
||||
|
||||
Lazy load:
|
||||
.. code-block:: python
|
||||
|
||||
docs = []
|
||||
docs_lazy = loader.lazy_load()
|
||||
|
||||
for doc in docs_lazy:
|
||||
docs.append(doc)
|
||||
print(docs[0].page_content[:100])
|
||||
print(docs[0].metadata)
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
Some text
|
||||
{
|
||||
'file_name': 'example.pdf',
|
||||
'file_type': 'application/pdf',
|
||||
# ...
|
||||
}
|
||||
"""
|
||||
|
||||
def _make_config(self) -> dict:
|
||||
from dedoc.utils.langchain import make_manager_config
|
||||
|
||||
return make_manager_config(
|
||||
file_path=self.file_path,
|
||||
parsing_params=self.parsing_parameters,
|
||||
split=self.split,
|
||||
)
|
||||
|
||||
|
||||
class DedocAPIFileLoader(DedocBaseLoader):
|
||||
"""
|
||||
Load files using `dedoc` API.
|
||||
The file loader automatically detects the file type (even with the wrong extension).
|
||||
By default, the loader makes a call to the locally hosted `dedoc` API.
|
||||
More information about `dedoc` API can be found in `dedoc` documentation:
|
||||
https://dedoc.readthedocs.io/en/latest/dedoc_api_usage/api.html
|
||||
|
||||
Please see the documentation of DedocBaseLoader to get more details.
|
||||
|
||||
Setup:
|
||||
You don't need to install `dedoc` library for using this loader.
|
||||
Instead, the `dedoc` API needs to be run.
|
||||
You may use Docker container for this purpose.
|
||||
Please see `dedoc` documentation for more details:
|
||||
https://dedoc.readthedocs.io/en/latest/getting_started/installation.html#install-and-run-dedoc-using-docker
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
docker pull dedocproject/dedoc
|
||||
docker run -p 1231:1231
|
||||
|
||||
Instantiate:
|
||||
.. code-block:: python
|
||||
|
||||
from langchain_community.document_loaders import DedocAPIFileLoader
|
||||
|
||||
loader = DedocAPIFileLoader(
|
||||
file_path="example.pdf",
|
||||
# url=...,
|
||||
# split=...,
|
||||
# with_tables=...,
|
||||
# pdf_with_text_layer=...,
|
||||
# pages=...,
|
||||
# ...
|
||||
)
|
||||
|
||||
Load:
|
||||
.. code-block:: python
|
||||
|
||||
docs = loader.load()
|
||||
print(docs[0].page_content[:100])
|
||||
print(docs[0].metadata)
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
Some text
|
||||
{
|
||||
'file_name': 'example.pdf',
|
||||
'file_type': 'application/pdf',
|
||||
# ...
|
||||
}
|
||||
|
||||
Lazy load:
|
||||
.. code-block:: python
|
||||
|
||||
docs = []
|
||||
docs_lazy = loader.lazy_load()
|
||||
|
||||
for doc in docs_lazy:
|
||||
docs.append(doc)
|
||||
print(docs[0].page_content[:100])
|
||||
print(docs[0].metadata)
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
Some text
|
||||
{
|
||||
'file_name': 'example.pdf',
|
||||
'file_type': 'application/pdf',
|
||||
# ...
|
||||
}
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
file_path: str,
|
||||
*,
|
||||
url: str = "http://0.0.0.0:1231",
|
||||
split: str = "document",
|
||||
with_tables: bool = True,
|
||||
with_attachments: Union[str, bool] = False,
|
||||
recursion_deep_attachments: int = 10,
|
||||
pdf_with_text_layer: str = "auto_tabby",
|
||||
language: str = "rus+eng",
|
||||
pages: str = ":",
|
||||
is_one_column_document: str = "auto",
|
||||
document_orientation: str = "auto",
|
||||
need_header_footer_analysis: Union[str, bool] = False,
|
||||
need_binarization: Union[str, bool] = False,
|
||||
need_pdf_table_analysis: Union[str, bool] = True,
|
||||
delimiter: Optional[str] = None,
|
||||
encoding: Optional[str] = None,
|
||||
) -> None:
|
||||
"""Initialize with file path, API url and parsing parameters.
|
||||
|
||||
Args:
|
||||
file_path: path to the file for processing
|
||||
url: URL to call `dedoc` API
|
||||
split: type of document splitting into parts (each part is returned
|
||||
separately), default value "document"
|
||||
"document": document is returned as a single langchain Document object
|
||||
(don't split)
|
||||
"page": split document into pages (works for PDF, DJVU, PPTX, PPT, ODP)
|
||||
"node": split document into tree nodes (title nodes, list item nodes,
|
||||
raw text nodes)
|
||||
"line": split document into lines
|
||||
with_tables: add tables to the result - each table is returned as a single
|
||||
langchain Document object
|
||||
|
||||
Parameters used for document parsing via `dedoc`
|
||||
(https://dedoc.readthedocs.io/en/latest/parameters/parameters.html):
|
||||
|
||||
with_attachments: enable attached files extraction
|
||||
recursion_deep_attachments: recursion level for attached files
|
||||
extraction, works only when with_attachments==True
|
||||
pdf_with_text_layer: type of handler for parsing PDF documents,
|
||||
available options
|
||||
["true", "false", "tabby", "auto", "auto_tabby" (default)]
|
||||
language: language of the document for PDF without a textual layer and
|
||||
images, available options ["eng", "rus", "rus+eng" (default)],
|
||||
the list of languages can be extended, please see
|
||||
https://dedoc.readthedocs.io/en/latest/tutorials/add_new_language.html
|
||||
pages: page slice to define the reading range for parsing PDF documents
|
||||
is_one_column_document: detect number of columns for PDF without
|
||||
a textual layer and images, available options
|
||||
["true", "false", "auto" (default)]
|
||||
document_orientation: fix document orientation (90, 180, 270 degrees)
|
||||
for PDF without a textual layer and images, available options
|
||||
["auto" (default), "no_change"]
|
||||
need_header_footer_analysis: remove headers and footers from the output
|
||||
result for parsing PDF and images
|
||||
need_binarization: clean pages background (binarize) for PDF without a
|
||||
textual layer and images
|
||||
need_pdf_table_analysis: parse tables for PDF without a textual layer
|
||||
and images
|
||||
delimiter: column separator for CSV, TSV files
|
||||
encoding: encoding of TXT, CSV, TSV
|
||||
"""
|
||||
super().__init__(
|
||||
file_path=file_path,
|
||||
split=split,
|
||||
with_tables=with_tables,
|
||||
with_attachments=with_attachments,
|
||||
recursion_deep_attachments=recursion_deep_attachments,
|
||||
pdf_with_text_layer=pdf_with_text_layer,
|
||||
language=language,
|
||||
pages=pages,
|
||||
is_one_column_document=is_one_column_document,
|
||||
document_orientation=document_orientation,
|
||||
need_header_footer_analysis=need_header_footer_analysis,
|
||||
need_binarization=need_binarization,
|
||||
need_pdf_table_analysis=need_pdf_table_analysis,
|
||||
delimiter=delimiter,
|
||||
encoding=encoding,
|
||||
)
|
||||
self.url = url
|
||||
self.parsing_parameters["return_format"] = "json"
|
||||
|
||||
def lazy_load(self) -> Iterator[Document]:
|
||||
"""Lazily load documents."""
|
||||
doc_tree = self._send_file(
|
||||
url=self.url, file_path=self.file_path, parameters=self.parsing_parameters
|
||||
)
|
||||
yield from self._split_document(document_tree=doc_tree, split=self.split)
|
||||
|
||||
def _make_config(self) -> dict:
|
||||
return {}
|
||||
|
||||
def _send_file(
|
||||
self, url: str, file_path: str, parameters: dict
|
||||
) -> Dict[str, Union[list, dict, str]]:
|
||||
"""Send POST-request to `dedoc` API and return the results"""
|
||||
import requests
|
||||
|
||||
file_name = os.path.basename(file_path)
|
||||
with open(file_path, "rb") as file:
|
||||
files = {"file": (file_name, file)}
|
||||
r = requests.post(f"{url}/upload", files=files, data=parameters)
|
||||
|
||||
if r.status_code != 200:
|
||||
raise ValueError(f"Error during file handling: {r.content.decode()}")
|
||||
|
||||
result = json.loads(r.content.decode())
|
||||
return result
|
||||
@@ -7,7 +7,6 @@
|
||||
# 4. For service accounts visit
|
||||
# https://cloud.google.com/iam/docs/service-accounts-create
|
||||
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional, Sequence, Union
|
||||
|
||||
@@ -108,7 +107,13 @@ class GoogleDriveLoader(BaseLoader, BaseModel):
|
||||
return v
|
||||
|
||||
def _load_credentials(self) -> Any:
|
||||
"""Load credentials."""
|
||||
"""Load credentials.
|
||||
The order of loading credentials:
|
||||
1. Service account key if file exists
|
||||
2. Token path (for OAuth Client) if file exists
|
||||
3. Credentials path (for OAuth Client) if file exists
|
||||
4. Default credentials. if no credentials found, raise DefaultCredentialsError
|
||||
"""
|
||||
# Adapted from https://developers.google.com/drive/api/v3/quickstart/python
|
||||
try:
|
||||
from google.auth import default
|
||||
@@ -126,30 +131,31 @@ class GoogleDriveLoader(BaseLoader, BaseModel):
|
||||
)
|
||||
|
||||
creds = None
|
||||
# From service account
|
||||
if self.service_account_key.exists():
|
||||
return service_account.Credentials.from_service_account_file(
|
||||
str(self.service_account_key), scopes=SCOPES
|
||||
)
|
||||
|
||||
# From Oauth Client
|
||||
if self.token_path.exists():
|
||||
creds = Credentials.from_authorized_user_file(str(self.token_path), SCOPES)
|
||||
|
||||
if not creds or not creds.valid:
|
||||
if creds and creds.expired and creds.refresh_token:
|
||||
creds.refresh(Request())
|
||||
elif "GOOGLE_APPLICATION_CREDENTIALS" not in os.environ:
|
||||
creds, project = default()
|
||||
creds = creds.with_scopes(SCOPES)
|
||||
# no need to write to file
|
||||
if creds:
|
||||
return creds
|
||||
else:
|
||||
elif self.credentials_path.exists():
|
||||
flow = InstalledAppFlow.from_client_secrets_file(
|
||||
str(self.credentials_path), SCOPES
|
||||
)
|
||||
creds = flow.run_local_server(port=0)
|
||||
with open(self.token_path, "w") as token:
|
||||
token.write(creds.to_json())
|
||||
if creds:
|
||||
with open(self.token_path, "w") as token:
|
||||
token.write(creds.to_json())
|
||||
|
||||
# From Application Default Credentials
|
||||
if not creds:
|
||||
creds, _ = default(scopes=SCOPES)
|
||||
|
||||
return creds
|
||||
|
||||
|
||||
@@ -26,6 +26,7 @@ from langchain_core.utils import get_from_dict_or_env
|
||||
|
||||
from langchain_community.document_loaders.base import BaseLoader
|
||||
from langchain_community.document_loaders.blob_loaders import Blob
|
||||
from langchain_community.document_loaders.dedoc import DedocBaseLoader
|
||||
from langchain_community.document_loaders.parsers.pdf import (
|
||||
AmazonTextractPDFParser,
|
||||
DocumentIntelligenceParser,
|
||||
@@ -738,6 +739,104 @@ class AmazonTextractPDFLoader(BasePDFLoader):
|
||||
raise ValueError(f"unsupported mime type: {blob.mimetype}") # type: ignore[attr-defined]
|
||||
|
||||
|
||||
class DedocPDFLoader(DedocBaseLoader):
|
||||
"""
|
||||
DedocPDFLoader document loader integration to load PDF files using `dedoc`.
|
||||
The file loader can automatically detect the correctness of a textual layer in the
|
||||
PDF document.
|
||||
Note that `__init__` method supports parameters that differ from ones of
|
||||
DedocBaseLoader.
|
||||
|
||||
Setup:
|
||||
Install ``dedoc`` package.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
pip install -U dedoc
|
||||
|
||||
Instantiate:
|
||||
.. code-block:: python
|
||||
|
||||
from langchain_community.document_loaders import DedocPDFLoader
|
||||
|
||||
loader = DedocPDFLoader(
|
||||
file_path="example.pdf",
|
||||
# split=...,
|
||||
# with_tables=...,
|
||||
# pdf_with_text_layer=...,
|
||||
# pages=...,
|
||||
# ...
|
||||
)
|
||||
|
||||
Load:
|
||||
.. code-block:: python
|
||||
|
||||
docs = loader.load()
|
||||
print(docs[0].page_content[:100])
|
||||
print(docs[0].metadata)
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
Some text
|
||||
{
|
||||
'file_name': 'example.pdf',
|
||||
'file_type': 'application/pdf',
|
||||
# ...
|
||||
}
|
||||
|
||||
Lazy load:
|
||||
.. code-block:: python
|
||||
|
||||
docs = []
|
||||
docs_lazy = loader.lazy_load()
|
||||
|
||||
for doc in docs_lazy:
|
||||
docs.append(doc)
|
||||
print(docs[0].page_content[:100])
|
||||
print(docs[0].metadata)
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
Some text
|
||||
{
|
||||
'file_name': 'example.pdf',
|
||||
'file_type': 'application/pdf',
|
||||
# ...
|
||||
}
|
||||
|
||||
Parameters used for document parsing via `dedoc`
|
||||
(https://dedoc.readthedocs.io/en/latest/parameters/pdf_handling.html):
|
||||
|
||||
with_attachments: enable attached files extraction
|
||||
recursion_deep_attachments: recursion level for attached files extraction,
|
||||
works only when with_attachments==True
|
||||
pdf_with_text_layer: type of handler for parsing, available options
|
||||
["true", "false", "tabby", "auto", "auto_tabby" (default)]
|
||||
language: language of the document for PDF without a textual layer,
|
||||
available options ["eng", "rus", "rus+eng" (default)], the list of
|
||||
languages can be extended, please see
|
||||
https://dedoc.readthedocs.io/en/latest/tutorials/add_new_language.html
|
||||
pages: page slice to define the reading range for parsing
|
||||
is_one_column_document: detect number of columns for PDF without a textual
|
||||
layer, available options ["true", "false", "auto" (default)]
|
||||
document_orientation: fix document orientation (90, 180, 270 degrees) for PDF
|
||||
without a textual layer, available options ["auto" (default), "no_change"]
|
||||
need_header_footer_analysis: remove headers and footers from the output result
|
||||
need_binarization: clean pages background (binarize) for PDF without a textual
|
||||
layer
|
||||
need_pdf_table_analysis: parse tables for PDF without a textual layer
|
||||
"""
|
||||
|
||||
def _make_config(self) -> dict:
|
||||
from dedoc.utils.langchain import make_manager_pdf_config
|
||||
|
||||
return make_manager_pdf_config(
|
||||
file_path=self.file_path,
|
||||
parsing_params=self.parsing_parameters,
|
||||
split=self.split,
|
||||
)
|
||||
|
||||
|
||||
class DocumentIntelligenceLoader(BasePDFLoader):
|
||||
"""Load a PDF with Azure Document Intelligence"""
|
||||
|
||||
|
||||
@@ -21,7 +21,7 @@ class ToMarkdownLoader(BaseLoader):
|
||||
) -> Iterator[Document]:
|
||||
"""Lazily load the file."""
|
||||
response = requests.post(
|
||||
"https://2markdown.com/api/2md",
|
||||
"https://api.2markdown.com/v1/url2md",
|
||||
headers={"X-Api-Key": self.api_key},
|
||||
json={"url": self.url},
|
||||
)
|
||||
|
||||
@@ -65,10 +65,10 @@ class AzureOpenAIEmbeddings(OpenAIEmbeddings):
|
||||
or os.getenv("AZURE_OPENAI_API_KEY")
|
||||
or os.getenv("OPENAI_API_KEY")
|
||||
)
|
||||
values["openai_api_base"] = values["openai_api_base"] or os.getenv(
|
||||
values["openai_api_base"] = values.get("openai_api_base") or os.getenv(
|
||||
"OPENAI_API_BASE"
|
||||
)
|
||||
values["openai_api_version"] = values["openai_api_version"] or os.getenv(
|
||||
values["openai_api_version"] = values.get("openai_api_version") or os.getenv(
|
||||
"OPENAI_API_VERSION", default="2023-05-15"
|
||||
)
|
||||
values["openai_api_type"] = get_from_dict_or_env(
|
||||
|
||||
@@ -29,19 +29,13 @@ class CassandraGraphVectorStore(GraphVectorStore):
|
||||
embedding: Embeddings,
|
||||
*,
|
||||
node_table: str = "graph_nodes",
|
||||
targets_table: str = "graph_targets",
|
||||
session: Optional[Session] = None,
|
||||
keyspace: Optional[str] = None,
|
||||
setup_mode: SetupMode = SetupMode.SYNC,
|
||||
):
|
||||
"""
|
||||
Create the hybrid graph store.
|
||||
Parameters configure the ways that edges should be added between
|
||||
documents. Many take `Union[bool, Set[str]]`, with `False` disabling
|
||||
inference, `True` enabling it globally between all documents, and a set
|
||||
of metadata fields defining a scope in which to enable it. Specifically,
|
||||
passing a set of metadata fields such as `source` only links documents
|
||||
with the same `source` metadata value.
|
||||
|
||||
Args:
|
||||
embedding: The embeddings to use for the document content.
|
||||
setup_mode: Mode used to create the Cassandra table (SYNC,
|
||||
@@ -77,7 +71,6 @@ class CassandraGraphVectorStore(GraphVectorStore):
|
||||
self.store = graph_store.GraphStore(
|
||||
embedding=_EmbeddingModelAdapter(embedding),
|
||||
node_table=node_table,
|
||||
targets_table=targets_table,
|
||||
session=session,
|
||||
keyspace=keyspace,
|
||||
setup_mode=_setup_mode,
|
||||
|
||||
@@ -20,6 +20,9 @@ from langchain_community.graph_vectorstores.extractors.link_extractor import (
|
||||
from langchain_community.graph_vectorstores.extractors.link_extractor_adapter import (
|
||||
LinkExtractorAdapter,
|
||||
)
|
||||
from langchain_community.graph_vectorstores.extractors.link_extractor_transformer import ( # noqa: E501
|
||||
LinkExtractorTransformer,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"GLiNERInput",
|
||||
@@ -34,4 +37,5 @@ __all__ = [
|
||||
"LinkExtractor",
|
||||
"LinkExtractorAdapter",
|
||||
"LinkExtractorAdapter",
|
||||
"LinkExtractorTransformer",
|
||||
]
|
||||
|
||||
@@ -14,7 +14,7 @@ class LinkExtractor(ABC, Generic[InputT]):
|
||||
"""Interface for extracting links (incoming, outgoing, bidirectional)."""
|
||||
|
||||
@abstractmethod
|
||||
def extract_one(self, input: InputT) -> set[Link]: # noqa: A002
|
||||
def extract_one(self, input: InputT) -> Set[Link]:
|
||||
"""Add edges from each `input` to the corresponding documents.
|
||||
|
||||
Args:
|
||||
|
||||
@@ -0,0 +1,43 @@
|
||||
from typing import Any, Sequence
|
||||
|
||||
from langchain_core.documents import Document
|
||||
from langchain_core.documents.transformers import BaseDocumentTransformer
|
||||
from langchain_core.graph_vectorstores.links import copy_with_links
|
||||
|
||||
from langchain_community.graph_vectorstores.extractors.link_extractor import (
|
||||
LinkExtractor,
|
||||
)
|
||||
|
||||
|
||||
class LinkExtractorTransformer(BaseDocumentTransformer):
|
||||
"""DocumentTransformer for applying one or more LinkExtractors.
|
||||
|
||||
Example:
|
||||
.. code-block:: python
|
||||
|
||||
extract_links = LinkExtractorTransformer([
|
||||
HtmlLinkExtractor().as_document_extractor(),
|
||||
])
|
||||
extract_links.transform_documents(docs)
|
||||
"""
|
||||
|
||||
def __init__(self, link_extractors: Sequence[LinkExtractor[Document]]):
|
||||
"""Create a DocumentTransformer which adds extracted links to each document."""
|
||||
self.link_extractors = link_extractors
|
||||
|
||||
def transform_documents(
|
||||
self, documents: Sequence[Document], **kwargs: Any
|
||||
) -> Sequence[Document]:
|
||||
# Implement `transform_docments` directly, so that LinkExtractors which operate
|
||||
# better in batch (`extract_many`) get a chance to do so.
|
||||
|
||||
# Run each extractor over all documents.
|
||||
links_per_extractor = [e.extract_many(documents) for e in self.link_extractors]
|
||||
|
||||
# Transpose the list of lists to pair each document with the tuple of links.
|
||||
links_per_document = zip(*links_per_extractor)
|
||||
|
||||
return [
|
||||
copy_with_links(document, *links)
|
||||
for document, links in zip(documents, links_per_document)
|
||||
]
|
||||
@@ -190,6 +190,7 @@ class Sambaverse(LLM):
|
||||
"top_p": 1.0,
|
||||
"repetition_penalty": 1.0,
|
||||
"top_k": 50,
|
||||
"process_prompt": False
|
||||
},
|
||||
)
|
||||
"""
|
||||
@@ -672,7 +673,7 @@ class SambaStudio(LLM):
|
||||
Example:
|
||||
.. code-block:: python
|
||||
|
||||
from langchain_community.llms.sambanova import Sambaverse
|
||||
from langchain_community.llms.sambanova import SambaStudio
|
||||
SambaStudio(
|
||||
sambastudio_base_url="your-SambaStudio-environment-URL",
|
||||
sambastudio_base_uri="your-SambaStudio-base-URI",
|
||||
@@ -687,6 +688,8 @@ class SambaStudio(LLM):
|
||||
"top_p": 1.0,
|
||||
"repetition_penalty": 1,
|
||||
"top_k": 50,
|
||||
#"process_prompt": False,
|
||||
#"select_expert": "Meta-Llama-3-8B-Instruct"
|
||||
},
|
||||
)
|
||||
"""
|
||||
@@ -741,7 +744,7 @@ class SambaStudio(LLM):
|
||||
values,
|
||||
"sambastudio_base_uri",
|
||||
"SAMBASTUDIO_BASE_URI",
|
||||
default="api/predict/nlp",
|
||||
default="api/predict/generic",
|
||||
)
|
||||
values["sambastudio_project_id"] = get_from_dict_or_env(
|
||||
values, "sambastudio_project_id", "SAMBASTUDIO_PROJECT_ID"
|
||||
|
||||
@@ -92,6 +92,7 @@ if TYPE_CHECKING:
|
||||
from langchain_community.retrievers.milvus import (
|
||||
MilvusRetriever,
|
||||
)
|
||||
from langchain_community.retrievers.nanopq import NanoPQRetriever
|
||||
from langchain_community.retrievers.outline import (
|
||||
OutlineRetriever,
|
||||
)
|
||||
@@ -171,6 +172,7 @@ _module_lookup = {
|
||||
"LlamaIndexRetriever": "langchain_community.retrievers.llama_index",
|
||||
"MetalRetriever": "langchain_community.retrievers.metal",
|
||||
"MilvusRetriever": "langchain_community.retrievers.milvus",
|
||||
"NanoPQRetriever": "langchain_community.retrievers.nanopq",
|
||||
"OutlineRetriever": "langchain_community.retrievers.outline",
|
||||
"PineconeHybridSearchRetriever": "langchain_community.retrievers.pinecone_hybrid_search", # noqa: E501
|
||||
"PubMedRetriever": "langchain_community.retrievers.pubmed",
|
||||
@@ -226,6 +228,7 @@ __all__ = [
|
||||
"LlamaIndexRetriever",
|
||||
"MetalRetriever",
|
||||
"MilvusRetriever",
|
||||
"NanoPQRetriever",
|
||||
"NeuralDBRetriever",
|
||||
"OutlineRetriever",
|
||||
"PineconeHybridSearchRetriever",
|
||||
|
||||
125
libs/community/langchain_community/retrievers/nanopq.py
Normal file
125
libs/community/langchain_community/retrievers/nanopq.py
Normal file
@@ -0,0 +1,125 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import concurrent.futures
|
||||
from typing import Any, Iterable, List, Optional
|
||||
|
||||
import numpy as np
|
||||
from langchain_core.callbacks import CallbackManagerForRetrieverRun
|
||||
from langchain_core.documents import Document
|
||||
from langchain_core.embeddings import Embeddings
|
||||
from langchain_core.retrievers import BaseRetriever
|
||||
|
||||
|
||||
def create_index(contexts: List[str], embeddings: Embeddings) -> np.ndarray:
|
||||
"""
|
||||
Create an index of embeddings for a list of contexts.
|
||||
|
||||
Args:
|
||||
contexts: List of contexts to embed.
|
||||
embeddings: Embeddings model to use.
|
||||
|
||||
Returns:
|
||||
Index of embeddings.
|
||||
"""
|
||||
with concurrent.futures.ThreadPoolExecutor() as executor:
|
||||
return np.array(list(executor.map(embeddings.embed_query, contexts)))
|
||||
|
||||
|
||||
class NanoPQRetriever(BaseRetriever):
|
||||
"""`NanoPQ retriever."""
|
||||
|
||||
embeddings: Embeddings
|
||||
"""Embeddings model to use."""
|
||||
index: Any
|
||||
"""Index of embeddings."""
|
||||
texts: List[str]
|
||||
"""List of texts to index."""
|
||||
metadatas: Optional[List[dict]] = None
|
||||
"""List of metadatas corresponding with each text."""
|
||||
k: int = 4
|
||||
"""Number of results to return."""
|
||||
relevancy_threshold: Optional[float] = None
|
||||
"""Threshold for relevancy."""
|
||||
subspace: int = 4
|
||||
"""No of subspaces to be created, should be a multiple of embedding shape"""
|
||||
clusters: int = 128
|
||||
"""No of clusters to be created"""
|
||||
|
||||
class Config:
|
||||
"""Configuration for this pydantic object."""
|
||||
|
||||
arbitrary_types_allowed = True
|
||||
|
||||
@classmethod
|
||||
def from_texts(
|
||||
cls,
|
||||
texts: List[str],
|
||||
embeddings: Embeddings,
|
||||
metadatas: Optional[List[dict]] = None,
|
||||
**kwargs: Any,
|
||||
) -> NanoPQRetriever:
|
||||
index = create_index(texts, embeddings)
|
||||
return cls(
|
||||
embeddings=embeddings,
|
||||
index=index,
|
||||
texts=texts,
|
||||
metadatas=metadatas,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def from_documents(
|
||||
cls,
|
||||
documents: Iterable[Document],
|
||||
embeddings: Embeddings,
|
||||
**kwargs: Any,
|
||||
) -> NanoPQRetriever:
|
||||
texts, metadatas = zip(*((d.page_content, d.metadata) for d in documents))
|
||||
return cls.from_texts(
|
||||
texts=texts, embeddings=embeddings, metadatas=metadatas, **kwargs
|
||||
)
|
||||
|
||||
def _get_relevant_documents(
|
||||
self, query: str, *, run_manager: CallbackManagerForRetrieverRun
|
||||
) -> List[Document]:
|
||||
try:
|
||||
from nanopq import PQ
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"Could not import nanopq, please install with `pip install " "nanopq`."
|
||||
)
|
||||
|
||||
query_embeds = np.array(self.embeddings.embed_query(query))
|
||||
try:
|
||||
pq = PQ(M=self.subspace, Ks=self.clusters, verbose=True).fit(
|
||||
self.index.astype("float32")
|
||||
)
|
||||
except AssertionError:
|
||||
error_message = (
|
||||
"Received params: training_sample={training_sample}, "
|
||||
"n_cluster={n_clusters}, subspace={subspace}, "
|
||||
"embedding_shape={embedding_shape}. Issue with the combination. "
|
||||
"Please retrace back to find the exact error"
|
||||
).format(
|
||||
training_sample=self.index.shape[0],
|
||||
n_clusters=self.clusters,
|
||||
subspace=self.subspace,
|
||||
embedding_shape=self.index.shape[1],
|
||||
)
|
||||
raise RuntimeError(error_message)
|
||||
|
||||
index_code = pq.encode(vecs=self.index.astype("float32"))
|
||||
dt = pq.dtable(query=query_embeds.astype("float32"))
|
||||
dists = dt.adist(codes=index_code)
|
||||
|
||||
sorted_ix = np.argsort(dists)
|
||||
|
||||
top_k_results = [
|
||||
Document(
|
||||
page_content=self.texts[row],
|
||||
metadata=self.metadatas[row] if self.metadatas else {},
|
||||
)
|
||||
for row in sorted_ix[0 : self.k]
|
||||
]
|
||||
|
||||
return top_k_results
|
||||
File diff suppressed because one or more lines are too long
@@ -306,6 +306,27 @@ class AzureCosmosDBVectorSearch(VectorStore):
|
||||
}
|
||||
return command
|
||||
|
||||
def create_filter_index(
|
||||
self,
|
||||
property_to_filter: str,
|
||||
index_name: str,
|
||||
) -> dict[str, Any]:
|
||||
command = {
|
||||
"createIndexes": self._collection.name,
|
||||
"indexes": [
|
||||
{
|
||||
"key": {property_to_filter: 1},
|
||||
"name": index_name,
|
||||
}
|
||||
],
|
||||
}
|
||||
# retrieve the database object
|
||||
current_database = self._collection.database
|
||||
|
||||
# invoke the command from the database object
|
||||
create_index_responses: dict[str, Any] = current_database.command(command)
|
||||
return create_index_responses
|
||||
|
||||
def add_texts(
|
||||
self,
|
||||
texts: Iterable[str],
|
||||
@@ -345,7 +366,7 @@ class AzureCosmosDBVectorSearch(VectorStore):
|
||||
# Embed and create the documents
|
||||
embeddings = self._embedding.embed_documents(texts)
|
||||
to_insert = [
|
||||
{self._text_key: t, self._embedding_key: embedding, **m}
|
||||
{self._text_key: t, self._embedding_key: embedding, "metadata": m}
|
||||
for t, m, embedding in zip(texts, metadatas, embeddings)
|
||||
]
|
||||
# insert the documents in Cosmos DB
|
||||
@@ -397,8 +418,10 @@ class AzureCosmosDBVectorSearch(VectorStore):
|
||||
embeddings: List[float],
|
||||
k: int = 4,
|
||||
kind: CosmosDBVectorSearchType = CosmosDBVectorSearchType.VECTOR_IVF,
|
||||
pre_filter: Optional[Dict] = None,
|
||||
ef_search: int = 40,
|
||||
score_threshold: float = 0.0,
|
||||
with_embedding: bool = False,
|
||||
) -> List[Tuple[Document, float]]:
|
||||
"""Returns a list of documents with their scores
|
||||
|
||||
@@ -422,9 +445,11 @@ class AzureCosmosDBVectorSearch(VectorStore):
|
||||
"""
|
||||
pipeline: List[dict[str, Any]] = []
|
||||
if kind == CosmosDBVectorSearchType.VECTOR_IVF:
|
||||
pipeline = self._get_pipeline_vector_ivf(embeddings, k)
|
||||
pipeline = self._get_pipeline_vector_ivf(embeddings, k, pre_filter)
|
||||
elif kind == CosmosDBVectorSearchType.VECTOR_HNSW:
|
||||
pipeline = self._get_pipeline_vector_hnsw(embeddings, k, ef_search)
|
||||
pipeline = self._get_pipeline_vector_hnsw(
|
||||
embeddings, k, ef_search, pre_filter
|
||||
)
|
||||
|
||||
cursor = self._collection.aggregate(pipeline)
|
||||
|
||||
@@ -433,28 +458,32 @@ class AzureCosmosDBVectorSearch(VectorStore):
|
||||
score = res.pop("similarityScore")
|
||||
if score < score_threshold:
|
||||
continue
|
||||
document_object_field = (
|
||||
res.pop("document")
|
||||
if kind == CosmosDBVectorSearchType.VECTOR_IVF
|
||||
else res
|
||||
)
|
||||
document_object_field = res.pop("document")
|
||||
text = document_object_field.pop(self._text_key)
|
||||
docs.append(
|
||||
(Document(page_content=text, metadata=document_object_field), score)
|
||||
)
|
||||
metadata = document_object_field.pop("metadata")
|
||||
if with_embedding:
|
||||
metadata[self._embedding_key] = document_object_field.pop(
|
||||
self._embedding_key
|
||||
)
|
||||
|
||||
docs.append((Document(page_content=text, metadata=metadata), score))
|
||||
return docs
|
||||
|
||||
def _get_pipeline_vector_ivf(
|
||||
self, embeddings: List[float], k: int = 4
|
||||
self, embeddings: List[float], k: int = 4, pre_filter: Optional[Dict] = None
|
||||
) -> List[dict[str, Any]]:
|
||||
params = {
|
||||
"vector": embeddings,
|
||||
"path": self._embedding_key,
|
||||
"k": k,
|
||||
}
|
||||
if pre_filter:
|
||||
params["filter"] = pre_filter
|
||||
|
||||
pipeline: List[dict[str, Any]] = [
|
||||
{
|
||||
"$search": {
|
||||
"cosmosSearch": {
|
||||
"vector": embeddings,
|
||||
"path": self._embedding_key,
|
||||
"k": k,
|
||||
},
|
||||
"cosmosSearch": params,
|
||||
"returnStoredSource": True,
|
||||
}
|
||||
},
|
||||
@@ -468,17 +497,25 @@ class AzureCosmosDBVectorSearch(VectorStore):
|
||||
return pipeline
|
||||
|
||||
def _get_pipeline_vector_hnsw(
|
||||
self, embeddings: List[float], k: int = 4, ef_search: int = 40
|
||||
self,
|
||||
embeddings: List[float],
|
||||
k: int = 4,
|
||||
ef_search: int = 40,
|
||||
pre_filter: Optional[Dict] = None,
|
||||
) -> List[dict[str, Any]]:
|
||||
params = {
|
||||
"vector": embeddings,
|
||||
"path": self._embedding_key,
|
||||
"k": k,
|
||||
"efSearch": ef_search,
|
||||
}
|
||||
if pre_filter:
|
||||
params["filter"] = pre_filter
|
||||
|
||||
pipeline: List[dict[str, Any]] = [
|
||||
{
|
||||
"$search": {
|
||||
"cosmosSearch": {
|
||||
"vector": embeddings,
|
||||
"path": self._embedding_key,
|
||||
"k": k,
|
||||
"efSearch": ef_search,
|
||||
},
|
||||
"cosmosSearch": params,
|
||||
}
|
||||
},
|
||||
{
|
||||
@@ -495,16 +532,20 @@ class AzureCosmosDBVectorSearch(VectorStore):
|
||||
query: str,
|
||||
k: int = 4,
|
||||
kind: CosmosDBVectorSearchType = CosmosDBVectorSearchType.VECTOR_IVF,
|
||||
pre_filter: Optional[Dict] = None,
|
||||
ef_search: int = 40,
|
||||
score_threshold: float = 0.0,
|
||||
with_embedding: bool = False,
|
||||
) -> List[Tuple[Document, float]]:
|
||||
embeddings = self._embedding.embed_query(query)
|
||||
docs = self._similarity_search_with_score(
|
||||
embeddings=embeddings,
|
||||
k=k,
|
||||
kind=kind,
|
||||
pre_filter=pre_filter,
|
||||
ef_search=ef_search,
|
||||
score_threshold=score_threshold,
|
||||
with_embedding=with_embedding,
|
||||
)
|
||||
return docs
|
||||
|
||||
@@ -513,16 +554,20 @@ class AzureCosmosDBVectorSearch(VectorStore):
|
||||
query: str,
|
||||
k: int = 4,
|
||||
kind: CosmosDBVectorSearchType = CosmosDBVectorSearchType.VECTOR_IVF,
|
||||
pre_filter: Optional[Dict] = None,
|
||||
ef_search: int = 40,
|
||||
score_threshold: float = 0.0,
|
||||
with_embedding: bool = False,
|
||||
**kwargs: Any,
|
||||
) -> List[Document]:
|
||||
docs_and_scores = self.similarity_search_with_score(
|
||||
query,
|
||||
k=k,
|
||||
kind=kind,
|
||||
pre_filter=pre_filter,
|
||||
ef_search=ef_search,
|
||||
score_threshold=score_threshold,
|
||||
with_embedding=with_embedding,
|
||||
)
|
||||
return [doc for doc, _ in docs_and_scores]
|
||||
|
||||
@@ -533,8 +578,10 @@ class AzureCosmosDBVectorSearch(VectorStore):
|
||||
fetch_k: int = 20,
|
||||
lambda_mult: float = 0.5,
|
||||
kind: CosmosDBVectorSearchType = CosmosDBVectorSearchType.VECTOR_IVF,
|
||||
pre_filter: Optional[Dict] = None,
|
||||
ef_search: int = 40,
|
||||
score_threshold: float = 0.0,
|
||||
with_embedding: bool = False,
|
||||
**kwargs: Any,
|
||||
) -> List[Document]:
|
||||
# Retrieves the docs with similarity scores
|
||||
@@ -543,8 +590,10 @@ class AzureCosmosDBVectorSearch(VectorStore):
|
||||
embedding,
|
||||
k=fetch_k,
|
||||
kind=kind,
|
||||
pre_filter=pre_filter,
|
||||
ef_search=ef_search,
|
||||
score_threshold=score_threshold,
|
||||
with_embedding=with_embedding,
|
||||
)
|
||||
|
||||
# Re-ranks the docs using MMR
|
||||
@@ -564,8 +613,10 @@ class AzureCosmosDBVectorSearch(VectorStore):
|
||||
fetch_k: int = 20,
|
||||
lambda_mult: float = 0.5,
|
||||
kind: CosmosDBVectorSearchType = CosmosDBVectorSearchType.VECTOR_IVF,
|
||||
pre_filter: Optional[Dict] = None,
|
||||
ef_search: int = 40,
|
||||
score_threshold: float = 0.0,
|
||||
with_embedding: bool = False,
|
||||
**kwargs: Any,
|
||||
) -> List[Document]:
|
||||
# compute the embeddings vector from the query string
|
||||
@@ -577,8 +628,10 @@ class AzureCosmosDBVectorSearch(VectorStore):
|
||||
fetch_k=fetch_k,
|
||||
lambda_mult=lambda_mult,
|
||||
kind=kind,
|
||||
pre_filter=pre_filter,
|
||||
ef_search=ef_search,
|
||||
score_threshold=score_threshold,
|
||||
with_embedding=with_embedding,
|
||||
)
|
||||
return docs
|
||||
|
||||
|
||||
@@ -162,7 +162,12 @@ class AzureCosmosDBNoSqlVectorSearch(VectorStore):
|
||||
text_key = "text"
|
||||
|
||||
to_insert = [
|
||||
{"id": str(uuid.uuid4()), text_key: t, self._embedding_key: embedding, **m}
|
||||
{
|
||||
"id": str(uuid.uuid4()),
|
||||
text_key: t,
|
||||
self._embedding_key: embedding,
|
||||
"metadata": m,
|
||||
}
|
||||
for t, m, embedding in zip(texts, metadatas, embeddings)
|
||||
]
|
||||
# insert the documents in CosmosDB No Sql
|
||||
@@ -184,6 +189,7 @@ class AzureCosmosDBNoSqlVectorSearch(VectorStore):
|
||||
cosmos_database_properties: Dict[str, Any],
|
||||
database_name: str = "vectorSearchDB",
|
||||
container_name: str = "vectorSearchContainer",
|
||||
create_container: bool = True,
|
||||
**kwargs: Any,
|
||||
) -> AzureCosmosDBNoSqlVectorSearch:
|
||||
if kwargs:
|
||||
@@ -204,6 +210,7 @@ class AzureCosmosDBNoSqlVectorSearch(VectorStore):
|
||||
cosmos_database_properties=cosmos_database_properties,
|
||||
database_name=database_name,
|
||||
container_name=container_name,
|
||||
create_container=create_container,
|
||||
)
|
||||
|
||||
@classmethod
|
||||
@@ -257,41 +264,83 @@ class AzureCosmosDBNoSqlVectorSearch(VectorStore):
|
||||
self,
|
||||
embeddings: List[float],
|
||||
k: int = 4,
|
||||
pre_filter: Optional[Dict] = None,
|
||||
with_embedding: bool = False,
|
||||
) -> List[Tuple[Document, float]]:
|
||||
query = (
|
||||
"SELECT TOP {} c.id, c.{}, c.text, VectorDistance(c.{}, {}) AS "
|
||||
"SimilarityScore FROM c ORDER BY VectorDistance(c.{}, {})".format(
|
||||
k,
|
||||
self._embedding_key,
|
||||
self._embedding_key,
|
||||
embeddings,
|
||||
self._embedding_key,
|
||||
embeddings,
|
||||
)
|
||||
query = "SELECT "
|
||||
|
||||
# If limit_offset_clause is not specified, add TOP clause
|
||||
if pre_filter is None or pre_filter.get("limit_offset_clause") is None:
|
||||
query += "TOP @limit "
|
||||
|
||||
query += (
|
||||
"c.id, c.{}, c.text, c.metadata, "
|
||||
"VectorDistance(c.@embeddingKey, @embeddings) AS SimilarityScore FROM c"
|
||||
)
|
||||
|
||||
# Add where_clause if specified
|
||||
if pre_filter is not None and pre_filter.get("where_clause") is not None:
|
||||
query += " {}".format(pre_filter["where_clause"])
|
||||
|
||||
query += " ORDER BY VectorDistance(c.@embeddingKey, @embeddings)"
|
||||
|
||||
# Add limit_offset_clause if specified
|
||||
if pre_filter is not None and pre_filter.get("limit_offset_clause") is not None:
|
||||
query += " {}".format(pre_filter["limit_offset_clause"])
|
||||
parameters = [
|
||||
{"name": "@limit", "value": k},
|
||||
{"name": "@embeddingKey", "value": self._embedding_key},
|
||||
{"name": "@embeddings", "value": embeddings},
|
||||
]
|
||||
|
||||
docs_and_scores = []
|
||||
|
||||
items = list(
|
||||
self._container.query_items(query=query, enable_cross_partition_query=True)
|
||||
self._container.query_items(
|
||||
query=query, parameters=parameters, enable_cross_partition_query=True
|
||||
)
|
||||
)
|
||||
for item in items:
|
||||
text = item["text"]
|
||||
metadata = item["metadata"]
|
||||
score = item["SimilarityScore"]
|
||||
docs_and_scores.append((Document(page_content=text, metadata=item), score))
|
||||
if with_embedding:
|
||||
metadata[self._embedding_key] = item[self._embedding_key]
|
||||
docs_and_scores.append(
|
||||
(Document(page_content=text, metadata=metadata), score)
|
||||
)
|
||||
return docs_and_scores
|
||||
|
||||
def similarity_search_with_score(
|
||||
self,
|
||||
query: str,
|
||||
k: int = 4,
|
||||
pre_filter: Optional[Dict] = None,
|
||||
with_embedding: bool = False,
|
||||
) -> List[Tuple[Document, float]]:
|
||||
embeddings = self._embedding.embed_query(query)
|
||||
docs_and_scores = self._similarity_search_with_score(embeddings=embeddings, k=k)
|
||||
docs_and_scores = self._similarity_search_with_score(
|
||||
embeddings=embeddings,
|
||||
k=k,
|
||||
pre_filter=pre_filter,
|
||||
with_embedding=with_embedding,
|
||||
)
|
||||
return docs_and_scores
|
||||
|
||||
def similarity_search(
|
||||
self, query: str, k: int = 4, **kwargs: Any
|
||||
self,
|
||||
query: str,
|
||||
k: int = 4,
|
||||
pre_filter: Optional[Dict] = None,
|
||||
with_embedding: bool = False,
|
||||
**kwargs: Any,
|
||||
) -> List[Document]:
|
||||
docs_and_scores = self.similarity_search_with_score(query, k=k)
|
||||
docs_and_scores = self.similarity_search_with_score(
|
||||
query,
|
||||
k=k,
|
||||
pre_filter=pre_filter,
|
||||
with_embedding=with_embedding,
|
||||
)
|
||||
|
||||
return [doc for doc, _ in docs_and_scores]
|
||||
|
||||
@@ -304,7 +353,18 @@ class AzureCosmosDBNoSqlVectorSearch(VectorStore):
|
||||
**kwargs: Any,
|
||||
) -> List[Document]:
|
||||
# Retrieves the docs with similarity scores
|
||||
docs = self._similarity_search_with_score(embeddings=embedding, k=fetch_k)
|
||||
pre_filter = {}
|
||||
with_embedding = False
|
||||
if kwargs["pre_filter"]:
|
||||
pre_filter = kwargs["pre_filter"]
|
||||
if kwargs["with_embedding"]:
|
||||
with_embedding = kwargs["with_embedding"]
|
||||
docs = self._similarity_search_with_score(
|
||||
embeddings=embedding,
|
||||
k=fetch_k,
|
||||
pre_filter=pre_filter,
|
||||
with_embedding=with_embedding,
|
||||
)
|
||||
|
||||
# Re-ranks the docs using MMR
|
||||
mmr_doc_indexes = maximal_marginal_relevance(
|
||||
@@ -326,6 +386,12 @@ class AzureCosmosDBNoSqlVectorSearch(VectorStore):
|
||||
**kwargs: Any,
|
||||
) -> List[Document]:
|
||||
# compute the embeddings vector from the query string
|
||||
pre_filter = {}
|
||||
with_embedding = False
|
||||
if kwargs["pre_filter"]:
|
||||
pre_filter = kwargs["pre_filter"]
|
||||
if kwargs["with_embedding"]:
|
||||
with_embedding = kwargs["with_embedding"]
|
||||
embeddings = self._embedding.embed_query(query)
|
||||
|
||||
docs = self.max_marginal_relevance_search_by_vector(
|
||||
@@ -333,5 +399,7 @@ class AzureCosmosDBNoSqlVectorSearch(VectorStore):
|
||||
k=k,
|
||||
fetch_k=fetch_k,
|
||||
lambda_mult=lambda_mult,
|
||||
pre_filter=pre_filter,
|
||||
with_embedding=with_embedding,
|
||||
)
|
||||
return docs
|
||||
|
||||
8
libs/community/poetry.lock
generated
8
libs/community/poetry.lock
generated
@@ -2117,7 +2117,7 @@ files = [
|
||||
|
||||
[[package]]
|
||||
name = "langchain"
|
||||
version = "0.2.9"
|
||||
version = "0.2.10"
|
||||
description = "Building applications with LLMs through composability"
|
||||
optional = false
|
||||
python-versions = ">=3.8.1,<4.0"
|
||||
@@ -2127,7 +2127,7 @@ develop = true
|
||||
[package.dependencies]
|
||||
aiohttp = "^3.8.3"
|
||||
async-timeout = {version = "^4.0.0", markers = "python_version < \"3.11\""}
|
||||
langchain-core = "^0.2.20"
|
||||
langchain-core = "^0.2.22"
|
||||
langchain-text-splitters = "^0.2.0"
|
||||
langsmith = "^0.1.17"
|
||||
numpy = [
|
||||
@@ -2146,7 +2146,7 @@ url = "../langchain"
|
||||
|
||||
[[package]]
|
||||
name = "langchain-core"
|
||||
version = "0.2.22"
|
||||
version = "0.2.23"
|
||||
description = "Building applications with LLMs through composability"
|
||||
optional = false
|
||||
python-versions = ">=3.8.1,<4.0"
|
||||
@@ -5759,4 +5759,4 @@ test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools",
|
||||
[metadata]
|
||||
lock-version = "2.0"
|
||||
python-versions = ">=3.8.1,<4.0"
|
||||
content-hash = "7755e46a5239dc88c52f49ac337d573090728fc8a3b1f3465fcd4a9f18c5aaec"
|
||||
content-hash = "14d60e1f61fa9c0ba69cb4e227e4af3de395a8dd4a53b121fe488e7b9f75ea66"
|
||||
|
||||
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
|
||||
|
||||
[tool.poetry]
|
||||
name = "langchain-community"
|
||||
version = "0.2.9"
|
||||
version = "0.2.10"
|
||||
description = "Community contributed LangChain integrations."
|
||||
authors = []
|
||||
license = "MIT"
|
||||
@@ -24,9 +24,13 @@ skip = ".git,*.pdf,*.svg,*.pdf,*.yaml,*.ipynb,poetry.lock,*.min.js,*.css,package
|
||||
ignore-regex = ".*(Stati Uniti|Tense=Pres).*"
|
||||
ignore-words-list = "momento,collison,ned,foor,reworkd,parth,whats,aapply,mysogyny,unsecure,damon,crate,aadd,symbl,precesses,accademia,nin,cann"
|
||||
|
||||
[tool.poetry.urls]
|
||||
"Source Code" = "https://github.com/langchain-ai/langchain/tree/master/libs/community"
|
||||
"Release Notes" = "https://github.com/langchain-ai/langchain/releases?q=tag%3A%22langchain-community%3D%3D0%22&expanded=true"
|
||||
|
||||
[tool.poetry.dependencies]
|
||||
python = ">=3.8.1,<4.0"
|
||||
langchain-core = "^0.2.22"
|
||||
langchain-core = "^0.2.23"
|
||||
langchain = "^0.2.9"
|
||||
SQLAlchemy = ">=1.4,<3"
|
||||
requests = "^2"
|
||||
|
||||
@@ -13,6 +13,30 @@ fi
|
||||
|
||||
repository_path="$1"
|
||||
|
||||
# Check that we are not using features that cannot be captured via init.
|
||||
# pre-init is a custom decorator that we introduced to capture the same semantics
|
||||
# as @root_validator(pre=False, skip_on_failure=False) available in pydantic 1.
|
||||
count=$(git grep -E '(@root_validator)|(@validator)|(@pre_init)' -- "*.py" | wc -l)
|
||||
# PRs that increase the current count will not be accepted.
|
||||
# PRs that decrease update the code in the repository
|
||||
# and allow decreasing the count of are welcome!
|
||||
current_count=336
|
||||
|
||||
if [ "$count" -gt "$current_count" ]; then
|
||||
echo "The PR seems to be introducing new usage of @root_validator and/or @field_validator."
|
||||
echo "git grep -E '(@root_validator)|(@validator)' | wc -l returned $count"
|
||||
echo "whereas the expected count should be equal or less than $current_count"
|
||||
echo "Please update the code to instead use __init__"
|
||||
echo "For examples, please see: "
|
||||
echo "https://gist.github.com/eyurtsev/d1dcba10c2f35626e302f1b98a0f5a3c "
|
||||
echo "This linter is here to make sure that its easier to upgrade pydantic in the future."
|
||||
exit 1
|
||||
elif [ "$count" -lt "$current_count" ]; then
|
||||
echo "Please update the $current_count variable in ./scripts/check_pydantic.sh to $count"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
|
||||
# Search for lines matching the pattern within the specified repository
|
||||
result=$(git -C "$repository_path" grep -En '^import pydantic|^from pydantic')
|
||||
|
||||
|
||||
@@ -0,0 +1,146 @@
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
from langchain_community.document_loaders import (
|
||||
DedocAPIFileLoader,
|
||||
DedocFileLoader,
|
||||
DedocPDFLoader,
|
||||
)
|
||||
|
||||
EXAMPLE_DOCS_DIRECTORY = str(Path(__file__).parent.parent / "examples/")
|
||||
|
||||
FILE_NAMES = [
|
||||
"example.html",
|
||||
"example.json",
|
||||
"fake-email-attachment.eml",
|
||||
"layout-parser-paper.pdf",
|
||||
"slack_export.zip",
|
||||
"stanley-cups.csv",
|
||||
"stanley-cups.xlsx",
|
||||
"whatsapp_chat.txt",
|
||||
]
|
||||
|
||||
|
||||
def test_dedoc_file_loader() -> None:
|
||||
for file_name in FILE_NAMES:
|
||||
file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, file_name)
|
||||
loader = DedocFileLoader(
|
||||
file_path,
|
||||
split="document",
|
||||
with_tables=False,
|
||||
pdf_with_text_layer="tabby",
|
||||
pages=":1",
|
||||
)
|
||||
docs = loader.load()
|
||||
|
||||
assert len(docs) == 1
|
||||
|
||||
|
||||
def test_dedoc_pdf_loader() -> None:
|
||||
file_name = "layout-parser-paper.pdf"
|
||||
for mode in ("true", "tabby"):
|
||||
file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, file_name)
|
||||
loader = DedocPDFLoader(
|
||||
file_path,
|
||||
split="document",
|
||||
with_tables=False,
|
||||
pdf_with_text_layer=mode,
|
||||
pages=":1",
|
||||
)
|
||||
docs = loader.load()
|
||||
|
||||
assert len(docs) == 1
|
||||
|
||||
|
||||
def test_dedoc_content_html() -> None:
|
||||
file_name = "example.html"
|
||||
file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, file_name)
|
||||
loader = DedocFileLoader(
|
||||
file_path,
|
||||
split="line",
|
||||
with_tables=False,
|
||||
)
|
||||
docs = loader.load()
|
||||
|
||||
assert docs[0].metadata["file_name"] == "example.html"
|
||||
assert docs[0].metadata["file_type"] == "text/html"
|
||||
assert "Instead of drinking water from the cat bowl" in docs[0].page_content
|
||||
assert "Chase the red dot" not in docs[0].page_content
|
||||
|
||||
|
||||
def test_dedoc_content_pdf() -> None:
|
||||
file_name = "layout-parser-paper.pdf"
|
||||
file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, file_name)
|
||||
loader = DedocFileLoader(
|
||||
file_path, split="page", pdf_with_text_layer="tabby", pages=":5"
|
||||
)
|
||||
docs = loader.load()
|
||||
table_list = [item for item in docs if item.metadata.get("type", "") == "table"]
|
||||
|
||||
assert len(docs) == 6
|
||||
assert docs[0].metadata["file_name"] == "layout-parser-paper.pdf"
|
||||
assert docs[0].metadata["file_type"] == "application/pdf"
|
||||
assert "This paper introduces LayoutParser, an open-source" in docs[0].page_content
|
||||
assert "layout detection [38, 22], table detection [26]" in docs[1].page_content
|
||||
assert "LayoutParser: A Unified Toolkit for DL-Based DIA" in docs[2].page_content
|
||||
assert len(table_list) > 0
|
||||
assert (
|
||||
'\n<tbody>\n<tr>\n<td colspan="1" rowspan="1">'
|
||||
in table_list[0].metadata["text_as_html"]
|
||||
)
|
||||
|
||||
|
||||
def test_dedoc_content_json() -> None:
|
||||
file_name = "example.json"
|
||||
file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, file_name)
|
||||
loader = DedocFileLoader(file_path, split="node")
|
||||
docs = loader.load()
|
||||
|
||||
assert len(docs) == 11
|
||||
assert docs[0].metadata["file_name"] == "example.json"
|
||||
assert docs[0].metadata["file_type"] == "application/json"
|
||||
assert "Bye!" in docs[0].page_content
|
||||
|
||||
|
||||
def test_dedoc_content_txt() -> None:
|
||||
file_name = "whatsapp_chat.txt"
|
||||
file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, file_name)
|
||||
loader = DedocFileLoader(file_path, split="line")
|
||||
docs = loader.load()
|
||||
|
||||
assert len(docs) == 10
|
||||
assert docs[0].metadata["file_name"] == "whatsapp_chat.txt"
|
||||
assert docs[0].metadata["file_type"] == "text/plain"
|
||||
assert "[05.05.23, 15:48:11] James: Hi here" in docs[0].page_content
|
||||
assert "[11/8/21, 9:41:32 AM] User name: Message 123" in docs[1].page_content
|
||||
assert "1/23/23, 3:19 AM - User 2: Bye!" in docs[2].page_content
|
||||
|
||||
|
||||
def test_dedoc_table_handling() -> None:
|
||||
file_name = "stanley-cups.csv"
|
||||
file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, file_name)
|
||||
loader = DedocFileLoader(file_path, split="document")
|
||||
docs = loader.load()
|
||||
|
||||
assert len(docs) == 2
|
||||
assert docs[0].metadata["file_name"] == "stanley-cups.csv"
|
||||
assert docs[0].metadata["file_type"] == "text/csv"
|
||||
assert docs[1].metadata["type"] == "table"
|
||||
assert '<td colspan="1" rowspan="1">1</td>' in docs[1].metadata["text_as_html"]
|
||||
assert "Maple Leafs\tTOR\t13" in docs[1].page_content
|
||||
|
||||
|
||||
def test_dedoc_api_file_loader() -> None:
|
||||
file_name = "whatsapp_chat.txt"
|
||||
file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, file_name)
|
||||
loader = DedocAPIFileLoader(
|
||||
file_path, split="line", url="https://dedoc-readme.hf.space"
|
||||
)
|
||||
docs = loader.load()
|
||||
|
||||
assert len(docs) == 10
|
||||
assert docs[0].metadata["file_name"] == "whatsapp_chat.txt"
|
||||
assert docs[0].metadata["file_type"] == "text/plain"
|
||||
assert "[05.05.23, 15:48:11] James: Hi here" in docs[0].page_content
|
||||
assert "[11/8/21, 9:41:32 AM] User name: Message 123" in docs[1].page_content
|
||||
assert "1/23/23, 3:19 AM - User 2: Bye!" in docs[2].page_content
|
||||
@@ -25,7 +25,7 @@ model_name = os.getenv("OPENAI_EMBEDDINGS_MODEL_NAME", "text-embedding-ada-002")
|
||||
INDEX_NAME = "langchain-test-index"
|
||||
INDEX_NAME_VECTOR_HNSW = "langchain-test-index-hnsw"
|
||||
NAMESPACE = "langchain_test_db.langchain_test_collection"
|
||||
CONNECTION_STRING: str = os.environ.get("MONGODB_VCORE_URI", "")
|
||||
CONNECTION_STRING: str = "mongodb+srv://akataria:Basket24ball@akataria-vector-search-testing.mongocluster.cosmos.azure.com/?tls=true&authMechanism=SCRAM-SHA-256&retrywrites=false&maxIdleTimeMS=120000"
|
||||
DB_NAME, COLLECTION_NAME = NAMESPACE.split(".")
|
||||
|
||||
num_lists = 3
|
||||
|
||||
@@ -104,6 +104,7 @@ class TestAzureCosmosDBNoSqlVectorSearch:
|
||||
),
|
||||
indexing_policy=get_vector_indexing_policy("flat"),
|
||||
cosmos_container_properties={"partition_key": partition_key},
|
||||
cosmos_database_properties={},
|
||||
)
|
||||
sleep(1) # waits for Cosmos DB to save contents to the collection
|
||||
|
||||
@@ -139,6 +140,7 @@ class TestAzureCosmosDBNoSqlVectorSearch:
|
||||
),
|
||||
indexing_policy=get_vector_indexing_policy("flat"),
|
||||
cosmos_container_properties={"partition_key": partition_key},
|
||||
cosmos_database_properties={},
|
||||
)
|
||||
sleep(1) # waits for Cosmos DB to save contents to the collection
|
||||
|
||||
@@ -154,3 +156,60 @@ class TestAzureCosmosDBNoSqlVectorSearch:
|
||||
assert output2
|
||||
assert output2[0].page_content != "Dogs are tough."
|
||||
safe_delete_database(cosmos_client)
|
||||
|
||||
def test_from_documents_cosine_distance_with_filtering(
|
||||
self,
|
||||
cosmos_client: Any,
|
||||
partition_key: Any,
|
||||
azure_openai_embeddings: OpenAIEmbeddings,
|
||||
) -> None:
|
||||
"""Test end to end construction and search."""
|
||||
documents = [
|
||||
Document(page_content="Dogs are tough.", metadata={"a": 1}),
|
||||
Document(page_content="Cats have fluff.", metadata={"a": 1}),
|
||||
Document(page_content="What is a sandwich?", metadata={"c": 1}),
|
||||
Document(page_content="That fence is purple.", metadata={"d": 1, "e": 2}),
|
||||
]
|
||||
|
||||
store = AzureCosmosDBNoSqlVectorSearch.from_documents(
|
||||
documents,
|
||||
azure_openai_embeddings,
|
||||
cosmos_client=cosmos_client,
|
||||
database_name=database_name,
|
||||
container_name=container_name,
|
||||
vector_embedding_policy=get_vector_embedding_policy(
|
||||
"cosine", "float32", 400
|
||||
),
|
||||
indexing_policy=get_vector_indexing_policy("flat"),
|
||||
cosmos_container_properties={"partition_key": partition_key},
|
||||
cosmos_database_properties={},
|
||||
)
|
||||
sleep(1) # waits for Cosmos DB to save contents to the collection
|
||||
|
||||
output = store.similarity_search("Dogs", k=4)
|
||||
assert len(output) == 4
|
||||
assert output[0].page_content == "Dogs are tough."
|
||||
assert output[0].metadata["a"] == 1
|
||||
|
||||
pre_filter = {
|
||||
"where_clause": "WHERE c.metadata.a=1",
|
||||
}
|
||||
output = store.similarity_search(
|
||||
"Dogs", k=4, pre_filter=pre_filter, with_embedding=True
|
||||
)
|
||||
|
||||
assert len(output) == 2
|
||||
assert output[0].page_content == "Dogs are tough."
|
||||
assert output[0].metadata["a"] == 1
|
||||
|
||||
pre_filter = {
|
||||
"where_clause": "WHERE c.metadata.a=1",
|
||||
"limit_offset_clause": "OFFSET 0 LIMIT 1",
|
||||
}
|
||||
|
||||
output = store.similarity_search("Dogs", k=4, pre_filter=pre_filter)
|
||||
|
||||
assert len(output) == 1
|
||||
assert output[0].page_content == "Dogs are tough."
|
||||
assert output[0].metadata["a"] == 1
|
||||
safe_delete_database(cosmos_client)
|
||||
|
||||
@@ -3,12 +3,16 @@
|
||||
from typing import cast
|
||||
|
||||
import pytest
|
||||
from langchain_core.messages import AIMessage, HumanMessage, SystemMessage
|
||||
from langchain_core.messages import AIMessage, HumanMessage, SystemMessage, ToolMessage
|
||||
from langchain_core.pydantic_v1 import SecretStr
|
||||
from pytest import CaptureFixture
|
||||
|
||||
from langchain_community.chat_models import ChatPremAI
|
||||
from langchain_community.chat_models.premai import _messages_to_prompt_dict
|
||||
from langchain_community.chat_models.premai import (
|
||||
SINGLE_TOOL_PROMPT_TEMPLATE,
|
||||
TOOL_PROMPT_HEADER,
|
||||
_messages_to_prompt_dict,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.requires("premai")
|
||||
@@ -36,13 +40,20 @@ def test_messages_to_prompt_dict_with_valid_messages() -> None:
|
||||
AIMessage(content="AI message #1"),
|
||||
HumanMessage(content="User message #2"),
|
||||
AIMessage(content="AI message #2"),
|
||||
ToolMessage(content="Tool Message #1", tool_call_id="test_tool"),
|
||||
AIMessage(content="AI message #3"),
|
||||
]
|
||||
)
|
||||
expected_tool_message = SINGLE_TOOL_PROMPT_TEMPLATE.format(
|
||||
tool_id="test_tool", tool_response="Tool Message #1"
|
||||
)
|
||||
expected = [
|
||||
{"role": "user", "content": "User message #1"},
|
||||
{"role": "assistant", "content": "AI message #1"},
|
||||
{"role": "user", "content": "User message #2"},
|
||||
{"role": "assistant", "content": "AI message #2"},
|
||||
{"role": "assistant", "content": "AI message #3"},
|
||||
{"role": "user", "content": TOOL_PROMPT_HEADER + expected_tool_message},
|
||||
]
|
||||
|
||||
assert system_message == "System Prompt"
|
||||
|
||||
@@ -51,6 +51,9 @@ EXPECTED_ALL = [
|
||||
"CubeSemanticLoader",
|
||||
"DataFrameLoader",
|
||||
"DatadogLogsLoader",
|
||||
"DedocAPIFileLoader",
|
||||
"DedocFileLoader",
|
||||
"DedocPDFLoader",
|
||||
"PebbloSafeLoader",
|
||||
"DiffbotLoader",
|
||||
"DirectoryLoader",
|
||||
|
||||
@@ -0,0 +1,92 @@
|
||||
from typing import Set
|
||||
|
||||
from langchain_core.documents import Document
|
||||
from langchain_core.graph_vectorstores.links import Link, get_links
|
||||
|
||||
from langchain_community.graph_vectorstores.extractors import (
|
||||
LinkExtractor,
|
||||
LinkExtractorTransformer,
|
||||
)
|
||||
|
||||
TEXT1 = "Text1"
|
||||
TEXT2 = "Text2"
|
||||
|
||||
|
||||
class FakeKeywordExtractor(LinkExtractor[Document]):
|
||||
def extract_one(self, input: Document) -> Set[Link]:
|
||||
kws: Set[str] = set()
|
||||
if input.page_content == TEXT1:
|
||||
kws = {"a", "b"}
|
||||
elif input.page_content == TEXT2:
|
||||
kws = {"b", "c"}
|
||||
|
||||
return {Link.bidir(kind="fakekw", tag=kw) for kw in kws}
|
||||
|
||||
|
||||
class FakeHyperlinkExtractor(LinkExtractor[Document]):
|
||||
def extract_one(self, input: Document) -> Set[Link]:
|
||||
if input.page_content == TEXT1:
|
||||
return {
|
||||
Link.incoming(kind="fakehref", tag="http://text1"),
|
||||
Link.outgoing(kind="fakehref", tag="http://text2"),
|
||||
Link.outgoing(kind="fakehref", tag="http://text3"),
|
||||
}
|
||||
elif input.page_content == TEXT2:
|
||||
return {
|
||||
Link.incoming(kind="fakehref", tag="http://text2"),
|
||||
Link.outgoing(kind="fakehref", tag="http://text3"),
|
||||
}
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Unsupported input for FakeHyperlinkExtractor: '{input.page_content}'"
|
||||
)
|
||||
|
||||
|
||||
def test_one_extractor() -> None:
|
||||
transformer = LinkExtractorTransformer(
|
||||
[
|
||||
FakeKeywordExtractor(),
|
||||
]
|
||||
)
|
||||
doc1 = Document(TEXT1)
|
||||
doc2 = Document(TEXT2)
|
||||
results = transformer.transform_documents([doc1, doc2])
|
||||
|
||||
assert set(get_links(results[0])) == {
|
||||
Link.bidir(kind="fakekw", tag="a"),
|
||||
Link.bidir(kind="fakekw", tag="b"),
|
||||
}
|
||||
|
||||
assert set(get_links(results[1])) == {
|
||||
Link.bidir(kind="fakekw", tag="b"),
|
||||
Link.bidir(kind="fakekw", tag="c"),
|
||||
}
|
||||
|
||||
|
||||
def test_multiple_extractors() -> None:
|
||||
transformer = LinkExtractorTransformer(
|
||||
[
|
||||
FakeKeywordExtractor(),
|
||||
FakeHyperlinkExtractor(),
|
||||
]
|
||||
)
|
||||
|
||||
doc1 = Document(TEXT1)
|
||||
doc2 = Document(TEXT2)
|
||||
|
||||
results = transformer.transform_documents([doc1, doc2])
|
||||
|
||||
assert set(get_links(results[0])) == {
|
||||
Link.bidir(kind="fakekw", tag="a"),
|
||||
Link.bidir(kind="fakekw", tag="b"),
|
||||
Link.incoming(kind="fakehref", tag="http://text1"),
|
||||
Link.outgoing(kind="fakehref", tag="http://text2"),
|
||||
Link.outgoing(kind="fakehref", tag="http://text3"),
|
||||
}
|
||||
|
||||
assert set(get_links(results[1])) == {
|
||||
Link.bidir(kind="fakekw", tag="b"),
|
||||
Link.bidir(kind="fakekw", tag="c"),
|
||||
Link.incoming(kind="fakehref", tag="http://text2"),
|
||||
Link.outgoing(kind="fakehref", tag="http://text3"),
|
||||
}
|
||||
@@ -25,6 +25,7 @@ EXPECTED_ALL = [
|
||||
"LlamaIndexRetriever",
|
||||
"MetalRetriever",
|
||||
"MilvusRetriever",
|
||||
"NanoPQRetriever",
|
||||
"OutlineRetriever",
|
||||
"PineconeHybridSearchRetriever",
|
||||
"PubMedRetriever",
|
||||
|
||||
41
libs/community/tests/unit_tests/retrievers/test_nanopq.py
Normal file
41
libs/community/tests/unit_tests/retrievers/test_nanopq.py
Normal file
@@ -0,0 +1,41 @@
|
||||
import pytest
|
||||
from langchain_core.documents import Document
|
||||
|
||||
from langchain_community.embeddings import FakeEmbeddings
|
||||
from langchain_community.retrievers import NanoPQRetriever
|
||||
|
||||
|
||||
class TestNanoPQRetriever:
|
||||
@pytest.mark.requires("nanopq")
|
||||
def test_from_texts(self) -> None:
|
||||
input_texts = ["I have a pen.", "Do you have a pen?", "I have a bag."]
|
||||
pq_retriever = NanoPQRetriever.from_texts(
|
||||
texts=input_texts, embeddings=FakeEmbeddings(size=100)
|
||||
)
|
||||
assert len(pq_retriever.texts) == 3
|
||||
|
||||
@pytest.mark.requires("nanopq")
|
||||
def test_from_documents(self) -> None:
|
||||
input_docs = [
|
||||
Document(page_content="I have a pen.", metadata={"page": 1}),
|
||||
Document(page_content="Do you have a pen?", metadata={"page": 2}),
|
||||
Document(page_content="I have a bag.", metadata={"page": 3}),
|
||||
]
|
||||
pq_retriever = NanoPQRetriever.from_documents(
|
||||
documents=input_docs, embeddings=FakeEmbeddings(size=100)
|
||||
)
|
||||
assert pq_retriever.texts == [
|
||||
"I have a pen.",
|
||||
"Do you have a pen?",
|
||||
"I have a bag.",
|
||||
]
|
||||
assert pq_retriever.metadatas == [{"page": 1}, {"page": 2}, {"page": 3}]
|
||||
|
||||
@pytest.mark.requires("nanopq")
|
||||
def invalid_subspace_error(self) -> None:
|
||||
input_texts = ["I have a pen.", "Do you have a pen?", "I have a bag."]
|
||||
pq_retriever = NanoPQRetriever.from_texts(
|
||||
texts=input_texts, embeddings=FakeEmbeddings(size=43)
|
||||
)
|
||||
with pytest.raises(RuntimeError):
|
||||
pq_retriever.invoke("I have")
|
||||
@@ -77,7 +77,7 @@ def get_verbose() -> bool:
|
||||
# In the meantime, the `verbose` setting is considered True if either the old
|
||||
# or the new value are True. This accommodates users who haven't migrated
|
||||
# to using `set_verbose()` yet. Those users are getting deprecation warnings
|
||||
# directing them to use `set_verbose()` when they import `langhchain.verbose`.
|
||||
# directing them to use `set_verbose()` when they import `langchain.verbose`.
|
||||
old_verbose = langchain.verbose
|
||||
except ImportError:
|
||||
old_verbose = False
|
||||
@@ -142,7 +142,7 @@ def get_debug() -> bool:
|
||||
# In the meantime, the `debug` setting is considered True if either the old
|
||||
# or the new value are True. This accommodates users who haven't migrated
|
||||
# to using `set_debug()` yet. Those users are getting deprecation warnings
|
||||
# directing them to use `set_debug()` when they import `langhchain.debug`.
|
||||
# directing them to use `set_debug()` when they import `langchain.debug`.
|
||||
old_debug = langchain.debug
|
||||
except ImportError:
|
||||
old_debug = False
|
||||
@@ -213,7 +213,7 @@ def get_llm_cache() -> "BaseCache":
|
||||
# or the old value if both are falsy. This accommodates users
|
||||
# who haven't migrated to using `set_llm_cache()` yet.
|
||||
# Those users are getting deprecation warnings directing them
|
||||
# to use `set_llm_cache()` when they import `langhchain.llm_cache`.
|
||||
# to use `set_llm_cache()` when they import `langchain.llm_cache`.
|
||||
old_llm_cache = langchain.llm_cache
|
||||
except ImportError:
|
||||
old_llm_cache = None
|
||||
|
||||
@@ -12,7 +12,7 @@ class Link:
|
||||
"""
|
||||
|
||||
kind: str
|
||||
"""The kind of link. Allows different extractors to use the same tag name without
|
||||
"""The kind of link. Allows different extractors to use the same tag name without
|
||||
creating collisions between extractors. For example “keyword” vs “url”."""
|
||||
direction: Literal["in", "out", "bidir"]
|
||||
"""The direction of the link."""
|
||||
@@ -66,3 +66,29 @@ def add_links(doc: Document, *links: Union[Link, Iterable[Link]]) -> None:
|
||||
links_in_metadata.extend(link)
|
||||
else:
|
||||
links_in_metadata.append(link)
|
||||
|
||||
|
||||
def copy_with_links(doc: Document, *links: Union[Link, Iterable[Link]]) -> Document:
|
||||
"""Return a document with the given links added.
|
||||
|
||||
Args:
|
||||
doc: The document to add the links to.
|
||||
*links: The links to add to the document.
|
||||
|
||||
Returns:
|
||||
A document with a shallow-copy of the metadata with the links added.
|
||||
"""
|
||||
new_links = set(get_links(doc))
|
||||
for link in links:
|
||||
if isinstance(link, Iterable):
|
||||
new_links.update(link)
|
||||
else:
|
||||
new_links.add(link)
|
||||
|
||||
return Document(
|
||||
page_content=doc.page_content,
|
||||
metadata={
|
||||
**doc.metadata,
|
||||
METADATA_LINKS_KEY: list(new_links),
|
||||
},
|
||||
)
|
||||
|
||||
@@ -55,11 +55,16 @@ from langchain_core.outputs import (
|
||||
RunInfo,
|
||||
)
|
||||
from langchain_core.prompt_values import ChatPromptValue, PromptValue, StringPromptValue
|
||||
from langchain_core.pydantic_v1 import BaseModel, Field, root_validator
|
||||
from langchain_core.pydantic_v1 import (
|
||||
BaseModel,
|
||||
Field,
|
||||
root_validator,
|
||||
)
|
||||
from langchain_core.runnables import RunnableMap, RunnablePassthrough
|
||||
from langchain_core.runnables.config import ensure_config, run_in_executor
|
||||
from langchain_core.tracers._streaming import _StreamingCallbackHandler
|
||||
from langchain_core.utils.function_calling import convert_to_openai_tool
|
||||
from langchain_core.utils.pydantic import is_basemodel_subclass
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from langchain_core.output_parsers.base import OutputParserLike
|
||||
@@ -1162,7 +1167,7 @@ class BaseChatModel(BaseLanguageModel[BaseMessage], ABC):
|
||||
"with_structured_output is not implemented for this model."
|
||||
)
|
||||
llm = self.bind_tools([schema], tool_choice="any")
|
||||
if isinstance(schema, type) and issubclass(schema, BaseModel):
|
||||
if isinstance(schema, type) and is_basemodel_subclass(schema):
|
||||
output_parser: OutputParserLike = PydanticToolsParser(
|
||||
tools=[schema], first_tool_only=True
|
||||
)
|
||||
|
||||
@@ -139,7 +139,7 @@ def merge_content(
|
||||
else:
|
||||
# If the last element of the first content is a string
|
||||
# Add the second content to the last element
|
||||
if isinstance(merged[-1], str):
|
||||
if merged and isinstance(merged[-1], str):
|
||||
merged[-1] += content
|
||||
# If second content is an empty string, treat as a no-op
|
||||
elif content == "":
|
||||
|
||||
@@ -47,7 +47,7 @@ class BasePromptTemplate(
|
||||
prompt."""
|
||||
optional_variables: List[str] = Field(default=[])
|
||||
"""A list of the names of the variables that are optional in the prompt."""
|
||||
input_types: Dict[str, Any] = Field(default_factory=dict)
|
||||
input_types: Dict[str, Any] = Field(default_factory=dict, exclude=True)
|
||||
"""A dictionary of the types of the variables the prompt template expects.
|
||||
If not provided, all variables are assumed to be strings."""
|
||||
output_parser: Optional[BaseOutputParser] = None
|
||||
|
||||
@@ -82,6 +82,7 @@ from langchain_core.runnables.utils import (
|
||||
)
|
||||
from langchain_core.utils.aiter import aclosing, atee, py_anext
|
||||
from langchain_core.utils.iter import safetee
|
||||
from langchain_core.utils.pydantic import is_basemodel_subclass
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from langchain_core.callbacks.manager import (
|
||||
@@ -300,7 +301,7 @@ class Runnable(Generic[Input, Output], ABC):
|
||||
"""
|
||||
root_type = self.InputType
|
||||
|
||||
if inspect.isclass(root_type) and issubclass(root_type, BaseModel):
|
||||
if inspect.isclass(root_type) and is_basemodel_subclass(root_type):
|
||||
return root_type
|
||||
|
||||
return create_model(
|
||||
@@ -332,7 +333,7 @@ class Runnable(Generic[Input, Output], ABC):
|
||||
"""
|
||||
root_type = self.OutputType
|
||||
|
||||
if inspect.isclass(root_type) and issubclass(root_type, BaseModel):
|
||||
if inspect.isclass(root_type) and is_basemodel_subclass(root_type):
|
||||
return root_type
|
||||
|
||||
return create_model(
|
||||
@@ -3998,11 +3999,14 @@ class RunnableLambda(Runnable[Input, Output]):
|
||||
RunnableLambda can be composed as any other Runnable and provides
|
||||
seamless integration with LangChain tracing.
|
||||
|
||||
`RunnableLambda` is best suited for code that does not need to support
|
||||
``RunnableLambda`` is best suited for code that does not need to support
|
||||
streaming. If you need to support streaming (i.e., be able to operate
|
||||
on chunks of inputs and yield chunks of outputs), use `RunnableGenerator`
|
||||
on chunks of inputs and yield chunks of outputs), use ``RunnableGenerator``
|
||||
instead.
|
||||
|
||||
Note that if a ``RunnableLambda`` returns an instance of ``Runnable``, that
|
||||
instance is invoked (or streamed) during execution.
|
||||
|
||||
Examples:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
@@ -22,6 +22,7 @@ from typing import (
|
||||
from uuid import UUID, uuid4
|
||||
|
||||
from langchain_core.pydantic_v1 import BaseModel
|
||||
from langchain_core.utils.pydantic import is_basemodel_subclass
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from langchain_core.runnables.base import Runnable as RunnableType
|
||||
@@ -229,7 +230,7 @@ def node_data_json(
|
||||
"name": node_data_str(node.id, node.data),
|
||||
},
|
||||
}
|
||||
elif inspect.isclass(node.data) and issubclass(node.data, BaseModel):
|
||||
elif inspect.isclass(node.data) and is_basemodel_subclass(node.data):
|
||||
json = (
|
||||
{
|
||||
"type": "schema",
|
||||
|
||||
@@ -28,6 +28,7 @@ from langchain_core.messages import (
|
||||
)
|
||||
from langchain_core.pydantic_v1 import BaseModel
|
||||
from langchain_core.utils.json_schema import dereference_refs
|
||||
from langchain_core.utils.pydantic import is_basemodel_subclass
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from langchain_core.tools import BaseTool
|
||||
@@ -100,7 +101,11 @@ def convert_pydantic_to_openai_function(
|
||||
Returns:
|
||||
The function description.
|
||||
"""
|
||||
schema = dereference_refs(model.schema())
|
||||
if hasattr(model, "model_json_schema"):
|
||||
schema = model.model_json_schema() # Pydantic 2
|
||||
else:
|
||||
schema = model.schema() # Pydantic 1
|
||||
schema = dereference_refs(schema)
|
||||
schema.pop("definitions", None)
|
||||
title = schema.pop("title", "")
|
||||
default_description = schema.pop("description", "")
|
||||
@@ -272,7 +277,7 @@ def convert_to_openai_function(
|
||||
"description": function.pop("description"),
|
||||
"parameters": function,
|
||||
}
|
||||
elif isinstance(function, type) and issubclass(function, BaseModel):
|
||||
elif isinstance(function, type) and is_basemodel_subclass(function):
|
||||
return cast(Dict, convert_pydantic_to_openai_function(function))
|
||||
elif isinstance(function, BaseTool):
|
||||
return cast(Dict, format_tool_to_openai_function(function))
|
||||
|
||||
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
|
||||
|
||||
[tool.poetry]
|
||||
name = "langchain-core"
|
||||
version = "0.2.22"
|
||||
version = "0.2.23"
|
||||
description = "Building applications with LLMs through composability"
|
||||
authors = []
|
||||
license = "MIT"
|
||||
@@ -18,6 +18,10 @@ exclude = [ "notebooks", "examples", "example_data", "langchain_core/pydantic",]
|
||||
module = [ "numpy", "pytest",]
|
||||
ignore_missing_imports = true
|
||||
|
||||
[tool.poetry.urls]
|
||||
"Source Code" = "https://github.com/langchain-ai/langchain/tree/master/libs/core"
|
||||
"Release Notes" = "https://github.com/langchain-ai/langchain/releases?q=tag%3A%22langchain-core%3D%3D0%22&expanded=true"
|
||||
|
||||
[tool.poetry.dependencies]
|
||||
python = ">=3.8.1,<4.0"
|
||||
langsmith = "^0.1.75"
|
||||
|
||||
@@ -1216,3 +1216,220 @@
|
||||
'type': 'object',
|
||||
})
|
||||
# ---
|
||||
# name: test_chat_prompt_w_msgs_placeholder_ser_des[chat_prompt]
|
||||
dict({
|
||||
'graph': dict({
|
||||
'edges': list([
|
||||
dict({
|
||||
'source': 0,
|
||||
'target': 1,
|
||||
}),
|
||||
dict({
|
||||
'source': 1,
|
||||
'target': 2,
|
||||
}),
|
||||
]),
|
||||
'nodes': list([
|
||||
dict({
|
||||
'data': 'PromptInput',
|
||||
'id': 0,
|
||||
'type': 'schema',
|
||||
}),
|
||||
dict({
|
||||
'data': dict({
|
||||
'id': list([
|
||||
'langchain',
|
||||
'prompts',
|
||||
'chat',
|
||||
'ChatPromptTemplate',
|
||||
]),
|
||||
'name': 'ChatPromptTemplate',
|
||||
}),
|
||||
'id': 1,
|
||||
'type': 'runnable',
|
||||
}),
|
||||
dict({
|
||||
'data': 'ChatPromptTemplateOutput',
|
||||
'id': 2,
|
||||
'type': 'schema',
|
||||
}),
|
||||
]),
|
||||
}),
|
||||
'id': list([
|
||||
'langchain',
|
||||
'prompts',
|
||||
'chat',
|
||||
'ChatPromptTemplate',
|
||||
]),
|
||||
'kwargs': dict({
|
||||
'input_variables': list([
|
||||
'bar',
|
||||
]),
|
||||
'messages': list([
|
||||
dict({
|
||||
'id': list([
|
||||
'langchain',
|
||||
'prompts',
|
||||
'chat',
|
||||
'SystemMessagePromptTemplate',
|
||||
]),
|
||||
'kwargs': dict({
|
||||
'prompt': dict({
|
||||
'graph': dict({
|
||||
'edges': list([
|
||||
dict({
|
||||
'source': 0,
|
||||
'target': 1,
|
||||
}),
|
||||
dict({
|
||||
'source': 1,
|
||||
'target': 2,
|
||||
}),
|
||||
]),
|
||||
'nodes': list([
|
||||
dict({
|
||||
'data': 'PromptInput',
|
||||
'id': 0,
|
||||
'type': 'schema',
|
||||
}),
|
||||
dict({
|
||||
'data': dict({
|
||||
'id': list([
|
||||
'langchain',
|
||||
'prompts',
|
||||
'prompt',
|
||||
'PromptTemplate',
|
||||
]),
|
||||
'name': 'PromptTemplate',
|
||||
}),
|
||||
'id': 1,
|
||||
'type': 'runnable',
|
||||
}),
|
||||
dict({
|
||||
'data': 'PromptTemplateOutput',
|
||||
'id': 2,
|
||||
'type': 'schema',
|
||||
}),
|
||||
]),
|
||||
}),
|
||||
'id': list([
|
||||
'langchain',
|
||||
'prompts',
|
||||
'prompt',
|
||||
'PromptTemplate',
|
||||
]),
|
||||
'kwargs': dict({
|
||||
'input_variables': list([
|
||||
]),
|
||||
'template': 'foo',
|
||||
'template_format': 'f-string',
|
||||
}),
|
||||
'lc': 1,
|
||||
'name': 'PromptTemplate',
|
||||
'type': 'constructor',
|
||||
}),
|
||||
}),
|
||||
'lc': 1,
|
||||
'type': 'constructor',
|
||||
}),
|
||||
dict({
|
||||
'id': list([
|
||||
'langchain',
|
||||
'prompts',
|
||||
'chat',
|
||||
'MessagesPlaceholder',
|
||||
]),
|
||||
'kwargs': dict({
|
||||
'variable_name': 'bar',
|
||||
}),
|
||||
'lc': 1,
|
||||
'type': 'constructor',
|
||||
}),
|
||||
dict({
|
||||
'id': list([
|
||||
'langchain',
|
||||
'prompts',
|
||||
'chat',
|
||||
'HumanMessagePromptTemplate',
|
||||
]),
|
||||
'kwargs': dict({
|
||||
'prompt': dict({
|
||||
'graph': dict({
|
||||
'edges': list([
|
||||
dict({
|
||||
'source': 0,
|
||||
'target': 1,
|
||||
}),
|
||||
dict({
|
||||
'source': 1,
|
||||
'target': 2,
|
||||
}),
|
||||
]),
|
||||
'nodes': list([
|
||||
dict({
|
||||
'data': 'PromptInput',
|
||||
'id': 0,
|
||||
'type': 'schema',
|
||||
}),
|
||||
dict({
|
||||
'data': dict({
|
||||
'id': list([
|
||||
'langchain',
|
||||
'prompts',
|
||||
'prompt',
|
||||
'PromptTemplate',
|
||||
]),
|
||||
'name': 'PromptTemplate',
|
||||
}),
|
||||
'id': 1,
|
||||
'type': 'runnable',
|
||||
}),
|
||||
dict({
|
||||
'data': 'PromptTemplateOutput',
|
||||
'id': 2,
|
||||
'type': 'schema',
|
||||
}),
|
||||
]),
|
||||
}),
|
||||
'id': list([
|
||||
'langchain',
|
||||
'prompts',
|
||||
'prompt',
|
||||
'PromptTemplate',
|
||||
]),
|
||||
'kwargs': dict({
|
||||
'input_variables': list([
|
||||
]),
|
||||
'template': 'baz',
|
||||
'template_format': 'f-string',
|
||||
}),
|
||||
'lc': 1,
|
||||
'name': 'PromptTemplate',
|
||||
'type': 'constructor',
|
||||
}),
|
||||
}),
|
||||
'lc': 1,
|
||||
'type': 'constructor',
|
||||
}),
|
||||
]),
|
||||
}),
|
||||
'lc': 1,
|
||||
'name': 'ChatPromptTemplate',
|
||||
'type': 'constructor',
|
||||
})
|
||||
# ---
|
||||
# name: test_chat_prompt_w_msgs_placeholder_ser_des[placholder]
|
||||
dict({
|
||||
'id': list([
|
||||
'langchain',
|
||||
'prompts',
|
||||
'chat',
|
||||
'MessagesPlaceholder',
|
||||
]),
|
||||
'kwargs': dict({
|
||||
'variable_name': 'bar',
|
||||
}),
|
||||
'lc': 1,
|
||||
'type': 'constructor',
|
||||
})
|
||||
# ---
|
||||
|
||||
@@ -6,9 +6,8 @@ from typing import Any, List, Union
|
||||
import pytest
|
||||
from syrupy import SnapshotAssertion
|
||||
|
||||
from langchain_core._api.deprecation import (
|
||||
LangChainPendingDeprecationWarning,
|
||||
)
|
||||
from langchain_core._api.deprecation import LangChainPendingDeprecationWarning
|
||||
from langchain_core.load import dumpd, load
|
||||
from langchain_core.messages import (
|
||||
AIMessage,
|
||||
BaseMessage,
|
||||
@@ -806,3 +805,13 @@ def test_chat_input_schema(snapshot: SnapshotAssertion) -> None:
|
||||
assert set(prompt_optional.input_variables) == {"input"}
|
||||
prompt_optional.input_schema(input="") # won't raise error
|
||||
assert prompt_optional.input_schema.schema() == snapshot(name="partial")
|
||||
|
||||
|
||||
def test_chat_prompt_w_msgs_placeholder_ser_des(snapshot: SnapshotAssertion) -> None:
|
||||
prompt = ChatPromptTemplate.from_messages(
|
||||
[("system", "foo"), MessagesPlaceholder("bar"), ("human", "baz")]
|
||||
)
|
||||
assert dumpd(MessagesPlaceholder("bar")) == snapshot(name="placholder")
|
||||
assert load(dumpd(MessagesPlaceholder("bar"))) == MessagesPlaceholder("bar")
|
||||
assert dumpd(prompt) == snapshot(name="chat_prompt")
|
||||
assert load(dumpd(prompt)) == prompt
|
||||
|
||||
@@ -8,12 +8,13 @@ from langchain_core.load.load import loads
|
||||
from langchain_core.prompts.structured import StructuredPrompt
|
||||
from langchain_core.pydantic_v1 import BaseModel
|
||||
from langchain_core.runnables.base import Runnable, RunnableLambda
|
||||
from langchain_core.utils.pydantic import is_basemodel_subclass
|
||||
|
||||
|
||||
def _fake_runnable(
|
||||
schema: Union[Dict, Type[BaseModel]], _: Any
|
||||
) -> Union[BaseModel, Dict]:
|
||||
if isclass(schema) and issubclass(schema, BaseModel):
|
||||
if isclass(schema) and is_basemodel_subclass(schema):
|
||||
return schema(name="yo", value=42)
|
||||
else:
|
||||
params = cast(Dict, schema)["parameters"]
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
import unittest
|
||||
from typing import List, Type
|
||||
from typing import List, Type, Union
|
||||
|
||||
import pytest
|
||||
|
||||
@@ -18,6 +18,7 @@ from langchain_core.messages import (
|
||||
ToolMessage,
|
||||
convert_to_messages,
|
||||
get_buffer_string,
|
||||
merge_content,
|
||||
message_chunk_to_message,
|
||||
message_to_dict,
|
||||
messages_from_dict,
|
||||
@@ -950,3 +951,27 @@ def test_tool_message_str() -> None:
|
||||
expected = "content='foo' tool_call_id='1' artifact={'bar': {'baz': 123}}"
|
||||
actual = str(message)
|
||||
assert expected == actual
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
["first", "others", "expected"],
|
||||
[
|
||||
("", [""], ""),
|
||||
("", [[]], [""]),
|
||||
([], [""], []),
|
||||
([], [[]], []),
|
||||
("foo", [""], "foo"),
|
||||
("foo", [[]], ["foo"]),
|
||||
(["foo"], [""], ["foo"]),
|
||||
(["foo"], [[]], ["foo"]),
|
||||
("foo", ["bar"], "foobar"),
|
||||
("foo", [["bar"]], ["foo", "bar"]),
|
||||
(["foo"], ["bar"], ["foobar"]),
|
||||
(["foo"], [["bar"]], ["foo", "bar"]),
|
||||
],
|
||||
)
|
||||
def test_merge_content(
|
||||
first: Union[list, str], others: list, expected: Union[list, str]
|
||||
) -> None:
|
||||
actual = merge_content(first, *others)
|
||||
assert actual == expected
|
||||
|
||||
@@ -34,11 +34,14 @@ from langchain_core.output_parsers.json import JsonOutputParser
|
||||
from langchain_core.output_parsers.pydantic import PydanticOutputParser
|
||||
from langchain_core.outputs import ChatGeneration, ChatResult
|
||||
from langchain_core.prompts import SystemMessagePromptTemplate
|
||||
from langchain_core.pydantic_v1 import BaseModel
|
||||
from langchain_core.pydantic_v1 import (
|
||||
BaseModel,
|
||||
)
|
||||
from langchain_core.runnables import Runnable, RunnableLambda
|
||||
from langchain_core.runnables.base import RunnableMap
|
||||
from langchain_core.runnables.passthrough import RunnablePassthrough
|
||||
from langchain_core.tools import BaseTool
|
||||
from langchain_core.utils.pydantic import is_basemodel_instance, is_basemodel_subclass
|
||||
|
||||
DEFAULT_SYSTEM_TEMPLATE = """You have access to the following tools:
|
||||
|
||||
@@ -75,14 +78,10 @@ _DictOrPydantic = Union[Dict, _BM]
|
||||
|
||||
def _is_pydantic_class(obj: Any) -> bool:
|
||||
return isinstance(obj, type) and (
|
||||
issubclass(obj, BaseModel) or BaseModel in obj.__bases__
|
||||
is_basemodel_subclass(obj) or BaseModel in obj.__bases__
|
||||
)
|
||||
|
||||
|
||||
def _is_pydantic_object(obj: Any) -> bool:
|
||||
return isinstance(obj, BaseModel)
|
||||
|
||||
|
||||
def convert_to_ollama_tool(tool: Any) -> Dict:
|
||||
"""Convert a tool to an Ollama tool."""
|
||||
description = None
|
||||
@@ -93,7 +92,7 @@ def convert_to_ollama_tool(tool: Any) -> Dict:
|
||||
schema = tool.tool_call_schema.schema()
|
||||
name = tool.get_name()
|
||||
description = tool.description
|
||||
elif _is_pydantic_object(tool):
|
||||
elif is_basemodel_instance(tool):
|
||||
schema = tool.get_input_schema().schema()
|
||||
name = tool.get_name()
|
||||
description = tool.description
|
||||
|
||||
@@ -1,11 +1,12 @@
|
||||
import asyncio
|
||||
from typing import Any, Dict, List, Optional, Union
|
||||
from typing import Any, Dict, List, Optional, Union, cast
|
||||
|
||||
from langchain.chains.base import Chain
|
||||
from langchain.chains.llm import LLMChain
|
||||
from langchain.pydantic_v1 import BaseModel, root_validator
|
||||
from langchain_core.language_models import BaseLanguageModel
|
||||
from langchain_core.prompts.few_shot import FewShotPromptTemplate
|
||||
from langchain_core.utils.pydantic import is_basemodel_instance
|
||||
|
||||
|
||||
class SyntheticDataGenerator(BaseModel):
|
||||
@@ -63,8 +64,10 @@ class SyntheticDataGenerator(BaseModel):
|
||||
"""Prevents duplicates by adding previously generated examples to the few shot
|
||||
list."""
|
||||
if self.template and self.template.examples:
|
||||
if isinstance(example, BaseModel):
|
||||
formatted_example = self._format_dict_to_string(example.dict())
|
||||
if is_basemodel_instance(example):
|
||||
formatted_example = self._format_dict_to_string(
|
||||
cast(BaseModel, example).dict()
|
||||
)
|
||||
elif isinstance(example, dict):
|
||||
formatted_example = self._format_dict_to_string(example)
|
||||
else:
|
||||
|
||||
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
|
||||
|
||||
[tool.poetry]
|
||||
name = "langchain-experimental"
|
||||
version = "0.0.62"
|
||||
version = "0.0.63"
|
||||
description = "Building applications with LLMs through composability"
|
||||
authors = []
|
||||
license = "MIT"
|
||||
@@ -16,6 +16,10 @@ ignore_missing_imports = "True"
|
||||
disallow_untyped_defs = "True"
|
||||
exclude = [ "notebooks", "examples", "example_data",]
|
||||
|
||||
[tool.poetry.urls]
|
||||
"Source Code" = "https://github.com/langchain-ai/langchain/tree/master/libs/experimental"
|
||||
"Release Notes" = "https://github.com/langchain-ai/langchain/releases?q=tag%3A%22langchain-experimental%3D%3D0%22&expanded=true"
|
||||
|
||||
[tool.poetry.dependencies]
|
||||
python = ">=3.8.1,<4.0"
|
||||
langchain-core = "^0.2.10"
|
||||
|
||||
@@ -42,8 +42,11 @@ def _get_chat_history(chat_history: List[CHAT_TURN_TYPE]) -> str:
|
||||
buffer = ""
|
||||
for dialogue_turn in chat_history:
|
||||
if isinstance(dialogue_turn, BaseMessage):
|
||||
role_prefix = _ROLE_MAP.get(dialogue_turn.type, f"{dialogue_turn.type}: ")
|
||||
buffer += f"\n{role_prefix}{dialogue_turn.content}"
|
||||
if len(dialogue_turn.content) > 0:
|
||||
role_prefix = _ROLE_MAP.get(
|
||||
dialogue_turn.type, f"{dialogue_turn.type}: "
|
||||
)
|
||||
buffer += f"\n{role_prefix}{dialogue_turn.content}"
|
||||
elif isinstance(dialogue_turn, tuple):
|
||||
human = "Human: " + dialogue_turn[0]
|
||||
ai = "Assistant: " + dialogue_turn[1]
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
from typing import Any, List, Optional, Type, Union
|
||||
from typing import Any, List, Optional, Type, Union, cast
|
||||
|
||||
from langchain_core.language_models import BaseLanguageModel
|
||||
from langchain_core.messages import HumanMessage, SystemMessage
|
||||
@@ -10,6 +10,7 @@ from langchain_core.output_parsers.openai_functions import (
|
||||
from langchain_core.prompts import PromptTemplate
|
||||
from langchain_core.prompts.chat import ChatPromptTemplate, HumanMessagePromptTemplate
|
||||
from langchain_core.pydantic_v1 import BaseModel, Field
|
||||
from langchain_core.utils.pydantic import is_basemodel_subclass
|
||||
|
||||
from langchain.chains.llm import LLMChain
|
||||
from langchain.chains.openai_functions.utils import get_llm_kwargs
|
||||
@@ -45,7 +46,7 @@ def create_qa_with_structure_chain(
|
||||
|
||||
"""
|
||||
if output_parser == "pydantic":
|
||||
if not (isinstance(schema, type) and issubclass(schema, BaseModel)):
|
||||
if not (isinstance(schema, type) and is_basemodel_subclass(schema)):
|
||||
raise ValueError(
|
||||
"Must provide a pydantic class for schema when output_parser is "
|
||||
"'pydantic'."
|
||||
@@ -60,10 +61,10 @@ def create_qa_with_structure_chain(
|
||||
f"Got unexpected output_parser: {output_parser}. "
|
||||
f"Should be one of `pydantic` or `base`."
|
||||
)
|
||||
if isinstance(schema, type) and issubclass(schema, BaseModel):
|
||||
schema_dict = schema.schema()
|
||||
if isinstance(schema, type) and is_basemodel_subclass(schema):
|
||||
schema_dict = cast(dict, schema.schema())
|
||||
else:
|
||||
schema_dict = schema
|
||||
schema_dict = cast(dict, schema)
|
||||
function = {
|
||||
"name": schema_dict["title"],
|
||||
"description": schema_dict["description"],
|
||||
|
||||
@@ -24,6 +24,7 @@ from langchain_core.utils.function_calling import (
|
||||
convert_to_openai_function,
|
||||
convert_to_openai_tool,
|
||||
)
|
||||
from langchain_core.utils.pydantic import is_basemodel_subclass
|
||||
|
||||
|
||||
@deprecated(
|
||||
@@ -465,7 +466,7 @@ def _get_openai_tool_output_parser(
|
||||
*,
|
||||
first_tool_only: bool = False,
|
||||
) -> Union[BaseOutputParser, BaseGenerationOutputParser]:
|
||||
if isinstance(tool, type) and issubclass(tool, BaseModel):
|
||||
if isinstance(tool, type) and is_basemodel_subclass(tool):
|
||||
output_parser: Union[BaseOutputParser, BaseGenerationOutputParser] = (
|
||||
PydanticToolsParser(tools=[tool], first_tool_only=first_tool_only)
|
||||
)
|
||||
@@ -493,7 +494,7 @@ def get_openai_output_parser(
|
||||
not a Pydantic class, then the output parser will automatically extract
|
||||
only the function arguments and not the function name.
|
||||
"""
|
||||
if isinstance(functions[0], type) and issubclass(functions[0], BaseModel):
|
||||
if isinstance(functions[0], type) and is_basemodel_subclass(functions[0]):
|
||||
if len(functions) > 1:
|
||||
pydantic_schema: Union[Dict, Type[BaseModel]] = {
|
||||
convert_to_openai_function(fn)["name"]: fn for fn in functions
|
||||
@@ -516,7 +517,7 @@ def _create_openai_json_runnable(
|
||||
output_parser: Optional[Union[BaseOutputParser, BaseGenerationOutputParser]] = None,
|
||||
) -> Runnable:
|
||||
""""""
|
||||
if isinstance(output_schema, type) and issubclass(output_schema, BaseModel):
|
||||
if isinstance(output_schema, type) and is_basemodel_subclass(output_schema):
|
||||
output_parser = output_parser or PydanticOutputParser(
|
||||
pydantic_object=output_schema, # type: ignore
|
||||
)
|
||||
|
||||
@@ -366,6 +366,11 @@ def _init_chat_model_helper(
|
||||
|
||||
# TODO: update to use model= once ChatBedrock supports
|
||||
return ChatBedrock(model_id=model, **kwargs)
|
||||
elif model_provider == "bedrock_converse":
|
||||
_check_pkg("langchain_aws")
|
||||
from langchain_aws import ChatBedrockConverse
|
||||
|
||||
return ChatBedrockConverse(model=model, **kwargs)
|
||||
else:
|
||||
supported = ", ".join(_SUPPORTED_PROVIDERS)
|
||||
raise ValueError(
|
||||
@@ -388,6 +393,7 @@ _SUPPORTED_PROVIDERS = {
|
||||
"huggingface",
|
||||
"groq",
|
||||
"bedrock",
|
||||
"bedrock_converse",
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -283,8 +283,7 @@ The following is the expected answer. Use this to measure correctness:
|
||||
|
||||
def prep_inputs(self, inputs: Union[Dict[str, Any], Any]) -> Dict[str, str]:
|
||||
"""Validate and prep inputs."""
|
||||
if "reference" not in inputs:
|
||||
inputs["reference"] = self._format_reference(inputs.get("reference"))
|
||||
inputs["reference"] = self._format_reference(inputs.get("reference"))
|
||||
return super().prep_inputs(inputs)
|
||||
|
||||
def _call(
|
||||
|
||||
4
libs/langchain/poetry.lock
generated
4
libs/langchain/poetry.lock
generated
@@ -1760,7 +1760,7 @@ files = [
|
||||
|
||||
[[package]]
|
||||
name = "langchain-core"
|
||||
version = "0.2.22"
|
||||
version = "0.2.23"
|
||||
description = "Building applications with LLMs through composability"
|
||||
optional = false
|
||||
python-versions = ">=3.8.1,<4.0"
|
||||
@@ -4561,4 +4561,4 @@ test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools",
|
||||
[metadata]
|
||||
lock-version = "2.0"
|
||||
python-versions = ">=3.8.1,<4.0"
|
||||
content-hash = "3bfb687a4835f55d71f5fc1d23c16a42afcf8e9ad496b4da61bd3cb9b026b6ca"
|
||||
content-hash = "73968d6c48a9e2523485914ecd420b6bd1e3cfdcef432f400ebe2b18ebadd51d"
|
||||
|
||||
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
|
||||
|
||||
[tool.poetry]
|
||||
name = "langchain"
|
||||
version = "0.2.10"
|
||||
version = "0.2.11"
|
||||
description = "Building applications with LLMs through composability"
|
||||
authors = []
|
||||
license = "MIT"
|
||||
@@ -24,12 +24,16 @@ skip = ".git,*.pdf,*.svg,*.pdf,*.yaml,*.ipynb,poetry.lock,*.min.js,*.css,package
|
||||
ignore-regex = ".*(Stati Uniti|Tense=Pres).*"
|
||||
ignore-words-list = "momento,collison,ned,foor,reworkd,parth,whats,aapply,mysogyny,unsecure,damon,crate,aadd,symbl,precesses,accademia,nin"
|
||||
|
||||
[tool.poetry.urls]
|
||||
"Source Code" = "https://github.com/langchain-ai/langchain/tree/master/libs/langchain"
|
||||
"Release Notes" = "https://github.com/langchain-ai/langchain/releases?q=tag%3A%22langchain%3D%3D0%22&expanded=true"
|
||||
|
||||
[tool.poetry.scripts]
|
||||
langchain-server = "langchain.server:main"
|
||||
|
||||
[tool.poetry.dependencies]
|
||||
python = ">=3.8.1,<4.0"
|
||||
langchain-core = "^0.2.22"
|
||||
langchain-core = "^0.2.23"
|
||||
langchain-text-splitters = "^0.2.0"
|
||||
langsmith = "^0.1.17"
|
||||
pydantic = ">=1,<3"
|
||||
|
||||
@@ -16,6 +16,7 @@ disallow_untyped_defs = "True"
|
||||
|
||||
[tool.poetry.urls]
|
||||
"Source Code" = "https://github.com/langchain-ai/langchain/tree/master/libs/partners/ai21"
|
||||
"Release Notes" = "https://github.com/langchain-ai/langchain/releases?q=tag%3A%22langchain-ai21%3D%3D0%22&expanded=true"
|
||||
|
||||
[tool.poetry.dependencies]
|
||||
python = ">=3.8.1,<4.0"
|
||||
|
||||
@@ -9,6 +9,7 @@ license = "MIT"
|
||||
|
||||
[tool.poetry.urls]
|
||||
"Source Code" = "https://github.com/langchain-ai/langchain/tree/master/libs/partners/airbyte"
|
||||
"Release Notes" = "https://github.com/langchain-ai/langchain/releases?q=tag%3A%22langchain-airbyte%3D%3D0%22&expanded=true"
|
||||
|
||||
[tool.poetry.dependencies]
|
||||
python = ">=3.9,<3.12.4"
|
||||
|
||||
@@ -50,7 +50,12 @@ from langchain_core.output_parsers import (
|
||||
)
|
||||
from langchain_core.output_parsers.base import OutputParserLike
|
||||
from langchain_core.outputs import ChatGeneration, ChatGenerationChunk, ChatResult
|
||||
from langchain_core.pydantic_v1 import BaseModel, Field, SecretStr, root_validator
|
||||
from langchain_core.pydantic_v1 import (
|
||||
BaseModel,
|
||||
Field,
|
||||
SecretStr,
|
||||
root_validator,
|
||||
)
|
||||
from langchain_core.runnables import (
|
||||
Runnable,
|
||||
RunnableMap,
|
||||
@@ -63,6 +68,7 @@ from langchain_core.utils import (
|
||||
get_pydantic_field_names,
|
||||
)
|
||||
from langchain_core.utils.function_calling import convert_to_openai_tool
|
||||
from langchain_core.utils.pydantic import is_basemodel_subclass
|
||||
|
||||
from langchain_anthropic.output_parsers import extract_tool_calls
|
||||
|
||||
@@ -994,7 +1000,7 @@ class ChatAnthropic(BaseChatModel):
|
||||
|
||||
tool_name = convert_to_anthropic_tool(schema)["name"]
|
||||
llm = self.bind_tools([schema], tool_choice=tool_name)
|
||||
if isinstance(schema, type) and issubclass(schema, BaseModel):
|
||||
if isinstance(schema, type) and is_basemodel_subclass(schema):
|
||||
output_parser: OutputParserLike = PydanticToolsParser(
|
||||
tools=[schema], first_tool_only=True
|
||||
)
|
||||
|
||||
@@ -11,11 +11,12 @@ readme = "README.md"
|
||||
repository = "https://github.com/langchain-ai/langchain"
|
||||
license = "MIT"
|
||||
|
||||
[tool.mypy]
|
||||
disallow_untyped_defs = "True"
|
||||
|
||||
[tool.poetry.urls]
|
||||
"Source Code" = "https://github.com/langchain-ai/langchain/tree/master/libs/partners/anthropic"
|
||||
"Release Notes" = "https://github.com/langchain-ai/langchain/releases?q=tag%3A%22langchain-anthropic%3D%3D0%22&expanded=true"
|
||||
|
||||
[tool.mypy]
|
||||
disallow_untyped_defs = "True"
|
||||
|
||||
[tool.poetry.dependencies]
|
||||
python = ">=3.8.1,<4.0"
|
||||
|
||||
@@ -16,6 +16,7 @@ disallow_untyped_defs = "True"
|
||||
|
||||
[tool.poetry.urls]
|
||||
"Source Code" = "https://github.com/langchain-ai/langchain/tree/master/libs/partners/azure-dynamic-sessions"
|
||||
"Release Notes" = "https://github.com/langchain-ai/langchain/releases?q=tag%3A%22langchain-azure-dynamic-sessions%3D%3D0%22&expanded=true"
|
||||
|
||||
[tool.poetry.dependencies]
|
||||
python = ">=3.8.1,<4.0"
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user