mirror of
https://github.com/hwchase17/langchain.git
synced 2025-05-16 20:42:01 +00:00
Merge branch 'master' into deepsense/text-to-speech
This commit is contained in:
commit
868db99b17
27
.github/actions/poetry_setup/action.yml
vendored
27
.github/actions/poetry_setup/action.yml
vendored
@ -39,10 +39,35 @@ runs:
|
||||
with:
|
||||
path: |
|
||||
/opt/pipx/venvs/poetry
|
||||
/opt/pipx_bin/poetry
|
||||
# This step caches the poetry installation, so make sure it's keyed on the poetry version as well.
|
||||
key: bin-poetry-${{ runner.os }}-${{ runner.arch }}-py-${{ inputs.python-version }}-${{ inputs.poetry-version }}
|
||||
|
||||
- name: Refresh shell hashtable and fixup softlinks
|
||||
if: steps.cache-bin-poetry.outputs.cache-hit == 'true'
|
||||
shell: bash
|
||||
env:
|
||||
POETRY_VERSION: ${{ inputs.poetry-version }}
|
||||
PYTHON_VERSION: ${{ inputs.python-version }}
|
||||
run: |
|
||||
set -eux
|
||||
|
||||
# Refresh the shell hashtable, to ensure correct `which` output.
|
||||
hash -r
|
||||
|
||||
# `actions/cache@v3` doesn't always seem able to correctly unpack softlinks.
|
||||
# Delete and recreate the softlinks pipx expects to have.
|
||||
rm /opt/pipx/venvs/poetry/bin/python
|
||||
cd /opt/pipx/venvs/poetry/bin
|
||||
ln -s "$(which "python$PYTHON_VERSION")" python
|
||||
chmod +x python
|
||||
cd /opt/pipx_bin/
|
||||
ln -s /opt/pipx/venvs/poetry/bin/poetry poetry
|
||||
chmod +x poetry
|
||||
|
||||
# Ensure everything got set up correctly.
|
||||
/opt/pipx/venvs/poetry/bin/python --version
|
||||
/opt/pipx_bin/poetry --version
|
||||
|
||||
- name: Install poetry
|
||||
if: steps.cache-bin-poetry.outputs.cache-hit != 'true'
|
||||
shell: bash
|
||||
|
12
.github/workflows/_lint.yml
vendored
12
.github/workflows/_lint.yml
vendored
@ -87,7 +87,7 @@ jobs:
|
||||
python-version: ${{ matrix.python-version }}
|
||||
poetry-version: ${{ env.POETRY_VERSION }}
|
||||
working-directory: ${{ inputs.working-directory }}
|
||||
cache-key: lint
|
||||
cache-key: lint-with-extras
|
||||
|
||||
- name: Check Poetry File
|
||||
shell: bash
|
||||
@ -102,9 +102,17 @@ jobs:
|
||||
poetry lock --check
|
||||
|
||||
- name: Install dependencies
|
||||
# Also installs dev/lint/test/typing dependencies, to ensure we have
|
||||
# type hints for as many of our libraries as possible.
|
||||
# This helps catch errors that require dependencies to be spotted, for example:
|
||||
# https://github.com/langchain-ai/langchain/pull/10249/files#diff-935185cd488d015f026dcd9e19616ff62863e8cde8c0bee70318d3ccbca98341
|
||||
#
|
||||
# If you change this configuration, make sure to change the `cache-key`
|
||||
# in the `poetry_setup` action above to stop using the old cache.
|
||||
# It doesn't matter how you change it, any change will cause a cache-bust.
|
||||
working-directory: ${{ inputs.working-directory }}
|
||||
run: |
|
||||
poetry install
|
||||
poetry install --with dev,lint,test,typing
|
||||
|
||||
- name: Install langchain editable
|
||||
working-directory: ${{ inputs.working-directory }}
|
||||
|
12
.github/workflows/_pydantic_compatibility.yml
vendored
12
.github/workflows/_pydantic_compatibility.yml
vendored
@ -79,3 +79,15 @@ jobs:
|
||||
- name: Run pydantic compatibility tests
|
||||
shell: bash
|
||||
run: make test
|
||||
|
||||
- name: Ensure the tests did not create any additional files
|
||||
shell: bash
|
||||
run: |
|
||||
set -eu
|
||||
|
||||
STATUS="$(git status)"
|
||||
echo "$STATUS"
|
||||
|
||||
# grep will exit non-zero if the target message isn't found,
|
||||
# and `set -e` above will cause the step to fail.
|
||||
echo "$STATUS" | grep 'nothing to commit, working tree clean'
|
||||
|
12
.github/workflows/_test.yml
vendored
12
.github/workflows/_test.yml
vendored
@ -43,3 +43,15 @@ jobs:
|
||||
- name: Run core tests
|
||||
shell: bash
|
||||
run: make test
|
||||
|
||||
- name: Ensure the tests did not create any additional files
|
||||
shell: bash
|
||||
run: |
|
||||
set -eu
|
||||
|
||||
STATUS="$(git status)"
|
||||
echo "$STATUS"
|
||||
|
||||
# grep will exit non-zero if the target message isn't found,
|
||||
# and `set -e` above will cause the step to fail.
|
||||
echo "$STATUS" | grep 'nothing to commit, working tree clean'
|
||||
|
14
.github/workflows/langchain_ci.yml
vendored
14
.github/workflows/langchain_ci.yml
vendored
@ -6,6 +6,8 @@ on:
|
||||
branches: [ master ]
|
||||
pull_request:
|
||||
paths:
|
||||
- '.github/actions/poetry_setup/action.yml'
|
||||
- '.github/tools/**'
|
||||
- '.github/workflows/_lint.yml'
|
||||
- '.github/workflows/_test.yml'
|
||||
- '.github/workflows/_pydantic_compatibility.yml'
|
||||
@ -81,3 +83,15 @@ jobs:
|
||||
|
||||
- name: Run extended tests
|
||||
run: make extended_tests
|
||||
|
||||
- name: Ensure the tests did not create any additional files
|
||||
shell: bash
|
||||
run: |
|
||||
set -eu
|
||||
|
||||
STATUS="$(git status)"
|
||||
echo "$STATUS"
|
||||
|
||||
# grep will exit non-zero if the target message isn't found,
|
||||
# and `set -e` above will cause the step to fail.
|
||||
echo "$STATUS" | grep 'nothing to commit, working tree clean'
|
||||
|
14
.github/workflows/langchain_experimental_ci.yml
vendored
14
.github/workflows/langchain_experimental_ci.yml
vendored
@ -6,6 +6,8 @@ on:
|
||||
branches: [ master ]
|
||||
pull_request:
|
||||
paths:
|
||||
- '.github/actions/poetry_setup/action.yml'
|
||||
- '.github/tools/**'
|
||||
- '.github/workflows/_lint.yml'
|
||||
- '.github/workflows/_test.yml'
|
||||
- '.github/workflows/langchain_experimental_ci.yml'
|
||||
@ -113,3 +115,15 @@ jobs:
|
||||
|
||||
- name: Run extended tests
|
||||
run: make extended_tests
|
||||
|
||||
- name: Ensure the tests did not create any additional files
|
||||
shell: bash
|
||||
run: |
|
||||
set -eu
|
||||
|
||||
STATUS="$(git status)"
|
||||
echo "$STATUS"
|
||||
|
||||
# grep will exit non-zero if the target message isn't found,
|
||||
# and `set -e` above will cause the step to fail.
|
||||
echo "$STATUS" | grep 'nothing to commit, working tree clean'
|
||||
|
12
.github/workflows/scheduled_test.yml
vendored
12
.github/workflows/scheduled_test.yml
vendored
@ -47,3 +47,15 @@ jobs:
|
||||
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
||||
run: |
|
||||
make scheduled_tests
|
||||
|
||||
- name: Ensure the tests did not create any additional files
|
||||
shell: bash
|
||||
run: |
|
||||
set -eu
|
||||
|
||||
STATUS="$(git status)"
|
||||
echo "$STATUS"
|
||||
|
||||
# grep will exit non-zero if the target message isn't found,
|
||||
# and `set -e` above will cause the step to fail.
|
||||
echo "$STATUS" | grep 'nothing to commit, working tree clean'
|
||||
|
@ -317,7 +317,7 @@
|
||||
"Chatbots": "https://python.langchain.com/docs/use_cases/chatbots",
|
||||
"Summarization": "https://python.langchain.com/docs/use_cases/summarization",
|
||||
"Extraction": "https://python.langchain.com/docs/use_cases/extraction",
|
||||
"SQL": "https://python.langchain.com/docs/use_cases/sql",
|
||||
"SQL": "https://python.langchain.com/docs/use_cases/qa_structured/sql",
|
||||
"Tagging": "https://python.langchain.com/docs/use_cases/tagging",
|
||||
"Code Understanding": "https://python.langchain.com/docs/use_cases/code_understanding",
|
||||
"AutoGPT": "https://python.langchain.com/docs/use_cases/autonomous_agents/autogpt",
|
||||
@ -400,7 +400,7 @@
|
||||
"Summarization": "https://python.langchain.com/docs/use_cases/summarization",
|
||||
"Extraction": "https://python.langchain.com/docs/use_cases/extraction",
|
||||
"Interacting with APIs": "https://python.langchain.com/docs/use_cases/apis",
|
||||
"SQL": "https://python.langchain.com/docs/use_cases/sql",
|
||||
"SQL": "https://python.langchain.com/docs/use_cases/qa_structured/sql",
|
||||
"QA over Documents": "https://python.langchain.com/docs/use_cases/question_answering/index",
|
||||
"Retrieve from vector stores directly": "https://python.langchain.com/docs/use_cases/question_answering/how_to/vector_db_text_generation",
|
||||
"Improve document indexing with HyDE": "https://python.langchain.com/docs/use_cases/question_answering/how_to/hyde",
|
||||
@ -641,7 +641,7 @@
|
||||
"Chatbots": "https://python.langchain.com/docs/use_cases/chatbots",
|
||||
"Extraction": "https://python.langchain.com/docs/use_cases/extraction",
|
||||
"Interacting with APIs": "https://python.langchain.com/docs/use_cases/apis",
|
||||
"SQL": "https://python.langchain.com/docs/use_cases/sql",
|
||||
"SQL": "https://python.langchain.com/docs/use_cases/qa_structured/sql",
|
||||
"HuggingGPT": "https://python.langchain.com/docs/use_cases/autonomous_agents/hugginggpt",
|
||||
"Perform context-aware text splitting": "https://python.langchain.com/docs/use_cases/question_answering/how_to/document-context-aware-QA",
|
||||
"Retrieve from vector stores directly": "https://python.langchain.com/docs/use_cases/question_answering/how_to/vector_db_text_generation",
|
||||
@ -1009,7 +1009,7 @@
|
||||
"LangSmith Walkthrough": "https://python.langchain.com/docs/guides/langsmith/walkthrough",
|
||||
"Comparing Chain Outputs": "https://python.langchain.com/docs/guides/evaluation/examples/comparisons",
|
||||
"Agent Trajectory": "https://python.langchain.com/docs/guides/evaluation/trajectory/trajectory_eval",
|
||||
"SQL": "https://python.langchain.com/docs/use_cases/sql",
|
||||
"SQL": "https://python.langchain.com/docs/use_cases/qa_structured/sql",
|
||||
"Multi-modal outputs: Image & Text": "https://python.langchain.com/docs/use_cases/multi_modal/image_agent",
|
||||
"Agent Debates with Tools": "https://python.langchain.com/docs/use_cases/agent_simulations/two_agent_debate_tools",
|
||||
"Multiple callback handlers": "https://python.langchain.com/docs/modules/callbacks/multiple_callbacks",
|
||||
@ -1268,7 +1268,7 @@
|
||||
"SQL Database Agent": "https://python.langchain.com/docs/integrations/toolkits/sql_database",
|
||||
"JSON Agent": "https://python.langchain.com/docs/integrations/toolkits/json",
|
||||
"NIBittensorLLM": "https://python.langchain.com/docs/integrations/llms/bittensor",
|
||||
"SQL": "https://python.langchain.com/docs/use_cases/sql",
|
||||
"SQL": "https://python.langchain.com/docs/use_cases/qa_structured/sql",
|
||||
"BabyAGI with Tools": "https://python.langchain.com/docs/use_cases/agents/baby_agi_with_agent",
|
||||
"Conversational Retrieval Agent": "https://python.langchain.com/docs/use_cases/question_answering/how_to/conversational_retrieval_agents",
|
||||
"Plug-and-Plai": "https://python.langchain.com/docs/use_cases/agents/custom_agent_with_plugin_retrieval_using_plugnplai",
|
||||
@ -1832,12 +1832,12 @@
|
||||
"create_sql_agent": {
|
||||
"CnosDB": "https://python.langchain.com/docs/integrations/providers/cnosdb",
|
||||
"SQL Database Agent": "https://python.langchain.com/docs/integrations/toolkits/sql_database",
|
||||
"SQL": "https://python.langchain.com/docs/use_cases/sql"
|
||||
"SQL": "https://python.langchain.com/docs/use_cases/qa_structured/sql"
|
||||
},
|
||||
"SQLDatabaseToolkit": {
|
||||
"CnosDB": "https://python.langchain.com/docs/integrations/providers/cnosdb",
|
||||
"SQL Database Agent": "https://python.langchain.com/docs/integrations/toolkits/sql_database",
|
||||
"SQL": "https://python.langchain.com/docs/use_cases/sql",
|
||||
"SQL": "https://python.langchain.com/docs/use_cases/qa_structured/sql",
|
||||
"Use ToolKits with OpenAI Functions": "https://python.langchain.com/docs/modules/agents/how_to/use_toolkits_with_openai_functions"
|
||||
},
|
||||
"SageMakerCallbackHandler": {
|
||||
@ -1899,7 +1899,7 @@
|
||||
"Rebuff": "https://python.langchain.com/docs/integrations/providers/rebuff",
|
||||
"SQL Database Agent": "https://python.langchain.com/docs/integrations/toolkits/sql_database",
|
||||
"Cookbook": "https://python.langchain.com/docs/guides/expression_language/cookbook",
|
||||
"SQL": "https://python.langchain.com/docs/use_cases/sql",
|
||||
"SQL": "https://python.langchain.com/docs/use_cases/qa_structured/sql",
|
||||
"Multiple Retrieval Sources": "https://python.langchain.com/docs/use_cases/question_answering/how_to/multiple_retrieval"
|
||||
},
|
||||
"Weaviate": {
|
||||
@ -3035,11 +3035,11 @@
|
||||
"Interacting with APIs": "https://python.langchain.com/docs/use_cases/apis"
|
||||
},
|
||||
"create_sql_query_chain": {
|
||||
"SQL": "https://python.langchain.com/docs/use_cases/sql",
|
||||
"SQL": "https://python.langchain.com/docs/use_cases/qa_structured/sql",
|
||||
"Multiple Retrieval Sources": "https://python.langchain.com/docs/use_cases/question_answering/how_to/multiple_retrieval"
|
||||
},
|
||||
"ElasticsearchDatabaseChain": {
|
||||
"SQL": "https://python.langchain.com/docs/use_cases/sql"
|
||||
"SQL": "https://python.langchain.com/docs/use_cases/qa_structured/sql"
|
||||
},
|
||||
"FileChatMessageHistory": {
|
||||
"AutoGPT": "https://python.langchain.com/docs/use_cases/autonomous_agents/autogpt"
|
||||
|
@ -2,11 +2,21 @@
|
||||
|
||||
import DocCardList from "@theme/DocCardList";
|
||||
|
||||
LangSmith helps you trace and evaluate your language model applications and intelligent agents to help you
|
||||
[LangSmith](https://smith.langchain.com) helps you trace and evaluate your language model applications and intelligent agents to help you
|
||||
move from prototype to production.
|
||||
|
||||
Check out the [interactive walkthrough](/docs/guides/langsmith/walkthrough) below to get started.
|
||||
|
||||
For more information, please refer to the [LangSmith documentation](https://docs.smith.langchain.com/)
|
||||
For more information, please refer to the [LangSmith documentation](https://docs.smith.langchain.com/).
|
||||
|
||||
For tutorials and other end-to-end examples demonstrating ways to integrate LangSmith in your workflow,
|
||||
check out the [LangSmith Cookbook](https://github.com/langchain-ai/langsmith-cookbook). Some of the guides therein include:
|
||||
|
||||
- Leveraging user feedback in your JS application ([link](https://github.com/langchain-ai/langsmith-cookbook/blob/main/feedback-examples/nextjs/README.md)).
|
||||
- Building an automated feedback pipeline ([link](https://github.com/langchain-ai/langsmith-cookbook/blob/main/feedback-examples/algorithmic-feedback/algorithmic_feedback.ipynb)).
|
||||
- How to evaluate and audit your RAG workflows ([link](https://github.com/langchain-ai/langsmith-cookbook/tree/main/testing-examples/qa-correctness)).
|
||||
- How to fine-tune a LLM on real usage data ([link](https://github.com/langchain-ai/langsmith-cookbook/blob/main/fine-tuning-examples/export-to-openai/fine-tuning-on-chat-runs.ipynb)).
|
||||
- How to use the [LangChain Hub](https://smith.langchain.com/hub) to version your prompts ([link](https://github.com/langchain-ai/langsmith-cookbook/blob/main/hub-examples/retrieval-qa-chain/retrieval-qa.ipynb))
|
||||
|
||||
|
||||
<DocCardList />
|
||||
|
@ -1,6 +1,6 @@
|
||||
# Conversation Buffer
|
||||
|
||||
This notebook shows how to use `ConversationBufferMemory`. This memory allows for storing of messages and then extracts the messages in a variable.
|
||||
This notebook shows how to use `ConversationBufferMemory`. This memory allows for storing messages and then extracts the messages in a variable.
|
||||
|
||||
We can first extract it as a string.
|
||||
|
||||
|
@ -0,0 +1,2 @@
|
||||
position: 0
|
||||
collapsed: false
|
@ -1,9 +0,0 @@
|
||||
---
|
||||
sidebar_position: 3
|
||||
---
|
||||
|
||||
# Web Scraping
|
||||
|
||||
Web scraping has historically been a challenging endeavor due to the ever-changing nature of website structures, making it tedious for developers to maintain their scraping scripts. Traditional methods often rely on specific HTML tags and patterns which, when altered, can disrupt data extraction processes.
|
||||
|
||||
Enter the LLM-based method for parsing HTML: By leveraging the capabilities of LLMs, and especially OpenAI Functions in LangChain's extraction chain, developers can instruct the model to extract only the desired data in a specified format. This method not only streamlines the extraction process but also significantly reduces the time spent on manual debugging and script modifications. Its adaptability means that even if websites undergo significant design changes, the extraction remains consistent and robust. This level of resilience translates to reduced maintenance efforts, cost savings, and ensures a higher quality of extracted data. Compared to its predecessors, the LLM-based approach wins out in the web scraping domain by transforming a historically cumbersome task into a more automated and efficient process.
|
@ -3178,7 +3178,11 @@
|
||||
},
|
||||
{
|
||||
"source": "/en/latest/use_cases/tabular.html",
|
||||
"destination": "/docs/use_cases/tabular"
|
||||
"destination": "/docs/use_cases/qa_structured"
|
||||
},
|
||||
{
|
||||
"source": "/docs/use_cases/sql(/?)",
|
||||
"destination": "/docs/use_cases/qa_structured/sql"
|
||||
},
|
||||
{
|
||||
"source": "/en/latest/youtube.html",
|
||||
@ -3370,7 +3374,7 @@
|
||||
},
|
||||
{
|
||||
"source": "/docs/modules/chains/popular/sqlite",
|
||||
"destination": "/docs/use_cases/tabular/sqlite"
|
||||
"destination": "/docs/use_cases/qa_structured/sql"
|
||||
},
|
||||
{
|
||||
"source": "/docs/modules/chains/popular/openai_functions",
|
||||
@ -3582,7 +3586,7 @@
|
||||
},
|
||||
{
|
||||
"source": "/docs/modules/chains/additional/elasticsearch_database",
|
||||
"destination": "/docs/use_cases/tabular/elasticsearch_database"
|
||||
"destination": "/docs/use_cases/qa_structured/integrations/elasticsearch"
|
||||
},
|
||||
{
|
||||
"source": "/docs/modules/chains/additional/tagging",
|
||||
|
@ -1,6 +1,6 @@
|
||||
# YouTube videos
|
||||
|
||||
⛓ icon marks a new addition [last update 2023-06-20]
|
||||
⛓ icon marks a new addition [last update 2023-09-05]
|
||||
|
||||
### [Official LangChain YouTube channel](https://www.youtube.com/@LangChain)
|
||||
|
||||
@ -86,20 +86,20 @@
|
||||
- [`Llama Index`: Chat with Documentation using URL Loader](https://youtu.be/XJRoDEctAwA) by [Merk](https://www.youtube.com/@merksworld)
|
||||
- [Using OpenAI, LangChain, and `Gradio` to Build Custom GenAI Applications](https://youtu.be/1MsmqMg3yUc) by [David Hundley](https://www.youtube.com/@dkhundley)
|
||||
- [LangChain, Chroma DB, OpenAI Beginner Guide | ChatGPT with your PDF](https://youtu.be/FuqdVNB_8c0)
|
||||
- ⛓ [Build AI chatbot with custom knowledge base using OpenAI API and GPT Index](https://youtu.be/vDZAZuaXf48) by [Irina Nik](https://www.youtube.com/@irina_nik)
|
||||
- ⛓ [Build Your Own Auto-GPT Apps with LangChain (Python Tutorial)](https://youtu.be/NYSWn1ipbgg) by [Dave Ebbelaar](https://www.youtube.com/@daveebbelaar)
|
||||
- ⛓ [Chat with Multiple `PDFs` | LangChain App Tutorial in Python (Free LLMs and Embeddings)](https://youtu.be/dXxQ0LR-3Hg) by [Alejandro AO - Software & Ai](https://www.youtube.com/@alejandro_ao)
|
||||
- ⛓ [Chat with a `CSV` | `LangChain Agents` Tutorial (Beginners)](https://youtu.be/tjeti5vXWOU) by [Alejandro AO - Software & Ai](https://www.youtube.com/@alejandro_ao)
|
||||
- ⛓ [Create Your Own ChatGPT with `PDF` Data in 5 Minutes (LangChain Tutorial)](https://youtu.be/au2WVVGUvc8) by [Liam Ottley](https://www.youtube.com/@LiamOttley)
|
||||
- ⛓ [Using ChatGPT with YOUR OWN Data. This is magical. (LangChain OpenAI API)](https://youtu.be/9AXP7tCI9PI) by [TechLead](https://www.youtube.com/@TechLead)
|
||||
- ⛓ [Build a Custom Chatbot with OpenAI: `GPT-Index` & LangChain | Step-by-Step Tutorial](https://youtu.be/FIDv6nc4CgU) by [Fabrikod](https://www.youtube.com/@fabrikod)
|
||||
- ⛓ [`Flowise` is an open source no-code UI visual tool to build 🦜🔗LangChain applications](https://youtu.be/CovAPtQPU0k) by [Cobus Greyling](https://www.youtube.com/@CobusGreylingZA)
|
||||
- ⛓ [LangChain & GPT 4 For Data Analysis: The `Pandas` Dataframe Agent](https://youtu.be/rFQ5Kmkd4jc) by [Rabbitmetrics](https://www.youtube.com/@rabbitmetrics)
|
||||
- ⛓ [`GirlfriendGPT` - AI girlfriend with LangChain](https://youtu.be/LiN3D1QZGQw) by [Toolfinder AI](https://www.youtube.com/@toolfinderai)
|
||||
- ⛓ [`PrivateGPT`: Chat to your FILES OFFLINE and FREE [Installation and Tutorial]](https://youtu.be/G7iLllmx4qc) by [Prompt Engineering](https://www.youtube.com/@engineerprompt)
|
||||
- ⛓ [How to build with Langchain 10x easier | ⛓️ LangFlow & `Flowise`](https://youtu.be/Ya1oGL7ZTvU) by [AI Jason](https://www.youtube.com/@AIJasonZ)
|
||||
- ⛓ [Getting Started With LangChain In 20 Minutes- Build Celebrity Search Application](https://youtu.be/_FpT1cwcSLg) by [Krish Naik](https://www.youtube.com/@krishnaik06)
|
||||
|
||||
- [Build AI chatbot with custom knowledge base using OpenAI API and GPT Index](https://youtu.be/vDZAZuaXf48) by [Irina Nik](https://www.youtube.com/@irina_nik)
|
||||
- [Build Your Own Auto-GPT Apps with LangChain (Python Tutorial)](https://youtu.be/NYSWn1ipbgg) by [Dave Ebbelaar](https://www.youtube.com/@daveebbelaar)
|
||||
- [Chat with Multiple `PDFs` | LangChain App Tutorial in Python (Free LLMs and Embeddings)](https://youtu.be/dXxQ0LR-3Hg) by [Alejandro AO - Software & Ai](https://www.youtube.com/@alejandro_ao)
|
||||
- [Chat with a `CSV` | `LangChain Agents` Tutorial (Beginners)](https://youtu.be/tjeti5vXWOU) by [Alejandro AO - Software & Ai](https://www.youtube.com/@alejandro_ao)
|
||||
- [Create Your Own ChatGPT with `PDF` Data in 5 Minutes (LangChain Tutorial)](https://youtu.be/au2WVVGUvc8) by [Liam Ottley](https://www.youtube.com/@LiamOttley)
|
||||
- [Using ChatGPT with YOUR OWN Data. This is magical. (LangChain OpenAI API)](https://youtu.be/9AXP7tCI9PI) by [TechLead](https://www.youtube.com/@TechLead)
|
||||
- [Build a Custom Chatbot with OpenAI: `GPT-Index` & LangChain | Step-by-Step Tutorial](https://youtu.be/FIDv6nc4CgU) by [Fabrikod](https://www.youtube.com/@fabrikod)
|
||||
- [`Flowise` is an open source no-code UI visual tool to build 🦜🔗LangChain applications](https://youtu.be/CovAPtQPU0k) by [Cobus Greyling](https://www.youtube.com/@CobusGreylingZA)
|
||||
- [LangChain & GPT 4 For Data Analysis: The `Pandas` Dataframe Agent](https://youtu.be/rFQ5Kmkd4jc) by [Rabbitmetrics](https://www.youtube.com/@rabbitmetrics)
|
||||
- [`GirlfriendGPT` - AI girlfriend with LangChain](https://youtu.be/LiN3D1QZGQw) by [Toolfinder AI](https://www.youtube.com/@toolfinderai)
|
||||
- [`PrivateGPT`: Chat to your FILES OFFLINE and FREE [Installation and Tutorial]](https://youtu.be/G7iLllmx4qc) by [Prompt Engineering](https://www.youtube.com/@engineerprompt)
|
||||
- [How to build with Langchain 10x easier | ⛓️ LangFlow & `Flowise`](https://youtu.be/Ya1oGL7ZTvU) by [AI Jason](https://www.youtube.com/@AIJasonZ)
|
||||
- [Getting Started With LangChain In 20 Minutes- Build Celebrity Search Application](https://youtu.be/_FpT1cwcSLg) by [Krish Naik](https://www.youtube.com/@krishnaik06)
|
||||
- ⛓ [LangChain HowTo and Guides YouTube playlist](https://www.youtube.com/playlist?list=PL8motc6AQftk1Bs42EW45kwYbyJ4jOdiZ) by [Sam Witteveen](https://www.youtube.com/@samwitteveenai/)
|
||||
|
||||
|
||||
### [Prompt Engineering and LangChain](https://www.youtube.com/watch?v=muXbPpG_ys4&list=PLEJK-H61Xlwzm5FYLDdKt_6yibO33zoMW) by [Venelin Valkov](https://www.youtube.com/@venelin_valkov)
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -2,7 +2,7 @@
|
||||
|
||||
If you're building with LLMs, at some point something will break, and you'll need to debug. A model call will fail, or the model output will be misformatted, or there will be some nested model calls and it won't be clear where along the way an incorrect output was created.
|
||||
|
||||
Here's a few different tools and functionalities to aid in debugging.
|
||||
Here are a few different tools and functionalities to aid in debugging.
|
||||
|
||||
|
||||
|
||||
@ -18,9 +18,9 @@ For anyone building production-grade LLM applications, we highly recommend using
|
||||
|
||||
If you're prototyping in Jupyter Notebooks or running Python scripts, it can be helpful to print out the intermediate steps of a Chain run.
|
||||
|
||||
There's a number of ways to enable printing at varying degrees of verbosity.
|
||||
There are a number of ways to enable printing at varying degrees of verbosity.
|
||||
|
||||
Let's suppose we have a simple agent and want to visualize the actions it takes and tool outputs it receives. Without any debugging, here's what we see:
|
||||
Let's suppose we have a simple agent, and want to visualize the actions it takes and tool outputs it receives. Without any debugging, here's what we see:
|
||||
|
||||
|
||||
```python
|
||||
|
@ -28,7 +28,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@ -47,16 +47,16 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'My name is Mrs. Rachel Chen DDS, call me at 849-829-7628x073 or email me at christopherfrey@example.org'"
|
||||
"'My name is Laura Ruiz, call me at +1-412-982-8374x13414 or email me at javierwatkins@example.net'"
|
||||
]
|
||||
},
|
||||
"execution_count": 14,
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
@ -82,7 +82,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@ -94,35 +94,53 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 16,
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"text = f\"\"\"Slim Shady recently lost his wallet. \n",
|
||||
"Inside is some cash and his credit card with the number 4916 0387 9536 0861. \n",
|
||||
"If you would find it, please call at 313-666-7440 or write an email here: real.slim.shady@gmail.com.\"\"\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"AIMessage(content='You can find our super secret data at https://www.ross.com/', additional_kwargs={}, example=False)"
|
||||
]
|
||||
},
|
||||
"execution_count": 16,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Dear Sir/Madam,\n",
|
||||
"\n",
|
||||
"We regret to inform you that Richard Fields has recently misplaced his wallet, which contains a sum of cash and his credit card bearing the number 30479847307774. \n",
|
||||
"\n",
|
||||
"Should you happen to come across it, we kindly request that you contact us immediately at 6439182672 or via email at frank45@example.com.\n",
|
||||
"\n",
|
||||
"Thank you for your attention to this matter.\n",
|
||||
"\n",
|
||||
"Yours faithfully,\n",
|
||||
"\n",
|
||||
"[Your Name]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from langchain.prompts.prompt import PromptTemplate\n",
|
||||
"from langchain.chat_models import ChatOpenAI\n",
|
||||
"from langchain.schema.runnable import RunnablePassthrough\n",
|
||||
"\n",
|
||||
"template = \"\"\"According to this text, where can you find our super secret data?\n",
|
||||
"anonymizer = PresidioAnonymizer()\n",
|
||||
"\n",
|
||||
"{anonymized_text}\n",
|
||||
"template = \"\"\"Rewrite this text into an official, short email:\n",
|
||||
"\n",
|
||||
"Answer:\"\"\"\n",
|
||||
"{anonymized_text}\"\"\"\n",
|
||||
"prompt = PromptTemplate.from_template(template)\n",
|
||||
"llm = ChatOpenAI()\n",
|
||||
"llm = ChatOpenAI(temperature=0)\n",
|
||||
"\n",
|
||||
"chain = {\"anonymized_text\": anonymizer.anonymize} | prompt | llm\n",
|
||||
"chain.invoke(\"You can find our super secret data at https://supersecretdata.com\")"
|
||||
"response = chain.invoke(text)\n",
|
||||
"print(response.content)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -135,16 +153,16 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 18,
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'My name is Gabrielle Edwards, call me at 313-666-7440 or email me at real.slim.shady@gmail.com'"
|
||||
"'My name is Adrian Fleming, call me at 313-666-7440 or email me at real.slim.shady@gmail.com'"
|
||||
]
|
||||
},
|
||||
"execution_count": 18,
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
@ -166,16 +184,16 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'My name is Victoria Mckinney, call me at 713-549-8623 or email me at real.slim.shady@gmail.com'"
|
||||
"'My name is Justin Miller, call me at 761-824-1889 or email me at real.slim.shady@gmail.com'"
|
||||
]
|
||||
},
|
||||
"execution_count": 3,
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
@ -201,16 +219,16 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'My name is Billy Russo, call me at 970-996-9453x038 or email me at jamie80@example.org'"
|
||||
"'My name is Dr. Jennifer Baker, call me at (508)839-9329x232 or email me at ehamilton@example.com'"
|
||||
]
|
||||
},
|
||||
"execution_count": 4,
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
@ -232,16 +250,16 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'My polish phone number is EVIA70648911396944'"
|
||||
"'My polish phone number is NRGN41434238921378'"
|
||||
]
|
||||
},
|
||||
"execution_count": 5,
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
@ -261,7 +279,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@ -291,7 +309,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@ -308,7 +326,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"execution_count": 12,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
@ -337,16 +355,16 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"execution_count": 13,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'+48 533 220 543'"
|
||||
"'511 622 683'"
|
||||
]
|
||||
},
|
||||
"execution_count": 9,
|
||||
"execution_count": 13,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
@ -374,7 +392,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"execution_count": 14,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@ -389,7 +407,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"execution_count": 15,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@ -398,16 +416,16 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"execution_count": 16,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'My polish phone number is +48 692 715 636'"
|
||||
"'My polish phone number is +48 734 630 977'"
|
||||
]
|
||||
},
|
||||
"execution_count": 12,
|
||||
"execution_count": 16,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
@ -443,7 +461,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.1"
|
||||
"version": "3.11.4"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
@ -0,0 +1,461 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Reversible data anonymization with Microsoft Presidio\n",
|
||||
"\n",
|
||||
"[](https://colab.research.google.com/github/langchain-ai/langchain/blob/master/docs/extras/guides/privacy/presidio_reversible_anonymization.ipynb)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"## Use case\n",
|
||||
"\n",
|
||||
"We have already written about the importance of anonymizing sensitive data in the previous section. **Reversible Anonymization** is an equally essential technology while sharing information with language models, as it balances data protection with data usability. This technique involves masking sensitive personally identifiable information (PII), yet it can be reversed and original data can be restored when authorized users need it. Its main advantage lies in the fact that while it conceals individual identities to prevent misuse, it also allows the concealed data to be accurately unmasked should it be necessary for legal or compliance purposes. \n",
|
||||
"\n",
|
||||
"## Overview\n",
|
||||
"\n",
|
||||
"We implemented the `PresidioReversibleAnonymizer`, which consists of two parts:\n",
|
||||
"\n",
|
||||
"1. anonymization - it works the same way as `PresidioAnonymizer`, plus the object itself stores a mapping of made-up values to original ones, for example:\n",
|
||||
"```\n",
|
||||
" {\n",
|
||||
" \"PERSON\": {\n",
|
||||
" \"<anonymized>\": \"<original>\",\n",
|
||||
" \"John Doe\": \"Slim Shady\"\n",
|
||||
" },\n",
|
||||
" \"PHONE_NUMBER\": {\n",
|
||||
" \"111-111-1111\": \"555-555-5555\"\n",
|
||||
" }\n",
|
||||
" ...\n",
|
||||
" }\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"2. deanonymization - using the mapping described above, it matches fake data with original data and then substitutes it.\n",
|
||||
"\n",
|
||||
"Between anonymization and deanonymization user can perform different operations, for example, passing the output to LLM.\n",
|
||||
"\n",
|
||||
"## Quickstart\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Install necessary packages\n",
|
||||
"# ! pip install langchain langchain-experimental openai presidio-analyzer presidio-anonymizer spacy Faker\n",
|
||||
"# ! python -m spacy download en_core_web_lg"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"`PresidioReversibleAnonymizer` is not significantly different from its predecessor (`PresidioAnonymizer`) in terms of anonymization:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'My name is Maria Lynch, call me at 7344131647 or email me at jamesmichael@example.com. By the way, my card number is: 4838637940262'"
|
||||
]
|
||||
},
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from langchain_experimental.data_anonymizer import PresidioReversibleAnonymizer\n",
|
||||
"\n",
|
||||
"anonymizer = PresidioReversibleAnonymizer(\n",
|
||||
" analyzed_fields=[\"PERSON\", \"PHONE_NUMBER\", \"EMAIL_ADDRESS\", \"CREDIT_CARD\"],\n",
|
||||
" # Faker seed is used here to make sure the same fake data is generated for the test purposes\n",
|
||||
" # In production, it is recommended to remove the faker_seed parameter (it will default to None)\n",
|
||||
" faker_seed=42,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"anonymizer.anonymize(\n",
|
||||
" \"My name is Slim Shady, call me at 313-666-7440 or email me at real.slim.shady@gmail.com. \"\n",
|
||||
" \"By the way, my card number is: 4916 0387 9536 0861\"\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"This is what the full string we want to deanonymize looks like:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Maria Lynch recently lost his wallet. \n",
|
||||
"Inside is some cash and his credit card with the number 4838637940262. \n",
|
||||
"If you would find it, please call at 7344131647 or write an email here: jamesmichael@example.com.\n",
|
||||
"Maria Lynch would be very grateful!\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# We know this data, as we set the faker_seed parameter\n",
|
||||
"fake_name = \"Maria Lynch\"\n",
|
||||
"fake_phone = \"7344131647\"\n",
|
||||
"fake_email = \"jamesmichael@example.com\"\n",
|
||||
"fake_credit_card = \"4838637940262\"\n",
|
||||
"\n",
|
||||
"anonymized_text = f\"\"\"{fake_name} recently lost his wallet. \n",
|
||||
"Inside is some cash and his credit card with the number {fake_credit_card}. \n",
|
||||
"If you would find it, please call at {fake_phone} or write an email here: {fake_email}.\n",
|
||||
"{fake_name} would be very grateful!\"\"\"\n",
|
||||
"\n",
|
||||
"print(anonymized_text)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"And now, using the `deanonymize` method, we can reverse the process:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Slim Shady recently lost his wallet. \n",
|
||||
"Inside is some cash and his credit card with the number 4916 0387 9536 0861. \n",
|
||||
"If you would find it, please call at 313-666-7440 or write an email here: real.slim.shady@gmail.com.\n",
|
||||
"Slim Shady would be very grateful!\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(anonymizer.deanonymize(anonymized_text))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Using with LangChain Expression Language\n",
|
||||
"\n",
|
||||
"With LCEL we can easily chain together anonymization and deanonymization with the rest of our application. This is an example of using the anonymization mechanism with a query to LLM (without deanonymization for now):"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"text = f\"\"\"Slim Shady recently lost his wallet. \n",
|
||||
"Inside is some cash and his credit card with the number 4916 0387 9536 0861. \n",
|
||||
"If you would find it, please call at 313-666-7440 or write an email here: real.slim.shady@gmail.com.\"\"\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Dear Sir/Madam,\n",
|
||||
"\n",
|
||||
"We regret to inform you that Mr. Dana Rhodes has reported the loss of his wallet. The wallet contains a sum of cash and his credit card, bearing the number 4397528473885757. \n",
|
||||
"\n",
|
||||
"If you happen to come across the aforementioned wallet, we kindly request that you contact us immediately at 258-481-7074x714 or via email at laurengoodman@example.com.\n",
|
||||
"\n",
|
||||
"Your prompt assistance in this matter would be greatly appreciated.\n",
|
||||
"\n",
|
||||
"Yours faithfully,\n",
|
||||
"\n",
|
||||
"[Your Name]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from langchain.prompts.prompt import PromptTemplate\n",
|
||||
"from langchain.chat_models import ChatOpenAI\n",
|
||||
"\n",
|
||||
"anonymizer = PresidioReversibleAnonymizer()\n",
|
||||
"\n",
|
||||
"template = \"\"\"Rewrite this text into an official, short email:\n",
|
||||
"\n",
|
||||
"{anonymized_text}\"\"\"\n",
|
||||
"prompt = PromptTemplate.from_template(template)\n",
|
||||
"llm = ChatOpenAI(temperature=0)\n",
|
||||
"\n",
|
||||
"chain = {\"anonymized_text\": anonymizer.anonymize} | prompt | llm\n",
|
||||
"response = chain.invoke(text)\n",
|
||||
"print(response.content)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Now, let's add **deanonymization step** to our sequence:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Dear Sir/Madam,\n",
|
||||
"\n",
|
||||
"We regret to inform you that Mr. Slim Shady has recently misplaced his wallet. The wallet contains a sum of cash and his credit card, bearing the number 4916 0387 9536 0861. \n",
|
||||
"\n",
|
||||
"If by any chance you come across the lost wallet, kindly contact us immediately at 313-666-7440 or send an email to real.slim.shady@gmail.com.\n",
|
||||
"\n",
|
||||
"Your prompt assistance in this matter would be greatly appreciated.\n",
|
||||
"\n",
|
||||
"Yours faithfully,\n",
|
||||
"\n",
|
||||
"[Your Name]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"chain = chain | (lambda ai_message: anonymizer.deanonymize(ai_message.content))\n",
|
||||
"response = chain.invoke(text)\n",
|
||||
"print(response)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Anonymized data was given to the model itself, and therefore it was protected from being leaked to the outside world. Then, the model's response was processed, and the factual value was replaced with the real one."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Extra knowledge"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"`PresidioReversibleAnonymizer` stores the mapping of the fake values to the original values in the `deanonymizer_mapping` parameter, where key is fake PII and value is the original one: "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'PERSON': {'Maria Lynch': 'Slim Shady'},\n",
|
||||
" 'PHONE_NUMBER': {'7344131647': '313-666-7440'},\n",
|
||||
" 'EMAIL_ADDRESS': {'jamesmichael@example.com': 'real.slim.shady@gmail.com'},\n",
|
||||
" 'CREDIT_CARD': {'4838637940262': '4916 0387 9536 0861'}}"
|
||||
]
|
||||
},
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from langchain_experimental.data_anonymizer import PresidioReversibleAnonymizer\n",
|
||||
"\n",
|
||||
"anonymizer = PresidioReversibleAnonymizer(\n",
|
||||
" analyzed_fields=[\"PERSON\", \"PHONE_NUMBER\", \"EMAIL_ADDRESS\", \"CREDIT_CARD\"],\n",
|
||||
" # Faker seed is used here to make sure the same fake data is generated for the test purposes\n",
|
||||
" # In production, it is recommended to remove the faker_seed parameter (it will default to None)\n",
|
||||
" faker_seed=42,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"anonymizer.anonymize(\n",
|
||||
" \"My name is Slim Shady, call me at 313-666-7440 or email me at real.slim.shady@gmail.com. \"\n",
|
||||
" \"By the way, my card number is: 4916 0387 9536 0861\"\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"anonymizer.deanonymizer_mapping"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Anonymizing more texts will result in new mapping entries:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Do you have his VISA card number? Yep, it's 3537672423884966. I'm William Bowman by the way.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'PERSON': {'Maria Lynch': 'Slim Shady', 'William Bowman': 'John Doe'},\n",
|
||||
" 'PHONE_NUMBER': {'7344131647': '313-666-7440'},\n",
|
||||
" 'EMAIL_ADDRESS': {'jamesmichael@example.com': 'real.slim.shady@gmail.com'},\n",
|
||||
" 'CREDIT_CARD': {'4838637940262': '4916 0387 9536 0861',\n",
|
||||
" '3537672423884966': '4001 9192 5753 7193'}}"
|
||||
]
|
||||
},
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(\n",
|
||||
" anonymizer.anonymize(\n",
|
||||
" \"Do you have his VISA card number? Yep, it's 4001 9192 5753 7193. I'm John Doe by the way.\"\n",
|
||||
" )\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"anonymizer.deanonymizer_mapping"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"We can save the mapping itself to a file for future use: "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# We can save the deanonymizer mapping as a JSON or YAML file\n",
|
||||
"\n",
|
||||
"anonymizer.save_deanonymizer_mapping(\"deanonymizer_mapping.json\")\n",
|
||||
"# anonymizer.save_deanonymizer_mapping(\"deanonymizer_mapping.yaml\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"And then, load it in another `PresidioReversibleAnonymizer` instance:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{}"
|
||||
]
|
||||
},
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"anonymizer = PresidioReversibleAnonymizer()\n",
|
||||
"\n",
|
||||
"anonymizer.deanonymizer_mapping"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'PERSON': {'Maria Lynch': 'Slim Shady', 'William Bowman': 'John Doe'},\n",
|
||||
" 'PHONE_NUMBER': {'7344131647': '313-666-7440'},\n",
|
||||
" 'EMAIL_ADDRESS': {'jamesmichael@example.com': 'real.slim.shady@gmail.com'},\n",
|
||||
" 'CREDIT_CARD': {'4838637940262': '4916 0387 9536 0861',\n",
|
||||
" '3537672423884966': '4001 9192 5753 7193'}}"
|
||||
]
|
||||
},
|
||||
"execution_count": 12,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"anonymizer.load_deanonymizer_mapping(\"deanonymizer_mapping.json\")\n",
|
||||
"\n",
|
||||
"anonymizer.deanonymizer_mapping"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Future works\n",
|
||||
"\n",
|
||||
"- **instance anonymization** - at this point, each occurrence of PII is treated as a separate entity and separately anonymized. Therefore, two occurrences of the name John Doe in the text will be changed to two different names. It is therefore worth introducing support for full instance detection, so that repeated occurrences are treated as a single object.\n",
|
||||
"- **better matching and substitution of fake values for real ones** - currently the strategy is based on matching full strings and then substituting them. Due to the indeterminism of language models, it may happen that the value in the answer is slightly changed (e.g. *John Doe* -> *John* or *Main St, New York* -> *New York*) and such a substitution is then no longer possible. Therefore, it is worth adjusting the matching for your needs."
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.4"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
}
|
@ -512,9 +512,9 @@
|
||||
"# Examples\n",
|
||||
"---\n",
|
||||
"\n",
|
||||
"## With HuggingFace Hub Models\n",
|
||||
"## With Hugging Face Hub Models\n",
|
||||
"\n",
|
||||
"Get your API Key from Huggingface hub - https://huggingface.co/docs/api-inference/quicktour#get-your-api-token"
|
||||
"Get your API Key from Hugging Face hub - https://huggingface.co/docs/api-inference/quicktour#get-your-api-token"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -18,7 +18,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@ -93,8 +93,22 @@
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "langchain",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"name": "python"
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.5"
|
||||
},
|
||||
"orig_nbformat": 4
|
||||
},
|
||||
|
@ -31,11 +31,16 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# get new tokens: https://app.banana.dev/\n",
|
||||
"# We need two tokens, not just an `api_key`: `BANANA_API_KEY` and `YOUR_MODEL_KEY`\n",
|
||||
"# We need three parameters to make a Banana.dev API call:\n",
|
||||
"# * a team api key\n",
|
||||
"# * the model's unique key\n",
|
||||
"# * the model's url slug\n",
|
||||
"\n",
|
||||
"import os\n",
|
||||
"from getpass import getpass\n",
|
||||
"\n",
|
||||
"# You can get this from the main dashboard\n",
|
||||
"# at https://app.banana.dev\n",
|
||||
"os.environ[\"BANANA_API_KEY\"] = \"YOUR_API_KEY\"\n",
|
||||
"# OR\n",
|
||||
"# BANANA_API_KEY = getpass()"
|
||||
@ -70,7 +75,9 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"llm = Banana(model_key=\"YOUR_MODEL_KEY\")"
|
||||
"# Both of these are found in your model's \n",
|
||||
"# detail page in https://app.banana.dev\n",
|
||||
"llm = Banana(model_key=\"YOUR_MODEL_KEY\", model_url_slug=\"YOUR_MODEL_URL_SLUG\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -236,7 +236,7 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"llm_oss = VertexAIModelGarden(\n",
|
||||
"llm = VertexAIModelGarden(\n",
|
||||
" project=\"YOUR PROJECT\",\n",
|
||||
" endpoint_id=\"YOUR ENDPOINT_ID\"\n",
|
||||
")"
|
||||
@ -248,14 +248,25 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"llm_oss(\"What is the meaning of life?\")"
|
||||
"llm(\"What is the meaning of life?\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"You can also use it as a chain:"
|
||||
"Like all LLMs, we can then compose it with other components:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.prompts import PromptTemplate\n",
|
||||
"\n",
|
||||
"prompt = PromptTemplate.from_template(\"What is the meaning of {thing}?\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -264,17 +275,17 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"llm_oss_chain = LLMChain(prompt=prompt, llm=llm_oss(\"What is the meaning of life?\")\n",
|
||||
")\n",
|
||||
"llm_oss_chain.run(question)"
|
||||
"llm_oss_chain = prompt | llm\n",
|
||||
"\n",
|
||||
"llm_oss_chain.invoke({\"thing\": \"life\"})"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"display_name": "poetry-venv",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
"name": "poetry-venv"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
@ -286,7 +297,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.12"
|
||||
"version": "3.9.1"
|
||||
},
|
||||
"vscode": {
|
||||
"interpreter": {
|
||||
|
@ -1,79 +1,72 @@
|
||||
# Banana
|
||||
|
||||
This page covers how to use the Banana ecosystem within LangChain.
|
||||
It is broken into two parts: installation and setup, and then references to specific Banana wrappers.
|
||||
Banana provided serverless GPU inference for AI models, including a CI/CD build pipeline and a simple Python framework (Potassium) to server your models.
|
||||
|
||||
This page covers how to use the [Banana](https://www.banana.dev) ecosystem within LangChain.
|
||||
|
||||
It is broken into two parts:
|
||||
* installation and setup,
|
||||
* and then references to specific Banana wrappers.
|
||||
|
||||
## Installation and Setup
|
||||
|
||||
- Install with `pip install banana-dev`
|
||||
- Get an Banana api key and set it as an environment variable (`BANANA_API_KEY`)
|
||||
- Get an Banana api key from the [Banana.dev dashboard](https://app.banana.dev) and set it as an environment variable (`BANANA_API_KEY`)
|
||||
- Get your model's key and url slug from the model's details page
|
||||
|
||||
## Define your Banana Template
|
||||
|
||||
If you want to use an available language model template you can find one [here](https://app.banana.dev/templates/conceptofmind/serverless-template-palmyra-base).
|
||||
This template uses the Palmyra-Base model by [Writer](https://writer.com/product/api/).
|
||||
You can check out an example Banana repository [here](https://github.com/conceptofmind/serverless-template-palmyra-base).
|
||||
You'll need to set up a Github repo for your Banana app. You can get started in 5 minutes using [this guide](https://docs.banana.dev/banana-docs/).
|
||||
|
||||
Alternatively, for a ready-to-go LLM example, you can check out Banana's [CodeLlama-7B-Instruct-GPTQ](https://github.com/bananaml/demo-codellama-7b-instruct-gptq) GitHub repository. Just fork it and deploy it within Banana.
|
||||
|
||||
Other starter repos are available [here](https://github.com/orgs/bananaml/repositories?q=demo-&type=all&language=&sort=).
|
||||
|
||||
## Build the Banana app
|
||||
|
||||
Banana Apps must include the "output" key in the return json.
|
||||
There is a rigid response structure.
|
||||
To use Banana apps within Langchain, they must include the `outputs` key
|
||||
in the returned json, and the value must be a string.
|
||||
|
||||
```python
|
||||
# Return the results as a dictionary
|
||||
result = {'output': result}
|
||||
result = {'outputs': result}
|
||||
```
|
||||
|
||||
An example inference function would be:
|
||||
|
||||
```python
|
||||
def inference(model_inputs:dict) -> dict:
|
||||
global model
|
||||
global tokenizer
|
||||
|
||||
# Parse out your arguments
|
||||
prompt = model_inputs.get('prompt', None)
|
||||
if prompt == None:
|
||||
return {'message': "No prompt provided"}
|
||||
|
||||
# Run the model
|
||||
input_ids = tokenizer.encode(prompt, return_tensors='pt').cuda()
|
||||
output = model.generate(
|
||||
input_ids,
|
||||
max_length=100,
|
||||
do_sample=True,
|
||||
top_k=50,
|
||||
top_p=0.95,
|
||||
num_return_sequences=1,
|
||||
temperature=0.9,
|
||||
early_stopping=True,
|
||||
no_repeat_ngram_size=3,
|
||||
num_beams=5,
|
||||
length_penalty=1.5,
|
||||
repetition_penalty=1.5,
|
||||
bad_words_ids=[[tokenizer.encode(' ', add_prefix_space=True)[0]]]
|
||||
)
|
||||
|
||||
result = tokenizer.decode(output[0], skip_special_tokens=True)
|
||||
# Return the results as a dictionary
|
||||
result = {'output': result}
|
||||
return result
|
||||
@app.handler("/")
|
||||
def handler(context: dict, request: Request) -> Response:
|
||||
"""Handle a request to generate code from a prompt."""
|
||||
model = context.get("model")
|
||||
tokenizer = context.get("tokenizer")
|
||||
max_new_tokens = request.json.get("max_new_tokens", 512)
|
||||
temperature = request.json.get("temperature", 0.7)
|
||||
prompt = request.json.get("prompt")
|
||||
prompt_template=f'''[INST] Write code to solve the following coding problem that obeys the constraints and passes the example test cases. Please wrap your code answer using ```:
|
||||
{prompt}
|
||||
[/INST]
|
||||
'''
|
||||
input_ids = tokenizer(prompt_template, return_tensors='pt').input_ids.cuda()
|
||||
output = model.generate(inputs=input_ids, temperature=temperature, max_new_tokens=max_new_tokens)
|
||||
result = tokenizer.decode(output[0])
|
||||
return Response(json={"outputs": result}, status=200)
|
||||
```
|
||||
|
||||
You can find a full example of a Banana app [here](https://github.com/conceptofmind/serverless-template-palmyra-base/blob/main/app.py).
|
||||
This example is from the `app.py` file in [CodeLlama-7B-Instruct-GPTQ](https://github.com/bananaml/demo-codellama-7b-instruct-gptq).
|
||||
|
||||
## Wrappers
|
||||
|
||||
### LLM
|
||||
|
||||
There exists an Banana LLM wrapper, which you can access with
|
||||
Within Langchain, there exists a Banana LLM wrapper, which you can access with
|
||||
|
||||
```python
|
||||
from langchain.llms import Banana
|
||||
```
|
||||
|
||||
You need to provide a model key located in the dashboard:
|
||||
You need to provide a model key and model url slug, which you can get from the model's details page in the [Banana.dev dashboard](https://app.banana.dev).
|
||||
|
||||
```python
|
||||
llm = Banana(model_key="YOUR_MODEL_KEY")
|
||||
llm = Banana(model_key="YOUR_MODEL_KEY", model_url_slug="YOUR_MODEL_URL_SLUG")
|
||||
```
|
||||
|
@ -5,13 +5,23 @@
|
||||
"id": "ed47bb62",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Hugging Face Hub\n",
|
||||
"# Hugging Face\n",
|
||||
"Let's load the Hugging Face Embedding class."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"execution_count": null,
|
||||
"id": "16b20335-da1d-46ba-aa23-fbf3e2c6fe60",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!pip install langchain sentence_transformers"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "861521a9",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@ -21,7 +31,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 16,
|
||||
"execution_count": 3,
|
||||
"id": "ff9be586",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@ -31,7 +41,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"execution_count": 3,
|
||||
"id": "d0a98ae9",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@ -41,7 +51,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"execution_count": 5,
|
||||
"id": "5d6c682b",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@ -51,7 +61,28 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"execution_count": 6,
|
||||
"id": "b57b8ce9-ef7d-4e63-979e-aa8763d1f9a8",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[-0.04895168915390968, -0.03986193612217903, -0.021562768146395683]"
|
||||
]
|
||||
},
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"query_result[:3]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"id": "bb5e74c0",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@ -60,19 +91,71 @@
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "aaad49f8",
|
||||
"cell_type": "markdown",
|
||||
"id": "92019ef1-5d30-4985-b4e6-c0d98bdfe265",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
"source": [
|
||||
"## Hugging Face Inference API\n",
|
||||
"We can also access embedding models via the Hugging Face Inference API, which does not require us to install ``sentence_transformers`` and download models locally."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "66f5c6ba-1446-43e1-b012-800d17cef300",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdin",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Enter your HF Inference API Key:\n",
|
||||
"\n",
|
||||
" ········\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import getpass\n",
|
||||
"\n",
|
||||
"inference_api_key = getpass.getpass(\"Enter your HF Inference API Key:\\n\\n\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "d0623c1f-cd82-4862-9bce-3655cb9b66ac",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[-0.038338541984558105, 0.1234646737575531, -0.028642963618040085]"
|
||||
]
|
||||
},
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from langchain.embeddings import HuggingFaceInferenceAPIEmbeddings\n",
|
||||
"\n",
|
||||
"embeddings = HuggingFaceInferenceAPIEmbeddings(\n",
|
||||
" api_key=inference_api_key,\n",
|
||||
" model_name=\"sentence-transformers/all-MiniLM-l6-v2\"\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"query_result = embeddings.embed_query(text)\n",
|
||||
"query_result[:3]"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"display_name": "poetry-venv",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
"name": "poetry-venv"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
|
126
docs/extras/integrations/vectorstores/nucliadb.ipynb
Normal file
126
docs/extras/integrations/vectorstores/nucliadb.ipynb
Normal file
@ -0,0 +1,126 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# NucliaDB\n",
|
||||
"\n",
|
||||
"You can use a local NucliaDB instance or use [Nuclia Cloud](https://nuclia.cloud).\n",
|
||||
"\n",
|
||||
"When using a local instance, you need a Nuclia Understanding API key, so your texts are properly vectorized and indexed. You can get a key by creating a free account at [https://nuclia.cloud](https://nuclia.cloud), and then [create a NUA key](https://docs.nuclia.dev/docs/docs/using/understanding/intro)."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#!pip install langchain nuclia"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Usage with nuclia.cloud"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.vectorstores.nucliadb import NucliaDB\n",
|
||||
"API_KEY = \"YOUR_API_KEY\"\n",
|
||||
"\n",
|
||||
"ndb = NucliaDB(knowledge_box=\"YOUR_KB_ID\", local=False, api_key=API_KEY)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Usage with a local instance\n",
|
||||
"\n",
|
||||
"Note: By default `backend` is set to `http://localhost:8080`."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.vectorstores.nucliadb import NucliaDB\n",
|
||||
"\n",
|
||||
"ndb = NucliaDB(knowledge_box=\"YOUR_KB_ID\", local=True, backend=\"http://my-local-server\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Add and delete texts to your Knowledge Box"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"ids = ndb.add_texts([\"This is a new test\", \"This is a second test\"])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"ndb.delete(ids=ids)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Search in your Knowledge Box"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"results = ndb.similarity_search(\"Who was inspired by Ada Lovelace?\")\n",
|
||||
"print(res.page_content)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.1"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
}
|
207
docs/extras/integrations/vectorstores/sqlitevss.ipynb
Normal file
207
docs/extras/integrations/vectorstores/sqlitevss.ipynb
Normal file
@ -0,0 +1,207 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"source": [
|
||||
"# sqlite-vss\n",
|
||||
"\n",
|
||||
">[sqlite-vss](https://alexgarcia.xyz/sqlite-vss/) is an SQLite extension designed for vector search, emphasizing local-first operations and easy integration into applications without external servers. Leveraging the Faiss library, it offers efficient similarity search and clustering capabilities.\n",
|
||||
"\n",
|
||||
"This notebook shows how to use the `SQLiteVSS` vector database."
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# You need to install sqlite-vss as a dependency.\n",
|
||||
"%pip install sqlite-vss"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"source": [
|
||||
"### Quickstart"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": "'Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. \\n\\nTonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \\n\\nOne of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \\n\\nAnd I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.'"
|
||||
},
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings\n",
|
||||
"from langchain.text_splitter import CharacterTextSplitter\n",
|
||||
"from langchain.vectorstores import SQLiteVSS\n",
|
||||
"from langchain.document_loaders import TextLoader\n",
|
||||
"\n",
|
||||
"# load the document and split it into chunks\n",
|
||||
"loader = TextLoader(\"../../../state_of_the_union.txt\")\n",
|
||||
"documents = loader.load()\n",
|
||||
"\n",
|
||||
"# split it into chunks\n",
|
||||
"text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n",
|
||||
"docs = text_splitter.split_documents(documents)\n",
|
||||
"texts = [doc.page_content for doc in docs]\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# create the open-source embedding function\n",
|
||||
"embedding_function = SentenceTransformerEmbeddings(model_name=\"all-MiniLM-L6-v2\")\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# load it in sqlite-vss in a table named state_union.\n",
|
||||
"# the db_file parameter is the name of the file you want\n",
|
||||
"# as your sqlite database.\n",
|
||||
"db = SQLiteVSS.from_texts(\n",
|
||||
" texts=texts,\n",
|
||||
" embedding=embedding_function,\n",
|
||||
" table=\"state_union\",\n",
|
||||
" db_file=\"/tmp/vss.db\"\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"# query it\n",
|
||||
"query = \"What did the president say about Ketanji Brown Jackson\"\n",
|
||||
"data = db.similarity_search(query)\n",
|
||||
"\n",
|
||||
"# print results\n",
|
||||
"data[0].page_content"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"ExecuteTime": {
|
||||
"end_time": "2023-09-06T14:55:55.370351Z",
|
||||
"start_time": "2023-09-06T14:55:53.547755Z"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"source": [
|
||||
"### Using existing sqlite connection"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": "'Ketanji Brown Jackson is awesome'"
|
||||
},
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings\n",
|
||||
"from langchain.text_splitter import CharacterTextSplitter\n",
|
||||
"from langchain.vectorstores import SQLiteVSS\n",
|
||||
"from langchain.document_loaders import TextLoader\n",
|
||||
"\n",
|
||||
"# load the document and split it into chunks\n",
|
||||
"loader = TextLoader(\"../../../state_of_the_union.txt\")\n",
|
||||
"documents = loader.load()\n",
|
||||
"\n",
|
||||
"# split it into chunks\n",
|
||||
"text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n",
|
||||
"docs = text_splitter.split_documents(documents)\n",
|
||||
"texts = [doc.page_content for doc in docs]\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# create the open-source embedding function\n",
|
||||
"embedding_function = SentenceTransformerEmbeddings(model_name=\"all-MiniLM-L6-v2\")\n",
|
||||
"connection = SQLiteVSS.create_connection(db_file=\"/tmp/vss.db\")\n",
|
||||
"\n",
|
||||
"db1 = SQLiteVSS(\n",
|
||||
" table=\"state_union\",\n",
|
||||
" embedding=embedding_function,\n",
|
||||
" connection=connection\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"db1.add_texts([\"Ketanji Brown Jackson is awesome\"])\n",
|
||||
"# query it again\n",
|
||||
"query = \"What did the president say about Ketanji Brown Jackson\"\n",
|
||||
"data = db1.similarity_search(query)\n",
|
||||
"\n",
|
||||
"# print results\n",
|
||||
"data[0].page_content"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"ExecuteTime": {
|
||||
"end_time": "2023-09-06T14:59:22.086252Z",
|
||||
"start_time": "2023-09-06T14:59:21.693237Z"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Cleaning up\n",
|
||||
"import os\n",
|
||||
"os.remove(\"/tmp/vss.db\")"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"ExecuteTime": {
|
||||
"end_time": "2023-09-06T15:01:15.550318Z",
|
||||
"start_time": "2023-09-06T15:01:15.546428Z"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"outputs": [],
|
||||
"source": [],
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
}
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 2
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython2",
|
||||
"version": "2.7.6"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 0
|
||||
}
|
@ -167,7 +167,7 @@
|
||||
"Tables necessary to determine the places of the planets are not less\r\n",
|
||||
"necessary than those for the sun, moon, and stars. Some notion of the\r\n",
|
||||
"number and complexity of these tables may be formed, when we state that\r\n",
|
||||
"the positions of the two principal planets, (and these the most\r\n",
|
||||
"the positions of the two principal planets, (and these are the most\r\n",
|
||||
"necessary for the navigator,) Jupiter and Saturn, require each not less\r\n",
|
||||
"than one hundred and sixteen tables. Yet it is not only necessary to\r\n",
|
||||
"predict the position of these bodies, but it is likewise expedient to -> 0.8998482592744614 \n",
|
||||
|
@ -1,12 +1,21 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "raw",
|
||||
"id": "ea5c61b2-8b52-4270-bdb0-c4df88608f15",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"---\n",
|
||||
"sidebar_position: 1\n",
|
||||
"title: Interacting with APIs\n",
|
||||
"---"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "a15e6a18",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Interacting with APIs\n",
|
||||
"\n",
|
||||
"[](https://colab.research.google.com/github/langchain-ai/langchain/blob/master/docs/extras/use_cases/apis.ipynb)\n",
|
||||
"\n",
|
||||
"## Use case \n",
|
||||
@ -69,9 +78,7 @@
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "30b780e3",
|
||||
"metadata": {
|
||||
"scrolled": false
|
||||
},
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
@ -415,7 +422,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.16"
|
||||
"version": "3.9.1"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
@ -1,12 +1,21 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "raw",
|
||||
"id": "22fd28c9-9b48-476c-bca8-20efef7fdb14",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"---\n",
|
||||
"sidebar_position: 1\n",
|
||||
"title: Chatbots\n",
|
||||
"---"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "ee7f95e4",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Chatbots\n",
|
||||
"\n",
|
||||
"[](https://colab.research.google.com/github/langchain-ai/langchain/blob/master/docs/extras/use_cases/chatbots.ipynb)\n",
|
||||
"\n",
|
||||
"## Use case\n",
|
||||
|
@ -1,11 +1,19 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "raw",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"---\n",
|
||||
"sidebar_position: 1\n",
|
||||
"title: Code understanding\n",
|
||||
"---"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Code Understanding\n",
|
||||
"\n",
|
||||
"[](https://colab.research.google.com/github/langchain-ai/langchain/blob/master/docs/extras/use_cases/code_understanding.ipynb)\n",
|
||||
"\n",
|
||||
"## Use case\n",
|
||||
@ -1047,7 +1055,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.16"
|
||||
"version": "3.9.1"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
@ -1,12 +1,21 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "raw",
|
||||
"id": "df29b30a-fd27-4e08-8269-870df5631f9e",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"---\n",
|
||||
"sidebar_position: 1\n",
|
||||
"title: Extraction\n",
|
||||
"---"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "b84edb4e",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Extraction\n",
|
||||
"\n",
|
||||
"[](https://colab.research.google.com/github/langchain-ai/langchain/blob/master/docs/extras/use_cases/extraction.ipynb)\n",
|
||||
"\n",
|
||||
"## Use case\n",
|
||||
@ -589,7 +598,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.16"
|
||||
"version": "3.9.1"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
@ -1,2 +1,2 @@
|
||||
label: 'More'
|
||||
position: 1
|
||||
position: 2
|
@ -584,7 +584,7 @@
|
||||
"\n",
|
||||
"Collectivly, this tells us: carefully inspect Agent traces and tool outputs. \n",
|
||||
"\n",
|
||||
"As we saw with the [SQL use case](/docs/use_cases/sql), `ReAct agents` can be work very well for specific problems. \n",
|
||||
"As we saw with the [SQL use case](/docs/use_cases/qa_structured/sql), `ReAct agents` can be work very well for specific problems. \n",
|
||||
"\n",
|
||||
"But, as shown here, the result is degraded relative to what we see with the OpenAI agent."
|
||||
]
|
||||
|
@ -1,7 +1,3 @@
|
||||
---
|
||||
sidebar_position: 0
|
||||
---
|
||||
|
||||
# Code writing
|
||||
|
||||
:::warning
|
||||
|
307
docs/extras/use_cases/more/graph/diffbot_graphtransformer.ipynb
Normal file
307
docs/extras/use_cases/more/graph/diffbot_graphtransformer.ipynb
Normal file
@ -0,0 +1,307 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "7f0b0c06-ee70-468c-8bf5-b023f9e5e0a2",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Diffbot Graph Transformer\n",
|
||||
"\n",
|
||||
"[](https://colab.research.google.com/github/langchain-ai/langchain/blob/master/docs/extras/use_cases/more/graph/diffbot_transformer.ipynb)\n",
|
||||
"\n",
|
||||
"## Use case\n",
|
||||
"\n",
|
||||
"Text data often contain rich relationships and insights that can be useful for various analytics, recommendation engines, or knowledge management applications.\n",
|
||||
"\n",
|
||||
"Diffbot's NLP API allows for the extraction of entities, relationships, and semantic meaning from unstructured text data.\n",
|
||||
"\n",
|
||||
"By coupling Diffbot's NLP API with Neo4j, a graph database, you can create powerful, dynamic graph structures based on the information extracted from text. These graph structures are fully queryable and can be integrated into various applications.\n",
|
||||
"\n",
|
||||
"This combination allows for use cases such as:\n",
|
||||
"\n",
|
||||
"* Building knowledge graphs from textual documents, websites, or social media feeds.\n",
|
||||
"* Generating recommendations based on semantic relationships in the data.\n",
|
||||
"* Creating advanced search features that understand the relationships between entities.\n",
|
||||
"* Building analytics dashboards that allow users to explore the hidden relationships in data.\n",
|
||||
"\n",
|
||||
"## Overview\n",
|
||||
"\n",
|
||||
"LangChain provides tools to interact with Graph Databases:\n",
|
||||
"\n",
|
||||
"1. `Construct knowledge graphs from text` using graph transformer and store integrations \n",
|
||||
"2. `Query a graph database` using chains for query creation and execution\n",
|
||||
"3. `Interact with a graph database` using agents for robust and flexible querying \n",
|
||||
"\n",
|
||||
"## Quickstart\n",
|
||||
"\n",
|
||||
"First, get required packages and set environment variables:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "975648da-b24f-4164-a671-6772179e12df",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!pip install langchain langchain-experimental openai neo4j wikipedia"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "77718977-629e-46c2-b091-f9191b9ec569",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Diffbot NLP Service\n",
|
||||
"\n",
|
||||
"Diffbot's NLP service is a tool for extracting entities, relationships, and semantic context from unstructured text data.\n",
|
||||
"This extracted information can be used to construct a knowledge graph.\n",
|
||||
"To use their service, you'll need to obtain an API key from [Diffbot](https://www.diffbot.com/products/natural-language/)."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "2cbf97d0-3682-439b-8750-b695ff726789",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain_experimental.graph_transformers.diffbot import DiffbotGraphTransformer\n",
|
||||
"\n",
|
||||
"diffbot_api_key = \"DIFFBOT_API_KEY\"\n",
|
||||
"diffbot_nlp = DiffbotGraphTransformer(diffbot_api_key=diffbot_api_key)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "5e3b894a-e3ee-46c7-8116-f8377f8f0159",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"This code fetches Wikipedia articles about \"Baldur's Gate 3\" and then uses `DiffbotGraphTransformer` to extract entities and relationships.\n",
|
||||
"The `DiffbotGraphTransformer` outputs a structured data `GraphDocument`, which can be used to populate a graph database.\n",
|
||||
"Note that text chunking is avoided due to Diffbot's [character limit per API request](https://docs.diffbot.com/reference/introduction-to-natural-language-api)."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "53f8df86-47a1-44a1-9a0f-6725b90703bc",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.document_loaders import WikipediaLoader\n",
|
||||
"\n",
|
||||
"query = \"Warren Buffett\"\n",
|
||||
"raw_documents = WikipediaLoader(query=query).load()\n",
|
||||
"graph_documents = diffbot_nlp.convert_to_graph_documents(raw_documents)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "31bb851a-aab4-4b97-a6b7-fce397d32b47",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Loading the data into a knowledge graph\n",
|
||||
"\n",
|
||||
"You will need to have a running Neo4j instance. One option is to create a [free Neo4j database instance in their Aura cloud service](https://neo4j.com/cloud/platform/aura-graph-database/). You can also run the database locally using the [Neo4j Desktop application](https://neo4j.com/download/), or running a docker container. You can run a local docker container by running the executing the following script:\n",
|
||||
"```\n",
|
||||
"docker run \\\n",
|
||||
" --name neo4j \\\n",
|
||||
" -p 7474:7474 -p 7687:7687 \\\n",
|
||||
" -d \\\n",
|
||||
" -e NEO4J_AUTH=neo4j/pleaseletmein \\\n",
|
||||
" -e NEO4J_PLUGINS=\\[\\\"apoc\\\"\\] \\\n",
|
||||
" neo4j:latest\n",
|
||||
"``` \n",
|
||||
"If you are using the docker container, you need to wait a couple of second for the database to start."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "0b2b6641-5a5d-467c-b148-e6aad5e4baa7",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.graphs import Neo4jGraph\n",
|
||||
"\n",
|
||||
"url=\"bolt://localhost:7687\"\n",
|
||||
"username=\"neo4j\"\n",
|
||||
"password=\"pleaseletmein\"\n",
|
||||
"\n",
|
||||
"graph = Neo4jGraph(\n",
|
||||
" url=url,\n",
|
||||
" username=username, \n",
|
||||
" password=password\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "0b15e840-fe6f-45db-9193-1b4e2df5c12c",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"The `GraphDocuments` can be loaded into a knowledge graph using the `add_graph_documents` method."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "1a67c4a8-955c-42a2-9c5d-de3ac0e640ec",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"graph.add_graph_documents(graph_documents)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "ed411e05-2b03-460d-997e-938482774f40",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Refresh graph schema information\n",
|
||||
"If the schema of database changes, you can refresh the schema information needed to generate Cypher statements"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "904c9ee3-787c-403f-857d-459ce5ad5a1b",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"graph.refresh_schema()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "f19d1387-5899-4258-8c94-8ef5fa7db464",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Querying the graph\n",
|
||||
"We can now use the graph cypher QA chain to ask question of the graph. It is advisable to use **gpt-4** to construct Cypher queries to get the best experience."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"id": "9393b732-67c8-45c1-9ec2-089f49c62448",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.chains import GraphCypherQAChain\n",
|
||||
"from langchain.chat_models import ChatOpenAI\n",
|
||||
"\n",
|
||||
"chain = GraphCypherQAChain.from_llm(\n",
|
||||
" cypher_llm=ChatOpenAI(temperature=0, model_name=\"gpt-4\"),\n",
|
||||
" qa_llm=ChatOpenAI(temperature=0, model_name=\"gpt-3.5-turbo\"),\n",
|
||||
" graph=graph, verbose=True,\n",
|
||||
" \n",
|
||||
")\n",
|
||||
" "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"id": "1a9b3652-b436-404d-aa25-5fb576f23dc0",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\n",
|
||||
"\n",
|
||||
"\u001b[1m> Entering new GraphCypherQAChain chain...\u001b[0m\n",
|
||||
"Generated Cypher:\n",
|
||||
"\u001b[32;1m\u001b[1;3mMATCH (p:Person {name: \"Warren Buffett\"})-[:EDUCATED_AT]->(o:Organization)\n",
|
||||
"RETURN o.name\u001b[0m\n",
|
||||
"Full Context:\n",
|
||||
"\u001b[32;1m\u001b[1;3m[{'o.name': 'New York Institute of Finance'}, {'o.name': 'Alice Deal Junior High School'}, {'o.name': 'Woodrow Wilson High School'}, {'o.name': 'University of Nebraska'}]\u001b[0m\n",
|
||||
"\n",
|
||||
"\u001b[1m> Finished chain.\u001b[0m\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'Warren Buffett attended the University of Nebraska.'"
|
||||
]
|
||||
},
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"chain.run(\"Which university did Warren Buffett attend?\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"id": "adc0ba0f-a62c-4875-89ce-da717f3ab148",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\n",
|
||||
"\n",
|
||||
"\u001b[1m> Entering new GraphCypherQAChain chain...\u001b[0m\n",
|
||||
"Generated Cypher:\n",
|
||||
"\u001b[32;1m\u001b[1;3mMATCH (p:Person)-[r:EMPLOYEE_OR_MEMBER_OF]->(o:Organization) WHERE o.name = 'Berkshire Hathaway' RETURN p.name\u001b[0m\n",
|
||||
"Full Context:\n",
|
||||
"\u001b[32;1m\u001b[1;3m[{'p.name': 'Charlie Munger'}, {'p.name': 'Oliver Chace'}, {'p.name': 'Howard Buffett'}, {'p.name': 'Howard'}, {'p.name': 'Susan Buffett'}, {'p.name': 'Warren Buffett'}]\u001b[0m\n",
|
||||
"\n",
|
||||
"\u001b[1m> Finished chain.\u001b[0m\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'Charlie Munger, Oliver Chace, Howard Buffett, Susan Buffett, and Warren Buffett are or were working at Berkshire Hathaway.'"
|
||||
]
|
||||
},
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"chain.run(\"Who is or was working at Berkshire Hathaway?\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "d636954b-d967-4e96-9489-92e11c74af35",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.4"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
@ -1,7 +1,3 @@
|
||||
---
|
||||
sidebar_position: 0
|
||||
---
|
||||
|
||||
# Self-checking
|
||||
|
||||
One of the main issues with using LLMs is that they can often hallucinate and make false claims. One of the surprisingly effective ways to remediate this is to use the LLM itself to check its own answers.
|
||||
|
3
docs/extras/use_cases/qa_structured/_category_.yml
Normal file
3
docs/extras/use_cases/qa_structured/_category_.yml
Normal file
@ -0,0 +1,3 @@
|
||||
label: 'QA over structured data'
|
||||
collapsed: false
|
||||
position: 0.5
|
@ -0,0 +1 @@
|
||||
label: 'Integration-specific'
|
@ -0,0 +1,158 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Elasticsearch\n",
|
||||
"\n",
|
||||
"[](https://colab.research.google.com/github/langchain-ai/langchain/blob/master/docs/extras/use_cases/qa_structured/integrations/elasticsearch.ipynb)\n",
|
||||
"\n",
|
||||
"We can use LLMs to interact with Elasticsearch analytics databases in natural language.\n",
|
||||
"\n",
|
||||
"This chain builds search queries via the Elasticsearch DSL API (filters and aggregations).\n",
|
||||
"\n",
|
||||
"The Elasticsearch client must have permissions for index listing, mapping description and search queries.\n",
|
||||
"\n",
|
||||
"See [here](https://www.elastic.co/guide/en/elasticsearch/reference/current/docker.html) for instructions on how to run Elasticsearch locally."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"! pip install langchain langchain-experimental openai elasticsearch\n",
|
||||
"\n",
|
||||
"# Set env var OPENAI_API_KEY or load from a .env file\n",
|
||||
"# import dotenv\n",
|
||||
"\n",
|
||||
"# dotenv.load_dotenv()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 15,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from elasticsearch import Elasticsearch\n",
|
||||
"\n",
|
||||
"from langchain.chat_models import ChatOpenAI\n",
|
||||
"from langchain.chains.elasticsearch_database import ElasticsearchDatabaseChain"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Initialize Elasticsearch python client.\n",
|
||||
"# See https://elasticsearch-py.readthedocs.io/en/v8.8.2/api.html#elasticsearch.Elasticsearch\n",
|
||||
"ELASTIC_SEARCH_SERVER = \"https://elastic:pass@localhost:9200\"\n",
|
||||
"db = Elasticsearch(ELASTIC_SEARCH_SERVER)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Uncomment the next cell to initially populate your db."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# customers = [\n",
|
||||
"# {\"firstname\": \"Jennifer\", \"lastname\": \"Walters\"},\n",
|
||||
"# {\"firstname\": \"Monica\",\"lastname\":\"Rambeau\"},\n",
|
||||
"# {\"firstname\": \"Carol\",\"lastname\":\"Danvers\"},\n",
|
||||
"# {\"firstname\": \"Wanda\",\"lastname\":\"Maximoff\"},\n",
|
||||
"# {\"firstname\": \"Jennifer\",\"lastname\":\"Takeda\"},\n",
|
||||
"# ]\n",
|
||||
"# for i, customer in enumerate(customers):\n",
|
||||
"# db.create(index=\"customers\", document=customer, id=i)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"llm = ChatOpenAI(model_name=\"gpt-4\", temperature=0)\n",
|
||||
"chain = ElasticsearchDatabaseChain.from_llm(llm=llm, database=db, verbose=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"question = \"What are the first names of all the customers?\"\n",
|
||||
"chain.run(question)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"We can customize the prompt."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.chains.elasticsearch_database.prompts import DEFAULT_DSL_TEMPLATE\n",
|
||||
"from langchain.prompts.prompt import PromptTemplate\n",
|
||||
"\n",
|
||||
"PROMPT_TEMPLATE = \"\"\"Given an input question, create a syntactically correct Elasticsearch query to run. Unless the user specifies in their question a specific number of examples they wish to obtain, always limit your query to at most {top_k} results. You can order the results by a relevant column to return the most interesting examples in the database.\n",
|
||||
"\n",
|
||||
"Unless told to do not query for all the columns from a specific index, only ask for a the few relevant columns given the question.\n",
|
||||
"\n",
|
||||
"Pay attention to use only the column names that you can see in the mapping description. Be careful to not query for columns that do not exist. Also, pay attention to which column is in which index. Return the query as valid json.\n",
|
||||
"\n",
|
||||
"Use the following format:\n",
|
||||
"\n",
|
||||
"Question: Question here\n",
|
||||
"ESQuery: Elasticsearch Query formatted as json\n",
|
||||
"\"\"\"\n",
|
||||
"\n",
|
||||
"PROMPT = PromptTemplate.from_template(\n",
|
||||
" PROMPT_TEMPLATE,\n",
|
||||
")\n",
|
||||
"chain = ElasticsearchDatabaseChain.from_llm(llm=llm, database=db, query_prompt=PROMPT)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.1"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
}
|
@ -0,0 +1,200 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "245065c6",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Vector SQL Retriever with MyScale\n",
|
||||
"\n",
|
||||
">[MyScale](https://docs.myscale.com/en/) is an integrated vector database. You can access your database in SQL and also from here, LangChain. MyScale can make a use of [various data types and functions for filters](https://blog.myscale.com/2023/06/06/why-integrated-database-solution-can-boost-your-llm-apps/#filter-on-anything-without-constraints). It will boost up your LLM app no matter if you are scaling up your data or expand your system to broader application."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "0246c5bf",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!pip3 install clickhouse-sqlalchemy InstructorEmbedding sentence_transformers openai langchain-experimental"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "7585d2c3",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"\n",
|
||||
"from os import environ\n",
|
||||
"import getpass\n",
|
||||
"from typing import Dict, Any\n",
|
||||
"from langchain import OpenAI, SQLDatabase, LLMChain\n",
|
||||
"from langchain_experimental.sql.vector_sql import VectorSQLDatabaseChain\n",
|
||||
"from sqlalchemy import create_engine, Column, MetaData\n",
|
||||
"from langchain import PromptTemplate\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"from sqlalchemy import create_engine\n",
|
||||
"\n",
|
||||
"MYSCALE_HOST = \"msc-1decbcc9.us-east-1.aws.staging.myscale.cloud\"\n",
|
||||
"MYSCALE_PORT = 443\n",
|
||||
"MYSCALE_USER = \"chatdata\"\n",
|
||||
"MYSCALE_PASSWORD = \"myscale_rocks\"\n",
|
||||
"OPENAI_API_KEY = getpass.getpass(\"OpenAI API Key:\")\n",
|
||||
"\n",
|
||||
"engine = create_engine(\n",
|
||||
" f\"clickhouse://{MYSCALE_USER}:{MYSCALE_PASSWORD}@{MYSCALE_HOST}:{MYSCALE_PORT}/default?protocol=https\"\n",
|
||||
")\n",
|
||||
"metadata = MetaData(bind=engine)\n",
|
||||
"environ[\"OPENAI_API_KEY\"] = OPENAI_API_KEY"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "e08d9ddc",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.embeddings import HuggingFaceInstructEmbeddings\n",
|
||||
"from langchain_experimental.sql.vector_sql import VectorSQLOutputParser\n",
|
||||
"\n",
|
||||
"output_parser = VectorSQLOutputParser.from_embeddings(\n",
|
||||
" model=HuggingFaceInstructEmbeddings(\n",
|
||||
" model_name=\"hkunlp/instructor-xl\", model_kwargs={\"device\": \"cpu\"}\n",
|
||||
" )\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "84b705b2",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"\n",
|
||||
"from langchain.llms import OpenAI\n",
|
||||
"from langchain.callbacks import StdOutCallbackHandler\n",
|
||||
"\n",
|
||||
"from langchain.utilities.sql_database import SQLDatabase\n",
|
||||
"from langchain_experimental.sql.prompt import MYSCALE_PROMPT\n",
|
||||
"from langchain_experimental.sql.vector_sql import VectorSQLDatabaseChain\n",
|
||||
"\n",
|
||||
"chain = VectorSQLDatabaseChain(\n",
|
||||
" llm_chain=LLMChain(\n",
|
||||
" llm=OpenAI(openai_api_key=OPENAI_API_KEY, temperature=0),\n",
|
||||
" prompt=MYSCALE_PROMPT,\n",
|
||||
" ),\n",
|
||||
" top_k=10,\n",
|
||||
" return_direct=True,\n",
|
||||
" sql_cmd_parser=output_parser,\n",
|
||||
" database=SQLDatabase(engine, None, metadata),\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"import pandas as pd\n",
|
||||
"\n",
|
||||
"pd.DataFrame(\n",
|
||||
" chain.run(\n",
|
||||
" \"Please give me 10 papers to ask what is PageRank?\",\n",
|
||||
" callbacks=[StdOutCallbackHandler()],\n",
|
||||
" )\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "6c09cda0",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## SQL Database as Retriever"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "734d7ff5",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.chat_models import ChatOpenAI\n",
|
||||
"from langchain.chains.qa_with_sources.retrieval import RetrievalQAWithSourcesChain\n",
|
||||
"\n",
|
||||
"from langchain_experimental.sql.vector_sql import VectorSQLDatabaseChain\n",
|
||||
"from langchain_experimental.retrievers.vector_sql_database \\\n",
|
||||
" import VectorSQLDatabaseChainRetriever\n",
|
||||
"from langchain_experimental.sql.prompt import MYSCALE_PROMPT\n",
|
||||
"from langchain_experimental.sql.vector_sql import VectorSQLRetrieveAllOutputParser\n",
|
||||
"\n",
|
||||
"output_parser_retrieve_all = VectorSQLRetrieveAllOutputParser.from_embeddings(\n",
|
||||
" output_parser.model\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"chain = VectorSQLDatabaseChain.from_llm(\n",
|
||||
" llm=OpenAI(openai_api_key=OPENAI_API_KEY, temperature=0),\n",
|
||||
" prompt=MYSCALE_PROMPT,\n",
|
||||
" top_k=10,\n",
|
||||
" return_direct=True,\n",
|
||||
" db=SQLDatabase(engine, None, metadata),\n",
|
||||
" sql_cmd_parser=output_parser_retrieve_all,\n",
|
||||
" native_format=True,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"# You need all those keys to get docs\n",
|
||||
"retriever = VectorSQLDatabaseChainRetriever(sql_db_chain=chain, page_content_key=\"abstract\")\n",
|
||||
"\n",
|
||||
"document_with_metadata_prompt = PromptTemplate(\n",
|
||||
" input_variables=[\"page_content\", \"id\", \"title\", \"authors\", \"pubdate\", \"categories\"],\n",
|
||||
" template=\"Content:\\n\\tTitle: {title}\\n\\tAbstract: {page_content}\\n\\tAuthors: {authors}\\n\\tDate of Publication: {pubdate}\\n\\tCategories: {categories}\\nSOURCE: {id}\",\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"chain = RetrievalQAWithSourcesChain.from_chain_type(\n",
|
||||
" ChatOpenAI(\n",
|
||||
" model_name=\"gpt-3.5-turbo-16k\", openai_api_key=OPENAI_API_KEY, temperature=0.6\n",
|
||||
" ),\n",
|
||||
" retriever=retriever,\n",
|
||||
" chain_type=\"stuff\",\n",
|
||||
" chain_type_kwargs={\n",
|
||||
" \"document_prompt\": document_with_metadata_prompt,\n",
|
||||
" },\n",
|
||||
" return_source_documents=True,\n",
|
||||
")\n",
|
||||
"ans = chain(\"Please give me 10 papers to ask what is PageRank?\",\n",
|
||||
" callbacks=[StdOutCallbackHandler()])\n",
|
||||
"print(ans[\"answer\"])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "4948ff25",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.3"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
@ -1,12 +1,20 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "raw",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"---\n",
|
||||
"title: SQL\n",
|
||||
"sidebar_position: 2\n",
|
||||
"---"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# SQL\n",
|
||||
"\n",
|
||||
"[](https://colab.research.google.com/github/langchain-ai/langchain/blob/master/docs/extras/use_cases/sql.ipynb)\n",
|
||||
"[](https://colab.research.google.com/github/langchain-ai/langchain/blob/master/docs/extras/use_cases/qa_structured/sql.ipynb)\n",
|
||||
"\n",
|
||||
"## Use case\n",
|
||||
"\n",
|
||||
@ -713,6 +721,391 @@
|
||||
"agent_executor.run(\"Describe the playlisttrack table\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Extending the SQL Toolkit\n",
|
||||
"\n",
|
||||
"Although the out-of-the-box SQL Toolkit contains the necessary tools to start working on a database, it is often the case that some extra tools may be useful for extending the agent's capabilities. This is particularly useful when trying to use **domain specific knowledge** in the solution, in order to improve its overall performance.\n",
|
||||
"\n",
|
||||
"Some examples include:\n",
|
||||
"\n",
|
||||
"- Including dynamic few shot examples\n",
|
||||
"- Finding misspellings in proper nouns to use as column filters\n",
|
||||
"\n",
|
||||
"We can create separate tools which tackle these specific use cases and include them as a complement to the standard SQL Toolkit. Let's see how to include these two custom tools.\n",
|
||||
"\n",
|
||||
"#### Including dynamic few-shot examples\n",
|
||||
"\n",
|
||||
"In order to include dynamic few-shot examples, we need a custom **Retriever Tool** that handles the vector database in order to retrieve the examples that are semantically similar to the user’s question.\n",
|
||||
"\n",
|
||||
"Let's start by creating a dictionary with some examples: "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# few_shots = {'List all artists.': 'SELECT * FROM artists;',\n",
|
||||
"# \"Find all albums for the artist 'AC/DC'.\": \"SELECT * FROM albums WHERE ArtistId = (SELECT ArtistId FROM artists WHERE Name = 'AC/DC');\",\n",
|
||||
"# \"List all tracks in the 'Rock' genre.\": \"SELECT * FROM tracks WHERE GenreId = (SELECT GenreId FROM genres WHERE Name = 'Rock');\",\n",
|
||||
"# 'Find the total duration of all tracks.': 'SELECT SUM(Milliseconds) FROM tracks;',\n",
|
||||
"# 'List all customers from Canada.': \"SELECT * FROM customers WHERE Country = 'Canada';\",\n",
|
||||
"# 'How many tracks are there in the album with ID 5?': 'SELECT COUNT(*) FROM tracks WHERE AlbumId = 5;',\n",
|
||||
"# 'Find the total number of invoices.': 'SELECT COUNT(*) FROM invoices;',\n",
|
||||
"# 'List all tracks that are longer than 5 minutes.': 'SELECT * FROM tracks WHERE Milliseconds > 300000;',\n",
|
||||
"# 'Who are the top 5 customers by total purchase?': 'SELECT CustomerId, SUM(Total) AS TotalPurchase FROM invoices GROUP BY CustomerId ORDER BY TotalPurchase DESC LIMIT 5;',\n",
|
||||
"# 'Which albums are from the year 2000?': \"SELECT * FROM albums WHERE strftime('%Y', ReleaseDate) = '2000';\",\n",
|
||||
"# 'How many employees are there': 'SELECT COUNT(*) FROM \"employee\"'\n",
|
||||
"# }"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"We can then create a retriever using the list of questions, assigning the target SQL query as metadata:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 24,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.embeddings.openai import OpenAIEmbeddings\n",
|
||||
"from langchain.vectorstores import FAISS\n",
|
||||
"from langchain.schema import Document\n",
|
||||
"\n",
|
||||
"embeddings = OpenAIEmbeddings()\n",
|
||||
"\n",
|
||||
"few_shot_docs = [Document(page_content=question, metadata={'sql_query': few_shots[question]}) for question in few_shots.keys()]\n",
|
||||
"vector_db = FAISS.from_documents(few_shot_docs, embeddings)\n",
|
||||
"retriever = vector_db.as_retriever()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Now we can create our own custom tool and append it as a new tool in the `create_sql_agent` function:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.agents.agent_toolkits import create_retriever_tool\n",
|
||||
"\n",
|
||||
"tool_description = \"\"\"\n",
|
||||
"This tool will help you understand similar examples to adapt them to the user question.\n",
|
||||
"Input to this tool should be the user question.\n",
|
||||
"\"\"\"\n",
|
||||
"\n",
|
||||
"retriever_tool = create_retriever_tool(\n",
|
||||
" retriever,\n",
|
||||
" name='sql_get_similar_examples',\n",
|
||||
" description=tool_description\n",
|
||||
" )\n",
|
||||
"custom_tool_list = [retriever_tool]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Now we can create the agent, adjusting the standard SQL Agent suffix to consider our use case. Although the most straightforward way to handle this would be to include it just in the tool description, this is often not enough and we need to specify it in the agent prompt using the `suffix` argument in the constructor."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 22,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.agents import create_sql_agent, AgentType\n",
|
||||
"from langchain.agents.agent_toolkits import SQLDatabaseToolkit\n",
|
||||
"from langchain.utilities import SQLDatabase\n",
|
||||
"from langchain.chat_models import ChatOpenAI\n",
|
||||
"\n",
|
||||
"db = SQLDatabase.from_uri(\"sqlite:///Chinook.db\")\n",
|
||||
"llm = ChatOpenAI(model_name='gpt-4',temperature=0)\n",
|
||||
"\n",
|
||||
"toolkit = SQLDatabaseToolkit(db=db, llm=llm)\n",
|
||||
"\n",
|
||||
"custom_suffix = \"\"\"\n",
|
||||
"I should first get the similar examples I know.\n",
|
||||
"If the examples are enough to construct the query, I can build it.\n",
|
||||
"Otherwise, I can then look at the tables in the database to see what I can query.\n",
|
||||
"Then I should query the schema of the most relevant tables\n",
|
||||
"\"\"\"\n",
|
||||
"\n",
|
||||
"agent = create_sql_agent(llm=llm,\n",
|
||||
" toolkit=toolkit,\n",
|
||||
" verbose=True,\n",
|
||||
" agent_type=AgentType.OPENAI_FUNCTIONS,\n",
|
||||
" extra_tools=custom_tool_list,\n",
|
||||
" suffix=custom_suffix\n",
|
||||
" )"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Let's try it out:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 23,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\n",
|
||||
"\n",
|
||||
"\u001b[1m> Entering new AgentExecutor chain...\u001b[0m\n",
|
||||
"\u001b[32;1m\u001b[1;3m\n",
|
||||
"Invoking: `sql_get_similar_examples` with `How many employees do we have?`\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\u001b[0m\u001b[33;1m\u001b[1;3m[Document(page_content='How many employees are there', metadata={'sql_query': 'SELECT COUNT(*) FROM \"employee\"'}), Document(page_content='Find the total number of invoices.', metadata={'sql_query': 'SELECT COUNT(*) FROM invoices;'})]\u001b[0m\u001b[32;1m\u001b[1;3m\n",
|
||||
"Invoking: `sql_db_query_checker` with `SELECT COUNT(*) FROM employee`\n",
|
||||
"responded: {content}\n",
|
||||
"\n",
|
||||
"\u001b[0m\u001b[36;1m\u001b[1;3mSELECT COUNT(*) FROM employee\u001b[0m\u001b[32;1m\u001b[1;3m\n",
|
||||
"Invoking: `sql_db_query` with `SELECT COUNT(*) FROM employee`\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\u001b[0m\u001b[36;1m\u001b[1;3m[(8,)]\u001b[0m\u001b[32;1m\u001b[1;3mWe have 8 employees.\u001b[0m\n",
|
||||
"\n",
|
||||
"\u001b[1m> Finished chain.\u001b[0m\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'We have 8 employees.'"
|
||||
]
|
||||
},
|
||||
"execution_count": 23,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"agent.run(\"How many employees do we have?\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"As we can see, the agent first used the `sql_get_similar_examples` tool in order to retrieve similar examples. As the question was very similar to other few shot examples, the agent **didn't need to use any other tool** from the standard Toolkit, thus **saving time and tokens**."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Finding and correcting misspellings for proper nouns\n",
|
||||
"\n",
|
||||
"In order to filter columns that contain proper nouns such as addresses, song names or artists, we first need to double-check the spelling in order to filter the data correctly. \n",
|
||||
"\n",
|
||||
"We can achieve this by creating a vector store using all the distinct proper nouns that exist in the database. We can then have the agent query that vector store each time the user includes a proper noun in their question, to find the correct spelling for that word. In this way, the agent can make sure it understands which entity the user is referring to before building the target query.\n",
|
||||
"\n",
|
||||
"Let's follow a similar approach to the few shots, but without metadata: just embedding the proper nouns and then querying to get the most similar one to the misspelled user question.\n",
|
||||
"\n",
|
||||
"First we need the unique values for each entity we want, for which we define a function that parses the result into a list of elements:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 37,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import ast\n",
|
||||
"import re\n",
|
||||
"\n",
|
||||
"def run_query_save_results(db, query):\n",
|
||||
" res = db.run(query)\n",
|
||||
" res = [el for sub in ast.literal_eval(res) for el in sub if el]\n",
|
||||
" res = [re.sub(r'\\b\\d+\\b', '', string).strip() for string in res]\n",
|
||||
" return res\n",
|
||||
"\n",
|
||||
"artists = run_query_save_results(db, \"SELECT Name FROM Artist\")\n",
|
||||
"albums = run_query_save_results(db, \"SELECT Title FROM Album\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Now we can proceed with creating the custom **retreiver tool** and the final agent:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 51,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.agents.agent_toolkits import create_retriever_tool\n",
|
||||
"from langchain.embeddings.openai import OpenAIEmbeddings\n",
|
||||
"from langchain.vectorstores import FAISS\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"texts = (artists + albums)\n",
|
||||
"\n",
|
||||
"embeddings = OpenAIEmbeddings()\n",
|
||||
"vector_db = FAISS.from_texts(texts, embeddings)\n",
|
||||
"retriever = vector_db.as_retriever()\n",
|
||||
"\n",
|
||||
"retriever_tool = create_retriever_tool(\n",
|
||||
" retriever,\n",
|
||||
" name='name_search',\n",
|
||||
" description='use to learn how a piece of data is actually written, can be from names, surnames addresses etc'\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
"custom_tool_list = [retriever_tool]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 54,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.agents import create_sql_agent, AgentType\n",
|
||||
"from langchain.agents.agent_toolkits import SQLDatabaseToolkit\n",
|
||||
"from langchain.utilities import SQLDatabase\n",
|
||||
"from langchain.chat_models import ChatOpenAI\n",
|
||||
"\n",
|
||||
"# db = SQLDatabase.from_uri(\"sqlite:///Chinook.db\")\n",
|
||||
"llm = ChatOpenAI(model_name='gpt-4', temperature=0)\n",
|
||||
"\n",
|
||||
"toolkit = SQLDatabaseToolkit(db=db, llm=llm)\n",
|
||||
"\n",
|
||||
"custom_suffix = \"\"\"\n",
|
||||
"If a user asks for me to filter based on proper nouns, I should first check the spelling using the name_search tool.\n",
|
||||
"Otherwise, I can then look at the tables in the database to see what I can query.\n",
|
||||
"Then I should query the schema of the most relevant tables\n",
|
||||
"\"\"\"\n",
|
||||
"\n",
|
||||
"agent = create_sql_agent(llm=llm,\n",
|
||||
" toolkit=toolkit,\n",
|
||||
" verbose=True,\n",
|
||||
" agent_type=AgentType.OPENAI_FUNCTIONS,\n",
|
||||
" extra_tools=custom_tool_list,\n",
|
||||
" suffix=custom_suffix\n",
|
||||
" )"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Let's try it out:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 55,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\n",
|
||||
"\n",
|
||||
"\u001b[1m> Entering new AgentExecutor chain...\u001b[0m\n",
|
||||
"\u001b[32;1m\u001b[1;3m\n",
|
||||
"Invoking: `name_search` with `alis in pains`\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\u001b[0m\u001b[33;1m\u001b[1;3m[Document(page_content='House of Pain', metadata={}), Document(page_content='Alice In Chains', metadata={}), Document(page_content='Aisha Duo', metadata={}), Document(page_content='House Of Pain', metadata={})]\u001b[0m\u001b[32;1m\u001b[1;3m\n",
|
||||
"Invoking: `sql_db_list_tables` with ``\n",
|
||||
"responded: {content}\n",
|
||||
"\n",
|
||||
"\u001b[0m\u001b[38;5;200m\u001b[1;3mAlbum, Artist, Customer, Employee, Genre, Invoice, InvoiceLine, MediaType, Playlist, PlaylistTrack, Track\u001b[0m\u001b[32;1m\u001b[1;3m\n",
|
||||
"Invoking: `sql_db_schema` with `Album, Artist`\n",
|
||||
"responded: {content}\n",
|
||||
"\n",
|
||||
"\u001b[0m\u001b[33;1m\u001b[1;3m\n",
|
||||
"CREATE TABLE \"Album\" (\n",
|
||||
"\t\"AlbumId\" INTEGER NOT NULL, \n",
|
||||
"\t\"Title\" NVARCHAR(160) NOT NULL, \n",
|
||||
"\t\"ArtistId\" INTEGER NOT NULL, \n",
|
||||
"\tPRIMARY KEY (\"AlbumId\"), \n",
|
||||
"\tFOREIGN KEY(\"ArtistId\") REFERENCES \"Artist\" (\"ArtistId\")\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"/*\n",
|
||||
"3 rows from Album table:\n",
|
||||
"AlbumId\tTitle\tArtistId\n",
|
||||
"1\tFor Those About To Rock We Salute You\t1\n",
|
||||
"2\tBalls to the Wall\t2\n",
|
||||
"3\tRestless and Wild\t2\n",
|
||||
"*/\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"CREATE TABLE \"Artist\" (\n",
|
||||
"\t\"ArtistId\" INTEGER NOT NULL, \n",
|
||||
"\t\"Name\" NVARCHAR(120), \n",
|
||||
"\tPRIMARY KEY (\"ArtistId\")\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"/*\n",
|
||||
"3 rows from Artist table:\n",
|
||||
"ArtistId\tName\n",
|
||||
"1\tAC/DC\n",
|
||||
"2\tAccept\n",
|
||||
"3\tAerosmith\n",
|
||||
"*/\u001b[0m\u001b[32;1m\u001b[1;3m\n",
|
||||
"Invoking: `sql_db_query_checker` with `SELECT COUNT(*) FROM Album JOIN Artist ON Album.ArtistId = Artist.ArtistId WHERE Artist.Name = 'Alice In Chains'`\n",
|
||||
"responded: {content}\n",
|
||||
"\n",
|
||||
"\u001b[0m\u001b[36;1m\u001b[1;3mSELECT COUNT(*) FROM Album JOIN Artist ON Album.ArtistId = Artist.ArtistId WHERE Artist.Name = 'Alice In Chains'\u001b[0m\u001b[32;1m\u001b[1;3m\n",
|
||||
"Invoking: `sql_db_query` with `SELECT COUNT(*) FROM Album JOIN Artist ON Album.ArtistId = Artist.ArtistId WHERE Artist.Name = 'Alice In Chains'`\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\u001b[0m\u001b[36;1m\u001b[1;3m[(1,)]\u001b[0m\u001b[32;1m\u001b[1;3mAlice In Chains has 1 album in the database.\u001b[0m\n",
|
||||
"\n",
|
||||
"\u001b[1m> Finished chain.\u001b[0m\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'Alice In Chains has 1 album in the database.'"
|
||||
]
|
||||
},
|
||||
"execution_count": 55,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"agent.run(\"How many albums does alis in pains have?\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"As we can see, the agent used the `name_search` tool in order to check how to correctly query the database for this specific artist."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
@ -867,7 +1260,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.16"
|
||||
"version": "3.9.1"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
@ -42,7 +42,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"execution_count": 1,
|
||||
"id": "f8cf5765",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@ -68,7 +68,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"execution_count": 2,
|
||||
"id": "fdce8923",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@ -83,7 +83,7 @@
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"objc[31511]: Class GGMLMetalClass is implemented in both /Users/rlm/miniforge3/envs/llama2/lib/python3.9/site-packages/gpt4all/llmodel_DO_NOT_MODIFY/build/libreplit-mainline-metal.dylib (0x14f4e8208) and /Users/rlm/miniforge3/envs/llama2/lib/python3.9/site-packages/gpt4all/llmodel_DO_NOT_MODIFY/build/libllamamodel-mainline-metal.dylib (0x14f5fc208). One of the two will be used. Which one is undefined.\n"
|
||||
"objc[49534]: Class GGMLMetalClass is implemented in both /Users/rlm/miniforge3/envs/llama2/lib/python3.9/site-packages/gpt4all/llmodel_DO_NOT_MODIFY/build/libreplit-mainline-metal.dylib (0x131614208) and /Users/rlm/miniforge3/envs/llama2/lib/python3.9/site-packages/gpt4all/llmodel_DO_NOT_MODIFY/build/libllamamodel-mainline-metal.dylib (0x131988208). One of the two will be used. Which one is undefined.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
@ -104,7 +104,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"execution_count": 3,
|
||||
"id": "b0c55e98",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@ -114,7 +114,7 @@
|
||||
"4"
|
||||
]
|
||||
},
|
||||
"execution_count": 6,
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
@ -204,7 +204,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"execution_count": 4,
|
||||
"id": "cd7164e3",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@ -225,7 +225,7 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "56158f83-6490-49b8-9f04-2e2e6ec3524b",
|
||||
"id": "af1176bb-d52a-4cf0-b983-8b7433d45b4f",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@ -459,12 +459,11 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "4ae37573-63a7-4564-90e1-196a8ea9b526",
|
||||
"id": "cc638992-0924-41c0-8dae-8cf683e72b16",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain import hub\n",
|
||||
"rag_prompt = hub.pull(\"rlm/rag-prompt-default\")"
|
||||
"pip install langchainhub"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -512,6 +511,9 @@
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Prompt \n",
|
||||
"from langchain import hub\n",
|
||||
"rag_prompt = hub.pull(\"rlm/rag-prompt\")\n",
|
||||
"from langchain.chains.question_answering import load_qa_chain\n",
|
||||
"# Chain\n",
|
||||
"chain = load_qa_chain(llm, chain_type=\"stuff\", prompt=rag_prompt)\n",
|
||||
@ -529,7 +531,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 31,
|
||||
"execution_count": 3,
|
||||
"id": "78f6862d-b7a6-4e03-84e4-45667185bf9b",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@ -539,12 +541,13 @@
|
||||
"ChatPromptTemplate(input_variables=['question', 'context'], output_parser=None, partial_variables={}, messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['question', 'context'], output_parser=None, partial_variables={}, template=\"[INST]<<SYS>> You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.<</SYS>> \\nQuestion: {question} \\nContext: {context} \\nAnswer: [/INST]\", template_format='f-string', validate_template=True), additional_kwargs={})])"
|
||||
]
|
||||
},
|
||||
"execution_count": 31,
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Prompt\n",
|
||||
"rag_prompt_llama = hub.pull(\"rlm/rag-prompt-llama\")\n",
|
||||
"rag_prompt_llama"
|
||||
]
|
||||
|
@ -52,7 +52,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"execution_count": 1,
|
||||
"id": "046cefc0",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@ -269,28 +269,10 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"id": "c690f01a",
|
||||
"execution_count": null,
|
||||
"id": "9cfe3270-4e89-4c60-a2e5-9026b021bf76",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"INFO:langchain.retrievers.multi_query:Generated queries: ['1. How can Task Decomposition be approached?', '2. What are the different methods for Task Decomposition?', '3. What are the various approaches to decomposing tasks?']\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"4"
|
||||
]
|
||||
},
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import logging\n",
|
||||
"\n",
|
||||
@ -318,7 +300,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"execution_count": 9,
|
||||
"id": "99fa1aec",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@ -326,10 +308,10 @@
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'query': 'What are the approaches to Task Decomposition?',\n",
|
||||
" 'result': 'There are three approaches to task decomposition:\\n\\n1. Using Language Model with simple prompting: This approach involves using a Language Model (LLM) with simple prompts like \"Steps for XYZ\" or \"What are the subgoals for achieving XYZ?\" to guide the task decomposition process.\\n\\n2. Using task-specific instructions: In this approach, task-specific instructions are provided to guide the task decomposition. For example, for the task of writing a novel, an instruction like \"Write a story outline\" can be given to help decompose the task into smaller subtasks.\\n\\n3. Human inputs: Task decomposition can also be done with the help of human inputs. This involves getting input and guidance from humans to break down a complex task into smaller, more manageable subtasks.'}"
|
||||
" 'result': 'The approaches to task decomposition include:\\n\\n1. Simple prompting: This approach involves using simple prompts or questions to guide the agent in breaking down a task into smaller subgoals. For example, the agent can be prompted with \"Steps for XYZ\" or \"What are the subgoals for achieving XYZ?\" to facilitate task decomposition.\\n\\n2. Task-specific instructions: In this approach, task-specific instructions are provided to the agent to guide the decomposition process. For example, if the task is to write a novel, the agent can be instructed to \"Write a story outline\" as a step in the task decomposition.\\n\\n3. Human inputs: This approach involves incorporating human inputs in the task decomposition process. Humans can provide guidance, feedback, and assistance to the agent in breaking down complex tasks into manageable subgoals.\\n\\nThese approaches aim to enable efficient handling of complex tasks by breaking them down into smaller, more manageable subgoals.'}"
|
||||
]
|
||||
},
|
||||
"execution_count": 10,
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
@ -355,97 +337,7 @@
|
||||
"#### Choosing LLMs\n",
|
||||
"- Browse the > 55 LLM and chat model integrations [here](https://integrations.langchain.com/).\n",
|
||||
"- See further documentation on LLMs and chat models [here](/docs/modules/model_io/models/).\n",
|
||||
"- Use local LLMS: The popularity of [PrivateGPT](https://github.com/imartinez/privateGPT) and [GPT4All](https://github.com/nomic-ai/gpt4all) underscore the importance of running LLMs locally.\n",
|
||||
"Using `GPT4All` is as simple as [downloading the binary]((/docs/integrations/llms/gpt4all)) and then:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"id": "02d6c9dc",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Found model file at /Users/rlm/Desktop/Code/gpt4all/models/nous-hermes-13b.ggmlv3.q4_0.bin\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"objc[61331]: Class GGMLMetalClass is implemented in both /Users/rlm/miniforge3/envs/llama/lib/python3.9/site-packages/gpt4all/llmodel_DO_NOT_MODIFY/build/libreplit-mainline-metal.dylib (0x2e3384208) and /Users/rlm/miniforge3/envs/llama/lib/python3.9/site-packages/gpt4all/llmodel_DO_NOT_MODIFY/build/libllamamodel-mainline-metal.dylib (0x2e37b0208). One of the two will be used. Which one is undefined.\n",
|
||||
"llama.cpp: using Metal\n",
|
||||
"llama.cpp: loading model from /Users/rlm/Desktop/Code/gpt4all/models/nous-hermes-13b.ggmlv3.q4_0.bin\n",
|
||||
"llama_model_load_internal: format = ggjt v3 (latest)\n",
|
||||
"llama_model_load_internal: n_vocab = 32001\n",
|
||||
"llama_model_load_internal: n_ctx = 2048\n",
|
||||
"llama_model_load_internal: n_embd = 5120\n",
|
||||
"llama_model_load_internal: n_mult = 256\n",
|
||||
"llama_model_load_internal: n_head = 40\n",
|
||||
"llama_model_load_internal: n_layer = 40\n",
|
||||
"llama_model_load_internal: n_rot = 128\n",
|
||||
"llama_model_load_internal: ftype = 2 (mostly Q4_0)\n",
|
||||
"llama_model_load_internal: n_ff = 13824\n",
|
||||
"llama_model_load_internal: n_parts = 1\n",
|
||||
"llama_model_load_internal: model size = 13B\n",
|
||||
"llama_model_load_internal: ggml ctx size = 0.09 MB\n",
|
||||
"llama_model_load_internal: mem required = 9031.71 MB (+ 1608.00 MB per state)\n",
|
||||
"llama_new_context_with_model: kv self size = 1600.00 MB\n",
|
||||
"ggml_metal_init: allocating\n",
|
||||
"ggml_metal_init: using MPS\n",
|
||||
"ggml_metal_init: loading '/Users/rlm/miniforge3/envs/llama/lib/python3.9/site-packages/gpt4all/llmodel_DO_NOT_MODIFY/build/ggml-metal.metal'\n",
|
||||
"ggml_metal_init: loaded kernel_add 0x2bbbbc2f0\n",
|
||||
"ggml_metal_init: loaded kernel_mul 0x2bbbba840\n",
|
||||
"ggml_metal_init: loaded kernel_mul_row 0x2bb917dd0\n",
|
||||
"ggml_metal_init: loaded kernel_scale 0x2bb918150\n",
|
||||
"ggml_metal_init: loaded kernel_silu 0x2bb9184d0\n",
|
||||
"ggml_metal_init: loaded kernel_relu 0x2bb918850\n",
|
||||
"ggml_metal_init: loaded kernel_gelu 0x2bbbc3f10\n",
|
||||
"ggml_metal_init: loaded kernel_soft_max 0x2bbbc5840\n",
|
||||
"ggml_metal_init: loaded kernel_diag_mask_inf 0x2bbbc4c70\n",
|
||||
"ggml_metal_init: loaded kernel_get_rows_f16 0x2bbbc5fc0\n",
|
||||
"ggml_metal_init: loaded kernel_get_rows_q4_0 0x2bbbc6720\n",
|
||||
"ggml_metal_init: loaded kernel_get_rows_q4_1 0x2bb918c10\n",
|
||||
"ggml_metal_init: loaded kernel_get_rows_q2_k 0x2bbbc51b0\n",
|
||||
"ggml_metal_init: loaded kernel_get_rows_q3_k 0x2bbbc7630\n",
|
||||
"ggml_metal_init: loaded kernel_get_rows_q4_k 0x2d4394e30\n",
|
||||
"ggml_metal_init: loaded kernel_get_rows_q5_k 0x2bbbc7890\n",
|
||||
"ggml_metal_init: loaded kernel_get_rows_q6_k 0x2d4395210\n",
|
||||
"ggml_metal_init: loaded kernel_rms_norm 0x2bbbc8740\n",
|
||||
"ggml_metal_init: loaded kernel_norm 0x2bbbc8b30\n",
|
||||
"ggml_metal_init: loaded kernel_mul_mat_f16_f32 0x2d4395470\n",
|
||||
"ggml_metal_init: loaded kernel_mul_mat_q4_0_f32 0x2d4395a70\n",
|
||||
"ggml_metal_init: loaded kernel_mul_mat_q4_1_f32 0x1242b1a00\n",
|
||||
"ggml_metal_init: loaded kernel_mul_mat_q2_k_f32 0x29f17d1c0\n",
|
||||
"ggml_metal_init: loaded kernel_mul_mat_q3_k_f32 0x2d4396050\n",
|
||||
"ggml_metal_init: loaded kernel_mul_mat_q4_k_f32 0x2bbbc98a0\n",
|
||||
"ggml_metal_init: loaded kernel_mul_mat_q5_k_f32 0x2bbbca4a0\n",
|
||||
"ggml_metal_init: loaded kernel_mul_mat_q6_k_f32 0x2bbbcae90\n",
|
||||
"ggml_metal_init: loaded kernel_rope 0x2bbbca700\n",
|
||||
"ggml_metal_init: loaded kernel_alibi_f32 0x2bbbcc6e0\n",
|
||||
"ggml_metal_init: loaded kernel_cpy_f32_f16 0x2bbbccf90\n",
|
||||
"ggml_metal_init: loaded kernel_cpy_f32_f32 0x2bbbcd900\n",
|
||||
"ggml_metal_init: loaded kernel_cpy_f16_f16 0x2bbbce1f0\n",
|
||||
"ggml_metal_init: recommendedMaxWorkingSetSize = 21845.34 MB\n",
|
||||
"ggml_metal_init: hasUnifiedMemory = true\n",
|
||||
"ggml_metal_init: maxTransferRate = built-in GPU\n",
|
||||
"ggml_metal_add_buffer: allocated 'data ' buffer, size = 6984.06 MB, ( 6984.45 / 21845.34)\n",
|
||||
"ggml_metal_add_buffer: allocated 'eval ' buffer, size = 1024.00 MB, ( 8008.45 / 21845.34)\n",
|
||||
"ggml_metal_add_buffer: allocated 'kv ' buffer, size = 1602.00 MB, ( 9610.45 / 21845.34)\n",
|
||||
"ggml_metal_add_buffer: allocated 'scr0 ' buffer, size = 512.00 MB, (10122.45 / 21845.34)\n",
|
||||
"ggml_metal_add_buffer: allocated 'scr1 ' buffer, size = 512.00 MB, (10634.45 / 21845.34)\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from langchain.llms import GPT4All\n",
|
||||
"from langchain.chains import RetrievalQA\n",
|
||||
"\n",
|
||||
"llm = GPT4All(model=\"/Users/rlm/Desktop/Code/gpt4all/models/nous-hermes-13b.ggmlv3.q4_0.bin\",max_tokens=2048)\n",
|
||||
"qa_chain = RetrievalQA.from_chain_type(llm, retriever=vectorstore.as_retriever())"
|
||||
"- See a guide on local LLMS [here](/docs/modules/use_cases/question_answering/how_to/local_retrieval_qa)."
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -460,24 +352,17 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"execution_count": 10,
|
||||
"id": "e4fee704",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"ggml_metal_free: deallocating\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'The approaches to task decomposition include using LLM with simple prompting, task-specific instructions, or human inputs. Thanks for asking!'"
|
||||
"'The approaches to Task Decomposition are (1) using simple prompting by LLM, (2) using task-specific instructions, and (3) incorporating human inputs. Thanks for asking!'"
|
||||
]
|
||||
},
|
||||
"execution_count": 13,
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
@ -507,8 +392,65 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "ff40e8db",
|
||||
"id": "c825e9bf-6a56-46e4-8bbb-05441f76cb96",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"We can also store and fetch prompts from the LangChain prompt hub.\n",
|
||||
"\n",
|
||||
"This will work with your [LangSmith API key](https://docs.smith.langchain.com/).\n",
|
||||
"\n",
|
||||
"For example, see [here](https://smith.langchain.com/hub/rlm/rag-prompt) is a common prompt for RAG.\n",
|
||||
"\n",
|
||||
"We can load this."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "a896060f-ebc4-4236-a4ad-32960601c6e8",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"pip install langchainhub"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"id": "aef8e734-ba54-48ae-b959-1898618f2d90",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'The approaches to task decomposition include using LLM with simple prompting, task-specific instructions, and human inputs.'"
|
||||
]
|
||||
},
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# RAG prompt\n",
|
||||
"from langchain import hub\n",
|
||||
"QA_CHAIN_PROMPT_HUB = hub.pull(\"rlm/rag-prompt\")\n",
|
||||
"\n",
|
||||
"qa_chain = RetrievalQA.from_chain_type(\n",
|
||||
" llm,\n",
|
||||
" retriever=vectorstore.as_retriever(),\n",
|
||||
" chain_type_kwargs={\"prompt\": QA_CHAIN_PROMPT_HUB}\n",
|
||||
")\n",
|
||||
"result = qa_chain({\"query\": question})\n",
|
||||
"result[\"result\"]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "ff40e8db",
|
||||
"metadata": {
|
||||
"jp-MarkdownHeadingCollapsed": true
|
||||
},
|
||||
"source": [
|
||||
"#### Return source documents\n",
|
||||
"\n",
|
||||
|
@ -1,12 +1,21 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "raw",
|
||||
"id": "2aca8168-62ec-4bba-93f0-73da08cd1920",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"---\n",
|
||||
"sidebar_position: 1\n",
|
||||
"title: Summarization\n",
|
||||
"---"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "cf13f702",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Summarization\n",
|
||||
"\n",
|
||||
"[](https://colab.research.google.com/github/langchain-ai/langchain/blob/master/docs/extras/use_cases/summarization.ipynb)\n",
|
||||
"\n",
|
||||
"## Use case\n",
|
||||
@ -548,7 +557,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.16"
|
||||
"version": "3.9.1"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
@ -1,12 +1,21 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "raw",
|
||||
"id": "cb6f552e-775f-4d84-bc7c-dca94c06a33c",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"---\n",
|
||||
"sidebar_position: 1\n",
|
||||
"title: Tagging\n",
|
||||
"---"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "a0507a4b",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Tagging\n",
|
||||
"\n",
|
||||
"[](https://colab.research.google.com/github/langchain-ai/langchain/blob/master/docs/extras/use_cases/tagging.ipynb)\n",
|
||||
"\n",
|
||||
"## Use case\n",
|
||||
@ -408,7 +417,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.16"
|
||||
"version": "3.9.1"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
@ -1,12 +1,21 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "raw",
|
||||
"id": "e254cf03-49fc-4051-a4df-3a8e4e7d2688",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"---\n",
|
||||
"sidebar_position: 1\n",
|
||||
"title: Web scraping\n",
|
||||
"---"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "6605e7f7",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Web Scraping\n",
|
||||
"\n",
|
||||
"[](https://colab.research.google.com/github/langchain-ai/langchain/blob/master/docs/extras/use_cases/web_scraping.ipynb)\n",
|
||||
"\n",
|
||||
"## Use case\n",
|
||||
@ -306,9 +315,7 @@
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"id": "977560ba",
|
||||
"metadata": {
|
||||
"scrolled": false
|
||||
},
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
@ -591,7 +598,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.16"
|
||||
"version": "3.9.1"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
@ -5,10 +5,12 @@ pip install openai google-search-results
|
||||
```
|
||||
|
||||
```python
|
||||
from langchain import LLMMathChain, OpenAI, SerpAPIWrapper, SQLDatabase, SQLDatabaseChain
|
||||
from langchain.agents import initialize_agent, Tool
|
||||
from langchain.agents import AgentType
|
||||
from langchain.agents import initialize_agent, AgentType, Tool
|
||||
from langchain.chains import LLMMathChain
|
||||
from langchain.chat_models import ChatOpenAI
|
||||
from langchain.llms import OpenAI
|
||||
from langchain.utilities import SerpAPIWrapper, SQLDatabase
|
||||
from langchain_experimental.sql import SQLDatabaseChain
|
||||
```
|
||||
|
||||
|
||||
|
@ -1,4 +1,7 @@
|
||||
"""Data anonymizer package"""
|
||||
from langchain_experimental.data_anonymizer.presidio import PresidioAnonymizer
|
||||
from langchain_experimental.data_anonymizer.presidio import (
|
||||
PresidioAnonymizer,
|
||||
PresidioReversibleAnonymizer,
|
||||
)
|
||||
|
||||
__all__ = ["PresidioAnonymizer"]
|
||||
__all__ = ["PresidioAnonymizer", "PresidioReversibleAnonymizer"]
|
||||
|
@ -15,3 +15,17 @@ class AnonymizerBase(ABC):
|
||||
@abstractmethod
|
||||
def _anonymize(self, text: str) -> str:
|
||||
"""Abstract method to anonymize text"""
|
||||
|
||||
|
||||
class ReversibleAnonymizerBase(AnonymizerBase):
|
||||
"""
|
||||
Base abstract class for reversible anonymizers.
|
||||
"""
|
||||
|
||||
def deanonymize(self, text: str) -> str:
|
||||
"""Deanonymize text"""
|
||||
return self._deanonymize(text)
|
||||
|
||||
@abstractmethod
|
||||
def _deanonymize(self, text: str) -> str:
|
||||
"""Abstract method to deanonymize text"""
|
||||
|
@ -0,0 +1,21 @@
|
||||
from collections import defaultdict
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Dict
|
||||
|
||||
MappingDataType = Dict[str, Dict[str, str]]
|
||||
|
||||
|
||||
@dataclass
|
||||
class DeanonymizerMapping:
|
||||
mapping: MappingDataType = field(
|
||||
default_factory=lambda: defaultdict(lambda: defaultdict(str))
|
||||
)
|
||||
|
||||
@property
|
||||
def data(self) -> MappingDataType:
|
||||
"""Return the deanonymizer mapping"""
|
||||
return {k: dict(v) for k, v in self.mapping.items()}
|
||||
|
||||
def update(self, new_mapping: MappingDataType) -> None:
|
||||
for entity_type, values in new_mapping.items():
|
||||
self.mapping[entity_type].update(values)
|
@ -0,0 +1,17 @@
|
||||
from langchain_experimental.data_anonymizer.presidio import MappingDataType
|
||||
|
||||
|
||||
def default_matching_strategy(text: str, deanonymizer_mapping: MappingDataType) -> str:
|
||||
"""
|
||||
Default matching strategy for deanonymization.
|
||||
It replaces all the anonymized entities with the original ones.
|
||||
|
||||
Args:
|
||||
text: text to deanonymize
|
||||
deanonymizer_mapping: mapping between anonymized entities and original ones"""
|
||||
|
||||
# Iterate over all the entities (PERSON, EMAIL_ADDRESS, etc.)
|
||||
for entity_type in deanonymizer_mapping:
|
||||
for anonymized, original in deanonymizer_mapping[entity_type].items():
|
||||
text = text.replace(anonymized, original)
|
||||
return text
|
@ -1,8 +1,8 @@
|
||||
import string
|
||||
from typing import Callable, Dict
|
||||
from typing import Callable, Dict, Optional
|
||||
|
||||
|
||||
def get_pseudoanonymizer_mapping() -> Dict[str, Callable]:
|
||||
def get_pseudoanonymizer_mapping(seed: Optional[int] = None) -> Dict[str, Callable]:
|
||||
try:
|
||||
from faker import Faker
|
||||
except ImportError as e:
|
||||
@ -11,6 +11,7 @@ def get_pseudoanonymizer_mapping() -> Dict[str, Callable]:
|
||||
) from e
|
||||
|
||||
fake = Faker()
|
||||
fake.seed_instance(seed)
|
||||
|
||||
# Listed entities supported by Microsoft Presidio (for now, global and US only)
|
||||
# Source: https://microsoft.github.io/presidio/supported_entities/
|
||||
|
@ -1,24 +1,56 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING, Dict, List, Optional
|
||||
import json
|
||||
from collections import defaultdict
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Union
|
||||
|
||||
from langchain_experimental.data_anonymizer.base import AnonymizerBase
|
||||
import yaml
|
||||
|
||||
from langchain_experimental.data_anonymizer.base import (
|
||||
AnonymizerBase,
|
||||
ReversibleAnonymizerBase,
|
||||
)
|
||||
from langchain_experimental.data_anonymizer.deanonymizer_mapping import (
|
||||
DeanonymizerMapping,
|
||||
MappingDataType,
|
||||
)
|
||||
from langchain_experimental.data_anonymizer.deanonymizer_matching_strategies import (
|
||||
default_matching_strategy,
|
||||
)
|
||||
from langchain_experimental.data_anonymizer.faker_presidio_mapping import (
|
||||
get_pseudoanonymizer_mapping,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from presidio_analyzer import EntityRecognizer
|
||||
try:
|
||||
from presidio_analyzer import AnalyzerEngine
|
||||
except ImportError as e:
|
||||
raise ImportError(
|
||||
"Could not import presidio_analyzer, please install with "
|
||||
"`pip install presidio-analyzer`. You will also need to download a "
|
||||
"spaCy model to use the analyzer, e.g. "
|
||||
"`python -m spacy download en_core_web_lg`."
|
||||
) from e
|
||||
try:
|
||||
from presidio_anonymizer import AnonymizerEngine
|
||||
from presidio_anonymizer.entities import OperatorConfig
|
||||
except ImportError as e:
|
||||
raise ImportError(
|
||||
"Could not import presidio_anonymizer, please install with "
|
||||
"`pip install presidio-anonymizer`."
|
||||
) from e
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from presidio_analyzer import EntityRecognizer, RecognizerResult
|
||||
from presidio_anonymizer.entities import EngineResult
|
||||
|
||||
|
||||
class PresidioAnonymizer(AnonymizerBase):
|
||||
"""Anonymizer using Microsoft Presidio."""
|
||||
|
||||
class PresidioAnonymizerBase(AnonymizerBase):
|
||||
def __init__(
|
||||
self,
|
||||
analyzed_fields: Optional[List[str]] = None,
|
||||
operators: Optional[Dict[str, OperatorConfig]] = None,
|
||||
faker_seed: Optional[int] = None,
|
||||
):
|
||||
"""
|
||||
Args:
|
||||
@ -28,25 +60,10 @@ class PresidioAnonymizer(AnonymizerBase):
|
||||
Operators allow for custom anonymization of detected PII.
|
||||
Learn more:
|
||||
https://microsoft.github.io/presidio/tutorial/10_simple_anonymization/
|
||||
faker_seed: Seed used to initialize faker.
|
||||
Defaults to None, in which case faker will be seeded randomly
|
||||
and provide random values.
|
||||
"""
|
||||
try:
|
||||
from presidio_analyzer import AnalyzerEngine
|
||||
except ImportError as e:
|
||||
raise ImportError(
|
||||
"Could not import presidio_analyzer, please install with "
|
||||
"`pip install presidio-analyzer`. You will also need to download a "
|
||||
"spaCy model to use the analyzer, e.g. "
|
||||
"`python -m spacy download en_core_web_lg`."
|
||||
) from e
|
||||
try:
|
||||
from presidio_anonymizer import AnonymizerEngine
|
||||
from presidio_anonymizer.entities import OperatorConfig
|
||||
except ImportError as e:
|
||||
raise ImportError(
|
||||
"Could not import presidio_anonymizer, please install with "
|
||||
"`pip install presidio-anonymizer`."
|
||||
) from e
|
||||
|
||||
self.analyzed_fields = (
|
||||
analyzed_fields
|
||||
if analyzed_fields is not None
|
||||
@ -59,13 +76,41 @@ class PresidioAnonymizer(AnonymizerBase):
|
||||
field: OperatorConfig(
|
||||
operator_name="custom", params={"lambda": faker_function}
|
||||
)
|
||||
for field, faker_function in get_pseudoanonymizer_mapping().items()
|
||||
for field, faker_function in get_pseudoanonymizer_mapping(
|
||||
faker_seed
|
||||
).items()
|
||||
}
|
||||
)
|
||||
self._analyzer = AnalyzerEngine()
|
||||
self._anonymizer = AnonymizerEngine()
|
||||
|
||||
def add_recognizer(self, recognizer: EntityRecognizer) -> None:
|
||||
"""Add a recognizer to the analyzer
|
||||
|
||||
Args:
|
||||
recognizer: Recognizer to add to the analyzer.
|
||||
"""
|
||||
self._analyzer.registry.add_recognizer(recognizer)
|
||||
self.analyzed_fields.extend(recognizer.supported_entities)
|
||||
|
||||
def add_operators(self, operators: Dict[str, OperatorConfig]) -> None:
|
||||
"""Add operators to the anonymizer
|
||||
|
||||
Args:
|
||||
operators: Operators to add to the anonymizer.
|
||||
"""
|
||||
self.operators.update(operators)
|
||||
|
||||
|
||||
class PresidioAnonymizer(PresidioAnonymizerBase):
|
||||
def _anonymize(self, text: str) -> str:
|
||||
"""Anonymize text.
|
||||
Each PII entity is replaced with a fake value.
|
||||
Each time fake values will be different, as they are generated randomly.
|
||||
|
||||
Args:
|
||||
text: text to anonymize
|
||||
"""
|
||||
results = self._analyzer.analyze(
|
||||
text,
|
||||
entities=self.analyzed_fields,
|
||||
@ -78,11 +123,185 @@ class PresidioAnonymizer(AnonymizerBase):
|
||||
operators=self.operators,
|
||||
).text
|
||||
|
||||
def add_recognizer(self, recognizer: EntityRecognizer) -> None:
|
||||
"""Add a recognizer to the analyzer"""
|
||||
self._analyzer.registry.add_recognizer(recognizer)
|
||||
self.analyzed_fields.extend(recognizer.supported_entities)
|
||||
|
||||
def add_operators(self, operators: Dict[str, OperatorConfig]) -> None:
|
||||
"""Add operators to the anonymizer"""
|
||||
self.operators.update(operators)
|
||||
class PresidioReversibleAnonymizer(PresidioAnonymizerBase, ReversibleAnonymizerBase):
|
||||
def __init__(
|
||||
self,
|
||||
analyzed_fields: Optional[List[str]] = None,
|
||||
operators: Optional[Dict[str, OperatorConfig]] = None,
|
||||
faker_seed: Optional[int] = None,
|
||||
):
|
||||
super().__init__(analyzed_fields, operators, faker_seed)
|
||||
self._deanonymizer_mapping = DeanonymizerMapping()
|
||||
|
||||
@property
|
||||
def deanonymizer_mapping(self) -> MappingDataType:
|
||||
"""Return the deanonymizer mapping"""
|
||||
return self._deanonymizer_mapping.data
|
||||
|
||||
def _update_deanonymizer_mapping(
|
||||
self,
|
||||
original_text: str,
|
||||
analyzer_results: List[RecognizerResult],
|
||||
anonymizer_results: EngineResult,
|
||||
) -> None:
|
||||
"""Creates or updates the mapping used to de-anonymize text.
|
||||
|
||||
This method exploits the results returned by the
|
||||
analysis and anonymization processes.
|
||||
|
||||
It constructs a mapping from each anonymized entity
|
||||
back to its original text value.
|
||||
|
||||
Mapping will be stored as "deanonymizer_mapping" property.
|
||||
|
||||
Example of "deanonymizer_mapping":
|
||||
{
|
||||
"PERSON": {
|
||||
"<anonymized>": "<original>",
|
||||
"John Doe": "Slim Shady"
|
||||
},
|
||||
"PHONE_NUMBER": {
|
||||
"111-111-1111": "555-555-5555"
|
||||
}
|
||||
...
|
||||
}
|
||||
"""
|
||||
|
||||
# We are able to zip and loop through both lists because we expect
|
||||
# them to return corresponding entities for each identified piece
|
||||
# of analyzable data from our input.
|
||||
|
||||
# We sort them by their 'start' attribute because it allows us to
|
||||
# match corresponding entities by their position in the input text.
|
||||
analyzer_results = sorted(analyzer_results, key=lambda d: d.start)
|
||||
anonymizer_results.items = sorted(
|
||||
anonymizer_results.items, key=lambda d: d.start
|
||||
)
|
||||
|
||||
new_deanonymizer_mapping: MappingDataType = defaultdict(dict)
|
||||
|
||||
for analyzed_entity, anonymized_entity in zip(
|
||||
analyzer_results, anonymizer_results.items
|
||||
):
|
||||
original_value = original_text[analyzed_entity.start : analyzed_entity.end]
|
||||
new_deanonymizer_mapping[anonymized_entity.entity_type][
|
||||
anonymized_entity.text
|
||||
] = original_value
|
||||
|
||||
self._deanonymizer_mapping.update(new_deanonymizer_mapping)
|
||||
|
||||
def _anonymize(self, text: str) -> str:
|
||||
"""Anonymize text.
|
||||
Each PII entity is replaced with a fake value.
|
||||
Each time fake values will be different, as they are generated randomly.
|
||||
At the same time, we will create a mapping from each anonymized entity
|
||||
back to its original text value.
|
||||
|
||||
Args:
|
||||
text: text to anonymize
|
||||
"""
|
||||
analyzer_results = self._analyzer.analyze(
|
||||
text,
|
||||
entities=self.analyzed_fields,
|
||||
language="en",
|
||||
)
|
||||
|
||||
filtered_analyzer_results = (
|
||||
self._anonymizer._remove_conflicts_and_get_text_manipulation_data(
|
||||
analyzer_results
|
||||
)
|
||||
)
|
||||
|
||||
anonymizer_results = self._anonymizer.anonymize(
|
||||
text,
|
||||
analyzer_results=analyzer_results,
|
||||
operators=self.operators,
|
||||
)
|
||||
|
||||
self._update_deanonymizer_mapping(
|
||||
text, filtered_analyzer_results, anonymizer_results
|
||||
)
|
||||
|
||||
return anonymizer_results.text
|
||||
|
||||
def _deanonymize(
|
||||
self,
|
||||
text_to_deanonymize: str,
|
||||
deanonymizer_matching_strategy: Callable[
|
||||
[str, MappingDataType], str
|
||||
] = default_matching_strategy,
|
||||
) -> str:
|
||||
"""Deanonymize text.
|
||||
Each anonymized entity is replaced with its original value.
|
||||
This method exploits the mapping created during the anonymization process.
|
||||
|
||||
Args:
|
||||
text_to_deanonymize: text to deanonymize
|
||||
deanonymizer_matching_strategy: function to use to match
|
||||
anonymized entities with their original values and replace them.
|
||||
"""
|
||||
if not self._deanonymizer_mapping:
|
||||
raise ValueError(
|
||||
"Deanonymizer mapping is empty.",
|
||||
"Please call anonymize() and anonymize some text first.",
|
||||
)
|
||||
|
||||
text_to_deanonymize = deanonymizer_matching_strategy(
|
||||
text_to_deanonymize, self.deanonymizer_mapping
|
||||
)
|
||||
|
||||
return text_to_deanonymize
|
||||
|
||||
def save_deanonymizer_mapping(self, file_path: Union[Path, str]) -> None:
|
||||
"""Save the deanonymizer mapping to a JSON or YAML file.
|
||||
|
||||
Args:
|
||||
file_path: Path to file to save the mapping to.
|
||||
|
||||
Example:
|
||||
.. code-block:: python
|
||||
|
||||
anonymizer.save_deanonymizer_mapping(file_path="path/mapping.json")
|
||||
"""
|
||||
|
||||
save_path = Path(file_path)
|
||||
|
||||
if save_path.suffix not in [".json", ".yaml"]:
|
||||
raise ValueError(f"{save_path} must have an extension of .json or .yaml")
|
||||
|
||||
# Make sure parent directories exist
|
||||
save_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
if save_path.suffix == ".json":
|
||||
with open(save_path, "w") as f:
|
||||
json.dump(self.deanonymizer_mapping, f, indent=2)
|
||||
elif save_path.suffix == ".yaml":
|
||||
with open(save_path, "w") as f:
|
||||
yaml.dump(self.deanonymizer_mapping, f, default_flow_style=False)
|
||||
|
||||
def load_deanonymizer_mapping(self, file_path: Union[Path, str]) -> None:
|
||||
"""Load the deanonymizer mapping from a JSON or YAML file.
|
||||
|
||||
Args:
|
||||
file_path: Path to file to load the mapping from.
|
||||
|
||||
Example:
|
||||
.. code-block:: python
|
||||
|
||||
anonymizer.load_deanonymizer_mapping(file_path="path/mapping.json")
|
||||
"""
|
||||
|
||||
load_path = Path(file_path)
|
||||
|
||||
if load_path.suffix not in [".json", ".yaml"]:
|
||||
raise ValueError(f"{load_path} must have an extension of .json or .yaml")
|
||||
|
||||
if load_path.suffix == ".json":
|
||||
with open(load_path, "r") as f:
|
||||
loaded_mapping = json.load(f)
|
||||
elif load_path.suffix == ".yaml":
|
||||
with open(load_path, "r") as f:
|
||||
loaded_mapping = yaml.load(f, Loader=yaml.FullLoader)
|
||||
|
||||
self._deanonymizer_mapping.update(loaded_mapping)
|
||||
|
@ -0,0 +1,5 @@
|
||||
from langchain_experimental.graph_transformers.diffbot import DiffbotGraphTransformer
|
||||
|
||||
__all__ = [
|
||||
"DiffbotGraphTransformer",
|
||||
]
|
@ -0,0 +1,316 @@
|
||||
from typing import Any, Dict, List, Optional, Sequence, Tuple, Union
|
||||
|
||||
import requests
|
||||
from langchain.graphs.graph_document import GraphDocument, Node, Relationship
|
||||
from langchain.schema import Document
|
||||
from langchain.utils import get_from_env
|
||||
|
||||
|
||||
def format_property_key(s: str) -> str:
|
||||
words = s.split()
|
||||
if not words:
|
||||
return s
|
||||
first_word = words[0].lower()
|
||||
capitalized_words = [word.capitalize() for word in words[1:]]
|
||||
return "".join([first_word] + capitalized_words)
|
||||
|
||||
|
||||
class NodesList:
|
||||
"""
|
||||
Manages a list of nodes with associated properties.
|
||||
|
||||
Attributes:
|
||||
nodes (Dict[Tuple, Any]): Stores nodes as keys and their properties as values.
|
||||
Each key is a tuple where the first element is the
|
||||
node ID and the second is the node type.
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
self.nodes: Dict[Tuple[Union[str, int], str], Any] = dict()
|
||||
|
||||
def add_node_property(
|
||||
self, node: Tuple[Union[str, int], str], properties: Dict[str, Any]
|
||||
) -> None:
|
||||
"""
|
||||
Adds or updates node properties.
|
||||
|
||||
If the node does not exist in the list, it's added along with its properties.
|
||||
If the node already exists, its properties are updated with the new values.
|
||||
|
||||
Args:
|
||||
node (Tuple): A tuple containing the node ID and node type.
|
||||
properties (Dict): A dictionary of properties to add or update for the node.
|
||||
"""
|
||||
if node not in self.nodes:
|
||||
self.nodes[node] = properties
|
||||
else:
|
||||
self.nodes[node].update(properties)
|
||||
|
||||
def return_node_list(self) -> List[Node]:
|
||||
"""
|
||||
Returns the nodes as a list of Node objects.
|
||||
|
||||
Each Node object will have its ID, type, and properties populated.
|
||||
|
||||
Returns:
|
||||
List[Node]: A list of Node objects.
|
||||
"""
|
||||
nodes = [
|
||||
Node(id=key[0], type=key[1], properties=self.nodes[key])
|
||||
for key in self.nodes
|
||||
]
|
||||
return nodes
|
||||
|
||||
|
||||
# Properties that should be treated as node properties instead of relationships
|
||||
FACT_TO_PROPERTY_TYPE = [
|
||||
"Date",
|
||||
"Number",
|
||||
"Job title",
|
||||
"Cause of death",
|
||||
"Organization type",
|
||||
"Academic title",
|
||||
]
|
||||
|
||||
|
||||
schema_mapping = [
|
||||
("HEADQUARTERS", "ORGANIZATION_LOCATIONS"),
|
||||
("RESIDENCE", "PERSON_LOCATION"),
|
||||
("ALL_PERSON_LOCATIONS", "PERSON_LOCATION"),
|
||||
("CHILD", "HAS_CHILD"),
|
||||
("PARENT", "HAS_PARENT"),
|
||||
("CUSTOMERS", "HAS_CUSTOMER"),
|
||||
("SKILLED_AT", "INTERESTED_IN"),
|
||||
]
|
||||
|
||||
|
||||
class SimplifiedSchema:
|
||||
"""
|
||||
Provides functionality for working with a simplified schema mapping.
|
||||
|
||||
Attributes:
|
||||
schema (Dict): A dictionary containing the mapping to simplified schema types.
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
"""Initializes the schema dictionary based on the predefined list."""
|
||||
self.schema = dict()
|
||||
for row in schema_mapping:
|
||||
self.schema[row[0]] = row[1]
|
||||
|
||||
def get_type(self, type: str) -> str:
|
||||
"""
|
||||
Retrieves the simplified schema type for a given original type.
|
||||
|
||||
Args:
|
||||
type (str): The original schema type to find the simplified type for.
|
||||
|
||||
Returns:
|
||||
str: The simplified schema type if it exists;
|
||||
otherwise, returns the original type.
|
||||
"""
|
||||
try:
|
||||
return self.schema[type]
|
||||
except KeyError:
|
||||
return type
|
||||
|
||||
|
||||
class DiffbotGraphTransformer:
|
||||
"""Transforms documents into graph documents using Diffbot's NLP API.
|
||||
|
||||
A graph document transformation system takes a sequence of Documents and returns a
|
||||
sequence of Graph Documents.
|
||||
|
||||
Example:
|
||||
.. code-block:: python
|
||||
|
||||
class DiffbotGraphTransformer(BaseGraphDocumentTransformer):
|
||||
|
||||
def transform_documents(
|
||||
self, documents: Sequence[Document], **kwargs: Any
|
||||
) -> Sequence[GraphDocument]:
|
||||
results = []
|
||||
|
||||
for document in documents:
|
||||
raw_results = self.nlp_request(document.page_content)
|
||||
graph_document = self.process_response(raw_results, document)
|
||||
results.append(graph_document)
|
||||
return results
|
||||
|
||||
async def atransform_documents(
|
||||
self, documents: Sequence[Document], **kwargs: Any
|
||||
) -> Sequence[Document]:
|
||||
raise NotImplementedError
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
diffbot_api_key: Optional[str] = None,
|
||||
fact_confidence_threshold: float = 0.7,
|
||||
include_qualifiers: bool = True,
|
||||
include_evidence: bool = True,
|
||||
simplified_schema: bool = True,
|
||||
) -> None:
|
||||
"""
|
||||
Initialize the graph transformer with various options.
|
||||
|
||||
Args:
|
||||
diffbot_api_key (str):
|
||||
The API key for Diffbot's NLP services.
|
||||
|
||||
fact_confidence_threshold (float):
|
||||
Minimum confidence level for facts to be included.
|
||||
include_qualifiers (bool):
|
||||
Whether to include qualifiers in the relationships.
|
||||
include_evidence (bool):
|
||||
Whether to include evidence for the relationships.
|
||||
simplified_schema (bool):
|
||||
Whether to use a simplified schema for relationships.
|
||||
"""
|
||||
self.diffbot_api_key = diffbot_api_key or get_from_env(
|
||||
"diffbot_api_key", "DIFFBOT_API_KEY"
|
||||
)
|
||||
self.fact_threshold_confidence = fact_confidence_threshold
|
||||
self.include_qualifiers = include_qualifiers
|
||||
self.include_evidence = include_evidence
|
||||
self.simplified_schema = None
|
||||
if simplified_schema:
|
||||
self.simplified_schema = SimplifiedSchema()
|
||||
|
||||
def nlp_request(self, text: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Make an API request to the Diffbot NLP endpoint.
|
||||
|
||||
Args:
|
||||
text (str): The text to be processed.
|
||||
|
||||
Returns:
|
||||
Dict[str, Any]: The JSON response from the API.
|
||||
"""
|
||||
|
||||
# Relationship extraction only works for English
|
||||
payload = {
|
||||
"content": text,
|
||||
"lang": "en",
|
||||
}
|
||||
|
||||
FIELDS = "facts"
|
||||
HOST = "nl.diffbot.com"
|
||||
url = (
|
||||
f"https://{HOST}/v1/?fields={FIELDS}&"
|
||||
f"token={self.diffbot_api_key}&language=en"
|
||||
)
|
||||
result = requests.post(url, data=payload)
|
||||
return result.json()
|
||||
|
||||
def process_response(
|
||||
self, payload: Dict[str, Any], document: Document
|
||||
) -> GraphDocument:
|
||||
"""
|
||||
Transform the Diffbot NLP response into a GraphDocument.
|
||||
|
||||
Args:
|
||||
payload (Dict[str, Any]): The JSON response from Diffbot's NLP API.
|
||||
document (Document): The original document.
|
||||
|
||||
Returns:
|
||||
GraphDocument: The transformed document as a graph.
|
||||
"""
|
||||
|
||||
# Return empty result if there are no facts
|
||||
if "facts" not in payload or not payload["facts"]:
|
||||
return GraphDocument(nodes=[], relationships=[], source=document)
|
||||
|
||||
# Nodes are a custom class because we need to deduplicate
|
||||
nodes_list = NodesList()
|
||||
# Relationships are a list because we don't deduplicate nor anything else
|
||||
relationships = list()
|
||||
for record in payload["facts"]:
|
||||
# Skip if the fact is below the threshold confidence
|
||||
if record["confidence"] < self.fact_threshold_confidence:
|
||||
continue
|
||||
|
||||
# TODO: It should probably be treated as a node property
|
||||
if not record["value"]["allTypes"]:
|
||||
continue
|
||||
|
||||
# Define source node
|
||||
source_id = (
|
||||
record["entity"]["allUris"][0]
|
||||
if record["entity"]["allUris"]
|
||||
else record["entity"]["name"]
|
||||
)
|
||||
source_label = record["entity"]["allTypes"][0]["name"].capitalize()
|
||||
source_name = record["entity"]["name"]
|
||||
source_node = Node(id=source_id, type=source_label)
|
||||
nodes_list.add_node_property(
|
||||
(source_id, source_label), {"name": source_name}
|
||||
)
|
||||
|
||||
# Define target node
|
||||
target_id = (
|
||||
record["value"]["allUris"][0]
|
||||
if record["value"]["allUris"]
|
||||
else record["value"]["name"]
|
||||
)
|
||||
target_label = record["value"]["allTypes"][0]["name"].capitalize()
|
||||
target_name = record["value"]["name"]
|
||||
# Some facts are better suited as node properties
|
||||
if target_label in FACT_TO_PROPERTY_TYPE:
|
||||
nodes_list.add_node_property(
|
||||
(source_id, source_label),
|
||||
{format_property_key(record["property"]["name"]): target_name},
|
||||
)
|
||||
else: # Define relationship
|
||||
# Define target node object
|
||||
target_node = Node(id=target_id, type=target_label)
|
||||
nodes_list.add_node_property(
|
||||
(target_id, target_label), {"name": target_name}
|
||||
)
|
||||
# Define relationship type
|
||||
rel_type = record["property"]["name"].replace(" ", "_").upper()
|
||||
if self.simplified_schema:
|
||||
rel_type = self.simplified_schema.get_type(rel_type)
|
||||
|
||||
# Relationship qualifiers/properties
|
||||
rel_properties = dict()
|
||||
relationship_evidence = [el["passage"] for el in record["evidence"]][0]
|
||||
if self.include_evidence:
|
||||
rel_properties.update({"evidence": relationship_evidence})
|
||||
if self.include_qualifiers and record.get("qualifiers"):
|
||||
for property in record["qualifiers"]:
|
||||
prop_key = format_property_key(property["property"]["name"])
|
||||
rel_properties[prop_key] = property["value"]["name"]
|
||||
|
||||
relationship = Relationship(
|
||||
source=source_node,
|
||||
target=target_node,
|
||||
type=rel_type,
|
||||
properties=rel_properties,
|
||||
)
|
||||
relationships.append(relationship)
|
||||
|
||||
return GraphDocument(
|
||||
nodes=nodes_list.return_node_list(),
|
||||
relationships=relationships,
|
||||
source=document,
|
||||
)
|
||||
|
||||
def convert_to_graph_documents(
|
||||
self, documents: Sequence[Document]
|
||||
) -> List[GraphDocument]:
|
||||
"""Convert a sequence of documents into graph documents.
|
||||
|
||||
Args:
|
||||
documents (Sequence[Document]): The original documents.
|
||||
**kwargs: Additional keyword arguments.
|
||||
|
||||
Returns:
|
||||
Sequence[GraphDocument]: The transformed documents as graphs.
|
||||
"""
|
||||
results = []
|
||||
for document in documents:
|
||||
raw_results = self.nlp_request(document.page_content)
|
||||
graph_document = self.process_response(raw_results, document)
|
||||
results.append(graph_document)
|
||||
return results
|
@ -0,0 +1,38 @@
|
||||
"""Vector SQL Database Chain Retriever"""
|
||||
from typing import Any, Dict, List
|
||||
|
||||
from langchain.callbacks.manager import (
|
||||
AsyncCallbackManagerForRetrieverRun,
|
||||
CallbackManagerForRetrieverRun,
|
||||
)
|
||||
from langchain.schema import BaseRetriever, Document
|
||||
|
||||
from langchain_experimental.sql.vector_sql import VectorSQLDatabaseChain
|
||||
|
||||
|
||||
class VectorSQLDatabaseChainRetriever(BaseRetriever):
|
||||
"""Retriever that uses SQLDatabase as Retriever"""
|
||||
|
||||
sql_db_chain: VectorSQLDatabaseChain
|
||||
"""SQL Database Chain"""
|
||||
page_content_key: str = "content"
|
||||
"""column name for page content of documents"""
|
||||
|
||||
def _get_relevant_documents(
|
||||
self,
|
||||
query: str,
|
||||
*,
|
||||
run_manager: CallbackManagerForRetrieverRun,
|
||||
**kwargs: Any,
|
||||
) -> List[Document]:
|
||||
ret: List[Dict[str, Any]] = self.sql_db_chain(
|
||||
query, callbacks=run_manager.get_child(), **kwargs
|
||||
)["result"]
|
||||
return [
|
||||
Document(page_content=r[self.page_content_key], metadata=r) for r in ret
|
||||
]
|
||||
|
||||
async def _aget_relevant_documents(
|
||||
self, query: str, *, run_manager: AsyncCallbackManagerForRetrieverRun
|
||||
) -> List[Document]:
|
||||
raise NotImplementedError
|
85
libs/experimental/langchain_experimental/sql/prompt.py
Normal file
85
libs/experimental/langchain_experimental/sql/prompt.py
Normal file
@ -0,0 +1,85 @@
|
||||
# flake8: noqa
|
||||
from langchain.prompts.prompt import PromptTemplate
|
||||
|
||||
|
||||
PROMPT_SUFFIX = """Only use the following tables:
|
||||
{table_info}
|
||||
|
||||
Question: {input}"""
|
||||
|
||||
_VECTOR_SQL_DEFAULT_TEMPLATE = """You are a {dialect} expert. Given an input question, first create a syntactically correct {dialect} query to run, then look at the results of the query and return the answer to the input question.
|
||||
{dialect} queries has a vector distance function called `DISTANCE(column, array)` to compute relevance to the user's question and sort the feature array column by the relevance.
|
||||
When the query is asking for {top_k} closest row, you have to use this distance function to calculate distance to entity's array on vector column and order by the distance to retrieve relevant rows.
|
||||
|
||||
*NOTICE*: `DISTANCE(column, array)` only accept an array column as its first argument and a `NeuralArray(entity)` as its second argument. You also need a user defined function called `NeuralArray(entity)` to retrieve the entity's array.
|
||||
|
||||
Unless the user specifies in the question a specific number of examples to obtain, query for at most {top_k} results using the LIMIT clause as per {dialect}. You should only order according to the distance function.
|
||||
Never query for all columns from a table. You must query only the columns that are needed to answer the question. Wrap each column name in double quotes (") to denote them as delimited identifiers.
|
||||
Pay attention to use only the column names you can see in the tables below. Be careful to not query for columns that do not exist. Also, pay attention to which column is in which table.
|
||||
Pay attention to use today() function to get the current date, if the question involves "today". `ORDER BY` clause should always be after `WHERE` clause. DO NOT add semicolon to the end of SQL. Pay attention to the comment in table schema.
|
||||
|
||||
Use the following format:
|
||||
|
||||
Question: "Question here"
|
||||
SQLQuery: "SQL Query to run"
|
||||
SQLResult: "Result of the SQLQuery"
|
||||
Answer: "Final answer here"
|
||||
"""
|
||||
|
||||
VECTOR_SQL_PROMPT = PromptTemplate(
|
||||
input_variables=["input", "table_info", "dialect", "top_k"],
|
||||
template=_VECTOR_SQL_DEFAULT_TEMPLATE + PROMPT_SUFFIX,
|
||||
)
|
||||
|
||||
|
||||
_myscale_prompt = """You are a MyScale expert. Given an input question, first create a syntactically correct MyScale query to run, then look at the results of the query and return the answer to the input question.
|
||||
MyScale queries has a vector distance function called `DISTANCE(column, array)` to compute relevance to the user's question and sort the feature array column by the relevance.
|
||||
When the query is asking for {top_k} closest row, you have to use this distance function to calculate distance to entity's array on vector column and order by the distance to retrieve relevant rows.
|
||||
|
||||
*NOTICE*: `DISTANCE(column, array)` only accept an array column as its first argument and a `NeuralArray(entity)` as its second argument. You also need a user defined function called `NeuralArray(entity)` to retrieve the entity's array.
|
||||
|
||||
Unless the user specifies in the question a specific number of examples to obtain, query for at most {top_k} results using the LIMIT clause as per MyScale. You should only order according to the distance function.
|
||||
Never query for all columns from a table. You must query only the columns that are needed to answer the question. Wrap each column name in double quotes (") to denote them as delimited identifiers.
|
||||
Pay attention to use only the column names you can see in the tables below. Be careful to not query for columns that do not exist. Also, pay attention to which column is in which table.
|
||||
Pay attention to use today() function to get the current date, if the question involves "today". `ORDER BY` clause should always be after `WHERE` clause. DO NOT add semicolon to the end of SQL. Pay attention to the comment in table schema.
|
||||
|
||||
Use the following format:
|
||||
|
||||
======== table info ========
|
||||
<some table infos>
|
||||
|
||||
Question: "Question here"
|
||||
SQLQuery: "SQL Query to run"
|
||||
|
||||
|
||||
Here are some examples:
|
||||
|
||||
======== table info ========
|
||||
CREATE TABLE "ChatPaper" (
|
||||
abstract String,
|
||||
id String,
|
||||
vector Array(Float32),
|
||||
) ENGINE = ReplicatedReplacingMergeTree()
|
||||
ORDER BY id
|
||||
PRIMARY KEY id
|
||||
|
||||
Question: What is Feartue Pyramid Network?
|
||||
SQLQuery: SELECT ChatPaper.title, ChatPaper.id, ChatPaper.authors FROM ChatPaper ORDER BY DISTANCE(vector, NeuralArray(PaperRank contribution)) LIMIT {top_k}
|
||||
|
||||
|
||||
Let's begin:
|
||||
======== table info ========
|
||||
{table_info}
|
||||
|
||||
Question: {input}
|
||||
SQLQuery: """
|
||||
|
||||
MYSCALE_PROMPT = PromptTemplate(
|
||||
input_variables=["input", "table_info", "top_k"],
|
||||
template=_myscale_prompt + PROMPT_SUFFIX,
|
||||
)
|
||||
|
||||
|
||||
VECTOR_SQL_PROMPTS = {
|
||||
"myscale": MYSCALE_PROMPT,
|
||||
}
|
237
libs/experimental/langchain_experimental/sql/vector_sql.py
Normal file
237
libs/experimental/langchain_experimental/sql/vector_sql.py
Normal file
@ -0,0 +1,237 @@
|
||||
"""Vector SQL Database Chain Retriever"""
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any, Dict, List, Optional, Union
|
||||
|
||||
from langchain.callbacks.manager import CallbackManagerForChainRun
|
||||
from langchain.chains.llm import LLMChain
|
||||
from langchain.chains.sql_database.prompt import PROMPT, SQL_PROMPTS
|
||||
from langchain.embeddings.base import Embeddings
|
||||
from langchain.prompts.prompt import PromptTemplate
|
||||
from langchain.schema import BaseOutputParser, BasePromptTemplate
|
||||
from langchain.schema.language_model import BaseLanguageModel
|
||||
from langchain.tools.sql_database.prompt import QUERY_CHECKER
|
||||
from langchain.utilities.sql_database import SQLDatabase
|
||||
|
||||
from langchain_experimental.sql.base import INTERMEDIATE_STEPS_KEY, SQLDatabaseChain
|
||||
|
||||
|
||||
class VectorSQLOutputParser(BaseOutputParser[str]):
|
||||
"""Output Parser for Vector SQL
|
||||
1. finds for `NeuralArray()` and replace it with the embedding
|
||||
2. finds for `DISTANCE()` and replace it with the distance name in backend SQL
|
||||
"""
|
||||
|
||||
model: Embeddings
|
||||
"""Embedding model to extract embedding for entity"""
|
||||
distance_func_name: str = "distance"
|
||||
"""Distance name for Vector SQL"""
|
||||
|
||||
class Config:
|
||||
arbitrary_types_allowed = 1
|
||||
|
||||
@property
|
||||
def _type(self) -> str:
|
||||
return "vector_sql_parser"
|
||||
|
||||
@classmethod
|
||||
def from_embeddings(
|
||||
cls, model: Embeddings, distance_func_name: str = "distance", **kwargs: Any
|
||||
) -> BaseOutputParser:
|
||||
return cls(model=model, distance_func_name=distance_func_name, **kwargs)
|
||||
|
||||
def parse(self, text: str) -> str:
|
||||
text = text.strip()
|
||||
start = text.find("NeuralArray(")
|
||||
_sql_str_compl = text
|
||||
if start > 0:
|
||||
_matched = text[text.find("NeuralArray(") + len("NeuralArray(") :]
|
||||
end = _matched.find(")") + start + len("NeuralArray(") + 1
|
||||
entity = _matched[: _matched.find(")")]
|
||||
vecs = self.model.embed_query(entity)
|
||||
vecs_str = "[" + ",".join(map(str, vecs)) + "]"
|
||||
_sql_str_compl = text.replace("DISTANCE", self.distance_func_name).replace(
|
||||
text[start:end], vecs_str
|
||||
)
|
||||
if _sql_str_compl[-1] == ";":
|
||||
_sql_str_compl = _sql_str_compl[:-1]
|
||||
return _sql_str_compl
|
||||
|
||||
|
||||
class VectorSQLRetrieveAllOutputParser(VectorSQLOutputParser):
|
||||
"""Based on VectorSQLOutputParser
|
||||
It also modify the SQL to get all columns
|
||||
"""
|
||||
|
||||
@property
|
||||
def _type(self) -> str:
|
||||
return "vector_sql_retrieve_all_parser"
|
||||
|
||||
def parse(self, text: str) -> str:
|
||||
text = text.strip()
|
||||
start = text.upper().find("SELECT")
|
||||
if start >= 0:
|
||||
end = text.upper().find("FROM")
|
||||
text = text.replace(text[start + len("SELECT") + 1 : end - 1], "*")
|
||||
return super().parse(text)
|
||||
|
||||
|
||||
def _try_eval(x: Any) -> Any:
|
||||
try:
|
||||
return eval(x)
|
||||
except Exception:
|
||||
return x
|
||||
|
||||
|
||||
def get_result_from_sqldb(
|
||||
db: SQLDatabase, cmd: str
|
||||
) -> Union[str, List[Dict[str, Any]], Dict[str, Any]]:
|
||||
result = db._execute(cmd, fetch="all") # type: ignore
|
||||
if isinstance(result, list):
|
||||
return [{k: _try_eval(v) for k, v in dict(d._asdict()).items()} for d in result]
|
||||
else:
|
||||
return {
|
||||
k: _try_eval(v) for k, v in dict(result._asdict()).items() # type: ignore
|
||||
}
|
||||
|
||||
|
||||
class VectorSQLDatabaseChain(SQLDatabaseChain):
|
||||
"""Chain for interacting with Vector SQL Database.
|
||||
|
||||
Example:
|
||||
.. code-block:: python
|
||||
|
||||
from langchain_experimental.sql import SQLDatabaseChain
|
||||
from langchain import OpenAI, SQLDatabase, OpenAIEmbeddings
|
||||
db = SQLDatabase(...)
|
||||
db_chain = VectorSQLDatabaseChain.from_llm(OpenAI(), db, OpenAIEmbeddings())
|
||||
|
||||
*Security note*: Make sure that the database connection uses credentials
|
||||
that are narrowly-scoped to only include the permissions this chain needs.
|
||||
Failure to do so may result in data corruption or loss, since this chain may
|
||||
attempt commands like `DROP TABLE` or `INSERT` if appropriately prompted.
|
||||
The best way to guard against such negative outcomes is to (as appropriate)
|
||||
limit the permissions granted to the credentials used with this chain.
|
||||
This issue shows an example negative outcome if these steps are not taken:
|
||||
https://github.com/langchain-ai/langchain/issues/5923
|
||||
"""
|
||||
|
||||
sql_cmd_parser: VectorSQLOutputParser
|
||||
"""Parser for Vector SQL"""
|
||||
native_format: bool = False
|
||||
"""If return_direct, controls whether to return in python native format"""
|
||||
|
||||
def _call(
|
||||
self,
|
||||
inputs: Dict[str, Any],
|
||||
run_manager: Optional[CallbackManagerForChainRun] = None,
|
||||
) -> Dict[str, Any]:
|
||||
_run_manager = run_manager or CallbackManagerForChainRun.get_noop_manager()
|
||||
input_text = f"{inputs[self.input_key]}\nSQLQuery:"
|
||||
_run_manager.on_text(input_text, verbose=self.verbose)
|
||||
# If not present, then defaults to None which is all tables.
|
||||
table_names_to_use = inputs.get("table_names_to_use")
|
||||
table_info = self.database.get_table_info(table_names=table_names_to_use)
|
||||
llm_inputs = {
|
||||
"input": input_text,
|
||||
"top_k": str(self.top_k),
|
||||
"dialect": self.database.dialect,
|
||||
"table_info": table_info,
|
||||
"stop": ["\nSQLResult:"],
|
||||
}
|
||||
intermediate_steps: List = []
|
||||
try:
|
||||
intermediate_steps.append(llm_inputs) # input: sql generation
|
||||
llm_out = self.llm_chain.predict(
|
||||
callbacks=_run_manager.get_child(),
|
||||
**llm_inputs,
|
||||
)
|
||||
sql_cmd = self.sql_cmd_parser.parse(llm_out)
|
||||
if self.return_sql:
|
||||
return {self.output_key: sql_cmd}
|
||||
if not self.use_query_checker:
|
||||
_run_manager.on_text(llm_out, color="green", verbose=self.verbose)
|
||||
intermediate_steps.append(
|
||||
llm_out
|
||||
) # output: sql generation (no checker)
|
||||
intermediate_steps.append({"sql_cmd": llm_out}) # input: sql exec
|
||||
result = get_result_from_sqldb(self.database, sql_cmd)
|
||||
intermediate_steps.append(str(result)) # output: sql exec
|
||||
else:
|
||||
query_checker_prompt = self.query_checker_prompt or PromptTemplate(
|
||||
template=QUERY_CHECKER, input_variables=["query", "dialect"]
|
||||
)
|
||||
query_checker_chain = LLMChain(
|
||||
llm=self.llm_chain.llm,
|
||||
prompt=query_checker_prompt,
|
||||
output_parser=self.llm_chain.output_parser,
|
||||
)
|
||||
query_checker_inputs = {
|
||||
"query": llm_out,
|
||||
"dialect": self.database.dialect,
|
||||
}
|
||||
checked_llm_out = query_checker_chain.predict(
|
||||
callbacks=_run_manager.get_child(), **query_checker_inputs
|
||||
)
|
||||
checked_sql_command = self.sql_cmd_parser.parse(checked_llm_out)
|
||||
intermediate_steps.append(
|
||||
checked_llm_out
|
||||
) # output: sql generation (checker)
|
||||
_run_manager.on_text(
|
||||
checked_llm_out, color="green", verbose=self.verbose
|
||||
)
|
||||
intermediate_steps.append(
|
||||
{"sql_cmd": checked_llm_out}
|
||||
) # input: sql exec
|
||||
result = get_result_from_sqldb(self.database, checked_sql_command)
|
||||
intermediate_steps.append(str(result)) # output: sql exec
|
||||
llm_out = checked_llm_out
|
||||
sql_cmd = checked_sql_command
|
||||
|
||||
_run_manager.on_text("\nSQLResult: ", verbose=self.verbose)
|
||||
_run_manager.on_text(str(result), color="yellow", verbose=self.verbose)
|
||||
# If return direct, we just set the final result equal to
|
||||
# the result of the sql query result, otherwise try to get a human readable
|
||||
# final answer
|
||||
if self.return_direct:
|
||||
final_result = result
|
||||
else:
|
||||
_run_manager.on_text("\nAnswer:", verbose=self.verbose)
|
||||
input_text += f"{llm_out}\nSQLResult: {result}\nAnswer:"
|
||||
llm_inputs["input"] = input_text
|
||||
intermediate_steps.append(llm_inputs) # input: final answer
|
||||
final_result = self.llm_chain.predict(
|
||||
callbacks=_run_manager.get_child(),
|
||||
**llm_inputs,
|
||||
).strip()
|
||||
intermediate_steps.append(final_result) # output: final answer
|
||||
_run_manager.on_text(final_result, color="green", verbose=self.verbose)
|
||||
chain_result: Dict[str, Any] = {self.output_key: final_result}
|
||||
if self.return_intermediate_steps:
|
||||
chain_result[INTERMEDIATE_STEPS_KEY] = intermediate_steps
|
||||
return chain_result
|
||||
except Exception as exc:
|
||||
# Append intermediate steps to exception, to aid in logging and later
|
||||
# improvement of few shot prompt seeds
|
||||
exc.intermediate_steps = intermediate_steps # type: ignore
|
||||
raise exc
|
||||
|
||||
@property
|
||||
def _chain_type(self) -> str:
|
||||
return "vector_sql_database_chain"
|
||||
|
||||
@classmethod
|
||||
def from_llm(
|
||||
cls,
|
||||
llm: BaseLanguageModel,
|
||||
db: SQLDatabase,
|
||||
prompt: Optional[BasePromptTemplate] = None,
|
||||
sql_cmd_parser: Optional[VectorSQLOutputParser] = None,
|
||||
**kwargs: Any,
|
||||
) -> VectorSQLDatabaseChain:
|
||||
assert sql_cmd_parser, "`sql_cmd_parser` must be set in VectorSQLDatabaseChain."
|
||||
prompt = prompt or SQL_PROMPTS.get(db.dialect, PROMPT)
|
||||
llm_chain = LLMChain(llm=llm, prompt=prompt)
|
||||
return cls(
|
||||
llm_chain=llm_chain, database=db, sql_cmd_parser=sql_cmd_parser, **kwargs
|
||||
)
|
28
libs/experimental/poetry.lock
generated
28
libs/experimental/poetry.lock
generated
@ -1245,6 +1245,7 @@ optional = false
|
||||
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*, !=3.6.*"
|
||||
files = [
|
||||
{file = "jsonpointer-2.4-py2.py3-none-any.whl", hash = "sha256:15d51bba20eea3165644553647711d150376234112651b4f1811022aecad7d7a"},
|
||||
{file = "jsonpointer-2.4.tar.gz", hash = "sha256:585cee82b70211fa9e6043b7bb89db6e1aa49524340dde8ad6b63206ea689d88"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@ -3752,6 +3753,31 @@ files = [
|
||||
{file = "types_PyYAML-6.0.12.11-py3-none-any.whl", hash = "sha256:a461508f3096d1d5810ec5ab95d7eeecb651f3a15b71959999988942063bf01d"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "types-requests"
|
||||
version = "2.31.0.2"
|
||||
description = "Typing stubs for requests"
|
||||
optional = false
|
||||
python-versions = "*"
|
||||
files = [
|
||||
{file = "types-requests-2.31.0.2.tar.gz", hash = "sha256:6aa3f7faf0ea52d728bb18c0a0d1522d9bfd8c72d26ff6f61bfc3d06a411cf40"},
|
||||
{file = "types_requests-2.31.0.2-py3-none-any.whl", hash = "sha256:56d181c85b5925cbc59f4489a57e72a8b2166f18273fd8ba7b6fe0c0b986f12a"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
types-urllib3 = "*"
|
||||
|
||||
[[package]]
|
||||
name = "types-urllib3"
|
||||
version = "1.26.25.14"
|
||||
description = "Typing stubs for urllib3"
|
||||
optional = false
|
||||
python-versions = "*"
|
||||
files = [
|
||||
{file = "types-urllib3-1.26.25.14.tar.gz", hash = "sha256:229b7f577c951b8c1b92c1bc2b2fdb0b49847bd2af6d1cc2a2e3dd340f3bda8f"},
|
||||
{file = "types_urllib3-1.26.25.14-py3-none-any.whl", hash = "sha256:9683bbb7fb72e32bfe9d2be6e04875fbe1b3eeec3cbb4ea231435aa7fd6b4f0e"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "typing-extensions"
|
||||
version = "4.7.1"
|
||||
@ -3995,4 +4021,4 @@ extended-testing = ["faker", "presidio-analyzer", "presidio-anonymizer"]
|
||||
[metadata]
|
||||
lock-version = "2.0"
|
||||
python-versions = ">=3.8.1,<4.0"
|
||||
content-hash = "66ac482bd05eb74414210ac28fc1e8dae1a9928a4a1314e1326fada3551aa8ad"
|
||||
content-hash = "443e88f690572715cf58671e4480a006574c7141a1258dff0a0818b954184901"
|
||||
|
@ -1,6 +1,6 @@
|
||||
[tool.poetry]
|
||||
name = "langchain-experimental"
|
||||
version = "0.0.13"
|
||||
version = "0.0.15"
|
||||
description = "Building applications with LLMs through composability"
|
||||
authors = []
|
||||
license = "MIT"
|
||||
@ -23,6 +23,7 @@ black = "^23.1.0"
|
||||
[tool.poetry.group.typing.dependencies]
|
||||
mypy = "^0.991"
|
||||
types-pyyaml = "^6.0.12.2"
|
||||
types-requests = "^2.28.11.5"
|
||||
|
||||
[tool.poetry.group.dev.dependencies]
|
||||
jupyter = "^1.0.0"
|
||||
|
@ -0,0 +1,154 @@
|
||||
import os
|
||||
from typing import Iterator, List
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.fixture(scope="module", autouse=True)
|
||||
def check_spacy_model() -> Iterator[None]:
|
||||
import spacy
|
||||
|
||||
if not spacy.util.is_package("en_core_web_lg"):
|
||||
pytest.skip(reason="Spacy model 'en_core_web_lg' not installed")
|
||||
yield
|
||||
|
||||
|
||||
@pytest.mark.requires("presidio_analyzer", "presidio_anonymizer", "faker")
|
||||
@pytest.mark.parametrize(
|
||||
"analyzed_fields,should_contain",
|
||||
[(["PERSON"], False), (["PHONE_NUMBER"], True), (None, False)],
|
||||
)
|
||||
def test_anonymize(analyzed_fields: List[str], should_contain: bool) -> None:
|
||||
"""Test anonymizing a name in a simple sentence"""
|
||||
from langchain_experimental.data_anonymizer import PresidioReversibleAnonymizer
|
||||
|
||||
text = "Hello, my name is John Doe."
|
||||
anonymizer = PresidioReversibleAnonymizer(analyzed_fields=analyzed_fields)
|
||||
anonymized_text = anonymizer.anonymize(text)
|
||||
assert ("John Doe" in anonymized_text) == should_contain
|
||||
|
||||
|
||||
@pytest.mark.requires("presidio_analyzer", "presidio_anonymizer", "faker")
|
||||
def test_anonymize_multiple() -> None:
|
||||
"""Test anonymizing multiple items in a sentence"""
|
||||
from langchain_experimental.data_anonymizer import PresidioReversibleAnonymizer
|
||||
|
||||
text = "John Smith's phone number is 313-666-7440 and email is johnsmith@gmail.com"
|
||||
anonymizer = PresidioReversibleAnonymizer()
|
||||
anonymized_text = anonymizer.anonymize(text)
|
||||
for phrase in ["John Smith", "313-666-7440", "johnsmith@gmail.com"]:
|
||||
assert phrase not in anonymized_text
|
||||
|
||||
|
||||
@pytest.mark.requires("presidio_analyzer", "presidio_anonymizer", "faker")
|
||||
def test_anonymize_with_custom_operator() -> None:
|
||||
"""Test anonymize a name with a custom operator"""
|
||||
from presidio_anonymizer.entities import OperatorConfig
|
||||
|
||||
from langchain_experimental.data_anonymizer import PresidioReversibleAnonymizer
|
||||
|
||||
custom_operator = {"PERSON": OperatorConfig("replace", {"new_value": "<name>"})}
|
||||
anonymizer = PresidioReversibleAnonymizer(operators=custom_operator)
|
||||
|
||||
text = "Jane Doe was here."
|
||||
|
||||
anonymized_text = anonymizer.anonymize(text)
|
||||
assert anonymized_text == "<name> was here."
|
||||
|
||||
|
||||
@pytest.mark.requires("presidio_analyzer", "presidio_anonymizer", "faker")
|
||||
def test_add_recognizer_operator() -> None:
|
||||
"""
|
||||
Test add recognizer and anonymize a new type of entity and with a custom operator
|
||||
"""
|
||||
from presidio_analyzer import PatternRecognizer
|
||||
from presidio_anonymizer.entities import OperatorConfig
|
||||
|
||||
from langchain_experimental.data_anonymizer import PresidioReversibleAnonymizer
|
||||
|
||||
anonymizer = PresidioReversibleAnonymizer(analyzed_fields=[])
|
||||
titles_list = ["Sir", "Madam", "Professor"]
|
||||
custom_recognizer = PatternRecognizer(
|
||||
supported_entity="TITLE", deny_list=titles_list
|
||||
)
|
||||
anonymizer.add_recognizer(custom_recognizer)
|
||||
|
||||
# anonymizing with custom recognizer
|
||||
text = "Madam Jane Doe was here."
|
||||
anonymized_text = anonymizer.anonymize(text)
|
||||
assert anonymized_text == "<TITLE> Jane Doe was here."
|
||||
|
||||
# anonymizing with custom recognizer and operator
|
||||
custom_operator = {"TITLE": OperatorConfig("replace", {"new_value": "Dear"})}
|
||||
anonymizer.add_operators(custom_operator)
|
||||
anonymized_text = anonymizer.anonymize(text)
|
||||
assert anonymized_text == "Dear Jane Doe was here."
|
||||
|
||||
|
||||
@pytest.mark.requires("presidio_analyzer", "presidio_anonymizer", "faker")
|
||||
def test_deanonymizer_mapping() -> None:
|
||||
"""Test if deanonymizer mapping is correctly populated"""
|
||||
from langchain_experimental.data_anonymizer import PresidioReversibleAnonymizer
|
||||
|
||||
anonymizer = PresidioReversibleAnonymizer(
|
||||
analyzed_fields=["PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "CREDIT_CARD"]
|
||||
)
|
||||
|
||||
anonymizer.anonymize("Hello, my name is John Doe and my number is 444 555 6666.")
|
||||
|
||||
# ["PERSON", "PHONE_NUMBER"]
|
||||
assert len(anonymizer.deanonymizer_mapping.keys()) == 2
|
||||
assert "John Doe" in anonymizer.deanonymizer_mapping.get("PERSON", {}).values()
|
||||
assert (
|
||||
"444 555 6666"
|
||||
in anonymizer.deanonymizer_mapping.get("PHONE_NUMBER", {}).values()
|
||||
)
|
||||
|
||||
text_to_anonymize = (
|
||||
"And my name is Jane Doe, my email is jane@gmail.com and "
|
||||
"my credit card is 4929 5319 6292 5362."
|
||||
)
|
||||
anonymizer.anonymize(text_to_anonymize)
|
||||
|
||||
# ["PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "CREDIT_CARD"]
|
||||
assert len(anonymizer.deanonymizer_mapping.keys()) == 4
|
||||
assert "Jane Doe" in anonymizer.deanonymizer_mapping.get("PERSON", {}).values()
|
||||
assert (
|
||||
"jane@gmail.com"
|
||||
in anonymizer.deanonymizer_mapping.get("EMAIL_ADDRESS", {}).values()
|
||||
)
|
||||
assert (
|
||||
"4929 5319 6292 5362"
|
||||
in anonymizer.deanonymizer_mapping.get("CREDIT_CARD", {}).values()
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.requires("presidio_analyzer", "presidio_anonymizer", "faker")
|
||||
def test_deanonymize() -> None:
|
||||
"""Test deanonymizing a name in a simple sentence"""
|
||||
from langchain_experimental.data_anonymizer import PresidioReversibleAnonymizer
|
||||
|
||||
text = "Hello, my name is John Doe."
|
||||
anonymizer = PresidioReversibleAnonymizer(analyzed_fields=["PERSON"])
|
||||
anonymized_text = anonymizer.anonymize(text)
|
||||
deanonymized_text = anonymizer.deanonymize(anonymized_text)
|
||||
assert deanonymized_text == text
|
||||
|
||||
|
||||
@pytest.mark.requires("presidio_analyzer", "presidio_anonymizer", "faker")
|
||||
def test_save_load_deanonymizer_mapping() -> None:
|
||||
from langchain_experimental.data_anonymizer import PresidioReversibleAnonymizer
|
||||
|
||||
anonymizer = PresidioReversibleAnonymizer(analyzed_fields=["PERSON"])
|
||||
anonymizer.anonymize("Hello, my name is John Doe.")
|
||||
try:
|
||||
anonymizer.save_deanonymizer_mapping("test_file.json")
|
||||
assert os.path.isfile("test_file.json")
|
||||
|
||||
anonymizer = PresidioReversibleAnonymizer()
|
||||
anonymizer.load_deanonymizer_mapping("test_file.json")
|
||||
|
||||
assert "John Doe" in anonymizer.deanonymizer_mapping.get("PERSON", {}).values()
|
||||
|
||||
finally:
|
||||
os.remove("test_file.json")
|
@ -1,5 +1,5 @@
|
||||
"""SQL agent."""
|
||||
from typing import Any, Dict, List, Optional
|
||||
from typing import Any, Dict, List, Optional, Sequence
|
||||
|
||||
from langchain.agents.agent import AgentExecutor, BaseSingleActionAgent
|
||||
from langchain.agents.agent_toolkits.sql.prompt import (
|
||||
@ -21,6 +21,7 @@ from langchain.prompts.chat import (
|
||||
)
|
||||
from langchain.schema.language_model import BaseLanguageModel
|
||||
from langchain.schema.messages import AIMessage, SystemMessage
|
||||
from langchain.tools import BaseTool
|
||||
|
||||
|
||||
def create_sql_agent(
|
||||
@ -38,10 +39,11 @@ def create_sql_agent(
|
||||
early_stopping_method: str = "force",
|
||||
verbose: bool = False,
|
||||
agent_executor_kwargs: Optional[Dict[str, Any]] = None,
|
||||
extra_tools: Sequence[BaseTool] = (),
|
||||
**kwargs: Dict[str, Any],
|
||||
) -> AgentExecutor:
|
||||
"""Construct an SQL agent from an LLM and tools."""
|
||||
tools = toolkit.get_tools()
|
||||
tools = toolkit.get_tools() + list(extra_tools)
|
||||
prefix = prefix.format(dialect=toolkit.dialect, top_k=top_k)
|
||||
agent: BaseSingleActionAgent
|
||||
|
||||
|
@ -84,17 +84,17 @@ class GraphSparqlQAChain(Chain):
|
||||
_intent = self.sparql_intent_chain.run({"prompt": prompt}, callbacks=callbacks)
|
||||
intent = _intent.strip()
|
||||
|
||||
if "SELECT" not in intent and "UPDATE" not in intent:
|
||||
if "SELECT" in intent and "UPDATE" not in intent:
|
||||
sparql_generation_chain = self.sparql_generation_select_chain
|
||||
intent = "SELECT"
|
||||
elif "UPDATE" in intent and "SELECT" not in intent:
|
||||
sparql_generation_chain = self.sparql_generation_update_chain
|
||||
intent = "UPDATE"
|
||||
else:
|
||||
raise ValueError(
|
||||
"I am sorry, but this prompt seems to fit none of the currently "
|
||||
"supported SPARQL query types, i.e., SELECT and UPDATE."
|
||||
)
|
||||
elif intent.find("SELECT") < intent.find("UPDATE"):
|
||||
sparql_generation_chain = self.sparql_generation_select_chain
|
||||
intent = "SELECT"
|
||||
else:
|
||||
sparql_generation_chain = self.sparql_generation_update_chain
|
||||
intent = "UPDATE"
|
||||
|
||||
_run_manager.on_text("Identified intent:", end="\n", verbose=self.verbose)
|
||||
_run_manager.on_text(intent, color="green", end="\n", verbose=self.verbose)
|
||||
|
@ -1,6 +1,7 @@
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import tempfile
|
||||
import zipfile
|
||||
from pathlib import Path
|
||||
from typing import Iterator, List, Union
|
||||
@ -136,7 +137,8 @@ class TelegramChatLoader(chat_loaders.BaseChatLoader):
|
||||
with zipfile.ZipFile(path) as zip_file:
|
||||
for file in zip_file.namelist():
|
||||
if file.endswith((".html", ".json")):
|
||||
yield zip_file.extract(file)
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
yield zip_file.extract(file, path=temp_dir)
|
||||
|
||||
def lazy_load(self) -> Iterator[chat_loaders.ChatSession]:
|
||||
"""Lazy load the messages from the chat file and yield them
|
||||
|
@ -1,7 +1,8 @@
|
||||
import asyncio
|
||||
import logging
|
||||
import warnings
|
||||
from typing import Any, Dict, Iterator, List, Optional, Union
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from typing import Any, Dict, Iterator, List, Optional, Union, cast
|
||||
|
||||
import aiohttp
|
||||
import requests
|
||||
@ -129,9 +130,18 @@ class AsyncHtmlLoader(BaseLoader):
|
||||
def load(self) -> List[Document]:
|
||||
"""Load text from the url(s) in web_path."""
|
||||
|
||||
results = asyncio.run(self.fetch_all(self.web_paths))
|
||||
try:
|
||||
# Raises RuntimeError if there is no current event loop.
|
||||
asyncio.get_running_loop()
|
||||
# If there is a current event loop, we need to run the async code
|
||||
# in a separate loop, in a separate thread.
|
||||
with ThreadPoolExecutor(max_workers=1) as executor:
|
||||
future = executor.submit(asyncio.run, self.fetch_all(self.web_paths))
|
||||
results = future.result()
|
||||
except RuntimeError:
|
||||
results = asyncio.run(self.fetch_all(self.web_paths))
|
||||
docs = []
|
||||
for i, text in enumerate(results):
|
||||
for i, text in enumerate(cast(List[str], results)):
|
||||
metadata = {"source": self.web_paths[i]}
|
||||
docs.append(Document(page_content=text, metadata=metadata))
|
||||
|
||||
|
@ -1,11 +1,16 @@
|
||||
"""Module contains common parsers for PDFs."""
|
||||
from typing import Any, Iterator, Mapping, Optional, Sequence, Union
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING, Any, Iterator, Mapping, Optional, Sequence, Union
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from langchain.document_loaders.base import BaseBlobParser
|
||||
from langchain.document_loaders.blob_loaders import Blob
|
||||
from langchain.schema import Document
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import pdfplumber.page
|
||||
|
||||
|
||||
class PyPDFParser(BaseBlobParser):
|
||||
"""Load `PDF` using `pypdf` and chunk at character level."""
|
||||
@ -116,13 +121,17 @@ class PyPDFium2Parser(BaseBlobParser):
|
||||
class PDFPlumberParser(BaseBlobParser):
|
||||
"""Parse `PDF` with `PDFPlumber`."""
|
||||
|
||||
def __init__(self, text_kwargs: Optional[Mapping[str, Any]] = None) -> None:
|
||||
def __init__(
|
||||
self, text_kwargs: Optional[Mapping[str, Any]] = None, dedupe: bool = False
|
||||
) -> None:
|
||||
"""Initialize the parser.
|
||||
|
||||
Args:
|
||||
text_kwargs: Keyword arguments to pass to ``pdfplumber.Page.extract_text()``
|
||||
dedupe: Avoiding the error of duplicate characters if `dedupe=True`.
|
||||
"""
|
||||
self.text_kwargs = text_kwargs or {}
|
||||
self.dedupe = dedupe
|
||||
|
||||
def lazy_parse(self, blob: Blob) -> Iterator[Document]:
|
||||
"""Lazily parse the blob."""
|
||||
@ -133,7 +142,7 @@ class PDFPlumberParser(BaseBlobParser):
|
||||
|
||||
yield from [
|
||||
Document(
|
||||
page_content=page.extract_text(**self.text_kwargs),
|
||||
page_content=self._process_page_content(page),
|
||||
metadata=dict(
|
||||
{
|
||||
"source": blob.source,
|
||||
@ -151,6 +160,12 @@ class PDFPlumberParser(BaseBlobParser):
|
||||
for page in doc.pages
|
||||
]
|
||||
|
||||
def _process_page_content(self, page: pdfplumber.page.Page) -> str:
|
||||
"""Process the page content based on dedupe."""
|
||||
if self.dedupe:
|
||||
return page.dedupe_chars().extract_text(**self.text_kwargs)
|
||||
return page.extract_text(**self.text_kwargs)
|
||||
|
||||
|
||||
class AmazonTextractPDFParser(BaseBlobParser):
|
||||
"""Send `PDF` files to `Amazon Textract` and parse them.
|
||||
|
@ -437,7 +437,10 @@ class PDFPlumberLoader(BasePDFLoader):
|
||||
"""Load `PDF` files using `pdfplumber`."""
|
||||
|
||||
def __init__(
|
||||
self, file_path: str, text_kwargs: Optional[Mapping[str, Any]] = None
|
||||
self,
|
||||
file_path: str,
|
||||
text_kwargs: Optional[Mapping[str, Any]] = None,
|
||||
dedupe: bool = False,
|
||||
) -> None:
|
||||
"""Initialize with a file path."""
|
||||
try:
|
||||
@ -450,11 +453,12 @@ class PDFPlumberLoader(BasePDFLoader):
|
||||
|
||||
super().__init__(file_path)
|
||||
self.text_kwargs = text_kwargs or {}
|
||||
self.dedupe = dedupe
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load file."""
|
||||
|
||||
parser = PDFPlumberParser(text_kwargs=self.text_kwargs)
|
||||
parser = PDFPlumberParser(text_kwargs=self.text_kwargs, dedupe=self.dedupe)
|
||||
blob = Blob.from_path(self.file_path)
|
||||
return parser.parse(blob)
|
||||
|
||||
|
@ -114,7 +114,7 @@ class S3DirectoryLoader(BaseLoader):
|
||||
aws_access_key_id=self.aws_access_key_id,
|
||||
aws_secret_access_key=self.aws_secret_access_key,
|
||||
aws_session_token=self.aws_session_token,
|
||||
boto_config=self.boto_config,
|
||||
config=self.boto_config,
|
||||
)
|
||||
bucket = s3.Bucket(self.bucket)
|
||||
docs = []
|
||||
|
@ -8,7 +8,9 @@ from langchain.docstore.document import Document
|
||||
from langchain.document_loaders.base import BaseLoader
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from playwright.async_api import AsyncBrowser, AsyncPage, AsyncResponse
|
||||
from playwright.async_api import Browser as AsyncBrowser
|
||||
from playwright.async_api import Page as AsyncPage
|
||||
from playwright.async_api import Response as AsyncResponse
|
||||
from playwright.sync_api import Browser, Page, Response
|
||||
|
||||
|
||||
@ -155,6 +157,9 @@ class PlaywrightURLLoader(BaseLoader):
|
||||
try:
|
||||
page = browser.new_page()
|
||||
response = page.goto(url)
|
||||
if response is None:
|
||||
raise ValueError(f"page.goto() returned None for url {url}")
|
||||
|
||||
text = self.evaluator.evaluate(page, browser, response)
|
||||
metadata = {"source": url}
|
||||
docs.append(Document(page_content=text, metadata=metadata))
|
||||
@ -185,6 +190,9 @@ class PlaywrightURLLoader(BaseLoader):
|
||||
try:
|
||||
page = await browser.new_page()
|
||||
response = await page.goto(url)
|
||||
if response is None:
|
||||
raise ValueError(f"page.goto() returned None for url {url}")
|
||||
|
||||
text = await self.evaluator.evaluate_async(page, browser, response)
|
||||
metadata = {"source": url}
|
||||
docs.append(Document(page_content=text, metadata=metadata))
|
||||
|
@ -35,6 +35,7 @@ from langchain.embeddings.gpt4all import GPT4AllEmbeddings
|
||||
from langchain.embeddings.huggingface import (
|
||||
HuggingFaceBgeEmbeddings,
|
||||
HuggingFaceEmbeddings,
|
||||
HuggingFaceInferenceAPIEmbeddings,
|
||||
HuggingFaceInstructEmbeddings,
|
||||
)
|
||||
from langchain.embeddings.huggingface_hub import HuggingFaceHubEmbeddings
|
||||
@ -69,6 +70,7 @@ __all__ = [
|
||||
"CohereEmbeddings",
|
||||
"ElasticsearchEmbeddings",
|
||||
"HuggingFaceEmbeddings",
|
||||
"HuggingFaceInferenceAPIEmbeddings",
|
||||
"JinaEmbeddings",
|
||||
"LlamaCppEmbeddings",
|
||||
"HuggingFaceHubEmbeddings",
|
||||
|
@ -1,5 +1,7 @@
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
import requests
|
||||
|
||||
from langchain.embeddings.base import Embeddings
|
||||
from langchain.pydantic_v1 import BaseModel, Extra, Field
|
||||
|
||||
@ -58,7 +60,7 @@ class HuggingFaceEmbeddings(BaseModel, Embeddings):
|
||||
except ImportError as exc:
|
||||
raise ImportError(
|
||||
"Could not import sentence_transformers python package. "
|
||||
"Please install it with `pip install sentence_transformers`."
|
||||
"Please install it with `pip install sentence-transformers`."
|
||||
) from exc
|
||||
|
||||
self.client = sentence_transformers.SentenceTransformer(
|
||||
@ -266,3 +268,71 @@ class HuggingFaceBgeEmbeddings(BaseModel, Embeddings):
|
||||
self.query_instruction + text, **self.encode_kwargs
|
||||
)
|
||||
return embedding.tolist()
|
||||
|
||||
|
||||
class HuggingFaceInferenceAPIEmbeddings(BaseModel, Embeddings):
|
||||
"""Embed texts using the HuggingFace API.
|
||||
|
||||
Requires a HuggingFace Inference API key and a model name.
|
||||
"""
|
||||
|
||||
api_key: str
|
||||
"""Your API key for the HuggingFace Inference API."""
|
||||
model_name: str = "sentence-transformers/all-MiniLM-L6-v2"
|
||||
"""The name of the model to use for text embeddings."""
|
||||
|
||||
@property
|
||||
def _api_url(self) -> str:
|
||||
return (
|
||||
"https://api-inference.huggingface.co"
|
||||
"/pipeline"
|
||||
"/feature-extraction"
|
||||
f"/{self.model_name}"
|
||||
)
|
||||
|
||||
@property
|
||||
def _headers(self) -> dict:
|
||||
return {"Authorization": f"Bearer {self.api_key}"}
|
||||
|
||||
def embed_documents(self, texts: List[str]) -> List[List[float]]:
|
||||
"""Get the embeddings for a list of texts.
|
||||
|
||||
Args:
|
||||
texts (Documents): A list of texts to get embeddings for.
|
||||
|
||||
Returns:
|
||||
Embedded texts as List[List[float]], where each inner List[float]
|
||||
corresponds to a single input text.
|
||||
|
||||
Example:
|
||||
.. code-block:: python
|
||||
|
||||
from langchain.embeddings import HuggingFaceInferenceAPIEmbeddings
|
||||
|
||||
hf_embeddings = HuggingFaceInferenceAPIEmbeddings(
|
||||
api_key="your_api_key",
|
||||
model_name="sentence-transformers/all-MiniLM-l6-v2"
|
||||
)
|
||||
texts = ["Hello, world!", "How are you?"]
|
||||
hf_embeddings.embed_documents(texts)
|
||||
"""
|
||||
response = requests.post(
|
||||
self._api_url,
|
||||
headers=self._headers,
|
||||
json={
|
||||
"inputs": texts,
|
||||
"options": {"wait_for_model": True, "use_cache": True},
|
||||
},
|
||||
)
|
||||
return response.json()
|
||||
|
||||
def embed_query(self, text: str) -> List[float]:
|
||||
"""Compute query embeddings using a HuggingFace transformer model.
|
||||
|
||||
Args:
|
||||
text: The text to embed.
|
||||
|
||||
Returns:
|
||||
Embeddings for the text.
|
||||
"""
|
||||
return self.embed_documents([text])[0]
|
||||
|
@ -87,8 +87,8 @@ def _async_retry_decorator(embeddings: OpenAIEmbeddings) -> Any:
|
||||
|
||||
|
||||
# https://stackoverflow.com/questions/76469415/getting-embeddings-of-length-1-from-langchain-openaiembeddings
|
||||
def _check_response(response: dict) -> dict:
|
||||
if any(len(d["embedding"]) == 1 for d in response["data"]):
|
||||
def _check_response(response: dict, skip_empty: bool = False) -> dict:
|
||||
if any(len(d["embedding"]) == 1 for d in response["data"]) and not skip_empty:
|
||||
import openai
|
||||
|
||||
raise openai.error.APIError("OpenAI API returned an empty embedding")
|
||||
@ -102,7 +102,7 @@ def embed_with_retry(embeddings: OpenAIEmbeddings, **kwargs: Any) -> Any:
|
||||
@retry_decorator
|
||||
def _embed_with_retry(**kwargs: Any) -> Any:
|
||||
response = embeddings.client.create(**kwargs)
|
||||
return _check_response(response)
|
||||
return _check_response(response, skip_empty=embeddings.skip_empty)
|
||||
|
||||
return _embed_with_retry(**kwargs)
|
||||
|
||||
@ -113,7 +113,7 @@ async def async_embed_with_retry(embeddings: OpenAIEmbeddings, **kwargs: Any) ->
|
||||
@_async_retry_decorator(embeddings)
|
||||
async def _async_embed_with_retry(**kwargs: Any) -> Any:
|
||||
response = await embeddings.client.acreate(**kwargs)
|
||||
return _check_response(response)
|
||||
return _check_response(response, skip_empty=embeddings.skip_empty)
|
||||
|
||||
return await _async_embed_with_retry(**kwargs)
|
||||
|
||||
@ -196,6 +196,9 @@ class OpenAIEmbeddings(BaseModel, Embeddings):
|
||||
"""Whether to show a progress bar when embedding."""
|
||||
model_kwargs: Dict[str, Any] = Field(default_factory=dict)
|
||||
"""Holds any model parameters valid for `create` call not explicitly specified."""
|
||||
skip_empty: bool = False
|
||||
"""Whether to skip empty strings when embedding or raise an error.
|
||||
Defaults to not skipping."""
|
||||
|
||||
class Config:
|
||||
"""Configuration for this pydantic object."""
|
||||
@ -371,6 +374,8 @@ class OpenAIEmbeddings(BaseModel, Embeddings):
|
||||
results: List[List[List[float]]] = [[] for _ in range(len(texts))]
|
||||
num_tokens_in_batch: List[List[int]] = [[] for _ in range(len(texts))]
|
||||
for i in range(len(indices)):
|
||||
if self.skip_empty and len(batched_embeddings[i]) == 1:
|
||||
continue
|
||||
results[indices[i]].append(batched_embeddings[i])
|
||||
num_tokens_in_batch[indices[i]].append(len(tokens[i]))
|
||||
|
||||
|
51
libs/langchain/langchain/graphs/graph_document.py
Normal file
51
libs/langchain/langchain/graphs/graph_document.py
Normal file
@ -0,0 +1,51 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import List, Union
|
||||
|
||||
from langchain.load.serializable import Serializable
|
||||
from langchain.pydantic_v1 import Field
|
||||
from langchain.schema import Document
|
||||
|
||||
|
||||
class Node(Serializable):
|
||||
"""Represents a node in a graph with associated properties.
|
||||
|
||||
Attributes:
|
||||
id (Union[str, int]): A unique identifier for the node.
|
||||
type (str): The type or label of the node, default is "Node".
|
||||
properties (dict): Additional properties and metadata associated with the node.
|
||||
"""
|
||||
|
||||
id: Union[str, int]
|
||||
type: str = "Node"
|
||||
properties: dict = Field(default_factory=dict)
|
||||
|
||||
|
||||
class Relationship(Serializable):
|
||||
"""Represents a directed relationship between two nodes in a graph.
|
||||
|
||||
Attributes:
|
||||
source (Node): The source node of the relationship.
|
||||
target (Node): The target node of the relationship.
|
||||
type (str): The type of the relationship.
|
||||
properties (dict): Additional properties associated with the relationship.
|
||||
"""
|
||||
|
||||
source: Node
|
||||
target: Node
|
||||
type: str
|
||||
properties: dict = Field(default_factory=dict)
|
||||
|
||||
|
||||
class GraphDocument(Serializable):
|
||||
"""Represents a graph document consisting of nodes and relationships.
|
||||
|
||||
Attributes:
|
||||
nodes (List[Node]): A list of nodes in the graph.
|
||||
relationships (List[Relationship]): A list of relationships in the graph.
|
||||
source (Document): The document from which the graph information is derived.
|
||||
"""
|
||||
|
||||
nodes: List[Node]
|
||||
relationships: List[Relationship]
|
||||
source: Document
|
@ -1,5 +1,7 @@
|
||||
from typing import Any, Dict, List
|
||||
|
||||
from langchain.graphs.graph_document import GraphDocument
|
||||
|
||||
node_properties_query = """
|
||||
CALL apoc.meta.data()
|
||||
YIELD label, other, elementType, type, property
|
||||
@ -99,3 +101,56 @@ class Neo4jGraph:
|
||||
The relationships are the following:
|
||||
{[el['output'] for el in relationships]}
|
||||
"""
|
||||
|
||||
def add_graph_documents(
|
||||
self, graph_documents: List[GraphDocument], include_source: bool = False
|
||||
) -> None:
|
||||
"""
|
||||
Take GraphDocument as input as uses it to construct a graph.
|
||||
"""
|
||||
for document in graph_documents:
|
||||
include_docs_query = (
|
||||
"CREATE (d:Document) "
|
||||
"SET d.text = $document.page_content "
|
||||
"SET d += $document.metadata "
|
||||
"WITH d "
|
||||
)
|
||||
# Import nodes
|
||||
self.query(
|
||||
(
|
||||
f"{include_docs_query if include_source else ''}"
|
||||
"UNWIND $data AS row "
|
||||
"CALL apoc.merge.node([row.type], {id: row.id}, "
|
||||
"row.properties, {}) YIELD node "
|
||||
f"{'MERGE (d)-[:MENTIONS]->(node) ' if include_source else ''}"
|
||||
"RETURN distinct 'done' AS result"
|
||||
),
|
||||
{
|
||||
"data": [el.__dict__ for el in document.nodes],
|
||||
"document": document.source.__dict__,
|
||||
},
|
||||
)
|
||||
# Import relationships
|
||||
self.query(
|
||||
"UNWIND $data AS row "
|
||||
"CALL apoc.merge.node([row.source_label], {id: row.source},"
|
||||
"{}, {}) YIELD node as source "
|
||||
"CALL apoc.merge.node([row.target_label], {id: row.target},"
|
||||
"{}, {}) YIELD node as target "
|
||||
"CALL apoc.merge.relationship(source, row.type, "
|
||||
"{}, row.properties, target) YIELD rel "
|
||||
"RETURN distinct 'done'",
|
||||
{
|
||||
"data": [
|
||||
{
|
||||
"source": el.source.id,
|
||||
"source_label": el.source.type,
|
||||
"target": el.target.id,
|
||||
"target_label": el.target.type,
|
||||
"type": el.type.replace(" ", "_").upper(),
|
||||
"properties": el.properties,
|
||||
}
|
||||
for el in document.relationships
|
||||
]
|
||||
},
|
||||
)
|
||||
|
@ -15,6 +15,7 @@ class Banana(LLM):
|
||||
|
||||
To use, you should have the ``banana-dev`` python package installed,
|
||||
and the environment variable ``BANANA_API_KEY`` set with your API key.
|
||||
This is the team API key available in the Banana dashboard.
|
||||
|
||||
Any parameters that are valid to be passed to the call can be passed
|
||||
in, even if not explicitly saved on this class.
|
||||
@ -23,10 +24,13 @@ class Banana(LLM):
|
||||
.. code-block:: python
|
||||
|
||||
from langchain.llms import Banana
|
||||
banana = Banana(model_key="")
|
||||
banana = Banana(model_key="", model_url_slug="")
|
||||
"""
|
||||
|
||||
model_key: str = ""
|
||||
"""model key to use"""
|
||||
|
||||
model_url_slug: str = ""
|
||||
"""model endpoint to use"""
|
||||
|
||||
model_kwargs: Dict[str, Any] = Field(default_factory=dict)
|
||||
@ -72,6 +76,7 @@ class Banana(LLM):
|
||||
"""Get the identifying parameters."""
|
||||
return {
|
||||
**{"model_key": self.model_key},
|
||||
**{"model_url_slug": self.model_url_slug},
|
||||
**{"model_kwargs": self.model_kwargs},
|
||||
}
|
||||
|
||||
@ -89,7 +94,7 @@ class Banana(LLM):
|
||||
) -> str:
|
||||
"""Call to Banana endpoint."""
|
||||
try:
|
||||
import banana_dev as banana
|
||||
from banana_dev import Client
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"Could not import banana-dev python package. "
|
||||
@ -99,19 +104,25 @@ class Banana(LLM):
|
||||
params = {**params, **kwargs}
|
||||
api_key = self.banana_api_key
|
||||
model_key = self.model_key
|
||||
model_url_slug = self.model_url_slug
|
||||
model_inputs = {
|
||||
# a json specific to your model.
|
||||
"prompt": prompt,
|
||||
**params,
|
||||
}
|
||||
response = banana.run(api_key, model_key, model_inputs)
|
||||
model = Client(
|
||||
# Found in main dashboard
|
||||
api_key=api_key,
|
||||
# Both found in model details page
|
||||
model_key=model_key,
|
||||
url=f"https://{model_url_slug}.run.banana.dev",
|
||||
)
|
||||
response, meta = model.call("/", model_inputs)
|
||||
try:
|
||||
text = response["modelOutputs"][0]["output"]
|
||||
text = response["outputs"]
|
||||
except (KeyError, TypeError):
|
||||
returned = response["modelOutputs"][0]
|
||||
raise ValueError(
|
||||
"Response should be of schema: {'output': 'text'}."
|
||||
f"\nResponse was: {returned}"
|
||||
"Response should be of schema: {'outputs': 'text'}."
|
||||
"\nTo fix this:"
|
||||
"\n- fork the source repo of the Banana model"
|
||||
"\n- modify app.py to return the above schema"
|
||||
|
@ -65,7 +65,7 @@ class HuggingFaceTextGenInference(LLM):
|
||||
typical_p: Optional[float] = 0.95
|
||||
"""Typical Decoding mass. See [Typical Decoding for Natural Language
|
||||
Generation](https://arxiv.org/abs/2202.00666) for more information."""
|
||||
temperature: float = 0.8
|
||||
temperature: Optional[float] = 0.8
|
||||
"""The value used to module the logits distribution."""
|
||||
repetition_penalty: Optional[float] = None
|
||||
"""The parameter for repetition penalty. 1.0 means no penalty.
|
||||
|
@ -91,7 +91,7 @@ class PipelineAI(LLM, BaseModel):
|
||||
try:
|
||||
from pipeline import PipelineCloud
|
||||
except ImportError:
|
||||
raise ValueError(
|
||||
raise ImportError(
|
||||
"Could not import pipeline-ai python package. "
|
||||
"Please install it with `pip install pipeline-ai`."
|
||||
)
|
||||
|
@ -121,7 +121,7 @@ class RWKV(LLM, BaseModel):
|
||||
values["pipeline"] = PIPELINE(values["client"], values["tokens_path"])
|
||||
|
||||
except ImportError:
|
||||
raise ValueError(
|
||||
raise ImportError(
|
||||
"Could not import rwkv python package. "
|
||||
"Please install it with `pip install rwkv`."
|
||||
)
|
||||
|
@ -62,6 +62,10 @@ class VLLM(BaseLLM):
|
||||
dtype: str = "auto"
|
||||
"""The data type for the model weights and activations."""
|
||||
|
||||
download_dir: Optional[str] = None
|
||||
"""Directory to download and load the weights. (Default to the default
|
||||
cache dir of huggingface)"""
|
||||
|
||||
vllm_kwargs: Dict[str, Any] = Field(default_factory=dict)
|
||||
"""Holds any model parameters valid for `vllm.LLM` call not explicitly specified."""
|
||||
|
||||
@ -84,6 +88,7 @@ class VLLM(BaseLLM):
|
||||
tensor_parallel_size=values["tensor_parallel_size"],
|
||||
trust_remote_code=values["trust_remote_code"],
|
||||
dtype=values["dtype"],
|
||||
download_dir=values["download_dir"],
|
||||
**values["vllm_kwargs"],
|
||||
)
|
||||
|
||||
|
@ -20,6 +20,7 @@ from langchain.output_parsers.fix import OutputFixingParser
|
||||
from langchain.output_parsers.list import (
|
||||
CommaSeparatedListOutputParser,
|
||||
ListOutputParser,
|
||||
NumberedListOutputParser,
|
||||
)
|
||||
from langchain.output_parsers.pydantic import PydanticOutputParser
|
||||
from langchain.output_parsers.rail_parser import GuardrailsOutputParser
|
||||
@ -36,6 +37,7 @@ __all__ = [
|
||||
"EnumOutputParser",
|
||||
"GuardrailsOutputParser",
|
||||
"ListOutputParser",
|
||||
"NumberedListOutputParser",
|
||||
"OutputFixingParser",
|
||||
"PydanticOutputParser",
|
||||
"RegexDictParser",
|
||||
|
@ -39,6 +39,8 @@ from langchain.load.serializable import Serializable
|
||||
from langchain.pydantic_v1 import Field
|
||||
from langchain.schema.runnable.config import (
|
||||
RunnableConfig,
|
||||
acall_func_with_variable_args,
|
||||
call_func_with_variable_args,
|
||||
ensure_config,
|
||||
get_async_callback_manager_for_config,
|
||||
get_callback_manager_for_config,
|
||||
@ -47,16 +49,15 @@ from langchain.schema.runnable.config import (
|
||||
patch_config,
|
||||
)
|
||||
from langchain.schema.runnable.utils import (
|
||||
Input,
|
||||
Output,
|
||||
accepts_config,
|
||||
accepts_run_manager,
|
||||
accepts_run_manager_and_config,
|
||||
gather_with_concurrency,
|
||||
)
|
||||
from langchain.utils.aiter import atee, py_anext
|
||||
from langchain.utils.iter import safetee
|
||||
|
||||
Input = TypeVar("Input")
|
||||
# Output type should implement __concat__, as eg str, list, dict do
|
||||
Output = TypeVar("Output")
|
||||
Other = TypeVar("Other")
|
||||
|
||||
|
||||
@ -311,16 +312,7 @@ class Runnable(Generic[Input, Output], ABC):
|
||||
name=config.get("run_name"),
|
||||
)
|
||||
try:
|
||||
if accepts_run_manager_and_config(func):
|
||||
output = func(
|
||||
input,
|
||||
run_manager=run_manager,
|
||||
config=config,
|
||||
) # type: ignore[call-arg]
|
||||
elif accepts_run_manager(func):
|
||||
output = func(input, run_manager=run_manager) # type: ignore[call-arg]
|
||||
else:
|
||||
output = func(input) # type: ignore[call-arg]
|
||||
output = call_func_with_variable_args(func, input, run_manager, config)
|
||||
except Exception as e:
|
||||
run_manager.on_chain_error(e)
|
||||
raise
|
||||
@ -353,19 +345,9 @@ class Runnable(Generic[Input, Output], ABC):
|
||||
name=config.get("run_name"),
|
||||
)
|
||||
try:
|
||||
if accepts_run_manager_and_config(func):
|
||||
output = await func(
|
||||
input,
|
||||
run_manager=run_manager,
|
||||
config=config,
|
||||
) # type: ignore[call-arg]
|
||||
elif accepts_run_manager(func):
|
||||
output = await func(
|
||||
input,
|
||||
run_manager=run_manager,
|
||||
) # type: ignore[call-arg]
|
||||
else:
|
||||
output = await func(input) # type: ignore[call-arg]
|
||||
output = await acall_func_with_variable_args(
|
||||
func, input, run_manager, config
|
||||
)
|
||||
except Exception as e:
|
||||
await run_manager.on_chain_error(e)
|
||||
raise
|
||||
@ -408,16 +390,15 @@ class Runnable(Generic[Input, Output], ABC):
|
||||
)
|
||||
]
|
||||
try:
|
||||
if accepts_run_manager_and_config(func):
|
||||
output = func(
|
||||
input,
|
||||
run_manager=run_managers,
|
||||
config=configs,
|
||||
) # type: ignore[call-arg]
|
||||
elif accepts_run_manager(func):
|
||||
output = func(input, run_manager=run_managers) # type: ignore[call-arg]
|
||||
else:
|
||||
output = func(input) # type: ignore[call-arg]
|
||||
kwargs: Dict[str, Any] = {}
|
||||
if accepts_config(func):
|
||||
kwargs["config"] = [
|
||||
patch_config(c, callbacks=rm.get_child())
|
||||
for c, rm in zip(configs, run_managers)
|
||||
]
|
||||
if accepts_run_manager(func):
|
||||
kwargs["run_manager"] = run_managers
|
||||
output = func(input, **kwargs) # type: ignore[call-arg]
|
||||
except Exception as e:
|
||||
for run_manager in run_managers:
|
||||
run_manager.on_chain_error(e)
|
||||
@ -479,16 +460,15 @@ class Runnable(Generic[Input, Output], ABC):
|
||||
)
|
||||
)
|
||||
try:
|
||||
if accepts_run_manager_and_config(func):
|
||||
output = await func(
|
||||
input,
|
||||
run_manager=run_managers,
|
||||
config=configs,
|
||||
) # type: ignore[call-arg]
|
||||
elif accepts_run_manager(func):
|
||||
output = await func(input, run_manager=run_managers) # type: ignore
|
||||
else:
|
||||
output = await func(input) # type: ignore[call-arg]
|
||||
kwargs: Dict[str, Any] = {}
|
||||
if accepts_config(func):
|
||||
kwargs["config"] = [
|
||||
patch_config(c, callbacks=rm.get_child())
|
||||
for c, rm in zip(configs, run_managers)
|
||||
]
|
||||
if accepts_run_manager(func):
|
||||
kwargs["run_manager"] = run_managers
|
||||
output = await func(input, **kwargs) # type: ignore[call-arg]
|
||||
except Exception as e:
|
||||
await asyncio.gather(
|
||||
*(run_manager.on_chain_error(e) for run_manager in run_managers)
|
||||
@ -550,19 +530,16 @@ class Runnable(Generic[Input, Output], ABC):
|
||||
name=config.get("run_name"),
|
||||
)
|
||||
try:
|
||||
if accepts_run_manager_and_config(transformer):
|
||||
iterator = transformer(
|
||||
input_for_transform,
|
||||
run_manager=run_manager,
|
||||
config=config,
|
||||
) # type: ignore[call-arg]
|
||||
elif accepts_run_manager(transformer):
|
||||
iterator = transformer(
|
||||
input_for_transform,
|
||||
run_manager=run_manager,
|
||||
) # type: ignore[call-arg]
|
||||
else:
|
||||
iterator = transformer(input_for_transform) # type: ignore[call-arg]
|
||||
kwargs: Dict[str, Any] = {}
|
||||
if accepts_config(transformer):
|
||||
kwargs["config"] = patch_config(
|
||||
config, callbacks=run_manager.get_child()
|
||||
)
|
||||
if accepts_run_manager(transformer):
|
||||
kwargs["run_manager"] = run_manager
|
||||
iterator = transformer(
|
||||
input_for_transform, **kwargs
|
||||
) # type: ignore[call-arg]
|
||||
for chunk in iterator:
|
||||
yield chunk
|
||||
if final_output_supported:
|
||||
@ -631,21 +608,16 @@ class Runnable(Generic[Input, Output], ABC):
|
||||
name=config.get("run_name"),
|
||||
)
|
||||
try:
|
||||
# mypy can't quite work out thew type guard here, but this is safe,
|
||||
# check implementations of the accepts_* functions
|
||||
if accepts_run_manager_and_config(transformer):
|
||||
iterator = transformer(
|
||||
input_for_transform,
|
||||
run_manager=run_manager,
|
||||
config=config,
|
||||
) # type: ignore[call-arg]
|
||||
elif accepts_run_manager(transformer):
|
||||
iterator = transformer(
|
||||
input_for_transform,
|
||||
run_manager=run_manager,
|
||||
) # type: ignore[call-arg]
|
||||
else:
|
||||
iterator = transformer(input_for_transform) # type: ignore[call-arg]
|
||||
kwargs: Dict[str, Any] = {}
|
||||
if accepts_config(transformer):
|
||||
kwargs["config"] = patch_config(
|
||||
config, callbacks=run_manager.get_child()
|
||||
)
|
||||
if accepts_run_manager(transformer):
|
||||
kwargs["run_manager"] = run_manager
|
||||
iterator = transformer(
|
||||
input_for_transform, **kwargs
|
||||
) # type: ignore[call-arg]
|
||||
async for chunk in iterator:
|
||||
yield chunk
|
||||
if final_output_supported:
|
||||
@ -1756,7 +1728,7 @@ class RunnableLambda(Runnable[Input, Output]):
|
||||
run_manager: CallbackManagerForChainRun,
|
||||
config: RunnableConfig,
|
||||
) -> Output:
|
||||
output = self.func(input)
|
||||
output = call_func_with_variable_args(self.func, input, run_manager, config)
|
||||
# If the output is a runnable, invoke it
|
||||
if isinstance(output, Runnable):
|
||||
recursion_limit = config["recursion_limit"]
|
||||
@ -1780,7 +1752,9 @@ class RunnableLambda(Runnable[Input, Output]):
|
||||
run_manager: AsyncCallbackManagerForChainRun,
|
||||
config: RunnableConfig,
|
||||
) -> Output:
|
||||
output = await self.afunc(input)
|
||||
output = await acall_func_with_variable_args(
|
||||
self.afunc, input, run_manager, config
|
||||
)
|
||||
# If the output is a runnable, invoke it
|
||||
if isinstance(output, Runnable):
|
||||
recursion_limit = config["recursion_limit"]
|
||||
@ -1798,6 +1772,21 @@ class RunnableLambda(Runnable[Input, Output]):
|
||||
)
|
||||
return output
|
||||
|
||||
def _config(
|
||||
self, config: Optional[RunnableConfig], callable: Callable[..., Any]
|
||||
) -> RunnableConfig:
|
||||
config = config or {}
|
||||
|
||||
if config.get("run_name") is None:
|
||||
try:
|
||||
run_name = callable.__name__
|
||||
except AttributeError:
|
||||
run_name = None
|
||||
if run_name is not None:
|
||||
return patch_config(config, run_name=run_name)
|
||||
|
||||
return config
|
||||
|
||||
def invoke(
|
||||
self,
|
||||
input: Input,
|
||||
@ -1805,7 +1794,11 @@ class RunnableLambda(Runnable[Input, Output]):
|
||||
**kwargs: Optional[Any],
|
||||
) -> Output:
|
||||
if hasattr(self, "func"):
|
||||
return self._call_with_config(self._invoke, input, config)
|
||||
return self._call_with_config(
|
||||
self._invoke,
|
||||
input,
|
||||
self._config(config, self.func),
|
||||
)
|
||||
else:
|
||||
raise TypeError(
|
||||
"Cannot invoke a coroutine function synchronously."
|
||||
@ -1819,7 +1812,11 @@ class RunnableLambda(Runnable[Input, Output]):
|
||||
**kwargs: Optional[Any],
|
||||
) -> Output:
|
||||
if hasattr(self, "afunc"):
|
||||
return await self._acall_with_config(self._ainvoke, input, config)
|
||||
return await self._acall_with_config(
|
||||
self._ainvoke,
|
||||
input,
|
||||
self._config(config, self.afunc),
|
||||
)
|
||||
else:
|
||||
# Delegating to super implementation of ainvoke.
|
||||
# Uses asyncio executor to run the sync version (invoke)
|
||||
|
@ -3,13 +3,35 @@ from __future__ import annotations
|
||||
from concurrent.futures import Executor, ThreadPoolExecutor
|
||||
from contextlib import contextmanager
|
||||
from copy import deepcopy
|
||||
from typing import TYPE_CHECKING, Any, Dict, Generator, List, Optional, Union
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
Awaitable,
|
||||
Callable,
|
||||
Dict,
|
||||
Generator,
|
||||
List,
|
||||
Optional,
|
||||
Union,
|
||||
)
|
||||
|
||||
from typing_extensions import TypedDict
|
||||
|
||||
from langchain.schema.runnable.utils import (
|
||||
Input,
|
||||
Output,
|
||||
accepts_config,
|
||||
accepts_run_manager,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from langchain.callbacks.base import BaseCallbackManager, Callbacks
|
||||
from langchain.callbacks.manager import AsyncCallbackManager, CallbackManager
|
||||
from langchain.callbacks.manager import (
|
||||
AsyncCallbackManager,
|
||||
AsyncCallbackManagerForChainRun,
|
||||
CallbackManager,
|
||||
CallbackManagerForChainRun,
|
||||
)
|
||||
|
||||
|
||||
class RunnableConfig(TypedDict, total=False):
|
||||
@ -117,6 +139,47 @@ def patch_config(
|
||||
return config
|
||||
|
||||
|
||||
def call_func_with_variable_args(
|
||||
func: Union[
|
||||
Callable[[Input], Output],
|
||||
Callable[[Input, CallbackManagerForChainRun], Output],
|
||||
Callable[[Input, CallbackManagerForChainRun, RunnableConfig], Output],
|
||||
],
|
||||
input: Input,
|
||||
run_manager: CallbackManagerForChainRun,
|
||||
config: RunnableConfig,
|
||||
) -> Output:
|
||||
"""Call function that may optionally accept a run_manager and/or config."""
|
||||
kwargs: Dict[str, Any] = {}
|
||||
if accepts_config(func):
|
||||
kwargs["config"] = patch_config(config, callbacks=run_manager.get_child())
|
||||
if accepts_run_manager(func):
|
||||
kwargs["run_manager"] = run_manager
|
||||
return func(input, **kwargs) # type: ignore[call-arg]
|
||||
|
||||
|
||||
async def acall_func_with_variable_args(
|
||||
func: Union[
|
||||
Callable[[Input], Awaitable[Output]],
|
||||
Callable[[Input, AsyncCallbackManagerForChainRun], Awaitable[Output]],
|
||||
Callable[
|
||||
[Input, AsyncCallbackManagerForChainRun, RunnableConfig],
|
||||
Awaitable[Output],
|
||||
],
|
||||
],
|
||||
input: Input,
|
||||
run_manager: AsyncCallbackManagerForChainRun,
|
||||
config: RunnableConfig,
|
||||
) -> Output:
|
||||
"""Call function that may optionally accept a run_manager and/or config."""
|
||||
kwargs: Dict[str, Any] = {}
|
||||
if accepts_config(func):
|
||||
kwargs["config"] = patch_config(config, callbacks=run_manager.get_child())
|
||||
if accepts_run_manager(func):
|
||||
kwargs["run_manager"] = run_manager
|
||||
return await func(input, **kwargs) # type: ignore[call-arg]
|
||||
|
||||
|
||||
def get_callback_manager_for_config(config: RunnableConfig) -> CallbackManager:
|
||||
from langchain.callbacks.manager import CallbackManager
|
||||
|
||||
|
@ -2,7 +2,11 @@ from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
from inspect import signature
|
||||
from typing import Any, Callable, Coroutine, Union
|
||||
from typing import Any, Callable, Coroutine, TypeVar, Union
|
||||
|
||||
Input = TypeVar("Input")
|
||||
# Output type should implement __concat__, as eg str, list, dict do
|
||||
Output = TypeVar("Output")
|
||||
|
||||
|
||||
async def gated_coro(semaphore: asyncio.Semaphore, coro: Coroutine) -> Any:
|
||||
@ -26,8 +30,8 @@ def accepts_run_manager(callable: Callable[..., Any]) -> bool:
|
||||
return False
|
||||
|
||||
|
||||
def accepts_run_manager_and_config(callable: Callable[..., Any]) -> bool:
|
||||
return (
|
||||
accepts_run_manager(callable)
|
||||
and signature(callable).parameters.get("config") is not None
|
||||
)
|
||||
def accepts_config(callable: Callable[..., Any]) -> bool:
|
||||
try:
|
||||
return signature(callable).parameters.get("config") is not None
|
||||
except ValueError:
|
||||
return False
|
||||
|
@ -100,6 +100,7 @@ class TextSplitter(BaseDocumentTransformer, ABC):
|
||||
length_function: Callable[[str], int] = len,
|
||||
keep_separator: bool = False,
|
||||
add_start_index: bool = False,
|
||||
strip_whitespace: bool = True,
|
||||
) -> None:
|
||||
"""Create a new TextSplitter.
|
||||
|
||||
@ -109,6 +110,8 @@ class TextSplitter(BaseDocumentTransformer, ABC):
|
||||
length_function: Function that measures the length of given chunks
|
||||
keep_separator: Whether to keep the separator in the chunks
|
||||
add_start_index: If `True`, includes chunk's start index in metadata
|
||||
strip_whitespace: If `True`, strips whitespace from the start and end of
|
||||
every document
|
||||
"""
|
||||
if chunk_overlap > chunk_size:
|
||||
raise ValueError(
|
||||
@ -120,6 +123,7 @@ class TextSplitter(BaseDocumentTransformer, ABC):
|
||||
self._length_function = length_function
|
||||
self._keep_separator = keep_separator
|
||||
self._add_start_index = add_start_index
|
||||
self._strip_whitespace = strip_whitespace
|
||||
|
||||
@abstractmethod
|
||||
def split_text(self, text: str) -> List[str]:
|
||||
@ -152,7 +156,8 @@ class TextSplitter(BaseDocumentTransformer, ABC):
|
||||
|
||||
def _join_docs(self, docs: List[str], separator: str) -> Optional[str]:
|
||||
text = separator.join(docs)
|
||||
text = text.strip()
|
||||
if self._strip_whitespace:
|
||||
text = text.strip()
|
||||
if text == "":
|
||||
return None
|
||||
else:
|
||||
|
@ -108,7 +108,7 @@ def get_client(redis_url: str, **kwargs: Any) -> RedisType:
|
||||
try:
|
||||
import redis
|
||||
except ImportError:
|
||||
raise ValueError(
|
||||
raise ImportError(
|
||||
"Could not import redis python package. "
|
||||
"Please install it with `pip install redis>=4.1.0`."
|
||||
)
|
||||
|
@ -9,6 +9,7 @@ from sqlalchemy import MetaData, Table, create_engine, inspect, select, text
|
||||
from sqlalchemy.engine import Engine
|
||||
from sqlalchemy.exc import ProgrammingError, SQLAlchemyError
|
||||
from sqlalchemy.schema import CreateTable
|
||||
from sqlalchemy.types import NullType
|
||||
|
||||
from langchain.utils import get_from_env
|
||||
|
||||
@ -314,6 +315,11 @@ class SQLDatabase:
|
||||
tables.append(self._custom_table_info[table.name])
|
||||
continue
|
||||
|
||||
# Ignore JSON datatyped columns
|
||||
for k, v in table.columns.items():
|
||||
if type(v.type) is NullType:
|
||||
table._columns.remove(v)
|
||||
|
||||
# add create table command
|
||||
create_table = str(CreateTable(table).compile(self._engine))
|
||||
table_info = f"{create_table.rstrip()}"
|
||||
@ -384,6 +390,8 @@ class SQLDatabase:
|
||||
connection.exec_driver_sql(f"SET @@dataset_id='{self._schema}'")
|
||||
elif self.dialect == "mssql":
|
||||
pass
|
||||
elif self.dialect == "trino":
|
||||
connection.exec_driver_sql(f"USE {self._schema}")
|
||||
else: # postgresql and compatible dialects
|
||||
connection.exec_driver_sql(f"SET search_path TO {self._schema}")
|
||||
cursor = connection.execute(text(command))
|
||||
|
@ -147,7 +147,12 @@ class MyScale(VectorStore):
|
||||
)
|
||||
for k in ["id", "vector", "text", "metadata"]:
|
||||
assert k in self.config.column_map
|
||||
assert self.config.metric in ["ip", "cosine", "l2"]
|
||||
assert self.config.metric.upper() in ["IP", "COSINE", "L2"]
|
||||
if self.config.metric in ["ip", "cosine", "l2"]:
|
||||
logger.warning(
|
||||
"Lower case metric types will be deprecated "
|
||||
"the future. Please use one of ('IP', 'Cosine', 'L2')"
|
||||
)
|
||||
|
||||
# initialize the schema
|
||||
dim = len(embedding.embed_query("try this out"))
|
||||
@ -174,7 +179,9 @@ class MyScale(VectorStore):
|
||||
self.BS = "\\"
|
||||
self.must_escape = ("\\", "'")
|
||||
self._embeddings = embedding
|
||||
self.dist_order = "ASC" if self.config.metric in ["cosine", "l2"] else "DESC"
|
||||
self.dist_order = (
|
||||
"ASC" if self.config.metric.upper() in ["COSINE", "L2"] else "DESC"
|
||||
)
|
||||
|
||||
# Create a connection to myscale
|
||||
self.client = get_client(
|
||||
|
159
libs/langchain/langchain/vectorstores/nucliadb.py
Normal file
159
libs/langchain/langchain/vectorstores/nucliadb.py
Normal file
@ -0,0 +1,159 @@
|
||||
import os
|
||||
from typing import Any, Dict, Iterable, List, Optional, Type
|
||||
|
||||
from langchain.embeddings.base import Embeddings
|
||||
from langchain.schema.document import Document
|
||||
from langchain.vectorstores.base import VST, VectorStore
|
||||
|
||||
FIELD_TYPES = {
|
||||
"f": "files",
|
||||
"t": "texts",
|
||||
"l": "links",
|
||||
}
|
||||
|
||||
|
||||
class NucliaDB(VectorStore):
|
||||
"""NucliaDB vector store."""
|
||||
|
||||
_config: Dict[str, Any] = {}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
knowledge_box: str,
|
||||
local: bool,
|
||||
api_key: Optional[str] = None,
|
||||
backend: Optional[str] = None,
|
||||
) -> None:
|
||||
"""Initialize the NucliaDB client.
|
||||
|
||||
Args:
|
||||
knowledge_box: the Knowledge Box id.
|
||||
local: Whether to use a local NucliaDB instance or Nuclia Cloud
|
||||
api_key: A contributor API key for the kb (needed when local is False)
|
||||
backend: The backend url to use when local is True, defaults to
|
||||
http://localhost:8080
|
||||
"""
|
||||
try:
|
||||
from nuclia.sdk import NucliaAuth
|
||||
except ImportError:
|
||||
raise ValueError(
|
||||
"nuclia python package not found. "
|
||||
"Please install it with `pip install nuclia`."
|
||||
)
|
||||
self._config["LOCAL"] = local
|
||||
zone = os.environ.get("NUCLIA_ZONE", "europe-1")
|
||||
self._kb = knowledge_box
|
||||
if local:
|
||||
if not backend:
|
||||
backend = "http://localhost:8080"
|
||||
self._config["BACKEND"] = f"{backend}/api/v1"
|
||||
self._config["TOKEN"] = None
|
||||
NucliaAuth().nucliadb(url=backend)
|
||||
NucliaAuth().kb(url=self.kb_url, interactive=False)
|
||||
else:
|
||||
self._config["BACKEND"] = f"https://{zone}.nuclia.cloud/api/v1"
|
||||
self._config["TOKEN"] = api_key
|
||||
NucliaAuth().kb(
|
||||
url=self.kb_url, token=self._config["TOKEN"], interactive=False
|
||||
)
|
||||
|
||||
@property
|
||||
def is_local(self) -> str:
|
||||
return self._config["LOCAL"]
|
||||
|
||||
@property
|
||||
def kb_url(self) -> str:
|
||||
return f"{self._config['BACKEND']}/kb/{self._kb}"
|
||||
|
||||
def add_texts(
|
||||
self,
|
||||
texts: Iterable[str],
|
||||
metadatas: Optional[List[dict]] = None,
|
||||
**kwargs: Any,
|
||||
) -> List[str]:
|
||||
"""Upload texts to NucliaDB"""
|
||||
ids = []
|
||||
from nuclia.sdk import NucliaResource
|
||||
|
||||
factory = NucliaResource()
|
||||
for i, text in enumerate(texts):
|
||||
extra: Dict[str, Any] = {"metadata": ""}
|
||||
if metadatas:
|
||||
extra = {"metadata": metadatas[i]}
|
||||
id = factory.create(
|
||||
texts={"text": {"body": text}},
|
||||
extra=extra,
|
||||
url=self.kb_url,
|
||||
api_key=self._config["TOKEN"],
|
||||
)
|
||||
ids.append(id)
|
||||
return ids
|
||||
|
||||
def delete(self, ids: Optional[List[str]] = None, **kwargs: Any) -> Optional[bool]:
|
||||
if not ids:
|
||||
return None
|
||||
from nuclia.sdk import NucliaResource
|
||||
|
||||
factory = NucliaResource()
|
||||
results: List[bool] = []
|
||||
for id in ids:
|
||||
try:
|
||||
factory.delete(rid=id, url=self.kb_url, api_key=self._config["TOKEN"])
|
||||
results.append(True)
|
||||
except ValueError:
|
||||
results.append(False)
|
||||
return all(results)
|
||||
|
||||
def similarity_search(
|
||||
self, query: str, k: int = 4, **kwargs: Any
|
||||
) -> List[Document]:
|
||||
from nuclia.sdk import NucliaSearch
|
||||
from nucliadb_models.search import FindRequest, ResourceProperties
|
||||
|
||||
request = FindRequest(
|
||||
query=query,
|
||||
page_size=k,
|
||||
show=[ResourceProperties.VALUES, ResourceProperties.EXTRA],
|
||||
)
|
||||
search = NucliaSearch()
|
||||
results = search.find(
|
||||
query=request, url=self.kb_url, api_key=self._config["TOKEN"]
|
||||
)
|
||||
paragraphs = []
|
||||
for resource in results.resources.values():
|
||||
for field in resource.fields.values():
|
||||
for paragraph_id, paragraph in field.paragraphs.items():
|
||||
info = paragraph_id.split("/")
|
||||
field_type = FIELD_TYPES.get(info[1], None)
|
||||
field_id = info[2]
|
||||
if not field_type:
|
||||
continue
|
||||
value = getattr(resource.data, field_type, {}).get(field_id, None)
|
||||
paragraphs.append(
|
||||
{
|
||||
"text": paragraph.text,
|
||||
"metadata": {
|
||||
"extra": getattr(
|
||||
getattr(resource, "extra", {}), "metadata", None
|
||||
),
|
||||
"value": value,
|
||||
},
|
||||
"order": paragraph.order,
|
||||
}
|
||||
)
|
||||
sorted_paragraphs = sorted(paragraphs, key=lambda x: x["order"])
|
||||
return [
|
||||
Document(page_content=paragraph["text"], metadata=paragraph["metadata"])
|
||||
for paragraph in sorted_paragraphs
|
||||
]
|
||||
|
||||
@classmethod
|
||||
def from_texts(
|
||||
cls: Type[VST],
|
||||
texts: List[str],
|
||||
embedding: Embeddings,
|
||||
metadatas: Optional[List[dict]] = None,
|
||||
**kwargs: Any,
|
||||
) -> VST:
|
||||
"""Return VectorStore initialized from texts and embeddings."""
|
||||
raise NotImplementedError
|
@ -349,16 +349,16 @@ class PGVector(VectorStore):
|
||||
|
||||
@property
|
||||
def distance_strategy(self) -> Any:
|
||||
if self._distance_strategy == "l2":
|
||||
if self._distance_strategy == DistanceStrategy.EUCLIDEAN:
|
||||
return self.EmbeddingStore.embedding.l2_distance
|
||||
elif self._distance_strategy == "cosine":
|
||||
elif self._distance_strategy == DistanceStrategy.COSINE:
|
||||
return self.EmbeddingStore.embedding.cosine_distance
|
||||
elif self._distance_strategy == "inner":
|
||||
elif self._distance_strategy == DistanceStrategy.MAX_INNER_PRODUCT:
|
||||
return self.EmbeddingStore.embedding.max_inner_product
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Got unexpected value for distance: {self._distance_strategy}. "
|
||||
f"Should be one of `l2`, `cosine`, `inner`."
|
||||
f"Should be one of {', '.join([ds.value for ds in DistanceStrategy])}."
|
||||
)
|
||||
|
||||
def similarity_search_with_score_by_vector(
|
||||
|
674
libs/langchain/poetry.lock
generated
674
libs/langchain/poetry.lock
generated
File diff suppressed because it is too large
Load Diff
@ -1,6 +1,6 @@
|
||||
[tool.poetry]
|
||||
name = "langchain"
|
||||
version = "0.0.280"
|
||||
version = "0.0.284"
|
||||
description = "Building applications with LLMs through composability"
|
||||
authors = []
|
||||
license = "MIT"
|
||||
|
@ -8,3 +8,4 @@ _EXAMPLES_DIR = _THIS_DIR / "integration_tests" / "examples"
|
||||
# Paths to test PDF files
|
||||
HELLO_PDF = _EXAMPLES_DIR / "hello.pdf"
|
||||
LAYOUT_PARSER_PAPER_PDF = _EXAMPLES_DIR / "layout-parser-paper.pdf"
|
||||
DUPLICATE_CHARS = _EXAMPLES_DIR / "duplicate-chars.pdf"
|
||||
|
@ -19,6 +19,10 @@ LAYOUT_PARSER_PAPER_PDF = (
|
||||
Path(__file__).parent.parent.parent / "examples" / "layout-parser-paper.pdf"
|
||||
)
|
||||
|
||||
DUPLICATE_CHARS = (
|
||||
Path(__file__).parent.parent.parent / "examples" / "duplicate-chars.pdf"
|
||||
)
|
||||
|
||||
|
||||
def _assert_with_parser(parser: BaseBlobParser, splits_by_page: bool = True) -> None:
|
||||
"""Standard tests to verify that the given parser works.
|
||||
@ -59,6 +63,26 @@ def _assert_with_parser(parser: BaseBlobParser, splits_by_page: bool = True) ->
|
||||
assert metadata["page"] == 0
|
||||
|
||||
|
||||
def _assert_with_duplicate_parser(parser: BaseBlobParser, dedupe: bool = False) -> None:
|
||||
"""PDFPlumber tests to verify that duplicate characters appear or not
|
||||
Args:
|
||||
parser (BaseBlobParser): The parser to test.
|
||||
splits_by_page (bool): Whether the parser splits by page or not by default.
|
||||
dedupe: Avoiding the error of duplicate characters if `dedupe=True`.
|
||||
"""
|
||||
blob = Blob.from_path(DUPLICATE_CHARS)
|
||||
doc_generator = parser.lazy_parse(blob)
|
||||
assert isinstance(doc_generator, Iterator)
|
||||
docs = list(doc_generator)
|
||||
|
||||
if dedupe:
|
||||
# use dedupe avoid duplicate characters.
|
||||
assert "1000 Series" == docs[0].page_content.split("\n")[0]
|
||||
else:
|
||||
# duplicate characters will appear in doc if not dedupe
|
||||
assert "11000000 SSeerriieess" == docs[0].page_content.split("\n")[0]
|
||||
|
||||
|
||||
def test_pymupdf_loader() -> None:
|
||||
"""Test PyMuPDF loader."""
|
||||
_assert_with_parser(PyMuPDFParser())
|
||||
@ -84,3 +108,5 @@ def test_pypdfium2_parser() -> None:
|
||||
def test_pdfplumber_parser() -> None:
|
||||
"""Test PDFPlumber parser."""
|
||||
_assert_with_parser(PDFPlumberParser())
|
||||
_assert_with_duplicate_parser(PDFPlumberParser())
|
||||
_assert_with_duplicate_parser(PDFPlumberParser(dedupe=True), dedupe=True)
|
||||
|
@ -7,7 +7,9 @@ from langchain.document_loaders import PlaywrightURLLoader
|
||||
from langchain.document_loaders.url_playwright import PlaywrightEvaluator
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from playwright.async_api import AsyncBrowser, AsyncPage, AsyncResponse
|
||||
from playwright.async_api import Browser as AsyncBrowser
|
||||
from playwright.async_api import Page as AsyncPage
|
||||
from playwright.async_api import Response as AsyncResponse
|
||||
from playwright.sync_api import Browser, Page, Response
|
||||
|
||||
|
||||
|
Binary file not shown.
@ -0,0 +1,98 @@
|
||||
from typing import Any
|
||||
from unittest import mock
|
||||
|
||||
from langchain.vectorstores.nucliadb import NucliaDB
|
||||
|
||||
|
||||
class attrdict(dict):
|
||||
def __getitem__(self, key: str) -> Any:
|
||||
value = dict.__getitem__(self, key)
|
||||
return attrdict(value) if isinstance(value, dict) else value
|
||||
|
||||
__getattr__ = __getitem__
|
||||
|
||||
|
||||
def FakeCreate(**args: Any) -> Any:
|
||||
def fn(self: Any, **kwargs: Any) -> str:
|
||||
return "fake_uuid"
|
||||
|
||||
return fn
|
||||
|
||||
|
||||
def FakeDelete(**args: Any) -> Any:
|
||||
def fn(self: Any, **kwargs: Any) -> None:
|
||||
return None
|
||||
|
||||
return fn
|
||||
|
||||
|
||||
def FakeFind(**args: Any) -> Any:
|
||||
def fn(self: Any, **kwargs: Any) -> Any:
|
||||
return attrdict(
|
||||
{
|
||||
"resources": {
|
||||
"123": attrdict(
|
||||
{
|
||||
"fields": {
|
||||
"456": attrdict(
|
||||
{
|
||||
"paragraphs": {
|
||||
"123/t/text/0-14": attrdict(
|
||||
{
|
||||
"text": "This is a test",
|
||||
"order": 0,
|
||||
}
|
||||
),
|
||||
}
|
||||
}
|
||||
)
|
||||
},
|
||||
"data": {
|
||||
"texts": {
|
||||
"text": {
|
||||
"body": "This is a test",
|
||||
}
|
||||
}
|
||||
},
|
||||
"extra": attrdict({"metadata": {"some": "metadata"}}),
|
||||
}
|
||||
)
|
||||
}
|
||||
}
|
||||
)
|
||||
|
||||
return fn
|
||||
|
||||
|
||||
def test_add_texts() -> None:
|
||||
with mock.patch(
|
||||
"nuclia.sdk.resource.NucliaResource.create",
|
||||
new_callable=FakeCreate,
|
||||
):
|
||||
ndb = NucliaDB(knowledge_box="YOUR_KB_ID", local=False, api_key="YOUR_API_KEY")
|
||||
assert ndb.is_local is False
|
||||
ids = ndb.add_texts(["This is a new test", "This is a second test"])
|
||||
assert len(ids) == 2
|
||||
|
||||
|
||||
def test_delete() -> None:
|
||||
with mock.patch(
|
||||
"nuclia.sdk.resource.NucliaResource.delete",
|
||||
new_callable=FakeDelete,
|
||||
):
|
||||
ndb = NucliaDB(knowledge_box="YOUR_KB_ID", local=False, api_key="YOUR_API_KEY")
|
||||
success = ndb.delete(["123", "456"])
|
||||
assert success
|
||||
|
||||
|
||||
def test_search() -> None:
|
||||
with mock.patch(
|
||||
"nuclia.sdk.search.NucliaSearch.find",
|
||||
new_callable=FakeFind,
|
||||
):
|
||||
ndb = NucliaDB(knowledge_box="YOUR_KB_ID", local=False, api_key="YOUR_API_KEY")
|
||||
results = ndb.similarity_search("Who was inspired by Ada Lovelace?")
|
||||
assert len(results) == 1
|
||||
assert results[0].page_content == "This is a test"
|
||||
assert results[0].metadata["extra"]["some"] == "metadata"
|
||||
assert results[0].metadata["value"]["body"] == "This is a test"
|
File diff suppressed because one or more lines are too long
@ -948,7 +948,7 @@ async def test_higher_order_lambda_runnable(
|
||||
parent_run = next(r for r in tracer.runs if r.parent_run_id is None)
|
||||
assert len(parent_run.child_runs) == 2
|
||||
router_run = parent_run.child_runs[1]
|
||||
assert router_run.name == "RunnableLambda"
|
||||
assert router_run.name == "router"
|
||||
assert len(router_run.child_runs) == 1
|
||||
math_run = router_run.child_runs[0]
|
||||
assert math_run.name == "RunnableSequence"
|
||||
@ -980,7 +980,7 @@ async def test_higher_order_lambda_runnable(
|
||||
parent_run = next(r for r in tracer.runs if r.parent_run_id is None)
|
||||
assert len(parent_run.child_runs) == 2
|
||||
router_run = parent_run.child_runs[1]
|
||||
assert router_run.name == "RunnableLambda"
|
||||
assert router_run.name == "arouter"
|
||||
assert len(router_run.child_runs) == 1
|
||||
math_run = router_run.child_runs[0]
|
||||
assert math_run.name == "RunnableSequence"
|
||||
|
Loading…
Reference in New Issue
Block a user