Compare commits

...

23 Commits

Author SHA1 Message Date
vowelparrot
f24b6fd5a0 Fix linting 2023-04-21 15:39:24 -07:00
Varun Srinivas
45003d9f79 Change in method name for creating an issue on JIRA (#3307)
The awesome JIRA tool created by @zywilliamli calls the `create_issue()`
method to create issues, however, the actual method is `issue_create()`.

Details in the Documentation here:
https://atlassian-python-api.readthedocs.io/jira.html#manage-issues
2023-04-21 15:36:44 -07:00
Davis Chase
0e29a22444 Update docs api references (#3315) 2023-04-21 15:36:44 -07:00
Paul Garner
b5b2b240e7 Add PythonLoader which auto-detects encoding of Python files (#3311)
This PR contributes a `PythonLoader`, which inherits from
`TextLoader` but detects and sets the encoding automatically.
2023-04-21 15:36:44 -07:00
Daniel Chalef
18c404e411 Fix example match_documents fn table name, grammar (#3294)
ref
https://github.com/hwchase17/langchain/pull/3100#issuecomment-1517086472

Co-authored-by: Daniel Chalef <daniel.chalef@private.org>
2023-04-21 15:36:44 -07:00
Davis Chase
92232b22bb Cleanup integration test dir (#3308) 2023-04-21 15:36:44 -07:00
leo-gan
093f2a5ec1 added links to the important YouTube videos (#3244)
Added links to the important YouTube videos
2023-04-21 15:36:44 -07:00
Sertaç Özercan
789c0270a2 fix: handle youtube TranscriptsDisabled (#3276)
handles error when youtube video has transcripts disabled

```
youtube_transcript_api._errors.TranscriptsDisabled: 
Could not retrieve a transcript for the video https://www.youtube.com/watch?v=<URL> This is most likely caused by:

Subtitles are disabled for this video

If you are sure that the described cause is not responsible for this error and that a transcript should be retrievable, please create an issue at https://github.com/jdepoix/youtube-transcript-api/issues. Please add which version of youtube_transcript_api you are using and provide the information needed to replicate the error. Also make sure that there are no open issues which already describe your problem!
```

Signed-off-by: Sertac Ozercan <sozercan@gmail.com>
2023-04-21 15:36:44 -07:00
Alexandre Pesant
ad7dc1513a Do not print openai settings (#3280)
There's no reason to print these settings like that, it just pollutes
the logs :)
2023-04-21 15:36:44 -07:00
Zander Chase
3bda9876be Handle null action in AutoGPT Agent (#3274)
Handle the case where the command is `null`
2023-04-21 15:36:44 -07:00
Harrison Chase
12121e4f6f bump version 146 (#3272) 2023-04-21 15:36:44 -07:00
Harrison Chase
958e1f49ce gradio tools (#3255) 2023-04-21 15:36:44 -07:00
Naveen Tatikonda
451b947ab7 OpenSearch: Add Support for Lucene Filter (#3201)
### Description
Add Support for Lucene Filter. When you specify a Lucene filter for a
k-NN search, the Lucene algorithm decides whether to perform an exact
k-NN search with pre-filtering or an approximate search with modified
post-filtering. This filter is supported only for approximate search
with the indexes that are created using `lucene` engine.

OpenSearch Documentation -
https://opensearch.org/docs/latest/search-plugins/knn/filter-search-knn/#lucene-k-nn-filter-implementation

Signed-off-by: Naveen Tatikonda <navtat@amazon.com>
2023-04-21 15:36:44 -07:00
Davis Chase
6a98faca88 Hf emb device (#3266)
Make it possible to control the HuggingFaceEmbeddings and HuggingFaceInstructEmbeddings client model kwargs. Additionally, the cache folder was added for HuggingFaceInstructEmbedding as the client inherits from SentenceTransformer (client of HuggingFaceEmbeddings).

It can be useful, especially to control the client device, as it will be defaulted to GPU by sentence_transformers if there is any.

---------

Co-authored-by: Yoann Poupart <66315201+Xmaster6y@users.noreply.github.com>
2023-04-21 15:36:44 -07:00
Zach Jones
a58df205ce Fix type annotation for QueryCheckerTool.llm (#3237)
Currently `langchain.tools.sql_database.tool.QueryCheckerTool` has a
field `llm` with type `BaseLLM`. This breaks initialization for some
LLMs. For example, trying to use it with GPT4:

```python
from langchain.sql_database import SQLDatabase
from langchain.chat_models import ChatOpenAI
from langchain.tools.sql_database.tool import QueryCheckerTool


db = SQLDatabase.from_uri("some_db_uri")
llm = ChatOpenAI(model_name="gpt-4")
tool = QueryCheckerTool(db=db, llm=llm)

# pydantic.error_wrappers.ValidationError: 1 validation error for QueryCheckerTool
# llm
#   Can't instantiate abstract class BaseLLM with abstract methods _agenerate, _generate, _llm_type (type=type_error)
```

Seems like much of the rest of the codebase has switched from `BaseLLM`
to `BaseLanguageModel`. This PR makes the change for QueryCheckerTool as
well

Co-authored-by: Zachary Jones <zjones@zetaglobal.com>
2023-04-21 15:36:44 -07:00
Davis Chase
d2173e089a Contextual compression retriever (#2915)
Co-authored-by: Harrison Chase <hw.chase.17@gmail.com>
2023-04-21 15:36:44 -07:00
Matt Robinson
be672527a3 feat: add loader for rich text files (#3227)
### Summary

Adds a loader for rich text files. Requires `unstructured>=0.5.12`.

### Testing

The following test uses the example RTF file from the [`unstructured`
repo](https://github.com/Unstructured-IO/unstructured/tree/main/example-docs).

```python
from langchain.document_loaders import UnstructuredRTFLoader

loader = UnstructuredRTFLoader("fake-doc.rtf", mode="elements")
docs = loader.load()
docs[0].page_content
```
2023-04-21 15:36:44 -07:00
Harrison Chase
99aacb50bf add to docs 2023-04-21 15:36:44 -07:00
Albert Castellana
653f3da74c Ecosystem/Yeager.ai (#3239)
Added yeagerai.md to ecosystem
2023-04-21 15:36:44 -07:00
Boris Feld
48434180e5 Fixing issue link for Comet callback (#3212)
Sorry I fixed that link once but there was still a typo inside, this
time it should be good.
2023-04-21 15:36:44 -07:00
Daniel Chalef
fc4de351b1 fix error msg ref to beautifulsoup4 (#3242)
Co-authored-by: Daniel Chalef <daniel.chalef@private.org>
2023-04-21 15:36:44 -07:00
Tom Dyson
1b437f3dc0 Add DuckDB prompt (#3233)
Adds a prompt template for the DuckDB SQL dialect.
2023-04-21 15:36:44 -07:00
Zander Chase
eda2408aed Patch Chat History Formatting (#3236)
While we work on solidifying the memory interfaces, handle common chat
history formats.

This may break linting on anyone who has been passing in
`get_chat_history` .

Somewhat handles #3077

Alternative to #3078 that updates the typing
2023-04-21 15:36:44 -07:00
82 changed files with 1863 additions and 104 deletions

View File

@@ -0,0 +1,43 @@
# Yeager.ai
This page covers how to use [Yeager.ai](https://yeager.ai) to generate LangChain tools and agents.
## What is Yeager.ai?
Yeager.ai is an ecosystem designed to simplify the process of creating AI agents and tools.
It features yAgents, a No-code LangChain Agent Builder, which enables users to build, test, and deploy AI solutions with ease. Leveraging the LangChain framework, yAgents allows seamless integration with various language models and resources, making it suitable for developers, researchers, and AI enthusiasts across diverse applications.
## yAgents
Low code generative agent designed to help you build, prototype, and deploy Langchain tools with ease.
### How to use?
```
pip install yeagerai-agent
yeagerai-agent
```
Go to http://127.0.0.1:7860
This will install the necessary dependencies and set up yAgents on your system. After the first run, yAgents will create a .env file where you can input your OpenAI API key. You can do the same directly from the Gradio interface under the tab "Settings".
`OPENAI_API_KEY=<your_openai_api_key_here>`
We recommend using GPT-4,. However, the tool can also work with GPT-3 if the problem is broken down sufficiently.
### Creating and Executing Tools with yAgents
yAgents makes it easy to create and execute AI-powered tools. Here's a brief overview of the process:
1. Create a tool: To create a tool, provide a natural language prompt to yAgents. The prompt should clearly describe the tool's purpose and functionality. For example:
`create a tool that returns the n-th prime number`
2. Load the tool into the toolkit: To load a tool into yAgents, simply provide a command to yAgents that says so. For example:
`load the tool that you just created it into your toolkit`
3. Execute the tool: To run a tool or agent, simply provide a command to yAgents that includes the name of the tool and any required parameters. For example:
`generate the 50th prime number`
You can see a video of how it works [here](https://www.youtube.com/watch?v=KA5hCM3RaWE).
As you become more familiar with yAgents, you can create more advanced tools and agents to automate your work and enhance your productivity.
For more information, see [yAgents' Github](https://github.com/yeagerai/yeagerai-agent) or our [docs](https://yeagerai.gitbook.io/docs/general/welcome-to-yeager.ai)

File diff suppressed because one or more lines are too long

View File

@@ -11,7 +11,7 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 3,
"id": "019d8520",
"metadata": {},
"outputs": [],
@@ -128,10 +128,69 @@
"len(docs)"
]
},
{
"cell_type": "markdown",
"id": "598a2805",
"metadata": {},
"source": [
"If you need to load Python source code files, use the `PythonLoader`."
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "c558bd73",
"metadata": {},
"outputs": [],
"source": [
"from langchain.document_loaders import PythonLoader"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "a3cfaba7",
"metadata": {},
"outputs": [],
"source": [
"loader = DirectoryLoader('../../../../../', glob=\"**/*.py\", loader_cls=PythonLoader)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "e2e1e26a",
"metadata": {},
"outputs": [],
"source": [
"docs = loader.load()"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "ffb8ff36",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"691"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(docs)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "984c8429",
"id": "7f6e0eae",
"metadata": {},
"outputs": [],
"source": []
@@ -153,7 +212,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.1"
"version": "3.10.3"
}
},
"nbformat": 4,

View File

@@ -0,0 +1,371 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "fc0db1bc",
"metadata": {},
"source": [
"# Contextual Compression Retriever\n",
"\n",
"This notebook introduces the concept of DocumentCompressors and the ContextualCompressionRetriever. The core idea is simple: given a specific query, we should be able to return only the documents relevant to that query, and only the parts of those documents that are relevant. The ContextualCompressionsRetriever is a wrapper for another retriever that iterates over the initial output of the base retriever and filters and compresses those initial documents, so that only the most relevant information is returned."
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "28e8dc12",
"metadata": {},
"outputs": [],
"source": [
"# Helper function for printing docs\n",
"\n",
"def pretty_print_docs(docs):\n",
" print(f\"\\n{'-' * 100}\\n\".join([f\"Document {i+1}:\\n\\n\" + d.page_content for i, d in enumerate(docs)]))"
]
},
{
"cell_type": "markdown",
"id": "6fa3d916",
"metadata": {},
"source": [
"## Using a vanilla vector store retriever\n",
"Let's start by initializing a simple vector store retriever and storing the 2023 State of the Union speech (in chunks). We can see that given an example question our retriever returns one or two relevant docs and a few irrelevant docs. And even the relevant docs have a lot of irrelevant information in them."
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "9fbcc58f",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Document 1:\n",
"\n",
"Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while youre at it, pass the Disclose Act so Americans can know who is funding our elections. \n",
"\n",
"Tonight, Id like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \n",
"\n",
"One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \n",
"\n",
"And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nations top legal minds, who will continue Justice Breyers legacy of excellence.\n",
"----------------------------------------------------------------------------------------------------\n",
"Document 2:\n",
"\n",
"A former top litigator in private practice. A former federal public defender. And from a family of public school educators and police officers. A consensus builder. Since shes been nominated, shes received a broad range of support—from the Fraternal Order of Police to former judges appointed by Democrats and Republicans. \n",
"\n",
"And if we are to advance liberty and justice, we need to secure the Border and fix the immigration system. \n",
"\n",
"We can do both. At our border, weve installed new technology like cutting-edge scanners to better detect drug smuggling. \n",
"\n",
"Weve set up joint patrols with Mexico and Guatemala to catch more human traffickers. \n",
"\n",
"Were putting in place dedicated immigration judges so families fleeing persecution and violence can have their cases heard faster. \n",
"\n",
"Were securing commitments and supporting partners in South and Central America to host more refugees and secure their own borders.\n",
"----------------------------------------------------------------------------------------------------\n",
"Document 3:\n",
"\n",
"And for our LGBTQ+ Americans, lets finally get the bipartisan Equality Act to my desk. The onslaught of state laws targeting transgender Americans and their families is wrong. \n",
"\n",
"As I said last year, especially to our younger transgender Americans, I will always have your back as your President, so you can be yourself and reach your God-given potential. \n",
"\n",
"While it often appears that we never agree, that isnt true. I signed 80 bipartisan bills into law last year. From preventing government shutdowns to protecting Asian-Americans from still-too-common hate crimes to reforming military justice. \n",
"\n",
"And soon, well strengthen the Violence Against Women Act that I first wrote three decades ago. It is important for us to show the nation that we can come together and do big things. \n",
"\n",
"So tonight Im offering a Unity Agenda for the Nation. Four big things we can do together. \n",
"\n",
"First, beat the opioid epidemic.\n",
"----------------------------------------------------------------------------------------------------\n",
"Document 4:\n",
"\n",
"Tonight, Im announcing a crackdown on these companies overcharging American businesses and consumers. \n",
"\n",
"And as Wall Street firms take over more nursing homes, quality in those homes has gone down and costs have gone up. \n",
"\n",
"That ends on my watch. \n",
"\n",
"Medicare is going to set higher standards for nursing homes and make sure your loved ones get the care they deserve and expect. \n",
"\n",
"Well also cut costs and keep the economy going strong by giving workers a fair shot, provide more training and apprenticeships, hire them based on their skills not degrees. \n",
"\n",
"Lets pass the Paycheck Fairness Act and paid leave. \n",
"\n",
"Raise the minimum wage to $15 an hour and extend the Child Tax Credit, so no one has to raise a family in poverty. \n",
"\n",
"Lets increase Pell Grants and increase our historic support of HBCUs, and invest in what Jill—our First Lady who teaches full-time—calls Americas best-kept secret: community colleges.\n"
]
}
],
"source": [
"from langchain.text_splitter import CharacterTextSplitter\n",
"from langchain.embeddings import OpenAIEmbeddings\n",
"from langchain.document_loaders import TextLoader\n",
"from langchain.vectorstores import FAISS\n",
"\n",
"documents = TextLoader('../../../state_of_the_union.txt').load()\n",
"text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n",
"texts = text_splitter.split_documents(documents)\n",
"retriever = FAISS.from_documents(texts, OpenAIEmbeddings()).as_retriever()\n",
"\n",
"docs = retriever.get_relevant_documents(\"What did the president say about Ketanji Brown Jackson\")\n",
"pretty_print_docs(docs)"
]
},
{
"cell_type": "markdown",
"id": "b7648612",
"metadata": {},
"source": [
"## Adding contextual compression with an `LLMChainExtractor`\n",
"Now let's wrap our base retriever with a `ContextualCompressionRetriever`. We'll add an `LLMChainExtractor`, which will iterate over the initially returned documents and extract from each only the content that is relevant to the query."
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "9a658023",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Document 1:\n",
"\n",
"\"One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \n",
"\n",
"And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nations top legal minds, who will continue Justice Breyers legacy of excellence.\"\n",
"----------------------------------------------------------------------------------------------------\n",
"Document 2:\n",
"\n",
"\"A former top litigator in private practice. A former federal public defender. And from a family of public school educators and police officers. A consensus builder. Since shes been nominated, shes received a broad range of support—from the Fraternal Order of Police to former judges appointed by Democrats and Republicans.\"\n"
]
}
],
"source": [
"from langchain.llms import OpenAI\n",
"from langchain.retrievers import ContextualCompressionRetriever\n",
"from langchain.retrievers.document_compressors import LLMChainExtractor\n",
"\n",
"llm = OpenAI(temperature=0)\n",
"compressor = LLMChainExtractor.from_llm(llm)\n",
"compression_retriever = ContextualCompressionRetriever(base_compressor=compressor, base_retriever=retriever)\n",
"\n",
"compressed_docs = compression_retriever.get_relevant_documents(\"What did the president say about Ketanji Jackson Brown\")\n",
"pretty_print_docs(compressed_docs)"
]
},
{
"cell_type": "markdown",
"id": "2cd38f3a",
"metadata": {},
"source": [
"## More built-in compressors: filters\n",
"### `LLMChainFilter`\n",
"The `LLMChainFilter` is slightly simpler but more robust compressor that uses an LLM chain to decide which of the initially retrieved documents to filter out and which ones to return, without manipulating the document contents."
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "b216a767",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Document 1:\n",
"\n",
"Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while youre at it, pass the Disclose Act so Americans can know who is funding our elections. \n",
"\n",
"Tonight, Id like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \n",
"\n",
"One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \n",
"\n",
"And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nations top legal minds, who will continue Justice Breyers legacy of excellence.\n"
]
}
],
"source": [
"from langchain.retrievers.document_compressors import LLMChainFilter\n",
"\n",
"_filter = LLMChainFilter.from_llm(llm)\n",
"compression_retriever = ContextualCompressionRetriever(base_compressor=_filter, base_retriever=retriever)\n",
"\n",
"compressed_docs = compression_retriever.get_relevant_documents(\"What did the president say about Ketanji Jackson Brown\")\n",
"pretty_print_docs(compressed_docs)"
]
},
{
"cell_type": "markdown",
"id": "8c709598",
"metadata": {},
"source": [
"### `EmbeddingsFilter`\n",
"\n",
"Making an extra LLM call over each retrieved document is expensive and slow. The `EmbeddingsFilter` provides a cheaper and faster option by embedding the documents and query and only returning those documents which have sufficiently similar embeddings to the query."
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "6fbc801f",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Document 1:\n",
"\n",
"Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while youre at it, pass the Disclose Act so Americans can know who is funding our elections. \n",
"\n",
"Tonight, Id like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \n",
"\n",
"One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \n",
"\n",
"And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nations top legal minds, who will continue Justice Breyers legacy of excellence.\n",
"----------------------------------------------------------------------------------------------------\n",
"Document 2:\n",
"\n",
"A former top litigator in private practice. A former federal public defender. And from a family of public school educators and police officers. A consensus builder. Since shes been nominated, shes received a broad range of support—from the Fraternal Order of Police to former judges appointed by Democrats and Republicans. \n",
"\n",
"And if we are to advance liberty and justice, we need to secure the Border and fix the immigration system. \n",
"\n",
"We can do both. At our border, weve installed new technology like cutting-edge scanners to better detect drug smuggling. \n",
"\n",
"Weve set up joint patrols with Mexico and Guatemala to catch more human traffickers. \n",
"\n",
"Were putting in place dedicated immigration judges so families fleeing persecution and violence can have their cases heard faster. \n",
"\n",
"Were securing commitments and supporting partners in South and Central America to host more refugees and secure their own borders.\n",
"----------------------------------------------------------------------------------------------------\n",
"Document 3:\n",
"\n",
"And for our LGBTQ+ Americans, lets finally get the bipartisan Equality Act to my desk. The onslaught of state laws targeting transgender Americans and their families is wrong. \n",
"\n",
"As I said last year, especially to our younger transgender Americans, I will always have your back as your President, so you can be yourself and reach your God-given potential. \n",
"\n",
"While it often appears that we never agree, that isnt true. I signed 80 bipartisan bills into law last year. From preventing government shutdowns to protecting Asian-Americans from still-too-common hate crimes to reforming military justice. \n",
"\n",
"And soon, well strengthen the Violence Against Women Act that I first wrote three decades ago. It is important for us to show the nation that we can come together and do big things. \n",
"\n",
"So tonight Im offering a Unity Agenda for the Nation. Four big things we can do together. \n",
"\n",
"First, beat the opioid epidemic.\n"
]
}
],
"source": [
"from langchain.embeddings import OpenAIEmbeddings\n",
"from langchain.retrievers.document_compressors import EmbeddingsFilter\n",
"\n",
"embeddings = OpenAIEmbeddings()\n",
"embeddings_filter = EmbeddingsFilter(embeddings=embeddings, similarity_threshold=0.76)\n",
"compression_retriever = ContextualCompressionRetriever(base_compressor=embeddings_filter, base_retriever=retriever)\n",
"\n",
"compressed_docs = compression_retriever.get_relevant_documents(\"What did the president say about Ketanji Jackson Brown\")\n",
"pretty_print_docs(compressed_docs)"
]
},
{
"cell_type": "markdown",
"id": "07365d36",
"metadata": {},
"source": [
"# Stringing compressors and document transformers together\n",
"Using the `DocumentCompressorPipeline` we can also easily combine multiple compressors in sequence. Along with compressors we can add `BaseDocumentTransformer`s to our pipeline, which don't perform any contextual compression but simply perform some transformation on a set of documents. For example `TextSplitter`s can be used as document transformers to split documents into smaller pieces, and the `EmbeddingsRedundantFilter` can be used to filter out redundant documents based on embedding similarity between documents.\n",
"\n",
"Below we create a compressor pipeline by first splitting our docs into smaller chunks, then removing redundant documents, and then filtering based on relevance to the query."
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "2a150a63",
"metadata": {},
"outputs": [],
"source": [
"from langchain.document_transformers import EmbeddingsRedundantFilter\n",
"from langchain.retrievers.document_compressors import DocumentCompressorPipeline\n",
"from langchain.text_splitter import CharacterTextSplitter\n",
"\n",
"splitter = CharacterTextSplitter(chunk_size=300, chunk_overlap=0, separator=\". \")\n",
"redundant_filter = EmbeddingsRedundantFilter(embeddings=embeddings)\n",
"relevant_filter = EmbeddingsFilter(embeddings=embeddings, similarity_threshold=0.76)\n",
"pipeline_compressor = DocumentCompressorPipeline(\n",
" transformers=[splitter, redundant_filter, relevant_filter]\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "3ceab64a",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Document 1:\n",
"\n",
"One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \n",
"\n",
"And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson\n",
"----------------------------------------------------------------------------------------------------\n",
"Document 2:\n",
"\n",
"As I said last year, especially to our younger transgender Americans, I will always have your back as your President, so you can be yourself and reach your God-given potential. \n",
"\n",
"While it often appears that we never agree, that isnt true. I signed 80 bipartisan bills into law last year\n",
"----------------------------------------------------------------------------------------------------\n",
"Document 3:\n",
"\n",
"A former top litigator in private practice. A former federal public defender. And from a family of public school educators and police officers. A consensus builder\n"
]
}
],
"source": [
"compression_retriever = ContextualCompressionRetriever(base_compressor=pipeline_compressor, base_retriever=retriever)\n",
"\n",
"compressed_docs = compression_retriever.get_relevant_documents(\"What did the president say about Ketanji Jackson Brown\")\n",
"pretty_print_docs(compressed_docs)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8cfd9fc5",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.1"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@@ -1,6 +1,7 @@
{
"cells": [
{
"attachments": {},
"cell_type": "markdown",
"id": "683953b3",
"metadata": {},
@@ -35,7 +36,7 @@
" id bigint,\n",
" content text,\n",
" metadata jsonb,\n",
" -- we return matched vectors to allow to execute maximal marginal relevance searches\n",
" -- we return matched vectors to enable maximal marginal relevance searches\n",
" embedding vector(1536),\n",
" similarity float)\n",
" LANGUAGE plpgsql\n",
@@ -48,11 +49,11 @@
" content,\n",
" metadata,\n",
" embedding,\n",
" 1 -(docstore.embedding <=> query_embedding) AS similarity\n",
" 1 -(documents.embedding <=> query_embedding) AS similarity\n",
" FROM\n",
" docstore\n",
" documents\n",
" ORDER BY\n",
" docstore.embedding <=> query_embedding\n",
" documents.embedding <=> query_embedding\n",
" LIMIT match_count;\n",
" END;\n",
" $$;\n",
@@ -390,7 +391,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.1"
"version": "3.11.3"
}
},
"nbformat": 4,

View File

@@ -7,8 +7,10 @@ Full documentation on all methods, classes, and APIs in LangChain.
.. toctree::
:maxdepth: 1
./reference/models.rst
./reference/prompts.rst
LLMs<./reference/modules/llms>
./reference/utils.rst
Chains<./reference/modules/chains>
Agents<./reference/modules/agents>
./reference/indexes.rst
./reference/modules/memory.rst
./reference/modules/chains.rst
./reference/agents.rst
./reference/modules/utilities.rst

12
docs/reference/agents.rst Normal file
View File

@@ -0,0 +1,12 @@
Agents
==============
Reference guide for Agents and associated abstractions.
.. toctree::
:maxdepth: 1
:glob:
modules/agents
modules/tools
modules/agent_toolkits

View File

@@ -0,0 +1,16 @@
Indexes
==============
Indexes refer to ways to structure documents so that LLMs can best interact with them.
LangChain has a number of modules that help you load, structure, store, and retrieve documents.
.. toctree::
:maxdepth: 1
:glob:
modules/docstore
modules/text_splitter
modules/document_loaders
modules/vectorstores
modules/retrievers
modules/document_compressors
modules/document_transformers

12
docs/reference/models.rst Normal file
View File

@@ -0,0 +1,12 @@
Models
==============
LangChain provides interfaces and integrations for a number of different types of models.
.. toctree::
:maxdepth: 1
:glob:
modules/llms
modules/chat_models
modules/embeddings

View File

@@ -0,0 +1,7 @@
Agent Toolkits
===============================
.. automodule:: langchain.agents.agent_toolkits
:members:
:undoc-members:

View File

@@ -0,0 +1,7 @@
Chat Models
===============================
.. automodule:: langchain.chat_models
:members:
:undoc-members:

View File

@@ -0,0 +1,7 @@
Document Compressors
===============================
.. automodule:: langchain.retrievers.document_compressors
:members:
:undoc-members:

View File

@@ -0,0 +1,7 @@
Document Loaders
===============================
.. automodule:: langchain.document_loaders
:members:
:undoc-members:

View File

@@ -0,0 +1,7 @@
Document Transformers
===============================
.. automodule:: langchain.document_transformers
:members:
:undoc-members:

View File

@@ -0,0 +1,7 @@
Memory
===============================
.. automodule:: langchain.memory
:members:
:undoc-members:

View File

@@ -0,0 +1,7 @@
Output Parsers
===============================
.. automodule:: langchain.output_parsers
:members:
:undoc-members:

View File

@@ -0,0 +1,7 @@
Retrievers
===============================
.. automodule:: langchain.retrievers
:members:
:undoc-members:

View File

@@ -0,0 +1,7 @@
Tools
===============================
.. automodule:: langchain.tools
:members:
:undoc-members:

View File

@@ -0,0 +1,7 @@
Utilities
===============================
.. automodule:: langchain.utilities
:members:
:undoc-members:

View File

@@ -1,4 +1,4 @@
VectorStores
Vector Stores
=============================
.. automodule:: langchain.vectorstores

View File

@@ -7,5 +7,6 @@ The reference guides here all relate to objects for working with Prompts.
:maxdepth: 1
:glob:
modules/prompt
modules/prompts
modules/example_selector
modules/output_parsers

View File

@@ -1,27 +0,0 @@
Utilities
==============
There are a lot of different utilities that LangChain provides integrations for
These guides go over how to use them.
These can largely be grouped into two categories: generic utilities, and then utilities for working with larger text documents.
.. toctree::
:maxdepth: 1
:glob:
:caption: Generic Utilities
modules/python
modules/serpapi
modules/searx_search
.. toctree::
:maxdepth: 1
:glob:
:caption: Utilities for working with Documents
modules/docstore
modules/text_splitter
modules/embeddings
modules/vectorstore

View File

@@ -20,3 +20,4 @@ Highlighting specific parts:
Specific examples of this include:
- [AI Plugins](agents/custom_agent_with_plugin_retrieval.ipynb): an implementation of an agent that is designed to be able to use all AI Plugins.
- [Wikibase Agent](agents/wikibase_agent.ipynb): an implementation of an agent that is designed to interact with Wikibase.

View File

@@ -1,9 +1,11 @@
# YouTube
This is a collection of LangChain tutorials and videos.
This is a collection of `LangChain` tutorials and videos on `YouTube`.
### Introduction to LangChain with Harrison Chase creator of LangChain
- [LangChain Demo + Q&A with Harrison Chase](https://youtu.be/zaYTXQFR0_s) by [Full Stack Deep Learning](https://www.youtube.com/@FullStackDeepLearning)
### Introduction to LangChain with Harrison Chase, creator of LangChain
- [Building the Future with LLMs, `LangChain`, & `Pinecone`](https://youtu.be/nMniwlGyX-c) by [Pinecone](https://www.youtube.com/@pinecone-io)
- [LangChain and Weaviate with Harrison Chase and Bob van Luijt - Weaviate Podcast #36](https://youtu.be/lhby7Ql7hbk) by [Weaviate • Vector Database](https://www.youtube.com/@Weaviate)
- [LangChain Demo + Q&A with Harrison Chase](https://youtu.be/zaYTXQFR0_s?t=788) by [Full Stack Deep Learning](https://www.youtube.com/@FullStackDeepLearning)
## Tutorials
@@ -81,7 +83,6 @@ This is a collection of LangChain tutorials and videos.
- [The easiest way to work with large language models | Learn LangChain in 10min](https://youtu.be/kmbS6FDQh7c) by [Sophia Yang](https://www.youtube.com/@SophiaYangDS)
- [4 Autonomous AI Agents: “Westworld” simulation `BabyAGI`, `AutoGPT`, `Camel`, `LangChain`](https://youtu.be/yWbnH6inT_U) by [Sophia Yang](https://www.youtube.com/@SophiaYangDS)
- [AI CAN SEARCH THE INTERNET? Langchain Agents + OpenAI ChatGPT](https://youtu.be/J-GL0htqda8) by [tylerwhatsgood](https://www.youtube.com/@tylerwhatsgood)
- [Building the Future with LLMs, LangChain, & `Pinecone`](https://youtu.be/nMniwlGyX-c) by [Pinecone](https://www.youtube.com/@pinecone-io)
- [`Weaviate` + LangChain for LLM apps presented by Erika Cardenas](https://youtu.be/7AGj4Td5Lgw) by [`Weaviate` • Vector Database](https://www.youtube.com/@Weaviate)
- [Analyze Custom `CSV` Data with `GPT-4` using Langchain](https://youtu.be/Ew3sGdX8at4) by [Venelin Valkov](https://www.youtube.com/@venelin_valkov)
- [Langchain Overview - How to Use Langchain & `ChatGPT`](https://youtu.be/oYVYIq0lOtI) by [Python In Office](https://www.youtube.com/@pythoninoffice6568)

View File

@@ -130,7 +130,7 @@ class CometCallbackHandler(BaseMetadataCallbackHandler, BaseCallbackHandler):
warning = (
"The comet_ml callback is currently in beta and is subject to change "
"based on updates to `langchain`. Please report any issues to "
"https://github.com/comet-ml/issue_tracking/issues with the tag "
"https://github.com/comet-ml/issue-tracking/issues with the tag "
"`langchain`."
)
self.comet_ml.LOGGER.warning(warning)

View File

@@ -15,16 +15,32 @@ from langchain.chains.conversational_retrieval.prompts import CONDENSE_QUESTION_
from langchain.chains.llm import LLMChain
from langchain.chains.question_answering import load_qa_chain
from langchain.prompts.base import BasePromptTemplate
from langchain.schema import BaseLanguageModel, BaseRetriever, Document
from langchain.schema import BaseLanguageModel, BaseMessage, BaseRetriever, Document
from langchain.vectorstores.base import VectorStore
# Depending on the memory type and configuration, the chat history format may differ.
# This needs to be consolidated.
CHAT_TURN_TYPE = Union[Tuple[str, str], BaseMessage]
def _get_chat_history(chat_history: List[Tuple[str, str]]) -> str:
_ROLE_MAP = {"human": "Human: ", "ai": "Assistant: "}
def _get_chat_history(chat_history: List[CHAT_TURN_TYPE]) -> str:
buffer = ""
for human_s, ai_s in chat_history:
human = "Human: " + human_s
ai = "Assistant: " + ai_s
buffer += "\n" + "\n".join([human, ai])
for dialogue_turn in chat_history:
if isinstance(dialogue_turn, BaseMessage):
role_prefix = _ROLE_MAP.get(dialogue_turn.type, f"{dialogue_turn.type}: ")
buffer += f"\n{role_prefix}{dialogue_turn.content}"
elif isinstance(dialogue_turn, tuple):
human = "Human: " + dialogue_turn[0]
ai = "Assistant: " + dialogue_turn[1]
buffer += "\n" + "\n".join([human, ai])
else:
raise ValueError(
f"Unsupported chat history format: {type(dialogue_turn)}."
f" Full chat history: {chat_history} "
)
return buffer
@@ -35,7 +51,7 @@ class BaseConversationalRetrievalChain(Chain):
question_generator: LLMChain
output_key: str = "answer"
return_source_documents: bool = False
get_chat_history: Optional[Callable[[Tuple[str, str]], str]] = None
get_chat_history: Optional[Callable[[CHAT_TURN_TYPE], str]] = None
"""Return the source documents."""
class Config:

View File

@@ -40,6 +40,27 @@ DECIDER_PROMPT = PromptTemplate(
output_parser=CommaSeparatedListOutputParser(),
)
_duckdb_prompt = """You are a DuckDB expert. Given an input question, first create a syntactically correct DuckDB query to run, then look at the results of the query and return the answer to the input question.
Unless the user specifies in the question a specific number of examples to obtain, query for at most {top_k} results using the LIMIT clause as per DuckDB. You can order the results to return the most informative data in the database.
Never query for all columns from a table. You must query only the columns that are needed to answer the question. Wrap each column name in double quotes (") to denote them as delimited identifiers.
Pay attention to use only the column names you can see in the tables below. Be careful to not query for columns that do not exist. Also, pay attention to which column is in which table.
Use the following format:
Question: "Question here"
SQLQuery: "SQL Query to run"
SQLResult: "Result of the SQLQuery"
Answer: "Final answer here"
Only use the following tables:
{table_info}
Question: {input}"""
DUCKDB_PROMPT = PromptTemplate(
input_variables=["input", "table_info", "top_k"],
template=_duckdb_prompt,
)
_googlesql_prompt = """You are a GoogleSQL expert. Given an input question, first create a syntactically correct GoogleSQL query to run, then look at the results of the query and return the answer to the input question.
Unless the user specifies in the question a specific number of examples to obtain, query for at most {top_k} results using the LIMIT clause as per GoogleSQL. You can order the results to return the most informative data in the database.
@@ -201,6 +222,7 @@ SQLITE_PROMPT = PromptTemplate(
SQL_PROMPTS = {
"duckdb": DUCKDB_PROMPT,
"googlesql": GOOGLESQL_PROMPT,
"mssql": MSSQL_PROMPT,
"mysql": MYSQL_PROMPT,

View File

@@ -55,8 +55,10 @@ from langchain.document_loaders.pdf import (
UnstructuredPDFLoader,
)
from langchain.document_loaders.powerpoint import UnstructuredPowerPointLoader
from langchain.document_loaders.python import PythonLoader
from langchain.document_loaders.readthedocs import ReadTheDocsLoader
from langchain.document_loaders.roam import RoamLoader
from langchain.document_loaders.rtf import UnstructuredRTFLoader
from langchain.document_loaders.s3_directory import S3DirectoryLoader
from langchain.document_loaders.s3_file import S3FileLoader
from langchain.document_loaders.sitemap import SitemapLoader
@@ -106,6 +108,7 @@ __all__ = [
"OutlookMessageLoader",
"UnstructuredEPubLoader",
"UnstructuredMarkdownLoader",
"UnstructuredRTFLoader",
"RoamLoader",
"YoutubeLoader",
"S3FileLoader",
@@ -154,4 +157,5 @@ __all__ = [
"ImageCaptionLoader",
"DiscordChatLoader",
"ConfluenceLoader",
"PythonLoader",
]

View File

@@ -24,7 +24,8 @@ class BSHTMLLoader(BaseLoader):
import bs4 # noqa:F401
except ImportError:
raise ValueError(
"bs4 package not found, please install it with " "`pip install bs4`"
"beautifulsoup4 package not found, please install it with "
"`pip install beautifulsoup4`"
)
self.file_path = file_path

View File

@@ -0,0 +1,14 @@
import tokenize
from langchain.document_loaders.text import TextLoader
class PythonLoader(TextLoader):
"""
Load Python files, respecting any non-default encoding if specified.
"""
def __init__(self, file_path: str):
with open(file_path, "rb") as f:
encoding, _ = tokenize.detect_encoding(f.readline)
super().__init__(file_path=file_path, encoding=encoding)

View File

@@ -0,0 +1,28 @@
"""Loader that loads rich text files."""
from typing import Any, List
from langchain.document_loaders.unstructured import (
UnstructuredFileLoader,
satisfies_min_unstructured_version,
)
class UnstructuredRTFLoader(UnstructuredFileLoader):
"""Loader that uses unstructured to load rtf files."""
def __init__(
self, file_path: str, mode: str = "single", **unstructured_kwargs: Any
):
min_unstructured_version = "0.5.12"
if not satisfies_min_unstructured_version(min_unstructured_version):
raise ValueError(
"Partitioning rtf files is only supported in "
f"unstructured>={min_unstructured_version}."
)
super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs)
def _get_elements(self) -> List:
from unstructured.partition.rtf import partition_rtf
return partition_rtf(filename=self.file_path, **self.unstructured_kwargs)

View File

@@ -114,7 +114,11 @@ class YoutubeLoader(BaseLoader):
def load(self) -> List[Document]:
"""Load documents."""
try:
from youtube_transcript_api import NoTranscriptFound, YouTubeTranscriptApi
from youtube_transcript_api import (
NoTranscriptFound,
TranscriptsDisabled,
YouTubeTranscriptApi,
)
except ImportError:
raise ImportError(
"Could not import youtube_transcript_api python package. "
@@ -129,7 +133,11 @@ class YoutubeLoader(BaseLoader):
video_info = self._get_video_info()
metadata.update(video_info)
transcript_list = YouTubeTranscriptApi.list_transcripts(self.video_id)
try:
transcript_list = YouTubeTranscriptApi.list_transcripts(self.video_id)
except TranscriptsDisabled:
return []
try:
transcript = transcript_list.find_transcript([self.language])
except NoTranscriptFound:

View File

@@ -0,0 +1,100 @@
"""Transform documents"""
from typing import Any, Callable, List, Sequence
import numpy as np
from pydantic import BaseModel, Field
from langchain.embeddings.base import Embeddings
from langchain.math_utils import cosine_similarity
from langchain.schema import BaseDocumentTransformer, Document
class _DocumentWithState(Document):
"""Wrapper for a document that includes arbitrary state."""
state: dict = Field(default_factory=dict)
"""State associated with the document."""
def to_document(self) -> Document:
"""Convert the DocumentWithState to a Document."""
return Document(page_content=self.page_content, metadata=self.metadata)
@classmethod
def from_document(cls, doc: Document) -> "_DocumentWithState":
"""Create a DocumentWithState from a Document."""
if isinstance(doc, cls):
return doc
return cls(page_content=doc.page_content, metadata=doc.metadata)
def get_stateful_documents(
documents: Sequence[Document],
) -> Sequence[_DocumentWithState]:
return [_DocumentWithState.from_document(doc) for doc in documents]
def _filter_similar_embeddings(
embedded_documents: List[List[float]], similarity_fn: Callable, threshold: float
) -> List[int]:
"""Filter redundant documents based on the similarity of their embeddings."""
similarity = np.tril(similarity_fn(embedded_documents, embedded_documents), k=-1)
redundant = np.where(similarity > threshold)
redundant_stacked = np.column_stack(redundant)
redundant_sorted = np.argsort(similarity[redundant])[::-1]
included_idxs = set(range(len(embedded_documents)))
for first_idx, second_idx in redundant_stacked[redundant_sorted]:
if first_idx in included_idxs and second_idx in included_idxs:
# Default to dropping the second document of any highly similar pair.
included_idxs.remove(second_idx)
return list(sorted(included_idxs))
def _get_embeddings_from_stateful_docs(
embeddings: Embeddings, documents: Sequence[_DocumentWithState]
) -> List[List[float]]:
if len(documents) and "embedded_doc" in documents[0].state:
embedded_documents = [doc.state["embedded_doc"] for doc in documents]
else:
embedded_documents = embeddings.embed_documents(
[d.page_content for d in documents]
)
for doc, embedding in zip(documents, embedded_documents):
doc.state["embedded_doc"] = embedding
return embedded_documents
class EmbeddingsRedundantFilter(BaseDocumentTransformer, BaseModel):
"""Filter that drops redundant documents by comparing their embeddings."""
embeddings: Embeddings
"""Embeddings to use for embedding document contents."""
similarity_fn: Callable = cosine_similarity
"""Similarity function for comparing documents. Function expected to take as input
two matrices (List[List[float]]) and return a matrix of scores where higher values
indicate greater similarity."""
similarity_threshold: float = 0.95
"""Threshold for determining when two documents are similar enough
to be considered redundant."""
class Config:
"""Configuration for this pydantic object."""
arbitrary_types_allowed = True
def transform_documents(
self, documents: Sequence[Document], **kwargs: Any
) -> Sequence[Document]:
"""Filter down documents."""
stateful_documents = get_stateful_documents(documents)
embedded_documents = _get_embeddings_from_stateful_docs(
self.embeddings, stateful_documents
)
included_idxs = _filter_similar_embeddings(
embedded_documents, self.similarity_fn, self.similarity_threshold
)
return [stateful_documents[i] for i in sorted(included_idxs)]
async def atransform_documents(
self, documents: Sequence[Document], **kwargs: Any
) -> Sequence[Document]:
raise NotImplementedError

View File

@@ -1,7 +1,7 @@
"""Wrapper around HuggingFace embedding models."""
from typing import Any, List, Optional
from typing import Any, Dict, List, Optional
from pydantic import BaseModel, Extra
from pydantic import BaseModel, Extra, Field
from langchain.embeddings.base import Embeddings
@@ -22,8 +22,10 @@ class HuggingFaceEmbeddings(BaseModel, Embeddings):
.. code-block:: python
from langchain.embeddings import HuggingFaceEmbeddings
model_name = "sentence-transformers/all-mpnet-base-v2"
hf = HuggingFaceEmbeddings(model_name=model_name)
model_kwargs = {'device': 'cpu'}
hf = HuggingFaceEmbeddings(model_name=model_name, model_kwargs=model_kwargs)
"""
client: Any #: :meta private:
@@ -32,6 +34,8 @@ class HuggingFaceEmbeddings(BaseModel, Embeddings):
cache_folder: Optional[str] = None
"""Path to store models.
Can be also set by SENTENCE_TRANSFORMERS_HOME enviroment variable."""
model_kwargs: Dict[str, Any] = Field(default_factory=dict)
"""Key word arguments to pass to the model."""
def __init__(self, **kwargs: Any):
"""Initialize the sentence_transformer."""
@@ -40,7 +44,7 @@ class HuggingFaceEmbeddings(BaseModel, Embeddings):
import sentence_transformers
self.client = sentence_transformers.SentenceTransformer(
self.model_name, self.cache_folder
self.model_name, cache_folder=self.cache_folder, **self.model_kwargs
)
except ImportError:
raise ValueError(
@@ -90,13 +94,22 @@ class HuggingFaceInstructEmbeddings(BaseModel, Embeddings):
.. code-block:: python
from langchain.embeddings import HuggingFaceInstructEmbeddings
model_name = "hkunlp/instructor-large"
hf = HuggingFaceInstructEmbeddings(model_name=model_name)
model_kwargs = {'device': 'cpu'}
hf = HuggingFaceInstructEmbeddings(
model_name=model_name, model_kwargs=model_kwargs
)
"""
client: Any #: :meta private:
model_name: str = DEFAULT_INSTRUCT_MODEL
"""Model name to use."""
cache_folder: Optional[str] = None
"""Path to store models.
Can be also set by SENTENCE_TRANSFORMERS_HOME enviroment variable."""
model_kwargs: Dict[str, Any] = Field(default_factory=dict)
"""Key word arguments to pass to the model."""
embed_instruction: str = DEFAULT_EMBED_INSTRUCTION
"""Instruction to use for embedding documents."""
query_instruction: str = DEFAULT_QUERY_INSTRUCTION
@@ -108,7 +121,9 @@ class HuggingFaceInstructEmbeddings(BaseModel, Embeddings):
try:
from InstructorEmbedding import INSTRUCTOR
self.client = INSTRUCTOR(self.model_name)
self.client = INSTRUCTOR(
self.model_name, cache_folder=self.cache_folder, **self.model_kwargs
)
except ImportError as e:
raise ValueError("Dependencies for InstructorEmbedding not found.") from e

View File

@@ -44,7 +44,8 @@ class AutoGPTOutputParser(BaseAutoGPTOutputParser):
name=parsed["command"]["name"],
args=parsed["command"]["args"],
)
except KeyError:
except (KeyError, TypeError):
# If the command is null or incomplete, return an erroneous tool
return AutoGPTAction(
name="ERROR", args={"error": f"Incomplete command args: {parsed}"}
)

View File

@@ -217,12 +217,8 @@ class BaseOpenAI(BaseLLM):
openai.api_key = openai_api_key
if openai_api_base:
print("USING API_BASE: ")
print(openai_api_base)
openai.api_base = openai_api_base
if openai_organization:
print("USING ORGANIZATION: ")
print(openai_organization)
openai.organization = openai_organization
values["client"] = openai.Completion
except ImportError:

22
langchain/math_utils.py Normal file
View File

@@ -0,0 +1,22 @@
"""Math utils."""
from typing import List, Union
import numpy as np
Matrix = Union[List[List[float]], List[np.ndarray], np.ndarray]
def cosine_similarity(X: Matrix, Y: Matrix) -> np.ndarray:
"""Row-wise cosine similarity between two equal-width matrices."""
if len(X) == 0 or len(Y) == 0:
return np.array([])
X = np.array(X)
Y = np.array(Y)
if X.shape[1] != Y.shape[1]:
raise ValueError("Number of columns in X and Y must be the same.")
X_norm = np.linalg.norm(X, axis=1)
Y_norm = np.linalg.norm(Y, axis=1)
similarity = np.dot(X, Y.T) / np.outer(X_norm, Y_norm)
similarity[np.isnan(similarity) | np.isinf(similarity)] = 0.0
return similarity

View File

@@ -0,0 +1,29 @@
from langchain.schema import BaseOutputParser
class BooleanOutputParser(BaseOutputParser[bool]):
true_val: str = "YES"
false_val: str = "NO"
def parse(self, text: str) -> bool:
"""Parse the output of an LLM call to a boolean.
Args:
text: output of language model
Returns:
boolean
"""
cleaned_text = text.strip()
if cleaned_text not in (self.true_val, self.false_val):
raise ValueError(
f"BooleanOutputParser expected output value to either be "
f"{self.true_val} or {self.false_val}. Received {cleaned_text}."
)
return cleaned_text == self.true_val
@property
def _type(self) -> str:
"""Snake-case string identifier for output parser type."""
return "boolean_output_parser"

View File

@@ -1,4 +1,5 @@
from langchain.retrievers.chatgpt_plugin_retriever import ChatGPTPluginRetriever
from langchain.retrievers.contextual_compression import ContextualCompressionRetriever
from langchain.retrievers.databerry import DataberryRetriever
from langchain.retrievers.elastic_search_bm25 import ElasticSearchBM25Retriever
from langchain.retrievers.metal import MetalRetriever
@@ -13,6 +14,7 @@ from langchain.retrievers.weaviate_hybrid_search import WeaviateHybridSearchRetr
__all__ = [
"ChatGPTPluginRetriever",
"ContextualCompressionRetriever",
"RemoteLangChainRetriever",
"PineconeHybridSearchRetriever",
"MetalRetriever",

View File

@@ -0,0 +1,51 @@
"""Retriever that wraps a base retriever and filters the results."""
from typing import List
from pydantic import BaseModel, Extra
from langchain.retrievers.document_compressors.base import (
BaseDocumentCompressor,
)
from langchain.schema import BaseRetriever, Document
class ContextualCompressionRetriever(BaseRetriever, BaseModel):
"""Retriever that wraps a base retriever and compresses the results."""
base_compressor: BaseDocumentCompressor
"""Compressor for compressing retrieved documents."""
base_retriever: BaseRetriever
"""Base Retriever to use for getting relevant documents."""
class Config:
"""Configuration for this pydantic object."""
extra = Extra.forbid
arbitrary_types_allowed = True
def get_relevant_documents(self, query: str) -> List[Document]:
"""Get documents relevant for a query.
Args:
query: string to find relevant documents for
Returns:
Sequence of relevant documents
"""
docs = self.base_retriever.get_relevant_documents(query)
compressed_docs = self.base_compressor.compress_documents(docs, query)
return list(compressed_docs)
async def aget_relevant_documents(self, query: str) -> List[Document]:
"""Get documents relevant for a query.
Args:
query: string to find relevant documents for
Returns:
List of relevant documents
"""
docs = await self.base_retriever.aget_relevant_documents(query)
compressed_docs = await self.base_compressor.acompress_documents(docs, query)
return list(compressed_docs)

View File

@@ -0,0 +1,17 @@
from langchain.retrievers.document_compressors.base import DocumentCompressorPipeline
from langchain.retrievers.document_compressors.chain_extract import (
LLMChainExtractor,
)
from langchain.retrievers.document_compressors.chain_filter import (
LLMChainFilter,
)
from langchain.retrievers.document_compressors.embeddings_filter import (
EmbeddingsFilter,
)
__all__ = [
"DocumentCompressorPipeline",
"EmbeddingsFilter",
"LLMChainExtractor",
"LLMChainFilter",
]

View File

@@ -0,0 +1,61 @@
"""Interface for retrieved document compressors."""
from abc import ABC, abstractmethod
from typing import List, Sequence, Union
from pydantic import BaseModel
from langchain.schema import BaseDocumentTransformer, Document
class BaseDocumentCompressor(BaseModel, ABC):
""""""
@abstractmethod
def compress_documents(
self, documents: Sequence[Document], query: str
) -> Sequence[Document]:
"""Compress retrieved documents given the query context."""
@abstractmethod
async def acompress_documents(
self, documents: Sequence[Document], query: str
) -> Sequence[Document]:
"""Compress retrieved documents given the query context."""
class DocumentCompressorPipeline(BaseDocumentCompressor):
"""Document compressor that uses a pipeline of transformers."""
transformers: List[Union[BaseDocumentTransformer, BaseDocumentCompressor]]
"""List of document filters that are chained together and run in sequence."""
class Config:
"""Configuration for this pydantic object."""
arbitrary_types_allowed = True
def compress_documents(
self, documents: Sequence[Document], query: str
) -> Sequence[Document]:
"""Transform a list of documents."""
for _transformer in self.transformers:
if isinstance(_transformer, BaseDocumentCompressor):
documents = _transformer.compress_documents(documents, query)
elif isinstance(_transformer, BaseDocumentTransformer):
documents = _transformer.transform_documents(documents)
else:
raise ValueError(f"Got unexpected transformer type: {_transformer}")
return documents
async def acompress_documents(
self, documents: Sequence[Document], query: str
) -> Sequence[Document]:
"""Compress retrieved documents given the query context."""
for _transformer in self.transformers:
if isinstance(_transformer, BaseDocumentCompressor):
documents = await _transformer.acompress_documents(documents, query)
elif isinstance(_transformer, BaseDocumentTransformer):
documents = await _transformer.atransform_documents(documents)
else:
raise ValueError(f"Got unexpected transformer type: {_transformer}")
return documents

View File

@@ -0,0 +1,77 @@
"""DocumentFilter that uses an LLM chain to extract the relevant parts of documents."""
from typing import Any, Callable, Dict, Optional, Sequence
from langchain import LLMChain, PromptTemplate
from langchain.retrievers.document_compressors.base import (
BaseDocumentCompressor,
)
from langchain.retrievers.document_compressors.chain_extract_prompt import (
prompt_template,
)
from langchain.schema import BaseLanguageModel, BaseOutputParser, Document
def default_get_input(query: str, doc: Document) -> Dict[str, Any]:
"""Return the compression chain input."""
return {"question": query, "context": doc.page_content}
class NoOutputParser(BaseOutputParser[str]):
"""Parse outputs that could return a null string of some sort."""
no_output_str: str = "NO_OUTPUT"
def parse(self, text: str) -> str:
cleaned_text = text.strip()
if cleaned_text == self.no_output_str:
return ""
return cleaned_text
def _get_default_chain_prompt() -> PromptTemplate:
output_parser = NoOutputParser()
template = prompt_template.format(no_output_str=output_parser.no_output_str)
return PromptTemplate(
template=template,
input_variables=["question", "context"],
output_parser=output_parser,
)
class LLMChainExtractor(BaseDocumentCompressor):
llm_chain: LLMChain
"""LLM wrapper to use for compressing documents."""
get_input: Callable[[str, Document], dict] = default_get_input
"""Callable for constructing the chain input from the query and a Document."""
def compress_documents(
self, documents: Sequence[Document], query: str
) -> Sequence[Document]:
"""Compress page content of raw documents."""
compressed_docs = []
for doc in documents:
_input = self.get_input(query, doc)
output = self.llm_chain.predict_and_parse(**_input)
if len(output) == 0:
continue
compressed_docs.append(Document(page_content=output, metadata=doc.metadata))
return compressed_docs
async def acompress_documents(
self, documents: Sequence[Document], query: str
) -> Sequence[Document]:
raise NotImplementedError
@classmethod
def from_llm(
cls,
llm: BaseLanguageModel,
prompt: Optional[PromptTemplate] = None,
get_input: Optional[Callable[[str, Document], str]] = None,
) -> "LLMChainExtractor":
"""Initialize from LLM."""
_prompt = prompt if prompt is not None else _get_default_chain_prompt()
_get_input = get_input if get_input is not None else default_get_input
llm_chain = LLMChain(llm=llm, prompt=_prompt)
return cls(llm_chain=llm_chain, get_input=_get_input)

View File

@@ -0,0 +1,11 @@
# flake8: noqa
prompt_template = """Given the following question and context, extract any part of the context *AS IS* that is relevant to answer the question. If none of the context is relevant return {no_output_str}.
Remember, *DO NOT* edit the extracted parts of the context.
> Question: {{question}}
> Context:
>>>
{{context}}
>>>
Extracted relevant parts:"""

View File

@@ -0,0 +1,65 @@
"""Filter that uses an LLM to drop documents that aren't relevant to the query."""
from typing import Any, Callable, Dict, Optional, Sequence
from langchain import BasePromptTemplate, LLMChain, PromptTemplate
from langchain.output_parsers.boolean import BooleanOutputParser
from langchain.retrievers.document_compressors.base import (
BaseDocumentCompressor,
)
from langchain.retrievers.document_compressors.chain_filter_prompt import (
prompt_template,
)
from langchain.schema import BaseLanguageModel, Document
def _get_default_chain_prompt() -> PromptTemplate:
return PromptTemplate(
template=prompt_template,
input_variables=["question", "context"],
output_parser=BooleanOutputParser(),
)
def default_get_input(query: str, doc: Document) -> Dict[str, Any]:
"""Return the compression chain input."""
return {"question": query, "context": doc.page_content}
class LLMChainFilter(BaseDocumentCompressor):
"""Filter that drops documents that aren't relevant to the query."""
llm_chain: LLMChain
"""LLM wrapper to use for filtering documents.
The chain prompt is expected to have a BooleanOutputParser."""
get_input: Callable[[str, Document], dict] = default_get_input
"""Callable for constructing the chain input from the query and a Document."""
def compress_documents(
self, documents: Sequence[Document], query: str
) -> Sequence[Document]:
"""Filter down documents based on their relevance to the query."""
filtered_docs = []
for doc in documents:
_input = self.get_input(query, doc)
include_doc = self.llm_chain.predict_and_parse(**_input)
if include_doc:
filtered_docs.append(doc)
return filtered_docs
async def acompress_documents(
self, documents: Sequence[Document], query: str
) -> Sequence[Document]:
"""Filter down documents."""
raise NotImplementedError
@classmethod
def from_llm(
cls,
llm: BaseLanguageModel,
prompt: Optional[BasePromptTemplate] = None,
**kwargs: Any
) -> "LLMChainFilter":
_prompt = prompt if prompt is not None else _get_default_chain_prompt()
llm_chain = LLMChain(llm=llm, prompt=_prompt)
return cls(llm_chain=llm_chain, **kwargs)

View File

@@ -0,0 +1,9 @@
# flake8: noqa
prompt_template = """Given the following question and context, return YES if the context is relevant to the question and NO if it isn't.
> Question: {question}
> Context:
>>>
{context}
>>>
> Relevant (YES / NO):"""

View File

@@ -0,0 +1,70 @@
"""Document compressor that uses embeddings to drop documents unrelated to the query."""
from typing import Callable, Dict, Optional, Sequence
import numpy as np
from pydantic import root_validator
from langchain.document_transformers import (
_get_embeddings_from_stateful_docs,
get_stateful_documents,
)
from langchain.embeddings.base import Embeddings
from langchain.math_utils import cosine_similarity
from langchain.retrievers.document_compressors.base import (
BaseDocumentCompressor,
)
from langchain.schema import Document
class EmbeddingsFilter(BaseDocumentCompressor):
embeddings: Embeddings
"""Embeddings to use for embedding document contents and queries."""
similarity_fn: Callable = cosine_similarity
"""Similarity function for comparing documents. Function expected to take as input
two matrices (List[List[float]]) and return a matrix of scores where higher values
indicate greater similarity."""
k: Optional[int] = 20
"""The number of relevant documents to return. Can be set to None, in which case
`similarity_threshold` must be specified. Defaults to 20."""
similarity_threshold: Optional[float]
"""Threshold for determining when two documents are similar enough
to be considered redundant. Defaults to None, must be specified if `k` is set
to None."""
class Config:
"""Configuration for this pydantic object."""
arbitrary_types_allowed = True
@root_validator()
def validate_params(cls, values: Dict) -> Dict:
"""Validate similarity parameters."""
if values["k"] is None and values["similarity_threshold"] is None:
raise ValueError("Must specify one of `k` or `similarity_threshold`.")
return values
def compress_documents(
self, documents: Sequence[Document], query: str
) -> Sequence[Document]:
"""Filter documents based on similarity of their embeddings to the query."""
stateful_documents = get_stateful_documents(documents)
embedded_documents = _get_embeddings_from_stateful_docs(
self.embeddings, stateful_documents
)
embedded_query = self.embeddings.embed_query(query)
similarity = self.similarity_fn([embedded_query], embedded_documents)[0]
included_idxs = np.arange(len(embedded_documents))
if self.k is not None:
included_idxs = np.argsort(similarity)[::-1][: self.k]
if self.similarity_threshold is not None:
similar_enough = np.where(
similarity[included_idxs] > self.similarity_threshold
)
included_idxs = included_idxs[similar_enough]
return [stateful_documents[i] for i in included_idxs]
async def acompress_documents(
self, documents: Sequence[Document], query: str
) -> Sequence[Document]:
"""Filter down documents."""
raise NotImplementedError

View File

@@ -2,7 +2,17 @@
from __future__ import annotations
from abc import ABC, abstractmethod
from typing import Any, Dict, Generic, List, NamedTuple, Optional, TypeVar, Union
from typing import (
Any,
Dict,
Generic,
List,
NamedTuple,
Optional,
Sequence,
TypeVar,
Union,
)
from pydantic import BaseModel, Extra, Field, root_validator
@@ -394,16 +404,17 @@ class OutputParserException(Exception):
pass
D = TypeVar("D", bound=Document)
class BaseDocumentTransformer(ABC, Generic[D]):
class BaseDocumentTransformer(ABC):
"""Base interface for transforming documents."""
@abstractmethod
def transform_documents(self, documents: List[D], **kwargs: Any) -> List[D]:
def transform_documents(
self, documents: Sequence[Document], **kwargs: Any
) -> Sequence[Document]:
"""Transform a list of documents."""
@abstractmethod
async def atransform_documents(self, documents: List[D], **kwargs: Any) -> List[D]:
async def atransform_documents(
self, documents: Sequence[Document], **kwargs: Any
) -> Sequence[Document]:
"""Asynchronously transform a list of documents."""

View File

@@ -13,6 +13,7 @@ from typing import (
List,
Literal,
Optional,
Sequence,
Union,
)
@@ -22,7 +23,7 @@ from langchain.schema import BaseDocumentTransformer
logger = logging.getLogger(__name__)
class TextSplitter(BaseDocumentTransformer[Document], ABC):
class TextSplitter(BaseDocumentTransformer, ABC):
"""Interface for splitting text into chunks."""
def __init__(
@@ -63,7 +64,7 @@ class TextSplitter(BaseDocumentTransformer[Document], ABC):
"""Split documents."""
texts = [doc.page_content for doc in documents]
metadatas = [doc.metadata for doc in documents]
return self.create_documents(texts, metadatas)
return self.create_documents(texts, metadatas=metadatas)
def _join_docs(self, docs: List[str], separator: str) -> Optional[str]:
text = separator.join(docs)
@@ -173,15 +174,15 @@ class TextSplitter(BaseDocumentTransformer[Document], ABC):
return cls(length_function=_tiktoken_encoder, **kwargs)
def transform_documents(
self, documents: List[Document], **kwargs: Any
) -> List[Document]:
"""Transform list of documents by splitting them."""
return self.split_documents(documents)
self, documents: Sequence[Document], **kwargs: Any
) -> Sequence[Document]:
"""Transform sequence of documents by splitting them."""
return self.split_documents(list(documents))
async def atransform_documents(
self, documents: List[Document], **kwargs: Any
) -> List[Document]:
"""Asynchronously transform a list of documents by splitting them."""
self, documents: Sequence[Document], **kwargs: Any
) -> Sequence[Document]:
"""Asynchronously transform a sequence of documents by splitting them."""
raise NotImplementedError

View File

@@ -6,7 +6,7 @@ from typing import Any, Dict
from langchain.chains.llm import LLMChain
from langchain.prompts import PromptTemplate
from langchain.sql_database import SQLDatabase
from langchain.llms.base import BaseLLM
from langchain.schema import BaseLanguageModel
from langchain.tools.base import BaseTool
from langchain.tools.sql_database.prompt import QUERY_CHECKER
@@ -81,7 +81,7 @@ class QueryCheckerTool(BaseSQLDatabaseTool, BaseTool):
Adapted from https://www.patterns.app/blog/2023/01/18/crunchbot-sql-analyst-gpt/"""
template: str = QUERY_CHECKER
llm: BaseLLM
llm: BaseLanguageModel
llm_chain: LLMChain = Field(init=False)
name = "query_checker_sql_db"
description = """

View File

@@ -159,7 +159,7 @@ class JiraAPIWrapper(BaseModel):
"json is not installed. " "Please install it with `pip install json`"
)
params = json.loads(query)
return self.jira.create_issue(fields=dict(params))
return self.jira.issue_create(fields=dict(params))
def other(self, query: str) -> str:
context = {"self": self}

View File

@@ -168,6 +168,21 @@ def _approximate_search_query_with_boolean_filter(
}
def _approximate_search_query_with_lucene_filter(
query_vector: List[float],
lucene_filter: Dict,
size: int = 4,
k: int = 4,
vector_field: str = "vector_field",
) -> Dict:
"""For Approximate k-NN Search, with Lucene Filter."""
search_query = _default_approximate_search_query(
query_vector, size, k, vector_field
)
search_query["query"]["knn"][vector_field]["filter"] = lucene_filter
return search_query
def _default_script_query(
query_vector: List[float],
space_type: str = "l2",
@@ -340,10 +355,14 @@ class OpenSearchVectorSearch(VectorStore):
size: number of results the query actually returns; default: 4
boolean_filter: A Boolean filter consists of a Boolean query that
contains a k-NN query and a filter
contains a k-NN query and a filter.
subquery_clause: Query clause on the knn vector field; default: "must"
lucene_filter: the Lucene algorithm decides whether to perform an exact
k-NN search with pre-filtering or an approximate search with modified
post-filtering.
Optional Args for Script Scoring Search:
search_type: "script_scoring"; default: "approximate_search"
@@ -371,10 +390,20 @@ class OpenSearchVectorSearch(VectorStore):
size = _get_kwargs_value(kwargs, "size", 4)
boolean_filter = _get_kwargs_value(kwargs, "boolean_filter", {})
subquery_clause = _get_kwargs_value(kwargs, "subquery_clause", "must")
lucene_filter = _get_kwargs_value(kwargs, "lucene_filter", {})
if boolean_filter != {} and lucene_filter != {}:
raise ValueError(
"Both `boolean_filter` and `lucene_filter` are provided which "
"is invalid"
)
if boolean_filter != {}:
search_query = _approximate_search_query_with_boolean_filter(
embedding, boolean_filter, size, k, vector_field, subquery_clause
)
elif lucene_filter != {}:
search_query = _approximate_search_query_with_lucene_filter(
embedding, lucene_filter, size, k, vector_field
)
else:
search_query = _default_approximate_search_query(
embedding, size, k, vector_field
@@ -442,7 +471,7 @@ class OpenSearchVectorSearch(VectorStore):
to "text".
Optional Keyword Args for Approximate Search:
engine: "nmslib", "faiss", "hnsw"; default: "nmslib"
engine: "nmslib", "faiss", "lucene"; default: "nmslib"
space_type: "l2", "l1", "cosinesimil", "linf", "innerproduct"; default: "l2"

View File

@@ -4,10 +4,7 @@ from typing import List
import numpy as np
def cosine_similarity(a: np.ndarray, b: np.ndarray) -> float:
"""Calculate cosine similarity with numpy."""
return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
from langchain.math_utils import cosine_similarity
def maximal_marginal_relevance(
@@ -17,22 +14,26 @@ def maximal_marginal_relevance(
k: int = 4,
) -> List[int]:
"""Calculate maximal marginal relevance."""
idxs: List[int] = []
while len(idxs) < k:
if min(k, len(embedding_list)) <= 0:
return []
similarity_to_query = cosine_similarity([query_embedding], embedding_list)[0]
most_similar = int(np.argmax(similarity_to_query))
idxs = [most_similar]
selected = np.array([embedding_list[most_similar]])
while len(idxs) < min(k, len(embedding_list)):
best_score = -np.inf
idx_to_add = -1
for i, emb in enumerate(embedding_list):
similarity_to_selected = cosine_similarity(embedding_list, selected)
for i, query_score in enumerate(similarity_to_query):
if i in idxs:
continue
first_part = cosine_similarity(query_embedding, emb)
second_part = 0.0
for j in idxs:
cos_sim = cosine_similarity(emb, embedding_list[j])
if cos_sim > second_part:
second_part = cos_sim
equation_score = lambda_mult * first_part - (1 - lambda_mult) * second_part
redundant_score = max(similarity_to_selected[i])
equation_score = (
lambda_mult * query_score - (1 - lambda_mult) * redundant_score
)
if equation_score > best_score:
best_score = equation_score
idx_to_add = i
idxs.append(idx_to_add)
selected = np.append(selected, [embedding_list[idx_to_add]], axis=0)
return idxs

View File

@@ -1,6 +1,6 @@
[tool.poetry]
name = "langchain"
version = "0.0.145"
version = "0.0.146"
description = "Building applications with LLMs through composability"
authors = []
license = "MIT"
@@ -148,6 +148,9 @@ select = [
"F", # pyflakes
"I", # isort
]
exclude = [
"tests/integration_tests/examples/non-utf8-encoding.py",
]
[tool.mypy]
ignore_missing_imports = "True"

View File

@@ -6,4 +6,12 @@ OPENAI_API_KEY=
# your api key from left menu "API Keys" in https://app.pinecone.io
PINECONE_API_KEY=your_pinecone_api_key_here
# your pinecone environment from left menu "API Keys" in https://app.pinecone.io
PINECONE_ENVIRONMENT=us-west4-gcp
PINECONE_ENVIRONMENT=us-west4-gcp
# jira
# your api token from https://id.atlassian.com/manage-profile/security/api-tokens
# more details here: https://confluence.atlassian.com/enterprise/using-personal-access-tokens-1026032365.html
# JIRA_API_TOKEN=your_jira_api_token_here
# JIRA_USERNAME=your_jira_username_here
# JIRA_INSTANCE_URL=your_jira_instance_url_here

View File

@@ -0,0 +1,19 @@
from pathlib import Path
import pytest
from langchain.document_loaders.python import PythonLoader
@pytest.mark.parametrize("filename", ["default-encoding.py", "non-utf8-encoding.py"])
def test_python_loader(filename: str) -> None:
"""Test Python loader."""
file_path = Path(__file__).parent.parent / "examples" / filename
loader = PythonLoader(str(file_path))
docs = loader.load()
assert len(docs) == 1
metadata = docs[0].metadata
assert metadata["source"] == str(file_path)

View File

@@ -0,0 +1 @@
u = "🦜🔗"

View File

@@ -0,0 +1,3 @@
# coding: iso-8859-5
# <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <- Cyrillic characters
u = "<EFBFBD><EFBFBD><EFBFBD><EFBFBD>"

View File

@@ -0,0 +1,28 @@
"""Integration test for compression pipelines."""
from langchain.document_transformers import EmbeddingsRedundantFilter
from langchain.embeddings import OpenAIEmbeddings
from langchain.retrievers.document_compressors import (
DocumentCompressorPipeline,
EmbeddingsFilter,
)
from langchain.schema import Document
from langchain.text_splitter import CharacterTextSplitter
def test_document_compressor_pipeline() -> None:
embeddings = OpenAIEmbeddings()
splitter = CharacterTextSplitter(chunk_size=20, chunk_overlap=0, separator=". ")
redundant_filter = EmbeddingsRedundantFilter(embeddings=embeddings)
relevant_filter = EmbeddingsFilter(embeddings=embeddings, similarity_threshold=0.8)
pipeline_filter = DocumentCompressorPipeline(
transformers=[splitter, redundant_filter, relevant_filter]
)
texts = [
"This sentence is about cows",
"This sentence was about cows",
"foo bar baz",
]
docs = [Document(page_content=". ".join(texts))]
actual = pipeline_filter.compress_documents(docs, "Tell me about farm animals")
assert len(actual) == 1
assert actual[0].page_content in texts[:2]

View File

@@ -0,0 +1,36 @@
"""Integration test for LLMChainExtractor."""
from langchain.chat_models import ChatOpenAI
from langchain.retrievers.document_compressors import LLMChainExtractor
from langchain.schema import Document
def test_llm_chain_extractor() -> None:
texts = [
"The Roman Empire followed the Roman Republic.",
"I love chocolate chip cookies—my mother makes great cookies.",
"The first Roman emperor was Caesar Augustus.",
"Don't you just love Caesar salad?",
"The Roman Empire collapsed in 476 AD after the fall of Rome.",
"Let's go to Olive Garden!",
]
doc = Document(page_content=" ".join(texts))
compressor = LLMChainExtractor.from_llm(ChatOpenAI())
actual = compressor.compress_documents([doc], "Tell me about the Roman Empire")[
0
].page_content
expected_returned = [0, 2, 4]
expected_not_returned = [1, 3, 5]
assert all([texts[i] in actual for i in expected_returned])
assert all([texts[i] not in actual for i in expected_not_returned])
def test_llm_chain_extractor_empty() -> None:
texts = [
"I love chocolate chip cookies—my mother makes great cookies.",
"Don't you just love Caesar salad?",
"Let's go to Olive Garden!",
]
doc = Document(page_content=" ".join(texts))
compressor = LLMChainExtractor.from_llm(ChatOpenAI())
actual = compressor.compress_documents([doc], "Tell me about the Roman Empire")
assert len(actual) == 0

View File

@@ -0,0 +1,17 @@
"""Integration test for llm-based relevant doc filtering."""
from langchain.chat_models import ChatOpenAI
from langchain.retrievers.document_compressors import LLMChainFilter
from langchain.schema import Document
def test_llm_chain_filter() -> None:
texts = [
"What happened to all of my cookies?",
"I wish there were better Italian restaurants in my neighborhood.",
"My favorite color is green",
]
docs = [Document(page_content=t) for t in texts]
relevant_filter = LLMChainFilter.from_llm(llm=ChatOpenAI())
actual = relevant_filter.compress_documents(docs, "Things I said related to food")
assert len(actual) == 2
assert len(set(texts[:2]).intersection([d.page_content for d in actual])) == 2

View File

@@ -0,0 +1,39 @@
"""Integration test for embedding-based relevant doc filtering."""
import numpy as np
from langchain.document_transformers import _DocumentWithState
from langchain.embeddings import OpenAIEmbeddings
from langchain.retrievers.document_compressors import EmbeddingsFilter
from langchain.schema import Document
def test_embeddings_filter() -> None:
texts = [
"What happened to all of my cookies?",
"I wish there were better Italian restaurants in my neighborhood.",
"My favorite color is green",
]
docs = [Document(page_content=t) for t in texts]
embeddings = OpenAIEmbeddings()
relevant_filter = EmbeddingsFilter(embeddings=embeddings, similarity_threshold=0.75)
actual = relevant_filter.compress_documents(docs, "What did I say about food?")
assert len(actual) == 2
assert len(set(texts[:2]).intersection([d.page_content for d in actual])) == 2
def test_embeddings_filter_with_state() -> None:
texts = [
"What happened to all of my cookies?",
"I wish there were better Italian restaurants in my neighborhood.",
"My favorite color is green",
]
query = "What did I say about food?"
embeddings = OpenAIEmbeddings()
embedded_query = embeddings.embed_query(query)
state = {"embedded_doc": np.zeros(len(embedded_query))}
docs = [_DocumentWithState(page_content=t, state=state) for t in texts]
docs[-1].state = {"embedded_doc": embedded_query}
relevant_filter = EmbeddingsFilter(embeddings=embeddings, similarity_threshold=0.75)
actual = relevant_filter.compress_documents(docs, query)
assert len(actual) == 1
assert texts[-1] == actual[0].page_content

View File

@@ -0,0 +1,25 @@
from langchain.embeddings import OpenAIEmbeddings
from langchain.retrievers.contextual_compression import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import EmbeddingsFilter
from langchain.vectorstores import Chroma
def test_contextual_compression_retriever_get_relevant_docs() -> None:
"""Test get_relevant_docs."""
texts = [
"This is a document about the Boston Celtics",
"The Boston Celtics won the game by 20 points",
"I simply love going to the movies",
]
embeddings = OpenAIEmbeddings()
base_compressor = EmbeddingsFilter(embeddings=embeddings, similarity_threshold=0.75)
base_retriever = Chroma.from_texts(texts, embedding=embeddings).as_retriever(
search_kwargs={"k": len(texts)}
)
retriever = ContextualCompressionRetriever(
base_compressor=base_compressor, base_retriever=base_retriever
)
actual = retriever.get_relevant_documents("Tell me about the Celtics")
assert len(actual) == 2
assert texts[-1] not in [d.page_content for d in actual]

View File

@@ -0,0 +1,31 @@
"""Integration test for embedding-based redundant doc filtering."""
from langchain.document_transformers import (
EmbeddingsRedundantFilter,
_DocumentWithState,
)
from langchain.embeddings import OpenAIEmbeddings
from langchain.schema import Document
def test_embeddings_redundant_filter() -> None:
texts = [
"What happened to all of my cookies?",
"Where did all of my cookies go?",
"I wish there were better Italian restaurants in my neighborhood.",
]
docs = [Document(page_content=t) for t in texts]
embeddings = OpenAIEmbeddings()
redundant_filter = EmbeddingsRedundantFilter(embeddings=embeddings)
actual = redundant_filter.transform_documents(docs)
assert len(actual) == 2
assert set(texts[:2]).intersection([d.page_content for d in actual])
def test_embeddings_redundant_filter_with_state() -> None:
texts = ["What happened to all of my cookies?", "foo bar baz"]
state = {"embedded_doc": [0.5] * 10}
docs = [_DocumentWithState(page_content=t, state=state) for t in texts]
embeddings = OpenAIEmbeddings()
redundant_filter = EmbeddingsRedundantFilter(embeddings=embeddings)
actual = redundant_filter.transform_documents(docs)
assert len(actual) == 1

View File

@@ -0,0 +1,27 @@
"""Integration test for JIRA API Wrapper."""
from langchain.utilities.jira import JiraAPIWrapper
import json
def test_search() -> None:
"""Test for Searching issues on JIRA"""
jql = "project = TP"
jira = JiraAPIWrapper()
output = jira.run("jql", jql)
assert "issues" in output
def test_getprojects() -> None:
"""Test for getting projects on JIRA"""
jira = JiraAPIWrapper()
output = jira.run("get_projects", "")
assert "projects" in output
def test_create_ticket() -> None:
"""Test the Create Ticket Call that Creates a Issue/Ticket on JIRA."""
issue_string = '{"summary": "Test Summary", "description": "Test Description", "issuetype": {"name": "Bug"}, "project": {"key": "TP"}}'
jira = JiraAPIWrapper()
output = jira.run("create_issue", issue_string)
assert "id" in output
assert "key" in output

View File

@@ -164,3 +164,13 @@ def test_appx_search_with_boolean_filter() -> None:
"foo", k=3, boolean_filter=boolean_filter_val, subquery_clause="should"
)
assert output == [Document(page_content="bar")]
def test_appx_search_with_lucene_filter() -> None:
"""Test Approximate Search with Lucene Filter."""
lucene_filter_val = {"bool": {"must": [{"term": {"text": "bar"}}]}}
docsearch = OpenSearchVectorSearch.from_texts(
texts, FakeEmbeddings(), opensearch_url=DEFAULT_OPENSEARCH_URL, engine="lucene"
)
output = docsearch.similarity_search("foo", k=3, lucene_filter=lucene_filter_val)
assert output == [Document(page_content="bar")]

View File

@@ -0,0 +1,15 @@
"""Unit tests for document transformers."""
from langchain.document_transformers import _filter_similar_embeddings
from langchain.math_utils import cosine_similarity
def test__filter_similar_embeddings() -> None:
threshold = 0.79
embedded_docs = [[1.0, 2.0], [1.0, 2.0], [2.0, 1.0], [2.0, 0.5], [0.0, 0.0]]
expected = [1, 3, 4]
actual = _filter_similar_embeddings(embedded_docs, cosine_similarity, threshold)
assert expected == actual
def test__filter_similar_embeddings_empty() -> None:
assert len(_filter_similar_embeddings([], cosine_similarity, 0.0)) == 0

View File

@@ -0,0 +1,39 @@
"""Test math utility functions."""
from typing import List
import numpy as np
from langchain.math_utils import cosine_similarity
def test_cosine_similarity_zero() -> None:
X = np.zeros((3, 3))
Y = np.random.random((3, 3))
expected = np.zeros((3, 3))
actual = cosine_similarity(X, Y)
assert np.allclose(expected, actual)
def test_cosine_similarity_identity() -> None:
X = np.random.random((4, 4))
expected = np.ones(4)
actual = np.diag(cosine_similarity(X, X))
assert np.allclose(expected, actual)
def test_cosine_similarity_empty() -> None:
empty_list: List[List[float]] = []
assert len(cosine_similarity(empty_list, empty_list)) == 0
assert len(cosine_similarity(empty_list, np.random.random((3, 3)))) == 0
def test_cosine_similarity() -> None:
X = [[1.0, 2.0, 3.0], [0.0, 1.0, 0.0], [1.0, 2.0, 0.0]]
Y = [[0.5, 1.0, 1.5], [1.0, 0.0, 0.0], [2.0, 5.0, 2.0]]
expected = [
[1.0, 0.26726124, 0.83743579],
[0.53452248, 0.0, 0.87038828],
[0.5976143, 0.4472136, 0.93419873],
]
actual = cosine_similarity(X, Y)
assert np.allclose(expected, actual)