Compare commits

..

7 Commits

Author SHA1 Message Date
Harrison Chase
bf78200f55 bump version 146 (#3272) 2023-04-20 22:20:43 -07:00
Harrison Chase
87544d2378 gradio tools (#3255) 2023-04-20 22:09:15 -07:00
Naveen Tatikonda
bb6c459f7a OpenSearch: Add Support for Lucene Filter (#3201)
### Description
Add Support for Lucene Filter. When you specify a Lucene filter for a
k-NN search, the Lucene algorithm decides whether to perform an exact
k-NN search with pre-filtering or an approximate search with modified
post-filtering. This filter is supported only for approximate search
with the indexes that are created using `lucene` engine.

OpenSearch Documentation -
https://opensearch.org/docs/latest/search-plugins/knn/filter-search-knn/#lucene-k-nn-filter-implementation

Signed-off-by: Naveen Tatikonda <navtat@amazon.com>
2023-04-20 20:42:53 -07:00
Davis Chase
36720cb57f Hf emb device (#3266)
Make it possible to control the HuggingFaceEmbeddings and HuggingFaceInstructEmbeddings client model kwargs. Additionally, the cache folder was added for HuggingFaceInstructEmbedding as the client inherits from SentenceTransformer (client of HuggingFaceEmbeddings).

It can be useful, especially to control the client device, as it will be defaulted to GPU by sentence_transformers if there is any.

---------

Co-authored-by: Yoann Poupart <66315201+Xmaster6y@users.noreply.github.com>
2023-04-20 20:41:22 -07:00
Zach Jones
d7942a9f19 Fix type annotation for QueryCheckerTool.llm (#3237)
Currently `langchain.tools.sql_database.tool.QueryCheckerTool` has a
field `llm` with type `BaseLLM`. This breaks initialization for some
LLMs. For example, trying to use it with GPT4:

```python
from langchain.sql_database import SQLDatabase
from langchain.chat_models import ChatOpenAI
from langchain.tools.sql_database.tool import QueryCheckerTool


db = SQLDatabase.from_uri("some_db_uri")
llm = ChatOpenAI(model_name="gpt-4")
tool = QueryCheckerTool(db=db, llm=llm)

# pydantic.error_wrappers.ValidationError: 1 validation error for QueryCheckerTool
# llm
#   Can't instantiate abstract class BaseLLM with abstract methods _agenerate, _generate, _llm_type (type=type_error)
```

Seems like much of the rest of the codebase has switched from `BaseLLM`
to `BaseLanguageModel`. This PR makes the change for QueryCheckerTool as
well

Co-authored-by: Zachary Jones <zjones@zetaglobal.com>
2023-04-20 18:50:59 -07:00
Davis Chase
46542dc774 Contextual compression retriever (#2915)
Co-authored-by: Harrison Chase <hw.chase.17@gmail.com>
2023-04-20 17:01:14 -07:00
Matt Robinson
3943759a90 feat: add loader for rich text files (#3227)
### Summary

Adds a loader for rich text files. Requires `unstructured>=0.5.12`.

### Testing

The following test uses the example RTF file from the [`unstructured`
repo](https://github.com/Unstructured-IO/unstructured/tree/main/example-docs).

```python
from langchain.document_loaders import UnstructuredRTFLoader

loader = UnstructuredRTFLoader("fake-doc.rtf", mode="elements")
docs = loader.load()
docs[0].page_content
```
2023-04-20 15:51:49 -07:00
8 changed files with 330 additions and 11 deletions

File diff suppressed because one or more lines are too long

View File

@@ -57,6 +57,7 @@ from langchain.document_loaders.pdf import (
from langchain.document_loaders.powerpoint import UnstructuredPowerPointLoader
from langchain.document_loaders.readthedocs import ReadTheDocsLoader
from langchain.document_loaders.roam import RoamLoader
from langchain.document_loaders.rtf import UnstructuredRTFLoader
from langchain.document_loaders.s3_directory import S3DirectoryLoader
from langchain.document_loaders.s3_file import S3FileLoader
from langchain.document_loaders.sitemap import SitemapLoader
@@ -106,6 +107,7 @@ __all__ = [
"OutlookMessageLoader",
"UnstructuredEPubLoader",
"UnstructuredMarkdownLoader",
"UnstructuredRTFLoader",
"RoamLoader",
"YoutubeLoader",
"S3FileLoader",

View File

@@ -0,0 +1,28 @@
"""Loader that loads rich text files."""
from typing import Any, List
from langchain.document_loaders.unstructured import (
UnstructuredFileLoader,
satisfies_min_unstructured_version,
)
class UnstructuredRTFLoader(UnstructuredFileLoader):
"""Loader that uses unstructured to load rtf files."""
def __init__(
self, file_path: str, mode: str = "single", **unstructured_kwargs: Any
):
min_unstructured_version = "0.5.12"
if not satisfies_min_unstructured_version(min_unstructured_version):
raise ValueError(
"Partitioning rtf files is only supported in "
f"unstructured>={min_unstructured_version}."
)
super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs)
def _get_elements(self) -> List:
from unstructured.partition.rtf import partition_rtf
return partition_rtf(filename=self.file_path, **self.unstructured_kwargs)

View File

@@ -1,7 +1,7 @@
"""Wrapper around HuggingFace embedding models."""
from typing import Any, List, Optional
from typing import Any, Dict, List, Optional
from pydantic import BaseModel, Extra
from pydantic import BaseModel, Extra, Field
from langchain.embeddings.base import Embeddings
@@ -22,8 +22,10 @@ class HuggingFaceEmbeddings(BaseModel, Embeddings):
.. code-block:: python
from langchain.embeddings import HuggingFaceEmbeddings
model_name = "sentence-transformers/all-mpnet-base-v2"
hf = HuggingFaceEmbeddings(model_name=model_name)
model_kwargs = {'device': 'cpu'}
hf = HuggingFaceEmbeddings(model_name=model_name, model_kwargs=model_kwargs)
"""
client: Any #: :meta private:
@@ -32,6 +34,8 @@ class HuggingFaceEmbeddings(BaseModel, Embeddings):
cache_folder: Optional[str] = None
"""Path to store models.
Can be also set by SENTENCE_TRANSFORMERS_HOME enviroment variable."""
model_kwargs: Dict[str, Any] = Field(default_factory=dict)
"""Key word arguments to pass to the model."""
def __init__(self, **kwargs: Any):
"""Initialize the sentence_transformer."""
@@ -40,7 +44,7 @@ class HuggingFaceEmbeddings(BaseModel, Embeddings):
import sentence_transformers
self.client = sentence_transformers.SentenceTransformer(
self.model_name, self.cache_folder
self.model_name, cache_folder=self.cache_folder, **self.model_kwargs
)
except ImportError:
raise ValueError(
@@ -90,13 +94,22 @@ class HuggingFaceInstructEmbeddings(BaseModel, Embeddings):
.. code-block:: python
from langchain.embeddings import HuggingFaceInstructEmbeddings
model_name = "hkunlp/instructor-large"
hf = HuggingFaceInstructEmbeddings(model_name=model_name)
model_kwargs = {'device': 'cpu'}
hf = HuggingFaceInstructEmbeddings(
model_name=model_name, model_kwargs=model_kwargs
)
"""
client: Any #: :meta private:
model_name: str = DEFAULT_INSTRUCT_MODEL
"""Model name to use."""
cache_folder: Optional[str] = None
"""Path to store models.
Can be also set by SENTENCE_TRANSFORMERS_HOME enviroment variable."""
model_kwargs: Dict[str, Any] = Field(default_factory=dict)
"""Key word arguments to pass to the model."""
embed_instruction: str = DEFAULT_EMBED_INSTRUCTION
"""Instruction to use for embedding documents."""
query_instruction: str = DEFAULT_QUERY_INSTRUCTION
@@ -108,7 +121,9 @@ class HuggingFaceInstructEmbeddings(BaseModel, Embeddings):
try:
from InstructorEmbedding import INSTRUCTOR
self.client = INSTRUCTOR(self.model_name)
self.client = INSTRUCTOR(
self.model_name, cache_folder=self.cache_folder, **self.model_kwargs
)
except ImportError as e:
raise ValueError("Dependencies for InstructorEmbedding not found.") from e

View File

@@ -6,7 +6,7 @@ from typing import Any, Dict
from langchain.chains.llm import LLMChain
from langchain.prompts import PromptTemplate
from langchain.sql_database import SQLDatabase
from langchain.llms.base import BaseLLM
from langchain.schema import BaseLanguageModel
from langchain.tools.base import BaseTool
from langchain.tools.sql_database.prompt import QUERY_CHECKER
@@ -81,7 +81,7 @@ class QueryCheckerTool(BaseSQLDatabaseTool, BaseTool):
Adapted from https://www.patterns.app/blog/2023/01/18/crunchbot-sql-analyst-gpt/"""
template: str = QUERY_CHECKER
llm: BaseLLM
llm: BaseLanguageModel
llm_chain: LLMChain = Field(init=False)
name = "query_checker_sql_db"
description = """

View File

@@ -168,6 +168,21 @@ def _approximate_search_query_with_boolean_filter(
}
def _approximate_search_query_with_lucene_filter(
query_vector: List[float],
lucene_filter: Dict,
size: int = 4,
k: int = 4,
vector_field: str = "vector_field",
) -> Dict:
"""For Approximate k-NN Search, with Lucene Filter."""
search_query = _default_approximate_search_query(
query_vector, size, k, vector_field
)
search_query["query"]["knn"][vector_field]["filter"] = lucene_filter
return search_query
def _default_script_query(
query_vector: List[float],
space_type: str = "l2",
@@ -340,10 +355,14 @@ class OpenSearchVectorSearch(VectorStore):
size: number of results the query actually returns; default: 4
boolean_filter: A Boolean filter consists of a Boolean query that
contains a k-NN query and a filter
contains a k-NN query and a filter.
subquery_clause: Query clause on the knn vector field; default: "must"
lucene_filter: the Lucene algorithm decides whether to perform an exact
k-NN search with pre-filtering or an approximate search with modified
post-filtering.
Optional Args for Script Scoring Search:
search_type: "script_scoring"; default: "approximate_search"
@@ -371,10 +390,20 @@ class OpenSearchVectorSearch(VectorStore):
size = _get_kwargs_value(kwargs, "size", 4)
boolean_filter = _get_kwargs_value(kwargs, "boolean_filter", {})
subquery_clause = _get_kwargs_value(kwargs, "subquery_clause", "must")
lucene_filter = _get_kwargs_value(kwargs, "lucene_filter", {})
if boolean_filter != {} and lucene_filter != {}:
raise ValueError(
"Both `boolean_filter` and `lucene_filter` are provided which "
"is invalid"
)
if boolean_filter != {}:
search_query = _approximate_search_query_with_boolean_filter(
embedding, boolean_filter, size, k, vector_field, subquery_clause
)
elif lucene_filter != {}:
search_query = _approximate_search_query_with_lucene_filter(
embedding, lucene_filter, size, k, vector_field
)
else:
search_query = _default_approximate_search_query(
embedding, size, k, vector_field
@@ -442,7 +471,7 @@ class OpenSearchVectorSearch(VectorStore):
to "text".
Optional Keyword Args for Approximate Search:
engine: "nmslib", "faiss", "hnsw"; default: "nmslib"
engine: "nmslib", "faiss", "lucene"; default: "nmslib"
space_type: "l2", "l1", "cosinesimil", "linf", "innerproduct"; default: "l2"

View File

@@ -1,6 +1,6 @@
[tool.poetry]
name = "langchain"
version = "0.0.145"
version = "0.0.146"
description = "Building applications with LLMs through composability"
authors = []
license = "MIT"

View File

@@ -164,3 +164,13 @@ def test_appx_search_with_boolean_filter() -> None:
"foo", k=3, boolean_filter=boolean_filter_val, subquery_clause="should"
)
assert output == [Document(page_content="bar")]
def test_appx_search_with_lucene_filter() -> None:
"""Test Approximate Search with Lucene Filter."""
lucene_filter_val = {"bool": {"must": [{"term": {"text": "bar"}}]}}
docsearch = OpenSearchVectorSearch.from_texts(
texts, FakeEmbeddings(), opensearch_url=DEFAULT_OPENSEARCH_URL, engine="lucene"
)
output = docsearch.similarity_search("foo", k=3, lucene_filter=lucene_filter_val)
assert output == [Document(page_content="bar")]