From 1fa5d94591969ec160371963e75816d76287ed08 Mon Sep 17 00:00:00 2001 From: Youngwook Kim Date: Thu, 20 Jul 2023 11:07:17 +0900 Subject: [PATCH 01/30] feat(document_loaders): add sync and async page evaluation methods to PlaywrightURLLoader --- .../document_loaders/url_playwright.py | 52 ++++++++++++------- 1 file changed, 32 insertions(+), 20 deletions(-) diff --git a/libs/langchain/langchain/document_loaders/url_playwright.py b/libs/langchain/langchain/document_loaders/url_playwright.py index ee4a47f2516..1c02ac30bb2 100644 --- a/libs/langchain/langchain/document_loaders/url_playwright.py +++ b/libs/langchain/langchain/document_loaders/url_playwright.py @@ -48,6 +48,36 @@ class PlaywrightURLLoader(BaseLoader): self.headless = headless self.remove_selectors = remove_selectors + def sync_evaluate_page(self, page): + """Process a page and return the text content. + This method can be overridden to apply custom logic. + """ + for selector in self.remove_selectors or []: + elements = page.locator(selector).all() + for element in elements: + if element.is_visible(): + element.evaluate("element => element.remove()") + + page_source = page.content() + elements = partition_html(text=page_source) + text = "\n\n".join([str(el) for el in elements]) + return text + + async def async_evaluate_page(self, page): + """Process a page asynchronously and return the text content. + This method can be overridden to apply custom logic. + """ + for selector in self.remove_selectors or []: + elements = await page.locator(selector).all() + for element in elements: + if await element.is_visible(): + await element.evaluate("element => element.remove()") + + page_source = await page.content() + elements = partition_html(text=page_source) + text = "\n\n".join([str(el) for el in elements]) + return text + def load(self) -> List[Document]: """Load the specified URLs using Playwright and create Document instances. @@ -65,16 +95,7 @@ class PlaywrightURLLoader(BaseLoader): try: page = browser.new_page() page.goto(url) - - for selector in self.remove_selectors or []: - elements = page.locator(selector).all() - for element in elements: - if element.is_visible(): - element.evaluate("element => element.remove()") - - page_source = page.content() - elements = partition_html(text=page_source) - text = "\n\n".join([str(el) for el in elements]) + text = self.sync_evaluate_page(page) metadata = {"source": url} docs.append(Document(page_content=text, metadata=metadata)) except Exception as e: @@ -105,16 +126,7 @@ class PlaywrightURLLoader(BaseLoader): try: page = await browser.new_page() await page.goto(url) - - for selector in self.remove_selectors or []: - elements = await page.locator(selector).all() - for element in elements: - if await element.is_visible(): - await element.evaluate("element => element.remove()") - - page_source = await page.content() - elements = partition_html(text=page_source) - text = "\n\n".join([str(el) for el in elements]) + text = await self.async_evaluate_page(page) metadata = {"source": url} docs.append(Document(page_content=text, metadata=metadata)) except Exception as e: From dc4b037957167e29342538c4d4ad17a2f3140b0a Mon Sep 17 00:00:00 2001 From: Youngwook Kim Date: Tue, 8 Aug 2023 23:31:27 +0900 Subject: [PATCH 02/30] docs(url_playwright): update docstrings for sync_evaluate_page and async_evaluate_page methods --- .../document_loaders/url_playwright.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/libs/langchain/langchain/document_loaders/url_playwright.py b/libs/langchain/langchain/document_loaders/url_playwright.py index 1c02ac30bb2..188b7f22fe4 100644 --- a/libs/langchain/langchain/document_loaders/url_playwright.py +++ b/libs/langchain/langchain/document_loaders/url_playwright.py @@ -49,8 +49,13 @@ class PlaywrightURLLoader(BaseLoader): self.remove_selectors = remove_selectors def sync_evaluate_page(self, page): - """Process a page and return the text content. - This method can be overridden to apply custom logic. + """Process a page and return the text content synchronously. + + Args: + page: The page to process. + + Returns: + text: The text content of the page. """ for selector in self.remove_selectors or []: elements = page.locator(selector).all() @@ -64,8 +69,13 @@ class PlaywrightURLLoader(BaseLoader): return text async def async_evaluate_page(self, page): - """Process a page asynchronously and return the text content. - This method can be overridden to apply custom logic. + """Process a page and return the text content asynchronously. + + Args: + page: The page to process. + + Returns: + text: The text content of the page. """ for selector in self.remove_selectors or []: elements = await page.locator(selector).all() From 224263aa24401ccdea5bfa4e17b495d7cb75db43 Mon Sep 17 00:00:00 2001 From: Youngwook Kim Date: Wed, 9 Aug 2023 10:51:34 +0900 Subject: [PATCH 03/30] refactor(document_loaders): modify evaluation methods in PlaywrightURLLoader --- .../document_loaders/url_playwright.py | 22 ++++++++++++------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/libs/langchain/langchain/document_loaders/url_playwright.py b/libs/langchain/langchain/document_loaders/url_playwright.py index 188b7f22fe4..6a18d528d4e 100644 --- a/libs/langchain/langchain/document_loaders/url_playwright.py +++ b/libs/langchain/langchain/document_loaders/url_playwright.py @@ -48,15 +48,19 @@ class PlaywrightURLLoader(BaseLoader): self.headless = headless self.remove_selectors = remove_selectors - def sync_evaluate_page(self, page): + def sync_evaluate(self, page, browser, response): """Process a page and return the text content synchronously. Args: page: The page to process. + browser: The browser instance. + response: The response from page.goto(). Returns: text: The text content of the page. """ + from unstructured.partition.html import partition_html + for selector in self.remove_selectors or []: elements = page.locator(selector).all() for element in elements: @@ -68,15 +72,19 @@ class PlaywrightURLLoader(BaseLoader): text = "\n\n".join([str(el) for el in elements]) return text - async def async_evaluate_page(self, page): + async def async_evaluate(self, page, browser, response): """Process a page and return the text content asynchronously. Args: page: The page to process. + browser: The browser instance. + response: The response from page.goto(). Returns: text: The text content of the page. """ + from unstructured.partition.html import partition_html + for selector in self.remove_selectors or []: elements = await page.locator(selector).all() for element in elements: @@ -95,7 +103,6 @@ class PlaywrightURLLoader(BaseLoader): List[Document]: A list of Document instances with loaded content. """ from playwright.sync_api import sync_playwright - from unstructured.partition.html import partition_html docs: List[Document] = list() @@ -104,8 +111,8 @@ class PlaywrightURLLoader(BaseLoader): for url in self.urls: try: page = browser.new_page() - page.goto(url) - text = self.sync_evaluate_page(page) + response = page.goto(url) + text = self.sync_evaluate(page, browser, response) metadata = {"source": url} docs.append(Document(page_content=text, metadata=metadata)) except Exception as e: @@ -126,7 +133,6 @@ class PlaywrightURLLoader(BaseLoader): List[Document]: A list of Document instances with loaded content. """ from playwright.async_api import async_playwright - from unstructured.partition.html import partition_html docs: List[Document] = list() @@ -135,8 +141,8 @@ class PlaywrightURLLoader(BaseLoader): for url in self.urls: try: page = await browser.new_page() - await page.goto(url) - text = await self.async_evaluate_page(page) + response = await page.goto(url) + text = await self.async_evaluate(page, browser, response) metadata = {"source": url} docs.append(Document(page_content=text, metadata=metadata)) except Exception as e: From ef7f4aea32fc89796e29da549d75befd0a82d47f Mon Sep 17 00:00:00 2001 From: Youngwook Kim Date: Wed, 9 Aug 2023 10:59:22 +0900 Subject: [PATCH 04/30] refactor: modify method visibility in url_playwright --- .../langchain/document_loaders/url_playwright.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/libs/langchain/langchain/document_loaders/url_playwright.py b/libs/langchain/langchain/document_loaders/url_playwright.py index 6a18d528d4e..48c8dcf0129 100644 --- a/libs/langchain/langchain/document_loaders/url_playwright.py +++ b/libs/langchain/langchain/document_loaders/url_playwright.py @@ -48,7 +48,7 @@ class PlaywrightURLLoader(BaseLoader): self.headless = headless self.remove_selectors = remove_selectors - def sync_evaluate(self, page, browser, response): + def _sync_evaluate(self, page, browser, response): """Process a page and return the text content synchronously. Args: @@ -72,7 +72,7 @@ class PlaywrightURLLoader(BaseLoader): text = "\n\n".join([str(el) for el in elements]) return text - async def async_evaluate(self, page, browser, response): + async def _async_evaluate(self, page, browser, response): """Process a page and return the text content asynchronously. Args: @@ -112,7 +112,7 @@ class PlaywrightURLLoader(BaseLoader): try: page = browser.new_page() response = page.goto(url) - text = self.sync_evaluate(page, browser, response) + text = self._sync_evaluate(page, browser, response) metadata = {"source": url} docs.append(Document(page_content=text, metadata=metadata)) except Exception as e: @@ -142,7 +142,7 @@ class PlaywrightURLLoader(BaseLoader): try: page = await browser.new_page() response = await page.goto(url) - text = await self.async_evaluate(page, browser, response) + text = await self._async_evaluate(page, browser, response) metadata = {"source": url} docs.append(Document(page_content=text, metadata=metadata)) except Exception as e: From 04fcd2d2e0aee574a0a26b416b4caa0610786b01 Mon Sep 17 00:00:00 2001 From: Youngwook Kim Date: Wed, 9 Aug 2023 14:14:59 +0900 Subject: [PATCH 05/30] refactor(document_loaders): introduce PlaywrightEvaluator abstract base class for custom evalutors and add tests --- .../document_loaders/url_playwright.py | 151 +++++++++++------- .../document_loaders/test_url_playwright.py | 39 +++++ 2 files changed, 131 insertions(+), 59 deletions(-) diff --git a/libs/langchain/langchain/document_loaders/url_playwright.py b/libs/langchain/langchain/document_loaders/url_playwright.py index 48c8dcf0129..87db27fb7d8 100644 --- a/libs/langchain/langchain/document_loaders/url_playwright.py +++ b/libs/langchain/langchain/document_loaders/url_playwright.py @@ -1,6 +1,7 @@ """Loader that uses Playwright to load a page, then uses unstructured to load the html. """ import logging +from abc import ABC, abstractmethod from typing import List, Optional from langchain.docstore.document import Document @@ -9,8 +10,88 @@ from langchain.document_loaders.base import BaseLoader logger = logging.getLogger(__name__) +class PlaywrightEvaluator(ABC): + """Abstract base class for all evaluators. + + Each evaluator should take a page, a browser instance, and a response + object, process the page as necessary, and return the resulting text. + """ + + @abstractmethod + def evaluate(self, page, browser, response): + """Synchronously process the page and return the resulting text. + + Args: + page: The page to process. + browser: The browser instance. + response: The response from page.goto(). + + Returns: + text: The text content of the page. + """ + pass + + @abstractmethod + async def evaluate_async(self, page, browser, response): + """Asynchronously process the page and return the resulting text. + + Args: + page: The page to process. + browser: The browser instance. + response: The response from page.goto(). + + Returns: + text: The text content of the page. + """ + pass + + +class UnstructuredHtmlEvaluator(PlaywrightEvaluator): + """Evaluates the page HTML content using the `unstructured` library.""" + + def __init__(self, remove_selectors: Optional[List[str]] = None): + """Initialize UnstructuredHtmlEvaluator and check if `unstructured` package is installed.""" + try: + import unstructured # noqa:F401 + except ImportError: + raise ImportError( + "unstructured package not found, please install it with " + "`pip install unstructured`" + ) + + self.remove_selectors = remove_selectors + + def evaluate(self, page, browser, response): + """Synchronously process the HTML content of the page and return a text string.""" + from unstructured.partition.html import partition_html + + for selector in self.remove_selectors or []: + elements = page.locator(selector).all() + for element in elements: + if element.is_visible(): + element.evaluate("element => element.remove()") + + page_source = page.content() + elements = partition_html(text=page_source) + return "\n\n".join([str(el) for el in elements]) + + async def evaluate_async(self, page, browser, response): + """Asynchronously process the HTML content of the page and return a text string.""" + from unstructured.partition.html import partition_html + + for selector in self.remove_selectors or []: + elements = await page.locator(selector).all() + for element in elements: + if await element.is_visible(): + await element.evaluate("element => element.remove()") + + page_source = await page.content() + elements = partition_html(text=page_source) + return "\n\n".join([str(el) for el in elements]) + + class PlaywrightURLLoader(BaseLoader): - """Loader that uses Playwright and to load a page and unstructured to load the html. + """Loader that uses Playwright and PlaywrightEvaluator to load the html. This is useful for loading pages that require javascript to render. Attributes: @@ -25,8 +106,9 @@ class PlaywrightURLLoader(BaseLoader): continue_on_failure: bool = True, headless: bool = True, remove_selectors: Optional[List[str]] = None, + evaluator: Optional[PlaywrightEvaluator] = None, ): - """Load a list of URLs using Playwright and unstructured.""" + """Load a list of URLs using Playwright.""" try: import playwright # noqa:F401 except ImportError: @@ -35,66 +117,17 @@ class PlaywrightURLLoader(BaseLoader): "`pip install playwright`" ) - try: - import unstructured # noqa:F401 - except ImportError: - raise ImportError( - "unstructured package not found, please install it with " - "`pip install unstructured`" - ) - self.urls = urls self.continue_on_failure = continue_on_failure self.headless = headless - self.remove_selectors = remove_selectors - def _sync_evaluate(self, page, browser, response): - """Process a page and return the text content synchronously. + if remove_selectors and evaluator: + raise ValueError( + "`remove_selectors` and `evaluator` cannot be both not None" + ) - Args: - page: The page to process. - browser: The browser instance. - response: The response from page.goto(). - - Returns: - text: The text content of the page. - """ - from unstructured.partition.html import partition_html - - for selector in self.remove_selectors or []: - elements = page.locator(selector).all() - for element in elements: - if element.is_visible(): - element.evaluate("element => element.remove()") - - page_source = page.content() - elements = partition_html(text=page_source) - text = "\n\n".join([str(el) for el in elements]) - return text - - async def _async_evaluate(self, page, browser, response): - """Process a page and return the text content asynchronously. - - Args: - page: The page to process. - browser: The browser instance. - response: The response from page.goto(). - - Returns: - text: The text content of the page. - """ - from unstructured.partition.html import partition_html - - for selector in self.remove_selectors or []: - elements = await page.locator(selector).all() - for element in elements: - if await element.is_visible(): - await element.evaluate("element => element.remove()") - - page_source = await page.content() - elements = partition_html(text=page_source) - text = "\n\n".join([str(el) for el in elements]) - return text + # Use the provided evaluator, if any, otherwise, use the default UnstructuredHtmlEvaluator. + self.evaluator = evaluator or UnstructuredHtmlEvaluator(remove_selectors) def load(self) -> List[Document]: """Load the specified URLs using Playwright and create Document instances. @@ -112,7 +145,7 @@ class PlaywrightURLLoader(BaseLoader): try: page = browser.new_page() response = page.goto(url) - text = self._sync_evaluate(page, browser, response) + text = self.evaluator.evaluate(page, browser, response) metadata = {"source": url} docs.append(Document(page_content=text, metadata=metadata)) except Exception as e: @@ -142,7 +175,7 @@ class PlaywrightURLLoader(BaseLoader): try: page = await browser.new_page() response = await page.goto(url) - text = await self._async_evaluate(page, browser, response) + text = await self.evaluator.evaluate_async(page, browser, response) metadata = {"source": url} docs.append(Document(page_content=text, metadata=metadata)) except Exception as e: diff --git a/libs/langchain/tests/integration_tests/document_loaders/test_url_playwright.py b/libs/langchain/tests/integration_tests/document_loaders/test_url_playwright.py index 565646428b2..451d0d71384 100644 --- a/libs/langchain/tests/integration_tests/document_loaders/test_url_playwright.py +++ b/libs/langchain/tests/integration_tests/document_loaders/test_url_playwright.py @@ -4,6 +4,16 @@ import pytest from langchain.document_loaders import PlaywrightURLLoader +class TestEvaluator(PageEvaluator): + """A simple evaluator for testing purposes.""" + + def evaluate(self, page, browser, response): + return "test" + + async def evaluate_async(self, page, browser, response): + return "test" + + def test_playwright_url_loader() -> None: """Test Playwright URL loader.""" urls = [ @@ -39,3 +49,32 @@ async def test_playwright_async_url_loader() -> None: ) docs = await loader.aload() assert len(docs) > 0 + + +def test_playwright_url_loader_with_custom_evaluator() -> None: + """Test Playwright URL loader with a custom evaluator.""" + urls = ["https://www.youtube.com/watch?v=dQw4w9WgXcQ"] + loader = PlaywrightURLLoader( + urls=urls, + page_evaluator=TestEvaluator(), + continue_on_failure=False, + headless=True, + ) + docs = loader.load() + assert len(docs) == 1 + assert docs[0].page_content == "test-" + + +@pytest.mark.asyncio +async def test_playwright_async_url_loader_with_custom_evaluator() -> None: + """Test Playwright async URL loader with a custom evaluator.""" + urls = ["https://www.youtube.com/watch?v=dQw4w9WgXcQ"] + loader = PlaywrightURLLoader( + urls=urls, + page_evaluator=TestEvaluator(), + continue_on_failure=False, + headless=True, + ) + docs = await loader.aload() + assert len(docs) == 2 + assert docs[0].page_content == "test" From 429de77b3b644232ed64b0c9beaa7486bc2d4b79 Mon Sep 17 00:00:00 2001 From: Youngwook Kim Date: Wed, 9 Aug 2023 15:56:24 +0900 Subject: [PATCH 06/30] refactor(langchain): improve type annotations in url_playwright and its test --- .../document_loaders/url_playwright.py | 27 ++++++++++++------- .../document_loaders/test_url_playwright.py | 23 +++++++++++----- 2 files changed, 34 insertions(+), 16 deletions(-) diff --git a/libs/langchain/langchain/document_loaders/url_playwright.py b/libs/langchain/langchain/document_loaders/url_playwright.py index 87db27fb7d8..bce53f91559 100644 --- a/libs/langchain/langchain/document_loaders/url_playwright.py +++ b/libs/langchain/langchain/document_loaders/url_playwright.py @@ -2,11 +2,16 @@ """ import logging from abc import ABC, abstractmethod -from typing import List, Optional +from typing import TYPE_CHECKING, List, Optional from langchain.docstore.document import Document from langchain.document_loaders.base import BaseLoader +if TYPE_CHECKING: + from playwright.async_api import AsyncBrowser, AsyncPage, AsyncResponse + from playwright.sync_api import Browser, Page, Response + + logger = logging.getLogger(__name__) @@ -18,7 +23,7 @@ class PlaywrightEvaluator(ABC): """ @abstractmethod - def evaluate(self, page, browser, response): + def evaluate(self, page: "Page", browser: "Browser", response: "Response") -> str: """Synchronously process the page and return the resulting text. Args: @@ -32,7 +37,9 @@ class PlaywrightEvaluator(ABC): pass @abstractmethod - async def evaluate_async(self, page, browser, response): + async def evaluate_async( + self, page: "AsyncPage", browser: "AsyncBrowser", response: "AsyncResponse" + ) -> str: """Asynchronously process the page and return the resulting text. Args: @@ -50,7 +57,7 @@ class UnstructuredHtmlEvaluator(PlaywrightEvaluator): """Evaluates the page HTML content using the `unstructured` library.""" def __init__(self, remove_selectors: Optional[List[str]] = None): - """Initialize UnstructuredHtmlEvaluator and check if `unstructured` package is installed.""" + """Initialize UnstructuredHtmlEvaluator.""" try: import unstructured # noqa:F401 except ImportError: @@ -61,8 +68,8 @@ class UnstructuredHtmlEvaluator(PlaywrightEvaluator): self.remove_selectors = remove_selectors - def evaluate(self, page, browser, response): - """Synchronously process the HTML content of the page and return a text string.""" + def evaluate(self, page: "Page", browser: "Browser", response: "Response") -> str: + """Synchronously process the HTML content of the page.""" from unstructured.partition.html import partition_html for selector in self.remove_selectors or []: @@ -75,8 +82,10 @@ class UnstructuredHtmlEvaluator(PlaywrightEvaluator): elements = partition_html(text=page_source) return "\n\n".join([str(el) for el in elements]) - async def evaluate_async(self, page, browser, response): - """Asynchronously process the HTML content of the page and return a text string.""" + async def evaluate_async( + self, page: "AsyncPage", browser: "AsyncBrowser", response: "AsyncResponse" + ) -> str: + """Asynchronously process the HTML content of the page.""" from unstructured.partition.html import partition_html for selector in self.remove_selectors or []: @@ -126,7 +135,7 @@ class PlaywrightURLLoader(BaseLoader): "`remove_selectors` and `evaluator` cannot be both not None" ) - # Use the provided evaluator, if any, otherwise, use the default UnstructuredHtmlEvaluator. + # Use the provided evaluator, if any, otherwise, use the default. self.evaluator = evaluator or UnstructuredHtmlEvaluator(remove_selectors) def load(self) -> List[Document]: diff --git a/libs/langchain/tests/integration_tests/document_loaders/test_url_playwright.py b/libs/langchain/tests/integration_tests/document_loaders/test_url_playwright.py index 451d0d71384..7bea1c6dee7 100644 --- a/libs/langchain/tests/integration_tests/document_loaders/test_url_playwright.py +++ b/libs/langchain/tests/integration_tests/document_loaders/test_url_playwright.py @@ -1,16 +1,25 @@ """Tests for the Playwright URL loader""" +from typing import TYPE_CHECKING + import pytest from langchain.document_loaders import PlaywrightURLLoader +from langchain.document_loaders.url_playwright import PlaywrightEvaluator + +if TYPE_CHECKING: + from playwright.async_api import AsyncBrowser, AsyncPage, AsyncResponse + from playwright.sync_api import Browser, Page, Response -class TestEvaluator(PageEvaluator): +class TestEvaluator(PlaywrightEvaluator): """A simple evaluator for testing purposes.""" - def evaluate(self, page, browser, response): + def evaluate(self, page: "Page", browser: "Browser", response: "Response") -> str: return "test" - async def evaluate_async(self, page, browser, response): + async def evaluate_async( + self, page: "AsyncPage", browser: "AsyncBrowser", response: "AsyncResponse" + ) -> str: return "test" @@ -56,13 +65,13 @@ def test_playwright_url_loader_with_custom_evaluator() -> None: urls = ["https://www.youtube.com/watch?v=dQw4w9WgXcQ"] loader = PlaywrightURLLoader( urls=urls, - page_evaluator=TestEvaluator(), + evaluator=TestEvaluator(), continue_on_failure=False, headless=True, ) docs = loader.load() assert len(docs) == 1 - assert docs[0].page_content == "test-" + assert docs[0].page_content == "test" @pytest.mark.asyncio @@ -71,10 +80,10 @@ async def test_playwright_async_url_loader_with_custom_evaluator() -> None: urls = ["https://www.youtube.com/watch?v=dQw4w9WgXcQ"] loader = PlaywrightURLLoader( urls=urls, - page_evaluator=TestEvaluator(), + evaluator=TestEvaluator(), continue_on_failure=False, headless=True, ) docs = await loader.aload() - assert len(docs) == 2 + assert len(docs) == 1 assert docs[0].page_content == "test" From c26deb6b388aceb07f435c4338cb2c5280007f92 Mon Sep 17 00:00:00 2001 From: dafu Date: Wed, 30 Aug 2023 09:49:42 +0800 Subject: [PATCH 07/30] fixed openai_functions api_response format args err root cause: args may not have a key (params) resulting in an error --- libs/langchain/langchain/chains/openai_functions/openapi.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libs/langchain/langchain/chains/openai_functions/openapi.py b/libs/langchain/langchain/chains/openai_functions/openapi.py index 100e4be28a6..a06d147ff61 100644 --- a/libs/langchain/langchain/chains/openai_functions/openapi.py +++ b/libs/langchain/langchain/chains/openai_functions/openapi.py @@ -227,7 +227,7 @@ class SimpleRequestChain(Chain): response = ( f"{api_response.status_code}: {api_response.reason}" + f"\nFor {name} " - + f"Called with args: {args['params']}" + + f"Called with args: {args.get('params','')}" ) else: try: From 8c4e29240c0c61d904d9133d99006415a78f0a9e Mon Sep 17 00:00:00 2001 From: wlleiiwang Date: Wed, 9 Aug 2023 17:42:15 +0800 Subject: [PATCH 08/30] implement vectorstores by tencent vectordb --- .../providers/tencentvectordb.mdx | 15 + .../vectorstores/tencentvectordb.ipynb | 122 ++++++ .../langchain/vectorstores/__init__.py | 2 + .../langchain/vectorstores/tencentvectordb.py | 392 ++++++++++++++++++ .../vectorstores/test_tencentvectordb.py | 93 +++++ 5 files changed, 624 insertions(+) create mode 100644 docs/extras/integrations/providers/tencentvectordb.mdx create mode 100644 docs/extras/integrations/vectorstores/tencentvectordb.ipynb create mode 100644 libs/langchain/langchain/vectorstores/tencentvectordb.py create mode 100644 libs/langchain/tests/integration_tests/vectorstores/test_tencentvectordb.py diff --git a/docs/extras/integrations/providers/tencentvectordb.mdx b/docs/extras/integrations/providers/tencentvectordb.mdx new file mode 100644 index 00000000000..0ce5f1142ea --- /dev/null +++ b/docs/extras/integrations/providers/tencentvectordb.mdx @@ -0,0 +1,15 @@ +# TencentVectorDB + +This page covers how to use the TencentVectorDB ecosystem within LangChain. + +### VectorStore + +There exists a wrapper around TencentVectorDB, allowing you to use it as a vectorstore, +whether for semantic search or example selection. + +To import this vectorstore: +```python +from langchain.vectorstores import TencentVectorDB +``` + +For a more detailed walkthrough of the TencentVectorDB wrapper, see [this notebook](/docs/integrations/vectorstores/tencentvectordb.html) diff --git a/docs/extras/integrations/vectorstores/tencentvectordb.ipynb b/docs/extras/integrations/vectorstores/tencentvectordb.ipynb new file mode 100644 index 00000000000..35fc17d4ceb --- /dev/null +++ b/docs/extras/integrations/vectorstores/tencentvectordb.ipynb @@ -0,0 +1,122 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "collapsed": true, + "jupyter": { + "outputs_hidden": true + } + }, + "source": [ + "# Tencent Cloud VectorDB\n", + "\n", + ">[Tencent Cloud VectorDB](https://cloud.tencent.com/document/product/1709) is a fully managed, self-developed, enterprise-level distributed database service designed for storing, retrieving, and analyzing multi-dimensional vector data. The database supports multiple index types and similarity calculation methods. A single index can support a vector scale of up to 1 billion and can support millions of QPS and millisecond-level query latency. Tencent Cloud Vector Database can not only provide an external knowledge base for large models to improve the accuracy of large model responses but can also be widely used in AI fields such as recommendation systems, NLP services, computer vision, and intelligent customer service.\n", + "\n", + "This notebook shows how to use functionality related to the Tencent vector database.\n", + "\n", + "To run, you should have a [Database instance.](https://cloud.tencent.com/document/product/1709/95101)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!pip3 install tcvectordb" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.embeddings.fake import FakeEmbeddings\n", + "from langchain.text_splitter import CharacterTextSplitter\n", + "from langchain.vectorstores import TencentVectorDB\n", + "from langchain.vectorstores.tencentvectordb import ConnectionParams\n", + "from langchain.document_loaders import TextLoader" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "loader = TextLoader(\"../../../state_of_the_union.txt\")\n", + "documents = loader.load()\n", + "text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n", + "docs = text_splitter.split_documents(documents)\n", + "embeddings = FakeEmbeddings(size=128)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "conn_params = ConnectionParams(url=\"http://10.0.X.X\", \n", + " key=\"eC4bLRy2va******************************\", \n", + " username=\"root\", \n", + " timeout=20)\n", + "\n", + "vector_db = TencentVectorDB.from_documents(\n", + " docs,\n", + " embeddings,\n", + " connection_params=conn_params,\n", + " # drop_old=True,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "query = \"What did the president say about Ketanji Brown Jackson\"\n", + "docs = vector_db.similarity_search(query)\n", + "docs[0].page_content" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vector_db = TencentVectorDB(embedding_function=embeddings, connection_params=conn_params)\n", + "\n", + "vector_db.add_texts([\"Ankush went to Princeton\"])\n", + "query = \"Where did Ankush go to college?\"\n", + "docs = vector_db.max_marginal_relevance_search(query)\n", + "docs[0].page_content" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.17" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/libs/langchain/langchain/vectorstores/__init__.py b/libs/langchain/langchain/vectorstores/__init__.py index ccbdfad6e55..94bbc64a90f 100644 --- a/libs/langchain/langchain/vectorstores/__init__.py +++ b/libs/langchain/langchain/vectorstores/__init__.py @@ -66,6 +66,7 @@ from langchain.vectorstores.sklearn import SKLearnVectorStore from langchain.vectorstores.starrocks import StarRocks from langchain.vectorstores.supabase import SupabaseVectorStore from langchain.vectorstores.tair import Tair +from langchain.vectorstores.tencentvectordb import TencentVectorDB from langchain.vectorstores.tigris import Tigris from langchain.vectorstores.typesense import Typesense from langchain.vectorstores.usearch import USearch @@ -136,4 +137,5 @@ __all__ = [ "ZepVectorStore", "Zilliz", "Zilliz", + "TencentVectorDB", ] diff --git a/libs/langchain/langchain/vectorstores/tencentvectordb.py b/libs/langchain/langchain/vectorstores/tencentvectordb.py new file mode 100644 index 00000000000..75541f85e67 --- /dev/null +++ b/libs/langchain/langchain/vectorstores/tencentvectordb.py @@ -0,0 +1,392 @@ +"""Wrapper around the Tencent vector database.""" +from __future__ import annotations + +import json +import logging +import time +from typing import Any, Dict, Iterable, List, Optional, Tuple + +import numpy as np + +from langchain.docstore.document import Document +from langchain.embeddings.base import Embeddings +from langchain.utils import guard_import +from langchain.vectorstores.base import VectorStore +from langchain.vectorstores.utils import maximal_marginal_relevance + +logger = logging.getLogger(__name__) + + +class ConnectionParams: + """Tencent vector DB Connection params. + + See the following documentation for details: + https://cloud.tencent.com/document/product/1709/95820 + + Attribute: + url (str) : The access address of the vector database server + that the client needs to connect to. + key (str): API key for client to access the vector database server, + which is used for authentication. + username (str) : Account for client to access the vector database server. + timeout (int) : Request Timeout. + """ + + def __init__(self, url: str, key: str, username: str = "root", timeout: int = 10): + self.url = url + self.key = key + self.username = username + self.timeout = timeout + + +class IndexParams: + """Tencent vector DB Index params. + + See the following documentation for details: + https://cloud.tencent.com/document/product/1709/95826 + """ + + def __init__( + self, + dimension: int, + shard: int = 1, + replicas: int = 2, + index_type: str = "HNSW", + metric_type: str = "L2", + params: Optional[Dict] = None, + ): + self.dimension = dimension + self.shard = shard + self.replicas = replicas + self.index_type = index_type + self.metric_type = metric_type + self.params = params + + +class TencentVectorDB(VectorStore): + """Initialize wrapper around the tencent vector database. + + In order to use this you need to have a database instance. + See the following documentation for details: + https://cloud.tencent.com/document/product/1709/94951 + """ + + field_id: str = "id" + field_vector: str = "vector" + field_text: str = "text" + field_metadata: str = "metadata" + + def __init__( + self, + embedding_function: Embeddings, + connection_params: ConnectionParams, + index_params: IndexParams = IndexParams(128), + database_name: str = "LangChainDatabase", + collection_name: str = "LangChainCollection", + drop_old: Optional[bool] = False, + ): + self.document = guard_import("tcvectordb.model.document") + tcvectordb = guard_import("tcvectordb") + self.embedding_func = embedding_function + self.index_params = index_params + self.vdb_client = tcvectordb.VectorDBClient( + url=connection_params.url, + username=connection_params.username, + key=connection_params.key, + timeout=connection_params.timeout, + ) + db_list = self.vdb_client.list_databases() + db_exist: bool = False + for db in db_list: + if database_name == db.database_name: + db_exist = True + break + if db_exist: + self.database = self.vdb_client.database(database_name) + else: + self.database = self.vdb_client.create_database(database_name) + try: + self.collection = self.database.describe_collection(collection_name) + if drop_old: + self.database.drop_collection(collection_name) + self._create_collection(collection_name) + except tcvectordb.exceptions.VectorDBException: + self._create_collection(collection_name) + + def _create_collection(self, collection_name: str) -> None: + enum = guard_import("tcvectordb.model.enum") + vdb_index = guard_import("tcvectordb.model.index") + index_type = None + for k, v in enum.IndexType.__members__.items(): + if k == self.index_params.index_type: + index_type = v + if index_type is None: + raise ValueError("unsupported index_type") + metric_type = None + for k, v in enum.MetricType.__members__.items(): + if k == self.index_params.metric_type: + metric_type = v + if metric_type is None: + raise ValueError("unsupported metric_type") + if self.index_params.params is None: + params = vdb_index.HNSWParams(m=16, efconstruction=200) + else: + params = vdb_index.HNSWParams( + m=self.index_params.params.get("M", 16), + efconstruction=self.index_params.params.get("efConstruction", 200), + ) + index = vdb_index.Index( + vdb_index.FilterIndex( + self.field_id, enum.FieldType.String, enum.IndexType.PRIMARY_KEY + ), + vdb_index.VectorIndex( + self.field_vector, + self.index_params.dimension, + index_type, + metric_type, + params, + ), + vdb_index.FilterIndex( + self.field_text, enum.FieldType.String, enum.IndexType.FILTER + ), + vdb_index.FilterIndex( + self.field_metadata, enum.FieldType.String, enum.IndexType.FILTER + ), + ) + self.collection = self.database.create_collection( + name=collection_name, + shard=self.index_params.shard, + replicas=self.index_params.replicas, + description="Collection for LangChain", + index=index, + ) + + @property + def embeddings(self) -> Embeddings: + return self.embedding_func + + @classmethod + def from_texts( + cls, + texts: List[str], + embedding: Embeddings, + metadatas: Optional[List[dict]] = None, + connection_params: Optional[ConnectionParams] = None, + index_params: Optional[IndexParams] = None, + database_name: str = "LangChainDatabase", + collection_name: str = "LangChainCollection", + drop_old: Optional[bool] = False, + **kwargs: Any, + ) -> TencentVectorDB: + """Create a collection, indexes it with HNSW, and insert data.""" + if len(texts) == 0: + raise ValueError("texts is empty") + if connection_params is None: + raise ValueError("connection_params is empty") + try: + embeddings = embedding.embed_documents(texts[0:1]) + except NotImplementedError: + embeddings = [embedding.embed_query(texts[0])] + dimension = len(embeddings[0]) + if index_params is None: + index_params = IndexParams(dimension=dimension) + else: + index_params.dimension = dimension + vector_db = cls( + embedding_function=embedding, + connection_params=connection_params, + index_params=index_params, + database_name=database_name, + collection_name=collection_name, + drop_old=drop_old, + ) + vector_db.add_texts(texts=texts, metadatas=metadatas) + return vector_db + + def add_texts( + self, + texts: Iterable[str], + metadatas: Optional[List[dict]] = None, + timeout: Optional[int] = None, + batch_size: int = 1000, + **kwargs: Any, + ) -> List[str]: + """Insert text data into TencentVectorDB.""" + texts = list(texts) + try: + embeddings = self.embedding_func.embed_documents(texts) + except NotImplementedError: + embeddings = [self.embedding_func.embed_query(x) for x in texts] + if len(embeddings) == 0: + logger.debug("Nothing to insert, skipping.") + return [] + pks: list[str] = [] + total_count = len(embeddings) + for start in range(0, total_count, batch_size): + # Grab end index + docs = [] + end = min(start + batch_size, total_count) + for id in range(start, end, 1): + metadata = "{}" + if metadatas is not None: + metadata = json.dumps(metadatas[id]) + doc = self.document.Document( + id="{}-{}-{}".format(time.time_ns(), hash(texts[id]), id), + vector=embeddings[id], + text=texts[id], + metadata=metadata, + ) + docs.append(doc) + pks.append(str(id)) + self.collection.upsert(docs, timeout) + return pks + + def similarity_search( + self, + query: str, + k: int = 4, + param: Optional[dict] = None, + expr: Optional[str] = None, + timeout: Optional[int] = None, + **kwargs: Any, + ) -> List[Document]: + """Perform a similarity search against the query string.""" + res = self.similarity_search_with_score( + query=query, k=k, param=param, expr=expr, timeout=timeout, **kwargs + ) + return [doc for doc, _ in res] + + def similarity_search_with_score( + self, + query: str, + k: int = 4, + param: Optional[dict] = None, + expr: Optional[str] = None, + timeout: Optional[int] = None, + **kwargs: Any, + ) -> List[Tuple[Document, float]]: + """Perform a search on a query string and return results with score.""" + # Embed the query text. + embedding = self.embedding_func.embed_query(query) + res = self.similarity_search_with_score_by_vector( + embedding=embedding, k=k, param=param, expr=expr, timeout=timeout, **kwargs + ) + return res + + def similarity_search_by_vector( + self, + embedding: List[float], + k: int = 4, + param: Optional[dict] = None, + expr: Optional[str] = None, + timeout: Optional[int] = None, + **kwargs: Any, + ) -> List[Document]: + """Perform a similarity search against the query string.""" + res = self.similarity_search_with_score_by_vector( + embedding=embedding, k=k, param=param, expr=expr, timeout=timeout, **kwargs + ) + return [doc for doc, _ in res] + + def similarity_search_with_score_by_vector( + self, + embedding: List[float], + k: int = 4, + param: Optional[dict] = None, + expr: Optional[str] = None, + timeout: Optional[int] = None, + **kwargs: Any, + ) -> List[Tuple[Document, float]]: + """Perform a search on a query string and return results with score.""" + filter = None if expr is None else self.document.Filter(expr) + ef = 10 if param is None else param.get("ef", 10) + res: List[List[Dict]] = self.collection.search( + vectors=[embedding], + filter=filter, + params=self.document.HNSWSearchParams(ef=ef), + retrieve_vector=False, + limit=k, + timeout=timeout, + ) + # Organize results. + ret: List[Tuple[Document, float]] = [] + if res is None or len(res) == 0: + return ret + for result in res[0]: + meta = result.get(self.field_metadata) + if meta is not None: + meta = json.loads(meta) + doc = Document(page_content=result.get(self.field_text), metadata=meta) + pair = (doc, result.get("score", 0.0)) + ret.append(pair) + return ret + + def max_marginal_relevance_search( + self, + query: str, + k: int = 4, + fetch_k: int = 20, + lambda_mult: float = 0.5, + param: Optional[dict] = None, + expr: Optional[str] = None, + timeout: Optional[int] = None, + **kwargs: Any, + ) -> List[Document]: + """Perform a search and return results that are reordered by MMR.""" + embedding = self.embedding_func.embed_query(query) + return self.max_marginal_relevance_search_by_vector( + embedding=embedding, + k=k, + fetch_k=fetch_k, + lambda_mult=lambda_mult, + param=param, + expr=expr, + timeout=timeout, + **kwargs, + ) + + def max_marginal_relevance_search_by_vector( + self, + embedding: list[float], + k: int = 4, + fetch_k: int = 20, + lambda_mult: float = 0.5, + param: Optional[dict] = None, + expr: Optional[str] = None, + timeout: Optional[int] = None, + **kwargs: Any, + ) -> List[Document]: + """Perform a search and return results that are reordered by MMR.""" + filter = None if expr is None else self.document.Filter(expr) + ef = 10 if param is None else param.get("ef", 10) + res: List[List[Dict]] = self.collection.search( + vectors=[embedding], + filter=filter, + params=self.document.HNSWSearchParams(ef=ef), + retrieve_vector=True, + limit=fetch_k, + timeout=timeout, + ) + # Organize results. + documents = [] + ordered_result_embeddings = [] + for result in res[0]: + meta = result.get(self.field_metadata) + if meta is not None: + meta = json.loads(meta) + doc = Document(page_content=result.get(self.field_text), metadata=meta) + documents.append(doc) + ordered_result_embeddings.append(result.get(self.field_vector)) + # Get the new order of results. + new_ordering = maximal_marginal_relevance( + np.array(embedding), ordered_result_embeddings, k=k, lambda_mult=lambda_mult + ) + # Reorder the values and return. + ret = [] + for x in new_ordering: + # Function can return -1 index + if x == -1: + break + else: + ret.append(documents[x]) + return ret diff --git a/libs/langchain/tests/integration_tests/vectorstores/test_tencentvectordb.py b/libs/langchain/tests/integration_tests/vectorstores/test_tencentvectordb.py new file mode 100644 index 00000000000..3cf2758763c --- /dev/null +++ b/libs/langchain/tests/integration_tests/vectorstores/test_tencentvectordb.py @@ -0,0 +1,93 @@ +"""Test TencentVectorDB functionality.""" +import time +from typing import List, Optional + +from langchain.docstore.document import Document +from langchain.vectorstores import TencentVectorDB +from langchain.vectorstores.tencentvectordb import ConnectionParams +from tests.integration_tests.vectorstores.fake_embeddings import ( + FakeEmbeddings, + fake_texts, +) + + +def _tencent_vector_db_from_texts( + metadatas: Optional[List[dict]] = None, drop: bool = True +) -> TencentVectorDB: + conn_params = ConnectionParams( + url="http://10.0.X.X", + key="eC4bLRy2va******************************", + username="root", + timeout=20, + ) + return TencentVectorDB.from_texts( + fake_texts, + FakeEmbeddings(), + metadatas=metadatas, + connection_params=conn_params, + drop_old=drop, + ) + + +def test_tencent_vector_db() -> None: + """Test end to end construction and search.""" + docsearch = _tencent_vector_db_from_texts() + output = docsearch.similarity_search("foo", k=1) + assert output == [Document(page_content="foo")] + + +def test_tencent_vector_db_with_score() -> None: + """Test end to end construction and search with scores and IDs.""" + texts = ["foo", "bar", "baz"] + metadatas = [{"page": i} for i in range(len(texts))] + docsearch = _tencent_vector_db_from_texts(metadatas=metadatas) + output = docsearch.similarity_search_with_score("foo", k=3) + docs = [o[0] for o in output] + assert docs == [ + Document(page_content="foo", metadata={"page": 0}), + Document(page_content="bar", metadata={"page": 1}), + Document(page_content="baz", metadata={"page": 2}), + ] + + +def test_tencent_vector_db_max_marginal_relevance_search() -> None: + """Test end to end construction and MRR search.""" + texts = ["foo", "bar", "baz"] + metadatas = [{"page": i} for i in range(len(texts))] + docsearch = _tencent_vector_db_from_texts(metadatas=metadatas) + output = docsearch.max_marginal_relevance_search("foo", k=2, fetch_k=3) + assert output == [ + Document(page_content="foo", metadata={"page": 0}), + Document(page_content="bar", metadata={"page": 1}), + ] + + +def test_tencent_vector_db_add_extra() -> None: + """Test end to end construction and MRR search.""" + texts = ["foo", "bar", "baz"] + metadatas = [{"page": i} for i in range(len(texts))] + docsearch = _tencent_vector_db_from_texts(metadatas=metadatas) + docsearch.add_texts(texts, metadatas) + time.sleep(3) + output = docsearch.similarity_search("foo", k=10) + assert len(output) == 6 + + +def test_tencent_vector_db_no_drop() -> None: + """Test end to end construction and MRR search.""" + texts = ["foo", "bar", "baz"] + metadatas = [{"page": i} for i in range(len(texts))] + docsearch = _tencent_vector_db_from_texts(metadatas=metadatas) + del docsearch + docsearch = _tencent_vector_db_from_texts(metadatas=metadatas, drop=False) + time.sleep(3) + output = docsearch.similarity_search("foo", k=10) + assert len(output) == 6 + + +# if __name__ == "__main__": +# test_tencent_vector_db() +# test_tencent_vector_db_with_score() +# test_tencent_vector_db_max_marginal_relevance_search() +# test_tencent_vector_db_add_extra() +# test_tencent_vector_db_no_drop() From a28e888b36ba51ce15d0eaf707917dcb5983b575 Mon Sep 17 00:00:00 2001 From: Buckler89 Date: Wed, 30 Aug 2023 15:35:23 +0200 Subject: [PATCH 09/30] fix call _get_keys for custom_evaluator (#9763) In the function _load_run_evaluators the function _get_keys was not called if only custom_evaluators parameter is used - Description: In the function _load_run_evaluators the function _get_keys was not called if only custom_evaluators parameter is used, - Issue: no issue created for this yet, - Dependencies: None, - Tag maintainer: @vowelparrot, - Twitter handle: Buckler89 --------- Co-authored-by: ddroghini --- .../langchain/langchain/smith/evaluation/runner_utils.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/libs/langchain/langchain/smith/evaluation/runner_utils.py b/libs/langchain/langchain/smith/evaluation/runner_utils.py index 438bc791400..f20f5cea4ba 100644 --- a/libs/langchain/langchain/smith/evaluation/runner_utils.py +++ b/libs/langchain/langchain/smith/evaluation/runner_utils.py @@ -566,8 +566,13 @@ def _load_run_evaluators( eval_llm = config.eval_llm or ChatOpenAI(model="gpt-4", temperature=0.0) run_evaluators = [] input_key, prediction_key, reference_key = None, None, None - if config.evaluators or any( - [isinstance(e, EvaluatorType) for e in config.evaluators] + if ( + config.evaluators + or any([isinstance(e, EvaluatorType) for e in config.evaluators]) + or ( + config.custom_evaluators + and any([isinstance(e, StringEvaluator) for e in config.custom_evaluators]) + ) ): input_key, prediction_key, reference_key = _get_keys( config, run_inputs, run_outputs, example_outputs From 588237ef30ca665582005a346de2770db46b518a Mon Sep 17 00:00:00 2001 From: Eugene Yurtsev Date: Wed, 30 Aug 2023 09:45:04 -0400 Subject: [PATCH 10/30] Make document serializable, create utility to create a docstore (#9674) This PR makes the following changes: 1. Documents become serializable using langhchain serialization 2. Make a utility to create a docstore kw store Will help to address issue here: https://github.com/langchain-ai/langchain/issues/9345 --- libs/langchain/langchain/schema/document.py | 5 ++ libs/langchain/langchain/storage/__init__.py | 3 + libs/langchain/langchain/storage/_lc_store.py | 88 +++++++++++++++++++ .../tests/unit_tests/storage/test_lc_store.py | 36 ++++++++ 4 files changed, 132 insertions(+) create mode 100644 libs/langchain/langchain/storage/_lc_store.py create mode 100644 libs/langchain/tests/unit_tests/storage/test_lc_store.py diff --git a/libs/langchain/langchain/schema/document.py b/libs/langchain/langchain/schema/document.py index feaa1acac0a..ccee67ca2a3 100644 --- a/libs/langchain/langchain/schema/document.py +++ b/libs/langchain/langchain/schema/document.py @@ -17,6 +17,11 @@ class Document(Serializable): documents, etc.). """ + @property + def lc_serializable(self) -> bool: + """Return whether or not the class is serializable.""" + return True + class BaseDocumentTransformer(ABC): """Abstract base class for document transformation systems. diff --git a/libs/langchain/langchain/storage/__init__.py b/libs/langchain/langchain/storage/__init__.py index ecc2e817f24..49a721b59ab 100644 --- a/libs/langchain/langchain/storage/__init__.py +++ b/libs/langchain/langchain/storage/__init__.py @@ -6,6 +6,7 @@ to a simple key-value interface. The primary goal of these storages is to support implementation of caching. """ +from langchain.storage._lc_store import create_kv_docstore, create_lc_store from langchain.storage.encoder_backed import EncoderBackedStore from langchain.storage.file_system import LocalFileStore from langchain.storage.in_memory import InMemoryStore @@ -16,4 +17,6 @@ __all__ = [ "InMemoryStore", "LocalFileStore", "RedisStore", + "create_lc_store", + "create_kv_docstore", ] diff --git a/libs/langchain/langchain/storage/_lc_store.py b/libs/langchain/langchain/storage/_lc_store.py new file mode 100644 index 00000000000..be528e77480 --- /dev/null +++ b/libs/langchain/langchain/storage/_lc_store.py @@ -0,0 +1,88 @@ +"""Create a key-value store for any langchain serializable object.""" +from typing import Callable, Optional + +from langchain.load.dump import dumps +from langchain.load.load import loads +from langchain.load.serializable import Serializable +from langchain.schema import BaseStore, Document +from langchain.storage.encoder_backed import EncoderBackedStore + + +def _dump_as_bytes(obj: Serializable) -> bytes: + """Return a bytes representation of a document.""" + return dumps(obj).encode("utf-8") + + +def _dump_document_as_bytes(obj: Document) -> bytes: + """Return a bytes representation of a document.""" + if not isinstance(obj, Document): + raise TypeError("Expected a Document instance") + return dumps(obj).encode("utf-8") + + +def _load_document_from_bytes(serialized: bytes) -> Document: + """Return a document from a bytes representation.""" + obj = loads(serialized.decode("utf-8")) + if not isinstance(obj, Document): + raise TypeError(f"Expected a Document instance. Got {type(obj)}") + return obj + + +def _load_from_bytes(serialized: bytes) -> Serializable: + """Return a document from a bytes representation.""" + return loads(serialized.decode("utf-8")) + + +def _identity(x: str) -> str: + """Return the same object.""" + return x + + +# PUBLIC API + + +def create_lc_store( + store: BaseStore[str, bytes], + *, + key_encoder: Optional[Callable[[str], str]] = None, +) -> BaseStore[str, Serializable]: + """Create a store for langchain serializable objects from a bytes store. + + Args: + store: A bytes store to use as the underlying store. + key_encoder: A function to encode keys; if None uses identity function. + + Returns: + A key-value store for documents. + """ + return EncoderBackedStore( + store, + key_encoder or _identity, + _dump_as_bytes, + _load_from_bytes, + ) + + +def create_kv_docstore( + store: BaseStore[str, bytes], + *, + key_encoder: Optional[Callable[[str], str]] = None, +) -> BaseStore[str, Document]: + """Create a store for langchain Document objects from a bytes store. + + This store does run time type checking to ensure that the values are + Document objects. + + Args: + store: A bytes store to use as the underlying store. + key_encoder: A function to encode keys; if None uses identity function. + + Returns: + A key-value store for documents. + """ + return EncoderBackedStore( + store, + key_encoder or _identity, + _dump_document_as_bytes, + _load_document_from_bytes, + ) diff --git a/libs/langchain/tests/unit_tests/storage/test_lc_store.py b/libs/langchain/tests/unit_tests/storage/test_lc_store.py new file mode 100644 index 00000000000..5d15683ac31 --- /dev/null +++ b/libs/langchain/tests/unit_tests/storage/test_lc_store.py @@ -0,0 +1,36 @@ +import tempfile +from typing import Generator, cast + +import pytest + +from langchain.schema import Document +from langchain.storage._lc_store import create_kv_docstore, create_lc_store +from langchain.storage.file_system import LocalFileStore + + +@pytest.fixture +def file_store() -> Generator[LocalFileStore, None, None]: + # Create a temporary directory for testing + with tempfile.TemporaryDirectory() as temp_dir: + # Instantiate the LocalFileStore with the temporary directory as the root path + store = LocalFileStore(temp_dir) + yield store + + +def test_create_lc_store(file_store: LocalFileStore) -> None: + """Test that a docstore is created from a base store.""" + docstore = create_lc_store(file_store) + docstore.mset([("key1", Document(page_content="hello", metadata={"key": "value"}))]) + fetched_doc = cast(Document, docstore.mget(["key1"])[0]) + assert fetched_doc.page_content == "hello" + assert fetched_doc.metadata == {"key": "value"} + + +def test_create_kv_store(file_store: LocalFileStore) -> None: + """Test that a docstore is created from a base store.""" + docstore = create_kv_docstore(file_store) + docstore.mset([("key1", Document(page_content="hello", metadata={"key": "value"}))]) + fetched_doc = docstore.mget(["key1"])[0] + assert isinstance(fetched_doc, Document) + assert fetched_doc.page_content == "hello" + assert fetched_doc.metadata == {"key": "value"} From 24c0b01c38233dd81a61018869a5395a5e6a2a93 Mon Sep 17 00:00:00 2001 From: Guy Korland Date: Wed, 30 Aug 2023 17:13:18 +0300 Subject: [PATCH 11/30] Extend the FalkorDB QA demo (#9992) - Description: Extend the FalkorDB QA demo - Tag maintainer: @baskaryan --- .../more/graph/graph_falkordb_qa.ipynb | 168 +++++++++++++++--- .../langchain/graphs/falkordb_graph.py | 6 +- 2 files changed, 149 insertions(+), 25 deletions(-) diff --git a/docs/extras/use_cases/more/graph/graph_falkordb_qa.ipynb b/docs/extras/use_cases/more/graph/graph_falkordb_qa.ipynb index c43e2329750..9edb18785f0 100644 --- a/docs/extras/use_cases/more/graph/graph_falkordb_qa.ipynb +++ b/docs/extras/use_cases/more/graph/graph_falkordb_qa.ipynb @@ -33,6 +33,13 @@ "from langchain.chains import FalkorDBQAChain" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create a graph connection and insert some demo data." + ] + }, { "cell_type": "code", "execution_count": 2, @@ -44,7 +51,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -53,38 +60,72 @@ "[]" ] }, - "execution_count": 8, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "graph.query(\n", - " \"\"\"\n", - "MERGE (m:Movie {name:\"Top Gun\"})\n", - "WITH m\n", - "UNWIND [\"Tom Cruise\", \"Val Kilmer\", \"Anthony Edwards\", \"Meg Ryan\"] AS actor\n", - "MERGE (a:Actor {name:actor})\n", - "MERGE (a)-[:ACTED_IN]->(m)\n", - "\"\"\"\n", - ")" + "graph.query(\"\"\"\n", + " CREATE \n", + " (al:Person {name: 'Al Pacino', birthDate: '1940-04-25'}),\n", + " (robert:Person {name: 'Robert De Niro', birthDate: '1943-08-17'}),\n", + " (tom:Person {name: 'Tom Cruise', birthDate: '1962-07-3'}),\n", + " (val:Person {name: 'Val Kilmer', birthDate: '1959-12-31'}),\n", + " (anthony:Person {name: 'Anthony Edwards', birthDate: '1962-7-19'}),\n", + " (meg:Person {name: 'Meg Ryan', birthDate: '1961-11-19'}),\n", + "\n", + " (god1:Movie {title: 'The Godfather'}),\n", + " (god2:Movie {title: 'The Godfather: Part II'}),\n", + " (god3:Movie {title: 'The Godfather Coda: The Death of Michael Corleone'}),\n", + " (top:Movie {title: 'Top Gun'}),\n", + "\n", + " (al)-[:ACTED_IN]->(god1),\n", + " (al)-[:ACTED_IN]->(god2),\n", + " (al)-[:ACTED_IN]->(god3),\n", + " (robert)-[:ACTED_IN]->(god2),\n", + " (tom)-[:ACTED_IN]->(top),\n", + " (val)-[:ACTED_IN]->(top),\n", + " (anthony)-[:ACTED_IN]->(top),\n", + " (meg)-[:ACTED_IN]->(top)\n", + "\"\"\")" ] }, { - "cell_type": "code", - "execution_count": 4, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "graph.refresh_schema()\n", - "import os\n", - "os.environ['OPENAI_API_KEY']='API_KEY_HERE'\n" + "### Creating FalkorDBQAChain" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Node properties: [[OrderedDict([('label', None), ('properties', ['name', 'birthDate', 'title'])])]]\n", + "Relationships properties: [[OrderedDict([('type', None), ('properties', [])])]]\n", + "Relationships: [['(:Person)-[:ACTED_IN]->(:Movie)']]\n", + "\n" + ] + } + ], + "source": [ + "graph.refresh_schema()\n", + "print(graph.schema)\n", + "\n", + "import os\n", + "os.environ['OPENAI_API_KEY']='API_KEY_HERE'" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, "outputs": [], "source": [ "chain = FalkorDBQAChain.from_llm(\n", @@ -92,9 +133,16 @@ ")" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Querying the graph" + ] + }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -105,10 +153,11 @@ "\n", "\u001b[1m> Entering new FalkorDBQAChain chain...\u001b[0m\n", "Generated Cypher:\n", - "\u001b[32;1m\u001b[1;3mMATCH (:Movie {title: 'Top Gun'})<-[:ACTED_IN]-(actor:Person)\n", - "RETURN actor.name AS output\u001b[0m\n", + "\u001b[32;1m\u001b[1;3mMATCH (p:Person)-[:ACTED_IN]->(m:Movie)\n", + "WHERE m.title = 'Top Gun'\n", + "RETURN p.name\u001b[0m\n", "Full Context:\n", - "\u001b[32;1m\u001b[1;3m[]\u001b[0m\n", + "\u001b[32;1m\u001b[1;3m[['Tom Cruise'], ['Val Kilmer'], ['Anthony Edwards'], ['Meg Ryan'], ['Tom Cruise'], ['Val Kilmer'], ['Anthony Edwards'], ['Meg Ryan']]\u001b[0m\n", "\n", "\u001b[1m> Finished chain.\u001b[0m\n" ] @@ -116,7 +165,7 @@ { "data": { "text/plain": [ - "'The actor who played in Top Gun is Tom Cruise.'" + "'Tom Cruise, Val Kilmer, Anthony Edwards, and Meg Ryan played in Top Gun.'" ] }, "execution_count": 7, @@ -127,6 +176,81 @@ "source": [ "chain.run(\"Who played in Top Gun?\")" ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + "\u001b[1m> Entering new FalkorDBQAChain chain...\u001b[0m\n", + "Generated Cypher:\n", + "\u001b[32;1m\u001b[1;3mMATCH (p:Person)-[r:ACTED_IN]->(m:Movie)\n", + "WHERE m.title = 'The Godfather: Part II'\n", + "RETURN p.name\n", + "ORDER BY p.birthDate ASC\n", + "LIMIT 1\u001b[0m\n", + "Full Context:\n", + "\u001b[32;1m\u001b[1;3m[['Al Pacino']]\u001b[0m\n", + "\n", + "\u001b[1m> Finished chain.\u001b[0m\n" + ] + }, + { + "data": { + "text/plain": [ + "'The oldest actor who played in The Godfather: Part II is Al Pacino.'" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "chain.run(\"Who is the oldest actor who played in The Godfather: Part II?\")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + "\u001b[1m> Entering new FalkorDBQAChain chain...\u001b[0m\n", + "Generated Cypher:\n", + "\u001b[32;1m\u001b[1;3mMATCH (p:Person {name: 'Robert De Niro'})-[:ACTED_IN]->(m:Movie)\n", + "RETURN m.title\u001b[0m\n", + "Full Context:\n", + "\u001b[32;1m\u001b[1;3m[['The Godfather: Part II'], ['The Godfather: Part II']]\u001b[0m\n", + "\n", + "\u001b[1m> Finished chain.\u001b[0m\n" + ] + }, + { + "data": { + "text/plain": [ + "'Robert De Niro played in \"The Godfather: Part II\".'" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "chain.run(\"Robert De Niro played in which movies?\")" + ] } ], "metadata": { diff --git a/libs/langchain/langchain/graphs/falkordb_graph.py b/libs/langchain/langchain/graphs/falkordb_graph.py index 189189f6414..f74281b3c18 100644 --- a/libs/langchain/langchain/graphs/falkordb_graph.py +++ b/libs/langchain/langchain/graphs/falkordb_graph.py @@ -52,9 +52,9 @@ class FalkorDBGraph: def refresh_schema(self) -> None: """Refreshes the schema of the FalkorDB database""" self.schema = ( - f"Node properties: {node_properties_query}\n" - f"Relationships properties: {rel_properties_query}\n" - f"Relationships: {rel_query}\n" + f"Node properties: {self.query(node_properties_query)}\n" + f"Relationships properties: {self.query(rel_properties_query)}\n" + f"Relationships: {self.query(rel_query)}\n" ) def query(self, query: str, params: dict = {}) -> List[Dict[str, Any]]: From 9870bfb9cdaab4bd6a61797a52a8141aab58fde8 Mon Sep 17 00:00:00 2001 From: Christophe Bornet Date: Wed, 30 Aug 2023 17:03:24 +0200 Subject: [PATCH 12/30] Add bucket and object key to metadata in S3 loader (#9317) - Description: this PR adds `s3_object_key` and `s3_bucket` to the doc metadata when loading an S3 file. This is particularly useful when using `S3DirectoryLoader` to remove the files from the dir once they have been processed (getting the object keys from the metadata `source` field seems brittle) - Dependencies: N/A - Tag maintainer: ? - Twitter handle: _cbornet --------- Co-authored-by: Eugene Yurtsev --- .../document_loaders/aws_s3_directory.ipynb | 2 +- .../document_loaders/aws_s3_file.ipynb | 3 ++- .../langchain/document_loaders/s3_file.py | 19 +++++++++++-------- 3 files changed, 14 insertions(+), 10 deletions(-) diff --git a/docs/extras/integrations/document_loaders/aws_s3_directory.ipynb b/docs/extras/integrations/document_loaders/aws_s3_directory.ipynb index eb21a2a3d19..683408995b4 100644 --- a/docs/extras/integrations/document_loaders/aws_s3_directory.ipynb +++ b/docs/extras/integrations/document_loaders/aws_s3_directory.ipynb @@ -90,7 +90,7 @@ { "data": { "text/plain": [ - "[Document(page_content='Lorem ipsum dolor sit amet.', lookup_str='', metadata={'source': '/var/folders/y6/8_bzdg295ld6s1_97_12m4lr0000gn/T/tmpujbkzf_l/fake.docx'}, lookup_index=0)]" + "[Document(page_content='Lorem ipsum dolor sit amet.', lookup_str='', metadata={'source': 's3://testing-hwc/fake.docx'}, lookup_index=0)]" ] }, "execution_count": 6, diff --git a/docs/extras/integrations/document_loaders/aws_s3_file.ipynb b/docs/extras/integrations/document_loaders/aws_s3_file.ipynb index ecf20098565..4646e4101e1 100644 --- a/docs/extras/integrations/document_loaders/aws_s3_file.ipynb +++ b/docs/extras/integrations/document_loaders/aws_s3_file.ipynb @@ -53,7 +53,7 @@ { "data": { "text/plain": [ - "[Document(page_content='Lorem ipsum dolor sit amet.', lookup_str='', metadata={'source': '/var/folders/y6/8_bzdg295ld6s1_97_12m4lr0000gn/T/tmpxvave6wl/fake.docx'}, lookup_index=0)]" + "[Document(page_content='Lorem ipsum dolor sit amet.', lookup_str='', metadata={'source': 's3://testing-hwc/fake.docx'}, lookup_index=0)]" ] }, "execution_count": 9, @@ -96,3 +96,4 @@ "nbformat": 4, "nbformat_minor": 5 } + diff --git a/libs/langchain/langchain/document_loaders/s3_file.py b/libs/langchain/langchain/document_loaders/s3_file.py index 509b1ea1eec..2e0b56700e8 100644 --- a/libs/langchain/langchain/document_loaders/s3_file.py +++ b/libs/langchain/langchain/document_loaders/s3_file.py @@ -2,12 +2,10 @@ import os import tempfile from typing import List -from langchain.docstore.document import Document -from langchain.document_loaders.base import BaseLoader -from langchain.document_loaders.unstructured import UnstructuredFileLoader +from langchain.document_loaders.unstructured import UnstructuredBaseLoader -class S3FileLoader(BaseLoader): +class S3FileLoader(UnstructuredBaseLoader): """Load from `Amazon AWS S3` file.""" def __init__(self, bucket: str, key: str): @@ -17,11 +15,14 @@ class S3FileLoader(BaseLoader): bucket: The name of the S3 bucket. key: The key of the S3 object. """ + super().__init__() self.bucket = bucket self.key = key - def load(self) -> List[Document]: - """Load documents.""" + def _get_elements(self) -> List: + """Get elements.""" + from unstructured.partition.auto import partition + try: import boto3 except ImportError: @@ -34,5 +35,7 @@ class S3FileLoader(BaseLoader): file_path = f"{temp_dir}/{self.key}" os.makedirs(os.path.dirname(file_path), exist_ok=True) s3.download_file(self.bucket, self.key, file_path) - loader = UnstructuredFileLoader(file_path) - return loader.load() + return partition(filename=file_path) + + def _get_metadata(self) -> dict: + return {"source": f"s3://{self.bucket}/{self.key}"} From 9828701de1e9af0802d6558a97131eb4b9bf2193 Mon Sep 17 00:00:00 2001 From: Bagatur <22008038+baskaryan@users.noreply.github.com> Date: Wed, 30 Aug 2023 08:10:51 -0700 Subject: [PATCH 13/30] mv base cache to schema (#9953) if you remove all other imports from langchain.init it exposes a circular dep --- libs/langchain/langchain/__init__.py | 2 +- libs/langchain/langchain/cache.py | 25 +++------------------ libs/langchain/langchain/schema/__init__.py | 2 ++ libs/langchain/langchain/schema/cache.py | 24 ++++++++++++++++++++ 4 files changed, 30 insertions(+), 23 deletions(-) create mode 100644 libs/langchain/langchain/schema/cache.py diff --git a/libs/langchain/langchain/__init__.py b/libs/langchain/langchain/__init__.py index 32ad552ecc5..721769d0e25 100644 --- a/libs/langchain/langchain/__init__.py +++ b/libs/langchain/langchain/__init__.py @@ -4,7 +4,6 @@ from importlib import metadata from typing import Optional from langchain.agents import MRKLChain, ReActChain, SelfAskWithSearchChain -from langchain.cache import BaseCache from langchain.chains import ( ConversationChain, LLMBashChain, @@ -40,6 +39,7 @@ from langchain.prompts import ( Prompt, PromptTemplate, ) +from langchain.schema.cache import BaseCache from langchain.schema.prompt_template import BasePromptTemplate from langchain.utilities.arxiv import ArxivAPIWrapper from langchain.utilities.golden_query import GoldenQueryAPIWrapper diff --git a/libs/langchain/langchain/cache.py b/libs/langchain/langchain/cache.py index 3160fbae6a5..4364c33e1a4 100644 --- a/libs/langchain/langchain/cache.py +++ b/libs/langchain/langchain/cache.py @@ -26,7 +26,6 @@ import inspect import json import logging import warnings -from abc import ABC, abstractmethod from datetime import timedelta from typing import ( TYPE_CHECKING, @@ -35,7 +34,6 @@ from typing import ( Dict, List, Optional, - Sequence, Tuple, Type, Union, @@ -46,17 +44,18 @@ from sqlalchemy import Column, Integer, String, create_engine, select from sqlalchemy.engine.base import Engine from sqlalchemy.orm import Session -from langchain.utils import get_from_env - try: from sqlalchemy.orm import declarative_base except ImportError: from sqlalchemy.ext.declarative import declarative_base + from langchain.embeddings.base import Embeddings from langchain.load.dump import dumps from langchain.load.load import loads from langchain.schema import ChatGeneration, Generation +from langchain.schema.cache import RETURN_VAL_TYPE, BaseCache +from langchain.utils import get_from_env from langchain.vectorstores.redis import Redis as RedisVectorstore logger = logging.getLogger(__file__) @@ -64,8 +63,6 @@ logger = logging.getLogger(__file__) if TYPE_CHECKING: import momento -RETURN_VAL_TYPE = Sequence[Generation] - def _hash(_input: str) -> str: """Use a deterministic hashing approach.""" @@ -105,22 +102,6 @@ def _load_generations_from_json(generations_json: str) -> RETURN_VAL_TYPE: ) -class BaseCache(ABC): - """Base interface for cache.""" - - @abstractmethod - def lookup(self, prompt: str, llm_string: str) -> Optional[RETURN_VAL_TYPE]: - """Look up based on prompt and llm_string.""" - - @abstractmethod - def update(self, prompt: str, llm_string: str, return_val: RETURN_VAL_TYPE) -> None: - """Update cache based on prompt and llm_string.""" - - @abstractmethod - def clear(self, **kwargs: Any) -> None: - """Clear cache that can take additional keyword arguments.""" - - class InMemoryCache(BaseCache): """Cache that stores things in memory.""" diff --git a/libs/langchain/langchain/schema/__init__.py b/libs/langchain/langchain/schema/__init__.py index aec8a1e3919..be830b10aa9 100644 --- a/libs/langchain/langchain/schema/__init__.py +++ b/libs/langchain/langchain/schema/__init__.py @@ -1,5 +1,6 @@ """**Schemas** are the LangChain Base Classes and Interfaces.""" from langchain.schema.agent import AgentAction, AgentFinish +from langchain.schema.cache import BaseCache from langchain.schema.chat_history import BaseChatMessageHistory from langchain.schema.document import BaseDocumentTransformer, Document from langchain.schema.exceptions import LangChainException @@ -39,6 +40,7 @@ RUN_KEY = "__run" Memory = BaseMemory __all__ = [ + "BaseCache", "BaseMemory", "BaseStore", "AgentFinish", diff --git a/libs/langchain/langchain/schema/cache.py b/libs/langchain/langchain/schema/cache.py new file mode 100644 index 00000000000..7adb07fd1db --- /dev/null +++ b/libs/langchain/langchain/schema/cache.py @@ -0,0 +1,24 @@ +from __future__ import annotations + +from abc import ABC, abstractmethod +from typing import Any, Optional, Sequence + +from langchain.schema.output import Generation + +RETURN_VAL_TYPE = Sequence[Generation] + + +class BaseCache(ABC): + """Base interface for cache.""" + + @abstractmethod + def lookup(self, prompt: str, llm_string: str) -> Optional[RETURN_VAL_TYPE]: + """Look up based on prompt and llm_string.""" + + @abstractmethod + def update(self, prompt: str, llm_string: str, return_val: RETURN_VAL_TYPE) -> None: + """Update cache based on prompt and llm_string.""" + + @abstractmethod + def clear(self, **kwargs: Any) -> None: + """Clear cache that can take additional keyword arguments.""" From b3e3a31240526de5089d3489f9f593094f9506d8 Mon Sep 17 00:00:00 2001 From: Bagatur <22008038+baskaryan@users.noreply.github.com> Date: Wed, 30 Aug 2023 08:29:51 -0700 Subject: [PATCH 14/30] bump 277 (#9997) --- libs/langchain/pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libs/langchain/pyproject.toml b/libs/langchain/pyproject.toml index e38c5721ce0..77899ffbba1 100644 --- a/libs/langchain/pyproject.toml +++ b/libs/langchain/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "langchain" -version = "0.0.276" +version = "0.0.277" description = "Building applications with LLMs through composability" authors = [] license = "MIT" From 98cce7dcd3b5f971bbaa02e19e4f53a5d324cd06 Mon Sep 17 00:00:00 2001 From: Bagatur <22008038+baskaryan@users.noreply.github.com> Date: Wed, 30 Aug 2023 10:34:25 -0700 Subject: [PATCH 15/30] update moderation docs (#10002) --- docs/docs_skeleton/docs/guides/safety/index.mdx | 3 ++- .../guides/safety/amazon_comprehend_chain.ipynb | 0 2 files changed, 2 insertions(+), 1 deletion(-) rename docs/{docs_skeleton/docs => extras}/guides/safety/amazon_comprehend_chain.ipynb (100%) diff --git a/docs/docs_skeleton/docs/guides/safety/index.mdx b/docs/docs_skeleton/docs/guides/safety/index.mdx index 1f01245d1e8..029eea7d441 100644 --- a/docs/docs_skeleton/docs/guides/safety/index.mdx +++ b/docs/docs_skeleton/docs/guides/safety/index.mdx @@ -1,6 +1,7 @@ -# Preventing harmful outputs +# Moderation One of the key concerns with using LLMs is that they may generate harmful or unethical text. This is an area of active research in the field. Here we present some built-in chains inspired by this research, which are intended to make the outputs of LLMs safer. - [Moderation chain](/docs/guides/safety/moderation): Explicitly check if any output text is harmful and flag it. - [Constitutional chain](/docs/guides/safety/constitutional_chain): Prompt the model with a set of principles which should guide it's behavior. +- [Amazon Comprehend moderation chain](/docs/guides/safety/amazon_comprehend_chain): Use [Amazon Comprehend](https://aws.amazon.com/comprehend/) to detect and handle PII and toxicity. diff --git a/docs/docs_skeleton/docs/guides/safety/amazon_comprehend_chain.ipynb b/docs/extras/guides/safety/amazon_comprehend_chain.ipynb similarity index 100% rename from docs/docs_skeleton/docs/guides/safety/amazon_comprehend_chain.ipynb rename to docs/extras/guides/safety/amazon_comprehend_chain.ipynb From a8f804a61820e0aefa0b80dea634e3e23710f89f Mon Sep 17 00:00:00 2001 From: maks-operlejn-ds <142261444+maks-operlejn-ds@users.noreply.github.com> Date: Wed, 30 Aug 2023 19:39:44 +0200 Subject: [PATCH 16/30] Add data anonymizer (#9863) ### Description The feature for anonymizing data has been implemented. In order to protect private data, such as when querying external APIs (OpenAI), it is worth pseudonymizing sensitive data to maintain full privacy. Anonynization consists of two steps: 1. **Identification:** Identify all data fields that contain personally identifiable information (PII). 2. **Replacement**: Replace all PIIs with pseudo values or codes that do not reveal any personal information about the individual but can be used for reference. We're not using regular encryption, because the language model won't be able to understand the meaning or context of the encrypted data. We use *Microsoft Presidio* together with *Faker* framework for anonymization purposes because of the wide range of functionalities they provide. The full implementation is available in `PresidioAnonymizer`. ### Future works - **deanonymization** - add the ability to reverse anonymization. For example, the workflow could look like this: `anonymize -> LLMChain -> deanonymize`. By doing this, we will retain anonymity in requests to, for example, OpenAI, and then be able restore the original data. - **instance anonymization** - at this point, each occurrence of PII is treated as a separate entity and separately anonymized. Therefore, two occurrences of the name John Doe in the text will be changed to two different names. It is therefore worth introducing support for full instance detection, so that repeated occurrences are treated as a single object. ### Twitter handle @deepsense_ai / @MaksOpp --------- Co-authored-by: MaksOpp Co-authored-by: Bagatur --- .../workflows/langchain_experimental_ci.yml | 32 + .../extras/use_cases/data_anonymization.ipynb | 485 +++++++++++ libs/experimental/Makefile | 3 + .../data_anonymizer/__init__.py | 4 + .../data_anonymizer/base.py | 17 + .../data_anonymizer/faker_presidio_mapping.py | 40 + .../data_anonymizer/presidio.py | 91 ++ libs/experimental/poetry.lock | 776 +++++++++++++++++- libs/experimental/pyproject.toml | 13 + .../tests/unit_tests/test_data_anonymizer.py | 84 ++ 10 files changed, 1543 insertions(+), 2 deletions(-) create mode 100644 docs/extras/use_cases/data_anonymization.ipynb create mode 100644 libs/experimental/langchain_experimental/data_anonymizer/__init__.py create mode 100644 libs/experimental/langchain_experimental/data_anonymizer/base.py create mode 100644 libs/experimental/langchain_experimental/data_anonymizer/faker_presidio_mapping.py create mode 100644 libs/experimental/langchain_experimental/data_anonymizer/presidio.py create mode 100644 libs/experimental/tests/unit_tests/test_data_anonymizer.py diff --git a/.github/workflows/langchain_experimental_ci.yml b/.github/workflows/langchain_experimental_ci.yml index c62ff18b354..5b00365f82f 100644 --- a/.github/workflows/langchain_experimental_ci.yml +++ b/.github/workflows/langchain_experimental_ci.yml @@ -81,3 +81,35 @@ jobs: - name: Run tests run: make test + extended-tests: + runs-on: ubuntu-latest + defaults: + run: + working-directory: ${{ env.WORKDIR }} + strategy: + matrix: + python-version: + - "3.8" + - "3.9" + - "3.10" + - "3.11" + name: Python ${{ matrix.python-version }} extended tests + steps: + - uses: actions/checkout@v3 + + - name: Set up Python ${{ matrix.python-version }} + Poetry ${{ env.POETRY_VERSION }} + uses: "./.github/actions/poetry_setup" + with: + python-version: ${{ matrix.python-version }} + poetry-version: ${{ env.POETRY_VERSION }} + working-directory: libs/experimental + cache-key: extended + + - name: Install dependencies + shell: bash + run: | + echo "Running extended tests, installing dependencies with poetry..." + poetry install -E extended_testing + + - name: Run extended tests + run: make extended_tests diff --git a/docs/extras/use_cases/data_anonymization.ipynb b/docs/extras/use_cases/data_anonymization.ipynb new file mode 100644 index 00000000000..4955406cf35 --- /dev/null +++ b/docs/extras/use_cases/data_anonymization.ipynb @@ -0,0 +1,485 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Data anonymization\n", + "\n", + "[![Open In Collab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/langchain-ai/langchain/blob/master/docs/extras/use_cases/data_anonymization.ipynb)\n", + "\n", + "## Use case\n", + "\n", + "Data anonymization is crucial before passing information to a language model like GPT-4 because it helps protect privacy and maintain confidentiality. If data is not anonymized, sensitive information such as names, addresses, contact numbers, or other identifiers linked to specific individuals could potentially be learned and misused. Hence, by obscuring or removing this personally identifiable information (PII), data can be used freely without compromising individuals' privacy rights or breaching data protection laws and regulations.\n", + "\n", + "## Overview\n", + "\n", + "Anonynization consists of two steps:\n", + "\n", + "1. **Identification:** Identify all data fields that contain personally identifiable information (PII).\n", + "2. **Replacement**: Replace all PIIs with pseudo values or codes that do not reveal any personal information about the individual but can be used for reference. We're not using regular encryption, because the language model won't be able to understand the meaning or context of the encrypted data.\n", + "\n", + "We use *Microsoft Presidio* together with *Faker* framework for anonymization purposes because of the wide range of functionalities they provide. The full implementation is available in `PresidioAnonymizer`.\n", + "\n", + "## Quickstart\n", + "\n", + "Below you will find the use case on how to leverage anonymization in LangChain." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "# Install necessary packages\n", + "# ! pip install langchain langchain-experimental openai\n", + "# ! python -m spacy download en_core_web_lg" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\\\n", + "Let's see how PII anonymization works using a sample sentence:" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'My name is Marie Santos, call me at 313-666-7440 or email me at real.slim.shady@gmail.com'" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from langchain_experimental.data_anonymizer import PresidioAnonymizer\n", + "\n", + "anonymizer = PresidioAnonymizer(analyzed_fields=[\"PERSON\"])\n", + "\n", + "anonymizer.anonymize(\n", + " \"My name is Slim Shady, call me at 313-666-7440 or email me at real.slim.shady@gmail.com\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\\\n", + "As can be observed, the name was correctly identified and replaced with another. The `analyzed_fields` attribute is responsible for what values are to be detected and substituted. We can add *PHONE_NUMBER* to the list:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'My name is Victoria Mckinney, call me at 713-549-8623 or email me at real.slim.shady@gmail.com'" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "anonymizer = PresidioAnonymizer(analyzed_fields=[\"PERSON\", \"PHONE_NUMBER\"])\n", + "anonymizer.anonymize(\n", + " \"My name is Slim Shady, call me at 313-666-7440 or email me at real.slim.shady@gmail.com\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\\\n", + "If no analyzed_fields are specified, by default the anonymizer will detect all supported formats. Below is the full list of them:\n", + "\n", + "`['PERSON', 'EMAIL_ADDRESS', 'PHONE_NUMBER', 'IBAN_CODE', 'CREDIT_CARD', 'CRYPTO', 'IP_ADDRESS', 'LOCATION', 'DATE_TIME', 'NRP', 'MEDICAL_LICENSE', 'URL', 'US_BANK_NUMBER', 'US_DRIVER_LICENSE', 'US_ITIN', 'US_PASSPORT', 'US_SSN']`\n", + "\n", + "**Disclaimer:** We suggest carefully defining the private data to be detected - Presidio doesn't work perfectly and it sometimes makes mistakes, so it's better to have more control over the data." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'My name is Billy Russo, call me at 970-996-9453x038 or email me at jamie80@example.org'" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "anonymizer = PresidioAnonymizer()\n", + "anonymizer.anonymize(\n", + " \"My name is Slim Shady, call me at 313-666-7440 or email me at real.slim.shady@gmail.com\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\\\n", + "It may be that the above list of detected fields is not sufficient. For example, the already available *PHONE_NUMBER* field does not support polish phone numbers and confuses it with another field:" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'My polish phone number is EVIA70648911396944'" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "anonymizer = PresidioAnonymizer()\n", + "anonymizer.anonymize(\"My polish phone number is 666555444\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\\\n", + "You can then write your own recognizers and add them to the pool of those present. How exactly to create recognizers is described in the [Presidio documentation](https://microsoft.github.io/presidio/samples/python/customizing_presidio_analyzer/)." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "# Define the regex pattern in a Presidio `Pattern` object:\n", + "from presidio_analyzer import Pattern, PatternRecognizer\n", + "\n", + "\n", + "polish_phone_numbers_pattern = Pattern(\n", + " name=\"polish_phone_numbers_pattern\",\n", + " regex=\"(?\n", + "My polish phone number is \n", + "My polish phone number is \n" + ] + } + ], + "source": [ + "print(anonymizer.anonymize(\"My polish phone number is 666555444\"))\n", + "print(anonymizer.anonymize(\"My polish phone number is 666 555 444\"))\n", + "print(anonymizer.anonymize(\"My polish phone number is +48 666 555 444\"))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\\\n", + "The problem is - even though we recognize polish phone numbers now, we don't have a method (operator) that would tell how to substitute a given field - because of this, in the outpit we only provide string `` We need to create a method to replace it correctly: " + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'+48 533 220 543'" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from faker import Faker\n", + "\n", + "fake = Faker(locale=\"pl_PL\")\n", + "\n", + "\n", + "def fake_polish_phone_number(_=None):\n", + " return fake.phone_number()\n", + "\n", + "\n", + "fake_polish_phone_number()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\\\n", + "We used Faker to create pseudo data. Now we can create an operator and add it to the anonymizer. For complete information about operators and their creation, see the Presidio documentation for [simple](https://microsoft.github.io/presidio/tutorial/10_simple_anonymization/) and [custom](https://microsoft.github.io/presidio/tutorial/11_custom_anonymization/) anonymization." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "from presidio_anonymizer.entities import OperatorConfig\n", + "\n", + "new_operators = {\n", + " \"POLISH_PHONE_NUMBER\": OperatorConfig(\n", + " \"custom\", {\"lambda\": fake_polish_phone_number}\n", + " )\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "anonymizer.add_operators(new_operators)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'My polish phone number is +48 692 715 636'" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "anonymizer.anonymize(\"My polish phone number is 666555444\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\\\n", + "Finally, it is worth showing how to implement anonymizer as a chain. Since anonymization is based on string operations, we can use `TransformChain` for this:" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'text': 'You can find our super secret data at https://supersecretdata.com',\n", + " 'anonymized_text': 'You can find our super secret data at https://www.fox.org/'}" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from langchain.chains.transform import TransformChain\n", + "\n", + "anonymizer = PresidioAnonymizer()\n", + "\n", + "\n", + "def anonymize_func(inputs: dict) -> dict:\n", + " text = inputs[\"text\"]\n", + " return {\"anonymized_text\": anonymizer.anonymize(text)}\n", + "\n", + "\n", + "anonymize_chain = TransformChain(\n", + " input_variables=[\"text\"],\n", + " output_variables=[\"anonymized_text\"],\n", + " transform=anonymize_func,\n", + ")\n", + "\n", + "anonymize_chain(\"You can find our super secret data at https://supersecretdata.com\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\\\n", + "Later, you can, for example, use such anonymization as part of chain sequence. We will use `LangChain Expression Language` ([learn more here](https://python.langchain.com/docs/guides/expression_language/)) for composing these chains together, as shown below:" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# ! pip install openai\n", + "\n", + "# Set env var OPENAI_API_KEY or load from a .env file:\n", + "import dotenv\n", + "\n", + "dotenv.load_dotenv()" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'anonymized_text': StringPromptValue(text='According to this text, where can you find our super secret data?\\n\\nYou can find our super secret data at https://evans-summers.info/\\n\\nAnswer:'),\n", + " 'text': ' https://evans-summers.info/'}" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from operator import itemgetter\n", + "from langchain.prompts.prompt import PromptTemplate\n", + "from langchain.chains.llm import LLMChain\n", + "from langchain.llms.openai import OpenAI\n", + "\n", + "template = \"\"\"According to this text, where can you find our super secret data?\n", + "\n", + "{anonymized_text}\n", + "\n", + "Answer:\"\"\"\n", + "prompt = PromptTemplate(input_variables=[\"anonymized_text\"], template=template)\n", + "llm_chain = LLMChain(llm=OpenAI(), prompt=prompt)\n", + "\n", + "\n", + "chain = (\n", + " anonymize_chain\n", + " | {\"anonymized_text\": itemgetter(\"anonymized_text\")}\n", + " | prompt\n", + " | llm_chain\n", + ")\n", + "chain.invoke(\"You can find our super secret data at https://supersecretdata.com\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Future works\n", + "\n", + "- **deanonymization** - add the ability to reverse anonymization. For example, the workflow could look like this: `anonymize -> LLMChain -> deanonymize`. By doing this, we will retain anonymity in requests to, for example, OpenAI, and then be able restore the original data.\n", + "- **instance anonymization** - at this point, each occurrence of PII is treated as a separate entity and separately anonymized. Therefore, two occurrences of the name John Doe in the text will be changed to two different names. It is therefore worth introducing support for full instance detection, so that repeated occurrences are treated as a single object." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.4" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/libs/experimental/Makefile b/libs/experimental/Makefile index ede2c85b47f..a8b926d8f26 100644 --- a/libs/experimental/Makefile +++ b/libs/experimental/Makefile @@ -15,6 +15,9 @@ tests: test_watch: poetry run ptw --now . -- tests/unit_tests +extended_tests: + poetry run pytest --only-extended tests/unit_tests + ###################### # LINTING AND FORMATTING diff --git a/libs/experimental/langchain_experimental/data_anonymizer/__init__.py b/libs/experimental/langchain_experimental/data_anonymizer/__init__.py new file mode 100644 index 00000000000..69babad859a --- /dev/null +++ b/libs/experimental/langchain_experimental/data_anonymizer/__init__.py @@ -0,0 +1,4 @@ +"""Data anonymizer package""" +from langchain_experimental.data_anonymizer.presidio import PresidioAnonymizer + +__all__ = ["PresidioAnonymizer"] diff --git a/libs/experimental/langchain_experimental/data_anonymizer/base.py b/libs/experimental/langchain_experimental/data_anonymizer/base.py new file mode 100644 index 00000000000..3f9905375e0 --- /dev/null +++ b/libs/experimental/langchain_experimental/data_anonymizer/base.py @@ -0,0 +1,17 @@ +from abc import ABC, abstractmethod + + +class AnonymizerBase(ABC): + """ + Base abstract class for anonymizers. + It is public and non-virtual because it allows + wrapping the behavior for all methods in a base class. + """ + + def anonymize(self, text: str) -> str: + """Anonymize text""" + return self._anonymize(text) + + @abstractmethod + def _anonymize(self, text: str) -> str: + """Abstract method to anonymize text""" diff --git a/libs/experimental/langchain_experimental/data_anonymizer/faker_presidio_mapping.py b/libs/experimental/langchain_experimental/data_anonymizer/faker_presidio_mapping.py new file mode 100644 index 00000000000..8db4f94c2fd --- /dev/null +++ b/libs/experimental/langchain_experimental/data_anonymizer/faker_presidio_mapping.py @@ -0,0 +1,40 @@ +import string +from typing import Callable, Dict + + +def get_pseudoanonymizer_mapping() -> Dict[str, Callable]: + try: + from faker import Faker + except ImportError as e: + raise ImportError( + "Could not import faker, please install it with `pip install Faker`." + ) from e + + fake = Faker() + + # Listed entities supported by Microsoft Presidio (for now, global and US only) + # Source: https://microsoft.github.io/presidio/supported_entities/ + return { + # Global entities + "PERSON": lambda _: fake.name(), + "EMAIL_ADDRESS": lambda _: fake.email(), + "PHONE_NUMBER": lambda _: fake.phone_number(), + "IBAN_CODE": lambda _: fake.iban(), + "CREDIT_CARD": lambda _: fake.credit_card_number(), + "CRYPTO": lambda _: "bc1" + + "".join( + fake.random_choices(string.ascii_lowercase + string.digits, length=26) + ), + "IP_ADDRESS": lambda _: fake.ipv4_public(), + "LOCATION": lambda _: fake.address(), + "DATE_TIME": lambda _: fake.iso8601(), + "NRP": lambda _: str(fake.random_number(digits=8, fix_len=True)), + "MEDICAL_LICENSE": lambda _: fake.bothify(text="??######").upper(), + "URL": lambda _: fake.url(), + # US-specific entities + "US_BANK_NUMBER": lambda _: fake.bban(), + "US_DRIVER_LICENSE": lambda _: str(fake.random_number(digits=9, fix_len=True)), + "US_ITIN": lambda _: fake.bothify(text="9##-7#-####"), + "US_PASSPORT": lambda _: fake.bothify(text="#####??").upper(), + "US_SSN": lambda _: fake.ssn(), + } diff --git a/libs/experimental/langchain_experimental/data_anonymizer/presidio.py b/libs/experimental/langchain_experimental/data_anonymizer/presidio.py new file mode 100644 index 00000000000..544958ac153 --- /dev/null +++ b/libs/experimental/langchain_experimental/data_anonymizer/presidio.py @@ -0,0 +1,91 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, Dict, List, Optional + +from langchain_experimental.data_anonymizer.base import AnonymizerBase +from langchain_experimental.data_anonymizer.faker_presidio_mapping import ( + get_pseudoanonymizer_mapping, +) + +if TYPE_CHECKING: + from presidio_analyzer import EntityRecognizer + from presidio_anonymizer.entities import OperatorConfig + + +class PresidioAnonymizer(AnonymizerBase): + """Anonymizer using Microsoft Presidio.""" + + def __init__( + self, + analyzed_fields: Optional[List[str]] = None, + language: str = "en", + operators: Optional[Dict[str, OperatorConfig]] = None, + ): + """ + Args: + analyzed_fields: List of fields to detect and then anonymize. + Defaults to all entities supported by Microsoft Presidio. + language: Language to use for analysis. Defaults to english. + operators: Operators to use for anonymization. + Operators allow for custom anonymization of detected PII. + Learn more: + https://microsoft.github.io/presidio/tutorial/10_simple_anonymization/ + """ + try: + from presidio_analyzer import AnalyzerEngine + except ImportError as e: + raise ImportError( + "Could not import presidio_analyzer, please install with " + "`pip install presidio-analyzer`. You will also need to download a " + "spaCy model to use the analyzer, e.g. " + "`python -m spacy download en_core_web_lg`." + ) from e + try: + from presidio_anonymizer import AnonymizerEngine + from presidio_anonymizer.entities import OperatorConfig + except ImportError as e: + raise ImportError( + "Could not import presidio_anonymizer, please install with " + "`pip install presidio-anonymizer`." + ) from e + + self.analyzed_fields = ( + analyzed_fields + if analyzed_fields is not None + else list(get_pseudoanonymizer_mapping().keys()) + ) + self.language = language + self.operators = ( + operators + if operators is not None + else { + field: OperatorConfig( + operator_name="custom", params={"lambda": faker_function} + ) + for field, faker_function in get_pseudoanonymizer_mapping().items() + } + ) + self._analyzer = AnalyzerEngine() + self._anonymizer = AnonymizerEngine() + + def _anonymize(self, text: str) -> str: + results = self._analyzer.analyze( + text, + entities=self.analyzed_fields, + language=self.language, + ) + + return self._anonymizer.anonymize( + text, + analyzer_results=results, + operators=self.operators, + ).text + + def add_recognizer(self, recognizer: EntityRecognizer) -> None: + """Add a recognizer to the analyzer""" + self._analyzer.registry.add_recognizer(recognizer) + self.analyzed_fields.extend(recognizer.supported_entities) + + def add_operators(self, operators: Dict[str, OperatorConfig]) -> None: + """Add operators to the anonymizer""" + self.operators.update(operators) diff --git a/libs/experimental/poetry.lock b/libs/experimental/poetry.lock index 71b7d1b942d..b0d5b9139af 100644 --- a/libs/experimental/poetry.lock +++ b/libs/experimental/poetry.lock @@ -392,6 +392,60 @@ webencodings = "*" [package.extras] css = ["tinycss2 (>=1.1.0,<1.2)"] +[[package]] +name = "blis" +version = "0.7.10" +description = "The Blis BLAS-like linear algebra library, as a self-contained C-extension." +optional = true +python-versions = "*" +files = [ + {file = "blis-0.7.10-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:1fb4a9fca42d56533e28bf62b740f5c7d122e804742e5ea24b2704950151ae3c"}, + {file = "blis-0.7.10-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2167e656d6237443ef7d0cd7dcfbedc12fcd156c54112f2dc5ca9b0249ec835d"}, + {file = "blis-0.7.10-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a887165f2d7c08814dc92f96535232ca628e3e27927fb09cdeb8492781a28d04"}, + {file = "blis-0.7.10-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:31a6a8c347ef764ef268b6e11ae7b47ce83aba7ea99fc9223f85543aaab09826"}, + {file = "blis-0.7.10-cp310-cp310-win_amd64.whl", hash = "sha256:67a17000e953d05f09a1ee7dad001c783ca5d5dc12e40dcfff049b86e74fed67"}, + {file = "blis-0.7.10-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:67c8270ea20cf7e9342e4e3ed8fd51123a5236b1aa35fa94fb2200a8e11d0081"}, + {file = "blis-0.7.10-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:a86f1d2c6370d571dc88fc710416e8cab7dc6bb3a47ee9f27079ee34adf780d6"}, + {file = "blis-0.7.10-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:288247c424fd2bd3d43b750f1f54bba19fe2cbb11e5c028bc4762bc03bd54b9b"}, + {file = "blis-0.7.10-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2846d1a5116a5a1e4c09fa5c3cab6fbe13349c8036bc1c8746a738c556a751c4"}, + {file = "blis-0.7.10-cp311-cp311-win_amd64.whl", hash = "sha256:f5c4a7c0fa67fec5a06fb6c1656bf1b51e7ab414292a04d417512b1fb1247246"}, + {file = "blis-0.7.10-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ec3e11e8ed6be18cf43152513bbfeabbc3f99a5d391786642fb7a14fb914ee61"}, + {file = "blis-0.7.10-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:148835c8c96ea4c8957111de0593a28e9044c5b0e4cbcc34b77d700394fa6f13"}, + {file = "blis-0.7.10-cp36-cp36m-win_amd64.whl", hash = "sha256:2df3d8703d23c39d8a0fb1e43be4681ec09f9010e08e9b35674fe799046c5fd5"}, + {file = "blis-0.7.10-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:fa62e13631c89626365ccd2585a2be154847c5bbb30cfc2ea8fdcf4e83cedd69"}, + {file = "blis-0.7.10-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:adc7c70c5d482ce71c61a6008bcb44dfb15a0ac41ba176c59143f016658fa82d"}, + {file = "blis-0.7.10-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ed4e31d32916f657842572b6640b235c5f2f679a70ec74808160b584c08399ce"}, + {file = "blis-0.7.10-cp37-cp37m-win_amd64.whl", hash = "sha256:9833fc44795c8d43617732df31a8eca9de3f54b181ff9f0008cc50356cc26d86"}, + {file = "blis-0.7.10-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:0cca151d046f8b6b9d075b4f3a5ffee52993424b3080f0e0c2be419f20a477a7"}, + {file = "blis-0.7.10-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:d3bb6c4b9ae45e88e6e69b46eca145858cb9b3cd0a43a6c6812fb34c5c80d871"}, + {file = "blis-0.7.10-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:47c6a0230688ff7c29e31b78f0d207556044c0c84bb90e7c28b009a6765658c4"}, + {file = "blis-0.7.10-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:953dd85d4a8f79d4d69c17d27a0b783a5664aee0feafa33662199b7c78b0ee51"}, + {file = "blis-0.7.10-cp38-cp38-win_amd64.whl", hash = "sha256:ed181a90fef1edff76220cb883df65685aeca610a0abe22c91322a3300e1e89d"}, + {file = "blis-0.7.10-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:df7f746159d9ab11f427e00c72abe8de522c1671c7a33ca664739b2bd48b71c2"}, + {file = "blis-0.7.10-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:dd7870a21aed12b25ec8692a75e6965e9451b1b7f2752e2cac4ae9f565d2de95"}, + {file = "blis-0.7.10-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4766e26721e37e028336b542c226eab9faf812ea2d89a1869531ed0cada6c359"}, + {file = "blis-0.7.10-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc8fac91353f20e747e130bc8d4010442c6700e4c7e5edc38d69bb844802ea81"}, + {file = "blis-0.7.10-cp39-cp39-win_amd64.whl", hash = "sha256:4329fef5b1050c88dbca6f7d87ecc02d56f09005afa60edf12d826d82544f88a"}, + {file = "blis-0.7.10.tar.gz", hash = "sha256:343e8b125784d70ff6e1f17a95ea71538705bf0bd3cc236a176d153590842647"}, +] + +[package.dependencies] +numpy = [ + {version = ">=1.15.0", markers = "python_version < \"3.9\""}, + {version = ">=1.19.0", markers = "python_version >= \"3.9\""}, +] + +[[package]] +name = "catalogue" +version = "2.0.9" +description = "Super lightweight function registries for your library" +optional = true +python-versions = ">=3.6" +files = [ + {file = "catalogue-2.0.9-py3-none-any.whl", hash = "sha256:5817ce97de17ace366a15eadd4987ac022b28f262006147549cdb3467265dc4d"}, + {file = "catalogue-2.0.9.tar.gz", hash = "sha256:d204c423ec436f2545341ec8a0e026ae033b3ce5911644f95e94d6b887cf631c"}, +] + [[package]] name = "certifi" version = "2023.7.22" @@ -607,6 +661,58 @@ lint = ["black (>=22.6.0)", "mdformat (>0.7)", "mdformat-gfm (>=0.3.5)", "ruff ( test = ["pytest"] typing = ["mypy (>=0.990)"] +[[package]] +name = "confection" +version = "0.1.1" +description = "The sweetest config system for Python" +optional = true +python-versions = ">=3.6" +files = [ + {file = "confection-0.1.1-py3-none-any.whl", hash = "sha256:d2d9e53a5a61395caae1ab09281bab17b08a23fa94aabd1cc24c134880d41c30"}, + {file = "confection-0.1.1.tar.gz", hash = "sha256:4678652fb4aab94f40631c853e2dd76a5a420205f877cb6a9f2459a44fd7aa29"}, +] + +[package.dependencies] +pydantic = ">=1.7.4,<1.8 || >1.8,<1.8.1 || >1.8.1,<3.0.0" +srsly = ">=2.4.0,<3.0.0" + +[[package]] +name = "cymem" +version = "2.0.7" +description = "Manage calls to calloc/free through Cython" +optional = true +python-versions = "*" +files = [ + {file = "cymem-2.0.7-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:4981fc9182cc1fe54bfedf5f73bfec3ce0c27582d9be71e130c46e35958beef0"}, + {file = "cymem-2.0.7-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:42aedfd2e77aa0518a24a2a60a2147308903abc8b13c84504af58539c39e52a3"}, + {file = "cymem-2.0.7-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c183257dc5ab237b664f64156c743e788f562417c74ea58c5a3939fe2d48d6f6"}, + {file = "cymem-2.0.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d18250f97eeb13af2e8b19d3cefe4bf743b963d93320b0a2e729771410fd8cf4"}, + {file = "cymem-2.0.7-cp310-cp310-win_amd64.whl", hash = "sha256:864701e626b65eb2256060564ed8eb034ebb0a8f14ce3fbef337e88352cdee9f"}, + {file = "cymem-2.0.7-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:314273be1f143da674388e0a125d409e2721fbf669c380ae27c5cbae4011e26d"}, + {file = "cymem-2.0.7-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:df543a36e7000808fe0a03d92fd6cd8bf23fa8737c3f7ae791a5386de797bf79"}, + {file = "cymem-2.0.7-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9e5e1b7de7952d89508d07601b9e95b2244e70d7ef60fbc161b3ad68f22815f8"}, + {file = "cymem-2.0.7-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2aa33f1dbd7ceda37970e174c38fd1cf106817a261aa58521ba9918156868231"}, + {file = "cymem-2.0.7-cp311-cp311-win_amd64.whl", hash = "sha256:10178e402bb512b2686b8c2f41f930111e597237ca8f85cb583ea93822ef798d"}, + {file = "cymem-2.0.7-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a2971b7da5aa2e65d8fbbe9f2acfc19ff8e73f1896e3d6e1223cc9bf275a0207"}, + {file = "cymem-2.0.7-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:85359ab7b490e6c897c04863704481600bd45188a0e2ca7375eb5db193e13cb7"}, + {file = "cymem-2.0.7-cp36-cp36m-win_amd64.whl", hash = "sha256:0ac45088abffbae9b7db2c597f098de51b7e3c1023cb314e55c0f7f08440cf66"}, + {file = "cymem-2.0.7-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:26e5d5c6958855d2fe3d5629afe85a6aae5531abaa76f4bc21b9abf9caaccdfe"}, + {file = "cymem-2.0.7-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:011039e12d3144ac1bf3a6b38f5722b817f0d6487c8184e88c891b360b69f533"}, + {file = "cymem-2.0.7-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6f9e63e5ad4ed6ffa21fd8db1c03b05be3fea2f32e32fdace67a840ea2702c3d"}, + {file = "cymem-2.0.7-cp37-cp37m-win_amd64.whl", hash = "sha256:5ea6b027fdad0c3e9a4f1b94d28d213be08c466a60c72c633eb9db76cf30e53a"}, + {file = "cymem-2.0.7-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:4302df5793a320c4f4a263c7785d2fa7f29928d72cb83ebeb34d64a610f8d819"}, + {file = "cymem-2.0.7-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:24b779046484674c054af1e779c68cb224dc9694200ac13b22129d7fb7e99e6d"}, + {file = "cymem-2.0.7-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6c50794c612801ed8b599cd4af1ed810a0d39011711c8224f93e1153c00e08d1"}, + {file = "cymem-2.0.7-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a9525ad563b36dc1e30889d0087a0daa67dd7bb7d3e1530c4b61cd65cc756a5b"}, + {file = "cymem-2.0.7-cp38-cp38-win_amd64.whl", hash = "sha256:48b98da6b906fe976865263e27734ebc64f972a978a999d447ad6c83334e3f90"}, + {file = "cymem-2.0.7-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:e156788d32ad8f7141330913c5d5d2aa67182fca8f15ae22645e9f379abe8a4c"}, + {file = "cymem-2.0.7-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:3da89464021fe669932fce1578343fcaf701e47e3206f50d320f4f21e6683ca5"}, + {file = "cymem-2.0.7-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4f359cab9f16e25b3098f816c40acbf1697a3b614a8d02c56e6ebcb9c89a06b3"}, + {file = "cymem-2.0.7-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f165d7bce55d6730930e29d8294569788aa127f1be8d1642d9550ed96223cb37"}, + {file = "cymem-2.0.7-cp39-cp39-win_amd64.whl", hash = "sha256:59a09cf0e71b1b88bfa0de544b801585d81d06ea123c1725e7c5da05b7ca0d20"}, + {file = "cymem-2.0.7.tar.gz", hash = "sha256:e6034badb5dd4e10344211c81f16505a55553a7164adc314c75bd80cf07e57a8"}, +] + [[package]] name = "dataclasses-json" version = "0.5.9" @@ -703,6 +809,21 @@ files = [ [package.extras] tests = ["asttokens", "littleutils", "pytest", "rich"] +[[package]] +name = "faker" +version = "19.3.1" +description = "Faker is a Python package that generates fake data for you." +optional = true +python-versions = ">=3.8" +files = [ + {file = "Faker-19.3.1-py3-none-any.whl", hash = "sha256:e2722fdf622cf24e974aaba15a3dee97a6f8b98d869bd827ff1af9c87695af46"}, + {file = "Faker-19.3.1.tar.gz", hash = "sha256:a6624d9574623bb27dfca33fff94581cd7b23b562901db8ad59acbde9a52543e"}, +] + +[package.dependencies] +python-dateutil = ">=2.4" +typing-extensions = {version = ">=3.10.0.1", markers = "python_version <= \"3.8\""} + [[package]] name = "fastjsonschema" version = "2.18.0" @@ -717,6 +838,24 @@ files = [ [package.extras] devel = ["colorama", "json-spec", "jsonschema", "pylint", "pytest", "pytest-benchmark", "pytest-cache", "validictory"] +[[package]] +name = "filelock" +version = "3.12.3" +description = "A platform independent file lock." +optional = true +python-versions = ">=3.8" +files = [ + {file = "filelock-3.12.3-py3-none-any.whl", hash = "sha256:f067e40ccc40f2b48395a80fcbd4728262fab54e232e090a4063ab804179efeb"}, + {file = "filelock-3.12.3.tar.gz", hash = "sha256:0ecc1dd2ec4672a10c8550a8182f1bd0c0a5088470ecd5a125e45f49472fac3d"}, +] + +[package.dependencies] +typing-extensions = {version = ">=4.7.1", markers = "python_version < \"3.11\""} + +[package.extras] +docs = ["furo (>=2023.7.26)", "sphinx (>=7.1.2)", "sphinx-autodoc-typehints (>=1.24)"] +testing = ["covdefaults (>=2.3)", "coverage (>=7.3)", "diff-cover (>=7.7)", "pytest (>=7.4)", "pytest-cov (>=4.1)", "pytest-mock (>=3.11.1)", "pytest-timeout (>=2.1)"] + [[package]] name = "fqdn" version = "1.5.1" @@ -1106,7 +1245,6 @@ optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*, !=3.6.*" files = [ {file = "jsonpointer-2.4-py2.py3-none-any.whl", hash = "sha256:15d51bba20eea3165644553647711d150376234112651b4f1811022aecad7d7a"}, - {file = "jsonpointer-2.4.tar.gz", hash = "sha256:585cee82b70211fa9e6043b7bb89db6e1aa49524340dde8ad6b63206ea689d88"}, ] [[package]] @@ -1457,6 +1595,20 @@ openai = ["openai (>=0,<1)", "tiktoken (>=0.3.2,<0.4.0)"] qdrant = ["qdrant-client (>=1.3.1,<2.0.0)"] text-helpers = ["chardet (>=5.1.0,<6.0.0)"] +[[package]] +name = "langcodes" +version = "3.3.0" +description = "Tools for labeling human languages with IETF language tags" +optional = true +python-versions = ">=3.6" +files = [ + {file = "langcodes-3.3.0-py3-none-any.whl", hash = "sha256:4d89fc9acb6e9c8fdef70bcdf376113a3db09b67285d9e1d534de6d8818e7e69"}, + {file = "langcodes-3.3.0.tar.gz", hash = "sha256:794d07d5a28781231ac335a1561b8442f8648ca07cd518310aeb45d6f0807ef6"}, +] + +[package.extras] +data = ["language-data (>=1.1,<2.0)"] + [[package]] name = "langsmith" version = "0.0.25" @@ -1673,6 +1825,43 @@ files = [ {file = "multidict-6.0.4.tar.gz", hash = "sha256:3666906492efb76453c0e7b97f2cf459b0682e7402c0489a95484965dbc1da49"}, ] +[[package]] +name = "murmurhash" +version = "1.0.9" +description = "Cython bindings for MurmurHash" +optional = true +python-versions = ">=3.6" +files = [ + {file = "murmurhash-1.0.9-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:697ed01454d92681c7ae26eb1adcdc654b54062bcc59db38ed03cad71b23d449"}, + {file = "murmurhash-1.0.9-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:5ef31b5c11be2c064dbbdd0e22ab3effa9ceb5b11ae735295c717c120087dd94"}, + {file = "murmurhash-1.0.9-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d7a2bd203377a31bbb2d83fe3f968756d6c9bbfa36c64c6ebfc3c6494fc680bc"}, + {file = "murmurhash-1.0.9-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0eb0f8e652431ea238c11bcb671fef5c03aff0544bf7e098df81ea4b6d495405"}, + {file = "murmurhash-1.0.9-cp310-cp310-win_amd64.whl", hash = "sha256:cf0b3fe54dca598f5b18c9951e70812e070ecb4c0672ad2cc32efde8a33b3df6"}, + {file = "murmurhash-1.0.9-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:5dc41be79ba4d09aab7e9110a8a4d4b37b184b63767b1b247411667cdb1057a3"}, + {file = "murmurhash-1.0.9-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:c0f84ecdf37c06eda0222f2f9e81c0974e1a7659c35b755ab2fdc642ebd366db"}, + {file = "murmurhash-1.0.9-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:241693c1c819148eac29d7882739b1099c891f1f7431127b2652c23f81722cec"}, + {file = "murmurhash-1.0.9-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:47f5ca56c430230d3b581dfdbc54eb3ad8b0406dcc9afdd978da2e662c71d370"}, + {file = "murmurhash-1.0.9-cp311-cp311-win_amd64.whl", hash = "sha256:660ae41fc6609abc05130543011a45b33ca5d8318ae5c70e66bbd351ca936063"}, + {file = "murmurhash-1.0.9-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:01137d688a6b259bde642513506b062364ea4e1609f886d9bd095c3ae6da0b94"}, + {file = "murmurhash-1.0.9-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1b70bbf55d89713873a35bd4002bc231d38e530e1051d57ca5d15f96c01fd778"}, + {file = "murmurhash-1.0.9-cp36-cp36m-win_amd64.whl", hash = "sha256:3e802fa5b0e618ee99e8c114ce99fc91677f14e9de6e18b945d91323a93c84e8"}, + {file = "murmurhash-1.0.9-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:213d0248e586082e1cab6157d9945b846fd2b6be34357ad5ea0d03a1931d82ba"}, + {file = "murmurhash-1.0.9-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:94b89d02aeab5e6bad5056f9d08df03ac7cfe06e61ff4b6340feb227fda80ce8"}, + {file = "murmurhash-1.0.9-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0c2e2ee2d91a87952fe0f80212e86119aa1fd7681f03e6c99b279e50790dc2b3"}, + {file = "murmurhash-1.0.9-cp37-cp37m-win_amd64.whl", hash = "sha256:8c3d69fb649c77c74a55624ebf7a0df3c81629e6ea6e80048134f015da57b2ea"}, + {file = "murmurhash-1.0.9-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:ab78675510f83e7a3c6bd0abdc448a9a2b0b385b0d7ee766cbbfc5cc278a3042"}, + {file = "murmurhash-1.0.9-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:0ac5530c250d2b0073ed058555847c8d88d2d00229e483d45658c13b32398523"}, + {file = "murmurhash-1.0.9-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:69157e8fa6b25c4383645227069f6a1f8738d32ed2a83558961019ca3ebef56a"}, + {file = "murmurhash-1.0.9-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2aebe2ae016525a662ff772b72a2c9244a673e3215fcd49897f494258b96f3e7"}, + {file = "murmurhash-1.0.9-cp38-cp38-win_amd64.whl", hash = "sha256:a5952f9c18a717fa17579e27f57bfa619299546011a8378a8f73e14eece332f6"}, + {file = "murmurhash-1.0.9-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:ef79202feeac68e83971239169a05fa6514ecc2815ce04c8302076d267870f6e"}, + {file = "murmurhash-1.0.9-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:799fcbca5693ad6a40f565ae6b8e9718e5875a63deddf343825c0f31c32348fa"}, + {file = "murmurhash-1.0.9-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f9b995bc82eaf9223e045210207b8878fdfe099a788dd8abd708d9ee58459a9d"}, + {file = "murmurhash-1.0.9-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b129e1c5ebd772e6ff5ef925bcce695df13169bd885337e6074b923ab6edcfc8"}, + {file = "murmurhash-1.0.9-cp39-cp39-win_amd64.whl", hash = "sha256:379bf6b414bd27dd36772dd1570565a7d69918e980457370838bd514df0d91e9"}, + {file = "murmurhash-1.0.9.tar.gz", hash = "sha256:fe7a38cb0d3d87c14ec9dddc4932ffe2dbc77d75469ab80fd5014689b0e07b58"}, +] + [[package]] name = "mypy" version = "0.991" @@ -2019,6 +2208,28 @@ files = [ {file = "pathspec-0.11.2.tar.gz", hash = "sha256:e0d8d0ac2f12da61956eb2306b69f9469b42f4deb0f3cb6ed47b9cce9996ced3"}, ] +[[package]] +name = "pathy" +version = "0.10.2" +description = "pathlib.Path subclasses for local and cloud bucket storage" +optional = true +python-versions = ">= 3.6" +files = [ + {file = "pathy-0.10.2-py3-none-any.whl", hash = "sha256:681bc98dbff28e7de3e50efa8246910f727e8ac254c4318c47ce341f7c1ce21d"}, + {file = "pathy-0.10.2.tar.gz", hash = "sha256:79c572ab7fed84dc46837346edae58565992d0477a789cd4691a41d8eab9917d"}, +] + +[package.dependencies] +smart-open = ">=5.2.1,<7.0.0" +typer = ">=0.3.0,<1.0.0" + +[package.extras] +all = ["azure-storage-blob", "boto3", "google-cloud-storage (>=1.26.0,<2.0.0)", "mock", "pytest", "pytest-coverage", "typer-cli"] +azure = ["azure-storage-blob"] +gcs = ["google-cloud-storage (>=1.26.0,<2.0.0)"] +s3 = ["boto3"] +test = ["mock", "pytest", "pytest-coverage", "typer-cli"] + [[package]] name = "pexpect" version = "4.8.0" @@ -2033,6 +2244,17 @@ files = [ [package.dependencies] ptyprocess = ">=0.5" +[[package]] +name = "phonenumbers" +version = "8.13.19" +description = "Python version of Google's common library for parsing, formatting, storing and validating international phone numbers." +optional = true +python-versions = "*" +files = [ + {file = "phonenumbers-8.13.19-py2.py3-none-any.whl", hash = "sha256:ba542f20f6dc83be8f127f240f9b5b7e7c1dec42aceff1879400d4dc0c781d81"}, + {file = "phonenumbers-8.13.19.tar.gz", hash = "sha256:38180247697240ccedd74dec4bfbdbc22bb108b9c5f991f270ca3e41395e6f96"}, +] + [[package]] name = "pickleshare" version = "0.7.5" @@ -2085,6 +2307,80 @@ files = [ dev = ["pre-commit", "tox"] testing = ["pytest", "pytest-benchmark"] +[[package]] +name = "preshed" +version = "3.0.8" +description = "Cython hash table that trusts the keys are pre-hashed" +optional = true +python-versions = ">=3.6" +files = [ + {file = "preshed-3.0.8-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:ea4b6df8ef7af38e864235256793bc3056e9699d991afcf6256fa298858582fc"}, + {file = "preshed-3.0.8-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:8e945fc814bdc29564a2ce137c237b3a9848aa1e76a1160369b6e0d328151fdd"}, + {file = "preshed-3.0.8-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f9a4833530fe53001c351974e0c8bb660211b8d0358e592af185fec1ae12b2d0"}, + {file = "preshed-3.0.8-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e1472ee231f323b4f4368b1b5f8f08481ed43af89697d45450c6ae4af46ac08a"}, + {file = "preshed-3.0.8-cp310-cp310-win_amd64.whl", hash = "sha256:c8a2e2931eea7e500fbf8e014b69022f3fab2e35a70da882e2fc753e5e487ae3"}, + {file = "preshed-3.0.8-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:0e1bb8701df7861af26a312225bdf7c4822ac06fcf75aeb60fe2b0a20e64c222"}, + {file = "preshed-3.0.8-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:e9aef2b0b7687aecef48b1c6ff657d407ff24e75462877dcb888fa904c4a9c6d"}, + {file = "preshed-3.0.8-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:854d58a8913ebf3b193b0dc8064155b034e8987de25f26838dfeca09151fda8a"}, + {file = "preshed-3.0.8-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:135e2ac0db1a3948d6ec295598c7e182b52c394663f2fcfe36a97ae51186be21"}, + {file = "preshed-3.0.8-cp311-cp311-win_amd64.whl", hash = "sha256:019d8fa4161035811fb2804d03214143298739e162d0ad24e087bd46c50970f5"}, + {file = "preshed-3.0.8-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6a49ce52856fbb3ef4f1cc744c53f5d7e1ca370b1939620ac2509a6d25e02a50"}, + {file = "preshed-3.0.8-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fdbc2957b36115a576c515ffe963919f19d2683f3c76c9304ae88ef59f6b5ca6"}, + {file = "preshed-3.0.8-cp36-cp36m-win_amd64.whl", hash = "sha256:09cc9da2ac1b23010ce7d88a5e20f1033595e6dd80be14318e43b9409f4c7697"}, + {file = "preshed-3.0.8-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:e19c8069f1a1450f835f23d47724530cf716d581fcafb398f534d044f806b8c2"}, + {file = "preshed-3.0.8-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:25b5ef5e387a0e17ff41202a8c1816184ab6fb3c0d0b847bf8add0ed5941eb8d"}, + {file = "preshed-3.0.8-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:53d3e2456a085425c66af7baba62d7eaa24aa5e460e1a9e02c401a2ed59abd7b"}, + {file = "preshed-3.0.8-cp37-cp37m-win_amd64.whl", hash = "sha256:85e98a618fb36cdcc37501d8b9b8c1246651cc2f2db3a70702832523e0ae12f4"}, + {file = "preshed-3.0.8-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:7f8837bf616335464f3713cbf562a3dcaad22c3ca9193f957018964ef871a68b"}, + {file = "preshed-3.0.8-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:720593baf2c2e295f855192974799e486da5f50d4548db93c44f5726a43cefb9"}, + {file = "preshed-3.0.8-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e0ad3d860b9ce88a74cf7414bb4b1c6fd833813e7b818e76f49272c4974b19ce"}, + {file = "preshed-3.0.8-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fd19d48440b152657966a52e627780c0ddbe9d907b8d7ee4598505e80a3c55c7"}, + {file = "preshed-3.0.8-cp38-cp38-win_amd64.whl", hash = "sha256:246e7c6890dc7fe9b10f0e31de3346b906e3862b6ef42fcbede37968f46a73bf"}, + {file = "preshed-3.0.8-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:67643e66691770dc3434b01671648f481e3455209ce953727ef2330b16790aaa"}, + {file = "preshed-3.0.8-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:0ae25a010c9f551aa2247ee621457f679e07c57fc99d3fd44f84cb40b925f12c"}, + {file = "preshed-3.0.8-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5a6a7fcf7dd2e7711051b3f0432da9ec9c748954c989f49d2cd8eabf8c2d953e"}, + {file = "preshed-3.0.8-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5942858170c4f53d9afc6352a86bbc72fc96cc4d8964b6415492114a5920d3ed"}, + {file = "preshed-3.0.8-cp39-cp39-win_amd64.whl", hash = "sha256:06793022a56782ef51d74f1399925a2ba958e50c5cfbc6fa5b25c4945e158a07"}, + {file = "preshed-3.0.8.tar.gz", hash = "sha256:6c74c70078809bfddda17be96483c41d06d717934b07cab7921011d81758b357"}, +] + +[package.dependencies] +cymem = ">=2.0.2,<2.1.0" +murmurhash = ">=0.28.0,<1.1.0" + +[[package]] +name = "presidio-analyzer" +version = "2.2.33" +description = "Presidio analyzer package" +optional = true +python-versions = "*" +files = [ + {file = "presidio_analyzer-2.2.33-py3-none-any.whl", hash = "sha256:1e0d4237f9ac28953e910900b42852927dbf8935de7bf023aebddc752a5bf9ea"}, +] + +[package.dependencies] +phonenumbers = ">=8.12" +pyyaml = "*" +regex = "*" +spacy = ">=3.4.4" +tldextract = "*" + +[package.extras] +transformers = ["torch", "transformers"] + +[[package]] +name = "presidio-anonymizer" +version = "2.2.33" +description = "Persidio Anonymizer package - replaces analyzed text with desired values." +optional = true +python-versions = ">=3.5" +files = [ + {file = "presidio_anonymizer-2.2.33-py3-none-any.whl", hash = "sha256:d1e7feff5ff2bc0eed13425356bce19e8e5ffda1f733d5d603b282ccfbe742d0"}, +] + +[package.dependencies] +pycryptodome = ">=3.10.1" + [[package]] name = "prometheus-client" version = "0.17.1" @@ -2175,6 +2471,47 @@ files = [ {file = "pycparser-2.21.tar.gz", hash = "sha256:e644fdec12f7872f86c58ff790da456218b10f863970249516d60a5eaca77206"}, ] +[[package]] +name = "pycryptodome" +version = "3.18.0" +description = "Cryptographic library for Python" +optional = true +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" +files = [ + {file = "pycryptodome-3.18.0-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:d1497a8cd4728db0e0da3c304856cb37c0c4e3d0b36fcbabcc1600f18504fc54"}, + {file = "pycryptodome-3.18.0-cp27-cp27m-manylinux2010_i686.whl", hash = "sha256:928078c530da78ff08e10eb6cada6e0dff386bf3d9fa9871b4bbc9fbc1efe024"}, + {file = "pycryptodome-3.18.0-cp27-cp27m-manylinux2010_x86_64.whl", hash = "sha256:157c9b5ba5e21b375f052ca78152dd309a09ed04703fd3721dce3ff8ecced148"}, + {file = "pycryptodome-3.18.0-cp27-cp27m-manylinux2014_aarch64.whl", hash = "sha256:d20082bdac9218649f6abe0b885927be25a917e29ae0502eaf2b53f1233ce0c2"}, + {file = "pycryptodome-3.18.0-cp27-cp27m-musllinux_1_1_aarch64.whl", hash = "sha256:e8ad74044e5f5d2456c11ed4cfd3e34b8d4898c0cb201c4038fe41458a82ea27"}, + {file = "pycryptodome-3.18.0-cp27-cp27m-win32.whl", hash = "sha256:62a1e8847fabb5213ccde38915563140a5b338f0d0a0d363f996b51e4a6165cf"}, + {file = "pycryptodome-3.18.0-cp27-cp27m-win_amd64.whl", hash = "sha256:16bfd98dbe472c263ed2821284118d899c76968db1a6665ade0c46805e6b29a4"}, + {file = "pycryptodome-3.18.0-cp27-cp27mu-manylinux2010_i686.whl", hash = "sha256:7a3d22c8ee63de22336679e021c7f2386f7fc465477d59675caa0e5706387944"}, + {file = "pycryptodome-3.18.0-cp27-cp27mu-manylinux2010_x86_64.whl", hash = "sha256:78d863476e6bad2a592645072cc489bb90320972115d8995bcfbee2f8b209918"}, + {file = "pycryptodome-3.18.0-cp27-cp27mu-manylinux2014_aarch64.whl", hash = "sha256:b6a610f8bfe67eab980d6236fdc73bfcdae23c9ed5548192bb2d530e8a92780e"}, + {file = "pycryptodome-3.18.0-cp27-cp27mu-musllinux_1_1_aarch64.whl", hash = "sha256:422c89fd8df8a3bee09fb8d52aaa1e996120eafa565437392b781abec2a56e14"}, + {file = "pycryptodome-3.18.0-cp35-abi3-macosx_10_9_universal2.whl", hash = "sha256:9ad6f09f670c466aac94a40798e0e8d1ef2aa04589c29faa5b9b97566611d1d1"}, + {file = "pycryptodome-3.18.0-cp35-abi3-macosx_10_9_x86_64.whl", hash = "sha256:53aee6be8b9b6da25ccd9028caf17dcdce3604f2c7862f5167777b707fbfb6cb"}, + {file = "pycryptodome-3.18.0-cp35-abi3-manylinux2014_aarch64.whl", hash = "sha256:10da29526a2a927c7d64b8f34592f461d92ae55fc97981aab5bbcde8cb465bb6"}, + {file = "pycryptodome-3.18.0-cp35-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f21efb8438971aa16924790e1c3dba3a33164eb4000106a55baaed522c261acf"}, + {file = "pycryptodome-3.18.0-cp35-abi3-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4944defabe2ace4803f99543445c27dd1edbe86d7d4edb87b256476a91e9ffa4"}, + {file = "pycryptodome-3.18.0-cp35-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:51eae079ddb9c5f10376b4131be9589a6554f6fd84f7f655180937f611cd99a2"}, + {file = "pycryptodome-3.18.0-cp35-abi3-musllinux_1_1_i686.whl", hash = "sha256:83c75952dcf4a4cebaa850fa257d7a860644c70a7cd54262c237c9f2be26f76e"}, + {file = "pycryptodome-3.18.0-cp35-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:957b221d062d5752716923d14e0926f47670e95fead9d240fa4d4862214b9b2f"}, + {file = "pycryptodome-3.18.0-cp35-abi3-win32.whl", hash = "sha256:795bd1e4258a2c689c0b1f13ce9684fa0dd4c0e08680dcf597cf9516ed6bc0f3"}, + {file = "pycryptodome-3.18.0-cp35-abi3-win_amd64.whl", hash = "sha256:b1d9701d10303eec8d0bd33fa54d44e67b8be74ab449052a8372f12a66f93fb9"}, + {file = "pycryptodome-3.18.0-pp27-pypy_73-manylinux2010_x86_64.whl", hash = "sha256:cb1be4d5af7f355e7d41d36d8eec156ef1382a88638e8032215c215b82a4b8ec"}, + {file = "pycryptodome-3.18.0-pp27-pypy_73-win32.whl", hash = "sha256:fc0a73f4db1e31d4a6d71b672a48f3af458f548059aa05e83022d5f61aac9c08"}, + {file = "pycryptodome-3.18.0-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:f022a4fd2a5263a5c483a2bb165f9cb27f2be06f2f477113783efe3fe2ad887b"}, + {file = "pycryptodome-3.18.0-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:363dd6f21f848301c2dcdeb3c8ae5f0dee2286a5e952a0f04954b82076f23825"}, + {file = "pycryptodome-3.18.0-pp38-pypy38_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:12600268763e6fec3cefe4c2dcdf79bde08d0b6dc1813887e789e495cb9f3403"}, + {file = "pycryptodome-3.18.0-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:4604816adebd4faf8810782f137f8426bf45fee97d8427fa8e1e49ea78a52e2c"}, + {file = "pycryptodome-3.18.0-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:01489bbdf709d993f3058e2996f8f40fee3f0ea4d995002e5968965fa2fe89fb"}, + {file = "pycryptodome-3.18.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3811e31e1ac3069988f7a1c9ee7331b942e605dfc0f27330a9ea5997e965efb2"}, + {file = "pycryptodome-3.18.0-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6f4b967bb11baea9128ec88c3d02f55a3e338361f5e4934f5240afcb667fdaec"}, + {file = "pycryptodome-3.18.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:9c8eda4f260072f7dbe42f473906c659dcbadd5ae6159dfb49af4da1293ae380"}, + {file = "pycryptodome-3.18.0.tar.gz", hash = "sha256:c9adee653fc882d98956e33ca2c1fb582e23a8af7ac82fee75bd6113c55a0413"}, +] + [[package]] name = "pydantic" version = "1.10.12" @@ -2548,6 +2885,103 @@ files = [ attrs = ">=22.2.0" rpds-py = ">=0.7.0" +[[package]] +name = "regex" +version = "2023.8.8" +description = "Alternative regular expression module, to replace re." +optional = true +python-versions = ">=3.6" +files = [ + {file = "regex-2023.8.8-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:88900f521c645f784260a8d346e12a1590f79e96403971241e64c3a265c8ecdb"}, + {file = "regex-2023.8.8-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:3611576aff55918af2697410ff0293d6071b7e00f4b09e005d614686ac4cd57c"}, + {file = "regex-2023.8.8-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b8a0ccc8f2698f120e9e5742f4b38dc944c38744d4bdfc427616f3a163dd9de5"}, + {file = "regex-2023.8.8-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c662a4cbdd6280ee56f841f14620787215a171c4e2d1744c9528bed8f5816c96"}, + {file = "regex-2023.8.8-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cf0633e4a1b667bfe0bb10b5e53fe0d5f34a6243ea2530eb342491f1adf4f739"}, + {file = "regex-2023.8.8-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:551ad543fa19e94943c5b2cebc54c73353ffff08228ee5f3376bd27b3d5b9800"}, + {file = "regex-2023.8.8-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:54de2619f5ea58474f2ac211ceea6b615af2d7e4306220d4f3fe690c91988a61"}, + {file = "regex-2023.8.8-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:5ec4b3f0aebbbe2fc0134ee30a791af522a92ad9f164858805a77442d7d18570"}, + {file = "regex-2023.8.8-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:3ae646c35cb9f820491760ac62c25b6d6b496757fda2d51be429e0e7b67ae0ab"}, + {file = "regex-2023.8.8-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:ca339088839582d01654e6f83a637a4b8194d0960477b9769d2ff2cfa0fa36d2"}, + {file = "regex-2023.8.8-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:d9b6627408021452dcd0d2cdf8da0534e19d93d070bfa8b6b4176f99711e7f90"}, + {file = "regex-2023.8.8-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:bd3366aceedf274f765a3a4bc95d6cd97b130d1dda524d8f25225d14123c01db"}, + {file = "regex-2023.8.8-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:7aed90a72fc3654fba9bc4b7f851571dcc368120432ad68b226bd593f3f6c0b7"}, + {file = "regex-2023.8.8-cp310-cp310-win32.whl", hash = "sha256:80b80b889cb767cc47f31d2b2f3dec2db8126fbcd0cff31b3925b4dc6609dcdb"}, + {file = "regex-2023.8.8-cp310-cp310-win_amd64.whl", hash = "sha256:b82edc98d107cbc7357da7a5a695901b47d6eb0420e587256ba3ad24b80b7d0b"}, + {file = "regex-2023.8.8-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:1e7d84d64c84ad97bf06f3c8cb5e48941f135ace28f450d86af6b6512f1c9a71"}, + {file = "regex-2023.8.8-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:ce0f9fbe7d295f9922c0424a3637b88c6c472b75eafeaff6f910494a1fa719ef"}, + {file = "regex-2023.8.8-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:06c57e14ac723b04458df5956cfb7e2d9caa6e9d353c0b4c7d5d54fcb1325c46"}, + {file = "regex-2023.8.8-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e7a9aaa5a1267125eef22cef3b63484c3241aaec6f48949b366d26c7250e0357"}, + {file = "regex-2023.8.8-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9b7408511fca48a82a119d78a77c2f5eb1b22fe88b0d2450ed0756d194fe7a9a"}, + {file = "regex-2023.8.8-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:14dc6f2d88192a67d708341f3085df6a4f5a0c7b03dec08d763ca2cd86e9f559"}, + {file = "regex-2023.8.8-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:48c640b99213643d141550326f34f0502fedb1798adb3c9eb79650b1ecb2f177"}, + {file = "regex-2023.8.8-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:0085da0f6c6393428bf0d9c08d8b1874d805bb55e17cb1dfa5ddb7cfb11140bf"}, + {file = "regex-2023.8.8-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:964b16dcc10c79a4a2be9f1273fcc2684a9eedb3906439720598029a797b46e6"}, + {file = "regex-2023.8.8-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:7ce606c14bb195b0e5108544b540e2c5faed6843367e4ab3deb5c6aa5e681208"}, + {file = "regex-2023.8.8-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:40f029d73b10fac448c73d6eb33d57b34607f40116e9f6e9f0d32e9229b147d7"}, + {file = "regex-2023.8.8-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:3b8e6ea6be6d64104d8e9afc34c151926f8182f84e7ac290a93925c0db004bfd"}, + {file = "regex-2023.8.8-cp311-cp311-win32.whl", hash = "sha256:942f8b1f3b223638b02df7df79140646c03938d488fbfb771824f3d05fc083a8"}, + {file = "regex-2023.8.8-cp311-cp311-win_amd64.whl", hash = "sha256:51d8ea2a3a1a8fe4f67de21b8b93757005213e8ac3917567872f2865185fa7fb"}, + {file = "regex-2023.8.8-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:e951d1a8e9963ea51efd7f150450803e3b95db5939f994ad3d5edac2b6f6e2b4"}, + {file = "regex-2023.8.8-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:704f63b774218207b8ccc6c47fcef5340741e5d839d11d606f70af93ee78e4d4"}, + {file = "regex-2023.8.8-cp36-cp36m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:22283c769a7b01c8ac355d5be0715bf6929b6267619505e289f792b01304d898"}, + {file = "regex-2023.8.8-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:91129ff1bb0619bc1f4ad19485718cc623a2dc433dff95baadbf89405c7f6b57"}, + {file = "regex-2023.8.8-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:de35342190deb7b866ad6ba5cbcccb2d22c0487ee0cbb251efef0843d705f0d4"}, + {file = "regex-2023.8.8-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b993b6f524d1e274a5062488a43e3f9f8764ee9745ccd8e8193df743dbe5ee61"}, + {file = "regex-2023.8.8-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:3026cbcf11d79095a32d9a13bbc572a458727bd5b1ca332df4a79faecd45281c"}, + {file = "regex-2023.8.8-cp36-cp36m-musllinux_1_1_aarch64.whl", hash = "sha256:293352710172239bf579c90a9864d0df57340b6fd21272345222fb6371bf82b3"}, + {file = "regex-2023.8.8-cp36-cp36m-musllinux_1_1_i686.whl", hash = "sha256:d909b5a3fff619dc7e48b6b1bedc2f30ec43033ba7af32f936c10839e81b9217"}, + {file = "regex-2023.8.8-cp36-cp36m-musllinux_1_1_ppc64le.whl", hash = "sha256:3d370ff652323c5307d9c8e4c62efd1956fb08051b0e9210212bc51168b4ff56"}, + {file = "regex-2023.8.8-cp36-cp36m-musllinux_1_1_s390x.whl", hash = "sha256:b076da1ed19dc37788f6a934c60adf97bd02c7eea461b73730513921a85d4235"}, + {file = "regex-2023.8.8-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:e9941a4ada58f6218694f382e43fdd256e97615db9da135e77359da257a7168b"}, + {file = "regex-2023.8.8-cp36-cp36m-win32.whl", hash = "sha256:a8c65c17aed7e15a0c824cdc63a6b104dfc530f6fa8cb6ac51c437af52b481c7"}, + {file = "regex-2023.8.8-cp36-cp36m-win_amd64.whl", hash = "sha256:aadf28046e77a72f30dcc1ab185639e8de7f4104b8cb5c6dfa5d8ed860e57236"}, + {file = "regex-2023.8.8-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:423adfa872b4908843ac3e7a30f957f5d5282944b81ca0a3b8a7ccbbfaa06103"}, + {file = "regex-2023.8.8-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4ae594c66f4a7e1ea67232a0846649a7c94c188d6c071ac0210c3e86a5f92109"}, + {file = "regex-2023.8.8-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e51c80c168074faa793685656c38eb7a06cbad7774c8cbc3ea05552d615393d8"}, + {file = "regex-2023.8.8-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:09b7f4c66aa9d1522b06e31a54f15581c37286237208df1345108fcf4e050c18"}, + {file = "regex-2023.8.8-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2e73e5243af12d9cd6a9d6a45a43570dbe2e5b1cdfc862f5ae2b031e44dd95a8"}, + {file = "regex-2023.8.8-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:941460db8fe3bd613db52f05259c9336f5a47ccae7d7def44cc277184030a116"}, + {file = "regex-2023.8.8-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:f0ccf3e01afeb412a1a9993049cb160d0352dba635bbca7762b2dc722aa5742a"}, + {file = "regex-2023.8.8-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:2e9216e0d2cdce7dbc9be48cb3eacb962740a09b011a116fd7af8c832ab116ca"}, + {file = "regex-2023.8.8-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:5cd9cd7170459b9223c5e592ac036e0704bee765706445c353d96f2890e816c8"}, + {file = "regex-2023.8.8-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:4873ef92e03a4309b3ccd8281454801b291b689f6ad45ef8c3658b6fa761d7ac"}, + {file = "regex-2023.8.8-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:239c3c2a339d3b3ddd51c2daef10874410917cd2b998f043c13e2084cb191684"}, + {file = "regex-2023.8.8-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:1005c60ed7037be0d9dea1f9c53cc42f836188227366370867222bda4c3c6bd7"}, + {file = "regex-2023.8.8-cp37-cp37m-win32.whl", hash = "sha256:e6bd1e9b95bc5614a7a9c9c44fde9539cba1c823b43a9f7bc11266446dd568e3"}, + {file = "regex-2023.8.8-cp37-cp37m-win_amd64.whl", hash = "sha256:9a96edd79661e93327cfeac4edec72a4046e14550a1d22aa0dd2e3ca52aec921"}, + {file = "regex-2023.8.8-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:f2181c20ef18747d5f4a7ea513e09ea03bdd50884a11ce46066bb90fe4213675"}, + {file = "regex-2023.8.8-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:a2ad5add903eb7cdde2b7c64aaca405f3957ab34f16594d2b78d53b8b1a6a7d6"}, + {file = "regex-2023.8.8-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9233ac249b354c54146e392e8a451e465dd2d967fc773690811d3a8c240ac601"}, + {file = "regex-2023.8.8-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:920974009fb37b20d32afcdf0227a2e707eb83fe418713f7a8b7de038b870d0b"}, + {file = "regex-2023.8.8-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cd2b6c5dfe0929b6c23dde9624483380b170b6e34ed79054ad131b20203a1a63"}, + {file = "regex-2023.8.8-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:96979d753b1dc3b2169003e1854dc67bfc86edf93c01e84757927f810b8c3c93"}, + {file = "regex-2023.8.8-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2ae54a338191e1356253e7883d9d19f8679b6143703086245fb14d1f20196be9"}, + {file = "regex-2023.8.8-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:2162ae2eb8b079622176a81b65d486ba50b888271302190870b8cc488587d280"}, + {file = "regex-2023.8.8-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:c884d1a59e69e03b93cf0dfee8794c63d7de0ee8f7ffb76e5f75be8131b6400a"}, + {file = "regex-2023.8.8-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:cf9273e96f3ee2ac89ffcb17627a78f78e7516b08f94dc435844ae72576a276e"}, + {file = "regex-2023.8.8-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:83215147121e15d5f3a45d99abeed9cf1fe16869d5c233b08c56cdf75f43a504"}, + {file = "regex-2023.8.8-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:3f7454aa427b8ab9101f3787eb178057c5250478e39b99540cfc2b889c7d0586"}, + {file = "regex-2023.8.8-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:f0640913d2c1044d97e30d7c41728195fc37e54d190c5385eacb52115127b882"}, + {file = "regex-2023.8.8-cp38-cp38-win32.whl", hash = "sha256:0c59122ceccb905a941fb23b087b8eafc5290bf983ebcb14d2301febcbe199c7"}, + {file = "regex-2023.8.8-cp38-cp38-win_amd64.whl", hash = "sha256:c12f6f67495ea05c3d542d119d270007090bad5b843f642d418eb601ec0fa7be"}, + {file = "regex-2023.8.8-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:82cd0a69cd28f6cc3789cc6adeb1027f79526b1ab50b1f6062bbc3a0ccb2dbc3"}, + {file = "regex-2023.8.8-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:bb34d1605f96a245fc39790a117ac1bac8de84ab7691637b26ab2c5efb8f228c"}, + {file = "regex-2023.8.8-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:987b9ac04d0b38ef4f89fbc035e84a7efad9cdd5f1e29024f9289182c8d99e09"}, + {file = "regex-2023.8.8-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9dd6082f4e2aec9b6a0927202c85bc1b09dcab113f97265127c1dc20e2e32495"}, + {file = "regex-2023.8.8-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7eb95fe8222932c10d4436e7a6f7c99991e3fdd9f36c949eff16a69246dee2dc"}, + {file = "regex-2023.8.8-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7098c524ba9f20717a56a8d551d2ed491ea89cbf37e540759ed3b776a4f8d6eb"}, + {file = "regex-2023.8.8-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4b694430b3f00eb02c594ff5a16db30e054c1b9589a043fe9174584c6efa8033"}, + {file = "regex-2023.8.8-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:b2aeab3895d778155054abea5238d0eb9a72e9242bd4b43f42fd911ef9a13470"}, + {file = "regex-2023.8.8-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:988631b9d78b546e284478c2ec15c8a85960e262e247b35ca5eaf7ee22f6050a"}, + {file = "regex-2023.8.8-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:67ecd894e56a0c6108ec5ab1d8fa8418ec0cff45844a855966b875d1039a2e34"}, + {file = "regex-2023.8.8-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:14898830f0a0eb67cae2bbbc787c1a7d6e34ecc06fbd39d3af5fe29a4468e2c9"}, + {file = "regex-2023.8.8-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:f2200e00b62568cfd920127782c61bc1c546062a879cdc741cfcc6976668dfcf"}, + {file = "regex-2023.8.8-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:9691a549c19c22d26a4f3b948071e93517bdf86e41b81d8c6ac8a964bb71e5a6"}, + {file = "regex-2023.8.8-cp39-cp39-win32.whl", hash = "sha256:6ab2ed84bf0137927846b37e882745a827458689eb969028af8032b1b3dac78e"}, + {file = "regex-2023.8.8-cp39-cp39-win_amd64.whl", hash = "sha256:5543c055d8ec7801901e1193a51570643d6a6ab8751b1f7dd9af71af467538bb"}, + {file = "regex-2023.8.8.tar.gz", hash = "sha256:fcbdc5f2b0f1cd0f6a56cdb46fe41d2cce1e644e3b68832f3eeebc5fb0f7712e"}, +] + [[package]] name = "requests" version = "2.31.0" @@ -2569,6 +3003,21 @@ urllib3 = ">=1.21.1,<3" socks = ["PySocks (>=1.5.6,!=1.5.7)"] use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"] +[[package]] +name = "requests-file" +version = "1.5.1" +description = "File transport adapter for Requests" +optional = true +python-versions = "*" +files = [ + {file = "requests-file-1.5.1.tar.gz", hash = "sha256:07d74208d3389d01c38ab89ef403af0cfec63957d53a0081d8eca738d0247d8e"}, + {file = "requests_file-1.5.1-py2.py3-none-any.whl", hash = "sha256:dfe5dae75c12481f68ba353183c53a65e6044c923e64c24b2209f6c7570ca953"}, +] + +[package.dependencies] +requests = ">=1.0.0" +six = "*" + [[package]] name = "rfc3339-validator" version = "0.1.4" @@ -2769,6 +3218,27 @@ files = [ {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"}, ] +[[package]] +name = "smart-open" +version = "6.3.0" +description = "Utils for streaming large files (S3, HDFS, GCS, Azure Blob Storage, gzip, bz2...)" +optional = true +python-versions = ">=3.6,<4.0" +files = [ + {file = "smart_open-6.3.0-py3-none-any.whl", hash = "sha256:b4c9ae193ad6d3e7add50944b86afa0d150bd821ab8ec21edb26d9a06b66f6a8"}, + {file = "smart_open-6.3.0.tar.gz", hash = "sha256:d5238825fe9a9340645fac3d75b287c08fbb99fb2b422477de781c9f5f09e019"}, +] + +[package.extras] +all = ["azure-common", "azure-core", "azure-storage-blob", "boto3", "google-cloud-storage (>=2.6.0)", "paramiko", "requests"] +azure = ["azure-common", "azure-core", "azure-storage-blob"] +gcs = ["google-cloud-storage (>=2.6.0)"] +http = ["requests"] +s3 = ["boto3"] +ssh = ["paramiko"] +test = ["azure-common", "azure-core", "azure-storage-blob", "boto3", "google-cloud-storage (>=2.6.0)", "moto[server]", "paramiko", "pytest", "pytest-rerunfailures", "requests", "responses"] +webhdfs = ["requests"] + [[package]] name = "sniffio" version = "1.3.0" @@ -2791,6 +3261,115 @@ files = [ {file = "soupsieve-2.4.1.tar.gz", hash = "sha256:89d12b2d5dfcd2c9e8c22326da9d9aa9cb3dfab0a83a024f05704076ee8d35ea"}, ] +[[package]] +name = "spacy" +version = "3.6.1" +description = "Industrial-strength Natural Language Processing (NLP) in Python" +optional = true +python-versions = ">=3.6" +files = [ + {file = "spacy-3.6.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:2fb23b9af51ee8baeea4920d6ffc8ef85bc3ea7a6338dbf330a0626cf6ac6ea9"}, + {file = "spacy-3.6.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:cb00bc74f59b537518a398fd066c0f7a8f029c763cc88afa1a0a59914f639e83"}, + {file = "spacy-3.6.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8f75430fef7e18e6a4c32ca7efa3fb17020eaaa5d7ca0aeac6f663748a32888d"}, + {file = "spacy-3.6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:479132dd3118024e97022735d6ad10d50c789f3979675a8db86e40f333fa335f"}, + {file = "spacy-3.6.1-cp310-cp310-win_amd64.whl", hash = "sha256:385dd3e48a8bb980ec2b8a70831ab3d2d43496357bae91b486c0e99dedb991aa"}, + {file = "spacy-3.6.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:369c1102eadfcfe155ff1d8d540411b784fe163171e15f02e0b47e030af7c527"}, + {file = "spacy-3.6.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:8ee28656f518e0d454dcc6840a17ec4c6141c055cda86e6b7a772ec6b55cde24"}, + {file = "spacy-3.6.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5f426f312e945191218a3f753d7ce0068f08d27b253de0e30b9fbae81778bb90"}, + {file = "spacy-3.6.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3c51ceb2e0352c99b1703ef97849c10cb27ceb58348cb76ab4734477d485035b"}, + {file = "spacy-3.6.1-cp311-cp311-win_amd64.whl", hash = "sha256:c6b7184bac8c8f72c4e3dbfd7c82eb0541c03fbccded11412269ae906f0d16c9"}, + {file = "spacy-3.6.1-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:643b69be30f092cc3215d576d9a194ee01a3da319accdc06ae5a521d83497093"}, + {file = "spacy-3.6.1-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:17424ab01023ece5679fe5c9224241d4ba6b08069b756df77df5b0c857fa762c"}, + {file = "spacy-3.6.1-cp36-cp36m-win_amd64.whl", hash = "sha256:eb93b401f7070fb7e6be64b4d9ac5c69f6ed49c9a7c13532481b425a9ee5d980"}, + {file = "spacy-3.6.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:46c27249590a0227d33ad33871e99820c2e9890b59f970a37f8f95f4520ca2eb"}, + {file = "spacy-3.6.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:590886ca51ad4509100eeae233d22086e3736ab3ff54bf588f356a0862cdb735"}, + {file = "spacy-3.6.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ca97c6052e098f00c0bed89dfa7c0d9a7ea24667d67854baa7dba53c61c8c6f0"}, + {file = "spacy-3.6.1-cp37-cp37m-win_amd64.whl", hash = "sha256:13554a7bda6f9b148f54f3df0870b487c590921eaff0d7ce1a8be15b70e77a92"}, + {file = "spacy-3.6.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:a110dc5bbc5b37176168bb24064f7e49b9f29f5a4857f09114e5953c3754b311"}, + {file = "spacy-3.6.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:3abd2b82dd483c13aeb10720f52416523415ac0af84106f0c1eaae29240fe709"}, + {file = "spacy-3.6.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:77ac5d89d909b30e64873caa93399aa5a1e72b363ae291e297c83a07db6b646f"}, + {file = "spacy-3.6.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3de915f5419ad28d8d1c614c77172ce05b0b59a7c57854f098b7f2da98e28f40"}, + {file = "spacy-3.6.1-cp38-cp38-win_amd64.whl", hash = "sha256:738d806851760c2917e20046332af1ccbef78ff43eaebb23914f4d90ed060539"}, + {file = "spacy-3.6.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:4b5350ad1b70fb9b9e17be220dd866c6b91a950a45cfe6ce524041ef52593621"}, + {file = "spacy-3.6.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:3b797eedaf29b8726e5fb81e4b839b1734a07c835243a2d59a28cc974d2a9067"}, + {file = "spacy-3.6.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7762c1944cdacc0d04f5c781c79cc7beb1caa6cbc2b74687a997775f0846cec1"}, + {file = "spacy-3.6.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3fdee99625ee3c11537182598c81a17d4d4521c73b59e6c1d0ad6749c6654f16"}, + {file = "spacy-3.6.1-cp39-cp39-win_amd64.whl", hash = "sha256:c9d112681d3666a75b07dea8c65a0b3f46ebebb9b90fda568089254134f0d28b"}, + {file = "spacy-3.6.1.tar.gz", hash = "sha256:6323a98706ae2d5561694b03a8b0b5751887a002903a4894e68aeb29cc672166"}, +] + +[package.dependencies] +catalogue = ">=2.0.6,<2.1.0" +cymem = ">=2.0.2,<2.1.0" +jinja2 = "*" +langcodes = ">=3.2.0,<4.0.0" +murmurhash = ">=0.28.0,<1.1.0" +numpy = ">=1.15.0" +packaging = ">=20.0" +pathy = ">=0.10.0" +preshed = ">=3.0.2,<3.1.0" +pydantic = ">=1.7.4,<1.8 || >1.8,<1.8.1 || >1.8.1,<3.0.0" +requests = ">=2.13.0,<3.0.0" +setuptools = "*" +smart-open = ">=5.2.1,<7.0.0" +spacy-legacy = ">=3.0.11,<3.1.0" +spacy-loggers = ">=1.0.0,<2.0.0" +srsly = ">=2.4.3,<3.0.0" +thinc = ">=8.1.8,<8.2.0" +tqdm = ">=4.38.0,<5.0.0" +typer = ">=0.3.0,<0.10.0" +wasabi = ">=0.9.1,<1.2.0" + +[package.extras] +apple = ["thinc-apple-ops (>=0.1.0.dev0,<1.0.0)"] +cuda = ["cupy (>=5.0.0b4,<13.0.0)"] +cuda-autodetect = ["cupy-wheel (>=11.0.0,<13.0.0)"] +cuda100 = ["cupy-cuda100 (>=5.0.0b4,<13.0.0)"] +cuda101 = ["cupy-cuda101 (>=5.0.0b4,<13.0.0)"] +cuda102 = ["cupy-cuda102 (>=5.0.0b4,<13.0.0)"] +cuda110 = ["cupy-cuda110 (>=5.0.0b4,<13.0.0)"] +cuda111 = ["cupy-cuda111 (>=5.0.0b4,<13.0.0)"] +cuda112 = ["cupy-cuda112 (>=5.0.0b4,<13.0.0)"] +cuda113 = ["cupy-cuda113 (>=5.0.0b4,<13.0.0)"] +cuda114 = ["cupy-cuda114 (>=5.0.0b4,<13.0.0)"] +cuda115 = ["cupy-cuda115 (>=5.0.0b4,<13.0.0)"] +cuda116 = ["cupy-cuda116 (>=5.0.0b4,<13.0.0)"] +cuda117 = ["cupy-cuda117 (>=5.0.0b4,<13.0.0)"] +cuda11x = ["cupy-cuda11x (>=11.0.0,<13.0.0)"] +cuda12x = ["cupy-cuda12x (>=11.5.0,<13.0.0)"] +cuda80 = ["cupy-cuda80 (>=5.0.0b4,<13.0.0)"] +cuda90 = ["cupy-cuda90 (>=5.0.0b4,<13.0.0)"] +cuda91 = ["cupy-cuda91 (>=5.0.0b4,<13.0.0)"] +cuda92 = ["cupy-cuda92 (>=5.0.0b4,<13.0.0)"] +ja = ["sudachidict-core (>=20211220)", "sudachipy (>=0.5.2,!=0.6.1)"] +ko = ["natto-py (>=0.9.0)"] +lookups = ["spacy-lookups-data (>=1.0.3,<1.1.0)"] +ray = ["spacy-ray (>=0.1.0,<1.0.0)"] +th = ["pythainlp (>=2.0)"] +transformers = ["spacy-transformers (>=1.1.2,<1.3.0)"] + +[[package]] +name = "spacy-legacy" +version = "3.0.12" +description = "Legacy registered functions for spaCy backwards compatibility" +optional = true +python-versions = ">=3.6" +files = [ + {file = "spacy-legacy-3.0.12.tar.gz", hash = "sha256:b37d6e0c9b6e1d7ca1cf5bc7152ab64a4c4671f59c85adaf7a3fcb870357a774"}, + {file = "spacy_legacy-3.0.12-py2.py3-none-any.whl", hash = "sha256:476e3bd0d05f8c339ed60f40986c07387c0a71479245d6d0f4298dbd52cda55f"}, +] + +[[package]] +name = "spacy-loggers" +version = "1.0.4" +description = "Logging utilities for SpaCy" +optional = true +python-versions = ">=3.6" +files = [ + {file = "spacy-loggers-1.0.4.tar.gz", hash = "sha256:e6f983bf71230091d5bb7b11bf64bd54415eca839108d5f83d9155d0ba93bf28"}, + {file = "spacy_loggers-1.0.4-py3-none-any.whl", hash = "sha256:e050bf2e63208b2f096b777e494971c962ad7c1dc997641c8f95c622550044ae"}, +] + [[package]] name = "sqlalchemy" version = "2.0.20" @@ -2869,6 +3448,46 @@ postgresql-psycopgbinary = ["psycopg[binary] (>=3.0.7)"] pymysql = ["pymysql"] sqlcipher = ["sqlcipher3-binary"] +[[package]] +name = "srsly" +version = "2.4.7" +description = "Modern high-performance serialization utilities for Python" +optional = true +python-versions = ">=3.6" +files = [ + {file = "srsly-2.4.7-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:38506074cfac43f5581b6b22c335dc4d43ef9a82cbe9fe2557452e149d4540f5"}, + {file = "srsly-2.4.7-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:efd401ac0b239f3c7c0070fcd613f10a4a01478ff5fe7fc8527ea7a23dfa3709"}, + {file = "srsly-2.4.7-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bd1be19502fda87108c8055bce6537ec332266057f595133623a4a18e56a91a1"}, + {file = "srsly-2.4.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:87e86be5fd655ed554e4bf6b63a4eb3380ffb40752d0621323a3df879d3e6407"}, + {file = "srsly-2.4.7-cp310-cp310-win_amd64.whl", hash = "sha256:7be5def9b6ac7896ce326997498b8155b9167ddc672fb209a200090c7fe45a4b"}, + {file = "srsly-2.4.7-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:bb3d54563e33816d33695b58f9daaea410fcd0b9272aba27050410a5279ba8d8"}, + {file = "srsly-2.4.7-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2848735a9fcb0ad9ec23a6986466de7942280a01dbcb7b66583288f1378afba1"}, + {file = "srsly-2.4.7-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:282d59a37c271603dd790ab25fa6521c3d3fdbca67bef3ee838fd664c773ea0d"}, + {file = "srsly-2.4.7-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7affecb281db0683fe78181d644f6d6a061948fa318884c5669a064b97869f54"}, + {file = "srsly-2.4.7-cp311-cp311-win_amd64.whl", hash = "sha256:76d991167dc83f8684fb366a092a03f51f7582741885ba42444ab577e61ae198"}, + {file = "srsly-2.4.7-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d7a7278470bbad3831c9d8abd7f7b9fa9a3d6cd29f797f913f7a04ade5668715"}, + {file = "srsly-2.4.7-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:654496a07fcf11ba823e9a16f263001271f04d8b1bfd8d94ba6130a1649fc6d8"}, + {file = "srsly-2.4.7-cp36-cp36m-win_amd64.whl", hash = "sha256:89e35ead948349b2a8d47600544dbf49ff737d15a899bc5a71928220daee2807"}, + {file = "srsly-2.4.7-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:3e0f0410faf9d5dc5c58caf907a4b0b94e6dc766289e329a15ddf8adca264d1c"}, + {file = "srsly-2.4.7-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6c3422ab7ed37438086a178e611be85b7001e0071882655fcb8dca83c4f5f57d"}, + {file = "srsly-2.4.7-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2a81186f9c1beb0892fcef4fd6350e6ee0d2d700da5042e400ec6da65a0b52fb"}, + {file = "srsly-2.4.7-cp37-cp37m-win_amd64.whl", hash = "sha256:1fe4a9bf004174f0b73b3fc3a96d35811c218e0441f4246ac4cb3f06daf0ca12"}, + {file = "srsly-2.4.7-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:86501eb25c6615d934bde0aea98d705ce7edd11d070536162bd2fa8606034f0f"}, + {file = "srsly-2.4.7-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:f46bc563a7b80f81aed8dd12f86ef43b93852d937666f44a3d04bcdaa630376c"}, + {file = "srsly-2.4.7-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6e60cd20f08b8a0e200017c6e8f5af51321878b17bf7da284dd81c7604825c6e"}, + {file = "srsly-2.4.7-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c90953a58dfde2eeaea15749c7dddad2a508b48b17d084b491d56d5213ef2a37"}, + {file = "srsly-2.4.7-cp38-cp38-win_amd64.whl", hash = "sha256:7c9a1dc7077b4a101fd018c1c567ec735203887e016a813588557f5c4ce2de8b"}, + {file = "srsly-2.4.7-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:c8ada26613f49f72baa573dbd7e911f3af88b647c3559cb6641c97ca8dd7cfe0"}, + {file = "srsly-2.4.7-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:267f6ac1b8388a4649a6e6299114ff2f6af03bafd60fc8f267e890a9becf7057"}, + {file = "srsly-2.4.7-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:75f2777cc44ad34c5f2239d44c8cd56b0263bf19bc6c1593dcc765e2a21fc5e7"}, + {file = "srsly-2.4.7-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2059d447cfe5bf6692634cbfbbb2d5663f554023b0aa0ee3d348387d9ec9345a"}, + {file = "srsly-2.4.7-cp39-cp39-win_amd64.whl", hash = "sha256:422e44d702da4420c47012d309fc56b5081ca06a500393d83114eb09d71bf1ce"}, + {file = "srsly-2.4.7.tar.gz", hash = "sha256:93c2cc4588778261ccb23dd0543b24ded81015dd8ab4ec137cd7d04965035d08"}, +] + +[package.dependencies] +catalogue = ">=2.0.3,<2.1.0" + [[package]] name = "stack-data" version = "0.6.2" @@ -2922,6 +3541,84 @@ tornado = ">=6.1.0" docs = ["myst-parser", "pydata-sphinx-theme", "sphinx"] test = ["pre-commit", "pytest (>=7.0)", "pytest-timeout"] +[[package]] +name = "thinc" +version = "8.1.12" +description = "A refreshing functional take on deep learning, compatible with your favorite libraries" +optional = true +python-versions = ">=3.6" +files = [ + {file = "thinc-8.1.12-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:efda431bc1513e81e457dbff4ef1610592569ddc362f8df24422628b195d51f4"}, + {file = "thinc-8.1.12-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:01dbe9063171c1d0df29374a3857ee500fb8acf8f33bd8a85d11214d7453ff7a"}, + {file = "thinc-8.1.12-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9fcfe97b80aa02a6cdeef9f5e3127822a13497a9b6f58653da4ff3caf321e3c4"}, + {file = "thinc-8.1.12-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0c52d0657c61b7e1a382cb5ee1ee71692a0e9c47bef9f3e02ac3492b26056d27"}, + {file = "thinc-8.1.12-cp310-cp310-win_amd64.whl", hash = "sha256:b2078018c8bc36540b0c007cb1909f6c81c9a973b3180d15b934414f08988b28"}, + {file = "thinc-8.1.12-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:340171c1927592082c79509e5a964766e2d65c2e30c5e583489488935a9a2340"}, + {file = "thinc-8.1.12-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:88e8c9cd5119d5dbb0c4ed1bdde5acd6cf12fe1b3316647ecbd79fb12e3ef542"}, + {file = "thinc-8.1.12-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:15c6cb31138814599426bd8855b9fc9d8d8ddb2bde1c91d204353b5e5af15deb"}, + {file = "thinc-8.1.12-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5dc3117db83ec0d423480b6c77de90f658dfaed5f7a2bbc3d640f1f6c7ff0fe7"}, + {file = "thinc-8.1.12-cp311-cp311-win_amd64.whl", hash = "sha256:f9ac43fd02e952c005753f85bd375c03baea5fa818a6a4942930177c31130eca"}, + {file = "thinc-8.1.12-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4241d0b8c9e813a1fbba05b6dc7d7056c0a2601b8a1119d372e85185068009e6"}, + {file = "thinc-8.1.12-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c141e42e610605a9c6def19e5dbb4877353839a610e3cdb1fa68e70f6b39492a"}, + {file = "thinc-8.1.12-cp36-cp36m-win_amd64.whl", hash = "sha256:9388c1427b4c3615967e1be19fa93427be61241392bdd5a84ab1da0f96c6bcfb"}, + {file = "thinc-8.1.12-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:f6fb12692fae1a056432800f94ec88fa714eb1111aff9eabd61d2dfe10beb713"}, + {file = "thinc-8.1.12-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e51c693d477e02eab164a67b588fcdbb3609bc54ec39de6084da2dd9a356b8f8"}, + {file = "thinc-8.1.12-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4265f902f9a597be294765479ef6535d679e497fa2fed955cbcabcfdd82f81ad"}, + {file = "thinc-8.1.12-cp37-cp37m-win_amd64.whl", hash = "sha256:4586d6709f3811db85e192fdf519620b3326d28e5f0193cef8544b057e20a951"}, + {file = "thinc-8.1.12-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:e10a648872e9ebbe115fa5fba0d515e8226bd0e2de0abd41d55f1ae04017813c"}, + {file = "thinc-8.1.12-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:27231eb1d468e7eb97f255c3d1e985d5a0cb8e309e0ec01b29cce2de836b8db2"}, + {file = "thinc-8.1.12-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c8ece3880ac05d6bb75ecdbd9c03298e6f9691e5cb7480c1f15e66e33fe34004"}, + {file = "thinc-8.1.12-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:285f1141ecd7a9b61e2fed58b609c194b40e6ae5daf1e1e8dec31616bc9ffca1"}, + {file = "thinc-8.1.12-cp38-cp38-win_amd64.whl", hash = "sha256:0400632aa235cfbbc0004014e90cdf54cd42333aa7f5e971ffe87c8125e607ed"}, + {file = "thinc-8.1.12-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:2edb3ef3a02f966eae8c5c56feb80ad5b6e5c221c94fcd95eb413d09d0d82212"}, + {file = "thinc-8.1.12-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:e078d3b00e51c597f3f301d3e2925d0842d0725f251ff9a53a1e1b4110d4b9c1"}, + {file = "thinc-8.1.12-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7d0ac2f6a0b38ddb913f9b31d8c4b13b98a7f5f62db211e0d8ebefbda5138757"}, + {file = "thinc-8.1.12-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:47cde897cf54bc731a3a7c2e51a6ef01a86687ab7ae90ab0e9fc5d2294fe0fba"}, + {file = "thinc-8.1.12-cp39-cp39-win_amd64.whl", hash = "sha256:1b846c35a24b5b33e5d240f514f3a9e8bac2b6a10491caa147753dc50740a400"}, + {file = "thinc-8.1.12.tar.gz", hash = "sha256:9dd12c5c79b176f077ce9416b49c9752782bd76ff0ea649d66527882e83ea353"}, +] + +[package.dependencies] +blis = ">=0.7.8,<0.8.0" +catalogue = ">=2.0.4,<2.1.0" +confection = ">=0.0.1,<1.0.0" +cymem = ">=2.0.2,<2.1.0" +murmurhash = ">=1.0.2,<1.1.0" +numpy = [ + {version = ">=1.15.0", markers = "python_version < \"3.9\""}, + {version = ">=1.19.0", markers = "python_version >= \"3.9\""}, +] +packaging = ">=20.0" +preshed = ">=3.0.2,<3.1.0" +pydantic = ">=1.7.4,<1.8 || >1.8,<1.8.1 || >1.8.1,<3.0.0" +setuptools = "*" +srsly = ">=2.4.0,<3.0.0" +wasabi = ">=0.8.1,<1.2.0" + +[package.extras] +cuda = ["cupy (>=5.0.0b4)"] +cuda-autodetect = ["cupy-wheel (>=11.0.0)"] +cuda100 = ["cupy-cuda100 (>=5.0.0b4)"] +cuda101 = ["cupy-cuda101 (>=5.0.0b4)"] +cuda102 = ["cupy-cuda102 (>=5.0.0b4)"] +cuda110 = ["cupy-cuda110 (>=5.0.0b4)"] +cuda111 = ["cupy-cuda111 (>=5.0.0b4)"] +cuda112 = ["cupy-cuda112 (>=5.0.0b4)"] +cuda113 = ["cupy-cuda113 (>=5.0.0b4)"] +cuda114 = ["cupy-cuda114 (>=5.0.0b4)"] +cuda115 = ["cupy-cuda115 (>=5.0.0b4)"] +cuda116 = ["cupy-cuda116 (>=5.0.0b4)"] +cuda117 = ["cupy-cuda117 (>=5.0.0b4)"] +cuda11x = ["cupy-cuda11x (>=11.0.0)"] +cuda80 = ["cupy-cuda80 (>=5.0.0b4)"] +cuda90 = ["cupy-cuda90 (>=5.0.0b4)"] +cuda91 = ["cupy-cuda91 (>=5.0.0b4)"] +cuda92 = ["cupy-cuda92 (>=5.0.0b4)"] +datasets = ["ml-datasets (>=0.2.0,<0.3.0)"] +mxnet = ["mxnet (>=1.5.1,<1.6.0)"] +tensorflow = ["tensorflow (>=2.0.0,<2.6.0)"] +torch = ["torch (>=1.6.0)"] + [[package]] name = "tinycss2" version = "1.2.1" @@ -2940,6 +3637,23 @@ webencodings = ">=0.4" doc = ["sphinx", "sphinx_rtd_theme"] test = ["flake8", "isort", "pytest"] +[[package]] +name = "tldextract" +version = "3.4.4" +description = "Accurately separates a URL's subdomain, domain, and public suffix, using the Public Suffix List (PSL). By default, this includes the public ICANN TLDs and their exceptions. You can optionally support the Public Suffix List's private domains as well." +optional = true +python-versions = ">=3.7" +files = [ + {file = "tldextract-3.4.4-py3-none-any.whl", hash = "sha256:581e7dbefc90e7bb857bb6f768d25c811a3c5f0892ed56a9a2999ddb7b1b70c2"}, + {file = "tldextract-3.4.4.tar.gz", hash = "sha256:5fe3210c577463545191d45ad522d3d5e78d55218ce97215e82004dcae1e1234"}, +] + +[package.dependencies] +filelock = ">=3.0.8" +idna = "*" +requests = ">=2.1.0" +requests-file = ">=1.4" + [[package]] name = "tomli" version = "2.0.1" @@ -2971,6 +3685,26 @@ files = [ {file = "tornado-6.3.3.tar.gz", hash = "sha256:e7d8db41c0181c80d76c982aacc442c0783a2c54d6400fe028954201a2e032fe"}, ] +[[package]] +name = "tqdm" +version = "4.66.1" +description = "Fast, Extensible Progress Meter" +optional = true +python-versions = ">=3.7" +files = [ + {file = "tqdm-4.66.1-py3-none-any.whl", hash = "sha256:d302b3c5b53d47bce91fea46679d9c3c6508cf6332229aa1e7d8653723793386"}, + {file = "tqdm-4.66.1.tar.gz", hash = "sha256:d88e651f9db8d8551a62556d3cff9e3034274ca5d66e93197cf2490e2dcb69c7"}, +] + +[package.dependencies] +colorama = {version = "*", markers = "platform_system == \"Windows\""} + +[package.extras] +dev = ["pytest (>=6)", "pytest-cov", "pytest-timeout", "pytest-xdist"] +notebook = ["ipywidgets (>=6)"] +slack = ["slack-sdk"] +telegram = ["requests"] + [[package]] name = "traitlets" version = "5.9.0" @@ -2986,6 +3720,27 @@ files = [ docs = ["myst-parser", "pydata-sphinx-theme", "sphinx"] test = ["argcomplete (>=2.0)", "pre-commit", "pytest", "pytest-mock"] +[[package]] +name = "typer" +version = "0.9.0" +description = "Typer, build great CLIs. Easy to code. Based on Python type hints." +optional = true +python-versions = ">=3.6" +files = [ + {file = "typer-0.9.0-py3-none-any.whl", hash = "sha256:5d96d986a21493606a358cae4461bd8cdf83cbf33a5aa950ae629ca3b51467ee"}, + {file = "typer-0.9.0.tar.gz", hash = "sha256:50922fd79aea2f4751a8e0408ff10d2662bd0c8bbfa84755a699f3bada2978b2"}, +] + +[package.dependencies] +click = ">=7.1.1,<9.0.0" +typing-extensions = ">=3.7.4.3" + +[package.extras] +all = ["colorama (>=0.4.3,<0.5.0)", "rich (>=10.11.0,<14.0.0)", "shellingham (>=1.3.0,<2.0.0)"] +dev = ["autoflake (>=1.3.1,<2.0.0)", "flake8 (>=3.8.3,<4.0.0)", "pre-commit (>=2.17.0,<3.0.0)"] +doc = ["cairosvg (>=2.5.2,<3.0.0)", "mdx-include (>=1.4.1,<2.0.0)", "mkdocs (>=1.1.2,<2.0.0)", "mkdocs-material (>=8.1.4,<9.0.0)", "pillow (>=9.3.0,<10.0.0)"] +test = ["black (>=22.3.0,<23.0.0)", "coverage (>=6.2,<7.0)", "isort (>=5.0.6,<6.0.0)", "mypy (==0.910)", "pytest (>=4.4.0,<8.0.0)", "pytest-cov (>=2.10.0,<5.0.0)", "pytest-sugar (>=0.9.4,<0.10.0)", "pytest-xdist (>=1.32.0,<4.0.0)", "rich (>=10.11.0,<14.0.0)", "shellingham (>=1.3.0,<2.0.0)"] + [[package]] name = "types-pyyaml" version = "6.0.12.11" @@ -3054,6 +3809,20 @@ secure = ["certifi", "cryptography (>=1.9)", "idna (>=2.0.0)", "pyopenssl (>=17. socks = ["pysocks (>=1.5.6,!=1.5.7,<2.0)"] zstd = ["zstandard (>=0.18.0)"] +[[package]] +name = "wasabi" +version = "1.1.2" +description = "A lightweight console printing and formatting toolkit" +optional = true +python-versions = ">=3.6" +files = [ + {file = "wasabi-1.1.2-py3-none-any.whl", hash = "sha256:0a3f933c4bf0ed3f93071132c1b87549733256d6c8de6473c5f7ed2e171b5cf9"}, + {file = "wasabi-1.1.2.tar.gz", hash = "sha256:1aaef3aceaa32edb9c91330d29d3936c0c39fdb965743549c173cb54b16c30b5"}, +] + +[package.dependencies] +colorama = {version = ">=0.4.6", markers = "sys_platform == \"win32\" and python_version >= \"3.7\""} + [[package]] name = "wcwidth" version = "0.2.6" @@ -3220,7 +3989,10 @@ files = [ docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"] testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-ignore-flaky", "pytest-mypy (>=0.9.1)", "pytest-ruff"] +[extras] +extended-testing = ["faker", "presidio-analyzer", "presidio-anonymizer"] + [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<4.0" -content-hash = "bd737027e0fd9ea2ee823632f89dbd947c7d5f41bb05fc1cbff04106ae3dd350" +content-hash = "66ac482bd05eb74414210ac28fc1e8dae1a9928a4a1314e1326fada3551aa8ad" diff --git a/libs/experimental/pyproject.toml b/libs/experimental/pyproject.toml index b9fb69c60ae..42893244555 100644 --- a/libs/experimental/pyproject.toml +++ b/libs/experimental/pyproject.toml @@ -11,6 +11,9 @@ repository = "https://github.com/langchain-ai/langchain" [tool.poetry.dependencies] python = ">=3.8.1,<4.0" langchain = ">=0.0.239" +presidio-anonymizer = {version = "^2.2.33", optional = true} +presidio-analyzer = {version = "^2.2.33", optional = true} +faker = {version = "^19.3.1", optional = true} [tool.poetry.group.lint.dependencies] @@ -31,6 +34,16 @@ setuptools = "^67.6.1" # Any dependencies that do not meet that criteria will be removed. pytest = "^7.3.0" +# An extra used to be able to add extended testing. +# Please use new-line on formatting to make it easier to add new packages without +# merge-conflicts +[tool.poetry.extras] +extended_testing = [ + "presidio-anonymizer", + "presidio-analyzer", + "faker", +] + [tool.ruff] select = [ "E", # pycodestyle diff --git a/libs/experimental/tests/unit_tests/test_data_anonymizer.py b/libs/experimental/tests/unit_tests/test_data_anonymizer.py new file mode 100644 index 00000000000..138b60eca89 --- /dev/null +++ b/libs/experimental/tests/unit_tests/test_data_anonymizer.py @@ -0,0 +1,84 @@ +from typing import Iterator, List + +import pytest + + +@pytest.fixture(scope="module", autouse=True) +def check_spacy_model() -> Iterator[None]: + import spacy + + if not spacy.util.is_package("en_core_web_lg"): + pytest.skip(reason="Spacy model 'en_core_web_lg' not installed") + yield + + +@pytest.mark.requires("presidio_analyzer", "presidio_anonymizer", "faker") +@pytest.mark.parametrize( + "analyzed_fields,should_contain", + [(["PERSON"], False), (["PHONE_NUMBER"], True), (None, False)], +) +def test_anonymize(analyzed_fields: List[str], should_contain: bool) -> None: + """Test anonymizing a name in a simple sentence""" + from langchain_experimental.data_anonymizer import PresidioAnonymizer + + text = "Hello, my name is John Doe." + anonymizer = PresidioAnonymizer(analyzed_fields=analyzed_fields) + anonymized_text = anonymizer.anonymize(text) + assert ("John Doe" in anonymized_text) == should_contain + + +@pytest.mark.requires("presidio_analyzer", "presidio_anonymizer", "faker") +def test_anonymize_multiple() -> None: + """Test anonymizing multiple items in a sentence""" + from langchain_experimental.data_anonymizer import PresidioAnonymizer + + text = "John Smith's phone number is 313-666-7440 and email is johnsmith@gmail.com" + anonymizer = PresidioAnonymizer() + anonymized_text = anonymizer.anonymize(text) + for phrase in ["John Smith", "313-666-7440", "johnsmith@gmail.com"]: + assert phrase not in anonymized_text + + +@pytest.mark.requires("presidio_analyzer", "presidio_anonymizer", "faker") +def test_anonymize_with_custom_operator() -> None: + """Test anonymize a name with a custom operator""" + from presidio_anonymizer.entities import OperatorConfig + + from langchain_experimental.data_anonymizer import PresidioAnonymizer + + custom_operator = {"PERSON": OperatorConfig("replace", {"new_value": ""})} + anonymizer = PresidioAnonymizer(operators=custom_operator) + + text = "Jane Doe was here." + + anonymized_text = anonymizer.anonymize(text) + assert anonymized_text == " was here." + + +@pytest.mark.requires("presidio_analyzer", "presidio_anonymizer", "faker") +def test_add_recognizer_operator() -> None: + """ + Test add recognizer and anonymize a new type of entity and with a custom operator + """ + from presidio_analyzer import PatternRecognizer + from presidio_anonymizer.entities import OperatorConfig + + from langchain_experimental.data_anonymizer import PresidioAnonymizer + + anonymizer = PresidioAnonymizer(analyzed_fields=[]) + titles_list = ["Sir", "Madam", "Professor"] + custom_recognizer = PatternRecognizer( + supported_entity="TITLE", deny_list=titles_list + ) + anonymizer.add_recognizer(custom_recognizer) + + # anonymizing with custom recognizer + text = "Madam Jane Doe was here." + anonymized_text = anonymizer.anonymize(text) + assert anonymized_text == " Jane Doe was here." + + # anonymizing with custom recognizer and operator + custom_operator = {"TITLE": OperatorConfig("replace", {"new_value": "Dear"})} + anonymizer.add_operators(custom_operator) + anonymized_text = anonymizer.anonymize(text) + assert anonymized_text == "Dear Jane Doe was here." From 781f274d19013875ed1b24020b099b83ebf4f8b3 Mon Sep 17 00:00:00 2001 From: Bagatur <22008038+baskaryan@users.noreply.github.com> Date: Wed, 30 Aug 2023 10:49:20 -0700 Subject: [PATCH 17/30] make privacy guide section (#10003) --- docs/extras/guides/privacy/_category_.yml | 1 + .../privacy/presidio_data_anonymization.ipynb} | 6 +++--- 2 files changed, 4 insertions(+), 3 deletions(-) create mode 100644 docs/extras/guides/privacy/_category_.yml rename docs/extras/{use_cases/data_anonymization.ipynb => guides/privacy/presidio_data_anonymization.ipynb} (98%) diff --git a/docs/extras/guides/privacy/_category_.yml b/docs/extras/guides/privacy/_category_.yml new file mode 100644 index 00000000000..3459827572e --- /dev/null +++ b/docs/extras/guides/privacy/_category_.yml @@ -0,0 +1 @@ +label: 'Privacy' diff --git a/docs/extras/use_cases/data_anonymization.ipynb b/docs/extras/guides/privacy/presidio_data_anonymization.ipynb similarity index 98% rename from docs/extras/use_cases/data_anonymization.ipynb rename to docs/extras/guides/privacy/presidio_data_anonymization.ipynb index 4955406cf35..7bb0b159342 100644 --- a/docs/extras/use_cases/data_anonymization.ipynb +++ b/docs/extras/guides/privacy/presidio_data_anonymization.ipynb @@ -4,9 +4,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Data anonymization\n", + "# Data anonymization with Microsoft Presidio\n", "\n", - "[![Open In Collab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/langchain-ai/langchain/blob/master/docs/extras/use_cases/data_anonymization.ipynb)\n", + "[![Open In Collab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/langchain-ai/langchain/blob/master/docs/extras/guides/privacy/presidio_data_anonymization.ipynb)\n", "\n", "## Use case\n", "\n", @@ -477,7 +477,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.4" + "version": "3.9.1" } }, "nbformat": 4, From 2f03e71e67b023b9a37dd27c7fee5054ed116bf2 Mon Sep 17 00:00:00 2001 From: Bagatur <22008038+baskaryan@users.noreply.github.com> Date: Wed, 30 Aug 2023 10:52:46 -0700 Subject: [PATCH 18/30] rename local llm guide (#10004) --- docs/extras/guides/local_llms.ipynb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/extras/guides/local_llms.ipynb b/docs/extras/guides/local_llms.ipynb index 5f399df3567..b9f3843296e 100644 --- a/docs/extras/guides/local_llms.ipynb +++ b/docs/extras/guides/local_llms.ipynb @@ -5,7 +5,7 @@ "id": "b8982428", "metadata": {}, "source": [ - "# Private, local, open source LLMs\n", + "# Run LLMs locally\n", "\n", "## Use case\n", "\n", @@ -799,7 +799,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.16" + "version": "3.9.1" } }, "nbformat": 4, From 7fa82900cb15d9c41099ad7dbb8aaa66941f6905 Mon Sep 17 00:00:00 2001 From: Bagatur <22008038+baskaryan@users.noreply.github.com> Date: Wed, 30 Aug 2023 11:07:42 -0700 Subject: [PATCH 19/30] guides docs nits (#10005) --- docs/docs_skeleton/docs/guides/evaluation/index.mdx | 4 ---- docs/extras/guides/pydantic_compatibility.md | 4 ++-- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/docs/docs_skeleton/docs/guides/evaluation/index.mdx b/docs/docs_skeleton/docs/guides/evaluation/index.mdx index e2d04b12284..d5cadc50fcb 100644 --- a/docs/docs_skeleton/docs/guides/evaluation/index.mdx +++ b/docs/docs_skeleton/docs/guides/evaluation/index.mdx @@ -1,7 +1,3 @@ ---- -sidebar_position: 6 ---- - import DocCardList from "@theme/DocCardList"; # Evaluation diff --git a/docs/extras/guides/pydantic_compatibility.md b/docs/extras/guides/pydantic_compatibility.md index 1effe893bee..fb93921675e 100644 --- a/docs/extras/guides/pydantic_compatibility.md +++ b/docs/extras/guides/pydantic_compatibility.md @@ -1,10 +1,10 @@ -# Pydantic Compatibility +# Pydantic compatibility - Pydantic v2 was released in June, 2023 (https://docs.pydantic.dev/2.0/blog/pydantic-v2-final/) - v2 contains has a number of breaking changes (https://docs.pydantic.dev/2.0/migration/) - Pydantic v2 and v1 are under the same package name, so both versions cannot be installed at the same time -## LangChain Pydantic Migration Plan +## LangChain Pydantic migration plan As of `langchain>=0.0.267`, LangChain will allow users to install either Pydantic V1 or V2. * Internally LangChain will continue to [use V1](https://docs.pydantic.dev/latest/migration/#continue-using-pydantic-v1-features). From 240cc289e6b953ab8149aa6ab53a809a64c7989b Mon Sep 17 00:00:00 2001 From: Bagatur <baskaryan@gmail.com> Date: Wed, 30 Aug 2023 13:37:39 -0700 Subject: [PATCH 20/30] wip --- .../agents/agent_toolkits/openapi/spec.py | 53 ++----------------- .../langchain/chains/openai_functions/base.py | 11 ++-- .../langchain/tools/convert_to_openai.py | 45 ++++------------ libs/langchain/langchain/utils/json_schema.py | 48 +++++++++++++++++ 4 files changed, 64 insertions(+), 93 deletions(-) create mode 100644 libs/langchain/langchain/utils/json_schema.py diff --git a/libs/langchain/langchain/agents/agent_toolkits/openapi/spec.py b/libs/langchain/langchain/agents/agent_toolkits/openapi/spec.py index fa26b3c5d0e..35b104c4a39 100644 --- a/libs/langchain/langchain/agents/agent_toolkits/openapi/spec.py +++ b/libs/langchain/langchain/agents/agent_toolkits/openapi/spec.py @@ -1,56 +1,9 @@ """Quick and dirty representation for OpenAPI specs.""" from dataclasses import dataclass -from typing import Any, Dict, List, Tuple, Union +from typing import List, Tuple - -def dereference_refs(spec_obj: dict, full_spec: dict) -> Union[dict, list]: - """Try to substitute $refs. - - The goal is to get the complete docs for each endpoint in context for now. - - In the few OpenAPI specs I studied, $refs referenced models - (or in OpenAPI terms, components) and could be nested. This code most - likely misses lots of cases. - """ - - def _retrieve_ref_path(path: str, full_spec: dict) -> dict: - components = path.split("/") - if components[0] != "#": - raise RuntimeError( - "All $refs I've seen so far are uri fragments (start with hash)." - ) - out = full_spec - for component in components[1:]: - out = out[component] - return out - - def _dereference_refs( - obj: Union[dict, list], stop: bool = False - ) -> Union[dict, list]: - if stop: - return obj - obj_out: Dict[str, Any] = {} - if isinstance(obj, dict): - for k, v in obj.items(): - if k == "$ref": - # stop=True => don't dereference recursively. - return _dereference_refs( - _retrieve_ref_path(v, full_spec), stop=True - ) - elif isinstance(v, list): - obj_out[k] = [_dereference_refs(el) for el in v] - elif isinstance(v, dict): - obj_out[k] = _dereference_refs(v) - else: - obj_out[k] = v - return obj_out - elif isinstance(obj, list): - return [_dereference_refs(el) for el in obj] - else: - return obj - - return _dereference_refs(spec_obj) +from langchain.utils.json_schema import dereference_refs @dataclass(frozen=True) @@ -90,7 +43,7 @@ def reduce_openapi_spec(spec: dict, dereference: bool = True) -> ReducedOpenAPIS # Note: probably want to do this post-retrieval, it blows up the size of the spec. if dereference: endpoints = [ - (name, description, dereference_refs(docs, spec)) + (name, description, dereference_refs(docs, full_schema=spec)) for name, description, docs in endpoints ] diff --git a/libs/langchain/langchain/chains/openai_functions/base.py b/libs/langchain/langchain/chains/openai_functions/base.py index e023c67b7f3..84089e0cccc 100644 --- a/libs/langchain/langchain/chains/openai_functions/base.py +++ b/libs/langchain/langchain/chains/openai_functions/base.py @@ -10,6 +10,7 @@ from typing import ( Tuple, Type, Union, + cast, ) from langchain.base_language import BaseLanguageModel @@ -22,6 +23,7 @@ from langchain.output_parsers.openai_functions import ( from langchain.prompts import BasePromptTemplate from langchain.pydantic_v1 import BaseModel from langchain.schema import BaseLLMOutputParser +from langchain.utils.openai_functions import convert_pydantic_to_openai_function PYTHON_TO_JSON_TYPES = { "str": "string", @@ -148,14 +150,7 @@ def convert_to_openai_function( if isinstance(function, dict): return function elif isinstance(function, type) and issubclass(function, BaseModel): - # Mypy error: - # "type" has no attribute "schema" - schema = function.schema() # type: ignore[attr-defined] - return { - "name": schema["title"], - "description": schema["description"], - "parameters": schema, - } + return cast(Dict, convert_pydantic_to_openai_function(function)) elif callable(function): return convert_python_function_to_openai_function(function) diff --git a/libs/langchain/langchain/tools/convert_to_openai.py b/libs/langchain/langchain/tools/convert_to_openai.py index e575b024d4b..3385b0d831e 100644 --- a/libs/langchain/langchain/tools/convert_to_openai.py +++ b/libs/langchain/langchain/tools/convert_to_openai.py @@ -1,41 +1,21 @@ -from typing import TypedDict - -from langchain.tools import BaseTool, StructuredTool - - -class FunctionDescription(TypedDict): - """Representation of a callable function to the OpenAI API.""" - - name: str - """The name of the function.""" - description: str - """A description of the function.""" - parameters: dict - """The parameters of the function.""" +from langchain.tools import BaseTool +from langchain.utils.openai_functions import ( + FunctionDescription, + convert_pydantic_to_openai_function, +) def format_tool_to_openai_function(tool: BaseTool) -> FunctionDescription: """Format tool into the OpenAI function API.""" - if isinstance(tool, StructuredTool): - schema_ = tool.args_schema.schema() - # Bug with required missing for structured tools. - required = schema_.get( - "required", sorted(schema_["properties"]) # Backup is a BUG WORKAROUND + if tool.args_schema: + return convert_pydantic_to_openai_function( + tool.args_schema, name=tool.name, description=tool.description ) + else: return { "name": tool.name, "description": tool.description, "parameters": { - "type": "object", - "properties": schema_["properties"], - "required": required, - }, - } - else: - if tool.args_schema: - parameters = tool.args_schema.schema() - else: - parameters = { # This is a hack to get around the fact that some tools # do not expose an args_schema, and expect an argument # which is a string. @@ -46,10 +26,5 @@ def format_tool_to_openai_function(tool: BaseTool) -> FunctionDescription: }, "required": ["__arg1"], "type": "object", - } - - return { - "name": tool.name, - "description": tool.description, - "parameters": parameters, + }, } diff --git a/libs/langchain/langchain/utils/json_schema.py b/libs/langchain/langchain/utils/json_schema.py new file mode 100644 index 00000000000..c5feab8478d --- /dev/null +++ b/libs/langchain/langchain/utils/json_schema.py @@ -0,0 +1,48 @@ +from __future__ import annotations + +from typing import Optional, TypeVar, Union, cast + + +def _retrieve_ref(path: str, schema: dict) -> dict: + components = path.split("/") + if components[0] != "#": + raise ValueError( + "ref paths are expected to be URI fragments, meaning they should start " + "with #." + ) + out = schema + for component in components[1:]: + out = out[component] + return out + + +JSON_LIKE = TypeVar("JSON_LIKE", bound=Union[dict, list]) + + +def _dereference_refs_helper(obj: JSON_LIKE, full_schema: dict) -> JSON_LIKE: + if isinstance(obj, dict): + obj_out = {} + for k, v in obj.items(): + if k == "$ref": + ref = _retrieve_ref(v, full_schema) + obj_out[k] = _dereference_refs_helper(ref, full_schema) + elif isinstance(v, (list, dict)): + obj_out[k] = _dereference_refs_helper(v, full_schema) # type: ignore + else: + obj_out[k] = v + return cast(JSON_LIKE, obj_out) + elif isinstance(obj, list): + return cast( + JSON_LIKE, [_dereference_refs_helper(el, full_schema) for el in obj] + ) + else: + return obj + + +def dereference_refs( + schema_obj: dict, *, full_schema: Optional[dict] = None +) -> Union[dict, list]: + """Try to substitute $refs in JSON Schema.""" + + full_schema = full_schema or schema_obj + return _dereference_refs_helper(schema_obj, full_schema) From 1f5c579ef4de385c14306f8b0ec539e39a36e432 Mon Sep 17 00:00:00 2001 From: Bagatur <baskaryan@gmail.com> Date: Wed, 30 Aug 2023 13:37:50 -0700 Subject: [PATCH 21/30] add --- .../langchain/utils/openai_functions.py | 29 +++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100644 libs/langchain/langchain/utils/openai_functions.py diff --git a/libs/langchain/langchain/utils/openai_functions.py b/libs/langchain/langchain/utils/openai_functions.py new file mode 100644 index 00000000000..48c49541dcf --- /dev/null +++ b/libs/langchain/langchain/utils/openai_functions.py @@ -0,0 +1,29 @@ +from typing import Dict, Optional, Type, TypedDict, cast + +from langchain.pydantic_v1 import BaseModel +from langchain.utils.json_schema import dereference_refs + + +class FunctionDescription(TypedDict): + """Representation of a callable function to the OpenAI API.""" + + name: str + """The name of the function.""" + description: str + """A description of the function.""" + parameters: dict + """The parameters of the function.""" + + +def convert_pydantic_to_openai_function( + model: Type[BaseModel], + *, + name: Optional[str] = None, + description: Optional[str] = None +) -> FunctionDescription: + schema = cast(Dict, dereference_refs(model.schema())) + return { + "name": name or schema["title"], + "description": description or schema["description"], + "parameters": schema, + } From e805f8e26373b24431401f02ce1a4654cb2d2078 Mon Sep 17 00:00:00 2001 From: Bagatur <baskaryan@gmail.com> Date: Wed, 30 Aug 2023 15:23:02 -0700 Subject: [PATCH 22/30] add tests --- libs/langchain/langchain/utils/json_schema.py | 56 +++++-- .../langchain/utils/openai_functions.py | 5 +- .../unit_tests/utils/test_json_schema.py | 151 ++++++++++++++++++ .../unit_tests/utils/test_openai_functions.py | 79 +++++++++ 4 files changed, 273 insertions(+), 18 deletions(-) create mode 100644 libs/langchain/tests/unit_tests/utils/test_json_schema.py create mode 100644 libs/langchain/tests/unit_tests/utils/test_openai_functions.py diff --git a/libs/langchain/langchain/utils/json_schema.py b/libs/langchain/langchain/utils/json_schema.py index c5feab8478d..9628f9e521b 100644 --- a/libs/langchain/langchain/utils/json_schema.py +++ b/libs/langchain/langchain/utils/json_schema.py @@ -1,6 +1,7 @@ from __future__ import annotations -from typing import Optional, TypeVar, Union, cast +from copy import deepcopy +from typing import Any, List, Optional, Sequence def _retrieve_ref(path: str, schema: dict) -> dict: @@ -13,36 +14,59 @@ def _retrieve_ref(path: str, schema: dict) -> dict: out = schema for component in components[1:]: out = out[component] - return out + return deepcopy(out) -JSON_LIKE = TypeVar("JSON_LIKE", bound=Union[dict, list]) - - -def _dereference_refs_helper(obj: JSON_LIKE, full_schema: dict) -> JSON_LIKE: +def _dereference_refs_helper( + obj: Any, full_schema: dict, skip_keys: Sequence[str] +) -> Any: if isinstance(obj, dict): obj_out = {} for k, v in obj.items(): - if k == "$ref": + if k in skip_keys: + obj_out[k] = v + elif k == "$ref": ref = _retrieve_ref(v, full_schema) - obj_out[k] = _dereference_refs_helper(ref, full_schema) + return _dereference_refs_helper(ref, full_schema, skip_keys) elif isinstance(v, (list, dict)): - obj_out[k] = _dereference_refs_helper(v, full_schema) # type: ignore + obj_out[k] = _dereference_refs_helper(v, full_schema, skip_keys) else: obj_out[k] = v - return cast(JSON_LIKE, obj_out) + return obj_out elif isinstance(obj, list): - return cast( - JSON_LIKE, [_dereference_refs_helper(el, full_schema) for el in obj] - ) + return [_dereference_refs_helper(el, full_schema, skip_keys) for el in obj] else: return obj +def _infer_skip_keys(obj: Any, full_schema: dict) -> List[str]: + keys = [] + if isinstance(obj, dict): + for k, v in obj.items(): + if k == "$ref": + ref = _retrieve_ref(v, full_schema) + keys.append(v.split("/")[1]) + keys += _infer_skip_keys(ref, full_schema) + elif isinstance(v, (list, dict)): + keys += _infer_skip_keys(v, full_schema) + elif isinstance(obj, list): + for el in obj: + keys += _infer_skip_keys(el, full_schema) + return keys + + def dereference_refs( - schema_obj: dict, *, full_schema: Optional[dict] = None -) -> Union[dict, list]: + schema_obj: dict, + *, + full_schema: Optional[dict] = None, + skip_keys: Optional[Sequence[str]] = None, +) -> dict: """Try to substitute $refs in JSON Schema.""" full_schema = full_schema or schema_obj - return _dereference_refs_helper(schema_obj, full_schema) + skip_keys = ( + skip_keys + if skip_keys is not None + else _infer_skip_keys(schema_obj, full_schema) + ) + return _dereference_refs_helper(schema_obj, full_schema, skip_keys) diff --git a/libs/langchain/langchain/utils/openai_functions.py b/libs/langchain/langchain/utils/openai_functions.py index 48c49541dcf..cfb1e76d595 100644 --- a/libs/langchain/langchain/utils/openai_functions.py +++ b/libs/langchain/langchain/utils/openai_functions.py @@ -1,4 +1,4 @@ -from typing import Dict, Optional, Type, TypedDict, cast +from typing import Optional, Type, TypedDict from langchain.pydantic_v1 import BaseModel from langchain.utils.json_schema import dereference_refs @@ -21,7 +21,8 @@ def convert_pydantic_to_openai_function( name: Optional[str] = None, description: Optional[str] = None ) -> FunctionDescription: - schema = cast(Dict, dereference_refs(model.schema())) + schema = dereference_refs(model.schema()) + schema.pop("definitions", None) return { "name": name or schema["title"], "description": description or schema["description"], diff --git a/libs/langchain/tests/unit_tests/utils/test_json_schema.py b/libs/langchain/tests/unit_tests/utils/test_json_schema.py new file mode 100644 index 00000000000..233c4672729 --- /dev/null +++ b/libs/langchain/tests/unit_tests/utils/test_json_schema.py @@ -0,0 +1,151 @@ +import pytest + +from langchain.utils.json_schema import dereference_refs + + +def test_dereference_refs_no_refs() -> None: + schema = { + "type": "object", + "properties": { + "first_name": {"type": "string"}, + }, + } + actual = dereference_refs(schema) + assert actual == schema + + +def test_dereference_refs_one_ref() -> None: + schema = { + "type": "object", + "properties": { + "first_name": {"$ref": "#/$defs/name"}, + }, + "$defs": {"name": {"type": "string"}}, + } + expected = { + "type": "object", + "properties": { + "first_name": {"type": "string"}, + }, + "$defs": {"name": {"type": "string"}}, + } + actual = dereference_refs(schema) + assert actual == expected + + +def test_dereference_refs_multiple_refs() -> None: + schema = { + "type": "object", + "properties": { + "first_name": {"$ref": "#/$defs/name"}, + "other": {"$ref": "#/$defs/other"}, + }, + "$defs": { + "name": {"type": "string"}, + "other": {"type": "object", "properties": {"age": "int", "height": "int"}}, + }, + } + expected = { + "type": "object", + "properties": { + "first_name": {"type": "string"}, + "other": {"type": "object", "properties": {"age": "int", "height": "int"}}, + }, + "$defs": { + "name": {"type": "string"}, + "other": {"type": "object", "properties": {"age": "int", "height": "int"}}, + }, + } + actual = dereference_refs(schema) + assert actual == expected + + +def test_dereference_refs_nested_refs_skip() -> None: + schema = { + "type": "object", + "properties": { + "info": {"$ref": "#/$defs/info"}, + }, + "$defs": { + "name": {"type": "string"}, + "info": { + "type": "object", + "properties": {"age": "int", "name": {"$ref": "#/$defs/name"}}, + }, + }, + } + expected = { + "type": "object", + "properties": { + "info": { + "type": "object", + "properties": {"age": "int", "name": {"type": "string"}}, + }, + }, + "$defs": { + "name": {"type": "string"}, + "info": { + "type": "object", + "properties": {"age": "int", "name": {"$ref": "#/$defs/name"}}, + }, + }, + } + actual = dereference_refs(schema) + assert actual == expected + + +def test_dereference_refs_nested_refs_no_skip() -> None: + schema = { + "type": "object", + "properties": { + "info": {"$ref": "#/$defs/info"}, + }, + "$defs": { + "name": {"type": "string"}, + "info": { + "type": "object", + "properties": {"age": "int", "name": {"$ref": "#/$defs/name"}}, + }, + }, + } + expected = { + "type": "object", + "properties": { + "info": { + "type": "object", + "properties": {"age": "int", "name": {"type": "string"}}, + }, + }, + "$defs": { + "name": {"type": "string"}, + "info": { + "type": "object", + "properties": {"age": "int", "name": {"type": "string"}}, + }, + }, + } + actual = dereference_refs(schema, skip_keys=()) + assert actual == expected + + +def test_dereference_refs_missing_ref() -> None: + schema = { + "type": "object", + "properties": { + "first_name": {"$ref": "#/$defs/name"}, + }, + "$defs": {}, + } + with pytest.raises(KeyError): + dereference_refs(schema) + + +def test_dereference_refs_remote_ref() -> None: + schema = { + "type": "object", + "properties": { + "first_name": {"$ref": "https://somewhere/else/name"}, + }, + } + with pytest.raises(ValueError): + dereference_refs(schema) diff --git a/libs/langchain/tests/unit_tests/utils/test_openai_functions.py b/libs/langchain/tests/unit_tests/utils/test_openai_functions.py new file mode 100644 index 00000000000..b5a22d837b9 --- /dev/null +++ b/libs/langchain/tests/unit_tests/utils/test_openai_functions.py @@ -0,0 +1,79 @@ +from langchain.pydantic_v1 import BaseModel, Field +from langchain.utils.openai_functions import convert_pydantic_to_openai_function + + +def test_convert_pydantic_to_openai_function() -> None: + class Data(BaseModel): + """The data to return.""" + + key: str = Field(..., description="API key") + days: int = Field(default=0, description="Number of days to forecast") + + actual = convert_pydantic_to_openai_function(Data) + expected = { + "name": "Data", + "description": "The data to return.", + "parameters": { + "title": "Data", + "description": "The data to return.", + "type": "object", + "properties": { + "key": {"title": "Key", "description": "API key", "type": "string"}, + "days": { + "title": "Days", + "description": "Number of days to forecast", + "default": 0, + "type": "integer", + }, + }, + "required": ["key"], + }, + } + assert actual == expected + + +def test_convert_pydantic_to_openai_function_nested() -> None: + class Data(BaseModel): + """The data to return.""" + + key: str = Field(..., description="API key") + days: int = Field(default=0, description="Number of days to forecast") + + class Model(BaseModel): + """The model to return.""" + + data: Data + + actual = convert_pydantic_to_openai_function(Model) + expected = { + "name": "Model", + "description": "The model to return.", + "parameters": { + "title": "Model", + "description": "The model to return.", + "type": "object", + "properties": { + "data": { + "title": "Data", + "description": "The data to return.", + "type": "object", + "properties": { + "key": { + "title": "Key", + "description": "API key", + "type": "string", + }, + "days": { + "title": "Days", + "description": "Number of days to forecast", + "default": 0, + "type": "integer", + }, + }, + "required": ["key"], + } + }, + "required": ["data"], + }, + } + assert actual == expected From b82ad19ed25815f381fa59b719e21fb11c49d377 Mon Sep 17 00:00:00 2001 From: William FH <13333726+hinthornw@users.noreply.github.com> Date: Wed, 30 Aug 2023 15:30:22 -0700 Subject: [PATCH 23/30] Check memory address (#9971) Don't want to dup the collector but can have multiple --- libs/langchain/langchain/callbacks/manager.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/libs/langchain/langchain/callbacks/manager.py b/libs/langchain/langchain/callbacks/manager.py index 2f7a7fad478..52832338fcb 100644 --- a/libs/langchain/langchain/callbacks/manager.py +++ b/libs/langchain/langchain/callbacks/manager.py @@ -1799,6 +1799,9 @@ def _configure( for handler in callback_manager.handlers ): callback_manager.add_handler(open_ai, True) - if run_collector_ is not None: + if run_collector_ is not None and not any( + handler is run_collector_ # direct pointer comparison + for handler in callback_manager.handlers + ): callback_manager.add_handler(run_collector_, False) return callback_manager From 5341b04d689b726d2c5da3e4e8c705c43529a2fb Mon Sep 17 00:00:00 2001 From: William FH <13333726+hinthornw@users.noreply.github.com> Date: Wed, 30 Aug 2023 17:42:55 -0700 Subject: [PATCH 24/30] Update error message (#9970) in evals --- libs/langchain/langchain/evaluation/loading.py | 12 ++++++++++-- .../langchain/smith/evaluation/runner_utils.py | 6 ++---- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/libs/langchain/langchain/evaluation/loading.py b/libs/langchain/langchain/evaluation/loading.py index 81ffec6d797..b5601b04434 100644 --- a/libs/langchain/langchain/evaluation/loading.py +++ b/libs/langchain/langchain/evaluation/loading.py @@ -108,7 +108,6 @@ def load_evaluator( >>> from langchain.evaluation import load_evaluator, EvaluatorType >>> evaluator = load_evaluator(EvaluatorType.QA) """ - llm = llm or ChatOpenAI(model="gpt-4", temperature=0) if evaluator not in _EVALUATOR_MAP: raise ValueError( f"Unknown evaluator type: {evaluator}" @@ -116,6 +115,16 @@ def load_evaluator( ) evaluator_cls = _EVALUATOR_MAP[evaluator] if issubclass(evaluator_cls, LLMEvalChain): + try: + llm = llm or ChatOpenAI(model="gpt-4", temperature=0) + except Exception as e: + raise ValueError( + f"Evaluation with the {evaluator_cls} requires a " + "language model to function." + " Failed to create the default 'gpt-4' model." + " Please manually provide an evaluation LLM" + " or check your openai credentials." + ) from e return evaluator_cls.from_llm(llm=llm, **kwargs) else: return evaluator_cls(**kwargs) @@ -154,7 +163,6 @@ def load_evaluators( >>> evaluators = [EvaluatorType.QA, EvaluatorType.CRITERIA] >>> loaded_evaluators = load_evaluators(evaluators, criteria="helpfulness") """ - llm = llm or ChatOpenAI(model="gpt-4", temperature=0) loaded = [] for evaluator in evaluators: _kwargs = config.get(evaluator, {}) if config else {} diff --git a/libs/langchain/langchain/smith/evaluation/runner_utils.py b/libs/langchain/langchain/smith/evaluation/runner_utils.py index f20f5cea4ba..1f432053e20 100644 --- a/libs/langchain/langchain/smith/evaluation/runner_utils.py +++ b/libs/langchain/langchain/smith/evaluation/runner_utils.py @@ -35,7 +35,6 @@ from langchain.callbacks.tracers.base import BaseTracer from langchain.callbacks.tracers.evaluation import EvaluatorCallbackHandler from langchain.callbacks.tracers.langchain import LangChainTracer from langchain.chains.base import Chain -from langchain.chat_models.openai import ChatOpenAI from langchain.evaluation.loading import load_evaluator from langchain.evaluation.schema import EvaluatorType, StringEvaluator from langchain.schema import ChatResult, LLMResult @@ -493,7 +492,7 @@ def _determine_reference_key( def _construct_run_evaluator( eval_config: Union[EvaluatorType, str, EvalConfig], - eval_llm: BaseLanguageModel, + eval_llm: Optional[BaseLanguageModel], run_type: str, data_type: DataType, example_outputs: Optional[List[str]], @@ -563,7 +562,6 @@ def _load_run_evaluators( Returns: A list of run evaluators. """ - eval_llm = config.eval_llm or ChatOpenAI(model="gpt-4", temperature=0.0) run_evaluators = [] input_key, prediction_key, reference_key = None, None, None if ( @@ -580,7 +578,7 @@ def _load_run_evaluators( for eval_config in config.evaluators: run_evaluator = _construct_run_evaluator( eval_config, - eval_llm, + config.eval_llm, run_type, data_type, example_outputs, From f2e8399cc880b06aadce40bf6944fd42ef86ef51 Mon Sep 17 00:00:00 2001 From: Tomaz Bratanic <bratanic.tomaz@gmail.com> Date: Thu, 31 Aug 2023 09:32:42 +0200 Subject: [PATCH 25/30] Fix link in Neo4j provider page (#10023) --- docs/extras/integrations/providers/neo4j.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/extras/integrations/providers/neo4j.mdx b/docs/extras/integrations/providers/neo4j.mdx index 3753f7a83e4..369c58c73cc 100644 --- a/docs/extras/integrations/providers/neo4j.mdx +++ b/docs/extras/integrations/providers/neo4j.mdx @@ -41,4 +41,4 @@ from langchain.graphs import Neo4jGraph from langchain.chains import GraphCypherQAChain ``` -For a more detailed walkthrough of Cypher generating chain, see [this notebook](/docs/extras/use_cases/more/graph/graph_cypher_qa.html) +For a more detailed walkthrough of Cypher generating chain, see [this notebook](/docs/use_cases/more/graph/graph_cypher_qa.html) From e2e05ad89e8097a9d04f2f32598efded80f004ff Mon Sep 17 00:00:00 2001 From: Hyeokjun seo <75557859+ANTARES-KOR@users.noreply.github.com> Date: Thu, 31 Aug 2023 16:33:13 +0900 Subject: [PATCH 26/30] Fix Typo : `openai_api_key` -> `serpapi_api_key` (#10020) Fixed typo in the comments Notebook. (which says `openai_api_key` for SerpAPI) --- .../agents/agent_types/openai_multi_functions_agent.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/extras/modules/agents/agent_types/openai_multi_functions_agent.ipynb b/docs/extras/modules/agents/agent_types/openai_multi_functions_agent.ipynb index 84cdad508ad..d1dc5cdb94a 100644 --- a/docs/extras/modules/agents/agent_types/openai_multi_functions_agent.ipynb +++ b/docs/extras/modules/agents/agent_types/openai_multi_functions_agent.ipynb @@ -71,7 +71,7 @@ "llm = ChatOpenAI(temperature=0, model=\"gpt-3.5-turbo-0613\")\n", "\n", "# Initialize the SerpAPIWrapper for search functionality\n", - "# Replace <your_api_key> in openai_api_key=\"<your_api_key>\" with your actual SerpAPI key.\n", + "# Replace <your_api_key> in serpapi_api_key=\"<your_api_key>\" with your actual SerpAPI key.\n", "search = SerpAPIWrapper()\n", "\n", "# Define a list of tools offered by the agent\n", From 52a3e8a261c84c83c0346283c91fcf35b168d900 Mon Sep 17 00:00:00 2001 From: skspark <118829871+skspark@users.noreply.github.com> Date: Thu, 31 Aug 2023 16:34:06 +0900 Subject: [PATCH 27/30] Add integration TCs on bing search (#8068) (#10021) ## Description Added integration TCs on bing search utility ## Issue #8068 ## Dependencies None --- .../utilities/test_bing_search.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 libs/langchain/tests/integration_tests/utilities/test_bing_search.py diff --git a/libs/langchain/tests/integration_tests/utilities/test_bing_search.py b/libs/langchain/tests/integration_tests/utilities/test_bing_search.py new file mode 100644 index 00000000000..72b2d166fd0 --- /dev/null +++ b/libs/langchain/tests/integration_tests/utilities/test_bing_search.py @@ -0,0 +1,19 @@ +"""Integration test for Bing Search API Wrapper.""" +from langchain.utilities.bing_search import BingSearchAPIWrapper + + +def test_call() -> None: + """Test that call gives the correct answer.""" + search = BingSearchAPIWrapper() + output = search.run("Obama's first name") + assert "Barack Hussein Obama" in output + + +def test_results() -> None: + """Test that call gives the correct answer.""" + search = BingSearchAPIWrapper() + results = search.results("Obama's first name", num_results=5) + result_contents = "\n".join( + f"{result['title']}: {result['snippet']}" for result in results + ) + assert "Barack Hussein Obama" in result_contents From e37d51cab646f2e81d7769a83d7f991baeef25ef Mon Sep 17 00:00:00 2001 From: Cameron Vetter <CameronVetter@users.noreply.github.com> Date: Thu, 31 Aug 2023 02:35:06 -0500 Subject: [PATCH 28/30] fix scoring profile example (#10016) - Description: A change in the documentation example for Azure Cognitive Vector Search with Scoring Profile so the example works as written - Issue: #10015 - Dependencies: None - Tag maintainer: @baskaryan @ruoccofabrizio - Twitter handle: @poshporcupine --- docs/extras/integrations/vectorstores/azuresearch.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/extras/integrations/vectorstores/azuresearch.ipynb b/docs/extras/integrations/vectorstores/azuresearch.ipynb index fc9bb75b5b6..bc89d1a9859 100644 --- a/docs/extras/integrations/vectorstores/azuresearch.ipynb +++ b/docs/extras/integrations/vectorstores/azuresearch.ipynb @@ -584,7 +584,7 @@ } ], "source": [ - "res = vector_store.similarity_search(query=\"Test 1\", k=3, search_type=\"hybrid\")\n", + "res = vector_store.similarity_search(query=\"Test 1\", k=3, search_type=\"similarity\")\n", "res" ] }, From 13fef1e5d3a263644420e882fabb03a21ca4c894 Mon Sep 17 00:00:00 2001 From: Hunsmore <homdyan@163.com> Date: Thu, 31 Aug 2023 15:38:55 +0800 Subject: [PATCH 29/30] add bloomz_7b, llama-2-7b, llama-2-13b, llama-2-70b to ErnieBotChat (#10024) - Description: Add bloomz_7b, llama-2-7b, llama-2-13b, llama-2-70b to ErnieBotChat, which only supported ERNIE-Bot-turbo and ERNIE-Bot. - Issue: #10022, - Dependencies: no extra dependencies --------- Co-authored-by: hetianfeng <hetianfeng@meituan.com> --- libs/langchain/langchain/chat_models/ernie.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/libs/langchain/langchain/chat_models/ernie.py b/libs/langchain/langchain/chat_models/ernie.py index d3fdce5c31f..367341c11f3 100644 --- a/libs/langchain/langchain/chat_models/ernie.py +++ b/libs/langchain/langchain/chat_models/ernie.py @@ -98,12 +98,19 @@ class ErnieBotChat(BaseChatModel): def _chat(self, payload: object) -> dict: base_url = "https://aip.baidubce.com/rpc/2.0/ai_custom/v1/wenxinworkshop/chat" - if self.model_name == "ERNIE-Bot-turbo": - url = f"{base_url}/eb-instant" - elif self.model_name == "ERNIE-Bot": - url = f"{base_url}/completions" + model_paths = { + "ERNIE-Bot-turbo": "eb-instant", + "ERNIE-Bot": "completions", + "BLOOMZ-7B": "bloomz_7b1", + "Llama-2-7b-chat": "llama_2_7b", + "Llama-2-13b-chat": "llama_2_13b", + "Llama-2-70b-chat": "llama_2_70b", + } + if self.model_name in model_paths: + url = f"{base_url}/{model_paths[self.model_name]}" else: raise ValueError(f"Got unknown model_name {self.model_name}") + resp = requests.post( url, timeout=self.request_timeout, From b1644bc9add6947e2b99f94f300abf1986044772 Mon Sep 17 00:00:00 2001 From: Bagatur <baskaryan@gmail.com> Date: Thu, 31 Aug 2023 00:43:34 -0700 Subject: [PATCH 30/30] cr --- .../integrations/vectorstores/tencentvectordb.ipynb | 4 ++-- libs/langchain/langchain/vectorstores/tencentvectordb.py | 6 +++--- .../vectorstores/test_tencentvectordb.py | 8 -------- 3 files changed, 5 insertions(+), 13 deletions(-) diff --git a/docs/extras/integrations/vectorstores/tencentvectordb.ipynb b/docs/extras/integrations/vectorstores/tencentvectordb.ipynb index 35fc17d4ceb..fb9bf232e35 100644 --- a/docs/extras/integrations/vectorstores/tencentvectordb.ipynb +++ b/docs/extras/integrations/vectorstores/tencentvectordb.ipynb @@ -89,7 +89,7 @@ "metadata": {}, "outputs": [], "source": [ - "vector_db = TencentVectorDB(embedding_function=embeddings, connection_params=conn_params)\n", + "vector_db = TencentVectorDB(embeddings, conn_params)\n", "\n", "vector_db.add_texts([\"Ankush went to Princeton\"])\n", "query = \"Where did Ankush go to college?\"\n", @@ -114,7 +114,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.17" + "version": "3.9.1" } }, "nbformat": 4, diff --git a/libs/langchain/langchain/vectorstores/tencentvectordb.py b/libs/langchain/langchain/vectorstores/tencentvectordb.py index 75541f85e67..a4660c564c2 100644 --- a/libs/langchain/langchain/vectorstores/tencentvectordb.py +++ b/libs/langchain/langchain/vectorstores/tencentvectordb.py @@ -78,7 +78,7 @@ class TencentVectorDB(VectorStore): def __init__( self, - embedding_function: Embeddings, + embedding: Embeddings, connection_params: ConnectionParams, index_params: IndexParams = IndexParams(128), database_name: str = "LangChainDatabase", @@ -87,7 +87,7 @@ class TencentVectorDB(VectorStore): ): self.document = guard_import("tcvectordb.model.document") tcvectordb = guard_import("tcvectordb") - self.embedding_func = embedding_function + self.embedding_func = embedding self.index_params = index_params self.vdb_client = tcvectordb.VectorDBClient( url=connection_params.url, @@ -193,7 +193,7 @@ class TencentVectorDB(VectorStore): else: index_params.dimension = dimension vector_db = cls( - embedding_function=embedding, + embedding=embedding, connection_params=connection_params, index_params=index_params, database_name=database_name, diff --git a/libs/langchain/tests/integration_tests/vectorstores/test_tencentvectordb.py b/libs/langchain/tests/integration_tests/vectorstores/test_tencentvectordb.py index 3cf2758763c..53970997c17 100644 --- a/libs/langchain/tests/integration_tests/vectorstores/test_tencentvectordb.py +++ b/libs/langchain/tests/integration_tests/vectorstores/test_tencentvectordb.py @@ -83,11 +83,3 @@ def test_tencent_vector_db_no_drop() -> None: time.sleep(3) output = docsearch.similarity_search("foo", k=10) assert len(output) == 6 - - -# if __name__ == "__main__": -# test_tencent_vector_db() -# test_tencent_vector_db_with_score() -# test_tencent_vector_db_max_marginal_relevance_search() -# test_tencent_vector_db_add_extra() -# test_tencent_vector_db_no_drop()