Compare commits

..

9 Commits

Author SHA1 Message Date
isaac hershenson
994bde53e3 fmt 2024-11-05 07:39:42 -08:00
isaac hershenson
df415417a1 fmt 2024-11-05 07:36:36 -08:00
isaac hershenson
85a1215217 fmt 2024-11-05 07:30:54 -08:00
isaac hershenson
1160090ce3 fix 2024-11-01 16:45:56 -07:00
isaac hershenson
cfd554bffc wip 2024-11-01 16:44:57 -07:00
Jun Yamog
830cad7bc0 core: fix CommaSeparatedListOutputParser to handle columns that may contain commas in it (#26365)
- **Description:**
Currently CommaSeparatedListOutputParser can't handle strings that may
contain commas within a column. It would parse any commas as the
delimiter.
Ex. 
"foo, foo2", "bar", "baz"

It will create 4 columns: "foo", "foo2", "bar", "baz"

This should be 3 columns:

"foo, foo2", "bar", "baz"

- **Dependencies:**
Added 2 additional imports, but they are built in python packages.

import csv
from io import StringIO

- **Twitter handle:** @jkyamog

- [ ] **Add tests and docs**: 
1. added simple unit test test_multiple_items_with_comma

---------

Co-authored-by: Erick Friis <erick@langchain.dev>
Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com>
Co-authored-by: Bagatur <baskaryan@gmail.com>
2024-11-01 22:42:24 +00:00
Erick Friis
9fedb04dd3 docs: INVALID_CHAT_HISTORY redirect (#27845) 2024-11-01 21:35:11 +00:00
Erick Friis
03a3670a5e infra: remove some special cases (#27839) 2024-11-01 21:13:43 +00:00
Bagatur
002e1c9055 airbyte: remove from master (#27837) 2024-11-01 13:59:34 -07:00
28 changed files with 55 additions and 3440 deletions

View File

@@ -81,7 +81,6 @@ jobs:
ES_URL: ${{ secrets.ES_URL }}
ES_CLOUD_ID: ${{ secrets.ES_CLOUD_ID }}
ES_API_KEY: ${{ secrets.ES_API_KEY }}
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # for airbyte
MONGODB_ATLAS_URI: ${{ secrets.MONGODB_ATLAS_URI }}
VOYAGE_API_KEY: ${{ secrets.VOYAGE_API_KEY }}
COHERE_API_KEY: ${{ secrets.COHERE_API_KEY }}

View File

@@ -289,7 +289,6 @@ jobs:
ES_URL: ${{ secrets.ES_URL }}
ES_CLOUD_ID: ${{ secrets.ES_CLOUD_ID }}
ES_API_KEY: ${{ secrets.ES_API_KEY }}
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # for airbyte
MONGODB_ATLAS_URI: ${{ secrets.MONGODB_ATLAS_URI }}
VOYAGE_API_KEY: ${{ secrets.VOYAGE_API_KEY }}
UPSTAGE_API_KEY: ${{ secrets.UPSTAGE_API_KEY }}

View File

@@ -72,9 +72,7 @@ jobs:
- name: Install dependencies
working-directory: langchain
run: |
# skip airbyte due to pandas dependency issue
python -m uv pip install $(ls ./libs/partners | grep -vE "airbyte" | xargs -I {} echo "./libs/partners/{}")
python -m uv pip install $(ls ./libs/partners | xargs -I {} echo "./libs/partners/{}")
python -m uv pip install libs/core libs/langchain libs/text-splitters libs/community libs/experimental
python -m uv pip install -r docs/api_reference/requirements.txt

View File

@@ -530,7 +530,6 @@ def _out_file_path(package_name: str) -> Path:
def _build_index(dirs: List[str]) -> None:
custom_names = {
"airbyte": "Airbyte",
"aws": "AWS",
"ai21": "AI21",
"ibm": "IBM",

View File

@@ -76,7 +76,6 @@ ALL_PACKAGES = IN_REPO_PACKAGES.union(EXTERNAL_PACKAGES)
CUSTOM_NAME = {
"google-genai": "Google Generative AI",
"aws": "AWS",
"airbyte": "Airbyte",
"ibm": "IBM",
}
CUSTOM_PROVIDER_PAGES = {

View File

@@ -87,7 +87,7 @@
"destination": "/docs/integrations/providers/:path*"
},
{
"source": "/docs/troubleshooting/errors/:path((?:GRAPH_RECURSION_LIMIT|INVALID_CONCURRENT_GRAPH_UPDATE|INVALID_GRAPH_NODE_RETURN_VALUE|MULTIPLE_SUBGRAPHS)/?)*",
"source": "/docs/troubleshooting/errors/:path((?:GRAPH_RECURSION_LIMIT|INVALID_CONCURRENT_GRAPH_UPDATE|INVALID_GRAPH_NODE_RETURN_VALUE|MULTIPLE_SUBGRAPHS|INVALID_CHAT_HISTORY)/?)*",
"destination": "https://langchain-ai.github.io/langgraph/troubleshooting/errors/:path*"
},
{

View File

@@ -36,7 +36,12 @@ from langchain_core.messages.function import FunctionMessage, FunctionMessageChu
from langchain_core.messages.human import HumanMessage, HumanMessageChunk
from langchain_core.messages.modifier import RemoveMessage
from langchain_core.messages.system import SystemMessage, SystemMessageChunk
from langchain_core.messages.tool import ToolCall, ToolMessage, ToolMessageChunk
from langchain_core.messages.tool import (
InvalidToolCall,
ToolCall,
ToolMessage,
ToolMessageChunk,
)
if TYPE_CHECKING:
from langchain_text_splitters import TextSplitter
@@ -317,9 +322,15 @@ def _convert_to_message(message: MessageLikeRepresentation) -> BaseMessage:
except KeyError:
msg_type = msg_kwargs.pop("type")
# None msg content is not allowed
msg_content = msg_kwargs.pop("content") or ""
content_or_tool_calls = (
"tool_calls" in msg_kwargs or "content" in msg_kwargs
)
if not content_or_tool_calls:
msg = "Must have one of content or tool calls"
raise KeyError(msg)
msg_content = msg_kwargs.pop("content", "") or ""
except KeyError as e:
msg = f"Message dict must contain 'role' and 'content' keys, got {message}"
msg = f"Message dict must contain 'role' and one of 'content' or 'tool_calls' keys, got {message}" # noqa: E501
msg = create_message(
message=msg, error_code=ErrorCode.MESSAGE_COERCION_FAILURE
)
@@ -957,6 +968,10 @@ def convert_to_openai_messages(
oai_msg["name"] = message.name
if isinstance(message, AIMessage) and message.tool_calls:
oai_msg["tool_calls"] = _convert_to_openai_tool_calls(message.tool_calls)
if isinstance(message, AIMessage) and message.invalid_tool_calls:
oai_msg["tool_calls"] = oai_msg.get(
"tool_calls", []
) + _convert_to_openai_tool_calls(message.invalid_tool_calls, invalid=True)
if message.additional_kwargs.get("refusal"):
oai_msg["refusal"] = message.additional_kwargs["refusal"]
if isinstance(message, ToolMessage):
@@ -1393,14 +1408,18 @@ def _get_message_openai_role(message: BaseMessage) -> str:
raise ValueError(msg)
def _convert_to_openai_tool_calls(tool_calls: list[ToolCall]) -> list[dict]:
def _convert_to_openai_tool_calls(
tool_calls: Union[list[ToolCall], list[InvalidToolCall]], invalid: bool = False
) -> list[dict]:
return [
{
"type": "function",
"id": tool_call["id"],
"function": {
"name": tool_call["name"],
"arguments": json.dumps(tool_call["args"]),
"arguments": tool_call["args"]
if invalid
else json.dumps(tool_call["args"]),
},
}
for tool_call in tool_calls

View File

@@ -1,9 +1,11 @@
from __future__ import annotations
import csv
import re
from abc import abstractmethod
from collections import deque
from collections.abc import AsyncIterator, Iterator
from io import StringIO
from typing import Optional as Optional
from typing import TypeVar, Union
@@ -162,7 +164,14 @@ class CommaSeparatedListOutputParser(ListOutputParser):
Returns:
A list of strings.
"""
return [part.strip() for part in text.split(",")]
try:
reader = csv.reader(
StringIO(text), quotechar='"', delimiter=",", skipinitialspace=True
)
return [item for sublist in reader for item in sublist]
except csv.Error:
# keep old logic for backup
return [part.strip() for part in text.split(",")]
@property
def _type(self) -> str:

View File

@@ -64,6 +64,25 @@ def test_multiple_items() -> None:
assert list(parser.transform(iter([text]))) == [[a] for a in expected]
def test_multiple_items_with_comma() -> None:
"""Test that a string with multiple comma-separated items with 1 item containing a
comma is parsed to a list."""
parser = CommaSeparatedListOutputParser()
text = '"foo, foo2",bar,baz'
expected = ["foo, foo2", "bar", "baz"]
assert parser.parse(text) == expected
assert add(parser.transform(t for t in text)) == expected
assert list(parser.transform(t for t in text)) == [[a] for a in expected]
assert list(parser.transform(t for t in text.splitlines(keepends=True))) == [
[a] for a in expected
]
assert list(
parser.transform(" " + t if i > 0 else t for i, t in enumerate(text.split(" ")))
) == [[a] for a in expected]
assert list(parser.transform(iter([text]))) == [[a] for a in expected]
def test_numbered_list() -> None:
parser = NumberedListOutputParser()
text1 = (

View File

@@ -23,10 +23,6 @@ packages:
- name: langchain-ai21
repo: langchain-ai/langchain-ai21
path: libs/ai21
- name: langchain-airbyte
repo: langchain-ai/langchain
path: libs/partners/airbyte
disabled: true # dependency issues / stale
- name: langchain-anthropic
repo: langchain-ai/langchain
path: libs/partners/anthropic

View File

@@ -1 +0,0 @@
__pycache__

View File

@@ -1,21 +0,0 @@
MIT License
Copyright (c) 2024 LangChain, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

View File

@@ -1,59 +0,0 @@
.PHONY: all format lint test tests integration_tests docker_tests help extended_tests
# Default target executed when no arguments are given to make.
all: help
# Define a variable for the test file path.
TEST_FILE ?= tests/unit_tests/
integration_test integration_tests: TEST_FILE = tests/integration_tests/
test tests integration_test integration_tests:
poetry run pytest $(TEST_FILE)
test_watch:
poetry run ptw --snapshot-update --now . -- -vv $(TEST_FILE)
######################
# LINTING AND FORMATTING
######################
# Define a variable for Python and notebook files.
PYTHON_FILES=.
MYPY_CACHE=.mypy_cache
lint format: PYTHON_FILES=.
lint_diff format_diff: PYTHON_FILES=$(shell git diff --relative=libs/partners/airbyte --name-only --diff-filter=d master | grep -E '\.py$$|\.ipynb$$')
lint_package: PYTHON_FILES=langchain_airbyte
lint_tests: PYTHON_FILES=tests
lint_tests: MYPY_CACHE=.mypy_cache_test
lint lint_diff lint_package lint_tests:
[ "$(PYTHON_FILES)" = "" ] || poetry run ruff $(PYTHON_FILES)
[ "$(PYTHON_FILES)" = "" ] || poetry run ruff format $(PYTHON_FILES) --diff
[ "$(PYTHON_FILES)" = "" ] || mkdir -p $(MYPY_CACHE) && poetry run mypy $(PYTHON_FILES) --cache-dir $(MYPY_CACHE)
format format_diff:
[ "$(PYTHON_FILES)" = "" ] || poetry run ruff format $(PYTHON_FILES)
[ "$(PYTHON_FILES)" = "" ] || poetry run ruff --select I --fix $(PYTHON_FILES)
spell_check:
poetry run codespell --toml pyproject.toml
spell_fix:
poetry run codespell --toml pyproject.toml -w
check_imports: $(shell find langchain_airbyte -name '*.py')
poetry run python ./scripts/check_imports.py $^
######################
# HELP
######################
help:
@echo '----'
@echo 'check_imports - check imports'
@echo 'format - run code formatters'
@echo 'lint - run linters'
@echo 'test - run unit tests'
@echo 'tests - run unit tests'
@echo 'test TEST_FILE=<test_file> - run all tests in file'

View File

@@ -1,27 +0,0 @@
# langchain-airbyte
This package contains the LangChain integration with Airbyte
## Installation
```bash
pip install -U langchain-airbyte
```
The integration package doesn't have any global environment variables that need to be
set, but some integrations (e.g. `source-github`) may need credentials passed in.
## Document Loaders
`AirbyteLoader` class exposes a single document loader for Airbyte sources.
```python
from langchain_airbyte import AirbyteLoader
loader = AirbyteLoader(
source="source-faker",
stream="users",
config={"count": 100},
)
docs = loader.load()
```

View File

@@ -1,3 +0,0 @@
from langchain_airbyte.document_loaders import AirbyteLoader
__all__ = ["AirbyteLoader"]

View File

@@ -1,127 +0,0 @@
"""Airbyte vector stores."""
from __future__ import annotations
from typing import (
TYPE_CHECKING,
Any,
AsyncIterator,
Dict,
Iterator,
List,
Mapping,
Optional,
TypeVar,
)
import airbyte as ab
from langchain_core.documents import Document
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import run_in_executor
from langchain_core.vectorstores import VectorStore
if TYPE_CHECKING:
from langchain_text_splitters import TextSplitter
VST = TypeVar("VST", bound=VectorStore)
class AirbyteLoader:
"""Airbyte Document Loader.
Example:
.. code-block:: python
from langchain_airbyte import AirbyteLoader
loader = AirbyteLoader(
source="github",
stream="pull_requests",
)
documents = loader.lazy_load()
"""
def __init__(
self,
source: str,
stream: str,
*,
config: Optional[Dict] = None,
include_metadata: bool = True,
template: Optional[PromptTemplate] = None,
):
self._airbyte_source = ab.get_source(source, config=config, streams=[stream])
self._stream = stream
self._template = template
self._include_metadata = include_metadata
def load(self) -> List[Document]:
"""Load source data into Document objects."""
return list(self.lazy_load())
def load_and_split(
self, text_splitter: Optional[TextSplitter] = None
) -> List[Document]:
"""Load Documents and split into chunks. Chunks are returned as Documents.
Args:
text_splitter: TextSplitter instance to use for splitting documents.
Defaults to RecursiveCharacterTextSplitter.
Returns:
List of Documents.
"""
if text_splitter is None:
try:
from langchain_text_splitters import RecursiveCharacterTextSplitter
except ImportError as e:
raise ImportError(
"Unable to import from langchain_text_splitters. Please specify "
"text_splitter or install langchain_text_splitters with "
"`pip install -U langchain-text-splitters`."
) from e
_text_splitter: TextSplitter = RecursiveCharacterTextSplitter()
else:
_text_splitter = text_splitter
docs = self.lazy_load()
return _text_splitter.split_documents(docs)
def lazy_load(self) -> Iterator[Document]:
"""A lazy loader for Documents."""
# if no prompt template defined, use default airbyte documents
if not self._template:
for document in self._airbyte_source.get_documents(self._stream):
# convert airbyte document to langchain document
metadata = (
{}
if not self._include_metadata
else {
**document.metadata,
"_last_modified": document.last_modified,
"_id": document.id,
}
)
yield Document(
page_content=document.content,
metadata=metadata,
)
else:
records: Iterator[Mapping[str, Any]] = self._airbyte_source.get_records(
self._stream
)
for record in records:
metadata = {} if not self._include_metadata else dict(record)
yield Document(
page_content=self._template.format(**record), metadata=metadata
)
async def alazy_load(self) -> AsyncIterator[Document]:
"""A lazy loader for Documents."""
iterator = await run_in_executor(None, self.lazy_load)
done = object()
while True:
doc = await run_in_executor(None, next, iterator, done) # type: ignore[call-arg, arg-type]
if doc is done:
break
yield doc # type: ignore[misc]

File diff suppressed because it is too large Load Diff

View File

@@ -1,88 +0,0 @@
[tool.poetry]
name = "langchain-airbyte"
version = "0.1.1"
description = "An integration package connecting Airbyte and LangChain"
authors = []
readme = "README.md"
repository = "https://github.com/langchain-ai/langchain"
license = "MIT"
[tool.poetry.urls]
"Source Code" = "https://github.com/langchain-ai/langchain/tree/master/libs/partners/airbyte"
"Release Notes" = "https://github.com/langchain-ai/langchain/releases?q=tag%3A%22langchain-airbyte%3D%3D0%22&expanded=true"
[tool.poetry.dependencies]
python = ">=3.9,<3.12.4"
langchain-core = "^0.3.0.dev"
airbyte = "^0.7.3"
pydantic = ">=1.10.8,<2"
[tool.poetry.group.test]
optional = true
[tool.poetry.group.test.dependencies]
pytest = "^7.4.3"
pytest-asyncio = "^0.23.2"
langchain-core = { path = "../../core", develop = true }
[tool.poetry.group.codespell]
optional = true
[tool.poetry.group.codespell.dependencies]
codespell = "^2.2.6"
[tool.poetry.group.test_integration]
optional = true
[tool.poetry.group.test_integration.dependencies]
[tool.poetry.group.lint]
optional = true
[tool.poetry.group.lint.dependencies]
ruff = "^0.1.8"
[tool.poetry.group.typing.dependencies]
mypy = "^1.7.1"
langchain-core = { path = "../../core", develop = true }
langchain-text-splitters = { path = "../../text-splitters", develop = true }
langchain = { path = "../../langchain", develop = true }
[tool.poetry.group.dev]
optional = true
[tool.poetry.group.dev.dependencies]
langchain-core = { path = "../../core", develop = true }
[tool.ruff.lint]
select = [
"E", # pycodestyle
"F", # pyflakes
"I", # isort
"T201", # print
]
[tool.mypy]
disallow_untyped_defs = "True"
[tool.coverage.run]
omit = ["tests/*"]
[build-system]
requires = ["poetry-core>=1.0.0"]
build-backend = "poetry.core.masonry.api"
[tool.pytest.ini_options]
# --strict-markers will raise errors on unknown marks.
# https://docs.pytest.org/en/7.1.x/how-to/mark.html#raising-errors-on-unknown-marks
#
# https://docs.pytest.org/en/7.1.x/reference/reference.html
# --strict-config any warnings encountered while parsing the `pytest`
# section of the configuration file raise errors.
addopts = "--strict-markers --strict-config --durations=5"
# Registering custom markers.
# https://docs.pytest.org/en/7.1.x/example/markers.html#registering-markers
markers = [
"compile: mark placeholder test used to compile integration tests without running them",
]
asyncio_mode = "auto"

View File

@@ -1,17 +0,0 @@
import sys
import traceback
from importlib.machinery import SourceFileLoader
if __name__ == "__main__":
files = sys.argv[1:]
has_failure = False
for file in files:
try:
SourceFileLoader("x", file).load_module()
except Exception:
has_failure = True
print(file) # noqa: T201
traceback.print_exc()
print() # noqa: T201
sys.exit(1 if has_failure else 0)

View File

@@ -1,18 +0,0 @@
#!/bin/bash
set -eu
# Initialize a variable to keep track of errors
errors=0
# make sure not importing from langchain, langchain_experimental, or langchain_community
git --no-pager grep '^from langchain\.' . && errors=$((errors+1))
git --no-pager grep '^from langchain_experimental\.' . && errors=$((errors+1))
git --no-pager grep '^from langchain_community\.' . && errors=$((errors+1))
# Decide on an exit status based on the errors
if [ "$errors" -gt 0 ]; then
exit 1
else
exit 0
fi

View File

@@ -1,7 +0,0 @@
import pytest
@pytest.mark.compile
def test_placeholder() -> None:
"""Used for compiling integration tests without running any real tests."""
pass

View File

@@ -1,28 +0,0 @@
"""Test Airbyte embeddings."""
import os
from langchain_airbyte import AirbyteLoader
GITHUB_TOKEN = os.environ.get("GITHUB_TOKEN")
def test_load_github() -> None:
"""Test loading from GitHub."""
airbyte_loader = AirbyteLoader(
source="source-github",
stream="issues",
config={
"repositories": ["airbytehq/quickstarts"],
"credentials": {"personal_access_token": GITHUB_TOKEN},
},
)
documents = airbyte_loader.load()
assert len(documents) > 0
# make sure some documents have body in metadata
found_body = False
for doc in documents:
if "body" in doc.metadata and doc.metadata["body"]:
found_body = True
break
assert found_body, "No documents with body found"

View File

@@ -1,77 +0,0 @@
from langchain_core.prompts import PromptTemplate
from langchain_airbyte import AirbyteLoader
def test_initialization() -> None:
"""Test integration loader initialization."""
AirbyteLoader(
source="source-faker",
stream="users",
config={"count": 3},
)
def test_load() -> None:
"""Test loading from source."""
airbyte_loader = AirbyteLoader(
source="source-faker",
stream="users",
config={"count": 5},
)
documents = airbyte_loader.load()
assert len(documents) == 5
def test_lazy_load() -> None:
"""Test lazy loading from source."""
airbyte_loader = AirbyteLoader(
source="source-faker",
stream="users",
config={"count": 3},
)
documents = airbyte_loader.lazy_load()
assert len(list(documents)) == 3
async def test_alazy_load() -> None:
"""Test async lazy loading from source."""
airbyte_loader = AirbyteLoader(
source="source-faker",
stream="users",
config={"count": 3},
)
documents = airbyte_loader.alazy_load()
lendocs = 0
async for _ in documents:
lendocs += 1
assert lendocs == 3
def test_load_with_template() -> None:
"""Test loading from source with template."""
airbyte_loader = AirbyteLoader(
source="source-faker",
stream="users",
config={"count": 3},
template=PromptTemplate.from_template("My name is {name}"),
)
documents = airbyte_loader.load()
assert len(documents) == 3
for doc in documents:
assert doc.page_content.startswith("My name is ")
assert doc.metadata["name"] # should have a name
def test_load_no_metadata() -> None:
"""Test loading from source with no metadata."""
airbyte_loader = AirbyteLoader(
source="source-faker",
stream="users",
config={"count": 3},
include_metadata=False,
)
documents = airbyte_loader.load()
assert len(documents) == 3
for doc in documents:
assert doc.metadata == {}

View File

@@ -1,9 +0,0 @@
from langchain_airbyte import __all__
EXPECTED_ALL = [
"AirbyteLoader",
]
def test_all_imports() -> None:
assert sorted(EXPECTED_ALL) == sorted(__all__)