mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-23 15:19:33 +00:00
core: fix CommaSeparatedListOutputParser to handle columns that may contain commas in it (#26365)
- **Description:** Currently CommaSeparatedListOutputParser can't handle strings that may contain commas within a column. It would parse any commas as the delimiter. Ex. "foo, foo2", "bar", "baz" It will create 4 columns: "foo", "foo2", "bar", "baz" This should be 3 columns: "foo, foo2", "bar", "baz" - **Dependencies:** Added 2 additional imports, but they are built in python packages. import csv from io import StringIO - **Twitter handle:** @jkyamog - [ ] **Add tests and docs**: 1. added simple unit test test_multiple_items_with_comma --------- Co-authored-by: Erick Friis <erick@langchain.dev> Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com> Co-authored-by: Bagatur <baskaryan@gmail.com>
This commit is contained in:
parent
9fedb04dd3
commit
830cad7bc0
@ -1,9 +1,11 @@
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import csv
|
||||||
import re
|
import re
|
||||||
from abc import abstractmethod
|
from abc import abstractmethod
|
||||||
from collections import deque
|
from collections import deque
|
||||||
from collections.abc import AsyncIterator, Iterator
|
from collections.abc import AsyncIterator, Iterator
|
||||||
|
from io import StringIO
|
||||||
from typing import Optional as Optional
|
from typing import Optional as Optional
|
||||||
from typing import TypeVar, Union
|
from typing import TypeVar, Union
|
||||||
|
|
||||||
@ -162,6 +164,13 @@ class CommaSeparatedListOutputParser(ListOutputParser):
|
|||||||
Returns:
|
Returns:
|
||||||
A list of strings.
|
A list of strings.
|
||||||
"""
|
"""
|
||||||
|
try:
|
||||||
|
reader = csv.reader(
|
||||||
|
StringIO(text), quotechar='"', delimiter=",", skipinitialspace=True
|
||||||
|
)
|
||||||
|
return [item for sublist in reader for item in sublist]
|
||||||
|
except csv.Error:
|
||||||
|
# keep old logic for backup
|
||||||
return [part.strip() for part in text.split(",")]
|
return [part.strip() for part in text.split(",")]
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
@ -64,6 +64,25 @@ def test_multiple_items() -> None:
|
|||||||
assert list(parser.transform(iter([text]))) == [[a] for a in expected]
|
assert list(parser.transform(iter([text]))) == [[a] for a in expected]
|
||||||
|
|
||||||
|
|
||||||
|
def test_multiple_items_with_comma() -> None:
|
||||||
|
"""Test that a string with multiple comma-separated items with 1 item containing a
|
||||||
|
comma is parsed to a list."""
|
||||||
|
parser = CommaSeparatedListOutputParser()
|
||||||
|
text = '"foo, foo2",bar,baz'
|
||||||
|
expected = ["foo, foo2", "bar", "baz"]
|
||||||
|
|
||||||
|
assert parser.parse(text) == expected
|
||||||
|
assert add(parser.transform(t for t in text)) == expected
|
||||||
|
assert list(parser.transform(t for t in text)) == [[a] for a in expected]
|
||||||
|
assert list(parser.transform(t for t in text.splitlines(keepends=True))) == [
|
||||||
|
[a] for a in expected
|
||||||
|
]
|
||||||
|
assert list(
|
||||||
|
parser.transform(" " + t if i > 0 else t for i, t in enumerate(text.split(" ")))
|
||||||
|
) == [[a] for a in expected]
|
||||||
|
assert list(parser.transform(iter([text]))) == [[a] for a in expected]
|
||||||
|
|
||||||
|
|
||||||
def test_numbered_list() -> None:
|
def test_numbered_list() -> None:
|
||||||
parser = NumberedListOutputParser()
|
parser = NumberedListOutputParser()
|
||||||
text1 = (
|
text1 = (
|
||||||
|
Loading…
Reference in New Issue
Block a user