mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-23 07:09:31 +00:00
core: fix CommaSeparatedListOutputParser to handle columns that may contain commas in it (#26365)
- **Description:** Currently CommaSeparatedListOutputParser can't handle strings that may contain commas within a column. It would parse any commas as the delimiter. Ex. "foo, foo2", "bar", "baz" It will create 4 columns: "foo", "foo2", "bar", "baz" This should be 3 columns: "foo, foo2", "bar", "baz" - **Dependencies:** Added 2 additional imports, but they are built in python packages. import csv from io import StringIO - **Twitter handle:** @jkyamog - [ ] **Add tests and docs**: 1. added simple unit test test_multiple_items_with_comma --------- Co-authored-by: Erick Friis <erick@langchain.dev> Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com> Co-authored-by: Bagatur <baskaryan@gmail.com>
This commit is contained in:
parent
9fedb04dd3
commit
830cad7bc0
@ -1,9 +1,11 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import csv
|
||||
import re
|
||||
from abc import abstractmethod
|
||||
from collections import deque
|
||||
from collections.abc import AsyncIterator, Iterator
|
||||
from io import StringIO
|
||||
from typing import Optional as Optional
|
||||
from typing import TypeVar, Union
|
||||
|
||||
@ -162,7 +164,14 @@ class CommaSeparatedListOutputParser(ListOutputParser):
|
||||
Returns:
|
||||
A list of strings.
|
||||
"""
|
||||
return [part.strip() for part in text.split(",")]
|
||||
try:
|
||||
reader = csv.reader(
|
||||
StringIO(text), quotechar='"', delimiter=",", skipinitialspace=True
|
||||
)
|
||||
return [item for sublist in reader for item in sublist]
|
||||
except csv.Error:
|
||||
# keep old logic for backup
|
||||
return [part.strip() for part in text.split(",")]
|
||||
|
||||
@property
|
||||
def _type(self) -> str:
|
||||
|
@ -64,6 +64,25 @@ def test_multiple_items() -> None:
|
||||
assert list(parser.transform(iter([text]))) == [[a] for a in expected]
|
||||
|
||||
|
||||
def test_multiple_items_with_comma() -> None:
|
||||
"""Test that a string with multiple comma-separated items with 1 item containing a
|
||||
comma is parsed to a list."""
|
||||
parser = CommaSeparatedListOutputParser()
|
||||
text = '"foo, foo2",bar,baz'
|
||||
expected = ["foo, foo2", "bar", "baz"]
|
||||
|
||||
assert parser.parse(text) == expected
|
||||
assert add(parser.transform(t for t in text)) == expected
|
||||
assert list(parser.transform(t for t in text)) == [[a] for a in expected]
|
||||
assert list(parser.transform(t for t in text.splitlines(keepends=True))) == [
|
||||
[a] for a in expected
|
||||
]
|
||||
assert list(
|
||||
parser.transform(" " + t if i > 0 else t for i, t in enumerate(text.split(" ")))
|
||||
) == [[a] for a in expected]
|
||||
assert list(parser.transform(iter([text]))) == [[a] for a in expected]
|
||||
|
||||
|
||||
def test_numbered_list() -> None:
|
||||
parser = NumberedListOutputParser()
|
||||
text1 = (
|
||||
|
Loading…
Reference in New Issue
Block a user