core: fix CommaSeparatedListOutputParser to handle columns that may contain commas in it (#26365)

- **Description:**
Currently CommaSeparatedListOutputParser can't handle strings that may
contain commas within a column. It would parse any commas as the
delimiter.
Ex. 
"foo, foo2", "bar", "baz"

It will create 4 columns: "foo", "foo2", "bar", "baz"

This should be 3 columns:

"foo, foo2", "bar", "baz"

- **Dependencies:**
Added 2 additional imports, but they are built in python packages.

import csv
from io import StringIO

- **Twitter handle:** @jkyamog

- [ ] **Add tests and docs**: 
1. added simple unit test test_multiple_items_with_comma

---------

Co-authored-by: Erick Friis <erick@langchain.dev>
Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com>
Co-authored-by: Bagatur <baskaryan@gmail.com>
This commit is contained in:
Jun Yamog 2024-11-02 11:42:24 +13:00 committed by GitHub
parent 9fedb04dd3
commit 830cad7bc0
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 29 additions and 1 deletions

View File

@ -1,9 +1,11 @@
from __future__ import annotations from __future__ import annotations
import csv
import re import re
from abc import abstractmethod from abc import abstractmethod
from collections import deque from collections import deque
from collections.abc import AsyncIterator, Iterator from collections.abc import AsyncIterator, Iterator
from io import StringIO
from typing import Optional as Optional from typing import Optional as Optional
from typing import TypeVar, Union from typing import TypeVar, Union
@ -162,6 +164,13 @@ class CommaSeparatedListOutputParser(ListOutputParser):
Returns: Returns:
A list of strings. A list of strings.
""" """
try:
reader = csv.reader(
StringIO(text), quotechar='"', delimiter=",", skipinitialspace=True
)
return [item for sublist in reader for item in sublist]
except csv.Error:
# keep old logic for backup
return [part.strip() for part in text.split(",")] return [part.strip() for part in text.split(",")]
@property @property

View File

@ -64,6 +64,25 @@ def test_multiple_items() -> None:
assert list(parser.transform(iter([text]))) == [[a] for a in expected] assert list(parser.transform(iter([text]))) == [[a] for a in expected]
def test_multiple_items_with_comma() -> None:
"""Test that a string with multiple comma-separated items with 1 item containing a
comma is parsed to a list."""
parser = CommaSeparatedListOutputParser()
text = '"foo, foo2",bar,baz'
expected = ["foo, foo2", "bar", "baz"]
assert parser.parse(text) == expected
assert add(parser.transform(t for t in text)) == expected
assert list(parser.transform(t for t in text)) == [[a] for a in expected]
assert list(parser.transform(t for t in text.splitlines(keepends=True))) == [
[a] for a in expected
]
assert list(
parser.transform(" " + t if i > 0 else t for i, t in enumerate(text.split(" ")))
) == [[a] for a in expected]
assert list(parser.transform(iter([text]))) == [[a] for a in expected]
def test_numbered_list() -> None: def test_numbered_list() -> None:
parser = NumberedListOutputParser() parser = NumberedListOutputParser()
text1 = ( text1 = (