mirror of
https://github.com/hwchase17/langchain.git
synced 2025-08-02 17:54:23 +00:00
Running make lint, make format, and make test
This commit is contained in:
parent
d50965f7b2
commit
68bede3e24
@ -74,8 +74,6 @@ class TextSplitter(BaseDocumentTransformer, ABC):
|
|||||||
self, texts: list[str], metadatas: Optional[list[dict[Any, Any]]] = None
|
self, texts: list[str], metadatas: Optional[list[dict[Any, Any]]] = None
|
||||||
) -> list[Document]:
|
) -> list[Document]:
|
||||||
"""Create documents from a list of texts."""
|
"""Create documents from a list of texts."""
|
||||||
if isinstance(self,TokenTextSplitter):
|
|
||||||
return self.token_create_documents(texts,metadatas)
|
|
||||||
_metadatas = metadatas or [{}] * len(texts)
|
_metadatas = metadatas or [{}] * len(texts)
|
||||||
documents = []
|
documents = []
|
||||||
for i, text in enumerate(texts):
|
for i, text in enumerate(texts):
|
||||||
@ -227,6 +225,7 @@ class TextSplitter(BaseDocumentTransformer, ABC):
|
|||||||
|
|
||||||
class TokenTextSplitter(TextSplitter):
|
class TokenTextSplitter(TextSplitter):
|
||||||
"""Splitting text to tokens using model tokenizer."""
|
"""Splitting text to tokens using model tokenizer."""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
encoding_name: str = "gpt2",
|
encoding_name: str = "gpt2",
|
||||||
@ -288,8 +287,8 @@ class TokenTextSplitter(TextSplitter):
|
|||||||
return split_text_on_tokens(text=text, tokenizer=tokenizer)
|
return split_text_on_tokens(text=text, tokenizer=tokenizer)
|
||||||
|
|
||||||
def create_documents(
|
def create_documents(
|
||||||
self, texts: list[str], metadatas: Optional[list[dict[Any, Any]]] = None
|
self, texts: list[str], metadatas: Optional[list[dict[Any, Any]]] = None
|
||||||
) -> list[Document]:
|
) -> list[Document]:
|
||||||
"""Override to create documents from a list of tokens."""
|
"""Override to create documents from a list of tokens."""
|
||||||
_metadatas = metadatas or [{}] * len(texts)
|
_metadatas = metadatas or [{}] * len(texts)
|
||||||
documents = []
|
documents = []
|
||||||
@ -305,12 +304,13 @@ class TokenTextSplitter(TextSplitter):
|
|||||||
if self._add_start_index:
|
if self._add_start_index:
|
||||||
char_index = text.find(chunk_text, char_index)
|
char_index = text.find(chunk_text, char_index)
|
||||||
metadata["start_index"] = char_index
|
metadata["start_index"] = char_index
|
||||||
documents.append(Document(page_content=chunk_text,metadata=metadata))
|
documents.append(Document(page_content=chunk_text, metadata=metadata))
|
||||||
if end_idx == len(input_ids):
|
if end_idx == len(input_ids):
|
||||||
break
|
break
|
||||||
start_idx += self._chunk_size - self._chunk_overlap
|
start_idx += self._chunk_size - self._chunk_overlap
|
||||||
return documents
|
return documents
|
||||||
|
|
||||||
|
|
||||||
class Language(str, Enum):
|
class Language(str, Enum):
|
||||||
"""Enum of the programming languages."""
|
"""Enum of the programming languages."""
|
||||||
|
|
||||||
@ -372,4 +372,3 @@ def split_text_on_tokens(*, text: str, tokenizer: Tokenizer) -> list[str]:
|
|||||||
cur_idx = min(start_idx + tokenizer.tokens_per_chunk, len(input_ids))
|
cur_idx = min(start_idx + tokenizer.tokens_per_chunk, len(input_ids))
|
||||||
chunk_ids = input_ids[start_idx:cur_idx]
|
chunk_ids = input_ids[start_idx:cur_idx]
|
||||||
return splits
|
return splits
|
||||||
|
|
||||||
|
@ -13,7 +13,7 @@ from langchain_text_splitters import (
|
|||||||
RecursiveCharacterTextSplitter,
|
RecursiveCharacterTextSplitter,
|
||||||
TextSplitter,
|
TextSplitter,
|
||||||
Tokenizer,
|
Tokenizer,
|
||||||
TokenTextSplitter
|
TokenTextSplitter,
|
||||||
)
|
)
|
||||||
from langchain_text_splitters.base import split_text_on_tokens
|
from langchain_text_splitters.base import split_text_on_tokens
|
||||||
from langchain_text_splitters.character import CharacterTextSplitter
|
from langchain_text_splitters.character import CharacterTextSplitter
|
||||||
@ -3668,13 +3668,10 @@ def test_character_text_splitter_chunk_size_effect(
|
|||||||
)
|
)
|
||||||
assert splitter.split_text(text) == expected
|
assert splitter.split_text(text) == expected
|
||||||
|
|
||||||
|
|
||||||
def test_token_splitter_create_documents() -> None:
|
def test_token_splitter_create_documents() -> None:
|
||||||
splitter = TokenTextSplitter(
|
splitter = TokenTextSplitter(add_start_index=True, chunk_size=10, chunk_overlap=5)
|
||||||
add_start_index=True,
|
text = """
|
||||||
chunk_size=10,
|
|
||||||
chunk_overlap=5
|
|
||||||
)
|
|
||||||
text="""
|
|
||||||
"Lorem ipsum dolor sit amet, consectetur adipiscing elit,
|
"Lorem ipsum dolor sit amet, consectetur adipiscing elit,
|
||||||
sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.
|
sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.
|
||||||
Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
|
Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
|
||||||
@ -3686,13 +3683,10 @@ def test_token_splitter_create_documents() -> None:
|
|||||||
s_i = doc.metadata["start_index"]
|
s_i = doc.metadata["start_index"]
|
||||||
assert text[s_i : s_i + len(doc.page_content)] == doc.page_content
|
assert text[s_i : s_i + len(doc.page_content)] == doc.page_content
|
||||||
|
|
||||||
|
|
||||||
def test_token_splitter_create_documents_repeat_text() -> None:
|
def test_token_splitter_create_documents_repeat_text() -> None:
|
||||||
splitter = TokenTextSplitter(
|
splitter = TokenTextSplitter(add_start_index=True,chunk_size=10,chunk_overlap=5)
|
||||||
add_start_index=True,
|
text = """
|
||||||
chunk_size=10,
|
|
||||||
chunk_overlap=5
|
|
||||||
)
|
|
||||||
text="""
|
|
||||||
"the quick brown fox jumped over the lazy fox
|
"the quick brown fox jumped over the lazy fox
|
||||||
the quick brown fox jumped over the lazy fox
|
the quick brown fox jumped over the lazy fox
|
||||||
the quick brown fox jumped over the lazy fox
|
the quick brown fox jumped over the lazy fox
|
||||||
@ -3703,4 +3697,3 @@ def test_token_splitter_create_documents_repeat_text() -> None:
|
|||||||
for doc in docs:
|
for doc in docs:
|
||||||
s_i = doc.metadata["start_index"]
|
s_i = doc.metadata["start_index"]
|
||||||
assert text[s_i : s_i + len(doc.page_content)] == doc.page_content
|
assert text[s_i : s_i + len(doc.page_content)] == doc.page_content
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user