mirror of
https://github.com/hwchase17/langchain.git
synced 2025-07-22 04:24:39 +00:00
text-splitters: add pydocstyle linting (#28127)
As seen in #23188, turned on Google-style docstrings by enabling `pydocstyle` linting in the `text-splitters` package. Each resulting linting error was addressed differently: ignored, resolved, suppressed, and missing docstrings were added. Fixes one of the checklist items from #25154, similar to #25939 in `core` package. Ran `make format`, `make lint` and `make test` from the root of the package `text-splitters` to ensure no issues were found. --------- Co-authored-by: Erick Friis <erick@langchain.dev>
This commit is contained in:
parent
b53f07bfb9
commit
90f162efb6
@ -1,6 +1,5 @@
|
|||||||
"""**Text Splitters** are classes for splitting text.
|
"""**Text Splitters** are classes for splitting text.
|
||||||
|
|
||||||
|
|
||||||
**Class hierarchy:**
|
**Class hierarchy:**
|
||||||
|
|
||||||
.. code-block::
|
.. code-block::
|
||||||
|
@ -249,6 +249,21 @@ class TokenTextSplitter(TextSplitter):
|
|||||||
self._disallowed_special = disallowed_special
|
self._disallowed_special = disallowed_special
|
||||||
|
|
||||||
def split_text(self, text: str) -> List[str]:
|
def split_text(self, text: str) -> List[str]:
|
||||||
|
"""Splits the input text into smaller chunks based on tokenization.
|
||||||
|
|
||||||
|
This method uses a custom tokenizer configuration to encode the input text
|
||||||
|
into tokens, processes the tokens in chunks of a specified size with overlap,
|
||||||
|
and decodes them back into text chunks. The splitting is performed using the
|
||||||
|
`split_text_on_tokens` function.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text (str): The input text to be split into smaller chunks.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List[str]: A list of text chunks, where each chunk is derived from a portion
|
||||||
|
of the input text based on the tokenization and chunking rules.
|
||||||
|
"""
|
||||||
|
|
||||||
def _encode(_text: str) -> List[int]:
|
def _encode(_text: str) -> List[int]:
|
||||||
return self._tokenizer.encode(
|
return self._tokenizer.encode(
|
||||||
_text,
|
_text,
|
||||||
|
@ -115,17 +115,45 @@ class RecursiveCharacterTextSplitter(TextSplitter):
|
|||||||
return final_chunks
|
return final_chunks
|
||||||
|
|
||||||
def split_text(self, text: str) -> List[str]:
|
def split_text(self, text: str) -> List[str]:
|
||||||
|
"""Split the input text into smaller chunks based on predefined separators.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text (str): The input text to be split.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List[str]: A list of text chunks obtained after splitting.
|
||||||
|
"""
|
||||||
return self._split_text(text, self._separators)
|
return self._split_text(text, self._separators)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_language(
|
def from_language(
|
||||||
cls, language: Language, **kwargs: Any
|
cls, language: Language, **kwargs: Any
|
||||||
) -> RecursiveCharacterTextSplitter:
|
) -> RecursiveCharacterTextSplitter:
|
||||||
|
"""Return an instance of this class based on a specific language.
|
||||||
|
|
||||||
|
This method initializes the text splitter with language-specific separators.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
language (Language): The language to configure the text splitter for.
|
||||||
|
**kwargs (Any): Additional keyword arguments to customize the splitter.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
RecursiveCharacterTextSplitter: An instance of the text splitter configured
|
||||||
|
for the specified language.
|
||||||
|
"""
|
||||||
separators = cls.get_separators_for_language(language)
|
separators = cls.get_separators_for_language(language)
|
||||||
return cls(separators=separators, is_separator_regex=True, **kwargs)
|
return cls(separators=separators, is_separator_regex=True, **kwargs)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def get_separators_for_language(language: Language) -> List[str]:
|
def get_separators_for_language(language: Language) -> List[str]:
|
||||||
|
"""Retrieve a list of separators specific to the given language.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
language (Language): The language for which to get the separators.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List[str]: A list of separators appropriate for the specified language.
|
||||||
|
"""
|
||||||
if language == Language.C or language == Language.CPP:
|
if language == Language.C or language == Language.CPP:
|
||||||
return [
|
return [
|
||||||
# Split along class definitions
|
# Split along class definitions
|
||||||
|
@ -21,8 +21,8 @@ class ElementType(TypedDict):
|
|||||||
|
|
||||||
|
|
||||||
class HTMLHeaderTextSplitter:
|
class HTMLHeaderTextSplitter:
|
||||||
"""
|
"""Splitting HTML files based on specified headers.
|
||||||
Splitting HTML files based on specified headers.
|
|
||||||
Requires lxml package.
|
Requires lxml package.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@ -46,7 +46,7 @@ class HTMLHeaderTextSplitter:
|
|||||||
def aggregate_elements_to_chunks(
|
def aggregate_elements_to_chunks(
|
||||||
self, elements: List[ElementType]
|
self, elements: List[ElementType]
|
||||||
) -> List[Document]:
|
) -> List[Document]:
|
||||||
"""Combine elements with common metadata into chunks
|
"""Combine elements with common metadata into chunks.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
elements: HTML element content with associated identifying info and metadata
|
elements: HTML element content with associated identifying info and metadata
|
||||||
@ -72,7 +72,7 @@ class HTMLHeaderTextSplitter:
|
|||||||
]
|
]
|
||||||
|
|
||||||
def split_text_from_url(self, url: str, **kwargs: Any) -> List[Document]:
|
def split_text_from_url(self, url: str, **kwargs: Any) -> List[Document]:
|
||||||
"""Split HTML from web URL
|
"""Split HTML from web URL.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
url: web URL
|
url: web URL
|
||||||
@ -83,7 +83,7 @@ class HTMLHeaderTextSplitter:
|
|||||||
return self.split_text_from_file(BytesIO(r.content))
|
return self.split_text_from_file(BytesIO(r.content))
|
||||||
|
|
||||||
def split_text(self, text: str) -> List[Document]:
|
def split_text(self, text: str) -> List[Document]:
|
||||||
"""Split HTML text string
|
"""Split HTML text string.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
text: HTML text
|
text: HTML text
|
||||||
@ -91,7 +91,7 @@ class HTMLHeaderTextSplitter:
|
|||||||
return self.split_text_from_file(StringIO(text))
|
return self.split_text_from_file(StringIO(text))
|
||||||
|
|
||||||
def split_text_from_file(self, file: Any) -> List[Document]:
|
def split_text_from_file(self, file: Any) -> List[Document]:
|
||||||
"""Split HTML file
|
"""Split HTML file.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
file: HTML file
|
file: HTML file
|
||||||
@ -166,8 +166,8 @@ class HTMLHeaderTextSplitter:
|
|||||||
|
|
||||||
|
|
||||||
class HTMLSectionSplitter:
|
class HTMLSectionSplitter:
|
||||||
"""
|
"""Splitting HTML files based on specified tag and font sizes.
|
||||||
Splitting HTML files based on specified tag and font sizes.
|
|
||||||
Requires lxml package.
|
Requires lxml package.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@ -186,6 +186,8 @@ class HTMLSectionSplitter:
|
|||||||
xslt_path: path to xslt file for document transformation.
|
xslt_path: path to xslt file for document transformation.
|
||||||
Uses a default if not passed.
|
Uses a default if not passed.
|
||||||
Needed for html contents that using different format and layouts.
|
Needed for html contents that using different format and layouts.
|
||||||
|
**kwargs (Any): Additional optional arguments for customizations.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
self.headers_to_split_on = dict(headers_to_split_on)
|
self.headers_to_split_on = dict(headers_to_split_on)
|
||||||
|
|
||||||
@ -210,7 +212,7 @@ class HTMLSectionSplitter:
|
|||||||
return text_splitter.split_documents(results)
|
return text_splitter.split_documents(results)
|
||||||
|
|
||||||
def split_text(self, text: str) -> List[Document]:
|
def split_text(self, text: str) -> List[Document]:
|
||||||
"""Split HTML text string
|
"""Split HTML text string.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
text: HTML text
|
text: HTML text
|
||||||
@ -236,6 +238,23 @@ class HTMLSectionSplitter:
|
|||||||
return documents
|
return documents
|
||||||
|
|
||||||
def split_html_by_headers(self, html_doc: str) -> List[Dict[str, Optional[str]]]:
|
def split_html_by_headers(self, html_doc: str) -> List[Dict[str, Optional[str]]]:
|
||||||
|
"""Split an HTML document into sections based on specified header tags.
|
||||||
|
|
||||||
|
This method uses BeautifulSoup to parse the HTML content and divides it into
|
||||||
|
sections based on headers defined in `headers_to_split_on`. Each section
|
||||||
|
contains the header text, content under the header, and the tag name.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
html_doc (str): The HTML document to be split into sections.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List[Dict[str, Optional[str]]]: A list of dictionaries representing
|
||||||
|
sections.
|
||||||
|
Each dictionary contains:
|
||||||
|
- 'header': The header text or a default title for the first section.
|
||||||
|
- 'content': The content under the header.
|
||||||
|
- 'tag_name': The name of the header tag (e.g., "h1", "h2").
|
||||||
|
"""
|
||||||
try:
|
try:
|
||||||
from bs4 import BeautifulSoup, PageElement # type: ignore[import-untyped]
|
from bs4 import BeautifulSoup, PageElement # type: ignore[import-untyped]
|
||||||
except ImportError as e:
|
except ImportError as e:
|
||||||
@ -259,7 +278,7 @@ class HTMLSectionSplitter:
|
|||||||
section_content: List = []
|
section_content: List = []
|
||||||
else:
|
else:
|
||||||
current_header = header_element.text.strip()
|
current_header = header_element.text.strip()
|
||||||
current_header_tag = header_element.name
|
current_header_tag = header_element.name # type: ignore[attr-defined]
|
||||||
section_content = []
|
section_content = []
|
||||||
for element in header_element.next_elements:
|
for element in header_element.next_elements:
|
||||||
if i + 1 < len(headers) and element == headers[i + 1]:
|
if i + 1 < len(headers) and element == headers[i + 1]:
|
||||||
@ -280,6 +299,18 @@ class HTMLSectionSplitter:
|
|||||||
return sections
|
return sections
|
||||||
|
|
||||||
def convert_possible_tags_to_header(self, html_content: str) -> str:
|
def convert_possible_tags_to_header(self, html_content: str) -> str:
|
||||||
|
"""Convert specific HTML tags to headers using an XSLT transformation.
|
||||||
|
|
||||||
|
This method uses an XSLT file to transform the HTML content, converting
|
||||||
|
certain tags into headers for easier parsing. If no XSLT path is provided,
|
||||||
|
the HTML content is returned unchanged.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
html_content (str): The HTML content to be transformed.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str: The transformed HTML content as a string.
|
||||||
|
"""
|
||||||
if self.xslt_path is None:
|
if self.xslt_path is None:
|
||||||
return html_content
|
return html_content
|
||||||
|
|
||||||
@ -299,7 +330,7 @@ class HTMLSectionSplitter:
|
|||||||
return str(result)
|
return str(result)
|
||||||
|
|
||||||
def split_text_from_file(self, file: Any) -> List[Document]:
|
def split_text_from_file(self, file: Any) -> List[Document]:
|
||||||
"""Split HTML file
|
"""Split HTML file.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
file: HTML file
|
file: HTML file
|
||||||
|
@ -8,9 +8,38 @@ from langchain_core.documents import Document
|
|||||||
|
|
||||||
|
|
||||||
class RecursiveJsonSplitter:
|
class RecursiveJsonSplitter:
|
||||||
|
"""Splits JSON data into smaller, structured chunks while preserving hierarchy.
|
||||||
|
|
||||||
|
This class provides methods to split JSON data into smaller dictionaries or
|
||||||
|
JSON-formatted strings based on configurable maximum and minimum chunk sizes.
|
||||||
|
It supports nested JSON structures, optionally converts lists into dictionaries
|
||||||
|
for better chunking, and allows the creation of document objects for further use.
|
||||||
|
|
||||||
|
Attributes:
|
||||||
|
max_chunk_size (int): The maximum size for each chunk. Defaults to 2000.
|
||||||
|
min_chunk_size (int): The minimum size for each chunk, derived from
|
||||||
|
`max_chunk_size` if not explicitly provided.
|
||||||
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self, max_chunk_size: int = 2000, min_chunk_size: Optional[int] = None
|
self, max_chunk_size: int = 2000, min_chunk_size: Optional[int] = None
|
||||||
):
|
):
|
||||||
|
"""Initialize the chunk size configuration for text processing.
|
||||||
|
|
||||||
|
This constructor sets up the maximum and minimum chunk sizes, ensuring that
|
||||||
|
the `min_chunk_size` defaults to a value slightly smaller than the
|
||||||
|
`max_chunk_size` if not explicitly provided.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
max_chunk_size (int): The maximum size for a chunk. Defaults to 2000.
|
||||||
|
min_chunk_size (Optional[int]): The minimum size for a chunk. If None,
|
||||||
|
defaults to the maximum chunk size minus 200, with a lower bound of 50.
|
||||||
|
|
||||||
|
Attributes:
|
||||||
|
max_chunk_size (int): The configured maximum size for each chunk.
|
||||||
|
min_chunk_size (int): The configured minimum size for each chunk, derived
|
||||||
|
from `max_chunk_size` if not explicitly provided.
|
||||||
|
"""
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.max_chunk_size = max_chunk_size
|
self.max_chunk_size = max_chunk_size
|
||||||
self.min_chunk_size = (
|
self.min_chunk_size = (
|
||||||
@ -51,9 +80,7 @@ class RecursiveJsonSplitter:
|
|||||||
current_path: Optional[List[str]] = None,
|
current_path: Optional[List[str]] = None,
|
||||||
chunks: Optional[List[Dict]] = None,
|
chunks: Optional[List[Dict]] = None,
|
||||||
) -> List[Dict]:
|
) -> List[Dict]:
|
||||||
"""
|
"""Split json into maximum size dictionaries while preserving structure."""
|
||||||
Split json into maximum size dictionaries while preserving structure.
|
|
||||||
"""
|
|
||||||
current_path = current_path or []
|
current_path = current_path or []
|
||||||
chunks = chunks if chunks is not None else [{}]
|
chunks = chunks if chunks is not None else [{}]
|
||||||
if isinstance(data, dict):
|
if isinstance(data, dict):
|
||||||
@ -83,8 +110,7 @@ class RecursiveJsonSplitter:
|
|||||||
json_data: Dict[str, Any],
|
json_data: Dict[str, Any],
|
||||||
convert_lists: bool = False,
|
convert_lists: bool = False,
|
||||||
) -> List[Dict]:
|
) -> List[Dict]:
|
||||||
"""Splits JSON into a list of JSON chunks"""
|
"""Splits JSON into a list of JSON chunks."""
|
||||||
|
|
||||||
if convert_lists:
|
if convert_lists:
|
||||||
chunks = self._json_split(self._list_to_dict_preprocessing(json_data))
|
chunks = self._json_split(self._list_to_dict_preprocessing(json_data))
|
||||||
else:
|
else:
|
||||||
@ -101,8 +127,7 @@ class RecursiveJsonSplitter:
|
|||||||
convert_lists: bool = False,
|
convert_lists: bool = False,
|
||||||
ensure_ascii: bool = True,
|
ensure_ascii: bool = True,
|
||||||
) -> List[str]:
|
) -> List[str]:
|
||||||
"""Splits JSON into a list of JSON formatted strings"""
|
"""Splits JSON into a list of JSON formatted strings."""
|
||||||
|
|
||||||
chunks = self.split_json(json_data=json_data, convert_lists=convert_lists)
|
chunks = self.split_json(json_data=json_data, convert_lists=convert_lists)
|
||||||
|
|
||||||
# Convert to string
|
# Convert to string
|
||||||
|
@ -45,7 +45,8 @@ class MarkdownHeaderTextSplitter:
|
|||||||
self.strip_headers = strip_headers
|
self.strip_headers = strip_headers
|
||||||
|
|
||||||
def aggregate_lines_to_chunks(self, lines: List[LineType]) -> List[Document]:
|
def aggregate_lines_to_chunks(self, lines: List[LineType]) -> List[Document]:
|
||||||
"""Combine lines with common metadata into chunks
|
"""Combine lines with common metadata into chunks.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
lines: Line of text / associated header metadata
|
lines: Line of text / associated header metadata
|
||||||
"""
|
"""
|
||||||
@ -87,10 +88,11 @@ class MarkdownHeaderTextSplitter:
|
|||||||
]
|
]
|
||||||
|
|
||||||
def split_text(self, text: str) -> List[Document]:
|
def split_text(self, text: str) -> List[Document]:
|
||||||
"""Split markdown file
|
"""Split markdown file.
|
||||||
Args:
|
|
||||||
text: Markdown file"""
|
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text: Markdown file
|
||||||
|
"""
|
||||||
# Split the input text by newline character ("\n").
|
# Split the input text by newline character ("\n").
|
||||||
lines = text.split("\n")
|
lines = text.split("\n")
|
||||||
# Final output
|
# Final output
|
||||||
@ -225,8 +227,7 @@ class HeaderType(TypedDict):
|
|||||||
|
|
||||||
|
|
||||||
class ExperimentalMarkdownSyntaxTextSplitter:
|
class ExperimentalMarkdownSyntaxTextSplitter:
|
||||||
"""
|
"""An experimental text splitter for handling Markdown syntax.
|
||||||
An experimental text splitter for handling Markdown syntax.
|
|
||||||
|
|
||||||
This splitter aims to retain the exact whitespace of the original text while
|
This splitter aims to retain the exact whitespace of the original text while
|
||||||
extracting structured metadata, such as headers. It is a re-implementation of the
|
extracting structured metadata, such as headers. It is a re-implementation of the
|
||||||
@ -280,6 +281,22 @@ class ExperimentalMarkdownSyntaxTextSplitter:
|
|||||||
return_each_line: bool = False,
|
return_each_line: bool = False,
|
||||||
strip_headers: bool = True,
|
strip_headers: bool = True,
|
||||||
):
|
):
|
||||||
|
"""Initialize the text splitter with header splitting and formatting options.
|
||||||
|
|
||||||
|
This constructor sets up the required configuration for splitting text into
|
||||||
|
chunks based on specified headers and formatting preferences.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
headers_to_split_on (Union[List[Tuple[str, str]], None]):
|
||||||
|
A list of tuples, where each tuple contains a header tag (e.g., "h1")
|
||||||
|
and its corresponding metadata key. If None, default headers are used.
|
||||||
|
return_each_line (bool):
|
||||||
|
Whether to return each line as an individual chunk.
|
||||||
|
Defaults to False, which aggregates lines into larger chunks.
|
||||||
|
strip_headers (bool):
|
||||||
|
Whether to exclude headers from the resulting chunks.
|
||||||
|
Defaults to True.
|
||||||
|
"""
|
||||||
self.chunks: List[Document] = []
|
self.chunks: List[Document] = []
|
||||||
self.current_chunk = Document(page_content="")
|
self.current_chunk = Document(page_content="")
|
||||||
self.current_header_stack: List[Tuple[int, str]] = []
|
self.current_header_stack: List[Tuple[int, str]] = []
|
||||||
@ -292,6 +309,21 @@ class ExperimentalMarkdownSyntaxTextSplitter:
|
|||||||
self.return_each_line = return_each_line
|
self.return_each_line = return_each_line
|
||||||
|
|
||||||
def split_text(self, text: str) -> List[Document]:
|
def split_text(self, text: str) -> List[Document]:
|
||||||
|
"""Split the input text into structured chunks.
|
||||||
|
|
||||||
|
This method processes the input text line by line, identifying and handling
|
||||||
|
specific patterns such as headers, code blocks, and horizontal rules to
|
||||||
|
split it into structured chunks based on headers, code blocks, and
|
||||||
|
horizontal rules.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text (str): The input text to be split into chunks.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List[Document]: A list of `Document` objects representing the structured
|
||||||
|
chunks of the input text. If `return_each_line` is enabled, each line
|
||||||
|
is returned as a separate `Document`.
|
||||||
|
"""
|
||||||
raw_lines = text.splitlines(keepends=True)
|
raw_lines = text.splitlines(keepends=True)
|
||||||
|
|
||||||
while raw_lines:
|
while raw_lines:
|
||||||
|
@ -51,6 +51,20 @@ class SentenceTransformersTokenTextSplitter(TextSplitter):
|
|||||||
)
|
)
|
||||||
|
|
||||||
def split_text(self, text: str) -> List[str]:
|
def split_text(self, text: str) -> List[str]:
|
||||||
|
"""Splits the input text into smaller components by splitting text on tokens.
|
||||||
|
|
||||||
|
This method encodes the input text using a private `_encode` method, then
|
||||||
|
strips the start and stop token IDs from the encoded result. It returns the
|
||||||
|
processed segments as a list of strings.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text (str): The input text to be split.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List[str]: A list of string components derived from the input text after
|
||||||
|
encoding and processing.
|
||||||
|
"""
|
||||||
|
|
||||||
def encode_strip_start_and_stop_token_ids(text: str) -> List[int]:
|
def encode_strip_start_and_stop_token_ids(text: str) -> List[int]:
|
||||||
return self._encode(text)[1:-1]
|
return self._encode(text)[1:-1]
|
||||||
|
|
||||||
@ -64,6 +78,17 @@ class SentenceTransformersTokenTextSplitter(TextSplitter):
|
|||||||
return split_text_on_tokens(text=text, tokenizer=tokenizer)
|
return split_text_on_tokens(text=text, tokenizer=tokenizer)
|
||||||
|
|
||||||
def count_tokens(self, *, text: str) -> int:
|
def count_tokens(self, *, text: str) -> int:
|
||||||
|
"""Counts the number of tokens in the given text.
|
||||||
|
|
||||||
|
This method encodes the input text using a private `_encode` method and
|
||||||
|
calculates the total number of tokens in the encoded result.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text (str): The input text for which the token count is calculated.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
int: The number of tokens in the encoded text.
|
||||||
|
"""
|
||||||
return len(self._encode(text))
|
return len(self._encode(text))
|
||||||
|
|
||||||
_max_length_equal_32_bit_integer: int = 2**32
|
_max_length_equal_32_bit_integer: int = 2**32
|
||||||
|
@ -8,7 +8,6 @@ from langchain_text_splitters.base import TextSplitter
|
|||||||
class SpacyTextSplitter(TextSplitter):
|
class SpacyTextSplitter(TextSplitter):
|
||||||
"""Splitting text using Spacy package.
|
"""Splitting text using Spacy package.
|
||||||
|
|
||||||
|
|
||||||
Per default, Spacy's `en_core_web_sm` model is used and
|
Per default, Spacy's `en_core_web_sm` model is used and
|
||||||
its default max_length is 1000000 (it is the length of maximum character
|
its default max_length is 1000000 (it is the length of maximum character
|
||||||
this model takes which can be increased for large files). For a faster, but
|
this model takes which can be increased for large files). For a faster, but
|
||||||
|
@ -26,7 +26,20 @@ python = ">=3.9,<4.0"
|
|||||||
langchain-core = "^0.3.15"
|
langchain-core = "^0.3.15"
|
||||||
|
|
||||||
[tool.ruff.lint]
|
[tool.ruff.lint]
|
||||||
select = [ "E", "F", "I", "T201",]
|
select = [
|
||||||
|
"E", # pycodestyle
|
||||||
|
"F", # Pyflakes
|
||||||
|
"I", # isort
|
||||||
|
"T201", # print
|
||||||
|
"D", # pydocstyle
|
||||||
|
]
|
||||||
|
ignore = ["D100"] # ignore missing module docstring
|
||||||
|
|
||||||
|
[tool.ruff.lint.pydocstyle]
|
||||||
|
convention = "google"
|
||||||
|
|
||||||
|
[tool.ruff.lint.per-file-ignores]
|
||||||
|
"tests/**" = ["D"] # ignore docstring checks for tests
|
||||||
|
|
||||||
[tool.coverage.run]
|
[tool.coverage.run]
|
||||||
omit = [ "tests/*",]
|
omit = [ "tests/*",]
|
||||||
|
Loading…
Reference in New Issue
Block a user