mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-21 14:18:52 +00:00
This PR closes #27781 # Problem The current implementation of `NLTKTextSplitter` is using `sent_tokenize`. However, this `sent_tokenize` doesn't handle chars between 2 tokenized sentences... hence, this behavior throws errors when we are using `add_start_index=True`, as described in issue #27781. In particular: ```python from nltk.tokenize import sent_tokenize output1 = sent_tokenize("Innovation drives our success. Collaboration fosters creative solutions. Efficiency enhances data management.", language="english") print(output1) output2 = sent_tokenize("Innovation drives our success. Collaboration fosters creative solutions. Efficiency enhances data management.", language="english") print(output2) >>> ['Innovation drives our success.', 'Collaboration fosters creative solutions.', 'Efficiency enhances data management.'] >>> ['Innovation drives our success.', 'Collaboration fosters creative solutions.', 'Efficiency enhances data management.'] ``` # Solution With this new `use_span_tokenize` parameter, we can use NLTK to create sentences (with `span_tokenize`), but also add extra chars to be sure that we still can map the chunks to the original text. --------- Co-authored-by: Erick Friis <erick@langchain.dev> Co-authored-by: Erick Friis <erickfriis@gmail.com>
115 lines
2.5 KiB
TOML
115 lines
2.5 KiB
TOML
[build-system]
|
|
requires = ["poetry-core>=1.0.0"]
|
|
build-backend = "poetry.core.masonry.api"
|
|
|
|
[tool.poetry]
|
|
name = "langchain-text-splitters"
|
|
version = "0.3.3"
|
|
description = "LangChain text splitting utilities"
|
|
authors = []
|
|
license = "MIT"
|
|
readme = "README.md"
|
|
repository = "https://github.com/langchain-ai/langchain"
|
|
|
|
[tool.mypy]
|
|
disallow_untyped_defs = "True"
|
|
[[tool.mypy.overrides]]
|
|
module = [
|
|
"transformers",
|
|
"sentence_transformers",
|
|
"nltk.tokenize",
|
|
"konlpy.tag",
|
|
"bs4",
|
|
"pytest",
|
|
"spacy",
|
|
"spacy.lang.en",
|
|
"numpy",
|
|
"nltk",
|
|
"spacy.cli",
|
|
"torch",
|
|
]
|
|
ignore_missing_imports = "True"
|
|
|
|
[tool.poetry.urls]
|
|
"Source Code" = "https://github.com/langchain-ai/langchain/tree/master/libs/text-splitters"
|
|
"Release Notes" = "https://github.com/langchain-ai/langchain/releases?q=tag%3A%22langchain-text-splitters%3D%3D0%22&expanded=true"
|
|
|
|
[tool.poetry.dependencies]
|
|
python = ">=3.9,<4.0"
|
|
langchain-core = "^0.3.25"
|
|
|
|
[tool.ruff.lint]
|
|
select = ["E", "F", "I", "T201", "D"]
|
|
ignore = ["D100"]
|
|
|
|
[tool.coverage.run]
|
|
omit = ["tests/*"]
|
|
|
|
[tool.pytest.ini_options]
|
|
addopts = "--strict-markers --strict-config --durations=5"
|
|
markers = [
|
|
"requires: mark tests as requiring a specific library",
|
|
"compile: mark placeholder test used to compile integration tests without running them",
|
|
]
|
|
asyncio_mode = "auto"
|
|
|
|
[tool.poetry.group.lint]
|
|
optional = true
|
|
|
|
[tool.poetry.group.typing]
|
|
optional = true
|
|
|
|
[tool.poetry.group.dev]
|
|
optional = true
|
|
|
|
[tool.poetry.group.test]
|
|
optional = true
|
|
|
|
[tool.ruff.lint.pydocstyle]
|
|
convention = "google"
|
|
|
|
[tool.ruff.lint.per-file-ignores]
|
|
"tests/**" = ["D"]
|
|
|
|
[tool.poetry.group.lint.dependencies]
|
|
ruff = "^0.5"
|
|
|
|
[tool.poetry.group.typing.dependencies]
|
|
mypy = "^1.10"
|
|
lxml-stubs = "^0.5.1"
|
|
types-requests = "^2.31.0.20240218"
|
|
tiktoken = "^0.8.0"
|
|
|
|
[tool.poetry.group.dev.dependencies]
|
|
jupyter = "^1.0.0"
|
|
|
|
|
|
[tool.poetry.group.test.dependencies]
|
|
pytest = "^8"
|
|
freezegun = "^1.2.2"
|
|
pytest-mock = "^3.10.0"
|
|
pytest-watcher = "^0.3.4"
|
|
pytest-asyncio = "^0.21.1"
|
|
pytest-socket = "^0.7.0"
|
|
|
|
[tool.poetry.group.test_integration]
|
|
optional = true
|
|
|
|
[tool.poetry.group.test_integration.dependencies]
|
|
spacy = { version = "*", python = "<3.13" }
|
|
nltk = "^3.9.1"
|
|
transformers = "^4.47.0"
|
|
sentence-transformers = { version = ">=2.6.0", python = "<3.13" }
|
|
|
|
[tool.poetry.group.lint.dependencies.langchain-core]
|
|
path = "../core"
|
|
develop = true
|
|
|
|
[tool.poetry.group.dev.dependencies.langchain-core]
|
|
path = "../core"
|
|
develop = true
|
|
|
|
[tool.poetry.group.test.dependencies.langchain-core]
|
|
path = "../core"
|
|
develop = true
|