mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-25 16:13:25 +00:00
As described in issue #17060, in the case in which text has only one sentence the following function fails. Checking for that and adding a return case fixed the issue. ```python def split_text(self, text: str) -> List[str]: """Split text into multiple components.""" # Splitting the essay on '.', '?', and '!' single_sentences_list = re.split(r"(?<=[.?!])\s+", text) sentences = [ {"sentence": x, "index": i} for i, x in enumerate(single_sentences_list) ] sentences = combine_sentences(sentences) embeddings = self.embeddings.embed_documents( [x["combined_sentence"] for x in sentences] ) for i, sentence in enumerate(sentences): sentence["combined_sentence_embedding"] = embeddings[i] distances, sentences = calculate_cosine_distances(sentences) start_index = 0 # Create a list to hold the grouped sentences chunks = [] breakpoint_percentile_threshold = 95 breakpoint_distance_threshold = np.percentile( distances, breakpoint_percentile_threshold ) # If you want more chunks, lower the percentile cutoff indices_above_thresh = [ i for i, x in enumerate(distances) if x > breakpoint_distance_threshold ] # The indices of those breakpoints on your list # Iterate through the breakpoints to slice the sentences for index in indices_above_thresh: # The end index is the current breakpoint end_index = index # Slice the sentence_dicts from the current start index to the end index group = sentences[start_index : end_index + 1] combined_text = " ".join([d["sentence"] for d in group]) chunks.append(combined_text) # Update the start index for the next group start_index = index + 1 # The last group, if any sentences remain if start_index < len(sentences): combined_text = " ".join([d["sentence"] for d in sentences[start_index:]]) chunks.append(combined_text) return chunks ``` Co-authored-by: Giulio Zani <salamanderxing@Giulios-MBP.homenet.telecomitalia.it>
This commit is contained in:
parent
912210ac19
commit
9f0b63dba0
@ -85,6 +85,12 @@ class SemanticChunker(BaseDocumentTransformer):
|
|||||||
"""Split text into multiple components."""
|
"""Split text into multiple components."""
|
||||||
# Splitting the essay on '.', '?', and '!'
|
# Splitting the essay on '.', '?', and '!'
|
||||||
single_sentences_list = re.split(r"(?<=[.?!])\s+", text)
|
single_sentences_list = re.split(r"(?<=[.?!])\s+", text)
|
||||||
|
|
||||||
|
# having len(single_sentences_list) == 1 would cause the following
|
||||||
|
# np.percentile to fail.
|
||||||
|
if len(single_sentences_list) == 1:
|
||||||
|
return single_sentences_list
|
||||||
|
|
||||||
sentences = [
|
sentences = [
|
||||||
{"sentence": x, "index": i} for i, x in enumerate(single_sentences_list)
|
{"sentence": x, "index": i} for i, x in enumerate(single_sentences_list)
|
||||||
]
|
]
|
||||||
|
Loading…
Reference in New Issue
Block a user