mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-05 14:43:08 +00:00
community[patch]: Update grobid.py (#16298)
there is a case where "coords" does not exist in the "sentence" therefore, the "split(";")" will lead to error. we can fix that by adding "if sentence.get("coords") is not None:" the resulting empty "sbboxes" from this scenario will raise error at "sbboxes[0]["page"]" because sbboxes are empty. the PDF from https://pubmed.ncbi.nlm.nih.gov/23970373/ can replicate those errors.
This commit is contained in:
parent
fbe592a5ce
commit
d1b4ead87c
@ -59,19 +59,20 @@ class GrobidParser(BaseBlobParser):
|
||||
for i, sentence in enumerate(paragraph.find_all("s")):
|
||||
paragraph_text.append(sentence.text)
|
||||
sbboxes = []
|
||||
for bbox in sentence.get("coords").split(";"):
|
||||
box = bbox.split(",")
|
||||
sbboxes.append(
|
||||
{
|
||||
"page": box[0],
|
||||
"x": box[1],
|
||||
"y": box[2],
|
||||
"h": box[3],
|
||||
"w": box[4],
|
||||
}
|
||||
)
|
||||
chunk_bboxes.append(sbboxes)
|
||||
if segment_sentences is True:
|
||||
if sentence.get("coords") is not None:
|
||||
for bbox in sentence.get("coords").split(";"):
|
||||
box = bbox.split(",")
|
||||
sbboxes.append(
|
||||
{
|
||||
"page": box[0],
|
||||
"x": box[1],
|
||||
"y": box[2],
|
||||
"h": box[3],
|
||||
"w": box[4],
|
||||
}
|
||||
)
|
||||
chunk_bboxes.append(sbboxes)
|
||||
if (segment_sentences is True) and (len(sbboxes) > 0):
|
||||
fpage, lpage = sbboxes[0]["page"], sbboxes[-1]["page"]
|
||||
sentence_dict = {
|
||||
"text": sentence.text,
|
||||
|
Loading…
Reference in New Issue
Block a user