mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-18 12:58:59 +00:00
community[patch]: Update grobid.py (#16298)
there is a case where "coords" does not exist in the "sentence" therefore, the "split(";")" will lead to error. we can fix that by adding "if sentence.get("coords") is not None:" the resulting empty "sbboxes" from this scenario will raise error at "sbboxes[0]["page"]" because sbboxes are empty. the PDF from https://pubmed.ncbi.nlm.nih.gov/23970373/ can replicate those errors.
This commit is contained in:
parent
fbe592a5ce
commit
d1b4ead87c
@ -59,19 +59,20 @@ class GrobidParser(BaseBlobParser):
|
|||||||
for i, sentence in enumerate(paragraph.find_all("s")):
|
for i, sentence in enumerate(paragraph.find_all("s")):
|
||||||
paragraph_text.append(sentence.text)
|
paragraph_text.append(sentence.text)
|
||||||
sbboxes = []
|
sbboxes = []
|
||||||
for bbox in sentence.get("coords").split(";"):
|
if sentence.get("coords") is not None:
|
||||||
box = bbox.split(",")
|
for bbox in sentence.get("coords").split(";"):
|
||||||
sbboxes.append(
|
box = bbox.split(",")
|
||||||
{
|
sbboxes.append(
|
||||||
"page": box[0],
|
{
|
||||||
"x": box[1],
|
"page": box[0],
|
||||||
"y": box[2],
|
"x": box[1],
|
||||||
"h": box[3],
|
"y": box[2],
|
||||||
"w": box[4],
|
"h": box[3],
|
||||||
}
|
"w": box[4],
|
||||||
)
|
}
|
||||||
chunk_bboxes.append(sbboxes)
|
)
|
||||||
if segment_sentences is True:
|
chunk_bboxes.append(sbboxes)
|
||||||
|
if (segment_sentences is True) and (len(sbboxes) > 0):
|
||||||
fpage, lpage = sbboxes[0]["page"], sbboxes[-1]["page"]
|
fpage, lpage = sbboxes[0]["page"], sbboxes[-1]["page"]
|
||||||
sentence_dict = {
|
sentence_dict = {
|
||||||
"text": sentence.text,
|
"text": sentence.text,
|
||||||
|
Loading…
Reference in New Issue
Block a user