community[patch]: Update grobid.py (#16298)

there is a case where "coords" does not exist in the "sentence"
therefore, the "split(";")" will lead to error.

we can fix that by adding "if sentence.get("coords") is not None:" 

the resulting empty "sbboxes" from this scenario will raise error at
"sbboxes[0]["page"]" because sbboxes are empty.

the PDF from https://pubmed.ncbi.nlm.nih.gov/23970373/ can replicate
those errors.
This commit is contained in:
Alireza Kashani 2024-01-22 23:03:58 +01:00 committed by GitHub
parent fbe592a5ce
commit d1b4ead87c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -59,19 +59,20 @@ class GrobidParser(BaseBlobParser):
for i, sentence in enumerate(paragraph.find_all("s")):
paragraph_text.append(sentence.text)
sbboxes = []
for bbox in sentence.get("coords").split(";"):
box = bbox.split(",")
sbboxes.append(
{
"page": box[0],
"x": box[1],
"y": box[2],
"h": box[3],
"w": box[4],
}
)
chunk_bboxes.append(sbboxes)
if segment_sentences is True:
if sentence.get("coords") is not None:
for bbox in sentence.get("coords").split(";"):
box = bbox.split(",")
sbboxes.append(
{
"page": box[0],
"x": box[1],
"y": box[2],
"h": box[3],
"w": box[4],
}
)
chunk_bboxes.append(sbboxes)
if (segment_sentences is True) and (len(sbboxes) > 0):
fpage, lpage = sbboxes[0]["page"], sbboxes[-1]["page"]
sentence_dict = {
"text": sentence.text,