mirror of
				https://github.com/hwchase17/langchain.git
				synced 2025-11-04 02:03:32 +00:00 
			
		
		
		
	fixed potential `IndexError: list index out of range` in case there is
no title
Thank you for contributing to LangChain!
- [ ] **PR title**: "package: description"
- Where "package" is whichever of langchain, community, core,
experimental, etc. is being modified. Use "docs: ..." for purely docs
changes, "templates: ..." for template changes, "infra: ..." for CI
changes.
  - Example: "community: add foobar LLM"
- [ ] **PR message**: ***Delete this entire checklist*** and replace
with
    - **Description:** a description of the change
    - **Issue:** the issue # it fixes, if applicable
    - **Dependencies:** any dependencies required for this change
- **Twitter handle:** if your PR gets announced, and you'd like a
mention, we'll gladly shout you out!
- [ ] **Add tests and docs**: If you're adding a new integration, please
include
1. a test for the integration, preferably unit tests that do not rely on
network access,
2. an example notebook showing its use. It lives in
`docs/docs/integrations` directory.
- [ ] **Lint and test**: Run `make format`, `make lint` and `make test`
from the root of the package(s) you've modified. See contribution
guidelines for more: https://python.langchain.com/docs/contributing/
Additional guidelines:
- Make sure optional dependencies are imported within a function.
- Please do not add dependencies to pyproject.toml files (even optional
ones) unless they are required for unit tests.
- Most PRs should not touch more than one package.
- Changes should be backwards compatible.
- If you are adding something to community, do not re-import it in
langchain.
If no one reviews your PR within a few days, please @-mention one of
baskaryan, efriis, eyurtsev, ccurme, vbarda, hwchase17.
		
	
		
			
				
	
	
		
			154 lines
		
	
	
		
			5.9 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			154 lines
		
	
	
		
			5.9 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
import logging
 | 
						|
from typing import Dict, Iterator, List, Union
 | 
						|
 | 
						|
import requests
 | 
						|
from langchain_core.documents import Document
 | 
						|
 | 
						|
from langchain_community.document_loaders.base import BaseBlobParser
 | 
						|
from langchain_community.document_loaders.blob_loaders import Blob
 | 
						|
 | 
						|
logger = logging.getLogger(__name__)
 | 
						|
 | 
						|
 | 
						|
class ServerUnavailableException(Exception):
 | 
						|
    """Exception raised when the Grobid server is unavailable."""
 | 
						|
 | 
						|
    pass
 | 
						|
 | 
						|
 | 
						|
class GrobidParser(BaseBlobParser):
 | 
						|
    """Load  article `PDF` files using `Grobid`."""
 | 
						|
 | 
						|
    def __init__(
 | 
						|
        self,
 | 
						|
        segment_sentences: bool,
 | 
						|
        grobid_server: str = "http://localhost:8070/api/processFulltextDocument",
 | 
						|
    ) -> None:
 | 
						|
        self.segment_sentences = segment_sentences
 | 
						|
        self.grobid_server = grobid_server
 | 
						|
        try:
 | 
						|
            requests.get(grobid_server)
 | 
						|
        except requests.exceptions.RequestException:
 | 
						|
            logger.error(
 | 
						|
                "GROBID server does not appear up and running, \
 | 
						|
                please ensure Grobid is installed and the server is running"
 | 
						|
            )
 | 
						|
            raise ServerUnavailableException
 | 
						|
 | 
						|
    def process_xml(
 | 
						|
        self, file_path: str, xml_data: str, segment_sentences: bool
 | 
						|
    ) -> Iterator[Document]:
 | 
						|
        """Process the XML file from Grobin."""
 | 
						|
 | 
						|
        try:
 | 
						|
            from bs4 import BeautifulSoup
 | 
						|
        except ImportError:
 | 
						|
            raise ImportError(
 | 
						|
                "`bs4` package not found, please install it with " "`pip install bs4`"
 | 
						|
            )
 | 
						|
        soup = BeautifulSoup(xml_data, "xml")
 | 
						|
        sections = soup.find_all("div")
 | 
						|
        titles = soup.find_all("title")
 | 
						|
        if titles:
 | 
						|
            title = titles[0].text
 | 
						|
        else:
 | 
						|
            title = "No title found"
 | 
						|
        chunks = []
 | 
						|
        for section in sections:
 | 
						|
            sect = section.find("head")
 | 
						|
            if sect is not None:
 | 
						|
                for i, paragraph in enumerate(section.find_all("p")):
 | 
						|
                    chunk_bboxes = []
 | 
						|
                    paragraph_text = []
 | 
						|
                    for i, sentence in enumerate(paragraph.find_all("s")):
 | 
						|
                        paragraph_text.append(sentence.text)
 | 
						|
                        sbboxes = []
 | 
						|
                        if sentence.get("coords") is not None:
 | 
						|
                            for bbox in sentence.get("coords").split(";"):
 | 
						|
                                box = bbox.split(",")
 | 
						|
                                sbboxes.append(
 | 
						|
                                    {
 | 
						|
                                        "page": box[0],
 | 
						|
                                        "x": box[1],
 | 
						|
                                        "y": box[2],
 | 
						|
                                        "h": box[3],
 | 
						|
                                        "w": box[4],
 | 
						|
                                    }
 | 
						|
                                )
 | 
						|
                            chunk_bboxes.append(sbboxes)
 | 
						|
                        if (segment_sentences is True) and (len(sbboxes) > 0):
 | 
						|
                            fpage, lpage = sbboxes[0]["page"], sbboxes[-1]["page"]
 | 
						|
                            sentence_dict = {
 | 
						|
                                "text": sentence.text,
 | 
						|
                                "para": str(i),
 | 
						|
                                "bboxes": [sbboxes],
 | 
						|
                                "section_title": sect.text,
 | 
						|
                                "section_number": sect.get("n"),
 | 
						|
                                "pages": (fpage, lpage),
 | 
						|
                            }
 | 
						|
                            chunks.append(sentence_dict)
 | 
						|
                    if segment_sentences is not True:
 | 
						|
                        fpage, lpage = (
 | 
						|
                            chunk_bboxes[0][0]["page"],
 | 
						|
                            chunk_bboxes[-1][-1]["page"],
 | 
						|
                        )
 | 
						|
                        paragraph_dict = {
 | 
						|
                            "text": "".join(paragraph_text),
 | 
						|
                            "para": str(i),
 | 
						|
                            "bboxes": chunk_bboxes,
 | 
						|
                            "section_title": sect.text,
 | 
						|
                            "section_number": sect.get("n"),
 | 
						|
                            "pages": (fpage, lpage),
 | 
						|
                        }
 | 
						|
                        chunks.append(paragraph_dict)
 | 
						|
 | 
						|
        yield from [
 | 
						|
            Document(
 | 
						|
                page_content=chunk["text"],
 | 
						|
                metadata=dict(
 | 
						|
                    {
 | 
						|
                        "text": str(chunk["text"]),
 | 
						|
                        "para": str(chunk["para"]),
 | 
						|
                        "bboxes": str(chunk["bboxes"]),
 | 
						|
                        "pages": str(chunk["pages"]),
 | 
						|
                        "section_title": str(chunk["section_title"]),
 | 
						|
                        "section_number": str(chunk["section_number"]),
 | 
						|
                        "paper_title": str(title),
 | 
						|
                        "file_path": str(file_path),
 | 
						|
                    }
 | 
						|
                ),
 | 
						|
            )
 | 
						|
            for chunk in chunks
 | 
						|
        ]
 | 
						|
 | 
						|
    def lazy_parse(self, blob: Blob) -> Iterator[Document]:
 | 
						|
        file_path = blob.source
 | 
						|
        if file_path is None:
 | 
						|
            raise ValueError("blob.source cannot be None.")
 | 
						|
        pdf = open(file_path, "rb")
 | 
						|
        files = {"input": (file_path, pdf, "application/pdf", {"Expires": "0"})}
 | 
						|
        try:
 | 
						|
            data: Dict[str, Union[str, List[str]]] = {}
 | 
						|
            for param in ["generateIDs", "consolidateHeader", "segmentSentences"]:
 | 
						|
                data[param] = "1"
 | 
						|
            data["teiCoordinates"] = ["head", "s"]
 | 
						|
            files = files or {}
 | 
						|
            r = requests.request(
 | 
						|
                "POST",
 | 
						|
                self.grobid_server,
 | 
						|
                headers=None,
 | 
						|
                params=None,
 | 
						|
                files=files,
 | 
						|
                data=data,
 | 
						|
                timeout=60,
 | 
						|
            )
 | 
						|
            xml_data = r.text
 | 
						|
        except requests.exceptions.ReadTimeout:
 | 
						|
            logger.error("GROBID server timed out. Return None.")
 | 
						|
            xml_data = None
 | 
						|
 | 
						|
        if xml_data is None:
 | 
						|
            return iter([])
 | 
						|
        else:
 | 
						|
            return self.process_xml(file_path, xml_data, self.segment_sentences)
 |