Confluence Loader: Fix CQL loading (#27620)

fix #12082

<!---
If no one reviews your PR within a few days, please @-mention one of
baskaryan, efriis, eyurtsev, ccurme, vbarda, hwchase17.
-->
This commit is contained in:
Alex Tonkonozhenko 2024-12-10 17:05:23 +01:00 committed by GitHub
parent aba2711e7f
commit 0d20c314dd
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -442,8 +442,15 @@ class ConfluenceLoader(BaseLoader):
yield from self._lazy_load() yield from self._lazy_load()
def _search_content_by_cql( def _search_content_by_cql(
self, cql: str, include_archived_spaces: Optional[bool] = None, **kwargs: Any self,
) -> List[dict]: cql: str,
include_archived_spaces: Optional[bool] = None,
next_url: str = "",
**kwargs: Any,
) -> tuple[List[dict], str]:
if next_url:
response = self.confluence.get(next_url)
else:
url = "rest/api/content/search" url = "rest/api/content/search"
params: Dict[str, Any] = {"cql": cql} params: Dict[str, Any] = {"cql": cql}
@ -452,7 +459,8 @@ class ConfluenceLoader(BaseLoader):
params["includeArchivedSpaces"] = include_archived_spaces params["includeArchivedSpaces"] = include_archived_spaces
response = self.confluence.get(url, params=params) response = self.confluence.get(url, params=params)
return response.get("results", [])
return response.get("results", []), response.get("_links", {}).get("next", "")
def paginate_request(self, retrieval_method: Callable, **kwargs: Any) -> List: def paginate_request(self, retrieval_method: Callable, **kwargs: Any) -> List:
"""Paginate the various methods to retrieve groups of pages. """Paginate the various methods to retrieve groups of pages.
@ -477,6 +485,7 @@ class ConfluenceLoader(BaseLoader):
max_pages = kwargs.pop("max_pages") max_pages = kwargs.pop("max_pages")
docs: List[dict] = [] docs: List[dict] = []
next_url: str = ""
while len(docs) < max_pages: while len(docs) < max_pages:
get_pages = retry( get_pages = retry(
reraise=True, reraise=True,
@ -490,6 +499,12 @@ class ConfluenceLoader(BaseLoader):
), ),
before_sleep=before_sleep_log(logger, logging.WARNING), before_sleep=before_sleep_log(logger, logging.WARNING),
)(retrieval_method) )(retrieval_method)
if self.cql: # cursor pagination for CQL
batch, next_url = get_pages(**kwargs, next_url=next_url)
if not next_url:
docs.extend(batch)
break
else:
batch = get_pages(**kwargs, start=len(docs)) batch = get_pages(**kwargs, start=len(docs))
if not batch: if not batch:
break break
@ -694,8 +709,11 @@ class ConfluenceLoader(BaseLoader):
return text return text
for i, image in enumerate(images): for i, image in enumerate(images):
try:
image_text = pytesseract.image_to_string(image, lang=ocr_languages) image_text = pytesseract.image_to_string(image, lang=ocr_languages)
text += f"Page {i + 1}:\n{image_text}\n\n" text += f"Page {i + 1}:\n{image_text}\n\n"
except pytesseract.TesseractError as ex:
logger.warning(f"TesseractError: {ex}")
return text return text