Confluence Loader: Fix CQL loading (#27620)

fix #12082

<!---
If no one reviews your PR within a few days, please @-mention one of
baskaryan, efriis, eyurtsev, ccurme, vbarda, hwchase17.
-->
This commit is contained in:
Alex Tonkonozhenko 2024-12-10 17:05:23 +01:00 committed by GitHub
parent aba2711e7f
commit 0d20c314dd
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -442,17 +442,25 @@ class ConfluenceLoader(BaseLoader):
yield from self._lazy_load()
def _search_content_by_cql(
self, cql: str, include_archived_spaces: Optional[bool] = None, **kwargs: Any
) -> List[dict]:
url = "rest/api/content/search"
self,
cql: str,
include_archived_spaces: Optional[bool] = None,
next_url: str = "",
**kwargs: Any,
) -> tuple[List[dict], str]:
if next_url:
response = self.confluence.get(next_url)
else:
url = "rest/api/content/search"
params: Dict[str, Any] = {"cql": cql}
params.update(kwargs)
if include_archived_spaces is not None:
params["includeArchivedSpaces"] = include_archived_spaces
params: Dict[str, Any] = {"cql": cql}
params.update(kwargs)
if include_archived_spaces is not None:
params["includeArchivedSpaces"] = include_archived_spaces
response = self.confluence.get(url, params=params)
return response.get("results", [])
response = self.confluence.get(url, params=params)
return response.get("results", []), response.get("_links", {}).get("next", "")
def paginate_request(self, retrieval_method: Callable, **kwargs: Any) -> List:
"""Paginate the various methods to retrieve groups of pages.
@ -477,6 +485,7 @@ class ConfluenceLoader(BaseLoader):
max_pages = kwargs.pop("max_pages")
docs: List[dict] = []
next_url: str = ""
while len(docs) < max_pages:
get_pages = retry(
reraise=True,
@ -490,9 +499,15 @@ class ConfluenceLoader(BaseLoader):
),
before_sleep=before_sleep_log(logger, logging.WARNING),
)(retrieval_method)
batch = get_pages(**kwargs, start=len(docs))
if not batch:
break
if self.cql: # cursor pagination for CQL
batch, next_url = get_pages(**kwargs, next_url=next_url)
if not next_url:
docs.extend(batch)
break
else:
batch = get_pages(**kwargs, start=len(docs))
if not batch:
break
docs.extend(batch)
return docs[:max_pages]
@ -694,8 +709,11 @@ class ConfluenceLoader(BaseLoader):
return text
for i, image in enumerate(images):
image_text = pytesseract.image_to_string(image, lang=ocr_languages)
text += f"Page {i + 1}:\n{image_text}\n\n"
try:
image_text = pytesseract.image_to_string(image, lang=ocr_languages)
text += f"Page {i + 1}:\n{image_text}\n\n"
except pytesseract.TesseractError as ex:
logger.warning(f"TesseractError: {ex}")
return text