mirror of
https://github.com/hwchase17/langchain.git
synced 2025-07-02 19:34:04 +00:00
Confluence Loader: Fix CQL loading (#27620)
fix #12082 <!--- If no one reviews your PR within a few days, please @-mention one of baskaryan, efriis, eyurtsev, ccurme, vbarda, hwchase17. -->
This commit is contained in:
parent
aba2711e7f
commit
0d20c314dd
@ -442,17 +442,25 @@ class ConfluenceLoader(BaseLoader):
|
||||
yield from self._lazy_load()
|
||||
|
||||
def _search_content_by_cql(
|
||||
self, cql: str, include_archived_spaces: Optional[bool] = None, **kwargs: Any
|
||||
) -> List[dict]:
|
||||
url = "rest/api/content/search"
|
||||
self,
|
||||
cql: str,
|
||||
include_archived_spaces: Optional[bool] = None,
|
||||
next_url: str = "",
|
||||
**kwargs: Any,
|
||||
) -> tuple[List[dict], str]:
|
||||
if next_url:
|
||||
response = self.confluence.get(next_url)
|
||||
else:
|
||||
url = "rest/api/content/search"
|
||||
|
||||
params: Dict[str, Any] = {"cql": cql}
|
||||
params.update(kwargs)
|
||||
if include_archived_spaces is not None:
|
||||
params["includeArchivedSpaces"] = include_archived_spaces
|
||||
params: Dict[str, Any] = {"cql": cql}
|
||||
params.update(kwargs)
|
||||
if include_archived_spaces is not None:
|
||||
params["includeArchivedSpaces"] = include_archived_spaces
|
||||
|
||||
response = self.confluence.get(url, params=params)
|
||||
return response.get("results", [])
|
||||
response = self.confluence.get(url, params=params)
|
||||
|
||||
return response.get("results", []), response.get("_links", {}).get("next", "")
|
||||
|
||||
def paginate_request(self, retrieval_method: Callable, **kwargs: Any) -> List:
|
||||
"""Paginate the various methods to retrieve groups of pages.
|
||||
@ -477,6 +485,7 @@ class ConfluenceLoader(BaseLoader):
|
||||
|
||||
max_pages = kwargs.pop("max_pages")
|
||||
docs: List[dict] = []
|
||||
next_url: str = ""
|
||||
while len(docs) < max_pages:
|
||||
get_pages = retry(
|
||||
reraise=True,
|
||||
@ -490,9 +499,15 @@ class ConfluenceLoader(BaseLoader):
|
||||
),
|
||||
before_sleep=before_sleep_log(logger, logging.WARNING),
|
||||
)(retrieval_method)
|
||||
batch = get_pages(**kwargs, start=len(docs))
|
||||
if not batch:
|
||||
break
|
||||
if self.cql: # cursor pagination for CQL
|
||||
batch, next_url = get_pages(**kwargs, next_url=next_url)
|
||||
if not next_url:
|
||||
docs.extend(batch)
|
||||
break
|
||||
else:
|
||||
batch = get_pages(**kwargs, start=len(docs))
|
||||
if not batch:
|
||||
break
|
||||
docs.extend(batch)
|
||||
return docs[:max_pages]
|
||||
|
||||
@ -694,8 +709,11 @@ class ConfluenceLoader(BaseLoader):
|
||||
return text
|
||||
|
||||
for i, image in enumerate(images):
|
||||
image_text = pytesseract.image_to_string(image, lang=ocr_languages)
|
||||
text += f"Page {i + 1}:\n{image_text}\n\n"
|
||||
try:
|
||||
image_text = pytesseract.image_to_string(image, lang=ocr_languages)
|
||||
text += f"Page {i + 1}:\n{image_text}\n\n"
|
||||
except pytesseract.TesseractError as ex:
|
||||
logger.warning(f"TesseractError: {ex}")
|
||||
|
||||
return text
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user