mirror of
https://github.com/hwchase17/langchain.git
synced 2025-07-04 12:18:24 +00:00
Confluence Loader: Fix CQL loading (#27620)
fix #12082 <!--- If no one reviews your PR within a few days, please @-mention one of baskaryan, efriis, eyurtsev, ccurme, vbarda, hwchase17. -->
This commit is contained in:
parent
aba2711e7f
commit
0d20c314dd
@ -442,8 +442,15 @@ class ConfluenceLoader(BaseLoader):
|
|||||||
yield from self._lazy_load()
|
yield from self._lazy_load()
|
||||||
|
|
||||||
def _search_content_by_cql(
|
def _search_content_by_cql(
|
||||||
self, cql: str, include_archived_spaces: Optional[bool] = None, **kwargs: Any
|
self,
|
||||||
) -> List[dict]:
|
cql: str,
|
||||||
|
include_archived_spaces: Optional[bool] = None,
|
||||||
|
next_url: str = "",
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> tuple[List[dict], str]:
|
||||||
|
if next_url:
|
||||||
|
response = self.confluence.get(next_url)
|
||||||
|
else:
|
||||||
url = "rest/api/content/search"
|
url = "rest/api/content/search"
|
||||||
|
|
||||||
params: Dict[str, Any] = {"cql": cql}
|
params: Dict[str, Any] = {"cql": cql}
|
||||||
@ -452,7 +459,8 @@ class ConfluenceLoader(BaseLoader):
|
|||||||
params["includeArchivedSpaces"] = include_archived_spaces
|
params["includeArchivedSpaces"] = include_archived_spaces
|
||||||
|
|
||||||
response = self.confluence.get(url, params=params)
|
response = self.confluence.get(url, params=params)
|
||||||
return response.get("results", [])
|
|
||||||
|
return response.get("results", []), response.get("_links", {}).get("next", "")
|
||||||
|
|
||||||
def paginate_request(self, retrieval_method: Callable, **kwargs: Any) -> List:
|
def paginate_request(self, retrieval_method: Callable, **kwargs: Any) -> List:
|
||||||
"""Paginate the various methods to retrieve groups of pages.
|
"""Paginate the various methods to retrieve groups of pages.
|
||||||
@ -477,6 +485,7 @@ class ConfluenceLoader(BaseLoader):
|
|||||||
|
|
||||||
max_pages = kwargs.pop("max_pages")
|
max_pages = kwargs.pop("max_pages")
|
||||||
docs: List[dict] = []
|
docs: List[dict] = []
|
||||||
|
next_url: str = ""
|
||||||
while len(docs) < max_pages:
|
while len(docs) < max_pages:
|
||||||
get_pages = retry(
|
get_pages = retry(
|
||||||
reraise=True,
|
reraise=True,
|
||||||
@ -490,6 +499,12 @@ class ConfluenceLoader(BaseLoader):
|
|||||||
),
|
),
|
||||||
before_sleep=before_sleep_log(logger, logging.WARNING),
|
before_sleep=before_sleep_log(logger, logging.WARNING),
|
||||||
)(retrieval_method)
|
)(retrieval_method)
|
||||||
|
if self.cql: # cursor pagination for CQL
|
||||||
|
batch, next_url = get_pages(**kwargs, next_url=next_url)
|
||||||
|
if not next_url:
|
||||||
|
docs.extend(batch)
|
||||||
|
break
|
||||||
|
else:
|
||||||
batch = get_pages(**kwargs, start=len(docs))
|
batch = get_pages(**kwargs, start=len(docs))
|
||||||
if not batch:
|
if not batch:
|
||||||
break
|
break
|
||||||
@ -694,8 +709,11 @@ class ConfluenceLoader(BaseLoader):
|
|||||||
return text
|
return text
|
||||||
|
|
||||||
for i, image in enumerate(images):
|
for i, image in enumerate(images):
|
||||||
|
try:
|
||||||
image_text = pytesseract.image_to_string(image, lang=ocr_languages)
|
image_text = pytesseract.image_to_string(image, lang=ocr_languages)
|
||||||
text += f"Page {i + 1}:\n{image_text}\n\n"
|
text += f"Page {i + 1}:\n{image_text}\n\n"
|
||||||
|
except pytesseract.TesseractError as ex:
|
||||||
|
logger.warning(f"TesseractError: {ex}")
|
||||||
|
|
||||||
return text
|
return text
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user