mirror of
https://github.com/hwchase17/langchain.git
synced 2025-07-31 08:32:32 +00:00
Extend Cube Semantic Loader functionality (#8186)
**PR Description:** This pull request introduces several enhancements and new features to the `CubeSemanticLoader`. The changes include the following: 1. Added imports for the `json` and `time` modules. 2. Added new constructor parameters: `load_dimension_values`, `dimension_values_limit`, `dimension_values_max_retries`, and `dimension_values_retry_delay`. 3. Updated the class documentation with descriptions for the new constructor parameters. 4. Added a new private method `_get_dimension_values()` to retrieve dimension values from Cube's REST API. 5. Modified the `load()` method to load dimension values for string dimensions if `load_dimension_values` is set to `True`. 6. Updated the API endpoint in the `load()` method from the base URL to the metadata endpoint. 7. Refactored the code to retrieve metadata from the response JSON. 8. Added the `column_member_type` field to the metadata dictionary to indicate if a column is a measure or a dimension. 9. Added the `column_values` field to the metadata dictionary to store the dimension values retrieved from Cube's API. 10. Modified the `page_content` construction to include the column title and description instead of the table name, column name, data type, title, and description. These changes improve the functionality and flexibility of the `CubeSemanticLoader` class by allowing the loading of dimension values and providing more detailed metadata for each document. --------- Co-authored-by: Bagatur <baskaryan@gmail.com>
This commit is contained in:
parent
82b8d8596c
commit
d983046f90
@ -53,11 +53,23 @@
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"**Input arguments (mandatory)**\n",
|
||||
"\n",
|
||||
"`Cube Semantic Loader` requires 2 arguments:\n",
|
||||
"| Input Parameter | Description |\n",
|
||||
"| --- | --- |\n",
|
||||
"| `cube_api_url` | The URL of your Cube's deployment REST API. Please refer to the [Cube documentation](https://cube.dev/docs/http-api/rest#configuration-base-path) for more information on configuring the base path. |\n",
|
||||
"| `cube_api_token` | The authentication token generated based on your Cube's API secret. Please refer to the [Cube documentation](https://cube.dev/docs/security#generating-json-web-tokens-jwt) for instructions on generating JSON Web Tokens (JWT). |\n"
|
||||
"\n",
|
||||
"- `cube_api_url`: The URL of your Cube's deployment REST API. Please refer to the [Cube documentation](https://cube.dev/docs/http-api/rest#configuration-base-path) for more information on configuring the base path.\n",
|
||||
"\n",
|
||||
"- `cube_api_token`: The authentication token generated based on your Cube's API secret. Please refer to the [Cube documentation](https://cube.dev/docs/security#generating-json-web-tokens-jwt) for instructions on generating JSON Web Tokens (JWT).\n",
|
||||
"\n",
|
||||
"**Input arguments (optional)**\n",
|
||||
"\n",
|
||||
"- `load_dimension_values`: Whether to load dimension values for every string dimension or not.\n",
|
||||
"\n",
|
||||
"- `dimension_values_limit`: Maximum number of dimension values to load.\n",
|
||||
"\n",
|
||||
"- `dimension_values_max_retries`: Maximum number of retries to load dimension values.\n",
|
||||
"\n",
|
||||
"- `dimension_values_retry_delay`: Delay between retries to load dimension values."
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -85,9 +97,7 @@
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Returns:\n",
|
||||
"\n",
|
||||
"A list of documents with the following attributes:\n",
|
||||
"Returns a list of documents with the following attributes:\n",
|
||||
"\n",
|
||||
"- `page_content`\n",
|
||||
"- `metadata`\n",
|
||||
@ -95,7 +105,8 @@
|
||||
" - `column_name`\n",
|
||||
" - `column_data_type`\n",
|
||||
" - `column_title`\n",
|
||||
" - `column_description`"
|
||||
" - `column_description`\n",
|
||||
" - `column_values`"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -103,7 +114,7 @@
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"> page_content='table name: orders_view, column name: orders_view.total_amount, column data type: number, column title: Orders View Total Amount, column description: None' metadata={'table_name': 'orders_view', 'column_name': 'orders_view.total_amount', 'column_data_type': 'number', 'column_title': 'Orders View Total Amount', 'column_description': 'None'}"
|
||||
"> page_content='Users View City, None' metadata={'table_name': 'users_view', 'column_name': 'users_view.city', 'column_data_type': 'string', 'column_title': 'Users View City', 'column_description': 'None', 'column_member_type': 'dimension', 'column_values': ['Austin', 'Chicago', 'Los Angeles', 'Mountain View', 'New York', 'Palo Alto', 'San Francisco', 'Seattle']}"
|
||||
]
|
||||
}
|
||||
],
|
||||
|
@ -1,3 +1,6 @@
|
||||
import json
|
||||
import logging
|
||||
import time
|
||||
from typing import List
|
||||
|
||||
import requests
|
||||
@ -5,45 +8,118 @@ import requests
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.document_loaders.base import BaseLoader
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class CubeSemanticLoader(BaseLoader):
|
||||
"""Load Cube semantic layer metadata."""
|
||||
"""Load Cube semantic layer metadata.
|
||||
|
||||
Args:
|
||||
cube_api_url: REST API endpoint.
|
||||
Use the REST API of your Cube's deployment.
|
||||
Please find out more information here:
|
||||
https://cube.dev/docs/http-api/rest#configuration-base-path
|
||||
cube_api_token: Cube API token.
|
||||
Authentication tokens are generated based on your Cube's API secret.
|
||||
Please find out more information here:
|
||||
https://cube.dev/docs/security#generating-json-web-tokens-jwt
|
||||
load_dimension_values: Whether to load dimension values for every string
|
||||
dimension or not.
|
||||
dimension_values_limit: Maximum number of dimension values to load.
|
||||
dimension_values_max_retries: Maximum number of retries to load dimension
|
||||
values.
|
||||
dimension_values_retry_delay: Delay between retries to load dimension values.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
cube_api_url: str,
|
||||
cube_api_token: str,
|
||||
load_dimension_values: bool = True,
|
||||
dimension_values_limit: int = 10_000,
|
||||
dimension_values_max_retries: int = 10,
|
||||
dimension_values_retry_delay: int = 3,
|
||||
):
|
||||
self.cube_api_url = cube_api_url
|
||||
"""Use the REST API of your Cube's deployment.
|
||||
Please find out more information here:
|
||||
https://cube.dev/docs/http-api/rest#configuration-base-path
|
||||
"""
|
||||
self.cube_api_token = cube_api_token
|
||||
"""Authentication tokens are generated based on your Cube's API secret.
|
||||
Please find out more information here:
|
||||
https://cube.dev/docs/security#generating-json-web-tokens-jwt
|
||||
self.load_dimension_values = load_dimension_values
|
||||
self.dimension_values_limit = dimension_values_limit
|
||||
self.dimension_values_max_retries = dimension_values_max_retries
|
||||
self.dimension_values_retry_delay = dimension_values_retry_delay
|
||||
|
||||
def _get_dimension_values(self, dimension_name: str) -> List[str]:
|
||||
"""Makes a call to Cube's REST API load endpoint to retrieve
|
||||
values for dimensions.
|
||||
|
||||
These values can be used to achieve a more accurate filtering.
|
||||
"""
|
||||
logger.info("Loading dimension values for: {dimension_name}...")
|
||||
|
||||
headers = {
|
||||
"Content-Type": "application/json",
|
||||
"Authorization": self.cube_api_token,
|
||||
}
|
||||
|
||||
query = {
|
||||
"query": {
|
||||
"dimensions": [dimension_name],
|
||||
"limit": self.dimension_values_limit,
|
||||
}
|
||||
}
|
||||
|
||||
retries = 0
|
||||
while retries < self.dimension_values_max_retries:
|
||||
response = requests.request(
|
||||
"POST",
|
||||
f"{self.cube_api_url}/load",
|
||||
headers=headers,
|
||||
data=json.dumps(query),
|
||||
)
|
||||
|
||||
if response.status_code == 200:
|
||||
response_data = response.json()
|
||||
if (
|
||||
"error" in response_data
|
||||
and response_data["error"] == "Continue wait"
|
||||
):
|
||||
logger.info("Retrying...")
|
||||
retries += 1
|
||||
time.sleep(self.dimension_values_retry_delay)
|
||||
continue
|
||||
else:
|
||||
dimension_values = [
|
||||
item[dimension_name] for item in response_data["data"]
|
||||
]
|
||||
return dimension_values
|
||||
else:
|
||||
logger.error("Request failed with status code:", response.status_code)
|
||||
break
|
||||
|
||||
if retries == self.dimension_values_max_retries:
|
||||
logger.info("Maximum retries reached.")
|
||||
return []
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Makes a call to Cube's REST API metadata endpoint.
|
||||
|
||||
Returns:
|
||||
A list of documents with attributes:
|
||||
- page_content=column_name
|
||||
- page_content=column_title + column_description
|
||||
- metadata
|
||||
- table_name
|
||||
- column_name
|
||||
- column_data_type
|
||||
- column_member_type
|
||||
- column_title
|
||||
- column_description
|
||||
- column_values
|
||||
"""
|
||||
headers = {
|
||||
"Content-Type": "application/json",
|
||||
"Authorization": self.cube_api_token,
|
||||
}
|
||||
|
||||
response = requests.get(self.cube_api_url, headers=headers)
|
||||
response = requests.get(f"{self.cube_api_url}/meta", headers=headers)
|
||||
response.raise_for_status()
|
||||
raw_meta_json = response.json()
|
||||
cubes = raw_meta_json.get("cubes", [])
|
||||
@ -59,19 +135,30 @@ class CubeSemanticLoader(BaseLoader):
|
||||
dimensions = cube.get("dimensions", [])
|
||||
|
||||
for item in measures + dimensions:
|
||||
column_member_type = "measure" if item in measures else "dimension"
|
||||
dimension_values = []
|
||||
item_name = str(item.get("name"))
|
||||
item_type = str(item.get("type"))
|
||||
|
||||
if (
|
||||
self.load_dimension_values
|
||||
and column_member_type == "dimension"
|
||||
and item_type == "string"
|
||||
):
|
||||
dimension_values = self._get_dimension_values(item_name)
|
||||
|
||||
metadata = dict(
|
||||
table_name=str(cube_name),
|
||||
column_name=str(item.get("name")),
|
||||
column_data_type=str(item.get("type")),
|
||||
column_name=item_name,
|
||||
column_data_type=item_type,
|
||||
column_title=str(item.get("title")),
|
||||
column_description=str(item.get("description")),
|
||||
column_member_type=column_member_type,
|
||||
column_values=dimension_values,
|
||||
)
|
||||
|
||||
page_content = f"table name: {str(cube_name)}, "
|
||||
page_content += f"column name: {str(item.get('name'))}, "
|
||||
page_content += f"column data type: {str(item.get('type'))}, "
|
||||
page_content += f"column title: {str(item.get('title'))}, "
|
||||
page_content += f"column description: {str(item.get('description'))}"
|
||||
page_content = f"{str(item.get('title'))}, "
|
||||
page_content += f"{str(item.get('description'))}"
|
||||
|
||||
docs.append(Document(page_content=page_content, metadata=metadata))
|
||||
|
||||
|
@ -1,86 +1,61 @@
|
||||
from typing import List
|
||||
from unittest import TestCase
|
||||
from unittest.mock import MagicMock, patch
|
||||
import unittest
|
||||
from unittest.mock import MagicMock, Mock, patch
|
||||
|
||||
import requests
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.document_loaders import CubeSemanticLoader
|
||||
|
||||
MODULE_PATH = "langchain.document_loaders.cube_semantic.CubeSemanticLoader"
|
||||
|
||||
class TestCubeSemanticLoader(TestCase):
|
||||
@patch.object(requests, "get")
|
||||
def test_load_success(self, mock_get: MagicMock) -> None:
|
||||
# Arrange
|
||||
cube_api_url: str = "https://example.com/cube_api"
|
||||
cube_api_token: str = "abc123"
|
||||
mock_response: MagicMock = MagicMock()
|
||||
|
||||
class TestCubeSemanticLoader(unittest.TestCase):
|
||||
def setUp(self) -> None:
|
||||
self.loader = CubeSemanticLoader(
|
||||
cube_api_url="http://example.com", cube_api_token="test_token"
|
||||
)
|
||||
|
||||
@patch("requests.request")
|
||||
def test_get_dimension_values(self, mock_request: MagicMock) -> None:
|
||||
mock_response = Mock()
|
||||
mock_response.status_code = 200
|
||||
mock_response_json: dict = {
|
||||
mock_response.json.return_value = {"data": [{"test_dimension": "value1"}]}
|
||||
mock_request.return_value = mock_response
|
||||
|
||||
values = self.loader._get_dimension_values("test_dimension")
|
||||
self.assertEqual(values, ["value1"])
|
||||
|
||||
@patch("requests.get")
|
||||
@patch(f"{MODULE_PATH}._get_dimension_values")
|
||||
def test_load(
|
||||
self, mock_get_dimension_values: MagicMock, mock_get: MagicMock
|
||||
) -> None:
|
||||
# Mocking the response
|
||||
mock_response = Mock()
|
||||
mock_response.raise_for_status.return_value = None
|
||||
mock_response.json.return_value = {
|
||||
"cubes": [
|
||||
{
|
||||
"name": "test_cube",
|
||||
"type": "view",
|
||||
"name": "cube1",
|
||||
"measures": [{"type": "sum", "name": "sales", "title": "Sales"}],
|
||||
"measures": [],
|
||||
"dimensions": [
|
||||
{
|
||||
"name": "test_dimension",
|
||||
"type": "string",
|
||||
"name": "product_name",
|
||||
"title": "Product Name",
|
||||
"title": "Test Title",
|
||||
"description": "Test Description",
|
||||
}
|
||||
],
|
||||
}
|
||||
]
|
||||
}
|
||||
mock_response.json.return_value = mock_response_json
|
||||
mock_get.return_value = mock_response
|
||||
|
||||
expected_docs: List[Document] = [
|
||||
Document(
|
||||
page_content=(
|
||||
"table name: cube1, "
|
||||
"column name: sales, "
|
||||
"column data type: sum, "
|
||||
"column title: Sales, "
|
||||
"column description: None"
|
||||
),
|
||||
metadata={
|
||||
"table_name": "cube1",
|
||||
"column_name": "sales",
|
||||
"column_data_type": "sum",
|
||||
"column_title": "Sales",
|
||||
"column_description": "None",
|
||||
},
|
||||
),
|
||||
Document(
|
||||
page_content=(
|
||||
"table name: cube1, "
|
||||
"column name: product_name, "
|
||||
"column data type: string, "
|
||||
"column title: Product Name, "
|
||||
"column description: None"
|
||||
),
|
||||
metadata={
|
||||
"table_name": "cube1",
|
||||
"column_name": "product_name",
|
||||
"column_data_type": "string",
|
||||
"column_title": "Product Name",
|
||||
"column_description": "None",
|
||||
},
|
||||
),
|
||||
]
|
||||
mock_get_dimension_values.return_value = ["value1", "value2"]
|
||||
|
||||
loader: CubeSemanticLoader = CubeSemanticLoader(cube_api_url, cube_api_token)
|
||||
documents = self.loader.load()
|
||||
self.assertEqual(len(documents), 1)
|
||||
self.assertEqual(documents[0].page_content, "Test Title, Test Description")
|
||||
self.assertEqual(documents[0].metadata["column_values"], ["value1", "value2"])
|
||||
|
||||
# Act
|
||||
result: List[Document] = loader.load()
|
||||
|
||||
# Assert
|
||||
self.assertEqual(result, expected_docs)
|
||||
mock_get.assert_called_once_with(
|
||||
cube_api_url,
|
||||
headers={
|
||||
"Content-Type": "application/json",
|
||||
"Authorization": cube_api_token,
|
||||
},
|
||||
)
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
|
Loading…
Reference in New Issue
Block a user