community[minor]: Improvements for NeptuneRdfGraph, Improve discovery of graph schema using database statistics (#19546)

Fixes linting for PR
[19244](https://github.com/langchain-ai/langchain/pull/19244)

---------

Co-authored-by: mhavey <mchavey@gmail.com>
This commit is contained in:
Piyush Jain 2024-03-26 07:36:51 -07:00 committed by GitHub
parent fc6b92bb9a
commit 72ba738bf5
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 163 additions and 85 deletions

View File

@ -6,7 +6,12 @@
"source": [
"# Neptune SPARQL QA Chain\n",
"\n",
"This notebook shows use of LLM to query RDF graph in Amazon Neptune. This code uses a `NeptuneRdfGraph` class that connects with the Neptune database and loads it's schema. The `NeptuneSparqlQAChain` is used to connect the graph and LLM to ask natural language questions.\n",
"This QA chain queries Resource Description Framework (RDF) data in an Amazon Neptune graph database using the SPARQL query language and returns a human readable response.\n",
"\n",
"\n",
"This code uses a `NeptuneRdfGraph` class that connects with the Neptune database and loads its schema. The `NeptuneSparqlQAChain` is used to connect the graph and LLM to ask natural language questions.\n",
"\n",
"This notebook demonstrates an example using organizational data.\n",
"\n",
"Requirements for running this notebook:\n",
"- Neptune 1.2.x cluster accessible from this notebook\n",
@ -98,6 +103,40 @@
"## Setup Chain"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!pip install --upgrade --force-reinstall langchain"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!pip install --upgrade --force-reinstall langchain-core"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!pip install --upgrade --force-reinstall langchain-community"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"** Restart kernel **"
]
},
{
"cell_type": "code",
"execution_count": null,
@ -209,24 +248,23 @@
"source": [
"import boto3\n",
"from langchain.chains.graph_qa.neptune_sparql import NeptuneSparqlQAChain\n",
"from langchain_community.chat_models import BedrockChat\n",
"from langchain.chat_models import BedrockChat\n",
"from langchain.llms import Bedrock\n",
"from langchain_community.graphs import NeptuneRdfGraph\n",
"\n",
"host = \"<neptune-host>\"\n",
"port = \"<neptune-port>\"\n",
"region = \"us-east-1\" # specify region\n",
"host = \"<your host>\"\n",
"port = 8182 # change if different\n",
"region = \"us-east-1\" # change if different\n",
"graph = NeptuneRdfGraph(host=host, port=port, use_iam_auth=True, region_name=region)\n",
"\n",
"graph = NeptuneRdfGraph(\n",
" host=host, port=port, use_iam_auth=True, region_name=region, hide_comments=True\n",
")\n",
"\n",
"schema_elements = graph.get_schema_elements\n",
"# Optionally, you can update the schema_elements, and\n",
"# load the schema from the pruned elements.\n",
"graph.load_from_schema_elements(schema_elements)\n",
"# Optionally change the schema\n",
"# elems = graph.get_schema_elements\n",
"# change elems ...\n",
"# graph.load_schema(elems)\n",
"\n",
"MODEL_ID = \"anthropic.claude-v2\"\n",
"bedrock_client = boto3.client(\"bedrock-runtime\")\n",
"llm = BedrockChat(model_id=\"anthropic.claude-v2\", client=bedrock_client)\n",
"llm = BedrockChat(model_id=MODEL_ID, client=bedrock_client)\n",
"\n",
"chain = NeptuneSparqlQAChain.from_llm(\n",
" llm=llm,\n",

View File

@ -4,55 +4,25 @@ from typing import Any, Dict, Optional, Sequence
import requests
CLASS_QUERY = """
SELECT DISTINCT ?elem ?com
WHERE {
?instance a ?elem .
OPTIONAL { ?instance rdf:type/rdfs:subClassOf* ?elem } .
#FILTER (isIRI(?elem)) .
OPTIONAL { ?elem rdfs:comment ?com filter (lang(?com) = "en")}
}
"""
REL_QUERY = """
SELECT DISTINCT ?elem ?com
WHERE {
?subj ?elem ?obj .
OPTIONAL {
?elem rdf:type/rdfs:subPropertyOf* ?proptype .
VALUES ?proptype { rdf:Property owl:DatatypeProperty owl:ObjectProperty } .
} .
OPTIONAL { ?elem rdfs:comment ?com filter (lang(?com) = "en")}
}
"""
# Query to find OWL datatype properties
DTPROP_QUERY = """
SELECT DISTINCT ?elem ?com
SELECT DISTINCT ?elem
WHERE {
?subj ?elem ?obj .
OPTIONAL {
?elem rdf:type/rdfs:subPropertyOf* ?proptype .
?proptype a owl:DatatypeProperty .
} .
OPTIONAL { ?elem rdfs:comment ?com filter (lang(?com) = "en")}
?elem a owl:DatatypeProperty .
}
"""
# Query to find OWL object properties
OPROP_QUERY = """
SELECT DISTINCT ?elem ?com
SELECT DISTINCT ?elem
WHERE {
?subj ?elem ?obj .
OPTIONAL {
?elem rdf:type/rdfs:subPropertyOf* ?proptype .
?proptype a owl:ObjectProperty .
} .
OPTIONAL { ?elem rdfs:comment ?com filter (lang(?com) = "en")}
?elem a owl:ObjectProperty .
}
"""
ELEM_TYPES = {
"classes": CLASS_QUERY,
"rels": REL_QUERY,
"classes": None,
"rels": None,
"dtprops": DTPROP_QUERY,
"oprops": OPROP_QUERY,
}
@ -62,32 +32,33 @@ class NeptuneRdfGraph:
"""Neptune wrapper for RDF graph operations.
Args:
host: SPARQL endpoint host for Neptune
port: SPARQL endpoint port for Neptune. Defaults 8182.
host: endpoint for the database instance
port: port number for the database instance, default is 8182
use_iam_auth: boolean indicating IAM auth is enabled in Neptune cluster
region_name: AWS region required if use_iam_auth is True, e.g., us-west-2
hide_comments: whether to include ontology comments in schema for prompt
use_https: whether to use secure connection, default is True
client: optional boto3 Neptune client
credentials_profile_name: optional AWS profile name
region_name: optional AWS region, e.g., us-west-2
service: optional service name, default is neptunedata
sign: optional, whether to sign the request payload, default is True
Example:
.. code-block:: python
graph = NeptuneRdfGraph(
host='<SPARQL host'>,
port=<SPARQL port>,
use_iam_auth=False
port=<SPARQL port>
)
schema = graph.get_schema()
OR
graph = NeptuneRdfGraph(
host='<SPARQL host'>,
port=<SPARQL port>,
use_iam_auth=False
port=<SPARQL port>
)
schema_elem = graph.get_schema_elements()
... change schema_elements ...
#... change schema_elements ...
graph.load_schema(schema_elem)
schema = graph.get_schema()
*Security note*: Make sure that the database connection uses credentials
that are narrowly-scoped to only include necessary permissions.
@ -105,27 +76,67 @@ class NeptuneRdfGraph:
self,
host: str,
port: int = 8182,
use_https: bool = True,
use_iam_auth: bool = False,
client: Any = None,
credentials_profile_name: Optional[str] = None,
region_name: Optional[str] = None,
hide_comments: bool = False,
service: str = "neptunedata",
sign: bool = True,
) -> None:
self.use_iam_auth = use_iam_auth
self.region_name = region_name
self.hide_comments = hide_comments
self.query_endpoint = f"https://{host}:{port}/sparql"
if self.use_iam_auth:
try:
try:
if client is not None:
self.client = client
else:
import boto3
self.session = boto3.Session()
except ImportError:
raise ImportError(
"Could not import boto3 python package. "
"Please install it with `pip install boto3`."
)
else:
self.session = None
if credentials_profile_name is not None:
self.session = boto3.Session(profile_name=credentials_profile_name)
else:
# use default credentials
self.session = boto3.Session()
client_params = {}
if region_name:
client_params["region_name"] = region_name
protocol = "https" if use_https else "http"
client_params["endpoint_url"] = f"{protocol}://{host}:{port}"
if sign:
self.client = self.session.client(service, **client_params)
else:
from botocore import UNSIGNED
from botocore.config import Config
self.client = self.session.client(
service,
**client_params,
config=Config(signature_version=UNSIGNED),
)
except ImportError:
raise ModuleNotFoundError(
"Could not import boto3 python package. "
"Please install it with `pip install boto3`."
)
except Exception as e:
if type(e).__name__ == "UnknownServiceError":
raise ModuleNotFoundError(
"NeptuneGraph requires a boto3 version 1.28.38 or greater."
"Please install it with `pip install -U boto3`."
) from e
else:
raise ValueError(
"Could not load credentials to authenticate with AWS client. "
"Please check that credentials in the specified "
"profile name are valid."
) from e
# Set schema
self.schema = ""
@ -143,6 +154,12 @@ class NeptuneRdfGraph:
def get_schema_elements(self) -> Dict[str, Any]:
return self.schema_elements
def get_summary(self) -> Dict[str, Any]:
"""
Obtain Neptune statistical summary of classes and predicates in the graph.
"""
return self.client.get_rdf_graph_summary(mode="detailed")
def query(
self,
query: str,
@ -197,12 +214,10 @@ class NeptuneRdfGraph:
elem_str = {}
for elem in ELEM_TYPES:
res_list = []
for elem_rec in self.schema_elements[elem]:
for elem_rec in schema_elements[elem]:
uri = elem_rec["uri"]
local = elem_rec["local"]
res_str = f"<{uri}> ({local})"
if self.hide_comments is False:
res_str = res_str + f", {elem_rec['comment']}"
res_list.append(res_str)
elem_str[elem] = ", ".join(res_list)
@ -210,12 +225,12 @@ class NeptuneRdfGraph:
"In the following, each IRI is followed by the local name and "
"optionally its description in parentheses. \n"
"The graph supports the following node types:\n"
f"{elem_str['classes']}"
f"{elem_str['classes']}\n"
"The graph supports the following relationships:\n"
f"{elem_str['rels']}"
"The graph supports the following OWL object properties, "
f"{elem_str['dtprops']}"
"The graph supports the following OWL data properties, "
f"{elem_str['rels']}\n"
"The graph supports the following OWL object properties:\n"
f"{elem_str['dtprops']}\n"
"The graph supports the following OWL data properties:\n"
f"{elem_str['oprops']}"
)
@ -238,15 +253,40 @@ class NeptuneRdfGraph:
"""
self.schema_elements["distinct_prefixes"] = {}
# get summary and build list of classes and rels
summary = self.get_summary()
reslist = []
for c in summary["payload"]["graphSummary"]["classes"]:
uri = c
tokens = self._get_local_name(uri)
elem_record = {"uri": uri, "local": tokens[1]}
reslist.append(elem_record)
if tokens[0] not in self.schema_elements["distinct_prefixes"]:
self.schema_elements["distinct_prefixes"][tokens[0]] = "y"
self.schema_elements["classes"] = reslist
reslist = []
for r in summary["payload"]["graphSummary"]["predicates"]:
for p in r:
uri = p
tokens = self._get_local_name(uri)
elem_record = {"uri": uri, "local": tokens[1]}
reslist.append(elem_record)
if tokens[0] not in self.schema_elements["distinct_prefixes"]:
self.schema_elements["distinct_prefixes"][tokens[0]] = "y"
self.schema_elements["rels"] = reslist
# get dtprops and oprops too
for elem in ELEM_TYPES:
items = self.query(ELEM_TYPES[elem])
q = ELEM_TYPES.get(elem)
if not q:
continue
items = self.query(q)
reslist = []
for r in items["results"]["bindings"]:
uri = r["elem"]["value"]
tokens = self._get_local_name(uri)
elem_record = {"uri": uri, "local": tokens[1]}
if not self.hide_comments:
elem_record["comment"] = r["com"]["value"] if "com" in r else ""
reslist.append(elem_record)
if tokens[0] not in self.schema_elements["distinct_prefixes"]:
self.schema_elements["distinct_prefixes"][tokens[0]] = "y"