community[minor]: Improvements for NeptuneRdfGraph, Improve discovery of graph schema using database statistics (#19546)

Fixes linting for PR
[19244](https://github.com/langchain-ai/langchain/pull/19244)

---------

Co-authored-by: mhavey <mchavey@gmail.com>
This commit is contained in:
Piyush Jain 2024-03-26 07:36:51 -07:00 committed by GitHub
parent fc6b92bb9a
commit 72ba738bf5
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 163 additions and 85 deletions

View File

@ -6,7 +6,12 @@
"source": [ "source": [
"# Neptune SPARQL QA Chain\n", "# Neptune SPARQL QA Chain\n",
"\n", "\n",
"This notebook shows use of LLM to query RDF graph in Amazon Neptune. This code uses a `NeptuneRdfGraph` class that connects with the Neptune database and loads it's schema. The `NeptuneSparqlQAChain` is used to connect the graph and LLM to ask natural language questions.\n", "This QA chain queries Resource Description Framework (RDF) data in an Amazon Neptune graph database using the SPARQL query language and returns a human readable response.\n",
"\n",
"\n",
"This code uses a `NeptuneRdfGraph` class that connects with the Neptune database and loads its schema. The `NeptuneSparqlQAChain` is used to connect the graph and LLM to ask natural language questions.\n",
"\n",
"This notebook demonstrates an example using organizational data.\n",
"\n", "\n",
"Requirements for running this notebook:\n", "Requirements for running this notebook:\n",
"- Neptune 1.2.x cluster accessible from this notebook\n", "- Neptune 1.2.x cluster accessible from this notebook\n",
@ -98,6 +103,40 @@
"## Setup Chain" "## Setup Chain"
] ]
}, },
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!pip install --upgrade --force-reinstall langchain"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!pip install --upgrade --force-reinstall langchain-core"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!pip install --upgrade --force-reinstall langchain-community"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"** Restart kernel **"
]
},
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
@ -209,24 +248,23 @@
"source": [ "source": [
"import boto3\n", "import boto3\n",
"from langchain.chains.graph_qa.neptune_sparql import NeptuneSparqlQAChain\n", "from langchain.chains.graph_qa.neptune_sparql import NeptuneSparqlQAChain\n",
"from langchain_community.chat_models import BedrockChat\n", "from langchain.chat_models import BedrockChat\n",
"from langchain.llms import Bedrock\n",
"from langchain_community.graphs import NeptuneRdfGraph\n", "from langchain_community.graphs import NeptuneRdfGraph\n",
"\n", "\n",
"host = \"<neptune-host>\"\n", "host = \"<your host>\"\n",
"port = \"<neptune-port>\"\n", "port = 8182 # change if different\n",
"region = \"us-east-1\" # specify region\n", "region = \"us-east-1\" # change if different\n",
"graph = NeptuneRdfGraph(host=host, port=port, use_iam_auth=True, region_name=region)\n",
"\n", "\n",
"graph = NeptuneRdfGraph(\n", "# Optionally change the schema\n",
" host=host, port=port, use_iam_auth=True, region_name=region, hide_comments=True\n", "# elems = graph.get_schema_elements\n",
")\n", "# change elems ...\n",
"\n", "# graph.load_schema(elems)\n",
"schema_elements = graph.get_schema_elements\n",
"# Optionally, you can update the schema_elements, and\n",
"# load the schema from the pruned elements.\n",
"graph.load_from_schema_elements(schema_elements)\n",
"\n", "\n",
"MODEL_ID = \"anthropic.claude-v2\"\n",
"bedrock_client = boto3.client(\"bedrock-runtime\")\n", "bedrock_client = boto3.client(\"bedrock-runtime\")\n",
"llm = BedrockChat(model_id=\"anthropic.claude-v2\", client=bedrock_client)\n", "llm = BedrockChat(model_id=MODEL_ID, client=bedrock_client)\n",
"\n", "\n",
"chain = NeptuneSparqlQAChain.from_llm(\n", "chain = NeptuneSparqlQAChain.from_llm(\n",
" llm=llm,\n", " llm=llm,\n",

View File

@ -4,55 +4,25 @@ from typing import Any, Dict, Optional, Sequence
import requests import requests
CLASS_QUERY = """ # Query to find OWL datatype properties
SELECT DISTINCT ?elem ?com
WHERE {
?instance a ?elem .
OPTIONAL { ?instance rdf:type/rdfs:subClassOf* ?elem } .
#FILTER (isIRI(?elem)) .
OPTIONAL { ?elem rdfs:comment ?com filter (lang(?com) = "en")}
}
"""
REL_QUERY = """
SELECT DISTINCT ?elem ?com
WHERE {
?subj ?elem ?obj .
OPTIONAL {
?elem rdf:type/rdfs:subPropertyOf* ?proptype .
VALUES ?proptype { rdf:Property owl:DatatypeProperty owl:ObjectProperty } .
} .
OPTIONAL { ?elem rdfs:comment ?com filter (lang(?com) = "en")}
}
"""
DTPROP_QUERY = """ DTPROP_QUERY = """
SELECT DISTINCT ?elem ?com SELECT DISTINCT ?elem
WHERE { WHERE {
?subj ?elem ?obj . ?elem a owl:DatatypeProperty .
OPTIONAL {
?elem rdf:type/rdfs:subPropertyOf* ?proptype .
?proptype a owl:DatatypeProperty .
} .
OPTIONAL { ?elem rdfs:comment ?com filter (lang(?com) = "en")}
} }
""" """
# Query to find OWL object properties
OPROP_QUERY = """ OPROP_QUERY = """
SELECT DISTINCT ?elem ?com SELECT DISTINCT ?elem
WHERE { WHERE {
?subj ?elem ?obj . ?elem a owl:ObjectProperty .
OPTIONAL {
?elem rdf:type/rdfs:subPropertyOf* ?proptype .
?proptype a owl:ObjectProperty .
} .
OPTIONAL { ?elem rdfs:comment ?com filter (lang(?com) = "en")}
} }
""" """
ELEM_TYPES = { ELEM_TYPES = {
"classes": CLASS_QUERY, "classes": None,
"rels": REL_QUERY, "rels": None,
"dtprops": DTPROP_QUERY, "dtprops": DTPROP_QUERY,
"oprops": OPROP_QUERY, "oprops": OPROP_QUERY,
} }
@ -62,32 +32,33 @@ class NeptuneRdfGraph:
"""Neptune wrapper for RDF graph operations. """Neptune wrapper for RDF graph operations.
Args: Args:
host: SPARQL endpoint host for Neptune host: endpoint for the database instance
port: SPARQL endpoint port for Neptune. Defaults 8182. port: port number for the database instance, default is 8182
use_iam_auth: boolean indicating IAM auth is enabled in Neptune cluster use_iam_auth: boolean indicating IAM auth is enabled in Neptune cluster
region_name: AWS region required if use_iam_auth is True, e.g., us-west-2 use_https: whether to use secure connection, default is True
hide_comments: whether to include ontology comments in schema for prompt client: optional boto3 Neptune client
credentials_profile_name: optional AWS profile name
region_name: optional AWS region, e.g., us-west-2
service: optional service name, default is neptunedata
sign: optional, whether to sign the request payload, default is True
Example: Example:
.. code-block:: python .. code-block:: python
graph = NeptuneRdfGraph( graph = NeptuneRdfGraph(
host='<SPARQL host'>, host='<SPARQL host'>,
port=<SPARQL port>, port=<SPARQL port>
use_iam_auth=False
) )
schema = graph.get_schema() schema = graph.get_schema()
OR OR
graph = NeptuneRdfGraph( graph = NeptuneRdfGraph(
host='<SPARQL host'>, host='<SPARQL host'>,
port=<SPARQL port>, port=<SPARQL port>
use_iam_auth=False
) )
schema_elem = graph.get_schema_elements() schema_elem = graph.get_schema_elements()
... change schema_elements ... #... change schema_elements ...
graph.load_schema(schema_elem) graph.load_schema(schema_elem)
schema = graph.get_schema()
*Security note*: Make sure that the database connection uses credentials *Security note*: Make sure that the database connection uses credentials
that are narrowly-scoped to only include necessary permissions. that are narrowly-scoped to only include necessary permissions.
@ -105,27 +76,67 @@ class NeptuneRdfGraph:
self, self,
host: str, host: str,
port: int = 8182, port: int = 8182,
use_https: bool = True,
use_iam_auth: bool = False, use_iam_auth: bool = False,
client: Any = None,
credentials_profile_name: Optional[str] = None,
region_name: Optional[str] = None, region_name: Optional[str] = None,
hide_comments: bool = False, service: str = "neptunedata",
sign: bool = True,
) -> None: ) -> None:
self.use_iam_auth = use_iam_auth self.use_iam_auth = use_iam_auth
self.region_name = region_name self.region_name = region_name
self.hide_comments = hide_comments
self.query_endpoint = f"https://{host}:{port}/sparql" self.query_endpoint = f"https://{host}:{port}/sparql"
if self.use_iam_auth: try:
try: if client is not None:
self.client = client
else:
import boto3 import boto3
self.session = boto3.Session() if credentials_profile_name is not None:
except ImportError: self.session = boto3.Session(profile_name=credentials_profile_name)
raise ImportError( else:
"Could not import boto3 python package. " # use default credentials
"Please install it with `pip install boto3`." self.session = boto3.Session()
)
else: client_params = {}
self.session = None if region_name:
client_params["region_name"] = region_name
protocol = "https" if use_https else "http"
client_params["endpoint_url"] = f"{protocol}://{host}:{port}"
if sign:
self.client = self.session.client(service, **client_params)
else:
from botocore import UNSIGNED
from botocore.config import Config
self.client = self.session.client(
service,
**client_params,
config=Config(signature_version=UNSIGNED),
)
except ImportError:
raise ModuleNotFoundError(
"Could not import boto3 python package. "
"Please install it with `pip install boto3`."
)
except Exception as e:
if type(e).__name__ == "UnknownServiceError":
raise ModuleNotFoundError(
"NeptuneGraph requires a boto3 version 1.28.38 or greater."
"Please install it with `pip install -U boto3`."
) from e
else:
raise ValueError(
"Could not load credentials to authenticate with AWS client. "
"Please check that credentials in the specified "
"profile name are valid."
) from e
# Set schema # Set schema
self.schema = "" self.schema = ""
@ -143,6 +154,12 @@ class NeptuneRdfGraph:
def get_schema_elements(self) -> Dict[str, Any]: def get_schema_elements(self) -> Dict[str, Any]:
return self.schema_elements return self.schema_elements
def get_summary(self) -> Dict[str, Any]:
"""
Obtain Neptune statistical summary of classes and predicates in the graph.
"""
return self.client.get_rdf_graph_summary(mode="detailed")
def query( def query(
self, self,
query: str, query: str,
@ -197,12 +214,10 @@ class NeptuneRdfGraph:
elem_str = {} elem_str = {}
for elem in ELEM_TYPES: for elem in ELEM_TYPES:
res_list = [] res_list = []
for elem_rec in self.schema_elements[elem]: for elem_rec in schema_elements[elem]:
uri = elem_rec["uri"] uri = elem_rec["uri"]
local = elem_rec["local"] local = elem_rec["local"]
res_str = f"<{uri}> ({local})" res_str = f"<{uri}> ({local})"
if self.hide_comments is False:
res_str = res_str + f", {elem_rec['comment']}"
res_list.append(res_str) res_list.append(res_str)
elem_str[elem] = ", ".join(res_list) elem_str[elem] = ", ".join(res_list)
@ -210,12 +225,12 @@ class NeptuneRdfGraph:
"In the following, each IRI is followed by the local name and " "In the following, each IRI is followed by the local name and "
"optionally its description in parentheses. \n" "optionally its description in parentheses. \n"
"The graph supports the following node types:\n" "The graph supports the following node types:\n"
f"{elem_str['classes']}" f"{elem_str['classes']}\n"
"The graph supports the following relationships:\n" "The graph supports the following relationships:\n"
f"{elem_str['rels']}" f"{elem_str['rels']}\n"
"The graph supports the following OWL object properties, " "The graph supports the following OWL object properties:\n"
f"{elem_str['dtprops']}" f"{elem_str['dtprops']}\n"
"The graph supports the following OWL data properties, " "The graph supports the following OWL data properties:\n"
f"{elem_str['oprops']}" f"{elem_str['oprops']}"
) )
@ -238,15 +253,40 @@ class NeptuneRdfGraph:
""" """
self.schema_elements["distinct_prefixes"] = {} self.schema_elements["distinct_prefixes"] = {}
# get summary and build list of classes and rels
summary = self.get_summary()
reslist = []
for c in summary["payload"]["graphSummary"]["classes"]:
uri = c
tokens = self._get_local_name(uri)
elem_record = {"uri": uri, "local": tokens[1]}
reslist.append(elem_record)
if tokens[0] not in self.schema_elements["distinct_prefixes"]:
self.schema_elements["distinct_prefixes"][tokens[0]] = "y"
self.schema_elements["classes"] = reslist
reslist = []
for r in summary["payload"]["graphSummary"]["predicates"]:
for p in r:
uri = p
tokens = self._get_local_name(uri)
elem_record = {"uri": uri, "local": tokens[1]}
reslist.append(elem_record)
if tokens[0] not in self.schema_elements["distinct_prefixes"]:
self.schema_elements["distinct_prefixes"][tokens[0]] = "y"
self.schema_elements["rels"] = reslist
# get dtprops and oprops too
for elem in ELEM_TYPES: for elem in ELEM_TYPES:
items = self.query(ELEM_TYPES[elem]) q = ELEM_TYPES.get(elem)
if not q:
continue
items = self.query(q)
reslist = [] reslist = []
for r in items["results"]["bindings"]: for r in items["results"]["bindings"]:
uri = r["elem"]["value"] uri = r["elem"]["value"]
tokens = self._get_local_name(uri) tokens = self._get_local_name(uri)
elem_record = {"uri": uri, "local": tokens[1]} elem_record = {"uri": uri, "local": tokens[1]}
if not self.hide_comments:
elem_record["comment"] = r["com"]["value"] if "com" in r else ""
reslist.append(elem_record) reslist.append(elem_record)
if tokens[0] not in self.schema_elements["distinct_prefixes"]: if tokens[0] not in self.schema_elements["distinct_prefixes"]:
self.schema_elements["distinct_prefixes"][tokens[0]] = "y" self.schema_elements["distinct_prefixes"][tokens[0]] = "y"