mirror of
https://github.com/hwchase17/langchain.git
synced 2025-07-09 14:35:50 +00:00
Add the extract types to diffbot graph transformer (#21315)
Before you could only extract triples (diffbot calls it facts) from diffbot to avoid isolated nodes. However, sometimes isolated nodes can still be useful like for prefiltering, so we want to allow users to extract them if they want. Default behaviour is unchanged.
This commit is contained in:
parent
c038991590
commit
5b6d1a907d
@ -1,3 +1,4 @@
|
|||||||
|
from enum import Enum
|
||||||
from typing import Any, Dict, List, Optional, Sequence, Tuple, Union
|
from typing import Any, Dict, List, Optional, Sequence, Tuple, Union
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
@ -6,6 +7,11 @@ from langchain_community.graphs.graph_document import GraphDocument, Node, Relat
|
|||||||
from langchain_core.documents import Document
|
from langchain_core.documents import Document
|
||||||
|
|
||||||
|
|
||||||
|
class TypeOption(str, Enum):
|
||||||
|
FACTS = "facts"
|
||||||
|
ENTITIES = "entities"
|
||||||
|
|
||||||
|
|
||||||
def format_property_key(s: str) -> str:
|
def format_property_key(s: str) -> str:
|
||||||
"""Formats a string to be used as a property key."""
|
"""Formats a string to be used as a property key."""
|
||||||
|
|
||||||
@ -141,6 +147,7 @@ class DiffbotGraphTransformer:
|
|||||||
include_qualifiers: bool = True,
|
include_qualifiers: bool = True,
|
||||||
include_evidence: bool = True,
|
include_evidence: bool = True,
|
||||||
simplified_schema: bool = True,
|
simplified_schema: bool = True,
|
||||||
|
extract_types: List[TypeOption] = [TypeOption.FACTS],
|
||||||
) -> None:
|
) -> None:
|
||||||
"""
|
"""
|
||||||
Initialize the graph transformer with various options.
|
Initialize the graph transformer with various options.
|
||||||
@ -157,6 +164,11 @@ class DiffbotGraphTransformer:
|
|||||||
Whether to include evidence for the relationships.
|
Whether to include evidence for the relationships.
|
||||||
simplified_schema (bool):
|
simplified_schema (bool):
|
||||||
Whether to use a simplified schema for relationships.
|
Whether to use a simplified schema for relationships.
|
||||||
|
extract_types (List[TypeOption]):
|
||||||
|
A list of data types to extract. Only facts or entities
|
||||||
|
are supported. By default, the option is set to facts.
|
||||||
|
A fact represents a combination of source and target
|
||||||
|
nodes with a relationship type.
|
||||||
"""
|
"""
|
||||||
self.diffbot_api_key = diffbot_api_key or get_from_env(
|
self.diffbot_api_key = diffbot_api_key or get_from_env(
|
||||||
"diffbot_api_key", "DIFFBOT_API_KEY"
|
"diffbot_api_key", "DIFFBOT_API_KEY"
|
||||||
@ -167,6 +179,13 @@ class DiffbotGraphTransformer:
|
|||||||
self.simplified_schema = None
|
self.simplified_schema = None
|
||||||
if simplified_schema:
|
if simplified_schema:
|
||||||
self.simplified_schema = SimplifiedSchema()
|
self.simplified_schema = SimplifiedSchema()
|
||||||
|
if not extract_types:
|
||||||
|
raise ValueError(
|
||||||
|
"`extract_types` cannot be an empty array. "
|
||||||
|
"Allowed values are 'facts', 'entities', or both."
|
||||||
|
)
|
||||||
|
|
||||||
|
self.extract_types = extract_types
|
||||||
|
|
||||||
def nlp_request(self, text: str) -> Dict[str, Any]:
|
def nlp_request(self, text: str) -> Dict[str, Any]:
|
||||||
"""
|
"""
|
||||||
@ -185,7 +204,7 @@ class DiffbotGraphTransformer:
|
|||||||
"lang": "en",
|
"lang": "en",
|
||||||
}
|
}
|
||||||
|
|
||||||
FIELDS = "facts"
|
FIELDS = ",".join(self.extract_types)
|
||||||
HOST = "nl.diffbot.com"
|
HOST = "nl.diffbot.com"
|
||||||
url = (
|
url = (
|
||||||
f"https://{HOST}/v1/?fields={FIELDS}&"
|
f"https://{HOST}/v1/?fields={FIELDS}&"
|
||||||
@ -209,77 +228,97 @@ class DiffbotGraphTransformer:
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
# Return empty result if there are no facts
|
# Return empty result if there are no facts
|
||||||
if "facts" not in payload or not payload["facts"]:
|
if ("facts" not in payload or not payload["facts"]) and (
|
||||||
|
"entities" not in payload or not payload["entities"]
|
||||||
|
):
|
||||||
return GraphDocument(nodes=[], relationships=[], source=document)
|
return GraphDocument(nodes=[], relationships=[], source=document)
|
||||||
|
|
||||||
# Nodes are a custom class because we need to deduplicate
|
# Nodes are a custom class because we need to deduplicate
|
||||||
nodes_list = NodesList()
|
nodes_list = NodesList()
|
||||||
# Relationships are a list because we don't deduplicate nor anything else
|
if "entities" in payload and payload["entities"]:
|
||||||
|
for record in payload["entities"]:
|
||||||
|
# Ignore if it doesn't have a type
|
||||||
|
if not record["allTypes"]:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Define source node
|
||||||
|
source_id = (
|
||||||
|
record["allUris"][0] if record["allUris"] else record["name"]
|
||||||
|
)
|
||||||
|
source_label = record["allTypes"][0]["name"].capitalize()
|
||||||
|
source_name = record["name"]
|
||||||
|
nodes_list.add_node_property(
|
||||||
|
(source_id, source_label), {"name": source_name}
|
||||||
|
)
|
||||||
relationships = list()
|
relationships = list()
|
||||||
for record in payload["facts"]:
|
# Relationships are a list because we don't deduplicate nor anything else
|
||||||
# Skip if the fact is below the threshold confidence
|
if "facts" in payload and payload["facts"]:
|
||||||
if record["confidence"] < self.fact_threshold_confidence:
|
for record in payload["facts"]:
|
||||||
continue
|
# Skip if the fact is below the threshold confidence
|
||||||
|
if record["confidence"] < self.fact_threshold_confidence:
|
||||||
|
continue
|
||||||
|
|
||||||
# TODO: It should probably be treated as a node property
|
# TODO: It should probably be treated as a node property
|
||||||
if not record["value"]["allTypes"]:
|
if not record["value"]["allTypes"]:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Define source node
|
# Define source node
|
||||||
source_id = (
|
source_id = (
|
||||||
record["entity"]["allUris"][0]
|
record["entity"]["allUris"][0]
|
||||||
if record["entity"]["allUris"]
|
if record["entity"]["allUris"]
|
||||||
else record["entity"]["name"]
|
else record["entity"]["name"]
|
||||||
)
|
)
|
||||||
source_label = record["entity"]["allTypes"][0]["name"].capitalize()
|
source_label = record["entity"]["allTypes"][0]["name"].capitalize()
|
||||||
source_name = record["entity"]["name"]
|
source_name = record["entity"]["name"]
|
||||||
source_node = Node(id=source_id, type=source_label)
|
source_node = Node(id=source_id, type=source_label)
|
||||||
nodes_list.add_node_property(
|
|
||||||
(source_id, source_label), {"name": source_name}
|
|
||||||
)
|
|
||||||
|
|
||||||
# Define target node
|
|
||||||
target_id = (
|
|
||||||
record["value"]["allUris"][0]
|
|
||||||
if record["value"]["allUris"]
|
|
||||||
else record["value"]["name"]
|
|
||||||
)
|
|
||||||
target_label = record["value"]["allTypes"][0]["name"].capitalize()
|
|
||||||
target_name = record["value"]["name"]
|
|
||||||
# Some facts are better suited as node properties
|
|
||||||
if target_label in FACT_TO_PROPERTY_TYPE:
|
|
||||||
nodes_list.add_node_property(
|
nodes_list.add_node_property(
|
||||||
(source_id, source_label),
|
(source_id, source_label), {"name": source_name}
|
||||||
{format_property_key(record["property"]["name"]): target_name},
|
|
||||||
)
|
)
|
||||||
else: # Define relationship
|
|
||||||
# Define target node object
|
|
||||||
target_node = Node(id=target_id, type=target_label)
|
|
||||||
nodes_list.add_node_property(
|
|
||||||
(target_id, target_label), {"name": target_name}
|
|
||||||
)
|
|
||||||
# Define relationship type
|
|
||||||
rel_type = record["property"]["name"].replace(" ", "_").upper()
|
|
||||||
if self.simplified_schema:
|
|
||||||
rel_type = self.simplified_schema.get_type(rel_type)
|
|
||||||
|
|
||||||
# Relationship qualifiers/properties
|
# Define target node
|
||||||
rel_properties = dict()
|
target_id = (
|
||||||
relationship_evidence = [el["passage"] for el in record["evidence"]][0]
|
record["value"]["allUris"][0]
|
||||||
if self.include_evidence:
|
if record["value"]["allUris"]
|
||||||
rel_properties.update({"evidence": relationship_evidence})
|
else record["value"]["name"]
|
||||||
if self.include_qualifiers and record.get("qualifiers"):
|
|
||||||
for property in record["qualifiers"]:
|
|
||||||
prop_key = format_property_key(property["property"]["name"])
|
|
||||||
rel_properties[prop_key] = property["value"]["name"]
|
|
||||||
|
|
||||||
relationship = Relationship(
|
|
||||||
source=source_node,
|
|
||||||
target=target_node,
|
|
||||||
type=rel_type,
|
|
||||||
properties=rel_properties,
|
|
||||||
)
|
)
|
||||||
relationships.append(relationship)
|
target_label = record["value"]["allTypes"][0]["name"].capitalize()
|
||||||
|
target_name = record["value"]["name"]
|
||||||
|
# Some facts are better suited as node properties
|
||||||
|
if target_label in FACT_TO_PROPERTY_TYPE:
|
||||||
|
nodes_list.add_node_property(
|
||||||
|
(source_id, source_label),
|
||||||
|
{format_property_key(record["property"]["name"]): target_name},
|
||||||
|
)
|
||||||
|
else: # Define relationship
|
||||||
|
# Define target node object
|
||||||
|
target_node = Node(id=target_id, type=target_label)
|
||||||
|
nodes_list.add_node_property(
|
||||||
|
(target_id, target_label), {"name": target_name}
|
||||||
|
)
|
||||||
|
# Define relationship type
|
||||||
|
rel_type = record["property"]["name"].replace(" ", "_").upper()
|
||||||
|
if self.simplified_schema:
|
||||||
|
rel_type = self.simplified_schema.get_type(rel_type)
|
||||||
|
|
||||||
|
# Relationship qualifiers/properties
|
||||||
|
rel_properties = dict()
|
||||||
|
relationship_evidence = [
|
||||||
|
el["passage"] for el in record["evidence"]
|
||||||
|
][0]
|
||||||
|
if self.include_evidence:
|
||||||
|
rel_properties.update({"evidence": relationship_evidence})
|
||||||
|
if self.include_qualifiers and record.get("qualifiers"):
|
||||||
|
for property in record["qualifiers"]:
|
||||||
|
prop_key = format_property_key(property["property"]["name"])
|
||||||
|
rel_properties[prop_key] = property["value"]["name"]
|
||||||
|
|
||||||
|
relationship = Relationship(
|
||||||
|
source=source_node,
|
||||||
|
target=target_node,
|
||||||
|
type=rel_type,
|
||||||
|
properties=rel_properties,
|
||||||
|
)
|
||||||
|
relationships.append(relationship)
|
||||||
|
|
||||||
return GraphDocument(
|
return GraphDocument(
|
||||||
nodes=nodes_list.return_node_list(),
|
nodes=nodes_list.return_node_list(),
|
||||||
|
Loading…
Reference in New Issue
Block a user