From d166967003be5b32a600d145d76e2f0ee31f5903 Mon Sep 17 00:00:00 2001 From: Tomaz Bratanic Date: Mon, 5 Aug 2024 23:01:27 +0200 Subject: [PATCH] experimental: Add gliner graph transformer (#25066) You can use this with: ``` from langchain_experimental.graph_transformers import GlinerGraphTransformer gliner = GlinerGraphTransformer(allowed_nodes=["Person", "Organization", "Nobel"], allowed_relationships=["EMPLOYEE", "WON"]) from langchain_core.documents import Document text = """ Marie Curie, was a Polish and naturalised-French physicist and chemist who conducted pioneering research on radioactivity. She was the first woman to win a Nobel Prize, the first person to win a Nobel Prize twice, and the only person to win a Nobel Prize in two scientific fields. Her husband, Pierre Curie, was a co-winner of her first Nobel Prize, making them the first-ever married couple to win the Nobel Prize and launching the Curie family legacy of five Nobel Prizes. She was, in 1906, the first woman to become a professor at the University of Paris. """ documents = [Document(page_content=text)] gliner.convert_to_graph_documents(documents) ``` --------- Co-authored-by: Erick Friis --- .../graph_transformers/__init__.py | 8 +- .../graph_transformers/gliner.py | 175 ++++++++++++++++++ 2 files changed, 182 insertions(+), 1 deletion(-) create mode 100644 libs/experimental/langchain_experimental/graph_transformers/gliner.py diff --git a/libs/experimental/langchain_experimental/graph_transformers/__init__.py b/libs/experimental/langchain_experimental/graph_transformers/__init__.py index 3bb5680bcf3..fd01190dc85 100644 --- a/libs/experimental/langchain_experimental/graph_transformers/__init__.py +++ b/libs/experimental/langchain_experimental/graph_transformers/__init__.py @@ -1,7 +1,13 @@ """**Graph Transformers** transform Documents into Graph Documents.""" from langchain_experimental.graph_transformers.diffbot import DiffbotGraphTransformer +from langchain_experimental.graph_transformers.gliner import GlinerGraphTransformer from langchain_experimental.graph_transformers.llm import LLMGraphTransformer from langchain_experimental.graph_transformers.relik import RelikGraphTransformer -__all__ = ["DiffbotGraphTransformer", "LLMGraphTransformer", "RelikGraphTransformer"] +__all__ = [ + "DiffbotGraphTransformer", + "LLMGraphTransformer", + "RelikGraphTransformer", + "GlinerGraphTransformer", +] diff --git a/libs/experimental/langchain_experimental/graph_transformers/gliner.py b/libs/experimental/langchain_experimental/graph_transformers/gliner.py new file mode 100644 index 00000000000..85566e2f7af --- /dev/null +++ b/libs/experimental/langchain_experimental/graph_transformers/gliner.py @@ -0,0 +1,175 @@ +from typing import Any, Dict, List, Sequence, Union + +from langchain_community.graphs.graph_document import GraphDocument, Node, Relationship +from langchain_core.documents import Document + +DEFAULT_NODE_TYPE = "Node" + + +class GlinerGraphTransformer: + """ + A transformer class for converting documents into graph structures + using the GLiNER and GLiREL models. + + This class leverages GLiNER for named entity recognition and GLiREL for + relationship extraction from text documents, converting them into a graph format. + The extracted entities and relationships are filtered based on specified + confidence thresholds and allowed types. + + For more details on GLiNER and GLiREL, visit their respective repositories: + GLiNER: https://github.com/urchade/GLiNER + GLiREL: https://github.com/jackboyla/GLiREL/tree/main + + Args: + allowed_nodes (List[str]): A list of allowed node types for entity extraction. + allowed_relationships (Union[List[str], Dict[str, Any]]): A list of allowed + relationship types or a dictionary with additional configuration for + relationship extraction. + gliner_model (str): The name of the pretrained GLiNER model to use. + Default is "urchade/gliner_mediumv2.1". + glirel_model (str): The name of the pretrained GLiREL model to use. + Default is "jackboyla/glirel_beta". + entity_confidence_threshold (float): The confidence threshold for + filtering extracted entities. Default is 0.1. + relationship_confidence_threshold (float): The confidence threshold for + filtering extracted relationships. Default is 0.1. + device (str): The device to use for model inference ('cpu' or 'cuda'). + Default is "cpu". + ignore_self_loops (bool): Whether to ignore relationships where the + source and target nodes are the same. Default is True. + """ + + def __init__( + self, + allowed_nodes: List[str], + allowed_relationships: Union[List[str], Dict[str, Any]], + gliner_model: str = "urchade/gliner_mediumv2.1", + glirel_model: str = "jackboyla/glirel_beta", + entity_confidence_threshold: float = 0.1, + relationship_confidence_threshold: float = 0.1, + device: str = "cpu", + ignore_self_loops: bool = True, + ) -> None: + try: + import gliner_spacy # type: ignore # noqa: F401 + except ImportError: + raise ImportError( + "Could not import relik python package. " + "Please install it with `pip install gliner-spacy`." + ) + try: + import spacy # type: ignore + except ImportError: + raise ImportError( + "Could not import relik python package. " + "Please install it with `pip install spacy`." + ) + try: + import glirel # type: ignore # noqa: F401 + except ImportError: + raise ImportError( + "Could not import relik python package. " + "Please install it with `pip install gliner`." + ) + + gliner_config = { + "gliner_model": gliner_model, + "chunk_size": 250, + "labels": allowed_nodes, + "style": "ent", + "threshold": entity_confidence_threshold, + "map_location": device, + } + glirel_config = {"model": glirel_model, "device": device} + self.nlp = spacy.blank("en") + # Add the GliNER component to the pipeline + self.nlp.add_pipe("gliner_spacy", config=gliner_config) + # Add the GLiREL component to the pipeline + self.nlp.add_pipe("glirel", after="gliner_spacy", config=glirel_config) + self.allowed_relationships = ( + {"glirel_labels": allowed_relationships} + if isinstance(allowed_relationships, list) + else allowed_relationships + ) + self.relationship_confidence_threshold = relationship_confidence_threshold + self.ignore_self_loops = ignore_self_loops + + def process_document(self, document: Document) -> GraphDocument: + # Extraction as SpaCy pipeline + docs = list( + self.nlp.pipe( + [(document.page_content, self.allowed_relationships)], as_tuples=True + ) + ) + # Convert nodes + nodes = [] + for node in docs[0][0].ents: + nodes.append( + Node( + id=node.text, + type=node.label_, + ) + ) + # Convert relationships + relationships = [] + relations = docs[0][0]._.relations + # Deduplicate based on label, head text, and tail text + # Use a list comprehension with max() function + deduplicated_rels = [] + seen = set() + + for item in relations: + key = (tuple(item["head_text"]), tuple(item["tail_text"]), item["label"]) + + if key not in seen: + seen.add(key) + + # Find all items matching the current key + matching_items = [ + rel + for rel in relations + if (tuple(rel["head_text"]), tuple(rel["tail_text"]), rel["label"]) + == key + ] + + # Find the item with the maximum score + max_item = max(matching_items, key=lambda x: x["score"]) + deduplicated_rels.append(max_item) + for rel in deduplicated_rels: + # Relationship confidence threshold + if rel["score"] < self.relationship_confidence_threshold: + continue + source_id = docs[0][0][rel["head_pos"][0] : rel["head_pos"][1]].text + target_id = docs[0][0][rel["tail_pos"][0] : rel["tail_pos"][1]].text + # Ignore self loops + if self.ignore_self_loops and source_id == target_id: + continue + source_node = [n for n in nodes if n.id == source_id][0] + target_node = [n for n in nodes if n.id == target_id][0] + relationships.append( + Relationship( + source=source_node, + target=target_node, + type=rel["label"].replace(" ", "_").upper(), + ) + ) + + return GraphDocument(nodes=nodes, relationships=relationships, source=document) + + def convert_to_graph_documents( + self, documents: Sequence[Document] + ) -> List[GraphDocument]: + """Convert a sequence of documents into graph documents. + + Args: + documents (Sequence[Document]): The original documents. + kwargs: Additional keyword arguments. + + Returns: + Sequence[GraphDocument]: The transformed documents as graphs. + """ + results = [] + for document in documents: + graph_document = self.process_document(document) + results.append(graph_document) + return results