adding new chain for logical fallacy removal from model output in chain (#9887)

Description: new chain for logical fallacy removal from model output in chain and docs Issue: n/a see above Dependencies: none Tag maintainer: @hinthornw in past from my end but not sure who that would be for maintenance of chains Twitter handle: no twitter feel free to call out my git user if shout out j-space-b Note: created documentation in docs/extras --------- Co-authored-by: Jon Bennion <jb@Jons-MacBook-Pro.local> Co-authored-by: Harrison Chase <hw.chase.17@gmail.com>
2025-09-06 21:43:44 +00:00 · 2023-09-03 15:44:27 -07:00
parent 794ff2dae8
commit fed137a8a9
8 changed files with 651 additions and 0 deletions
--- a/libs/experimental/langchain_experimental/fallacy_removal/init.py
+++ b/libs/experimental/langchain_experimental/fallacy_removal/init.py
@@ -0,0 +1,4 @@
+"""The Chain runs a self-review of logical fallacies as determined by this paper  \
+categorizing and defining logical fallacies https://arxiv.org/pdf/2212.07425.pdf. \
+Modeled after Constitutional AI and in same format, but applying logical \
+fallacies as generalized rules to remove in output"""
--- a/libs/experimental/langchain_experimental/fallacy_removal/base.py
+++ b/libs/experimental/langchain_experimental/fallacy_removal/base.py
@@ -0,0 +1,181 @@
+"""Chain for applying removals of logical fallacies."""
+from __future__ import annotations
+
+from typing import Any, Dict, List, Optional
+
+from langchain.callbacks.manager import CallbackManagerForChainRun
+from langchain.chains.base import Chain
+from langchain.chains.llm import LLMChain
+from langchain.schema import BasePromptTemplate
+from langchain.schema.language_model import BaseLanguageModel
+
+from langchain_experimental.fallacy_removal.fallacies import FALLACIES
+from langchain_experimental.fallacy_removal.models import LogicalFallacy
+from langchain_experimental.fallacy_removal.prompts import (
+    FALLACY_CRITIQUE_PROMPT,
+    FALLACY_REVISION_PROMPT,
+)
+
+
+class FallacyChain(Chain):
+    """Chain for applying logical fallacy evaluations, modeled after Constitutional AI \
+    and in same format, but applying logical fallacies as generalized rules to remove \
+    in output
+
+    Example:
+        .. code-block:: python
+
+            from langchain.llms import OpenAI
+            from langchain.chains import LLMChain
+            from langchain_experimental.fallacy import FallacyChain
+            from langchain_experimental.fallacy_removal.models import LogicalFallacy
+
+            llm = OpenAI()
+
+            qa_prompt = PromptTemplate(
+                template="Q: {question} A:",
+                input_variables=["question"],
+            )
+            qa_chain = LLMChain(llm=llm, prompt=qa_prompt)
+
+            fallacy_chain = FallacyChain.from_llm(
+                llm=llm,
+                chain=qa_chain,
+                logical_fallacies=[
+                    LogicalFallacy(
+                        fallacy_critique_request="Tell if this answer meets criteria.",
+                        fallacy_revision_request=\
+                        "Give an answer that meets better criteria.",
+                    )
+                ],
+            )
+
+            fallacy_chain.run(question="How do I know if the earth is round?")
+    """
+
+    chain: LLMChain
+    logical_fallacies: List[LogicalFallacy]
+    fallacy_critique_chain: LLMChain
+    fallacy_revision_chain: LLMChain
+    return_intermediate_steps: bool = False
+
+    @classmethod
+    def get_fallacies(cls, names: Optional[List[str]] = None) -> List[LogicalFallacy]:
+        if names is None:
+            return list(FALLACIES.values())
+        else:
+            return [FALLACIES[name] for name in names]
+
+    @classmethod
+    def from_llm(
+        cls,
+        llm: BaseLanguageModel,
+        chain: LLMChain,
+        fallacy_critique_prompt: BasePromptTemplate = FALLACY_CRITIQUE_PROMPT,
+        fallacy_revision_prompt: BasePromptTemplate = FALLACY_REVISION_PROMPT,
+        **kwargs: Any,
+    ) -> "FallacyChain":
+        """Create a chain from an LLM."""
+        fallacy_critique_chain = LLMChain(llm=llm, prompt=fallacy_critique_prompt)
+        fallacy_revision_chain = LLMChain(llm=llm, prompt=fallacy_revision_prompt)
+        return cls(
+            chain=chain,
+            fallacy_critique_chain=fallacy_critique_chain,
+            fallacy_revision_chain=fallacy_revision_chain,
+            **kwargs,
+        )
+
+    @property
+    def input_keys(self) -> List[str]:
+        """Input keys."""
+        return self.chain.input_keys
+
+    @property
+    def output_keys(self) -> List[str]:
+        """Output keys."""
+        if self.return_intermediate_steps:
+            return ["output", "fallacy_critiques_and_revisions", "initial_output"]
+        return ["output"]
+
+    def _call(
+        self,
+        inputs: Dict[str, Any],
+        run_manager: Optional[CallbackManagerForChainRun] = None,
+    ) -> Dict[str, Any]:
+        _run_manager = run_manager or CallbackManagerForChainRun.get_noop_manager()
+        response = self.chain.run(
+            **inputs,
+            callbacks=_run_manager.get_child("original"),
+        )
+        initial_response = response
+        input_prompt = self.chain.prompt.format(**inputs)
+
+        _run_manager.on_text(
+            text="Initial response: " + response + "\n\n",
+            verbose=self.verbose,
+            color="yellow",
+        )
+        fallacy_critiques_and_revisions = []
+        for logical_fallacy in self.logical_fallacies:
+            # Fallacy critique below
+
+            fallacy_raw_critique = self.fallacy_critique_chain.run(
+                input_prompt=input_prompt,
+                output_from_model=response,
+                fallacy_critique_request=logical_fallacy.fallacy_critique_request,
+                callbacks=_run_manager.get_child("fallacy_critique"),
+            )
+            fallacy_critique = self._parse_critique(
+                output_string=fallacy_raw_critique,
+            ).strip()
+
+            # if fallacy critique contains "No fallacy critique needed" then done
+            if "no fallacy critique needed" in fallacy_critique.lower():
+                fallacy_critiques_and_revisions.append((fallacy_critique, ""))
+                continue
+
+            fallacy_revision = self.fallacy_revision_chain.run(
+                input_prompt=input_prompt,
+                output_from_model=response,
+                fallacy_critique_request=logical_fallacy.fallacy_critique_request,
+                fallacy_critique=fallacy_critique,
+                revision_request=logical_fallacy.fallacy_revision_request,
+                callbacks=_run_manager.get_child("fallacy_revision"),
+            ).strip()
+            response = fallacy_revision
+            fallacy_critiques_and_revisions.append((fallacy_critique, fallacy_revision))
+
+            _run_manager.on_text(
+                text=f"Applying {logical_fallacy.name}..." + "\n\n",
+                verbose=self.verbose,
+                color="green",
+            )
+
+            _run_manager.on_text(
+                text="Logical Fallacy: " + fallacy_critique + "\n\n",
+                verbose=self.verbose,
+                color="blue",
+            )
+
+            _run_manager.on_text(
+                text="Updated response: " + fallacy_revision + "\n\n",
+                verbose=self.verbose,
+                color="yellow",
+            )
+
+        final_output: Dict[str, Any] = {"output": response}
+        if self.return_intermediate_steps:
+            final_output["initial_output"] = initial_response
+            final_output[
+                "fallacy_critiques_and_revisions"
+            ] = fallacy_critiques_and_revisions
+        return final_output
+
+    @staticmethod
+    def _parse_critique(output_string: str) -> str:
+        if "Fallacy Revision request:" not in output_string:
+            return output_string
+        output_string = output_string.split("Fallacy Revision request:")[0]
+        if "\n\n" in output_string:
+            output_string = output_string.split("\n\n")[0]
+        return output_string
--- a/libs/experimental/langchain_experimental/fallacy_removal/fallacies.py
+++ b/libs/experimental/langchain_experimental/fallacy_removal/fallacies.py
@@ -0,0 +1,209 @@
+"""Classification of logical fallacies in Natural Language Arguments \
+from https://arxiv.org/pdf/2212.07425.pdf
+"""
+
+from typing import Dict
+
+from langchain_experimental.fallacy_removal.models import LogicalFallacy
+
+FALLACIES: Dict[str, LogicalFallacy] = {
+    "adhominem": LogicalFallacy(
+        name="adhominem",
+        fallacy_critique_request="Identify any feasible ways in which \
+        the assistant’s last response is attacking the character or \
+        personal traits of the person making an argument rather than \
+        addressing the actual argument and evidence.",
+        fallacy_revision_request="Please rewrite the assistant response\
+        to remove any attacking the character or personal traits of the\
+        person making an argument rather than addressing the actual\
+        argument and evidence.",
+    ),
+    "adpopulum": LogicalFallacy(
+        name="adpopulum",
+        fallacy_critique_request="Identify ways in which the assistant’s\
+        last response may be asserting that something must be true or \
+        correct simply because many people believe it or do it, without \
+        actual facts or evidence to support the conclusion.",
+        fallacy_revision_request="Please rewrite the assistant response \
+        to remove any assertion that something must be true or correct \
+        simply because many people believe it or do it, without actual \
+        facts or evidence to support the conclusion.",
+    ),
+    "appealtoemotion": LogicalFallacy(
+        name="appealtoemotion",
+        fallacy_critique_request="Identify all ways in which the \
+        assistant’s last response is an attempt to win support for an \
+        argument by exploiting or manipulating people's emotions rather \
+        than using facts and reason.",
+        fallacy_revision_request="Please rewrite the assistant response \
+        to remove any attempt to win support for an argument by \
+        exploiting or manipulating people's emotions rather than using \
+        facts and reason.",
+    ),
+    "fallacyofextension": LogicalFallacy(
+        name="fallacyofextension",
+        fallacy_critique_request="Identify any ways in which the \
+        assitant's last response is making broad, sweeping generalizations\
+        and extending the implications of an argument far beyond what the \
+        initial premises support.",
+        fallacy_revision_request="Rewrite the assistant response to remove\
+         all broad, sweeping generalizations and extending the implications\
+         of an argument far beyond what the initial premises support.",
+    ),
+    "intentionalfallacy": LogicalFallacy(
+        name="intentionalfallacy",
+        fallacy_critique_request="Identify any way in which the assistant’s\
+        last response may be falsely supporting a conclusion by claiming to\
+        understand an author or creator's subconscious intentions without \
+        clear evidence.",
+        fallacy_revision_request="Revise the assistant’s last response to \
+        remove any false support of a conclusion by claiming to understand\
+        an author or creator's subconscious intentions without clear \
+        evidence.",
+    ),
+    "falsecausality": LogicalFallacy(
+        name="falsecausality",
+        fallacy_critique_request="Think carefully about whether the \
+        assistant's last response is jumping to conclusions about causation\
+        between events or circumstances without adequate evidence to infer \
+        a causal relationship.",
+        fallacy_revision_request="Please write a new version of the \
+        assistant’s response that removes jumping to conclusions about\
+        causation between events or circumstances without adequate \
+        evidence to infer a causal relationship.",
+    ),
+    "falsedilemma": LogicalFallacy(
+        name="falsedilemma",
+        fallacy_critique_request="Identify any way in which the \
+        assistant's last response may be presenting only two possible options\
+        or sides to a situation when there are clearly other alternatives \
+        that have not been considered or addressed.",
+        fallacy_revision_request="Amend the assistant’s last response to \
+        remove any presentation of only two possible options or sides to a \
+        situation when there are clearly other alternatives that have not \
+        been considered or addressed.",
+    ),
+    "hastygeneralization": LogicalFallacy(
+        name="hastygeneralization",
+        fallacy_critique_request="Identify any way in which the assistant’s\
+        last response is making a broad inference or generalization to \
+        situations, people, or circumstances that are not sufficiently \
+        similar based on a specific example or limited evidence.",
+        fallacy_revision_request="Please rewrite the assistant response to\
+        remove a broad inference or generalization to situations, people, \
+        or circumstances that are not sufficiently similar based on a \
+        specific example or limited evidence.",
+    ),
+    "illogicalarrangement": LogicalFallacy(
+        name="illogicalarrangement",
+        fallacy_critique_request="Think carefully about any ways in which \
+        the assistant's last response is constructing an argument in a \
+        flawed, illogical way, so the premises do not connect to or lead\
+        to the conclusion properly.",
+        fallacy_revision_request="Please rewrite the assistant’s response\
+        so as to remove any construction of an argument that is flawed and\
+        illogical or if the premises do not connect to or lead to the \
+        conclusion properly.",
+    ),
+    "fallacyofcredibility": LogicalFallacy(
+        name="fallacyofcredibility",
+        fallacy_critique_request="Discuss whether the assistant's last \
+        response was dismissing or attacking the credibility of the person\
+        making an argument rather than directly addressing the argument \
+        itself.",
+        fallacy_revision_request="Revise the assistant’s response so as \
+        that it refrains from dismissing or attacking the credibility of\
+        the person making an argument rather than directly addressing \
+        the argument itself.",
+    ),
+    "circularreasoning": LogicalFallacy(
+        name="circularreasoning",
+        fallacy_critique_request="Discuss ways in which the assistant’s\
+        last response may be supporting a premise by simply repeating \
+        the premise as the conclusion without giving actual proof or \
+        evidence.",
+        fallacy_revision_request="Revise the assistant’s response if \
+        possible so that it’s not supporting a premise by simply \
+        repeating the premise as the conclusion without giving actual\
+        proof or evidence.",
+    ),
+    "beggingthequestion": LogicalFallacy(
+        name="beggingthequestion",
+        fallacy_critique_request="Discuss ways in which the assistant's\
+        last response is restating the conclusion of an argument as a \
+        premise without providing actual support for the conclusion in \
+        the first place.",
+        fallacy_revision_request="Write a revision of the assistant’s \
+        response that refrains from restating the conclusion of an \
+        argument as a premise without providing actual support for the \
+        conclusion in the first place.",
+    ),
+    "trickquestion": LogicalFallacy(
+        name="trickquestion",
+        fallacy_critique_request="Identify ways in which the \
+        assistant’s last response is asking a question that \
+        contains or assumes information that has not been proven or \
+        substantiated.",
+        fallacy_revision_request="Please write a new assistant \
+        response so that it does not ask a question that contains \
+        or assumes information that has not been proven or \
+        substantiated.",
+    ),
+    "overapplier": LogicalFallacy(
+        name="overapplier",
+        fallacy_critique_request="Identify ways in which the assistant’s\
+        last response is applying a general rule or generalization to a \
+        specific case it was not meant to apply to.",
+        fallacy_revision_request="Please write a new response that does\
+        not apply a general rule or generalization to a specific case \
+        it was not meant to apply to.",
+    ),
+    "equivocation": LogicalFallacy(
+        name="equivocation",
+        fallacy_critique_request="Read the assistant’s last response \
+        carefully and identify if it is using the same word or phrase \
+        in two different senses or contexts within an argument.",
+        fallacy_revision_request="Rewrite the assistant response so \
+        that it does not use the same word or phrase in two different \
+        senses or contexts within an argument.",
+    ),
+    "amphiboly": LogicalFallacy(
+        name="amphiboly",
+        fallacy_critique_request="Critique the assistant’s last response\
+        to see if it is constructing sentences such that the grammar \
+        or structure is ambiguous, leading to multiple interpretations.",
+        fallacy_revision_request="Please rewrite the assistant response\
+        to remove any construction of sentences where the grammar or \
+        structure is ambiguous or leading to multiple interpretations.",
+    ),
+    "accent": LogicalFallacy(
+        name="accent",
+        fallacy_critique_request="Discuss whether the assitant's response\
+        is misrepresenting an argument by shifting the emphasis of a word\
+        or phrase to give it a different meaning than intended.",
+        fallacy_revision_request="Please rewrite the AI model's response\
+        so that it is not misrepresenting an argument by shifting the \
+        emphasis of a word or phrase to give it a different meaning than\
+        intended.",
+    ),
+    "composition": LogicalFallacy(
+        name="composition",
+        fallacy_critique_request="Discuss whether the assistant's \
+        response is erroneously inferring that something is true of \
+        the whole based on the fact that it is true of some part or \
+        parts.",
+        fallacy_revision_request="Please rewrite the assitant's response\
+        so that it is not erroneously inferring that something is true \
+        of the whole based on the fact that it is true of some part or \
+        parts.",
+    ),
+    "division": LogicalFallacy(
+        name="division",
+        fallacy_critique_request="Discuss whether the assistant's last \
+        response is erroneously inferring that something is true of the \
+        parts based on the fact that it is true of the whole.",
+        fallacy_revision_request="Please rewrite the assitant's response\
+        so that it is not erroneously inferring that something is true \
+        of the parts based on the fact that it is true of the whole.",
+    ),
+}
--- a/libs/experimental/langchain_experimental/fallacy_removal/models.py
+++ b/libs/experimental/langchain_experimental/fallacy_removal/models.py
@@ -0,0 +1,10 @@
+"""Models for the Logical Fallacy Chain"""
+from langchain_experimental.pydantic_v1 import BaseModel
+
+
+class LogicalFallacy(BaseModel):
+    """Class for a logical fallacy."""
+
+    fallacy_critique_request: str
+    fallacy_revision_request: str
+    name: str = "Logical Fallacy"
--- a/libs/experimental/langchain_experimental/fallacy_removal/prompts.py
+++ b/libs/experimental/langchain_experimental/fallacy_removal/prompts.py
@@ -0,0 +1,135 @@
+from langchain.prompts.few_shot import FewShotPromptTemplate
+from langchain.prompts.prompt import PromptTemplate
+
+fallacy_critique_example = PromptTemplate(
+    template="""Human: {input_prompt}
+
+Model: {output_from_model}
+
+Fallacy Critique Request: {fallacy_critique_request}
+
+Fallacy Critique: {fallacy_critique}""",
+    input_variables=[
+        "input_prompt",
+        "output_from_model",
+        "fallacy_critique_request",
+        "fallacy_critique",
+    ],
+)
+
+examples = [
+    {
+        "input_prompt": "If everyone says the Earth is round, \
+        how do I know that's correct?",
+        "output_from_model": "The earth is round because your \
+        teacher says it is",
+        "fallacy_critique_request": "Identify specific ways in\
+        which the model’s previous response had a logical fallacy. \
+        Also point out potential logical fallacies in the human’s \
+        questions and responses. Examples of logical fallacies \
+        include but are not limited to ad hominem, ad populum, \
+        appeal to emotion and false causality.",
+        "fallacy_critique": "This statement contains the logical \
+        fallacy of Ad Verecundiam or Appeal to Authority. It is \
+        a fallacy because it asserts something to be true purely \
+        based on the authority of the source making the claim, \
+        without any actual evidence to support it.  Fallacy \
+        Critique Needed",
+        "fallacy_revision_request": "Please rewrite the model \
+        response to remove all logical fallacies, and to \
+        politely point out any logical fallacies from the \
+        human.",
+        "fallacy_revision": "The earth is round based on \
+        evidence from observations of its curvature from high \
+        altitudes, photos from space showing its spherical shape, \
+        circumnavigation, and the fact that we see its rounded \
+        shadow on the moon during lunar eclipses.",
+    },
+    {
+        "input_prompt": "Should we invest more in our school \
+        music program? After all, studies show students \
+        involved in music perform better academically.",
+        "output_from_model": "I don't think we should invest \
+        more in the music program. Playing the piccolo won't \
+        teach someone better math skills.",
+        "fallacy_critique_request": "Identify specific ways \
+        in which the model’s previous response had a logical \
+        fallacy. Also point out potential logical fallacies \
+        in the human’s questions and responses. Examples of \
+        logical fallacies include but are not limited to ad \
+        homimem, ad populum, appeal to emotion and false \
+        causality.",
+        "fallacy_critique": "This answer commits the division \
+        fallacy by rejecting the argument based on assuming \
+        capabilities true of the parts (playing an instrument \
+        like piccolo) also apply to the whole \
+        (the full music program). The answer focuses only on \
+        part of the music program rather than considering it \
+        as a whole.  Fallacy Critique Needed.",
+        "fallacy_revision_request": "Please rewrite the model \
+        response to remove all logical fallacies, and to \
+        politely point out any logical fallacies from the human.",
+        "fallacy_revision": "While playing an instrument may \
+        teach discipline, more evidence is needed on whether \
+        music education courses improve critical thinking \
+        skills across subjects before determining if increased \
+        investment in the whole music program is warranted.",
+    },
+]
+
+FALLACY_CRITIQUE_PROMPT = FewShotPromptTemplate(
+    example_prompt=fallacy_critique_example,
+    examples=[
+        {k: v for k, v in e.items() if k != "fallacy_revision_request"}
+        for e in examples
+    ],
+    prefix="Below is a conversation between a human and an \
+    AI assistant. If there is no material critique of the \
+    model output, append to the end of the Fallacy Critique: \
+    'No fallacy critique needed.' If there is material \
+    critique \
+    of the model output, append to the end of the Fallacy \
+    Critique: 'Fallacy Critique needed.'",
+    suffix="""Human: {input_prompt}
+Model: {output_from_model}
+
+Fallacy Critique Request: {fallacy_critique_request}
+
+Fallacy Critique:""",
+    example_separator="\n === \n",
+    input_variables=["input_prompt", "output_from_model", "fallacy_critique_request"],
+)
+
+FALLACY_REVISION_PROMPT = FewShotPromptTemplate(
+    example_prompt=fallacy_critique_example,
+    examples=examples,
+    prefix="Below is a conversation between a human and \
+    an AI assistant.",
+    suffix="""Human: {input_prompt}
+
+Model: {output_from_model}
+
+Fallacy Critique Request: {fallacy_critique_request}
+
+Fallacy Critique: {fallacy_critique}
+
+If the fallacy critique does not identify anything worth \
+changing, ignore the Fallacy Revision Request and do not \
+make any revisions. Instead, return "No revisions needed".
+
+If the fallacy critique does identify something worth \
+changing, please revise the model response based on the \
+Fallacy Revision Request.
+
+Fallacy Revision Request: {fallacy_revision_request}
+
+Fallacy Revision:""",
+    example_separator="\n === \n",
+    input_variables=[
+        "input_prompt",
+        "output_from_model",
+        "fallacy_critique_request",
+        "fallacy_critique",
+        "fallacy_revision_request",
+    ],
+)
--- a/libs/experimental/tests/unit_tests/test_logical_fallacy.py
+++ b/libs/experimental/tests/unit_tests/test_logical_fallacy.py
@@ -0,0 +1,26 @@
+"""Unit tests for the Logical Fallacy chain, same format as CAI"""
+from langchain_experimental.fallacy_removal.base import FallacyChain
+
+TEXT_ONE = """ This text is bad.\
+
+Fallacy Revision request: Make it great.\
+
+Fallacy Revision:"""
+
+TEXT_TWO = """ This text is bad.\n\n"""
+
+TEXT_THREE = """ This text is bad.\
+
+Fallacy Revision request: Make it great again.\
+
+Fallacy Revision: Better text"""
+
+
+def test_fallacy_critique_parsing() -> None:
+    """Test parsing of critique text."""
+    for text in [TEXT_ONE, TEXT_TWO, TEXT_THREE]:
+        fallacy_critique = FallacyChain._parse_critique(text)
+
+        assert (
+            fallacy_critique.strip() == "This text is bad."
+        ), f"Failed on {text} with {fallacy_critique}"