"""Analyze git diffs to determine which directories need to be tested.

Intelligently determines which LangChain packages and directories need to be tested,
linted, or built based on the changes. Handles dependency relationships between
packages, maps file changes to appropriate CI job configurations, and outputs JSON
configurations for GitHub Actions.

- Maps changed files to affected package directories (libs/core, libs/partners/*, etc.)
- Builds dependency graph to include dependent packages when core components change
- Generates test matrix configurations with appropriate Python versions
- Handles special cases for Pydantic version testing and performance benchmarks

Used as part of the check_diffs workflow.
"""

import glob
import json
import os
import sys
from collections import defaultdict
from pathlib import Path
from typing import Dict, List, Set

import tomllib
from get_min_versions import get_min_version_from_toml
from packaging.requirements import Requirement

LANGCHAIN_DIRS = [
    "libs/core",
    "libs/text-splitters",
    "libs/langchain",
    "libs/langchain_v1",
]

# When set to True, we are ignoring core dependents
# in order to be able to get CI to pass for each individual
# package that depends on core
# e.g. if you touch core, we don't then add textsplitters/etc to CI
IGNORE_CORE_DEPENDENTS = False

# ignored partners are removed from dependents
# but still run if directly edited
IGNORED_PARTNERS = [
    # remove huggingface from dependents because of CI instability
    # specifically in huggingface jobs
    # https://github.com/langchain-ai/langchain/issues/25558
    "huggingface",
    # prompty exhibiting issues with numpy for Python 3.13
    # https://github.com/langchain-ai/langchain/actions/runs/12651104685/job/35251034969?pr=29065
    "prompty",
]

PY_312_MAX_PACKAGES = [
    "libs/partners/chroma",  # https://github.com/chroma-core/chroma/issues/4382
]


def all_package_dirs() -> Set[str]:
    return {
        "/".join(path.split("/")[:-1]).lstrip("./")
        for path in glob.glob("./libs/**/pyproject.toml", recursive=True)
        if "libs/cli" not in path and "libs/standard-tests" not in path
    }


def dependents_graph() -> dict:
    """Construct a mapping of package -> dependents

    Done such that we can run tests on all dependents of a package when a change is made.
    """
    dependents = defaultdict(set)

    for path in glob.glob("./libs/**/pyproject.toml", recursive=True):
        if "template" in path:
            continue

        # load regular and test deps from pyproject.toml
        with open(path, "rb") as f:
            pyproject = tomllib.load(f)

        pkg_dir = "libs" + "/".join(path.split("libs")[1].split("/")[:-1])
        for dep in [
            *pyproject["project"]["dependencies"],
            *pyproject["dependency-groups"]["test"],
        ]:
            requirement = Requirement(dep)
            package_name = requirement.name
            if "langchain" in dep:
                dependents[package_name].add(pkg_dir)
                continue

        # load extended deps from extended_testing_deps.txt
        package_path = Path(path).parent
        extended_requirement_path = package_path / "extended_testing_deps.txt"
        if extended_requirement_path.exists():
            with open(extended_requirement_path, "r") as f:
                extended_deps = f.read().splitlines()
                for depline in extended_deps:
                    if depline.startswith("-e "):
                        # editable dependency
                        assert depline.startswith("-e ../partners/"), (
                            "Extended test deps should only editable install partner packages"
                        )
                        partner = depline.split("partners/")[1]
                        dep = f"langchain-{partner}"
                    else:
                        dep = depline.split("==")[0]

                    if "langchain" in dep:
                        dependents[dep].add(pkg_dir)

    for k in dependents:
        for partner in IGNORED_PARTNERS:
            if f"libs/partners/{partner}" in dependents[k]:
                dependents[k].remove(f"libs/partners/{partner}")
    return dependents


def add_dependents(dirs_to_eval: Set[str], dependents: dict) -> List[str]:
    updated = set()
    for dir_ in dirs_to_eval:
        # handle core manually because it has so many dependents
        if "core" in dir_:
            updated.add(dir_)
            continue
        pkg = "langchain-" + dir_.split("/")[-1]
        updated.update(dependents[pkg])
        updated.add(dir_)
    return list(updated)


def _get_configs_for_single_dir(job: str, dir_: str) -> List[Dict[str, str]]:
    if job == "test-pydantic":
        return _get_pydantic_test_configs(dir_)

    if job == "codspeed":
        py_versions = ["3.12"]  # 3.13 is not yet supported
    elif dir_ == "libs/core":
        py_versions = ["3.9", "3.10", "3.11", "3.12", "3.13"]
    # custom logic for specific directories

    elif dir_ in PY_312_MAX_PACKAGES:
        py_versions = ["3.9", "3.12"]

    elif dir_ == "libs/langchain" and job == "extended-tests":
        py_versions = ["3.9", "3.13"]
    elif dir_ == "libs/langchain_v1":
        py_versions = ["3.10", "3.13"]
    elif dir_ in {"libs/cli"}:
        py_versions = ["3.10", "3.13"]

    elif dir_ == ".":
        # unable to install with 3.13 because tokenizers doesn't support 3.13 yet
        py_versions = ["3.9", "3.12"]
    else:
        py_versions = ["3.9", "3.13"]

    return [{"working-directory": dir_, "python-version": py_v} for py_v in py_versions]


def _get_pydantic_test_configs(
    dir_: str, *, python_version: str = "3.11"
) -> List[Dict[str, str]]:
    with open("./libs/core/uv.lock", "rb") as f:
        core_uv_lock_data = tomllib.load(f)
    for package in core_uv_lock_data["package"]:
        if package["name"] == "pydantic":
            core_max_pydantic_minor = package["version"].split(".")[1]
            break

    with open(f"./{dir_}/uv.lock", "rb") as f:
        dir_uv_lock_data = tomllib.load(f)

    for package in dir_uv_lock_data["package"]:
        if package["name"] == "pydantic":
            dir_max_pydantic_minor = package["version"].split(".")[1]
            break

    core_min_pydantic_version = get_min_version_from_toml(
        "./libs/core/pyproject.toml", "release", python_version, include=["pydantic"]
    )["pydantic"]
    core_min_pydantic_minor = (
        core_min_pydantic_version.split(".")[1]
        if "." in core_min_pydantic_version
        else "0"
    )
    dir_min_pydantic_version = get_min_version_from_toml(
        f"./{dir_}/pyproject.toml", "release", python_version, include=["pydantic"]
    ).get("pydantic", "0.0.0")
    dir_min_pydantic_minor = (
        dir_min_pydantic_version.split(".")[1]
        if "." in dir_min_pydantic_version
        else "0"
    )

    max_pydantic_minor = min(
        int(dir_max_pydantic_minor),
        int(core_max_pydantic_minor),
    )
    min_pydantic_minor = max(
        int(dir_min_pydantic_minor),
        int(core_min_pydantic_minor),
    )

    configs = [
        {
            "working-directory": dir_,
            "pydantic-version": f"2.{v}.0",
            "python-version": python_version,
        }
        for v in range(min_pydantic_minor, max_pydantic_minor + 1)
    ]
    return configs


def _get_configs_for_multi_dirs(
    job: str, dirs_to_run: Dict[str, Set[str]], dependents: dict
) -> List[Dict[str, str]]:
    if job == "lint":
        dirs = add_dependents(
            dirs_to_run["lint"] | dirs_to_run["test"] | dirs_to_run["extended-test"],
            dependents,
        )
    elif job in ["test", "compile-integration-tests", "dependencies", "test-pydantic"]:
        dirs = add_dependents(
            dirs_to_run["test"] | dirs_to_run["extended-test"], dependents
        )
    elif job == "extended-tests":
        dirs = list(dirs_to_run["extended-test"])
    elif job == "codspeed":
        dirs = list(dirs_to_run["codspeed"])
    else:
        raise ValueError(f"Unknown job: {job}")

    return [
        config for dir_ in dirs for config in _get_configs_for_single_dir(job, dir_)
    ]


if __name__ == "__main__":
    files = sys.argv[1:]

    dirs_to_run: Dict[str, set] = {
        "lint": set(),
        "test": set(),
        "extended-test": set(),
        "codspeed": set(),
    }
    docs_edited = False

    if len(files) >= 300:
        # max diff length is 300 files - there are likely files missing
        dirs_to_run["lint"] = all_package_dirs()
        dirs_to_run["test"] = all_package_dirs()
        dirs_to_run["extended-test"] = set(LANGCHAIN_DIRS)

    for file in files:
        if any(
            file.startswith(dir_)
            for dir_ in (
                ".github/workflows",
                ".github/tools",
                ".github/actions",
                ".github/scripts/check_diff.py",
            )
        ):
            # add all LANGCHAIN_DIRS for infra changes
            dirs_to_run["extended-test"].update(LANGCHAIN_DIRS)
            dirs_to_run["lint"].add(".")

        if file.startswith("libs/core"):
            dirs_to_run["codspeed"].add(f"libs/core")
        if any(file.startswith(dir_) for dir_ in LANGCHAIN_DIRS):
            # add that dir and all dirs after in LANGCHAIN_DIRS
            # for extended testing

            found = False
            for dir_ in LANGCHAIN_DIRS:
                if dir_ == "libs/core" and IGNORE_CORE_DEPENDENTS:
                    dirs_to_run["extended-test"].add(dir_)
                    continue
                if file.startswith(dir_):
                    found = True
                if found:
                    dirs_to_run["extended-test"].add(dir_)
        elif file.startswith("libs/standard-tests"):
            # TODO: update to include all packages that rely on standard-tests (all partner packages)
            # Note: won't run on external repo partners
            dirs_to_run["lint"].add("libs/standard-tests")
            dirs_to_run["test"].add("libs/standard-tests")
            dirs_to_run["lint"].add("libs/cli")
            dirs_to_run["test"].add("libs/cli")
            dirs_to_run["test"].add("libs/partners/mistralai")
            dirs_to_run["test"].add("libs/partners/openai")
            dirs_to_run["test"].add("libs/partners/anthropic")
            dirs_to_run["test"].add("libs/partners/fireworks")
            dirs_to_run["test"].add("libs/partners/groq")

        elif file.startswith("libs/cli"):
            dirs_to_run["lint"].add("libs/cli")
            dirs_to_run["test"].add("libs/cli")

        elif file.startswith("libs/partners"):
            partner_dir = file.split("/")[2]
            if os.path.isdir(f"libs/partners/{partner_dir}") and [
                filename
                for filename in os.listdir(f"libs/partners/{partner_dir}")
                if not filename.startswith(".")
            ] != ["README.md"]:
                dirs_to_run["test"].add(f"libs/partners/{partner_dir}")
                dirs_to_run["codspeed"].add(f"libs/partners/{partner_dir}")
            # Skip if the directory was deleted or is just a tombstone readme
        elif file == "libs/packages.yml":
            continue
        elif file.startswith("libs/"):
            raise ValueError(
                f"Unknown lib: {file}. check_diff.py likely needs "
                "an update for this new library!"
            )
        elif file.startswith("docs/") or file in [
            "pyproject.toml",
            "uv.lock",
        ]:  # docs or root uv files
            docs_edited = True
            dirs_to_run["lint"].add(".")

    dependents = dependents_graph()

    # we now have dirs_by_job
    # todo: clean this up
    map_job_to_configs = {
        job: _get_configs_for_multi_dirs(job, dirs_to_run, dependents)
        for job in [
            "lint",
            "test",
            "extended-tests",
            "compile-integration-tests",
            "dependencies",
            "test-pydantic",
            "codspeed",
        ]
    }
    map_job_to_configs["test-doc-imports"] = (
        [{"python-version": "3.12"}] if docs_edited else []
    )

    for key, value in map_job_to_configs.items():
        json_output = json.dumps(value)
        print(f"{key}={json_output}")