gpu: Add embeding service

For a simple RAG pipeline add a embeding service

Signed-off-by: Zvonko Kaiser <zkaiser@nvidia.com>

Update tests/integration/kubernetes/k8s-nvidia-nim.bats

Co-authored-by: Steve Horsman <steven@uk.ibm.com>
This commit is contained in:
Zvonko Kaiser
2025-05-09 14:57:47 +00:00
parent 3ae6d48ce7
commit f07fe13330
3 changed files with 245 additions and 4 deletions

View File

@@ -111,7 +111,6 @@ jobs:
ARTEFACT_REGISTRY_PASSWORD: ${{ secrets.GITHUB_TOKEN }}
TARGET_BRANCH: ${{ inputs.target-branch }}
RELEASE: ${{ inputs.stage == 'release' && 'yes' || 'no' }}
KBUILD_SIGN_PIN: ${{ secrets.KBUILD_SIGN_PIN }}
- name: Parse OCI image name and digest
id: parse-oci-segments
@@ -220,7 +219,6 @@ jobs:
ARTEFACT_REGISTRY_PASSWORD: ${{ secrets.GITHUB_TOKEN }}
TARGET_BRANCH: ${{ inputs.target-branch }}
RELEASE: ${{ inputs.stage == 'release' && 'yes' || 'no' }}
KBUILD_SIGN_PIN: ${{ secrets.KBUILD_SIGN_PIN }}
- name: store-artifact ${{ matrix.asset }}
uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2

View File

@@ -37,7 +37,6 @@ jobs:
k8s:
- kubeadm
runs-on: amd64-nvidia-a100
environment: nvidia
env:
DOCKER_REGISTRY: ${{ inputs.registry }}
DOCKER_REPO: ${{ inputs.repo }}

View File

@@ -28,6 +28,7 @@ setup_file() {
[[ -d ${PYENV_ROOT}/bin ]] && export PATH="${PYENV_ROOT}/bin:${PATH}"
eval "$(pyenv init - bash)"
# shellcheck disable=SC1091 # Virtual environment will be created during test execution
python3 -m venv "${HOME}"/.cicd/venv
get_pod_config_dir
@@ -35,20 +36,36 @@ setup_file() {
pod_instruct_yaml_in="${pod_config_dir}/${POD_NAME_INSTRUCT}.yaml.in"
pod_instruct_yaml="${pod_config_dir}/${POD_NAME_INSTRUCT}.yaml"
pod_embedqa_yaml_in="${pod_config_dir}/${POD_NAME_EMBEDQA}.yaml.in"
pod_embedqa_yaml="${pod_config_dir}/${POD_NAME_EMBEDQA}.yaml"
envsubst <"${pod_instruct_yaml_in}" >"${pod_instruct_yaml}"
envsubst <"${pod_embedqa_yaml_in}" >"${pod_embedqa_yaml}"
export POD_INSTRUCT_YAML="${pod_instruct_yaml}"
export POD_EMBEDQA_YAML="${pod_embedqa_yaml}"
}
@test "NVIDIA NIM Llama 3.1-8b Instruct" {
@test "NVIDIA NIM Llama 3.1-8b Instruct & NVIDIA NIM Llama 3.2 EmbedQA-1b-v2" {
kubectl apply -f "${POD_INSTRUCT_YAML}"
kubectl apply -f "${POD_EMBEDQA_YAML}"
kubectl wait --for=condition=Ready --timeout=500s pod "${POD_NAME_INSTRUCT}"
kubectl wait --for=condition=Ready --timeout=500s pod "${POD_NAME_EMBEDQA}"
# shellcheck disable=SC2030 # Variable is shared via file between BATS tests
POD_IP_INSTRUCT=$(kubectl get pod "${POD_NAME_INSTRUCT}" -o jsonpath='{.status.podIP}')
[[ -n "${POD_IP_INSTRUCT}" ]]
# shellcheck disable=SC2030 # Variable is shared via file between BATS tests
POD_IP_EMBEDQA=$(kubectl get pod "${POD_NAME_EMBEDQA}" -o jsonpath='{.status.podIP}')
[[ -n "${POD_IP_EMBEDQA}" ]]
echo "POD_IP_INSTRUCT=${POD_IP_INSTRUCT}" >"${BATS_SUITE_TMPDIR}/env"
echo "# POD_IP_INSTRUCT=${POD_IP_INSTRUCT}" >&3
echo "POD_IP_EMBEDQA=${POD_IP_EMBEDQA}" >>"${BATS_SUITE_TMPDIR}/env"
echo "# POD_IP_EMBEDQA=${POD_IP_EMBEDQA}" >&3
}
@test "List of models available for inference" {
@@ -94,6 +111,233 @@ setup_file() {
echo "# ANSWER: ${ANSWER}" >&3
}
@test "Setup the LangChain flow" {
# shellcheck disable=SC1091 # Sourcing virtual environment activation script
source "${HOME}"/.cicd/venv/bin/activate
pip install --upgrade pip
[[ "$(pip show langchain 2>/dev/null | awk '/^Version:/{print $2}')" = "0.2.5" ]] || pip install langchain==0.2.5
[[ "$(pip show langchain-nvidia-ai-endpoints 2>/dev/null | awk '/^Version:/{print $2}')" = "0.1.2" ]] || pip install langchain-nvidia-ai-endpoints==0.1.2
[[ "$(pip show faiss-gpu 2>/dev/null | awk '/^Version:/{print $2}')" = "1.7.2" ]] || pip install faiss-gpu==1.7.2
[[ "$(pip show langchain-community 2>/dev/null | awk '/^Version:/{print $2}')" = "0.2.5" ]] || pip install langchain-community==0.2.5
[[ "$(pip show beautifulsoup4 2>/dev/null | awk '/^Version:/{print $2}')" = "4.13.4" ]] || pip install beautifulsoup4==4.13.4
}
@test "LangChain NVIDIA AI Endpoints" {
# shellcheck disable=SC1091 # File is created by previous test
source "${BATS_SUITE_TMPDIR}/env"
# shellcheck disable=SC2031 # Variables are shared via file between BATS tests
[[ -n "${POD_IP_INSTRUCT}" ]]
# shellcheck disable=SC2031 # Variables are shared via file between BATS tests
[[ -n "${MODEL_NAME}" ]]
QUESTION="What is the capital of France?"
ANSWER="The capital of France is Paris."
# shellcheck disable=SC1091 # Sourcing virtual environment activation script
source "${HOME}"/.cicd/venv/bin/activate
# shellcheck disable=SC2031 # Variables are used in heredoc, not subshell
cat <<EOF >"${HOME}"/.cicd/venv/langchain_nim.py
from langchain_nvidia_ai_endpoints import ChatNVIDIA
llm = ChatNVIDIA(base_url="http://${POD_IP_INSTRUCT}:8000/v1", model="${MODEL_NAME}", temperature=0.1, max_tokens=1000, top_p=1.0)
result = llm.invoke("${QUESTION}")
print(result.content)
EOF
run python3 "${HOME}"/.cicd/venv/langchain_nim.py
[[ "${status}" -eq 0 ]]
[[ "${output}" = "${ANSWER}" ]]
}
@test "Kata Documentation RAG" {
# shellcheck disable=SC1091 # File is created by previous test
source "${BATS_SUITE_TMPDIR}/env"
# shellcheck disable=SC2031 # Variables are shared via file between BATS tests
[[ -n "${POD_IP_EMBEDQA}" ]]
# shellcheck disable=SC2031 # Variables are shared via file between BATS tests
[[ -n "${POD_IP_INSTRUCT}" ]]
# shellcheck disable=SC1091 # Sourcing virtual environment activation script
source "${HOME}"/.cicd/venv/bin/activate
cat <<EOF >"${HOME}"/.cicd/venv/langchain_nim_kata_rag.py
import os
from langchain.chains import ConversationalRetrievalChain, LLMChain
from langchain.chains.conversational_retrieval.prompts import CONDENSE_QUESTION_PROMPT, QA_PROMPT
from langchain.chains.question_answering import load_qa_chain
from langchain.memory import ConversationBufferMemory
from langchain_community.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_nvidia_ai_endpoints import ChatNVIDIA
from langchain_nvidia_ai_endpoints import NVIDIAEmbeddings
EOF
# shellcheck disable=SC2129 # Multiple heredocs are intentional for building the Python script
cat <<EOF >>"${HOME}"/.cicd/venv/langchain_nim_kata_rag.py
import re
from typing import List, Union
import requests
from bs4 import BeautifulSoup
def html_document_loader(url: Union[str, bytes]) -> str:
"""
Loads the HTML content of a document from a given URL and return it's content.
Args:
url: The URL of the document.
Returns:
The content of the document.
Raises:
Exception: If there is an error while making the HTTP request.
"""
try:
response = requests.get(url)
html_content = response.text
except Exception as e:
print(f"Failed to load {url} due to exception {e}")
return ""
try:
# Create a Beautiful Soup object to parse html
soup = BeautifulSoup(html_content, "html.parser")
# Remove script and style tags
for script in soup(["script", "style"]):
script.extract()
# Get the plain text from the HTML document
text = soup.get_text()
# Remove excess whitespace and newlines
text = re.sub("\s+", " ", text).strip()
return text
except Exception as e:
print(f"Exception {e} while loading document")
return ""
EOF
cat <<EOF >>"${HOME}"/.cicd/venv/langchain_nim_kata_rag.py
def create_embeddings(embedding_path: str = "./data/nv_embedding"):
embedding_path = "./data/nv_embedding"
print(f"Storing embeddings to {embedding_path}")
# List of web pages containing Kata technical documentation
urls = [
"https://katacontainers.io/",
"https://katacontainers.io/learn",
"https://github.com/kata-containers/kata-containers/releases",
]
documents = []
for url in urls:
document = html_document_loader(url)
documents.append(document)
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=0,
length_function=len,
)
texts = text_splitter.create_documents(documents)
index_docs(url, text_splitter, texts, embedding_path)
print("Generated embedding successfully")
EOF
# shellcheck disable=SC2031 # POD_IP_EMBEDQA is shared via file between BATS tests
cat <<EOF >>"${HOME}"/.cicd/venv/langchain_nim_kata_rag.py
def index_docs(url: Union[str, bytes], splitter, documents: List[str], dest_embed_dir) -> None:
"""
Split the document into chunks and create embeddings for the document
Args:
url: Source url for the document.
splitter: Splitter used to split the document
documents: list of documents whose embeddings needs to be created
dest_embed_dir: destination directory for embeddings
Returns:
None
"""
embeddings = NVIDIAEmbeddings(base_url="http://${POD_IP_EMBEDQA}:8000/v1", model="nvidia/llama-3.2-nv-embedqa-1b-v2")
for document in documents:
texts = splitter.split_text(document.page_content)
# metadata to attach to document
metadatas = [document.metadata]
# create embeddings and add to vector store
if os.path.exists(dest_embed_dir):
update = FAISS.load_local(folder_path=dest_embed_dir, embeddings=embeddings, allow_dangerous_deserialization=True)
update.add_texts(texts, metadatas=metadatas)
update.save_local(folder_path=dest_embed_dir)
else:
docsearch = FAISS.from_texts(texts, embedding=embeddings, metadatas=metadatas)
docsearch.save_local(folder_path=dest_embed_dir)
EOF
# shellcheck disable=SC2031 # POD_IP_EMBEDQA is shared via file between BATS tests
cat <<EOF >>"${HOME}"/.cicd/venv/langchain_nim_kata_rag.py
create_embeddings()
embedding_model = NVIDIAEmbeddings(base_url="http://${POD_IP_EMBEDQA}:8000/v1", model="nvidia/llama-3.2-nv-embedqa-1b-v2")
EOF
cat <<EOF >>"${HOME}"/.cicd/venv/langchain_nim_kata_rag.py
# Embed documents
embedding_path = "./data/nv_embedding"
docsearch = FAISS.load_local(folder_path=embedding_path, embeddings=embedding_model, allow_dangerous_deserialization=True)
EOF
# shellcheck disable=SC2031 # Variables are used in heredoc, not subshell
cat <<EOF >>"${HOME}"/.cicd/venv/langchain_nim_kata_rag.py
llm = ChatNVIDIA(base_url="http://${POD_IP_INSTRUCT}:8000/v1", model="meta/llama3-8b-instruct", temperature=0.1, max_tokens=1000, top_p=1.0)
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
qa_prompt=QA_PROMPT
doc_chain = load_qa_chain(llm, chain_type="stuff", prompt=QA_PROMPT)
qa = ConversationalRetrievalChain.from_llm(
llm=llm,
retriever=docsearch.as_retriever(),
chain_type="stuff",
memory=memory,
combine_docs_chain_kwargs={'prompt': qa_prompt},
)
EOF
QUESTION="What is the latest Kata Containers release?"
cat <<EOF >>"${HOME}"/.cicd/venv/langchain_nim_kata_rag.py
query = "${QUESTION}"
result = qa({"question": query})
print("#"+ result.get("answer"))
EOF
run python3 "${HOME}"/.cicd/venv/langchain_nim_kata_rag.py
[[ "${status}" -eq 0 ]]
ANSWER=$(echo "${output}" | cut -d '#' -f2)
[[ -n "${ANSWER}" ]]
echo "# QUESTION: ${QUESTION}" >&3
echo "# ANSWER: ${ANSWER}" >&3
}
teardown_file() {
kubectl delete -f "${POD_INSTRUCT_YAML}"
}