From a06727a44d3d901803817f3b1e75b5350ea4b670 Mon Sep 17 00:00:00 2001 From: Zvonko Kaiser Date: Fri, 9 May 2025 14:57:47 +0000 Subject: [PATCH] gpu: Add embeding service For a simple RAG pipeline add a embeding service Signed-off-by: Zvonko Kaiser --- .../run-k8s-tests-on-nvidia-gpu.yaml | 1 + .../kubernetes/k8s-nvidia-nim.bats | 227 +++++++++++++++++- .../nvidia-nim-llama-3-1-8b-instruct.yaml.in | 11 +- ...dia-nim-llama-3-2-nv-embedqa-1b-v2.yaml.in | 10 +- 4 files changed, 235 insertions(+), 14 deletions(-) diff --git a/.github/workflows/run-k8s-tests-on-nvidia-gpu.yaml b/.github/workflows/run-k8s-tests-on-nvidia-gpu.yaml index c6e285a41b..8e250c3d17 100644 --- a/.github/workflows/run-k8s-tests-on-nvidia-gpu.yaml +++ b/.github/workflows/run-k8s-tests-on-nvidia-gpu.yaml @@ -37,6 +37,7 @@ jobs: k8s: - kubeadm runs-on: amd64-nvidia-a100 + environment: nvidia env: DOCKER_REGISTRY: ${{ inputs.registry }} DOCKER_REPO: ${{ inputs.repo }} diff --git a/tests/integration/kubernetes/k8s-nvidia-nim.bats b/tests/integration/kubernetes/k8s-nvidia-nim.bats index 3181f68d2a..52b2111a9f 100644 --- a/tests/integration/kubernetes/k8s-nvidia-nim.bats +++ b/tests/integration/kubernetes/k8s-nvidia-nim.bats @@ -10,10 +10,13 @@ load "${BATS_TEST_DIRNAME}/../../common.bash" # shellcheck disable=SC1091 load "${BATS_TEST_DIRNAME}/tests_common.sh" +RUNTIME_CLASS_NAME=${RUNTIME_CLASS_NAME:-kata-qemu-nvidia-gpu} +export RUNTIME_CLASS_NAME + export POD_NAME_INSTRUCT="nvidia-nim-llama-3-1-8b-instruct" export POD_NAME_EMBEDQA="nvidia-nim-llama-3-2-nv-embedqa-1b-v2" -export POD_SECRET_INSTRUCT="ngc-secret-instruct" +export LOCAL_NIM_CACHE="/opt/nim/.cache" DOCKER_CONFIG_JSON=$( echo -n "{\"auths\":{\"nvcr.io\":{\"username\":\"\$oauthtoken\",\"password\":\"${NGC_API_KEY}\",\"auth\":\"$(echo -n "\$oauthtoken:${NGC_API_KEY}" | base64 -w0)\"}}}" | @@ -28,6 +31,7 @@ setup_file() { [[ -d ${PYENV_ROOT}/bin ]] && export PATH="${PYENV_ROOT}/bin:${PATH}" eval "$(pyenv init - bash)" + # shellcheck disable=SC1091 # Virtual environment will be created during test execution python3 -m venv "${HOME}"/.cicd/venv get_pod_config_dir @@ -35,20 +39,36 @@ setup_file() { pod_instruct_yaml_in="${pod_config_dir}/${POD_NAME_INSTRUCT}.yaml.in" pod_instruct_yaml="${pod_config_dir}/${POD_NAME_INSTRUCT}.yaml" + pod_embedqa_yaml_in="${pod_config_dir}/${POD_NAME_EMBEDQA}.yaml.in" + pod_embedqa_yaml="${pod_config_dir}/${POD_NAME_EMBEDQA}.yaml" + envsubst <"${pod_instruct_yaml_in}" >"${pod_instruct_yaml}" + envsubst <"${pod_embedqa_yaml_in}" >"${pod_embedqa_yaml}" export POD_INSTRUCT_YAML="${pod_instruct_yaml}" + export POD_EMBEDQA_YAML="${pod_embedqa_yaml}" } -@test "NVIDIA NIM Llama 3.1-8b Instruct" { +@test "NVIDIA NIM Llama 3.1-8b Instruct & NVIDIA NIM Llama 3.2 EmbedQA-1b-v2" { kubectl apply -f "${POD_INSTRUCT_YAML}" + kubectl apply -f "${POD_EMBEDQA_YAML}" + kubectl wait --for=condition=Ready --timeout=500s pod "${POD_NAME_INSTRUCT}" + kubectl wait --for=condition=Ready --timeout=500s pod "${POD_NAME_EMBEDQA}" + # shellcheck disable=SC2030 # Variable is shared via file between BATS tests POD_IP_INSTRUCT=$(kubectl get pod "${POD_NAME_INSTRUCT}" -o jsonpath='{.status.podIP}') [[ -n "${POD_IP_INSTRUCT}" ]] + # shellcheck disable=SC2030 # Variable is shared via file between BATS tests + POD_IP_EMBEDQA=$(kubectl get pod "${POD_NAME_EMBEDQA}" -o jsonpath='{.status.podIP}') + [[ -n "${POD_IP_EMBEDQA}" ]] + echo "POD_IP_INSTRUCT=${POD_IP_INSTRUCT}" >"${BATS_SUITE_TMPDIR}/env" echo "# POD_IP_INSTRUCT=${POD_IP_INSTRUCT}" >&3 + + echo "POD_IP_EMBEDQA=${POD_IP_EMBEDQA}" >>"${BATS_SUITE_TMPDIR}/env" + echo "# POD_IP_EMBEDQA=${POD_IP_EMBEDQA}" >&3 } @test "List of models available for inference" { @@ -94,6 +114,209 @@ setup_file() { echo "# ANSWER: ${ANSWER}" >&3 } +@test "Setup the LangChain flow" { + # shellcheck disable=SC1091 # Sourcing virtual environment activation script + source "${HOME}"/.cicd/venv/bin/activate + + pip install --upgrade pip + [[ "$(pip show langchain 2>/dev/null | awk '/^Version:/{print $2}')" = "0.2.5" ]] || pip install langchain==0.2.5 + [[ "$(pip show langchain-nvidia-ai-endpoints 2>/dev/null | awk '/^Version:/{print $2}')" = "0.1.2" ]] || pip install langchain-nvidia-ai-endpoints==0.1.2 + [[ "$(pip show faiss-gpu 2>/dev/null | awk '/^Version:/{print $2}')" = "1.7.2" ]] || pip install faiss-gpu==1.7.2 + [[ "$(pip show langchain-community 2>/dev/null | awk '/^Version:/{print $2}')" = "0.2.5" ]] || pip install langchain-community==0.2.5 + [[ "$(pip show beautifulsoup4 2>/dev/null | awk '/^Version:/{print $2}')" = "4.13.4" ]] || pip install beautifulsoup4==4.13.4 +} + +@test "LangChain NVIDIA AI Endpoints" { + # shellcheck disable=SC1091 # File is created by previous test + source "${BATS_SUITE_TMPDIR}/env" + # shellcheck disable=SC2031 # Variables are shared via file between BATS tests + [[ -n "${POD_IP_INSTRUCT}" ]] + # shellcheck disable=SC2031 # Variables are shared via file between BATS tests + [[ -n "${MODEL_NAME}" ]] + + QUESTION="What is the capital of France?" + ANSWER="The capital of France is Paris." + + # shellcheck disable=SC1091 # Sourcing virtual environment activation script + source "${HOME}"/.cicd/venv/bin/activate + # shellcheck disable=SC2031 # Variables are used in heredoc, not subshell + cat <"${HOME}"/.cicd/venv/langchain_nim.py +from langchain_nvidia_ai_endpoints import ChatNVIDIA + +llm = ChatNVIDIA(base_url="http://${POD_IP_INSTRUCT}:8000/v1", model="${MODEL_NAME}", temperature=0.1, max_tokens=1000, top_p=1.0) + +result = llm.invoke("${QUESTION}") +print(result.content) +EOF + + run python3 "${HOME}"/.cicd/venv/langchain_nim.py + + [[ "${status}" -eq 0 ]] + [[ "${output}" = "${ANSWER}" ]] + + echo "# QUESTION: ${QUESTION}" >&3 + echo "# ANSWER: ${ANSWER}" >&3 +} + +@test "Kata Documentation RAG" { + # shellcheck disable=SC1091 # File is created by previous test + source "${BATS_SUITE_TMPDIR}/env" + # shellcheck disable=SC2031 # Variables are shared via file between BATS tests + [[ -n "${POD_IP_EMBEDQA}" ]] + # shellcheck disable=SC2031 # Variables are shared via file between BATS tests + [[ -n "${POD_IP_INSTRUCT}" ]] + + # shellcheck disable=SC1091 # Sourcing virtual environment activation script + source "${HOME}"/.cicd/venv/bin/activate + cat <"${HOME}"/.cicd/venv/langchain_nim_kata_rag.py +import os +from langchain.chains import ConversationalRetrievalChain, LLMChain +from langchain.chains.conversational_retrieval.prompts import CONDENSE_QUESTION_PROMPT, QA_PROMPT +from langchain.chains.question_answering import load_qa_chain +from langchain.memory import ConversationBufferMemory +from langchain_community.vectorstores import FAISS +from langchain.text_splitter import RecursiveCharacterTextSplitter +from langchain_nvidia_ai_endpoints import ChatNVIDIA +from langchain_nvidia_ai_endpoints import NVIDIAEmbeddings +EOF + + # shellcheck disable=SC2129 # Multiple heredocs are intentional for building the Python script + cat <>"${HOME}"/.cicd/venv/langchain_nim_kata_rag.py +import re +from typing import List, Union + +import requests +from bs4 import BeautifulSoup + +def html_document_loader(url: Union[str, bytes]) -> str: + try: + response = requests.get(url) + html_content = response.text + except Exception as e: + print(f"Failed to load {url} due to exception {e}") + return "" + + try: + # Create a Beautiful Soup object to parse html + soup = BeautifulSoup(html_content, "html.parser") + + # Remove script and style tags + for script in soup(["script", "style"]): + script.extract() + + # Get the plain text from the HTML document + text = soup.get_text() + + # Remove excess whitespace and newlines + text = re.sub("\s+", " ", text).strip() + + return text + except Exception as e: + print(f"Exception {e} while loading document") + return "" + +EOF + + cat <>"${HOME}"/.cicd/venv/langchain_nim_kata_rag.py +def create_embeddings(embedding_path: str = "./data/nv_embedding"): + + embedding_path = "./data/nv_embedding" + print(f"Storing embeddings to {embedding_path}") + + # List of web pages containing Kata technical documentation + urls = [ + "https://github.com/kata-containers/kata-containers/releases", + ] + + documents = [] + for url in urls: + document = html_document_loader(url) + documents.append(document) + + + text_splitter = RecursiveCharacterTextSplitter( + chunk_size=1000, + chunk_overlap=0, + length_function=len, + ) + texts = text_splitter.create_documents(documents) + index_docs(url, text_splitter, texts, embedding_path) + print("Generated embedding successfully") +EOF + + # shellcheck disable=SC2031 # POD_IP_EMBEDQA is shared via file between BATS tests + cat <>"${HOME}"/.cicd/venv/langchain_nim_kata_rag.py +def index_docs(url: Union[str, bytes], splitter, documents: List[str], dest_embed_dir) -> None: + embeddings = NVIDIAEmbeddings(base_url="http://${POD_IP_EMBEDQA}:8000/v1", model="nvidia/llama-3.2-nv-embedqa-1b-v2") + + for document in documents: + texts = splitter.split_text(document.page_content) + + # metadata to attach to document + metadatas = [document.metadata] + + # create embeddings and add to vector store + if os.path.exists(dest_embed_dir): + update = FAISS.load_local(folder_path=dest_embed_dir, embeddings=embeddings, allow_dangerous_deserialization=True) + update.add_texts(texts, metadatas=metadatas) + update.save_local(folder_path=dest_embed_dir) + else: + docsearch = FAISS.from_texts(texts, embedding=embeddings, metadatas=metadatas) + docsearch.save_local(folder_path=dest_embed_dir) +EOF + + # shellcheck disable=SC2031 # POD_IP_EMBEDQA is shared via file between BATS tests + cat <>"${HOME}"/.cicd/venv/langchain_nim_kata_rag.py +create_embeddings() + +embedding_model = NVIDIAEmbeddings(base_url="http://${POD_IP_EMBEDQA}:8000/v1", model="nvidia/llama-3.2-nv-embedqa-1b-v2") +EOF + + cat <>"${HOME}"/.cicd/venv/langchain_nim_kata_rag.py +# Embed documents +embedding_path = "./data/nv_embedding" +docsearch = FAISS.load_local(folder_path=embedding_path, embeddings=embedding_model, allow_dangerous_deserialization=True) +EOF + + # shellcheck disable=SC2031 # Variables are used in heredoc, not subshell + cat <>"${HOME}"/.cicd/venv/langchain_nim_kata_rag.py +llm = ChatNVIDIA(base_url="http://${POD_IP_INSTRUCT}:8000/v1", model="meta/llama3-8b-instruct", temperature=0.1, max_tokens=1000, top_p=1.0) + +memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True) + +qa_prompt=QA_PROMPT + +doc_chain = load_qa_chain(llm, chain_type="stuff", prompt=QA_PROMPT) + +qa = ConversationalRetrievalChain.from_llm( + llm=llm, + retriever=docsearch.as_retriever(), + chain_type="stuff", + memory=memory, + combine_docs_chain_kwargs={'prompt': qa_prompt}, +) + +EOF + + QUESTION="What is the latest Kata Containers release?" + + cat <>"${HOME}"/.cicd/venv/langchain_nim_kata_rag.py +query = "${QUESTION}" +result = qa.invoke({"question": query}) +print("#"+ result.get("answer")) + +EOF + + run python3 "${HOME}"/.cicd/venv/langchain_nim_kata_rag.py + [[ "${status}" -eq 0 ]] + + ANSWER=$(echo "${output}" | cut -d '#' -f2) + [[ -n "${ANSWER}" ]] + + echo "# QUESTION: ${QUESTION}" >&3 + echo "# ANSWER: ${ANSWER}" >&3 +} + teardown_file() { kubectl delete -f "${POD_INSTRUCT_YAML}" } diff --git a/tests/integration/kubernetes/runtimeclass_workloads/nvidia-nim-llama-3-1-8b-instruct.yaml.in b/tests/integration/kubernetes/runtimeclass_workloads/nvidia-nim-llama-3-1-8b-instruct.yaml.in index 761d46158e..f505278be3 100644 --- a/tests/integration/kubernetes/runtimeclass_workloads/nvidia-nim-llama-3-1-8b-instruct.yaml.in +++ b/tests/integration/kubernetes/runtimeclass_workloads/nvidia-nim-llama-3-1-8b-instruct.yaml.in @@ -1,4 +1,3 @@ - # Copyright (c) 2025 NVIDIA Corporation # # SPDX-License-Identifier: Apache-2.0 @@ -20,13 +19,13 @@ metadata: app: ${POD_NAME_INSTRUCT} spec: restartPolicy: Never - runtimeClassName: kata-qemu-nvidia-gpu + runtimeClassName: "${RUNTIME_CLASS_NAME}" imagePullSecrets: - name: ngc-secret-instruct securityContext: - runAsUser: 0 - runAsGroup: 0 - fsGroup: 0 + runAsUser: 1000 + runAsGroup: 1000 + fsGroup: 1000 containers: - name: ${POD_NAME_INSTRUCT} image: nvcr.io/nim/meta/llama3-8b-instruct:1.0.0 @@ -84,6 +83,6 @@ spec: volumes: - name: nim-cache hostPath: - path: "/opr/nim/.cache" + path: "${LOCAL_NIM_CACHE}" type: DirectoryOrCreate diff --git a/tests/integration/kubernetes/runtimeclass_workloads/nvidia-nim-llama-3-2-nv-embedqa-1b-v2.yaml.in b/tests/integration/kubernetes/runtimeclass_workloads/nvidia-nim-llama-3-2-nv-embedqa-1b-v2.yaml.in index 890564da9f..afca3a7839 100644 --- a/tests/integration/kubernetes/runtimeclass_workloads/nvidia-nim-llama-3-2-nv-embedqa-1b-v2.yaml.in +++ b/tests/integration/kubernetes/runtimeclass_workloads/nvidia-nim-llama-3-2-nv-embedqa-1b-v2.yaml.in @@ -7,7 +7,6 @@ apiVersion: v1 kind: Secret metadata: name: ngc-secret-embedqa - namespace: nim-embedqa type: kubernetes.io/dockerconfigjson data: .dockerconfigjson: ${DOCKER_CONFIG_JSON} @@ -16,7 +15,6 @@ apiVersion: v1 kind: Pod metadata: name: nvidia-nim-llama-3-2-nv-embedqa-1b-v2 - namespace: nim-embedqa labels: app: nvidia-nim-llama-3-2-nv-embedqa-1b-v2 spec: @@ -26,9 +24,9 @@ spec: imagePullSecrets: - name: ngc-secret-embedqa securityContext: - fsGroup: 0 - runAsGroup: 0 - runAsUser: 0 + fsGroup: 1000 + runAsGroup: 1000 + runAsUser: 1000 containers: - name: nvidia-nim-llama-3-2-nv-embedqa-1b-v2 image: nvcr.io/nim/nvidia/llama-3.2-nv-embedqa-1b-v2:1.5.0 @@ -91,5 +89,5 @@ spec: volumes: - name: nim-cache hostPath: - path: "/opr/nim/.cache" + path: "${LOCAL_NIM_CACHE}" type: DirectoryOrCreate