From a046fa478d33d17657bb3a240c714610c3d80ae4 Mon Sep 17 00:00:00 2001 From: Gurvinder Singh Date: Fri, 10 Jul 2015 11:19:55 +0200 Subject: [PATCH] modified spark example to use kubectl exec to interact with cluster and create spark driver pod --- examples/spark/README.md | 57 +++++++++++-------------- examples/spark/images/driver/Dockerfile | 4 ++ examples/spark/images/driver/README.md | 0 examples/spark/images/driver/start.sh | 9 ++++ examples/spark/spark-driver.json | 23 ++++++++++ 5 files changed, 61 insertions(+), 32 deletions(-) create mode 100644 examples/spark/images/driver/Dockerfile create mode 100644 examples/spark/images/driver/README.md create mode 100755 examples/spark/images/driver/start.sh create mode 100644 examples/spark/spark-driver.json diff --git a/examples/spark/README.md b/examples/spark/README.md index c401311269a..d2ac4575cde 100644 --- a/examples/spark/README.md +++ b/examples/spark/README.md @@ -110,44 +110,35 @@ $ kubectl logs spark-master 15/06/26 14:15:55 INFO Master: Registering worker 10.244.1.15:44839 with 1 cores, 2.6 GB RAM 15/06/26 14:15:55 INFO Master: Registering worker 10.244.0.19:60970 with 1 cores, 2.6 GB RAM ``` -## Step Three: Do something with the cluster -Get the address and port of the Master service. +## Step Three: Start your Spark driver to launch jobs on your Spark cluster + +The Spark driver is used to launch jobs into Spark cluster. You can read more about it in +[Spark architecture](http://spark.apache.org/docs/latest/cluster-overview.html). ```shell -$ kubectl get service spark-master -NAME LABELS SELECTOR IP(S) PORT(S) -spark-master name=spark-master name=spark-master 10.0.204.187 7077/TCP +$ kubectl create -f examples/spark/spark-driver.json +``` +The Spark driver needs the Master service to be running. + +### Check to see if the driver is running + +```shell +$ kubectl get pods +NAME READY REASON RESTARTS AGE +[...] +spark-master 1/1 Running 0 14m +spark-driver 1/1 Running 0 10m ``` -SSH to one of your cluster nodes. On GCE/GKE you can either use [Developers Console](https://console.developers.google.com) -(more details [here](https://cloud.google.com/compute/docs/ssh-in-browser)) -or run `gcloud compute ssh ` where the name can be taken from `kubectl get nodes` -(more details [here](https://cloud.google.com/compute/docs/gcloud-compute/#connecting)). +## Step Four: Do something with the cluster + +Use the kubectl exec to connect to Spark driver ``` -$ kubectl get nodes -NAME LABELS STATUS -kubernetes-minion-5jvu kubernetes.io/hostname=kubernetes-minion-5jvu Ready -kubernetes-minion-6fbi kubernetes.io/hostname=kubernetes-minion-6fbi Ready -kubernetes-minion-8y2v kubernetes.io/hostname=kubernetes-minion-8y2v Ready -kubernetes-minion-h0tr kubernetes.io/hostname=kubernetes-minion-h0tr Ready - -$ gcloud compute ssh kubernetes-minion-5jvu --zone=us-central1-b -Linux kubernetes-minion-5jvu 3.16.0-0.bpo.4-amd64 #1 SMP Debian 3.16.7-ckt9-3~deb8u1~bpo70+1 (2015-04-27) x86_64 - -=== GCE Kubernetes node setup complete === - -me@kubernetes-minion-5jvu:~$ -``` - -Once logged in run spark-base image. Inside of the image there is a script -that sets up the environment based on the provided IP and port of the Master. - -``` -cluster-node $ sudo docker run -it gcr.io/google_containers/spark-base -root@f12a6fec45ce:/# . /setup_client.sh 10.0.204.187 7077 -root@f12a6fec45ce:/# pyspark +$ kubectl exec spark-driver -it bash +root@spark-driver:/# +root@spark-driver:/# pyspark Python 2.7.9 (default, Mar 1 2015, 12:57:24) [GCC 4.9.2] on linux2 Type "help", "copyright", "credits" or "license" for more information. @@ -166,7 +157,7 @@ SparkContext available as sc, HiveContext available as sqlContext. ``` ## Result -You now have services, replication controllers, and pods for the Spark master and Spark workers. +You now have services, replication controllers, and pods for the Spark master , Spark driver and Spark workers. You can take this example to the next step and start using the Apache Spark cluster you just created, see [Spark documentation](https://spark.apache.org/documentation.html) for more information. @@ -181,4 +172,6 @@ Make sure the Master Pod is running (use: ```kubectl get pods```). ```kubectl create -f spark-worker-controller.json``` +```kubectl create -f spark-driver.json``` + [![Analytics](https://kubernetes-site.appspot.com/UA-36037335-10/GitHub/examples/spark/README.md?pixel)]() diff --git a/examples/spark/images/driver/Dockerfile b/examples/spark/images/driver/Dockerfile new file mode 100644 index 00000000000..cfb1dad7df3 --- /dev/null +++ b/examples/spark/images/driver/Dockerfile @@ -0,0 +1,4 @@ +FROM gcr.io/google_containers/spark-base +ADD start.sh /start.sh +ADD log4j.properties /opt/spark/conf/log4j.properties +CMD ["/start.sh"] diff --git a/examples/spark/images/driver/README.md b/examples/spark/images/driver/README.md new file mode 100644 index 00000000000..e69de29bb2d diff --git a/examples/spark/images/driver/start.sh b/examples/spark/images/driver/start.sh new file mode 100755 index 00000000000..495194dc38b --- /dev/null +++ b/examples/spark/images/driver/start.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +echo "$SPARK_MASTER_SERVICE_HOST spark-master" >> /etc/hosts +echo "SPARK_LOCAL_HOSTNAME=$(hostname -i)" >> /opt/spark/conf/spark-env.sh +echo "MASTER=spark://spark-master:$SPARK_MASTER_SERVICE_PORT" >> /opt/spark/conf/spark-env.sh + +while true; do +sleep 100 +done diff --git a/examples/spark/spark-driver.json b/examples/spark/spark-driver.json new file mode 100644 index 00000000000..ee695eeabcd --- /dev/null +++ b/examples/spark/spark-driver.json @@ -0,0 +1,23 @@ +{ + "kind": "Pod", + "apiVersion": "v1", + "metadata": { + "name": "spark-driver", + "labels": { + "name": "spark-driver" + } + }, + "spec": { + "containers": [ + { + "name": "spark-driver", + "image": "gurvin/spark-driver", + "resources": { + "limits": { + "cpu": "100m" + } + } + } + ] + } +}