From bfc4ee789df61c99b6304003c12dfd6398c7446a Mon Sep 17 00:00:00 2001 From: Marcin Wielgus Date: Fri, 26 Jun 2015 18:00:42 +0200 Subject: [PATCH] Updated spark examples with docker images moved to gcr.io/google_containers --- examples/spark/README.md | 162 ++++++++++-------- examples/spark/images/base/Dockerfile | 17 ++ examples/spark/images/base/log4j.properties | 12 ++ examples/spark/images/base/setup_client.sh | 24 +++ examples/spark/images/master/Dockerfile | 7 + examples/spark/images/master/log4j.properties | 12 ++ examples/spark/images/master/start.sh | 19 ++ examples/spark/images/worker/Dockerfile | 7 + examples/spark/images/worker/log4j.properties | 12 ++ examples/spark/images/worker/start.sh | 28 +++ examples/spark/spark-master.json | 2 +- examples/spark/spark-worker-controller.json | 2 +- 12 files changed, 226 insertions(+), 78 deletions(-) create mode 100644 examples/spark/images/base/Dockerfile create mode 100644 examples/spark/images/base/log4j.properties create mode 100755 examples/spark/images/base/setup_client.sh create mode 100644 examples/spark/images/master/Dockerfile create mode 100644 examples/spark/images/master/log4j.properties create mode 100755 examples/spark/images/master/start.sh create mode 100644 examples/spark/images/worker/Dockerfile create mode 100644 examples/spark/images/worker/log4j.properties create mode 100755 examples/spark/images/worker/start.sh diff --git a/examples/spark/README.md b/examples/spark/README.md index 8f85f5d1488..c401311269a 100644 --- a/examples/spark/README.md +++ b/examples/spark/README.md @@ -12,9 +12,7 @@ section. ### Sources -Source is freely available at: -* Docker image - https://github.com/mattf/docker-spark -* Docker Trusted Build - https://registry.hub.docker.com/search?q=mattf/spark +The Docker images are heavily based on https://github.com/mattf/docker-spark ## Step Zero: Prerequisites @@ -36,7 +34,7 @@ the Master service. $ kubectl create -f examples/spark/spark-master.json ``` -Then, use the `examples/spark/spark-master-service.json` file to +Then, use the [`examples/spark/spark-master-service.json`](spar-master-service.json) file to create a logical service endpoint that Spark workers can use to access the Master pod. @@ -44,38 +42,42 @@ the Master pod. $ kubectl create -f examples/spark/spark-master-service.json ``` -Ensure that the Master service is running and functional. - ### Check to see if Master is running and accessible ```shell -$ kubectl get pods,services -POD IP CONTAINER(S) IMAGE(S) HOST LABELS STATUS -spark-master 192.168.90.14 spark-master mattf/spark-master 172.18.145.8/172.18.145.8 name=spark-master Running -NAME LABELS SELECTOR IP PORT -kubernetes component=apiserver,provider=kubernetes 10.254.0.2 443 -spark-master name=spark-master name=spark-master 10.254.125.166 7077 +$ kubectl get pods +NAME READY REASON RESTARTS AGE +[...] +spark-master 1/1 Running 0 25s + ``` -Connect to http://192.168.90.14:8080 to see the status of the master. +Check logs to see the status of the master. ```shell -$ links -dump 192.168.90.14:8080 - [IMG] 1.2.1 Spark Master at spark://spark-master:7077 +$ kubectl logs spark-master - * URL: spark://spark-master:7077 - * Workers: 0 - * Cores: 0 Total, 0 Used - * Memory: 0.0 B Total, 0.0 B Used - * Applications: 0 Running, 0 Completed - * Drivers: 0 Running, 0 Completed - * Status: ALIVE -... +starting org.apache.spark.deploy.master.Master, logging to /opt/spark-1.4.0-bin-hadoop2.6/sbin/../logs/spark--org.apache.spark.deploy.master.Master-1-spark-master.out +Spark Command: /usr/lib/jvm/java-7-openjdk-amd64/jre/bin/java -cp /opt/spark-1.4.0-bin-hadoop2.6/sbin/../conf/:/opt/spark-1.4.0-bin-hadoop2.6/lib/spark-assembly-1.4.0-hadoop2.6.0.jar:/opt/spark-1.4.0-bin-hadoop2.6/lib/datanucleus-api-jdo-3.2.6.jar:/opt/spark-1.4.0-bin-hadoop2.6/lib/datanucleus-rdbms-3.2.9.jar:/opt/spark-1.4.0-bin-hadoop2.6/lib/datanucleus-core-3.2.10.jar -Xms512m -Xmx512m -XX:MaxPermSize=128m org.apache.spark.deploy.master.Master --ip spark-master --port 7077 --webui-port 8080 +======================================== +15/06/26 14:01:49 INFO Master: Registered signal handlers for [TERM, HUP, INT] +15/06/26 14:01:50 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable +15/06/26 14:01:51 INFO SecurityManager: Changing view acls to: root +15/06/26 14:01:51 INFO SecurityManager: Changing modify acls to: root +15/06/26 14:01:51 INFO SecurityManager: SecurityManager: authentication disabled; ui acls disabled; users with view permissions: Set(root); users with modify permissions: Set(root) +15/06/26 14:01:51 INFO Slf4jLogger: Slf4jLogger started +15/06/26 14:01:51 INFO Remoting: Starting remoting +15/06/26 14:01:52 INFO Remoting: Remoting started; listening on addresses :[akka.tcp://sparkMaster@spark-master:7077] +15/06/26 14:01:52 INFO Utils: Successfully started service 'sparkMaster' on port 7077. +15/06/26 14:01:52 INFO Utils: Successfully started service on port 6066. +15/06/26 14:01:52 INFO StandaloneRestServer: Started REST server for submitting applications on port 6066 +15/06/26 14:01:52 INFO Master: Starting Spark master at spark://spark-master:7077 +15/06/26 14:01:52 INFO Master: Running Spark version 1.4.0 +15/06/26 14:01:52 INFO Utils: Successfully started service 'MasterUI' on port 8080. +15/06/26 14:01:52 INFO MasterWebUI: Started MasterWebUI at http://10.244.2.34:8080 +15/06/26 14:01:53 INFO Master: I have been elected leader! New state: ALIVE ``` -(Pull requests welcome for an alternative that uses the service IP and -port) - ## Step Two: Start your Spark workers The Spark workers do the heavy lifting in a Spark cluster. They @@ -94,71 +96,80 @@ $ kubectl create -f examples/spark/spark-worker-controller.json ### Check to see if the workers are running ```shell -$ links -dump 192.168.90.14:8080 - [IMG] 1.2.1 Spark Master at spark://spark-master:7077 +$ kubectl get pods +NAME READY REASON RESTARTS AGE +[...] +spark-master 1/1 Running 0 14m +spark-worker-controller-hifwi 1/1 Running 0 33s +spark-worker-controller-u40r2 1/1 Running 0 33s +spark-worker-controller-vpgyg 1/1 Running 0 33s - * URL: spark://spark-master:7077 - * Workers: 3 - * Cores: 12 Total, 0 Used - * Memory: 20.4 GB Total, 0.0 B Used - * Applications: 0 Running, 0 Completed - * Drivers: 0 Running, 0 Completed - * Status: ALIVE - - Workers - -Id Address State Cores Memory - 4 (0 6.8 GB -worker-20150318151745-192.168.75.14-46422 192.168.75.14:46422 ALIVE Used) (0.0 B - Used) - 4 (0 6.8 GB -worker-20150318151746-192.168.35.17-53654 192.168.35.17:53654 ALIVE Used) (0.0 B - Used) - 4 (0 6.8 GB -worker-20150318151746-192.168.90.17-50490 192.168.90.17:50490 ALIVE Used) (0.0 B - Used) -... +$ kubectl logs spark-master +[...] +15/06/26 14:15:43 INFO Master: Registering worker 10.244.2.35:46199 with 1 cores, 2.6 GB RAM +15/06/26 14:15:55 INFO Master: Registering worker 10.244.1.15:44839 with 1 cores, 2.6 GB RAM +15/06/26 14:15:55 INFO Master: Registering worker 10.244.0.19:60970 with 1 cores, 2.6 GB RAM ``` - -(Pull requests welcome for an alternative that uses the service IP and -port) - ## Step Three: Do something with the cluster +Get the address and port of the Master service. + ```shell -$ kubectl get pods,services -POD IP CONTAINER(S) IMAGE(S) HOST LABELS STATUS -spark-master 192.168.90.14 spark-master mattf/spark-master 172.18.145.8/172.18.145.8 name=spark-master Running -spark-worker-controller-51wgg 192.168.75.14 spark-worker mattf/spark-worker 172.18.145.9/172.18.145.9 name=spark-worker,uses=spark-master Running -spark-worker-controller-5v48c 192.168.90.17 spark-worker mattf/spark-worker 172.18.145.8/172.18.145.8 name=spark-worker,uses=spark-master Running -spark-worker-controller-ehq23 192.168.35.17 spark-worker mattf/spark-worker 172.18.145.12/172.18.145.12 name=spark-worker,uses=spark-master Running -NAME LABELS SELECTOR IP PORT -kubernetes component=apiserver,provider=kubernetes 10.254.0.2 443 -spark-master name=spark-master name=spark-master 10.254.125.166 7077 +$ kubectl get service spark-master +NAME LABELS SELECTOR IP(S) PORT(S) +spark-master name=spark-master name=spark-master 10.0.204.187 7077/TCP +``` -$ sudo docker run -it mattf/spark-base sh +SSH to one of your cluster nodes. On GCE/GKE you can either use [Developers Console](https://console.developers.google.com) +(more details [here](https://cloud.google.com/compute/docs/ssh-in-browser)) +or run `gcloud compute ssh ` where the name can be taken from `kubectl get nodes` +(more details [here](https://cloud.google.com/compute/docs/gcloud-compute/#connecting)). -sh-4.2# echo "10.254.125.166 spark-master" >> /etc/hosts +``` +$ kubectl get nodes +NAME LABELS STATUS +kubernetes-minion-5jvu kubernetes.io/hostname=kubernetes-minion-5jvu Ready +kubernetes-minion-6fbi kubernetes.io/hostname=kubernetes-minion-6fbi Ready +kubernetes-minion-8y2v kubernetes.io/hostname=kubernetes-minion-8y2v Ready +kubernetes-minion-h0tr kubernetes.io/hostname=kubernetes-minion-h0tr Ready -sh-4.2# export SPARK_LOCAL_HOSTNAME=$(hostname -i) +$ gcloud compute ssh kubernetes-minion-5jvu --zone=us-central1-b +Linux kubernetes-minion-5jvu 3.16.0-0.bpo.4-amd64 #1 SMP Debian 3.16.7-ckt9-3~deb8u1~bpo70+1 (2015-04-27) x86_64 -sh-4.2# MASTER=spark://spark-master:7077 pyspark -Python 2.7.5 (default, Jun 17 2014, 18:11:42) -[GCC 4.8.2 20140120 (Red Hat 4.8.2-16)] on linux2 +=== GCE Kubernetes node setup complete === + +me@kubernetes-minion-5jvu:~$ +``` + +Once logged in run spark-base image. Inside of the image there is a script +that sets up the environment based on the provided IP and port of the Master. + +``` +cluster-node $ sudo docker run -it gcr.io/google_containers/spark-base +root@f12a6fec45ce:/# . /setup_client.sh 10.0.204.187 7077 +root@f12a6fec45ce:/# pyspark +Python 2.7.9 (default, Mar 1 2015, 12:57:24) +[GCC 4.9.2] on linux2 Type "help", "copyright", "credits" or "license" for more information. +15/06/26 14:25:28 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable Welcome to ____ __ / __/__ ___ _____/ /__ _\ \/ _ \/ _ `/ __/ '_/ - /__ / .__/\_,_/_/ /_/\_\ version 1.2.1 + /__ / .__/\_,_/_/ /_/\_\ version 1.4.0 /_/ - -Using Python version 2.7.5 (default, Jun 17 2014 18:11:42) -SparkContext available as sc. ->>> import socket, resource ->>> sc.parallelize(range(1000)).map(lambda x: (socket.gethostname(), resource.getrlimit(resource.RLIMIT_NOFILE))).distinct().collect() -[('spark-worker-controller-ehq23', (1048576, 1048576)), ('spark-worker-controller-5v48c', (1048576, 1048576)), ('spark-worker-controller-51wgg', (1048576, 1048576))] +Using Python version 2.7.9 (default, Mar 1 2015 12:57:24) +SparkContext available as sc, HiveContext available as sqlContext. +>>> import socket +>>> sc.parallelize(range(1000)).map(lambda x:socket.gethostname()).distinct().collect() +['spark-worker-controller-u40r2', 'spark-worker-controller-hifwi', 'spark-worker-controller-vpgyg'] ``` +## Result + +You now have services, replication controllers, and pods for the Spark master and Spark workers. +You can take this example to the next step and start using the Apache Spark cluster +you just created, see [Spark documentation](https://spark.apache.org/documentation.html) +for more information. ## tl;dr @@ -170,5 +181,4 @@ Make sure the Master Pod is running (use: ```kubectl get pods```). ```kubectl create -f spark-worker-controller.json``` - [![Analytics](https://kubernetes-site.appspot.com/UA-36037335-10/GitHub/examples/spark/README.md?pixel)]() diff --git a/examples/spark/images/base/Dockerfile b/examples/spark/images/base/Dockerfile new file mode 100644 index 00000000000..ffadeca11dc --- /dev/null +++ b/examples/spark/images/base/Dockerfile @@ -0,0 +1,17 @@ +FROM java:latest + +RUN apt-get update -y +RUN apt-get install -y scala + +# Get Spark from some apache mirror. +RUN mkdir -p /opt && \ + cd /opt && \ + wget http://apache.mirrors.pair.com/spark/spark-1.4.0/spark-1.4.0-bin-hadoop2.6.tgz && \ + tar -zvxf spark-1.4.0-bin-hadoop2.6.tgz && \ + rm spark-1.4.0-bin-hadoop2.6.tgz && \ + ln -s spark-1.4.0-bin-hadoop2.6 spark && \ + echo Spark installed in /opt + +ADD log4j.properties /opt/spark/conf/log4j.properties +ADD setup_client.sh / +ENV PATH $PATH:/opt/spark/bin diff --git a/examples/spark/images/base/log4j.properties b/examples/spark/images/base/log4j.properties new file mode 100644 index 00000000000..b146f8a7841 --- /dev/null +++ b/examples/spark/images/base/log4j.properties @@ -0,0 +1,12 @@ +# Set everything to be logged to the console +log4j.rootCategory=WARN, console +log4j.appender.console=org.apache.log4j.ConsoleAppender +log4j.appender.console.target=System.err +log4j.appender.console.layout=org.apache.log4j.PatternLayout +log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n + +# Settings to quiet third party logs that are too verbose +log4j.logger.org.spark-project.jetty=WARN +log4j.logger.org.spark-project.jetty.util.component.AbstractLifeCycle=ERROR +log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO +log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO diff --git a/examples/spark/images/base/setup_client.sh b/examples/spark/images/base/setup_client.sh new file mode 100755 index 00000000000..7fb47258eab --- /dev/null +++ b/examples/spark/images/base/setup_client.sh @@ -0,0 +1,24 @@ +#!/bin/bash + +# Copyright 2015 The Kubernetes Authors All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +if [[ $# != 2 || $1 == "" || $2 == "" ]]; then + echo "Usage: . ./setup_client.sh master_address master_port" + exit 1 +fi + +echo "$1 spark-master" >> /etc/hosts +export SPARK_LOCAL_HOSTNAME=$(hostname -i) +export MASTER=spark://spark-master:$2 diff --git a/examples/spark/images/master/Dockerfile b/examples/spark/images/master/Dockerfile new file mode 100644 index 00000000000..6a894c2d70d --- /dev/null +++ b/examples/spark/images/master/Dockerfile @@ -0,0 +1,7 @@ +FROM gcr.io/google_containers/spark-base + +ADD start.sh / +ADD log4j.properties /opt/spark/conf/log4j.properties +EXPOSE 7077 + +ENTRYPOINT ["/start.sh"] diff --git a/examples/spark/images/master/log4j.properties b/examples/spark/images/master/log4j.properties new file mode 100644 index 00000000000..3a2a8821981 --- /dev/null +++ b/examples/spark/images/master/log4j.properties @@ -0,0 +1,12 @@ +# Set everything to be logged to the console +log4j.rootCategory=INFO, console +log4j.appender.console=org.apache.log4j.ConsoleAppender +log4j.appender.console.target=System.err +log4j.appender.console.layout=org.apache.log4j.PatternLayout +log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n + +# Settings to quiet third party logs that are too verbose +log4j.logger.org.spark-project.jetty=WARN +log4j.logger.org.spark-project.jetty.util.component.AbstractLifeCycle=ERROR +log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO +log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO diff --git a/examples/spark/images/master/start.sh b/examples/spark/images/master/start.sh new file mode 100755 index 00000000000..59b225f77e7 --- /dev/null +++ b/examples/spark/images/master/start.sh @@ -0,0 +1,19 @@ +#!/bin/bash + +# Copyright 2015 The Kubernetes Authors All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +export SPARK_MASTER_PORT=${SPARK_MASTER_SERVICE_PORT:-7077} +/opt/spark/sbin/start-master.sh +tail -F /opt/spark/logs/* diff --git a/examples/spark/images/worker/Dockerfile b/examples/spark/images/worker/Dockerfile new file mode 100644 index 00000000000..30db1895ad7 --- /dev/null +++ b/examples/spark/images/worker/Dockerfile @@ -0,0 +1,7 @@ +FROM gcr.io/google_containers/spark-base + +ADD start.sh / +ADD log4j.properties /opt/spark/conf/log4j.properties + +EXPOSE 8080 +ENTRYPOINT ["/start.sh"] diff --git a/examples/spark/images/worker/log4j.properties b/examples/spark/images/worker/log4j.properties new file mode 100644 index 00000000000..3a2a8821981 --- /dev/null +++ b/examples/spark/images/worker/log4j.properties @@ -0,0 +1,12 @@ +# Set everything to be logged to the console +log4j.rootCategory=INFO, console +log4j.appender.console=org.apache.log4j.ConsoleAppender +log4j.appender.console.target=System.err +log4j.appender.console.layout=org.apache.log4j.PatternLayout +log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n + +# Settings to quiet third party logs that are too verbose +log4j.logger.org.spark-project.jetty=WARN +log4j.logger.org.spark-project.jetty.util.component.AbstractLifeCycle=ERROR +log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO +log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO diff --git a/examples/spark/images/worker/start.sh b/examples/spark/images/worker/start.sh new file mode 100755 index 00000000000..3643ea0a3da --- /dev/null +++ b/examples/spark/images/worker/start.sh @@ -0,0 +1,28 @@ +#!/bin/bash + +# Copyright 2015 The Kubernetes Authors All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +if [[ ${SPARK_MASTER_SERVICE_HOST} == "" ]]; then + echo "Spark Master service must be created before starting any workers" + sleep 30 # To postpone pod restart + exit 1 +fi + +echo "${SPARK_MASTER_SERVICE_HOST} spark-master" >> /etc/hosts +export SPARK_LOCAL_HOSTNAME=$(hostname -i) + +/opt/spark/sbin/start-slave.sh spark://spark-master:${SPARK_MASTER_SERVICE_PORT} + +tail -F /opt/spark/logs/* diff --git a/examples/spark/spark-master.json b/examples/spark/spark-master.json index e903c6b0df3..69044030c32 100644 --- a/examples/spark/spark-master.json +++ b/examples/spark/spark-master.json @@ -11,7 +11,7 @@ "containers": [ { "name": "spark-master", - "image": "mattf/spark-master", + "image": "gcr.io/google_containers/spark-master", "ports": [ { "containerPort": 7077 diff --git a/examples/spark/spark-worker-controller.json b/examples/spark/spark-worker-controller.json index a89322d51cb..4beeacfcc4e 100644 --- a/examples/spark/spark-worker-controller.json +++ b/examples/spark/spark-worker-controller.json @@ -23,7 +23,7 @@ "containers": [ { "name": "spark-worker", - "image": "mattf/spark-worker", + "image": "gcr.io/google_containers/spark-worker", "ports": [ { "hostPort": 8888,