diff --git a/examples/examples_test.go b/examples/examples_test.go index 20a798d2d9c..f75f9d604e6 100644 --- a/examples/examples_test.go +++ b/examples/examples_test.go @@ -355,11 +355,12 @@ func TestExampleObjectSchemas(t *testing.T) { "secret": &api.Secret{}, }, "../examples/spark": { - "spark-driver-controller": &api.ReplicationController{}, "spark-master-controller": &api.ReplicationController{}, "spark-master-service": &api.Service{}, "spark-webui": &api.Service{}, "spark-worker-controller": &api.ReplicationController{}, + "zeppelin-controller": &api.ReplicationController{}, + "zeppelin-service": &api.Service{}, }, "../examples/spark/spark-gluster": { "spark-master-service": &api.Service{}, diff --git a/examples/spark/README.md b/examples/spark/README.md index 63d942edd60..b14b56152e0 100644 --- a/examples/spark/README.md +++ b/examples/spark/README.md @@ -120,8 +120,8 @@ Spark Command: /usr/lib/jvm/java-8-openjdk-amd64/jre/bin/java -cp /opt/spark-1.5 15/10/27 21:25:07 INFO Master: I have been elected leader! New state: ALIVE ``` -After you know the master is running, you can use the (cluster -proxy)[../../docs/user-guide/accessing-the-cluster.md#using-kubectl-proxy] to +After you know the master is running, you can use the [cluster +proxy](../../docs/user-guide/accessing-the-cluster.md#using-kubectl-proxy) to connect to the Spark WebUI: ```console @@ -129,7 +129,7 @@ kubectl proxy --port=8001 ``` At which point the UI will be available at -http://localhost:8001/api/v1/proxy/namespaces/default/services/spark-webui/ +[http://localhost:8001/api/v1/proxy/namespaces/default/services/spark-webui/](http://localhost:8001/api/v1/proxy/namespaces/default/services/spark-webui/). ## Step Two: Start your Spark workers @@ -172,32 +172,40 @@ you should now see the workers in the UI as well. *Note:* The UI will have links to worker Web UIs. The worker UI links do not work (the links will attempt to connect to cluster IPs, which Kubernetes won't proxy automatically). -## Step Three: Start your Spark driver to launch jobs on your Spark cluster +## Step Three: Start the Zeppelin UI to launch jobs on your Spark cluster -The Spark driver is used to launch jobs into Spark cluster. You can read more about it in -[Spark architecture](https://spark.apache.org/docs/latest/cluster-overview.html). +The Zeppelin UI pod can be used to launch jobs into the Spark cluster either via +a web notebook frontend or the traditional Spark command line. See +[Zeppelin](https://zeppelin.incubator.apache.org/) and +[Spark architecture](https://spark.apache.org/docs/latest/cluster-overview.html) +for more details. ```console -$ kubectl create -f examples/spark/spark-driver-controller.yaml -replicationcontrollers/spark-driver-controller +$ kubectl create -f examples/spark/zeppelin-controller.yaml +replicationcontrollers/zeppelin-controller ``` -The Spark driver needs the Master service to be running. +Zeppelin needs the Master service to be running. -### Check to see if the driver is running +### Check to see if Zeppelin is running ```console -$ kubectl get pods -lcomponent=spark-driver -NAME READY STATUS RESTARTS AGE -spark-driver-controller-vwb9c 1/1 Running 0 1m +$ kubectl get pods -lcomponent=zeppelin +NAME READY STATUS RESTARTS AGE +zeppelin-controller-ja09s 1/1 Running 0 53s ``` ## Step Four: Do something with the cluster -Use the kubectl exec to connect to Spark driver and run a pipeline. +Now you have two choices, depending on your predilections. You can do something +graphical with the Spark cluster, or you can stay in the CLI. + +### Do something fast with pyspark! + +Use the kubectl exec to connect to the Zeppelin driver and run a pipeline. ```console -$ kubectl exec spark-driver-controller-vwb9c -it pyspark +$ kubectl exec zeppelin-controller-ja09s -it pyspark Python 2.7.9 (default, Mar 1 2015, 12:57:24) [GCC 4.9.2] on linux2 Type "help", "copyright", "credits" or "license" for more information. @@ -217,6 +225,24 @@ SparkContext available as sc, HiveContext available as sqlContext. Congratulations, you just counted all of the words in all of the plays of Shakespeare. +### Do something graphical and shiny! + +Take the Zeppelin pod from above and port-forward the WebUI port: + +```console +$ kubectl port-forward zeppelin-controller-ja09s 8080:8080 +``` + +This forwards `localhost` 8080 to container port 8080. You can then find +Zeppelin at (https://localhost:8080/)[https://localhost:8080/]. + +Create a "New Notebook". In there, type: + +``` +%pyspark +print sc.textFile("gs://dataflow-samples/shakespeare/*").map(lambda s: len(s.split())).sum() +``` + ## Result You now have services and replication controllers for the Spark master, Spark @@ -235,10 +261,46 @@ After it's setup: ```console kubectl get pods # Make sure everything is running -kubectl proxy --port=8001 # Start an application proxy, if you want to see the Spark WebUI -kubectl get pods -lcomponent=spark-driver # Get the driver pod to interact with. +kubectl proxy --port=8001 # Start an application proxy, if you want to see the Spark Master WebUI +kubectl get pods -lcomponent=zeppelin # Get the driver pod to interact with. ``` +At which point the Master UI will be available at +[http://localhost:8001/api/v1/proxy/namespaces/default/services/spark-webui/](http://localhost:8001/api/v1/proxy/namespaces/default/services/spark-webui/). + +You can either interact with the Spark cluster the traditional `spark-shell` / +`spark-subsubmit` / `pyspark` commands by using `kubectl exec` against the +`zeppelin-controller` pod, or if you want to interact with Zeppelin: + +```console +kubectl port-forward zeppelin-controller-abc123 8080:8080 & +``` + +Then visit [http://localhost:8080/](http://localhost:8080/). + +## Known Issues With Spark + +* This provides a Spark configuration that is restricted to the cluster network, + meaning the Spark master is only available as a cluster service. If you need + to submit jobs using external client other than Zeppelin or `spark-submit` on + the `zeppelin` pod, you will need to provide a way for your clients to get to + the + [`examples/spark/spark-master-service.yaml`](spark-master-service.yaml). See + [Services](../../docs/user-guide/services.md) for more information. + +## Known Issues With Zeppelin + +* The Zeppelin pod is large, so it may take a while to pull depending on your + network. The size of the Zeppelin pod is something we're working on, see issue #17231. + +* Zeppelin may take some time (about a minute) on this pipeline the first time + you run it. It seems to take considerable time to load. + +* On GKE, `kubectl port-forward` may not be stable over long periods of time. If + you see Zeppelin go into `Disconnected` state (there will be a red dot on the + top right as well), the `port-forward` probably failed and needs to be + restarted. See #12179. + [![Analytics](https://kubernetes-site.appspot.com/UA-36037335-10/GitHub/examples/spark/README.md?pixel)]() diff --git a/examples/spark/images/Makefile b/examples/spark/images/Makefile index fd6341105f9..66364924c70 100644 --- a/examples/spark/images/Makefile +++ b/examples/spark/images/Makefile @@ -1,12 +1,20 @@ all: push +push: push-spark push-zeppelin +.PHONY: push push-spark push-zeppelin spark zeppelin # To bump the Spark version, bump the version in base/Dockerfile, bump -# this tag and reset to v1. You should also double check the native -# Hadoop libs at that point (we grab the 2.6.1 libs, which are -# appropriate for 1.5.1-with-2.6). +# the version in zeppelin/Dockerfile, bump this tag and reset to +# v1. You should also double check the native Hadoop libs at that +# point (we grab the 2.6.1 libs, which are appropriate for +# 1.5.1-with-2.6). Note that you'll need to re-test Zeppelin (and it +# may not have caught up to newest Spark). TAG = 1.5.1_v2 -containers: +# To bump the Zeppelin version, bump the version in +# zeppelin/Dockerfile and bump this tag and reset to v1. +ZEPPELIN_TAG = v0.5.5_v1 + +spark: docker build -t gcr.io/google_containers/spark-base base docker tag gcr.io/google_containers/spark-base gcr.io/google_containers/spark-base:$(TAG) docker build -t gcr.io/google_containers/spark-worker worker @@ -16,7 +24,11 @@ containers: docker build -t gcr.io/google_containers/spark-driver driver docker tag gcr.io/google_containers/spark-driver gcr.io/google_containers/spark-driver:$(TAG) -push: containers +zeppelin: + docker build -t gcr.io/google_containers/zeppelin zeppelin + docker tag -f gcr.io/google_containers/zeppelin gcr.io/google_containers/zeppelin:$(ZEPPELIN_TAG) + +push-spark: spark gcloud docker push gcr.io/google_containers/spark-base gcloud docker push gcr.io/google_containers/spark-base:$(TAG) gcloud docker push gcr.io/google_containers/spark-worker @@ -26,4 +38,8 @@ push: containers gcloud docker push gcr.io/google_containers/spark-driver gcloud docker push gcr.io/google_containers/spark-driver:$(TAG) +push-zeppelin: zeppelin + gcloud docker push gcr.io/google_containers/zeppelin + gcloud docker push gcr.io/google_containers/zeppelin:$(ZEPPELIN_TAG) + clean: diff --git a/examples/spark/images/base/Dockerfile b/examples/spark/images/base/Dockerfile index 4217013db3a..9de03e54279 100644 --- a/examples/spark/images/base/Dockerfile +++ b/examples/spark/images/base/Dockerfile @@ -1,4 +1,4 @@ -FROM java:latest +FROM java:openjdk-8-jdk ENV hadoop_ver 2.6.1 ENV spark_ver 1.5.1 diff --git a/examples/spark/images/zeppelin/Dockerfile b/examples/spark/images/zeppelin/Dockerfile new file mode 100644 index 00000000000..57c8d6d2cbc --- /dev/null +++ b/examples/spark/images/zeppelin/Dockerfile @@ -0,0 +1,66 @@ +# Copyright 2015 The Kubernetes Authors All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Based heavily on +# https://github.com/dylanmei/docker-zeppelin/blob/master/Dockerfile +# (which is similar to many others out there), but rebased onto maven +# image. +# +# This image is a composition of the official docker-maven +# Docker image from https://github.com/carlossg/docker-maven/ and +# spark-base. + +FROM gcr.io/google_containers/spark-base:latest + +ENV ZEPPELIN_TAG v0.5.5 +ENV MAVEN_VERSION 3.3.3 +ENV SPARK_MINOR 1.5 +ENV SPARK_PATCH 1 +ENV SPARK_VER ${SPARK_MINOR}.${SPARK_PATCH} +ENV HADOOP_MINOR 2.6 +ENV HADOOP_PATCH 1 +ENV HADOOP_VER ${HADOOP_MINOR}.${HADOOP_PATCH} + +RUN curl -fsSL http://archive.apache.org/dist/maven/maven-3/${MAVEN_VERSION}/binaries/apache-maven-${MAVEN_VERSION}-bin.tar.gz | tar xzf - -C /usr/share \ + && mv /usr/share/apache-maven-${MAVEN_VERSION} /usr/share/maven \ + && ln -s /usr/share/maven/bin/mvn /usr/bin/mvn + +ENV MAVEN_HOME /usr/share/maven + +# libfontconfig is a workaround for +# https://github.com/karma-runner/karma/issues/1270, which caused a +# build break similar to +# https://www.mail-archive.com/users@zeppelin.incubator.apache.org/msg01586.html + +RUN apt-get update \ + && apt-get install -y net-tools build-essential git wget unzip python python-setuptools python-dev python-numpy libfontconfig \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +RUN git clone https://github.com/apache/incubator-zeppelin.git --branch ${ZEPPELIN_TAG} /opt/zeppelin +RUN cd /opt/zeppelin && \ + mvn clean package \ + -Pspark-${SPARK_MINOR} -Dspark.version=${SPARK_VER} \ + -Phadoop-${HADOOP_MINOR} -Dhadoop.version=${HADOOP_VER} \ + -Ppyspark \ + -DskipTests && \ + rm -rf /root/.m2 && \ + rm -rf /root/.npm && \ + echo "Successfully built Zeppelin" + +ADD zeppelin-log4j.properties /opt/zeppelin/conf/log4j.properties +ADD zeppelin-env.sh /opt/zeppelin/conf/zeppelin-env.sh +ADD docker-zeppelin.sh /opt/zeppelin/bin/docker-zeppelin.sh +EXPOSE 8080 +ENTRYPOINT ["/opt/zeppelin/bin/docker-zeppelin.sh"] diff --git a/examples/spark/images/zeppelin/docker-zeppelin.sh b/examples/spark/images/zeppelin/docker-zeppelin.sh new file mode 100755 index 00000000000..99cbde1660c --- /dev/null +++ b/examples/spark/images/zeppelin/docker-zeppelin.sh @@ -0,0 +1,21 @@ +#!/bin/bash + +# Copyright 2015 The Kubernetes Authors All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +export ZEPPELIN_HOME=/opt/zeppelin +export ZEPPELIN_CONF_DIR="${ZEPPELIN_HOME}/conf" + +echo "=== Launching Zeppelin under Docker ===" +/opt/zeppelin/bin/zeppelin.sh "${ZEPPELIN_CONF_DIR}" diff --git a/examples/spark/images/zeppelin/zeppelin-env.sh b/examples/spark/images/zeppelin/zeppelin-env.sh new file mode 100644 index 00000000000..3fa8a024867 --- /dev/null +++ b/examples/spark/images/zeppelin/zeppelin-env.sh @@ -0,0 +1,26 @@ +#!/bin/bash + +# Copyright 2015 The Kubernetes Authors All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +export MASTER="spark://spark-master:7077" +export SPARK_HOME=/opt/spark +export ZEPPELIN_JAVA_OPTS="-Dspark.jars=/opt/spark/lib/gcs-connector-latest-hadoop2.jar" +# TODO(zmerlynn): Setting global CLASSPATH *should* be unnecessary, +# but ZEPPELIN_JAVA_OPTS isn't enough here. :( +export CLASSPATH="/opt/spark/lib/gcs-connector-latest-hadoop2.jar" +export ZEPPELIN_NOTEBOOK_DIR="${ZEPPELIN_HOME}/notebook" +export ZEPPELIN_MEM=-Xmx1024m +export ZEPPELIN_PORT=8080 +export PYTHONPATH="${SPARK_HOME}/python:${SPARK_HOME}/python/lib/py4j-0.8.2.1-src.zip" diff --git a/examples/spark/images/zeppelin/zeppelin-log4j.properties b/examples/spark/images/zeppelin/zeppelin-log4j.properties new file mode 100644 index 00000000000..5fb18104eed --- /dev/null +++ b/examples/spark/images/zeppelin/zeppelin-log4j.properties @@ -0,0 +1,6 @@ +# Set everything to be logged to the console. +log4j.rootCategory=INFO, console +log4j.appender.console=org.apache.log4j.ConsoleAppender +log4j.appender.console.target=System.err +log4j.appender.console.layout=org.apache.log4j.PatternLayout +log4j.appender.console.layout.ConversionPattern=%5p [%d] ({%t} %F[%M]:%L) - %m%n diff --git a/examples/spark/spark-driver-controller.yaml b/examples/spark/zeppelin-controller.yaml similarity index 50% rename from examples/spark/spark-driver-controller.yaml rename to examples/spark/zeppelin-controller.yaml index 1e77910016a..9ef4a36787d 100644 --- a/examples/spark/spark-driver-controller.yaml +++ b/examples/spark/zeppelin-controller.yaml @@ -1,19 +1,21 @@ kind: ReplicationController apiVersion: v1 metadata: - name: spark-driver-controller + name: zeppelin-controller spec: replicas: 1 selector: - component: spark-driver + component: zeppelin template: metadata: labels: - component: spark-driver + component: zeppelin spec: containers: - - name: spark-driver - image: gcr.io/google_containers/spark-driver:1.5.1_v2 + - name: zeppelin + image: gcr.io/google_containers/zeppelin:v0.5.5_v1 + ports: + - containerPort: 8080 resources: requests: cpu: 100m diff --git a/examples/spark/zeppelin-service.yaml b/examples/spark/zeppelin-service.yaml new file mode 100644 index 00000000000..9296297f168 --- /dev/null +++ b/examples/spark/zeppelin-service.yaml @@ -0,0 +1,10 @@ +kind: Service +apiVersion: v1 +metadata: + name: zeppelin +spec: + ports: + - port: 8080 + targetPort: 8080 + selector: + component: zeppelin