From 397ae5231243c3fb6713bc14784f7825d78831a1 Mon Sep 17 00:00:00 2001 From: Felix Hennig Date: Wed, 12 Apr 2023 13:53:30 +0200 Subject: [PATCH 01/11] usage guide restructuring --- .../spark-k8s/pages/crd-reference.adoc | 106 ++++++ .../pages/getting_started/installation.adoc | 2 +- docs/modules/spark-k8s/pages/index.adoc | 39 ++- docs/modules/spark-k8s/pages/rbac.adoc | 11 - .../history-server.adoc} | 18 +- .../spark-k8s/pages/usage-guide/index.adoc | 89 +++++ .../job-dependencies.adoc} | 1 + .../pod-placement.adoc} | 0 .../pages/usage-guide/resources.adoc | 36 ++ .../spark-k8s/pages/usage-guide/s3.adoc | 48 +++ docs/modules/spark-k8s/pages/usage.adoc | 308 ------------------ docs/modules/spark-k8s/partials/nav.adoc | 12 +- 12 files changed, 318 insertions(+), 352 deletions(-) create mode 100644 docs/modules/spark-k8s/pages/crd-reference.adoc delete mode 100644 docs/modules/spark-k8s/pages/rbac.adoc rename docs/modules/spark-k8s/pages/{history_server.adoc => usage-guide/history-server.adoc} (89%) create mode 100644 docs/modules/spark-k8s/pages/usage-guide/index.adoc rename docs/modules/spark-k8s/pages/{job_dependencies.adoc => usage-guide/job-dependencies.adoc} (99%) rename docs/modules/spark-k8s/pages/{pod_placement.adoc => usage-guide/pod-placement.adoc} (100%) create mode 100644 docs/modules/spark-k8s/pages/usage-guide/resources.adoc create mode 100644 docs/modules/spark-k8s/pages/usage-guide/s3.adoc delete mode 100644 docs/modules/spark-k8s/pages/usage.adoc diff --git a/docs/modules/spark-k8s/pages/crd-reference.adoc b/docs/modules/spark-k8s/pages/crd-reference.adoc new file mode 100644 index 00000000..abe4d1bc --- /dev/null +++ b/docs/modules/spark-k8s/pages/crd-reference.adoc @@ -0,0 +1,106 @@ += CRD reference + +Below are listed the CRD fields that can be defined by the user: + +|=== +|CRD field |Remarks + +|`apiVersion` +|`spark.stackable.tech/v1alpha1` + +|`kind` +|`SparkApplication` + +|`metadata.name` +| Job name + +|`spec.version` +|"1.0" + +|`spec.mode` +| `cluster` or `client`. Currently only `cluster` is supported + +|`spec.image` +|User-supplied image containing spark-job dependencies that will be copied to the specified volume mount + +|`spec.sparkImage` +| Spark image which will be deployed to driver and executor pods, which must contain spark environment needed by the job e.g. `docker.stackable.tech/stackable/spark-k8s:3.3.0-stackable0.3.0` + +|`spec.sparkImagePullPolicy` +| Optional Enum (one of `Always`, `IfNotPresent` or `Never`) that determines the pull policy of the spark job image + +|`spec.sparkImagePullSecrets` +| An optional list of references to secrets in the same namespace to use for pulling any of the images used by a `SparkApplication` resource. Each reference has a single property (`name`) that must contain a reference to a valid secret + +|`spec.mainApplicationFile` +|The actual application file that will be called by `spark-submit` + +|`spec.mainClass` +|The main class i.e. entry point for JVM artifacts + +|`spec.args` +|Arguments passed directly to the job artifact + +|`spec.s3connection` +|S3 connection specification. See the <> for more details. + +|`spec.sparkConf` +|A map of key/value strings that will be passed directly to `spark-submit` + +|`spec.deps.requirements` +|A list of python packages that will be installed via `pip` + +|`spec.deps.packages` +|A list of packages that is passed directly to `spark-submit` + +|`spec.deps.excludePackages` +|A list of excluded packages that is passed directly to `spark-submit` + +|`spec.deps.repositories` +|A list of repositories that is passed directly to `spark-submit` + +|`spec.volumes` +|A list of volumes + +|`spec.volumes.name` +|The volume name + +|`spec.volumes.persistentVolumeClaim.claimName` +|The persistent volume claim backing the volume + +|`spec.job.resources` +|Resources specification for the initiating Job + +|`spec.driver.resources` +|Resources specification for the driver Pod + +|`spec.driver.volumeMounts` +|A list of mounted volumes for the driver + +|`spec.driver.volumeMounts.name` +|Name of mount + +|`spec.driver.volumeMounts.mountPath` +|Volume mount path + +|`spec.driver.nodeSelector` +|A dictionary of labels to use for node selection when scheduling the driver N.B. this assumes there are no implicit node dependencies (e.g. `PVC`, `VolumeMount`) defined elsewhere. + +|`spec.executor.resources` +|Resources specification for the executor Pods + +|`spec.executor.instances` +|Number of executor instances launched for this job + +|`spec.executor.volumeMounts` +|A list of mounted volumes for each executor + +|`spec.executor.volumeMounts.name` +|Name of mount + +|`spec.executor.volumeMounts.mountPath` +|Volume mount path + +|`spec.executor.nodeSelector` +|A dictionary of labels to use for node selection when scheduling the executors N.B. this assumes there are no implicit node dependencies (e.g. `PVC`, `VolumeMount`) defined elsewhere. +|=== diff --git a/docs/modules/spark-k8s/pages/getting_started/installation.adoc b/docs/modules/spark-k8s/pages/getting_started/installation.adoc index ef526259..b9133a7e 100644 --- a/docs/modules/spark-k8s/pages/getting_started/installation.adoc +++ b/docs/modules/spark-k8s/pages/getting_started/installation.adoc @@ -8,7 +8,7 @@ Spark applications almost always require dependencies like database drivers, RES More information about the different ways to define Spark jobs and their dependencies is given on the following pages: -- xref:usage.adoc[] +- xref:usage-guide/index.adoc[] - xref:job_dependencies.adoc[] == Stackable Operators diff --git a/docs/modules/spark-k8s/pages/index.adoc b/docs/modules/spark-k8s/pages/index.adoc index 9c49d8e5..dc2b8a6c 100644 --- a/docs/modules/spark-k8s/pages/index.adoc +++ b/docs/modules/spark-k8s/pages/index.adoc @@ -1,18 +1,37 @@ -= Stackable Operator for Apache Spark on Kubernetes += Stackable Operator for Apache Spark +:description: The Stackable Operator for Apache Spark is a Kubernetes operator that can manage Apache Spark clusters. Learn about its features, resources, dependencies and demos, and see the list of supported Spark versions. +:keywords: Stackable Operator, Apache Spark, Kubernetes, operator, data science, engineer, big data, CRD, StatefulSet, ConfigMap, Service, S3, demo, version -This is an operator for Kubernetes that can manage https://spark.apache.org/[Apache Spark] kubernetes clusters. +This is an operator for Kubernetes that can manage https://spark.apache.org/[Apache Spark] Kubernetes clusters. Apache Spark is a powerful open-source big data processing framework that allows for efficient and flexible distributed computing. Its in-memory processing and fault-tolerant architecture make it ideal for a variety of use cases, including batch processing, real-time streaming, machine learning, and graph processing. -WARNING: This operator only works with images from the https://repo.stackable.tech/#browse/browse:docker:v2%2Fstackable%2Fspark[Stackable] repository +== Getting Started + +Follow the xref:getting_started/index.adoc[] guide to get started with Apache Spark using the Stackable Operator. The guide will lead you through the installation of the Operator and running your first Spark job on Kubernetes. + +== RBAC + +The https://spark.apache.org/docs/latest/running-on-kubernetes.html#rbac[Spark-Kubernetes RBAC documentation] describes what is needed for `spark-submit` jobs to run successfully: minimally a role/cluster-role to allow the driver pod to create and manage executor pods. + +However, to add security, each `spark-submit` job launched by the spark-k8s operator will be assigned its own service account. + +When the spark-k8s operator is installed via Helm, a cluster role named `spark-k8s-clusterrole` is created with pre-defined permissions. + +When a new Spark application is submitted, the operator creates a new service account with the same name as the application and binds this account to the cluster role `spark-k8s-clusterrole` created by Helm. + +== Integrations + +- Kafka +- S3 +- loading custom dependencies + +== Demos + +The xref:stackablectl::demos/data-lakehouse-iceberg-trino-spark.adoc[] demo connects multiple components and datasets into a data Lakehouse. A Spark application with https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html[structured streaming] is used to stream data from Apache Kafka into the Lakehouse. + +In the xref:stackablectl::demos/spark-k8s-anomaly-detection-taxi-data.adoc[] demo Spark is used to read training data from S3 and train an anomaly detection model on the data. The model is then stored in a Trino table. == Supported Versions The Stackable Operator for Apache Spark on Kubernetes currently supports the following versions of Spark: include::partial$supported-versions.adoc[] - -== Getting the Docker image - -[source] ----- -docker pull docker.stackable.tech/stackable/spark-k8s: ----- diff --git a/docs/modules/spark-k8s/pages/rbac.adoc b/docs/modules/spark-k8s/pages/rbac.adoc deleted file mode 100644 index ed9de94d..00000000 --- a/docs/modules/spark-k8s/pages/rbac.adoc +++ /dev/null @@ -1,11 +0,0 @@ -= RBAC - -== Overview - -The https://spark.apache.org/docs/latest/running-on-kubernetes.html#rbac[Spark-Kubernetes RBAC documentation] describes what is needed for `spark-submit` jobs to run successfully: minimally a role/cluster-role to allow the driver pod to create and manage executor pods. - -However, to add security, each `spark-submit` job launched by the spark-k8s operator will be assigned its own service account. - -When the spark-k8s operator is installed via Helm, a cluster role named `spark-k8s-clusterrole` is created with pre-defined permissions. - -When a new Spark application is submitted, the operator creates a new service account with the same name as the application and binds this account to the cluster role `spark-k8s-clusterrole` created by Helm. diff --git a/docs/modules/spark-k8s/pages/history_server.adoc b/docs/modules/spark-k8s/pages/usage-guide/history-server.adoc similarity index 89% rename from docs/modules/spark-k8s/pages/history_server.adoc rename to docs/modules/spark-k8s/pages/usage-guide/history-server.adoc index 65e387e3..f89464c7 100644 --- a/docs/modules/spark-k8s/pages/history_server.adoc +++ b/docs/modules/spark-k8s/pages/usage-guide/history-server.adoc @@ -1,4 +1,5 @@ = Spark History Server +:page-aliases: history_server.adoc == Overview @@ -48,23 +49,6 @@ include::example$example-history-app.yaml[] <6> Credentials used to write event logs. These can, of course, differ from the credentials used to process data. -== Log aggregation - -The logs can be forwarded to a Vector log aggregator by providing a discovery -ConfigMap for the aggregator and by enabling the log agent: - -[source,yaml] ----- -spec: - vectorAggregatorConfigMapName: vector-aggregator-discovery - nodes: - config: - logging: - enableVectorAgent: true ----- - -Further information on how to configure logging, can be found in -xref:home:concepts:logging.adoc[]. == History Web UI diff --git a/docs/modules/spark-k8s/pages/usage-guide/index.adoc b/docs/modules/spark-k8s/pages/usage-guide/index.adoc new file mode 100644 index 00000000..1da42747 --- /dev/null +++ b/docs/modules/spark-k8s/pages/usage-guide/index.adoc @@ -0,0 +1,89 @@ += Usage guide + +== Examples + +The following examples have the following `spec` fields in common: + +- `version`: the current version is "1.0" +- `sparkImage`: the docker image that will be used by job, driver and executor pods. This can be provided by the user. +- `mode`: only `cluster` is currently supported +- `mainApplicationFile`: the artifact (Java, Scala or Python) that forms the basis of the Spark job. +- `args`: these are the arguments passed directly to the application. In the examples below it is e.g. the input path for part of the public New York taxi dataset. +- `sparkConf`: these list spark configuration settings that are passed directly to `spark-submit` and which are best defined explicitly by the user. Since the `SparkApplication` "knows" that there is an external dependency (the s3 bucket where the data and/or the application is located) and how that dependency should be treated (i.e. what type of credential checks are required, if any), it is better to have these things declared together. +- `volumes`: refers to any volumes needed by the `SparkApplication`, in this case an underlying `PersistentVoulmeClaim`. +- `driver`: driver-specific settings, including any volume mounts. +- `executor`: executor-specific settings, including any volume mounts. + +Job-specific settings are annotated below. + +=== Pyspark: externally located artifact and dataset + +[source,yaml] +---- +include::example$example-sparkapp-external-dependencies.yaml[] +---- + +<1> Job python artifact (external) +<2> Job argument (external) +<3> List of python job requirements: these will be installed in the pods via `pip` +<4> Spark dependencies: the credentials provider (the user knows what is relevant here) plus dependencies needed to access external resources (in this case, in s3) +<5> the name of the volume mount backed by a `PersistentVolumeClaim` that must be pre-existing +<6> the path on the volume mount: this is referenced in the `sparkConf` section where the extra class path is defined for the driver and executors + +=== Pyspark: externally located dataset, artifact available via PVC/volume mount + +[source,yaml] +---- +include::example$example-sparkapp-image.yaml[] +---- + +<1> Job image: this contains the job artifact that will be retrieved from the volume mount backed by the PVC +<2> Job python artifact (local) +<3> Job argument (external) +<4> List of python job requirements: these will be installed in the pods via `pip` +<5> Spark dependencies: the credentials provider (the user knows what is relevant here) plus dependencies needed to access external resources (in this case, in an S3 store) + +=== JVM (Scala): externally located artifact and dataset + +[source,yaml] +---- +include::example$example-sparkapp-pvc.yaml[] +---- + +<1> Job artifact located on S3. +<2> Job main class +<3> Spark dependencies: the credentials provider (the user knows what is relevant here) plus dependencies needed to access external resources (in this case, in an S3 store, accessed without credentials) +<4> the name of the volume mount backed by a `PersistentVolumeClaim` that must be pre-existing +<5> the path on the volume mount: this is referenced in the `sparkConf` section where the extra class path is defined for the driver and executors + +=== JVM (Scala): externally located artifact accessed with credentials + +[source,yaml] +---- +include::example$example-sparkapp-s3-private.yaml[] +---- + +<1> Job python artifact (located in an S3 store) +<2> Artifact class +<3> S3 section, specifying the existing secret and S3 end-point (in this case, MinIO) +<4> Credentials referencing a secretClass (not shown in is example) +<5> Spark dependencies: the credentials provider (the user knows what is relevant here) plus dependencies needed to access external resources... +<6> ...in this case, in an S3 store, accessed with the credentials defined in the secret + +=== JVM (Scala): externally located artifact accessed with job arguments provided via configuration map + +[source,yaml] +---- +include::example$example-configmap.yaml[] +---- +[source,yaml] +---- +include::example$example-sparkapp-configmap.yaml[] +---- +<1> Name of the configuration map +<2> Argument required by the job +<3> Job scala artifact that requires an input argument +<4> The volume backed by the configuration map +<5> The expected job argument, accessed via the mounted configuration map file +<6> The name of the volume backed by the configuration map that will be mounted to the driver/executor +<7> The mount location of the volume (this will contain a file `/arguments/job-args.txt`) diff --git a/docs/modules/spark-k8s/pages/job_dependencies.adoc b/docs/modules/spark-k8s/pages/usage-guide/job-dependencies.adoc similarity index 99% rename from docs/modules/spark-k8s/pages/job_dependencies.adoc rename to docs/modules/spark-k8s/pages/usage-guide/job-dependencies.adoc index 4601188d..0bf4192c 100644 --- a/docs/modules/spark-k8s/pages/job_dependencies.adoc +++ b/docs/modules/spark-k8s/pages/usage-guide/job-dependencies.adoc @@ -1,4 +1,5 @@ = Job Dependencies +:page-aliases: job_dependencies.adoc == Overview diff --git a/docs/modules/spark-k8s/pages/pod_placement.adoc b/docs/modules/spark-k8s/pages/usage-guide/pod-placement.adoc similarity index 100% rename from docs/modules/spark-k8s/pages/pod_placement.adoc rename to docs/modules/spark-k8s/pages/usage-guide/pod-placement.adoc diff --git a/docs/modules/spark-k8s/pages/usage-guide/resources.adoc b/docs/modules/spark-k8s/pages/usage-guide/resources.adoc new file mode 100644 index 00000000..e4939b93 --- /dev/null +++ b/docs/modules/spark-k8s/pages/usage-guide/resources.adoc @@ -0,0 +1,36 @@ += Resource Requests + +include::home:concepts:stackable_resource_requests.adoc[] + +If no resources are configured explicitly, the operator uses the following defaults: + +[source,yaml] +---- +job: + resources: + cpu: + min: '500m' + max: "1" + memory: + limit: '1Gi' +driver: + resources: + cpu: + min: '1' + max: "2" + memory: + limit: '2Gi' +executor: + resources: + cpu: + min: '1' + max: "4" + memory: + limit: '4Gi' +---- +WARNING: The default values are _most likely_ not sufficient to run a proper cluster in production. Please adapt according to your requirements. +For more details regarding Kubernetes CPU limits see: https://kubernetes.io/docs/tasks/configure-pod-container/assign-cpu-resource/[Assign CPU Resources to Containers and Pods]. + +Spark allocates a default amount of non-heap memory based on the type of job (JVM or non-JVM). This is taken into account when defining memory settings based exclusively on the resource limits, so that the "declared" value is the actual total value (i.e. including memory overhead). This may result in minor deviations from the stated resource value due to rounding differences. + +NOTE: It is possible to define Spark resources either directly by setting configuration properties listed under `sparkConf`, or by using resource limits. If both are used, then `sparkConf` properties take precedence. It is recommended for the sake of clarity to use *_either_* one *_or_* the other. diff --git a/docs/modules/spark-k8s/pages/usage-guide/s3.adoc b/docs/modules/spark-k8s/pages/usage-guide/s3.adoc new file mode 100644 index 00000000..c668328a --- /dev/null +++ b/docs/modules/spark-k8s/pages/usage-guide/s3.adoc @@ -0,0 +1,48 @@ += S3 bucket specification + +You can specify S3 connection details directly inside the `SparkApplication` specification or by referring to an external `S3Bucket` custom resource. + +To specify S3 connection details directly as part of the `SparkApplication` resource you add an inline connection configuration as shown below. + +[source,yaml] +---- +s3connection: # <1> + inline: + host: test-minio # <2> + port: 9000 # <3> + accessStyle: Path + credentials: + secretClass: s3-credentials-class # <4> +---- +<1> Entry point for the S3 connection configuration. +<2> Connection host. +<3> Optional connection port. +<4> Name of the `Secret` object expected to contain the following keys: `ACCESS_KEY_ID` and `SECRET_ACCESS_KEY` + +It is also possible to configure the connection details as a separate Kubernetes resource and only refer to that object from the `SparkApplication` like this: + +[source,yaml] +---- +s3connection: + reference: s3-connection-resource # <1> +---- +<1> Name of the connection resource with connection details. + +The resource named `s3-connection-resource` is then defined as shown below: + +[source,yaml] +---- +--- +apiVersion: s3.stackable.tech/v1alpha1 +kind: S3Connection +metadata: + name: s3-connection-resource +spec: + host: test-minio + port: 9000 + accessStyle: Path + credentials: + secretClass: minio-credentials-class +---- + +This has the advantage that one connection configuration can be shared across `SparkApplications` and reduces the cost of updating these details. diff --git a/docs/modules/spark-k8s/pages/usage.adoc b/docs/modules/spark-k8s/pages/usage.adoc deleted file mode 100644 index 64e6aa5c..00000000 --- a/docs/modules/spark-k8s/pages/usage.adoc +++ /dev/null @@ -1,308 +0,0 @@ -= Usage - -== Examples - -The following examples have the following `spec` fields in common: - -- `version`: the current version is "1.0" -- `sparkImage`: the docker image that will be used by job, driver and executor pods. This can be provided by the user. -- `mode`: only `cluster` is currently supported -- `mainApplicationFile`: the artifact (Java, Scala or Python) that forms the basis of the Spark job. -- `args`: these are the arguments passed directly to the application. In the examples below it is e.g. the input path for part of the public New York taxi dataset. -- `sparkConf`: these list spark configuration settings that are passed directly to `spark-submit` and which are best defined explicitly by the user. Since the `SparkApplication` "knows" that there is an external dependency (the s3 bucket where the data and/or the application is located) and how that dependency should be treated (i.e. what type of credential checks are required, if any), it is better to have these things declared together. -- `volumes`: refers to any volumes needed by the `SparkApplication`, in this case an underlying `PersistentVoulmeClaim`. -- `driver`: driver-specific settings, including any volume mounts. -- `executor`: executor-specific settings, including any volume mounts. - -Job-specific settings are annotated below. - -=== Pyspark: externally located artifact and dataset - -[source,yaml] ----- -include::example$example-sparkapp-external-dependencies.yaml[] ----- - -<1> Job python artifact (external) -<2> Job argument (external) -<3> List of python job requirements: these will be installed in the pods via `pip` -<4> Spark dependencies: the credentials provider (the user knows what is relevant here) plus dependencies needed to access external resources (in this case, in s3) -<5> the name of the volume mount backed by a `PersistentVolumeClaim` that must be pre-existing -<6> the path on the volume mount: this is referenced in the `sparkConf` section where the extra class path is defined for the driver and executors - -=== Pyspark: externally located dataset, artifact available via PVC/volume mount - -[source,yaml] ----- -include::example$example-sparkapp-image.yaml[] ----- - -<1> Job image: this contains the job artifact that will be retrieved from the volume mount backed by the PVC -<2> Job python artifact (local) -<3> Job argument (external) -<4> List of python job requirements: these will be installed in the pods via `pip` -<5> Spark dependencies: the credentials provider (the user knows what is relevant here) plus dependencies needed to access external resources (in this case, in an S3 store) - -=== JVM (Scala): externally located artifact and dataset - -[source,yaml] ----- -include::example$example-sparkapp-pvc.yaml[] ----- - -<1> Job artifact located on S3. -<2> Job main class -<3> Spark dependencies: the credentials provider (the user knows what is relevant here) plus dependencies needed to access external resources (in this case, in an S3 store, accessed without credentials) -<4> the name of the volume mount backed by a `PersistentVolumeClaim` that must be pre-existing -<5> the path on the volume mount: this is referenced in the `sparkConf` section where the extra class path is defined for the driver and executors - -=== JVM (Scala): externally located artifact accessed with credentials - -[source,yaml] ----- -include::example$example-sparkapp-s3-private.yaml[] ----- - -<1> Job python artifact (located in an S3 store) -<2> Artifact class -<3> S3 section, specifying the existing secret and S3 end-point (in this case, MinIO) -<4> Credentials referencing a secretClass (not shown in is example) -<5> Spark dependencies: the credentials provider (the user knows what is relevant here) plus dependencies needed to access external resources... -<6> ...in this case, in an S3 store, accessed with the credentials defined in the secret - -=== JVM (Scala): externally located artifact accessed with job arguments provided via configuration map - -[source,yaml] ----- -include::example$example-configmap.yaml[] ----- -[source,yaml] ----- -include::example$example-sparkapp-configmap.yaml[] ----- -<1> Name of the configuration map -<2> Argument required by the job -<3> Job scala artifact that requires an input argument -<4> The volume backed by the configuration map -<5> The expected job argument, accessed via the mounted configuration map file -<6> The name of the volume backed by the configuration map that will be mounted to the driver/executor -<7> The mount location of the volume (this will contain a file `/arguments/job-args.txt`) - -== S3 bucket specification - -You can specify S3 connection details directly inside the `SparkApplication` specification or by referring to an external `S3Bucket` custom resource. - -To specify S3 connection details directly as part of the `SparkApplication` resource you add an inline connection configuration as shown below. - -[source,yaml] ----- -s3connection: # <1> - inline: - host: test-minio # <2> - port: 9000 # <3> - accessStyle: Path - credentials: - secretClass: s3-credentials-class # <4> ----- -<1> Entry point for the S3 connection configuration. -<2> Connection host. -<3> Optional connection port. -<4> Name of the `Secret` object expected to contain the following keys: `ACCESS_KEY_ID` and `SECRET_ACCESS_KEY` - -It is also possible to configure the connection details as a separate Kubernetes resource and only refer to that object from the `SparkApplication` like this: - -[source,yaml] ----- -s3connection: - reference: s3-connection-resource # <1> ----- -<1> Name of the connection resource with connection details. - -The resource named `s3-connection-resource` is then defined as shown below: - -[source,yaml] ----- ---- -apiVersion: s3.stackable.tech/v1alpha1 -kind: S3Connection -metadata: - name: s3-connection-resource -spec: - host: test-minio - port: 9000 - accessStyle: Path - credentials: - secretClass: minio-credentials-class ----- - -This has the advantage that one connection configuration can be shared across `SparkApplications` and reduces the cost of updating these details. - -== Resource Requests - -// The "nightly" version is needed because the "include" directive searches for -// files in the "stable" version by default. -// TODO: remove the "nightly" version after the next platform release (current: 22.09) -include::nightly@home:concepts:stackable_resource_requests.adoc[] - -If no resources are configured explicitly, the operator uses the following defaults: - -[source,yaml] ----- -job: - resources: - cpu: - min: '500m' - max: "1" - memory: - limit: '1Gi' -driver: - resources: - cpu: - min: '1' - max: "2" - memory: - limit: '2Gi' -executor: - resources: - cpu: - min: '1' - max: "4" - memory: - limit: '4Gi' ----- -WARNING: The default values are _most likely_ not sufficient to run a proper cluster in production. Please adapt according to your requirements. -For more details regarding Kubernetes CPU limits see: https://kubernetes.io/docs/tasks/configure-pod-container/assign-cpu-resource/[Assign CPU Resources to Containers and Pods]. - -Spark allocates a default amount of non-heap memory based on the type of job (JVM or non-JVM). This is taken into account when defining memory settings based exclusively on the resource limits, so that the "declared" value is the actual total value (i.e. including memory overhead). This may result in minor deviations from the stated resource value due to rounding differences. - -NOTE: It is possible to define Spark resources either directly by setting configuration properties listed under `sparkConf`, or by using resource limits. If both are used, then `sparkConf` properties take precedence. It is recommended for the sake of clarity to use *_either_* one *_or_* the other. - -== Log aggregation - -The logs can be forwarded to a Vector log aggregator by providing a discovery -ConfigMap for the aggregator and by enabling the log agent: - -[source,yaml] ----- -spec: - vectorAggregatorConfigMapName: vector-aggregator-discovery - job: - logging: - enableVectorAgent: true - driver: - logging: - enableVectorAgent: true - executor: - logging: - enableVectorAgent: true ----- - -Further information on how to configure logging, can be found in -xref:home:concepts:logging.adoc[]. - -== CRD argument coverage - -Below are listed the CRD fields that can be defined by the user: - -|=== -|CRD field |Remarks - -|`apiVersion` -|`spark.stackable.tech/v1alpha1` - -|`kind` -|`SparkApplication` - -|`metadata.name` -| Job name - -|`spec.version` -|"1.0" - -|`spec.mode` -| `cluster` or `client`. Currently only `cluster` is supported - -|`spec.image` -|User-supplied image containing spark-job dependencies that will be copied to the specified volume mount - -|`spec.sparkImage` -| Spark image which will be deployed to driver and executor pods, which must contain spark environment needed by the job e.g. `docker.stackable.tech/stackable/spark-k8s:3.3.0-stackable0.3.0` - -|`spec.sparkImagePullPolicy` -| Optional Enum (one of `Always`, `IfNotPresent` or `Never`) that determines the pull policy of the spark job image - -|`spec.sparkImagePullSecrets` -| An optional list of references to secrets in the same namespace to use for pulling any of the images used by a `SparkApplication` resource. Each reference has a single property (`name`) that must contain a reference to a valid secret - -|`spec.mainApplicationFile` -|The actual application file that will be called by `spark-submit` - -|`spec.mainClass` -|The main class i.e. entry point for JVM artifacts - -|`spec.args` -|Arguments passed directly to the job artifact - -|`spec.s3connection` -|S3 connection specification. See the <> for more details. - -|`spec.sparkConf` -|A map of key/value strings that will be passed directly to `spark-submit` - -|`spec.deps.requirements` -|A list of python packages that will be installed via `pip` - -|`spec.deps.packages` -|A list of packages that is passed directly to `spark-submit` - -|`spec.deps.excludePackages` -|A list of excluded packages that is passed directly to `spark-submit` - -|`spec.deps.repositories` -|A list of repositories that is passed directly to `spark-submit` - -|`spec.volumes` -|A list of volumes - -|`spec.volumes.name` -|The volume name - -|`spec.volumes.persistentVolumeClaim.claimName` -|The persistent volume claim backing the volume - -|`spec.job.resources` -|Resources specification for the initiating Job - -|`spec.driver.resources` -|Resources specification for the driver Pod - -|`spec.driver.volumeMounts` -|A list of mounted volumes for the driver - -|`spec.driver.volumeMounts.name` -|Name of mount - -|`spec.driver.volumeMounts.mountPath` -|Volume mount path - -|`spec.driver.nodeSelector` -|A dictionary of labels to use for node selection when scheduling the driver N.B. this assumes there are no implicit node dependencies (e.g. `PVC`, `VolumeMount`) defined elsewhere. - -|`spec.executor.resources` -|Resources specification for the executor Pods - -|`spec.executor.instances` -|Number of executor instances launched for this job - -|`spec.executor.volumeMounts` -|A list of mounted volumes for each executor - -|`spec.executor.volumeMounts.name` -|Name of mount - -|`spec.executor.volumeMounts.mountPath` -|Volume mount path - -|`spec.executor.nodeSelector` -|A dictionary of labels to use for node selection when scheduling the executors N.B. this assumes there are no implicit node dependencies (e.g. `PVC`, `VolumeMount`) defined elsewhere. -|=== diff --git a/docs/modules/spark-k8s/partials/nav.adoc b/docs/modules/spark-k8s/partials/nav.adoc index 143c44ca..ef0e7aa6 100644 --- a/docs/modules/spark-k8s/partials/nav.adoc +++ b/docs/modules/spark-k8s/partials/nav.adoc @@ -2,8 +2,10 @@ ** xref:spark-k8s:getting_started/installation.adoc[] ** xref:spark-k8s:getting_started/first_steps.adoc[] * xref:spark-k8s:configuration.adoc[] -* xref:spark-k8s:usage.adoc[] -* xref:spark-k8s:job_dependencies.adoc[] -* xref:spark-k8s:rbac.adoc[] -* xref:spark-k8s:history_server.adoc[] -* xref:spark-k8s:pod_placement.adoc[] \ No newline at end of file +* xref:spark-k8s:usage-guide/index.adoc[] +** xref:spark-k8s:usage-guide/pod-placement.adoc[] +** xref:spark-k8s:usage-guide/job-dependencies.adoc[] +** xref:spark-k8s:usage-guide/resources.adoc[] +** xref:spark-k8s:usage-guide/s3.adoc[] +** xref:spark-k8s:usage-guide/history-server.adoc[] +* xref:spark-k8s:crd-reference.adoc[] \ No newline at end of file From 50cad07835e7d9746386ea29d76927a8e4461edc Mon Sep 17 00:00:00 2001 From: Felix Hennig Date: Wed, 12 Apr 2023 15:25:21 +0200 Subject: [PATCH 02/11] Added dummy image --- .../spark-k8s/images/spark_overview.drawio.svg | 4 ++++ docs/modules/spark-k8s/pages/index.adoc | 18 +++++++++++++++++- 2 files changed, 21 insertions(+), 1 deletion(-) create mode 100644 docs/modules/spark-k8s/images/spark_overview.drawio.svg diff --git a/docs/modules/spark-k8s/images/spark_overview.drawio.svg b/docs/modules/spark-k8s/images/spark_overview.drawio.svg new file mode 100644 index 00000000..6124ff94 --- /dev/null +++ b/docs/modules/spark-k8s/images/spark_overview.drawio.svg @@ -0,0 +1,4 @@ + + + +
Operator
Operator
SparkApplication
SparkApplication
Job
Job
Driver Pod?
Driver Pod?
Executor Pod?
Executor Pod?
Pod
Pod
Text is not SVG - cannot display
\ No newline at end of file diff --git a/docs/modules/spark-k8s/pages/index.adoc b/docs/modules/spark-k8s/pages/index.adoc index dc2b8a6c..58368011 100644 --- a/docs/modules/spark-k8s/pages/index.adoc +++ b/docs/modules/spark-k8s/pages/index.adoc @@ -8,7 +8,23 @@ This is an operator for Kubernetes that can manage https://spark.apache.org/[Apa Follow the xref:getting_started/index.adoc[] guide to get started with Apache Spark using the Stackable Operator. The guide will lead you through the installation of the Operator and running your first Spark job on Kubernetes. -== RBAC +== How the Operator works + +The Stackable Operator for Apache Spark reads a _SparkApplication custom resource_ which you use to define your spark job/application. The Operator creates the relevant Kubernetes resources for the job to run. + +=== SparkApplication custom resource + +The SparkApplication resource is the main point of interaction with the Operator. An exhaustive list of options is given on the xref:crd-reference.adoc[] page. You can specify a custom image etc.etc (TODO) + +=== Kubernetes resources + +For every SparkApplication deployed to the cluster the Operator creates a Job + +image::spark_overview.drawio.svg[A diagram depicting the Kubernetes resources created by the operator] + +The Job ... + +=== RBAC The https://spark.apache.org/docs/latest/running-on-kubernetes.html#rbac[Spark-Kubernetes RBAC documentation] describes what is needed for `spark-submit` jobs to run successfully: minimally a role/cluster-role to allow the driver pod to create and manage executor pods. From a9b69979e92a235a62f5e3bb439bd8a06cc398a0 Mon Sep 17 00:00:00 2001 From: Felix Hennig Date: Thu, 13 Apr 2023 11:40:46 +0200 Subject: [PATCH 03/11] updates --- .../images/spark_overview.drawio.svg | 2 +- docs/modules/spark-k8s/pages/index.adoc | 18 ++-- .../spark-k8s/pages/usage-guide/examples.adoc | 87 ++++++++++++++++++ .../spark-k8s/pages/usage-guide/index.adoc | 88 +------------------ docs/modules/spark-k8s/partials/nav.adoc | 1 + 5 files changed, 100 insertions(+), 96 deletions(-) create mode 100644 docs/modules/spark-k8s/pages/usage-guide/examples.adoc diff --git a/docs/modules/spark-k8s/images/spark_overview.drawio.svg b/docs/modules/spark-k8s/images/spark_overview.drawio.svg index 6124ff94..c74b32ca 100644 --- a/docs/modules/spark-k8s/images/spark_overview.drawio.svg +++ b/docs/modules/spark-k8s/images/spark_overview.drawio.svg @@ -1,4 +1,4 @@ -
Operator
Operator
SparkApplication
SparkApplication
Job
Job
Driver Pod?
Driver Pod?
Executor Pod?
Executor Pod?
Pod
Pod
Text is not SVG - cannot display
\ No newline at end of file +
Spark Operator
Spark Operator
SparkApplication
<name>
SparkApplication...
create
create
read
read
Legend
Legend
Operator
Operator
Resource
Resource
Custom
Resource
Custom...
references
references
Job
<name>
Job...
Pod
<name>-...
Pod...
Pod
<name>-...-driver-pod
Pod...
ConfigMap
<name>-driver-pod-template
ConfigMap...
ConfigMap
<name>-executor-pod-template
ConfigMap...
ConfigMap
<name>-submit-job
ConfigMap...
ConfigMap
<name>-...-driver-pod-spec-conf-map
ConfigMap...
Pod
<name>-...-exec-3
Pod...
Pod
<name>-...-exec-2
Pod...
Pod
<name>-...-exec-1
Pod...
spark-
submit
spark-...
ServiceAccount
<name>
ServiceAccount...
Text is not SVG - cannot display
\ No newline at end of file diff --git a/docs/modules/spark-k8s/pages/index.adoc b/docs/modules/spark-k8s/pages/index.adoc index 58368011..26a056d9 100644 --- a/docs/modules/spark-k8s/pages/index.adoc +++ b/docs/modules/spark-k8s/pages/index.adoc @@ -14,21 +14,25 @@ The Stackable Operator for Apache Spark reads a _SparkApplication custom resourc === SparkApplication custom resource -The SparkApplication resource is the main point of interaction with the Operator. An exhaustive list of options is given on the xref:crd-reference.adoc[] page. You can specify a custom image etc.etc (TODO) +The SparkApplication resource is the main point of interaction with the Operator. An exhaustive list of options is given on the xref:crd-reference.adoc[] page. + +Unlike other Operators, the Spark Operator does not have xref:concepts:roles-and-role-groups.adoc[roles]. === Kubernetes resources -For every SparkApplication deployed to the cluster the Operator creates a Job +For every SparkApplication deployed to the cluster the Operator creates a Job, A ServiceAccout and a few ConfigMaps. image::spark_overview.drawio.svg[A diagram depicting the Kubernetes resources created by the operator] -The Job ... +The Job runs `spark-submit` in a Pod which then creates a Spark driver Pod. The driver creates its own Executors based on the configuration in the SparkApplication. The Job, driver and executors all use the same image, which is configured in the SparkApplication resource. + +The two main ConfigMaps are the `-driver-pod-template` and `-executor-pod-template` which define how the driver and executor Pods should be created. === RBAC The https://spark.apache.org/docs/latest/running-on-kubernetes.html#rbac[Spark-Kubernetes RBAC documentation] describes what is needed for `spark-submit` jobs to run successfully: minimally a role/cluster-role to allow the driver pod to create and manage executor pods. -However, to add security, each `spark-submit` job launched by the spark-k8s operator will be assigned its own service account. +However, to add security, each `spark-submit` job launched by the spark-k8s operator will be assigned its own ServiceAccount. When the spark-k8s operator is installed via Helm, a cluster role named `spark-k8s-clusterrole` is created with pre-defined permissions. @@ -36,11 +40,9 @@ When a new Spark application is submitted, the operator creates a new service ac == Integrations -- Kafka -- S3 -- loading custom dependencies +You can read and write data from xref:usage-guide/s3.adoc[s3 buckets], load xref:usage-guide/job-dependencies[custom job dependencies]. Spark also supports easy integration with Apache Kafka which is also supported xref:kafka:index.adoc[on the Stackable Data Platform]. Have a look at the demos below to see it in action. -== Demos +== [[demos]]Demos The xref:stackablectl::demos/data-lakehouse-iceberg-trino-spark.adoc[] demo connects multiple components and datasets into a data Lakehouse. A Spark application with https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html[structured streaming] is used to stream data from Apache Kafka into the Lakehouse. diff --git a/docs/modules/spark-k8s/pages/usage-guide/examples.adoc b/docs/modules/spark-k8s/pages/usage-guide/examples.adoc new file mode 100644 index 00000000..916def85 --- /dev/null +++ b/docs/modules/spark-k8s/pages/usage-guide/examples.adoc @@ -0,0 +1,87 @@ += Examples + +The following examples have the following `spec` fields in common: + +- `version`: the current version is "1.0" +- `sparkImage`: the docker image that will be used by job, driver and executor pods. This can be provided by the user. +- `mode`: only `cluster` is currently supported +- `mainApplicationFile`: the artifact (Java, Scala or Python) that forms the basis of the Spark job. +- `args`: these are the arguments passed directly to the application. In the examples below it is e.g. the input path for part of the public New York taxi dataset. +- `sparkConf`: these list spark configuration settings that are passed directly to `spark-submit` and which are best defined explicitly by the user. Since the `SparkApplication` "knows" that there is an external dependency (the s3 bucket where the data and/or the application is located) and how that dependency should be treated (i.e. what type of credential checks are required, if any), it is better to have these things declared together. +- `volumes`: refers to any volumes needed by the `SparkApplication`, in this case an underlying `PersistentVolumeClaim`. +- `driver`: driver-specific settings, including any volume mounts. +- `executor`: executor-specific settings, including any volume mounts. + +Job-specific settings are annotated below. + +== Pyspark: externally located artifact and dataset + +[source,yaml] +---- +include::example$example-sparkapp-external-dependencies.yaml[] +---- + +<1> Job python artifact (external) +<2> Job argument (external) +<3> List of python job requirements: these will be installed in the pods via `pip` +<4> Spark dependencies: the credentials provider (the user knows what is relevant here) plus dependencies needed to access external resources (in this case, in s3) +<5> the name of the volume mount backed by a `PersistentVolumeClaim` that must be pre-existing +<6> the path on the volume mount: this is referenced in the `sparkConf` section where the extra class path is defined for the driver and executors + +== Pyspark: externally located dataset, artifact available via PVC/volume mount + +[source,yaml] +---- +include::example$example-sparkapp-image.yaml[] +---- + +<1> Job image: this contains the job artifact that will be retrieved from the volume mount backed by the PVC +<2> Job python artifact (local) +<3> Job argument (external) +<4> List of python job requirements: these will be installed in the pods via `pip` +<5> Spark dependencies: the credentials provider (the user knows what is relevant here) plus dependencies needed to access external resources (in this case, in an S3 store) + +== JVM (Scala): externally located artifact and dataset + +[source,yaml] +---- +include::example$example-sparkapp-pvc.yaml[] +---- + +<1> Job artifact located on S3. +<2> Job main class +<3> Spark dependencies: the credentials provider (the user knows what is relevant here) plus dependencies needed to access external resources (in this case, in an S3 store, accessed without credentials) +<4> the name of the volume mount backed by a `PersistentVolumeClaim` that must be pre-existing +<5> the path on the volume mount: this is referenced in the `sparkConf` section where the extra class path is defined for the driver and executors + +== JVM (Scala): externally located artifact accessed with credentials + +[source,yaml] +---- +include::example$example-sparkapp-s3-private.yaml[] +---- + +<1> Job python artifact (located in an S3 store) +<2> Artifact class +<3> S3 section, specifying the existing secret and S3 end-point (in this case, MinIO) +<4> Credentials referencing a secretClass (not shown in is example) +<5> Spark dependencies: the credentials provider (the user knows what is relevant here) plus dependencies needed to access external resources... +<6> ...in this case, in an S3 store, accessed with the credentials defined in the secret + +== JVM (Scala): externally located artifact accessed with job arguments provided via configuration map + +[source,yaml] +---- +include::example$example-configmap.yaml[] +---- +[source,yaml] +---- +include::example$example-sparkapp-configmap.yaml[] +---- +<1> Name of the configuration map +<2> Argument required by the job +<3> Job scala artifact that requires an input argument +<4> The volume backed by the configuration map +<5> The expected job argument, accessed via the mounted configuration map file +<6> The name of the volume backed by the configuration map that will be mounted to the driver/executor +<7> The mount location of the volume (this will contain a file `/arguments/job-args.txt`) diff --git a/docs/modules/spark-k8s/pages/usage-guide/index.adoc b/docs/modules/spark-k8s/pages/usage-guide/index.adoc index 1da42747..1d3463ec 100644 --- a/docs/modules/spark-k8s/pages/usage-guide/index.adoc +++ b/docs/modules/spark-k8s/pages/usage-guide/index.adoc @@ -1,89 +1,3 @@ = Usage guide -== Examples - -The following examples have the following `spec` fields in common: - -- `version`: the current version is "1.0" -- `sparkImage`: the docker image that will be used by job, driver and executor pods. This can be provided by the user. -- `mode`: only `cluster` is currently supported -- `mainApplicationFile`: the artifact (Java, Scala or Python) that forms the basis of the Spark job. -- `args`: these are the arguments passed directly to the application. In the examples below it is e.g. the input path for part of the public New York taxi dataset. -- `sparkConf`: these list spark configuration settings that are passed directly to `spark-submit` and which are best defined explicitly by the user. Since the `SparkApplication` "knows" that there is an external dependency (the s3 bucket where the data and/or the application is located) and how that dependency should be treated (i.e. what type of credential checks are required, if any), it is better to have these things declared together. -- `volumes`: refers to any volumes needed by the `SparkApplication`, in this case an underlying `PersistentVoulmeClaim`. -- `driver`: driver-specific settings, including any volume mounts. -- `executor`: executor-specific settings, including any volume mounts. - -Job-specific settings are annotated below. - -=== Pyspark: externally located artifact and dataset - -[source,yaml] ----- -include::example$example-sparkapp-external-dependencies.yaml[] ----- - -<1> Job python artifact (external) -<2> Job argument (external) -<3> List of python job requirements: these will be installed in the pods via `pip` -<4> Spark dependencies: the credentials provider (the user knows what is relevant here) plus dependencies needed to access external resources (in this case, in s3) -<5> the name of the volume mount backed by a `PersistentVolumeClaim` that must be pre-existing -<6> the path on the volume mount: this is referenced in the `sparkConf` section where the extra class path is defined for the driver and executors - -=== Pyspark: externally located dataset, artifact available via PVC/volume mount - -[source,yaml] ----- -include::example$example-sparkapp-image.yaml[] ----- - -<1> Job image: this contains the job artifact that will be retrieved from the volume mount backed by the PVC -<2> Job python artifact (local) -<3> Job argument (external) -<4> List of python job requirements: these will be installed in the pods via `pip` -<5> Spark dependencies: the credentials provider (the user knows what is relevant here) plus dependencies needed to access external resources (in this case, in an S3 store) - -=== JVM (Scala): externally located artifact and dataset - -[source,yaml] ----- -include::example$example-sparkapp-pvc.yaml[] ----- - -<1> Job artifact located on S3. -<2> Job main class -<3> Spark dependencies: the credentials provider (the user knows what is relevant here) plus dependencies needed to access external resources (in this case, in an S3 store, accessed without credentials) -<4> the name of the volume mount backed by a `PersistentVolumeClaim` that must be pre-existing -<5> the path on the volume mount: this is referenced in the `sparkConf` section where the extra class path is defined for the driver and executors - -=== JVM (Scala): externally located artifact accessed with credentials - -[source,yaml] ----- -include::example$example-sparkapp-s3-private.yaml[] ----- - -<1> Job python artifact (located in an S3 store) -<2> Artifact class -<3> S3 section, specifying the existing secret and S3 end-point (in this case, MinIO) -<4> Credentials referencing a secretClass (not shown in is example) -<5> Spark dependencies: the credentials provider (the user knows what is relevant here) plus dependencies needed to access external resources... -<6> ...in this case, in an S3 store, accessed with the credentials defined in the secret - -=== JVM (Scala): externally located artifact accessed with job arguments provided via configuration map - -[source,yaml] ----- -include::example$example-configmap.yaml[] ----- -[source,yaml] ----- -include::example$example-sparkapp-configmap.yaml[] ----- -<1> Name of the configuration map -<2> Argument required by the job -<3> Job scala artifact that requires an input argument -<4> The volume backed by the configuration map -<5> The expected job argument, accessed via the mounted configuration map file -<6> The name of the volume backed by the configuration map that will be mounted to the driver/executor -<7> The mount location of the volume (this will contain a file `/arguments/job-args.txt`) +Learn how to load your own xref:usage-guide/job-dependencies.adoc[] or configure an xref:usage-guide/s3.adoc[S3 connection]. Have a look at the xref:usage-guide/examples.adoc[] to learn more about different operatoring modes. diff --git a/docs/modules/spark-k8s/partials/nav.adoc b/docs/modules/spark-k8s/partials/nav.adoc index ef0e7aa6..1bd0ead8 100644 --- a/docs/modules/spark-k8s/partials/nav.adoc +++ b/docs/modules/spark-k8s/partials/nav.adoc @@ -8,4 +8,5 @@ ** xref:spark-k8s:usage-guide/resources.adoc[] ** xref:spark-k8s:usage-guide/s3.adoc[] ** xref:spark-k8s:usage-guide/history-server.adoc[] +** xref:spark-k8s:usage-guide/examples.adoc[] * xref:spark-k8s:crd-reference.adoc[] \ No newline at end of file From 14e9a1d8f68506b2748ca214f48df7806a22ed9a Mon Sep 17 00:00:00 2001 From: Razvan-Daniel Mihai <84674+razvan@users.noreply.github.com> Date: Thu, 13 Apr 2023 13:58:06 +0200 Subject: [PATCH 04/11] Update the CRD reference. --- .../spark-k8s/pages/crd-reference.adoc | 27 ++++++++++++++----- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/docs/modules/spark-k8s/pages/crd-reference.adoc b/docs/modules/spark-k8s/pages/crd-reference.adoc index abe4d1bc..5144f8b2 100644 --- a/docs/modules/spark-k8s/pages/crd-reference.adoc +++ b/docs/modules/spark-k8s/pages/crd-reference.adoc @@ -12,10 +12,10 @@ Below are listed the CRD fields that can be defined by the user: |`SparkApplication` |`metadata.name` -| Job name +|Application name |`spec.version` -|"1.0" +|Application version |`spec.mode` | `cluster` or `client`. Currently only `cluster` is supported @@ -42,7 +42,7 @@ Below are listed the CRD fields that can be defined by the user: |Arguments passed directly to the job artifact |`spec.s3connection` -|S3 connection specification. See the <> for more details. +|S3 connection specification. See the xref:concepts:s3.adoc[] for more details. |`spec.sparkConf` |A map of key/value strings that will be passed directly to `spark-submit` @@ -83,8 +83,11 @@ Below are listed the CRD fields that can be defined by the user: |`spec.driver.volumeMounts.mountPath` |Volume mount path -|`spec.driver.nodeSelector` -|A dictionary of labels to use for node selection when scheduling the driver N.B. this assumes there are no implicit node dependencies (e.g. `PVC`, `VolumeMount`) defined elsewhere. +|`spec.driver.affinity` +|Driver Pod placement affinity. See xref:pod-placement.adoc[] for details + +|`spec.driver.logging` +|Logging aggregation for the driver Pod. See xref:concepts:logging.adoc[] for details |`spec.executor.resources` |Resources specification for the executor Pods @@ -101,6 +104,16 @@ Below are listed the CRD fields that can be defined by the user: |`spec.executor.volumeMounts.mountPath` |Volume mount path -|`spec.executor.nodeSelector` -|A dictionary of labels to use for node selection when scheduling the executors N.B. this assumes there are no implicit node dependencies (e.g. `PVC`, `VolumeMount`) defined elsewhere. +|`spec.executor.affinity` +|Driver Pod placement affinity. See xref:pod-placement.adoc[] for details. + +|`spec.executor.logging` +|Logging aggregation for the executor Pods. See xref:concepts:logging.adoc[] for details + +|`spec.logFileDirectory.bucket` +|S3 bucket definition where applications should publish events for the Spark History server. + +|`spec.logFileDirectory.prefix` +|Prefix to use when storing events for the Spark History server. + |=== From 071480ecb25d457293c86283d3d070cbc2dbd32e Mon Sep 17 00:00:00 2001 From: Felix Hennig Date: Thu, 13 Apr 2023 15:18:16 +0200 Subject: [PATCH 05/11] Update docs/modules/spark-k8s/pages/index.adoc Co-authored-by: Razvan-Daniel Mihai <84674+razvan@users.noreply.github.com> --- docs/modules/spark-k8s/pages/index.adoc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/modules/spark-k8s/pages/index.adoc b/docs/modules/spark-k8s/pages/index.adoc index 26a056d9..44916cfe 100644 --- a/docs/modules/spark-k8s/pages/index.adoc +++ b/docs/modules/spark-k8s/pages/index.adoc @@ -2,7 +2,7 @@ :description: The Stackable Operator for Apache Spark is a Kubernetes operator that can manage Apache Spark clusters. Learn about its features, resources, dependencies and demos, and see the list of supported Spark versions. :keywords: Stackable Operator, Apache Spark, Kubernetes, operator, data science, engineer, big data, CRD, StatefulSet, ConfigMap, Service, S3, demo, version -This is an operator for Kubernetes that can manage https://spark.apache.org/[Apache Spark] Kubernetes clusters. Apache Spark is a powerful open-source big data processing framework that allows for efficient and flexible distributed computing. Its in-memory processing and fault-tolerant architecture make it ideal for a variety of use cases, including batch processing, real-time streaming, machine learning, and graph processing. +This is an operator manages https://spark.apache.org/[Apache Spark] on Kubernetes clusters. Apache Spark is a powerful open-source big data processing framework that allows for efficient and flexible distributed computing. Its in-memory processing and fault-tolerant architecture make it ideal for a variety of use cases, including batch processing, real-time streaming, machine learning, and graph processing. == Getting Started From 9b45161fcefe8d3bd6fbe64571b74602714aa2b0 Mon Sep 17 00:00:00 2001 From: Felix Hennig Date: Thu, 13 Apr 2023 15:18:32 +0200 Subject: [PATCH 06/11] Update docs/modules/spark-k8s/pages/usage-guide/index.adoc Co-authored-by: Razvan-Daniel Mihai <84674+razvan@users.noreply.github.com> --- docs/modules/spark-k8s/pages/usage-guide/index.adoc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/modules/spark-k8s/pages/usage-guide/index.adoc b/docs/modules/spark-k8s/pages/usage-guide/index.adoc index 1d3463ec..545c3432 100644 --- a/docs/modules/spark-k8s/pages/usage-guide/index.adoc +++ b/docs/modules/spark-k8s/pages/usage-guide/index.adoc @@ -1,3 +1,3 @@ = Usage guide -Learn how to load your own xref:usage-guide/job-dependencies.adoc[] or configure an xref:usage-guide/s3.adoc[S3 connection]. Have a look at the xref:usage-guide/examples.adoc[] to learn more about different operatoring modes. +Learn how to load your own xref:usage-guide/job-dependencies.adoc[] or configure an xref:usage-guide/s3.adoc[S3 connection] to access data. Have a look at the xref:usage-guide/examples.adoc[] to learn more about different usage scenarios. From 0114054a8d783c11d7d5885e9d257e5a45eccd2c Mon Sep 17 00:00:00 2001 From: Felix Hennig Date: Thu, 13 Apr 2023 15:18:42 +0200 Subject: [PATCH 07/11] Update docs/modules/spark-k8s/pages/index.adoc Co-authored-by: Razvan-Daniel Mihai <84674+razvan@users.noreply.github.com> --- docs/modules/spark-k8s/pages/index.adoc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/modules/spark-k8s/pages/index.adoc b/docs/modules/spark-k8s/pages/index.adoc index 44916cfe..737b5a76 100644 --- a/docs/modules/spark-k8s/pages/index.adoc +++ b/docs/modules/spark-k8s/pages/index.adoc @@ -6,7 +6,7 @@ This is an operator manages https://spark.apache.org/[Apache Spark] on Kubernete == Getting Started -Follow the xref:getting_started/index.adoc[] guide to get started with Apache Spark using the Stackable Operator. The guide will lead you through the installation of the Operator and running your first Spark job on Kubernetes. +Follow the xref:getting_started/index.adoc[] guide to get started with Apache Spark using the Stackable Operator. The guide will lead you through the installation of the Operator and running your first Spark application on Kubernetes. == How the Operator works From 44f055e2ff873066cad926f2f2e4ee3b63ada942 Mon Sep 17 00:00:00 2001 From: Felix Hennig Date: Thu, 13 Apr 2023 15:24:18 +0200 Subject: [PATCH 08/11] Removed some clutter from the diagram --- docs/modules/spark-k8s/images/spark_overview.drawio.svg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/modules/spark-k8s/images/spark_overview.drawio.svg b/docs/modules/spark-k8s/images/spark_overview.drawio.svg index c74b32ca..567626e1 100644 --- a/docs/modules/spark-k8s/images/spark_overview.drawio.svg +++ b/docs/modules/spark-k8s/images/spark_overview.drawio.svg @@ -1,4 +1,4 @@ -
Spark Operator
Spark Operator
SparkApplication
<name>
SparkApplication...
create
create
read
read
Legend
Legend
Operator
Operator
Resource
Resource
Custom
Resource
Custom...
references
references
Job
<name>
Job...
Pod
<name>-...
Pod...
Pod
<name>-...-driver-pod
Pod...
ConfigMap
<name>-driver-pod-template
ConfigMap...
ConfigMap
<name>-executor-pod-template
ConfigMap...
ConfigMap
<name>-submit-job
ConfigMap...
ConfigMap
<name>-...-driver-pod-spec-conf-map
ConfigMap...
Pod
<name>-...-exec-3
Pod...
Pod
<name>-...-exec-2
Pod...
Pod
<name>-...-exec-1
Pod...
spark-
submit
spark-...
ServiceAccount
<name>
ServiceAccount...
Text is not SVG - cannot display
\ No newline at end of file +
Spark Operator
Spark Operator
SparkApplication
<name>
SparkApplication...
create
create
read
read
Legend
Legend
Operator
Operator
Resource
Resource
Custom
Resource
Custom...
references
references
Job
<name>
Job...
Pod
<name>-...
Pod...
Pod
<name>-...-driver-pod
Pod...
ConfigMap
<name>-driver-pod-template
ConfigMap...
ConfigMap
<name>-executor-pod-template
ConfigMap...
Pod
<name>-...-exec-3
Pod...
Pod
<name>-...-exec-2
Pod...
Pod
<name>-...-exec-1
Pod...
spark-
submit
spark-...
ServiceAccount
<name>
ServiceAccount...
Text is not SVG - cannot display
\ No newline at end of file From aa27408e18599e5242023fbeb0b8b84a08675305 Mon Sep 17 00:00:00 2001 From: Felix Hennig Date: Mon, 17 Apr 2023 09:33:45 +0200 Subject: [PATCH 09/11] fixed links --- docs/modules/spark-k8s/pages/crd-reference.adoc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/modules/spark-k8s/pages/crd-reference.adoc b/docs/modules/spark-k8s/pages/crd-reference.adoc index 5144f8b2..682b36e1 100644 --- a/docs/modules/spark-k8s/pages/crd-reference.adoc +++ b/docs/modules/spark-k8s/pages/crd-reference.adoc @@ -84,7 +84,7 @@ Below are listed the CRD fields that can be defined by the user: |Volume mount path |`spec.driver.affinity` -|Driver Pod placement affinity. See xref:pod-placement.adoc[] for details +|Driver Pod placement affinity. See xref:usage-guide/pod-placement.adoc[] for details |`spec.driver.logging` |Logging aggregation for the driver Pod. See xref:concepts:logging.adoc[] for details @@ -105,7 +105,7 @@ Below are listed the CRD fields that can be defined by the user: |Volume mount path |`spec.executor.affinity` -|Driver Pod placement affinity. See xref:pod-placement.adoc[] for details. +|Driver Pod placement affinity. See xref:usage-guide/pod-placement.adoc[] for details. |`spec.executor.logging` |Logging aggregation for the executor Pods. See xref:concepts:logging.adoc[] for details From e783cfc86b4d330bd0fb13e2170da4cc2de17cdd Mon Sep 17 00:00:00 2001 From: Felix Hennig Date: Mon, 17 Apr 2023 11:57:11 +0200 Subject: [PATCH 10/11] Updated diagram to include Spark History Server --- docs/modules/spark-k8s/images/spark_overview.drawio.svg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/modules/spark-k8s/images/spark_overview.drawio.svg b/docs/modules/spark-k8s/images/spark_overview.drawio.svg index 567626e1..57368ea4 100644 --- a/docs/modules/spark-k8s/images/spark_overview.drawio.svg +++ b/docs/modules/spark-k8s/images/spark_overview.drawio.svg @@ -1,4 +1,4 @@ -
Spark Operator
Spark Operator
SparkApplication
<name>
SparkApplication...
create
create
read
read
Legend
Legend
Operator
Operator
Resource
Resource
Custom
Resource
Custom...
references
references
Job
<name>
Job...
Pod
<name>-...
Pod...
Pod
<name>-...-driver-pod
Pod...
ConfigMap
<name>-driver-pod-template
ConfigMap...
ConfigMap
<name>-executor-pod-template
ConfigMap...
Pod
<name>-...-exec-3
Pod...
Pod
<name>-...-exec-2
Pod...
Pod
<name>-...-exec-1
Pod...
spark-
submit
spark-...
ServiceAccount
<name>
ServiceAccount...
Text is not SVG - cannot display
\ No newline at end of file +
Spark Operator
Spark Operator
SparkApplication
<app>
SparkApplication...
create
create
read
read
Legend
Legend
Operator
Operator
Resource
Resource
Custom
Resource
Custom...
references
references
Job
<app>
Job...
Pod
<app>-...
Pod...
Pod
<app>-...-driver-pod
Pod...
ConfigMap
<app>-driver-pod-template
ConfigMap...
ConfigMap
<app>-executor-pod-template
ConfigMap...
Pod
<app>-...-exec-3
Pod...
Pod
<app>-...-exec-2
Pod...
Pod
<app>-...-exec-1
Pod...
spark-
submit
spark-...
ServiceAccount
<app>
ServiceAccount...
Pod
<hist>-node-<rg1>-1
Pod...
StatefulSet
<hist>-node-<rg1>
StatefulSet...
Service
<hist>-node-<rg1>
Service...
Pod
<hist>-node-<rg1>-0
Pod...
ConfigMap
<hist>-node-<rg1>
ConfigMap...
role group
<rg1>
role group...
Spark application
Spark application
Spark history server
Spark history server
SparkHistoryServer
<hist>
SparkHistoryServer...
S3
S3
write
write
Text is not SVG - cannot display
\ No newline at end of file From 7bda95e8e6d0a1e4a96d6d433b73fa43480dab48 Mon Sep 17 00:00:00 2001 From: Felix Hennig Date: Mon, 17 Apr 2023 12:23:05 +0200 Subject: [PATCH 11/11] Updated spark history text --- docs/modules/spark-k8s/pages/index.adoc | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/docs/modules/spark-k8s/pages/index.adoc b/docs/modules/spark-k8s/pages/index.adoc index 737b5a76..0c12eec3 100644 --- a/docs/modules/spark-k8s/pages/index.adoc +++ b/docs/modules/spark-k8s/pages/index.adoc @@ -12,11 +12,13 @@ Follow the xref:getting_started/index.adoc[] guide to get started with Apache Sp The Stackable Operator for Apache Spark reads a _SparkApplication custom resource_ which you use to define your spark job/application. The Operator creates the relevant Kubernetes resources for the job to run. -=== SparkApplication custom resource +=== Custom resources -The SparkApplication resource is the main point of interaction with the Operator. An exhaustive list of options is given on the xref:crd-reference.adoc[] page. +The Operator manages two custom resource kinds: The _SparkApplication_ and the _SparkHistoryServer_. -Unlike other Operators, the Spark Operator does not have xref:concepts:roles-and-role-groups.adoc[roles]. +The SparkApplication resource is the main point of interaction with the Operator. Unlike other Stackable Operator custom resources, the SparkApplication does not have xref:concepts:roles-and-role-groups.adoc[roles]. An exhaustive list of options is given on the xref:crd-reference.adoc[] page. + +The xref:usage-guide/history-server.adoc[SparkHistoryServer] does have a single `node` role. It is used to deploy a https://spark.apache.org/docs/latest/monitoring.html#viewing-after-the-fact[Spark history server]. It reads data from an S3 bucket that you configure. Your applications need to write their logs to the same bucket. === Kubernetes resources @@ -28,6 +30,8 @@ The Job runs `spark-submit` in a Pod which then creates a Spark driver Pod. The The two main ConfigMaps are the `-driver-pod-template` and `-executor-pod-template` which define how the driver and executor Pods should be created. +The Spark history server deploys like other Stackable-supported applications: A Statefulset is created for every role group. A role group can have multiple replicas (Pods). A ConfigMap supplies the necessary configuration, and there is a service to connect to. + === RBAC The https://spark.apache.org/docs/latest/running-on-kubernetes.html#rbac[Spark-Kubernetes RBAC documentation] describes what is needed for `spark-submit` jobs to run successfully: minimally a role/cluster-role to allow the driver pod to create and manage executor pods.