Skip to content

Commit

Permalink
feat(ci): add spark smoke test (#4158)
Browse files Browse the repository at this point in the history
  • Loading branch information
MugdhaHardikar-GSLab authored Feb 23, 2022
1 parent 78cb194 commit ede31c4
Show file tree
Hide file tree
Showing 50 changed files with 2,761 additions and 2 deletions.
55 changes: 55 additions & 0 deletions .github/workflows/spark-smoke-test.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
name: spark smoke test
on:
# Allows you to run this workflow manually from the Actions tab
workflow_dispatch:
push:
branches:
- master
paths:
- "metadata_models/**"
- "metadata-integration/java/datahub-client/**"
- "metadata-integration/java/spark-lineage"
pull_request:
branches:
- master
paths:
- "metadata_models/**"
- "metadata-integration/java/datahub-client/**"
- "metadata-integration/java/spark-lineage"
release:
types: [published, edited]

jobs:
spark-smoke-test:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- name: Set up JDK 1.8
uses: actions/setup-java@v1
with:
java-version: 1.8
- uses: actions/setup-python@v2
with:
python-version: "3.6"
- name: Install dependencies
run: ./metadata-ingestion/scripts/install_deps.sh
- name: Gradle build
run: ./gradlew build -x check -x docs-website:build -x test
- name: Smoke test
run: ./gradlew metadata-integration:java:spark-lineage:integrationTest
- uses: actions/upload-artifact@v2
if: always()
with:
name: Test Results (smoke tests)
path: |
**/build/reports/tests/test/**
**/build/test-results/test/**
**/junit.*.xml
- name: Slack failure notification
if: failure() && github.event_name == 'push'
uses: kpritam/slack-job-status-action@v1
with:
job-status: ${{ job.status }}
slack-bot-token: ${{ secrets.SLACK_BOT_TOKEN }}
channel: github-activities

6 changes: 4 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -46,12 +46,14 @@ MANIFEST
**/spark-lineage/**/out.csv/
.vscode

#spark smoke test
smoke-test/spark-smoke-test/docker/workspace/
smoke-test/spark-smoke-test/__pycache__/

# cypress integration test generated files
**/cypress/videos
**/cypress/screenshots
**/cypress/node_modules

# Metadata Ingestion Generated
metadata-ingestion/generated/**

.remote*
4 changes: 4 additions & 0 deletions metadata-integration/java/spark-lineage/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,10 @@ assemble {
dependsOn shadowJar
}

task integrationTest(type: Exec ) {
commandLine "spark-smoke-test/smoke.sh"
}

task sourcesJar(type: Jar) {
classifier 'sources'
from sourceSets.main.allJava
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
**/metastore_db/
**/derby.log
spark-warehouse
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
FROM rappdw/docker-java-python:openjdk1.8.0_171-python3.6.6

ARG shared_workspace=/opt/workspace


ENV SHARED_WORKSPACE=${shared_workspace}

# -- Layer: Apache Spark

ARG spark_version=2.4.8
ARG hadoop_version=2.7

RUN apt-get update -y && \
apt-get install -y curl && \
curl https://archive.apache.org/dist/spark/spark-${spark_version}/spark-${spark_version}-bin-hadoop${hadoop_version}.tgz -o spark.tgz && \
tar -xf spark.tgz && \
mv spark-${spark_version}-bin-hadoop${hadoop_version} /usr/bin/ && \
mkdir /usr/bin/spark-${spark_version}-bin-hadoop${hadoop_version}/logs && \
rm spark.tgz

ENV SPARK_HOME /usr/bin/spark-${spark_version}-bin-hadoop${hadoop_version}
ENV SPARK_MASTER_HOST spark-master
ENV SPARK_MASTER_PORT 7077
ENV PYSPARK_PYTHON python2.7
ENV PATH=$PATH:$SPARK_HOME/bin

COPY workspace $SHARED_WORKSPACE

WORKDIR ${SPARK_HOME}

Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
FROM spark-base

# -- Runtime

ARG spark_master_web_ui=8080

EXPOSE ${spark_master_web_ui} ${SPARK_MASTER_PORT}
ENTRYPOINT bin/spark-class org.apache.spark.deploy.master.Master >> logs/spark-master.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
FROM spark-base

# -- Runtime

ARG spark_worker_web_ui=8081

EXPOSE ${spark_worker_web_ui}
ENTRYPOINT bin/spark-class org.apache.spark.deploy.worker.Worker spark://${SPARK_MASTER_HOST}:${SPARK_MASTER_PORT} >> logs/spark-worker.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
FROM spark-base

# -- Runtime

WORKDIR ${SHARED_WORKSPACE}

ENTRYPOINT sleep 30 && \
cd python-spark-lineage-test && \
./python_test_run.sh $SPARK_HOME ../spark-docker.conf && \
cd ../java-spark-lineage-test && ./java_test_run.sh $SPARK_HOME ../spark-docker.conf



Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@

#Remove old configuration
rm -rf workspace

#Copy needed files
mkdir workspace

ls ../../

cp ../../build/libs/datahub-spark-lineage* workspace/
cp ../spark-docker.conf workspace/
cp -a ../python-spark-lineage-test workspace/
mkdir workspace/java-spark-lineage-test
cp ../test-spark-lineage/java_test_run.sh workspace/java-spark-lineage-test/

mkdir -p workspace/java-spark-lineage-test/build/libs/
cp ../test-spark-lineage/build/libs/test-spark-lineage.jar workspace/java-spark-lineage-test/build/libs/

cp -a ../resources workspace

# create docker images
docker build -f SparkBase.Dockerfile -t spark-base .
docker build -f SparkMaster.Dockerfile -t spark-master .
docker build -f SparkSlave.Dockerfile -t spark-slave .
docker build -f SparkSubmit.Dockerfile -t spark-submit .

Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
version: "3.6"

services:
spark-master:
image: spark-master
container_name: spark-master
ports:
- 8090:8080
- 7077:7077
spark-worker-1:
image: spark-slave
container_name: spark-worker-1
ports:
- 8091:8081
depends_on:
- spark-master
spark-worker-2:
image: spark-slave
container_name: spark-worker-2
ports:
- 8092:8081
depends_on:
- spark-master

networks:
default:
external: true
name: datahub_network

Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
{
"urn:li:dataFlow:(spark,JavaHdfsIn2HdfsOut1,spark_spark-master_7077)": {
"value": {
"com.linkedin.metadata.snapshot.DataFlowSnapshot": {
"urn": "urn:li:dataFlow:(spark,JavaHdfsIn2HdfsOut1,spark_spark-master_7077)",
"aspects": [
{
"com.linkedin.metadata.key.DataFlowKey": {
"orchestrator": "spark",
"cluster": "spark_spark-master_7077",
"flowId": "JavaHdfsIn2HdfsOut1"
}
},
{
"com.linkedin.datajob.DataFlowInfo": {
"name": "JavaHdfsIn2HdfsOut1",
"customProperties": {
"sparkUser": "root",
"appName": "JavaHdfsIn2HdfsOut1"
}
}
},
{
"com.linkedin.common.DataPlatformInstance": {
"platform": "urn:li:dataPlatform:spark"
}
},
{
"com.linkedin.common.BrowsePaths": {
"paths": [
"/spark/spark_spark-master_7077/javahdfsin2hdfsout1"
]
}
}
]
}
}
},
"urn:li:dataJob:(urn:li:dataFlow:(spark,JavaHdfsIn2HdfsOut1,spark_spark-master_7077),QueryExecId_4)": {
"value": {
"com.linkedin.metadata.snapshot.DataJobSnapshot": {
"urn": "urn:li:dataJob:(urn:li:dataFlow:(spark,JavaHdfsIn2HdfsOut1,spark_spark-master_7077),QueryExecId_4)",
"aspects": [
{
"com.linkedin.metadata.key.DataJobKey": {
"jobId": "QueryExecId_4",
"flow": "urn:li:dataFlow:(spark,JavaHdfsIn2HdfsOut1,spark_spark-master_7077)"
}
},
{
"com.linkedin.common.DataPlatformInstance": {
"platform": "urn:li:dataPlatform:spark"
}
},
{
"com.linkedin.datajob.DataJobInfo": {
"name": "csv at HdfsIn2HdfsOut1.java:27",
"type": {
"string": "sparkJob"
},
"customProperties": {
"SQLQueryId": "4",
"appName": "JavaHdfsIn2HdfsOut1",
"description": "csv at HdfsIn2HdfsOut1.java:27",
"queryPlan": "InsertIntoHadoopFsRelationCommand file:/opt/workspace/resources/data/JavaHdfsIn2HdfsOut1/out.csv, false, CSV, Map(path -> ../resources/data/JavaHdfsIn2HdfsOut1/out.csv), Overwrite, [a, b, c, d]\n+- Project [c1#11 AS a#32, c2#12 AS b#33, c1#27 AS c#34, c2#28 AS d#35]\n +- Join Inner, (id#10 = id#26)\n :- Filter isnotnull(id#10)\n : +- Relation[id#10,c1#11,c2#12] csv\n +- Filter isnotnull(id#26)\n +- Relation[id#26,c1#27,c2#28] csv\n"
}
}
},
{
"com.linkedin.common.BrowsePaths": {
"paths": [
"/spark/javahdfsin2hdfsout1/queryexecid_4"
]
}
},
{
"com.linkedin.datajob.DataJobInputOutput": {
"inputDatasets": [
"urn:li:dataset:(urn:li:dataPlatform:hdfs,file:/opt/workspace/resources/data/in1.csv,PROD)",
"urn:li:dataset:(urn:li:dataPlatform:hdfs,file:/opt/workspace/resources/data/in2.csv,PROD)"
],
"outputDatasets": [
"urn:li:dataset:(urn:li:dataPlatform:hdfs,file:/opt/workspace/resources/data/JavaHdfsIn2HdfsOut1/out.csv,PROD)"
]
}
}
]
}
}
}
}
Loading

0 comments on commit ede31c4

Please sign in to comment.