feat: Update Analytics Workload to Latest Version

* feat: hadoop 2.10.2 and 3.3.4 * feat: spark 3.3.2 * rename: move bench-analytics/4.0 to bench-analytics/ * update: data-analytics, separate dataset, separate master and slave, allow parameters, * update: graph-analytics spark ver 2.4.5 -> 3.3.2 * update: in-memory--analytics spark ver 2.4.5 -> 3.3.2, scala 2.11->2.13 * rename: `bench-analytics` to `bench-analytics/latest` * rm: old hadoop * fix: CI * fix: hadoop3 remove files from hadoop2 * doc: update data-analytics new launch parameters * fix: graph-analytics sbt spark version * fix: in-memory-analytics spark version and COPY in Dockerfile * chore: rename the image tag * chore: Enable CI for wikimedia page dataset * fix(graph-analytics): entrypoint JAR path * fix(analytics): revive the build container * fix(graph-analytics): entrypoint is missing * fix(graph-analytics): shell permission * fix(graph-analytics): wrong application [no ci] * doc(analytics): correct the document description * doc: additional information * doc(data-analytics): master ip address and wait * doc: fix grammar bug --------- Co-authored-by: Cyan Lin <[email protected]>
parsa-epfl · Mar 16, 2023 · b45ce2c · b45ce2c
1 parent e7dda14
commit b45ce2c
Show file tree

Hide file tree

Showing 59 changed files with 1,109 additions and 460 deletions.
diff --git a/.github/workflows/build-images.yaml b/.github/workflows/build-images.yaml
@@ -157,7 +157,7 @@ jobs:
       is_parent_modified: ${{ steps.set_is_parent_modified.outputs.is_parent_modified }}
     strategy:
       matrix:
-        tag: ["2.4.5"]
+        tag: ["3.3.2"]
         platform: ["linux/amd64,linux/arm64,linux/riscv64"]
     steps:
       - name: checkout
@@ -211,7 +211,7 @@ jobs:
       is_parent_modified: ${{ steps.set_is_parent_modified.outputs.is_parent_modified }}
     strategy:
       matrix:
-        tag: ["2.10.1"]
+        tag: ["2.10.2", "3.3.4"]
         platform: ["linux/amd64,linux/arm64,linux/riscv64"]
     steps:
       - name: checkout
@@ -236,7 +236,7 @@ jobs:
       DH_REPO: "cloudsuite/${{ github.job }}"
     strategy:
       matrix:
-        tag: ["4.0"]
+        tag: ["latest"]
         platform: ["linux/amd64,linux/arm64,linux/riscv64"]
     steps:
       - name: checkout
@@ -308,7 +308,7 @@ jobs:
       DH_REPO: "cloudsuite/${{ github.job }}"
     strategy:
       matrix:
-        tag: ["4.0"]
+        tag: ["latest"]
         platform: ["linux/amd64,linux/arm64,linux/riscv64"]
     steps:
       - name: checkout
@@ -332,7 +332,7 @@ jobs:
       DH_REPO: "cloudsuite/${{ github.job }}"
     strategy:
       matrix:
-        tag: ["4.0"]
+        tag: ["latest"]
         platform: ["linux/amd64,linux/arm64,linux/riscv64"]
     steps:
       - name: checkout
@@ -426,7 +426,7 @@ jobs:
       DH_REPO: "cloudsuite/${{ github.job }}"
     strategy:
       matrix:
-        tag: ["4.0"]
+        tag: ["latest"]
         platform: ["linux/amd64,linux/arm64,linux/riscv64"]
     steps:
       - name: checkout
@@ -449,7 +449,31 @@ jobs:
       DH_REPO: "cloudsuite/${{ github.job }}"
     strategy:
       matrix:
-        tag: ["4.0"]
+        tag: ["latest"]
+        platform: ["linux/amd64,linux/arm64,linux/riscv64"]
+    steps:
+      - name: checkout
+        uses: actions/checkout@v3
+        with:
+          fetch-depth: 0
+      - if: ${{ needs.base-os.outputs.is_parent_modified == 'true' }}
+        run: echo "IS_PARENT_MODIFIED=true" >> $GITHUB_ENV
+      - name: build and push
+        run: "./.github/scripts/build-images.sh"
+        env:
+          IMG_TAG: "${{ matrix.tag }}"
+          DF_PATH: "./datasets/${{ github.job }}/${{ matrix.tag }}"
+          DBX_PLATFORM: ${{ matrix.platform }}
+
+
+  wikimedia-pages-dataset:
+    runs-on: ubuntu-latest
+    needs: base-os
+    env:
+      DH_REPO: "cloudsuite/${{ github.job }}"
+    strategy:
+      matrix:
+        tag: ["latest"]
         platform: ["linux/amd64,linux/arm64,linux/riscv64"]
     steps:
       - name: checkout

diff --git a/.wordlist.txt b/.wordlist.txt
@@ -49,6 +49,7 @@ github
 GraphX
 grouplens
 hadoop
+HDFS
 HHVM
 HipHop
 hostname

diff --git a/benchmarks/data-analytics/4.0/Dockerfile b/benchmarks/data-analytics/4.0/Dockerfile
diff --git a/benchmarks/data-analytics/4.0/files/benchmark.sh b/benchmarks/data-analytics/4.0/files/benchmark.sh
diff --git a/benchmarks/data-analytics/4.0/files/yarn-site.xml b/benchmarks/data-analytics/4.0/files/yarn-site.xml
diff --git a/benchmarks/data-analytics/latest/Dockerfile b/benchmarks/data-analytics/latest/Dockerfile
@@ -0,0 +1,20 @@
+FROM cloudsuite/hadoop:2.10.2
+
+ENV MAHOUT_VERSION 14.1
+ENV MAHOUT_HOME /opt/mahout-${MAHOUT_VERSION}
+RUN mkdir ${MAHOUT_HOME} /user
+
+# Install Mahout
+RUN set -x \
+    && URL=https://downloads.apache.org/mahout/${MAHOUT_VERSION}/apache-mahout-distribution-${MAHOUT_VERSION}.tar.gz \
+    && curl ${URL} | tar -xzC ${MAHOUT_HOME} 
+
+COPY files/benchmark.sh /root/
+COPY files/docker-entrypoint.py /root/
+COPY files/mahout-examples-0.13.0-job.jar ${MAHOUT_HOME}/.
+COPY files/categories /user/
+
+RUN chmod +x /root/benchmark.sh /root/docker-entrypoint.py \
+    && ln -s /root/benchmark.sh /bin/benchmark
+
+ENTRYPOINT ["/root/docker-entrypoint.py"]
diff --git a/benchmarks/data-analytics/latest/files/benchmark.sh b/benchmarks/data-analytics/latest/files/benchmark.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+RED='\033[0;31m'
+RESET='\033[0m'
+
+source ~/.bashrc
+echo -e "Mahout: Start HDFS server"
+${HADOOP_HOME}/bin/hdfs dfs -test -e /data/wiki
+if [ $? -ne 0 ]; then
+  echo -e "Mahout: make dir for /user/data/wiki"
+  ${HADOOP_HOME}/bin/hdfs dfs -mkdir -p /user/data
+  ${HADOOP_HOME}/bin/hdfs dfs -put /data/wiki /user/data/
+fi
+
+START=$(($(date +"%s%N")/1000000))
+
+# Create sequence files from wiki
+echo -e "${RED}Mahout: seqwiki${RESET}"
+${MAHOUT_HOME}/bin/mahout seqwiki -c /user/categories -i /user/data/wiki -o /user/data/wiki-seq
+# Convert sequence files to vectors using bigrams
+echo -e "${RED}Mahout: seq2sparse${RESET}"
+${MAHOUT_HOME}/bin/mahout seq2sparse -i /user/data/wiki-seq -o /user/data/wiki-vectors -lnorm -nv -wt tfidf -ow -ng 2
+# Create training and holdout sets with a random 80-20 split of the generated vector dataset
+echo -e "${RED}Mahout: split${RESET}"
+${MAHOUT_HOME}/bin/mahout split -i /user/data/wiki-vectors/tfidf-vectors --trainingOutput /user/data/training \
+                                --testOutput /user/data/testing -rp 20 -ow -seq -xm sequential
+# Train Bayes model
+echo -e "${RED}Mahout: trainnb${RESET}"
+${MAHOUT_HOME}/bin/mahout trainnb -i /user/data/training -o /user/data/model -li /user/data/labelindex -ow -c
+# Test on holdout set
+echo -e "${RED}Mahout: testnb${RESET}"
+${MAHOUT_HOME}/bin/mahout testnb -i /user/data/testing -m /user/data/model -l /user/data/labelindex -ow -o /user/data/output -seq
+
+END=$(($(date +"%s%N")/1000000))
+TIME=$(($END - $START))
+echo -e "\nBenchmark time: ${TIME}ms"
+
diff --git a/...marks/data-analytics/4.0/files/categories → ...ks/data-analytics/latest/files/categories b/...marks/data-analytics/4.0/files/categories → ...ks/data-analytics/latest/files/categories
diff --git a/benchmarks/data-analytics/latest/files/docker-entrypoint.py b/benchmarks/data-analytics/latest/files/docker-entrypoint.py
@@ -0,0 +1,19 @@
+#!/usr/bin/env python3
+
+import os
+import sys
+import subprocess
+import argparse
+
+args = sys.argv[1:]
+parser = argparse.ArgumentParser()
+parser.add_argument("--yarn-cores", help="YARN: number of cores for yarn", default=8)
+parser.add_argument("--mapreduce-mem", help="MAP_REDUCE: memory per mapreduce worker", default=2096)
+
+args_parsed, unknown = parser.parse_known_args()
+
+yarn_max_mem = int(args_parsed.mapreduce_mem) * int(args_parsed.yarn_cores)
+args.append("--yarn-mem=" + str(yarn_max_mem + 812))
+
+print(str(args))
+subprocess.call(['./hadoop-start.py'] + args)
diff --git a/.../4.0/files/mahout-examples-0.13.0-job.jar → ...test/files/mahout-examples-0.13.0-job.jar b/.../4.0/files/mahout-examples-0.13.0-job.jar → ...test/files/mahout-examples-0.13.0-job.jar
diff --git a/benchmarks/graph-analytics/4.0/benchmark/run_benchmark.sh b/benchmarks/graph-analytics/4.0/benchmark/run_benchmark.sh
diff --git a/benchmarks/graph-analytics/4.0/benchmark/simple.sbt b/benchmarks/graph-analytics/4.0/benchmark/simple.sbt
diff --git a/benchmarks/graph-analytics/4.0/benchmark/src/main/scala/GraphAnalytics.scala b/benchmarks/graph-analytics/4.0/benchmark/src/main/scala/GraphAnalytics.scala
diff --git a/benchmarks/graph-analytics/4.0/files/entrypoint.sh b/benchmarks/graph-analytics/4.0/files/entrypoint.sh
-Original file line number
+Diff line change
@@ Expand Up / @@ -49,6 +49,7 @@ github @@
     GraphX
     grouplens
     hadoop
+    HDFS
     HHVM
     HipHop
     hostname
@@ Expand Down @@