-
Notifications
You must be signed in to change notification settings - Fork 122
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: Update Analytics Workload to Latest Version
* feat: hadoop 2.10.2 and 3.3.4 * feat: spark 3.3.2 * rename: move bench-analytics/4.0 to bench-analytics/ * update: data-analytics, separate dataset, separate master and slave, allow parameters, * update: graph-analytics spark ver 2.4.5 -> 3.3.2 * update: in-memory--analytics spark ver 2.4.5 -> 3.3.2, scala 2.11->2.13 * rename: `bench-analytics` to `bench-analytics/latest` * rm: old hadoop * fix: CI * fix: hadoop3 remove files from hadoop2 * doc: update data-analytics new launch parameters * fix: graph-analytics sbt spark version * fix: in-memory-analytics spark version and COPY in Dockerfile * chore: rename the image tag * chore: Enable CI for wikimedia page dataset * fix(graph-analytics): entrypoint JAR path * fix(analytics): revive the build container * fix(graph-analytics): entrypoint is missing * fix(graph-analytics): shell permission * fix(graph-analytics): wrong application [no ci] * doc(analytics): correct the document description * doc: additional information * doc(data-analytics): master ip address and wait * doc: fix grammar bug --------- Co-authored-by: Cyan Lin <[email protected]>
- Loading branch information
1 parent
e7dda14
commit b45ce2c
Showing
59 changed files
with
1,109 additions
and
460 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -49,6 +49,7 @@ github | |
GraphX | ||
grouplens | ||
hadoop | ||
HDFS | ||
HHVM | ||
HipHop | ||
hostname | ||
|
This file was deleted.
Oops, something went wrong.
This file was deleted.
Oops, something went wrong.
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
FROM cloudsuite/hadoop:2.10.2 | ||
|
||
ENV MAHOUT_VERSION 14.1 | ||
ENV MAHOUT_HOME /opt/mahout-${MAHOUT_VERSION} | ||
RUN mkdir ${MAHOUT_HOME} /user | ||
|
||
# Install Mahout | ||
RUN set -x \ | ||
&& URL=https://downloads.apache.org/mahout/${MAHOUT_VERSION}/apache-mahout-distribution-${MAHOUT_VERSION}.tar.gz \ | ||
&& curl ${URL} | tar -xzC ${MAHOUT_HOME} | ||
|
||
COPY files/benchmark.sh /root/ | ||
COPY files/docker-entrypoint.py /root/ | ||
COPY files/mahout-examples-0.13.0-job.jar ${MAHOUT_HOME}/. | ||
COPY files/categories /user/ | ||
|
||
RUN chmod +x /root/benchmark.sh /root/docker-entrypoint.py \ | ||
&& ln -s /root/benchmark.sh /bin/benchmark | ||
|
||
ENTRYPOINT ["/root/docker-entrypoint.py"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
#!/bin/bash | ||
|
||
RED='\033[0;31m' | ||
RESET='\033[0m' | ||
|
||
source ~/.bashrc | ||
echo -e "Mahout: Start HDFS server" | ||
${HADOOP_HOME}/bin/hdfs dfs -test -e /data/wiki | ||
if [ $? -ne 0 ]; then | ||
echo -e "Mahout: make dir for /user/data/wiki" | ||
${HADOOP_HOME}/bin/hdfs dfs -mkdir -p /user/data | ||
${HADOOP_HOME}/bin/hdfs dfs -put /data/wiki /user/data/ | ||
fi | ||
|
||
START=$(($(date +"%s%N")/1000000)) | ||
|
||
# Create sequence files from wiki | ||
echo -e "${RED}Mahout: seqwiki${RESET}" | ||
${MAHOUT_HOME}/bin/mahout seqwiki -c /user/categories -i /user/data/wiki -o /user/data/wiki-seq | ||
# Convert sequence files to vectors using bigrams | ||
echo -e "${RED}Mahout: seq2sparse${RESET}" | ||
${MAHOUT_HOME}/bin/mahout seq2sparse -i /user/data/wiki-seq -o /user/data/wiki-vectors -lnorm -nv -wt tfidf -ow -ng 2 | ||
# Create training and holdout sets with a random 80-20 split of the generated vector dataset | ||
echo -e "${RED}Mahout: split${RESET}" | ||
${MAHOUT_HOME}/bin/mahout split -i /user/data/wiki-vectors/tfidf-vectors --trainingOutput /user/data/training \ | ||
--testOutput /user/data/testing -rp 20 -ow -seq -xm sequential | ||
# Train Bayes model | ||
echo -e "${RED}Mahout: trainnb${RESET}" | ||
${MAHOUT_HOME}/bin/mahout trainnb -i /user/data/training -o /user/data/model -li /user/data/labelindex -ow -c | ||
# Test on holdout set | ||
echo -e "${RED}Mahout: testnb${RESET}" | ||
${MAHOUT_HOME}/bin/mahout testnb -i /user/data/testing -m /user/data/model -l /user/data/labelindex -ow -o /user/data/output -seq | ||
|
||
END=$(($(date +"%s%N")/1000000)) | ||
TIME=$(($END - $START)) | ||
echo -e "\nBenchmark time: ${TIME}ms" | ||
|
File renamed without changes.
19 changes: 19 additions & 0 deletions
19
benchmarks/data-analytics/latest/files/docker-entrypoint.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
#!/usr/bin/env python3 | ||
|
||
import os | ||
import sys | ||
import subprocess | ||
import argparse | ||
|
||
args = sys.argv[1:] | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument("--yarn-cores", help="YARN: number of cores for yarn", default=8) | ||
parser.add_argument("--mapreduce-mem", help="MAP_REDUCE: memory per mapreduce worker", default=2096) | ||
|
||
args_parsed, unknown = parser.parse_known_args() | ||
|
||
yarn_max_mem = int(args_parsed.mapreduce_mem) * int(args_parsed.yarn_cores) | ||
args.append("--yarn-mem=" + str(yarn_max_mem + 812)) | ||
|
||
print(str(args)) | ||
subprocess.call(['./hadoop-start.py'] + args) |
File renamed without changes.
This file was deleted.
Oops, something went wrong.
This file was deleted.
Oops, something went wrong.
54 changes: 0 additions & 54 deletions
54
benchmarks/graph-analytics/4.0/benchmark/src/main/scala/GraphAnalytics.scala
This file was deleted.
Oops, something went wrong.
This file was deleted.
Oops, something went wrong.
Oops, something went wrong.