Skip to content

Commit 348385f

Browse files
committed
Merge pull request #9 from SunZhaonan/master
Add Hive metadata ETL process
2 parents 059dfb1 + 0eddd8a commit 348385f

File tree

13 files changed

+789
-2
lines changed

13 files changed

+789
-2
lines changed

backend-service/app/actors/EtlJobFactory.java

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
import java.util.Properties;
1717
import metadata.etl.EtlJob;
1818
import metadata.etl.dataset.hdfs.HdfsMetadataEtl;
19+
import metadata.etl.dataset.hive.HiveMetadataEtl;
1920
import metadata.etl.dataset.teradata.TeradataMetadataEtl;
2021
import metadata.etl.git.GitMetadataEtl;
2122
import metadata.etl.lineage.AzLineageMetadataEtl;
@@ -49,6 +50,8 @@ public static EtlJob getEtlJob(EtlJobName etlJobName, Integer refId, Long whExec
4950
return new LdapEtl(refId, whExecId, properties);
5051
case GIT_MEDATA_ETL:
5152
return new GitMetadataEtl(refId, whExecId, properties);
53+
case HIVE_DATASET_METADATA_ETL:
54+
return new HiveMetadataEtl(refId, whExecId, properties);
5255
default:
5356
throw new UnsupportedOperationException("Unsupported job type: " + etlJobName);
5457
}

backend-service/app/models/EtlJobName.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ public enum EtlJobName {
2525
HADOOP_DATASET_OWNER_ETL(EtlType.OWNER, RefIdType.DB),
2626
LDAP_USER_ETL(EtlType.LDAP, RefIdType.APP),
2727
GIT_MEDATA_ETL(EtlType.VCS, RefIdType.APP),
28+
HIVE_DATASET_METADATA_ETL(EtlType.DATASET, RefIdType.DB),
2829
;
2930

3031
EtlType etlType;

build.gradle

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,5 +69,7 @@ subprojects {
6969
"play" : "com.typesafe.play:play_2.10:2.2.4",
7070
"play_ebean" : "com.typesafe.play:play-java-ebean_2.10:2.2.4",
7171
"play_java_jdbc" : "com.typesafe.play:play-java-jdbc_2.10:2.2.4",
72-
"play_cache" : "com.typesafe.play:play-cache_2.10:2.2.4"]
72+
"play_cache" : "com.typesafe.play:play-cache_2.10:2.2.4",
73+
"hive_exec" : "org.apache.hive:hive-exec:1.2.1"
74+
]
7375
}

metadata-etl/build.gradle

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ dependencies {
2222
compile externalDependency.akka
2323
compile externalDependency.slf4j_api
2424
compile externalDependency.slf4j_log4j
25+
compile externalDependency.hive_exec
2526
compile files("extralibs/terajdbc4-15.00.00.20.jar")
2627
compile files("extralibs/tdgssconfig-15.00.00.20.jar")
2728
// compile externalDependency.jython
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
package metadata.etl.dataset.hive;
2+
3+
import java.io.InputStream;
4+
import java.util.Properties;
5+
import metadata.etl.EtlJob;
6+
7+
8+
/**
9+
* Created by zsun on 11/16/15.
10+
*/
11+
public class HiveMetadataEtl extends EtlJob {
12+
13+
@Deprecated
14+
public HiveMetadataEtl(int dbId, long whExecId) {
15+
super(null, dbId, whExecId);
16+
}
17+
18+
public HiveMetadataEtl(int dbId, long whExecId, Properties prop) {
19+
super(null, dbId, whExecId, prop);
20+
}
21+
22+
23+
@Override
24+
public void extract()
25+
throws Exception {
26+
logger.info("In Hive metadata ETL, launch extract jython scripts");
27+
InputStream inputStream = classLoader.getResourceAsStream("jython/HiveExtract.py");
28+
//logger.info("before call scripts " + interpreter.getSystemState().argv);
29+
interpreter.execfile(inputStream);
30+
inputStream.close();
31+
}
32+
33+
@Override
34+
public void transform()
35+
throws Exception {
36+
logger.info("In Hive metadata ETL, launch transform jython scripts");
37+
InputStream inputStream = classLoader.getResourceAsStream("jython/HiveTransform.py");
38+
interpreter.execfile(inputStream);
39+
inputStream.close();
40+
}
41+
42+
@Override
43+
public void load()
44+
throws Exception {
45+
logger.info("In Hive metadata ETL, launch load jython scripts");
46+
InputStream inputStream = classLoader.getResourceAsStream("jython/HiveLoad.py");
47+
interpreter.execfile(inputStream);
48+
inputStream.close();
49+
}
50+
}
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
package metadata.etl.dataset.hive;
2+
3+
import java.util.TreeSet;
4+
import org.apache.hadoop.hive.ql.tools.LineageInfo;
5+
import org.slf4j.Logger;
6+
import org.slf4j.LoggerFactory;
7+
8+
9+
/**
10+
* Created by zsun on 12/14/15.
11+
*/
12+
public class HiveViewDependency {
13+
final Logger logger = LoggerFactory.getLogger(getClass());
14+
LineageInfo lineageInfoTool;
15+
16+
public HiveViewDependency() {
17+
lineageInfoTool = new LineageInfo();
18+
}
19+
public String[] getViewDependency(String hiveQl) {
20+
try {
21+
lineageInfoTool.getLineageInfo(hiveQl);
22+
TreeSet<String> inputs = lineageInfoTool.getInputTableList();
23+
return inputs.toArray(new String[inputs.size()]);
24+
} catch (Exception e) {
25+
logger.error("Sql statements : \n" + hiveQl + "\n parse ERROR, will return an empty String array");
26+
logger.error(String.valueOf(e.getCause()));
27+
return new String[]{};
28+
}
29+
}
30+
}

metadata-etl/src/main/resources/application.properties.template

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,3 +106,13 @@ ldap.group.search.return.attributes=
106106
git.host=
107107
git.project.whitelist=
108108

109+
# hive metastore
110+
hive.metastore.jdbc.url=
111+
hive.metastore.jdbc.driver=
112+
hive.metstore.username=
113+
hive.metastore.password=
114+
115+
hive.schema_json_file=
116+
#hive.sample_csv=
117+
hive.schema_csv_file=
118+
hive.field_metadata=

metadata-etl/src/main/resources/jython/HdfsLoad.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ def load_metadata(self):
3030
3131
LOAD DATA LOCAL INFILE '{source_file}'
3232
INTO TABLE stg_dict_dataset
33-
FIELDS TERMINATED BY '\Z' ESCAPED BY '\0'
33+
FIELDS TERMINATED BY '\Z' ESCAPED BY '\\'
3434
(`name`, `schema`, properties, fields, urn, source, sample_partition_full_path, source_created_time, source_modified_time)
3535
SET db_id = {db_id},
3636
-- TODO storage_type = 'Avro',

0 commit comments

Comments
 (0)