-
Notifications
You must be signed in to change notification settings - Fork 3k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
13 changed files
with
723 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -54,3 +54,11 @@ License: Apache 2.0 | |
This product includes/uses JsonPath (http://goessner.net/articles/JsonPath/) | ||
Copyright (c) 2011 the original author or authors | ||
License: Apache 2.0 | ||
|
||
This product includes/uses jsoup (http://jsoup.org/) | ||
Copyright © 2009 - 2013 Jonathan Hedley ([email protected]) | ||
License: MIT | ||
|
||
This product includes/uses JGit (https://eclipse.org/jgit/) | ||
Copyright (c) 2007, Eclipse Foundation, Inc. and its licensors | ||
License: Eclipse Distribution License |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
CREATE TABLE `source_code_commit_info` ( | ||
`app_id` SMALLINT(5) UNSIGNED DEFAULT NULL, | ||
`repository_urn` VARCHAR(300) CHAR SET latin1 NOT NULL COMMENT 'the git repo urn', | ||
`commit_id` VARCHAR(50) CHAR SET latin1 NOT NULL COMMENT 'the sha-1 hash of the commit', | ||
`file_path` VARCHAR(600) CHAR SET latin1 NOT NULL COMMENT 'the path to the file', | ||
`file_name` VARCHAR(127) NOT NULL COMMENT 'the file name', | ||
`commit_time` INT UNSIGNED COMMENT 'the commit time', | ||
`committer_name` VARCHAR(128) NOT NULL COMMENT 'name of the committer', | ||
`committer_email` VARCHAR(128) DEFAULT NULL COMMENT 'email of the committer', | ||
`author_name` VARCHAR(128) NOT NULL COMMENT 'name of the author', | ||
`author_email` VARCHAR(128) NOT NULL COMMENT 'email of the author', | ||
`message` VARCHAR(1024) NOT NULL COMMENT 'message of the commit', | ||
`created_time` INT UNSIGNED COMMENT 'wherehows created time', | ||
`modified_time` INT UNSIGNED COMMENT 'latest wherehows modified', | ||
`wh_etl_exec_id` BIGINT COMMENT 'wherehows etl execution id that modified this record', | ||
PRIMARY KEY (repository_urn, file_path, commit_id), | ||
KEY (commit_id), | ||
KEY (repository_urn, file_name, committer_email) | ||
) ENGINE = InnoDB DEFAULT CHARSET = utf8; | ||
|
||
CREATE TABLE `stg_source_code_commit_info` ( | ||
`app_id` SMALLINT(5) UNSIGNED DEFAULT NULL, | ||
`repository_urn` VARCHAR(300) CHAR SET latin1 NOT NULL COMMENT 'the git repo urn', | ||
`commit_id` VARCHAR(50) CHAR SET latin1 NOT NULL COMMENT 'the sha-1 hash of the commit', | ||
`file_path` VARCHAR(600) CHAR SET latin1 NOT NULL COMMENT 'the path to the file', | ||
`file_name` VARCHAR(127) NOT NULL COMMENT 'the file name', | ||
`commit_time` INT UNSIGNED COMMENT 'the commit time', | ||
`committer_name` VARCHAR(128) NOT NULL COMMENT 'name of the committer', | ||
`committer_email` VARCHAR(128) DEFAULT NULL COMMENT 'email of the committer', | ||
`author_name` VARCHAR(128) NOT NULL COMMENT 'name of the author', | ||
`author_email` VARCHAR(128) NOT NULL COMMENT 'email of the author', | ||
`message` VARCHAR(1024) NOT NULL COMMENT 'message of the commit', | ||
`wh_etl_exec_id` BIGINT COMMENT 'wherehows etl execution id that modified this record', | ||
PRIMARY KEY (repository_urn, file_path, commit_id), | ||
KEY (commit_id), | ||
KEY (repository_urn, file_name, committer_email) | ||
) ENGINE = InnoDB DEFAULT CHARSET = utf8; | ||
|
99 changes: 99 additions & 0 deletions
99
metadata-etl/src/main/java/metadata/etl/git/GitMetadataEtl.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,99 @@ | ||
/** | ||
* Copyright 2015 LinkedIn Corp. All rights reserved. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
*/ | ||
package metadata.etl.git; | ||
|
||
import java.io.File; | ||
import java.io.InputStream; | ||
import java.util.Arrays; | ||
import java.util.HashSet; | ||
import java.util.List; | ||
import java.util.Properties; | ||
import java.util.Set; | ||
import metadata.etl.EtlJob; | ||
import org.slf4j.Logger; | ||
import org.slf4j.LoggerFactory; | ||
import wherehows.common.Constant; | ||
import wherehows.common.schemas.GitCommitRecord; | ||
import wherehows.common.utils.GitUtil; | ||
import wherehows.common.writers.FileWriter; | ||
|
||
|
||
/** | ||
* Created by zechen on 12/7/15. | ||
*/ | ||
public class GitMetadataEtl extends EtlJob { | ||
|
||
public ClassLoader classLoader = getClass().getClassLoader(); | ||
protected final Logger logger = LoggerFactory.getLogger(getClass()); | ||
public static final String COMMIT_OUTPUT_FILE = "commit.csv"; | ||
|
||
public GitMetadataEtl(int appId, long whExecId) { | ||
super(appId, null, whExecId); | ||
} | ||
|
||
public GitMetadataEtl(int appId, long whExecId, Properties prop) { | ||
super(appId, null, whExecId, prop); | ||
} | ||
|
||
public void extract() throws Exception { | ||
logger.info("git extract"); | ||
String gitHost = this.prop.getProperty(Constant.GIT_HOST_KEY); | ||
String[] projects = (this.prop.getProperty(Constant.GIT_PROJECT_WHITELIST_KEY)).trim().split("\\s*,\\s*"); | ||
Set<String> blackCommitters = new HashSet<>( | ||
Arrays.asList(this.prop.getProperty(Constant.GIT_COMMITTER_BLACKLIST_KEY).trim().split("\\s*,\\s*"))); | ||
|
||
String localDir = this.prop.getProperty(Constant.WH_APP_FOLDER_KEY) + "/" + this.prop.getProperty(Constant.APP_ID_KEY); | ||
File dir = new File(localDir); | ||
if (!dir.exists()) { | ||
if (!dir.mkdirs()) { | ||
throw new Exception("can not create metadata directory"); | ||
} | ||
} | ||
FileWriter fw = new FileWriter(localDir + "/" + COMMIT_OUTPUT_FILE); | ||
for (String project : projects) { | ||
List<String> repos = GitUtil.getRepoListFromProject(GitUtil.getHttpsUrl(gitHost, project)); | ||
for (String repo : repos) { | ||
String repoUri = GitUtil.getGitUrl(gitHost, repo); | ||
String repoDir = localDir + "/" + repo; | ||
GitUtil.clone(repoUri, repoDir); | ||
List<GitUtil.CommitMetadata> commitMetadatas = GitUtil.getRepoMetadata(repoDir); | ||
for (GitUtil.CommitMetadata m : commitMetadatas) { | ||
fw.append(new GitCommitRecord(m, repoUri)); | ||
} | ||
} | ||
} | ||
fw.close(); | ||
} | ||
|
||
@Override | ||
public void transform() | ||
throws Exception { | ||
logger.info("git transform"); | ||
// call a python script to do the transformation | ||
InputStream inputStream = classLoader.getResourceAsStream("jython/GitTransform.py"); | ||
interpreter.execfile(inputStream); | ||
inputStream.close(); | ||
} | ||
|
||
@Override | ||
public void load() | ||
throws Exception { | ||
logger.info("ldap db load"); | ||
InputStream inputStream = classLoader.getResourceAsStream("jython/GitLoad.py"); | ||
interpreter.execfile(inputStream); | ||
inputStream.close(); | ||
} | ||
|
||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
# | ||
# Copyright 2015 LinkedIn Corp. All rights reserved. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# | ||
|
||
__author__ = 'zechen' | ||
|
||
from wherehows.common import Constant | ||
from com.ziclix.python.sql import zxJDBC | ||
import sys | ||
|
||
|
||
class GitLoad: | ||
|
||
def __init__(self, args): | ||
self.wh_con = zxJDBC.connect(args[Constant.WH_DB_URL_KEY], | ||
args[Constant.WH_DB_USERNAME_KEY], | ||
args[Constant.WH_DB_PASSWORD_KEY], | ||
args[Constant.WH_DB_DRIVER_KEY]) | ||
self.wh_cursor = self.wh_con.cursor() | ||
self.app_id = int(args[Constant.APP_ID_KEY]) | ||
|
||
def run(self): | ||
self.load_from_stg() | ||
self.wh_cursor.close() | ||
self.wh_con.close() | ||
|
||
def load_from_stg(self): | ||
query = """ | ||
INSERT INTO source_code_commit_info | ||
( | ||
app_id, repository_urn, commit_id, file_path, file_name, commit_time, committer_name, committer_email, | ||
author_name, author_email, message, created_time, wh_etl_exec_id | ||
) | ||
select app_id, repository_urn, commit_id, file_path, file_name, commit_time, committer_name, committer_email, | ||
author_name, author_email, message, unix_timestamp(NOW()), wh_etl_exec_id | ||
from stg_source_code_commit_info s | ||
where s.app_id = {app_id} | ||
on duplicate key update | ||
commit_time = s.commit_time, | ||
committer_name = s.committer_name, | ||
committer_email = s.committer_email, | ||
author_name = s.author_name, | ||
author_email = s.author_email, | ||
message = s.message, | ||
modified_time = unix_timestamp(NOW()), | ||
wh_etl_exec_id = s.wh_etl_exec_id | ||
""".format(app_id=self.app_id) | ||
print query | ||
self.wh_cursor.execute(query) | ||
self.wh_con.commit() | ||
|
||
if __name__ == "__main__": | ||
props = sys.argv[1] | ||
git = GitLoad(props) | ||
git.run() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,81 @@ | ||
# | ||
# Copyright 2015 LinkedIn Corp. All rights reserved. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# | ||
|
||
__author__ = 'zechen' | ||
|
||
from wherehows.common import Constant | ||
from com.ziclix.python.sql import zxJDBC | ||
import sys | ||
|
||
|
||
class OwnerTransform: | ||
_tables = {"source_code_commit": {"columns": "repository_urn, commit_id, file_path, file_name, commit_time, committer_name, committer_email, author_name, author_email, message", | ||
"file": "commit.csv", | ||
"table": "stg_source_code_commit_info"} | ||
} | ||
|
||
_clear_staging_tempalte = """ | ||
DELETE FROM {table} | ||
""" | ||
|
||
_read_file_template = """ | ||
LOAD DATA LOCAL INFILE '{folder}/{file}' | ||
INTO TABLE {table} | ||
FIELDS TERMINATED BY '\x1a' ESCAPED BY '\0' | ||
LINES TERMINATED BY '\n' | ||
({columns}) | ||
SET app_id = {app_id}, | ||
wh_etl_exec_id = {wh_etl_exec_id}; | ||
""" | ||
|
||
def __init__(self, args): | ||
self.wh_con = zxJDBC.connect(args[Constant.WH_DB_URL_KEY], | ||
args[Constant.WH_DB_USERNAME_KEY], | ||
args[Constant.WH_DB_PASSWORD_KEY], | ||
args[Constant.WH_DB_DRIVER_KEY]) | ||
self.wh_cursor = self.wh_con.cursor() | ||
self.app_id = int(args[Constant.APP_ID_KEY]) | ||
self.wh_etl_exec_id = int(args[Constant.WH_EXEC_ID_KEY]) | ||
self.app_folder = args[Constant.WH_APP_FOLDER_KEY] | ||
self.metadata_folder = self.app_folder + "/" + str(self.app_id) | ||
|
||
def run(self): | ||
self.read_file_to_stg() | ||
self.wh_cursor.close() | ||
self.wh_con.close() | ||
|
||
def read_file_to_stg(self): | ||
t = self._tables["source_code_commit"] | ||
|
||
# Clear stagging table | ||
query = self._clear_staging_tempalte.format(table=t.get("table")) | ||
print query | ||
self.wh_cursor.execute(query) | ||
self.wh_con.commit() | ||
|
||
# Load file into stagging table | ||
query = self._read_file_template.format(folder=self.metadata_folder, | ||
file=t.get("file"), | ||
table=t.get("table"), | ||
columns=t.get("columns"), | ||
app_id=self.app_id, | ||
wh_etl_exec_id=self.wh_etl_exec_id) | ||
print query | ||
self.wh_cursor.execute(query) | ||
self.wh_con.commit() | ||
|
||
if __name__ == "__main__": | ||
props = sys.argv[1] | ||
ot = OwnerTransform(props) | ||
ot.run() |
55 changes: 55 additions & 0 deletions
55
metadata-etl/src/test/java/metadata/etl/git/GitMetadataEtlTest.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
/** | ||
* Copyright 2015 LinkedIn Corp. All rights reserved. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
*/ | ||
package metadata.etl.git; | ||
|
||
import org.testng.annotations.BeforeMethod; | ||
import org.testng.annotations.Test; | ||
|
||
|
||
/** | ||
* Created by zechen on 12/8/15. | ||
*/ | ||
public class GitMetadataEtlTest { | ||
GitMetadataEtl git; | ||
|
||
@BeforeMethod | ||
public void setUp() | ||
throws Exception { | ||
this.git = new GitMetadataEtl(500, 0L); | ||
} | ||
|
||
@Test | ||
public void testExtract() | ||
throws Exception { | ||
git.extract(); | ||
} | ||
|
||
@Test | ||
public void testTransform() | ||
throws Exception { | ||
git.transform(); | ||
} | ||
|
||
@Test | ||
public void testLoad() | ||
throws Exception { | ||
git.load(); | ||
} | ||
|
||
@Test | ||
public void testRun() | ||
throws Exception { | ||
git.run(); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.