Skip to content

Commit

Permalink
add git file commit history etl
Browse files Browse the repository at this point in the history
  • Loading branch information
czbernard committed Dec 11, 2015
1 parent ebbf9ec commit af04ff6
Show file tree
Hide file tree
Showing 13 changed files with 723 additions and 0 deletions.
8 changes: 8 additions & 0 deletions NOTICE
Original file line number Diff line number Diff line change
Expand Up @@ -54,3 +54,11 @@ License: Apache 2.0
This product includes/uses JsonPath (http://goessner.net/articles/JsonPath/)
Copyright (c) 2011 the original author or authors
License: Apache 2.0

This product includes/uses jsoup (http://jsoup.org/)
Copyright © 2009 - 2013 Jonathan Hedley ([email protected])
License: MIT

This product includes/uses JGit (https://eclipse.org/jgit/)
Copyright (c) 2007, Eclipse Foundation, Inc. and its licensors
License: Eclipse Distribution License
2 changes: 2 additions & 0 deletions build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,8 @@ subprojects {
"hadoop_auth" : "org.apache.hadoop:hadoop-auth:2.7.1",
"json_path" : "com.jayway.jsonpath:json-path:2.0.0",
"akka" : "com.typesafe.akka:akka-actor_2.10:2.2.0",
"jgit" : "org.eclipse.jgit:org.eclipse.jgit:4.1.1.201511131810-r",
"jsoup" : "org.jsoup:jsoup:1.8.3",

"jackson_databind" : "com.fasterxml.jackson.core:jackson-databind:2.6.1",
"jackson_core" : "com.fasterxml.jackson.core:jackson-core:2.6.1",
Expand Down
38 changes: 38 additions & 0 deletions data-model/DDL/ETL_DDL/git_metadata.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
CREATE TABLE `source_code_commit_info` (
`app_id` SMALLINT(5) UNSIGNED DEFAULT NULL,
`repository_urn` VARCHAR(300) CHAR SET latin1 NOT NULL COMMENT 'the git repo urn',
`commit_id` VARCHAR(50) CHAR SET latin1 NOT NULL COMMENT 'the sha-1 hash of the commit',
`file_path` VARCHAR(600) CHAR SET latin1 NOT NULL COMMENT 'the path to the file',
`file_name` VARCHAR(127) NOT NULL COMMENT 'the file name',
`commit_time` INT UNSIGNED COMMENT 'the commit time',
`committer_name` VARCHAR(128) NOT NULL COMMENT 'name of the committer',
`committer_email` VARCHAR(128) DEFAULT NULL COMMENT 'email of the committer',
`author_name` VARCHAR(128) NOT NULL COMMENT 'name of the author',
`author_email` VARCHAR(128) NOT NULL COMMENT 'email of the author',
`message` VARCHAR(1024) NOT NULL COMMENT 'message of the commit',
`created_time` INT UNSIGNED COMMENT 'wherehows created time',
`modified_time` INT UNSIGNED COMMENT 'latest wherehows modified',
`wh_etl_exec_id` BIGINT COMMENT 'wherehows etl execution id that modified this record',
PRIMARY KEY (repository_urn, file_path, commit_id),
KEY (commit_id),
KEY (repository_urn, file_name, committer_email)
) ENGINE = InnoDB DEFAULT CHARSET = utf8;

CREATE TABLE `stg_source_code_commit_info` (
`app_id` SMALLINT(5) UNSIGNED DEFAULT NULL,
`repository_urn` VARCHAR(300) CHAR SET latin1 NOT NULL COMMENT 'the git repo urn',
`commit_id` VARCHAR(50) CHAR SET latin1 NOT NULL COMMENT 'the sha-1 hash of the commit',
`file_path` VARCHAR(600) CHAR SET latin1 NOT NULL COMMENT 'the path to the file',
`file_name` VARCHAR(127) NOT NULL COMMENT 'the file name',
`commit_time` INT UNSIGNED COMMENT 'the commit time',
`committer_name` VARCHAR(128) NOT NULL COMMENT 'name of the committer',
`committer_email` VARCHAR(128) DEFAULT NULL COMMENT 'email of the committer',
`author_name` VARCHAR(128) NOT NULL COMMENT 'name of the author',
`author_email` VARCHAR(128) NOT NULL COMMENT 'email of the author',
`message` VARCHAR(1024) NOT NULL COMMENT 'message of the commit',
`wh_etl_exec_id` BIGINT COMMENT 'wherehows etl execution id that modified this record',
PRIMARY KEY (repository_urn, file_path, commit_id),
KEY (commit_id),
KEY (repository_urn, file_name, committer_email)
) ENGINE = InnoDB DEFAULT CHARSET = utf8;

99 changes: 99 additions & 0 deletions metadata-etl/src/main/java/metadata/etl/git/GitMetadataEtl.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
/**
* Copyright 2015 LinkedIn Corp. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*/
package metadata.etl.git;

import java.io.File;
import java.io.InputStream;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Properties;
import java.util.Set;
import metadata.etl.EtlJob;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import wherehows.common.Constant;
import wherehows.common.schemas.GitCommitRecord;
import wherehows.common.utils.GitUtil;
import wherehows.common.writers.FileWriter;


/**
* Created by zechen on 12/7/15.
*/
public class GitMetadataEtl extends EtlJob {

public ClassLoader classLoader = getClass().getClassLoader();
protected final Logger logger = LoggerFactory.getLogger(getClass());
public static final String COMMIT_OUTPUT_FILE = "commit.csv";

public GitMetadataEtl(int appId, long whExecId) {
super(appId, null, whExecId);
}

public GitMetadataEtl(int appId, long whExecId, Properties prop) {
super(appId, null, whExecId, prop);
}

public void extract() throws Exception {
logger.info("git extract");
String gitHost = this.prop.getProperty(Constant.GIT_HOST_KEY);
String[] projects = (this.prop.getProperty(Constant.GIT_PROJECT_WHITELIST_KEY)).trim().split("\\s*,\\s*");
Set<String> blackCommitters = new HashSet<>(
Arrays.asList(this.prop.getProperty(Constant.GIT_COMMITTER_BLACKLIST_KEY).trim().split("\\s*,\\s*")));

String localDir = this.prop.getProperty(Constant.WH_APP_FOLDER_KEY) + "/" + this.prop.getProperty(Constant.APP_ID_KEY);
File dir = new File(localDir);
if (!dir.exists()) {
if (!dir.mkdirs()) {
throw new Exception("can not create metadata directory");
}
}
FileWriter fw = new FileWriter(localDir + "/" + COMMIT_OUTPUT_FILE);
for (String project : projects) {
List<String> repos = GitUtil.getRepoListFromProject(GitUtil.getHttpsUrl(gitHost, project));
for (String repo : repos) {
String repoUri = GitUtil.getGitUrl(gitHost, repo);
String repoDir = localDir + "/" + repo;
GitUtil.clone(repoUri, repoDir);
List<GitUtil.CommitMetadata> commitMetadatas = GitUtil.getRepoMetadata(repoDir);
for (GitUtil.CommitMetadata m : commitMetadatas) {
fw.append(new GitCommitRecord(m, repoUri));
}
}
}
fw.close();
}

@Override
public void transform()
throws Exception {
logger.info("git transform");
// call a python script to do the transformation
InputStream inputStream = classLoader.getResourceAsStream("jython/GitTransform.py");
interpreter.execfile(inputStream);
inputStream.close();
}

@Override
public void load()
throws Exception {
logger.info("ldap db load");
InputStream inputStream = classLoader.getResourceAsStream("jython/GitLoad.py");
interpreter.execfile(inputStream);
inputStream.close();
}


}
Original file line number Diff line number Diff line change
Expand Up @@ -102,3 +102,7 @@ ldap.group.context.security.credentials=
ldap.group.search.domains=
ldap.group.search.return.attributes=

# git
git.host=
git.project.whitelist=

65 changes: 65 additions & 0 deletions metadata-etl/src/main/resources/jython/GitLoad.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
#
# Copyright 2015 LinkedIn Corp. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#

__author__ = 'zechen'

from wherehows.common import Constant
from com.ziclix.python.sql import zxJDBC
import sys


class GitLoad:

def __init__(self, args):
self.wh_con = zxJDBC.connect(args[Constant.WH_DB_URL_KEY],
args[Constant.WH_DB_USERNAME_KEY],
args[Constant.WH_DB_PASSWORD_KEY],
args[Constant.WH_DB_DRIVER_KEY])
self.wh_cursor = self.wh_con.cursor()
self.app_id = int(args[Constant.APP_ID_KEY])

def run(self):
self.load_from_stg()
self.wh_cursor.close()
self.wh_con.close()

def load_from_stg(self):
query = """
INSERT INTO source_code_commit_info
(
app_id, repository_urn, commit_id, file_path, file_name, commit_time, committer_name, committer_email,
author_name, author_email, message, created_time, wh_etl_exec_id
)
select app_id, repository_urn, commit_id, file_path, file_name, commit_time, committer_name, committer_email,
author_name, author_email, message, unix_timestamp(NOW()), wh_etl_exec_id
from stg_source_code_commit_info s
where s.app_id = {app_id}
on duplicate key update
commit_time = s.commit_time,
committer_name = s.committer_name,
committer_email = s.committer_email,
author_name = s.author_name,
author_email = s.author_email,
message = s.message,
modified_time = unix_timestamp(NOW()),
wh_etl_exec_id = s.wh_etl_exec_id
""".format(app_id=self.app_id)
print query
self.wh_cursor.execute(query)
self.wh_con.commit()

if __name__ == "__main__":
props = sys.argv[1]
git = GitLoad(props)
git.run()
81 changes: 81 additions & 0 deletions metadata-etl/src/main/resources/jython/GitTransform.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
#
# Copyright 2015 LinkedIn Corp. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#

__author__ = 'zechen'

from wherehows.common import Constant
from com.ziclix.python.sql import zxJDBC
import sys


class OwnerTransform:
_tables = {"source_code_commit": {"columns": "repository_urn, commit_id, file_path, file_name, commit_time, committer_name, committer_email, author_name, author_email, message",
"file": "commit.csv",
"table": "stg_source_code_commit_info"}
}

_clear_staging_tempalte = """
DELETE FROM {table}
"""

_read_file_template = """
LOAD DATA LOCAL INFILE '{folder}/{file}'
INTO TABLE {table}
FIELDS TERMINATED BY '\x1a' ESCAPED BY '\0'
LINES TERMINATED BY '\n'
({columns})
SET app_id = {app_id},
wh_etl_exec_id = {wh_etl_exec_id};
"""

def __init__(self, args):
self.wh_con = zxJDBC.connect(args[Constant.WH_DB_URL_KEY],
args[Constant.WH_DB_USERNAME_KEY],
args[Constant.WH_DB_PASSWORD_KEY],
args[Constant.WH_DB_DRIVER_KEY])
self.wh_cursor = self.wh_con.cursor()
self.app_id = int(args[Constant.APP_ID_KEY])
self.wh_etl_exec_id = int(args[Constant.WH_EXEC_ID_KEY])
self.app_folder = args[Constant.WH_APP_FOLDER_KEY]
self.metadata_folder = self.app_folder + "/" + str(self.app_id)

def run(self):
self.read_file_to_stg()
self.wh_cursor.close()
self.wh_con.close()

def read_file_to_stg(self):
t = self._tables["source_code_commit"]

# Clear stagging table
query = self._clear_staging_tempalte.format(table=t.get("table"))
print query
self.wh_cursor.execute(query)
self.wh_con.commit()

# Load file into stagging table
query = self._read_file_template.format(folder=self.metadata_folder,
file=t.get("file"),
table=t.get("table"),
columns=t.get("columns"),
app_id=self.app_id,
wh_etl_exec_id=self.wh_etl_exec_id)
print query
self.wh_cursor.execute(query)
self.wh_con.commit()

if __name__ == "__main__":
props = sys.argv[1]
ot = OwnerTransform(props)
ot.run()
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
/**
* Copyright 2015 LinkedIn Corp. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*/
package metadata.etl.git;

import org.testng.annotations.BeforeMethod;
import org.testng.annotations.Test;


/**
* Created by zechen on 12/8/15.
*/
public class GitMetadataEtlTest {
GitMetadataEtl git;

@BeforeMethod
public void setUp()
throws Exception {
this.git = new GitMetadataEtl(500, 0L);
}

@Test
public void testExtract()
throws Exception {
git.extract();
}

@Test
public void testTransform()
throws Exception {
git.transform();
}

@Test
public void testLoad()
throws Exception {
git.load();
}

@Test
public void testRun()
throws Exception {
git.run();
}
}
2 changes: 2 additions & 0 deletions wherehows-common/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ dependencies {
compile externalDependency.slf4j_api
compile externalDependency.slf4j_log4j
compile externalDependency.spring_jdbc
compile externalDependency.jgit
compile externalDependency.jsoup
testCompile externalDependency.testng
testCompile project(":metadata-etl")
}
4 changes: 4 additions & 0 deletions wherehows-common/src/main/java/wherehows/common/Constant.java
Original file line number Diff line number Diff line change
Expand Up @@ -102,4 +102,8 @@ public class Constant {
public static final String LDAP_GROUP_SEARCH_DOMAINS_KEY = "ldap.group.search.domains";
public static final String LDAP_GROUP_SEARCH_RETURN_ATTRS_KEY = "ldap.group.search.return.attributes";

// git
public static final String GIT_HOST_KEY = "git.host";
public static final String GIT_PROJECT_WHITELIST_KEY = "git.project.whitelist";

}
Loading

0 comments on commit af04ff6

Please sign in to comment.