Skip to content

Commit 4801203

Browse files
committed
added hive_tables_row_counts_any_nulls.py
1 parent e9178a0 commit 4801203

1 file changed

Lines changed: 106 additions & 0 deletions

File tree

Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
#!/usr/bin/env python
2+
# vim:ts=4:sts=4:sw=4:et
3+
#
4+
# Author: Hari Sekhon
5+
# Date: 2019-11-26 10:08:52 +0000 (Tue, 26 Nov 2019)
6+
#
7+
# https://github.com/harisekhon/devops-python-tools
8+
#
9+
# License: see accompanying Hari Sekhon LICENSE file
10+
#
11+
# If you're using my code you're welcome to connect with me on LinkedIn
12+
# and optionally send me feedback to help steer this or other code I publish
13+
#
14+
# https://www.linkedin.com/in/harisekhon
15+
#
16+
17+
"""
18+
19+
Connect to HiveServer2 and count number of rows with NULL in any column
20+
for each table in each database, or only those matching given db / table regexes
21+
22+
Useful for catching problems with data quality or subtle ETL bugs
23+
24+
Rewrite of a Perl version from 2013 from my DevOps Perl Tools repo
25+
26+
Tested on Hive 1.1.0 on CDH 5.10 with Kerberos
27+
28+
Due to a thrift / impyla bug this needs exactly thrift==0.9.3, see
29+
30+
https://github.com/cloudera/impyla/issues/286
31+
32+
If you get an error like this:
33+
34+
ERROR:impala.hiveserver2:Failed to open transport (tries_left=1)
35+
...
36+
TTransportException: TSocket read 0 bytes
37+
38+
then check your --kerberos and --ssl settings match the cluster's settings
39+
(Thrift and Kerberos have the worst error messages ever)
40+
41+
"""
42+
43+
from __future__ import absolute_import
44+
from __future__ import division
45+
from __future__ import print_function
46+
from __future__ import unicode_literals
47+
48+
import os
49+
import sys
50+
srcdir = os.path.abspath(os.path.dirname(__file__))
51+
pylib = os.path.join(srcdir, 'pylib')
52+
sys.path.append(pylib)
53+
try:
54+
# pylint: disable=wrong-import-position
55+
from harisekhon.utils import log
56+
from hive_foreach_table import HiveForEachTable
57+
except ImportError as _:
58+
print('module import failed: %s' % _, file=sys.stderr)
59+
print("Did you remember to build the project by running 'make'?", file=sys.stderr)
60+
print("Alternatively perhaps you tried to copy this program out without it's adjacent libraries?", file=sys.stderr)
61+
sys.exit(4)
62+
63+
64+
__author__ = 'Hari Sekhon'
65+
__version__ = '0.5.0'
66+
67+
class HiveTablesRowsWithNulls(HiveForEachTable):
68+
69+
def __init__(self):
70+
# Python 2.x
71+
super(HiveTablesRowsWithNulls, self).__init__()
72+
# Python 3.x
73+
# super().__init__()
74+
self.query = 'placeholder' # constructed later dynamically per table, here to suppress --query CLI option
75+
self.database = None
76+
self.table = None
77+
#self.partition = None
78+
self.ignore_errors = False
79+
80+
# discard last param query and construct our own based on the table DDL of cols
81+
def execute(self, conn, database, table, query):
82+
columns = []
83+
log.info('describing table %s.%s', database, table)
84+
with conn.cursor() as column_cursor:
85+
# doesn't support parameterized query quoting from dbapi spec
86+
#column_cursor.execute('use %(database)s', {'database': database})
87+
#column_cursor.execute('describe %(table)s', {'table': table})
88+
column_cursor.execute('use `{}`'.format(database))
89+
column_cursor.execute('describe `{}`'.format(table))
90+
for column_row in column_cursor:
91+
column = column_row[0]
92+
#column_type = column_row[1]
93+
columns.append(column)
94+
query = "SELECT count(*) FROM `{db}`.`{table}` WHERE `"\
95+
.format(db=database, table=table) + \
96+
"` IS NULL OR `".join(columns) + "` IS NULL"
97+
with conn.cursor() as table_cursor:
98+
log.debug('executing query: %s', query)
99+
table_cursor.execute(query)
100+
for result in table_cursor:
101+
count = result[0]
102+
print('{db}.{table}\t{count}'.format(db=database, table=table, count=count))
103+
104+
105+
if __name__ == '__main__':
106+
HiveTablesRowsWithNulls().main()

0 commit comments

Comments
 (0)