Skip to content

Commit 7a857e4

Browse files
committed
added hive warehouse db/table match and tightened domain\user format to avoid newlines and carriage returns
1 parent a85da02 commit 7a857e4

1 file changed

Lines changed: 4 additions & 2 deletions

File tree

anonymize.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,7 @@
9090
sys.exit(4)
9191

9292
__author__ = 'Hari Sekhon'
93-
__version__ = '0.10.8'
93+
__version__ = '0.10.9'
9494

9595
ip_regex = r'(?!127\.0\.0\.)' + ip_regex
9696
subnet_mask_regex = r'(?!127\.0\.0\.)' + subnet_mask_regex
@@ -320,6 +320,7 @@ def __init__(self):
320320
id_or_name=id_or_name,
321321
switch_prefix=switch_prefix),
322322
'db4': r'(\s(?:in|of)\s+(column|table|database|schema)\s+[\'"])[^\'"]+',
323+
'db5': r'/+user/+hive/+warehouse/+([A-Za-z0-9_-]+/+)*[A-Za-z0-9_-]+.db/+[A-Za-z0-9_-]+',
323324
'generic': r'(\bfileb?)://{filename_regex}'.format(filename_regex=filename_regex),
324325
'generic2': r'({switch_prefix}key{id_or_name}?{arg_sep})\S+'\
325326
.format(arg_sep=arg_sep,
@@ -357,7 +358,7 @@ def __init__(self):
357358
'user': r'([-\.]{user_name}{sep})\S+'.format(user_name=user_name, sep=arg_sep),
358359
'user2': r'/(home|user)/{user}'.format(user=user_regex),
359360
'user3': r'({user_name}{sep}){user}'.format(user_name=user_name, sep=arg_sep, user=user_regex),
360-
'user4': r'(?<![\w\\]){NT_DOMAIN}(?!\\n\d\d\d\d-\d\d-\d\d)\\{user}(?!\\)'\
361+
'user4': r'(?<![\w\\]){NT_DOMAIN}(?!\\r|\\n)(?!\\n\d\d\d\d-\d\d-\d\d)\\{user}(?!\\)'\
361362
.format(NT_DOMAIN=r'\b[\w-]{1,15}\b', user=user_regex),
362363
'user5': r'for\s+user\s+{user}'.format(user=user_regex),
363364
# (?<!>/) exclude patterns '>/' where we have already matched and token replaced
@@ -456,6 +457,7 @@ def __init__(self):
456457
'db2': r'\1<database_instance>',
457458
'db3': r'\1<schema>',
458459
'db4': r'\1<\2>',
460+
'db5': r'/user/hive/warehouse/<database>.db/<table>',
459461
'generic': r'\1://<file>',
460462
'generic2': r'\1<key>',
461463
'generic3': r'\1<cluster>',

0 commit comments

Comments
 (0)