Skip to content

Commit 915e295

Browse files
committed
Refactor filter_date util, add support for max date
1 parent b4c5d84 commit 915e295

File tree

1 file changed

+25
-28
lines changed

1 file changed

+25
-28
lines changed

utils/filter_date.py

Lines changed: 25 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1,49 +1,46 @@
11
#!/usr/bin/env python
22
"""
3-
Given a minimum date, filter out all tweets after this date.
3+
Given a minimum and/or maximum date, filter out all tweets after this date.
44
55
For example, if a hashtag was used for another event before the one you're
66
interested in, you can filter out the old ones.
77
88
Example usage:
9-
utils\filter_date.py --mindate 1-may-2014 tweets.jsonl > filtered.jsonl
9+
utils/filter_date.py --mindate 1-may-2014 tweets.jsonl > filtered.jsonl
1010
"""
1111
from __future__ import print_function
1212

1313
import sys
1414
import json
1515
import fileinput
16-
import dateutil.parser
16+
import argparse
17+
import datetime
18+
from dateutil.parser import parse
1719

1820

19-
# parse command-line args
20-
mindate = dateutil.parser.parse("1-January-2012")
21-
# if args include --mindate, get mindate and remove first two args,
22-
# leaving file name(s) (if any) in args
23-
if len(sys.argv) > 1:
24-
if sys.argv[1] == "--mindate":
25-
mindate = dateutil.parser.parse(sys.argv[2])
26-
del sys.argv[0]
27-
del sys.argv[0]
21+
def filter_input(mindate, maxdate, files):
22+
mindate = parse(mindate) if mindate is not None else datetime.datetime.min
23+
maxdate = parse(maxdate) if maxdate is not None else datetime.datetime.max
2824

29-
# fh = open('date_filtered.jsonl', 'w')
30-
# kept, discarded = 0, 0
25+
for line in fileinput.input(files):
26+
tweet = json.loads(line)
3127

32-
for line in fileinput.input():
33-
tweet = json.loads(line)
28+
created_at = parse(tweet["created_at"])
29+
created_at = created_at.replace(tzinfo=None)
3430

35-
created_at = dateutil.parser.parse(tweet["created_at"])
36-
created_at = created_at.replace(tzinfo=None)
31+
if mindate < created_at and maxdate > created_at:
32+
print(json.dumps(tweet))
3733

38-
# print(created_at, mindate, created_at >= mindate)
39-
if (created_at >= mindate):
40-
print(json.dumps(tweet))
41-
# fh.write(json.dumps(tweet))
42-
# fh.write("\n")
43-
# kept += 1
44-
# else:
45-
# discarded += 1
4634

47-
# print("Kept", kept, "tweets and discarded", discarded)
35+
def main():
36+
parser = argparse.ArgumentParser()
37+
parser.add_argument("--mindate", help="the minimum date", default=None)
38+
parser.add_argument("--maxdate", help="the maximum date", default=None)
39+
parser.add_argument("files", nargs="?", default=[])
40+
args = parser.parse_args()
4841

49-
# End of file
42+
filter_input(args.mindate, args.maxdate, args.files)
43+
44+
45+
if __name__ == "__main__":
46+
main()

0 commit comments

Comments
 (0)