|
1 | 1 | #!/usr/bin/env python |
2 | 2 | """ |
3 | | -Given a minimum date, filter out all tweets after this date. |
| 3 | +Given a minimum and/or maximum date, filter out all tweets after this date. |
4 | 4 |
|
5 | 5 | For example, if a hashtag was used for another event before the one you're |
6 | 6 | interested in, you can filter out the old ones. |
7 | 7 |
|
8 | 8 | Example usage: |
9 | | -utils\filter_date.py --mindate 1-may-2014 tweets.jsonl > filtered.jsonl |
| 9 | +utils/filter_date.py --mindate 1-may-2014 tweets.jsonl > filtered.jsonl |
10 | 10 | """ |
11 | 11 | from __future__ import print_function |
12 | 12 |
|
13 | 13 | import sys |
14 | 14 | import json |
15 | 15 | import fileinput |
16 | | -import dateutil.parser |
| 16 | +import argparse |
| 17 | +import datetime |
| 18 | +from dateutil.parser import parse |
17 | 19 |
|
18 | 20 |
|
19 | | -# parse command-line args |
20 | | -mindate = dateutil.parser.parse("1-January-2012") |
21 | | -# if args include --mindate, get mindate and remove first two args, |
22 | | -# leaving file name(s) (if any) in args |
23 | | -if len(sys.argv) > 1: |
24 | | - if sys.argv[1] == "--mindate": |
25 | | - mindate = dateutil.parser.parse(sys.argv[2]) |
26 | | - del sys.argv[0] |
27 | | - del sys.argv[0] |
| 21 | +def filter_input(mindate, maxdate, files): |
| 22 | + mindate = parse(mindate) if mindate is not None else datetime.datetime.min |
| 23 | + maxdate = parse(maxdate) if maxdate is not None else datetime.datetime.max |
28 | 24 |
|
29 | | -# fh = open('date_filtered.jsonl', 'w') |
30 | | -# kept, discarded = 0, 0 |
| 25 | + for line in fileinput.input(files): |
| 26 | + tweet = json.loads(line) |
31 | 27 |
|
32 | | -for line in fileinput.input(): |
33 | | - tweet = json.loads(line) |
| 28 | + created_at = parse(tweet["created_at"]) |
| 29 | + created_at = created_at.replace(tzinfo=None) |
34 | 30 |
|
35 | | - created_at = dateutil.parser.parse(tweet["created_at"]) |
36 | | - created_at = created_at.replace(tzinfo=None) |
| 31 | + if mindate < created_at and maxdate > created_at: |
| 32 | + print(json.dumps(tweet)) |
37 | 33 |
|
38 | | - # print(created_at, mindate, created_at >= mindate) |
39 | | - if (created_at >= mindate): |
40 | | - print(json.dumps(tweet)) |
41 | | - # fh.write(json.dumps(tweet)) |
42 | | - # fh.write("\n") |
43 | | - # kept += 1 |
44 | | - # else: |
45 | | - # discarded += 1 |
46 | 34 |
|
47 | | -# print("Kept", kept, "tweets and discarded", discarded) |
| 35 | +def main(): |
| 36 | + parser = argparse.ArgumentParser() |
| 37 | + parser.add_argument("--mindate", help="the minimum date", default=None) |
| 38 | + parser.add_argument("--maxdate", help="the maximum date", default=None) |
| 39 | + parser.add_argument("files", nargs="?", default=[]) |
| 40 | + args = parser.parse_args() |
48 | 41 |
|
49 | | -# End of file |
| 42 | + filter_input(args.mindate, args.maxdate, args.files) |
| 43 | + |
| 44 | + |
| 45 | +if __name__ == "__main__": |
| 46 | + main() |
0 commit comments