-
Notifications
You must be signed in to change notification settings - Fork 255
/
twarc-archive.py
executable file
·206 lines (171 loc) · 5.9 KB
/
twarc-archive.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
#!/usr/bin/env python
"""
This little utility uses twarc to write Twitter search results to a directory
of your choosing. It will use the previous results to determine when to stop
searching.
So for example if you want to search for tweets mentioning "ferguson" you can
run it:
% twarc-archive.py ferguson /mnt/tweets/ferguson
The first time you run this it will search twitter for tweets matching
"ferguson" and write them to a file:
/mnt/tweets/ferguson/tweets-0001.jsonl.gz
When you run the exact same command again:
% twarc-archive.py ferguson /mnt/tweets/ferguson
it will get the first tweet id in tweets-0001.jsonl.gz and use it to write
another file which includes any new tweets since that tweet:
/mnt/tweets/ferguson/tweets-0002.jsonl.gz
This functionality was initially part of twarc.py itself, but has been split out
into a separate utility.
"""
from __future__ import print_function
import os
import re
import sys
import gzip
import json
import twarc
import logging
import argparse
archive_file_fmt = "tweets-%04i.jsonl.gz"
archive_file_pat = "tweets-(\d+).jsonl.gz$"
def main():
config = os.path.join(os.path.expanduser("~"), ".twarc")
e = os.environ.get
parser = argparse.ArgumentParser("archive")
parser.add_argument(
"search", action="store", help="search for tweets matching a query"
)
parser.add_argument(
"archive_dir", action="store", help="a directory where results are stored"
)
parser.add_argument(
"--consumer_key",
action="store",
default=e("CONSUMER_KEY"),
help="Twitter API consumer key",
)
parser.add_argument(
"--consumer_secret",
action="store",
default=e("CONSUMER_SECRET"),
help="Twitter API consumer secret",
)
parser.add_argument(
"--access_token",
action="store",
default=e("ACCESS_TOKEN"),
help="Twitter API access key",
)
parser.add_argument(
"--access_token_secret",
action="store",
default=e("ACCESS_TOKEN_SECRET"),
help="Twitter API access token secret",
)
parser.add_argument("--profile", action="store", default="main")
parser.add_argument(
"-c",
"--config",
default=config,
help="Config file containing Twitter keys and secrets. Overridden by environment config.",
)
parser.add_argument(
"--tweet_mode",
action="store",
default="extended",
dest="tweet_mode",
choices=["compat", "extended"],
help="set tweet mode",
)
parser.add_argument(
"--twarc_command",
action="store",
default="search",
choices=["search", "timeline"],
help="select twarc command to be used for harvest, currently supports search and timeline",
)
args = parser.parse_args()
if not os.path.isdir(args.archive_dir):
os.mkdir(args.archive_dir)
logging.basicConfig(
filename=os.path.join(args.archive_dir, "archive.log"),
level=logging.INFO,
format="%(asctime)s %(levelname)s %(message)s",
)
lockfile = os.path.join(args.archive_dir, "") + "lockfile"
if not os.path.exists(lockfile):
pid = os.getpid()
lockfile_handle = open(lockfile, "w")
lockfile_handle.write(str(pid))
lockfile_handle.close()
else:
old_pid = "unknown"
with open(lockfile, "r") as lockfile_handle:
old_pid = lockfile_handle.read()
sys.exit(
"Another twarc-archive.py process with pid "
+ old_pid
+ " is running. If the process is no longer active then it may have been interrupted. In that case remove the 'lockfile' in "
+ args.archive_dir
+ " and run the command again."
)
logging.info("logging search for %s to %s", args.search, args.archive_dir)
t = twarc.Twarc(
consumer_key=args.consumer_key,
consumer_secret=args.consumer_secret,
access_token=args.access_token,
access_token_secret=args.access_token_secret,
profile=args.profile,
config=args.config,
tweet_mode=args.tweet_mode,
)
last_archive = get_last_archive(args.archive_dir)
if last_archive:
last_id = json.loads(next(gzip.open(last_archive, "rt")))["id_str"]
else:
last_id = None
if args.twarc_command == "search":
tweets = t.search(args.search, since_id=last_id)
elif args.twarc_command == "timeline":
if re.match("^\d+$", args.search):
tweets = t.timeline(userid=args.search, since_id=last_id)
else:
tweets = t.timeline(screen_name=args.search, since_id=last_id)
else:
raise Exception("invalid twarc_command %s" % args.twarc_command)
next_archive = get_next_archive(args.archive_dir)
# we only create the file if there are new tweets to save
# this prevents empty archive files
fh = None
for tweet in tweets:
if not fh:
fh = gzip.open(next_archive, "wt")
logging.info("archived %s", tweet["id_str"])
fh.write(json.dumps(tweet))
fh.write("\n")
if fh:
fh.close()
else:
logging.info("no new tweets found for %s", args.search)
if os.path.exists(lockfile):
os.remove(lockfile)
def get_last_archive(archive_dir):
count = 0
for filename in os.listdir(archive_dir):
m = re.match(archive_file_pat, filename)
if m and int(m.group(1)) > count:
count = int(m.group(1))
if count != 0:
return os.path.join(archive_dir, archive_file_fmt % count)
else:
return None
def get_next_archive(archive_dir):
last_archive = get_last_archive(archive_dir)
if last_archive:
m = re.search(archive_file_pat, last_archive)
count = int(m.group(1)) + 1
else:
count = 1
return os.path.join(archive_dir, archive_file_fmt % count)
if __name__ == "__main__":
main()