Skip to content

Commit

Permalink
main function
Browse files Browse the repository at this point in the history
This commit moves the majority of the processing code into a main
function which is only called when the program is run from the command
line and not when it is invoked as part of the multiprocessing queue.
This is to hopefully fix an error encountered on OS X with Python 3.9.
  • Loading branch information
edsu committed Feb 18, 2021
1 parent ec9d726 commit 596f429
Showing 1 changed file with 103 additions and 101 deletions.
204 changes: 103 additions & 101 deletions utils/youtubedl.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,57 +75,107 @@

parser.add_argument('files', action='append', help='json files to parse')

args = parser.parse_args()

# make download directory
download_dir = args.download_dir
if not os.path.isdir(download_dir):
os.mkdir(download_dir)

# setup logger
log_file = "{}/youtubedl.log".format(download_dir)
logging.basicConfig(filename=log_file, level=logging.INFO)
log = logging.getLogger()

# setup youtube_dl config
ydl_opts = {
"format": "best",
"logger": log,
"restrictfilenames": True,
"ignoreerrors": True,
"nooverwrites": True,
"writedescription": True,
"writeinfojson": True,
"writesubtitles": True,
"writeautomaticsub": True,
"outtmpl": "{}/%(extractor)s/%(id)s/%(title)s.%(ext)s".format(download_dir),
"download_archive": "{}/archive.txt".format(download_dir)
}
if args.ignore_livestreams:
ydl_opts["matchfilter"] = match_filter_func("!is_live")
if args.max_downloads:
ydl_opts['max_downloads'] = args.max_downloads
if args.max_filesize:
ydl_opts['max_filesize'] = args.max_filesize

# keep track of domains to block
blocklist = []
if args.block:
blocklist = args.block

# read in existing mapping file to know which urls we can ignorej
seen = set()
mapping_file = os.path.join(download_dir, 'mapping.tsv')
if os.path.isfile(mapping_file):
for line in open(mapping_file):
url, path = line.split('\t')
log.info('found %s in %s', url, mapping_file)
seen.add(url)
results = open(mapping_file, 'a')


# a function to do the download
def download(url, q):
def main():
args = parser.parse_args()

# make download directory
download_dir = args.download_dir
if not os.path.isdir(download_dir):
os.mkdir(download_dir)

# setup logger
log_file = "{}/youtubedl.log".format(download_dir)
logging.basicConfig(filename=log_file, level=logging.INFO)
log = logging.getLogger()

# setup youtube_dl config
ydl_opts = {
"format": "best",
"logger": log,
"restrictfilenames": True,
"ignoreerrors": True,
"nooverwrites": True,
"writedescription": True,
"writeinfojson": True,
"writesubtitles": True,
"writeautomaticsub": True,
"outtmpl": "{}/%(extractor)s/%(id)s/%(title)s.%(ext)s".format(download_dir),
"download_archive": "{}/archive.txt".format(download_dir)
}
if args.ignore_livestreams:
ydl_opts["matchfilter"] = match_filter_func("!is_live")
if args.max_downloads:
ydl_opts['max_downloads'] = args.max_downloads
if args.max_filesize:
ydl_opts['max_filesize'] = args.max_filesize

# keep track of domains to block
blocklist = []
if args.block:
blocklist = args.block

# read in existing mapping file to know which urls we can ignorej
seen = set()
mapping_file = os.path.join(download_dir, 'mapping.tsv')
if os.path.isfile(mapping_file):
for line in open(mapping_file):
url, path = line.split('\t')
log.info('found %s in %s', url, mapping_file)
seen.add(url)

# loop through the tweets
results = open(mapping_file, 'a')
for line in fileinput.input(args.files):
tweet = json.loads(line)
log.info('analyzing %s', tweet['id_str'])
for e in tweet['entities']['urls']:
url = e.get('unshortened_url') or e['expanded_url']

# see if we can skip this one
if not url:
continue
if url in seen:
log.info('already processed %s', url)
continue
seen.add(url)

# check for blocks
uri = urlparse(url)
if uri.netloc in blocklist:
logging.warn("%s in block list", url)
continue

# set up a multiprocessing queue to manage the download with a timeout
log.info('processing %s', url)
q = mp.Queue()
p = mp.Process(target=download, args=(url, q, ydl_opts, log))
p.start()

started = datetime.now()
while True:
# if we've exceeded the timeout terminate the process
if args.timeout and datetime.now() - started > timedelta(seconds=args.timeout):
log.warning('reached timeout %s', args.timeout)
p.terminate()
break
# if the process is done we can stop
elif not p.is_alive():
break
# otherwise sleep and the check again
time.sleep(1)

# if the queue was empty there either wasn't a download or it timed out
if q.empty():
filename = ''
else:
filename = q.get()

p.join()

# write the result to the mapping file
results.write("{}\t{}\n".format(url, filename))

def download(url, q, ydl_opts, log):
try:
ydl = youtube_dl.YoutubeDL(ydl_opts)
info = ydl.extract_info(url)
Expand All @@ -139,53 +189,5 @@ def download(url, q):
logging.warning('only %s downloads per url allowed', args.max_downloads)


# loop through the tweets
for line in fileinput.input(args.files):
tweet = json.loads(line)
log.info('analyzing %s', tweet['id_str'])
for e in tweet['entities']['urls']:
url = e.get('unshortened_url') or e['expanded_url']

# see if we can skip this one
if not url:
continue
if url in seen:
log.info('already processed %s', url)
continue
seen.add(url)

# check for blocks
uri = urlparse(url)
if uri.netloc in blocklist:
logging.warn("%s in block list", url)
continue

# set up a multiprocessing queue to manage the download with a timeout
log.info('processing %s', url)
q = mp.Queue()
p = mp.Process(target=download, args=(url, q))
p.start()

started = datetime.now()
while True:
# if we've exceeded the timeout terminate the process
if args.timeout and datetime.now() - started > timedelta(seconds=args.timeout):
log.warning('reached timeout %s', args.timeout)
p.terminate()
break
# if the process is done we can stop
elif not p.is_alive():
break
# otherwise sleep and the check again
time.sleep(1)

# if the queue was empty there either wasn't a download or it timed out
if q.empty():
filename = ''
else:
filename = q.get()

p.join()

# write the result to the mapping file
results.write("{}\t{}\n".format(url, filename))
if __name__ == "__main__":
main()

0 comments on commit 596f429

Please sign in to comment.