main function

This commit moves the majority of the processing code into a main function which is only called when the program is run from the command line and not when it is invoked as part of the multiprocessing queue. This is to hopefully fix an error encountered on OS X with Python 3.9.
DocNow · Feb 18, 2021 · 596f429 · 596f429
1 parent ec9d726
commit 596f429
Showing 1 changed file with 103 additions and 101 deletions.
diff --git a/utils/youtubedl.py b/utils/youtubedl.py
@@ -75,57 +75,107 @@
 
 parser.add_argument('files', action='append', help='json files to parse')
 
-args = parser.parse_args()
-
-# make download directory
-download_dir = args.download_dir
-if not os.path.isdir(download_dir):
-    os.mkdir(download_dir)
-
-# setup logger
-log_file = "{}/youtubedl.log".format(download_dir)
-logging.basicConfig(filename=log_file, level=logging.INFO)
-log = logging.getLogger()
-
-# setup youtube_dl config
-ydl_opts = {
-    "format": "best",
-    "logger": log,
-    "restrictfilenames": True,
-    "ignoreerrors": True,
-    "nooverwrites": True,
-    "writedescription": True,
-    "writeinfojson": True,
-    "writesubtitles": True,
-    "writeautomaticsub": True,
-    "outtmpl": "{}/%(extractor)s/%(id)s/%(title)s.%(ext)s".format(download_dir),
-    "download_archive": "{}/archive.txt".format(download_dir)
-}
-if args.ignore_livestreams:
-    ydl_opts["matchfilter"] = match_filter_func("!is_live")
-if args.max_downloads:
-    ydl_opts['max_downloads'] = args.max_downloads
-if args.max_filesize:
-    ydl_opts['max_filesize'] = args.max_filesize
-
-# keep track of domains to block
-blocklist = []
-if args.block:
-    blocklist = args.block
-
-# read in existing mapping file to know which urls we can ignorej
-seen = set()
-mapping_file = os.path.join(download_dir, 'mapping.tsv')
-if os.path.isfile(mapping_file):
-    for line in open(mapping_file):
-        url, path = line.split('\t')
-        log.info('found %s in %s', url, mapping_file)
-        seen.add(url)
-results = open(mapping_file, 'a')
-
-
-# a function to do the download
-def download(url, q):
+def main():
+    args = parser.parse_args()
+
+    # make download directory
+    download_dir = args.download_dir
+    if not os.path.isdir(download_dir):
+        os.mkdir(download_dir)
+
+    # setup logger
+    log_file = "{}/youtubedl.log".format(download_dir)
+    logging.basicConfig(filename=log_file, level=logging.INFO)
+    log = logging.getLogger()
+
+    # setup youtube_dl config
+    ydl_opts = {
+        "format": "best",
+        "logger": log,
+        "restrictfilenames": True,
+        "ignoreerrors": True,
+        "nooverwrites": True,
+        "writedescription": True,
+        "writeinfojson": True,
+        "writesubtitles": True,
+        "writeautomaticsub": True,
+        "outtmpl": "{}/%(extractor)s/%(id)s/%(title)s.%(ext)s".format(download_dir),
+        "download_archive": "{}/archive.txt".format(download_dir)
+    }
+    if args.ignore_livestreams:
+        ydl_opts["matchfilter"] = match_filter_func("!is_live")
+    if args.max_downloads:
+        ydl_opts['max_downloads'] = args.max_downloads
+    if args.max_filesize:
+        ydl_opts['max_filesize'] = args.max_filesize
+
+    # keep track of domains to block
+    blocklist = []
+    if args.block:
+        blocklist = args.block
+
+    # read in existing mapping file to know which urls we can ignorej
+    seen = set()
+    mapping_file = os.path.join(download_dir, 'mapping.tsv')
+    if os.path.isfile(mapping_file):
+        for line in open(mapping_file):
+            url, path = line.split('\t')
+            log.info('found %s in %s', url, mapping_file)
+            seen.add(url)
+
+    # loop through the tweets
+    results = open(mapping_file, 'a')
+    for line in fileinput.input(args.files):
+        tweet = json.loads(line)
+        log.info('analyzing %s', tweet['id_str'])
+        for e in tweet['entities']['urls']:
+            url = e.get('unshortened_url') or e['expanded_url']
+
+            # see if we can skip this one
+            if not url:
+                continue
+            if url in seen:
+                log.info('already processed %s', url)
+                continue
+            seen.add(url)
+
+            # check for blocks
+            uri = urlparse(url)
+            if uri.netloc in blocklist:
+                logging.warn("%s in block list", url)
+                continue
+
+            # set up a multiprocessing queue to manage the download with a timeout
+            log.info('processing %s', url)
+            q = mp.Queue()
+            p = mp.Process(target=download, args=(url, q, ydl_opts, log))
+            p.start()
+
+            started = datetime.now()
+            while True:
+                # if we've exceeded the timeout terminate the process
+                if args.timeout and datetime.now() - started > timedelta(seconds=args.timeout):
+                    log.warning('reached timeout %s', args.timeout)
+                    p.terminate()
+                    break
+                # if the process is done we can stop
+                elif not p.is_alive():
+                    break
+                # otherwise sleep and the check again
+                time.sleep(1)
+
+            # if the queue was empty there either wasn't a download or it timed out
+            if q.empty():
+                filename = ''
+            else:
+                filename = q.get()
+
+            p.join()
+
+            # write the result to the mapping file
+            results.write("{}\t{}\n".format(url, filename))
+
+def download(url, q, ydl_opts, log):
     try:
         ydl = youtube_dl.YoutubeDL(ydl_opts)
         info = ydl.extract_info(url)
@@ -139,53 +189,5 @@ def download(url, q):
         logging.warning('only %s downloads per url allowed', args.max_downloads)
 
 
-# loop through the tweets
-for line in fileinput.input(args.files):
-    tweet = json.loads(line)
-    log.info('analyzing %s', tweet['id_str'])
-    for e in tweet['entities']['urls']:
-        url = e.get('unshortened_url') or e['expanded_url']
-
-        # see if we can skip this one
-        if not url:
-            continue
-        if url in seen:
-            log.info('already processed %s', url)
-            continue
-        seen.add(url)
-
-        # check for blocks
-        uri = urlparse(url)
-        if uri.netloc in blocklist:
-            logging.warn("%s in block list", url)
-            continue
-
-        # set up a multiprocessing queue to manage the download with a timeout
-        log.info('processing %s', url)
-        q = mp.Queue()
-        p = mp.Process(target=download, args=(url, q))
-        p.start()
-
-        started = datetime.now()
-        while True:
-            # if we've exceeded the timeout terminate the process
-            if args.timeout and datetime.now() - started > timedelta(seconds=args.timeout):
-                log.warning('reached timeout %s', args.timeout)
-                p.terminate()
-                break
-            # if the process is done we can stop
-            elif not p.is_alive():
-                break
-            # otherwise sleep and the check again
-            time.sleep(1)
-
-        # if the queue was empty there either wasn't a download or it timed out
-        if q.empty():
-            filename = ''
-        else:
-            filename = q.get()
-
-        p.join()
-
-        # write the result to the mapping file
-        results.write("{}\t{}\n".format(url, filename))
+if __name__ == "__main__":
+    main()