twarc/command.py

from __future__ import print_function

import os
import re
import sys
import json
import signal
import codecs
import logging
import datetime
import argparse
import fileinput

from twarc.client import Twarc
from twarc.version import version
from twarc.json2csv import csv, get_headings, get_row
from dateutil.parser import parse as parse_dt

if sys.version_info[:2] <= (2, 7):
    # Python 2
    pyv = 2
    get_input = raw_input
    str_type = unicode
    import ConfigParser as configparser
else:
    # Python 3
    pyv = 3
    get_input = input
    str_type = str
    import configparser

log = logging.getLogger("twarc")


commands = [
    "configure",
    "dehydrate",
    "filter",
    "followers",
    "friends",
    "help",
    "hydrate",
    "replies",
    "retweets",
    "sample",
    "search",
    "timeline",
    "trends",
    "tweet",
    "users",
    "listmembers",
    "version",
]


def main():
    parser = get_argparser()
    args = parser.parse_args()

    command = args.command
    query = args.query or ""

    logging.basicConfig(
        filename=args.log,
        level=logging.INFO,
        format="%(asctime)s %(levelname)s %(message)s",
    )

    # log and stop when process receives SIGINT
    def stop(signal, frame):
        log.warn("process received SIGNT, stopping")
        sys.exit(0)

    signal.signal(signal.SIGINT, stop)

    if command == "version":
        print("twarc v%s" % version)
        sys.exit()
    elif command == "help" or not command:
        parser.print_help()
        print("\nPlease use one of the following commands:\n")
        for cmd in commands:
            print(" - %s" % cmd)
        print("\nFor example:\n\n    twarc search blacklivesmatter")
        sys.exit(1)

    # Don't validate the keys if the command is "configure"
    if command == "configure" or args.skip_key_validation:
        validate_keys = False
    else:
        validate_keys = True

    t = Twarc(
        consumer_key=args.consumer_key,
        consumer_secret=args.consumer_secret,
        access_token=args.access_token,
        access_token_secret=args.access_token_secret,
        connection_errors=args.connection_errors,
        http_errors=args.http_errors,
        config=args.config,
        profile=args.profile,
        tweet_mode=args.tweet_mode,
        protected=args.protected,
        validate_keys=validate_keys,
        app_auth=args.app_auth,
        gnip_auth=args.gnip_auth,
    )

    # calls that return tweets
    if command == "search":
        if len(args.lang) > 0:
            lang = args.lang[0]
        else:
            lang = None

        # if not using a premium endpoint do a standard search
        if not args.thirtyday and not args.fullarchive and not args.gnip_fullarchive:
            things = t.search(
                query,
                since_id=args.since_id,
                max_id=args.max_id,
                lang=lang,
                result_type=args.result_type,
                geocode=args.geocode,
            )
        else:
            # parse the dates if given
            from_date = parse_dt(args.from_date) if args.from_date else None
            to_date = parse_dt(args.to_date) if args.to_date else None
            if args.gnip_fullarchive:
                env = args.gnip_fullarchive
                product = "gnip_fullarchive"
            elif args.thirtyday:
                env = args.thirtyday
                product = "30day"
            else:
                env = args.fullarchive
                product = "fullarchive"
            things = t.premium_search(
                query,
                product,
                env,
                from_date=from_date,
                to_date=to_date,
                sandbox=args.sandbox,
                limit=args.limit,
            )

    elif command == "filter":
        things = t.filter(
            track=query, follow=args.follow, locations=args.locations, lang=args.lang
        )

    elif command == "dehydrate":
        input_iterator = fileinput.FileInput(
            query,
            mode="r",
            openhook=fileinput.hook_compressed,
        )
        things = t.dehydrate(input_iterator)

    elif command == "hydrate":
        input_iterator = fileinput.FileInput(
            query,
            mode="r",
            openhook=fileinput.hook_compressed,
        )
        things = t.hydrate(input_iterator)

    elif command == "tweet":
        things = [t.tweet(query)]

    elif command == "sample":
        things = t.sample()

    elif command == "timeline":
        kwargs = {"max_id": args.max_id, "since_id": args.since_id}
        if re.match("^[0-9]+$", query):
            kwargs["user_id"] = query
        elif query:
            kwargs["screen_name"] = query
        things = t.timeline(**kwargs)

    elif command == "retweets":
        if os.path.isfile(query):
            iterator = fileinput.FileInput(
                query,
                mode="r",
                openhook=fileinput.hook_compressed,
            )
            things = t.retweets(tweet_ids=iterator)
        else:
            things = t.retweets(tweet_ids=query.split(","))

    elif command == "users":
        if os.path.isfile(query):
            iterator = fileinput.FileInput(
                query,
                mode="r",
                openhook=fileinput.hook_compressed,
            )
            if re.match("^[0-9,]+$", next(open(query))):
                id_type = "user_id"
            else:
                id_type = "screen_name"
            things = t.user_lookup(ids=iterator, id_type=id_type)
        elif re.match("^[0-9,]+$", query):
            things = t.user_lookup(ids=query.split(","))
        else:
            things = t.user_lookup(ids=query.split(","), id_type="screen_name")

    elif command == "followers":
        things = t.follower_ids(query)

    elif command == "friends":
        things = t.friend_ids(query)

    elif command == "trends":
        # lookup woeid for geo-coordinate if appropriate
        geo = re.match("^([0-9-.]+),([0-9-.]+)$", query)
        if geo:
            lat, lon = map(float, geo.groups())
            if lat > 180 or lat < -180 or lon > 180 or lon < -180:
                parser.error("LAT and LONG must be within [-180.0, 180.0]")
            places = list(t.trends_closest(lat, lon))
            if len(places) == 0:
                parser.error("Couldn't find WOE ID for %s" % query)
            query = places[0]["woeid"]

        if not query:
            things = t.trends_available()
        else:
            trends = t.trends_place(query)
            if trends:
                things = trends[0]["trends"]

    elif command == "replies":
        tweet = t.tweet(query)
        if not tweet:
            parser.error("tweet with id %s does not exist" % query)
        things = t.replies(tweet, args.recursive)

    elif command == "listmembers":
        list_parts = re.match("^https://twitter.com/(.+)/lists/(.+)$", query)
        if not list_parts:
            parser.error(
                "provide the url for the list, e.g., https://twitter.com/USAFacts/lists/us-armed-forces"
            )
        things = t.list_members(
            slug=list_parts.group(2), owner_screen_name=list_parts.groups(1)
        )

    elif command == "configure":
        t.configure()
        sys.exit()

    else:
        parser.print_help()
        print("\nPlease use one of the following commands:\n")
        for cmd in commands:
            print(" - %s" % cmd)
        print("\nFor example:\n\n    twarc search blacklivesmatter")
        sys.exit(1)

    # get the output filehandle
    if args.output:
        if pyv == 3:
            fh = codecs.open(args.output, "wb", "utf8")
        else:
            fh = open(args.output, "w")
    else:
        fh = sys.stdout

    # optionally create a csv writer
    csv_writer = None
    if args.format in ("csv", "csv-excel") and command not in [
        "filter",
        "hydrate",
        "replies",
        "retweets",
        "sample",
        "search",
        "timeline",
        "tweet",
    ]:
        parser.error("csv output not available for %s" % command)
    elif args.format in ("csv", "csv-excel"):
        csv_writer = csv.writer(fh)
        csv_writer.writerow(get_headings())

    line_count = 0
    file_count = 0
    for thing in things:

        # rotate the files if necessary
        if args.output and args.split and line_count % args.split == 0:
            file_count += 1
            fh = codecs.open(numbered_filepath(args.output, file_count), "wb", "utf8")
            if csv_writer:
                csv_writer = csv.writer(fh)
                csv_writer.writerow(get_headings())

        line_count += 1

        # ready to output

        kind_of = type(thing)
        if kind_of == str_type:
            # user or tweet IDs
            print(thing, file=fh)
            log.info("archived %s" % thing)
        elif "id_str" in thing:
            # tweets and users
            if args.format == "json":
                print(json.dumps(thing), file=fh)
            elif args.format == "csv":
                csv_writer.writerow(get_row(thing))
            elif args.format == "csv-excel":
                csv_writer.writerow(get_row(thing, excel=True))
            log.info("archived %s", thing["id_str"])
        elif "woeid" in thing:
            # places
            print(json.dumps(thing), file=fh)
        elif "tweet_volume" in thing:
            # trends
            print(json.dumps(thing), file=fh)
        elif "limit" in thing:
            # rate limits
            t = datetime.datetime.utcfromtimestamp(
                float(thing["limit"]["timestamp_ms"]) / 1000
            )
            t = t.isoformat("T") + "Z"
            log.warning("%s tweets undelivered at %s", thing["limit"]["track"], t)
            if args.warnings:
                print(json.dumps(thing), file=fh)
        elif "warning" in thing:
            # other warnings
            log.warning(thing["warning"]["message"])
            if args.warnings:
                print(json.dumps(thing), file=fh)
        elif "data" in thing:
            # Labs style JSON schema.
            print(json.dumps(thing), file=fh)


def get_argparser():
    """
    Get the command line argument parser.
    """

    parser = argparse.ArgumentParser("twarc")
    parser.add_argument("command", choices=commands)
    parser.add_argument("query", nargs="?", default=None)
    parser.add_argument("--log", dest="log", default="twarc.log", help="log file")
    parser.add_argument("--consumer_key", default=None, help="Twitter API consumer key")
    parser.add_argument(
        "--consumer_secret", default=None, help="Twitter API consumer secret"
    )
    parser.add_argument("--access_token", default=None, help="Twitter API access key")
    parser.add_argument(
        "--access_token_secret", default=None, help="Twitter API access token secret"
    )
    parser.add_argument(
        "--config", help="Config file containing Twitter keys and secrets"
    )
    parser.add_argument(
        "--profile", help="Name of a profile in your configuration file"
    )
    parser.add_argument(
        "--warnings", action="store_true", help="Include warning messages in output"
    )
    parser.add_argument(
        "--connection_errors",
        type=int,
        default="0",
        help="Number of connection errors before giving up",
    )
    parser.add_argument(
        "--http_errors",
        type=int,
        default="0",
        help="Number of http errors before giving up",
    )
    parser.add_argument(
        "--max_id", dest="max_id", help="maximum tweet id to search for"
    )
    parser.add_argument("--since_id", dest="since_id", help="smallest id to search for")
    parser.add_argument(
        "--result_type",
        dest="result_type",
        choices=["mixed", "recent", "popular"],
        default="recent",
        help="search result type",
    )
    parser.add_argument(
        "--lang",
        dest="lang",
        action="append",
        default=[],
        help="limit to ISO 639-1 language code",
    ),
    parser.add_argument(
        "--geocode", dest="geocode", help="limit by latitude,longitude,radius"
    )
    parser.add_argument(
        "--locations", dest="locations", help="limit filter stream to location(s)"
    )
    parser.add_argument(
        "--follow", dest="follow", help="limit filter to tweets from given user id(s)"
    )
    parser.add_argument(
        "--recursive",
        dest="recursive",
        action="store_true",
        help="also fetch replies to replies",
    )
    parser.add_argument(
        "--tweet_mode",
        action="store",
        default="extended",
        dest="tweet_mode",
        choices=["compat", "extended"],
        help="set tweet mode",
    )
    parser.add_argument(
        "--protected",
        dest="protected",
        action="store_true",
        help="include protected tweets",
    )
    parser.add_argument(
        "--output",
        action="store",
        default=None,
        dest="output",
        help="write output to file path",
    )
    parser.add_argument(
        "--format",
        action="store",
        default="json",
        dest="format",
        choices=["json", "csv", "csv-excel"],
        help="set output format",
    )
    parser.add_argument(
        "--split",
        action="store",
        type=int,
        default=0,
        help="used with --output to split into numbered files",
    )
    parser.add_argument(
        "--skip_key_validation",
        action="store_true",
        help="skip checking keys are valid on startup",
    )
    parser.add_argument(
        "--app_auth",
        action="store_true",
        default=False,
        help="run in App Auth mode instead of User Auth",
    )
    parser.add_argument(
        "--gnip_auth",
        action="store_true",
        default=False,
        help="run in Gnip Auth mode (for enterprise APIs)",
    )
    parser.add_argument(
        "--30day",
        action="store",
        dest="thirtyday",
        help="environment to use to search 30day premium endpoint",
    )
    parser.add_argument(
        "--fullarchive",
        action="store",
        help="environment to use to search fullarchive premium endpoint",
    ),
    parser.add_argument(
        "--gnip_fullarchive",
        action="store",
        help="environment to use to search gnip fullarchive enterprise endpoint",
    ),
    parser.add_argument(
        "--from_date",
        action="store",
        default=None,
        help="limit premium search to date e.g. 2012-05-01 03:04:01",
    )
    parser.add_argument(
        "--to_date",
        action="store",
        default=None,
        help="limit premium search to date e.g. 2012-05-01 03:04:01",
    )
    parser.add_argument(
        "--limit",
        type=int,
        default=0,
        help="limit number of tweets returned by Premium API",
    )
    parser.add_argument(
        "--sandbox",
        action="store_true",
        default=False,
        help="indicate that Premium API endpoint is a sandbox",
    )

    return parser


def numbered_filepath(filepath, num):
    path, ext = os.path.splitext(filepath)
    return os.path.join("{}-{:0>3}{}".format(path, num, ext))