#!/usr/bin/env python # NOTE: # # This script has been ported to the twarc-network plugin for working # with data collected with twarc2. Please see # https://github.com/docnow/twarc-newtwork for details. # # --- # # build a reply, quote, retweet network from a file of tweets and write it # out as a gexf, dot, json or html file. You will need to have networkx # installed and pydotplus if you want to use dot. The html presentation # uses d3 to display the network graph in your browser. # # ./network.py tweets.jsonl network.html # # or # ./network.py tweets.jsonl network.dot # # or # # ./network.py tweets.jsonl network.gexf # # if you would rather have the network oriented around nodes that are users # instead of tweets use the --users flag # # ./network.py --users tweets.jsonl network.gexf # # if you would rather have the network oriented around nodes that are hashtags # instead of tweets or users, use the --hashtags flag # # TODO: this is mostly here some someone can improve it :) import sys import json import networkx import optparse import itertools import time from networkx import nx_pydot from networkx.readwrite import json_graph usage = "network.py tweets.jsonl graph.html" opt_parser = optparse.OptionParser(usage=usage) opt_parser.add_option( "--retweets", dest="retweets", action="store_true", help="include retweets" ) opt_parser.add_option( "--min_subgraph_size", dest="min_subgraph_size", type="int", help="remove any subgraphs with a size smaller than this number", ) opt_parser.add_option( "--max_subgraph_size", dest="max_subgraph_size", type="int", help="remove any subgraphs with a size larger than this number", ) opt_parser.add_option( "--users", dest="users", action="store_true", help="show user relations instead of tweet relations", ) opt_parser.add_option( "--hashtags", dest="hashtags", action="store_true", help="show hashtag relations instead of tweet relations", ) options, args = opt_parser.parse_args() if len(args) != 2: opt_parser.error("must supply input and output file names") tweets, output = args G = networkx.DiGraph() def add(from_user, from_id, to_user, to_id, type, created_at=None): "adds a relation to the graph" # storing start_data will allow for timestamps for gephi timeline, where nodes will appear on screen at their start dataset # and stay on forever after if (options.users or options.hashtags) and to_user: G.add_node(from_user, screen_name=from_user, start_date=created_at) G.add_node(to_user, screen_name=to_user, start_date=created_at) if G.has_edge(from_user, to_user): weight = G[from_user][to_user]["weight"] + 1 else: weight = 1 G.add_edge(from_user, to_user, type=type, weight=weight) elif not options.users and to_id: G.add_node(from_id, screen_name=from_user, type=type) if to_user: G.add_node(to_id, screen_name=to_user) else: G.add_node(to_id) G.add_edge(from_id, to_id, type=type) def to_json(g): j = {"nodes": [], "links": []} for node_id, node_attrs in g.nodes(True): j["nodes"].append( { "id": node_id, "type": node_attrs.get("type"), "screen_name": node_attrs.get("screen_name"), } ) for source, target, attrs in g.edges(data=True): j["links"].append( {"source": source, "target": target, "type": attrs.get("type")} ) return j for line in open(tweets): try: t = json.loads(line) except: continue from_id = t["id_str"] from_user = t["user"]["screen_name"] from_user_id = t["user"]["id_str"] to_user = None to_id = None # standardize raw created at date to dd/MM/yyyy HH:mm:ss created_at_date = time.strftime( "%d/%m/%Y %H:%M:%S", time.strptime(t["created_at"], "%a %b %d %H:%M:%S +0000 %Y"), ) if options.users: for u in t["entities"].get("user_mentions", []): add(from_user, from_id, u["screen_name"], None, "reply", created_at_date) elif options.hashtags: hashtags = t["entities"].get("hashtags", []) hashtag_pairs = list( itertools.combinations(hashtags, 2) ) # list of all possible hashtag pairs for u in hashtag_pairs: # source hashtag: u[0]['text'] # target hashtag: u[1]['text'] add( "#" + u[0]["text"], None, "#" + u[1]["text"], None, "hashtag", created_at_date, ) else: if t.get("in_reply_to_status_id_str"): to_id = t["in_reply_to_status_id_str"] to_user = t["in_reply_to_screen_name"] add(from_user, from_id, to_user, to_id, "reply") if t.get("quoted_status"): to_id = t["quoted_status"]["id_str"] to_user = t["quoted_status"]["user"]["screen_name"] to_user_id = t["quoted_status"]["user"]["id_str"] add(from_user, from_id, to_user, to_id, "quote") if options.retweets and t.get("retweeted_status"): to_id = t["retweeted_status"]["id_str"] to_user = t["retweeted_status"]["user"]["screen_name"] to_user_id = t["retweeted_status"]["user"]["id_str"] add(from_user, from_id, to_user, to_id, "retweet") if options.min_subgraph_size or options.max_subgraph_size: g_copy = G.copy() for g in networkx.connected_component_subgraphs(G): if options.min_subgraph_size and len(g) < options.min_subgraph_size: g_copy.remove_nodes_from(g.nodes()) elif options.max_subgraph_size and len(g) > options.max_subgraph_size: g_copy.remove_nodes_from(g.nodes()) G = g_copy if output.endswith(".gexf"): networkx.write_gexf(G, output) elif output.endswith(".gml"): networkx.write_gml(G, output) elif output.endswith(".dot"): nx_pydot.write_dot(G, output) elif output.endswith(".json"): json.dump(to_json(G), open(output, "w"), indent=2) elif output.endswith(".html"): graph_data = json.dumps(to_json(G), indent=2) html = ( """

""" % graph_data ) open(output, "w").write(html)