Skip to content

Instantly share code, notes, and snippets.

@mrecachinas
Created May 7, 2017 16:08
Show Gist options
  • Save mrecachinas/a45cc590dce1e3a7e08978aac06f832e to your computer and use it in GitHub Desktop.
Save mrecachinas/a45cc590dce1e3a7e08978aac06f832e to your computer and use it in GitHub Desktop.

Revisions

  1. Michael Recachinas created this gist May 7, 2017.
    58 changes: 58 additions & 0 deletions wiki.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,58 @@
    #!/usr/bin/env python
    from __future__ import print_function
    import urllib2
    import httplib
    from bs4 import BeautifulSoup, SoupStrainer
    import string
    import time

    def get_wiki(link):
    try:
    response = urllib2.urlopen(link)
    return response
    except urllib2.URLError as e:
    pass
    except urllib2.HTTPError as e:
    pass
    except httplib.HTTPException as e:
    pass
    except Exception:
    import traceback
    print(traceback.format_exc())

    def get_random_wiki():
    random_link = "https://en.wikipedia.org/wiki/Special:Random"
    return get_wiki(random_link)

    def first_lowercase_link(wiki):
    content = BeautifulSoup(wiki).find_all(id='mw-content-text')
    print(content)
    content = content[0]
    prefix = "https://en.wikipedia.org"
    for para in content.find_all('p'):
    for link in para.find_all('a', href=True):
    if link.text and link.text[0] in string.ascii_lowercase:
    return (urllib2.urlparse.urljoin(prefix, link["href"]), link.text)

    def links_to_topic(wiki, topic=""):
    links = []
    link = first_lowercase_link(wiki)
    while link and link[1].lower() != topic:
    links.append(link)
    next_wiki = get_wiki(link[0])
    link = first_lowercase_link(next_wiki)
    if link:
    links.append(link)
    return links

    def test():
    MAX_ITER = 10
    links = {}
    for n in xrange(MAX_ITER):
    random_wiki = get_random_wiki()
    random_wiki_title = BeautifulSoup(random_wiki.read()).find(id="firstHeading").text
    links[random_wiki_title] = links_to_topic(random_wiki, topic="philosophy")
    print(links)

    if __name__ == '__main__':
    test()