Skip to content

Commit

Permalink
Merge branch 'main' of github.com:DocNow/twarc into main
Browse files Browse the repository at this point in the history
  • Loading branch information
igorbrigadir committed May 31, 2021
2 parents acb89bb + 68c35bb commit 5461300
Show file tree
Hide file tree
Showing 9 changed files with 168 additions and 98 deletions.
4 changes: 2 additions & 2 deletions docs/api/client2.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# twarc.Client2
# twarc.expansions

::: twarc.client2
::: twarc.expansions
handler: python
2 changes: 2 additions & 0 deletions docs/twitter-developer-access.md
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,8 @@ Now that you have your keys and tokens, you can start using the API. You may be

Be careful not to commit your keys into a public repository or make them visible to the public - do not include them in a client side js script for example. Most apps will ask for API Key and Secret, but "Consumer Key" is "API Key" and "Consumer Secret" is "API Secret".

For Academic Access, there is only one endpoint that takes Bearer (App Only) authentication, so in most cases, the Bearer Token is all you need to share.

## Step 5: Next Steps

Install `twarc`, and run `twarc2 configure` to set it up.
Expand Down
1 change: 1 addition & 0 deletions mkdocs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ nav:
- Library API:
- api/client.md
- api/client2.md
- api/expansions.md

plugins:
- search
Expand Down
59 changes: 42 additions & 17 deletions test_twarc2.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import dotenv
import pytest
import logging
import pathlib
import datetime
import threading

Expand All @@ -15,6 +16,7 @@
access_token = os.environ.get('ACCESS_TOKEN')
access_token_secret = os.environ.get('ACCESS_TOKEN_SECRET')

test_data = pathlib.Path('test-data')
logging.basicConfig(filename="test.log", level=logging.INFO)

# Implicitly test the constructor in application auth mode. This ensures that
Expand Down Expand Up @@ -292,6 +294,7 @@ def test_follows():
break
assert found >= 1000


def test_follows_username():
"""
Test followers and and following by username.
Expand Down Expand Up @@ -330,16 +333,20 @@ def test_flattened():
found_referenced_tweets = False

event = threading.Event()
for count, result in enumerate(T.sample(event=event)):
result = twarc.expansions.flatten(result)
for count, response in enumerate(T.sample(event=event)):

# streaming api always returns a tweet at a time but flatten
# will put these in a list so they can be treated uniformly
tweets = twarc.expansions.flatten(response)
assert len(tweets) == 1
tweet = tweets[0]

tweet = result["data"]
assert "id" in tweet
logging.info("got sample tweet #%s %s", count, tweet["id"])

author_id = tweet["author_id"]
assert "author" in tweet
assert result["data"]["author"]["id"] == author_id
assert tweet["author"]["id"] == author_id

if "in_reply_to_user_id" in tweet:
assert "in_reply_to_user" in tweet
Expand All @@ -362,8 +369,11 @@ def test_flattened():
assert tweet["entities"]["mentions"][0]["username"]
found_entities_mentions = True

if "referenced_tweets" in tweet:
assert tweet["referenced_tweets"][0]["id"]
# need to ensure there are no errors because a referenced tweet
# might be protected or deleted in which case it would not have been
# included in the response and would not have been flattened
if "errors" not in response and "referenced_tweets" in tweet:
assert tweet["referenced_tweets"][0]["text"]
found_referenced_tweets = True

if found_geo and found_in_reply_to_user and found_attachments_media \
Expand All @@ -383,18 +393,33 @@ def test_flattened():
assert found_referenced_tweets, "found referenced tweets"


def test_flatten_noop():
"""
Flattening twice should be a no-op.
"""
resp = next(T.tweet_lookup(range(1000, 2000)))
def test_ensure_flattened():
resp = next(T.search_recent('twitter'))

# flatten a response
flat1 = twarc.expansions.ensure_flattened(resp)
assert isinstance(flat1, list)
assert len(flat1) > 1
assert 'author' in flat1[0]

# flatten the flattened list
flat2 = twarc.expansions.ensure_flattened(flat1)
assert isinstance(flat2, list)
assert len(flat2) == len(flat1)
assert 'author' in flat2[0]

flat1 = twarc.expansions.flatten(resp)
assert len(flat1) > 0
# flatten a tweet object which will force it into a list
flat3 = twarc.expansions.ensure_flattened(flat2[0])
assert isinstance(flat3, list)
assert len(flat3) == 1

flat2 = twarc.expansions.flatten(flat1)
assert len(flat2) > 0
assert len(flat1) == len(flat2)
with pytest.raises(ValueError):
twarc.expansions.ensure_flattened({'fake': 'tweet'})
with pytest.raises(ValueError):
twarc.expansions.ensure_flattened([{'fake': 'tweet'}])
with pytest.raises(ValueError):
flat1[0].pop('author')
twarc.expansions.ensure_flattened(flat1)


def test_twarc_metadata():
Expand All @@ -408,7 +433,7 @@ def test_twarc_metadata():

for response in T.tweet_lookup(range(1000, 2000)):
assert "__twarc" in response
assert "__twarc" in twarc.expansions.flatten(response)
assert "__twarc" in twarc.expansions.flatten(response)[0]

# Witout metadata
T.metadata = False
Expand Down
1 change: 1 addition & 0 deletions twarc/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from .client import Twarc
from .client2 import Twarc2
from .version import version
from .expansions import ensure_flattened
26 changes: 15 additions & 11 deletions twarc/client2.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,17 +133,19 @@ def _search(
count += len(response['data'])
yield response

# Calculate the amount of time to sleep, accounting for any
# processing time used by the rest of the application.
# This is to satisfy the 1 request / 1 second rate limit
# on the search/all endpoint.

time.sleep(
max(0, sleep_between - (time.monotonic() - made_call))
)
made_call = time.monotonic()
else:
log.info(f'no more results for search')
log.info(f'Retrieved an empty page of results.')

# Calculate the amount of time to sleep, accounting for any
# processing time used by the rest of the application.
# This is to satisfy the 1 request / 1 second rate limit
# on the search/all endpoint.
time.sleep(
max(0, sleep_between - (time.monotonic() - made_call))
)
made_call = time.monotonic()

log.info(f'No more results for search {query}.')

def search_recent(
self, query, since_id=None, until_id=None, start_time=None,
Expand Down Expand Up @@ -497,7 +499,9 @@ def _timeline(
count += len(response['data'])
yield response
else:
log.info(f'no more results for timeline')
log.info(f'Retrieved an empty page of results for timeline {user_id}')

log.info(f'No more results for timeline {user_id}.')

def timeline(
self, user, since_id=None, until_id=None, start_time=None,
Expand Down
Loading

0 comments on commit 5461300

Please sign in to comment.