Merge branch 'main' of github.com:DocNow/twarc into main

DocNow · May 31, 2021 · 5461300 · 5461300
2 parents acb89bb + 68c35bb
commit 5461300
Show file tree

Hide file tree

Showing 9 changed files with 168 additions and 98 deletions.
diff --git a/docs/api/client2.md b/docs/api/client2.md
@@ -1,4 +1,4 @@
-# twarc.Client2
+# twarc.expansions
 
-::: twarc.client2
+::: twarc.expansions
   handler: python
diff --git a/docs/twitter-developer-access.md b/docs/twitter-developer-access.md
@@ -61,6 +61,8 @@ Now that you have your keys and tokens, you can start using the API. You may be
 
 Be careful not to commit your keys into a public repository or make them visible to the public - do not include them in a client side js script for example. Most apps will ask for API Key and Secret, but "Consumer Key" is "API Key" and "Consumer Secret" is "API Secret".
 
+For Academic Access, there is only one endpoint that takes Bearer (App Only) authentication, so in most cases, the Bearer Token is all you need to share.
+
 ## Step 5: Next Steps
 
 Install `twarc`, and run `twarc2 configure` to set it up.

diff --git a/mkdocs.yml b/mkdocs.yml
@@ -28,6 +28,7 @@ nav:
   - Library API:
     - api/client.md
     - api/client2.md
+    - api/expansions.md
 
 plugins:
 - search

diff --git a/test_twarc2.py b/test_twarc2.py
@@ -5,6 +5,7 @@
 import dotenv
 import pytest
 import logging
+import pathlib
 import datetime
 import threading
 
@@ -15,6 +16,7 @@
 access_token = os.environ.get('ACCESS_TOKEN')
 access_token_secret = os.environ.get('ACCESS_TOKEN_SECRET')
 
+test_data = pathlib.Path('test-data')
 logging.basicConfig(filename="test.log", level=logging.INFO)
 
 # Implicitly test the constructor in application auth mode. This ensures that
@@ -292,6 +294,7 @@ def test_follows():
             break
     assert found >= 1000
 
+
 def test_follows_username():
     """
     Test followers and and following by username. 
@@ -330,16 +333,20 @@ def test_flattened():
     found_referenced_tweets = False
 
     event = threading.Event()
-    for count, result in enumerate(T.sample(event=event)):
-        result = twarc.expansions.flatten(result)
+    for count, response in enumerate(T.sample(event=event)):
+
+        # streaming api always returns a tweet at a time but flatten
+        # will put these in a list so they can be treated uniformly
+        tweets = twarc.expansions.flatten(response)
+        assert len(tweets) == 1
+        tweet = tweets[0]
 
-        tweet = result["data"]
         assert "id" in tweet
         logging.info("got sample tweet #%s %s", count, tweet["id"])
 
         author_id = tweet["author_id"]
         assert "author" in tweet
-        assert result["data"]["author"]["id"] == author_id
+        assert tweet["author"]["id"] == author_id
 
         if "in_reply_to_user_id" in tweet:
             assert "in_reply_to_user" in tweet
@@ -362,8 +369,11 @@ def test_flattened():
             assert tweet["entities"]["mentions"][0]["username"]
             found_entities_mentions = True
 
-        if "referenced_tweets" in tweet:
-            assert tweet["referenced_tweets"][0]["id"]
+        # need to ensure there are no errors because a referenced tweet
+        # might be protected or deleted in which case it would not have been
+        # included in the response and would not have been flattened
+        if "errors" not in response and "referenced_tweets" in tweet:
+            assert tweet["referenced_tweets"][0]["text"]
             found_referenced_tweets = True
 
         if found_geo and found_in_reply_to_user and found_attachments_media \
@@ -383,18 +393,33 @@ def test_flattened():
     assert found_referenced_tweets, "found referenced tweets"
 
 
-def test_flatten_noop():
-    """
-    Flattening twice should be a no-op.
-    """
-    resp = next(T.tweet_lookup(range(1000, 2000)))
+def test_ensure_flattened():
+    resp = next(T.search_recent('twitter'))
+
+    # flatten a response
+    flat1 = twarc.expansions.ensure_flattened(resp)
+    assert isinstance(flat1, list)
+    assert len(flat1) > 1
+    assert 'author' in flat1[0]
+
+    # flatten the flattened list
+    flat2 = twarc.expansions.ensure_flattened(flat1)
+    assert isinstance(flat2, list)
+    assert len(flat2) == len(flat1)
+    assert 'author' in flat2[0]
 
-    flat1 = twarc.expansions.flatten(resp)
-    assert len(flat1) > 0
+    # flatten a tweet object which will force it into a list
+    flat3 = twarc.expansions.ensure_flattened(flat2[0])
+    assert isinstance(flat3, list)
+    assert len(flat3) == 1
 
-    flat2 = twarc.expansions.flatten(flat1)
-    assert len(flat2) > 0
-    assert len(flat1) == len(flat2)
+    with pytest.raises(ValueError):
+        twarc.expansions.ensure_flattened({'fake': 'tweet'})
+    with pytest.raises(ValueError):
+        twarc.expansions.ensure_flattened([{'fake': 'tweet'}])
+    with pytest.raises(ValueError):
+        flat1[0].pop('author')
+        twarc.expansions.ensure_flattened(flat1)
 
 
 def test_twarc_metadata():
@@ -408,7 +433,7 @@ def test_twarc_metadata():
 
     for response in T.tweet_lookup(range(1000, 2000)):
         assert "__twarc" in response
-        assert "__twarc" in twarc.expansions.flatten(response)
+        assert "__twarc" in twarc.expansions.flatten(response)[0]
 
     # Witout metadata
     T.metadata = False

diff --git a/twarc/__init__.py b/twarc/__init__.py
@@ -1,3 +1,4 @@
 from .client import Twarc
 from .client2 import Twarc2
 from .version import version
+from .expansions import ensure_flattened
diff --git a/twarc/client2.py b/twarc/client2.py
@@ -133,17 +133,19 @@ def _search(
                 count += len(response['data'])
                 yield response
 
-                # Calculate the amount of time to sleep, accounting for any
-                # processing time used by the rest of the application.
-                # This is to satisfy the 1 request / 1 second rate limit
-                # on the search/all endpoint.
-
-                time.sleep(
-                    max(0, sleep_between - (time.monotonic() - made_call))
-                )
-                made_call = time.monotonic()
             else:
-                log.info(f'no more results for search')
+                log.info(f'Retrieved an empty page of results.')
+
+            # Calculate the amount of time to sleep, accounting for any
+            # processing time used by the rest of the application.
+            # This is to satisfy the 1 request / 1 second rate limit
+            # on the search/all endpoint.
+            time.sleep(
+                max(0, sleep_between - (time.monotonic() - made_call))
+            )
+            made_call = time.monotonic()
+
+        log.info(f'No more results for search {query}.')
 
     def search_recent(
             self, query, since_id=None, until_id=None, start_time=None,
@@ -497,7 +499,9 @@ def _timeline(
                 count += len(response['data'])
                 yield response
             else:
-                log.info(f'no more results for timeline')
+                log.info(f'Retrieved an empty page of results for timeline {user_id}')
+
+        log.info(f'No more results for timeline {user_id}.')
 
     def timeline(
         self, user, since_id=None, until_id=None, start_time=None,