add new batched endpoint TweetResultsByRestIds

ZakariaMQ · Dec 7, 2023 · a4d8a83 · a4d8a83
1 parent f15d0e2
commit a4d8a83
Show file tree

Hide file tree

Showing 9 changed files with 133 additions and 88 deletions.
diff --git a/readme.md b/readme.md
@@ -215,8 +215,20 @@ account.change_password('old pwd','new pwd')
 
 #### Get all user/tweet data
 
+Two special batch queries `scraper.tweets_by_ids` and `scraper.users_by_ids` should be preferred when applicable. These endpoints are more much more efficient and have higher rate limits than their unbatched counterparts. See the table below for a comparison.
+
+| Endpoint      | Batch Size     | Rate Limit    |
+|---------------|----------------|---------------|
+| tweets_by_ids | ~220           | 500 / 15 mins |
+| tweets_by_id  | 1              | 50 / 15 mins  |
+| users_by_ids  | ~220           | 100 / 15 mins |
+| users_by_id   | 1              | 500 / 15 mins |
+
+
 ![](assets/scrape.gif)
 
+*As of Fall 2023 login by username/password is unstable. Using cookies is now recommended.*
+
 ```python
 from twitter.scraper import Scraper
 
@@ -236,7 +248,8 @@ scraper = Scraper(email, username, password)
 
 # user data
 users = scraper.users(['foo', 'bar', 'hello', 'world'])
-users = scraper.users_by_ids([123, 234, 345])  # batch-request
+users = scraper.users_by_ids([123, 234, 345]) # preferred
+users = scraper.users_by_id([123, 234, 345])
 tweets = scraper.tweets([123, 234, 345])
 likes = scraper.likes([123, 234, 345])
 tweets_and_replies = scraper.tweets_and_replies([123, 234, 345])
@@ -250,8 +263,9 @@ scraper.recommended_users()
 scraper.recommended_users([123])
 
 # tweet data
-tweets_by_ids = scraper.tweets_by_id([987, 876, 754])
-tweets_details = scraper.tweets_details([987, 876, 754])
+tweets = scraper.tweets_by_ids([987, 876, 754]) # preferred
+tweets = scraper.tweets_by_id([987, 876, 754])
+tweet_details = scraper.tweets_details([987, 876, 754])
 retweeters = scraper.retweeters([987, 876, 754])
 favoriters = scraper.favoriters([987, 876, 754])
 

diff --git a/setup.py b/setup.py
@@ -3,12 +3,13 @@
 from pathlib import Path
 
 install_requires = [
-    "aiofiles",
-    "websockets",
-    "nest_asyncio",
-    "httpx",
-    "tqdm",
-    "orjson",
+    'aiofiles',
+    'nest_asyncio',
+    'httpx',
+    'tqdm',
+    'orjson',
+    'm3u8',
+    'websockets',
     'uvloop; platform_system != "Windows"',
 ]
 
@@ -239,7 +240,18 @@
     ### Scraping
 
     #### Get all user/tweet data
-
+    
+    Two special batch queries `scraper.tweets_by_ids` and `scraper.users_by_ids` should be preferred when applicable. These endpoints are more much more efficient and have higher rate limits than their unbatched counterparts. See the table below for a comparison.
+    
+    | Endpoint      | Batch Size     | Rate Limit    |
+    |---------------|----------------|---------------|
+    | tweets_by_ids | ~220           | 500 / 15 mins |
+    | tweets_by_id  | 1              | 50 / 15 mins  |
+    | users_by_ids  | ~220           | 100 / 15 mins |
+    | users_by_id   | 1              | 500 / 15 mins |
+    
+    *As of Fall 2023 login by username/password is unstable. Using cookies is now recommended.*
+    
     ```python
     from twitter.scraper import Scraper
 
@@ -259,32 +271,34 @@
 
     # user data
     users = scraper.users(['foo', 'bar', 'hello', 'world'])
-    users = scraper.users_by_ids([123, 234, 345])  # batch-request
+    users = scraper.users_by_ids([123, 234, 345]) # preferred
+    users = scraper.users_by_id([123, 234, 345])
     tweets = scraper.tweets([123, 234, 345])
     likes = scraper.likes([123, 234, 345])
     tweets_and_replies = scraper.tweets_and_replies([123, 234, 345])
     media = scraper.media([123, 234, 345])
     following = scraper.following([123, 234, 345])
     followers = scraper.followers([123, 234, 345])
     scraper.tweet_stats([111111, 222222, 333333])
-
+    
     # get recommended users based on user
     scraper.recommended_users()
     scraper.recommended_users([123])
-
+    
     # tweet data
-    tweets_by_ids = scraper.tweets_by_id([987, 876, 754])
-    tweets_details = scraper.tweets_details([987, 876, 754])
+    tweets = scraper.tweets_by_ids([987, 876, 754]) # preferred
+    tweets = scraper.tweets_by_id([987, 876, 754])
+    tweet_details = scraper.tweets_details([987, 876, 754])
     retweeters = scraper.retweeters([987, 876, 754])
     favoriters = scraper.favoriters([987, 876, 754])
-
+    
     scraper.download_media([
         111111,
         222222,
         333333,
         444444,
     ])
-
+    
     # trends
     scraper.trends()
     ```

diff --git a/twitter/__version__.py b/twitter/__version__.py
@@ -1,5 +1,5 @@
 __title__ = "twitter-api-client"
 __description__ = "Implementation of X/Twitter v1, v2, and GraphQL APIs."
-__version__ = "0.10.12"
+__version__ = "0.10.13"
 __author__ = "Trevor Hobenshield"
 __license__ = "MIT"
diff --git a/twitter/account.py b/twitter/account.py
@@ -3,7 +3,6 @@
 import logging.config
 import math
 import mimetypes
-import platform
 from copy import deepcopy
 from datetime import datetime
 from string import ascii_letters
@@ -18,20 +17,18 @@
 from .util import *
 
 try:
-    if get_ipython().__class__.__name__ == 'ZMQInteractiveShell':
-        import nest_asyncio
+    import nest_asyncio
 
-        nest_asyncio.apply()
+    nest_asyncio.apply()
 except:
     ...
 
-if platform.system() != 'Windows':
-    try:
-        import uvloop
+try:
+    import uvloop
 
-        uvloop.install()
-    except ImportError as e:
-        ...
+    uvloop.install()
+except:
+    ...
 
 
 class Account:

diff --git a/twitter/constants.py b/twitter/constants.py
@@ -1,5 +1,10 @@
 from dataclasses import dataclass
 
+# todo: not accurate measure. value will decrease as new gql features/variables are required. (actual limitation is request size, i.e. new gql features an variables contribute to total request size)
+MAX_GQL_CHAR_LIMIT = 4_200
+
+MAX_ENDPOINT_LIMIT = 500  # 500/15 mins
+
 MAX_IMAGE_SIZE = 5_242_880  # ~5 MB
 MAX_GIF_SIZE = 15_728_640  # ~15 MB
 MAX_VIDEO_SIZE = 536_870_912  # ~530 MB
@@ -105,6 +110,7 @@ class Operation:
     UserMedia = {'userId': int}, 'YqiE3JL1KNgf9nSljYdxaA', 'UserMedia'
     UserTweetsAndReplies = {'userId': int}, 'RIWc55YCNyUJ-U3HHGYkdg', 'UserTweetsAndReplies'
     TweetResultByRestId = {'tweetId': int}, 'D_jNhjWZeRZT5NURzfJZSQ', 'TweetResultByRestId'
+    TweetResultsByRestIds = {'tweetIds': list[int | str]}, 'BWy5aoI-WvwbeSiHUIf2Hw', 'TweetResultsByRestIds'
     TweetDetail = {'focalTweetId': int}, 'zXaXQgfyR4GxE21uwYQSyA', 'TweetDetail'
     TweetStats = {'rest_id': int}, 'EvbTkPDT-xQCfupPu0rWMA', 'TweetStats'
     Likes = {'userId': int}, 'nXEl0lfN_XSznVMlprThgQ', 'Likes'
@@ -360,6 +366,10 @@ class Operation:
         'withMessages': True,
     }
     default_features = {
+        # new
+        'c9s_tweet_anatomy_moderator_badge_enabled': True,
+        'responsive_web_home_pinned_timelines_enabled': True,
+
         'blue_business_profile_image_shape_enabled': True,
         'creator_subscriptions_tweet_preview_api_enabled': True,
         'freedom_of_speech_not_reach_fetch_enabled': True,

diff --git a/twitter/login.py b/twitter/login.py
@@ -2,7 +2,7 @@
 
 from httpx import Client
 
-from .constants import GREEN, YELLOW, RED, BOLD, RESET
+from .constants import YELLOW, RED, BOLD, RESET
 from .util import find_key
 
 

diff --git a/twitter/scraper.py b/twitter/scraper.py
@@ -1,9 +1,7 @@
 import asyncio
 import logging.config
 import math
-import platform
 
-import aiofiles
 import websockets
 from httpx import AsyncClient, Limits, ReadTimeout, URL
 from tqdm.asyncio import tqdm_asyncio
@@ -13,20 +11,18 @@
 from .util import *
 
 try:
-    if get_ipython().__class__.__name__ == 'ZMQInteractiveShell':
-        import nest_asyncio
+    import nest_asyncio
 
-        nest_asyncio.apply()
+    nest_asyncio.apply()
 except:
     ...
 
-if platform.system() != 'Windows':
-    try:
-        import uvloop
+try:
+    import uvloop
 
-        uvloop.install()
-    except ImportError as e:
-        ...
+    uvloop.install()
+except:
+    ...
 
 
 class Scraper:
@@ -49,7 +45,7 @@ def users(self, screen_names: list[str], **kwargs) -> list[dict]:
         """
         return self._run(Operation.UserByScreenName, screen_names, **kwargs)
 
-    def tweets_by_id(self, tweet_ids: list[int], **kwargs) -> list[dict]:
+    def tweets_by_id(self, tweet_ids: list[int | str], **kwargs) -> list[dict]:
         """
         Get tweet metadata by tweet ids.
 
@@ -59,6 +55,18 @@ def tweets_by_id(self, tweet_ids: list[int], **kwargs) -> list[dict]:
         """
         return self._run(Operation.TweetResultByRestId, tweet_ids, **kwargs)
 
+    def tweets_by_ids(self, tweet_ids: list[int | str], **kwargs) -> list[dict]:
+        """
+        Get tweet metadata by tweet ids.
+
+        Special batch query for tweet data. Most efficient way to get tweets.
+
+        @param tweet_ids: list of tweet ids
+        @param kwargs: optional keyword arguments
+        @return: list of tweet data as dicts
+        """
+        return self._run(Operation.TweetResultsByRestIds, batch_ids(tweet_ids), **kwargs)
+
     def tweets_details(self, tweet_ids: list[int], **kwargs) -> list[dict]:
         """
         Get tweet data by tweet ids.
@@ -230,8 +238,7 @@ def users_by_id(self, user_ids: list[int], **kwargs) -> list[dict]:
         """
         return self._run(Operation.UserByRestId, user_ids, **kwargs)
 
-    def download_media(self, ids: list[int], photos: bool = True, videos: bool = True, chunk_size: int = 8192,
-                       stream: bool = False) -> None:
+    def download_media(self, ids: list[int], photos: bool = True, videos: bool = True, chunk_size: int = 8192, stream: bool = False) -> None:
         """
         Download media from tweets by tweet ids.
 
@@ -515,12 +522,12 @@ async def process():
 
         return asyncio.run(process())
 
-    def _run(self, operation: tuple[dict, str, str], queries: set | list[int | str | dict], **kwargs):
+    def _run(self, operation: tuple[dict, str, str], queries: set | list[int | str | list | dict], **kwargs):
         keys, qid, name = operation
         # stay within rate-limits
-        if (l := len(queries)) > 500:
+        if (l := len(queries)) > MAX_ENDPOINT_LIMIT:
             self.logger.warning(f'Got {l} queries, truncating to first 500.')
-            queries = list(queries)[:500]
+            queries = list(queries)[:MAX_ENDPOINT_LIMIT]
 
         if all(isinstance(q, dict) for q in queries):
             data = asyncio.run(self._process(operation, list(queries), **kwargs))
@@ -542,14 +549,13 @@ async def _query(self, client: AsyncClient, operation: tuple, **kwargs) -> Respo
         if self.debug:
             log(self.logger, self.debug, r)
         if self.save:
-            save_json(r, self.out, name, **kwargs)
+            await save_json(r, self.out, name, **kwargs)
         return r
 
     async def _process(self, operation: tuple, queries: list[dict], **kwargs):
-        limits = Limits(max_connections=100, max_keepalive_connections=10)
         headers = self.session.headers if self.guest else get_headers(self.session)
         cookies = self.session.cookies
-        async with AsyncClient(limits=limits, headers=headers, cookies=cookies, timeout=20) as c:
+        async with AsyncClient(limits=Limits(max_connections=MAX_ENDPOINT_LIMIT), headers=headers, cookies=cookies, timeout=20) as c:
             tasks = (self._paginate(c, operation, **q, **kwargs) for q in queries)
             if self.pbar:
                 return await tqdm_asyncio.gather(*tasks, desc=operation[-1])

diff --git a/twitter/search.py b/twitter/search.py
@@ -19,20 +19,18 @@
 colors = [f'\x1b[{i}m' for i in range(31, 37)]
 
 try:
-    if get_ipython().__class__.__name__ == 'ZMQInteractiveShell':
-        import nest_asyncio
+    import nest_asyncio
 
-        nest_asyncio.apply()
+    nest_asyncio.apply()
 except:
     ...
 
-if platform.system() != 'Windows':
-    try:
-        import uvloop
+try:
+    import uvloop
 
-        uvloop.install()
-    except ImportError as e:
-        ...
+    uvloop.install()
+except:
+    ...
 
 
 class Search: