Skip to content

Commit

Permalink
add new batched endpoint TweetResultsByRestIds
Browse files Browse the repository at this point in the history
  • Loading branch information
trevorhobenshield committed Dec 7, 2023
1 parent f15d0e2 commit a4d8a83
Show file tree
Hide file tree
Showing 9 changed files with 133 additions and 88 deletions.
20 changes: 17 additions & 3 deletions readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -215,8 +215,20 @@ account.change_password('old pwd','new pwd')

#### Get all user/tweet data

Two special batch queries `scraper.tweets_by_ids` and `scraper.users_by_ids` should be preferred when applicable. These endpoints are more much more efficient and have higher rate limits than their unbatched counterparts. See the table below for a comparison.

| Endpoint | Batch Size | Rate Limit |
|---------------|----------------|---------------|
| tweets_by_ids | ~220 | 500 / 15 mins |
| tweets_by_id | 1 | 50 / 15 mins |
| users_by_ids | ~220 | 100 / 15 mins |
| users_by_id | 1 | 500 / 15 mins |


![](assets/scrape.gif)

*As of Fall 2023 login by username/password is unstable. Using cookies is now recommended.*

```python
from twitter.scraper import Scraper

Expand All @@ -236,7 +248,8 @@ scraper = Scraper(email, username, password)

# user data
users = scraper.users(['foo', 'bar', 'hello', 'world'])
users = scraper.users_by_ids([123, 234, 345]) # batch-request
users = scraper.users_by_ids([123, 234, 345]) # preferred
users = scraper.users_by_id([123, 234, 345])
tweets = scraper.tweets([123, 234, 345])
likes = scraper.likes([123, 234, 345])
tweets_and_replies = scraper.tweets_and_replies([123, 234, 345])
Expand All @@ -250,8 +263,9 @@ scraper.recommended_users()
scraper.recommended_users([123])

# tweet data
tweets_by_ids = scraper.tweets_by_id([987, 876, 754])
tweets_details = scraper.tweets_details([987, 876, 754])
tweets = scraper.tweets_by_ids([987, 876, 754]) # preferred
tweets = scraper.tweets_by_id([987, 876, 754])
tweet_details = scraper.tweets_details([987, 876, 754])
retweeters = scraper.retweeters([987, 876, 754])
favoriters = scraper.favoriters([987, 876, 754])

Expand Down
42 changes: 28 additions & 14 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,13 @@
from pathlib import Path

install_requires = [
"aiofiles",
"websockets",
"nest_asyncio",
"httpx",
"tqdm",
"orjson",
'aiofiles',
'nest_asyncio',
'httpx',
'tqdm',
'orjson',
'm3u8',
'websockets',
'uvloop; platform_system != "Windows"',
]

Expand Down Expand Up @@ -239,7 +240,18 @@
### Scraping
#### Get all user/tweet data
Two special batch queries `scraper.tweets_by_ids` and `scraper.users_by_ids` should be preferred when applicable. These endpoints are more much more efficient and have higher rate limits than their unbatched counterparts. See the table below for a comparison.
| Endpoint | Batch Size | Rate Limit |
|---------------|----------------|---------------|
| tweets_by_ids | ~220 | 500 / 15 mins |
| tweets_by_id | 1 | 50 / 15 mins |
| users_by_ids | ~220 | 100 / 15 mins |
| users_by_id | 1 | 500 / 15 mins |
*As of Fall 2023 login by username/password is unstable. Using cookies is now recommended.*
```python
from twitter.scraper import Scraper
Expand All @@ -259,32 +271,34 @@
# user data
users = scraper.users(['foo', 'bar', 'hello', 'world'])
users = scraper.users_by_ids([123, 234, 345]) # batch-request
users = scraper.users_by_ids([123, 234, 345]) # preferred
users = scraper.users_by_id([123, 234, 345])
tweets = scraper.tweets([123, 234, 345])
likes = scraper.likes([123, 234, 345])
tweets_and_replies = scraper.tweets_and_replies([123, 234, 345])
media = scraper.media([123, 234, 345])
following = scraper.following([123, 234, 345])
followers = scraper.followers([123, 234, 345])
scraper.tweet_stats([111111, 222222, 333333])
# get recommended users based on user
scraper.recommended_users()
scraper.recommended_users([123])
# tweet data
tweets_by_ids = scraper.tweets_by_id([987, 876, 754])
tweets_details = scraper.tweets_details([987, 876, 754])
tweets = scraper.tweets_by_ids([987, 876, 754]) # preferred
tweets = scraper.tweets_by_id([987, 876, 754])
tweet_details = scraper.tweets_details([987, 876, 754])
retweeters = scraper.retweeters([987, 876, 754])
favoriters = scraper.favoriters([987, 876, 754])
scraper.download_media([
111111,
222222,
333333,
444444,
])
# trends
scraper.trends()
```
Expand Down
2 changes: 1 addition & 1 deletion twitter/__version__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
__title__ = "twitter-api-client"
__description__ = "Implementation of X/Twitter v1, v2, and GraphQL APIs."
__version__ = "0.10.12"
__version__ = "0.10.13"
__author__ = "Trevor Hobenshield"
__license__ = "MIT"
17 changes: 7 additions & 10 deletions twitter/account.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
import logging.config
import math
import mimetypes
import platform
from copy import deepcopy
from datetime import datetime
from string import ascii_letters
Expand All @@ -18,20 +17,18 @@
from .util import *

try:
if get_ipython().__class__.__name__ == 'ZMQInteractiveShell':
import nest_asyncio
import nest_asyncio

nest_asyncio.apply()
nest_asyncio.apply()
except:
...

if platform.system() != 'Windows':
try:
import uvloop
try:
import uvloop

uvloop.install()
except ImportError as e:
...
uvloop.install()
except:
...


class Account:
Expand Down
10 changes: 10 additions & 0 deletions twitter/constants.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
from dataclasses import dataclass

# todo: not accurate measure. value will decrease as new gql features/variables are required. (actual limitation is request size, i.e. new gql features an variables contribute to total request size)
MAX_GQL_CHAR_LIMIT = 4_200

MAX_ENDPOINT_LIMIT = 500 # 500/15 mins

MAX_IMAGE_SIZE = 5_242_880 # ~5 MB
MAX_GIF_SIZE = 15_728_640 # ~15 MB
MAX_VIDEO_SIZE = 536_870_912 # ~530 MB
Expand Down Expand Up @@ -105,6 +110,7 @@ class Operation:
UserMedia = {'userId': int}, 'YqiE3JL1KNgf9nSljYdxaA', 'UserMedia'
UserTweetsAndReplies = {'userId': int}, 'RIWc55YCNyUJ-U3HHGYkdg', 'UserTweetsAndReplies'
TweetResultByRestId = {'tweetId': int}, 'D_jNhjWZeRZT5NURzfJZSQ', 'TweetResultByRestId'
TweetResultsByRestIds = {'tweetIds': list[int | str]}, 'BWy5aoI-WvwbeSiHUIf2Hw', 'TweetResultsByRestIds'
TweetDetail = {'focalTweetId': int}, 'zXaXQgfyR4GxE21uwYQSyA', 'TweetDetail'
TweetStats = {'rest_id': int}, 'EvbTkPDT-xQCfupPu0rWMA', 'TweetStats'
Likes = {'userId': int}, 'nXEl0lfN_XSznVMlprThgQ', 'Likes'
Expand Down Expand Up @@ -360,6 +366,10 @@ class Operation:
'withMessages': True,
}
default_features = {
# new
'c9s_tweet_anatomy_moderator_badge_enabled': True,
'responsive_web_home_pinned_timelines_enabled': True,

'blue_business_profile_image_shape_enabled': True,
'creator_subscriptions_tweet_preview_api_enabled': True,
'freedom_of_speech_not_reach_fetch_enabled': True,
Expand Down
2 changes: 1 addition & 1 deletion twitter/login.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from httpx import Client

from .constants import GREEN, YELLOW, RED, BOLD, RESET
from .constants import YELLOW, RED, BOLD, RESET
from .util import find_key


Expand Down
46 changes: 26 additions & 20 deletions twitter/scraper.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,7 @@
import asyncio
import logging.config
import math
import platform

import aiofiles
import websockets
from httpx import AsyncClient, Limits, ReadTimeout, URL
from tqdm.asyncio import tqdm_asyncio
Expand All @@ -13,20 +11,18 @@
from .util import *

try:
if get_ipython().__class__.__name__ == 'ZMQInteractiveShell':
import nest_asyncio
import nest_asyncio

nest_asyncio.apply()
nest_asyncio.apply()
except:
...

if platform.system() != 'Windows':
try:
import uvloop
try:
import uvloop

uvloop.install()
except ImportError as e:
...
uvloop.install()
except:
...


class Scraper:
Expand All @@ -49,7 +45,7 @@ def users(self, screen_names: list[str], **kwargs) -> list[dict]:
"""
return self._run(Operation.UserByScreenName, screen_names, **kwargs)

def tweets_by_id(self, tweet_ids: list[int], **kwargs) -> list[dict]:
def tweets_by_id(self, tweet_ids: list[int | str], **kwargs) -> list[dict]:
"""
Get tweet metadata by tweet ids.
Expand All @@ -59,6 +55,18 @@ def tweets_by_id(self, tweet_ids: list[int], **kwargs) -> list[dict]:
"""
return self._run(Operation.TweetResultByRestId, tweet_ids, **kwargs)

def tweets_by_ids(self, tweet_ids: list[int | str], **kwargs) -> list[dict]:
"""
Get tweet metadata by tweet ids.
Special batch query for tweet data. Most efficient way to get tweets.
@param tweet_ids: list of tweet ids
@param kwargs: optional keyword arguments
@return: list of tweet data as dicts
"""
return self._run(Operation.TweetResultsByRestIds, batch_ids(tweet_ids), **kwargs)

def tweets_details(self, tweet_ids: list[int], **kwargs) -> list[dict]:
"""
Get tweet data by tweet ids.
Expand Down Expand Up @@ -230,8 +238,7 @@ def users_by_id(self, user_ids: list[int], **kwargs) -> list[dict]:
"""
return self._run(Operation.UserByRestId, user_ids, **kwargs)

def download_media(self, ids: list[int], photos: bool = True, videos: bool = True, chunk_size: int = 8192,
stream: bool = False) -> None:
def download_media(self, ids: list[int], photos: bool = True, videos: bool = True, chunk_size: int = 8192, stream: bool = False) -> None:
"""
Download media from tweets by tweet ids.
Expand Down Expand Up @@ -515,12 +522,12 @@ async def process():

return asyncio.run(process())

def _run(self, operation: tuple[dict, str, str], queries: set | list[int | str | dict], **kwargs):
def _run(self, operation: tuple[dict, str, str], queries: set | list[int | str | list | dict], **kwargs):
keys, qid, name = operation
# stay within rate-limits
if (l := len(queries)) > 500:
if (l := len(queries)) > MAX_ENDPOINT_LIMIT:
self.logger.warning(f'Got {l} queries, truncating to first 500.')
queries = list(queries)[:500]
queries = list(queries)[:MAX_ENDPOINT_LIMIT]

if all(isinstance(q, dict) for q in queries):
data = asyncio.run(self._process(operation, list(queries), **kwargs))
Expand All @@ -542,14 +549,13 @@ async def _query(self, client: AsyncClient, operation: tuple, **kwargs) -> Respo
if self.debug:
log(self.logger, self.debug, r)
if self.save:
save_json(r, self.out, name, **kwargs)
await save_json(r, self.out, name, **kwargs)
return r

async def _process(self, operation: tuple, queries: list[dict], **kwargs):
limits = Limits(max_connections=100, max_keepalive_connections=10)
headers = self.session.headers if self.guest else get_headers(self.session)
cookies = self.session.cookies
async with AsyncClient(limits=limits, headers=headers, cookies=cookies, timeout=20) as c:
async with AsyncClient(limits=Limits(max_connections=MAX_ENDPOINT_LIMIT), headers=headers, cookies=cookies, timeout=20) as c:
tasks = (self._paginate(c, operation, **q, **kwargs) for q in queries)
if self.pbar:
return await tqdm_asyncio.gather(*tasks, desc=operation[-1])
Expand Down
16 changes: 7 additions & 9 deletions twitter/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,20 +19,18 @@
colors = [f'\x1b[{i}m' for i in range(31, 37)]

try:
if get_ipython().__class__.__name__ == 'ZMQInteractiveShell':
import nest_asyncio
import nest_asyncio

nest_asyncio.apply()
nest_asyncio.apply()
except:
...

if platform.system() != 'Windows':
try:
import uvloop
try:
import uvloop

uvloop.install()
except ImportError as e:
...
uvloop.install()
except:
...


class Search:
Expand Down
Loading

0 comments on commit a4d8a83

Please sign in to comment.