Skip to content

Commit

Permalink
Merge pull request #32 from markowanga/feature/add_media_url
Browse files Browse the repository at this point in the history
Add media_url
  • Loading branch information
markowanga authored Feb 6, 2021
2 parents 333cc14 + f61d00d commit 413cc5b
Show file tree
Hide file tree
Showing 12 changed files with 67 additions and 57 deletions.
9 changes: 3 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -142,8 +142,7 @@ With class SearchRunner library can scrap tweets specified in SearchTweetsTask.
|Property|Type|Default value|Description|
|---|---|---|---|
|search_run_context|st.SearchRunContext|None, in \_\_init\_\_() assign SearchRunContext()|Search context, contains all important properties to make the next request to Twitter|
|search_tweets_task|st.SearchTweetsTask|**Obligatory
property**|Property specifies which tweets should be downloaded by the runner|
|search_tweets_task|st.SearchTweetsTask|**Obligatory property**|Property specifies which tweets should be downloaded by the runner|
|tweet_outputs|List[st.TweetOutput]|**Obligatory property**|List of objects to export downloaded tweets|
|web_client|st.WebClient|stweet.http_request.WebClientRequests()|Implementation of a WebClient, can be replaced for custom implementation|
|tweet_parser|st.TweetParser|stweet.parse.TwintBasedTweetParser()|Parser of tweets from web API response|
Expand All @@ -164,8 +163,7 @@ properties:

|Property|Type|Default value|Description|
|---|---|---|---|
|tweets_by_ids_task|st.TweetsByIdsTask|**Obligatory
property**|Property specifies which tweets should be downloaded by the runner|
|tweets_by_ids_task|st.TweetsByIdsTask|**Obligatory property**|Property specifies which tweets should be downloaded by the runner|
|tweet_outputs|List[st.TweetOutput]|**Obligatory property**|List of objects to export downloaded tweets|
|search_run_context|st.SearchRunContext|None, in \_\_init\_\_() assign SearchRunContext()|Search context, contains all important properties to make the next request to Twitter|
|web_client|st.WebClient|stweet.http_request.WebClientRequests()|Implementation of a WebClient, can be replaced for custom implementation|
Expand All @@ -186,8 +184,7 @@ With class GetUsersRunner library can scrap users specified in GetUsersTask. The

|Property|Type|Default value|Description|
|---|---|---|---|
|get_user_task|st.GetUsersTask|**Obligatory
property**|Property specifies which users should be downloaded by the runner|
|get_user_task|st.GetUsersTask|**Obligatory property**|Property specifies which users should be downloaded by the runner|
|user_outputs|List[st.UserOutput]|**Obligatory property**|List of objects to export downloaded users|
|get_user_context|st.GetUsersContext|None, in \_\_init\_\_() assign GetUsersContext()|Search context, contains all important properties to make the next request to Twitter|
|web_client|st.WebClient|stweet.http_request.WebClientRequests()|Implementation of a WebClient, can be replaced for custom implementation|
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

setuptools.setup(
name="stweet",
version="1.1.2",
version="1.2.0",
author="Marcin Wątroba",
author_email="[email protected]",
description="Package to scrap tweets",
Expand Down
5 changes: 4 additions & 1 deletion stweet/import_data/tweets_import.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,11 @@ def read_tweets_from_csv_file(file_path: str) -> List[Tweet]:
df = pd.read_csv(file_path, dtype={
'quoted_status_id_str': str,
'in_reply_to_status_id_str': str,
'in_reply_to_user_id_str': str
'in_reply_to_user_id_str': str,
'media_url': str
})
if 'media_url' not in df.columns:
df['media_url'] = ''
df.quoted_status_id_str.fillna('', inplace=True)
df.quoted_status_short_url.fillna('', inplace=True)
df.quoted_status_expand_url.fillna('', inplace=True)
Expand Down
1 change: 1 addition & 0 deletions stweet/mapper/tweet_dict_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ def create_tweet_from_dict(dictionary: Dict[str, any]):
dictionary['user_verified'],
str(dictionary['in_reply_to_status_id_str']),
str(dictionary['in_reply_to_user_id_str']),
dictionary['media_url'],
dictionary['hashtags'],
dictionary['mentions'],
dictionary['urls']
Expand Down
2 changes: 2 additions & 0 deletions stweet/mapper/tweet_json_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,6 @@ def tweet_to_json(tweet: Tweet) -> str:
def create_tweet_from_json(json_value: str) -> Tweet:
"""Method creates tweet from json string."""
tweet_dict = json.loads(json_value)
if 'media_url' not in tweet_dict:
tweet_dict['media_url'] = ''
return create_tweet_from_dict(tweet_dict)
50 changes: 1 addition & 49 deletions stweet/model/tweet.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,55 +30,7 @@ class Tweet:
user_verified: bool
in_reply_to_status_id_str: str
in_reply_to_user_id_str: str
media_url: str
hashtags: List[str]
mentions: List[str]
urls: List[str]

# def to_json_string(self) -> str:
# """Method to prepare json of tweet. Used in JSON serialization."""
# return json.dumps(self, default=lambda o: str(o) if isinstance(o, Arrow) else o.__dict__)
#
# def to_flat_dict(self):
# """Method to prepare flat dict of tweet. Used in CSV serialization."""
# dictionary = dict(self.__dict__)
# dictionary['hashtags'] = _simple_string_list_to_string(dictionary['hashtags'])
# dictionary['mentions'] = _simple_string_list_to_string(dictionary['mentions'])
# dictionary['urls'] = _simple_string_list_to_string(dictionary['urls'])
# return dictionary
#
# @staticmethod
# def create_tweet_from_dict(dictionary: Dict[str, any]):
# """Method to create Tweet from dictionary."""
# return Tweet(
# arrow.get(dictionary['created_at']),
# str(dictionary['id_str']),
# str(dictionary['conversation_id_str']),
# dictionary['full_text'],
# dictionary['lang'],
# dictionary['favorited'],
# dictionary['retweeted'],
# dictionary['retweet_count'],
# dictionary['favorite_count'],
# dictionary['reply_count'],
# dictionary['quote_count'],
# dictionary['quoted_status_id_str'],
# dictionary['quoted_status_short_url'],
# dictionary['quoted_status_expand_url'],
# str(dictionary['user_id_str']),
# dictionary['user_name'],
# dictionary['user_full_name'],
# dictionary['user_verified'],
# str(dictionary['in_reply_to_status_id_str']),
# str(dictionary['in_reply_to_user_id_str']),
# dictionary['hashtags'],
# dictionary['mentions'],
# dictionary['urls']
# )
#
# @staticmethod
# def create_tweet_from_flat_dict(dictionary: Dict[str, any]):
# """Method to create Tweet from flat dictionary."""
# dictionary['hashtags'] = _string_to_simple_string_list(dictionary['hashtags'])
# dictionary['mentions'] = _string_to_simple_string_list(dictionary['mentions'])
# dictionary['urls'] = _string_to_simple_string_list(dictionary['urls'])
# return Tweet.create_tweet_from_dict(dictionary)
2 changes: 2 additions & 0 deletions stweet/search_runner/parse/base_tweet_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ def parse_tweets(self, response_text: str) -> List[Tweet]:

@staticmethod
def _tweet_dict_to_tweet_object(tweet) -> Tweet:
print(json.dumps(tweet))
return Tweet(
created_at=Arrow.fromdatetime(parser.parse(tweet['created_at'])),
id_str=tweet['id_str'],
Expand All @@ -107,6 +108,7 @@ def _tweet_dict_to_tweet_object(tweet) -> Tweet:
in_reply_to_status_id_str=_default_string_value(tweet['in_reply_to_status_id_str'], ''),
in_reply_to_user_id_str=_default_string_value(
tweet['in_reply_to_user_id_str'], ''),
media_url=tweet['media_url'] if 'media_url' in tweet else '',
hashtags=['#' + it['text'] for it in tweet['entities']['hashtags']],
mentions=[it['screen_name'] for it in tweet['entities']['user_mentions']],
urls=[it['url'] for it in tweet['entities']['urls']]
Expand Down
1 change: 1 addition & 0 deletions stweet/search_runner/search_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ def _is_end_of_scrapping(self) -> bool:
def _execute_next_tweets_request(self):
request_params = self._get_next_request_details()
response = self.web_client.run_request(request_params)
# print(response.text)
if response.is_token_expired():
self._refresh_token()
elif response.is_success():
Expand Down
Empty file added tests/__init__.py
Empty file.
16 changes: 16 additions & 0 deletions tests/integration/import_older_version_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
import stweet as st
import os

_RESOURCES_PATH = 'tests/resources'


def test_tweets_csv_import_v1_1_2():
csv_filename = f'{_RESOURCES_PATH}/tweets_v1.1.2.csv'
tweets_from_csv = st.read_tweets_from_csv_file(csv_filename)
assert len(tweets_from_csv) == 9


def test_tweets_json_import_v1_1_2():
jl_filename = f'{_RESOURCES_PATH}/tweets_v1.1.2.json'
tweets_from_csv = st.read_tweets_from_json_lines_file(jl_filename)
assert len(tweets_from_csv) == 9
27 changes: 27 additions & 0 deletions tests/resources/tweets_v1.1.2.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
created_at,id_str,conversation_id_str,full_text,lang,favorited,retweeted,retweet_count,favorite_count,reply_count,quote_count,quoted_status_id_str,quoted_status_short_url,quoted_status_expand_url,user_id_str,user_name,user_full_name,user_verified,in_reply_to_status_id_str,in_reply_to_user_id_str,hashtags,mentions,urls
2021-02-05T00:54:28+00:00,1357492753215729666,1357492753215729666,"It no longer matters. The decision to accept casualties rather than accede to simple acts of social generosity has been made. Hundreds of thousands dead due to bumbling, both willful and unconscious, has been shrugged off by everyone other than, one presumes, the dead. #COVID19",en,False,False,0,0,0,0,1357490037773004802,,,3231804466,robert__gibbons,Robert Gibbons,False,,,#COVID19,,
2021-02-05T00:54:27+00:00,1357492749516292098,1357492749516292098,"Here in our hearts
Oh the answer is there
If we only would look there inside them
We can make it better, we can make it better #WhatIf #WorldCancerDay #COVID19",en,False,False,0,0,0,0,1357489296844992512,,,1007294353818226694,bigdan071288,Daniel,False,,,"#WhatIf , #WorldCancerDay , #COVID19",,
2021-02-05T00:54:26+00:00,1357492745514942466,1357492745514942466,"Hoy más que nunca, ciudadanos, ciudadanas y autoridades de la @AlcaldiaMHmx nos unimos con un solo propósito; reducir el numero de contagios por #COVID19. No es tiempo de fiestas ni reuniones, continuamos en #SemáforoRojo 🚦 por contingencia sanitaria.

#MHelCorazónDeLaCapital https://t.co/VRRcVdXjsx",es,False,False,0,0,0,0,,,,1065021849430618112,MHSUrbanos,Servicios Urbanos,False,,,"#COVID19 , #SemáforoRojo , #MHelCorazónDeLaCapital",AlcaldiaMHmx,
2021-02-05T00:54:25+00:00,1357492742943764480,1357492742943764480,発見遅れたCOCOA不具合、厚生労働省「実機テストせず」:#朝日新聞デジタル https://t.co/ng0nDNqw2M #新型コロナウイルス #COVID19,ja,False,False,0,0,0,0,,,,113366981,otaka_thursday,おたか 🍥,False,,,"#朝日新聞デジタル , #新型コロナウイルス , #COVID19",,https://t.co/ng0nDNqw2M
2021-02-05T00:54:14+00:00,1357492697901264897,1357492697901264897,"that moment when your patient decides to (stupidly) go to Turks and Caicos and returns with COVID (omg) and uses up precious resources, time, PPE, & ED personnel to take care of her while exposing a slew of other patients and staff in the ER. #COVID19 #canyounot #COVIDIOT https://t.co/Icy9EydgLg",en,False,False,0,0,0,0,,,,2462950457,paper_canyon,paper canyon,False,,,"#COVID19 , #canyounot , #COVIDIOT",,
2021-02-05T00:54:09+00:00,1357492674731913216,1357492674731913216,“hasta un 40% de las personas que fueron internadas por complicaciones asociadas al Covid-19 tuvieron secuelas en la función de sus pulmones”. Hay que mantener los cuidados y vacunarse cuando nos toque #COVID19,es,False,False,0,0,0,0,1357358481939050496,,,139287395,jota_leonr,José Julio León,False,,,#COVID19,,
2021-02-05T00:54:08+00:00,1357492671217086464,1357492671217086464,"Feeling sorry for corporations during the #covid19 #pandemic?

YOU MIGHT WANT TO EDUCATE YOURSELF on how American companies are treating front line employees during a pandemic? #MustRead",en,False,False,0,0,0,0,1357483359677550592,,,2316413918,GregCurtin,-v|v- 🍁 🇺🇸,False,,,"#covid19 , #pandemic , #MustRead",,
2021-02-05T00:54:07+00:00,1357492667064623104,1357492667064623104,"Yesterday hot topics:
#lka (16.88%)
#Srilanka (13.71%)
#IndependenceDaySL (5.68%)
#IndependenceDay (3.84%)
#adaderanasinhala (2.75%)
#slnews (2.75%)
#Covid19 (2.09%)
#IndependenceSL (1.58%)
#73rdIndependenceDay (1.17%)
#COVID19SL (1.08%)",en,False,False,0,0,0,0,,,,1343032398238208002,yuganOffcial,Yugan Narmathan 🇱🇰,False,,,"#lka , #Srilanka , #IndependenceDaySL , #IndependenceDay , #adaderanasinhala , #slnews , #Covid19 , #IndependenceSL , #73rdIndependenceDay , #COVID19SL",,
2021-02-05T00:54:06+00:00,1357492664363532288,1357492664363532288,#COVID19 #Impfung #Impfpflicht https://t.co/h2175ku7Zp https://t.co/UCJMOTj2Rj,und,False,False,0,0,0,0,,,,394643993,Caputschi,Pit Caputschi,False,,,"#COVID19 , #Impfung , #Impfpflicht",,https://t.co/h2175ku7Zp
Loading

0 comments on commit 413cc5b

Please sign in to comment.