Merge pull request #32 from markowanga/feature/add_media_url

Add media_url
markowanga · Feb 6, 2021 · 413cc5b · 413cc5b
2 parents 333cc14 + f61d00d
commit 413cc5b
Show file tree

Hide file tree

Showing 12 changed files with 67 additions and 57 deletions.
diff --git a/README.md b/README.md
@@ -142,8 +142,7 @@ With class SearchRunner library can scrap tweets specified in SearchTweetsTask.
 |Property|Type|Default value|Description|
 |---|---|---|---|
 |search_run_context|st.SearchRunContext|None, in \_\_init\_\_() assign SearchRunContext()|Search context, contains all important properties to make the next request to Twitter|
-|search_tweets_task|st.SearchTweetsTask|**Obligatory
-property**|Property specifies which tweets should be downloaded by the runner|
+|search_tweets_task|st.SearchTweetsTask|**Obligatory property**|Property specifies which tweets should be downloaded by the runner|
 |tweet_outputs|List[st.TweetOutput]|**Obligatory property**|List of objects to export downloaded tweets|
 |web_client|st.WebClient|stweet.http_request.WebClientRequests()|Implementation of a WebClient, can be replaced for custom implementation|
 |tweet_parser|st.TweetParser|stweet.parse.TwintBasedTweetParser()|Parser of tweets from web API response|
@@ -164,8 +163,7 @@ properties:
 
 |Property|Type|Default value|Description|
 |---|---|---|---|
-|tweets_by_ids_task|st.TweetsByIdsTask|**Obligatory
-property**|Property specifies which tweets should be downloaded by the runner|
+|tweets_by_ids_task|st.TweetsByIdsTask|**Obligatory property**|Property specifies which tweets should be downloaded by the runner|
 |tweet_outputs|List[st.TweetOutput]|**Obligatory property**|List of objects to export downloaded tweets|
 |search_run_context|st.SearchRunContext|None, in \_\_init\_\_() assign SearchRunContext()|Search context, contains all important properties to make the next request to Twitter|
 |web_client|st.WebClient|stweet.http_request.WebClientRequests()|Implementation of a WebClient, can be replaced for custom implementation|
@@ -186,8 +184,7 @@ With class GetUsersRunner library can scrap users specified in GetUsersTask. The
 
 |Property|Type|Default value|Description|
 |---|---|---|---|
-|get_user_task|st.GetUsersTask|**Obligatory
-property**|Property specifies which users should be downloaded by the runner|
+|get_user_task|st.GetUsersTask|**Obligatory property**|Property specifies which users should be downloaded by the runner|
 |user_outputs|List[st.UserOutput]|**Obligatory property**|List of objects to export downloaded users|
 |get_user_context|st.GetUsersContext|None, in \_\_init\_\_() assign GetUsersContext()|Search context, contains all important properties to make the next request to Twitter|
 |web_client|st.WebClient|stweet.http_request.WebClientRequests()|Implementation of a WebClient, can be replaced for custom implementation|

diff --git a/setup.py b/setup.py
@@ -8,7 +8,7 @@
 
 setuptools.setup(
     name="stweet",
-    version="1.1.2",
+    version="1.2.0",
     author="Marcin Wątroba",
     author_email="[email protected]",
     description="Package to scrap tweets",

diff --git a/stweet/import_data/tweets_import.py b/stweet/import_data/tweets_import.py
@@ -14,8 +14,11 @@ def read_tweets_from_csv_file(file_path: str) -> List[Tweet]:
     df = pd.read_csv(file_path, dtype={
         'quoted_status_id_str': str,
         'in_reply_to_status_id_str': str,
-        'in_reply_to_user_id_str': str
+        'in_reply_to_user_id_str': str,
+        'media_url': str
     })
+    if 'media_url' not in df.columns:
+        df['media_url'] = ''
     df.quoted_status_id_str.fillna('', inplace=True)
     df.quoted_status_short_url.fillna('', inplace=True)
     df.quoted_status_expand_url.fillna('', inplace=True)

diff --git a/stweet/mapper/tweet_dict_mapper.py b/stweet/mapper/tweet_dict_mapper.py
@@ -40,6 +40,7 @@ def create_tweet_from_dict(dictionary: Dict[str, any]):
         dictionary['user_verified'],
         str(dictionary['in_reply_to_status_id_str']),
         str(dictionary['in_reply_to_user_id_str']),
+        dictionary['media_url'],
         dictionary['hashtags'],
         dictionary['mentions'],
         dictionary['urls']

diff --git a/stweet/mapper/tweet_json_mapper.py b/stweet/mapper/tweet_json_mapper.py
@@ -15,4 +15,6 @@ def tweet_to_json(tweet: Tweet) -> str:
 def create_tweet_from_json(json_value: str) -> Tweet:
     """Method creates tweet from json string."""
     tweet_dict = json.loads(json_value)
+    if 'media_url' not in tweet_dict:
+        tweet_dict['media_url'] = ''
     return create_tweet_from_dict(tweet_dict)
diff --git a/stweet/model/tweet.py b/stweet/model/tweet.py
@@ -30,55 +30,7 @@ class Tweet:
     user_verified: bool
     in_reply_to_status_id_str: str
     in_reply_to_user_id_str: str
+    media_url: str
     hashtags: List[str]
     mentions: List[str]
     urls: List[str]
-
-    # def to_json_string(self) -> str:
-    #     """Method to prepare json of tweet. Used in JSON serialization."""
-    #     return json.dumps(self, default=lambda o: str(o) if isinstance(o, Arrow) else o.__dict__)
-    #
-    # def to_flat_dict(self):
-    #     """Method to prepare flat dict of tweet. Used in CSV serialization."""
-    #     dictionary = dict(self.__dict__)
-    #     dictionary['hashtags'] = _simple_string_list_to_string(dictionary['hashtags'])
-    #     dictionary['mentions'] = _simple_string_list_to_string(dictionary['mentions'])
-    #     dictionary['urls'] = _simple_string_list_to_string(dictionary['urls'])
-    #     return dictionary
-    #
-    # @staticmethod
-    # def create_tweet_from_dict(dictionary: Dict[str, any]):
-    #     """Method to create Tweet from dictionary."""
-    #     return Tweet(
-    #         arrow.get(dictionary['created_at']),
-    #         str(dictionary['id_str']),
-    #         str(dictionary['conversation_id_str']),
-    #         dictionary['full_text'],
-    #         dictionary['lang'],
-    #         dictionary['favorited'],
-    #         dictionary['retweeted'],
-    #         dictionary['retweet_count'],
-    #         dictionary['favorite_count'],
-    #         dictionary['reply_count'],
-    #         dictionary['quote_count'],
-    #         dictionary['quoted_status_id_str'],
-    #         dictionary['quoted_status_short_url'],
-    #         dictionary['quoted_status_expand_url'],
-    #         str(dictionary['user_id_str']),
-    #         dictionary['user_name'],
-    #         dictionary['user_full_name'],
-    #         dictionary['user_verified'],
-    #         str(dictionary['in_reply_to_status_id_str']),
-    #         str(dictionary['in_reply_to_user_id_str']),
-    #         dictionary['hashtags'],
-    #         dictionary['mentions'],
-    #         dictionary['urls']
-    #     )
-    #
-    # @staticmethod
-    # def create_tweet_from_flat_dict(dictionary: Dict[str, any]):
-    #     """Method to create Tweet from flat dictionary."""
-    #     dictionary['hashtags'] = _string_to_simple_string_list(dictionary['hashtags'])
-    #     dictionary['mentions'] = _string_to_simple_string_list(dictionary['mentions'])
-    #     dictionary['urls'] = _string_to_simple_string_list(dictionary['urls'])
-    #     return Tweet.create_tweet_from_dict(dictionary)
diff --git a/stweet/search_runner/parse/base_tweet_parser.py b/stweet/search_runner/parse/base_tweet_parser.py
@@ -81,6 +81,7 @@ def parse_tweets(self, response_text: str) -> List[Tweet]:
 
     @staticmethod
     def _tweet_dict_to_tweet_object(tweet) -> Tweet:
+        print(json.dumps(tweet))
         return Tweet(
             created_at=Arrow.fromdatetime(parser.parse(tweet['created_at'])),
             id_str=tweet['id_str'],
@@ -107,6 +108,7 @@ def _tweet_dict_to_tweet_object(tweet) -> Tweet:
             in_reply_to_status_id_str=_default_string_value(tweet['in_reply_to_status_id_str'], ''),
             in_reply_to_user_id_str=_default_string_value(
                 tweet['in_reply_to_user_id_str'], ''),
+            media_url=tweet['media_url'] if 'media_url' in tweet else '',
             hashtags=['#' + it['text'] for it in tweet['entities']['hashtags']],
             mentions=[it['screen_name'] for it in tweet['entities']['user_mentions']],
             urls=[it['url'] for it in tweet['entities']['urls']]

diff --git a/stweet/search_runner/search_runner.py b/stweet/search_runner/search_runner.py
@@ -62,6 +62,7 @@ def _is_end_of_scrapping(self) -> bool:
     def _execute_next_tweets_request(self):
         request_params = self._get_next_request_details()
         response = self.web_client.run_request(request_params)
+        # print(response.text)
         if response.is_token_expired():
             self._refresh_token()
         elif response.is_success():

diff --git a/tests/__init__.py b/tests/__init__.py
diff --git a/tests/integration/import_older_version_test.py b/tests/integration/import_older_version_test.py
@@ -0,0 +1,16 @@
+import stweet as st
+import os
+
+_RESOURCES_PATH = 'tests/resources'
+
+
+def test_tweets_csv_import_v1_1_2():
+    csv_filename = f'{_RESOURCES_PATH}/tweets_v1.1.2.csv'
+    tweets_from_csv = st.read_tweets_from_csv_file(csv_filename)
+    assert len(tweets_from_csv) == 9
+
+
+def test_tweets_json_import_v1_1_2():
+    jl_filename = f'{_RESOURCES_PATH}/tweets_v1.1.2.json'
+    tweets_from_csv = st.read_tweets_from_json_lines_file(jl_filename)
+    assert len(tweets_from_csv) == 9
diff --git a/tests/resources/tweets_v1.1.2.csv b/tests/resources/tweets_v1.1.2.csv
@@ -0,0 +1,27 @@
+created_at,id_str,conversation_id_str,full_text,lang,favorited,retweeted,retweet_count,favorite_count,reply_count,quote_count,quoted_status_id_str,quoted_status_short_url,quoted_status_expand_url,user_id_str,user_name,user_full_name,user_verified,in_reply_to_status_id_str,in_reply_to_user_id_str,hashtags,mentions,urls
+2021-02-05T00:54:28+00:00,1357492753215729666,1357492753215729666,"It no longer matters. The decision to accept casualties rather than accede to simple acts of social generosity has been made. Hundreds of thousands dead due to bumbling, both willful and unconscious, has been shrugged off by everyone other than, one presumes, the dead. #COVID19",en,False,False,0,0,0,0,1357490037773004802,,,3231804466,robert__gibbons,Robert Gibbons,False,,,#COVID19,,
+2021-02-05T00:54:27+00:00,1357492749516292098,1357492749516292098,"Here in our hearts
+Oh the answer is there
+If we only would look there inside them
+We can make it better, we can make it better #WhatIf #WorldCancerDay #COVID19",en,False,False,0,0,0,0,1357489296844992512,,,1007294353818226694,bigdan071288,Daniel,False,,,"#WhatIf , #WorldCancerDay , #COVID19",,
+2021-02-05T00:54:26+00:00,1357492745514942466,1357492745514942466,"Hoy más que nunca, ciudadanos, ciudadanas y autoridades de la @AlcaldiaMHmx nos unimos con un solo propósito; reducir el numero de contagios por #COVID19. No es tiempo de fiestas ni reuniones, continuamos en #SemáforoRojo 🚦 por contingencia sanitaria.
+
+#MHelCorazónDeLaCapital https://t.co/VRRcVdXjsx",es,False,False,0,0,0,0,,,,1065021849430618112,MHSUrbanos,Servicios Urbanos,False,,,"#COVID19 , #SemáforoRojo , #MHelCorazónDeLaCapital",AlcaldiaMHmx,
+2021-02-05T00:54:25+00:00,1357492742943764480,1357492742943764480,発見遅れたCOCOA不具合、厚生労働省「実機テストせず」：#朝日新聞デジタル https://t.co/ng0nDNqw2M #新型コロナウイルス #COVID19,ja,False,False,0,0,0,0,,,,113366981,otaka_thursday,おたか 🍥,False,,,"#朝日新聞デジタル , #新型コロナウイルス , #COVID19",,https://t.co/ng0nDNqw2M
+2021-02-05T00:54:14+00:00,1357492697901264897,1357492697901264897,"that moment when your patient decides to (stupidly) go to Turks and Caicos and returns with COVID (omg) and uses up precious resources, time, PPE, &amp; ED personnel to take care of her while exposing a slew of other patients and staff in the ER. #COVID19 #canyounot #COVIDIOT https://t.co/Icy9EydgLg",en,False,False,0,0,0,0,,,,2462950457,paper_canyon,paper canyon,False,,,"#COVID19 , #canyounot , #COVIDIOT",,
+2021-02-05T00:54:09+00:00,1357492674731913216,1357492674731913216,“hasta un 40% de las personas que fueron internadas por complicaciones asociadas al Covid-19 tuvieron secuelas en la función de sus pulmones”. Hay que mantener los cuidados y vacunarse cuando nos toque #COVID19,es,False,False,0,0,0,0,1357358481939050496,,,139287395,jota_leonr,José Julio León,False,,,#COVID19,,
+2021-02-05T00:54:08+00:00,1357492671217086464,1357492671217086464,"Feeling sorry for corporations during the #covid19 #pandemic?
+
+YOU MIGHT WANT TO EDUCATE YOURSELF on how American companies are treating front line employees during a pandemic? #MustRead",en,False,False,0,0,0,0,1357483359677550592,,,2316413918,GregCurtin,-v|v- 🍁 🇺🇸,False,,,"#covid19 , #pandemic , #MustRead",,
+2021-02-05T00:54:07+00:00,1357492667064623104,1357492667064623104,"Yesterday hot topics:
+#lka (16.88%)
+#Srilanka (13.71%)
+#IndependenceDaySL (5.68%)
+#IndependenceDay (3.84%)
+#adaderanasinhala (2.75%)
+#slnews (2.75%)
+#Covid19 (2.09%)
+#IndependenceSL (1.58%)
+#73rdIndependenceDay (1.17%)
+#COVID19SL (1.08%)",en,False,False,0,0,0,0,,,,1343032398238208002,yuganOffcial,Yugan Narmathan 🇱🇰,False,,,"#lka , #Srilanka , #IndependenceDaySL , #IndependenceDay , #adaderanasinhala , #slnews , #Covid19 , #IndependenceSL , #73rdIndependenceDay , #COVID19SL",,
+2021-02-05T00:54:06+00:00,1357492664363532288,1357492664363532288,#COVID19  #Impfung #Impfpflicht  https://t.co/h2175ku7Zp https://t.co/UCJMOTj2Rj,und,False,False,0,0,0,0,,,,394643993,Caputschi,Pit Caputschi,False,,,"#COVID19 , #Impfung , #Impfpflicht",,https://t.co/h2175ku7Zp