From c7a87cdbfccab301cdd764ba2c71c5efafeb42f0 Mon Sep 17 00:00:00 2001 From: Recursing Date: Thu, 22 Mar 2018 14:49:10 +0100 Subject: [PATCH 1/2] Check for .tweet-text children before using them --- twitter_scraper.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/twitter_scraper.py b/twitter_scraper.py index 7e71b96..306666e 100644 --- a/twitter_scraper.py +++ b/twitter_scraper.py @@ -32,7 +32,10 @@ def gen_tweets(pages): dot = "." tweets = [] for tweet in html.find('.stream-item'): - text = tweet.find('.tweet-text')[0].full_text + tweet_text_elements = tweet.find('.tweet-text') + if not tweet_text_elements: + continue + text = tweet_text_elements[0].full_text tweetId = tweet.find( '.js-permalink')[0].attrs['data-conversation-id'] time = datetime.fromtimestamp( From f6a87b42b33596481131d426632bb2b13bbba4ac Mon Sep 17 00:00:00 2001 From: Recursing Date: Thu, 22 Mar 2018 15:13:39 +0100 Subject: [PATCH 2/2] Remove hidden elements from tweet text Probably there's a better way to ignore them --- twitter_scraper.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/twitter_scraper.py b/twitter_scraper.py index 306666e..b549f7f 100644 --- a/twitter_scraper.py +++ b/twitter_scraper.py @@ -35,7 +35,10 @@ def gen_tweets(pages): tweet_text_elements = tweet.find('.tweet-text') if not tweet_text_elements: continue - text = tweet_text_elements[0].full_text + text_container = tweet_text_elements[0] + for hidden_child in text_container.lxml.find_class('u-hidden'): + hidden_child.drop_tree() + text = text_container.full_text.strip() tweetId = tweet.find( '.js-permalink')[0].attrs['data-conversation-id'] time = datetime.fromtimestamp(