From 54f8938e3095633d8cf0d40855118774c49ddd8b Mon Sep 17 00:00:00 2001 From: Tim Chan Date: Sat, 7 Jul 2018 13:46:03 +1000 Subject: [PATCH 1/3] Add in basic twitter archive to Corpus functionality --- .gitignore | 3 +++ local_settings_example.py | 4 ++++ twittereater.py | 17 +++++++++++++++++ 3 files changed, 24 insertions(+) create mode 100644 twittereater.py diff --git a/.gitignore b/.gitignore index 390be07..fa0d0b8 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,6 @@ .git /.idea __pycache__ +*.csv +.env/ +local_settings.py \ No newline at end of file diff --git a/local_settings_example.py b/local_settings_example.py index 1760b3d..fdaabce 100644 --- a/local_settings_example.py +++ b/local_settings_example.py @@ -34,3 +34,7 @@ DEBUG = True # Set this to False to start Tweeting live TWEET_ACCOUNT = "" # The name of the account you're tweeting to. + +#Configuration for Twitter parser. TEST_SOURCE will be re-used as as the corpus location. +TWITTER_ARCHIVE_NAME = "tweets.csv" #Name of your twitter archive +IGNORE_RETWEETS = True #If you want to remove retweets \ No newline at end of file diff --git a/twittereater.py b/twittereater.py new file mode 100644 index 0000000..1ff8adb --- /dev/null +++ b/twittereater.py @@ -0,0 +1,17 @@ +import csv +from local_settings import TWITTER_ARCHIVE_NAME, TEST_SOURCE, IGNORE_RETWEETS + +f = open(TWITTER_ARCHIVE_NAME, 'r', encoding='utf-8') +tweets = [] +reader = csv.reader(f,quotechar='"') +next(reader) #get rid of the twitter header + + +tweetarchive = open(TEST_SOURCE, 'w') +for row in reader: + if IGNORE_RETWEETS: + if not row[8]: #9th column is the timestamp of the retweet + tweetarchive.write("'%s'," % (row[5])) + else: + tweetarchive.write("'%s'," % (row[5])) + From f1861f388118ac29cf399b64e2e6cbe89279ca44 Mon Sep 17 00:00:00 2001 From: Tim Chan Date: Tue, 31 Jul 2018 20:02:44 +1000 Subject: [PATCH 2/3] Update twittereater for python 2.7 compatibility Updated twittereater.py for python 2.7 compatibility Updated readme.md to include twitter archive instructions Updated contributors.md --- CONTRIBUTORS.md | 3 ++- README.md | 9 +++++++++ twittereater.py | 3 ++- 3 files changed, 13 insertions(+), 2 deletions(-) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 4ab5703..30fa420 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -10,4 +10,5 @@ * [varjmes](https://github.com/varjmes) * [meggle](https://github.com/meggle) * [superstrong](https://github.com/superstrong) -* [andrlik](https://github.com/andrlik) \ No newline at end of file +* [andrlik](https://github.com/andrlik) +* [TimlChan](https://github.com/TimLChan) diff --git a/README.md b/README.md index c6e4b22..ffe601b 100644 --- a/README.md +++ b/README.md @@ -60,6 +60,15 @@ To scrape content from the web, set `SCRAPE_URL` to `True`. This bot makes use o __Note:__ Web scraping is experimental and may give you unexpected results. Make sure to test the bot in debugging mode before publishing. +#### Twitter archive +To use tweets from a Twitter account you have access to, you can download your Twitter Archive by following the steps from [Twitter's Help Center](https://help.twitter.com/en/managing-your-account/how-to-download-your-twitter-archive). + +1. Request your Twitter archive +2. Extract the CSV file and ensure it is named the same as the `TWITTER_ARCHIVE_NAME` in `local_settings.py` +3. In `local_settings.py`, retweets are ignored by default. If you want to include retweets in your corpus, change `IGNORE_RETWEETS` to `False`. +4. Once that is all set, run `twittereater.py` and it will automatically create a corpus file based on the `TEST_SOURCE` variable in `local_settings.py` + + ## Debugging If you want to test the script or to debug the tweet generation, you can skip the random number generation and not publish the resulting tweets to Twitter. diff --git a/twittereater.py b/twittereater.py index 1ff8adb..6f69008 100644 --- a/twittereater.py +++ b/twittereater.py @@ -1,7 +1,8 @@ +# -*- coding: utf-8 -*- import csv from local_settings import TWITTER_ARCHIVE_NAME, TEST_SOURCE, IGNORE_RETWEETS -f = open(TWITTER_ARCHIVE_NAME, 'r', encoding='utf-8') +f = open(TWITTER_ARCHIVE_NAME, 'r') tweets = [] reader = csv.reader(f,quotechar='"') next(reader) #get rid of the twitter header From fea6f3c4544454fd952f3db57fdc607570fd35bc Mon Sep 17 00:00:00 2001 From: Tim Chan Date: Tue, 31 Jul 2018 20:07:46 +1000 Subject: [PATCH 3/3] Update readme.md Add in an optional step to use the Twitter archive corpus as tweet source --- README.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index ffe601b..6d5416c 100644 --- a/README.md +++ b/README.md @@ -66,7 +66,10 @@ To use tweets from a Twitter account you have access to, you can download your T 1. Request your Twitter archive 2. Extract the CSV file and ensure it is named the same as the `TWITTER_ARCHIVE_NAME` in `local_settings.py` 3. In `local_settings.py`, retweets are ignored by default. If you want to include retweets in your corpus, change `IGNORE_RETWEETS` to `False`. -4. Once that is all set, run `twittereater.py` and it will automatically create a corpus file based on the `TEST_SOURCE` variable in `local_settings.py` +4. Update `TEST_SOURCE` and specify the name of the parsed Twitter archive +5. Once that is all set, run `twittereater.py` and it will automatically create a corpus file based on the `TEST_SOURCE` variable in `local_settings.py` + +If you want to use the Twitter corpus to generate tweets, set `STATIC_TEST = True` ## Debugging