Merge pull request #53 from TimLChan/tweetdata

Add in basic twitter archive to Corpus functionality, thanks to @TimLChan.
tommeagher · Aug 19, 2018 · 3dfd3c1 · 3dfd3c1
2 parents 5771baa + fea6f3c
commit 3dfd3c1
Show file tree

Hide file tree

Showing 5 changed files with 39 additions and 1 deletion.
diff --git a/.gitignore b/.gitignore
@@ -2,3 +2,6 @@
 .git
 /.idea
 __pycache__
+*.csv
+.env/
+local_settings.py
diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
@@ -10,4 +10,5 @@
 * [varjmes](https://github.com/varjmes)
 * [meggle](https://github.com/meggle)
 * [superstrong](https://github.com/superstrong)
-* [andrlik](https://github.com/andrlik)
+* [andrlik](https://github.com/andrlik)
+* [TimlChan](https://github.com/TimLChan)
diff --git a/README.md b/README.md
@@ -60,6 +60,18 @@ To scrape content from the web, set `SCRAPE_URL` to `True`. This bot makes use o
 
 __Note:__ Web scraping is experimental and may give you unexpected results. Make sure to test the bot in debugging mode before publishing.
 
+#### Twitter archive
+To use tweets from a Twitter account you have access to, you can download your Twitter Archive by following the steps from [Twitter's Help Center](https://help.twitter.com/en/managing-your-account/how-to-download-your-twitter-archive).
+
+1. Request your Twitter archive
+2. Extract the CSV file and ensure it is named the same as the `TWITTER_ARCHIVE_NAME` in `local_settings.py`
+3. In `local_settings.py`, retweets are ignored by default. If you want to include retweets in your corpus, change `IGNORE_RETWEETS` to `False`.
+4. Update `TEST_SOURCE` and specify the name of the parsed Twitter archive
+5. Once that is all set, run `twittereater.py` and it will automatically create a corpus file based on the `TEST_SOURCE` variable in `local_settings.py`
+
+If you want to use the Twitter corpus to generate tweets, set `STATIC_TEST = True`
+
+
 ## Debugging
 
 If you want to test the script or to debug the tweet generation, you can skip the random number generation and not publish the resulting tweets to Twitter.

diff --git a/local_settings_example.py b/local_settings_example.py
@@ -34,3 +34,7 @@
 
 DEBUG = True  # Set this to False to start Tweeting live
 TWEET_ACCOUNT = ""  # The name of the account you're tweeting to.
+
+#Configuration for Twitter parser. TEST_SOURCE will be re-used as as the corpus location.
+TWITTER_ARCHIVE_NAME = "tweets.csv" #Name of your twitter archive
+IGNORE_RETWEETS = True #If you want to remove retweets
diff --git a/twittereater.py b/twittereater.py
@@ -0,0 +1,18 @@
+# -*- coding: utf-8 -*-
+import csv
+from local_settings import TWITTER_ARCHIVE_NAME, TEST_SOURCE, IGNORE_RETWEETS
+
+f = open(TWITTER_ARCHIVE_NAME, 'r')
+tweets = []
+reader = csv.reader(f,quotechar='"')
+next(reader) #get rid of the twitter header
+
+
+tweetarchive = open(TEST_SOURCE, 'w')
+for row in reader:
+    if IGNORE_RETWEETS:
+        if not row[8]: #9th column is the timestamp of the retweet
+            tweetarchive.write("'%s'," % (row[5]))
+    else:
+        tweetarchive.write("'%s'," % (row[5]))
+
-Original file line number
+Diff line change
@@ Expand Up / @@ -2,3 +2,6 @@ @@
     .git
     /.idea
     __pycache__
+    *.csv
+    .env/
+    local_settings.py