algorithmiaio · Aug 1, 2016
diff --git a/‎Python/tweet-profanity-demo/data/Donald-Trump_OR-Trump.csv
+946 b/‎Python/tweet-profanity-demo/data/Donald-Trump_OR-Trump.csv
+946
diff --git a/‎Python/tweet-profanity-demo/data/Hillary-Clinton-OR-Hillary.csv
+891 b/‎Python/tweet-profanity-demo/data/Hillary-Clinton-OR-Hillary.csv
+891
diff --git a/‎Python/tweet-profanity-demo/profanity_analysis.py
+80 b/‎Python/tweet-profanity-demo/profanity_analysis.py
+80
diff --git a/‎Python/tweet-profanity-demo/requirements.txt
+4 b/‎Python/tweet-profanity-demo/requirements.txt
+4
diff --git a/‎Python/tweet-profanity-demo/twitter_pull_data.py
+63 b/‎Python/tweet-profanity-demo/twitter_pull_data.py
+63
@@ -0,0 +1,80 @@
+"""
+Analyze twitter data for profanity of presidential candidates.
+
+Dependent on: stop word, profanity detection algorithms and twitter_pull.py
+"""
+
+import os
+import re
+import csv
+import sys
+import Algorithmia as alg
+
+
+# Add in your Algorithmia API key
+client = alg.client('your_algorithmia_api_key')
+
+
+def read_data():
+    """Create the list of Tweets from your query."""
+    try:
+        filename = os.path.join(sys.argv[1].replace(' ', '-'))
+        with open('data/{0}.csv'.format(filename)) as data_file:
+            data_object = csv.DictReader(data_file, delimiter=',')
+            text_data = [tweets['text'] for tweets in data_object]
+        return text_data
+    except IndexError as ie:
+        print(
+            "Input error - did you remember to pass in your system argument?",
+            ie)
+        sys.exit(1)
+    except FileNotFoundError as fe:
+        print("File not found - check your directory and filename", fe)
+        sys.exit(1)
+    except:
+        raise
+
+
+def process_text():
+    """Remove emoticons, numbers etc. and returns list of cleaned tweets."""
+    stripped_text = [
+        re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?" +
+               sys.argv[1].lower(), '',
+               tweets.lower()).strip() for tweets in read_data()
+    ]
+    return stripped_text
+
+
+def remove_stop_words():
+    """Remove stop words in tweets."""
+    try:
+        algo = client.algo('nlp/RetrieveStopWords/0.1.1')
+        # Input is an empty list
+        stop_word_list = algo.pipe([])
+        # If our word is not in the stop list than we add it to our word list
+        clean_text = ' '.join([word for sentence in process_text()
+                               for word in sentence.split(' ')
+                               if word not in stop_word_list.result])
+        return clean_text
+    except Exception as e:
+        print(e)
+        sys.exit(1)
+
+
+def profanity():
+    """Return a dictionary of swear words and their frequency."""
+    try:
+        algo = client.algo('nlp/ProfanityDetection/0.1.2')
+        # Pass in the clean list of tweets combined into a single corpus
+        result = algo.pipe([remove_stop_words()]).result
+        # Total profanity in corpus
+        total = sum(result.values())
+        print('Resulting swear words and counts: ', result)
+        print('total swear words: ', total)
+        return {'profanity_counts': result, 'profanity_sum': total}
+    except Exception as e:
+        print(e)
+        sys.exit(1)
+
+if __name__ == '__main__':
+    profanity()
@@ -0,0 +1,4 @@
+algorithmia==1.0.5
+enum34==1.1.6
+requests==2.10.0
+six==1.10.0
@@ -0,0 +1,63 @@
+import os
+import csv
+import sys
+import logging
+import Algorithmia
+
+# Logging
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+logFile = logging.FileHandler(
+    'logs/twitter_pull_data.log')
+logFile.setLevel(logging.INFO)
+
+# Creating a custom log format for each line in the log file
+formatter = logging.Formatter('%(asctime)s : %(levelname)s : %(message)s')
+logFile.setFormatter(formatter)
+logger.addHandler(logFile)
+
+
+# Pass in string query as sys.argv
+q_input = sys.argv[1]
+
+
+def pull_tweets():
+    """Pull tweets from Twitter API via Algorithmia."""
+    input = {
+        "query": q_input,
+        "numTweets": "700",
+        "auth": {
+            "app_key": 'your_consumer_key',
+            "app_secret": 'your_consumer_secret_key',
+            "oauth_token": 'your_access_token',
+            "oauth_token_secret": 'your_access_token_secret'
+        }
+    }
+    client = Algorithmia.client('your_algorithmia_api_key')
+    algo = client.algo('twitter/RetrieveTweetsWithKeyword/0.1.3')
+
+    tweet_list = [{'user_id': record['user']['id'],
+                   'retweet_count': record['retweet_count'],
+                   'text': record['text']}
+                  for record in algo.pipe(input).result]
+    return tweet_list
+
+
+def write_data():
+    # Write tweet records to csv for later data processing
+    data = pull_tweets()
+    filename = os.path.join(q_input.replace(' ', '-'))
+    try:
+        with open('data/{0}.csv'.format(filename), 'w') as f:
+            fieldnames = ['user_id', 'retweet_count', 'text']
+            writer = csv.DictWriter(f, fieldnames=fieldnames)
+            writer.writeheader()
+            for record in data:
+                writer.writerow(record)
+
+    except Exception as e:
+        logger.info(e)
+
+if __name__ == '__main__':
+    write_data()