Skip to content

Commit 3ba9df8

Browse files
Stephanie KimStephanie Kim
Stephanie Kim
authored and
Stephanie Kim
committedAug 1, 2016
add twitter profanity script to sample-apps
1 parent 3d8a728 commit 3ba9df8

File tree

5 files changed

+1984
-0
lines changed

5 files changed

+1984
-0
lines changed
 

‎Python/tweet-profanity-demo/data/Donald-Trump_OR-Trump.csv

+946
Large diffs are not rendered by default.

‎Python/tweet-profanity-demo/data/Hillary-Clinton-OR-Hillary.csv

+891
Large diffs are not rendered by default.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
"""
2+
Analyze twitter data for profanity of presidential candidates.
3+
4+
Dependent on: stop word, profanity detection algorithms and twitter_pull.py
5+
"""
6+
7+
import os
8+
import re
9+
import csv
10+
import sys
11+
import Algorithmia as alg
12+
13+
14+
# Add in your Algorithmia API key
15+
client = alg.client('your_algorithmia_api_key')
16+
17+
18+
def read_data():
19+
"""Create the list of Tweets from your query."""
20+
try:
21+
filename = os.path.join(sys.argv[1].replace(' ', '-'))
22+
with open('data/{0}.csv'.format(filename)) as data_file:
23+
data_object = csv.DictReader(data_file, delimiter=',')
24+
text_data = [tweets['text'] for tweets in data_object]
25+
return text_data
26+
except IndexError as ie:
27+
print(
28+
"Input error - did you remember to pass in your system argument?",
29+
ie)
30+
sys.exit(1)
31+
except FileNotFoundError as fe:
32+
print("File not found - check your directory and filename", fe)
33+
sys.exit(1)
34+
except:
35+
raise
36+
37+
38+
def process_text():
39+
"""Remove emoticons, numbers etc. and returns list of cleaned tweets."""
40+
stripped_text = [
41+
re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?" +
42+
sys.argv[1].lower(), '',
43+
tweets.lower()).strip() for tweets in read_data()
44+
]
45+
return stripped_text
46+
47+
48+
def remove_stop_words():
49+
"""Remove stop words in tweets."""
50+
try:
51+
algo = client.algo('nlp/RetrieveStopWords/0.1.1')
52+
# Input is an empty list
53+
stop_word_list = algo.pipe([])
54+
# If our word is not in the stop list than we add it to our word list
55+
clean_text = ' '.join([word for sentence in process_text()
56+
for word in sentence.split(' ')
57+
if word not in stop_word_list.result])
58+
return clean_text
59+
except Exception as e:
60+
print(e)
61+
sys.exit(1)
62+
63+
64+
def profanity():
65+
"""Return a dictionary of swear words and their frequency."""
66+
try:
67+
algo = client.algo('nlp/ProfanityDetection/0.1.2')
68+
# Pass in the clean list of tweets combined into a single corpus
69+
result = algo.pipe([remove_stop_words()]).result
70+
# Total profanity in corpus
71+
total = sum(result.values())
72+
print('Resulting swear words and counts: ', result)
73+
print('total swear words: ', total)
74+
return {'profanity_counts': result, 'profanity_sum': total}
75+
except Exception as e:
76+
print(e)
77+
sys.exit(1)
78+
79+
if __name__ == '__main__':
80+
profanity()
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
algorithmia==1.0.5
2+
enum34==1.1.6
3+
requests==2.10.0
4+
six==1.10.0
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
import os
2+
import csv
3+
import sys
4+
import logging
5+
import Algorithmia
6+
7+
# Logging
8+
logger = logging.getLogger(__name__)
9+
logger.setLevel(logging.INFO)
10+
11+
logFile = logging.FileHandler(
12+
'logs/twitter_pull_data.log')
13+
logFile.setLevel(logging.INFO)
14+
15+
# Creating a custom log format for each line in the log file
16+
formatter = logging.Formatter('%(asctime)s : %(levelname)s : %(message)s')
17+
logFile.setFormatter(formatter)
18+
logger.addHandler(logFile)
19+
20+
21+
# Pass in string query as sys.argv
22+
q_input = sys.argv[1]
23+
24+
25+
def pull_tweets():
26+
"""Pull tweets from Twitter API via Algorithmia."""
27+
input = {
28+
"query": q_input,
29+
"numTweets": "700",
30+
"auth": {
31+
"app_key": 'your_consumer_key',
32+
"app_secret": 'your_consumer_secret_key',
33+
"oauth_token": 'your_access_token',
34+
"oauth_token_secret": 'your_access_token_secret'
35+
}
36+
}
37+
client = Algorithmia.client('your_algorithmia_api_key')
38+
algo = client.algo('twitter/RetrieveTweetsWithKeyword/0.1.3')
39+
40+
tweet_list = [{'user_id': record['user']['id'],
41+
'retweet_count': record['retweet_count'],
42+
'text': record['text']}
43+
for record in algo.pipe(input).result]
44+
return tweet_list
45+
46+
47+
def write_data():
48+
# Write tweet records to csv for later data processing
49+
data = pull_tweets()
50+
filename = os.path.join(q_input.replace(' ', '-'))
51+
try:
52+
with open('data/{0}.csv'.format(filename), 'w') as f:
53+
fieldnames = ['user_id', 'retweet_count', 'text']
54+
writer = csv.DictWriter(f, fieldnames=fieldnames)
55+
writer.writeheader()
56+
for record in data:
57+
writer.writerow(record)
58+
59+
except Exception as e:
60+
logger.info(e)
61+
62+
if __name__ == '__main__':
63+
write_data()

0 commit comments

Comments
 (0)
Please sign in to comment.