The code below will give you a start on processing text data from Twitter. There are some basic examples of how to pull down tweets for selected users and compare/contrast the sentiment of their posts.
#####################
# This script illustrates how to pull data from
# twitter and use default settings for English
# language sentiment analysis
#####################
library(twitteR)
library(rtweet)
library(syuzhet)
library(ngram)
library(reshape2)
require(dplyr)
library(timeDate)
library(ggplot2)
#####################
# This is just a crude string cleaning function for the purposes
# of illustration.
#####################
clean.string <- function(string){
# Lowercase
temp <- tolower(string)
# Remove everything that is not a number or letter (may want to keep more
# stuff in your actual analyses).
temp <- stringr::str_replace_all(temp,"[^a-zA-Z\\s]", " ")
# Shrink down to just one white space
temp <- stringr::str_replace_all(temp,"[\\s]+", " ")
return(temp)
}
#####################
# this function returns a crude sentiment analysis of the tweets from a set of
# users' timelines. You must provide a vector of users.
#####################
twit.sentiment <- function(users, n.tweets=200, include.retweet=FALSE) {
sent.vars = c("anger", "anticipation", "disgust", "fear", "joy", "sadness", "surprise", "trust", "negative", "positive")
d.vars = c("user_id", "screen_name", "created_at", "retweet_count", "favorite_count", "followers_count", "friends_count", "text")
d = data.frame(get_timelines(users, n=n.tweets, parse=TRUE))
# do a very light text cleaning
d$text_clean = unlist(lapply(d$text, clean.string))
# count the clean words
d$n_words = unlist(lapply(d$text_clean, wordcount))
# Do the sentiment analysis using nrc. In a real production sentiment analysis, you would want
# to consider several different dictionaries. Check out the following page for a walkthrough of
# some of the different lexicons you might consider:
# https://cran.r-project.org/web/packages/syuzhet/vignettes/syuzhet-vignette.html
d[,sent.vars] = bind_rows(lapply(d$text_clean, get_nrc_sentiment))
head(d)
# Get a percentage of pos/neg by number of words in the email
d$neg_pct = d$negative/d$n_words
d$pos_pct = d$positive/d$n_words
if(include.retweet) {
d.sub = d[,c(d.vars, sent.vars)]
} else {
d.sub = d[!(d$is_retweet),c(d.vars, sent.vars)]
}
return(d.sub)
}
#####################
# Explore the dictionaries, showing how different
# words are coded
#####################
nrc = get_sentiment_dictionary(dictionary = "nrc", language = "english")
syuzhet = get_sentiment_dictionary(dictionary = "syuzhet", language = "english")
nrc[nrc$word == "horrible", ]
syuzhet[syuzhet$word == "horrible", ]
nrc[nrc$word == "disastrous", ]
syuzhet[syuzhet$word == "disastrous", ]
#####################
# Exploring sentiment analysis
#####################
v1 = "Man, I am having the best day today. The sun is out and it is a beautiful day."
v2 = "So grateful to be part of this supportive community. This is an amazing place to work."
v3 = "What a horrible day. Not only is it storming, but I fell in the mud and broke my phone."
v4 = "Awful bosses and terrible co-workers. This is a ridiculously bad place to work."
v5 = "I am not having the best day today. The sun is not out and it is not a beautiful day."
v6 = "Some days are better than others. This is the latter."
v7 = "So, got my final back. Um, yeah. The professor sure knows how to give the gift of a great day."
v8 = "Great idea Olin...Make all the students swipe their cards just to get onto the 4th floor. Beautiful building that we can't access."
get_nrc_sentiment(clean.string(v1))
get_nrc_sentiment(clean.string(v2))
get_nrc_sentiment(clean.string(v3))
get_nrc_sentiment(clean.string(v4))
get_nrc_sentiment(clean.string(v5))
get_nrc_sentiment(clean.string(v6))
get_nrc_sentiment(clean.string(v7))
get_nrc_sentiment(clean.string(v8))
#####################
# The first thing you need to do is create an app for your twitter account
# you can find instructions here:
# https://developer.twitter.com/en/docs/basics/apps/overview.html
# Once you've created an app, then add the following information to this script
#####################
# twitter_consumer_key = "YOUR INFO HERE"
# twitter_consumer_secret = "YOUR INFO HERE"
# twitter_access_token = "YOUR INFO HERE"
# twitter_access_secret = "YOUR INFO HERE"
setup_twitter_oauth(twitter_consumer_key, twitter_consumer_secret, twitter_access_token, twitter_access_secret)
#####################
# Sample sentiment analysis on accounts where
# we have strong priors about their sentiment
#####################
sad_happy = c("sosadtoday", "angrymemorys", "gohappiest", "kindnessgirl")
d.sh = twit.sentiment(users=sad_happy, n.tweets=200, include.retweet=F)
boxplot(positive~screen_name, data=d.sh, cex.axis=.7, las=2, main="positive")
boxplot(negative~screen_name, data=d.sh, cex.axis=.7, las=2, main="negative")
#####################
# Illustrating the potential for looking at specific users and
# comparing / contrasting individual employees' sentiment
#####################
OlinPeeps = c("DeanTaylorWashU", "sjmalter", "LamarPierce1", "OrgStratProf")
BSchoolDeans = c("DeanTaylorWashU", "scottderue")
BSchools = c("OlinBusiness", "Wharton")
d.olin = twit.sentiment(users=OlinPeeps, n.tweets=300, include.retweet=F)
d.deans = twit.sentiment(users=BSchoolDeans, n.tweets=300, include.retweet=F)
d.schools = twit.sentiment(users=BSchools, n.tweets=300, include.retweet=F)
boxplot(positive~screen_name, data=d.olin, cex.axis=.7, las=2, main="positive")
boxplot(negative~screen_name, data=d.olin, cex.axis=.7, las=2, main="negative")
boxplot(positive~screen_name, data=d.deans, cex.axis=.7, las=2, main="positive")
boxplot(negative~screen_name, data=d.deans, cex.axis=.7, las=2, main="negative")
boxplot(positive~screen_name, data=d.schools, cex.axis=.7, las=2, main="positive")
boxplot(negative~screen_name, data=d.schools, cex.axis=.7, las=2, main="negative")
#####################
# Illustrating the potential for looking at trends over time
#####################
olin.all = c("DeanTaylorWashU", "sjmalter", "LamarPierce1", "OrgStratProf", "sethcarnahan", "peterboumgarden",
"jrobmartin", "milbourn_todd", "danbentle", "wustlbusiness", "drpatsportsbiz", "analisaortiz", "krwools")
d.lrg = twit.sentiment(users=olin.all, n.tweets=300, include.retweet=F)
d.lrg$date = as.Date(d.lrg$created_at)
d.lrg$year = as.numeric(strftime(d.lrg$date, format="%Y"))
d.lrg$month = as.numeric(strftime(d.lrg$date, format="%m"))
d.lrg$woy = as.numeric(strftime(d.lrg$date, format="%V"))
o = aggregate(d.lrg[,c("positive", "negative")], by=list(d.lrg$year, d.lrg$month), mean)
names(o)[1:2] = c("year", "month")
plot(o[o$year == 2018, "month"], o[o$year == 2018, "positive"], type="l", ylim=c(0,3), col="dark green", lwd=3, ylab="sentiment", xlab="month")
lines(o[o$year == 2017, "month"], o[o$year == 2017, "positive"], type="l", col="dark green", lwd=3, lty=2)
lines(o[o$year == 2018, "month"], o[o$year == 2018, "negative"], type="l", col="dark red", lwd=3)
lines(o[o$year == 2017, "month"], o[o$year == 2017, "negative"], type="l", col="dark red", lwd=3, lty=2)
boxplot(positive~screen_name, data=d.lrg, cex.axis=.7, las=2, main="positive")
boxplot(negative~screen_name, data=d.lrg, cex.axis=.7, las=2, main="negative")
d.lrg$name = as.factor(d.lrg$screen_name)
p <- ggplot(d.lrg, aes(x=name, y=positive)) + geom_violin()
p <- ggplot(d.lrg, aes(x=name, y=negative)) + geom_violin()
d.lrg[d.lrg$negative > 7, ]
# This script illustrates how to pull data from
# twitter and use default settings for English
# language sentiment analysis
#####################
library(twitteR)
library(rtweet)
library(syuzhet)
library(ngram)
library(reshape2)
require(dplyr)
library(timeDate)
library(ggplot2)
#####################
# This is just a crude string cleaning function for the purposes
# of illustration.
#####################
clean.string <- function(string){
# Lowercase
temp <- tolower(string)
# Remove everything that is not a number or letter (may want to keep more
# stuff in your actual analyses).
temp <- stringr::str_replace_all(temp,"[^a-zA-Z\\s]", " ")
# Shrink down to just one white space
temp <- stringr::str_replace_all(temp,"[\\s]+", " ")
return(temp)
}
#####################
# this function returns a crude sentiment analysis of the tweets from a set of
# users' timelines. You must provide a vector of users.
#####################
twit.sentiment <- function(users, n.tweets=200, include.retweet=FALSE) {
sent.vars = c("anger", "anticipation", "disgust", "fear", "joy", "sadness", "surprise", "trust", "negative", "positive")
d.vars = c("user_id", "screen_name", "created_at", "retweet_count", "favorite_count", "followers_count", "friends_count", "text")
d = data.frame(get_timelines(users, n=n.tweets, parse=TRUE))
# do a very light text cleaning
d$text_clean = unlist(lapply(d$text, clean.string))
# count the clean words
d$n_words = unlist(lapply(d$text_clean, wordcount))
# Do the sentiment analysis using nrc. In a real production sentiment analysis, you would want
# to consider several different dictionaries. Check out the following page for a walkthrough of
# some of the different lexicons you might consider:
# https://cran.r-project.org/web/packages/syuzhet/vignettes/syuzhet-vignette.html
d[,sent.vars] = bind_rows(lapply(d$text_clean, get_nrc_sentiment))
head(d)
# Get a percentage of pos/neg by number of words in the email
d$neg_pct = d$negative/d$n_words
d$pos_pct = d$positive/d$n_words
if(include.retweet) {
d.sub = d[,c(d.vars, sent.vars)]
} else {
d.sub = d[!(d$is_retweet),c(d.vars, sent.vars)]
}
return(d.sub)
}
#####################
# Explore the dictionaries, showing how different
# words are coded
#####################
nrc = get_sentiment_dictionary(dictionary = "nrc", language = "english")
syuzhet = get_sentiment_dictionary(dictionary = "syuzhet", language = "english")
nrc[nrc$word == "horrible", ]
syuzhet[syuzhet$word == "horrible", ]
nrc[nrc$word == "disastrous", ]
syuzhet[syuzhet$word == "disastrous", ]
#####################
# Exploring sentiment analysis
#####################
v1 = "Man, I am having the best day today. The sun is out and it is a beautiful day."
v2 = "So grateful to be part of this supportive community. This is an amazing place to work."
v3 = "What a horrible day. Not only is it storming, but I fell in the mud and broke my phone."
v4 = "Awful bosses and terrible co-workers. This is a ridiculously bad place to work."
v5 = "I am not having the best day today. The sun is not out and it is not a beautiful day."
v6 = "Some days are better than others. This is the latter."
v7 = "So, got my final back. Um, yeah. The professor sure knows how to give the gift of a great day."
v8 = "Great idea Olin...Make all the students swipe their cards just to get onto the 4th floor. Beautiful building that we can't access."
get_nrc_sentiment(clean.string(v1))
get_nrc_sentiment(clean.string(v2))
get_nrc_sentiment(clean.string(v3))
get_nrc_sentiment(clean.string(v4))
get_nrc_sentiment(clean.string(v5))
get_nrc_sentiment(clean.string(v6))
get_nrc_sentiment(clean.string(v7))
get_nrc_sentiment(clean.string(v8))
#####################
# The first thing you need to do is create an app for your twitter account
# you can find instructions here:
# https://developer.twitter.com/en/docs/basics/apps/overview.html
# Once you've created an app, then add the following information to this script
#####################
# twitter_consumer_key = "YOUR INFO HERE"
# twitter_consumer_secret = "YOUR INFO HERE"
# twitter_access_token = "YOUR INFO HERE"
# twitter_access_secret = "YOUR INFO HERE"
setup_twitter_oauth(twitter_consumer_key, twitter_consumer_secret, twitter_access_token, twitter_access_secret)
#####################
# Sample sentiment analysis on accounts where
# we have strong priors about their sentiment
#####################
sad_happy = c("sosadtoday", "angrymemorys", "gohappiest", "kindnessgirl")
d.sh = twit.sentiment(users=sad_happy, n.tweets=200, include.retweet=F)
boxplot(positive~screen_name, data=d.sh, cex.axis=.7, las=2, main="positive")
boxplot(negative~screen_name, data=d.sh, cex.axis=.7, las=2, main="negative")
#####################
# Illustrating the potential for looking at specific users and
# comparing / contrasting individual employees' sentiment
#####################
OlinPeeps = c("DeanTaylorWashU", "sjmalter", "LamarPierce1", "OrgStratProf")
BSchoolDeans = c("DeanTaylorWashU", "scottderue")
BSchools = c("OlinBusiness", "Wharton")
d.olin = twit.sentiment(users=OlinPeeps, n.tweets=300, include.retweet=F)
d.deans = twit.sentiment(users=BSchoolDeans, n.tweets=300, include.retweet=F)
d.schools = twit.sentiment(users=BSchools, n.tweets=300, include.retweet=F)
boxplot(positive~screen_name, data=d.olin, cex.axis=.7, las=2, main="positive")
boxplot(negative~screen_name, data=d.olin, cex.axis=.7, las=2, main="negative")
boxplot(positive~screen_name, data=d.deans, cex.axis=.7, las=2, main="positive")
boxplot(negative~screen_name, data=d.deans, cex.axis=.7, las=2, main="negative")
boxplot(positive~screen_name, data=d.schools, cex.axis=.7, las=2, main="positive")
boxplot(negative~screen_name, data=d.schools, cex.axis=.7, las=2, main="negative")
#####################
# Illustrating the potential for looking at trends over time
#####################
olin.all = c("DeanTaylorWashU", "sjmalter", "LamarPierce1", "OrgStratProf", "sethcarnahan", "peterboumgarden",
"jrobmartin", "milbourn_todd", "danbentle", "wustlbusiness", "drpatsportsbiz", "analisaortiz", "krwools")
d.lrg = twit.sentiment(users=olin.all, n.tweets=300, include.retweet=F)
d.lrg$date = as.Date(d.lrg$created_at)
d.lrg$year = as.numeric(strftime(d.lrg$date, format="%Y"))
d.lrg$month = as.numeric(strftime(d.lrg$date, format="%m"))
d.lrg$woy = as.numeric(strftime(d.lrg$date, format="%V"))
o = aggregate(d.lrg[,c("positive", "negative")], by=list(d.lrg$year, d.lrg$month), mean)
names(o)[1:2] = c("year", "month")
plot(o[o$year == 2018, "month"], o[o$year == 2018, "positive"], type="l", ylim=c(0,3), col="dark green", lwd=3, ylab="sentiment", xlab="month")
lines(o[o$year == 2017, "month"], o[o$year == 2017, "positive"], type="l", col="dark green", lwd=3, lty=2)
lines(o[o$year == 2018, "month"], o[o$year == 2018, "negative"], type="l", col="dark red", lwd=3)
lines(o[o$year == 2017, "month"], o[o$year == 2017, "negative"], type="l", col="dark red", lwd=3, lty=2)
boxplot(positive~screen_name, data=d.lrg, cex.axis=.7, las=2, main="positive")
boxplot(negative~screen_name, data=d.lrg, cex.axis=.7, las=2, main="negative")
d.lrg$name = as.factor(d.lrg$screen_name)
p <- ggplot(d.lrg, aes(x=name, y=positive)) + geom_violin()
p <- ggplot(d.lrg, aes(x=name, y=negative)) + geom_violin()
d.lrg[d.lrg$negative > 7, ]