Tag: people analytics

  • Using R to Analyze Twitter

    The code below will give you a start on processing text data from Twitter. There are some basic examples of how to pull down tweets for selected users and compare/contrast the sentiment of their posts.

    #####################
    # This script illustrates how to pull data from
    # twitter and use default settings for English
    # language sentiment analysis
    #####################
    library(twitteR)
    library(rtweet)
    library(syuzhet)
    library(ngram)
    library(reshape2)
    require(dplyr)
    library(timeDate)
    library(ggplot2)

    #####################
    # This is just a crude string cleaning function for the purposes
    # of illustration.
    #####################

    clean.string <- function(string){
        # Lowercase
        temp <- tolower(string)
        # Remove everything that is not a number or letter (may want to keep more
        # stuff in your actual analyses).
        temp <- stringr::str_replace_all(temp,"[^a-zA-Z\\s]", " ")
        # Shrink down to just one white space
        temp <- stringr::str_replace_all(temp,"[\\s]+", " ")
        return(temp)
    }

    #####################
    # this function returns a crude sentiment analysis of the tweets from a set of
    # users’ timelines. You must provide a vector of users.
    #####################

    twit.sentiment <- function(users, n.tweets=200, include.retweet=FALSE) {
        sent.vars = c("anger", "anticipation", "disgust", "fear", "joy", "sadness", "surprise", "trust", "negative", "positive")   
        d.vars = c("user_id", "screen_name", "created_at", "retweet_count", "favorite_count", "followers_count", "friends_count", "text")
        d = data.frame(get_timelines(users, n=n.tweets, parse=TRUE))

        # do a very light text cleaning
        d$text_clean = unlist(lapply(d$text, clean.string))

        # count the clean words
        d$n_words = unlist(lapply(d$text_clean, wordcount))

        # Do the sentiment analysis using nrc. In a real production sentiment analysis, you would want
        # to consider several different dictionaries. Check out the following page for a walkthrough of
        # some of the different lexicons you might consider:
        # https://cran.r-project.org/web/packages/syuzhet/vignettes/syuzhet-vignette.html
        d[,sent.vars] = bind_rows(lapply(d$text_clean, get_nrc_sentiment))
        head(d)

        # Get a percentage of pos/neg by number of words in the email
        d$neg_pct = d$negative/d$n_words
        d$pos_pct = d$positive/d$n_words

        if(include.retweet) {
            d.sub = d[,c(d.vars, sent.vars)]       
        } else {
            d.sub = d[!(d$is_retweet),c(d.vars, sent.vars)]    
        }
        return(d.sub)
    }

    #####################
    # Explore the dictionaries, showing how different
    # words are coded
    #####################

    nrc = get_sentiment_dictionary(dictionary = "nrc", language = "english")
    syuzhet = get_sentiment_dictionary(dictionary = "syuzhet", language = "english")

    nrc[nrc$word == "horrible", ]
    syuzhet[syuzhet$word == "horrible", ]

    nrc[nrc$word == "disastrous", ]
    syuzhet[syuzhet$word == "disastrous", ]

    #####################
    # Exploring sentiment analysis
    #####################

    v1 = "Man, I am having the best day today. The sun is out and it is a beautiful day."
    v2 = "So grateful to be part of this supportive community. This is an amazing place to work."
    v3 = "What a horrible day. Not only is it storming, but I fell in the mud and broke my phone."
    v4 = "Awful bosses and terrible co-workers. This is a ridiculously bad place to work."

    v5 = "I am not having the best day today. The sun is not out and it is not a beautiful day."
    v6 = "Some days are better than others. This is the latter."
    v7 = "So, got my final back. Um, yeah. The professor sure knows how to give the gift of a great day."
    v8 = "Great idea Olin…Make all the students swipe their cards just to get onto the 4th floor. Beautiful building that we can’t access."

    get_nrc_sentiment(clean.string(v1))
    get_nrc_sentiment(clean.string(v2))
    get_nrc_sentiment(clean.string(v3))
    get_nrc_sentiment(clean.string(v4))
    get_nrc_sentiment(clean.string(v5))
    get_nrc_sentiment(clean.string(v6))
    get_nrc_sentiment(clean.string(v7))
    get_nrc_sentiment(clean.string(v8))

    #####################
    # The first thing you need to do is create an app for your twitter account
    # you can find instructions here:
    # https://developer.twitter.com/en/docs/basics/apps/overview.html

    # Once you’ve created an app, then add the following information to this script
    #####################
    # twitter_consumer_key = "YOUR INFO HERE"
    # twitter_consumer_secret = "YOUR INFO HERE"
    # twitter_access_token = "YOUR INFO HERE"
    # twitter_access_secret = "YOUR INFO HERE"

    setup_twitter_oauth(twitter_consumer_key, twitter_consumer_secret, twitter_access_token, twitter_access_secret)

    #####################
    # Sample sentiment analysis on accounts where
    # we have strong priors about their sentiment
    #####################

    sad_happy = c("sosadtoday", "angrymemorys", "gohappiest", "kindnessgirl")
    d.sh = twit.sentiment(users=sad_happy, n.tweets=200, include.retweet=F)
    boxplot(positive~screen_name, data=d.sh, cex.axis=.7, las=2, main="positive")
    boxplot(negative~screen_name, data=d.sh, cex.axis=.7, las=2, main="negative")

    #####################
    # Illustrating the potential for looking at specific users and
    # comparing / contrasting individual employees’ sentiment
    #####################

    OlinPeeps = c("DeanTaylorWashU", "sjmalter", "LamarPierce1", "OrgStratProf")
    BSchoolDeans = c("DeanTaylorWashU", "scottderue")
    BSchools = c("OlinBusiness", "Wharton")

    d.olin = twit.sentiment(users=OlinPeeps, n.tweets=300, include.retweet=F)
    d.deans = twit.sentiment(users=BSchoolDeans, n.tweets=300, include.retweet=F)
    d.schools = twit.sentiment(users=BSchools, n.tweets=300, include.retweet=F)

    boxplot(positive~screen_name, data=d.olin, cex.axis=.7, las=2, main="positive")
    boxplot(negative~screen_name, data=d.olin, cex.axis=.7, las=2, main="negative")

    boxplot(positive~screen_name, data=d.deans, cex.axis=.7, las=2, main="positive")
    boxplot(negative~screen_name, data=d.deans, cex.axis=.7, las=2, main="negative")

    boxplot(positive~screen_name, data=d.schools, cex.axis=.7, las=2, main="positive")
    boxplot(negative~screen_name, data=d.schools, cex.axis=.7, las=2, main="negative")

    #####################
    # Illustrating the potential for looking at trends over time
    #####################
    olin.all = c("DeanTaylorWashU", "sjmalter", "LamarPierce1", "OrgStratProf", "sethcarnahan", "peterboumgarden",
        "jrobmartin", "milbourn_todd", "danbentle", "wustlbusiness", "drpatsportsbiz", "analisaortiz", "krwools")

    d.lrg = twit.sentiment(users=olin.all, n.tweets=300, include.retweet=F)

    d.lrg$date = as.Date(d.lrg$created_at)
    d.lrg$year = as.numeric(strftime(d.lrg$date, format="%Y"))
    d.lrg$month = as.numeric(strftime(d.lrg$date, format="%m"))
    d.lrg$woy = as.numeric(strftime(d.lrg$date, format="%V"))

    o = aggregate(d.lrg[,c("positive", "negative")], by=list(d.lrg$year, d.lrg$month), mean)
    names(o)[1:2] = c("year", "month")

    plot(o[o$year == 2018, "month"], o[o$year == 2018, "positive"], type="l", ylim=c(0,3), col="dark green", lwd=3, ylab="sentiment", xlab="month")
    lines(o[o$year == 2017, "month"], o[o$year == 2017, "positive"], type="l", col="dark green", lwd=3, lty=2)

    lines(o[o$year == 2018, "month"], o[o$year == 2018, "negative"], type="l", col="dark red", lwd=3)
    lines(o[o$year == 2017, "month"], o[o$year == 2017, "negative"], type="l", col="dark red", lwd=3, lty=2)

    boxplot(positive~screen_name, data=d.lrg, cex.axis=.7, las=2, main="positive")
    boxplot(negative~screen_name, data=d.lrg, cex.axis=.7, las=2, main="negative")

    d.lrg$name = as.factor(d.lrg$screen_name)

    p <- ggplot(d.lrg, aes(x=name, y=positive)) + geom_violin()
    p <- ggplot(d.lrg, aes(x=name, y=negative)) + geom_violin()

    d.lrg[d.lrg$negative > 7, ]