For today’s exploration, I wanted to connect to my gmail account, pull messages, and do a quick sentiment analysis on the text. The focus of this code is pulling and transforming the data from gmail’s api–not doing a precise and polished sentiment analysis. I wanted to learn a bit about the gmail api the gmailr package (which right now is pretty thin on documentation).
There is much potential with this. The api would make everything from sentiment analysis to network analysis on your own gmail account possible.
##########################################
# This script gives an example of how to connect
# to a personal gmail account, extract a set of messages
# and do a quick-and-dirty sentiment analysis on the
# body of the messages.
# NOTE: This is not a pure or clean analysis of this text data.
# For production, you would want to make sure to clean up the
# body of the text data (e.g., ensuring that you don't have duplicate
# messages that are appended at the bottom of replies).
#
# However, this should give you a place to start for making sense of your email.
##########################################
#### -- ## -- ## -- ## -- ## -- ## -- ## -- ## -- ####
## Setup
#### -- ## -- ## -- ## -- ## -- ## -- ## -- ## -- ####
# Setup your environment, marking a particular working directory where you'd like
# to output files and loading libraries that you'll use
# syuzhet has a set of functions for doing sentiment analysis
library(syuzhet)
# ngram is useful for breaking up and parsing text data
library(ngram)
# reshape2 is also helpul for parsing text data
library(reshape2)
# use this to smash a list
require(dplyr)
# gmailr has a set of functions for connecting to gmail and parsing emails
library(gmailr)
## User-defined function for doing a quick-and-dirty clean-up on text
# You could add elements to this to create an even more precise set of
# text data to parse for your sentiment analysis. For a production
# text analysis, you would want to create a clean set of data.
clean.string <- function(string){
# Lowercase
temp <- tolower(string)
# Remove everything that is not a number or letter (may want to keep more
# stuff in your actual analyses).
temp <- stringr::str_replace_all(temp,"[^a-zA-Z\\s]", " ")
# Shrink down to just one white space
temp <- stringr::str_replace_all(temp,"[\\s]+", " ")
return(temp)
}
## User-defined function for pulling a set of messages from gmail
# and doing a sentiment analysis on those messages. This will also retain the actual
# body of the messages in case you want to do something further with it down
# the line. The only input into the function is a vector of message ids
# that you want to pull and process.
gmail.sentiment = function(ids) {
# a vector of the sentiment variables
sent.vars = c("anger", "anticipation", "disgust", "fear", "joy", "sadness", "surprise", "trust", "negative", "positive")
# a vector of the email vars
email.vars = c("id", "to", "from", "cc", "bcc", "date", "subject", "body")
# put together and also add the number of words in the body
all.vars = c(email.vars, "n_words", sent.vars)
null.to.na = function(x) {
x = ifelse(is.null(x), NA, x)
return(x)
}
# Loop through the vector of message ids and pull the info for that specific message
# We're creating a data.frame here that contains the information for this query of messages
for(i in 1:length(ids)) {
# Get the header info for the message, replacing any null values with NA
id = ids[i]
msg = message(id)
to = to(msg)
to = null.to.na(to)
from = from(msg)
from = null.to.na(from)
cc = cc(msg)
cc = null.to.na(cc)
bcc = bcc(msg)
bcc = null.to.na(bcc)
date = date(msg)
date = null.to.na(date)
subject = subject(msg)
subject = null.to.na(subject)
body = unlist(body(msg))
body = null.to.na(body)
# Create a holding line
res.line = data.frame(cbind(id, to, from, cc, bcc, date, subject, body), stringsAsFactors=F)
# if this is the first pass through, then create an outset. Otherwise, append this line
# to the existing outset
if(i == 1) {
res.out = res.line
} else {
res.out = rbind(res.out, res.line)
}
}
# do a very light text cleaning
res.out$body_clean = unlist(lapply(res.out$body, clean.string))
# count the clean words
res.out$n_words = unlist(lapply(res.out$body_clean, wordcount))
# Do the sentiment analysis using nrc. In a real production sentiment analysis, you would want
# to consider several different dictionaries. Check out the following page for a walkthrough of
# some of the different lexicons you might consider:
# https://cran.r-project.org/web/packages/syuzhet/vignettes/syuzhet-vignette.html
res.out[,sent.vars] = bind_rows(lapply(res.out$body_clean, get_nrc_sentiment))
# Get a percentage of pos/neg by number of words in the email
res.out$neg_pct = res.out$negative/res.out$n_words
res.out$pos_pct = res.out$positive/res.out$n_words
# parse the date information into some variables to use in graphing
res.out$dow = substr(res.out$date, 1, 3)
res.out$date_time = substr(res.out$date, 6, nchar(res.out$date))
o = colsplit(trimws(res.out$date_time), " ", names=c("day", "month", "year", "time", "offset"))
d = cbind(res.out, o)
d$date_time_format = as.Date(paste(d$month, " ", as.numeric(d$day), " ", as.numeric(d$year), sep=""), format="%b %d %Y")
d$month_num = as.numeric(substr(d$date_time_format, 6,7))
d$day_num = as.numeric(substr(d$date_time_format, 9,10))
return(d)
}
#### -- ## -- ## -- ## -- ## -- ## -- ## -- ## -- ####
## Connect to gmail
#### -- ## -- ## -- ## -- ## -- ## -- ## -- ## -- ####
## Note, you will need to create your own application to connect to gmail
## Here are some steps for doing this:
## 1. Go to https://console.developers.google.com/
## 2. Create a new project
## 3. Copy-and-paste the Client ID and Client Secret into the fields below
## 4. Add an authorized redirect URI: http://localhost:1410/
client_id = "{INSERT YOUR ID HERE}"
client_secret = "{INSERT YOUR SECRET HERE}"
# Running this will open a web browser and ask you to authenticate
# If you are already authenticated into gmail, it will just give you a confirmation
# message, indicating that you are authenticated. You can close the browser and begin using gmail
# NOTE: After a period of time, your authentication will time-out. When you try to pass
# a request to gmail, you'll get an error. Just re-run the line below and you'll re-authenticate.
gmail_auth(scope="read_only", id=client_id, secret=client_secret)
#### -- ## -- ## -- ## -- ## -- ## -- ## -- ## -- ####
## Request a set of message ids that match a given query.
## There are many slick ways to search for messages (or threads) in gmail. Any of these methods can be used
## in the search=" " argument.
## For a full set of search options, check out this page:
## https://support.google.com/mail/answer/7190?hl=en
#### -- ## -- ## -- ## -- ## -- ## -- ## -- ## -- ####
## For this example, I'm going to pull all messages that I sent (i.e., those that gmail auto-labeled as SENT)
## I'm going to specify a particular time window and a maximum of 10k messages.
msgs = messages(search="before:2019/01/01 after:2005/12/01", num_results = 10000, label_ids="SENT")
# the messages function abovewill return an object with thread and message ids. The function below
# will return a vector of string ids that can be used in subsequent pulls.
# Note that because the function has to call each message, this can take sometime to process
# So, if you have something like 4000 messages, expect for it to take several minutes to finish running.
# Be patient! It's not efficient code.
ids = gmailr::id(msgs, what="message_id")
o = gmail.sentiment(ids)
# Because this took so long to do, I'm going to write out the results
write.table(o, "./gmail_text_analysis.csv", sep=",", row.names=F)
#### -- ## -- ## -- ## -- ## -- ## -- ## -- ## -- ####
# At this point, you can use your favorite graphing and analysis tools
# to analyze this dataset at different levels of analysis (e.g., time, day, day of week, month, year)
#### -- ## -- ## -- ## -- ## -- ## -- ## -- ## -- ####
# This script gives an example of how to connect
# to a personal gmail account, extract a set of messages
# and do a quick-and-dirty sentiment analysis on the
# body of the messages.
# NOTE: This is not a pure or clean analysis of this text data.
# For production, you would want to make sure to clean up the
# body of the text data (e.g., ensuring that you don't have duplicate
# messages that are appended at the bottom of replies).
#
# However, this should give you a place to start for making sense of your email.
##########################################
#### -- ## -- ## -- ## -- ## -- ## -- ## -- ## -- ####
## Setup
#### -- ## -- ## -- ## -- ## -- ## -- ## -- ## -- ####
# Setup your environment, marking a particular working directory where you'd like
# to output files and loading libraries that you'll use
# syuzhet has a set of functions for doing sentiment analysis
library(syuzhet)
# ngram is useful for breaking up and parsing text data
library(ngram)
# reshape2 is also helpul for parsing text data
library(reshape2)
# use this to smash a list
require(dplyr)
# gmailr has a set of functions for connecting to gmail and parsing emails
library(gmailr)
## User-defined function for doing a quick-and-dirty clean-up on text
# You could add elements to this to create an even more precise set of
# text data to parse for your sentiment analysis. For a production
# text analysis, you would want to create a clean set of data.
clean.string <- function(string){
# Lowercase
temp <- tolower(string)
# Remove everything that is not a number or letter (may want to keep more
# stuff in your actual analyses).
temp <- stringr::str_replace_all(temp,"[^a-zA-Z\\s]", " ")
# Shrink down to just one white space
temp <- stringr::str_replace_all(temp,"[\\s]+", " ")
return(temp)
}
## User-defined function for pulling a set of messages from gmail
# and doing a sentiment analysis on those messages. This will also retain the actual
# body of the messages in case you want to do something further with it down
# the line. The only input into the function is a vector of message ids
# that you want to pull and process.
gmail.sentiment = function(ids) {
# a vector of the sentiment variables
sent.vars = c("anger", "anticipation", "disgust", "fear", "joy", "sadness", "surprise", "trust", "negative", "positive")
# a vector of the email vars
email.vars = c("id", "to", "from", "cc", "bcc", "date", "subject", "body")
# put together and also add the number of words in the body
all.vars = c(email.vars, "n_words", sent.vars)
null.to.na = function(x) {
x = ifelse(is.null(x), NA, x)
return(x)
}
# Loop through the vector of message ids and pull the info for that specific message
# We're creating a data.frame here that contains the information for this query of messages
for(i in 1:length(ids)) {
# Get the header info for the message, replacing any null values with NA
id = ids[i]
msg = message(id)
to = to(msg)
to = null.to.na(to)
from = from(msg)
from = null.to.na(from)
cc = cc(msg)
cc = null.to.na(cc)
bcc = bcc(msg)
bcc = null.to.na(bcc)
date = date(msg)
date = null.to.na(date)
subject = subject(msg)
subject = null.to.na(subject)
body = unlist(body(msg))
body = null.to.na(body)
# Create a holding line
res.line = data.frame(cbind(id, to, from, cc, bcc, date, subject, body), stringsAsFactors=F)
# if this is the first pass through, then create an outset. Otherwise, append this line
# to the existing outset
if(i == 1) {
res.out = res.line
} else {
res.out = rbind(res.out, res.line)
}
}
# do a very light text cleaning
res.out$body_clean = unlist(lapply(res.out$body, clean.string))
# count the clean words
res.out$n_words = unlist(lapply(res.out$body_clean, wordcount))
# Do the sentiment analysis using nrc. In a real production sentiment analysis, you would want
# to consider several different dictionaries. Check out the following page for a walkthrough of
# some of the different lexicons you might consider:
# https://cran.r-project.org/web/packages/syuzhet/vignettes/syuzhet-vignette.html
res.out[,sent.vars] = bind_rows(lapply(res.out$body_clean, get_nrc_sentiment))
# Get a percentage of pos/neg by number of words in the email
res.out$neg_pct = res.out$negative/res.out$n_words
res.out$pos_pct = res.out$positive/res.out$n_words
# parse the date information into some variables to use in graphing
res.out$dow = substr(res.out$date, 1, 3)
res.out$date_time = substr(res.out$date, 6, nchar(res.out$date))
o = colsplit(trimws(res.out$date_time), " ", names=c("day", "month", "year", "time", "offset"))
d = cbind(res.out, o)
d$date_time_format = as.Date(paste(d$month, " ", as.numeric(d$day), " ", as.numeric(d$year), sep=""), format="%b %d %Y")
d$month_num = as.numeric(substr(d$date_time_format, 6,7))
d$day_num = as.numeric(substr(d$date_time_format, 9,10))
return(d)
}
#### -- ## -- ## -- ## -- ## -- ## -- ## -- ## -- ####
## Connect to gmail
#### -- ## -- ## -- ## -- ## -- ## -- ## -- ## -- ####
## Note, you will need to create your own application to connect to gmail
## Here are some steps for doing this:
## 1. Go to https://console.developers.google.com/
## 2. Create a new project
## 3. Copy-and-paste the Client ID and Client Secret into the fields below
## 4. Add an authorized redirect URI: http://localhost:1410/
client_id = "{INSERT YOUR ID HERE}"
client_secret = "{INSERT YOUR SECRET HERE}"
# Running this will open a web browser and ask you to authenticate
# If you are already authenticated into gmail, it will just give you a confirmation
# message, indicating that you are authenticated. You can close the browser and begin using gmail
# NOTE: After a period of time, your authentication will time-out. When you try to pass
# a request to gmail, you'll get an error. Just re-run the line below and you'll re-authenticate.
gmail_auth(scope="read_only", id=client_id, secret=client_secret)
#### -- ## -- ## -- ## -- ## -- ## -- ## -- ## -- ####
## Request a set of message ids that match a given query.
## There are many slick ways to search for messages (or threads) in gmail. Any of these methods can be used
## in the search=" " argument.
## For a full set of search options, check out this page:
## https://support.google.com/mail/answer/7190?hl=en
#### -- ## -- ## -- ## -- ## -- ## -- ## -- ## -- ####
## For this example, I'm going to pull all messages that I sent (i.e., those that gmail auto-labeled as SENT)
## I'm going to specify a particular time window and a maximum of 10k messages.
msgs = messages(search="before:2019/01/01 after:2005/12/01", num_results = 10000, label_ids="SENT")
# the messages function abovewill return an object with thread and message ids. The function below
# will return a vector of string ids that can be used in subsequent pulls.
# Note that because the function has to call each message, this can take sometime to process
# So, if you have something like 4000 messages, expect for it to take several minutes to finish running.
# Be patient! It's not efficient code.
ids = gmailr::id(msgs, what="message_id")
o = gmail.sentiment(ids)
# Because this took so long to do, I'm going to write out the results
write.table(o, "./gmail_text_analysis.csv", sep=",", row.names=F)
#### -- ## -- ## -- ## -- ## -- ## -- ## -- ## -- ####
# At this point, you can use your favorite graphing and analysis tools
# to analyze this dataset at different levels of analysis (e.g., time, day, day of week, month, year)
#### -- ## -- ## -- ## -- ## -- ## -- ## -- ## -- ####