knitr::opts_chunk$set(message = FALSE, warning = FALSE, fig.width = 6, fig.height = 9/2
)
library(tidyverse)
library(rvest)
library(pdftools)
library(dplyr)
library(glue)
library(httr)The data for this analysis comes from publically available collegiate news postings hosted through the Smith College website. In order to communicate efficiently with students, families, faculty members, and the community, the College frequently opts to publish “letters”, or briefings related to current events on campus or on topics that are otherwise relevant to the workings of the College. Some of these letters come directly from Smith College President Kathleen McCartney, while others do not specify authorship and simply communicate information. The letters range from 56 words in length to 783 words*. These letters represent valuable data because of their centrality to the Smith Community; the publication of a letter is typically accompanied by an email to students, faculty, staff, and alumnx who choose to receive notifications about the College. Because they are designed to communicate the Smith administration’s position on current events and situations at the College, these letters are an excellent resource for assessing sentiment in the highest levels of the Smith administrative hiaerarchy.
In order to collect this data, we employed a series of webscraping techniques through R, primarily using rvest and httr packages. Initial collection involved scraping all available links on the landing page for the community letters. These links are each associated with a unique letter, and were then individually accessed through the construction of a function designed to iterate over each URL and scrape the contents of the related webpage. Once the contents of each webpage was scraped, we were able to summarise each letter in one row of a dataframe, with a single column containing all of the letter’s text. With help from the tidytext package, designed for sentiment analysis and text parsing, we were then able to seperate out each individual word from each letter into its own row, leaving us with an expansive data set explicitly showcasing the entire textual composition of each letter. In order to perform sentiment analysis, we proceeded to download and join data sets associated with three seperate sentiment lexiscons (NRC, Afinn, and Bing) to our letters data set, producing additional dataframes wherein individual words identified with their lexicon-specific sentiment rating. These data sets can be manipulated, summarized, and plotted as any dataframe in R, and their content can be used to assess a variety of sentiment constructs within the letters.
# Scrape urls from main webpage with all the letters
url <- read_html("https://www.smith.edu/president-kathleen-mccartney/letters-community")
links <- url %>%
html_nodes(".field-item") %>%
html_nodes("li") %>%
html_nodes("a") %>%
sub(pattern = '<a href="', replacement = "") %>%
sub(pattern = '(\\".*)', replacement = "")
# Remove main domain to have clean data- most are missing the domain url
links_clean <- links %>%
str_replace("https://www.smith.edu", "")
# Add main domain page to each url
links_url <- paste("https://www.smith.edu", links_clean, sep="")
# # Testing purposes (less data)
# years = c("2019-20")
# Only include urls from 2017-18, 2018-19, and 2019-20
years = c("2019-20", "2018-19", "2017-18")
links_years <- links_url[!links_url %in% grep(paste0(years, collapse = "|"), links_url, value = T, invert = T)]
# Check if each of the urls exists
urls_exist <- c()
check_exists <- function(links_years, urls_exist){
if(GET(links_years, body = F)[2]==200){
urls_exist <- c(urls_exist, links_years)
}else{
return()
}
}
# Only include urls that are valid/exist
working_urls <- lapply(links_years, check_exists, urls_exist)
links_actual <- unlist(working_urls)# Scrape title of each letter
title <- lapply(links_actual,
function(links_actual) {
links_actual %>% read_html() %>%
html_node(".page-title") %>%
html_text() %>%
sub(pattern = "\n", replacement = "")
}
)
# Scrape content of each letter
content <- lapply(links_actual,
function(links_actual) {
links_actual %>% read_html() %>%
html_node(".left-column-text") %>%
html_node(".field-item.even") %>%
html_text() %>%
sub(pattern = "\n", replacement = "")
}
)
# Get titles of each letter from the main webpage
titles_dates <- url %>%
html_nodes(".field-item") %>%
html_nodes("li") %>%
html_nodes("a") %>%
html_text()
# Unlist vector of
title_dates <- unlist(titles_dates)
# Date format to search for
date_format <- "[A-Z]{1}[a-z]{1,10} [0-9]{1,2}, [0-9]{4}"
# Extract date using above format from titles
date <- lapply(title_dates,
function(title_dates) {
str_extract(string = title_dates,
pattern = date_format)
}
)
# Unlist dates vector
date <- unlist(date, use.names=FALSE)
# Unlink links
# links_url from line 41
link <- unlist(links_url, use.names=FALSE)
# Create data frame with extracted dates and links
df_dates <- as.data.frame(cbind(date, link))# Unlist extracted text from each category
title <- unlist(title, use.names=FALSE)
content <- unlist(content, use.names=FALSE)
link <- unlist(links_actual, use.names=FALSE)
# Combine data into a data frame with titles, contents, and links
df_content <- as.data.frame(cbind(title, content, link))
# Join data data frame with dates with data frame with content
df_full <- inner_join(df_content, df_dates, by = "link")
# Change data types
df_full$link <-
as.character(as.factor(df_full$link))
df_full$title <-
as.character(as.factor(df_full$title))
df_full$content <-
as.character(as.factor(df_full$content))
df_full$date <-
as.Date(as.factor(df_full$date), format ="%B %d, %Y")
df_full <- tibble::rowid_to_column(df_full, "ID")The table below is a small sample of the complete data set with the title of each community letter, the link, the date it was published, each word within the letter, and the word position in each letter.
# Trasnform to tidy text data and remove words such as "of, the, to, in"
library(tidytext)
df_tidy <- df_full %>%
unnest_tokens(word, content) %>%
anti_join(stop_words)
df_tidy <- df_tidy %>%
group_by(ID) %>%
mutate(word_number = row_number()) %>%
ungroup()
head(df_tidy, n=10)