packages.used=c("rvest", "tibble",
"sentimentr", "gplots", "dplyr",
"tm", "syuzhet", "factoextra",
"beeswarm", "scales", "RColorBrewer",
"RANN", "topicmodels", "stringr")
# check packages that need to be installed.
packages.needed=setdiff(packages.used,
intersect(installed.packages()[,1],
packages.used))
# install additional packages
if(length(packages.needed)>0){
install.packages(packages.needed, dependencies = TRUE)
}
# load packages
library("rvest")
library("tibble")
library("syuzhet")
library("sentimentr")
library("gplots")
library("dplyr")
library("tm")
library("syuzhet")
library("factoextra")
library("beeswarm")
library("scales")
library("RColorBrewer")
library("RANN")
library("tm")
library("topicmodels")
library("stringr")
source("../lib/plotstacked.R")
source("../lib/speechFuncs.R")
This notebook was prepared with the following environmental settings.
print(R.version)
_
platform x86_64-apple-darwin15.6.0
arch x86_64
os darwin15.6.0
system x86_64, darwin15.6.0
status
major 3
minor 6.0
year 2019
month 04
day 26
svn rev 76424
language R
version.string R version 3.6.0 (2019-04-26)
nickname Planting of a Tree
Following the example of Jerid Francom, we used Selectorgadget to choose the links we would like to scrap. For this project, we selected all inaugural addresses of past presidents, nomination speeches of major party candidates and farewell addresses. We also included several public speeches from Donald Trump for our textual analysis of presidential speeches.
### Inauguaral speeches
main.page <- read_html(x = "http://www.presidency.ucsb.edu/inaugurals.php")
# Get link URLs
# f.speechlinks is a function for extracting links from the list of speeches.
inaug=f.speechlinks(main.page)
#head(inaug)
as.Date(inaug[,1], format="%B %e, %Y")
[1] "1789-04-30" "1793-03-04" "1797-03-04" "1801-03-04" "1805-03-04" "1809-03-04" "1813-03-04" "1817-03-04" "1821-03-04" "1825-03-04"
[11] "1829-03-04" "1833-03-04" "1837-03-04" "1841-03-04" "1845-03-04" "1849-03-05" "1853-03-04" "1857-03-04" "1861-03-04" "1865-03-04"
[21] "1869-03-04" "1873-03-04" "1877-03-05" "1881-03-04" "1885-03-04" "1889-03-04" "1893-03-04" "1897-03-04" "1901-03-04" "1905-03-04"
[31] "1909-03-04" "1913-03-04" "1917-03-04" "1921-03-04" "1925-03-04" "1929-03-04" "1933-03-04" "1937-01-20" "1941-01-20" "1945-01-20"
[41] "1949-01-20" "1953-01-20" "1957-01-21" "1961-01-20" "1965-01-20" "1969-01-20" "1973-01-20" "1977-01-20" "1981-01-20" "1985-01-21"
[51] "1989-01-20" "1993-01-20" "1997-01-20" "2001-01-20" "2005-01-20" "2009-01-20" "2013-01-21" "2017-01-20" NA
inaug=inaug[-nrow(inaug),] # remove the last line, irrelevant due to error.
#### Nomination speeches
main.page=read_html("http://www.presidency.ucsb.edu/nomination.php")
# Get link URLs
nomin <- f.speechlinks(main.page)
#head(nomin)
#
#### Farewell speeches
main.page=read_html("http://www.presidency.ucsb.edu/farewell_addresses.php")
# Get link URLs
farewell <- f.speechlinks(main.page)
#head(farewell)
inaug.list=read.csv("../data/inauglist.csv", stringsAsFactors = FALSE)
nomin.list=read.csv("../data/nominlist.csv", stringsAsFactors = FALSE)
farewell.list=read.csv("../data/farewelllist.csv", stringsAsFactors = FALSE)
We assemble all scrapped speeches into one list. Note here that we don’t have the full text yet, only the links to full text transcripts.
speech.list=rbind(inaug.list, nomin.list, farewell.list)
speech.list$type=c(rep("inaug", nrow(inaug.list)),
rep("nomin", nrow(nomin.list)),
rep("farewell", nrow(farewell.list)))
# speech.url=rbind(inaug, nomin, farewell)
# speech.list=cbind(speech.list, speech.url[-126,])
Based on the list of speeches, we scrap the main text part of the transcript’s html page. For simple html pages of this kind, Selectorgadget is very convenient for identifying the html node that rvest
can use to scrap its content. For reproducibility, we also save our scrapped speeches into our local folder as individual speech files.
Trump, as president-elect that has not been a politician, do not have a lot of formal speeches yet. For our textual analysis, we manually add several public transcripts from Trump: + [Transcript: Donald Trump’s full immigration speech, annotated. LA Times, 08/31/2016] (http://www.latimes.com/politics/la-na-pol-donald-trump-immigration-speech-transcript-20160831-snap-htmlstory.html) + Transcript of Donald Trump’s speech on national security in Philadelphia - The Hill, 09/07/16 + Transcript of President-elect Trump’s news conference CNBC, 01/11/2017
speech1=paste(readLines("../data/fulltext/SpeechDonaldTrump-NA.txt",
n=-1, skipNul=TRUE),
collapse=" ")
speech2=paste(readLines("../data/fulltext/SpeechDonaldTrump-NA2.txt",
n=-1, skipNul=TRUE),
collapse=" ")
speech3=paste(readLines("../data/fulltext/PressDonaldTrump-NA.txt",
n=-1, skipNul=TRUE),
collapse=" ")
Trump.speeches=data.frame(
President=rep("Donald J. Trump", 3),
File=rep("DonaldJTrump", 3),
Term=rep(0, 3),
Party=rep("Republican", 3),
Date=c("August 31, 2016", "September 7, 2016", "January 11, 2017"),
Words=c(f.word_count(speech1), f.word_count(speech2), f.word_count(speech3)),
Win=rep("yes", 3),
type=rep("speeches", 3),
#links=rep(NA, 3),
#urls=rep(NA, 3),
fulltext=c(speech1, speech2, speech3)
)
speech.list=rbind(speech.list, Trump.speeches)
We will use sentences as units of analysis for this project, as sentences are natural languge units for organizing thoughts and ideas. For each extracted sentence, we apply sentiment analysis using NRC sentiment lexion. “The NRC Emotion Lexicon is a list of English words and their associations with eight basic emotions (anger, fear, anticipation, trust, surprise, sadness, joy, and disgust) and two sentiments (negative and positive). The annotations were manually done by crowdsourcing.”
We assign an sequential id to each sentence in a speech (sent.id
) and also calculated the number of words in each sentence as sentence length (word.count
).
sentence.list=NULL
for(i in 1:nrow(speech.list)){
sentences=syuzhet::get_sentences(speech.list$fulltext[i])
if(length(sentences)>0){
emotions=matrix(emotion(sentences)$emotion,
nrow=length(sentences),
byrow=T)
colnames(emotions)=emotion(sentences[1])$emotion_type
emotions=data.frame(emotions)
emotions=select(emotions,
anticipation,
joy,
surprise,
trust,
anger,
disgust,
fear,
sadness)
word.count=f.word_count(sentences)
# colnames(emotions)=paste0("emo.", colnames(emotions))
# in case the word counts are zeros?
# emotions=diag(1/(word.count+0.01))%*%as.matrix(emotions)
sentence.list=rbind(sentence.list,
cbind(speech.list[i,-ncol(speech.list)],
sentences=as.character(sentences),
word.count,
emotions,
sent.id=1:length(sentences)
)
)
}
}
names(sentence.list)
[1] "President" "File" "Term" "Party" "Date" "Words" "Win"
[8] "type" "sentences" "word.count" "anticipation" "joy" "surprise" "trust"
[15] "anger" "disgust" "fear" "sadness" "sent.id"
Some non-sentences exist in raw data due to erroneous extra end-of sentence marks.
sentence.list=
sentence.list%>%
filter(!is.na(word.count))
For simpler visualization, we chose a subset of better known presidents or presidential candidates on which to focus our analysis.
sel.comparison=c("DonaldJTrump","JohnMcCain", "GeorgeBush", "MittRomney", "GeorgeWBush",
"RonaldReagan","AlbertGore,Jr", "HillaryClinton","JohnFKerry",
"WilliamJClinton","HarrySTruman", "BarackObama", "LyndonBJohnson",
"GeraldRFord", "JimmyCarter", "DwightDEisenhower", "FranklinDRoosevelt",
"HerbertHoover","JohnFKennedy","RichardNixon","WoodrowWilson",
"AbrahamLincoln", "TheodoreRoosevelt", "JamesGarfield",
"JohnQuincyAdams", "UlyssesSGrant", "ThomasJefferson",
"GeorgeWashington", "WilliamHowardTaft", "AndrewJackson",
"WilliamHenryHarrison", "JohnAdams")
First, we look at nomination acceptance speeches at major party’s national conventions. For relevant to Trump’s speeches, we limit our attention to speeches for the first terms of former U.S. presidents. We noticed that a number of presidents have very short sentences in their nomination acceptance speeches.
par(mar=c(4, 11, 2, 2))
#sel.comparison=levels(sentence.list$FileOrdered)
sentence.list.sel=filter(sentence.list,
type=="nomin", Term==1, File%in%sel.comparison)
sentence.list.sel$File=factor(sentence.list.sel$File)
sentence.list.sel$FileOrdered=reorder(sentence.list.sel$File,
sentence.list.sel$word.count,
mean,
order=T)
beeswarm(word.count~FileOrdered,
data=sentence.list.sel,
horizontal = TRUE,
pch=16, col=alpha(brewer.pal(9, "Set1"), 0.6),
cex=0.55, cex.axis=0.8, cex.lab=0.8,
spacing=5/nlevels(sentence.list.sel$FileOrdered),
las=2, xlab="Number of words in a sentence.", ylab="",
main="Nomination speeches")
par(mar=c(4, 11, 2, 2))
#sel.comparison=levels(sentence.list$FileOrdered)
sentence.list.sel=filter(sentence.list,
type=="nomin", Term==2, File%in%sel.comparison)
sentence.list.sel$File=factor(sentence.list.sel$File)
sentence.list.sel$FileOrdered=reorder(sentence.list.sel$File,
sentence.list.sel$word.count,
mean,
order=T)
beeswarm(word.count~FileOrdered,
data=sentence.list.sel,
horizontal = TRUE,
pch=16, col=alpha(brewer.pal(9, "Set1"), 0.6),
cex=0.55, cex.axis=0.8, cex.lab=0.8,
spacing=1.2/nlevels(sentence.list.sel$FileOrdered),
las=2, xlab="Number of words in a sentence.", ylab="",
main="Nomination speeches, 2nd term")
What are these short sentences?
sentence.list%>%
filter(File=="DonaldJTrump",
type=="nomin",
word.count<=3)%>%
select(sentences)%>%sample_n(10)
sentence.list%>%
filter(File=="AlbertGore,Jr",
type=="nomin",
word.count<=3)%>%
select(sentences)%>%sample_n(10)
sentence.list%>%
filter(File=="HillaryClinton",
type=="nomin",
word.count<=3)%>%
select(sentences)
sentence.list%>%
filter(File=="WilliamJClinton",
type=="nomin", Term==1,
word.count<=3)%>%
select(sentences)
We notice that the sentences in inaugural speeches are longer than those in nomination acceptance speeches.
sentence.list.sel=sentence.list%>%filter(type=="inaug", File%in%sel.comparison, Term==1)
sentence.list.sel$File=factor(sentence.list.sel$File)
sentence.list.sel$FileOrdered=reorder(sentence.list.sel$File,
sentence.list.sel$word.count,
mean,
order=T)
par(mar=c(4, 11, 2, 2))
beeswarm(word.count~FileOrdered,
data=sentence.list.sel,
horizontal = TRUE,
pch=16, col=alpha(brewer.pal(9, "Set1"), 0.6),
cex=0.55, cex.axis=0.8, cex.lab=0.8,
spacing=5/nlevels(sentence.list.sel$FileOrdered),
las=2, ylab="", xlab="Number of words in a sentence.",
main="Inaugural Speeches")
Short sentences in inaugural speeches.
sentence.list%>%
filter(File=="BarackObama",
type=="inaug",
word.count<=3)%>%
select(sentences)
How our presidents (or candidates) alternate between long and short sentences and how they shift between different sentiments in their speeches. It is interesting to note that some presidential candidates’ speech are more colorful than others. Here we used the same color theme as in the movie “Inside Out.”
image
par(mfrow=c(4,1), mar=c(1,0,2,0), bty="n", xaxt="n", yaxt="n", font.main=1)
f.plotsent.len(In.list=sentence.list, InFile="HillaryClinton",
InType="nomin", InTerm=1, President="Hillary Clinton")
f.plotsent.len(In.list=sentence.list, InFile="DonaldJTrump",
InType="nomin", InTerm=1, President="Donald Trump")
f.plotsent.len(In.list=sentence.list, InFile="BarackObama",
InType="nomin", InTerm=1, President="Barack Obama")
f.plotsent.len(In.list=sentence.list, InFile="GeorgeWBush",
InType="nomin", InTerm=1, President="George W. Bush")
emotions.types=c("anticipation", "joy", "surprise", "trust",
"anger", "disgust", "fear", "sadness")
print("Hillary Clinton")
[1] "Hillary Clinton"
speech.df=tbl_df(sentence.list)%>%
filter(File=="HillaryClinton", type=="nomin", word.count>=4)%>%
select(sentences, anticipation:sadness)
speech.df=as.data.frame(speech.df)
as.character(speech.df$sentences[apply(speech.df[,-1], 2, which.max)])
[1] "It's a big deal."
[2] "So it's true."
[3] "It's a big deal."
[4] "So it's true."
[5] "Some of you are frustrated, even furious."
[6] "Powerful forces are threatening to pull us apart."
[7] "Powerful forces are threatening to pull us apart."
[8] "My mother, Dorothy, was abandoned by her parents as a young girl."
print("Barack Obama")
[1] "Barack Obama"
speech.df=tbl_df(sentence.list)%>%
filter(File=="BarackObama", type=="nomin", Term==1, word.count>=5)%>%
select(sentences, anticipation:sadness)
speech.df=as.data.frame(speech.df)
as.character(speech.df$sentences[apply(speech.df[,-1], 2, which.max)])
[1] "Ill invest in early childhood education."
[2] "They couldve heard words of anger and discord."
[3] "Thats not the judgment we need."
[4] "It should help us, not hurt us."
[5] "Thank you, God Bless you, and and God Bless the United States of America."
[6] "Ill invest in early childhood education."
[7] "That promise is our greatest inheritance."
[8] "Now let there be no doubt."
print("George W Bush")
[1] "George W Bush"
speech.df=tbl_df(sentence.list)%>%
filter(File=="GeorgeWBush", type=="nomin", Term==1, word.count>=4)%>%
select(sentences, anticipation:sadness)
speech.df=as.data.frame(speech.df)
as.character(speech.df$sentences[apply(speech.df[,-1], 2, which.max)])
[1] "[applause]The wait has been long, but it wont be long now."
[2] "So much promise to no great purpose."
[3] "The surplus is not the government's money; the surplus is the people's money."
[4] "On the other side of that wall are poverty and prison, addiction and despair."
[5] "I dont have enemies to fight."
[6] "Big government is not the answer, but the alternative to bureaucracy is not indifference."
[7] "Government cannot do this work."
[8] "And one replied, \"Were not worried, General."
print("Donald Trump")
[1] "Donald Trump"
speech.df=tbl_df(sentence.list)%>%
filter(File=="DonaldJTrump", type=="nomin", Term==1, word.count>=5)%>%
select(sentences, anticipation:sadness)
speech.df=as.data.frame(speech.df)
as.character(speech.df$sentences[apply(speech.df[,-1], 2, which.max)])
[1] "Not going to happen anymore."
[2] "These families have no special interests to represent them."
[3] "This time, the terrorist targeted LGBTQ community – no good and were going to stop it."
[4] "Once again, France is the victim of brutal Islamic terrorism."
[5] "My opponent will never meet with them, or share in their pain, believe me."
[6] "My opponent will never meet with them, or share in their pain, believe me."
[7] "I have joined the political arena so that the powerful can no longer beat up on people that cannot defend themselves."
[8] "And they are forgotten, but theyre not going to be forgotten long."
heatmap.2(cor(sentence.list%>%filter(type=="inaug")%>%select(anticipation:sadness)),
scale = "none",
col = bluered(100), , margin=c(6, 6), key=F,
trace = "none", density.info = "none")
par(mar=c(4, 6, 2, 1))
emo.means=colMeans(select(sentence.list, anticipation:sadness)>0.01)
col.use=c("darkgoldenrod1", "darkgoldenrod1", "darkgoldenrod1", "darkgoldenrod1",
"red2", "chartreuse3", "blueviolet","dodgerblue3")
barplot(emo.means[order(emo.means)], las=2, col=col.use[order(emo.means)], horiz=T, main="Inaugural Speeches")
presid.summary=tbl_df(sentence.list)%>%
filter(type=="nomin", File%in%sel.comparison)%>%
#group_by(paste0(type, File))%>%
group_by(File)%>%
summarise(
anger=mean(anger),
anticipation=mean(anticipation),
disgust=mean(disgust),
fear=mean(fear),
joy=mean(joy),
sadness=mean(sadness),
surprise=mean(surprise),
trust=mean(trust)
#negative=mean(negative),
#positive=mean(positive)
)
presid.summary=as.data.frame(presid.summary)
rownames(presid.summary)=as.character((presid.summary[,1]))
km.res=kmeans(presid.summary[,-1], iter.max=200,
5)
fviz_cluster(km.res,
stand=F, repel= TRUE,
data = presid.summary[,-1], xlab="", xaxt="n",
show.clust.cent=FALSE)
For topic modeling, we prepare a corpus of sentence snipets as follows. For each speech, we start with sentences and prepare a snipet with a given sentence with the flanking sentences.
corpus.list=sentence.list[2:(nrow(sentence.list)-1), ]
sentence.pre=sentence.list$sentences[1:(nrow(sentence.list)-2)]
sentence.post=sentence.list$sentences[3:(nrow(sentence.list)-1)]
corpus.list$snipets=paste(sentence.pre, corpus.list$sentences, sentence.post, sep=" ")
rm.rows=(1:nrow(corpus.list))[corpus.list$sent.id==1]
rm.rows=c(rm.rows, rm.rows-1)
corpus.list=corpus.list[-rm.rows, ]
docs <- Corpus(VectorSource(corpus.list$snipets))
writeLines(as.character(docs[[sample(1:nrow(corpus.list), 1)]]))
The joy and moral stimulation of work no longer must be forgotten in the mad chase of evanescent profits. These dark days will be worth all they cost us if they teach us that our true destiny is not to be ministered unto but to minister to ourselves and to our fellow men. Recognition of the falsity of material wealth as the standard of success goes hand in hand with the abandonment of the false belief that public office and high political position are to be valued only by the standards of pride of place and personal profit; and there must be an end to a conduct in banking and in business which too often has given to a sacred trust the likeness of callous and selfish wrongdoing.
Adapted from https://eight2late.wordpress.com/2015/09/29/a-gentle-introduction-to-topic-modeling-using-r/.
#remove potentially problematic symbols
docs <-tm_map(docs,content_transformer(tolower))
transformation drops documents
writeLines(as.character(docs[[sample(1:nrow(corpus.list), 1)]]))
either of them are intolerable, and they are not the only ways out.now, our objective must be a sane solution, not a blind leap back to old evils. moreover, a step backwards would result in a chaos of new evils not yet experienced, because the local systems of prohibition and controls which were developed over generations have been in a large degree abandoned under this amendment. the republican platform recommends submission of the question to the states and that the people themselves may determine whether they desire a change, but insists that this submission shall propose a constructive and not a destructive change.
#remove punctuation
docs <- tm_map(docs, removePunctuation)
transformation drops documents
writeLines(as.character(docs[[sample(1:nrow(corpus.list), 1)]]))
we the citizens of america are now joined in a great national effort to rebuild our country and restore its promise for all of our people together we will determine the course of america and the world for many many years to come we will face challenges we will confront hardships but we will get the job done
#Strip digits
docs <- tm_map(docs, removeNumbers)
transformation drops documents
writeLines(as.character(docs[[sample(1:nrow(corpus.list), 1)]]))
we offer proven workable answers our opponents began this campaign hoping that america has a poor memory well lets take them on a little stroll down memory lane
#remove stopwords
docs <- tm_map(docs, removeWords, stopwords("english"))
transformation drops documents
writeLines(as.character(docs[[sample(1:nrow(corpus.list), 1)]]))
father grew small community named possum hollow middle tennessee just went work teacher oneroom school great depression came along taught lesson couldnt found classroom
#remove whitespace
docs <- tm_map(docs, stripWhitespace)
transformation drops documents
writeLines(as.character(docs[[sample(1:nrow(corpus.list), 1)]]))
party lincoln roosevelt reagan going get back basics applausein country believe everyone something contribute deserves opportunity reach godgiven potential boy whose descendants arrived mayflower latina daughter migrant workers gods children americans
#Stem document
docs <- tm_map(docs,stemDocument)
transformation drops documents
writeLines(as.character(docs[[sample(1:nrow(corpus.list), 1)]]))
mani citizen engag commerc navig certain degre depend prosper state mani engag fisheri interest expos invas war power disregard faith admonit experi expect
Gengerate document-term matrices.
dtm.Docs=dtm.Docs[rowTotals>0, ]
Error in dtm.Docs[rowTotals > 0, ] : incorrect number of dimensions
Run LDA
#Set parameters for Gibbs sampling
burnin <- 4000
iter <- 2000
thin <- 500
seed <-list(2003,5,63,100001,765)
nstart <- 5
best <- TRUE
#Number of topics
k <- 15
#Run LDA using Gibbs sampling
ldaOut <-LDA(dtm, k, method="Gibbs", control=list(nstart=nstart,
seed = seed, best=best,
burnin = burnin, iter = iter,
thin=thin))
#write out results
#docs to topics
ldaOut.topics <- as.matrix(topics(ldaOut))
table(c(1:k, ldaOut.topics))
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
994 1665 2134 1273 1769 1035 1073 686 1217 1470 1377 1126 900 1160 1377
write.csv(ldaOut.topics,file=paste("../out/LDAGibbs",k,"DocsToTopics.csv"))
#top 6 terms in each topic
ldaOut.terms <- as.matrix(terms(ldaOut,20))
write.csv(ldaOut.terms,file=paste("../out/LDAGibbs",k,"TopicsToTerms.csv"))
#probabilities associated with each topic assignment
topicProbabilities <- as.data.frame(ldaOut@gamma)
write.csv(topicProbabilities,file=paste("../out/LDAGibbs",k,"TopicProbabilities.csv"))
terms.beta=ldaOut@beta
terms.beta=scale(terms.beta)
topics.terms=NULL
for(i in 1:k){
topics.terms=rbind(topics.terms, ldaOut@terms[order(terms.beta[i,], decreasing = TRUE)[1:7]])
}
topics.terms
[,1] [,2] [,3] [,4] [,5] [,6] [,7]
[1,] "sure" "wake" "hasten" "librari" "narcot" "taint" "woe"
[2,] "fought" "eye" "stori" "went" "dad" "mountain" "merci"
[3,] "wont" "cant" "didnt" "your" "everybodi" "laughter" "hack"
[4,] "amend" "enact" "judici" "restrain" "drawn" "jurisdict" "obey"
[5,] "pay" "budget" "colleg" "inflat" "senior" "gas" "bureaucraci"
[6,] "solv" "simpli" "enlist" "isol" "enthusiasm" "grim" "scandal"
[7,] "safeti" "portion" "tend" "ruin" "tendenc" "dominion" "discord"
[8,] "rich" "overcom" "emphasi" "folli" "dynam" "drill" "knit"
[9,] "liberti" "knowledg" "spiritu" "worship" "templ" "wrought" "lawabid"
[10,] "nuclear" "aggress" "communist" "terrorist" "coast" "asia" "afghanistan"
[11,] "product" "revenu" "market" "employ" "paper" "articl" "cent"
[12,] "distinguish" "cultiv" "partial" "tranquil" "disposit" "guidanc" "zeal"
[13,] "partisan" "bitter" "deliv" "faction" "sane" "redeem" "anger"
[14,] "bridg" "youth" "planet" "aliv" "drift" "sun" "threshold"
[15,] "hous" "audienc" "kennedi" "chairman" "goe" "eisenhow" "franklin"
ldaOut.terms
Topic 1 Topic 2 Topic 3 Topic 4 Topic 5 Topic 6 Topic 7 Topic 8 Topic 9
[1,] "will" "see" "want" "law" "work" "can" "govern" "one" "right"
[2,] "everi" "man" "know" "congress" "famili" "must" "state" "time" "peopl"
[3,] "make" "day" "say" "shall" "tax" "nation" "power" "mani" "freedom"
[4,] "american" "men" "just" "may" "job" "peopl" "unit" "countri" "free"
[5,] "countri" "know" "way" "execut" "million" "believ" "union" "great" "nation"
[6,] "never" "hope" "think" "without" "help" "great" "constitut" "now" "human"
[7,] "good" "live" "dont" "constitut" "children" "respons" "citizen" "much" "liberti"
[8,] "need" "life" "like" "secur" "care" "problem" "limit" "place" "equal"
[9,] "peopl" "love" "back" "legisl" "pay" "mean" "general" "well" "life"
[10,] "now" "stand" "that" "upon" "school" "fail" "form" "ever" "progress"
[11,] "give" "heart" "put" "effect" "economi" "without" "principl" "done" "citizen"
[12,] "pledg" "god" "tell" "enforc" "program" "econom" "exist" "anoth" "justic"
[13,] "bring" "women" "ive" "order" "plan" "alon" "right" "part" "men"
[14,] "sure" "young" "thing" "administr" "educ" "leadership" "interest" "call" "civil"
[15,] "serv" "moment" "get" "subject" "creat" "valu" "within" "still" "seek"
[16,] "commit" "faith" "talk" "pass" "need" "set" "whole" "hand" "prosper"
[17,] "alway" "rememb" "immigr" "offic" "health" "task" "feder" "long" "hold"
[18,] "best" "land" "hard" "public" "home" "govern" "institut" "thing" "among"
[19,] "take" "citi" "someth" "practic" "cut" "lead" "protect" "take" "republ"
[20,] "whether" "across" "even" "servic" "save" "idea" "control" "less" "caus"
Topic 10 Topic 11 Topic 12 Topic 13 Topic 14 Topic 15
[1,] "world" "busi" "may" "parti" "america" "year"
[2,] "peac" "industri" "upon" "made" "new" "presid"
[3,] "nation" "increas" "public" "peopl" "american" "first"
[4,] "war" "product" "duti" "republican" "let" "last"
[5,] "forc" "system" "shall" "chang" "futur" "ago"
[6,] "defens" "trade" "confid" "polit" "world" "offic"
[7,] "polici" "labor" "interest" "democrat" "today" "thank"
[8,] "militari" "upon" "spirit" "differ" "better" "said"
[9,] "arm" "develop" "trust" "elect" "promis" "accept"
[10,] "strength" "use" "countri" "administr" "togeth" "proud"
[11,] "intern" "protect" "honor" "now" "live" "tonight"
[12,] "use" "tariff" "patriot" "question" "come" "next"
[13,] "danger" "import" "foreign" "continu" "build" "convent"
[14,] "strong" "revenu" "best" "decis" "home" "friend"
[15,] "europ" "market" "desir" "fact" "generat" "sinc"
[16,] "secur" "employ" "present" "support" "ask" "hous"
[17,] "nuclear" "price" "servic" "polici" "centuri" "day"
[18,] "threaten" "wage" "influenc" "choic" "challeng" "four"
[19,] "defend" "farmer" "experi" "record" "begin" "unit"
[20,] "maintain" "produc" "principl" "vote" "histori" "senat"
Based on the most popular terms and the most salient terms for each topic, we assign a hashtag to each topic. This part require manual setup as the topics are likely to change.
topics.hash=c("Unity", "Belief", "Reform", "Constitution", "WorkingFamilies",
"Leadership", "Speech", "Government", "Freedom", "ForeignRelations",
"Economy", "Patriotism", "Election", "America", "SpeechTemporal")
corpus.list$ldatopic=as.vector(ldaOut.topics)
corpus.list$ldahash=topics.hash[ldaOut.topics]
colnames(topicProbabilities)=topics.hash
corpus.list.df=cbind(corpus.list, topicProbabilities)
par(mar=c(1,1,1,1))
topic.summary=tbl_df(corpus.list.df)%>%
filter(type%in%c("nomin", "inaug"), File%in%sel.comparison)%>%
select(File, Unity:SpeechTemporal)%>%
group_by(File)%>%
summarise_each(funs(mean))
topic.summary=as.data.frame(topic.summary)
rownames(topic.summary)=topic.summary[,1]
#"Unity", "Belief", "Reform", "Constitution", "WorkingFamilies",
#"Leadership", "Speech", "Government", "Freedom", "ForeignRelations",
#"Economy", "Patriotism", "Election", "America", "SpeechTemporal"
topic.plot=c(2, 4, 5, 9, 10,11,12,14)
print(topics.hash[topic.plot])
[1] "Belief" "Constitution" "WorkingFamilies" "Freedom" "ForeignRelations"
[6] "Economy" "Patriotism" "America"
heatmap.2(as.matrix(topic.summary[,topic.plot+1]),
scale = "column", key=F,
col = bluered(100),
cexRow = 0.9, cexCol = 0.9, margins = c(8, 8),
trace = "none", density.info = "none")
#"Unity", "Belief", "Reform", "Constitution", "WorkingFamilies",
#"Leadership", "Speech", "Government", "Freedom", "ForeignRelations",
#"Economy", "Patriotism", "Election", "America", "SpeechTemporal"
par(mfrow=c(5, 1), mar=c(1,1,2,0), bty="n", xaxt="n", yaxt="n")
topic.plot=c(2, 4, 5, 9, 10,11,12,14)
print(topics.hash[topic.plot])
[1] "Belief" "Constitution" "WorkingFamilies" "Freedom" "ForeignRelations"
[6] "Economy" "Patriotism" "America"
speech.df=tbl_df(corpus.list.df)%>%filter(File=="GeorgeBush", type=="nomin",Term==1)%>%select(sent.id, Unity:SpeechTemporal)
speech.df=as.matrix(speech.df)
speech.df[,-1]=replace(speech.df[,-1], speech.df[,-1]<1/15, 0.001)
speech.df[,-1]=f.smooth.topic(x=speech.df[,1], y=speech.df[,-1])
plot.stacked(speech.df[,1], speech.df[,topic.plot+1],
xlab="Sentences", ylab="Topic share", main="George Bush, Nomination")
[1] 0.05793686 0.11587373 0.17381059 0.23174745 0.28968431 0.34762118 0.40555804 0.46349490
speech.df=tbl_df(corpus.list.df)%>%filter(File=="WilliamJClinton", type=="nomin", Term==1)%>%select(sent.id, Unity:SpeechTemporal)
speech.df=as.matrix(speech.df)
speech.df[,-1]=replace(speech.df[,-1], speech.df[,-1]<1/15, 0.001)
speech.df[,-1]=f.smooth.topic(x=speech.df[,1], y=speech.df[,-1])
plot.stacked(speech.df[,1], speech.df[,topic.plot+1],
xlab="Sentences", ylab="Topic share", main="Bill Clinton, Nomination")
[1] 0.05644301 0.11288603 0.16932904 0.22577206 0.28221507 0.33865808 0.39510110 0.45154411
speech.df=tbl_df(corpus.list.df)%>%filter(File=="GeorgeWBush", type=="nomin", Term==1)%>%select(sent.id, Unity:SpeechTemporal)
speech.df=as.matrix(speech.df)
speech.df[,-1]=replace(speech.df[,-1], speech.df[,-1]<1/15, 0.001)
speech.df[,-1]=f.smooth.topic(x=speech.df[,1], y=speech.df[,-1])
plot.stacked(speech.df[,1], speech.df[,topic.plot+1],
xlab="Sentences", ylab="Topic share", main="George W Bush, Nomination")
[1] 0.05050198 0.10100395 0.15150593 0.20200791 0.25250988 0.30301186 0.35351383 0.40401581
speech.df=tbl_df(corpus.list.df)%>%filter(File=="BarackObama", type=="nomin", Term==1)%>%select(sent.id, Unity:SpeechTemporal)
speech.df=as.matrix(speech.df)
speech.df[,-1]=replace(speech.df[,-1], speech.df[,-1]<1/15, 0.001)
speech.df[,-1]=f.smooth.topic(x=speech.df[,1], y=speech.df[,-1])
plot.stacked(speech.df[,1], speech.df[,topic.plot+1],
xlab="Sentences", ylab="Topic share", main="Barack Obama, Nomination")
[1] 0.05396998 0.10793995 0.16190993 0.21587990 0.26984988 0.32381985 0.37778983 0.43175980
speech.df=tbl_df(corpus.list.df)%>%filter(File=="DonaldJTrump", type=="nomin")%>%select(sent.id, Unity:SpeechTemporal)
speech.df=as.matrix(speech.df)
speech.df[,-1]=replace(speech.df[,-1], speech.df[,-1]<1/15, 0.001)
speech.df[,-1]=f.smooth.topic(x=speech.df[,1], y=speech.df[,-1])
plot.stacked(speech.df[,1], speech.df[,topic.plot+1],
xlab="Sentences", ylab="Topic share", main="Donald Trump, Nomination")
[1] 0.05117658 0.10235316 0.15352974 0.20470631 0.25588289 0.30705947 0.35823605 0.40941263
#"Unity", "Belief", "Reform", "Constitution", "WorkingFamilies",
#"Leadership", "Speech", "Government", "Freedom", "ForeignRelations",
#"Economy", "Patriotism", "Election", "America", "SpeechTemporal"
par(mfrow=c(5, 1), mar=c(1,1,2,0), bty="n", xaxt="n", yaxt="n")
topic.plot=c(2, 4, 5, 9, 10,11,12,14)
print(topics.hash[topic.plot])
[1] "Belief" "Constitution" "WorkingFamilies" "Freedom" "ForeignRelations"
[6] "Economy" "Patriotism" "America"
speech.df=tbl_df(corpus.list.df)%>%filter(File=="GeorgeBush", type=="inaug", Term==1)%>%select(sent.id, Unity:SpeechTemporal)
speech.df=as.matrix(speech.df)
speech.df[,-1]=replace(speech.df[,-1], speech.df[,-1]<1/15, 0.001)
speech.df[,-1]=f.smooth.topic(x=speech.df[,1], y=speech.df[,-1])
plot.stacked(speech.df[,1], speech.df[,topic.plot+1],
xlab="Sentences", ylab="Topic share", main="George Bush, inaugural Speeches")
[1] 0.05132556 0.10265112 0.15397668 0.20530223 0.25662779 0.30795335 0.35927891 0.41060447
speech.df=tbl_df(corpus.list.df)%>%filter(File=="WilliamJClinton", type=="inaug", Term==1)%>%select(sent.id, Unity:SpeechTemporal)
speech.df=as.matrix(speech.df)
speech.df[,-1]=replace(speech.df[,-1], speech.df[,-1]<1/15, 0.001)
speech.df[,-1]=f.smooth.topic(x=speech.df[,1], y=speech.df[,-1])
plot.stacked(speech.df[,1], speech.df[,topic.plot+1],
xlab="Sentences", ylab="Topic share", main="William J Clinton, inaugural Speeches")
[1] 0.06178416 0.12356832 0.18535248 0.24713664 0.30892080 0.37070496 0.43248912 0.49427328
speech.df=tbl_df(corpus.list.df)%>%filter(File=="GeorgeWBush", type=="inaug", Term==1)%>%select(sent.id, Unity:SpeechTemporal)
speech.df=as.matrix(speech.df)
speech.df[,-1]=replace(speech.df[,-1], speech.df[,-1]<1/15, 0.001)
speech.df[,-1]=f.smooth.topic(x=speech.df[,1], y=speech.df[,-1])
plot.stacked(speech.df[,1], speech.df[,topic.plot+1],
xlab="Sentences", ylab="Topic share", main="George W. Bush, inaugural Speeches")
[1] 0.0591767 0.1183534 0.1775301 0.2367068 0.2958835 0.3550602 0.4142369 0.4734136
speech.df=tbl_df(corpus.list.df)%>%filter(File=="BarackObama", type=="inaug", Term==1)%>%select(sent.id, Unity:SpeechTemporal)
speech.df=as.matrix(speech.df)
speech.df[,-1]=replace(speech.df[,-1], speech.df[,-1]<1/15, 0.001)
speech.df[,-1]=f.smooth.topic(x=speech.df[,1], y=speech.df[,-1])
plot.stacked(speech.df[,1], speech.df[,topic.plot+1],
xlab="Sentences", ylab="Topic share", main="Barack Obama, inaugural Speeches")
[1] 0.05657425 0.11314850 0.16972275 0.22629700 0.28287125 0.33944550 0.39601975 0.45259400
#"Unity", "Belief", "Reform", "Constitution", "WorkingFamilies",
#"Leadership", "Speech", "Government", "Freedom", "ForeignRelations",
#"Economy", "Patriotism", "Election", "America", "SpeechTemporal"
par(mfrow=c(5, 1))
topic.plot=c(2, 4, 5, 9, 10,11,12,14)
print(topics.hash[topic.plot])
[1] "Belief" "Constitution" "WorkingFamilies" "Freedom" "ForeignRelations"
[6] "Economy" "Patriotism" "America"
speech.df=tbl_df(corpus.list.df)%>%filter(File=="RonaldReagan", type=="farewell")%>%select(sent.id, Unity:SpeechTemporal)
speech.df=as.matrix(speech.df)
speech.df[,-1]=replace(speech.df[,-1], speech.df[,-1]<1/15, 0.001)
speech.df[,-1]=f.smooth.topic(x=speech.df[,1], y=speech.df[,-1])
plot.stacked(speech.df[,1], speech.df[,topic.plot+1],
xlab="Sentences", ylab="Topic share", main="Ronald Reagan, Farewell Speeches")
[1] 0.05356731 0.10713462 0.16070193 0.21426924 0.26783655 0.32140386 0.37497118 0.42853849
speech.df=tbl_df(corpus.list.df)%>%filter(File=="GeorgeBush", type=="farewell")%>%select(sent.id, Unity:SpeechTemporal)
speech.df=as.matrix(speech.df)
speech.df[,-1]=replace(speech.df[,-1], speech.df[,-1]<1/15, 0.001)
speech.df[,-1]=f.smooth.topic(x=speech.df[,1], y=speech.df[,-1])
plot.stacked(speech.df[,1], speech.df[,topic.plot+1],
xlab="Sentences", ylab="Topic share", main="George Bush, Farewell Speeches")
[1] 0.05129105 0.10258209 0.15387314 0.20516419 0.25645523 0.30774628 0.35903733 0.41032837
speech.df=tbl_df(corpus.list.df)%>%filter(File=="WilliamJClinton", type=="farewell")%>%select(sent.id, Unity:SpeechTemporal)
speech.df=as.matrix(speech.df)
speech.df[,-1]=replace(speech.df[,-1], speech.df[,-1]<1/15, 0.001)
speech.df[,-1]=f.smooth.topic(x=speech.df[,1], y=speech.df[,-1])
plot.stacked(speech.df[,1], speech.df[,topic.plot+1],
xlab="Sentences", ylab="Topic share", main="William J. Clinton, Farewell Speeches")
[1] 0.05825175 0.11650350 0.17475524 0.23300699 0.29125874 0.34951049 0.40776224 0.46601399
speech.df=tbl_df(corpus.list.df)%>%filter(File=="GeorgeWBush", type=="farewell")%>%select(sent.id, Unity:SpeechTemporal)
speech.df=as.matrix(speech.df)
speech.df[,-1]=replace(speech.df[,-1], speech.df[,-1]<1/15, 0.001)
speech.df[,-1]=f.smooth.topic(x=speech.df[,1], y=speech.df[,-1])
plot.stacked(speech.df[,1], speech.df[,topic.plot+1],
xlab="Sentences", ylab="Topic share", main="George W Bush, Farewell Speeches")
[1] 0.06666901 0.13333802 0.20000704 0.26667605 0.33334506 0.40001407 0.46668308 0.53335210
speech.df=tbl_df(corpus.list.df)%>%filter(File=="BarackObama", type=="farewell")%>%select(sent.id, Unity:SpeechTemporal)
speech.df=as.matrix(speech.df)
speech.df[,-1]=replace(speech.df[,-1], speech.df[,-1]<1/15, 0.001)
speech.df[,-1]=f.smooth.topic(x=speech.df[,1], y=speech.df[,-1])
plot.stacked(speech.df[,1], speech.df[,topic.plot+1],
xlab="Sentences", ylab="Topic share", main="Barack Obama, Farewell Speeches")
[1] 0.05392534 0.10785069 0.16177603 0.21570137 0.26962671 0.32355206 0.37747740 0.43140274
speech.df=tbl_df(corpus.list.df)%>%filter(type=="inaug", word.count<20)%>%select(sentences, Unity:SpeechTemporal)
print(paste(names(speech.df)[-1],
as.character(speech.df$sentences[apply(as.data.frame(speech.df[,-1]), 2, which.max)]),
sep=": "))
[1] "Unity: Good will begets good will."
[2] "Belief: Your voice, your hopes, and your dreams will define our American destiny."
[3] "Reform: Our greatest responsibility is to embrace a new spirit of community for a new century."
[4] "Constitution: It has also passed a model child-labor law for the District of Columbia."
[5] "WorkingFamilies: And we will transform our schools and colleges and universities to meet the demands of a new age."
[6] "Leadership: Government must learn to take less from people so that people can do more for themselves."
[7] "Speech: The Government of the Union, acting within the sphere of its delegated authority, is also a complete sovereignty."
[8] "Government: We always professed unselfish purpose and we covet the opportunity to prove our professions are sincere."
[9] "Freedom: There is no short road to the realization of these aspirations."
[10] "ForeignRelations: It must be built and in existence when the emergency arises which calls for its use and operation."
[11] "Economy: The prosperity of Porto Rico continues unabated."
[12] "Patriotism: Conscious of my own deficiency, I cannot enter on these duties without great anxiety for the result."
[13] "Election: It is alleged that in many communities negro citizens are practically denied the freedom of the ballot."
[14] "America: From the height of this place and the summit of this century, let us go forth."
[15] "SpeechTemporal: Thank you."
presid.summary=tbl_df(corpus.list.df)%>%
filter(type=="inaug", File%in%sel.comparison)%>%
select(File, Unity:SpeechTemporal)%>%
group_by(File)%>%
summarise_each(funs(mean))
presid.summary=as.data.frame(presid.summary)
rownames(presid.summary)=as.character((presid.summary[,1]))
km.res=kmeans(scale(presid.summary[,-1]), iter.max=200,
5)
fviz_cluster(km.res,
stand=T, repel= TRUE,
data = presid.summary[,-1],
show.clust.cent=FALSE)