Objectives: The goal of this kernel is to perform sentiment analysis based on the dialogues of main characters in Star Wars. The visualization section will be made interactive via plotly and highcharter. Hope the kernel is easy to read and enjoy!
If you like the kernel, please give me an upvote and thanks!
if (!require("pacman")) install.packages("pacman")
pacman::p_load(tidyverse, tm, plotly, highcharter, viridis,
wordcloud, wordcloud2, plotrix, tidytext,
reshape2, ggthemes, qdap, igraph, ggraph,
visNetwork)
ep4 <- read.table("input/SW_EpisodeIV.txt")
ep5 <- read.table("input/SW_EpisodeV.txt")
ep6 <- read.table("input/SW_EpisodeVI.txt")
combined <- bind_rows(ep4, ep5, ep6)
rm(ep4, ep5, ep6)
# clean corpus
cleanCorpus <- function(corpus){
corpus.tmp <- tm_map(corpus, removePunctuation)
corpus.tmp <- tm_map(corpus.tmp, stripWhitespace)
corpus.tmp <- tm_map(corpus.tmp, content_transformer(tolower))
v_stopwords <- c(stopwords("en"), c("thats","weve","hes","theres","ive","im",
"will","can","cant","dont","youve","us",
"youre","youll","theyre","whats","didnt"))
corpus.tmp <- tm_map(corpus.tmp, removeWords, v_stopwords)
corpus.tmp <- tm_map(corpus.tmp, removeNumbers)
return(corpus.tmp)
}
# frequent terms
frequentTerms <- function(text){
s.cor <- Corpus(VectorSource(text))
s.cor.cl <- cleanCorpus(s.cor)
s.tdm <- TermDocumentMatrix(s.cor.cl)
s.tdm <- removeSparseTerms(s.tdm, 0.999)
m <- as.matrix(s.tdm)
word_freqs <- sort(rowSums(m), decreasing=TRUE)
dm <- data.frame(word=names(word_freqs), freq=word_freqs)
return(dm)
}
# clean by each character
clean_top_char <- function(dataset){
all_dialogue <- list()
namelist <- list()
for (i in 1:9){
name <- top_chars$character[i]
dialogue <- paste(dataset$dialogue[dataset$character == name], collapse = " ")
all_dialogue <- c(all_dialogue, dialogue)
namelist <- c(namelist, name)
}
all_clean <- all_dialogue %>%
VectorSource() %>%
Corpus() %>%
cleanCorpus() %>%
TermDocumentMatrix() %>%
as.matrix()
colnames(all_clean) <- namelist
assign("all_clean",all_clean,.GlobalEnv)
all_clean %>% head()
}
top_chars <- combined %>% count(character) %>% arrange(desc(n)) %>% head(20)
hchart(top_chars, type = 'treemap',hcaes(x = "character", value = 'n', color = 'n'))
clean_top_char(combined)
## Docs
## Terms LUKE HAN THREEPIO LEIA VADER BEN LANDO YODA EMPEROR
## aaargh 1 0 0 0 0 0 0 0 0
## academy 3 0 0 0 0 0 0 0 0
## accelerator 1 0 0 0 0 0 0 0 0
## accepted 1 0 0 0 2 0 0 0 0
## across 1 0 1 1 0 0 0 0 0
## action 1 0 0 0 0 0 0 0 0
Click Any Part to DrillDown
df1 <- combined %>%
count(name = character, drilldown = character) %>%
arrange(desc(n)) %>%
head(9) %>%
rename(y = n)
df2 <- all_clean %>%
as.data.frame() %>%
rownames_to_column(var = 'word') %>%
inner_join(get_sentiments("nrc"), by = 'word') %>%
select(-word) %>%
gather(char, value, -10) %>%
group_by(char, sentiment) %>%
summarise(y = sum(value)
# , colorByPoint = 1
) %>%
arrange(desc(y)) %>%
group_by(name = char, id = name
# , colorByPoint
) %>%
do(data = list_parse(
mutate(.,name = sentiment, drilldown = tolower(paste(char, sentiment,sep=": "))) %>%
group_by(name, drilldown) %>%
summarise(y=sum(y)) %>%
select(name, y, drilldown) %>%
arrange(desc(y))))
a <- highchart() %>%
hc_chart(type = 'pie') %>%
hc_xAxis(type = "category") %>%
hc_add_series(name = 'number of appearance', data = df1, colorByPoint = 1) %>%
hc_drilldown(
allowPointDrilldown = TRUE,
series =list_parse(df2)
) %>%
hc_legend(enabled = F) %>%
hc_title(text = "Sentiment Pie Chart by Character") %>%
hc_add_theme(hc_theme_darkunica())
b <- highchart() %>%
hc_chart(type = 'bar') %>%
hc_xAxis(type = "category") %>%
hc_add_series(name = 'number of appearance', data = df1, colorByPoint = 1) %>%
hc_drilldown(
allowPointDrilldown = TRUE,
series =list_parse(df2)
) %>%
hc_legend(enabled = F) %>%
hc_title(text = "Sentiment Bar Chart by Character") %>%
hc_add_theme(hc_theme_darkunica())
rm(df1, df2)
lst <- list(
a,
b
)
hw_grid(lst, rowheight = 400)
df1 <- combined %>%
count(name = character, drilldown = character) %>%
arrange(desc(n)) %>%
head(9) %>%
rename(y = n)
df2 <- all_clean %>%
as.data.frame() %>%
rownames_to_column(var = 'word') %>%
inner_join(get_sentiments("bing"), by = 'word') %>%
select(-word) %>%
gather(char, value, -10) %>%
group_by(char, sentiment) %>%
summarise(y = sum(value), colorByPoint = 1) %>%
arrange(desc(y)) %>%
group_by(name = char, id = name, colorByPoint) %>%
do(data = list_parse(
mutate(.,name = sentiment, drilldown = tolower(paste(char, sentiment,sep=": "))) %>%
group_by(name, drilldown) %>%
summarise(y=sum(y)) %>%
select(name, y, drilldown) %>%
arrange(desc(y))))
a <- highchart() %>%
hc_chart(type = 'pie') %>%
hc_xAxis(type = "category") %>%
hc_add_series(name = 'number of cannabis', data = df1, colorByPoint = 1) %>%
hc_drilldown(
allowPointDrilldown = TRUE,
series =list_parse(df2)
) %>%
hc_legend(enabled = F) %>%
hc_title(text = "Sentiment Pie Chart by Character") %>%
hc_add_theme(hc_theme_darkunica())
b <- highchart() %>%
hc_chart(type = 'bar') %>%
hc_xAxis(type = "category") %>%
hc_add_series(name = 'number of cannabis', data = df1, colorByPoint = 1) %>%
hc_drilldown(
allowPointDrilldown = TRUE,
series =list_parse(df2)
) %>%
hc_legend(enabled = F) %>%
hc_title(text = "Sentiment Bar Chart by Character") %>%
hc_add_theme(hc_theme_darkunica())
rm(df1, df2)
lst <- list(
a,
b
)
hw_grid(lst, rowheight = 400)
df1 <- combined %>%
count(name = character, drilldown = character) %>%
arrange(desc(n)) %>%
head(9) %>%
rename(y = n)
df2 <- all_clean %>%
as.data.frame() %>%
rownames_to_column(var = 'word') %>%
inner_join(get_sentiments("loughran"), by = 'word') %>%
select(-word) %>%
gather(char, value, -10) %>%
group_by(char, sentiment) %>%
summarise(y = sum(value), colorByPoint = 1) %>%
arrange(desc(y)) %>%
group_by(name = char, id = name, colorByPoint) %>%
do(data = list_parse(
mutate(.,name = sentiment, drilldown = tolower(paste(char, sentiment,sep=": "))) %>%
group_by(name, drilldown) %>%
summarise(y=sum(y)) %>%
select(name, y, drilldown) %>%
arrange(desc(y))))
a <- highchart() %>%
hc_chart(type = 'pie') %>%
hc_xAxis(type = "category") %>%
hc_add_series(name = 'number of cannabis', data = df1, colorByPoint = 1) %>%
hc_drilldown(
allowPointDrilldown = TRUE,
series =list_parse(df2)
) %>%
hc_legend(enabled = F) %>%
hc_title(text = "Sentiment Pie Chart by Character") %>%
hc_add_theme(hc_theme_darkunica())
b <- highchart() %>%
hc_chart(type = 'bar') %>%
hc_xAxis(type = "category") %>%
hc_add_series(name = 'number of cannabis', data = df1, colorByPoint = 1) %>%
hc_drilldown(
allowPointDrilldown = TRUE,
series =list_parse(df2)
) %>%
hc_legend(enabled = F) %>%
hc_title(text = "Sentiment Bar Chart by Character") %>%
hc_add_theme(hc_theme_darkunica())
rm(df1, df2)
lst <- list(
a,
b
)
hw_grid(lst, rowheight = 400)
combined$dialogue %>%
frequentTerms() %>%
# dim()
head(30) %>%
mutate(word = factor(word))%>%
plot_ly(x = ~reorder(word,-freq), y = ~freq, colors = viridis(10)) %>%
add_bars(color = ~word) %>%
layout(title = "Top 30 Words",
yaxis = list(title = " "),
xaxis = list(title = ""),
margin = list(l = 100))
As we can see from the following wordcloud, these are the words that Luke and Threepio have in common.
commonality.cloud(all_clean[,c("LUKE","THREEPIO")], colors = "steelblue1", at.least = 2, max.words = 100)
In the comparison cloud, the lower one is for Threepio and upper one is for Luke.
comparison.cloud(all_clean[,c("LUKE","THREEPIO")], colors = c("#F8766D", "#00BFC4"), max.words=50)
Then lets have a look what words that Luke and Threepio have in common. By analyzing the frequency that they are using the words, we can have a better understanding the character. For example, Threepio says much more “afraid” than Luke. It shows that Threepio is less confident.
common_words <- all_clean %>%
as.data.frame() %>%
rownames_to_column() %>%
filter(LUKE>0, THREEPIO>0) %>%
# select(LUKE, THREEPIO) %>%
mutate(difference = abs(LUKE - THREEPIO)) %>%
arrange(desc(difference))
common_words_25 <- common_words%>%
head(25)
# Create the pyramid plot
pyramid.plot(common_words_25$LUKE, common_words_25$THREEPIO,
labels = common_words_25$rowname, gap = 20,
top.labels = c("LUKE", "Words", "THREEPIO"),
main = "Words in Common", laxlab = NULL,
raxlab = NULL, unit = NULL)
## [1] 5.1 4.1 4.1 2.1
all_clean %>%
as.data.frame() %>%
rownames_to_column(var = 'word') %>%
inner_join(get_sentiments("loughran"), by = 'word') %>%
group_by(sentiment) %>%
summarise(number = sum(LUKE)) %>%
plot_ly(labels = ~sentiment, values = ~number) %>%
add_pie(hole = 0.6) %>%
layout(title = "LUKE Emotions", showlegend = T,
xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE))
all_clean %>%
as.data.frame() %>%
rownames_to_column(var = 'word') %>%
inner_join(get_sentiments("loughran"), by = 'word') %>%
group_by(sentiment) %>%
summarise(number = sum(THREEPIO)) %>%
plot_ly(labels = ~sentiment, values = ~number) %>%
add_pie(hole = 0.6) %>%
layout(title = "THREEPIO Emotions", showlegend = T,
xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE))
LUKE
all_clean %>%
as.data.frame() %>%
rownames_to_column(var = 'word') %>%
inner_join(get_sentiments("loughran"), by = 'word') %>%
select(word, LUKE, sentiment) %>%
filter(LUKE!=0) %>%
spread(sentiment, LUKE, fill = 0) %>%
column_to_rownames(var = 'word') %>%
comparison.cloud(colors = c( "firebrick", "#F8766D", "#00BFC4","steelblue"), max.words=50)
VADER
For Vader, he has much more negative words than the positive ones. From examing the graph below, we can see negative words such as “dark”, “Destroy”, “Attack” and “Betray” and the positive words such as “Master” .
all_clean %>%
as.data.frame() %>%
rownames_to_column(var = 'word') %>%
inner_join(get_sentiments("bing"), by = 'word') %>%
select(word, VADER, sentiment) %>%
spread(sentiment, VADER, fill = 0) %>%
column_to_rownames(var = 'word') %>%
comparison.cloud(colors = c("#F8766D", "#00BFC4"), max.words=150)
senti_LUKE_THREE <- all_clean %>%
as.data.frame() %>%
rownames_to_column(var = 'word') %>%
inner_join(get_sentiments("nrc"), by = 'word')%>%
select(LUKE, THREEPIO, sentiment) %>%
group_by(sentiment) %>%
summarise(sum_luke = sum(LUKE),
sum_threepio = sum(THREEPIO))
pyramid.plot(senti_LUKE_THREE$sum_luke, senti_LUKE_THREE$sum_threepio,
labels = senti_LUKE_THREE$sentiment, gap = 40,
top.labels = c("LUKE", "Sentiment", "THREEPIO"),
main = "Sentiment Comparison", laxlab = NULL,
raxlab = NULL, unit = NULL)
## [1] 5.1 4.1 4.1 2.1
Master Yoda
# Word association
word_associate(combined$dialogue, match.string = c("yoda"),
stopwords = c(stopwords("english"), c("thats","weve","hes","theres","ive","im",
"will","can","cant","dont","youve","us",
"youre","youll","theyre","whats","didnt")),
network.plot = TRUE, cloud.colors = c("gray85", "darkred"))
## row group unit text ...
## 1 1097 all 1097 There you will learn from Yoda, the Jedi Master who instructed me.
## 2 1107 all 1107 Yoda...
## 3 1359 all 1359 Now all I have to do is find this Yoda...if he even exists.
## 4 1388 all 1388 Oohhh. Jedi Master. Yoda. You seek Yoda.
## 5 1437 all 1437 Look, I'm sure it's delicious. I just don't understand why we can't see Yoda now.
## 6 1439 all 1439 How far away is Yoda? Will it take us long to get there?
## 7 1440 all 1440 Not far. Yoda not far. Patience. Soon you will be with him. Rootleaf, I cook. Why wish you become Jedi? Hm?
## 8 1449 all 1449 Yoda! I am ready. I...Ben! I can be a Jedi. Ben, tell him I'm ready.
## 9 1638 all 1638 Even Yoda cannot see their fate.
## 10 1642 all 1642 But I've learned so much since then. Master Yoda, I promise to return and finish what I've begun. You have my word.
## 11 2057 all 2057 Master Yoda, you can't die.
## 12 2063 all 2063 Master Yoda... is Darth Vader my father?
## 13 2065 all 2065 Yoda, I must know.
## 14 2076 all 2076 Yoda will always be with you.
## 15 2084 all 2084 When I first knew him, your father was already a great pilot. But I was amazed how strongly the Force was with him. I took it upon myself to train him as a Jedi. I thought that I could instruct him just as well as Yoda. I was wrong. My pride has had terrible consequences for the galaxy.
## 16 2090 all 2090 Vader humbled you when first you met him, Luke...but that experience was part of your training. It taught you, among other things, the value of patience. Had you not been so impatient to defeat Vader then, you could have finished your training here with Yoda. You would have been prepared.
## 17 2097 all 2097 Yoda spoke of another.
# Add title
title(main = "Master Yoda")
Vader’s Comment towards Rebel
# Word association
word_associate(combined$dialogue[combined$character == 'VADER'], match.string = c("rebel"),
stopwords = c(stopwords("english"), c("thats","weve","hes","theres","ive","im",
"will","can","cant","dont","youve","us",
"youre","youll","theyre","whats","didnt")),
network.plot = TRUE, cloud.colors = c("gray85", "darkred"))
## row group unit text
## 1 4 all 4 Don't play games with me, Your Highness. You weren't on any mercy mission this time. You passed directly through a restricted system. Several transmissions were beamed to this ship by Rebel spies. I want to know what happened to the plans they sent you.
## 2 5 all 5 You're a part of the Rebel Alliance... and a traitor. Take her away!
## 3 6 all 6 I have traced the Rebel spies to her. Now she is my only link to find their secret base!
## 4 13 all 13 And, now Your Highness, we will discuss the location of your hidden Rebel base.
## 5 17 all 17 I told you she would never consciously betray the Rebellion.
## 6 31 all 31 This will be a day long remembered. It has seen the end of Kenobi and it will soon see the end of the Rebellion.
## 7 43 all 43 That's it. The Rebels are there.
## 8 46 all 46 The Rebels are alerted to our presence. Admiral Ozzel came out of light-speed too close to the system.
## 9 108 all 108 What of the reports of the Rebel fleet massing near Sullust?
## 10 114 all 114 A small Rebel force has penetrated the shield and landed on Endor.
# Add title
title(main = "Vader Rebel Comment")
Relation Network
char <- colnames(all_clean) %>% tolower()
all_clean_dt <- all_clean %>%
as.data.frame() %>%
select(-10)
colnames(all_clean_dt) <- colnames(all_clean_dt) %>% tolower()
network <- all_clean_dt[rownames(all_clean) %in% char,]
network_matrix <- network %>% select(rownames(network)) %>% as.matrix()
network1 = graph_from_adjacency_matrix(network_matrix, mode='undirected', diag=F)
# plot it
plot(network1,
# === vertex
vertex.color = rgb(0.8,0.4,0.3,0.8), # Node color
vertex.frame.color = "white", # Node border color
vertex.shape="circle", # One of “none”, “circle”, “square”, “csquare”, “rectangle” “crectangle”, “vrectangle”, “pie”, “raster”, or “sphere”
vertex.size=54, # Size of the node (default is 15)
vertex.size2=NA, # The second size of the node (e.g. for a rectangle)
# Character vector used to label the nodes
vertex.label.color="white",
vertex.label.family="Times", # Font family of the label (e.g.“Times”, “Helvetica”)
vertex.label.font=c(1,2,3,4), # Font: 1 plain, 2 bold, 3, italic, 4 bold italic, 5 symbol
vertex.label.cex=c(0.7,1,1.3), # Font size (multiplication factor, device-dependent)
vertex.label.dist=0, # Distance between the label and the vertex
vertex.label.degree=0, # The position of the label in relation to the vertex (use pi)ze2=NA) # The second size of the node (e.g. for a rectangle)
layout=layout.circle, main="Star Wars Relationship Network")
Relation Network2
This relation network solves the problem in the first one that the the connection between two nearby nodes is hard to see. In this network, the strength of the connection is illustrated through the alpha value of the edges.
network2 <- network %>%
rownames_to_column('word1') %>%
gather(word2, value, -1)
network2 %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = value), show.legend = FALSE) +
geom_node_point(color = "firebrick", size = 20, alpha = .5) +
geom_node_text(aes(label = name), col = "white") +
theme_solarized(light = F)
Relation Network3 - Interactive
This is an interactive approachs that shows the network connections. This time, I manipulated the data a little bit by limiting to the ones have more than 5 in frequency. Therefore, you can clearly see the main connections among the characters.
I learned this technique from the kernal, StackOverflow 2018 : DrillDown+Network+Motion Plot. It also shows you how to do very cool drilldown and motion plot. Don’t miss out!
network3_edg <- network2 %>%
filter(value>5) %>%
rename(from = word1, to = word2, weight = value, width = value)
network3_node <- network2 %>%
group_by(word1) %>%
summarise(size = sum(value)) %>%
rename(id = word1)
network3_node$label <- network3_node$id # Node label
visNetwork(network3_node, network3_edg, height = "500px", width = "100%") %>%
visIgraphLayout(layout = "layout_with_lgl") %>%
visEdges(shadow = TRUE,
color = list(color = "gray", highlight = "orange"))
Relation Network4 - Interactive
network3_edg <- network2 %>%
filter(value>5) %>%
rename(from = word1, to = word2)
network3_node <- network2 %>%
group_by(word1) %>%
summarise(size = sum(value)) %>%
rename(id = word1) %>%
select(id)
network3_node$label <- network3_node$id # Node label
visNetwork(network3_node, network3_edg, height = "500px", width = "100%") %>%
# visIgraphLayout(layout = "layout_with_lgl") %>%
visEdges(shadow = TRUE,
color = list(color = "gray", highlight = "orange"))
To be continued! If you like the kernel, don’t forget to upvote and thanks!