Up the Irons

master yeoda

A long, long time ago, in a galaxy far, far away

Today is born the seventh one

Born of woman the seventh son

And he in turn of a seventh son

He has the power to heal

He has the gift of second sight

He is the chosen one

So it shall be written

And so it shall be done

1. Load data

library(tidytext)
library(readtext)

seventhson <- readtext("seventh_son_lyrics.txt")

summary(seventhson)
##     doc_id              text          
##  Length:1           Length:1          
##  Class :character   Class :character  
##  Mode  :character   Mode  :character

2. Tokenisation

library(tidyr)

seventhson_token <- seventhson %>% unnest_tokens(word, text)

summary(seventhson_token)
##     doc_id              word          
##  Length:1964        Length:1964       
##  Class :character   Class :character  
##  Mode  :character   Mode  :character

3. Initial exploration

Most frequent words.

library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
seventhson_token %>% count(word, sort = TRUE) %>% 
  filter(n > 20) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot() + aes(x = word, y = n) + geom_col(fill = "blue") + coord_flip() +
  xlab("Words") +
  ylab("Number") +
  ggtitle("Frequent Words in Seventh Son of a Seventh Son")

4. Most frequent words

4.1 Clean up

Libraries

library("tm")
## Loading required package: NLP
## 
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
## 
##     annotate
library("SnowballC")
library("wordcloud")
## Loading required package: RColorBrewer
library("RColorBrewer")
library("syuzhet")
library("ggplot2")

Load the data as a corpus.

TextDoc <- Corpus(VectorSource(seventhson))
TextDoc
## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 1

Replacing “/”, “@” and “|” with space.

toSpace <- content_transformer(function (x , pattern ) gsub(pattern, " ", x))
TextDoc <- tm_map(TextDoc, toSpace, "/")
TextDoc <- tm_map(TextDoc, toSpace, "@")
TextDoc <- tm_map(TextDoc, toSpace, "\\|")

Convert the text to lower case.

TextDoc <- tm_map(TextDoc, content_transformer(tolower))

Remove numbers.

TextDoc <- tm_map(TextDoc, removeNumbers)

Remove English common stopwords.

TextDoc <- tm_map(TextDoc, removeWords, stopwords("english"))

Remove your own stop words (if any).

Specify your custom stop words as a character vector.

Just for illustration, and fun :-)

TextDoc <- tm_map(TextDoc, removeWords, c("Star Trek", "Green Lantern", "Bat Man")) 

Remove punctuation marks.

TextDoc <- tm_map(TextDoc, removePunctuation)

Eliminate extra white spaces.

TextDoc <- tm_map(TextDoc, stripWhitespace)

Text stemming - which reduces words to their root form.

TextDoc <- tm_map(TextDoc, stemDocument)

4.2 The top 10 words

Build a term-document matrix.

TextDoc_dtm <- TermDocumentMatrix(TextDoc)

dtm_m <- as.matrix(TextDoc_dtm)

Sort by descearing value of frequency.

dtm_v <- sort(rowSums(dtm_m),decreasing = TRUE)
dtm_d <- data.frame(word = names(dtm_v),freq = dtm_v)

Display the top 10 most frequent words.

head(dtm_d, 10)
##            word freq
## live       live   35
## evil       evil   24
## time       time   22
## die         die   21
## seventh seventh   21
## seven     seven   18
## son         son   17
## know       know   15
## men         men   15
## one         one   14

Create a df.

top_10_words <- top_n(dtm_d, n = 10, freq) 
top_10_words
##            word freq
## live       live   35
## evil       evil   24
## time       time   22
## die         die   21
## seventh seventh   21
## seven     seven   18
## son         son   17
## know       know   15
## men         men   15
## one         one   14

Plot the top 10 words.

library(ggdark)
ggplot(top_10_words) + aes(x = reorder(word, freq), y = freq, fill = freq) +
  labs(fill = "Frequency") + geom_col() + xlab("Word") + ylab("Frequency") + 
  coord_flip() + dark_theme_dark()
## Inverted geom defaults of fill and color/colour.
## To change them back, use invert_geom_defaults().

Plot a word cloud.

set.seed(666)
wordcloud(words = dtm_d$word, freq = dtm_d$freq, min.freq = 6,
          max.words = 666, random.order = FALSE, rot.per = 0.66, 
          colors = brewer.pal(6, "Dark2"))

5. Sentiment analysis

Convert to a character vector.

head(dtm_d)
##            word freq
## live       live   35
## evil       evil   24
## time       time   22
## die         die   21
## seventh seventh   21
## seven     seven   18
dtm_d_data <- rep(dtm_d$word, dtm_d$freq)
head(dtm_d_data)
## [1] "live" "live" "live" "live" "live" "live"
dtm_d_data_vec <- as.vector(dtm_d_data)
head(dtm_d_data_vec)
## [1] "live" "live" "live" "live" "live" "live"

Get sentiments.

dtm_sentiment <- get_nrc_sentiment(dtm_d_data_vec)
## Warning: `spread_()` was deprecated in tidyr 1.2.0.
## Please use `spread()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
head(dtm_sentiment)
##   anger anticipation disgust fear joy sadness surprise trust negative positive
## 1     0            0       0    0   0       0        0     0        0        0
## 2     0            0       0    0   0       0        0     0        0        0
## 3     0            0       0    0   0       0        0     0        0        0
## 4     0            0       0    0   0       0        0     0        0        0
## 5     0            0       0    0   0       0        0     0        0        0
## 6     0            0       0    0   0       0        0     0        0        0
head(dtm_sentiment, 10)
##    anger anticipation disgust fear joy sadness surprise trust negative positive
## 1      0            0       0    0   0       0        0     0        0        0
## 2      0            0       0    0   0       0        0     0        0        0
## 3      0            0       0    0   0       0        0     0        0        0
## 4      0            0       0    0   0       0        0     0        0        0
## 5      0            0       0    0   0       0        0     0        0        0
## 6      0            0       0    0   0       0        0     0        0        0
## 7      0            0       0    0   0       0        0     0        0        0
## 8      0            0       0    0   0       0        0     0        0        0
## 9      0            0       0    0   0       0        0     0        0        0
## 10     0            0       0    0   0       0        0     0        0        0
nrow(dtm_sentiment)
## [1] 958

Get the total.

dtm_sentiment_sum <- as.data.frame(colSums(dtm_sentiment))
dtm_sentiment_sum
##              colSums(dtm_sentiment)
## anger                            79
## anticipation                    103
## disgust                          68
## fear                            119
## joy                              59
## sadness                          98
## surprise                         55
## trust                            60
## negative                        138
## positive                         93

Clean up.

dtm_sentiment_sum$Sentiment <- rownames(dtm_sentiment_sum)

dtm_sentiment_sum <- dtm_sentiment_sum[, c(2, 1)]

dtm_sentiment_sum
##                 Sentiment colSums(dtm_sentiment)
## anger               anger                     79
## anticipation anticipation                    103
## disgust           disgust                     68
## fear                 fear                    119
## joy                   joy                     59
## sadness           sadness                     98
## surprise         surprise                     55
## trust               trust                     60
## negative         negative                    138
## positive         positive                     93
colnames(dtm_sentiment_sum)[2] <- "Count"
dtm_sentiment_sum
##                 Sentiment Count
## anger               anger    79
## anticipation anticipation   103
## disgust           disgust    68
## fear                 fear   119
## joy                   joy    59
## sadness           sadness    98
## surprise         surprise    55
## trust               trust    60
## negative         negative   138
## positive         positive    93
dtm_sentiment_sum$Sentiment <- recode(dtm_sentiment_sum$Sentiment,
                                      "anger" = "Anger",
                                      "anticipation" = "Anticipation",
                                      "disgust" = "Disgust",
                                      "fear" = "Fear",
                                      "joy" = "Joy",
                                      "sadness" = "Sadness",
                                      "surprise" = "Surprise",
                                      "trust" = "Trust",
                                      "negative" = "Negative",
                                      "positive" = "Positive")
dtm_sentiment_sum
##                 Sentiment Count
## anger               Anger    79
## anticipation Anticipation   103
## disgust           Disgust    68
## fear                 Fear   119
## joy                   Joy    59
## sadness           Sadness    98
## surprise         Surprise    55
## trust               Trust    60
## negative         Negative   138
## positive         Positive    93

Total sentiments.

library(ggdark)
ggplot(dtm_sentiment_sum) + aes(x = reorder(Sentiment, Count), y = Count,
                     fill = Sentiment) +
  geom_col() + xlab("Sentiment") + ylab("Count of words") + labs(fill = "Sentiment") +
  ggtitle("Seventh Son of a Seventh Son Sentiments") + coord_flip() + dark_theme_dark()

Express as a percentage.

dtm_sentiment_sum$count_percentage <- round(dtm_sentiment_sum$Count/sum(dtm_sentiment_sum$Count)*100,2)
dtm_sentiment_sum
##                 Sentiment Count count_percentage
## anger               Anger    79             9.06
## anticipation Anticipation   103            11.81
## disgust           Disgust    68             7.80
## fear                 Fear   119            13.65
## joy                   Joy    59             6.77
## sadness           Sadness    98            11.24
## surprise         Surprise    55             6.31
## trust               Trust    60             6.88
## negative         Negative   138            15.83
## positive         Positive    93            10.67
ggplot(dtm_sentiment_sum) + aes(x = reorder(Sentiment, Count), y = count_percentage,
                     fill = Count) +
  geom_col() + xlab("Sentiment") + ylab("Percentage of Sentiments (%)") + 
  labs(fill = "Sentiment") + scale_y_continuous(limits = c(0, 20)) +
  ggtitle("Seventh Son of a Seventh Son Sentiments") + 
  geom_text(aes(label = count_percentage), hjust = -0.5) +
  coord_flip() + dark_theme_bw()