Today is born the seventh one
Born of woman the seventh son
And he in turn of a seventh son
He has the power to heal
He has the gift of second sight
He is the chosen one
So it shall be written
And so it shall be done
1. Load data
library(tidytext)
library(readtext)
seventhson <- readtext("seventh_son_lyrics.txt")
summary(seventhson)
## doc_id text
## Length:1 Length:1
## Class :character Class :character
## Mode :character Mode :character
2. Tokenisation
library(tidyr)
seventhson_token <- seventhson %>% unnest_tokens(word, text)
summary(seventhson_token)
## doc_id word
## Length:1964 Length:1964
## Class :character Class :character
## Mode :character Mode :character
3. Initial exploration
Most frequent words.
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
seventhson_token %>% count(word, sort = TRUE) %>%
filter(n > 20) %>%
mutate(word = reorder(word, n)) %>%
ggplot() + aes(x = word, y = n) + geom_col(fill = "blue") + coord_flip() +
xlab("Words") +
ylab("Number") +
ggtitle("Frequent Words in Seventh Son of a Seventh Son")
4. Most frequent words
4.1 Clean up
Libraries
library("tm")
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
library("SnowballC")
library("wordcloud")
## Loading required package: RColorBrewer
library("RColorBrewer")
library("syuzhet")
library("ggplot2")
Load the data as a corpus.
TextDoc <- Corpus(VectorSource(seventhson))
TextDoc
## <<SimpleCorpus>>
## Metadata: corpus specific: 1, document level (indexed): 0
## Content: documents: 1
Replacing “/”, “@” and “|” with space.
toSpace <- content_transformer(function (x , pattern ) gsub(pattern, " ", x))
TextDoc <- tm_map(TextDoc, toSpace, "/")
TextDoc <- tm_map(TextDoc, toSpace, "@")
TextDoc <- tm_map(TextDoc, toSpace, "\\|")
Convert the text to lower case.
TextDoc <- tm_map(TextDoc, content_transformer(tolower))
Remove numbers.
TextDoc <- tm_map(TextDoc, removeNumbers)
Remove English common stopwords.
TextDoc <- tm_map(TextDoc, removeWords, stopwords("english"))
Remove your own stop words (if any).
Specify your custom stop words as a character vector.
Just for illustration, and fun :-)
TextDoc <- tm_map(TextDoc, removeWords, c("Star Trek", "Green Lantern", "Bat Man"))
Remove punctuation marks.
TextDoc <- tm_map(TextDoc, removePunctuation)
Eliminate extra white spaces.
TextDoc <- tm_map(TextDoc, stripWhitespace)
Text stemming - which reduces words to their root form.
TextDoc <- tm_map(TextDoc, stemDocument)
4.2 The top 10 words
Build a term-document matrix.
TextDoc_dtm <- TermDocumentMatrix(TextDoc)
dtm_m <- as.matrix(TextDoc_dtm)
Sort by descearing value of frequency.
dtm_v <- sort(rowSums(dtm_m),decreasing = TRUE)
dtm_d <- data.frame(word = names(dtm_v),freq = dtm_v)
Display the top 10 most frequent words.
head(dtm_d, 10)
## word freq
## live live 35
## evil evil 24
## time time 22
## die die 21
## seventh seventh 21
## seven seven 18
## son son 17
## know know 15
## men men 15
## one one 14
Create a df.
top_10_words <- top_n(dtm_d, n = 10, freq)
top_10_words
## word freq
## live live 35
## evil evil 24
## time time 22
## die die 21
## seventh seventh 21
## seven seven 18
## son son 17
## know know 15
## men men 15
## one one 14
Plot the top 10 words.
library(ggdark)
ggplot(top_10_words) + aes(x = reorder(word, freq), y = freq, fill = freq) +
labs(fill = "Frequency") + geom_col() + xlab("Word") + ylab("Frequency") +
coord_flip() + dark_theme_dark()
## Inverted geom defaults of fill and color/colour.
## To change them back, use invert_geom_defaults().
Plot a word cloud.
set.seed(666)
wordcloud(words = dtm_d$word, freq = dtm_d$freq, min.freq = 6,
max.words = 666, random.order = FALSE, rot.per = 0.66,
colors = brewer.pal(6, "Dark2"))
5. Sentiment analysis
Convert to a character vector.
head(dtm_d)
## word freq
## live live 35
## evil evil 24
## time time 22
## die die 21
## seventh seventh 21
## seven seven 18
dtm_d_data <- rep(dtm_d$word, dtm_d$freq)
head(dtm_d_data)
## [1] "live" "live" "live" "live" "live" "live"
dtm_d_data_vec <- as.vector(dtm_d_data)
head(dtm_d_data_vec)
## [1] "live" "live" "live" "live" "live" "live"
Get sentiments.
dtm_sentiment <- get_nrc_sentiment(dtm_d_data_vec)
## Warning: `spread_()` was deprecated in tidyr 1.2.0.
## Please use `spread()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
head(dtm_sentiment)
## anger anticipation disgust fear joy sadness surprise trust negative positive
## 1 0 0 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0 0 0 0
## 4 0 0 0 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0 0 0 0
## 6 0 0 0 0 0 0 0 0 0 0
head(dtm_sentiment, 10)
## anger anticipation disgust fear joy sadness surprise trust negative positive
## 1 0 0 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0 0 0 0
## 4 0 0 0 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0 0 0 0
## 6 0 0 0 0 0 0 0 0 0 0
## 7 0 0 0 0 0 0 0 0 0 0
## 8 0 0 0 0 0 0 0 0 0 0
## 9 0 0 0 0 0 0 0 0 0 0
## 10 0 0 0 0 0 0 0 0 0 0
nrow(dtm_sentiment)
## [1] 958
Get the total.
dtm_sentiment_sum <- as.data.frame(colSums(dtm_sentiment))
dtm_sentiment_sum
## colSums(dtm_sentiment)
## anger 79
## anticipation 103
## disgust 68
## fear 119
## joy 59
## sadness 98
## surprise 55
## trust 60
## negative 138
## positive 93
Clean up.
dtm_sentiment_sum$Sentiment <- rownames(dtm_sentiment_sum)
dtm_sentiment_sum <- dtm_sentiment_sum[, c(2, 1)]
dtm_sentiment_sum
## Sentiment colSums(dtm_sentiment)
## anger anger 79
## anticipation anticipation 103
## disgust disgust 68
## fear fear 119
## joy joy 59
## sadness sadness 98
## surprise surprise 55
## trust trust 60
## negative negative 138
## positive positive 93
colnames(dtm_sentiment_sum)[2] <- "Count"
dtm_sentiment_sum
## Sentiment Count
## anger anger 79
## anticipation anticipation 103
## disgust disgust 68
## fear fear 119
## joy joy 59
## sadness sadness 98
## surprise surprise 55
## trust trust 60
## negative negative 138
## positive positive 93
dtm_sentiment_sum$Sentiment <- recode(dtm_sentiment_sum$Sentiment,
"anger" = "Anger",
"anticipation" = "Anticipation",
"disgust" = "Disgust",
"fear" = "Fear",
"joy" = "Joy",
"sadness" = "Sadness",
"surprise" = "Surprise",
"trust" = "Trust",
"negative" = "Negative",
"positive" = "Positive")
dtm_sentiment_sum
## Sentiment Count
## anger Anger 79
## anticipation Anticipation 103
## disgust Disgust 68
## fear Fear 119
## joy Joy 59
## sadness Sadness 98
## surprise Surprise 55
## trust Trust 60
## negative Negative 138
## positive Positive 93
Total sentiments.
library(ggdark)
ggplot(dtm_sentiment_sum) + aes(x = reorder(Sentiment, Count), y = Count,
fill = Sentiment) +
geom_col() + xlab("Sentiment") + ylab("Count of words") + labs(fill = "Sentiment") +
ggtitle("Seventh Son of a Seventh Son Sentiments") + coord_flip() + dark_theme_dark()
Express as a percentage.
dtm_sentiment_sum$count_percentage <- round(dtm_sentiment_sum$Count/sum(dtm_sentiment_sum$Count)*100,2)
dtm_sentiment_sum
## Sentiment Count count_percentage
## anger Anger 79 9.06
## anticipation Anticipation 103 11.81
## disgust Disgust 68 7.80
## fear Fear 119 13.65
## joy Joy 59 6.77
## sadness Sadness 98 11.24
## surprise Surprise 55 6.31
## trust Trust 60 6.88
## negative Negative 138 15.83
## positive Positive 93 10.67
ggplot(dtm_sentiment_sum) + aes(x = reorder(Sentiment, Count), y = count_percentage,
fill = Count) +
geom_col() + xlab("Sentiment") + ylab("Percentage of Sentiments (%)") +
labs(fill = "Sentiment") + scale_y_continuous(limits = c(0, 20)) +
ggtitle("Seventh Son of a Seventh Son Sentiments") +
geom_text(aes(label = count_percentage), hjust = -0.5) +
coord_flip() + dark_theme_bw()