Today is born the seventh one

Born of woman the seventh son

And he in turn of a seventh son

He has the power to heal

He has the gift of second sight

He is the chosen one

So it shall be written

And so it shall be done

1. Load data


seventhson <- readtext("seventh_son_lyrics.txt")

##     doc_id              text          
##  Length:1           Length:1          
##  Class :character   Class :character  
##  Mode  :character   Mode  :character

2. Tokenisation


seventhson_token <- seventhson %>% unnest_tokens(word, text)

##     doc_id              word          
##  Length:1964        Length:1964       
##  Class :character   Class :character  
##  Mode  :character   Mode  :character

3. Initial exploration

Most frequent words.

seventhson_token %>% count(word, sort = TRUE) %>% 
  filter(n > 20) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot() + aes(x = word, y = n) + geom_col(fill = "blue") + coord_flip() +
  xlab("Words") +
  ylab("Number") +
  ggtitle("Frequent Words in Seventh Son of a Seventh Son")

4. Most frequent words

4.1 Clean up


Load the data as a corpus.

TextDoc <- Corpus(VectorSource(seventhson))
Replacing “/”, “@” and “|” with space.

toSpace <- content_transformer(function (x , pattern ) gsub(pattern, " ", x))
TextDoc <- tm_map(TextDoc, toSpace, "/")
TextDoc <- tm_map(TextDoc, toSpace, "@")
TextDoc <- tm_map(TextDoc, toSpace, "\\|")

Convert the text to lower case.

TextDoc <- tm_map(TextDoc, content_transformer(tolower))

Remove numbers.

TextDoc <- tm_map(TextDoc, removeNumbers)

Remove English common stopwords.

TextDoc <- tm_map(TextDoc, removeWords, stopwords("english"))

Remove your own stop words (if any).

Specify your custom stop words as a character vector.

Just for illustration, and fun :-)

TextDoc <- tm_map(TextDoc, removeWords, c("Star Trek", "Green Lantern", "Bat Man")) 

Remove punctuation marks.

TextDoc <- tm_map(TextDoc, removePunctuation)

Eliminate extra white spaces.

TextDoc <- tm_map(TextDoc, stripWhitespace)

Text stemming - which reduces words to their root form.

TextDoc <- tm_map(TextDoc, stemDocument)

4.2 The top 10 words

Build a term-document matrix.

TextDoc_dtm <- TermDocumentMatrix(TextDoc)

dtm_m <- as.matrix(TextDoc_dtm)

Sort by descearing value of frequency.

dtm_v <- sort(rowSums(dtm_m),decreasing = TRUE)
dtm_d <- data.frame(word = names(dtm_v),freq = dtm_v)

Display the top 10 most frequent words.

head(dtm_d, 10)
Create a df.

top_10_words <- top_n(dtm_d, n = 10, freq) 
Plot the top 10 words.

ggplot(top_10_words) + aes(x = reorder(word, freq), y = freq, fill = freq) +
  labs(fill = "Frequency") + geom_col() + xlab("Word") + ylab("Frequency") + 
  coord_flip() + dark_theme_dark()
Plot a word cloud.

wordcloud(words = dtm_d$word, freq = dtm_d$freq, min.freq = 6,
          max.words = 666, random.order = FALSE, rot.per = 0.66, 
          colors = brewer.pal(6, "Dark2"))

5. Sentiment analysis

Convert to a character vector.

Get sentiments.

dtm_sentiment <- get_nrc_sentiment(dtm_d_data_vec)
head(dtm_sentiment, 10)
Get the total.

dtm_sentiment_sum <- as.data.frame(colSums(dtm_sentiment))
Clean up.

dtm_sentiment_sum$Sentiment <- rownames(dtm_sentiment_sum)

dtm_sentiment_sum <- dtm_sentiment_sum[, c(2, 1)]

colnames(dtm_sentiment_sum)[2] <- "Count"
dtm_sentiment_sum$Sentiment <- recode(dtm_sentiment_sum$Sentiment,
                                      "anger" = "Anger",
                                      "anticipation" = "Anticipation",
                                      "disgust" = "Disgust",
                                      "fear" = "Fear",
                                      "joy" = "Joy",
                                      "sadness" = "Sadness",
                                      "surprise" = "Surprise",
                                      "trust" = "Trust",
                                      "negative" = "Negative",
                                      "positive" = "Positive")
Total sentiments.

ggplot(dtm_sentiment_sum) + aes(x = reorder(Sentiment, Count), y = Count,
                     fill = Sentiment) +
  geom_col() + xlab("Sentiment") + ylab("Count of words") + labs(fill = "Sentiment") +
  ggtitle("Seventh Son of a Seventh Son Sentiments") + coord_flip() + dark_theme_dark()

Express as a percentage.

dtm_sentiment_sum$count_percentage <- round(dtm_sentiment_sum$Count/sum(dtm_sentiment_sum$Count)*100,2)
ggplot(dtm_sentiment_sum) + aes(x = reorder(Sentiment, Count), y = count_percentage,
                     fill = Count) +
  geom_col() + xlab("Sentiment") + ylab("Percentage of Sentiments (%)") + 
  labs(fill = "Sentiment") + scale_y_continuous(limits = c(0, 20)) +
  ggtitle("Seventh Son of a Seventh Son Sentiments") + 
  geom_text(aes(label = count_percentage), hjust = -0.5) +
  coord_flip() + dark_theme_bw()