Back to the Spell Book

1. Libraries

library(stringdist)
library(tm)

2. Set up

Define the path to the folders

emails_folder <- "emails/"

keywords_folder <- "keywords/"

Load signs of phishing

phishing_keywords <- scan(paste0(keywords_folder, "phishing_keywords.txt"), 
                          what = "", sep = "\n", quiet = TRUE)

Define reference dictionary for misspelling

reference_dictionary <- tolower(stopwords("en"))
reference_dictionary <- c(reference_dictionary, "verify", "urgent", 
                          "account", "password", "click", "payment")

Read emails

read_emails <- function(folder_path) {
  email_files <- list.files(folder_path, full.names = TRUE)
  emails <- sapply(email_files, function(file) 
    paste(readLines(file, warn = FALSE), collapse = " "))
  return(emails)
}

3. Detection

3.1 Load Emails

emails <- read_emails(emails_folder)

3.2 Use the Force

Initialise a data frame to store results

results <- data.frame(
  Email = emails,
  stringsAsFactors = FALSE
)

Check for fuzzy matches of a keywords

check_fuzzy_match <- function(email, keyword, max_dist = 2) {
  words <- unlist(strsplit(tolower(email), "\\s+"))
  any(sapply(words, function(word) 
    stringdist(word, tolower(keyword), method = "lv") <= max_dist))
}

Calculate misspelling rate

calculate_misspelling_rate <- function(email, reference_dictionary, 
                                       max_dist = 2) {
  words <- unlist(strsplit(tolower(email), "\\s+"))
  total_words <- length(words)
  misspelled_count <- sum(sapply(words, function(word) {
    min_dist <- min(stringdist(word, reference_dictionary, method = "lv"))
    return(as.integer(min_dist > max_dist))
  }))
  return(misspelled_count / total_words)
}

Add phishing keyword columns using fuzzy matching

for (keyword in phishing_keywords) {
  col_name <- gsub(" ", "_", keyword)
  results[[col_name]] <- sapply(results$Email, function(email) {
    as.integer(check_fuzzy_match(email, keyword, max_dist = 2))
  })
}

Add misspelling rate column

results$Misspelling_Rate <- sapply(results$Email, function(email) {
  calculate_misspelling_rate(email, reference_dictionary, max_dist = 2)
})

Calculate Phishing Score (weighted sum of keyword matches + misspelling rate)

results$Phishing_Score <- rowSums(results[,-1]) + 
  results$Misspelling_Rate * 2
head(results)
##                                                                                                      Email
## emails/email_1.txt  Dear customer, urgnat action required! Please verfiy your acc0unt to avoid suspension.
## emails/email_10.txt                                  Your package has been shpped and will arive tomorrow.
## emails/email_11.txt                                       Happy birrthday! We have a special gift for you.
## emails/email_12.txt                                 Remindr: Your apointment is scheduled for next Monday.
## emails/email_13.txt                                         Join us for our anual charity event next week.
## emails/email_14.txt                      Your paymnt has been received. Thank you for your promt response.
##                     verify_your_account urgent_action_required click_here
## emails/email_1.txt                    0                      0          0
## emails/email_10.txt                   0                      0          0
## emails/email_11.txt                   0                      0          0
## emails/email_12.txt                   0                      0          0
## emails/email_13.txt                   0                      0          0
## emails/email_14.txt                   0                      0          0
##                     reset_your_password payment_information
## emails/email_1.txt                    0                   0
## emails/email_10.txt                   0                   0
## emails/email_11.txt                   0                   0
## emails/email_12.txt                   0                   0
## emails/email_13.txt                   0                   0
## emails/email_14.txt                   0                   0
##                     account_has_been_compromised sensitive_information
## emails/email_1.txt                             0                     0
## emails/email_10.txt                            0                     0
## emails/email_11.txt                            0                     0
## emails/email_12.txt                            0                     0
## emails/email_13.txt                            0                     0
## emails/email_14.txt                            0                     0
##                     unauthorized_transaction_detected
## emails/email_1.txt                                  0
## emails/email_10.txt                                 0
## emails/email_11.txt                                 0
## emails/email_12.txt                                 0
## emails/email_13.txt                                 0
## emails/email_14.txt                                 0
##                     update_your_billing_information security_alert
## emails/email_1.txt                                0              0
## emails/email_10.txt                               0              0
## emails/email_11.txt                               0              0
## emails/email_12.txt                               0              0
## emails/email_13.txt                               0              0
## emails/email_14.txt                               0              0
##                     validate_your_identity your_account_has_been_locked
## emails/email_1.txt                       0                            0
## emails/email_10.txt                      0                            0
## emails/email_11.txt                      0                            0
## emails/email_12.txt                      0                            0
## emails/email_13.txt                      0                            0
## emails/email_14.txt                      0                            0
##                     login_credentials
## emails/email_1.txt                  0
## emails/email_10.txt                 0
## emails/email_11.txt                 0
## emails/email_12.txt                 0
## emails/email_13.txt                 0
## emails/email_14.txt                 0
##                     failure_to_comply_will_result_in_account_suspension
## emails/email_1.txt                                                    0
## emails/email_10.txt                                                   0
## emails/email_11.txt                                                   0
## emails/email_12.txt                                                   0
## emails/email_13.txt                                                   0
## emails/email_14.txt                                                   0
##                     unusual_login_activity contact_customer_support
## emails/email_1.txt                       0                        0
## emails/email_10.txt                      0                        0
## emails/email_11.txt                      0                        0
## emails/email_12.txt                      0                        0
## emails/email_13.txt                      0                        0
## emails/email_14.txt                      0                        0
##                     confirm_your_subscription renew_your_membership
## emails/email_1.txt                          0                     0
## emails/email_10.txt                         0                     0
## emails/email_11.txt                         0                     0
## emails/email_12.txt                         0                     0
## emails/email_13.txt                         0                     0
## emails/email_14.txt                         0                     0
##                     unauthorized_access_attempt pending_account_closure
## emails/email_1.txt                            0                       0
## emails/email_10.txt                           0                       0
## emails/email_11.txt                           0                       0
## emails/email_12.txt                           0                       0
## emails/email_13.txt                           0                       0
## emails/email_14.txt                           0                       0
##                     Misspelling_Rate Phishing_Score
## emails/email_1.txt         0.5000000      1.5000000
## emails/email_10.txt        0.3333333      1.0000000
## emails/email_11.txt        0.3333333      1.0000000
## emails/email_12.txt        0.5000000      1.5000000
## emails/email_13.txt        0.4444444      1.3333333
## emails/email_14.txt        0.1818182      0.5454545

4. Different Settings

Different thresholds. Sometimes, clouded our vision, the dark side has.

thresholds <- seq(0.5, 3, by = 0.5)
detection_results <- data.frame()
for (threshold in thresholds) {
  results$Predicted_Label <- ifelse(results$Phishing_Score >= threshold, 
                                    "Phishing", "Legitimate")
  phishing_detected <- sum(results$Predicted_Label == "Phishing")
  detection_results <- rbind(detection_results, 
                             data.frame(Threshold = threshold, 
                                        Phishing_Detected = phishing_detected))
}

Detection results using Sir Nighteye’s Quirk

detection_results
##   Threshold Phishing_Detected
## 1       0.5                21
## 2       1.0                18
## 3       1.5                 7
## 4       2.0                 1
## 5       2.5                 0
## 6       3.0                 0
library(ggplot2)

detection_results_chart <- ggplot(detection_results, 
                                  aes(x = Threshold, 
                                      y = Phishing_Detected)) +
  geom_line(color = "#00bfff", size = 1.2) + 
  geom_point(color = "#ffffff", size = 3) +  
  theme_minimal() +
  labs(title = "Sir Nighteye's Foresight",
       x = "Threshold",
       y = "Number of Phishing Emails Detected") +
  theme(plot.background = element_rect(fill = "#2b2b2b", 
                                       color = NA),
        panel.background = element_rect(fill = "#2b2b2b", color = NA), 
        text = element_text(color = "white"),
        axis.text = element_text(color = "white"),
        axis.title = element_text(color = "white"),
        plot.title = element_text(size = 14, face = "bold", hjust = 0.5),
        panel.grid.major = element_line(color = "#444444"),
        panel.grid.minor = element_blank())
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Added features… Sometimes Foresight reveals other things :-)

library(plotly)
interactive_chart <- ggplotly(detection_results_chart)

interactive_chart

5. Final Selection

Set the threshold to 1.5 (hypothetically). Change as needed or use the force :-)

threshold <- 1.5

Classify emails based on the selected threshold

results$Predicted_Label <- ifelse(results$Phishing_Score >= threshold, 
                                  "Phishing", "Legitimate")

Print phishing emails preview

phishing_emails <- results[results$Predicted_Label == "Phishing", ]
phishing_emails$Preview <- sapply(phishing_emails$Email, function(email) {
  paste0(strsplit(email, "\\s+")[[1]][1:5], collapse = " ")
})
phishing_emails <- phishing_emails[, c("Preview", 
                                       "Phishing_Score", "Predicted_Label")]
phishing_emails
##                                                      Preview Phishing_Score
## emails/email_1.txt    Dear customer, urgnat action required!           1.50
## emails/email_12.txt    Remindr: Your apointment is scheduled           1.50
## emails/email_15.txt Congratulatins on your recent prom0tion!           1.80
## emails/email_19.txt        Your subcription has been renewed           1.50
## emails/email_21.txt             Your feedbak is important to           1.50
## emails/email_4.txt             We noticed unsual activity in           1.75
## emails/email_5.txt     Imporrtant notice: Update your contct           2.40
##                     Predicted_Label
## emails/email_1.txt         Phishing
## emails/email_12.txt        Phishing
## emails/email_15.txt        Phishing
## emails/email_19.txt        Phishing
## emails/email_21.txt        Phishing
## emails/email_4.txt         Phishing
## emails/email_5.txt         Phishing
library(ggplot2)

phishing_bar_chart <- ggplot(phishing_emails) +
  aes(x = reorder(Preview, Phishing_Score),
      y = Phishing_Score) +
  geom_bar(stat = "identity", fill = "#444444", 
           color = "#222222") +
  theme_minimal(base_family = "sans") +
  labs(title = "The Phishing Ones",
       x = "Email Preview",
       y = "Phishing Score") +
  theme(plot.background = element_rect(fill = "#2b2b2b", 
                                       color = NA), 
        panel.background = element_rect(fill = "#2b2b2b", 
                                        color = NA), 
        text = element_text(color = "white"),
        axis.text = element_text(color = "white"),
        axis.text.x = element_text(hjust = 1),
        axis.title = element_text(color = "white"),
        plot.title = element_text(size = 14, face = "bold", hjust = 0.5),
        panel.grid.major = element_line(color = "#444444"),
        panel.grid.minor = element_blank()) +
  coord_flip()


phishing_bar_chart

phishing_emails
##                                                      Preview Phishing_Score
## emails/email_1.txt    Dear customer, urgnat action required!           1.50
## emails/email_12.txt    Remindr: Your apointment is scheduled           1.50
## emails/email_15.txt Congratulatins on your recent prom0tion!           1.80
## emails/email_19.txt        Your subcription has been renewed           1.50
## emails/email_21.txt             Your feedbak is important to           1.50
## emails/email_4.txt             We noticed unsual activity in           1.75
## emails/email_5.txt     Imporrtant notice: Update your contct           2.40
##                     Predicted_Label
## emails/email_1.txt         Phishing
## emails/email_12.txt        Phishing
## emails/email_15.txt        Phishing
## emails/email_19.txt        Phishing
## emails/email_21.txt        Phishing
## emails/email_4.txt         Phishing
## emails/email_5.txt         Phishing
phishing_emails$file_name <- basename(rownames(phishing_emails))


phishing_bar_chart_2 <- ggplot(phishing_emails) +
  aes(x = reorder(file_name, Phishing_Score), y = Phishing_Score) +
  geom_bar(stat = "identity", fill = "#444444",
           color = "#222222") +
  theme_minimal(base_family = "sans") +
  labs(title = "The Phishing Ones",
       x = "Email Preview",
       y = "Phishing Score") +
  theme(plot.background = element_rect(fill = "#2b2b2b", 
                                       color = NA), 
        panel.background = element_rect(fill = "#2b2b2b", 
                                        color = NA), 
        text = element_text(color = "white"),
        axis.text = element_text(color = "white"),
        axis.text.x = element_text(hjust = 1),
        axis.title = element_text(color = "white"),
        plot.title = element_text(size = 14, face = "bold", hjust = 0.5),
        panel.grid.major = element_line(color = "#444444"),
        panel.grid.minor = element_blank()) +
  coord_flip()


phishing_bar_chart_2

Added features…

interactive_chart_2 <- ggplotly(phishing_bar_chart_2)

interactive_chart_2

Decorative Image