Back to the Spell Book

1. Libraries

library(stringdist)
library(tm)
library(ggplot2)
library(plotly)

2. Set up

Define the path to the folders

emails_folder <- "emails_v2/"

keywords_folder <- "keywords/"

Load signs of phishing

phishing_keywords <- scan(paste0(keywords_folder, "phishing_keywords.txt"), 
                          what = "", sep = "\n", quiet = TRUE)

Define reference dictionary for misspelling

reference_dictionary <- tolower(stopwords("en"))
reference_dictionary <- c(reference_dictionary, "verify", "urgent", 
                          "account", "password", "click", "payment")

Read emails

read_emails <- function(folder_path) {
  email_files <- list.files(folder_path, full.names = TRUE)
  emails <- sapply(email_files, function(file) 
    paste(readLines(file, warn = FALSE), collapse = " "))
  return(emails)
}

3. Detection

3.1 Load Emails

emails <- read_emails(emails_folder)

3.2 Use the Force

Initialise a data frame to store results

results <- data.frame(
  Email = emails,
  stringsAsFactors = FALSE
)

Check for fuzzy matches of a keywords

check_fuzzy_match <- function(email, keyword, max_dist = 2) {
  words <- unlist(strsplit(tolower(email), "\\s+"))
  any(sapply(words, function(word) 
    stringdist(word, tolower(keyword), method = "lv") <= max_dist))
}

Calculate misspelling rate

calculate_misspelling_rate <- function(email, reference_dictionary, 
                                       max_dist = 2) {
  words <- unlist(strsplit(tolower(email), "\\s+"))
  total_words <- length(words)
  misspelled_count <- sum(sapply(words, function(word) {
    min_dist <- min(stringdist(word, reference_dictionary, method = "lv"))
    return(as.integer(min_dist > max_dist))
  }))
  return(misspelled_count / total_words)
}

Add phishing keyword columns using fuzzy matching

for (keyword in phishing_keywords) {
  col_name <- gsub(" ", "_", keyword)
  results[[col_name]] <- sapply(results$Email, function(email) {
    as.integer(check_fuzzy_match(email, keyword, max_dist = 2))
  })
}

Add misspelling rate column

results$Misspelling_Rate <- sapply(results$Email, function(email) {
  calculate_misspelling_rate(email, reference_dictionary, max_dist = 2)
})

Suspicious url detection –> AI vs AI

results$Suspicious_URL <- sapply(results$Email, function(email) {
  urls <- stringr::str_extract_all(email, "http[s]?://[^\\s]+")[[1]]
  suspicious <- any(grepl("\\.xyz$|\\.ru$|phishing\\.com$", urls))
  return(as.integer(suspicious))
})

head(results$Suspicious_URL)
## [1] 1 0 0 0 0 0

AI-like writing detection –> AI vs AI

results$AI_Like_Content <- sapply(results$Email, function(email) {
  polished_words <- c("unprecedented", "effortlessly", "exclusively")
  score <- sum(stringr::str_count(email, polished_words))
  return(score)
})

head(results$AI_Like_Content)
## [1] 0 0 0 0 0 0

Something phishy? Calculate phishing scores.

results$Phishing_Score <- rowSums(results[,-1]) + 
                          results$Misspelling_Rate * 2 +
                          results$Suspicious_URL * 3 +
                          results$AI_Like_Content * 1.5

head(results$Phishing_Score)
## [1] 5.600000 1.928571 1.000000 1.000000 1.500000 1.333333

4. Different Settings

Different thresholds. Sometimes, clouded our vision, the dark side has.

thresholds <- seq(0.5, 3, by = 0.5)
detection_results <- data.frame()
for (threshold in thresholds) {
  results$Predicted_Label <- ifelse(results$Phishing_Score >= threshold, 
                                    "Phishing", "Legitimate")
  phishing_detected <- sum(results$Predicted_Label == "Phishing")
  detection_results <- rbind(detection_results, 
                             data.frame(Threshold = threshold, 
                                        Phishing_Detected = phishing_detected))
}

Detection results using Sir Nighteye’s Quirk

detection_results
##   Threshold Phishing_Detected
## 1       0.5                22
## 2       1.0                19
## 3       1.5                 8
## 4       2.0                 2
## 5       2.5                 2
## 6       3.0                 1
library(ggplot2)

detection_results_chart <- ggplot(detection_results) +
  aes(x = Threshold, y = Phishing_Detected) +
  geom_line(color = "#00bfff", size = 1.2) + 
  geom_point(color = "#ffffff", size = 3) +  
  theme_minimal() +
  labs(title = "Sir Nighteye's Foresight",
       x = "Threshold",
       y = "Number of Phishing Emails Detected") +
  theme(plot.background = element_rect(fill = "#2b2b2b", 
                                       color = NA),
        panel.background = element_rect(fill = "#2b2b2b", color = NA), 
        text = element_text(color = "white"),
        axis.text = element_text(color = "white"),
        axis.title = element_text(color = "white"),
        plot.title = element_text(size = 14, face = "bold", hjust = 0.5),
        panel.grid.major = element_line(color = "#444444"),
        panel.grid.minor = element_blank())

Added features… Sometimes Foresight reveals other things :-)

library(plotly)
interactive_chart <- ggplotly(detection_results_chart)

interactive_chart

Visualise suspicious urls.

results$File_Name <- basename(rownames(results))
results$File_Name <- factor(results$File_Name,
                            levels = results$File_Name[order(as.numeric(gsub("email_(\\d+).*", "\\1",                                                                             results$File_Name)))])
results$Suspicious_URL <- sapply(results$Email, function(email) {
  urls <- stringr::str_extract_all(email, "http[s]?://[^\\s]+")[[1]]
  suspicious <- any(grepl("\\.xyz$|\\.ru$|phishing\\.com$", urls))
  return(as.integer(suspicious))
})


url_chart <- ggplot(results, aes(x = reorder(File_Name, File_Name), y = Suspicious_URL)) +
  geom_segment(aes(x = reorder(File_Name, File_Name), xend = reorder(File_Name, File_Name),
                   y = 0, yend = Suspicious_URL),
               color = "#ff6347", size = 1) +
  geom_point(color = "#8b0000", size = 4) +
  coord_flip() +
  theme_minimal() +
  labs(
    title = "Suspicious URLs Detected in Emails",
    x = "Email File Name",
    y = "Suspicious URL Flag (1 = Suspicious)"
  ) +
  theme(
    plot.background = element_rect(fill = "#2b2b2b", color = NA),
    panel.background = element_rect(fill = "#2b2b2b", color = NA),
    text = element_text(color = "white"),
    axis.text = element_text(color = "white"),
    axis.text.x = element_text(angle = 45, hjust = 1),
    axis.title = element_text(color = "white"),
    plot.title = element_text(size = 14, face = "bold", hjust = 0.5),
    panel.grid.major = element_line(color = "#444444"),
    panel.grid.minor = element_blank()
  )


url_chart

Visualise AI-like writing.

results$AI_Like_Content <- sapply(results$Email, function(email) {
  polished_words <- c("effortless", "exclusive", "unprecedented")
  score <- sum(stringr::str_count(email, polished_words))
  return(as.integer(score > 0))
})

ai_chart <- ggplot(results) + aes(x = reorder(File_Name, File_Name), 
                                  y = AI_Like_Content) +
  geom_segment(aes(x = reorder(File_Name, File_Name), 
                   xend = reorder(File_Name, File_Name),
                   y = 0, yend = AI_Like_Content),
               color = "#4682b4", size = 1) +
  geom_point(color = "#1e90ff", size = 4) +
  coord_flip() +
  theme_minimal() +
  labs(
    title = "AI-Like Writing Detected in Emails",
    x = "Email File Name",
    y = "AI Writing Flag (1 = Detected)"
  ) +
  theme(
    plot.background = element_rect(fill = "#2b2b2b", color = NA),
    panel.background = element_rect(fill = "#2b2b2b", color = NA),
    text = element_text(color = "white"),
    axis.text = element_text(color = "white"),
    axis.text.x = element_text(angle = 45, hjust = 1),
    axis.title = element_text(color = "white"),
    plot.title = element_text(size = 14, face = "bold", hjust = 0.5),
    panel.grid.major = element_line(color = "#444444"),
    panel.grid.minor = element_blank()
  )
ai_chart

5. Final Selection

Set the threshold to 1.5 (hypothetically). Change as needed or use the force :-)

threshold <- 1.5

Classify emails based on the selected threshold

results$Predicted_Label <- ifelse(results$Phishing_Score >= threshold, 
                                  "Phishing", "Legitimate")

Export flagged emails (if desired).

flagged_emails <- results[results$Phishing_Score >= 1.5, ]

# write.csv(flagged_emails, "flagged_phishing_emails.csv", row.names = FALSE)

Print phishing emails preview

phishing_emails <- results[results$Predicted_Label == "Phishing", ]
phishing_emails$Preview <- sapply(phishing_emails$Email, function(email) {
  paste0(strsplit(email, "\\s+")[[1]][1:5], collapse = " ")
})
phishing_emails <- phishing_emails[, c("Preview", 
                                       "Phishing_Score", "Predicted_Label")]
phishing_emails
##                                                         Preview Phishing_Score
## emails_v2/email_1.txt    Dear customer, urgent action required!       5.600000
## emails_v2/email_10.txt  Experience effortless shopping with our       1.928571
## emails_v2/email_13.txt    Remindr: Your apointment is scheduled       1.500000
## emails_v2/email_16.txt Congratulatins on your recent prom0tion!       1.800000
## emails_v2/email_20.txt        Your subcription has been renewed       1.500000
## emails_v2/email_22.txt             Your feedbak is important to       1.500000
## emails_v2/email_4.txt             We noticed unsual activity in       1.687500
## emails_v2/email_5.txt     Imporrtant notice: Update your contct       2.500000
##                        Predicted_Label
## emails_v2/email_1.txt         Phishing
## emails_v2/email_10.txt        Phishing
## emails_v2/email_13.txt        Phishing
## emails_v2/email_16.txt        Phishing
## emails_v2/email_20.txt        Phishing
## emails_v2/email_22.txt        Phishing
## emails_v2/email_4.txt         Phishing
## emails_v2/email_5.txt         Phishing
library(ggplot2)

phishing_bar_chart <- ggplot(phishing_emails) +
  aes(x = reorder(Preview, Phishing_Score),
      y = Phishing_Score) +
  geom_bar(stat = "identity", fill = "#444444", 
           color = "#222222") +
  theme_minimal(base_family = "sans") +
  labs(title = "The Phishing Ones",
       x = "Email Preview",
       y = "Phishing Score") +
  theme(plot.background = element_rect(fill = "#2b2b2b", 
                                       color = NA), 
        panel.background = element_rect(fill = "#2b2b2b", 
                                        color = NA), 
        text = element_text(color = "white"),
        axis.text = element_text(color = "white"),
        axis.text.x = element_text(hjust = 1),
        axis.title = element_text(color = "white"),
        plot.title = element_text(size = 14, face = "bold", hjust = 0.5),
        panel.grid.major = element_line(color = "#444444"),
        panel.grid.minor = element_blank()) +
  coord_flip()


phishing_bar_chart

phishing_emails
##                                                         Preview Phishing_Score
## emails_v2/email_1.txt    Dear customer, urgent action required!       5.600000
## emails_v2/email_10.txt  Experience effortless shopping with our       1.928571
## emails_v2/email_13.txt    Remindr: Your apointment is scheduled       1.500000
## emails_v2/email_16.txt Congratulatins on your recent prom0tion!       1.800000
## emails_v2/email_20.txt        Your subcription has been renewed       1.500000
## emails_v2/email_22.txt             Your feedbak is important to       1.500000
## emails_v2/email_4.txt             We noticed unsual activity in       1.687500
## emails_v2/email_5.txt     Imporrtant notice: Update your contct       2.500000
##                        Predicted_Label
## emails_v2/email_1.txt         Phishing
## emails_v2/email_10.txt        Phishing
## emails_v2/email_13.txt        Phishing
## emails_v2/email_16.txt        Phishing
## emails_v2/email_20.txt        Phishing
## emails_v2/email_22.txt        Phishing
## emails_v2/email_4.txt         Phishing
## emails_v2/email_5.txt         Phishing
phishing_emails$file_name <- basename(rownames(phishing_emails))


phishing_bar_chart_2 <- ggplot(phishing_emails) +
  aes(x = reorder(file_name, Phishing_Score), y = Phishing_Score) +
  geom_bar(stat = "identity", fill = "#444444",
           color = "#222222") +
  theme_minimal(base_family = "sans") +
  labs(title = "The Phishing Ones",
       x = "Email Preview",
       y = "Phishing Score") +
  theme(plot.background = element_rect(fill = "#2b2b2b", 
                                       color = NA), 
        panel.background = element_rect(fill = "#2b2b2b", 
                                        color = NA), 
        text = element_text(color = "white"),
        axis.text = element_text(color = "white"),
        axis.text.x = element_text(hjust = 1),
        axis.title = element_text(color = "white"),
        plot.title = element_text(size = 14, face = "bold", hjust = 0.5),
        panel.grid.major = element_line(color = "#444444"),
        panel.grid.minor = element_blank()) +
  coord_flip()


phishing_bar_chart_2

Added features…

interactive_chart_2 <- ggplotly(phishing_bar_chart_2)

interactive_chart_2

Decorative Image