Define the path to the folders
Load signs of phishing
phishing_keywords <- scan(paste0(keywords_folder, "phishing_keywords.txt"),
what = "", sep = "\n", quiet = TRUE)
Define reference dictionary for misspelling
reference_dictionary <- tolower(stopwords("en"))
reference_dictionary <- c(reference_dictionary, "verify", "urgent",
"account", "password", "click", "payment")
Read emails
Initialise a data frame to store results
Check for fuzzy matches of a keywords
check_fuzzy_match <- function(email, keyword, max_dist = 2) {
words <- unlist(strsplit(tolower(email), "\\s+"))
any(sapply(words, function(word)
stringdist(word, tolower(keyword), method = "lv") <= max_dist))
}
Calculate misspelling rate
calculate_misspelling_rate <- function(email, reference_dictionary,
max_dist = 2) {
words <- unlist(strsplit(tolower(email), "\\s+"))
total_words <- length(words)
misspelled_count <- sum(sapply(words, function(word) {
min_dist <- min(stringdist(word, reference_dictionary, method = "lv"))
return(as.integer(min_dist > max_dist))
}))
return(misspelled_count / total_words)
}
Add phishing keyword columns using fuzzy matching
for (keyword in phishing_keywords) {
col_name <- gsub(" ", "_", keyword)
results[[col_name]] <- sapply(results$Email, function(email) {
as.integer(check_fuzzy_match(email, keyword, max_dist = 2))
})
}
Add misspelling rate column
results$Misspelling_Rate <- sapply(results$Email, function(email) {
calculate_misspelling_rate(email, reference_dictionary, max_dist = 2)
})
Suspicious url detection –> AI vs AI
results$Suspicious_URL <- sapply(results$Email, function(email) {
urls <- stringr::str_extract_all(email, "http[s]?://[^\\s]+")[[1]]
suspicious <- any(grepl("\\.xyz$|\\.ru$|phishing\\.com$", urls))
return(as.integer(suspicious))
})
head(results$Suspicious_URL)
## [1] 1 0 0 0 0 0
AI-like writing detection –> AI vs AI
results$AI_Like_Content <- sapply(results$Email, function(email) {
polished_words <- c("unprecedented", "effortlessly", "exclusively")
score <- sum(stringr::str_count(email, polished_words))
return(score)
})
head(results$AI_Like_Content)
## [1] 0 0 0 0 0 0
Something phishy? Calculate phishing scores.
results$Phishing_Score <- rowSums(results[,-1]) +
results$Misspelling_Rate * 2 +
results$Suspicious_URL * 3 +
results$AI_Like_Content * 1.5
head(results$Phishing_Score)
## [1] 5.600000 1.928571 1.000000 1.000000 1.500000 1.333333
Different thresholds. Sometimes, clouded our vision, the dark side has.
for (threshold in thresholds) {
results$Predicted_Label <- ifelse(results$Phishing_Score >= threshold,
"Phishing", "Legitimate")
phishing_detected <- sum(results$Predicted_Label == "Phishing")
detection_results <- rbind(detection_results,
data.frame(Threshold = threshold,
Phishing_Detected = phishing_detected))
}
Detection results using Sir Nighteye’s Quirk
## Threshold Phishing_Detected
## 1 0.5 22
## 2 1.0 19
## 3 1.5 8
## 4 2.0 2
## 5 2.5 2
## 6 3.0 1
library(ggplot2)
detection_results_chart <- ggplot(detection_results) +
aes(x = Threshold, y = Phishing_Detected) +
geom_line(color = "#00bfff", size = 1.2) +
geom_point(color = "#ffffff", size = 3) +
theme_minimal() +
labs(title = "Sir Nighteye's Foresight",
x = "Threshold",
y = "Number of Phishing Emails Detected") +
theme(plot.background = element_rect(fill = "#2b2b2b",
color = NA),
panel.background = element_rect(fill = "#2b2b2b", color = NA),
text = element_text(color = "white"),
axis.text = element_text(color = "white"),
axis.title = element_text(color = "white"),
plot.title = element_text(size = 14, face = "bold", hjust = 0.5),
panel.grid.major = element_line(color = "#444444"),
panel.grid.minor = element_blank())
Added features… Sometimes Foresight reveals other things :-)
Visualise suspicious urls.
results$File_Name <- basename(rownames(results))
results$File_Name <- factor(results$File_Name,
levels = results$File_Name[order(as.numeric(gsub("email_(\\d+).*", "\\1", results$File_Name)))])
results$Suspicious_URL <- sapply(results$Email, function(email) {
urls <- stringr::str_extract_all(email, "http[s]?://[^\\s]+")[[1]]
suspicious <- any(grepl("\\.xyz$|\\.ru$|phishing\\.com$", urls))
return(as.integer(suspicious))
})
url_chart <- ggplot(results, aes(x = reorder(File_Name, File_Name), y = Suspicious_URL)) +
geom_segment(aes(x = reorder(File_Name, File_Name), xend = reorder(File_Name, File_Name),
y = 0, yend = Suspicious_URL),
color = "#ff6347", size = 1) +
geom_point(color = "#8b0000", size = 4) +
coord_flip() +
theme_minimal() +
labs(
title = "Suspicious URLs Detected in Emails",
x = "Email File Name",
y = "Suspicious URL Flag (1 = Suspicious)"
) +
theme(
plot.background = element_rect(fill = "#2b2b2b", color = NA),
panel.background = element_rect(fill = "#2b2b2b", color = NA),
text = element_text(color = "white"),
axis.text = element_text(color = "white"),
axis.text.x = element_text(angle = 45, hjust = 1),
axis.title = element_text(color = "white"),
plot.title = element_text(size = 14, face = "bold", hjust = 0.5),
panel.grid.major = element_line(color = "#444444"),
panel.grid.minor = element_blank()
)
url_chart
Visualise AI-like writing.
results$AI_Like_Content <- sapply(results$Email, function(email) {
polished_words <- c("effortless", "exclusive", "unprecedented")
score <- sum(stringr::str_count(email, polished_words))
return(as.integer(score > 0))
})
ai_chart <- ggplot(results) + aes(x = reorder(File_Name, File_Name),
y = AI_Like_Content) +
geom_segment(aes(x = reorder(File_Name, File_Name),
xend = reorder(File_Name, File_Name),
y = 0, yend = AI_Like_Content),
color = "#4682b4", size = 1) +
geom_point(color = "#1e90ff", size = 4) +
coord_flip() +
theme_minimal() +
labs(
title = "AI-Like Writing Detected in Emails",
x = "Email File Name",
y = "AI Writing Flag (1 = Detected)"
) +
theme(
plot.background = element_rect(fill = "#2b2b2b", color = NA),
panel.background = element_rect(fill = "#2b2b2b", color = NA),
text = element_text(color = "white"),
axis.text = element_text(color = "white"),
axis.text.x = element_text(angle = 45, hjust = 1),
axis.title = element_text(color = "white"),
plot.title = element_text(size = 14, face = "bold", hjust = 0.5),
panel.grid.major = element_line(color = "#444444"),
panel.grid.minor = element_blank()
)
ai_chart
Set the threshold to 1.5 (hypothetically). Change as needed or use the force :-)
Classify emails based on the selected threshold
Export flagged emails (if desired).
flagged_emails <- results[results$Phishing_Score >= 1.5, ]
# write.csv(flagged_emails, "flagged_phishing_emails.csv", row.names = FALSE)
Print phishing emails preview
phishing_emails$Preview <- sapply(phishing_emails$Email, function(email) {
paste0(strsplit(email, "\\s+")[[1]][1:5], collapse = " ")
})
## Preview Phishing_Score
## emails_v2/email_1.txt Dear customer, urgent action required! 5.600000
## emails_v2/email_10.txt Experience effortless shopping with our 1.928571
## emails_v2/email_13.txt Remindr: Your apointment is scheduled 1.500000
## emails_v2/email_16.txt Congratulatins on your recent prom0tion! 1.800000
## emails_v2/email_20.txt Your subcription has been renewed 1.500000
## emails_v2/email_22.txt Your feedbak is important to 1.500000
## emails_v2/email_4.txt We noticed unsual activity in 1.687500
## emails_v2/email_5.txt Imporrtant notice: Update your contct 2.500000
## Predicted_Label
## emails_v2/email_1.txt Phishing
## emails_v2/email_10.txt Phishing
## emails_v2/email_13.txt Phishing
## emails_v2/email_16.txt Phishing
## emails_v2/email_20.txt Phishing
## emails_v2/email_22.txt Phishing
## emails_v2/email_4.txt Phishing
## emails_v2/email_5.txt Phishing
library(ggplot2)
phishing_bar_chart <- ggplot(phishing_emails) +
aes(x = reorder(Preview, Phishing_Score),
y = Phishing_Score) +
geom_bar(stat = "identity", fill = "#444444",
color = "#222222") +
theme_minimal(base_family = "sans") +
labs(title = "The Phishing Ones",
x = "Email Preview",
y = "Phishing Score") +
theme(plot.background = element_rect(fill = "#2b2b2b",
color = NA),
panel.background = element_rect(fill = "#2b2b2b",
color = NA),
text = element_text(color = "white"),
axis.text = element_text(color = "white"),
axis.text.x = element_text(hjust = 1),
axis.title = element_text(color = "white"),
plot.title = element_text(size = 14, face = "bold", hjust = 0.5),
panel.grid.major = element_line(color = "#444444"),
panel.grid.minor = element_blank()) +
coord_flip()
phishing_bar_chart
## Preview Phishing_Score
## emails_v2/email_1.txt Dear customer, urgent action required! 5.600000
## emails_v2/email_10.txt Experience effortless shopping with our 1.928571
## emails_v2/email_13.txt Remindr: Your apointment is scheduled 1.500000
## emails_v2/email_16.txt Congratulatins on your recent prom0tion! 1.800000
## emails_v2/email_20.txt Your subcription has been renewed 1.500000
## emails_v2/email_22.txt Your feedbak is important to 1.500000
## emails_v2/email_4.txt We noticed unsual activity in 1.687500
## emails_v2/email_5.txt Imporrtant notice: Update your contct 2.500000
## Predicted_Label
## emails_v2/email_1.txt Phishing
## emails_v2/email_10.txt Phishing
## emails_v2/email_13.txt Phishing
## emails_v2/email_16.txt Phishing
## emails_v2/email_20.txt Phishing
## emails_v2/email_22.txt Phishing
## emails_v2/email_4.txt Phishing
## emails_v2/email_5.txt Phishing
phishing_emails$file_name <- basename(rownames(phishing_emails))
phishing_bar_chart_2 <- ggplot(phishing_emails) +
aes(x = reorder(file_name, Phishing_Score), y = Phishing_Score) +
geom_bar(stat = "identity", fill = "#444444",
color = "#222222") +
theme_minimal(base_family = "sans") +
labs(title = "The Phishing Ones",
x = "Email Preview",
y = "Phishing Score") +
theme(plot.background = element_rect(fill = "#2b2b2b",
color = NA),
panel.background = element_rect(fill = "#2b2b2b",
color = NA),
text = element_text(color = "white"),
axis.text = element_text(color = "white"),
axis.text.x = element_text(hjust = 1),
axis.title = element_text(color = "white"),
plot.title = element_text(size = 14, face = "bold", hjust = 0.5),
panel.grid.major = element_line(color = "#444444"),
panel.grid.minor = element_blank()) +
coord_flip()
phishing_bar_chart_2
Added features…