library(chromote)
library(rvest)
library(xml2)
library(dplyr)
library(purrr)
library(stringr)
library(lubridate)
# ===============================
# ποΈ Define Date Ranges (JanβOct 2025)
# ===============================
start_dates <- seq(ymd("2018-01-01"), ymd("2024-12-31"), by = "1 month")
end_dates <- c(start_dates[-1] - days(1), ymd("2024-12-31"))
# ===============================
# π§ FASTER Chromote Article Scraper (Reuses Browser)
# ===============================
get_article_text_chromote <- function(browser, url, wait_time = 2) {
tryCatch({
message(" π₯ Loading: ", substr(url, 1, 60), "...")
# Navigate to URL
browser$Page$navigate(url, wait_ = FALSE)
Sys.sleep(wait_time) # Reduced wait time
# Get rendered HTML
result <- browser$Runtime$evaluate("document.documentElement.outerHTML")
page <- read_html(result$result$value)
# Extract text
text_nodes <- page %>%
html_nodes("article p, div[itemprop='articleBody'] p, div.article-body p,
div.story-body p, main p, section p, p") %>%
html_text(trim = TRUE)
text_nodes <- text_nodes[nchar(text_nodes) > 0]
text <- paste(unique(text_nodes), collapse = " ")
message(" β ", length(text_nodes), " paragraphs, ", nchar(text), " chars")
if (nchar(text) < 400) {
message("β οΈ Too little text")
return(NA)
}
return(text)
}, error = function(e) {
message("β Error: ", e$message)
return(NA)
})
}
# ===============================
# π·οΈ Function to Fetch One RSS Window
# ===============================
get_rss_data <- function(start, end, query = "Your Search Request") {
rss_url <- paste0(
"https://news.google.com/rss/search?q=",
URLencode(query),
"+after:", start,
"+before:", end,
"&hl=en-US&gl=US&ceid=US:en"
)
xml <- tryCatch(read_xml(rss_url), error = function(e) NULL)
if (is.null(xml)) return(tibble())
tibble(
title = xml_text(xml_find_all(xml, "//item/title")),
link = xml_text(xml_find_all(xml, "//item/link")),
pubDate = xml_text(xml_find_all(xml, "//item/pubDate")),
range_start = start,
range_end = end
)
}
# ===============================
# πΎ Save Text to PDF (Multi-page)
# ===============================
save_text_to_pdf <- function(title, date, link, article_text, pdf_path) {
tryCatch({
header <- paste0("π° ", title, "\n\nπ
", date, "\n\nπ ", link, "\n\n")
full_text <- paste0(header, article_text)
wrapped <- strwrap(full_text, width = 95)
pdf(pdf_path, width = 8.5, height = 11)
lines_per_page <- 50
num_pages <- ceiling(length(wrapped) / lines_per_page)
for (page in 1:num_pages) {
start_line <- (page - 1) * lines_per_page + 1
end_line <- min(page * lines_per_page, length(wrapped))
page_text <- wrapped[start_line:end_line]
plot.new()
par(mar = c(0, 0, 0, 0))
text(0.05, 0.95, paste(page_text, collapse = "\n"),
adj = c(0, 1), family = "mono", cex = 0.65)
}
dev.off()
return(TRUE)
}, error = function(e) {
message("β PDF failed: ", e$message)
return(FALSE)
})
}
# ===============================
# π Create Output Folder
# ===============================
dir.create("your_folder", showWarnings = FALSE)
# ===============================
# π Crawl Loop (All Date Windows)
# ===============================
message("\n", paste(rep("=", 70), collapse = ""))
message("STEP 1: Fetching RSS feeds")
message(paste(rep("=", 70), collapse = ""))
news_data <- map2_df(start_dates, end_dates, function(s, e) {
message("\nποΈ ", s, " β ", e)
Sys.sleep(1) # Reduced from 2
get_rss_data(s, e)
})
message("\nπ Found ", nrow(news_data), " articles")
# Clean Google redirect links
news_data <- news_data %>%
mutate(real_link = ifelse(
str_detect(link, "google.com/url\\?q="),
URLencode(str_extract(link, "(?<=q=)[^&]+")),
link
)) %>%
distinct(real_link, .keep_all = TRUE)
message("π Unique articles: ", nrow(news_data))
# ===============================
# π§Ύ Scrape Text + Save PDFs
# ===============================
message("\n", paste(rep("=", 70), collapse = ""))
message("STEP 2: Scraping with Chromote (FAST MODE)")
message(paste(rep("=", 70), collapse = ""))
# Initialize browser ONCE
message("π Starting browser...")
browser <- ChromoteSession$new()
success_count <- 0
fail_count <- 0
start_time <- Sys.time()
for (i in seq_len(nrow(news_data))) {
title <- news_data$title[i]
date <- news_data$pubDate[i]
link <- news_data$real_link[i]
message("\nπ [", i, "/", nrow(news_data), "] ", substr(title, 1, 60))
# Use the SAME browser session
article_text <- get_article_text_chromote(browser, link, wait_time = 2)
if (is.na(article_text)) {
fail_count <- fail_count + 1
next
}
file_title <- make.names(substr(title, 1, 60))
pdf_path <- file.path("your_folder", paste0(file_title, ".pdf"))
if (save_text_to_pdf(title, date, link, article_text, pdf_path)) {
message("β
Saved: ", basename(pdf_path))
success_count <- success_count + 1
} else {
fail_count <- fail_count + 1
}
Sys.sleep(0.5) # Reduced from 2 seconds
}
# Close browser at the end
browser$close()
end_time <- Sys.time()
elapsed <- round(as.numeric(difftime(end_time, start_time, units = "mins")), 2)
# ===============================
# πΎ Save Master CSV
# ===============================
write.csv(news_data, "your_csv.csv", row.names = FALSE)
message("\n", paste(rep("=", 70), collapse = ""))
message("π COMPLETE!")
message(" β
Success: ", success_count, " PDFs")
message(" β Failed: ", fail_count, " articles")
message(" β±οΈ Time: ", elapsed, " minutes")
message(" π Output: ny_article_pdfs/")
message(paste(rep("=", 70), collapse = ""))Web-crawler used to fetch sentiment Data Set
The Web-crawler: