Web-crawler used to fetch sentiment Data Set

The Web-crawler:

library(chromote)
library(rvest)
library(xml2)
library(dplyr)
library(purrr)
library(stringr)
library(lubridate)

# ===============================
# πŸ—“οΈ Define Date Ranges (Jan–Oct 2025)
# ===============================
start_dates <- seq(ymd("2018-01-01"), ymd("2024-12-31"), by = "1 month")
end_dates   <- c(start_dates[-1] - days(1), ymd("2024-12-31"))

# ===============================
# 🧠 FASTER Chromote Article Scraper (Reuses Browser)
# ===============================
get_article_text_chromote <- function(browser, url, wait_time = 2) {
  tryCatch({
    message("  πŸ“₯ Loading: ", substr(url, 1, 60), "...")
    
    # Navigate to URL
    browser$Page$navigate(url, wait_ = FALSE)
    Sys.sleep(wait_time)  # Reduced wait time
    
    # Get rendered HTML
    result <- browser$Runtime$evaluate("document.documentElement.outerHTML")
    page <- read_html(result$result$value)
    
    # Extract text
    text_nodes <- page %>%
      html_nodes("article p, div[itemprop='articleBody'] p, div.article-body p, 
                  div.story-body p, main p, section p, p") %>%
      html_text(trim = TRUE)
    
    text_nodes <- text_nodes[nchar(text_nodes) > 0]
    text <- paste(unique(text_nodes), collapse = " ")
    
    message("  β†’ ", length(text_nodes), " paragraphs, ", nchar(text), " chars")
    
    if (nchar(text) < 400) {
      message("⚠️ Too little text")
      return(NA)
    }
    
    return(text)
    
  }, error = function(e) {
    message("❌ Error: ", e$message)
    return(NA)
  })
}

# ===============================
# πŸ•·οΈ Function to Fetch One RSS Window
# ===============================
get_rss_data <- function(start, end, query = "Your Search Request") {
  rss_url <- paste0(
    "https://news.google.com/rss/search?q=",
    URLencode(query),
    "+after:", start,
    "+before:", end,
    "&hl=en-US&gl=US&ceid=US:en"
  )

  xml <- tryCatch(read_xml(rss_url), error = function(e) NULL)
  if (is.null(xml)) return(tibble())

  tibble(
    title = xml_text(xml_find_all(xml, "//item/title")),
    link  = xml_text(xml_find_all(xml, "//item/link")),
    pubDate = xml_text(xml_find_all(xml, "//item/pubDate")),
    range_start = start,
    range_end = end
  )
}

# ===============================
# πŸ’Ύ Save Text to PDF (Multi-page)
# ===============================
save_text_to_pdf <- function(title, date, link, article_text, pdf_path) {
  tryCatch({
    header <- paste0("πŸ“° ", title, "\n\nπŸ“… ", date, "\n\nπŸ”— ", link, "\n\n")
    full_text <- paste0(header, article_text)
    wrapped <- strwrap(full_text, width = 95)
    
    pdf(pdf_path, width = 8.5, height = 11)
    
    lines_per_page <- 50
    num_pages <- ceiling(length(wrapped) / lines_per_page)
    
    for (page in 1:num_pages) {
      start_line <- (page - 1) * lines_per_page + 1
      end_line <- min(page * lines_per_page, length(wrapped))
      page_text <- wrapped[start_line:end_line]
      
      plot.new()
      par(mar = c(0, 0, 0, 0))
      text(0.05, 0.95, paste(page_text, collapse = "\n"), 
           adj = c(0, 1), family = "mono", cex = 0.65)
    }
    
    dev.off()
    return(TRUE)
    
  }, error = function(e) {
    message("❌ PDF failed: ", e$message)
    return(FALSE)
  })
}

# ===============================
# πŸ“ Create Output Folder
# ===============================
dir.create("your_folder", showWarnings = FALSE)

# ===============================
# πŸ” Crawl Loop (All Date Windows)
# ===============================
message("\n", paste(rep("=", 70), collapse = ""))
message("STEP 1: Fetching RSS feeds")
message(paste(rep("=", 70), collapse = ""))

news_data <- map2_df(start_dates, end_dates, function(s, e) {
  message("\nπŸ—“οΈ ", s, " β†’ ", e)
  Sys.sleep(1)  # Reduced from 2
  get_rss_data(s, e)
})

message("\nπŸ“Š Found ", nrow(news_data), " articles")

# Clean Google redirect links
news_data <- news_data %>%
  mutate(real_link = ifelse(
    str_detect(link, "google.com/url\\?q="),
    URLencode(str_extract(link, "(?<=q=)[^&]+")),
    link
  )) %>%
  distinct(real_link, .keep_all = TRUE)

message("πŸ“Š Unique articles: ", nrow(news_data))

# ===============================
# 🧾 Scrape Text + Save PDFs
# ===============================
message("\n", paste(rep("=", 70), collapse = ""))
message("STEP 2: Scraping with Chromote (FAST MODE)")
message(paste(rep("=", 70), collapse = ""))

# Initialize browser ONCE
message("πŸš€ Starting browser...")
browser <- ChromoteSession$new()

success_count <- 0
fail_count <- 0
start_time <- Sys.time()

for (i in seq_len(nrow(news_data))) {
  title <- news_data$title[i]
  date <- news_data$pubDate[i]
  link <- news_data$real_link[i]
  
  message("\nπŸ”Ž [", i, "/", nrow(news_data), "] ", substr(title, 1, 60))
  
  # Use the SAME browser session
  article_text <- get_article_text_chromote(browser, link, wait_time = 2)
  
  if (is.na(article_text)) {
    fail_count <- fail_count + 1
    next
  }
  
  file_title <- make.names(substr(title, 1, 60))
  pdf_path <- file.path("your_folder", paste0(file_title, ".pdf"))
  
  if (save_text_to_pdf(title, date, link, article_text, pdf_path)) {
    message("βœ… Saved: ", basename(pdf_path))
    success_count <- success_count + 1
  } else {
    fail_count <- fail_count + 1
  }
  
  Sys.sleep(0.5)  # Reduced from 2 seconds
}

# Close browser at the end
browser$close()

end_time <- Sys.time()
elapsed <- round(as.numeric(difftime(end_time, start_time, units = "mins")), 2)

# ===============================
# πŸ’Ύ Save Master CSV
# ===============================
write.csv(news_data, "your_csv.csv", row.names = FALSE)

message("\n", paste(rep("=", 70), collapse = ""))
message("πŸŽ‰ COMPLETE!")
message("   βœ… Success: ", success_count, " PDFs")
message("   ❌ Failed: ", fail_count, " articles")
message("   ⏱️  Time: ", elapsed, " minutes")
message("   πŸ“ Output: ny_article_pdfs/")
message(paste(rep("=", 70), collapse = ""))