links <- "某网站链接"
# 自定义抓取函数 GrabWebpage <- function(d) { web = read_html(d, encoding="UTF8") # 读取数据,规定编码 web1 = web %>% html_nodes("a") %>% html_attr("class") num_where1 = match("blog-pager-older-link", web1) # 下一篇文章的网址链接 web1 = web %>% html_nodes("a") %>% html_attr("href") web1 = web1 [num_where1] # 下一篇文章的网址链接 # web2 = web %>% html_nodes("h2") %>% html_attr("class") num_where2 = match("date-header", web2) # 本篇文章的发布日期 # num_where2 web2 = web %>% html_nodes("h2") %>% html_text() web2 = web2 [num_where2] # 本篇文章的发布日期 # web3 = web %>% html_nodes("h3") %>% html_attr("class") num_where3 = match("post-title entry-title", web3) # 本篇文章的标题 web3 = web %>% html_nodes("h3") %>% html_text() web3 = web3 [num_where3] # 本篇文章的标题 # web4 = web %>% html_nodes("div") %>% html_text() web5 = web %>% html_nodes("div") %>% html_attr("class") num_where4 = match("post-body entry-content", web5) # 本篇文章的正文 web4 = paste(web4 [num_where4]) # 本篇文章的正文 Date = as.character(web2) Title = as.character(web3) Post = paste(as.character(web4), collapse=",,") DF = cbind(Title, Date, d, Post, web1) DF = data.frame(DF) write.csv(DF, paste("D:/",gsub(pattern="\n", replacement="", substr(as.character(DF$Title), start=1, stop=8)), ".csv", sep ="")) # 注意这里要把换行符号\n替换掉。 Sys.sleep(0.5) assign (web3, DF, envir = .GlobalEnv) } GrabWebpage(links) case = 1 # 使用while循环。 while(case < 1000) { # while-loop當符合裡面的條件時,就會一直重複括號內的程式碼,直到不符合為止 DF = GrabWebpage(links) links = as.character(DF$web1) case = case + 1 } |