Blog Archives

2017-11-5 网络爬虫一点心得

5/11/2017

links <- "某网站链接"

# 自定义抓取函数
GrabWebpage <- function(d) {
web = read_html(d, encoding="UTF8") # 读取数据，规定编码
web1 = web %>% html_nodes("a") %>% html_attr("class")
num_where1 = match("blog-pager-older-link", web1) # 下一篇文章的网址链接
web1 = web %>% html_nodes("a") %>% html_attr("href")
web1 = web1 [num_where1] # 下一篇文章的网址链接

#
web2 = web %>% html_nodes("h2") %>% html_attr("class")
num_where2 = match("date-header", web2) # 本篇文章的发布日期
# num_where2
web2 = web %>% html_nodes("h2") %>% html_text()
web2 = web2 [num_where2] # 本篇文章的发布日期

#
web3 = web %>% html_nodes("h3") %>% html_attr("class")
num_where3 = match("post-title entry-title", web3) # 本篇文章的标题
web3 = web %>% html_nodes("h3") %>% html_text()
web3 = web3 [num_where3] # 本篇文章的标题

#
web4 = web %>% html_nodes("div") %>% html_text()
web5 = web %>% html_nodes("div") %>% html_attr("class")
num_where4 = match("post-body entry-content", web5) # 本篇文章的正文
web4 = paste(web4 [num_where4]) # 本篇文章的正文

Date = as.character(web2)
Title = as.character(web3)
Post = paste(as.character(web4), collapse=",,")
DF = cbind(Title, Date, d, Post, web1)
DF = data.frame(DF)
write.csv(DF, paste("D:/",gsub(pattern="\n", replacement="", substr(as.character(DF$Title), start=1, stop=8)), ".csv", sep ="")) # 注意这里要把换行符号\n替换掉。
Sys.sleep(0.5)
assign (web3, DF, envir = .GlobalEnv)
}

GrabWebpage(links)
case = 1

# 使用while循环。
while(case < 1000) { # while-loop當符合裡面的條件時，就會一直重複括號內的程式碼，直到不符合為止
DF = GrabWebpage(links)
links = as.character(DF$web1)
case = case + 1
}

2017-11-5 网络爬虫一点心得

Archives

Tony Huiquan Zhang Blog, Singer-songwriter, Sociologist