這禮拜也有同網站的內容要爬(https://heavenlyfood.cn/books/index.php?id=4000) ,其主要結構與上星期的篇章雷同,因此沿用上星期的code,只是在最後抓取文章文件的時候,有遇到一些問題,如下圖:
如果有爬蟲經驗的人應該可以看出他文章主要是在一個名稱叫做div#c 這個nodes下存放,而我在整頁結構確認後,便使用R去執行html_nodes去抓這些節點,但經由文字提取的函數,卻抓不到任何文字。
後來發現,文章文字的內容,並不在文章的這個連結內,而是頁面結構先載入,而後內容才進行加載,這個一般稱為delay-load的問題,主要指我們想爬取的內容並非第一時間就在網頁結構上,這會讓一般的爬蟲code失效。原先想要用python的套件來處理這個問題,後來觀察他network的情況後,發現文章載入的連結,如下圖:
可以發現到它文章結構是在葉面開啟後接近4000毫秒才加載,因此我直接改成抓取這個連結內部的文字,便解決了這個網頁內容爬取的問題。
最終存儲的結果:
下面附上這結構的code:
import packages
if (!require(httr))install.packages("httr")
library(httr)
if (!require(rvest))install.packages("rvest")
library(rvest)
if (!require(ropencc))devtools::install_github("qinwf/ropencc")
library(ropencc)
# def simple to traditional
trans <- converter(S2TWP)
# setting a fake user agent
uastring <- "Mozilla/5.0 (Macintosh Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36"
Contents processing
library(stringr)
# def string replace
stringReplace <- function(txt){
txt <- str_replace_all(txt,"([。:、,])[.]+([0-9]+)\U00A0","\\1\n \\2 ")
txt <- str_replace_all(txt,"×","\n")
txt <- gsub("。,", "。", txt)
txt <- gsub("。。", "。", txt)
txt <- gsub("。:", "。:", txt)
txt <- gsub(";。", ";", txt)
txt <- gsub(",:", ",", txt)
# str_detect(text,"([0-9]+)\\1\U00A0")
txt <- gsub("\U00A0", " ", txt)
txt <- gsub("\U53C4", "叄", txt)
txt <- str_replace_all(txt,"\n+","\n\n")
return(txt)
}
id頁抓取標題& link
id = 4000
view_link <- paste0("https://heavenlyfood.cn/books/index.php?id=",id)
id_link <- read_html(html_session(view_link, user_agent(uastring)))
# 文章篇數
msg_num <- length(html_text(html_nodes(id_link,"div#menu a")))
# 抓取branch文字
#46 哥林多前書 31 (1311) 第三十一篇 變化為著建造
w1 = "TOC"## TOC
w4 = strsplit(view_link,split="id=",fixed=T)[[1]][2]
w5 = run_convert(trans, html_text(html_nodes(id_link,"div#toptitle")))
w6 = paste0(msg_num,"篇")
w = paste(w1,w4,w5,w6)
ex = paste("說明",w4,w5,w6)
bar<- "=========="
# 抓取sub章標題
id1 <- html_nodes(id_link,"div#menu a")
sub_title_list <- html_text(id1)
sub_title <- run_convert(trans, sub_title_list)
# create folder
id_name <- paste(w4,w5,w6)
folder <- paste0("./",id_name)
dir.create(folder)
# set wd
id_path = paste0(getwd(),"/",id_name)
setwd(id_path)
# 寫出id頁
id_page <- c(w,bar,view_link,sub_title,bar,bar)
write.table(id_page,paste0(w,".txt"),row.names = FALSE,col.names = FALSE,quote = FALSE,fileEncoding="UTF-8")
# 寫出id說明頁
ex_page <- c(ex,bar,view_link,sub_title,bar,bar)
write.table(ex_page,paste0(ex,".txt"),row.names = FALSE,col.names = FALSE,quote = FALSE,fileEncoding="UTF-8")
# 抓取sub link
url <- html_nodes(id_link,"div#menu a")
sub_links <- paste0("https://heavenlyfood.cn/",html_attr(html_nodes(id_link,"div#menu a"),"href"))
sub頁抓取標題&link&存msg
# 讀取sub頁面
for(ii in 1:length(sub_links)){
setwd(id_path)
sub_link <- sub_links[ii]
link <- sub_link
sub <- html_session(sub_link, user_agent(uastring))
sub <- read_html(sub)
# 抓取文章link
url <- html_nodes(sub,"div#title")
urls <- paste0("https://heavenlyfood.cn/view/",html_attr(html_nodes(sub,"div#title a#wtt"),"href"))
# 抓取branch文字
w1 = "TOC"## TOC
w2 = strsplit(link,split="sub=",fixed=T)[[1]][2]## 46
w3 = run_convert(trans, html_text(html_node(sub,"div#chap1 a#mainwhite")))## 哥林多前書生命讀經
w5 = run_convert(trans, html_text(html_nodes(id_link,"div#toptitle")))## 生命讀經
w6 = if(length(url)<100){w6 = paste0("0",length(url),"篇")}else{w6 = paste0(length(url),"篇")}## 篇
w = paste(w1,w2,w3,w5,w6)
bar<- "=========="
# 抓取文章標題
sub1 <- html_nodes(sub,"div#title")
book_title_list <- html_text(sub1)
book_title <- run_convert(trans, book_title_list[1:length(book_title_list)])
# 抓取文章link
sub2 <- html_nodes(sub,"div#title div a.content.link")
urls <- paste0("https://heavenlyfood.cn/books/",html_attr(sub2 ,"href"))
# create a folder
folder <- paste0("./",paste(w2,w3,w5,w6))
dir.create(folder)
# set wd
path = paste0("./",paste(w2,w3,w5,w6))
setwd(path)
# 寫出sub頁
sub_page <- c(w,bar,link,book_title,bar,bar)
write.table(sub_page,paste0(w,".txt"),row.names = FALSE,col.names = FALSE,quote = FALSE,fileEncoding="UTF-8")
# 寫出說明頁
ex = paste("說明",w2,w3,w5,w6)
ex_page <- c(ex,bar,link,book_title,bar,bar)
write.table(ex_page,paste0(ex,".txt"),row.names = FALSE,col.names = FALSE,quote = FALSE,fileEncoding="UTF-8")
# message 文章儲存
msg_link <- "https://heavenlyfood.cn/books/getContent.php?id=4000&sub=46&message=1&contentData=Spritualbooks&assist=&study=1&collect=&q="
for(i in 1:length(book_title_list)){
msg_link <- urls[i]
msg_link <- paste0("https://heavenlyfood.cn/books/getContent.php?id=",id,"&sub=",w2,"&message=",i,"&contentData=Spritualbooks&assist=&study=1&collect=&q=")
msg1 <- read_html(html_session(msg_link, user_agent(uastring)))
cont1 <- run_convert(trans,c(html_text(html_nodes(msg1,"div.modal-body"))))
cont1 <- stringReplace(cont1)
## 整合匯出
{if(i<10){wi = paste0("00",i)}
else if(100>i & i>10){wi = paste0("0",i)}}
{if(as.numeric(w2)<10){w22 = paste0("00",as.numeric(w2))}
else if(100>as.numeric(w2) & as.numeric(w2)>10){w22 = paste0("0",as.numeric(w2))}}
msg_w = paste(w22,w3,wi,book_title[i])
write.table(c(msg_w,"",link,"",bar,cont1,bar),paste0(msg_w,".txt"),fileEncoding="UTF-8",row.names = FALSE,col.names = FALSE,quote = FALSE)}
}