记录一下第一次用R语言爬Letpub期刊信息的代码,以后还是尽量用python吧。
library(XML)
library(RSelenium)
library(rvest)
library(stringr)
library(data.table)
library(magrittr)
library(xlsx)
letpub <- function(){
# Do not read string as a factor!
options(stringsAsFactors=F)
# load selenium service
system("java -jar E:/study/python/selenium-server-standalone-2.50.1.jar", wait=F, invisible=T, minimized=T)
# Configure the browser
eCap <- list(phantomjs.page.settings.userAgent="Mozilla/5.0 (Windows NT 6.1; WOW64; rv:29.0) Gecko/20120101 Firefox/29.0")
# remDr <- remoteDriver(browserName="chrome", extraCapabilities=eCap)
remDr <- remoteDriver(browserName="phantomjs", extraCapabilities=eCap)
# Open and parse the initial page
remDr$open()
u <- 'https://www.letpub.com.cn/index.php?page=journalapp&view=researchfield&fieldtag=&firstletter=¤tpage=1#journallisttable'
remDr$navigate(u)
opage <- remDr$getPageSource()[[1]] %>% read_html(encoding ="UTF-8") %>% htmlParse(encoding='UTF-8')
nurl <- xpathSApply(opage,"//form[@name='jumppageform']",xmlValue) %>% str_extract('\\d+') %>% as.numeric()
nurl <- 1:nurl
aurl <- paste0('https://www.letpub.com.cn/index.php?page=journalapp&view=researchfield&fieldtag=&firstletter=¤tpage=',nurl,'#journallisttable')
tn <- 1
an <- length(aurl)
atabl1 <- NULL
tablcol <- c('ISSN','Journal','IF2021','Division','Category','Discipline','Is_SCI','OA','Employment','Refereeing','View')
for(turl in aurl){
remDr$navigate(turl)
tpage <- remDr$getPageSource()[[1]] %>% read_html(encoding ="UTF-8") %>% htmlParse(encoding='UTF-8')
ttabl <- xpathSApply(tpage,"//table[@class='table_yjfx']")[[2]] %>% readHTMLTable(header=T) %>% set_colnames(.[1,]) %>% data.table() %>% .[c(-1,-.N),-11]
names(ttabl) <- tablcol
tabbr <- ttabl$Journal
ttabl$Journal <- xpathSApply(tpage,"//table[@class='table_yjfx']/tbody/tr/td/a[@target='_blank']",xmlValue) %>% .[as.logical(1:length(.)%%2)]
ttabl$Abbr_Name <- str_remove(tabbr,ttabl$Journal)
ttabl$Url <- xpathSApply(tpage,"//table[@class='table_yjfx']/tbody/tr/td/a[@target='_blank']/@href") %>% str_remove('./') %>% paste0('https://www.letpub.com.cn/',.) %>% .[as.logical(1:length(.)%%2)]
ttabl[Is_SCI=='SCISCIE',Is_SCI:='SCI/SCIE']
atabl1 <- rbind(atabl1,ttabl)
cat(sprintf("Summary page, [%d] pages has been crawled, a total of %d, [%.4f%%] completed",tn,an,tn/an*100),'\n')
tn <- tn+1
Sys.sleep(10)
}
return(atabl1)
}
dataset <- letpub()
write.csv(dataset, 'E:/dataset2.csv', row.names=F)
PS:这个玩意太耗时间,相比较Python还复杂,当学习好了