Posted on

记录一下第一次用R语言爬Letpub期刊信息的代码,以后还是尽量用python吧。

library(XML)
library(RSelenium)
library(rvest)
library(stringr)
library(data.table)
library(magrittr)
library(xlsx)

letpub <- function(){
  # Do not read string as a factor!
  options(stringsAsFactors=F)
  # load selenium service
  system("java -jar E:/study/python/selenium-server-standalone-2.50.1.jar", wait=F, invisible=T, minimized=T)
  # Configure the browser
  eCap  <- list(phantomjs.page.settings.userAgent="Mozilla/5.0 (Windows NT 6.1; WOW64; rv:29.0) Gecko/20120101 Firefox/29.0")
  # remDr <- remoteDriver(browserName="chrome", extraCapabilities=eCap)
  remDr <- remoteDriver(browserName="phantomjs", extraCapabilities=eCap)
  # Open and parse the initial page
  remDr$open()
  u <- 'https://www.letpub.com.cn/index.php?page=journalapp&view=researchfield&fieldtag=&firstletter=&currentpage=1#journallisttable'
  remDr$navigate(u)
  opage <- remDr$getPageSource()[[1]] %>% read_html(encoding ="UTF-8") %>% htmlParse(encoding='UTF-8')
  nurl  <- xpathSApply(opage,"//form[@name='jumppageform']",xmlValue) %>% str_extract('\\d+') %>% as.numeric() 
  nurl  <- 1:nurl
  aurl  <- paste0('https://www.letpub.com.cn/index.php?page=journalapp&view=researchfield&fieldtag=&firstletter=&currentpage=',nurl,'#journallisttable')
  tn <- 1
  an <- length(aurl)
  atabl1 <- NULL
  tablcol <- c('ISSN','Journal','IF2021','Division','Category','Discipline','Is_SCI','OA','Employment','Refereeing','View')
  for(turl in aurl){
    remDr$navigate(turl)
    tpage <- remDr$getPageSource()[[1]] %>% read_html(encoding ="UTF-8") %>% htmlParse(encoding='UTF-8')
    ttabl <- xpathSApply(tpage,"//table[@class='table_yjfx']")[[2]] %>% readHTMLTable(header=T) %>% set_colnames(.[1,]) %>% data.table() %>% .[c(-1,-.N),-11]
    names(ttabl)  <- tablcol
    tabbr <- ttabl$Journal
    ttabl$Journal <- xpathSApply(tpage,"//table[@class='table_yjfx']/tbody/tr/td/a[@target='_blank']",xmlValue) %>% .[as.logical(1:length(.)%%2)]
    ttabl$Abbr_Name <- str_remove(tabbr,ttabl$Journal)
    ttabl$Url     <- xpathSApply(tpage,"//table[@class='table_yjfx']/tbody/tr/td/a[@target='_blank']/@href") %>% str_remove('./') %>% paste0('https://www.letpub.com.cn/',.) %>% .[as.logical(1:length(.)%%2)]
    ttabl[Is_SCI=='SCISCIE',Is_SCI:='SCI/SCIE']
    atabl1 <- rbind(atabl1,ttabl)
    cat(sprintf("Summary page, [%d] pages has been crawled, a total of %d, [%.4f%%] completed",tn,an,tn/an*100),'\n')
    tn <- tn+1
    Sys.sleep(10)
  }
  return(atabl1)
}


dataset <- letpub()
write.csv(dataset, 'E:/dataset2.csv', row.names=F)

PS:这个玩意太耗时间,相比较Python还复杂,当学习好了

发表评论

邮箱地址不会被公开。 必填项已用*标注