2017-09-21 4 views
0

J'utilise Rselenium à la ferraille site Web suivant: http://plovila.pomorstvo.hr/Accélérez grattage web en utilisant les navigateurs MULTIPLIE de Rselenium

Chaque fois que je dois entrer dans le champ « NIB », exécuter et débris de toutes les données. J'utilise plusieurs fois la fonction Sys.time() pour que mon code soit lent (environ 12 secondes pour une NIB). Je dois mettre au rebut environ 200 000 numéros NIB ce qui donne 30 jours de raclage.

Je suis intéressé si je peux ouvrir plusieurs navigateurs localement ou d'une façon ou d'une autre dans le nuage et rendre mon script de grattage plus rapide.

Est-il possible d'utiliser l'informatique parallèle pour résoudre ce problème? Avez-vous des suggestions?

EDIT: J'ajoute le code:

library(XML) 
library(RCurl) 
library(RSelenium) 
library(png) 
library(imager) 
library(RMySQL) 
library(htmltab) 
library(jsonlite) 
library(rvest) 

# function for waiting instead Sys.sleep() 
waitLoad <- function (xpath_check = "//input[@id = 'ctl00_Content_FormContent_uiIspisGrid_ctl00__0']/td[2]", 
         iterations = 5){ 
    counter <- 0 
    chk <- FALSE 
    while(!chk & counter <= iterations){ 
    wait <- tryCatch(
     remDr$findElement(using = "xpath", 
         xpath_check)$getElementText(), 
     # remDr$findElement(using = "xpath", "//input[@id = 'ctl00_Content_FormContent_Img1']")$clearElement(), 
     error = function(e) print(paste0("Trazi dalje")) 
    ) 
    if(wait == "Trazi dalje"){ 
     Sys.sleep(1L) 
     counter <- sum(counter, 1) 
    }else{ 
     chk <- TRUE 
    } 
    } 
} 

# Start Selenium Server 
# docker run -d -p 4445:4444 selenium/standalone-chrome:3.5.0 
remDr <- remoteDriver(remoteServerAddr = "192.168.99.100", port = 4445L, browserName = "chrome") 
remDr$open() 

# Simulate browser session and fill out form 
remDr$navigate("http://plovila.pomorstvo.hr/") 
remDr$findElement(using = "xpath", "//select[@id = 'ctl00_Content_FormContent_uiTipObjektaDropDown']/option[@value = '1']")$clickElement() 
remDr$screenshot(display = TRUE) 

# Scrap ! 
df <- list() 
Porivni_uredjaji <- list() 
Clanovi_posade <- list() 
Vlasnici <- list() 
Korisnici <- list() 
df_2 <- list() 
Tereti <- list() 
pocetak <- 100000 
kraj <- 100003 
system.time(
for (i in pocetak:kraj){ 
    remDr$findElement(using = "xpath", "//input[@id = 'ctl00_Content_FormContent_uiNibTextBox']")$clearElement() 
    Sys.sleep(1L) 
    remDr$findElement(using = "xpath", 
        "//input[@id = 'ctl00_Content_FormContent_uiNibTextBox']")$sendKeysToElement(list(as.character(i), 
                             key = "enter")) 
    waitLoad() 
    remDr$screenshot(display = TRUE) 
    doc <- htmlParse(remDr$getPageSource()[[1]]) 
    Sys.sleep(1L) 
    Ime <- xpathSApply(doc = doc, path = "//*[@id='ctl00_Content_FormContent_uiIspisGrid_ctl00__0']/td[1]", fun = xmlValue) 
    Oznaka <- xpathSApply(doc = doc, path = "//*[@id='ctl00_Content_FormContent_uiIspisGrid_ctl00__0']/td[2]", fun = xmlValue) 
    NIB <- xpathSApply(doc = doc, path = "//*[@id='ctl00_Content_FormContent_uiIspisGrid_ctl00__0']/td[3]", fun = xmlValue) 
    Vlasnik <- xpathSApply(doc = doc, path = "//*[@id='ctl00_Content_FormContent_uiIspisGrid_ctl00__0']/td[4]", fun = xmlValue) 
    LK_LI <- xpathSApply(doc = doc, path = "/html/body/form/div[4]/div[1]/div[3]/table/tbody/tr/td[5]", fun = xmlValue) 
    br1 <- xpathSApply(doc = doc, path = "/html/body/form/div[4]/div[1]/div[3]/table/tbody/tr/td[6]", fun = xmlValue) 
    br2 <- xpathSApply(doc = doc, path = "/html/body/form/div[4]/div[1]/div[3]/table/tbody/tr/td[7]", fun = xmlValue) 
    x <- i-pocetak + 1 
    if (length(NIB)==0){ 
    Pozivni_znak <- NA 
    df[[x]] <- cbind(Ime, Oznaka, NIB, Vlasnik, LK_LI, br1, br2, Pozivni_znak) 
    df[[x]] <- as.data.frame(df[[x]], stringsAsFactors = FALSE) 
    }else{ 
    remDr$findElement(using = "xpath", "//input[@title = 'Detalji']")$clickElement() 
    waitLoad("//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiNamjenaText']", 5) 
    doc <- htmlParse(remDr$getPageSource()[[1]], encoding = "UTF-8") 
    Sys.sleep(1L) 
    list_a <- xpathSApply(doc = doc, path = "/html/body/form/div[4]/div[1]/div[3]/fieldset/h3[1]", fun = xmlValue) 
    if (length(list_a) >= 1){ 

     Namjena <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiNamjenaText']/@value") 
     json <- paste0("[", '"', Namjena, '"', "]") 
     Namjena <- fromJSON(json) 
     Namjena <- as.data.frame(Namjena, stringsAsFactors = FALSE) 
     colnames(Namjena) <- "Namjena" 
     Vrsta_plovila <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiVrstaPlovilaText']/@value") 
     json <- paste0("[", '"', Vrsta_plovila, '"', "]") 
     Vrsta_plovila <- fromJSON(json) 
     Vrsta_plovila <- as.data.frame(Vrsta_plovila, stringsAsFactors = FALSE) 
     colnames(Vrsta_plovila) <- "Vrsta_plovila" 
     Model_plovila <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiModelPlovilaText']/@value") 
     json <- paste0("[", '"', Model_plovila, '"', "]") 
     Model_plovila <- fromJSON(json) 
     Model_plovila <- as.data.frame(Model_plovila, stringsAsFactors = FALSE) 
     colnames(Model_plovila) <- "Model_plovila" 
     Duljina_trupa <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiDuljinaTrupaText']/@value") 
     json <- paste0("[", '"', Duljina_trupa, '"', "]") 
     Duljina_trupa <- fromJSON(json) 
     Duljina_trupa <- as.data.frame(Duljina_trupa, stringsAsFactors = FALSE) 
     colnames(Duljina_trupa) <- "Duljina_trupa" 
     Sirina_trupa <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiSirinaText']/@value") 
     json <- paste0("[", '"', Sirina_trupa, '"', "]") 
     Sirina_trupa <- fromJSON(json) 
     Sirina_trupa <- as.data.frame(Sirina_trupa, stringsAsFactors = FALSE) 
     colnames(Sirina_trupa) <- "Sirina_trupa" 
     Visina_trupa <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiVisinaText']/@value") 
     json <- paste0("[", '"', Visina_trupa, '"', "]") 
     Visina_trupa <- fromJSON(json) 
     Visina_trupa <- as.data.frame(Visina_trupa, stringsAsFactors = FALSE) 
     colnames(Visina_trupa) <- "Visina_trupa" 
     Gaz <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiGazText']/@value") 
     json <- paste0("[", '"', Gaz, '"', "]") 
     Gaz <- fromJSON(json) 
     Gaz <- as.data.frame(Gaz, stringsAsFactors = FALSE) 
     colnames(Gaz) <- "Gaz" 
     Nosivost <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiNosivostText']/@value") 
     json <- paste0("[", '"', Nosivost, '"', "]") 
     Nosivost <- fromJSON(json) 
     Nosivost <- as.data.frame(Nosivost, stringsAsFactors = FALSE) 
     colnames(Nosivost) <- "Nosivost" 
     GT <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiGtText']/@value") 
     json <- paste0("[", '"', GT, '"', "]") 
     GT <- fromJSON(json) 
     GT <- as.data.frame(GT, stringsAsFactors = FALSE) 
     colnames(GT) <- "GT" 
     Snaga_motora <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiUkupnaSnagaText']/@value") 
     json <- paste0("[", '"', Snaga_motora, '"', "]") 
     Snaga_motora <- fromJSON(json) 
     Snaga_motora <- as.data.frame(Snaga_motora, stringsAsFactors = FALSE) 
     colnames(Snaga_motora) <- "Snaga_motora" 
     Brodogradiliste <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiBrodogradilisteText']/@value") 
     Brodogradiliste <- gsub("\"", "'", Brodogradiliste) 
     json <- paste0("[", '"', Brodogradiliste, '"', "]") 
     Brodogradiliste <- fromJSON(json) 
     Brodogradiliste <- as.data.frame(Brodogradiliste, stringsAsFactors = FALSE) 
     Encoding(Brodogradiliste[,c(1)]) <- "UTF-8" 
     colnames(Brodogradiliste) <- "Brodogradiliste" 
     Godina_gradnje <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiGodGradnjeText']/@value") 
     json <- paste0("[", '"', Godina_gradnje, '"', "]") 
     Godina_gradnje <- fromJSON(json) 
     Godina_gradnje <- as.data.frame(Godina_gradnje, stringsAsFactors = FALSE) 
     colnames(Godina_gradnje) <- "Godina_gradnje" 
     Materijal <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiMaterijalGradnjeText']/@value") 
     json <- paste0("[", '"', Materijal, '"', "]") 
     Materijal <- fromJSON(json) 
     Materijal <- as.data.frame(Materijal, stringsAsFactors = FALSE) 
     colnames(Materijal) <- "Materijal" 
     Najveci_broj_osoba <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiMaxBrojOsobaText']/@value") 
     json <- paste0("[", '"', Najveci_broj_osoba, '"', "]") 
     Najveci_broj_osoba <- fromJSON(json) 
     Najveci_broj_osoba <- as.data.frame(Najveci_broj_osoba, stringsAsFactors = FALSE) 
     colnames(Najveci_broj_osoba) <- "Najveci_broj_osoba" 
     Najveci_broj_putnika <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiMaxBrojPutnikaText']/@value") 
     json <- paste0("[", '"', Najveci_broj_putnika, '"', "]") 
     Najveci_broj_putnika <- fromJSON(json) 
     Najveci_broj_putnika <- as.data.frame(Najveci_broj_putnika, stringsAsFactors = FALSE) 
     colnames(Najveci_broj_putnika) <- "Najveci_broj_putnika" 
     Najmanji_broj_posade <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiMinBrojPosade']/@value") 
     json <- paste0("[", '"', Najmanji_broj_posade, '"', "]") 
     Najmanji_broj_posade <- fromJSON(json) 
     Najmanji_broj_posade <- as.data.frame(Najmanji_broj_posade, stringsAsFactors = FALSE) 
     colnames(Najmanji_broj_posade) <- "Najmanji_broj_posade" 
     Prethodna_oznaka <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiPrethodnaOznakaText']/@value") 
     json <- paste0("[", '"', Prethodna_oznaka, '"', "]") 
     Prethodna_oznaka <- fromJSON(json) 
     Prethodna_oznaka <- as.data.frame(Prethodna_oznaka, stringsAsFactors = FALSE) 
     colnames(Prethodna_oznaka) <- "Prethodna_oznaka" 
     Prethodna_luka <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiPrethodnaLukaUpisaText']/@value") 
     Prethodna_luka <- gsub("\"", "'", Prethodna_luka) 
     json <- paste0("[", '"', Prethodna_luka, '"', "]") 
     Prethodna_luka <- fromJSON(json) 
     Prethodna_luka <- as.data.frame(Prethodna_luka, stringsAsFactors = FALSE) 
     colnames(Prethodna_luka) <- "Prethodna_luka" 
     Prethodna_drĹľava <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiPrethodnaDrzavaUpisaText']/@value") 
     json <- paste0("[", '"', Prethodna_drĹľava, '"', "]") 
     Prethodna_drĹľava <- fromJSON(json) 
     Prethodna_drĹľava <- as.data.frame(Prethodna_drĹľava, stringsAsFactors = FALSE) 
     colnames(Prethodna_drĹľava) <- "Prethodna_drĹľava" 

     df[[x]] <- cbind(Ime, Oznaka, NIB, Vlasnik, LK_LI, br1, br2, Namjena, Vrsta_plovila, 
         Model_plovila, Duljina_trupa, Sirina_trupa, Visina_trupa, Gaz, Nosivost, GT, 
         Snaga_motora, Brodogradiliste, Godina_gradnje, Materijal, Najveci_broj_osoba, 
         Najveci_broj_putnika, Najmanji_broj_posade, Prethodna_oznaka, 
         Prethodna_luka, Prethodna_drĹľava) 
     df[[x]] <- as.data.frame(df[[x]], stringsAsFactors = FALSE) 

     df_2 <- readHTMLTable(doc) 
     Sys.sleep(2L) 

     Porivni_uredjaji[[x]] <- tryCatch(as.data.frame(cbind(df_2[[2]], NIB), stringsAsFactors = FALSE), error=function(e) print(paste0("Error ", NIB))) 
     Clanovi_posade[[x]] <- tryCatch(as.data.frame(cbind(df_2[[3]], NIB), stringsAsFactors = FALSE), error=function(e) print(paste0("Error ", NIB))) 
     Vlasnici[[x]] <- tryCatch(as.data.frame(cbind(df_2[[4]], NIB), stringsAsFactors = FALSE), error=function(e) print(paste0("Error ", NIB))) 
     Korisnici[[x]] <- tryCatch(as.data.frame(cbind(df_2[[5]], NIB), stringsAsFactors = FALSE), error=function(e) print(paste0("Error ", NIB))) 
     Tereti[[x]] <- cbind(remDr$findElement(using = "xpath", "//*/span[@id='ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiTeretiLabel']")$getElementText(), NIB) 
    }} 
} 
) 

# manipulate data after scraping 
for (i in 1:length(df)){ 
    if (length(df[[i]]) < 13){ 
    df[[i]] <- matrix(data = rep(NA, 26), nrow = 1, ncol = 26) 
    df[[i]] <- as.data.frame(df[[i]]) 
    colnames(df[[i]]) <- c("Ime", "Oznaka", "NIB", "Vlasnik", "LK_LI", "br1", "br2","Namjena", 
          "Vrsta_plovila", "Model_plovila", "Duljina_trupa", "Sirina_trupa", "Visina_trupa", 
          "Gaz", "Nosivost", "GT", "Snaga_motora", "Brodogradiliste", "Godina_gradnje", 
          "Materijal", "Najveci_broj_osoba", "Najveci_broj_putnika", "Najmanji_broj_posade", 
          "Prethodna_oznaka", "Prethodna_luka", "Prethodna_drĹľava") 
    } 
} 

df_final <- do.call(rbind, df) 
df_final_1 <- df_final[!is.na(df_final$NIB), ] 

EDIT 2: J'ai un problème avec le code ci-dessus vous avez publié. Si je lance:

(cl <- (detectCores() - 1) %>% makeCluster) %>% registerDoParallel 
# open a remoteDriver for each node on the cluster 
# docker run -d -p 4445:4444 selenium/standalone-chrome:3.5.3 
clusterEvalQ(cl, { 
    library(RSelenium) 
    remDr <- remoteDriver(remoteServerAddr = "192.168.99.100", port = 4445L, browserName = "chrome") 
    remDr$open() 
}) 
myTitles <- c() 
ws <- foreach(x = 1:length(urls), 
       .packages = c("rvest", "magrittr", "RSelenium", "jsonlite", "htmltab", "XML", "RCurl")) %dopar% { 
    remDr$navigate(urls[x]) 
    Sys.sleep(3L) 
    remDr$getTitle()[[1]] 
       } 

il renvoie une erreur

Error in { : task 1 failed - " Summary: UnknownError 
    Detail: An unknown server-side error occurred while processing the command. 
    Further Details: run errorDetails method" 
+0

La solution la plus simple serait de personnaliser ou de votre casse votre entrée en lots et exécuter 8 scripts parallèles dans un terminal différent. Cela signifie moins de changements de code. Et rappelez-vous ces navigateurs basés sur l'interface utilisateur, donc vous allez au-delà d'un nombre spécifique va commencer à détériorer les performances de votre système –

+0

@tarun Lalwani Que voulez-vous dire par l'entrée de pause dans les baches? J'ai utilisé oparallel et foreach package pour parallèle y calculer pour la boucle. Mais je ne suis pas sûr de savoir comment faire ça avec Rselenium. Ou est-ce même possible. – Mislav

+0

Postez votre code, ne pouvez pas commenter comme ceci –

Répondre

0

Peut-être un problème avec chrome: 3.5.0 image docker. Le suivant fonctionne pour moi sur gagner 10 avec docker toolbox:

library(RSelenium) 
library(rvest) 
library(magrittr) 
library(foreach) 
library(doParallel) 

# using docker run -d -p 4445:4444 selenium/standalone-chrome:3.5.3 
# in windows 
URLsPar <- c("https://stackoverflow.com/", "https://github.com/", 
      "http://www.bbc.com/", "http://www.google.com", 
      "https://www.r-project.org/", "https://cran.r-project.org", 
      "https://twitter.com/", "https://www.facebook.com/") 

appHTML <- c() 

(cl <- (detectCores() - 1) %>% makeCluster) %>% registerDoParallel 
# open a remoteDriver for each node on the cluster 
clusterEvalQ(cl, { 
    library(RSelenium) 
    remDr <- remoteDriver(remoteServerAddr = "192.168.99.100", port = 4445L, 
         browserName = "chrome") 
    remDr$open() 
}) 
ws <- foreach(x = 1:length(URLsPar), 
       .packages = c("rvest", "magrittr", "RSelenium")) %dopar% { 
       print(URLsPar[x]) 
       remDr$navigate(URLsPar[x]) 
       remDr$getTitle()[[1]] 
       } 
> ws 
[[1]] 
[1] "Stack Overflow - Where Developers Learn, Share, & Build Careers" 

[[2]] 
[1] "The world's leading software development platform · GitHub" 

[[3]] 
[1] "BBC - Homepage" 

[[4]] 
[1] "Google" 

[[5]] 
[1] "R: The R Project for Statistical Computing" 

[[6]] 
[1] "The Comprehensive R Archive Network" 

[[7]] 
[1] "Twitter. It's what's happening." 

[[8]] 
[1] "Facebook - Log In or Sign Up"  


# close browser on each node 
clusterEvalQ(cl, { 
    remDr$close() 
}) 

stopImplicitCluster() 
+0

mêmes URL dans le vecteur 'URLsPar'. Peut-être que c'est le problème. aussi, j'ai le pilote 3.5.3., pas 3.5.0. Si je cours le code de la manière habituelle, il woks parfaitement (remoteDriver sans paralell) – Mislav

+0

J'ai changé le nom 'urls' instad du nom de variable' URLsPar 'en premier lieu. Maintenant, j'ai changé pour 'URLsPar' et il donne la sortie. Pourquoi le nom de variable est-il important? Il donne ce qui suit: [1] "Stack Overflow - Où les développeurs apprennent, partagent, et construisent des carrières" et pour [2] et [3] "chrome inaccessible \ n (Information de session: chrome = 61.0.3163.91) \ n (Informations sur le pilote: chromedriver = 2.32.498513 (2c63aa53b2c658de596ed550eb5267ec5967b351), platform = Linux 4.4.86-boot2docker x86_64) "' La question est maintenant pourquoi le chrome n'est pas accessible. – Mislav

+1

J'ai changé l'exemple à 8 URL. Je ne vois pas le problème que vous avez. Il est peut-être lié à https://github.com/SeleniumHQ/docker-selenium/issues/547 et https: // github.com/SeleniumHQ/docker-selenium/issues/554 – jdharrison