J'utilise Rselenium à la ferraille site Web suivant: http://plovila.pomorstvo.hr/Accélérez grattage web en utilisant les navigateurs MULTIPLIE de Rselenium
Chaque fois que je dois entrer dans le champ « NIB », exécuter et débris de toutes les données. J'utilise plusieurs fois la fonction Sys.time() pour que mon code soit lent (environ 12 secondes pour une NIB). Je dois mettre au rebut environ 200 000 numéros NIB ce qui donne 30 jours de raclage.
Je suis intéressé si je peux ouvrir plusieurs navigateurs localement ou d'une façon ou d'une autre dans le nuage et rendre mon script de grattage plus rapide.
Est-il possible d'utiliser l'informatique parallèle pour résoudre ce problème? Avez-vous des suggestions?
EDIT: J'ajoute le code:
library(XML)
library(RCurl)
library(RSelenium)
library(png)
library(imager)
library(RMySQL)
library(htmltab)
library(jsonlite)
library(rvest)
# function for waiting instead Sys.sleep()
waitLoad <- function (xpath_check = "//input[@id = 'ctl00_Content_FormContent_uiIspisGrid_ctl00__0']/td[2]",
iterations = 5){
counter <- 0
chk <- FALSE
while(!chk & counter <= iterations){
wait <- tryCatch(
remDr$findElement(using = "xpath",
xpath_check)$getElementText(),
# remDr$findElement(using = "xpath", "//input[@id = 'ctl00_Content_FormContent_Img1']")$clearElement(),
error = function(e) print(paste0("Trazi dalje"))
)
if(wait == "Trazi dalje"){
Sys.sleep(1L)
counter <- sum(counter, 1)
}else{
chk <- TRUE
}
}
}
# Start Selenium Server
# docker run -d -p 4445:4444 selenium/standalone-chrome:3.5.0
remDr <- remoteDriver(remoteServerAddr = "192.168.99.100", port = 4445L, browserName = "chrome")
remDr$open()
# Simulate browser session and fill out form
remDr$navigate("http://plovila.pomorstvo.hr/")
remDr$findElement(using = "xpath", "//select[@id = 'ctl00_Content_FormContent_uiTipObjektaDropDown']/option[@value = '1']")$clickElement()
remDr$screenshot(display = TRUE)
# Scrap !
df <- list()
Porivni_uredjaji <- list()
Clanovi_posade <- list()
Vlasnici <- list()
Korisnici <- list()
df_2 <- list()
Tereti <- list()
pocetak <- 100000
kraj <- 100003
system.time(
for (i in pocetak:kraj){
remDr$findElement(using = "xpath", "//input[@id = 'ctl00_Content_FormContent_uiNibTextBox']")$clearElement()
Sys.sleep(1L)
remDr$findElement(using = "xpath",
"//input[@id = 'ctl00_Content_FormContent_uiNibTextBox']")$sendKeysToElement(list(as.character(i),
key = "enter"))
waitLoad()
remDr$screenshot(display = TRUE)
doc <- htmlParse(remDr$getPageSource()[[1]])
Sys.sleep(1L)
Ime <- xpathSApply(doc = doc, path = "//*[@id='ctl00_Content_FormContent_uiIspisGrid_ctl00__0']/td[1]", fun = xmlValue)
Oznaka <- xpathSApply(doc = doc, path = "//*[@id='ctl00_Content_FormContent_uiIspisGrid_ctl00__0']/td[2]", fun = xmlValue)
NIB <- xpathSApply(doc = doc, path = "//*[@id='ctl00_Content_FormContent_uiIspisGrid_ctl00__0']/td[3]", fun = xmlValue)
Vlasnik <- xpathSApply(doc = doc, path = "//*[@id='ctl00_Content_FormContent_uiIspisGrid_ctl00__0']/td[4]", fun = xmlValue)
LK_LI <- xpathSApply(doc = doc, path = "/html/body/form/div[4]/div[1]/div[3]/table/tbody/tr/td[5]", fun = xmlValue)
br1 <- xpathSApply(doc = doc, path = "/html/body/form/div[4]/div[1]/div[3]/table/tbody/tr/td[6]", fun = xmlValue)
br2 <- xpathSApply(doc = doc, path = "/html/body/form/div[4]/div[1]/div[3]/table/tbody/tr/td[7]", fun = xmlValue)
x <- i-pocetak + 1
if (length(NIB)==0){
Pozivni_znak <- NA
df[[x]] <- cbind(Ime, Oznaka, NIB, Vlasnik, LK_LI, br1, br2, Pozivni_znak)
df[[x]] <- as.data.frame(df[[x]], stringsAsFactors = FALSE)
}else{
remDr$findElement(using = "xpath", "//input[@title = 'Detalji']")$clickElement()
waitLoad("//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiNamjenaText']", 5)
doc <- htmlParse(remDr$getPageSource()[[1]], encoding = "UTF-8")
Sys.sleep(1L)
list_a <- xpathSApply(doc = doc, path = "/html/body/form/div[4]/div[1]/div[3]/fieldset/h3[1]", fun = xmlValue)
if (length(list_a) >= 1){
Namjena <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiNamjenaText']/@value")
json <- paste0("[", '"', Namjena, '"', "]")
Namjena <- fromJSON(json)
Namjena <- as.data.frame(Namjena, stringsAsFactors = FALSE)
colnames(Namjena) <- "Namjena"
Vrsta_plovila <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiVrstaPlovilaText']/@value")
json <- paste0("[", '"', Vrsta_plovila, '"', "]")
Vrsta_plovila <- fromJSON(json)
Vrsta_plovila <- as.data.frame(Vrsta_plovila, stringsAsFactors = FALSE)
colnames(Vrsta_plovila) <- "Vrsta_plovila"
Model_plovila <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiModelPlovilaText']/@value")
json <- paste0("[", '"', Model_plovila, '"', "]")
Model_plovila <- fromJSON(json)
Model_plovila <- as.data.frame(Model_plovila, stringsAsFactors = FALSE)
colnames(Model_plovila) <- "Model_plovila"
Duljina_trupa <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiDuljinaTrupaText']/@value")
json <- paste0("[", '"', Duljina_trupa, '"', "]")
Duljina_trupa <- fromJSON(json)
Duljina_trupa <- as.data.frame(Duljina_trupa, stringsAsFactors = FALSE)
colnames(Duljina_trupa) <- "Duljina_trupa"
Sirina_trupa <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiSirinaText']/@value")
json <- paste0("[", '"', Sirina_trupa, '"', "]")
Sirina_trupa <- fromJSON(json)
Sirina_trupa <- as.data.frame(Sirina_trupa, stringsAsFactors = FALSE)
colnames(Sirina_trupa) <- "Sirina_trupa"
Visina_trupa <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiVisinaText']/@value")
json <- paste0("[", '"', Visina_trupa, '"', "]")
Visina_trupa <- fromJSON(json)
Visina_trupa <- as.data.frame(Visina_trupa, stringsAsFactors = FALSE)
colnames(Visina_trupa) <- "Visina_trupa"
Gaz <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiGazText']/@value")
json <- paste0("[", '"', Gaz, '"', "]")
Gaz <- fromJSON(json)
Gaz <- as.data.frame(Gaz, stringsAsFactors = FALSE)
colnames(Gaz) <- "Gaz"
Nosivost <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiNosivostText']/@value")
json <- paste0("[", '"', Nosivost, '"', "]")
Nosivost <- fromJSON(json)
Nosivost <- as.data.frame(Nosivost, stringsAsFactors = FALSE)
colnames(Nosivost) <- "Nosivost"
GT <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiGtText']/@value")
json <- paste0("[", '"', GT, '"', "]")
GT <- fromJSON(json)
GT <- as.data.frame(GT, stringsAsFactors = FALSE)
colnames(GT) <- "GT"
Snaga_motora <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiUkupnaSnagaText']/@value")
json <- paste0("[", '"', Snaga_motora, '"', "]")
Snaga_motora <- fromJSON(json)
Snaga_motora <- as.data.frame(Snaga_motora, stringsAsFactors = FALSE)
colnames(Snaga_motora) <- "Snaga_motora"
Brodogradiliste <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiBrodogradilisteText']/@value")
Brodogradiliste <- gsub("\"", "'", Brodogradiliste)
json <- paste0("[", '"', Brodogradiliste, '"', "]")
Brodogradiliste <- fromJSON(json)
Brodogradiliste <- as.data.frame(Brodogradiliste, stringsAsFactors = FALSE)
Encoding(Brodogradiliste[,c(1)]) <- "UTF-8"
colnames(Brodogradiliste) <- "Brodogradiliste"
Godina_gradnje <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiGodGradnjeText']/@value")
json <- paste0("[", '"', Godina_gradnje, '"', "]")
Godina_gradnje <- fromJSON(json)
Godina_gradnje <- as.data.frame(Godina_gradnje, stringsAsFactors = FALSE)
colnames(Godina_gradnje) <- "Godina_gradnje"
Materijal <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiMaterijalGradnjeText']/@value")
json <- paste0("[", '"', Materijal, '"', "]")
Materijal <- fromJSON(json)
Materijal <- as.data.frame(Materijal, stringsAsFactors = FALSE)
colnames(Materijal) <- "Materijal"
Najveci_broj_osoba <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiMaxBrojOsobaText']/@value")
json <- paste0("[", '"', Najveci_broj_osoba, '"', "]")
Najveci_broj_osoba <- fromJSON(json)
Najveci_broj_osoba <- as.data.frame(Najveci_broj_osoba, stringsAsFactors = FALSE)
colnames(Najveci_broj_osoba) <- "Najveci_broj_osoba"
Najveci_broj_putnika <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiMaxBrojPutnikaText']/@value")
json <- paste0("[", '"', Najveci_broj_putnika, '"', "]")
Najveci_broj_putnika <- fromJSON(json)
Najveci_broj_putnika <- as.data.frame(Najveci_broj_putnika, stringsAsFactors = FALSE)
colnames(Najveci_broj_putnika) <- "Najveci_broj_putnika"
Najmanji_broj_posade <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiMinBrojPosade']/@value")
json <- paste0("[", '"', Najmanji_broj_posade, '"', "]")
Najmanji_broj_posade <- fromJSON(json)
Najmanji_broj_posade <- as.data.frame(Najmanji_broj_posade, stringsAsFactors = FALSE)
colnames(Najmanji_broj_posade) <- "Najmanji_broj_posade"
Prethodna_oznaka <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiPrethodnaOznakaText']/@value")
json <- paste0("[", '"', Prethodna_oznaka, '"', "]")
Prethodna_oznaka <- fromJSON(json)
Prethodna_oznaka <- as.data.frame(Prethodna_oznaka, stringsAsFactors = FALSE)
colnames(Prethodna_oznaka) <- "Prethodna_oznaka"
Prethodna_luka <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiPrethodnaLukaUpisaText']/@value")
Prethodna_luka <- gsub("\"", "'", Prethodna_luka)
json <- paste0("[", '"', Prethodna_luka, '"', "]")
Prethodna_luka <- fromJSON(json)
Prethodna_luka <- as.data.frame(Prethodna_luka, stringsAsFactors = FALSE)
colnames(Prethodna_luka) <- "Prethodna_luka"
Prethodna_drĹľava <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiPrethodnaDrzavaUpisaText']/@value")
json <- paste0("[", '"', Prethodna_drĹľava, '"', "]")
Prethodna_drĹľava <- fromJSON(json)
Prethodna_drĹľava <- as.data.frame(Prethodna_drĹľava, stringsAsFactors = FALSE)
colnames(Prethodna_drĹľava) <- "Prethodna_drĹľava"
df[[x]] <- cbind(Ime, Oznaka, NIB, Vlasnik, LK_LI, br1, br2, Namjena, Vrsta_plovila,
Model_plovila, Duljina_trupa, Sirina_trupa, Visina_trupa, Gaz, Nosivost, GT,
Snaga_motora, Brodogradiliste, Godina_gradnje, Materijal, Najveci_broj_osoba,
Najveci_broj_putnika, Najmanji_broj_posade, Prethodna_oznaka,
Prethodna_luka, Prethodna_drĹľava)
df[[x]] <- as.data.frame(df[[x]], stringsAsFactors = FALSE)
df_2 <- readHTMLTable(doc)
Sys.sleep(2L)
Porivni_uredjaji[[x]] <- tryCatch(as.data.frame(cbind(df_2[[2]], NIB), stringsAsFactors = FALSE), error=function(e) print(paste0("Error ", NIB)))
Clanovi_posade[[x]] <- tryCatch(as.data.frame(cbind(df_2[[3]], NIB), stringsAsFactors = FALSE), error=function(e) print(paste0("Error ", NIB)))
Vlasnici[[x]] <- tryCatch(as.data.frame(cbind(df_2[[4]], NIB), stringsAsFactors = FALSE), error=function(e) print(paste0("Error ", NIB)))
Korisnici[[x]] <- tryCatch(as.data.frame(cbind(df_2[[5]], NIB), stringsAsFactors = FALSE), error=function(e) print(paste0("Error ", NIB)))
Tereti[[x]] <- cbind(remDr$findElement(using = "xpath", "//*/span[@id='ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiTeretiLabel']")$getElementText(), NIB)
}}
}
)
# manipulate data after scraping
for (i in 1:length(df)){
if (length(df[[i]]) < 13){
df[[i]] <- matrix(data = rep(NA, 26), nrow = 1, ncol = 26)
df[[i]] <- as.data.frame(df[[i]])
colnames(df[[i]]) <- c("Ime", "Oznaka", "NIB", "Vlasnik", "LK_LI", "br1", "br2","Namjena",
"Vrsta_plovila", "Model_plovila", "Duljina_trupa", "Sirina_trupa", "Visina_trupa",
"Gaz", "Nosivost", "GT", "Snaga_motora", "Brodogradiliste", "Godina_gradnje",
"Materijal", "Najveci_broj_osoba", "Najveci_broj_putnika", "Najmanji_broj_posade",
"Prethodna_oznaka", "Prethodna_luka", "Prethodna_drĹľava")
}
}
df_final <- do.call(rbind, df)
df_final_1 <- df_final[!is.na(df_final$NIB), ]
EDIT 2: J'ai un problème avec le code ci-dessus vous avez publié. Si je lance:
(cl <- (detectCores() - 1) %>% makeCluster) %>% registerDoParallel
# open a remoteDriver for each node on the cluster
# docker run -d -p 4445:4444 selenium/standalone-chrome:3.5.3
clusterEvalQ(cl, {
library(RSelenium)
remDr <- remoteDriver(remoteServerAddr = "192.168.99.100", port = 4445L, browserName = "chrome")
remDr$open()
})
myTitles <- c()
ws <- foreach(x = 1:length(urls),
.packages = c("rvest", "magrittr", "RSelenium", "jsonlite", "htmltab", "XML", "RCurl")) %dopar% {
remDr$navigate(urls[x])
Sys.sleep(3L)
remDr$getTitle()[[1]]
}
il renvoie une erreur
Error in { : task 1 failed - " Summary: UnknownError
Detail: An unknown server-side error occurred while processing the command.
Further Details: run errorDetails method"
La solution la plus simple serait de personnaliser ou de votre casse votre entrée en lots et exécuter 8 scripts parallèles dans un terminal différent. Cela signifie moins de changements de code. Et rappelez-vous ces navigateurs basés sur l'interface utilisateur, donc vous allez au-delà d'un nombre spécifique va commencer à détériorer les performances de votre système –
@tarun Lalwani Que voulez-vous dire par l'entrée de pause dans les baches? J'ai utilisé oparallel et foreach package pour parallèle y calculer pour la boucle. Mais je ne suis pas sûr de savoir comment faire ça avec Rselenium. Ou est-ce même possible. – Mislav
Postez votre code, ne pouvez pas commenter comme ceci –