0
Je suis en train de gratter une seule page en utilisant Scrapy et SéléniumScrapy & Sélénium
import time
import scrapy
from selenium import webdriver
class SampleSpider(scrapy.Spider):
name = "sample"
start_urls = ['url-to-scrape']
def __init__(self):
self.driver = webdriver.Chrome()
def parse(self, response):
self.driver.get(response.url)
time.sleep(30)
for page in response.css('a'):
yield {
'url-href': page.xpath('@href').extract(),
'url-text': page.css('::text').extract()
}
self.driver.quit()
L'araignée ne sait capturer pas les balises et les sorties:
{"url-text": [" "], "url-href": ["javascript:WebForm_DoPostBackWithOptions(new WebForm_PostBackOptions(\"ctl00$PlaceHolderMain$ctl01$ctl00$ctl01\", \"\", true, \"\", \"\", false, true))"]},
{"url-text": [" "], "url-href": ["javascript:WebForm_DoPostBackWithOptions(new WebForm_PostBackOptions(\"ctl00$PlaceHolderMain$ctl01$ctl00$ctl02\", \"\", true, \"\", \"\", false, true))"]},
{"url-text": [" "], "url-href": ["javascript:WebForm_DoPostBackWithOptions(new WebForm_PostBackOptions(\"ctl00$PlaceHolderMain$ctl01$ctl00$ctl03\", \"\", true, \"\", \"\", false, true))"]}
pensées?