2011-11-06 1 views
1

Afin d'obtenir les données dont j'ai besoin d'un site Web en utilisant scrapy, je dois d'abord créer un objet de réponse afin que je puisse utiliser le HtmlXpathSelector sur lui. HtmlXpathSelector n'accepte pas une chaîne d'URL comme argument. Dans l'exemple de code suivant, la variable "response2" est vide car je ne sais pas comment le faire.comment instancier un scrapy.http.Response de chaîne d'URL?

from scrapy.selector import HtmlXPathSelector 
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor 
from scrapy.contrib.spiders import CrawlSpider, Rule 
from tarantula.items import OlgaItem 

class OlgaSpider(CrawlSpider): 
    """This crawler gets the physician's name and his homepage url.""" 

    name = 'Olga' 
    DOWNLOAD_DELAY = 6 #para tentar evitar ser banido 
    #ROBOTSTXT_OBEY = True 
    #CONCURRENT_REQUESTS = 1 
    FEED_URI = '/home/mercutio22/gitcode/MedicWebsites.csv' 
    FEED_FORMAT = 'csv' 
    USER_AGENT = "Googlebot/2.1 (http://www.google.com/bot.html)" 
    #allowed_domains = ['guiareunimedicos.med.br'] 
    start_urls = (
    'http://medial-saude.guiareunimedicos.med.br/index.pl?act=searc\ 
    h&_id_=172&_ev_=Submit&_formSearchSubmit=%3Adefault%3A&type=0&country=0\ 
    &q=oncologia#results/', 
    'http://www.guiareunimedicos.med.br/index.pl?act=search&_id_=17\ 
    #2&_ev_=Submit&_formSearchSubmit=%3Adefault%3A&type=0&country=0&q=cancer\ 
    #ologia#results/') 
    rules = (
    Rule(SgmlLinkExtractor(allow=r"V=", restrict_xpaths='//a[text()=">"]'), 
    callback='parse_item', follow=True), 
    ) 

    def parse_item(self, response): 

     hxs = HtmlXPathSelector(response) 
     mdata = hxs.select('//div[contains(@class, "mdata")]') 
     links = mdata.select('./a/@href').extract() 
     names = mdata.select('./a/text()').extract() 

     items = [] 
     for index in range(len(names)): 
      i = OlgaItem() 
      i['name'] = names[index] 
      i['link'] = links[index] 
      response2 = 
      hxs2 = HtmlXPathSelector(response2) ### 
      name = hxs2.select('//big/text()').extract() 
      i['clinics'] = hxs2.select('//h2/a/text()').extract() 
      data = hxs2.select('//div[contains(@class, "stab data")]') 
      addresses = [ x.select('./p/text()').extract() for x in data ] 
      addresses = [ ''.join(x) for x in addresses ] 
      addresses = [ x.replace('Telefone(s): \r\n\r\n\r\n', '') for x in addresses ] 
      addresses = [ x[2:] for x in addresses ] 
      i['addresses'] = addresses 
      i['phones'] = hxs2.select('//span[@id]/text()').extract() 

      items.append(i) 
     return items 

Répondre

3

Vous pouvez également créer HtmlXPathSelector en donnant à quelques-unes html:

hxs = HtmlXPathSelector(text= '<div>blah-blah</div>') 
Questions connexes