2017-08-16 1 views
0

J'essaie d'explorer une liste paginée de Catalogfonctionne bien.Scrapy comment ramper pagination dans le deuxième niveau ou pagination imbriquée

Mais pour chaqueCatalog il y a une liste paginé deDataSet mais seule la première page là-bas apparaît dans le résultat. J'essaye d'obtenir un résultat qui ressemble à ceci, mais tous les noeuds 24 devraient être là correspondant à 24 DataSet couvrant des pages de 6 articles chacun.

[{'data_sets_count': 24, 
    'description': 'The catalog contains data regarding various indicators of ' 
       'HMIS like Health, Abortions, Immunisation, AEFI, Adolescent, ' 
       'Bite, Sting, Disease, Diarrhoeal, Hypertension, HIV, AIDS, ' 
       'Malaria, Neurological, Stroke, Fever, Respiratory, ' 
       'Infection, suicide, Trauma, Accident, Burn, Tuberculosis, ' 
       'VHND, ASHA, JSY, CHC, PHC, SDH, DH, Hospital.', 
    'last_updated': '11/08/17', 
    'ministry_department': 'Ministry of Health and Family Welfare, Department of ' 
         'Health and Family Welfare', 
    'nodes': [{'node': '3183861', 
      'title': 'Item-wise report for North Goa of Goa upto ' 
         'April-2014-15'}, 
      {'node': '3183881', 
      'title': 'Item-wise report for North Goa of Goa upto May-2014-15'}, 
      {'node': '3183981', 
      'title': 'Item-wise report for North Goa of Goa upto ' 
         'October-2014-15'}, 
      {'node': '3184021', 
      'title': 'Item-wise report for North Goa of Goa upto ' 
         'December-2014-15'}, 
      {'node': '3184061', 
      'title': 'Item-wise report for North Goa of Goa upto ' 
         'February-2014-15'}, 
      {'node': '3183961', 
      'title': 'Item-wise report for North Goa of Goa upto ' 
         'September-2014-15'}], 
    'state_department': None, 
    'title': 'HMIS sub district level item-wise monthly report of Goa', 
    'url': '/catalog/hmis-sub-district-level-item-wise-monthly-report-goa'}] 

import scrapy 
class Category(scrapy.Item): 
    title = scrapy.Field() 
    url = scrapy.Field() 
    ministry_department = scrapy.Field() 
    description = scrapy.Field() 
    state_department = scrapy.Field() 
    last_updated = scrapy.Field() 
    data_sets_count = scrapy.Field() 
    data_sets = scrapy.Field() 
    item = scrapy.Field() 
    nodes = scrapy.Field() 

class CatalogSpider(scrapy.Spider): 
    name = 'catalogspider' 
    start_urls = ['https://data.gov.in/catalogs#sort_by=created&sort_order=DESC&items_per_page=9&page=1'] 

    def parse(self, response): 
     for catalog in response.css('.view-catalogs > div > .views-row-6'): 
      category = Category() 
      category['title'] = catalog.css('.views-field-title .field-content a::text').extract_first() 
      category['url'] = catalog.css('.views-field-title .field-content a::attr(href)').extract_first() 
      category['ministry_department'] = catalog.css('.views-field-field-ministry-department .field-content ::text').extract_first() 
      category['description'] = catalog.css('.views-field-body .field-content ::text').extract_first() 
      category['state_department'] = catalog.css('.views-field-field-state-department .field-content ::text').extract_first() 
      category['last_updated'] = catalog.css('.views-field-changed .field-content ::text').extract_first() 
      category['data_sets_count'] = int(catalog.css('.views-field-resource-count-last .count-resource::text').re(r'\((.*?)\)')[0]) 
      category['nodes'] = [] 
      request = scrapy.Request(response.urljoin(category['url']), callback=self.parseDataSets) 
      request.meta['item'] = category 
      yield request 

     for next_page in response.css('li.pager-next > a'): 
      yield response.follow(next_page, self.parse) 


    def parseDataSets(self, response): 
     item = response.meta['item'] 

     for dataset in response.css('.view-resource-detail-popup > div > .views-row'): 
      item['nodes'].append({ 
       'node' : dataset.css('.data-extension.csv::attr(class)').extract_first().split()[0], 
       'title' : dataset.css('.views-field-title .field-content .title-content::text').extract_first() 
       }) 

     for next_page in response.css('li.pager-next'): 
      print('here') 
      request = scrapy.Request(response.urljoin(next_page.css('a::attr(href)').extract_first()), callback=self.parseDataSets) 
      request.meta['item'] = item 

     yield item 
+0

Pouvez-vous poster journal d'analyse? Vous pouvez le faire via 'scrapy crawl spider --logfile output.log' ou' scrapy crawl spider 2> 1 | Commandes tee output.log' (la dernière met la sortie à l'écran et au fichier). – Granitosaurus

+0

@Granitosaurus Je viens de le faire travailler avec quelques changements de code, je vais poster le code de travail maintenant, mais je ne sais pas si c'est la bonne façon de le faire. – sabithpocker

+0

J'ajoute à une méta-variable, les éléments de chaque page enfant et donne zéro, à la fin donne la méta-variable quand c'est la dernière page. Cela semble un peu hacky, mais ça marche maintenant. – sabithpocker

Répondre

0

Je l'ai travail en utilisant le code ci-dessous, je ne suis pas sûr qu'il est ce la bonne façon de le faire. J'ajoute DataSet à une méta-variable category, et donne None, à la fin donne la méta-variable category quand c'est la dernière page. Cela semble un peu hacky, mais ça marche maintenant.

import scrapy 
class Category(scrapy.Item): 
    title = scrapy.Field() 
    url = scrapy.Field() 
    ministry_department = scrapy.Field() 
    description = scrapy.Field() 
    state_department = scrapy.Field() 
    last_updated = scrapy.Field() 
    data_sets_count = scrapy.Field() 
    data_sets_actual_count = scrapy.Field() 
    data_sets = scrapy.Field() 
    item = scrapy.Field() 
    nodes = scrapy.Field() 

class CatalogSpider(scrapy.Spider): 
    name = 'catalogspider' 
    start_urls = ['https://data.gov.in/catalogs#sort_by=created&sort_order=DESC&items_per_page=9&page=1'] 

    def parse(self, response): 
     for catalog in response.css('.view-catalogs > div > .views-row-6'): 
      category = Category() 
      category['title'] = catalog.css('.views-field-title .field-content a::text').extract_first() 
      category['url'] = catalog.css('.views-field-title .field-content a::attr(href)').extract_first() 
      category['ministry_department'] = catalog.css('.views-field-field-ministry-department .field-content ::text').extract_first() 
      category['description'] = catalog.css('.views-field-body .field-content ::text').extract_first() 
      category['state_department'] = catalog.css('.views-field-field-state-department .field-content ::text').extract_first() 
      category['last_updated'] = catalog.css('.views-field-changed .field-content ::text').extract_first() 
      category['data_sets_count'] = int(catalog.css('.views-field-resource-count-last .count-resource::text').re(r'\((.*?)\)')[0]) 
      category['nodes'] = [] 
      request = scrapy.Request(response.urljoin(category['url']), callback=self.parse_data_sets) 
      request.meta['category'] = category 
      yield request 

     #for next_page in response.css('li.pager-next > a'): 
     # yield response.follow(next_page, self.parse) 


    def parse_data_sets(self, response): 
     category = response.meta['category'] 
     datasets = response.css('.view-resource-detail-popup > div > .views-row') 
     if datasets: 
      for dataset in datasets: 
       node = dataset.css('.data-extension.csv::attr(class)').extract_first().split()[0] 
       title = dataset.css('.views-field-title .field-content .title-content::text').extract_first() 
       url = 'https://data.gov.in/node/' + node + '/download' 
       category['nodes'].append({ 
        'node' : node, 
        'title' : title, 
        'url' : url 
        }) 
       yield None 
     else: 
      yield category 

     if len(response.css('li.pager-next').extract()) == 0: 
      category['data_sets_actual_count'] = len(category['nodes']) 
      yield category 

     #pagination 
     for next_page in response.css('li.pager-next'): 
      request = scrapy.Request(response.urljoin(next_page.css('a::attr(href)').extract_first()), callback=self.parse_data_sets) 
      request.meta['category'] = category 
      yield request 

Un de mon problème a été mal réglé profondeur dans ma commande, qui je l'ai changé pour un plus grand nombre plus tard, des questions aléatoires lorsque dans des domaines inconnus:

scrapy parse --spider=catalogspider -d 60 'https://data.gov.in/catalogs#sort_by=created&sort_order=DESC&items_per_page=9&page=1'