2016-10-08 4 views
1

J'essaye d'obtenir l'image de la page de détails d'un site Web. J'utilise la fonction rss 'links' pour obtenir les liens. Ceci est mon codeQuand j'essaye de gratter la page de détails pour l'image j'obtiens une erreur

@app.task 
def pan_task(): 
    url = 'http://feeds.example.com/reuters/technologyNews' 
    name = 'noticiassin' 
    live_leaks = [i for i in feedparser.parse(url).entries][:10] 
    the_count = len(live_leaks) 
    ky = feedparser.parse(url).keys() 
    oky = [i.keys() for i in feedparser.parse(url).entries][1] # shows what I can pull 

    def make_soup(url): 
     def swappo(): 
      user_one = ' "Mozilla/5.0 (Windows NT 6.0; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0" ' 
      user_two = ' "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_5)" ' 
      user_thr = ' "Mozilla/5.0 (Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko" ' 
      user_for = ' "Mozilla/5.0 (Macintosh; Intel Mac OS X x.y; rv:10.0) Gecko/20100101 Firefox/10.0" ' 

      agent_list = [user_one, user_two, user_thr, user_for] 
      a = random.choice(agent_list) 
      return a 
     headers = { 
      "user-agent": swappo(), 
      "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", 
      "accept-charset": "ISO-8859-1,utf-8;q=0.7,*;q=0.3", 
      "accept-encoding": "gzip,deflate,sdch", 
      "accept-language": "en-US,en;q=0.8", 
     } 
     the_comments_page = requests.get(url, headers=headers) 
     soupdata = BeautifulSoup(the_comments_page.text, 'html5lib') 
     # comment = soupdata.find('a').get('src') 
     # para = comment.find_all('p') 
     # kids = [child.text for child in para] 
     # blu = str(kids).strip('[]') 
     return soupdata 

    try: 
     live_entries = [{'href': live_leak.links[0]['href']} for live_leak in live_leaks] 
     o = make_soup(live_entries) 
    except IndexError: 
     print('error check logs') 
     live_entries = [] 

    return print(o) 

mais quand j'essaie je reçois cette erreur

[2016-10-07 21:10:58,019: ERROR/MainProcess] Task blog.tasks.pan_task[f43ed360-c06e-4a4b-95ab-4f44a4564afa] raised unexpected: InvalidSchema("No connection adapters were found for '[{'href': 'http://feeds.reuters.com/~r/reuters/technologyNews/~3/AA1uAIpygjQ/us-apple-samsung-elec-appeal-idUSKCN1271LF'}, {'href': 'http://feeds.reuters.com/~r/reuters/technologyNews/~3/Nz28cqiuS0Y/us-google-pixel-advertising-idUSKCN12721U'}, {'href': 'http://feeds.reuters.com/~r/reuters/technologyNews/~3/POLoFj22hc4/us-yahoo-nsa-order-idUSKCN12800D'}, {'href': 'http://feeds.reuters.com/~r/reuters/technologyNews/~3/eF-XlhlQl-s/us-fcc-dataservices-idUSKCN1271RB'}, {'href': 'http://feeds.reuters.com/~r/reuters/technologyNews/~3/hNf9IQ3rXjw/us-autonomous-nauto-idUSKCN1271FX'}, {'href': 'http://feeds.reuters.com/~r/reuters/technologyNews/~3/NXkk5WfWVhM/us-sony-sensors-idUSKCN1270EC'}, {'href': 'http://feeds.reuters.com/~r/reuters/technologyNews/~3/gdBvoarqQro/us-yahoo-discrimination-lawsuit-idUSKCN12800K'}, {'href': 'http://feeds.reuters.com/~r/reuters/technologyNews/~3/nt8K--27bDg/us-thomsonreuters-ceo-idUSKCN1271DQ'}, {'href': 'http://feeds.reuters.com/~r/reuters/technologyNews/~3/f8z3eQg2Fpo/us-snapchat-ipo-idUSKCN12627S'}, {'href': 'http://feeds.reuters.com/~r/reuters/technologyNews/~3/rr4vdLsC11Y/us-samsung-elec-results-idUSKCN1262NO'}]'",) 
Traceback (most recent call last): 
    File "/Users/ray/Desktop/myheroku/practice/lib/python3.5/site-packages/celery/app/trace.py", line 240, in trace_task 
    R = retval = fun(*args, **kwargs) 
    File "/Users/ray/Desktop/myheroku/practice/lib/python3.5/site-packages/celery/app/trace.py", line 438, in __protected_call__ 
    return self.run(*args, **kwargs) 
    File "/Users/ray/Desktop/myheroku/practice/src/blog/tasks.py", line 134, in pan_task 
    o = make_soup(live_entries) 
    File "/Users/ray/Desktop/myheroku/practice/src/blog/tasks.py", line 124, in make_soup 
    the_comments_page = requests.get(url, headers=headers) 
    File "/Users/ray/Desktop/myheroku/practice/lib/python3.5/site-packages/requests/api.py", line 67, in get 
    return request('get', url, params=params, **kwargs) 
    File "/Users/ray/Desktop/myheroku/practice/lib/python3.5/site-packages/requests/api.py", line 53, in request 
    return session.request(method=method, url=url, **kwargs) 
    File "/Users/ray/Desktop/myheroku/practice/lib/python3.5/site-packages/requests/sessions.py", line 468, in request 
    resp = self.send(prep, **send_kwargs) 
    File "/Users/ray/Desktop/myheroku/practice/lib/python3.5/site-packages/requests/sessions.py", line 570, in send 
    adapter = self.get_adapter(url=request.url) 
    File "/Users/ray/Desktop/myheroku/practice/lib/python3.5/site-packages/requests/sessions.py", line 644, in get_adapter 
    raise InvalidSchema("No connection adapters were found for '%s'" % url) 
requests.exceptions.InvalidSchema: No connection adapters were found for '[{'href': 'http://feeds.reuters.com/~r/reuters/technologyNews/~3/AA1uAIpygjQ/us-apple-samsung-elec-appeal-idUSKCN1271LF'}, {'href': 'http://feeds.reuters.com/~r/reuters/technologyNews/~3/Nz28cqiuS0Y/us-google-pixel-advertising-idUSKCN12721U'}, {'href': 'http://feeds.reuters.com/~r/reuters/technologyNews/~3/POLoFj22hc4/us-yahoo-nsa-order-idUSKCN12800D'}, {'href': 'http://feeds.reuters.com/~r/reuters/technologyNews/~3/eF-XlhlQl-s/us-fcc-dataservices-idUSKCN1271RB'}, {'href': 'http://feeds.reuters.com/~r/reuters/technologyNews/~3/hNf9IQ3rXjw/us-autonomous-nauto-idUSKCN1271FX'}, {'href': 'http://feeds.reuters.com/~r/reuters/technologyNews/~3/NXkk5WfWVhM/us-sony-sensors-idUSKCN1270EC'}, {'href': 'http://feeds.reuters.com/~r/reuters/technologyNews/~3/gdBvoarqQro/us-yahoo-discrimination-lawsuit-idUSKCN12800K'}, {'href': 'http://feeds.reuters.com/~r/reuters/technologyNews/~3/nt8K--27bDg/us-thomsonreuters-ceo-idUSKCN1271DQ'}, {'href': 'http://feeds.reuters.com/~r/reuters/technologyNews/~3/f8z3eQg2Fpo/us-snapchat-ipo-idUSKCN12627S'}, {'href': 'http://feeds.reuters.com/~r/reuters/technologyNews/~3/rr4vdLsC11Y/us-samsung-elec-results-idUSKCN1262NO'}]' 

whhy ce ne fonctionne pas? J'utilise la fonction dans un autre programme.

+1

prend une URL unique, mais vous faisant passer une liste de dictionnaires. – miah

+0

J'ai utilisé cette fonction dans un programme similaire. La seule différence est qu'il n'utilise pas l'analyseur de flux pour obtenir l'URL. comment puis-je le faire fonctionner? – losee

Répondre

-1

Vous devez faire quelque chose comme ceci:

@app.task 
def pan_task(): 
    url = 'http://feeds.example.com/reuters/technologyNews' 
    name = 'noticiassin' 
    live_leaks = [i for i in feedparser.parse(url).entries][:10] 
    the_count = len(live_leaks) 
    ky = feedparser.parse(url).keys() 
    oky = [i.keys() for i in feedparser.parse(url).entries][1] # shows what I can pull 

    def make_soup(url): 
     def swappo(): 
      user_one = ' "Mozilla/5.0 (Windows NT 6.0; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0" ' 
      user_two = ' "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_5)" ' 
      user_thr = ' "Mozilla/5.0 (Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko" ' 
      user_for = ' "Mozilla/5.0 (Macintosh; Intel Mac OS X x.y; rv:10.0) Gecko/20100101 Firefox/10.0" ' 

      agent_list = [user_one, user_two, user_thr, user_for] 
      a = random.choice(agent_list) 
      return a 
     headers = { 
      "user-agent": swappo(), 
      "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", 
      "accept-charset": "ISO-8859-1,utf-8;q=0.7,*;q=0.3", 
      "accept-encoding": "gzip,deflate,sdch", 
      "accept-language": "en-US,en;q=0.8", 
     } 
     the_comments_page = requests.get(url, headers=headers) 
     soupdata = BeautifulSoup(the_comments_page.text, 'html5lib') 
     # comment = soupdata.find('div') 
     # para = comment.find_all('p') 
     # kids = [child.text for child in para] 
     # blu = str(kids).strip('[]') 
     return soupdata 

    live_entries = [] 
    try: 
     for live_leak in live_leaks: 
      live_entries.append(make_soup(live_leak.links[0]['href'])) 
      # Do what ever you need to do to o here 
    except IndexError: 
     print('error check logs') 
     live_entries = [] 
    return live_entries 
demandes
+0

qui retourne juste une liste vide [] – losee

+0

que voulez-vous qu'il retourne? – miah

+0

Je veux qu'il attrape le src sur la page de détails pour que je puisse l'utiliser – losee