Est-ce que vous passez par des codes officiels pycurl? Le code suivant la mise en œuvre des choses multiples et j'ai essayé d'exécuter et j'ai pu ramper autour 10.000 urls en 300 secondes en parallèle. Je pense que cela est exactement ce que vous voulez atteindre? me corriger, si je me trompe.
#! /usr/bin/env python
# -*- coding: iso-8859-1 -*-
# vi:ts=4:et
# $Id: retriever-multi.py,v 1.29 2005/07/28 11:04:13 mfx Exp $
#
# Usage: python retriever-multi.py <file with URLs to fetch> [<# of
# concurrent connections>]
#
import sys
import pycurl
# We should ignore SIGPIPE when using pycurl.NOSIGNAL - see
# the libcurl tutorial for more info.
try:
import signal
from signal import SIGPIPE, SIG_IGN
signal.signal(signal.SIGPIPE, signal.SIG_IGN)
except ImportError:
pass
# Get args
num_conn = 10
try:
if sys.argv[1] == "-":
urls = sys.stdin.readlines()
else:
urls = open(sys.argv[1]).readlines()
if len(sys.argv) >= 3:
num_conn = int(sys.argv[2])
except:
print "Usage: %s <file with URLs to fetch> [<# of concurrent connections>]" % sys.argv[0]
raise SystemExit
# Make a queue with (url, filename) tuples
queue = []
for url in urls:
url = url.strip()
if not url or url[0] == "#":
continue
filename = "doc_%03d.dat" % (len(queue) + 1)
queue.append((url, filename))
# Check args
assert queue, "no URLs given"
num_urls = len(queue)
num_conn = min(num_conn, num_urls)
assert 1 <= num_conn <= 10000, "invalid number of concurrent connections"
print "PycURL %s (compiled against 0x%x)" % (pycurl.version, pycurl.COMPILE_LIBCURL_VERSION_NUM)
print "----- Getting", num_urls, "URLs using", num_conn, "connections -----"
# Pre-allocate a list of curl objects
m = pycurl.CurlMulti()
m.handles = []
for i in range(num_conn):
c = pycurl.Curl()
c.fp = None
c.setopt(pycurl.FOLLOWLOCATION, 1)
c.setopt(pycurl.MAXREDIRS, 5)
c.setopt(pycurl.CONNECTTIMEOUT, 30)
c.setopt(pycurl.TIMEOUT, 300)
c.setopt(pycurl.NOSIGNAL, 1)
m.handles.append(c)
# Main loop
freelist = m.handles[:]
num_processed = 0
while num_processed < num_urls:
# If there is an url to process and a free curl object, add to multi stack
while queue and freelist:
url, filename = queue.pop(0)
c = freelist.pop()
c.fp = open(filename, "wb")
c.setopt(pycurl.URL, url)
c.setopt(pycurl.WRITEDATA, c.fp)
m.add_handle(c)
# store some info
c.filename = filename
c.url = url
# Run the internal curl state machine for the multi stack
while 1:
ret, num_handles = m.perform()
if ret != pycurl.E_CALL_MULTI_PERFORM:
break
# Check for curl objects which have terminated, and add them to the freelist
while 1:
num_q, ok_list, err_list = m.info_read()
for c in ok_list:
c.fp.close()
c.fp = None
m.remove_handle(c)
print "Success:", c.filename, c.url, c.getinfo(pycurl.EFFECTIVE_URL)
freelist.append(c)
for c, errno, errmsg in err_list:
c.fp.close()
c.fp = None
m.remove_handle(c)
print "Failed: ", c.filename, c.url, errno, errmsg
freelist.append(c)
num_processed = num_processed + len(ok_list) + len(err_list)
if num_q == 0:
break
# Currently no more I/O is pending, could do something in the meantime
# (display a progress bar, etc.).
# We just call select() to sleep until some more data is available.
m.select(1.0)
# Cleanup
for c in m.handles:
if c.fp is not None:
c.fp.close()
c.fp = None
c.close()
m.close()
Pouvez-vous expliquer peu? – Nisarg
cela devrait faire l'affaire – Buster