Voici le problème:programme python se fige lors de l'exécution pendant quelques jours
J'écris un programme Python qui est le but est de recueillir en continu des nouvelles de flux RSS. Je veux que le programme collecte les données pendant une semaine. Le problème est que le programme n'arrive jamais à la fin de la semaine. Parfois, il se fige après plusieurs jours, parfois plusieurs heures et même quelques minutes. Il gèle toujours, pas d'erreurs. Quand je dis gel, je veux dire que l'interprète semble toujours courir, en ce sens que je ne peux pas lui donner d'autres commandes. Comment puis-je résoudre ce problème?
Je vais poster le code ci-dessous. Merci les gars!!
from goose import Goose
from requests import get
import urllib2
import feedparser
from urllib2 import urlopen
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import re
import datetime as dt
import time
import os
Symbols=['AAPL','T','BA','XOM','GOOG','JPM','PG','WMT']
url='http://finance.yahoo.com/rss/headline?s='
for t in xrange(7):
AAPL=pd.DataFrame()
AAPL['Published']=""
AAPL['Title']=""
AAPL['link']=""
AAPL['ID']=""
AAPL['News']=""
T=pd.DataFrame()
T['Published']=""
T['Title']=""
T['link']=""
T['ID']=""
T['News']=""
BA=pd.DataFrame()
BA['Published']=""
BA['Title']=""
BA['link']=""
BA['ID']=""
BA['News']=""
XOM=pd.DataFrame()
XOM['Published']=""
XOM['Title']=""
XOM['link']=""
XOM['ID']=""
XOM['News']=""
GOOG=pd.DataFrame()
GOOG['Published']=""
GOOG['Title']=""
GOOG['link']=""
GOOG['ID']=""
GOOG['News']=""
JPM=pd.DataFrame()
JPM['Published']=""
JPM['Title']=""
JPM['link']=""
JPM['ID']=""
JPM['News']=""
PG=pd.DataFrame()
PG['Published']=""
PG['Title']=""
PG['link']=""
PG['ID']=""
PG['News']=""
WMT=pd.DataFrame()
WMT['Published']=""
WMT['Title']=""
WMT['link']=""
WMT['ID']=""
WMT['News']=""
DaysIDsAAPL=[]
DaysIDsT=[]
DaysIDsBA=[]
DaysIDsXOM=[]
DaysIDsGOOG=[]
DaysIDsJPM=[]
DaysIDsPG=[]
DaysIDsWMT=[]
count=0
AAPLCount=0
TCount=0
BACount=0
XOMCount=0
GOOGCount=0
JPMCount=0
PGCount=0
WMTCount=0
date=dt.date.today()
newpathAAPL = r'D:\News Data\AAPL\\'+str(t)
newpathT = r'D:\News Data\T\\'+str(t)
newpathBA = r'D:\News Data\BA\\'+str(t)
newpathXOM = r'D:\News Data\XOM\\'+str(t)
newpathGOOG = r'D:\News Data\GOOG\\'+str(t)
newpathJPM = r'D:\News Data\JPM\\'+str(t)
newpathPG = r'D:\News Data\PG\\'+str(t)
newpathWMT = r'D:\News Data\WMT\\'+str(t)
os.makedirs(newpathAAPL)
os.makedirs(newpathT)
os.makedirs(newpathBA)
os.makedirs(newpathXOM)
os.makedirs(newpathGOOG)
os.makedirs(newpathJPM)
os.makedirs(newpathPG)
os.makedirs(newpathWMT)
while dt.date.today()==date:
print "Loop"
try:
#AAPL inner most loop
d1=feedparser.parse(url+Symbols[0])
for x in xrange(len(d1['entries'])):
if int(d1.entries[x]['id'][14:]) not in DaysIDsAAPL:
DaysIDsAAPL.append(int(d1.entries[x]['id'][14:]))
y = len(AAPL.index.tolist())
m=re.search(r'\*(.*)',d1.entries[x]['link'])
z=re.search(r'\?ru=yahoo\?mod=yahoo_itp',m.group(1))
if type(z) is not None:
m=re.sub(r'\?ru=yahoo\?mod=yahoo_itp', '', m.group(1))
AAPL.loc[y,'Title'] =d1.entries[x]['title'].encode('utf8')
AAPL.loc[y,'link'] =m.encode('utf8')
AAPL.loc[y,'Published'] =d1.entries[x]['published'].encode('utf8')
AAPL.loc[y,'ID'] =int(d1.entries[x]['id'][14:])
hdr = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
page = get(m,headers=hdr)
extractor = Goose()
article = extractor.extract(raw_html=page.text)
text = article.cleaned_text.encode('utf8')
if text == '':
try:
url2 = m
req = urllib2.Request(url2, None, hdr)
html2 = urlopen(req).read().decode('utf8')
raw = BeautifulSoup(html2,"lxml").get_text().encode('utf8')
Text_file = open(newpathAAPL+r"\\"+str(AAPLCount)+".txt", "w")
Text_file.write(raw)
Text_file.close()
AAPL.loc[y,'News'] = AAPLCount
AAPLCount+=1
AAPL=AAPL.fillna("")
AAPL.to_csv(newpathAAPL+r'\Key.csv')
except:
print m
print "AAPL"
else:
Text_file = open(newpathAAPL+r"\\"+str(AAPLCount)+".txt", "w")
Text_file.write(text)
Text_file.close()
AAPL.loc[y,'News'] =AAPLCount
AAPLCount+=1
AAPL=AAPL.fillna("")
AAPL.to_csv(newpathAAPL+r'\Key.csv')
print "AAPL"
#T inner most loop
d2=feedparser.parse(url+Symbols[1])
for x in xrange(len(d2['entries'])):
if int(d2.entries[x]['id'][14:]) not in DaysIDsT:
DaysIDsT.append(int(d2.entries[x]['id'][14:]))
y = len(T.index.tolist())
m=re.search(r'\*(.*)',d2.entries[x]['link'])
z=re.search(r'\?ru=yahoo\?mod=yahoo_itp',m.group(1))
if type(z) is not None:
m=re.sub(r'\?ru=yahoo\?mod=yahoo_itp', '', m.group(1))
T.loc[y,'Title'] =d2.entries[x]['title'].encode('utf8')
T.loc[y,'link'] =m.encode('utf8')
T.loc[y,'Published'] =d2.entries[x]['published'].encode('utf8')
T.loc[y,'ID'] =int(d2.entries[x]['id'][14:])
hdr = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
page = get(m,headers=hdr)
extractor = Goose()
article = extractor.extract(raw_html=page.text)
text = article.cleaned_text.encode('utf8')
if text == '':
try:
url2 = m
req = urllib2.Request(url2, None, hdr)
html2 = urlopen(req).read().decode('utf8')
raw = BeautifulSoup(html2,"lxml").get_text().encode('utf8')
Text_file = open(newpathT+r"\\"+str(TCount)+".txt", "w")
Text_file.write(raw)
Text_file.close()
T.loc[y,'News'] = TCount
TCount+=1
T=T.fillna("")
T.to_csv(newpathT+r'\Key.csv')
except:
print m
print "T"
else:
Text_file = open(newpathT+r"\\"+str(TCount)+".txt", "w")
Text_file.write(text)
Text_file.close()
T.loc[y,'News'] =TCount
TCount+=1
T=T.fillna("")
T.to_csv(newpathT+r'\Key.csv')
print "T"
#BA inner most loop
d3=feedparser.parse(url+Symbols[2])
for x in xrange(len(d3['entries'])):
if int(d3.entries[x]['id'][14:]) not in DaysIDsBA:
DaysIDsBA.append(int(d3.entries[x]['id'][14:]))
y = len(BA.index.tolist())
m=re.search(r'\*(.*)',d3.entries[x]['link'])
z=re.search(r'\?ru=yahoo\?mod=yahoo_itp',m.group(1))
if type(z) is not None:
m=re.sub(r'\?ru=yahoo\?mod=yahoo_itp', '', m.group(1))
BA.loc[y,'Title'] =d3.entries[x]['title'].encode('utf8')
BA.loc[y,'link'] =m.encode('utf8')
BA.loc[y,'Published'] =d3.entries[x]['published'].encode('utf8')
BA.loc[y,'ID'] =int(d3.entries[x]['id'][14:])
hdr = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
page = get(m,headers=hdr)
extractor = Goose()
article = extractor.extract(raw_html=page.text)
text = article.cleaned_text.encode('utf8')
if text == '':
try:
url2 = m
req = urllib2.Request(url2, None, hdr)
html2 = urlopen(req).read().decode('utf8')
raw = BeautifulSoup(html2,"lxml").get_text().encode('utf8')
Text_file = open(newpathBA+r"\\"+str(BACount)+".txt", "w")
Text_file.write(raw)
Text_file.close()
BA.loc[y,'News'] = BACount
BACount+=1
BA=BA.fillna("")
BA.to_csv(newpathBA+r'\Key.csv')
except:
print m
print "BA"
else:
Text_file = open(newpathBA+r"\\"+str(BACount)+".txt", "w")
Text_file.write(text)
Text_file.close()
BA.loc[y,'News'] =BACount
BACount+=1
BA=BA.fillna("")
BA.to_csv(newpathBA+r'\Key.csv')
print "BA"
#XOM inner most loop
d4=feedparser.parse(url+Symbols[3])
for x in xrange(len(d4['entries'])):
if int(d4.entries[x]['id'][14:]) not in DaysIDsXOM:
DaysIDsXOM.append(int(d4.entries[x]['id'][14:]))
y = len(XOM.index.tolist())
m=re.search(r'\*(.*)',d4.entries[x]['link'])
z=re.search(r'\?ru=yahoo\?mod=yahoo_itp',m.group(1))
if type(z) is not None:
m=re.sub(r'\?ru=yahoo\?mod=yahoo_itp', '', m.group(1))
XOM.loc[y,'Title'] =d4.entries[x]['title'].encode('utf8')
XOM.loc[y,'link'] =m.encode('utf8')
XOM.loc[y,'Published'] =d4.entries[x]['published'].encode('utf8')
XOM.loc[y,'ID'] =int(d4.entries[x]['id'][14:])
hdr = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
page = get(m,headers=hdr)
extractor = Goose()
article = extractor.extract(raw_html=page.text)
text = article.cleaned_text.encode('utf8')
if text == '':
try:
url2 = m
req = urllib2.Request(url2, None, hdr)
html2 = urlopen(req).read().decode('utf8')
raw = BeautifulSoup(html2,"lxml").get_text().encode('utf8')
Text_file = open(newpathXOM+r"\\"+str(XOMCount)+".txt", "w")
Text_file.write(raw)
Text_file.close()
XOM.loc[y,'News'] = XOMCount
XOMCount+=1
XOM=XOM.fillna("")
XOM.to_csv(newpathXOM+r'\Key.csv')
except:
print m
print "XOM"
else:
Text_file = open(newpathXOM+r"\\"+str(XOMCount)+".txt", "w")
Text_file.write(text)
Text_file.close()
XOM.loc[y,'News'] =XOMCount
XOMCount+=1
XOM=XOM.fillna("")
XOM.to_csv(newpathXOM+r'\Key.csv')
#GOOG inner most loop
d5=feedparser.parse(url+Symbols[4])
for x in xrange(len(d5['entries'])):
if int(d5.entries[x]['id'][14:]) not in DaysIDsGOOG:
DaysIDsGOOG.append(int(d5.entries[x]['id'][14:]))
y = len(GOOG.index.tolist())
m=re.search(r'\*(.*)',d5.entries[x]['link'])
z=re.search(r'\?ru=yahoo\?mod=yahoo_itp',m.group(1))
if type(z) is not None:
m=re.sub(r'\?ru=yahoo\?mod=yahoo_itp', '', m.group(1))
GOOG.loc[y,'Title'] =d5.entries[x]['title'].encode('utf8')
GOOG.loc[y,'link'] =m.encode('utf8')
GOOG.loc[y,'Published'] =d5.entries[x]['published'].encode('utf8')
GOOG.loc[y,'ID'] =int(d5.entries[x]['id'][14:])
hdr = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
page = get(m,headers=hdr)
extractor = Goose()
article = extractor.extract(raw_html=page.text)
text = article.cleaned_text.encode('utf8')
if text == '':
try:
url2 = m
req = urllib2.Request(url2, None, hdr)
html2 = urlopen(req).read().decode('utf8')
raw = BeautifulSoup(html2,"lxml").get_text().encode('utf8')
Text_file = open(newpathGOOG+r"\\"+str(GOOGCount)+".txt", "w")
Text_file.write(raw)
Text_file.close()
GOOG.loc[y,'News'] = GOOGCount
GOOGCount+=1
GOOG=GOOG.fillna("")
GOOG.to_csv(newpathGOOG+r'\Key.csv')
except:
print m
print "GOOG"
else:
Text_file = open(newpathGOOG+r"\\"+str(GOOGCount)+".txt", "w")
Text_file.write(text)
Text_file.close()
GOOG.loc[y,'News'] =GOOGCount
GOOGCount+=1
GOOG=GOOG.fillna("")
GOOG.to_csv(newpathGOOG+r'\Key.csv')
print "GOOG"
#JPM inner most loop
d6=feedparser.parse(url+Symbols[5])
for x in xrange(len(d6['entries'])):
if int(d6.entries[x]['id'][14:]) not in DaysIDsJPM:
DaysIDsJPM.append(int(d6.entries[x]['id'][14:]))
y = len(JPM.index.tolist())
m=re.search(r'\*(.*)',d6.entries[x]['link'])
z=re.search(r'\?ru=yahoo\?mod=yahoo_itp',m.group(1))
if type(z) is not None:
m=re.sub(r'\?ru=yahoo\?mod=yahoo_itp', '', m.group(1))
JPM.loc[y,'Title'] =d6.entries[x]['title'].encode('utf8')
JPM.loc[y,'link'] =m.encode('utf8')
JPM.loc[y,'Published'] =d6.entries[x]['published'].encode('utf8')
JPM.loc[y,'ID'] =int(d6.entries[x]['id'][14:])
hdr = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
page = get(m,headers=hdr)
extractor = Goose()
article = extractor.extract(raw_html=page.text)
text = article.cleaned_text.encode('utf8')
if text == '':
try:
url2 = m
req = urllib2.Request(url2, None, hdr)
html2 = urlopen(req).read().decode('utf8')
raw = BeautifulSoup(html2,"lxml").get_text().encode('utf8')
Text_file = open(newpathJPM+r"\\"+str(JPMCount)+".txt", "w")
Text_file.write(raw)
Text_file.close()
JPM.loc[y,'News'] = JPMCount
JPMCount+=1
JPM=JPM.fillna("")
JPM.to_csv(newpathJPM+r'\Key.csv')
except:
print m
print "JPM"
else:
Text_file = open(newpathJPM+r"\\"+str(JPMCount)+".txt", "w")
Text_file.write(text)
Text_file.close()
JPM.loc[y,'News'] =JPMCount
JPMCount+=1
JPM=JPM.fillna("")
JPM.to_csv(newpathJPM+r'\Key.csv')
print "JPM"
#PG inner most loop
d7=feedparser.parse(url+Symbols[6])
for x in xrange(len(d7['entries'])):
if int(d7.entries[x]['id'][14:]) not in DaysIDsPG:
DaysIDsPG.append(int(d7.entries[x]['id'][14:]))
y = len(PG.index.tolist())
m=re.search(r'\*(.*)',d7.entries[x]['link'])
z=re.search(r'\?ru=yahoo\?mod=yahoo_itp',m.group(1))
if type(z) is not None:
m=re.sub(r'\?ru=yahoo\?mod=yahoo_itp', '', m.group(1))
PG.loc[y,'Title'] =d7.entries[x]['title'].encode('utf8')
PG.loc[y,'link'] =m.encode('utf8')
PG.loc[y,'Published'] =d7.entries[x]['published'].encode('utf8')
PG.loc[y,'ID'] =int(d7.entries[x]['id'][14:])
hdr = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
page = get(m,headers=hdr)
extractor = Goose()
article = extractor.extract(raw_html=page.text)
text = article.cleaned_text.encode('utf8')
if text == "":
try:
url2 = m
req = urllib2.Request(url2, None, hdr)
html2 = urlopen(req).read().decode('utf8')
raw = BeautifulSoup(html2,"lxml").get_text().encode('utf8')
Text_file = open(newpathPG+r"\\"+str(PGCount)+".txt", "w")
Text_file.write(raw)
Text_file.close()
PG.loc[y,'News'] = PGCount
PGCount+=1
PG=PG.fillna("")
PG.to_csv(newpathPG+r'\Key.csv')
except:
print m
print "PG"
else:
Text_file = open(newpathPG+r"\\"+str(PGCount)+".txt", "w")
Text_file.write(text)
Text_file.close()
PG.loc[y,'News'] =PGCount
PGCount+=1
PG=PG.fillna("")
PG.to_csv(newpathPG+r'\Key.csv')
print "PG"
#WMT inner most loop
d8=feedparser.parse(url+Symbols[7])
for x in xrange(len(d8['entries'])):
if int(d8.entries[x]['id'][14:]) not in DaysIDsWMT:
DaysIDsWMT.append(int(d8.entries[x]['id'][14:]))
y = len(WMT.index.tolist())
m=re.search(r'\*(.*)',d8.entries[x]['link'])
z=re.search(r'\?ru=yahoo\?mod=yahoo_itp',m.group(1))
if type(z) is not None:
m=re.sub(r'\?ru=yahoo\?mod=yahoo_itp', '', m.group(1))
WMT.loc[y,'Title'] =d8.entries[x]['title'].encode('utf8')
WMT.loc[y,'link'] =m.encode('utf8')
WMT.loc[y,'Published'] =d8.entries[x]['published'].encode('utf8')
WMT.loc[y,'ID'] =int(d8.entries[x]['id'][14:])
hdr = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
page = get(m,headers=hdr)
extractor = Goose()
article = extractor.extract(raw_html=page.text)
text = article.cleaned_text.encode('utf8')
if text == "":
try:
url2 = m
req = urllib2.Request(url2, None, hdr)
html2 = urlopen(req).read().decode('utf8')
raw = BeautifulSoup(html2,"lxml").get_text().encode('utf8')
Text_file = open(newpathWMT+r"\\"+str(WMTCount)+".txt", "w")
Text_file.write(raw)
Text_file.close()
WMT.loc[y,'News'] = WMTCount
WMTCount+=1
WMT=WMT.fillna("")
WMT.to_csv(newpathWMT+r'\Key.csv')
except:
print m
print "WMT"
else:
Text_file = open(newpathWMT+r"\\"+str(WMTCount)+".txt", "w")
Text_file.write(text)
Text_file.close()
WMT.loc[y,'News'] =WMTCount
WMTCount+=1
WMT=WMT.fillna("")
WMT.to_csv(newpathWMT+r'\Key.csv')
print "WMT"
count+=1
print count
time.sleep(1)
except:
print "Error"
AAPL=AAPL.fillna("")
AAPL.to_csv(newpathAAPL+r'\Key.csv')
T=T.fillna("")
T.to_csv(newpathT+r'\Key.csv')
BA=BA.fillna("")
BA.to_csv(newpathBA+r'\Key.csv')
XOM=XOM.fillna("")
XOM.to_csv(newpathXOM+r'\Key.csv')
GOOG=GOOG.fillna("")
GOOG.to_csv(newpathGOOG+r'\Key.csv')
JPM=JPM.fillna("")
JPM.to_csv(newpathJPM+r'\Key.csv')
PG=PG.fillna("")
PG.to_csv(newpathPG+r'\Key.csv')
WMT=WMT.fillna("")
WMT.to_csv(newpathWMT+r'\Key.csv')
Vous avez beaucoup de code dupliqué; vous pouvez extraire le code rudundant dans les méthodes d'aide (petites fonctions) que vous pouvez passer dans les arguments –
si votre code n'a pas de bug, essayez de désactiver gc et collecter manuellement les déchets (si vous utilisez 2.6). – galaxyan
Peut-être essayer de décomposer le code pour pouvoir isoler le problème. Séparez toutes ces sections dans différentes fonctions et ajoutez un décorateur responsable du traçage, par ex. 'timestamp - nom de func démarré - parameters' et' timestamp - nom de func terminé - duration'. Il pourrait y avoir un problème avec le service que vous appelez que vous ne gérez pas avec élégance, mais c'est difficile à dire dans un programme de 500 lignes! – fips