J'indexe les pages d'url avec python lucene.La fonction Python Lucene ajoute le contenu du champ au document ne fonctionne pas
J'ai eu quelques erreurs en essayant d'ajouter des champs au document. Je ne suis pas sûr pourquoi. L'erreur dit:
JavaError:,> Java stacktrace: java.lang.IllegalArgumentException: il n'a pas de sens d'avoir un champ qui est ni indexé ni stocké à org.apache.lucene.document. champ (Field.java:249)
en ligne où je mets. doc.add (champ ("contenu", texte, t2))
Le code python j'ai utilisé est:
def IndexerForUrl(start, number, domain):
lucene.initVM()
# join base dir and index dir
path = os.path.abspath("paths")
directory = SimpleFSDirectory(Paths.get(path)) # the index
analyzer = StandardAnalyzer()
writerConfig = IndexWriterConfig(analyzer)
writerConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
writer = IndexWriter(directory, writerConfig)
print "reading lines from sys.std..."
# hashtable dictionary
D = {}
D[start] = [start]
numVisited = 0
wordBool = False
n = start
queue = [start]
visited = set()
t1 = FieldType()
t1.setStored(True)
t1.setTokenized(False)
t2 = FieldType()
t2.setStored(False)
t2.setTokenized(True)
while numVisited < number and queue and not wordBool:
pg = queue.pop(0)
if pg not in visited:
visited.add(pg)
htmlwebpg = urllib2.urlopen(pg).read()
# robot exclusion standard
rp = robotparser.RobotFileParser()
rp.set_url(pg)
rp.read() # read robots.txt url and feeds to parser
soup = BeautifulSoup(htmlwebpg, 'html.parser')
for script in soup(["script","style"]):
script.extract()
text = soup.get_text()
lines = (line.strip() for line in text.splitlines())
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
text = '\n'.join(chunk for chunk in chunks if chunk)
print text
doc = Document()
doc.add(Field("urlpath", pg, t2))
if len(text)> 0:
doc.add(Field("contents", text, t2))
else:
print "warning: no content in %s " % pgv
writer.addDocument(doc)
numVisited = numVisited+1
linkset = set()
# add to list
for link in soup.findAll('a', attrs={'href':re.compile("^http://")}):
#links.append(link.get('href'))
if rp.can_fetch(link.get('href')):
linkset.add(link.get('href'))
D[pg] = linkset
queue.extend(D[pg] - visited)
writer.commit()
writer.close()
directory.close() #close the index
return writer
oh merci. je vais essayer. –