Voici mon code pour la formation de mon modèle doc2vecdocuments gensim de doc2vec non trouvés par id
d'importation gensim.models.doc2vec Doc2Vec
from FileDocIterator import FileDocIterator
doc_file_name = 'doc_6million.txt'
docs = FileDocIterator(doc_file_name)
print "Fitting started"
model = Doc2Vec(docs, size=100, window=5, min_count=5, negative=20, workers=6, iter=4)
print "Saving model"
model.save("doc2vec_model")
print "model saved"
permet maintenant jeter un oeil à FileDocIterator
import json
from gensim.models.doc2vec import TaggedDocument
from gensim.models import Phrases
class FileDocIterator(object):
def __init__(self, fileName):
self.fileName = fileName
self.phrase = Phrases.load("phrases")
def __iter__(self):
for line in open(self.fileName):
jsData = json.loads(line)
yield TaggedDocument(words=jsData["data"], tags=jsData["id"])
Maintenant, je comprends que les phrases ne sont pas utilisées dans cette implémentation, mais gardez-moi ici, regardons à quoi ressemblent les données. Voici le premier point de données
{"data":["strategic","and","analytical","technical","program","director","and","innovator","who","inspires","calculated","risk-taking","in","emerging","technologies",",","such","as","cyber","security",",","risk",",","analytics",",","big","data",",","cloud",",","mobility","and","3d","printing",".","known","for","growing","company","profit","through","innovative","thinking","aimed","at","improving","employee","productivity","and","providing","solutions","to","private","industry","and","government","customers",".","recognized","for","invigorating","creative","thinking","and","collaboration","within","large","companies","to","leverage","their","economies","of","scale","to","capture","market","share",".","successful","in","managing","the","risk","and","uncertainty","throughout","the","innovation","lifecycle","by","leveraging","an","innovation","management","framework","to","overcome","barriers",".","track","record","of","producing","results","in","competitive",",","rapidly","changing","environments","where","innovation","and","customer","satisfaction","is","the","business",".","competencies","include",":","innovation","management","cyber",",","risk",",","analytics",",","cloud","computing","and","mobility","technology","development","security","compliance",":","dod/ic","(","nispom",",","icd","503",",","fedramp",")","commercial","(","iso/iec","27002",",","pci","dss",")","relationship","management",":","dod",",","public","sector","and","intelligence","community","change","management","it","security","&","risk","management","(","cissp",")","program",",","product","&","portfolio","management","(","pmp",")","data","analytics","management","(","cchd",")","itil","service",
"management","(","itilv3-expert",")"],
"id":"55c37f730d03382935e12767"}
Ma compréhension est que l'identifiant, 55c37f730d03382935e12767
doit être l'identifiant du document, ce faisant, le suivant doit me rendre un docVector.
model.docvecs["55c37f730d03382935e12767"]
Au lieu de cela, c'est ce qui est édité.
>>> model.docvecs["55c37f730d03382935e12767"]
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/usr/local/lib/python2.7/dist-packages/gensim/models/doc2vec.py", line 341, in __getitem__
return self.doctag_syn0[self._int_index(index)]
File "/usr/local/lib/python2.7/dist-packages/gensim/models/doc2vec.py", line 315, in _int_index
return self.max_rawint + 1 + self.doctags[index].offset
KeyError: '55c37f730d03382935e12767'
Essayer d'obtenir donne plus similaire les suivantes dos
>>> model.docvecs.most_similar("55c37f730d03382935e12767")
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/usr/local/lib/python2.7/dist-packages/gensim/models/doc2vec.py", line 450, in most_similar
raise KeyError("doc '%s' not in trained set" % doc)
KeyError: "doc '55c37f730d03382935e12767' not in trained set"
Ce que je suis en train de comprendre comment est sont des vecteurs de doc enregistrés et ce que id sont utilisés. Quelle partie de mon approche ne fonctionne pas ci-dessus?
Maintenant, voici quelque chose d'intéressant, si je fais ce qui suit, je récupère des vecteurs doc similaires, mais ils n'ont aucun sens pour moi.
>>> model.docvecs.most_similar(str(1))
[(u'8', 0.9000369906425476), (u'3', 0.8878246545791626), (u'7', 0.886141836643219), (u'2', 0.8834314942359924), (u'e', 0.8812381029129028), (u'a', 0.8648831248283386), (u'd', 0.8587037920951843), (u'0', 0.8413013219833374), (u'4', 0.8385311365127563), (u'c', 0.8290119767189026)]