2011-06-21 5 views
3

J'écris un code qui effectue un clustering k-means sur un ensemble de données. J'utilise en fait le code d'un livre appelé intelligence collective par O'Reilly. Tout fonctionne, mais dans son code il utilise la ligne de commande et je veux tout écrire dans le bloc-notes ++. En tant que référence sa ligne estComment imprimer des objets dans un tableau en python?

>>>kclust=clusters.kcluster(data,k=10) 
>>>[rownames[r] for r in k[0]] 

Voici mon code:

from PIL import Image,ImageDraw 
def readfile(filename): 
    lines=[line for line in file(filename)] 
    # First line is the column titles 
    colnames=lines[0].strip().split('\t')[1:] 
    rownames=[] 
    data=[] 
    for line in lines[1:]: 
     p=line.strip().split('\t') 
     # First column in each row is the rowname 
     rownames.append(p[0]) 
     # The data for this row is the remainder of the row 
     data.append([float(x) for x in p[1:]]) 
    return rownames,colnames,data 


from math import sqrt 
def pearson(v1,v2): 
    # Simple sums 
    sum1=sum(v1) 
    sum2=sum(v2) 

    # Sums of the squares 
    sum1Sq=sum([pow(v,2) for v in v1]) 
    sum2Sq=sum([pow(v,2) for v in v2]) 

    # Sum of the products 
    pSum=sum([v1[i]*v2[i] for i in range(len(v1))]) 

    # Calculate r (Pearson score) 
    num=pSum-(sum1*sum2/len(v1)) 
    den=sqrt((sum1Sq-pow(sum1,2)/len(v1))*(sum2Sq-pow(sum2,2)/len(v1))) 
    if den==0: return 0 

    return 1.0-num/den 

class bicluster: 
    def __init__(self,vec,left=None,right=None,distance=0.0,id=None): 
     self.left=left 
     self.right=right 
     self.vec=vec 
     self.id=id 
     self.distance=distance 

def hcluster(rows,distance=pearson): 
    distances={} 
    currentclustid=-1 

    # Clusters are initially just the rows 
    clust=[bicluster(rows[i],id=i) for i in range(len(rows))] 

    while len(clust)>1: 
     lowestpair=(0,1) 
     closest=distance(clust[0].vec,clust[1].vec) 

     # loop through every pair looking for the smallest distance 
     for i in range(len(clust)): 
      for j in range(i+1,len(clust)): 
       # distances is the cache of distance calculations 
       if (clust[i].id,clust[j].id) not in distances: 
        distances[(clust[i].id,clust[j].id)]=distance(clust[i].vec,clust[j].vec) 
       #print 'i' 
       #print i  
       #print 
       #print 'j' 
       #print j  
       #print 

       d=distances[(clust[i].id,clust[j].id)] 
       if d<closest: 
        closest=d 
        lowestpair=(i,j) 

     # calculate the average of the two clusters 
     mergevec=[ 
     (clust[lowestpair[0]].vec[i]+clust[lowestpair[1]].vec[i])/2.0 
     for i in range(len(clust[0].vec))] 

     # create the new cluster 
     newcluster=bicluster(mergevec,left=clust[lowestpair[0]], 
          right=clust[lowestpair[1]], 
          distance=closest,id=currentclustid) 

     # cluster ids that weren't in the original set are negative 
     currentclustid-=1 
     del clust[lowestpair[1]] 
     del clust[lowestpair[0]] 
     clust.append(newcluster) 

    return clust[0] 

def kcluster(rows,distance=pearson,k=4): 
    # Determine the minimum and maximum values for each point 
    ranges=[(min([row[i] for row in rows]),max([row[i] for row in rows])) 
    for i in range(len(rows[0]))] 

    # Create k randomly placed centroids 
    clusters=[[random.random()*(ranges[i][1]-ranges[i][0])+ranges[i][0] 
    for i in range(len(rows[0]))] for j in range(k)] 

    lastmatches=None 
    for t in range(100): 
     print 'Iteration %d' % t 
     bestmatches=[[] for i in range(k)] 

     # Find which centroid is the closest for each row 
     for j in range(len(rows)): 
      row=rows[j] 
      bestmatch=0 
      for i in range(k): 
       d=distance(clusters[i],row) 
       if d<distance(clusters[bestmatch],row): bestmatch=i 
      bestmatches[bestmatch].append(j) 

     # If the results are the same as last time, this is complete 
     if bestmatches==lastmatches: break 
     lastmatches=bestmatches 

     # Move the centroids to the average of their members 
     for i in range(k): 
      avgs=[0.0]*len(rows[0]) 
      if len(bestmatches[i])>0: 
       for rowid in bestmatches[i]: 
        for m in range(len(rows[rowid])): 
         avgs[m]+=rows[rowid][m] 
       for j in range(len(avgs)): 
        avgs[j]/=len(bestmatches[i]) 
       clusters[i]=avgs 
    return bestmatches 
+2

Il y a beaucoup de code ici pour ce qui se résume comme une question assez simple « Comment puis-je imprimer ... ». Pensez à poster moins de code, seulement la partie pertinente. –

Répondre

2

Je suppose que vous voulez exécuter cela comme un programme? simple est d'ajouter simplement les lignes en bas:

kclust=clusters.kcluster(data,k=10) 
print [rownames[r] for r in k[0]] 

Vous pouvez ajouter plus tard une fonction principale si vous envisagez d'élargir bien.

EDIT: Il y a une bonne description des fonctions principales de Guido here

Questions connexes