2017-10-14 5 views
2

Mes données est la suivanteCalculer moyenne de la colonne basée sur une autre colonne

0.5,4.96,0.724973,0.01481065 
0.5,5.11,0.726749,0.01140151 
0.5,4.99,0.893074,0.00910343 
0.5,4.14,0.734336,0.00835252 
0.5,1.69,0.755600,0.00422898 
0.6,4.43,0.733582,0.01796329 
0.6,4.47,0.740393,0.01399680 
0.6,4.49,0.885607,0.01095668 
0.6,3.69,0.720035,0.00992851 
0.6,1.60,0.748339,0.00456993 
0.7,4.03,0.756354,0.02086922 
0.7,3.99,0.771689,0.01705783 
0.7,4.02,0.854532,0.01319982 
0.7,3.33,0.725414,0.01170297 

Je veux calculer la moyenne des 2ème, 3ème et 4ème colonne en fonction de la valeur de la première colonne.

Par exemple, pour 0,5

0.5,4.18,0.766946,0.00957942

Répondre

4

La solution la plus courte avec GNU datamash:

datamash -st, -g1 mean 2 mean 3 mean 4 <file 
  • -s - trier les enregistrements

  • -t, - définies par des virgules , comme séparateur de champ

  • -g1 - groupe enregistre par le 1er champ


La sortie:

0.5,4.178,0.7669464,0.009579418 
0.6,3.736,0.7655912,0.011483042 
0.7,3.8425,0.77699725,0.01570746 
+0

Incroyable ..... jamais entendu parler de cet utilitaire.Est-il un moyen de garder le même nombre de décimales que dans l'entrée.Je ne dérange pas les zéros supplémentaires – user2650277

+0

@ user2650277, non, il effectue des calculs et imprime la nombres réels entiers – RomanPerekhrest

+1

Ou, 'datamash -st, -g1 signifie 2-4 randomir

2

awk pour sauver (en considérant que votre mode est triée INPUT_FILE, sinon vous pouvez utiliser sort -t, -k1 | awk ... avant le code suivant aussi):

awk -F, 'prev && prev != $1{for(i in a){split(i, b," ");val[b[1]]=val[b[1]]?val[b[1]] FS sprintf("%0.2f",a[i]/c[i]):sprintf("%0.2f",a[i]/c[i]);};delete a;print b[1],val[b[1]]}{a[$1,2]+=$2;a[$1,3]+=$3;a[$1,4]+=$4;c[$1,2]++;c[$1,3]++;c[$1,4]++;prev=$1} END{for(i in a){split(i, b," ");val[b[1]]=val[b[1]]?val[b[1]] FS sprintf("%0.2f",a[i]/c[i]):sprintf("%0.2f",a[i]/c[i]);};delete a;print b[1],val[b[1]]}' SUBSEP=" " Input_file 

sortie sera comme suit.

0.5 4.18,0.77,0.01 
0.6 3.74,0.77,0.01 
0.7 3.84,0.78,0.02 

Ajout maintenant d'une forme de doublure non-one.

awk -F, ' 
prev && prev != $1{ 
    for(i in a){ 
    split(i, b," "); 
    val[b[1]]=val[b[1]]?val[b[1]] FS sprintf("%0.2f",a[i]/c[i]):sprintf("%0.2f",a[i]/c[i]); 
}; 
delete a; 
print b[1],val[b[1]] 
} 
{ 
a[$1,2]+=$2; 
a[$1,3]+=$3; 
a[$1,4]+=$4; 
c[$1,2]++; 
c[$1,3]++; 
c[$1,4]++; 
prev=$1 
} 
END{ 
for(i in a){ 
    split(i, b," "); 
    val[b[1]]=val[b[1]]?val[b[1]] FS sprintf("%0.2f",a[i]/c[i]):sprintf("%0.2f",a[i]/c[i]); 
}; 
delete a; 
print b[1],val[b[1]] 
} 
' SUBSEP=" " Input_file 

EDIT: Ajout d'explication commande trop maintenant.

awk -F, ' 
##making field seprator as comma(,) 
prev && prev != $1{ 
##Checking here if value of prev variable is NOT equal to first column and value of variable prev is NOT NULL. 
    for(i in a){ 
##Traversing in array named a now. 
    split(i, b," "); 
##using split utility of awk which will split any variable or line to an array with provided delimiter eg--> split(variable/line, array_name,delimiter), like i(index of array a) is provided here to be splited into array named b with delimiter as a space. 
    val[b[1]]=val[b[1]]?val[b[1]] FS sprintf("%0.2f",a[i]/c[i]):sprintf("%0.2f",a[i]/c[i]); 
##creating an array named val with index of array b value whose value will be the AVG/MEAN of all $1s and its index will be $1. It will concatenate its own value. 
}; 
delete a; 
##Deleting array a here. 
print b[1],val[b[1]] 
##printing array b whose index is 1 and array val whose index is value of b[1] array. 
} 
{ 
a[$1,2]+=$2; 
##creating array a whose index is $1,2 where 2 denoted the 2nd field and it will add its all $2 values of whole Input_file. 
a[$1,3]+=$3; 
##creating array a whose index is $1,3 where 3 denoted the 3rd field and it will add its all $3 values of whole Input_file. 
a[$1,4]+=$4; 
##creating array a whose index is $1,4 where 4 denoted the 4th field and it will add its all $4 values of whole Input_file. 
c[$1,2]++; 
##creating array named c with index of $1,2 and incrementing its value each time to make sure no empty column values will come. 
c[$1,3]++; 
##creating array named c with index of $1,3 and incrementing its value each time to make sure no empty column values will come. 
c[$1,4]++; 
##creating array named c with index of $1,4 and incrementing its value each time to make sure no empty column values will come. 
prev=$1 
##Assigning variable prev value as column 1. 
} 
END{ 
for(i in a){ 
##Again traversing through the array a and getting the MEAN/AVG of last line which will not come before END block of awk so same logic above mentioned to get first field and its means of $2,$3 and $4. 
    split(i, b," "); 
    val[b[1]]=val[b[1]]?val[b[1]] FS sprintf("%0.2f",a[i]/c[i]):sprintf("%0.2f",a[i]/c[i]); 
}; 
delete a; 
print b[1],val[b[1]] 
##printing value of array b with index 1 and array val whose index is value of array b[1] value. 
} 
' SUBSEP=" " file17 
##Setting SUBSEP as space and Mentioning Input_file name above. 
2

Voici un petit script Awk propre que vous pouvez utiliser à cet effet,

#!/usr/bin/awk 

# Setting the input and output field-separators and setting a special variable 
# CONVFMT to control the precision width while printing the output 
# Change CONVFMT to %.2f if you don't want the rounding of digits 

BEGIN { FS=OFS=","; CONVFMT="%.2g" } 

NF == 4 { 
    # Creating a hash-table based on $1 value by summing up the value present 
    # in each of the other columns present. 
    sumOfCol2[$1]+=$2 
    sumOfCol3[$1]+=$3 
    sumOfCol4[$1]+=$4 
    count[$1]++; 
} 

END { 
    # Print the value (sum)/(count) value with the required precision control 
    for (i in sumOfCol2) 
     print i, (sumOfCol2[i]/count[i]), (sumOfCol3[i]/count[i]), (sumOfCol4[i]/count[i]) 
} 

et exécutez le script comme

awk -f script.awk file 
0.5,4.178,0.766946,0.00957942 
0.6,3.736,0.765591,0.011483 
0.7,3.8425,0.776997,0.0157075