Calculer moyenne de la colonne basée sur une autre colonne

Mes données est la suivanteCalculer moyenne de la colonne basée sur une autre colonne

0.5,4.96,0.724973,0.01481065 
0.5,5.11,0.726749,0.01140151 
0.5,4.99,0.893074,0.00910343 
0.5,4.14,0.734336,0.00835252 
0.5,1.69,0.755600,0.00422898 
0.6,4.43,0.733582,0.01796329 
0.6,4.47,0.740393,0.01399680 
0.6,4.49,0.885607,0.01095668 
0.6,3.69,0.720035,0.00992851 
0.6,1.60,0.748339,0.00456993 
0.7,4.03,0.756354,0.02086922 
0.7,3.99,0.771689,0.01705783 
0.7,4.02,0.854532,0.01319982 
0.7,3.33,0.725414,0.01170297

Je veux calculer la moyenne des 2ème, 3ème et 4ème colonne en fonction de la valeur de la première colonne.

Par exemple, pour 0,5

0.5,4.18,0.766946,0.00957942

Source

2017-10-14 user2650277

La solution la plus courte avec GNU datamash:

datamash -st, -g1 mean 2 mean 3 mean 4 <file

-s - trier les enregistrements
-t, - définies par des virgules , comme séparateur de champ
-g1 - groupe enregistre par le 1er champ

La sortie:

0.5,4.178,0.7669464,0.009579418 
0.6,3.736,0.7655912,0.011483042 
0.7,3.8425,0.77699725,0.01570746

Source

2017-10-14 09:30:56 RomanPerekhrest

Incroyable ..... jamais entendu parler de cet utilitaire.Est-il un moyen de garder le même nombre de décimales que dans l'entrée.Je ne dérange pas les zéros supplémentaires – user2650277

@ user2650277, non, il effectue des calculs et imprime la nombres réels entiers – RomanPerekhrest

Ou, 'datamash -st, -g1 signifie 2-4 randomir

awk pour sauver (en considérant que votre mode est triée INPUT_FILE, sinon vous pouvez utiliser sort -t, -k1 | awk ... avant le code suivant aussi):

awk -F, 'prev && prev != $1{for(i in a){split(i, b," ");val[b[1]]=val[b[1]]?val[b[1]] FS sprintf("%0.2f",a[i]/c[i]):sprintf("%0.2f",a[i]/c[i]);};delete a;print b[1],val[b[1]]}{a[$1,2]+=$2;a[$1,3]+=$3;a[$1,4]+=$4;c[$1,2]++;c[$1,3]++;c[$1,4]++;prev=$1} END{for(i in a){split(i, b," ");val[b[1]]=val[b[1]]?val[b[1]] FS sprintf("%0.2f",a[i]/c[i]):sprintf("%0.2f",a[i]/c[i]);};delete a;print b[1],val[b[1]]}' SUBSEP=" " Input_file

sortie sera comme suit.

0.5 4.18,0.77,0.01 
0.6 3.74,0.77,0.01 
0.7 3.84,0.78,0.02

Ajout maintenant d'une forme de doublure non-one.

awk -F, ' 
prev && prev != $1{ 
    for(i in a){ 
    split(i, b," "); 
    val[b[1]]=val[b[1]]?val[b[1]] FS sprintf("%0.2f",a[i]/c[i]):sprintf("%0.2f",a[i]/c[i]); 
}; 
delete a; 
print b[1],val[b[1]] 
} 
{ 
a[$1,2]+=$2; 
a[$1,3]+=$3; 
a[$1,4]+=$4; 
c[$1,2]++; 
c[$1,3]++; 
c[$1,4]++; 
prev=$1 
} 
END{ 
for(i in a){ 
    split(i, b," "); 
    val[b[1]]=val[b[1]]?val[b[1]] FS sprintf("%0.2f",a[i]/c[i]):sprintf("%0.2f",a[i]/c[i]); 
}; 
delete a; 
print b[1],val[b[1]] 
} 
' SUBSEP=" " Input_file

EDIT: Ajout d'explication commande trop maintenant.

awk -F, ' 
##making field seprator as comma(,) 
prev && prev != $1{ 
##Checking here if value of prev variable is NOT equal to first column and value of variable prev is NOT NULL. 
    for(i in a){ 
##Traversing in array named a now. 
    split(i, b," "); 
##using split utility of awk which will split any variable or line to an array with provided delimiter eg--> split(variable/line, array_name,delimiter), like i(index of array a) is provided here to be splited into array named b with delimiter as a space. 
    val[b[1]]=val[b[1]]?val[b[1]] FS sprintf("%0.2f",a[i]/c[i]):sprintf("%0.2f",a[i]/c[i]); 
##creating an array named val with index of array b value whose value will be the AVG/MEAN of all $1s and its index will be $1. It will concatenate its own value. 
}; 
delete a; 
##Deleting array a here. 
print b[1],val[b[1]] 
##printing array b whose index is 1 and array val whose index is value of b[1] array. 
} 
{ 
a[$1,2]+=$2; 
##creating array a whose index is $1,2 where 2 denoted the 2nd field and it will add its all $2 values of whole Input_file. 
a[$1,3]+=$3; 
##creating array a whose index is $1,3 where 3 denoted the 3rd field and it will add its all $3 values of whole Input_file. 
a[$1,4]+=$4; 
##creating array a whose index is $1,4 where 4 denoted the 4th field and it will add its all $4 values of whole Input_file. 
c[$1,2]++; 
##creating array named c with index of $1,2 and incrementing its value each time to make sure no empty column values will come. 
c[$1,3]++; 
##creating array named c with index of $1,3 and incrementing its value each time to make sure no empty column values will come. 
c[$1,4]++; 
##creating array named c with index of $1,4 and incrementing its value each time to make sure no empty column values will come. 
prev=$1 
##Assigning variable prev value as column 1. 
} 
END{ 
for(i in a){ 
##Again traversing through the array a and getting the MEAN/AVG of last line which will not come before END block of awk so same logic above mentioned to get first field and its means of $2,$3 and $4. 
    split(i, b," "); 
    val[b[1]]=val[b[1]]?val[b[1]] FS sprintf("%0.2f",a[i]/c[i]):sprintf("%0.2f",a[i]/c[i]); 
}; 
delete a; 
print b[1],val[b[1]] 
##printing value of array b with index 1 and array val whose index is value of array b[1] value. 
} 
' SUBSEP=" " file17 
##Setting SUBSEP as space and Mentioning Input_file name above.

Source

2017-10-14 09:27:02 RavinderSingh13

Voici un petit script Awk propre que vous pouvez utiliser à cet effet,

#!/usr/bin/awk 

# Setting the input and output field-separators and setting a special variable 
# CONVFMT to control the precision width while printing the output 
# Change CONVFMT to %.2f if you don't want the rounding of digits 

BEGIN { FS=OFS=","; CONVFMT="%.2g" } 

NF == 4 { 
    # Creating a hash-table based on $1 value by summing up the value present 
    # in each of the other columns present. 
    sumOfCol2[$1]+=$2 
    sumOfCol3[$1]+=$3 
    sumOfCol4[$1]+=$4 
    count[$1]++; 
} 

END { 
    # Print the value (sum)/(count) value with the required precision control 
    for (i in sumOfCol2) 
     print i, (sumOfCol2[i]/count[i]), (sumOfCol3[i]/count[i]), (sumOfCol4[i]/count[i]) 
}

et exécutez le script comme

awk -f script.awk file 
0.5,4.178,0.766946,0.00957942 
0.6,3.736,0.765591,0.011483 
0.7,3.8425,0.776997,0.0157075

Source

2017-10-14 09:28:23 Inian

Calculer moyenne de la colonne basée sur une autre colonne

Répondre

Questions connexes