The awk
below executes and is close (producing the first 4 columns in desired). However, when I add the sum of $7
, I get nothing returned. Basically, I am trying to combine all the matching $4
in f1
and output them with the average of $7
in each match. Thank you :).
f1
chr9 135804143 135804145 chr9:135804143-135804145 TSC1 1 128
chr9 135804143 135804145 chr9:135804143-135804145 TSC1 2 125
chr9 135819919 135819922 chr9:135819919-135819922 TSC1 1 0
chr9 135819919 135819922 chr9:135819919-135819922 TSC1 2 0
chr9 135819919 135819922 chr9:135819919-135819922 TSC1 3 0
chr16 2097885 2097890 chr16:2097885-2097890 TSC2 1 249
chr16 2097885 2097890 chr16:2097885-2097890 TSC2 2 245
chr16 2097885 2097890 chr16:2097885-2097890 TSC2 3 243
chr16 2097885 2097890 chr16:2097885-2097890 TSC2 4 237
chr16 2097885 2097890 chr16:2097885-2097890 TSC2 5 237
desired tab-delimeted
chr9 135804143 135804145 TSC1 126
chr9 135819919 135819922 TSC1 0
chr16 2097885 2097890 TSC2 242
awk
awk '
# print from stored values
function prt(){
print p1 ":" (p6start==1 ? p2 : p2+p6start) "-" p2+p6, "\t" p5
}
($4!=p4 || $6!=p6+1) {
# new sequence, print the previous sequence
if (NR>1) prt()
p6start=$6
}
{
# store the values that for later
p1=$1
p2=$2
p4=$4
p5=$5
p6=$6
}
END { prt() }
' f1 | awk -F"[:-]" ' { print $1 "\t" $2 "\t" $3 "\t" $4}' | awk -v N=7 '{ sum += $N } END { if (NR > 0) print sum / NR }' > out