RudiC
February 24, 2015, 6:11am
21
Don't edit posts modifying samples pulling the rug from under me.
However, the reason was the group "gn" not being sorted after g1 and g2 but before within awk
's arrays. I should have added that to the limitations (had I known this...). Try this:
awk 'NR==1 {for (i=1; i<=NF; i++) {GRCNT[$i]++
if (!GRMIN[$i]) GRMIN[$i]=i+1
GRMAX[$i]=i+1
}
if (debug) for (i in GRCNT) print "Verteilung: ", $1, i, GRCNT, GRMIN, GRMAX
}
{for (gc in GRCNT) {TOT[gc]=0
for (i=GRMIN[gc];i<=GRMAX[gc];i++)
{TOT[gc]+=($i>0.1)
if (debug) print $1, gc, i, $i, ($i>0.1)
}
}
for (gc in TOT) {if (debug) print NR, $1, gc, GRCNT[gc], TOT[gc]
if (TOT[gc] >= GRCNT[gc] * 0.8) {print; break}
}
}
' file
g1 g1 g1 g1 g1 g1 g1 g1 g1 g1 g2 g2 g2 g2 g2 g2 g2 g2 g2 g2 g2 g2
t1 t2 t3 t4 t5 t6 t7 t8 t9 t10 t1 t2 t3 t4 t5 t6 t7 t8 t9 t10 t11 t12
lnc2 0.2 0.1 0.2 0.2 0.2 2 2 2 2 2 0 0 0 0 0 0 0 0 0 0 0 0
lnc4 0 0 0 0 0 0 0 0 0 0 0.2 0.2 2 2 2 2 2 2 2 2 2 2
lnc5 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
1 Like
sorry about that. I was correcting the names to avoid confusion.
When I run the script for 100% samples, it is giving wrong output. But it is working fine with 90% or 80% ....Any idea ?
RudiC
March 6, 2015, 6:36am
24
Can't believe that. lnc4 had 100% in g2...
yes working. you are right. sorry about that
The code is working great but is it also possible to print the number of rows that satisfy the given condition per group. thanks
ex:
g1 1(v2)
g2 1(v4)
gn 1(v5)
Here is the code, input and output
Input
g1 g1 g1 g1 g1 g1 g1 g1 g1 g1 g2 g2 g2 g2 g2 g2 g2 g2 g2 g2 g2 g2 g2 g2 g2 g2 g2 g2 g2 g2 gn gn gn gn gn
t1 t2 t3 t4 t5 t6 t7 t8 t9 t10 t1 t2 t3 t4 t5 t6 t7 t8 t9 t10 t11 t12 t13 t14 t15 t16 t17 t18 t19 t20 t1 t2 t3 t4 t5
v1 0 0 0 0 0 0 0 0 0 0.1 0.1 0.1 0.1 0.1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
v2 0.2 0.1 0.2 0.2 0.2 2 2 2 2 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
v3 0 0 0 0 0 0 0 0 0 0 1 2 3 2 2 2 2 2 2 2 2 2 2 2 2 0 0 0 0 0 0 0 0 0 0
v4 0 0 0 0 0 0 0 0 0 0 0.2 0.2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 0 0 0 0
v5 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Script
awk 'NR==1 {for (i=1; i<=NF; i++) {GRCNT[$i]++
if (!GRMIN[$i]) GRMIN[$i]=i+1
GRMAX[$i]=i+1
}
if (debug) for (i in GRCNT) print "Verteilung: ", $1, i, GRCNT, GRMIN, GRMAX
}
{for (gc in GRCNT) {TOT[gc]=0
for (i=GRMIN[gc];i<=GRMAX[gc];i++)
{TOT[gc]+=($i>0.1)
if (debug) print $1, gc, i, $i, ($i>0.1)
}
}
for (gc in TOT) {if (debug) print NR, $1, gc, GRCNT[gc], TOT[gc]
if (TOT[gc] >= GRCNT[gc] * 0.8) {print; break}
}
}
' input
Ouput
g1 g1 g1 g1 g1 g1 g1 g1 g1 g1 g2 g2 g2 g2 g2 g2 g2 g2 g2 g2 g2 g2 g2 g2 g2 g2 g2 g2 g2 g2 gn gn gn gn gn
t1 t2 t3 t4 t5 t6 t7 t8 t9 t10 t1 t2 t3 t4 t5 t6 t7 t8 t9 t10 t11 t12 t13 t14 t15 t16 t17 t18 t19 t20 t1 t2 t3 t4 t5
v2 0.2 0.1 0.2 0.2 0.2 2 2 2 2 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
v4 0 0 0 0 0 0 0 0 0 0 0.2 0.2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 0 0 0 0
v5 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1
RudiC
April 6, 2015, 2:33pm
27
Try
awk 'NR==1 {for (i=1; i<=NF; i++) {GRCNT[$i]++
if (!GRMIN[$i]) GRMIN[$i]=i+1
GRMAX[$i]=i+1
}
if (debug) for (i in GRCNT) print "Verteilung: ", $1, i, GRCNT, GRMIN, GRMAX
}
NR>2 {for (gc in GRCNT) {TOT[gc]=0
for (i=GRMIN[gc];i<=GRMAX[gc];i++)
{TOT[gc]+=($i>0.1)
if (debug) print $1, gc, i, $i, ($i>0.1)
}
}
for (gc in TOT) {if (debug) print NR, $1, gc, GRCNT[gc], TOT[gc]
if (TOT[gc] >= GRCNT[gc] * 0.8) {print; CNT[gc]++; break}
}
}
END {for (c in CNT) print c, CNT[c]}
' file
1 Like
I am getting the following errors. Any help would be greatly appreciated. Thanks
g1_lpx 1. This one should be 2 as l1 and l5 satisy the condition 80%
g2_edfj should be 1 as l2 satisfy the condition 80%
Header is not coming in the output
input
g1_lpx g1_lpx g1_lpx g1_lpx g2_edfj g2_edfj g2_edfj g3_pp g3_pp g3_pp g3_pp g4_x g4_x gn_m gn_m gn_m gn_m
qwe100 qwe101 qwe133 qwe44 qweq33 qweq44 qwe77 qwexc2 qwe34 qwe55 qwe77 qwe99 qwe88 qwer5 qwer6 qwer8 qwer9
l1 1 1 1 0 0 0 0 0 0 0 0 1 1 0 0 0 0
l2 0 0 0 1 1 1 1 0 0 0 0 1 1 0 0 0 0
l3 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1
l4 0 0 0 0 0 0 0 0 0 0 0 0 0 0.3 0.3 0.3 0.3
l5 0.4 0.4 0.4 0.4 0 0 0 0 0 0 0 0 0 0 0 0 0
Script
awk 'NR==1 {for (i=1; i<=NF; i++) {GRCNT[$i]++
if (!GRMIN[$i]) GRMIN[$i]=i+1
GRMAX[$i]=i+1
}
if (debug) for (i in GRCNT) print "Verteilung: ", $1, i, GRCNT, GRMIN, GRMAX
}
NR>2 {for (gc in GRCNT) {TOT[gc]=0
for (i=GRMIN[gc];i<=GRMAX[gc];i++)
{TOT[gc]+=($i>0.1)
if (debug) print $1, gc, i, $i, ($i>0.1)
}
}
for (gc in TOT) {if (debug) print NR, $1, gc, GRCNT[gc], TOT[gc]
if (TOT[gc] >= GRCNT[gc] * 0.8) {print; CNT[gc]++; break}
}
}
END {for (c in CNT) print c, CNT[c]}
' $1
ouput
l1 1 1 1 0 0 0 0 0 0 0 0 1 1 0 0 0 0
l2 0 0 0 1 1 1 1 0 0 0 0 1 1 0 0 0 0
l3 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1
l4 0 0 0 0 0 0 0 0 0 0 0 0 0 0.3 0.3 0.3 0.3
l5 0.4 0.4 0.4 0.4 0 0 0 0 0 0 0 0 0 0 0 0 0
gn_m 2
g4_x 2
g1_lpx 1
RudiC
April 7, 2015, 10:22am
29
Sorry for the headers. For the group counts, I didn't consider that one line can have several groups fulfilling the requirements. Try
awk 'NR==1 {for (i=1; i<=NF; i++) {GRCNT[$i]++
if (!GRMIN[$i]) GRMIN[$i]=i+1
GRMAX[$i]=i+1
}
if (debug) for (i in GRCNT) print "Verteilung: ", $1, i, GRCNT, GRMIN, GRMAX
print
}
NR==2 {print}
NR>2 {for (gc in GRCNT) {TOT[gc]=0
for (i=GRMIN[gc];i<=GRMAX[gc];i++)
{TOT[gc]+=($i>0.1)
if (debug) print $1, gc, i, $i, ($i>0.1)
}
}
}
{for (gc in TOT) {if (debug) print NR, $1, gc, GRCNT[gc], TOT[gc]
if (TOT[gc] >= GRCNT[gc] * 0.8) {Pr=1; CNT[gc]++}
}
if (Pr) {print; Pr=0}
}
END {for (c in CNT) print c, CNT[c]}
' file
.
.
.
g4_x 2
g1_lpx 1
gn_m 2
g2_edfj 1
1 Like
this will be the final request. is it possible to print no.of groups and no.of rows in common from thesame input above. great thanks.
output
no.of groups no.of rows in common
0 0
1 3
2 2
3 0
4 0
5 0
RudiC
April 8, 2015, 4:59pm
31
Not clear. Please elaborate.
Hope its clear .... please let me know if not. thanks
input
g1_lpx g1_lpx g1_lpx g1_lpx g2_edfj g2_edfj g2_edfj g3_pp g3_pp g3_pp g3_pp g4_x g4_x gn_m gn_m gn_m gn_m
qwe100 qwe101 qwe133 qwe44 qweq33 qweq44 qwe77 qwexc2 qwe34 qwe55 qwe77 qwe99 qwe88 qwer5 qwer6 qwer8 qwer9
l1 1 1 1 0 0 0 0 0 0 0 0 1 1 0 0 0 0
l2 0 0 0 1 1 1 1 0 0 0 0 1 1 0 0 0 0
l3 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1
l4 0 0 0 0 0 0 0 0 0 0 0 0 0 0.3 0.3 0.3 0.3
l5 0.4 0.4 0.4 0.4 0 0 0 0 0 0 0 0 0 0 0 0 0
output description
0 (none of the groups)
1 (1 of the 5 groups g1_lpx or g2_edfj or g3_pp or g4_x or gn_m)
2 (2 of the 5 groups)
3 (3 of the 5 groups)
4 (4 of the 5 groups)
5 (5 of the 5 groups)
calculate how many of keys (l1 or l2 or l3 or l4 or l5) satisfy the condition 80% in groups 1 or 2 or 3 or 4 or 5
therefore
no.of groups no.of rows in common
0 0
1 3 (l3/l4/l5)
2 2(l1/l2)
3 0
4 0
5 0
RudiC
April 9, 2015, 6:51am
33
awk 'NR==1 {for (i=1; i<=NF; i++) {if (!($i in GRCNT)) GR++
GRCNT[$i]++
if (!GRMIN[$i]) GRMIN[$i]=i+1
GRMAX[$i]=i+1
}
if (debug) for (i in GRCNT) print "Verteilung: ", $1, i, GRCNT, GRMIN, GRMAX
print
}
NR==2 {print}
NR>2 {for (gc in GRCNT) {TOT[gc]=0
for (i=GRMIN[gc];i<=GRMAX[gc];i++)
{TOT[gc]+=($i>0.1)
if (debug) print $1, gc, i, $i, ($i>0.1)
}
}
}
{for (gc in TOT) {if (debug) print NR, $1, gc, GRCNT[gc], TOT[gc]
if (TOT[gc] >= GRCNT[gc] * 0.8) {Pr=1; GCNT[gc]++; LCNT[$1]++}
}
if (Pr) {print; Pr=0}
}
END {for (g in GCNT) print g, GCNT[g]
for (l in LCNT) {print l, LCNT[l]
NCNT[LCNT[l]]++
}
print "no. grp\tno.of rows in common"
for (i=0; i<=GR; i++) print i "\t" NCNT+0
}
' file
g1_lpx g1_lpx g1_lpx g1_lpx g2_edfj g2_edfj g2_edfj g3_pp g3_pp g3_pp g3_pp g4_x g4_x gn_m gn_m gn_m gn_m
qwe100 qwe101 qwe133 qwe44 qweq33 qweq44 qwe77 qwexc2 qwe34 qwe55 qwe77 qwe99 qwe88 qwer5 qwer6 qwer8 qwer9
l1 1 1 1 0 0 0 0 0 0 0 0 1 1 0 0 0 0
l2 0 0 0 1 1 1 1 0 0 0 0 1 1 0 0 0 0
l3 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1
l4 0 0 0 0 0 0 0 0 0 0 0 0 0 0.3 0.3 0.3 0.3
l5 0.4 0.4 0.4 0.4 0 0 0 0 0 0 0 0 0 0 0 0 0
g4_x 2
g1_lpx 1
gn_m 2
g2_edfj 1
l1 1
l2 2
l3 1
l4 1
l5 1
no. grp no.of rows in common
0 0
1 4
2 1
3 0
4 0
5 0
Please be aware that l1 only has one group satisfying the condition!
1 Like
but l1 satisfy the condition in 2 groups i.e, g1_lpx and g4_x.
0,1,2,3,4,5 doest represent l1, l2, l3, l4, l5. they represent groups (g1_lpx.....gn_m)
RudiC
April 9, 2015, 9:14am
35
g1_lpx
has three out of four which is 75% not exceeding the required 80%. What do you mean by
0,1,2,3,4,5 doest represent l1, l2, l3, l4, l5. they represent groups (g1_lpx.....gn_m)
and how does it relate to my last proposal?
my bad. sorry. Your answer was correct.