Add specific string to last field of each line in perl based on value

I am trying to add a condition to the below perl that will capture the GT tag and place a specific string in the last field of each line. The problem is that the GT value used is not right after the tag rather it is a few fields away. The values should always be 0/1 or 1/2 and are in bold in the input . 0/1=het and 1/2=hom . Thank you :).

input

##
##
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality, the Phred-scaled marginal (or unconditional) probability of the called genotype">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">
##
##
#CHROM    POS    ID    REF    ALT    QUAL    FILTER    INFO    FORMAT    SAMPLE
chr1    9324670    .    A    G    672.016    PASS    AF=0.528369;AO=148;DP=281;FAO=149;FDP=282;FR=.;FRO=133;FSAF=59;FSAR=90;FSRF=60;FSRR=73;FWDB=0.00343606;FXX=0;HRUN=1;LEN=1;MLLD=155.207;OALT=G;OID=.;OMAPALT=G;OPOS=9324670;OREF=A;PB=0.5;PBP=1;QD=9.53214;RBI=0.00594431;REFB=-0.0181827;REVB=0.00485061;RO=130;SAF=59;SAR=89;SRF=57;SRR=73;SSEN=0;SSEP=0;SSSB=-0.0352973;STB=0.526882;STBP=0.323;TYPE=snp;VARB=0.0184938;ANN=H6PD    GT:GQ:DP:FDP:RO:FRO:AO:FAO:AF:SAR:SAF:SRF:SRR:FSAR:FSAF:FSRF:FSRR    0/1:527:281:282:130:133:148:149:0.528369:89:59:57:73:90:59:60:73 GOOD 282 reads
chr1    10318652    .    C    G    360.217    PASS    AF=0.566929;AO=72;DP=129;FAO=72;FDP=127;FR=.;FRO=55;FSAF=36;FSAR=36;FSRF=31;FSRR=24;FWDB=0.00760676;FXX=0.0155027;HRUN=2;LEN=1;MLLD=115.62;OALT=G;OID=.;OMAPALT=G;OPOS=10318652;OREF=C;PB=0.5;PBP=1;QD=11.3454;RBI=0.0125905;REFB=-0.0312889;REVB=-0.0100329;RO=55;SAF=36;SAR=36;SRF=31;SRR=24;SSEN=0;SSEP=0;SSSB=-0.0505108;STB=0.527551;STBP=0.492;TYPE=snp;VARB=0.0181889;ANN=KIF1B    GT:GQ:DP:FDP:RO:FRO:AO:FAO:AF:SAR:SAF:SRF:SRR:FSAR:FSAF:FSRF:FSRR    1/1:203:129:127:55:55:72:72:0.566929:36:36:31:24:36:36:31:24 GOOD 127 reads
chr1    10355834    .    C    T    504.995    PASS    AF=0.456747;AO=132;DP=290;FAO=132;FDP=289;FR=.;FRO=157;FSAF=56;FSAR=76;FSRF=85;FSRR=72;FWDB=0.0634576;FXX=0.00344816;HRUN=2;LEN=1;MLLD=58.7971;OALT=T;OID=.;OMAPALT=T;OPOS=10355834;OREF=C;PB=0.5;PBP=1;QD=6.98956;RBI=0.0644815;REFB=0.00590624;REVB=-0.0114455;RO=156;SAF=56;SAR=76;SRF=85;SRR=71;SSEN=0;SSEP=0;SSSB=-0.118565;STB=0.563872;STBP=0.047;TYPE=snp;VARB=-0.00678859;ANN=KIF1B    GT:GQ:DP:FDP:RO:FRO:AO:FAO:AF:SAR:SAF:SRF:SRR:FSAR:FSAF:FSRF:FSRR    0/1:504:290:289:156:157:132:132:0.456747:76:56:85:71:76:56:85:72 GOOD 289 reads
chr1    11090916    .    C    A    1855.57    PASS    AF=1;AO=193;DP=193;FAO=194;FDP=194;FR=.;FRO=0;FSAF=96;FSAR=98;FSRF=0;FSRR=0;FWDB=0.0243175;FXX=0;HRUN=1;LEN=1;MLLD=207.14;OALT=A;OID=.;OMAPALT=A;OPOS=11090916;OREF=C;PB=0.5;PBP=1;QD=38.2592;RBI=0.048476;REFB=0;REVB=0.0419355;RO=0;SAF=95;SAR=98;SRF=0;SRR=0;SSEN=0;SSEP=0;SSSB=4.54468e-08;STB=0.5;STBP=1;TYPE=snp;VARB=3.87852e-05;ANN=MASP2    GT:GQ:DP:FDP:RO:FRO:AO:FAO:AF:SAR:SAF:SRF:SRR:FSAR:FSAF:FSRF:FSRR    1/1:88:193:194:0:0:193:194:1:98:95:0:0:98:96:0:0 GOOD 194 reads
chr1    213068404    .    T    C    70.4374    PASS    AF=0.435897;AO=17;DP=39;FAO=17;FDP=39;FR=.;FRO=22;FSAF=16;FSAR=1;FSRF=11;FSRR=11;FWDB=-0.000611735;FXX=0;HRUN=1;LEN=1;MLLD=243.519;OALT=C;OID=.;OMAPALT=C;OPOS=213068404;OREF=T;PB=0.5;PBP=1;QD=7.22435;RBI=0.023516;REFB=0.00205571;REVB=0.023508;RO=22;SAF=16;SAR=1;SRF=11;SRR=11;SSEN=0;SSEP=0;SSSB=0.630793;STB=0.87614;STBP=0.001;TYPE=snp;VARB=-0.00167889;ANN=FLVCR1    GT:GQ:DP:FDP:RO:FRO:AO:FAO:AF:SAR:SAF:SRF:SRR:FSAR:FSAF:FSRF:FSRR    0/1:70:39:39:22:22:17:17:0.435897:1:16:11:11:1:16:11:11 STRAND BIAS 39 reads

desired output

##
##
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality, the Phred-scaled marginal (or unconditional) probability of the called genotype">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">
##
##
#CHROM    POS    ID    REF    ALT    QUAL    FILTER    INFO    FORMAT    SAMPLE
chr1    9324670    .    A    G    672.016    PASS    AF=0.528369;AO=148;DP=281;FAO=149;FDP=282;FR=.;FRO=133;FSAF=59;FSAR=90;FSRF=60;FSRR=73;FWDB=0.00343606;FXX=0;HRUN=1;LEN=1;MLLD=155.207;OALT=G;OID=.;OMAPALT=G;OPOS=9324670;OREF=A;PB=0.5;PBP=1;QD=9.53214;RBI=0.00594431;REFB=-0.0181827;REVB=0.00485061;RO=130;SAF=59;SAR=89;SRF=57;SRR=73;SSEN=0;SSEP=0;SSSB=-0.0352973;STB=0.526882;STBP=0.323;TYPE=snp;VARB=0.0184938;ANN=H6PD    GT:GQ:DP:FDP:RO:FRO:AO:FAO:AF:SAR:SAF:SRF:SRR:FSAR:FSAF:FSRF:FSRR    0/1:527:281:282:130:133:148:149:0.528369:89:59:57:73:90:59:60:73 GOOD 282 reads het
chr1    10318652    .    C    G    360.217    PASS    AF=0.566929;AO=72;DP=129;FAO=72;FDP=127;FR=.;FRO=55;FSAF=36;FSAR=36;FSRF=31;FSRR=24;FWDB=0.00760676;FXX=0.0155027;HRUN=2;LEN=1;MLLD=115.62;OALT=G;OID=.;OMAPALT=G;OPOS=10318652;OREF=C;PB=0.5;PBP=1;QD=11.3454;RBI=0.0125905;REFB=-0.0312889;REVB=-0.0100329;RO=55;SAF=36;SAR=36;SRF=31;SRR=24;SSEN=0;SSEP=0;SSSB=-0.0505108;STB=0.527551;STBP=0.492;TYPE=snp;VARB=0.0181889;ANN=KIF1B    GT:GQ:DP:FDP:RO:FRO:AO:FAO:AF:SAR:SAF:SRF:SRR:FSAR:FSAF:FSRF:FSRR    1/1:203:129:127:55:55:72:72:0.566929:36:36:31:24:36:36:31:24 GOOD 127 reads hom
chr1    10355834    .    C    T    504.995    PASS    AF=0.456747;AO=132;DP=290;FAO=132;FDP=289;FR=.;FRO=157;FSAF=56;FSAR=76;FSRF=85;FSRR=72;FWDB=0.0634576;FXX=0.00344816;HRUN=2;LEN=1;MLLD=58.7971;OALT=T;OID=.;OMAPALT=T;OPOS=10355834;OREF=C;PB=0.5;PBP=1;QD=6.98956;RBI=0.0644815;REFB=0.00590624;REVB=-0.0114455;RO=156;SAF=56;SAR=76;SRF=85;SRR=71;SSEN=0;SSEP=0;SSSB=-0.118565;STB=0.563872;STBP=0.047;TYPE=snp;VARB=-0.00678859;ANN=KIF1B    GT:GQ:DP:FDP:RO:FRO:AO:FAO:AF:SAR:SAF:SRF:SRR:FSAR:FSAF:FSRF:FSRR    0/1:504:290:289:156:157:132:132:0.456747:76:56:85:71:76:56:85:72 GOOD 289 reads het
chr1    11090916    .    C    A    1855.57    PASS    AF=1;AO=193;DP=193;FAO=194;FDP=194;FR=.;FRO=0;FSAF=96;FSAR=98;FSRF=0;FSRR=0;FWDB=0.0243175;FXX=0;HRUN=1;LEN=1;MLLD=207.14;OALT=A;OID=.;OMAPALT=A;OPOS=11090916;OREF=C;PB=0.5;PBP=1;QD=38.2592;RBI=0.048476;REFB=0;REVB=0.0419355;RO=0;SAF=95;SAR=98;SRF=0;SRR=0;SSEN=0;SSEP=0;SSSB=4.54468e-08;STB=0.5;STBP=1;TYPE=snp;VARB=3.87852e-05;ANN=MASP2    GT:GQ:DP:FDP:RO:FRO:AO:FAO:AF:SAR:SAF:SRF:SRR:FSAR:FSAF:FSRF:FSRR    1/1:88:193:194:0:0:193:194:1:98:95:0:0:98:96:0:0 GOOD 194 reads hom
chr1    213068404    .    T    C    70.4374    PASS    AF=0.435897;AO=17;DP=39;FAO=17;FDP=39;FR=.;FRO=22;FSAF=16;FSAR=1;FSRF=11;FSRR=11;FWDB=-0.000611735;FXX=0;HRUN=1;LEN=1;MLLD=243.519;OALT=C;OID=.;OMAPALT=C;OPOS=213068404;OREF=T;PB=0.5;PBP=1;QD=7.22435;RBI=0.023516;REFB=0.00205571;REVB=0.023508;RO=22;SAF=16;SAR=1;SRF=11;SRR=11;SSEN=0;SSEP=0;SSSB=0.630793;STB=0.87614;STBP=0.001;TYPE=snp;VARB=-0.00167889;ANN=FLVCR1    GT:GQ:DP:FDP:RO:FRO:AO:FAO:AF:SAR:SAF:SRF:SRR:FSAR:FSAF:FSRF:FSRR    0/1:70:39:39:22:22:17:17:0.435897:1:16:11:11:1:16:11:11 STRAND BIAS 39 reads het

perl

perl -ple '/^[^#].*FDP=(\d+);.*STB=(\d+\.\d+);.*GT=(\d+);  ($9:*)= 0/1?="het",= 1/1?="hom"/ and $_.=($2 >= 0.8?" STRAND BIAS ":" GOOD ").$1." reads"' input > results.txt 

You said: "1/2 equals hom", however, in your desired output 1/1 gets hom, and 1/2 is nowhere to be found.

Here's a line of Perl that produces what you posted as desired output, based on what you posted as input:

perl -ple 'BEGIN{%h=qw(0/1 het 1/1 hom)}; /([01]\/1)/ and $_ .= " $h{$1}"'  input > results.txt
1 Like

I apologize for the typo.... it may be easier to include all 0/0 or 1/1 or 2/2 as hom . The other condition is 0/1 or 1/2 is het . I think these are all the possibilities. Thank you :).

Modify it, accordingly.

perl -ple 'BEGIN{%h=qw(0/0 hom 0/1 het 1/1 hom 1/2 het 2/2 hom)} /([0-2]\/[0-2])/ and $_ .=" $h{$1}"'  input > results.txt
1 Like

If you showed us a representative sample of your real input, you could also try using awk (or, on Solaris/SunOS systems, nawk or /usr/xpg4/bin/awk ):

awk '$10~"^[012]"{$0=$0($10~"^(0/0|1/1|2/2)"?" hom":" het")}1' input > results.txt
1 Like

Thank you very much :slight_smile:

The below perl from @Aia works great and produces the desired result, but I don't think I understand how it works. Is it possible to explain a little, maybe that will help. Thank you :).

perl -ple 'BEGIN{%h=qw(0/0 hom 0/1 het 1/1 hom 1/2 het 2/2 hom)} /([0-2]\/[0-2])/ and $_ .=" $h{$1}"'

For example, how does STB get calculated or the reads extracted?

The input file is the one in the post. I have a follow-up question but I want to try and understand so I can make an attempt at a perl code. Thank you.

Followup question:
The below perl produces the current results . In the desired results the last fields below are added and need to be tab deliniated. The value in front of the string "score" is caculated from $6 or QUAL . That calculation is $6/33 .

GOOD 282 reads het 20 score  (672.016/33) no decimal, rounded up
GOOD 127 reads hom 11 score  (360.217/33) no decimal, rounded up
GOOD 289 reads het 15 score  (504.995/33) no decimal, rounded up
GOOD 194 reads hom 56 score  (1855.57/33) no decimal, rounded up
STRAND BIAS 39 reads het 2 score  (70.4374/33) no decimal, rounded up
perl -ple 'BEGIN{%h=qw(0/0 hom 0/1 het 1/1 hom 1/2 het 2/2 hom)} /([0-2]\/[0-2])/ and $_ .=" $h{$1}"'  input > results.txt

input

##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality, the Phred-scaled marginal (or unconditional) probability of the called genotype">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">
#CHROM    POS    ID    REF    ALT    QUAL    FILTER    INFO    FORMAT    SAMPLE
chr1    9324670    .    A    G    672.016    PASS    AF=0.528369;AO=148;DP=281;FAO=149;FDP=282;FR=.;FRO=133;FSAF=59;FSAR=90;FSRF=60;FSRR=73;FWDB=0.00343606;FXX=0;HRUN=1;LEN=1;MLLD=155.207;OALT=G;OID=.;OMAPALT=G;OPOS=9324670;OREF=A;PB=0.5;PBP=1;QD=9.53214;RBI=0.00594431;REFB=-0.0181827;REVB=0.00485061;RO=130;SAF=59;SAR=89;SRF=57;SRR=73;SSEN=0;SSEP=0;SSSB=-0.0352973;STB=0.526882;STBP=0.323;TYPE=snp;VARB=0.0184938;ANN=H6PD    GT:GQ:DP:FDP:RO:FRO:AO:FAO:AF:SAR:SAF:SRF:SRR:FSAR:FSAF:FSRF:FSRR    0/1:527:281:282:130:133:148:149:0.528369:89:59:57:73:90:59:60:73
chr1    10318652    .    C    G    360.217    PASS    AF=0.566929;AO=72;DP=129;FAO=72;FDP=127;FR=.;FRO=55;FSAF=36;FSAR=36;FSRF=31;FSRR=24;FWDB=0.00760676;FXX=0.0155027;HRUN=2;LEN=1;MLLD=115.62;OALT=G;OID=.;OMAPALT=G;OPOS=10318652;OREF=C;PB=0.5;PBP=1;QD=11.3454;RBI=0.0125905;REFB=-0.0312889;REVB=-0.0100329;RO=55;SAF=36;SAR=36;SRF=31;SRR=24;SSEN=0;SSEP=0;SSSB=-0.0505108;STB=0.527551;STBP=0.492;TYPE=snp;VARB=0.0181889;ANN=KIF1B    GT:GQ:DP:FDP:RO:FRO:AO:FAO:AF:SAR:SAF:SRF:SRR:FSAR:FSAF:FSRF:FSRR    1/1:203:129:127:55:55:72:72:0.566929:36:36:31:24:36:36:31:24
chr1    10355834    .    C    T    504.995    PASS    AF=0.456747;AO=132;DP=290;FAO=132;FDP=289;FR=.;FRO=157;FSAF=56;FSAR=76;FSRF=85;FSRR=72;FWDB=0.0634576;FXX=0.00344816;HRUN=2;LEN=1;MLLD=58.7971;OALT=T;OID=.;OMAPALT=T;OPOS=10355834;OREF=C;PB=0.5;PBP=1;QD=6.98956;RBI=0.0644815;REFB=0.00590624;REVB=-0.0114455;RO=156;SAF=56;SAR=76;SRF=85;SRR=71;SSEN=0;SSEP=0;SSSB=-0.118565;STB=0.563872;STBP=0.047;TYPE=snp;VARB=-0.00678859;ANN=KIF1B    GT:GQ:DP:FDP:RO:FRO:AO:FAO:AF:SAR:SAF:SRF:SRR:FSAR:FSAF:FSRF:FSRR    0/1:504:290:289:156:157:132:132:0.456747:76:56:85:71:76:56:85:72
chr1    11090916    .    C    A    1855.57    PASS    AF=1;AO=193;DP=193;FAO=194;FDP=194;FR=.;FRO=0;FSAF=96;FSAR=98;FSRF=0;FSRR=0;FWDB=0.0243175;FXX=0;HRUN=1;LEN=1;MLLD=207.14;OALT=A;OID=.;OMAPALT=A;OPOS=11090916;OREF=C;PB=0.5;PBP=1;QD=38.2592;RBI=0.048476;REFB=0;REVB=0.0419355;RO=0;SAF=95;SAR=98;SRF=0;SRR=0;SSEN=0;SSEP=0;SSSB=4.54468e-08;STB=0.5;STBP=1;TYPE=snp;VARB=3.87852e-05;ANN=MASP2    GT:GQ:DP:FDP:RO:FRO:AO:FAO:AF:SAR:SAF:SRF:SRR:FSAR:FSAF:FSRF:FSRR    1/1:88:193:194:0:0:193:194:1:98:95:0:0:98:96:0:0
chr1    213068404    .    T    C    70.4374    PASS    AF=0.435897;AO=17;DP=39;FAO=17;FDP=39;FR=.;FRO=22;FSAF=16;FSAR=1;FSRF=11;FSRR=11;FWDB=-0.000611735;FXX=0;HRUN=1;LEN=1;MLLD=243.519;OALT=C;OID=.;OMAPALT=C;OPOS=213068404;OREF=T;PB=0.5;PBP=1;QD=7.22435;RBI=0.023516;REFB=0.00205571;REVB=0.023508;RO=22;SAF=16;SAR=1;SRF=11;SRR=11;SSEN=0;SSEP=0;SSSB=0.630793;STB=0.87614;STBP=0.001;TYPE=snp;VARB=-0.00167889;ANN=FLVCR1    GT:GQ:DP:FDP:RO:FRO:AO:FAO:AF:SAR:SAF:SRF:SRR:FSAR:FSAF:FSRF:FSRR    0/1:70:39:39:22:22:17:17:0.435897:1:16:11:11:1:16:11:11

current results

##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality, the Phred-scaled marginal (or unconditional) probability of the called genotype">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">
#CHROM    POS    ID    REF    ALT    QUAL    FILTER    INFO    FORMAT    SAMPLE
chr1    9324670    .    A    G    672.016    PASS    AF=0.528369;AO=148;DP=281;FAO=149;FDP=282;FR=.;FRO=133;FSAF=59;FSAR=90;FSRF=60;FSRR=73;FWDB=0.00343606;FXX=0;HRUN=1;LEN=1;MLLD=155.207;OALT=G;OID=.;OMAPALT=G;OPOS=9324670;OREF=A;PB=0.5;PBP=1;QD=9.53214;RBI=0.00594431;REFB=-0.0181827;REVB=0.00485061;RO=130;SAF=59;SAR=89;SRF=57;SRR=73;SSEN=0;SSEP=0;SSSB=-0.0352973;STB=0.526882;STBP=0.323;TYPE=snp;VARB=0.0184938;ANN=H6PD    GT:GQ:DP:FDP:RO:FRO:AO:FAO:AF:SAR:SAF:SRF:SRR:FSAR:FSAF:FSRF:FSRR    0/1:527:281:282:130:133:148:149:0.528369:89:59:57:73:90:59:60:73 GOOD 282 reads het
chr1    10318652    .    C    G    360.217    PASS    AF=0.566929;AO=72;DP=129;FAO=72;FDP=127;FR=.;FRO=55;FSAF=36;FSAR=36;FSRF=31;FSRR=24;FWDB=0.00760676;FXX=0.0155027;HRUN=2;LEN=1;MLLD=115.62;OALT=G;OID=.;OMAPALT=G;OPOS=10318652;OREF=C;PB=0.5;PBP=1;QD=11.3454;RBI=0.0125905;REFB=-0.0312889;REVB=-0.0100329;RO=55;SAF=36;SAR=36;SRF=31;SRR=24;SSEN=0;SSEP=0;SSSB=-0.0505108;STB=0.527551;STBP=0.492;TYPE=snp;VARB=0.0181889;ANN=KIF1B    GT:GQ:DP:FDP:RO:FRO:AO:FAO:AF:SAR:SAF:SRF:SRR:FSAR:FSAF:FSRF:FSRR    1/1:203:129:127:55:55:72:72:0.566929:36:36:31:24:36:36:31:24 GOOD 127 reads hom
chr1    10355834    .    C    T    504.995    PASS    AF=0.456747;AO=132;DP=290;FAO=132;FDP=289;FR=.;FRO=157;FSAF=56;FSAR=76;FSRF=85;FSRR=72;FWDB=0.0634576;FXX=0.00344816;HRUN=2;LEN=1;MLLD=58.7971;OALT=T;OID=.;OMAPALT=T;OPOS=10355834;OREF=C;PB=0.5;PBP=1;QD=6.98956;RBI=0.0644815;REFB=0.00590624;REVB=-0.0114455;RO=156;SAF=56;SAR=76;SRF=85;SRR=71;SSEN=0;SSEP=0;SSSB=-0.118565;STB=0.563872;STBP=0.047;TYPE=snp;VARB=-0.00678859;ANN=KIF1B    GT:GQ:DP:FDP:RO:FRO:AO:FAO:AF:SAR:SAF:SRF:SRR:FSAR:FSAF:FSRF:FSRR    0/1:504:290:289:156:157:132:132:0.456747:76:56:85:71:76:56:85:72 GOOD 289 reads het
chr1    11090916    .    C    A    1855.57    PASS    AF=1;AO=193;DP=193;FAO=194;FDP=194;FR=.;FRO=0;FSAF=96;FSAR=98;FSRF=0;FSRR=0;FWDB=0.0243175;FXX=0;HRUN=1;LEN=1;MLLD=207.14;OALT=A;OID=.;OMAPALT=A;OPOS=11090916;OREF=C;PB=0.5;PBP=1;QD=38.2592;RBI=0.048476;REFB=0;REVB=0.0419355;RO=0;SAF=95;SAR=98;SRF=0;SRR=0;SSEN=0;SSEP=0;SSSB=4.54468e-08;STB=0.5;STBP=1;TYPE=snp;VARB=3.87852e-05;ANN=MASP2    GT:GQ:DP:FDP:RO:FRO:AO:FAO:AF:SAR:SAF:SRF:SRR:FSAR:FSAF:FSRF:FSRR    1/1:88:193:194:0:0:193:194:1:98:95:0:0:98:96:0:0 GOOD 194 reads hom
chr1    213068404    .    T    C    70.4374    PASS    AF=0.435897;AO=17;DP=39;FAO=17;FDP=39;FR=.;FRO=22;FSAF=16;FSAR=1;FSRF=11;FSRR=11;FWDB=-0.000611735;FXX=0;HRUN=1;LEN=1;MLLD=243.519;OALT=C;OID=.;OMAPALT=C;OPOS=213068404;OREF=T;PB=0.5;PBP=1;QD=7.22435;RBI=0.023516;REFB=0.00205571;REVB=0.023508;RO=22;SAF=16;SAR=1;SRF=11;SRR=11;SSEN=0;SSEP=0;SSSB=0.630793;STB=0.87614;STBP=0.001;TYPE=snp;VARB=-0.00167889;ANN=FLVCR1    GT:GQ:DP:FDP:RO:FRO:AO:FAO:AF:SAR:SAF:SRF:SRR:FSAR:FSAF:FSRF:FSRR    0/1:70:39:39:22:22:17:17:0.435897:1:16:11:11:1:16:11:11 STRAND BIAS 39 reads het

desired results

##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality, the Phred-scaled marginal (or unconditional) probability of the called genotype">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">
#CHROM    POS    ID    REF    ALT    QUAL    FILTER    INFO    FORMAT    SAMPLE
chr1    9324670    .    A    G    672.016    PASS    AF=0.528369;AO=148;DP=281;FAO=149;FDP=282;FR=.;FRO=133;FSAF=59;FSAR=90;FSRF=60;FSRR=73;FWDB=0.00343606;FXX=0;HRUN=1;LEN=1;MLLD=155.207;OALT=G;OID=.;OMAPALT=G;OPOS=9324670;OREF=A;PB=0.5;PBP=1;QD=9.53214;RBI=0.00594431;REFB=-0.0181827;REVB=0.00485061;RO=130;SAF=59;SAR=89;SRF=57;SRR=73;SSEN=0;SSEP=0;SSSB=-0.0352973;STB=0.526882;STBP=0.323;TYPE=snp;VARB=0.0184938;ANN=H6PD    GT:GQ:DP:FDP:RO:FRO:AO:FAO:AF:SAR:SAF:SRF:SRR:FSAR:FSAF:FSRF:FSRR    0/1:527:281:282:130:133:148:149:0.528369:89:59:57:73:90:59:60:73 GOOD 282 reads het 20 score
chr1    10318652    .    C    G    360.217    PASS    AF=0.566929;AO=72;DP=129;FAO=72;FDP=127;FR=.;FRO=55;FSAF=36;FSAR=36;FSRF=31;FSRR=24;FWDB=0.00760676;FXX=0.0155027;HRUN=2;LEN=1;MLLD=115.62;OALT=G;OID=.;OMAPALT=G;OPOS=10318652;OREF=C;PB=0.5;PBP=1;QD=11.3454;RBI=0.0125905;REFB=-0.0312889;REVB=-0.0100329;RO=55;SAF=36;SAR=36;SRF=31;SRR=24;SSEN=0;SSEP=0;SSSB=-0.0505108;STB=0.527551;STBP=0.492;TYPE=snp;VARB=0.0181889;ANN=KIF1B    GT:GQ:DP:FDP:RO:FRO:AO:FAO:AF:SAR:SAF:SRF:SRR:FSAR:FSAF:FSRF:FSRR    1/1:203:129:127:55:55:72:72:0.566929:36:36:31:24:36:36:31:24 GOOD 127 reads hom 11 score
chr1    10355834    .    C    T    504.995    PASS    AF=0.456747;AO=132;DP=290;FAO=132;FDP=289;FR=.;FRO=157;FSAF=56;FSAR=76;FSRF=85;FSRR=72;FWDB=0.0634576;FXX=0.00344816;HRUN=2;LEN=1;MLLD=58.7971;OALT=T;OID=.;OMAPALT=T;OPOS=10355834;OREF=C;PB=0.5;PBP=1;QD=6.98956;RBI=0.0644815;REFB=0.00590624;REVB=-0.0114455;RO=156;SAF=56;SAR=76;SRF=85;SRR=71;SSEN=0;SSEP=0;SSSB=-0.118565;STB=0.563872;STBP=0.047;TYPE=snp;VARB=-0.00678859;ANN=KIF1B    GT:GQ:DP:FDP:RO:FRO:AO:FAO:AF:SAR:SAF:SRF:SRR:FSAR:FSAF:FSRF:FSRR    0/1:504:290:289:156:157:132:132:0.456747:76:56:85:71:76:56:85:72 GOOD 289 reads het 15 score
chr1    11090916    .    C    A    1855.57    PASS    AF=1;AO=193;DP=193;FAO=194;FDP=194;FR=.;FRO=0;FSAF=96;FSAR=98;FSRF=0;FSRR=0;FWDB=0.0243175;FXX=0;HRUN=1;LEN=1;MLLD=207.14;OALT=A;OID=.;OMAPALT=A;OPOS=11090916;OREF=C;PB=0.5;PBP=1;QD=38.2592;RBI=0.048476;REFB=0;REVB=0.0419355;RO=0;SAF=95;SAR=98;SRF=0;SRR=0;SSEN=0;SSEP=0;SSSB=4.54468e-08;STB=0.5;STBP=1;TYPE=snp;VARB=3.87852e-05;ANN=MASP2    GT:GQ:DP:FDP:RO:FRO:AO:FAO:AF:SAR:SAF:SRF:SRR:FSAR:FSAF:FSRF:FSRR    1/1:88:193:194:0:0:193:194:1:98:95:0:0:98:96:0:0 GOOD 194 reads hom 56 score
chr1    213068404    .    T    C    70.4374    PASS    AF=0.435897;AO=17;DP=39;FAO=17;FDP=39;FR=.;FRO=22;FSAF=16;FSAR=1;FSRF=11;FSRR=11;FWDB=-0.000611735;FXX=0;HRUN=1;LEN=1;MLLD=243.519;OALT=C;OID=.;OMAPALT=C;OPOS=213068404;OREF=T;PB=0.5;PBP=1;QD=7.22435;RBI=0.023516;REFB=0.00205571;REVB=0.023508;RO=22;SAF=16;SAR=1;SRF=11;SRR=11;SSEN=0;SSEP=0;SSSB=0.630793;STB=0.87614;STBP=0.001;TYPE=snp;VARB=-0.00167889;ANN=FLVCR1    GT:GQ:DP:FDP:RO:FRO:AO:FAO:AF:SAR:SAF:SRF:SRR:FSAR:FSAF:FSRF:FSRR    0/1:70:39:39:22:22:17:17:0.435897:1:16:11:11:1:16:11:11 STRAND BIAS 39 reads het 2 score

Hi, cmccabe.

The following liner can be in three major parts

perl -ple '   # -p will print every line in a loop.
              # -l will remove the end of line and add it after each print.
              # -e will tell the binary perl to interpret the following as code.
BEGIN{%h=qw(0/0 hom 0/1 het 1/1 hom 1/2 het 2/2 hom)}  # Before any line is processed, create a lookup hash table
 /([0-2]\/[0-2])/ and $_ .=" $h{$1}"   # capture any combination of digit/digit from 0 to 2 inclusive
                                       # and use it as the index to the lookup hash table {$1} to append to the current line
' input > results.txt

However, I do believe you are a bit confused, there is no calculation of STB in this liner. You used another command previously to calculate that. Please, see here

In fact, the latest posted input, will not produced the previously shown achieved output. You run that command over the input shown in post number 1 of this thread.

If you were to run the following modified version with the input posted in #1, you might get what you want.

perl -plae 'BEGIN{%h=qw(0/0 hom 0/1 het 1/1 hom 1/2 het 2/2 hom)} /([0-2]\/[0-2])/ and $_ .=join ("\t", "", $h{$1}, int($F[5]/33+0.5), "score")' inputs > results.txt

I am going to leave it up to you to combine both commands, if you want to apply to the input shown in the following thread.
However, I am going to explain as well the additions highlighted in red.

-a  # creates an array @F for each line separating it by blank spaces.
 $_ .=join ("\t", "", $h{$1}, int($F[5]/33+0.5), "score") # insert a tab to the list of elements passed as argument to join, append to the current line.
int($F[5]/33+0.5) # use the value in the 5th column (starts at zero) to make the calculation you asked for.
1 Like

My attempt at combining the two threads into one perl code:

perl -plae 'BEGIN{%h=qw(0/0 hom 0/1 het 1/1 hom 1/2 het 2/2 hom)} /([0-2]\/[0-2])/ and $_ .=join ("\t", "", $h{$1}, int($F[5]/33+0.5)) and $_ .=join ("\t", "", /^[^#].*FDP=(\d+);.*STB=(\d+\.\d+);/ and $_.=($2 >= 0.8?" STRAND BIAS ":" GOOD ")).$1."' input > result
Can't find string terminator '"' anywhere before EOF at -e line 1.

I appreciate all your help and explanations :slight_smile:

perl -plae '
    BEGIN{ %h = qw(0/0 hom 0/1 het 1/1 hom 1/2 het 2/2 hom) }
    /^[^#].*FDP=(\d+);.*STB=(\d+\.\d+);.*([0-2]\/[0-2])/ and
    $_ .= join "\t", ("", ($2 >= 0.8 ? "STRAND BIAS" : "GOOD"), $1, "reads", $h{$3}, int($F[5]/33+0.5), "score")
' input > result
1 Like

Why does the lines in bold repeat? I have tried different combinations with no luck in figuring it out. The lines in italics are correct, but the lines in bold are at the beginning and do not need to be there. Thank you very much :).

perl -plae '
    BEGIN{ %h = qw(0/0 hom 0/1 het 1/1 hom 1/2 het 2/2 hom) }
    /^[^#].*FDP=(\d+);.*STB=(\d+\.\d+);.*([0-2]\/[0-2])/ and
    $_ .= join "\t", ("", ($2 >= 0.8 ? "STRAND BIAS" : "GOOD"), $1, $h{$3}, int($F[5]/33+0.5))
' input > result

result

##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality, the Phred-scaled marginal (or unconditional) probability of the called genotype">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">
#CHROM    POS    ID    REF    ALT    QUAL    FILTER    INFO    FORMAT    SAMPLE
chr1    9324670    .    A    G    672.016    PASS    AF=0.528369;AO=148;DP=281;FAO=149;FDP=282;FR=.;FRO=133;FSAF=59;FSAR=90;FSRF=60;FSRR=73;FWDB=0.00343606;FXX=0;HRUN=1;LEN=1;MLLD=155.207;OALT=G;OID=.;OMAPALT=G;OPOS=9324670;OREF=A;PB=0.5;PBP=1;QD=9.53214;RBI=0.00594431;REFB=-0.0181827;REVB=0.00485061;RO=130;SAF=59;SAR=89;SRF=57;SRR=73;SSEN=0;SSEP=0;SSSB=-0.0352973;STB=0.526882;STBP=0.323;TYPE=snp;VARB=0.0184938;ANN=H6PD    GT:GQ:DP:FDP:RO:FRO:AO:FAO:AF:SAR:SAF:SRF:SRR:FSAR:FSAF:FSRF:FSRR    0/1:527:281:282:130:133:148:149:0.528369:89:59:57:73:90:59:60:73 GOOD 282 reads    GOOD    282    het    20
chr1    10318652    .    C    G    360.217    PASS    AF=0.566929;AO=72;DP=129;FAO=72;FDP=127;FR=.;FRO=55;FSAF=36;FSAR=36;FSRF=31;FSRR=24;FWDB=0.00760676;FXX=0.0155027;HRUN=2;LEN=1;MLLD=115.62;OALT=G;OID=.;OMAPALT=G;OPOS=10318652;OREF=C;PB=0.5;PBP=1;QD=11.3454;RBI=0.0125905;REFB=-0.0312889;REVB=-0.0100329;RO=55;SAF=36;SAR=36;SRF=31;SRR=24;SSEN=0;SSEP=0;SSSB=-0.0505108;STB=0.527551;STBP=0.492;TYPE=snp;VARB=0.0181889;ANN=KIF1B    GT:GQ:DP:FDP:RO:FRO:AO:FAO:AF:SAR:SAF:SRF:SRR:FSAR:FSAF:FSRF:FSRR    1/1:203:129:127:55:55:72:72:0.566929:36:36:31:24:36:36:31:24 GOOD 127 reads    GOOD    127    hom    11
chr1    10355834    .    C    T    504.995    PASS   

Please, verify that your input does not terminate with the string GOOD number reads , already. The shown Perl run instance is not responsible for the highlighted bold string that you posted.

1 Like

I didn't even notice that.... thank you very much :).