awk to remove mutiple values from specific pattern, leaving a single value

In the awk below I am trying to remove all instances after a ; (semi-colon) or , (comma) in the ANN= pattern. I am using gsub
to substitute an empty string in these, so that ANN= is a single value (with only one value in it the one right after the ANN=). Thank you :).
I have comented my awk and included a description of each line as well.

input tab-deliimeted

chr1	987200	.	C	T	1217.2	PASS	AF=1;AO=127;DP=127;FAO=127;FDP=127;FR=.;FRO=0;FSAF=63;FSAR=64;FSRF=0;FSRR=0;FWDB=-0.0049104;FXX=0;HRUN=1;LEN=1;MLLD=167.668;OALT=T;OID=.;OMAPALT=T;OPOS=987200;OREF=C;PB=.;PBP=.;QD=38.3369;RBI=0.0213032;REFB=0;REVB=0.0207296;RO=0;SAF=63;SAR=64;SRF=0;SRR=0;SSEN=0;SSEP=0;SSSB=-5.21543e-08;STB=0.5;STBP=1;TYPE=snp;VARB=2.41961e-05;ANN=AGRN	GT:GQ:DP:FDP:RO:FRO:AO:FAO:AF:SAR:SAF:SRF:SRR:FSAR:FSAF:FSRF:FSRR:QT	1/1:57:127:127:0:0:127:127:1:64:63:0:0:64:63:0:0:1	GOOD	127	hom	37
chr1	990280	.	C	T	2418.92	PASS	AF=1;AO=258;DP=264;FAO=260;FDP=260;FR=.;FRO=0;FSAF=120;FSAR=140;FSRF=0;FSRR=0;FWDB=0.0249502;FXX=0.0225555;HRUN=1;LEN=1;MLLD=92.2049;OALT=T;OID=.;OMAPALT=T;OPOS=990280;OREF=C;PB=.;PBP=.;QD=37.2141;RBI=0.0261262;REFB=-0.11255;REVB=-0.00775041;RO=0;SAF=118;SAR=140;SRF=0;SRR=0;SSEN=0;SSEP=0;SSSB=0;STB=0.5;STBP=1;TYPE=snp;VARB=0.000526608;ANN=AGRN	GT:GQ:DP:FDP:RO:FRO:AO:FAO:AF:SAR:SAF:SRF:SRR:FSAR:FSAF:FSRF:FSRR:QT	1/1:86:264:260:0:0:258:260:1:140:118:0:0:140:120:0:0:1	GOOD	260	hom	73
chr2	48915871	.	A	G	1624.87	PASS	AF=1;AO=170;DP=171;FAO=172;FDP=172;FR=.;FRO=0;FSAF=92;FSAR=80;FSRF=0;FSRR=0;FWDB=0.0234407;FXX=0;HRUN=1;LEN=1;MLLD=70.9343;OALT=G;OID=.;OMAPALT=G;OPOS=48915871;OREF=A;PB=.;PBP=.;QD=37.7877;RBI=0.0331357;REFB=0;REVB=0.0234202;RO=1;SAF=91;SAR=79;SRF=0;SRR=1;SSEN=0;SSEP=0;SSSB=0.00598669;STB=0.5;STBP=1;TYPE=snp;VARB=0.000172399;ANN=LHCGR;STON1-GTF2A1L,LHCGR	GT:GQ:DP:FDP:RO:FRO:AO:FAO:AF:SAR:SAF:SRF:SRR:FSAR:FSAF:FSRF:FSRR:QT	1/1:76:171:172:1:0:170:172:1:79:91:0:1:80:92:0:0:1	GOOD	172	hom	49
chr2	48921375	.	T	C	481.192	PASS	AF=1;AO=51;DP=51;FAO=51;FDP=51;FR=.;FRO=0;FSAF=27;FSAR=24;FSRF=0;FSRR=0;FWDB=0.0171521;FXX=0;HRUN=2;LEN=1;MLLD=203.707;OALT=C;OID=.;OMAPALT=C;OPOS=48921375;OREF=T;PB=.;PBP=.;QD=37.7406;RBI=0.0241572;REFB=0;REVB=0.0170111;RO=0;SAF=27;SAR=24;SRF=0;SRR=0;SSEN=0;SSEP=0;SSSB=0;STB=0.5;STBP=1;TYPE=snp;VARB=8.09379e-05;ANN=LHCGR,LHCGR;STON1-GTF2A1L	GT:GQ:DP:FDP:RO:FRO:AO:FAO:AF:SAR:SAF:SRF:SRR:FSAR:FSAF:FSRF:FSRR:QT	1/1:23:51:51:0:0:51:51:1:24:27:0:0:24:27:0:0:1	GOOD	51	hom	15
chr2	48925746	.	C	T	1144.07	PASS	AF=1;AO=114;DP=114;FAO=119;FDP=119;FR=.,REALIGNEDx0.958;FRO=0;FSAF=54;FSAR=65;FSRF=0;FSRR=0;FWDB=0.0374429;FXX=0;HRUN=1;LEN=1;MLLD=261.838;OALT=T;OID=.;OMAPALT=T;OPOS=48925746;OREF=C;PB=.;PBP=.;QD=38.456;RBI=0.0379673;REFB=0;REVB=0.00628838;RO=0;SAF=51;SAR=63;SRF=0;SRR=0;SSEN=0;SSEP=0;SSSB=3.26593e-08;STB=0.5;STBP=1;TYPE=snp;VARB=2.12074e-05;ANN=LHCGR;STON1-GTF2A1L,LHCGR	GT:GQ:DP:FDP:RO:FRO:AO:FAO:AF:SAR:SAF:SRF:SRR:FSAR:FSAF:FSRF:FSRR:QT	1/1:54:114:119:0:0:114:119:1:63:51:0:0:65:54:0:0:1	GOOD	119	hom	35
chr2	49189921	.	C	T	570.875	PASS	AF=0.582474;AO=113;DP=193;FAO=113;FDP=194;FR=.,REALIGNEDx0.5825;FRO=81;FSAF=53;FSAR=60;FSRF=44;FSRR=37;FWDB=-0.00244613;FXX=0;HRUN=1;LEN=1;MLLD=239.763;OALT=T;OID=.;OMAPALT=T;OPOS=49189921;OREF=C;PB=.;PBP=.;QD=11.7706;RBI=0.0159301;REFB=-0.00100522;REVB=-0.0157412;RO=80;SAF=53;SAR=60;SRF=44;SRR=36;SSEN=0;SSEP=0;SSSB=-0.061858;STB=0.530968;STBP=0.315;TYPE=snp;VARB=0.000865756;ANN=FSHR	GT:GQ:DP:FDP:RO:FRO:AO:FAO:AF:SAR:SAF:SRF:SRR:FSAR:FSAF:FSRF:FSRR:QT	0/1:280:193:194:80:81:113:113:0.582474:60:53:44:36:60:53:44:37:1	GOOD	194	het	17

desired output tab-delimeted

chr1	987200	.	C	T	1217.2	PASS	AF=1;AO=127;DP=127;FAO=127;FDP=127;FR=.;FRO=0;FSAF=63;FSAR=64;FSRF=0;FSRR=0;FWDB=-0.0049104;FXX=0;HRUN=1;LEN=1;MLLD=167.668;OALT=T;OID=.;OMAPALT=T;OPOS=987200;OREF=C;PB=.;PBP=.;QD=38.3369;RBI=0.0213032;REFB=0;REVB=0.0207296;RO=0;SAF=63;SAR=64;SRF=0;SRR=0;SSEN=0;SSEP=0;SSSB=-5.21543e-08;STB=0.5;STBP=1;TYPE=snp;VARB=2.41961e-05;ANN=AGRN	GT:GQ:DP:FDP:RO:FRO:AO:FAO:AF:SAR:SAF:SRF:SRR:FSAR:FSAF:FSRF:FSRR:QT	1/1:57:127:127:0:0:127:127:1:64:63:0:0:64:63:0:0:1	GOOD	127	hom	37
chr1	990280	.	C	T	2418.92	PASS	AF=1;AO=258;DP=264;FAO=260;FDP=260;FR=.;FRO=0;FSAF=120;FSAR=140;FSRF=0;FSRR=0;FWDB=0.0249502;FXX=0.0225555;HRUN=1;LEN=1;MLLD=92.2049;OALT=T;OID=.;OMAPALT=T;OPOS=990280;OREF=C;PB=.;PBP=.;QD=37.2141;RBI=0.0261262;REFB=-0.11255;REVB=-0.00775041;RO=0;SAF=118;SAR=140;SRF=0;SRR=0;SSEN=0;SSEP=0;SSSB=0;STB=0.5;STBP=1;TYPE=snp;VARB=0.000526608;ANN=AGRN	GT:GQ:DP:FDP:RO:FRO:AO:FAO:AF:SAR:SAF:SRF:SRR:FSAR:FSAF:FSRF:FSRR:QT	1/1:86:264:260:0:0:258:260:1:140:118:0:0:140:120:0:0:1	GOOD	260	hom	73
chr2	48915871	.	A	G	1624.87	PASS	AF=1;AO=170;DP=171;FAO=172;FDP=172;FR=.;FRO=0;FSAF=92;FSAR=80;FSRF=0;FSRR=0;FWDB=0.0234407;FXX=0;HRUN=1;LEN=1;MLLD=70.9343;OALT=G;OID=.;OMAPALT=G;OPOS=48915871;OREF=A;PB=.;PBP=.;QD=37.7877;RBI=0.0331357;REFB=0;REVB=0.0234202;RO=1;SAF=91;SAR=79;SRF=0;SRR=1;SSEN=0;SSEP=0;SSSB=0.00598669;STB=0.5;STBP=1;TYPE=snp;VARB=0.000172399;ANN=LHCGR	GT:GQ:DP:FDP:RO:FRO:AO:FAO:AF:SAR:SAF:SRF:SRR:FSAR:FSAF:FSRF:FSRR:QT	1/1:76:171:172:1:0:170:172:1:79:91:0:1:80:92:0:0:1	GOOD	172	hom	49
chr2	48921375	.	T	C	481.192	PASS	AF=1;AO=51;DP=51;FAO=51;FDP=51;FR=.;FRO=0;FSAF=27;FSAR=24;FSRF=0;FSRR=0;FWDB=0.0171521;FXX=0;HRUN=2;LEN=1;MLLD=203.707;OALT=C;OID=.;OMAPALT=C;OPOS=48921375;OREF=T;PB=.;PBP=.;QD=37.7406;RBI=0.0241572;REFB=0;REVB=0.0170111;RO=0;SAF=27;SAR=24;SRF=0;SRR=0;SSEN=0;SSEP=0;SSSB=0;STB=0.5;STBP=1;TYPE=snp;VARB=8.09379e-05;ANN=LHCGR	GT:GQ:DP:FDP:RO:FRO:AO:FAO:AF:SAR:SAF:SRF:SRR:FSAR:FSAF:FSRF:FSRR:QT	1/1:23:51:51:0:0:51:51:1:24:27:0:0:24:27:0:0:1	GOOD	51	hom	15
chr2	48925746	.	C	T	1144.07	PASS	AF=1;AO=114;DP=114;FAO=119;FDP=119;FR=.,REALIGNEDx0.958;FRO=0;FSAF=54;FSAR=65;FSRF=0;FSRR=0;FWDB=0.0374429;FXX=0;HRUN=1;LEN=1;MLLD=261.838;OALT=T;OID=.;OMAPALT=T;OPOS=48925746;OREF=C;PB=.;PBP=.;QD=38.456;RBI=0.0379673;REFB=0;REVB=0.00628838;RO=0;SAF=51;SAR=63;SRF=0;SRR=0;SSEN=0;SSEP=0;SSSB=3.26593e-08;STB=0.5;STBP=1;TYPE=snp;VARB=2.12074e-05;ANN=LHCGR	GT:GQ:DP:FDP:RO:FRO:AO:FAO:AF:SAR:SAF:SRF:SRR:FSAR:FSAF:FSRF:FSRR:QT	1/1:54:114:119:0:0:114:119:1:63:51:0:0:65:54:0:0:1	GOOD	119	hom	35
chr2	49189921	.	C	T	570.875	PASS	AF=0.582474;AO=113;DP=193;FAO=113;FDP=194;FR=.,REALIGNEDx0.5825;FRO=81;FSAF=53;FSAR=60;FSRF=44;FSRR=37;FWDB=-0.00244613;FXX=0;HRUN=1;LEN=1;MLLD=239.763;OALT=T;OID=.;OMAPALT=T;OPOS=49189921;OREF=C;PB=.;PBP=.;QD=11.7706;RBI=0.0159301;REFB=-0.00100522;REVB=-0.0157412;RO=80;SAF=53;SAR=60;SRF=44;SRR=36;SSEN=0;SSEP=0;SSSB=-0.061858;STB=0.530968;STBP=0.315;TYPE=snp;VARB=0.000865756;ANN=FSHR	GT:GQ:DP:FDP:RO:FRO:AO:FAO:AF:SAR:SAF:SRF:SRR:FSAR:FSAF:FSRF:FSRR:QT	0/1:280:193:194:80:81:113:113:0.582474:60:53:44:36:60:53:44:37:1	GOOD	194	het	17

description

line1 is good as ANN= has no ; or , in it ANN=AGRN portion after the = is a single value
line2 is good as ANN= has no ; or , in it ANN=AGRN portion after the = is a single value
line3 ANN=LHCGR;STON1-GTF2A1L,LHCGR has both ; and , in it so everything after the first value is removed
line4 ANN=LHCGR,LHCGR;STON1-GTF2A1L has both ; and , in it so everything after the first value is removed
line5 ANN=LHCGR;STON1-GTF2A1L,LHCGR has both ; and , in it so everything after the first value is removed
line1 is good as ANN= has no ; or , in it ANN=FSHR portion after the = is a single value

awk

awk -F'\t' -v OFS="\t" '   # define input and output FS as tab
                      {if(/ANN=/); = search each line for pattern ANN=
                      {sub(/;,*/,""); print}}' input  if ANN= has a ; or ' in it substitute values after with null values/empty strings (removing them)

sub can't remove part of a string like that, it has no backreferences. Instead I use match to figure out exactly where the good part is, and keep only exactly that.

awk -F"\t" -v OFS="\t" '{ for(N=1; N<=NF; N++) if(match($N, /ANN=[^;,]*/)) $N=substr($N, 0, RLENGTH+RSTART-1) ; } 1' inputfile > outputfile
1 Like

Hello cmccabe,

Not sure if I got this 100%, could you please try following and let me know if this helps you.

awk '!/ANN.*[;,]/{print;next} {match($0,/ANN[^:]*/);VAL=substr($0,RSTART,RLENGTH);if(VAL){split(VAL, A," ");sub(/[;,].*/,"",A[1]);sub(/ANN[^:]*/,A[1] "\t" A[2],$0)}} 1'  Input_file

Adding a non-one liner form of solution too now.

awk '!/ANN.*[;,]/{
                        print;
                        next
                 }
                 {
                        match($0,/ANN[^:]*/);
                        VAL=substr($0,RSTART,RLENGTH);
                        if(VAL){
                                split(VAL, A," ");
                                sub(/[;,].*/,"",A[1]);
                                sub(/ANN[^:]*/,A[1] "\t" A[2],$0)
                                }
                 }
      1
    '    Input_file

Thanks,
R. Singh

1 Like
perl -pe 's/(ANN=\w+)[;,][^\s]*/$1/' input
1 Like

where [^\s] can be shortened \S .

1 Like

Thank you all very much :slight_smile:

Corona688 in the portion of code below:

$N=substr($N, 0, RLENGTH+RSTART-1)

is RLENGTH+RSTART-1 the entire string after ANN= , but the index of only the first value? So, basically captures everything but only print index 1. I am just trying to grasp this concept and think the explanation by RavinderSingh13 previously helped. Thank you :).

Hello cmccabe,

You could refer the following URL for explanation on this function too.
awk to combine lines if fields match in lines Post: 302997499

Thanks,
R. Singh

1 Like

Hi cmccabe,

It is the combination of both functions match() and substr(), working together that do the work of where to start and what to remove.

match() assigns values to two predefined variables: RSTART which gets the start of the matched portion and RLENGTH which gets the length of the match.

Using those two variables like relative coordinates in the whole string, you can extract that substring portion and keep in a variable using substr()

1 Like

Thank you very much :slight_smile:

Hi Gurus... good day;

currently I trying to run the df -g command with awk to get to convert in SQL statement, but I have some errors;

df -g | awk '{print "This is the FileSystem: " $NF, " This is LV: "$1, "This is SIZE: "$2, "This is FREE: " $3, "This is the USED% "$4}'

This on AIX operative system;

But I need replace the words "This is the...", but I dont know how to...��

Some can help me..

best regards;

Wilmer

Please don't hijack other folks' threads - open your own.
Thanks

Moderator comments were removed during original forum migration.