In the awk
below, what I am attempting to do is check each line in the tab-delimeted
input
, which has ~20 lines in it, for a keyword
SVTYPE=Fusion
. If the keyword is found I am splitting $3
using the .
(dot) and reading the portion before and after the dot in an array a
.
If it does have that keyword is in line 1 then the it will also be in the line directly below it will (always and in the same location/format). The same process is performed on that
line except the split is read into array b
.
Then if array b
matches array a
the desired output is printed tab-delimeted
.
If the line does not have the word in it then that line is skipped and appears as-is in the output (nothing needs to be done to it, but it is printed in the output). I hope the awk
is close, I included comments as to what I think is happening in each line. The two lines in bold and the two lines in italics should produce the desired output, the ones in color do not match and can be skipped.Thank you :).
input
chr12 12006495 ETV6-NTRK3.E4N15.COSF823.1_1 G G]chr15:88483984] . PASS SVTYPE=Fusion;READ_COUNT=1868;GENE_NAME=ETV6;EXON_NUM=4;RPM=1.5825e-09;NORM_COUNT=0.001582480886121524;ANNOTATION=COSF823;FUNC=[{'gene':'ETV6','exon':'4','oncomineGeneClass':'Gain-of-Function','oncomineVariantClass':'Fusion'}] GT:GQ ./.:.
chr15 88483984 ETV6-NTRK3.E4N15.COSF823.1_2 T ]chr12:12006495]T . PASS SVTYPE=Fusion;READ_COUNT=1868;GENE_NAME=NTRK3;EXON_NUM=15;RPM=1.5825e-09;NORM_COUNT=0.001582480886121524;ANNOTATION=COSF823;FUNC=[{'gene':'NTRK3','exon':'15','oncomineGeneClass':'Gain-of-Function','oncomineVariantClass':'Fusion'}] GT:GQ ./.:.
chr12 12022903 ETV6-NTRK3.E5N15.COSF571.1_1 G G]chr15:88483984] . PASS SVTYPE=Fusion;READ_COUNT=414833;GENE_NAME=ETV6;EXON_NUM=5;RPM=3.5143e-07;NORM_COUNT=0.3514268166126607;ANNOTATION=COSF571;FUNC=[{'gene':'ETV6','exon':'5','oncomineGeneClass':'Gain-of-Function','oncomineVariantClass':'Fusion'}] GT:GQ ./.:.
chr15 88483984 ETV6-NTRK3.E5N15.COSF571.1_2 T ]chr12:12022903]T . PASS SVTYPE=Fusion;READ_COUNT=414833;GENE_NAME=NTRK3;EXON_NUM=15;RPM=3.5143e-07;NORM_COUNT=0.3514268166126607;ANNOTATION=COSF571;FUNC=[{'gene':'NTRK3','exon':'15','oncomineGeneClass':'Gain-of-Function','oncomineVariantClass':'Fusion'}] GT:GQ ./.:.
chr17 7577108 COSM10749;COSM43737 C A,T 149.594 PASS AF=0.0830415,0.0;AO=372,2;DP=4420;FAO=166,0;FDP=1999;FR=.,.,REALIGNEDx0.0865;FRO=1833;FSAF=82,0;FSAR=84,0;FSRF=952;FSRR=881;FWDB=0.0072184,-0.0207142;FXX=4.99998E-4;HRUN=1,1;LEN=1,1;MLLD=293.795,80.5366;OALT=A,T;OID=COSM10749,COSM43737;OMAPALT=A,T;OPOS=7577108,7577108;OREF=C,C;PB=.,.;PBP=.,.;QD=0.299338;RBI=0.00721997,0.02565;REFB=1.40155E-4,-7.81395E-4;REVB=1.50579E-4,0.0151276;RO=4043;SAF=187,1;SAR=185,1;SRF=2118;SRR=1925;SSEN=0,0;SSEP=0,0;SSSB=-0.0251826,-5.12306E-4;STB=0.52327,0.5;STBP=0.541,1.0;TYPE=snp,snp;VARB=-0.00153404,0.0;HS;FUNC=[{'origPos':'7577108','origRef':'C','normalizedRef':'C','gene':'TP53','normalizedPos':'7577108','normalizedAlt':'A','polyphen':'1.0','gt':'pos','codon':'TTT','coding':'c.830G>T','sift':'0.0','grantham':'205.0','transcript':'NM_000546.5','function':'missense','protein':'p.Cys277Phe','location':'exonic','origAlt':'A','exon':'8','oncomineGeneClass':'Loss-of-Function','oncomineVariantClass':'Hotspot'}] GT:GQ:DP:FDP:RO:FRO:AO:FAO:AF:SAR:SAF:SRF:SRR:FSAR:FSAF:FSRF:FSRR:QT 0/1:149:4420:1999:4043:1833:372,2:166,0:0.0830415,0.0:185,1:187,1:2118:1925:84,0:82,0:952:881:1
chr10 89624278 . G T 62.8836 PASS AF=0.0785393;AO=297;DP=4155;FAO=157;FDP=1999;FR=.;FRO=1842;FSAF=77;FSAR=80;FSRF=908;FSRR=934;FWDB=0.0113997;FXX=4.99998E-4;HRUN=1;LEN=1;MLLD=117.237;OALT=T;OID=.;OMAPALT=T;OPOS=89624278;OREF=G;PB=.;PBP=.;QD=0.12583;RBI=0.040843;REFB=5.39678E-4;REVB=-0.0392199;RO=3844;SAF=150;SAR=147;SRF=1936;SRR=1908;SSEN=0;SSEP=0;SSSB=0.00159791;STB=0.502301;STBP=0.96;TYPE=snp;VARB=-0.00676678;FUNC=[{'origPos':'89624278','origRef':'G','normalizedRef':'G','gene':'PTEN','normalizedPos':'89624278','normalizedAlt':'T','gt':'pos','codon':'TAG','coding':'c.52G>T','transcript':'NM_000314.4','function':'nonsense','protein':'p.Glu18Ter','location':'exonic','origAlt':'T','exon':'1'}] GT:GQ:DP:FDP:RO:FRO:AO:FAO:AF:SAR:SAF:SRF:SRR:FSAR:FSAF:FSRF:FSRR:QT 0/1:62:4155:1999:3844:1842:297:157:0.0785393:147:150:1936:1908:80:77:908:934:1
awk with comments
awk -F'\t' -v OFS="\t"' # call awk and set FS and OFS as tab
{
match($0,/SVTYPE=[^;]*/); # look in line for SVTYPE
SVTYPE_VALUE=substr($0,RSTART+7,RLENGTH-7); # index SVTYPE for Fusion (value to look for)
if(SVTYPE_VALUE == "Fusion"){ # if SVTYPE is Fusion
{split($3,a,"."); a=a[1]"."a[2]}' # split the string in $3 at using the . and store the portion before the . and after (ETV6-NTRK3.E4N15) in array
a else if(SVTYPE_VALUE !~ "Fusion") # if SVTYPE not Fusion
next; # skip to next line
}
}
}
{
match($0,/SVTYPE=[^;]*/); # look in line for SVTYPE
SVTYPE_VALUE=substr($0,RSTART+7,RLENGTH-7); # index SVTYPE for Fusion (value to look for)
if(SVTYPE_VALUE == "Fusion"){ # if SVTYPE is Fusion
{split($3,b,"."); b=b[1]"."b[2]}' # split the string in $3 at using the . and store the portion before the . and after (ETV6-NTRK3.E4N15) in array
b else if(SVTYPE_VALUE !~ "Fusion") # if SVTYPE not Fusion
next; # skip to next line
}
}
}
}
}
}
{
for(j in b){ # define loop
for(i=1;i<=length(a);i++){ # iterate through loop
if(b[j] ~ a){ # compare array b to array a for match
print --- desired output ----
}
}
}' input | awk 'BEGIN {print "Position\tLocus\tClass\tFunction\tGene\tCount"}' > output # add header
desired output
Locus Class Function Gene Count
chr12:12006495-chr15:88483984 Fusion Gain-of-function ETV6E4-NTRK3E15 1868
chr12:12022903-chr15:88483984Fusion Gain-of-function ETV6E5-NTRK3E15 414833
chr17 7577108 COSM10749;COSM43737 C A,T 149.594 PASS AF=0.0830415,0.0;AO=372,2;DP=4420;FAO=166,0;FDP=1999;FR=.,.,REALIGNEDx0.0865;FRO=1833;FSAF=82,0;FSAR=84,0;FSRF=952;FSRR=881;FWDB=0.0072184,-0.0207142;FXX=4.99998E-4;HRUN=1,1;LEN=1,1;MLLD=293.795,80.5366;OALT=A,T;OID=COSM10749,COSM43737;OMAPALT=A,T;OPOS=7577108,7577108;OREF=C,C;PB=.,.;PBP=.,.;QD=0.299338;RBI=0.00721997,0.02565;REFB=1.40155E-4,-7.81395E-4;REVB=1.50579E-4,0.0151276;RO=4043;SAF=187,1;SAR=185,1;SRF=2118;SRR=1925;SSEN=0,0;SSEP=0,0;SSSB=-0.0251826,-5.12306E-4;STB=0.52327,0.5;STBP=0.541,1.0;TYPE=snp,snp;VARB=-0.00153404,0.0;HS;FUNC=[{'origPos':'7577108','origRef':'C','normalizedRef':'C','gene':'TP53','normalizedPos':'7577108','normalizedAlt':'A','polyphen':'1.0','gt':'pos','codon':'TTT','coding':'c.830G>T','sift':'0.0','grantham':'205.0','transcript':'NM_000546.5','function':'missense','protein':'p.Cys277Phe','location':'exonic','origAlt':'A','exon':'8','oncomineGeneClass':'Loss-of-Function','oncomineVariantClass':'Hotspot'}] GT:GQ:DP:FDP:RO:FRO:AO:FAO:AF:SAR:SAF:SRF:SRR:FSAR:FSAF:FSRF:FSRR:QT 0/1:149:4420:1999:4043:1833:372,2:166,0:0.0830415,0.0:185,1:187,1:2118:1925:84,0:82,0:952:881:1
chr10 89624278 . G T 62.8836 PASS AF=0.0785393;AO=297;DP=4155;FAO=157;FDP=1999;FR=.;FRO=1842;FSAF=77;FSAR=80;FSRF=908;FSRR=934;FWDB=0.0113997;FXX=4.99998E-4;HRUN=1;LEN=1;MLLD=117.237;OALT=T;OID=.;OMAPALT=T;OPOS=89624278;OREF=G;PB=.;PBP=.;QD=0.12583;RBI=0.040843;REFB=5.39678E-4;REVB=-0.0392199;RO=3844;SAF=150;SAR=147;SRF=1936;SRR=1908;SSEN=0;SSEP=0;SSSB=0.00159791;STB=0.502301;STBP=0.96;TYPE=snp;VARB=-0.00676678;FUNC=[{'origPos':'89624278','origRef':'G','normalizedRef':'G','gene':'PTEN','normalizedPos':'89624278','normalizedAlt':'T','gt':'pos','codon':'TAG','coding':'c.52G>T','transcript':'NM_000314.4','function':'nonsense','protein':'p.Glu18Ter','location':'exonic','origAlt':'T','exon':'1'}] GT:GQ:DP:FDP:RO:FRO:AO:FAO:AF:SAR:SAF:SRF:SRR:FSAR:FSAF:FSRF:FSRR:QT 0/1:62:4155:1999:3844:1842:297:157:0.0785393:147:150:1936:1908:80:77:908:934:1
description of output
header line
$1:$2 of line 1 - $1:$2 of line 2 SVTYPE=(value) oncomineGeneClass=(value) array a and array b match (value) READ_Count=(value)
lines that were nor processed