File2
is tab-delimeted
and I am trying to use $2
in file1
(space delimeted) as a search term in file2
. If it is found then the AF=
in and the FDP=
values from file2
are extracted and printed next to the file1
line. I commented the awk
before I added the lines in bold the current output resulted. I also can not figure out how to change the FDP=
to READS=
. Thank you :).
script.awk
BEGIN { OFS="\t" }
# Read $2 search terms from file1 into 's'
FNR==NR {
s[$2]
next
}
{ match($8,/'"'"'normalizedPos'"'"':'"'"'/, pos) # extract and read into pos value after :
}
{ match($8,/(AF=[^;]+).*(FDP=[^;]+)/, info) # extract and read into info value after AF= and FDP=
print $1":"pos[1],gene[1],info[2],info[1] ($1 in s ? s[$1] : "NOT DETECTED") # print desired output
}
awk -f script.awk file FS='\t' file2
file1 space delimited
AKT1 c.49G>A p.E17K
AKT1 c.155T>G p.L52R
APC c.4033G>T p.E1345*
EGFR c.2237_2255delAATTAAGAGAAGCAACATCinsT p.E746_S752delinsV
file2 tab-delimited
chr5 112175324 COSM18759 G T 5141.14 PASS AF=0.248124;AO=1844;DP=7607;FAO=496;FDP=1999;FDVR=10;FR=.;FRO=1503;FSAF=248;FSAR=248;FSRF=764;FSRR=739;FWDB=0.00324498;FXX=4.99998E-4;HRUN=1;HS_ONLY=0;LEN=1;MLLD=196.753;OALT=T;OID=COSM18759;OMAPALT=T;OPOS=112175324;OREF=G;PB=.;PBP=.;QD=10.2874;RBI=0.0620718;REFB=-6.9359E-4;REVB=-0.0619869;RO=5728;SAF=921;SAR=923;SRF=2898;SRR=2830;SSEN=0;SSEP=0;SSSB=-0.0082562;STB=0.506253;STBP=0.772;TYPE=snp;VARB=0.00209058;HS;FUNC=[{'origPos':'112175324','origRef':'G','normalizedRef':'G','gene':'APC','normalizedPos':'112175324','normalizedAlt':'T','gt':'pos','codon':'TAA','coding':'c.4033G>T','transcript':'NM_000038.5','function':'nonsense','protein':'p.Glu1345Ter','location':'exonic','origAlt':'T','exon':'16'}] GT:GQ:DP:FDP:RO:FRO:AO:FAO:AF:SAR:SAF:SRF:SRR:FSAR:FSAF:FSRF:FSRR 0/1:5141:7607:1999:5728:1503:1844:496:0.248124:923:921:2898:2830:248:248:764:739
chr7 55242467 COSM23571;COSM6254;COSM12369;COSM12386;COSM6220;COSM12384 AATTAAGAGAAGCAACATC AATC,TATC,A,T 2127.65 PASS AF=0.0,0.0,0.0,0.139582;AO=0,0,0,0;DP=6091;FAO=0,0,0,274;FDP=1963;FDVR=10,10,10,10;FR=.,.,.,.;FRO=1689;FSAF=0,0,0,173;FSAR=0,0,0,101;FSRF=1017;FSRR=672;FWDB=-0.00907011,-0.0112182,-0.0151195,-0.0172792;FXX=0.0184999;HRUN=2,2,2,2;HS_ONLY=0;LEN=4,4,18,18;MLLD=539.227,630.185,613.658,627.795;OALT=-,T,-,T,-,-;OID=COSM23571,COSM12386,COSM6220,COSM12384,COSM6254,COSM12369;OMAPALT=AATC,TATC,A,T,AATC,AATC;OPOS=55242468,55242467,55242468,55242467,55242469,55242470;OREF=ATTAAGAGAAGCAAC,AATTAAGAGAAGCAAC,ATTAAGAGAAGCAACATC,AATTAAGAGAAGCAACATC,TTAAGAGAAGCAACA,TAAGAGAAGCAACAT;PB=.,.,.,.;PBP=.,.,.,.;QD=4.33551;RBI=0.00953953,0.0170328,0.0229596,0.0226107;REFB=0.00154032,-0.0127049,0.00689381,0.0103482;REVB=0.00295562,-0.0128166,-0.0172784,-0.0145834;RO=5942;SAF=0,0,0,0;SAR=0,0,0,0;SRF=3671;SRR=2271;SSEN=0,0,0,0;SSEP=0,0,0,0;SSSB=0,0,0,0;STB=0.5,0.5,0.5,0.526658;STBP=1.0,1.0,1.0,0.373;SUBSET=.,.,4,.;TYPE=complex,complex,del,del;VARB=0.0,0.0,0.0,-0.0673064;HS;FUNC=[{'origPos':'55242467','origRef':'AATTAAGAGAAGCAACATC','normalizedRef':'AATTAAGAGAAGCAACATC','gene':'EGFR','normalizedPos':'55242467','normalizedAlt':'T','gt':'pos','coding':'c.2237_2255delAATTAAGAGAAGCAACATCinsT','transcript':'NM_005228.3','function':'nonframeshiftBlockSubstitution','protein':'p.Glu746_Ser752delinsVal','location':'exonic','origAlt':'T','exon':'19'}] GT:GQ:DP:FDP:RO:FRO:AO:FAO:AF:SAR:SAF:SRF:SRR:FSAR:FSAF:FSRF:FSRR 0/4:2127:6091:1963:5942:1689:0,0,0,0:0,0,0,274:0.0,0.0,0.0,0.139582:0,0,0,0:0,0,0,0:3671:2271:0,0,0,101:0,0,0,173:1017:672
current output
chr5: FDP=1999 AF=0.248124
chr7: FDP=1963 AF=0.0,0.0,0.0,0.139582
desiered
AKT1 c.49G>A p.E17K chr5:112175324 NOT DETECTED
AKT1 c.155T>G p.L52R chr7:55242467 NOT DETECTED
APC c.4033G>T p.E1345* READS=1999 AF=0.248124
EGFR c.2237_2255delAATTAAGAGAAGCAACATCinsT p.E746_S752delinsV READS=1963 AF=0.0,0.0,0.0,0.139582