awk to match file1 and extract specific tag values

cmccabe · January 2, 2019, 4:11pm

File2 is tab-delimeted and I am trying to use $2 in file1 (space delimeted) as a search term in file2 . If it is found then the AF= in and the FDP= values from file2 are extracted and printed next to the file1 line. I commented the awk before I added the lines in bold the current output resulted. I also can not figure out how to change the FDP= to READS= . Thank you :).

script.awk

BEGIN { OFS="\t" }
    # Read $2 search terms from file1 into 's'
      FNR==NR {
          s[$2]
          next
}
    { match($8,/'"'"'normalizedPos'"'"':'"'"'/, pos)  # extract and read into pos value after :
    }
     { match($8,/(AF=[^;]+).*(FDP=[^;]+)/, info) # extract and read into info value after AF= and FDP=
      print $1":"pos[1],gene[1],info[2],info[1] ($1 in s ? s[$1] : "NOT DETECTED")  # print desired output
     }
	 
awk -f script.awk file FS='\t' file2

file1  space delimited

AKT1 c.49G>A p.E17K
AKT1 c.155T>G p.L52R
APC c.4033G>T p.E1345*
EGFR c.2237_2255delAATTAAGAGAAGCAACATCinsT p.E746_S752delinsV

file2  tab-delimited

chr5	112175324	COSM18759	G	T	5141.14	PASS	AF=0.248124;AO=1844;DP=7607;FAO=496;FDP=1999;FDVR=10;FR=.;FRO=1503;FSAF=248;FSAR=248;FSRF=764;FSRR=739;FWDB=0.00324498;FXX=4.99998E-4;HRUN=1;HS_ONLY=0;LEN=1;MLLD=196.753;OALT=T;OID=COSM18759;OMAPALT=T;OPOS=112175324;OREF=G;PB=.;PBP=.;QD=10.2874;RBI=0.0620718;REFB=-6.9359E-4;REVB=-0.0619869;RO=5728;SAF=921;SAR=923;SRF=2898;SRR=2830;SSEN=0;SSEP=0;SSSB=-0.0082562;STB=0.506253;STBP=0.772;TYPE=snp;VARB=0.00209058;HS;FUNC=[{'origPos':'112175324','origRef':'G','normalizedRef':'G','gene':'APC','normalizedPos':'112175324','normalizedAlt':'T','gt':'pos','codon':'TAA','coding':'c.4033G>T','transcript':'NM_000038.5','function':'nonsense','protein':'p.Glu1345Ter','location':'exonic','origAlt':'T','exon':'16'}]	GT:GQ:DP:FDP:RO:FRO:AO:FAO:AF:SAR:SAF:SRF:SRR:FSAR:FSAF:FSRF:FSRR	0/1:5141:7607:1999:5728:1503:1844:496:0.248124:923:921:2898:2830:248:248:764:739
chr7	55242467	COSM23571;COSM6254;COSM12369;COSM12386;COSM6220;COSM12384	AATTAAGAGAAGCAACATC	AATC,TATC,A,T	2127.65	PASS	AF=0.0,0.0,0.0,0.139582;AO=0,0,0,0;DP=6091;FAO=0,0,0,274;FDP=1963;FDVR=10,10,10,10;FR=.,.,.,.;FRO=1689;FSAF=0,0,0,173;FSAR=0,0,0,101;FSRF=1017;FSRR=672;FWDB=-0.00907011,-0.0112182,-0.0151195,-0.0172792;FXX=0.0184999;HRUN=2,2,2,2;HS_ONLY=0;LEN=4,4,18,18;MLLD=539.227,630.185,613.658,627.795;OALT=-,T,-,T,-,-;OID=COSM23571,COSM12386,COSM6220,COSM12384,COSM6254,COSM12369;OMAPALT=AATC,TATC,A,T,AATC,AATC;OPOS=55242468,55242467,55242468,55242467,55242469,55242470;OREF=ATTAAGAGAAGCAAC,AATTAAGAGAAGCAAC,ATTAAGAGAAGCAACATC,AATTAAGAGAAGCAACATC,TTAAGAGAAGCAACA,TAAGAGAAGCAACAT;PB=.,.,.,.;PBP=.,.,.,.;QD=4.33551;RBI=0.00953953,0.0170328,0.0229596,0.0226107;REFB=0.00154032,-0.0127049,0.00689381,0.0103482;REVB=0.00295562,-0.0128166,-0.0172784,-0.0145834;RO=5942;SAF=0,0,0,0;SAR=0,0,0,0;SRF=3671;SRR=2271;SSEN=0,0,0,0;SSEP=0,0,0,0;SSSB=0,0,0,0;STB=0.5,0.5,0.5,0.526658;STBP=1.0,1.0,1.0,0.373;SUBSET=.,.,4,.;TYPE=complex,complex,del,del;VARB=0.0,0.0,0.0,-0.0673064;HS;FUNC=[{'origPos':'55242467','origRef':'AATTAAGAGAAGCAACATC','normalizedRef':'AATTAAGAGAAGCAACATC','gene':'EGFR','normalizedPos':'55242467','normalizedAlt':'T','gt':'pos','coding':'c.2237_2255delAATTAAGAGAAGCAACATCinsT','transcript':'NM_005228.3','function':'nonframeshiftBlockSubstitution','protein':'p.Glu746_Ser752delinsVal','location':'exonic','origAlt':'T','exon':'19'}]	GT:GQ:DP:FDP:RO:FRO:AO:FAO:AF:SAR:SAF:SRF:SRR:FSAR:FSAF:FSRF:FSRR	0/4:2127:6091:1963:5942:1689:0,0,0,0:0,0,0,274:0.0,0.0,0.0,0.139582:0,0,0,0:0,0,0,0:3671:2271:0,0,0,101:0,0,0,173:1017:672

current output

chr5:		FDP=1999	AF=0.248124
chr7:		FDP=1963	AF=0.0,0.0,0.0,0.139582

desiered

AKT1 c.49G>A p.E17K	chr5:112175324 	NOT DETECTED
AKT1 c.155T>G p.L52R	chr7:55242467	NOT DETECTED
APC c.4033G>T p.E1345*	READS=1999	AF=0.248124
EGFR c.2237_2255delAATTAAGAGAAGCAACATCinsT p.E746_S752delinsV	READS=1963	AF=0.0,0.0,0.0,0.139582

MadeInGermany · January 3, 2019, 10:41am

Your first error is that '"'"' is the sequence to have a literal ' within a 'string' in the shell.
You need it when you have an embedded awk code in a shell script or shell command.
Within a pure awk script it is simply a '

MadeInGermany · January 4, 2019, 4:42am

I think you want a lookup table from file1: $1 is the lookup key and $2 the returned value.

# in file 1 store the $2 values in s[$1] (s[] indexed by $1)
FNR==NR {
  s[$1]=$2
  next
}

Now you can test existence with (key in s) or get the value with s[key] or loop through the keys with for (key in s) .

I think the following captures the wanted values in file2:

{
  match($8,/'normalizedPos':'([^']+)'/, pos)  # extract value after : into pos[]
  match((";" $8),/;(AF=[^;]+).*;(FDP=[^;]+)/, info) # extract values after AF= and FDP= into info[]
  match($8,/'gene':'([^']+)'/,gene) # extract the value after gene into gene[]
  print "pos[1]="pos[1], "info[1]="info[1], "info[2]="info[2], "gene[1]="gene[1]
}

Note1: the target array as 3rd argument is an extension in GNU-awk. (Maybe you want to consider perl? Then it works on all platforms that have perl.)
Note2: for a most exact string match use a "border character". I have even prepended a ";" to the input string for match() in order to ensure it exists even at the beginning.

durden_tyler · January 4, 2019, 12:19pm

If Perl is an option, then here's a sample program:

$
$ cat -n process_files.pl
     1  #!/usr/bin/perl
     2  use strict;
     3
     4  # Set the file names
     5  my $search_file = "file1";
     6  my $data_file = "file2";
     7  my $temp_file = "file1.tmp";
     8
     9  # $sf = The search file; the file from where the search terms are read.
    10  # $df = The data file; the file to be scanned with the search term to fetch
    11  #       values of AF and FDP
    12  # $tf = The temporary file; where the final results are stored temporarily.
    13  #       After all processing is done, you can either move or copy the temp file
    14  #       to the search file. This is much safer than in-place editing of search
    15  #       file.
    16  open(my $sf, "<", $search_file) or die "Can't open $search_file: $!";
    17  open(my $df, "<", $data_file) or die "Can't open $data_file: $!";
    18  open(my $tf, ">", $temp_file) or die "Can't open $temp_file: $!";
    19
    20  # Loop through the search file
    21  while (<$sf>) {
    22      chomp(my $line1 = $_);
    23      # For each line in the search file, extract the search term, which is
    24      # the second "word" from left.
    25      (my $search_term) = $line1 =~ /^\S+\s+(\S+).*$/;
    26
    27      # Initialize AF and FDP to zero-length strings. If the search term is
    28      # not found in data file, we use this fact to print "NOT DETECTED".
    29      my ($af, $fdp) = ("", "");
    30
    31      # Now loop through the data file looking for the search term.
    32      while (<$df>) {
    33          chomp(my $line2 = $_);
    34          # If the search term was found in a particular line of data file,
    35          # set AF and FDP values and stop processing the data file.
    36          if ($line2 =~ /$search_term/) {
    37              ($af, $fdp) = $line2 =~ /AF=(.*?);.*FDP=(.*?);/;
    38              last;
    39          }
    40      }
    41
    42      # If AF and FDP were set, print their values, else "NOT DETECTED".
    43      if ($af eq "" and $fdp eq "") {
    44          print $tf "$line1 NOT DETECTED\n";
    45      } else {
    46          print $tf "$line1 READS=[$fdp] AF=[$af]\n";
    47      }
    48
    49      # Now rewind; set the pointer back to the beginning of data file.
    50      # For the next search term, we will start searching from the top of the
    51      # data file.
    52      seek($df, 0, 0);
    53  }
    54
    55  close($sf) or die "Can't close $search_file: $!";
    56  close($df) or die "Can't close $data_file: $!";
    57  close($tf) or die "Can't close $temp_file: $!";
    58
    59  # At this point, we have the original file "file1" and a temp file "file1.tmp"
    60  # with the desired output. The following statements retain a backup of the
    61  # original file.
    62  my $sf_orig = "file1.orig";
    63  rename($search_file, $sf_orig) or die "Can't rename $search_file to $sf_orig: $!";
    64  rename($temp_file, $search_file) or die "Can't rename $temp_file to $search_file: $!";
    65

$
$ perl process_files.pl
$
$ cat file1
AKT1 c.49G>A p.E17K NOT DETECTED
AKT1 c.155T>G p.L52R NOT DETECTED
APC c.4033G>T p.E1345* READS=[1999] AF=[0.248124]
EGFR c.2237_2255delAATTAAGAGAAGCAACATCinsT p.E746_S752delinsV READS=[1963] AF=[0.0,0.0,0.0,0.139582]

$
$

cmccabe · January 5, 2019, 8:52am

Thank you both for the help :).

Don_Cragun · January 8, 2019, 3:22pm

Hi cmccabe ,
In post #1, you said that the desired output from your two sample input files is:

AKT1 c.49G>A p.E17K	chr5:112175324 	NOT DETECTED
AKT1 c.155T>G p.L52R	chr7:55242467	NOT DETECTED
APC c.4033G>T p.E1345*	READS=1999	AF=0.248124
EGFR c.2237_2255delAATTAAGAGAAGCAACATCinsT p.E746_S752delinsV	READS=1963	AF=0.0,0.0,0.0,0.139582

Given that there is no record in file2 that contains the file1 field 2 values as a coding value in the single-quoted sub-subfields of field 8 in file2 why were the values show in red in the above output selected as the desired output? (I note that the output produced by the perl script durden_tyler suggested omits that field from the output and you didn't comment on that discrepancy.)

Other than having no idea how the 3rd output field in the first two records of your desired output is supposed to be selected, I think I can come up with an awk script that does what you want just using standard awk features (without the gawk extensions your current code is trying to use).

cmccabe · January 12, 2019, 9:39am

@Don Cragun the fields in red are a typo on my part (they are not needed). I apologize and thank you :). A standard awk would be great as I was trying but could not figure it out.

Don_Cragun · January 12, 2019, 3:47pm

Hi cmccabe,
One might try:

#!/bin/ksh
awk -v sq="'" '	# Externally set a variable to a single quote character for use
		# in the script.
# Set initial subfield and sub-subfield search values and set the output file
# field separator:
BEGIN {	sf8AF = "^AF="
	sf8READS = "^FDP="	# Subfield name FDP will be changed to READS
				# when we save the field in the READS[] array.
	ssf8coding = sq "coding" sq
	OFS = "\t"
}

# Gather search terms from the "coding" sub-subfield of the last subfield of
# field 8 in the second operand input file (default file2).  Fields in this file
# are separated by tabs.  Subfields in field 8 are separated by semicolons and
# (except for the last subfield) are of form:
#	name=value
# The last subfield of field 8 is a list sub-subfields separated by commas and
# are of the form:
#	'name':'value'
#
# The value associated with the "coding" sub-subfield will be used as the index
# into the arrays AF[] and READS[].  The AF[] array values will be collected
# from the AF subfield associated with this record and the READS[] array values
# will be collected from the FDP subfield associated with this record with the
# name of that field changed from "FDP" to "READS".
FNR == NR {
	# Split field 8 subfields into array f8sf[] keeping number of subfields
	# found in nf8sf:
	nf8sf = split($8, f8sf, /;/)

	# And split the last field 8 subfield into sub-subfields into array
	# s8ssf[] keeping the number of sub-subfields found in nf8ssf with odd
	# elements being a single-quoted sub-subfield name and even elements
	# being the corresponding single-quoted sub-subfield value:
	nf8ssf = split(f8sf[nf8sf], f8ssf, /[:,]/)

	# Look for the sub-subfield with name "coding":
	for(i = 1; i < nf8ssf; i += 2)
		if(f8ssf == ssf8coding) {
			# Coding sub-subfield found.  Save the value for this
			# sub-subfield (without the surrounding single-quotes)
			# as the index for the AP[] and READS[] arrays:
			key = substr(f8ssf[i + 1], 2, length(f8ssf[i + 1]) - 2)
			break
		}
	if(i > nf8ssf) {
		# No coding sub-subfield was found:
		print "WARNING: No coding sub-subfield found in record #" NR
		next
	}

	# We found a coding sub-subfield...  Look for field 8 subfields that
	# have AF and FDP field names.  If found save AF=... and READS=...
	# output field values.  Default values will be set to "none" in case no
	# matching subfields are found:
	AF[key] = "AF=none"
	READS[key] = "READS=none"
	AFfound = READSfound = 0
	for(i = 1; i < nf8sf && !(AFfound && READSfound); i++)
		if(f8sf ~ sf8AF) {
			AF[key] = f8sf
			AFfound = 1
		} else if(f8sf ~ sf8READS) {
			READS[key] = f8sf
			READSfound = 1
			sub(/FDP/, "READS", READS[key])
		}
	next
}

# Process records from the file named by the first operand (default: file1):
{	if($2 in AF)
		print $0, READS[$2], AF[$2]
	else	print $0, "NOT DETECTED"
}' FS='\t' "${2:-file2}" FS=' ' "${1:-file1}"

This was written and tested using a Korn shell on macOS Mojave on a MacBook Pro.

It should work with any shell that performs standard POSIX shell variable expansions. I hope the comments help explain what it is doing. If something isn't clear, ask...

If you want to try this on a Solaris/SunOS system, change awk to /usr/xpg4/bin/awk or nawk .

When invoked with no operands in a directory containing the sample file1 and file2 that you provided, it produces the output:

AKT1 c.49G>A p.E17K	NOT DETECTED
AKT1 c.155T>G p.L52R	NOT DETECTED
APC c.4033G>T p.E1345*	READS=1999	AF=0.248124
EGFR c.2237_2255delAATTAAGAGAAGCAACATCinsT p.E746_S752delinsV	READS=1963	AF=0.0,0.0,0.0,0.139582

which seems to match what you want.