awk to update value based on pattern match in another file

In the awk , thanks you @RavinderSingh13, for the help in below, hopefully it is close as I am trying to update the value in $12 of the tab-delimeted file2 with the matching value in $1 of the space delimeted file1 . I have added comments for each line as well. Thank you :).

awk

awk '$12 == /NM_/{                                              # search $12 for pattern NM_
            match($12,/p..*/);                                  # using match the regex will match in $12 from the p. to the end
            VAL=substr($12,RSTART+1,RLENGTH-2);                 # Putting the values of subtring whose value starts from RSTART+1 to RSTART-2
            for(i=1;i<=num;i++){                                # Starting a loop which will start from value 1 of variable i to till value of variable num
            awk 'FNR==NR {a[$1]=$2; next} a[$i]{VAL=a[$i]}      # Store $2 value from file1 in array a and update array i matching VAL
                }                                               # close block
                    next                                        # process next line
                    }1' OFS="\t" file1 FS'\t' file2             # define file1 as space delimited and file2 as tab-delimited with the output also being tab-delimited

file1

C Cys
D Asp
V Val
W Trp
Y Tyr

file2

R_Index	Chr	Start	End	Ref	Alt	Func.refGene	Gene.refGene	GeneDetail.refGene	Inheritence	ExonicFunc.refGene	AAChange.refGene	avsnp147
1	chr1	948846	948846	-	A	upstream	ISG15	dist=1	.	.	.	rs3841266
2	chr1	948870	948870	C	G	UTR5	ISG15	NM_005101:c.-84C>G	.	.	.	rs4615788
3	chr1	948921	948921	T	C	UTR5	ISG15	NM_005101:c.-33T>C	.	.	.	rs15842
4	chr1	949597	949597	C	T	exonic	ISG15	.	.	synonymous SNV	ISG15:NM_005101:exon2:c.237C>T:p.D79D	rs61766284
5	chr1	949654	949654	A	G	exonic	ISG15	.	.	synonymous SNV	ISG15:NM_005101:exon2:c.294A>G:p.V98V	rs8997
6	chr1	1269554	1269554	T	C	exonic	TAS1R3	.	.	nonsynonymous SNV	TAS1R3:NM_152228:exon6:c.2269T>C:p.C757R	rs307377

desired output the 3 lines in bold are updated because the one-letter code before and after the digit matched $1 in file1 , so it is updated to the three-letter code in $2 .

R_Index	Chr	Start	End	Ref	Alt	Func.refGene	Gene.refGene	GeneDetail.refGene	Inheritence	ExonicFunc.refGene	AAChange.refGene	avsnp147
1	chr1	948846	948846	-	A	upstream	ISG15	dist=1	.	.	.	rs3841266
2	chr1	948870	948870	C	G	UTR5	ISG15	NM_005101:c.-84C>G	.	.	.	rs4615788
3	chr1	948921	948921	T	C	UTR5	ISG15	NM_005101:c.-33T>C	.	.	.	rs15842
4	chr1	949597	949597	C	T	exonic	ISG15	.	.	synonymous SNV	ISG15:NM_005101:exon2:c.237C>T:p.Asp79Asp	rs61766284
5	chr1	949654	949654	A	G	exonic	ISG15	.	.	synonymous SNV	ISG15:NM_005101:exon2:c.294A>G:p.Val98Val	rs8997
6	chr1	1269554	1269554	T	C	exonic	TAS1R3	.	.	nonsynonymous SNV	TAS1R3:NM_152228:exon6:c.2269T>C:p.Cys757Arg	rs307377

To get you started

awk '
  # BEGIN runs before any of the input files is opened
  BEGIN { FS=OFS="\t" }
  # The input files are processed one by one and the following code runs for each line
  # FNR is equal to NR when processing file1
  # a[ ] is indexed by the one letter code, its value is the three letter code
  FNR==NR { a[$1]=$2; next }
  # The next goes to the next input cycle
  # The following code runs for file2 (and further files)
  $12 ~ /:NM_/{                                                 # search $12 for pattern :NM_
            match($12,/p..*/)                                   # using match the regex will match in $12 from the p. to the 
            VAL=substr($12,RSTART+1,RLENGTH-2)                  # Put the values of substring whose value starts from RSTART+1 to RSTART-2
  # more to come
  }
  { print }
' file1 file2
1 Like

Both awk command below execute but I do not get the intended result:

I added some more comments to both as well. Thank you :).

awk 1

awk '
  # BEGIN runs before any of the input files is opened
  BEGIN { FS=OFS="\t" }
  # The input files are processed one by one and the following code runs for each line
  # FNR is equal to NR when processing file1
  # a[ ] is indexed by the one letter code, its value is the three letter code
  FNR==NR { a[$1]=$2; next }
  # The next goes to the next input cycle
  # The following code runs for file2 (and further files)
  $12 ~ /:NM_/{                                                 # search $12 for pattern :NM_
            match($12,/p..*/)                                   # using match the regex will match in $12 from the p. to the 
            VAL=substr($12,RSTART+1,RLENGTH-2)                  # Put the values of substring whose value starts from RSTART+1 to RSTART-2
  # update one letter code to three letter by storing the value of $12 in array then updating if it matches file1
  { if(a[$12]){$12=a[$12] };
  }
  print}' file1 file2 > out

out

4	chr1	949597	949597	C	T	exonic	ISG15	.	.	synonymous SNV	ISG15:NM_005101:exon2:c.237C>T:p.D79D	rs61766284
5	chr1	949654	949654	A	G	exonic	ISG15	.	.	synonymous SNV	ISG15:NM_005101:exon2:c.294A>G:p.V98V	rs8997

awk 2

# store value of $2 in file1 in array A and update each sub-patter of p. matching $12 in file2
awk 'FNR==NR {A[$1]=$2; next}  $12 in A {sub ($12, $12 "p." A[$12]) }1' OFS="\t" file1 FS='\t' file2 > out

out

R_Index	Chr	Start	End	Ref	Alt	Func.refGene	Gene.refGene	GeneDetail.refGene	Inheritence	ExonicFunc.refGene	AAChange.refGene	avsnp147
1	chr1	948846	948846	-	A	upstream	ISG15	dist=1	.	.	.	rs3841266
2	chr1	948870	948870	C	G	UTR5	ISG15	NM_005101:c.-84C>G	.	.	.	rs4615788
3	chr1	948921	948921	T	C	UTR5	ISG15	NM_005101:c.-33T>C	.	.	.	rs15842
4	chr1	949597	949597	C	T	exonic	ISG15	.	.	synonymous SNV	ISG15:NM_005101:exon2:c.237C>T:p.D79D	rs61766284
5	chr1	949654	949654	A	G	exonic	ISG15	.	.	synonymous SNV	ISG15:NM_005101:exon2:c.294A>G:p.V98V	rs8997

You need to cycle through each character, as you maybe intended in post#1; you need substr() to cut out a single character.

awk '
  BEGIN { OFS="\t" }
  # The input files are processed one by one and the following code runs for each line
  # FNR is equal to NR when processing file1
  # a[ ] is indexed by the one letter code, its value is the three letter code
  FNR==NR { a[$1]=$2; next }
  # The next goes to the next input cycle
  # The following code runs for file2 (and further files)
  ($12 ~ /:NM_/ && match($12,/p..*/)) {  # search for :NM_ and p..*
    # Get the substring after p.
    VAL=substr($12,RSTART+2)
    # Get its length
    lenVAL=length(VAL)
    ostring=""
    # Cycle through each character, append to ostring, if in a[ ] replace by its value
    for (i=1; i<=lenVAL; i++) {
      c=substr(VAL,i,1)
      ostring=(ostring ((c in a) ? a[c] : c))
    }
    # copy ostring back to $12 (unconditionally), retaining the part up to p.
    $12=(substr($12,1,RSTART+1) ostring)
  }
  # always print
  { print }
' file1 FS="\t" file2

I have reverted to define FS after reading file1, because file1 might not be TAB-separated.

1 Like

Thank you very much :).

---------- Post updated 09-12-17 at 07:45 AM ---------- Previous update was 09-11-17 at 05:19 PM ----------

A line with multiple NM_ values in $12 seperated by a ; seems to chage a matching c. as well. I have tried adding in ; as a FS , but that splits eaach into multiple tabs in the output. I also added tried a break after the ostring=(ostring ((c in a) ? a[c] : c)) , thinking that would process each, then break, and loop to the next... but that only processed one and then stopped. Maybe I added it in the wrong place or is there a better way? Thank you :slight_smile:

line

R_Index	Chr	Start	End	Ref	Alt	Func.refGene	Gene.refGene	GeneDetail.refGene	Inheritence	ExonicFunc.refGene	AAChange.refGene	avsnp147
1	chr1	949654	949654	A	G	exonic	ISG15	.	.	synonymous SNV	ISG15:NM_005101:exon2:c.294A>G:p.V98V;ISG15:NM_005101:exon2:c.237C>T:p.D79D	rs8997

current output

R_Index	Chr	Start	End	Ref	Alt	Func.refGene	Gene.refGene	GeneDetail.refGene	Inheritence	ExonicFunc.refGene	AAChange.refGene	avsnp147
1	chr1	949654	949654	A	G	exonic	ISG15	.	.	synonymous SNV	ISG15:NM_005101:exon2:c.294A>G:p.Val98Val;ISG15:NM_005101:exon2:c.237Cys>T:p.Asp79Asp	rs8997

Hello cmccabe,

Could you please try following and let me know if this helps you.

awk 'FNR==NR{b[$1]=$2;next} {if($13 ~ /NM_/){va=$13;match($13,/.*\./);sub(/.*\./,"",va);num1=split(va,c,"");for(i=1;i<=num1;i++){val=c in b?val b[c]:val c};$13=substr($13,RSTART,RLENGTH) val;val=""}} 1' OFS="\t" FILE1 FILE2

EDIT: Adding a non-one liner form of solution too now.

awk '
FNR==NR{
  b[$1]=$2;
  next
}
{
  if($13 ~ /NM_/){
     va=$13;
     match($13,/.*\./);
     sub(/.*\./,"",va);
     num1=split(va,c,"");
     for(i=1;i<=num1;i++){
         val=c in b?val b[c]:val c
};
  $13=substr($13,RSTART,RLENGTH) val;
  val=""
}
}
1' OFS="\t" FILE1 FILE2
 

Thanks,
R. Singh

1 Like

Im not sure I follow completly, but it is close... line 3 is the multiple NM_ , but only the second p . looks to be updated. Can you add comments if possible? Thank you :).

1	chr1	949597	949597	C	T	exonic	ISG15	.	.	synonymous	SNV	ISG15:NM_005101:exon2:c.237C>T:p.Asp79Asp	rs61766284
2	chr1	949654	949654	A	G	exonic	ISG15	.	.	synonymous	SNV	ISG15:NM_005101:exon2:c.294A>G:p.Val98Val	rs8997
3	chr1	949654	949654	A	G	exonic	ISG15	.	.	synonymous	SNV	ISG15:NM_005101:exon2:c.294A>G:p.V98V;ISG15:NM_005101:exon2:c.237C>T:p.Asp79Asp	rs8997

Hello cmccabe,

In your post#5 you haven't mentioned that we need to change p.D79D after : , so I had written my logic to change D79D not before DOT, could you please post exact output you need so that I could try to help you.

Thanks,
R. Singh

1 Like

I apologize and below is the complete file2 is below, it is tab-delimeted and $12 may have multiple (up to 30) NM_ in it. In this file2 line 6 has multiple NM_ in $12 seperated by a ; . All the matching p. and both p. are updated if it matches file1 . So using line 6 as an example the V in p.V98V and D in p.D79D .
The last thing I am going to do, though I have not thought that far ahead yet :)... is since the V in p.V98V and D in p.D79D are the same, that is the letter before and after the digit are the same, print p.= , but only if the letters before and after the digit are the same. So in this case line 6 would look like

6	chr1	949654	949654	A	G	exonic	ISG15	.	.	synonymous SNV	ISG15:NM_005101:exon2:c.294A>G:p.=;ISG15:NM_005101:exon2:c.237C>T:p.=

this is the ultimate goal but the last step and maybe it is easier to do at a later time. Thanks :).

file2

R_Index	Chr	Start	End	Ref	Alt	Func.refGene	Gene.refGene	GeneDetail.refGene	Inheritence	ExonicFunc.refGene	AAChange.refGene	avsnp147
1	chr1	948846	948846	-	A	upstream	ISG15	dist=1	.	.	.	rs3841266
2	chr1	948870	948870	C	G	UTR5	ISG15	NM_005101:c.-84C>G	.	.	.	rs4615788
3	chr1	948921	948921	T	C	UTR5	ISG15	NM_005101:c.-33T>C	.	.	.	rs15842
4	chr1	949597	949597	C	T	exonic	ISG15	.	.	synonymous SNV	ISG15:NM_005101:exon2:c.237C>T:p.D79D	rs61766284
5	chr1	949654	949654	A	G	exonic	ISG15	.	.	synonymous SNV	ISG15:NM_005101:exon2:c.294A>G:p.V98V	rs8997
6	chr1	949654	949654	A	G	exonic	ISG15	.	.	synonymous SNV	ISG15:NM_005101:exon2:c.294A>G:p.V98V;ISG15:NM_005101:exon2:c.237C>T:p.D79D	rs8997

desired output

R_Index	Chr	Start	End	Ref	Alt	Func.refGene	Gene.refGene	GeneDetail.refGene	Inheritence	ExonicFunc.refGene	AAChange.refGene	avsnp147
1	chr1	948846	948846	-	A	upstream	ISG15	dist=1	.	.	.	rs3841266
2	chr1	948870	948870	C	G	UTR5	ISG15	NM_005101:c.-84C>G	.	.	.	rs4615788
3	chr1	948921	948921	T	C	UTR5	ISG15	NM_005101:c.-33T>C	.	.	.	rs15842
4	chr1	949597	949597	C	T	exonic	ISG15	.	.	synonymous SNV	ISG15:NM_005101:exon2:c.237C>T:p.Asp79Asp	rs61766284
5	chr1	949654	949654	A	G	exonic	ISG15	.	.	synonymous SNV	ISG15:NM_005101:exon2:c.294A>G:p.Val98Val	rs8997
6	chr1	949654	949654	A	G	exonic	ISG15	.	.	synonymous SNV	ISG15:NM_005101:exon2:c.294A>G:p.Val98Val;ISG15:NM_005101:exon2:c.237C>T:p.Asp79Asp	rs8997

Have you made any progress?
Here is my attempt.
I have added another loop over the ";"-separated parts in $12.
In each part it still cycles through all characters after "p.".

awk '
  BEGIN { OFS="\t" }
  # The input files are processed one by one and the following code runs for each line
  # FNR is equal to NR when processing file1
  # a[ ] is indexed by the one letter code, its value is the three letter code
  FNR==NR { a[$1]=$2; next }
  # The next goes to the next input cycle
  # The following code runs for file2 (and further files)
  $12 ~ /:NM_/ {
    ostring=""
    # split $12 by ";" and cycle through them
    nNM=split($12,NM,";")
    for (n=1; n<=nNM; n++) {
      if (n>1) ostring=(ostring ";") # append ";"
      if (match(NM[n],/p[.].*/)) {
        # copy up to "p."
        ostring=(ostring substr(NM[n],1,RSTART+1))
        # Get the substring after "p."
        VAL=substr(NM[n],RSTART+2)
        # Get its length
        lenVAL=length(VAL)
        # Cycle through each character, append to ostring, if in a[ ] replace by its value
        for (i=1; i<=lenVAL; i++) {
          c=substr(VAL,i,1)
          ostring=(ostring ((c in a) ? a[c] : c))
        }
      } else {
        # append the unchanged string
        ostring=(ostring NM[n])   
      }
    }
    # copy ostring back to $12 (unconditionally)
    $12=ostring
  }
  # always print
  { print }
' file1 FS="\t" file2
1 Like

I was having no luck and now I see why (or think I do anyway).... I was spiting but not cycling though and appending the new value. Thank you very much :).